Something went wrong on our end
-
Sebastian Wagner authoredSebastian Wagner authored
nrLDPC_bnProc.h 99.63 KiB
/*
* Licensed to the OpenAirInterface (OAI) Software Alliance under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The OpenAirInterface Software Alliance licenses this file to You under
* the OAI Public License, Version 1.1 (the "License"); you may not use this file
* except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.openairinterface.org/?page_id=698
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*-------------------------------------------------------------------------------
* For more information about the OpenAirInterface (OAI) Software Alliance:
* contact@openairinterface.org
*/
/*!\file nrLDPC_bnProc.h
* \brief Defines the functions for bit node processing
* \author Sebastian Wagner (TCL Communications) Email: <mailto:sebastian.wagner@tcl.com>
* \date 27-03-2018
* \version 1.0
* \note
* \warning
*/
#ifndef __NR_LDPC_BNPROC__H__
#define __NR_LDPC_BNPROC__H__
/**
\brief Performs first part of BN processing on the BN processing buffer and stores the results in the LLR results buffer.
At every BN, the sum of the returned LLRs from the connected CNs and the LLR of the receiver input is computed.
\param p_lut Pointer to decoder LUTs
\param Z Lifting size
*/
static inline void nrLDPC_bnProcPc(t_nrLDPC_lut* p_lut, t_nrLDPC_procBuf* p_procBuf, uint16_t Z)
{
const uint8_t* lut_numBnInBnGroups = p_lut->numBnInBnGroups;
const uint32_t* lut_startAddrBnGroups = p_lut->startAddrBnGroups;
const uint16_t* lut_startAddrBnGroupsLlr = p_lut->startAddrBnGroupsLlr;
int8_t* bnProcBuf = p_procBuf->bnProcBuf;
int8_t* bnProcBufRes = p_procBuf->bnProcBufRes;
int8_t* llrRes = p_procBuf->llrRes;
int8_t* llrProcBuf = p_procBuf->llrProcBuf;
__m128i* p_bnProcBuf;
__m256i* p_bnProcBufRes;
__m128i* p_llrProcBuf;
__m256i* p_llrProcBuf256;
__m256i* p_llrRes;
// Number of BNs in Groups
uint32_t M;
//uint32_t M32rem;
uint32_t i,j;
uint32_t k;
// Offset to each bit within a group in terms of 32 Byte
uint32_t cnOffsetInGroup;
uint8_t idxBnGroup = 0;
__m256i ymm0, ymm1, ymmRes0, ymmRes1;
// =====================================================================
// Process group with 1 CN
// There is always a BN group with 1 CN
// Number of groups of 32 BNs for parallel processing
M = (lut_numBnInBnGroups[0]*Z + 31)>>5;
p_bnProcBuf = (__m128i*) &bnProcBuf [lut_startAddrBnGroups [idxBnGroup]];
p_bnProcBufRes = (__m256i*) &bnProcBufRes [lut_startAddrBnGroups [idxBnGroup]];
p_llrProcBuf = (__m128i*) &llrProcBuf [lut_startAddrBnGroupsLlr[idxBnGroup]];
p_llrProcBuf256 = (__m256i*) &llrProcBuf [lut_startAddrBnGroupsLlr[idxBnGroup]];
p_llrRes = (__m256i*) &llrRes [lut_startAddrBnGroupsLlr[idxBnGroup]];
// Loop over BNs
for (i=0,j=0; i<M; i++,j+=2)
{
// Store results in bnProcBufRes of first CN for further processing for next iteration
// In case parity check fails
p_bnProcBufRes[i] = p_llrProcBuf256[i];
// First 16 LLRs of first CN
ymm0 = _mm256_cvtepi8_epi16(p_bnProcBuf [j]);
ymm1 = _mm256_cvtepi8_epi16(p_llrProcBuf[j]);
ymmRes0 = _mm256_adds_epi16(ymm0, ymm1);
// Second 16 LLRs of first CN
ymm0 = _mm256_cvtepi8_epi16(p_bnProcBuf [j+1]);
ymm1 = _mm256_cvtepi8_epi16(p_llrProcBuf[j+1]);
ymmRes1 = _mm256_adds_epi16(ymm0, ymm1);
// Pack results back to epi8
ymm0 = _mm256_packs_epi16(ymmRes0, ymmRes1);
// ymm0 = [ymmRes1[255:128] ymmRes0[255:128] ymmRes1[127:0] ymmRes0[127:0]]
// p_llrRes = [ymmRes1[255:128] ymmRes1[127:0] ymmRes0[255:128] ymmRes0[127:0]]
*p_llrRes = _mm256_permute4x64_epi64(ymm0, 0xD8);
// Next result
p_llrRes++;
}
// =====================================================================
// Process group with 2 CNs
if (lut_numBnInBnGroups[1] > 0)
{
// If elements in group move to next address
idxBnGroup++;
// Number of groups of 32 BNs for parallel processing
M = (lut_numBnInBnGroups[1]*Z + 31)>>5;
// Set the offset to each CN within a group in terms of 16 Byte
cnOffsetInGroup = (lut_numBnInBnGroups[1]*NR_LDPC_ZMAX)>>4;
// Set pointers to start of group 2
p_bnProcBuf = (__m128i*) &bnProcBuf [lut_startAddrBnGroups [idxBnGroup]];
p_llrProcBuf = (__m128i*) &llrProcBuf [lut_startAddrBnGroupsLlr[idxBnGroup]];
p_llrRes = (__m256i*) &llrRes [lut_startAddrBnGroupsLlr[idxBnGroup]];
// Loop over BNs
for (i=0,j=0; i<M; i++,j+=2)
{
// First 16 LLRs of first CN
ymmRes0 = _mm256_cvtepi8_epi16(p_bnProcBuf[j]);
ymmRes1 = _mm256_cvtepi8_epi16(p_bnProcBuf[j+1]);
// Loop over CNs
for (k=1; k<2; k++)
{
ymm0 = _mm256_cvtepi8_epi16(p_bnProcBuf[k*cnOffsetInGroup + j]);
ymmRes0 = _mm256_adds_epi16(ymmRes0, ymm0);
ymm1 = _mm256_cvtepi8_epi16(p_bnProcBuf[k*cnOffsetInGroup + j+1]);
ymmRes1 = _mm256_adds_epi16(ymmRes1, ymm1);
}
// Add LLR from receiver input
ymm0 = _mm256_cvtepi8_epi16(p_llrProcBuf[j]);
ymmRes0 = _mm256_adds_epi16(ymmRes0, ymm0);
ymm1 = _mm256_cvtepi8_epi16(p_llrProcBuf[j+1]);
ymmRes1 = _mm256_adds_epi16(ymmRes1, ymm1);
// Pack results back to epi8
ymm0 = _mm256_packs_epi16(ymmRes0, ymmRes1);
// ymm0 = [ymmRes1[255:128] ymmRes0[255:128] ymmRes1[127:0] ymmRes0[127:0]]
// p_llrRes = [ymmRes1[255:128] ymmRes1[127:0] ymmRes0[255:128] ymmRes0[127:0]]
*p_llrRes = _mm256_permute4x64_epi64(ymm0, 0xD8);
// Next result
p_llrRes++;
}
}
// =====================================================================
// Process group with 3 CNs
if (lut_numBnInBnGroups[2] > 0)
{
// If elements in group move to next address
idxBnGroup++;
// Number of groups of 32 BNs for parallel processing
M = (lut_numBnInBnGroups[2]*Z + 31)>>5;
// Set the offset to each CN within a group in terms of 16 Byte
cnOffsetInGroup = (lut_numBnInBnGroups[2]*NR_LDPC_ZMAX)>>4;
// Set pointers to start of group 3
p_bnProcBuf = (__m128i*) &bnProcBuf [lut_startAddrBnGroups [idxBnGroup]];
p_llrProcBuf = (__m128i*) &llrProcBuf [lut_startAddrBnGroupsLlr[idxBnGroup]];
p_llrRes = (__m256i*) &llrRes [lut_startAddrBnGroupsLlr[idxBnGroup]];
// Loop over BNs
for (i=0,j=0; i<M; i++,j+=2)
{
// First 16 LLRs of first CN
ymmRes0 = _mm256_cvtepi8_epi16(p_bnProcBuf[j]);
ymmRes1 = _mm256_cvtepi8_epi16(p_bnProcBuf[j+1]);
// Loop over CNs
for (k=1; k<3; k++)
{
ymm0 = _mm256_cvtepi8_epi16(p_bnProcBuf[k*cnOffsetInGroup + j]);
ymmRes0 = _mm256_adds_epi16(ymmRes0, ymm0);
ymm1 = _mm256_cvtepi8_epi16(p_bnProcBuf[k*cnOffsetInGroup + j+1]);
ymmRes1 = _mm256_adds_epi16(ymmRes1, ymm1);
}
// Add LLR from receiver input
ymm0 = _mm256_cvtepi8_epi16(p_llrProcBuf[j]);
ymmRes0 = _mm256_adds_epi16(ymmRes0, ymm0);
ymm1 = _mm256_cvtepi8_epi16(p_llrProcBuf[j+1]);
ymmRes1 = _mm256_adds_epi16(ymmRes1, ymm1);
// Pack results back to epi8
ymm0 = _mm256_packs_epi16(ymmRes0, ymmRes1);
// ymm0 = [ymmRes1[255:128] ymmRes0[255:128] ymmRes1[127:0] ymmRes0[127:0]]
// p_llrRes = [ymmRes1[255:128] ymmRes1[127:0] ymmRes0[255:128] ymmRes0[127:0]]
*p_llrRes = _mm256_permute4x64_epi64(ymm0, 0xD8);
// Next result
p_llrRes++;
}
}
// =====================================================================
// Process group with 4 CNs
if (lut_numBnInBnGroups[3] > 0)
{
// If elements in group move to next address
idxBnGroup++;
// Number of groups of 32 BNs for parallel processing
M = (lut_numBnInBnGroups[3]*Z + 31)>>5;
// Set the offset to each CN within a group in terms of 16 Byte
cnOffsetInGroup = (lut_numBnInBnGroups[3]*NR_LDPC_ZMAX)>>4;
// Set pointers to start of group 4
p_bnProcBuf = (__m128i*) &bnProcBuf [lut_startAddrBnGroups [idxBnGroup]];
p_llrProcBuf = (__m128i*) &llrProcBuf [lut_startAddrBnGroupsLlr[idxBnGroup]];
p_llrRes = (__m256i*) &llrRes [lut_startAddrBnGroupsLlr[idxBnGroup]];
// Loop over BNs
for (i=0,j=0; i<M; i++,j+=2)
{
// First 16 LLRs of first CN
ymmRes0 = _mm256_cvtepi8_epi16(p_bnProcBuf[j]);
ymmRes1 = _mm256_cvtepi8_epi16(p_bnProcBuf[j+1]);
// Loop over CNs
for (k=1; k<4; k++)
{
ymm0 = _mm256_cvtepi8_epi16(p_bnProcBuf[k*cnOffsetInGroup + j]);
ymmRes0 = _mm256_adds_epi16(ymmRes0, ymm0);
ymm1 = _mm256_cvtepi8_epi16(p_bnProcBuf[k*cnOffsetInGroup + j+1]);
ymmRes1 = _mm256_adds_epi16(ymmRes1, ymm1);
}
// Add LLR from receiver input
ymm0 = _mm256_cvtepi8_epi16(p_llrProcBuf[j]);
ymmRes0 = _mm256_adds_epi16(ymmRes0, ymm0);
ymm1 = _mm256_cvtepi8_epi16(p_llrProcBuf[j+1]);
ymmRes1 = _mm256_adds_epi16(ymmRes1, ymm1);
// Pack results back to epi8
ymm0 = _mm256_packs_epi16(ymmRes0, ymmRes1);
// ymm0 = [ymmRes1[255:128] ymmRes0[255:128] ymmRes1[127:0] ymmRes0[127:0]]
// p_llrRes = [ymmRes1[255:128] ymmRes1[127:0] ymmRes0[255:128] ymmRes0[127:0]]
*p_llrRes = _mm256_permute4x64_epi64(ymm0, 0xD8);
// Next result
p_llrRes++;
}
}
// =====================================================================
// Process group with 5 CNs
if (lut_numBnInBnGroups[4] > 0)
{
// If elements in group move to next address
idxBnGroup++;
// Number of groups of 32 BNs for parallel processing
M = (lut_numBnInBnGroups[4]*Z + 31)>>5;
// Set the offset to each CN within a group in terms of 16 Byte
cnOffsetInGroup = (lut_numBnInBnGroups[4]*NR_LDPC_ZMAX)>>4;
// Set pointers to start of group 5
p_bnProcBuf = (__m128i*) &bnProcBuf [lut_startAddrBnGroups [idxBnGroup]];
p_llrProcBuf = (__m128i*) &llrProcBuf [lut_startAddrBnGroupsLlr[idxBnGroup]];
p_llrRes = (__m256i*) &llrRes [lut_startAddrBnGroupsLlr[idxBnGroup]];
// Loop over BNs
for (i=0,j=0; i<M; i++,j+=2)
{
// First 16 LLRs of first CN
ymmRes0 = _mm256_cvtepi8_epi16(p_bnProcBuf[j]);
ymmRes1 = _mm256_cvtepi8_epi16(p_bnProcBuf[j+1]);
// Loop over CNs
for (k=1; k<5; k++)
{
ymm0 = _mm256_cvtepi8_epi16(p_bnProcBuf[k*cnOffsetInGroup + j]);
ymmRes0 = _mm256_adds_epi16(ymmRes0, ymm0);
ymm1 = _mm256_cvtepi8_epi16(p_bnProcBuf[k*cnOffsetInGroup + j+1]);
ymmRes1 = _mm256_adds_epi16(ymmRes1, ymm1);
}
// Add LLR from receiver input
ymm0 = _mm256_cvtepi8_epi16(p_llrProcBuf[j]);
ymmRes0 = _mm256_adds_epi16(ymmRes0, ymm0);
ymm1 = _mm256_cvtepi8_epi16(p_llrProcBuf[j+1]);
ymmRes1 = _mm256_adds_epi16(ymmRes1, ymm1);
// Pack results back to epi8
ymm0 = _mm256_packs_epi16(ymmRes0, ymmRes1);
// ymm0 = [ymmRes1[255:128] ymmRes0[255:128] ymmRes1[127:0] ymmRes0[127:0]]
// p_llrRes = [ymmRes1[255:128] ymmRes1[127:0] ymmRes0[255:128] ymmRes0[127:0]]
*p_llrRes = _mm256_permute4x64_epi64(ymm0, 0xD8);
// Next result
p_llrRes++;
}
}
// =====================================================================
// Process group with 6 CNs
if (lut_numBnInBnGroups[5] > 0)
{
// If elements in group move to next address
idxBnGroup++;
// Number of groups of 32 BNs for parallel processing
M = (lut_numBnInBnGroups[5]*Z + 31)>>5;
// Set the offset to each CN within a group in terms of 16 Byte
cnOffsetInGroup = (lut_numBnInBnGroups[5]*NR_LDPC_ZMAX)>>4;
// Set pointers to start of group 6
p_bnProcBuf = (__m128i*) &bnProcBuf [lut_startAddrBnGroups [idxBnGroup]];
p_llrProcBuf = (__m128i*) &llrProcBuf [lut_startAddrBnGroupsLlr[idxBnGroup]];
p_llrRes = (__m256i*) &llrRes [lut_startAddrBnGroupsLlr[idxBnGroup]];
// Loop over BNs
for (i=0,j=0; i<M; i++,j+=2)
{
// First 16 LLRs of first CN
ymmRes0 = _mm256_cvtepi8_epi16(p_bnProcBuf[j]);
ymmRes1 = _mm256_cvtepi8_epi16(p_bnProcBuf[j+1]);
// Loop over CNs
for (k=1; k<6; k++)
{
ymm0 = _mm256_cvtepi8_epi16(p_bnProcBuf[k*cnOffsetInGroup + j]);
ymmRes0 = _mm256_adds_epi16(ymmRes0, ymm0);
ymm1 = _mm256_cvtepi8_epi16(p_bnProcBuf[k*cnOffsetInGroup + j+1]);
ymmRes1 = _mm256_adds_epi16(ymmRes1, ymm1);
}
// Add LLR from receiver input
ymm0 = _mm256_cvtepi8_epi16(p_llrProcBuf[j]);
ymmRes0 = _mm256_adds_epi16(ymmRes0, ymm0);
ymm1 = _mm256_cvtepi8_epi16(p_llrProcBuf[j+1]);
ymmRes1 = _mm256_adds_epi16(ymmRes1, ymm1);
// Pack results back to epi8
ymm0 = _mm256_packs_epi16(ymmRes0, ymmRes1);
// ymm0 = [ymmRes1[255:128] ymmRes0[255:128] ymmRes1[127:0] ymmRes0[127:0]]
// p_llrRes = [ymmRes1[255:128] ymmRes1[127:0] ymmRes0[255:128] ymmRes0[127:0]]
*p_llrRes = _mm256_permute4x64_epi64(ymm0, 0xD8);
// Next result
p_llrRes++;
}
}
// =====================================================================
// Process group with 7 CNs
if (lut_numBnInBnGroups[6] > 0)
{
// If elements in group move to next address
idxBnGroup++;
// Number of groups of 32 BNs for parallel processing
M = (lut_numBnInBnGroups[6]*Z + 31)>>5;
// Set the offset to each CN within a group in terms of 16 Byte
cnOffsetInGroup = (lut_numBnInBnGroups[6]*NR_LDPC_ZMAX)>>4;
// Set pointers to start of group 7
p_bnProcBuf = (__m128i*) &bnProcBuf [lut_startAddrBnGroups [idxBnGroup]];
p_llrProcBuf = (__m128i*) &llrProcBuf [lut_startAddrBnGroupsLlr[idxBnGroup]];
p_llrRes = (__m256i*) &llrRes [lut_startAddrBnGroupsLlr[idxBnGroup]];
// Loop over BNs
for (i=0,j=0; i<M; i++,j+=2)
{
// First 16 LLRs of first CN
ymmRes0 = _mm256_cvtepi8_epi16(p_bnProcBuf[j]);
ymmRes1 = _mm256_cvtepi8_epi16(p_bnProcBuf[j+1]);
// Loop over CNs
for (k=1; k<7; k++)
{
ymm0 = _mm256_cvtepi8_epi16(p_bnProcBuf[k*cnOffsetInGroup + j]);
ymmRes0 = _mm256_adds_epi16(ymmRes0, ymm0);
ymm1 = _mm256_cvtepi8_epi16(p_bnProcBuf[k*cnOffsetInGroup + j+1]);
ymmRes1 = _mm256_adds_epi16(ymmRes1, ymm1);
}
// Add LLR from receiver input
ymm0 = _mm256_cvtepi8_epi16(p_llrProcBuf[j]);
ymmRes0 = _mm256_adds_epi16(ymmRes0, ymm0);
ymm1 = _mm256_cvtepi8_epi16(p_llrProcBuf[j+1]);
ymmRes1 = _mm256_adds_epi16(ymmRes1, ymm1);
// Pack results back to epi8
ymm0 = _mm256_packs_epi16(ymmRes0, ymmRes1);
// ymm0 = [ymmRes1[255:128] ymmRes0[255:128] ymmRes1[127:0] ymmRes0[127:0]]
// p_llrRes = [ymmRes1[255:128] ymmRes1[127:0] ymmRes0[255:128] ymmRes0[127:0]]
*p_llrRes = _mm256_permute4x64_epi64(ymm0, 0xD8);
// Next result
p_llrRes++;
}
}
// =====================================================================
// Process group with 8 CNs
if (lut_numBnInBnGroups[7] > 0)
{
// If elements in group move to next address
idxBnGroup++;
// Number of groups of 32 BNs for parallel processing
M = (lut_numBnInBnGroups[7]*Z + 31)>>5;
// Set the offset to each CN within a group in terms of 16 Byte
cnOffsetInGroup = (lut_numBnInBnGroups[7]*NR_LDPC_ZMAX)>>4;
// Set pointers to start of group 8
p_bnProcBuf = (__m128i*) &bnProcBuf [lut_startAddrBnGroups [idxBnGroup]];
p_llrProcBuf = (__m128i*) &llrProcBuf [lut_startAddrBnGroupsLlr[idxBnGroup]];
p_llrRes = (__m256i*) &llrRes [lut_startAddrBnGroupsLlr[idxBnGroup]];
// Loop over BNs
for (i=0,j=0; i<M; i++,j+=2)
{
// First 16 LLRs of first CN
ymmRes0 = _mm256_cvtepi8_epi16(p_bnProcBuf[j]);
ymmRes1 = _mm256_cvtepi8_epi16(p_bnProcBuf[j+1]);
// Loop over CNs
for (k=1; k<8; k++)
{
ymm0 = _mm256_cvtepi8_epi16(p_bnProcBuf[k*cnOffsetInGroup + j]);
ymmRes0 = _mm256_adds_epi16(ymmRes0, ymm0);
ymm1 = _mm256_cvtepi8_epi16(p_bnProcBuf[k*cnOffsetInGroup + j+1]);
ymmRes1 = _mm256_adds_epi16(ymmRes1, ymm1);
}
// Add LLR from receiver input
ymm0 = _mm256_cvtepi8_epi16(p_llrProcBuf[j]);
ymmRes0 = _mm256_adds_epi16(ymmRes0, ymm0);
ymm1 = _mm256_cvtepi8_epi16(p_llrProcBuf[j+1]);
ymmRes1 = _mm256_adds_epi16(ymmRes1, ymm1);
// Pack results back to epi8
ymm0 = _mm256_packs_epi16(ymmRes0, ymmRes1);
// ymm0 = [ymmRes1[255:128] ymmRes0[255:128] ymmRes1[127:0] ymmRes0[127:0]]
// p_llrRes = [ymmRes1[255:128] ymmRes1[127:0] ymmRes0[255:128] ymmRes0[127:0]]
*p_llrRes = _mm256_permute4x64_epi64(ymm0, 0xD8);
// Next result
p_llrRes++;
}
}
// =====================================================================
// Process group with 9 CNs
if (lut_numBnInBnGroups[8] > 0)
{
// If elements in group move to next address
idxBnGroup++;
// Number of groups of 32 BNs for parallel processing
M = (lut_numBnInBnGroups[8]*Z + 31)>>5;
// Set the offset to each CN within a group in terms of 16 Byte
cnOffsetInGroup = (lut_numBnInBnGroups[8]*NR_LDPC_ZMAX)>>4;
// Set pointers to start of group 9
p_bnProcBuf = (__m128i*) &bnProcBuf [lut_startAddrBnGroups [idxBnGroup]];
p_llrProcBuf = (__m128i*) &llrProcBuf [lut_startAddrBnGroupsLlr[idxBnGroup]];
p_llrRes = (__m256i*) &llrRes [lut_startAddrBnGroupsLlr[idxBnGroup]];
// Loop over BNs
for (i=0,j=0; i<M; i++,j+=2)
{
// First 16 LLRs of first CN
ymmRes0 = _mm256_cvtepi8_epi16(p_bnProcBuf[j]);
ymmRes1 = _mm256_cvtepi8_epi16(p_bnProcBuf[j+1]);
// Loop over CNs
for (k=1; k<9; k++)
{
ymm0 = _mm256_cvtepi8_epi16(p_bnProcBuf[k*cnOffsetInGroup + j]);
ymmRes0 = _mm256_adds_epi16(ymmRes0, ymm0);
ymm1 = _mm256_cvtepi8_epi16(p_bnProcBuf[k*cnOffsetInGroup + j+1]);
ymmRes1 = _mm256_adds_epi16(ymmRes1, ymm1);
}
// Add LLR from receiver input
ymm0 = _mm256_cvtepi8_epi16(p_llrProcBuf[j]);
ymmRes0 = _mm256_adds_epi16(ymmRes0, ymm0);
ymm1 = _mm256_cvtepi8_epi16(p_llrProcBuf[j+1]);
ymmRes1 = _mm256_adds_epi16(ymmRes1, ymm1);
// Pack results back to epi8
ymm0 = _mm256_packs_epi16(ymmRes0, ymmRes1);
// ymm0 = [ymmRes1[255:128] ymmRes0[255:128] ymmRes1[127:0] ymmRes0[127:0]]
// p_llrRes = [ymmRes1[255:128] ymmRes1[127:0] ymmRes0[255:128] ymmRes0[127:0]]
*p_llrRes = _mm256_permute4x64_epi64(ymm0, 0xD8);
// Next result
p_llrRes++;
}
}
// =====================================================================
// Process group with 10 CNs
if (lut_numBnInBnGroups[9] > 0)
{
// If elements in group move to next address
idxBnGroup++;
// Number of groups of 32 BNs for parallel processing
M = (lut_numBnInBnGroups[9]*Z + 31)>>5;
// Set the offset to each CN within a group in terms of 16 Byte
cnOffsetInGroup = (lut_numBnInBnGroups[9]*NR_LDPC_ZMAX)>>4;
// Set pointers to start of group 10
p_bnProcBuf = (__m128i*) &bnProcBuf [lut_startAddrBnGroups [idxBnGroup]];
p_llrProcBuf = (__m128i*) &llrProcBuf [lut_startAddrBnGroupsLlr[idxBnGroup]];
p_llrRes = (__m256i*) &llrRes [lut_startAddrBnGroupsLlr[idxBnGroup]];
// Loop over BNs
for (i=0,j=0; i<M; i++,j+=2)
{
// First 16 LLRs of first CN
ymmRes0 = _mm256_cvtepi8_epi16(p_bnProcBuf[j]);
ymmRes1 = _mm256_cvtepi8_epi16(p_bnProcBuf[j+1]);
// Loop over CNs
for (k=1; k<10; k++)
{
ymm0 = _mm256_cvtepi8_epi16(p_bnProcBuf[k*cnOffsetInGroup + j]);
ymmRes0 = _mm256_adds_epi16(ymmRes0, ymm0);
ymm1 = _mm256_cvtepi8_epi16(p_bnProcBuf[k*cnOffsetInGroup + j+1]);
ymmRes1 = _mm256_adds_epi16(ymmRes1, ymm1);
}
// Add LLR from receiver input
ymm0 = _mm256_cvtepi8_epi16(p_llrProcBuf[j]);
ymmRes0 = _mm256_adds_epi16(ymmRes0, ymm0);
ymm1 = _mm256_cvtepi8_epi16(p_llrProcBuf[j+1]);
ymmRes1 = _mm256_adds_epi16(ymmRes1, ymm1);
// Pack results back to epi8
ymm0 = _mm256_packs_epi16(ymmRes0, ymmRes1);
// ymm0 = [ymmRes1[255:128] ymmRes0[255:128] ymmRes1[127:0] ymmRes0[127:0]]
// p_llrRes = [ymmRes1[255:128] ymmRes1[127:0] ymmRes0[255:128] ymmRes0[127:0]]
*p_llrRes = _mm256_permute4x64_epi64(ymm0, 0xD8);
// Next result
p_llrRes++;
}
}
// =====================================================================
// Process group with 11 CNs
if (lut_numBnInBnGroups[10] > 0)
{
// If elements in group move to next address
idxBnGroup++;
// Number of groups of 32 BNs for parallel processing
M = (lut_numBnInBnGroups[10]*Z + 31)>>5;
// Set the offset to each CN within a group in terms of 16 Byte
cnOffsetInGroup = (lut_numBnInBnGroups[10]*NR_LDPC_ZMAX)>>4;
// Set pointers to start of group 11
p_bnProcBuf = (__m128i*) &bnProcBuf [lut_startAddrBnGroups [idxBnGroup]];
p_llrProcBuf = (__m128i*) &llrProcBuf [lut_startAddrBnGroupsLlr[idxBnGroup]];
p_llrRes = (__m256i*) &llrRes [lut_startAddrBnGroupsLlr[idxBnGroup]];
// Loop over BNs
for (i=0,j=0; i<M; i++,j+=2)
{
// First 16 LLRs of first CN
ymmRes0 = _mm256_cvtepi8_epi16(p_bnProcBuf[j]);
ymmRes1 = _mm256_cvtepi8_epi16(p_bnProcBuf[j+1]);
// Loop over CNs
for (k=1; k<11; k++)
{
ymm0 = _mm256_cvtepi8_epi16(p_bnProcBuf[k*cnOffsetInGroup + j]);
ymmRes0 = _mm256_adds_epi16(ymmRes0, ymm0);
ymm1 = _mm256_cvtepi8_epi16(p_bnProcBuf[k*cnOffsetInGroup + j+1]);
ymmRes1 = _mm256_adds_epi16(ymmRes1, ymm1);
}
// Add LLR from receiver input
ymm0 = _mm256_cvtepi8_epi16(p_llrProcBuf[j]);
ymmRes0 = _mm256_adds_epi16(ymmRes0, ymm0);
ymm1 = _mm256_cvtepi8_epi16(p_llrProcBuf[j+1]);
ymmRes1 = _mm256_adds_epi16(ymmRes1, ymm1);
// Pack results back to epi8
ymm0 = _mm256_packs_epi16(ymmRes0, ymmRes1);
// ymm0 = [ymmRes1[255:128] ymmRes0[255:128] ymmRes1[127:0] ymmRes0[127:0]]
// p_llrRes = [ymmRes1[255:128] ymmRes1[127:0] ymmRes0[255:128] ymmRes0[127:0]]
*p_llrRes = _mm256_permute4x64_epi64(ymm0, 0xD8);
// Next result
p_llrRes++;
}
}
// =====================================================================
// Process group with 12 CNs
if (lut_numBnInBnGroups[11] > 0)
{
// If elements in group move to next address
idxBnGroup++;
// Number of groups of 32 BNs for parallel processing
M = (lut_numBnInBnGroups[11]*Z + 31)>>5;
// Set the offset to each CN within a group in terms of 16 Byte
cnOffsetInGroup = (lut_numBnInBnGroups[11]*NR_LDPC_ZMAX)>>4;
// Set pointers to start of group 12
p_bnProcBuf = (__m128i*) &bnProcBuf [lut_startAddrBnGroups [idxBnGroup]];
p_llrProcBuf = (__m128i*) &llrProcBuf [lut_startAddrBnGroupsLlr[idxBnGroup]];
p_llrRes = (__m256i*) &llrRes [lut_startAddrBnGroupsLlr[idxBnGroup]];
// Loop over BNs
for (i=0,j=0; i<M; i++,j+=2)
{
// First 16 LLRs of first CN
ymmRes0 = _mm256_cvtepi8_epi16(p_bnProcBuf[j]);
ymmRes1 = _mm256_cvtepi8_epi16(p_bnProcBuf[j+1]);
// Loop over CNs
for (k=1; k<12; k++)
{
ymm0 = _mm256_cvtepi8_epi16(p_bnProcBuf[k*cnOffsetInGroup + j]);
ymmRes0 = _mm256_adds_epi16(ymmRes0, ymm0);
ymm1 = _mm256_cvtepi8_epi16(p_bnProcBuf[k*cnOffsetInGroup + j+1]);
ymmRes1 = _mm256_adds_epi16(ymmRes1, ymm1);
}
// Add LLR from receiver input
ymm0 = _mm256_cvtepi8_epi16(p_llrProcBuf[j]);
ymmRes0 = _mm256_adds_epi16(ymmRes0, ymm0);
ymm1 = _mm256_cvtepi8_epi16(p_llrProcBuf[j+1]);
ymmRes1 = _mm256_adds_epi16(ymmRes1, ymm1);
// Pack results back to epi8
ymm0 = _mm256_packs_epi16(ymmRes0, ymmRes1);
// ymm0 = [ymmRes1[255:128] ymmRes0[255:128] ymmRes1[127:0] ymmRes0[127:0]]
// p_llrRes = [ymmRes1[255:128] ymmRes1[127:0] ymmRes0[255:128] ymmRes0[127:0]]
*p_llrRes = _mm256_permute4x64_epi64(ymm0, 0xD8);
// Next result
p_llrRes++;
}
}
// =====================================================================
// Process group with 13 CNs
if (lut_numBnInBnGroups[12] > 0)
{
// If elements in group move to next address
idxBnGroup++;
// Number of groups of 32 BNs for parallel processing
M = (lut_numBnInBnGroups[12]*Z + 31)>>5;
// Set the offset to each CN within a group in terms of 16 Byte
cnOffsetInGroup = (lut_numBnInBnGroups[12]*NR_LDPC_ZMAX)>>4;
// Set pointers to start of group 13
p_bnProcBuf = (__m128i*) &bnProcBuf [lut_startAddrBnGroups [idxBnGroup]];
p_llrProcBuf = (__m128i*) &llrProcBuf [lut_startAddrBnGroupsLlr[idxBnGroup]];
p_llrRes = (__m256i*) &llrRes [lut_startAddrBnGroupsLlr[idxBnGroup]];
// Loop over BNs
for (i=0,j=0; i<M; i++,j+=2)
{
// First 16 LLRs of first CN
ymmRes0 = _mm256_cvtepi8_epi16(p_bnProcBuf[j]);
ymmRes1 = _mm256_cvtepi8_epi16(p_bnProcBuf[j+1]);
// Loop over CNs
for (k=1; k<13; k++)
{
ymm0 = _mm256_cvtepi8_epi16(p_bnProcBuf[k*cnOffsetInGroup + j]);
ymmRes0 = _mm256_adds_epi16(ymmRes0, ymm0);
ymm1 = _mm256_cvtepi8_epi16(p_bnProcBuf[k*cnOffsetInGroup + j+1]);
ymmRes1 = _mm256_adds_epi16(ymmRes1, ymm1);
}
// Add LLR from receiver input
ymm0 = _mm256_cvtepi8_epi16(p_llrProcBuf[j]);
ymmRes0 = _mm256_adds_epi16(ymmRes0, ymm0);
ymm1 = _mm256_cvtepi8_epi16(p_llrProcBuf[j+1]);
ymmRes1 = _mm256_adds_epi16(ymmRes1, ymm1);
// Pack results back to epi8
ymm0 = _mm256_packs_epi16(ymmRes0, ymmRes1);
// ymm0 = [ymmRes1[255:128] ymmRes0[255:128] ymmRes1[127:0] ymmRes0[127:0]]
// p_llrRes = [ymmRes1[255:128] ymmRes1[127:0] ymmRes0[255:128] ymmRes0[127:0]]
*p_llrRes = _mm256_permute4x64_epi64(ymm0, 0xD8);
// Next result
p_llrRes++;
}
}
// =====================================================================
// Process group with 14 CNs
if (lut_numBnInBnGroups[13] > 0)
{
// If elements in group move to next address
idxBnGroup++;
// Number of groups of 32 BNs for parallel processing
M = (lut_numBnInBnGroups[13]*Z + 31)>>5;
// Set the offset to each CN within a group in terms of 16 Byte
cnOffsetInGroup = (lut_numBnInBnGroups[13]*NR_LDPC_ZMAX)>>4;
// Set pointers to start of group 14
p_bnProcBuf = (__m128i*) &bnProcBuf [lut_startAddrBnGroups [idxBnGroup]];
p_llrProcBuf = (__m128i*) &llrProcBuf [lut_startAddrBnGroupsLlr[idxBnGroup]];
p_llrRes = (__m256i*) &llrRes [lut_startAddrBnGroupsLlr[idxBnGroup]];
// Loop over BNs
for (i=0,j=0; i<M; i++,j+=2)
{
// First 16 LLRs of first CN
ymmRes0 = _mm256_cvtepi8_epi16(p_bnProcBuf[j]);
ymmRes1 = _mm256_cvtepi8_epi16(p_bnProcBuf[j+1]);
// Loop over CNs
for (k=1; k<14; k++)
{
ymm0 = _mm256_cvtepi8_epi16(p_bnProcBuf[k*cnOffsetInGroup + j]);
ymmRes0 = _mm256_adds_epi16(ymmRes0, ymm0);
ymm1 = _mm256_cvtepi8_epi16(p_bnProcBuf[k*cnOffsetInGroup + j+1]);
ymmRes1 = _mm256_adds_epi16(ymmRes1, ymm1);
}
// Add LLR from receiver input
ymm0 = _mm256_cvtepi8_epi16(p_llrProcBuf[j]);
ymmRes0 = _mm256_adds_epi16(ymmRes0, ymm0);
ymm1 = _mm256_cvtepi8_epi16(p_llrProcBuf[j+1]);
ymmRes1 = _mm256_adds_epi16(ymmRes1, ymm1);
// Pack results back to epi8
ymm0 = _mm256_packs_epi16(ymmRes0, ymmRes1);
// ymm0 = [ymmRes1[255:128] ymmRes0[255:128] ymmRes1[127:0] ymmRes0[127:0]]
// p_llrRes = [ymmRes1[255:128] ymmRes1[127:0] ymmRes0[255:128] ymmRes0[127:0]]
*p_llrRes = _mm256_permute4x64_epi64(ymm0, 0xD8);
// Next result
p_llrRes++;
}
}
// =====================================================================
// Process group with 15 CNs
if (lut_numBnInBnGroups[14] > 0)
{
// If elements in group move to next address
idxBnGroup++;
// Number of groups of 32 BNs for parallel processing
M = (lut_numBnInBnGroups[14]*Z + 31)>>5;
// Set the offset to each CN within a group in terms of 16 Byte
cnOffsetInGroup = (lut_numBnInBnGroups[14]*NR_LDPC_ZMAX)>>4;
// Set pointers to start of group 15
p_bnProcBuf = (__m128i*) &bnProcBuf [lut_startAddrBnGroups [idxBnGroup]];
p_llrProcBuf = (__m128i*) &llrProcBuf [lut_startAddrBnGroupsLlr[idxBnGroup]];
p_llrRes = (__m256i*) &llrRes [lut_startAddrBnGroupsLlr[idxBnGroup]];
// Loop over BNs
for (i=0,j=0; i<M; i++,j+=2)
{
// First 16 LLRs of first CN
ymmRes0 = _mm256_cvtepi8_epi16(p_bnProcBuf[j]);
ymmRes1 = _mm256_cvtepi8_epi16(p_bnProcBuf[j+1]);
// Loop over CNs
for (k=1; k<15; k++)
{
ymm0 = _mm256_cvtepi8_epi16(p_bnProcBuf[k*cnOffsetInGroup + j]);
ymmRes0 = _mm256_adds_epi16(ymmRes0, ymm0);
ymm1 = _mm256_cvtepi8_epi16(p_bnProcBuf[k*cnOffsetInGroup + j+1]);
ymmRes1 = _mm256_adds_epi16(ymmRes1, ymm1);
}
// Add LLR from receiver input
ymm0 = _mm256_cvtepi8_epi16(p_llrProcBuf[j]);
ymmRes0 = _mm256_adds_epi16(ymmRes0, ymm0);
ymm1 = _mm256_cvtepi8_epi16(p_llrProcBuf[j+1]);
ymmRes1 = _mm256_adds_epi16(ymmRes1, ymm1);
// Pack results back to epi8
ymm0 = _mm256_packs_epi16(ymmRes0, ymmRes1);
// ymm0 = [ymmRes1[255:128] ymmRes0[255:128] ymmRes1[127:0] ymmRes0[127:0]]
// p_llrRes = [ymmRes1[255:128] ymmRes1[127:0] ymmRes0[255:128] ymmRes0[127:0]]
*p_llrRes = _mm256_permute4x64_epi64(ymm0, 0xD8);
// Next result
p_llrRes++;
}
}
// =====================================================================
// Process group with 16 CNs
if (lut_numBnInBnGroups[15] > 0)
{
// If elements in group move to next address
idxBnGroup++;
// Number of groups of 32 BNs for parallel processing
M = (lut_numBnInBnGroups[15]*Z + 31)>>5;
// Set the offset to each CN within a group in terms of 16 Byte
cnOffsetInGroup = (lut_numBnInBnGroups[15]*NR_LDPC_ZMAX)>>4;
// Set pointers to start of group 16
p_bnProcBuf = (__m128i*) &bnProcBuf [lut_startAddrBnGroups [idxBnGroup]];
p_llrProcBuf = (__m128i*) &llrProcBuf [lut_startAddrBnGroupsLlr[idxBnGroup]];
p_llrRes = (__m256i*) &llrRes [lut_startAddrBnGroupsLlr[idxBnGroup]];
// Loop over BNs
for (i=0,j=0; i<M; i++,j+=2)
{
// First 16 LLRs of first CN
ymmRes0 = _mm256_cvtepi8_epi16(p_bnProcBuf[j]);
ymmRes1 = _mm256_cvtepi8_epi16(p_bnProcBuf[j+1]);
// Loop over CNs
for (k=1; k<16; k++)
{
ymm0 = _mm256_cvtepi8_epi16(p_bnProcBuf[k*cnOffsetInGroup + j]);
ymmRes0 = _mm256_adds_epi16(ymmRes0, ymm0);
ymm1 = _mm256_cvtepi8_epi16(p_bnProcBuf[k*cnOffsetInGroup + j+1]);
ymmRes1 = _mm256_adds_epi16(ymmRes1, ymm1);
}
// Add LLR from receiver input
ymm0 = _mm256_cvtepi8_epi16(p_llrProcBuf[j]);
ymmRes0 = _mm256_adds_epi16(ymmRes0, ymm0);
ymm1 = _mm256_cvtepi8_epi16(p_llrProcBuf[j+1]);
ymmRes1 = _mm256_adds_epi16(ymmRes1, ymm1);
// Pack results back to epi8
ymm0 = _mm256_packs_epi16(ymmRes0, ymmRes1);
// ymm0 = [ymmRes1[255:128] ymmRes0[255:128] ymmRes1[127:0] ymmRes0[127:0]]
// p_llrRes = [ymmRes1[255:128] ymmRes1[127:0] ymmRes0[255:128] ymmRes0[127:0]]
*p_llrRes = _mm256_permute4x64_epi64(ymm0, 0xD8);
// Next result
p_llrRes++;
}
}
// =====================================================================
// Process group with 17 CNs
if (lut_numBnInBnGroups[16] > 0)
{
// If elements in group move to next address
idxBnGroup++;
// Number of groups of 32 BNs for parallel processing
M = (lut_numBnInBnGroups[16]*Z + 31)>>5;
// Set the offset to each CN within a group in terms of 16 Byte
cnOffsetInGroup = (lut_numBnInBnGroups[16]*NR_LDPC_ZMAX)>>4;
// Set pointers to start of group 16
p_bnProcBuf = (__m128i*) &bnProcBuf [lut_startAddrBnGroups [idxBnGroup]];
p_llrProcBuf = (__m128i*) &llrProcBuf [lut_startAddrBnGroupsLlr[idxBnGroup]];
p_llrRes = (__m256i*) &llrRes [lut_startAddrBnGroupsLlr[idxBnGroup]];
// Loop over BNs
for (i=0,j=0; i<M; i++,j+=2)
{
// First 16 LLRs of first CN
ymmRes0 = _mm256_cvtepi8_epi16(p_bnProcBuf[j]);
ymmRes1 = _mm256_cvtepi8_epi16(p_bnProcBuf[j+1]);
// Loop over CNs
for (k=1; k<17; k++)
{
ymm0 = _mm256_cvtepi8_epi16(p_bnProcBuf[k*cnOffsetInGroup + j]);
ymmRes0 = _mm256_adds_epi16(ymmRes0, ymm0);
ymm1 = _mm256_cvtepi8_epi16(p_bnProcBuf[k*cnOffsetInGroup + j+1]);
ymmRes1 = _mm256_adds_epi16(ymmRes1, ymm1);
}
// Add LLR from receiver input
ymm0 = _mm256_cvtepi8_epi16(p_llrProcBuf[j]);
ymmRes0 = _mm256_adds_epi16(ymmRes0, ymm0);
ymm1 = _mm256_cvtepi8_epi16(p_llrProcBuf[j+1]);
ymmRes1 = _mm256_adds_epi16(ymmRes1, ymm1);
// Pack results back to epi8
ymm0 = _mm256_packs_epi16(ymmRes0, ymmRes1);
// ymm0 = [ymmRes1[255:128] ymmRes0[255:128] ymmRes1[127:0] ymmRes0[127:0]]
// p_llrRes = [ymmRes1[255:128] ymmRes1[127:0] ymmRes0[255:128] ymmRes0[127:0]]
*p_llrRes = _mm256_permute4x64_epi64(ymm0, 0xD8);
// Next result
p_llrRes++;
}
}
// =====================================================================
// Process group with 18 CNs
if (lut_numBnInBnGroups[17] > 0)
{
// If elements in group move to next address
idxBnGroup++;
// Number of groups of 32 BNs for parallel processing
M = (lut_numBnInBnGroups[17]*Z + 31)>>5;
// Set the offset to each CN within a group in terms of 16 Byte
cnOffsetInGroup = (lut_numBnInBnGroups[17]*NR_LDPC_ZMAX)>>4;
// Set pointers to start of group 18
p_bnProcBuf = (__m128i*) &bnProcBuf [lut_startAddrBnGroups [idxBnGroup]];
p_llrProcBuf = (__m128i*) &llrProcBuf [lut_startAddrBnGroupsLlr[idxBnGroup]];
p_llrRes = (__m256i*) &llrRes [lut_startAddrBnGroupsLlr[idxBnGroup]];
// Loop over BNs
for (i=0,j=0; i<M; i++,j+=2)
{
// First 16 LLRs of first CN
ymmRes0 = _mm256_cvtepi8_epi16(p_bnProcBuf[j]);
ymmRes1 = _mm256_cvtepi8_epi16(p_bnProcBuf[j+1]);
// Loop over CNs
for (k=1; k<18; k++)
{
ymm0 = _mm256_cvtepi8_epi16(p_bnProcBuf[k*cnOffsetInGroup + j]);
ymmRes0 = _mm256_adds_epi16(ymmRes0, ymm0);
ymm1 = _mm256_cvtepi8_epi16(p_bnProcBuf[k*cnOffsetInGroup + j+1]);
ymmRes1 = _mm256_adds_epi16(ymmRes1, ymm1);
}
// Add LLR from receiver input
ymm0 = _mm256_cvtepi8_epi16(p_llrProcBuf[j]);
ymmRes0 = _mm256_adds_epi16(ymmRes0, ymm0);
ymm1 = _mm256_cvtepi8_epi16(p_llrProcBuf[j+1]);
ymmRes1 = _mm256_adds_epi16(ymmRes1, ymm1);
// Pack results back to epi8
ymm0 = _mm256_packs_epi16(ymmRes0, ymmRes1);
// ymm0 = [ymmRes1[255:128] ymmRes0[255:128] ymmRes1[127:0] ymmRes0[127:0]]
// p_llrRes = [ymmRes1[255:128] ymmRes1[127:0] ymmRes0[255:128] ymmRes0[127:0]]
*p_llrRes = _mm256_permute4x64_epi64(ymm0, 0xD8);
// Next result
p_llrRes++;
}
}
// =====================================================================
// Process group with 19 CNs
if (lut_numBnInBnGroups[18] > 0)
{
// If elements in group move to next address
idxBnGroup++;
// Number of groups of 32 BNs for parallel processing
M = (lut_numBnInBnGroups[18]*Z + 31)>>5;
// Set the offset to each CN within a group in terms of 16 Byte
cnOffsetInGroup = (lut_numBnInBnGroups[18]*NR_LDPC_ZMAX)>>4;
// Set pointers to start of group 19
p_bnProcBuf = (__m128i*) &bnProcBuf [lut_startAddrBnGroups [idxBnGroup]];
p_llrProcBuf = (__m128i*) &llrProcBuf [lut_startAddrBnGroupsLlr[idxBnGroup]];
p_llrRes = (__m256i*) &llrRes [lut_startAddrBnGroupsLlr[idxBnGroup]];
// Loop over BNs
for (i=0,j=0; i<M; i++,j+=2)
{
// First 16 LLRs of first CN
ymmRes0 = _mm256_cvtepi8_epi16(p_bnProcBuf[j]);
ymmRes1 = _mm256_cvtepi8_epi16(p_bnProcBuf[j+1]);
// Loop over CNs
for (k=1; k<19; k++)
{
ymm0 = _mm256_cvtepi8_epi16(p_bnProcBuf[k*cnOffsetInGroup + j]);
ymmRes0 = _mm256_adds_epi16(ymmRes0, ymm0);
ymm1 = _mm256_cvtepi8_epi16(p_bnProcBuf[k*cnOffsetInGroup + j+1]);
ymmRes1 = _mm256_adds_epi16(ymmRes1, ymm1);
}
// Add LLR from receiver input
ymm0 = _mm256_cvtepi8_epi16(p_llrProcBuf[j]);
ymmRes0 = _mm256_adds_epi16(ymmRes0, ymm0);
ymm1 = _mm256_cvtepi8_epi16(p_llrProcBuf[j+1]);
ymmRes1 = _mm256_adds_epi16(ymmRes1, ymm1);
// Pack results back to epi8
ymm0 = _mm256_packs_epi16(ymmRes0, ymmRes1);
// ymm0 = [ymmRes1[255:128] ymmRes0[255:128] ymmRes1[127:0] ymmRes0[127:0]]
// p_llrRes = [ymmRes1[255:128] ymmRes1[127:0] ymmRes0[255:128] ymmRes0[127:0]]
*p_llrRes = _mm256_permute4x64_epi64(ymm0, 0xD8);
// Next result
p_llrRes++;
}
}
// =====================================================================
// Process group with 20 CNs
if (lut_numBnInBnGroups[19] > 0)
{
// If elements in group move to next address
idxBnGroup++;
// Number of groups of 32 BNs for parallel processing
M = (lut_numBnInBnGroups[19]*Z + 31)>>5;
// Set the offset to each CN within a group in terms of 16 Byte
cnOffsetInGroup = (lut_numBnInBnGroups[19]*NR_LDPC_ZMAX)>>4;
// Set pointers to start of group 20
p_bnProcBuf = (__m128i*) &bnProcBuf [lut_startAddrBnGroups [idxBnGroup]];
p_llrProcBuf = (__m128i*) &llrProcBuf [lut_startAddrBnGroupsLlr[idxBnGroup]];
p_llrRes = (__m256i*) &llrRes [lut_startAddrBnGroupsLlr[idxBnGroup]];
// Loop over BNs
for (i=0,j=0; i<M; i++,j+=2)
{
// First 16 LLRs of first CN
ymmRes0 = _mm256_cvtepi8_epi16(p_bnProcBuf[j]);
ymmRes1 = _mm256_cvtepi8_epi16(p_bnProcBuf[j+1]);
// Loop over CNs
for (k=1; k<20; k++)
{
ymm0 = _mm256_cvtepi8_epi16(p_bnProcBuf[k*cnOffsetInGroup + j]);
ymmRes0 = _mm256_adds_epi16(ymmRes0, ymm0);
ymm1 = _mm256_cvtepi8_epi16(p_bnProcBuf[k*cnOffsetInGroup + j+1]);
ymmRes1 = _mm256_adds_epi16(ymmRes1, ymm1);
}
// Add LLR from receiver input
ymm0 = _mm256_cvtepi8_epi16(p_llrProcBuf[j]);
ymmRes0 = _mm256_adds_epi16(ymmRes0, ymm0);
ymm1 = _mm256_cvtepi8_epi16(p_llrProcBuf[j+1]);
ymmRes1 = _mm256_adds_epi16(ymmRes1, ymm1);
// Pack results back to epi8
ymm0 = _mm256_packs_epi16(ymmRes0, ymmRes1);
// ymm0 = [ymmRes1[255:128] ymmRes0[255:128] ymmRes1[127:0] ymmRes0[127:0]]
// p_llrRes = [ymmRes1[255:128] ymmRes1[127:0] ymmRes0[255:128] ymmRes0[127:0]]
*p_llrRes = _mm256_permute4x64_epi64(ymm0, 0xD8);
// Next result
p_llrRes++;
}
}
// =====================================================================
// Process group with 21 CNs
if (lut_numBnInBnGroups[20] > 0)
{
// If elements in group move to next address
idxBnGroup++;
// Number of groups of 32 BNs for parallel processing
M = (lut_numBnInBnGroups[20]*Z + 31)>>5;
// Set the offset to each CN within a group in terms of 16 Byte
cnOffsetInGroup = (lut_numBnInBnGroups[20]*NR_LDPC_ZMAX)>>4;
// Set pointers to start of group 21
p_bnProcBuf = (__m128i*) &bnProcBuf [lut_startAddrBnGroups [idxBnGroup]];
p_llrProcBuf = (__m128i*) &llrProcBuf [lut_startAddrBnGroupsLlr[idxBnGroup]];
p_llrRes = (__m256i*) &llrRes [lut_startAddrBnGroupsLlr[idxBnGroup]];
// Loop over BNs
for (i=0,j=0; i<M; i++,j+=2)
{
// First 16 LLRs of first CN
ymmRes0 = _mm256_cvtepi8_epi16(p_bnProcBuf[j]);
ymmRes1 = _mm256_cvtepi8_epi16(p_bnProcBuf[j+1]);
// Loop over CNs
for (k=1; k<21; k++)
{
ymm0 = _mm256_cvtepi8_epi16(p_bnProcBuf[k*cnOffsetInGroup + j]);
ymmRes0 = _mm256_adds_epi16(ymmRes0, ymm0);
ymm1 = _mm256_cvtepi8_epi16(p_bnProcBuf[k*cnOffsetInGroup + j+1]);
ymmRes1 = _mm256_adds_epi16(ymmRes1, ymm1);
}
// Add LLR from receiver input
ymm0 = _mm256_cvtepi8_epi16(p_llrProcBuf[j]);
ymmRes0 = _mm256_adds_epi16(ymmRes0, ymm0);
ymm1 = _mm256_cvtepi8_epi16(p_llrProcBuf[j+1]);
ymmRes1 = _mm256_adds_epi16(ymmRes1, ymm1);
// Pack results back to epi8
ymm0 = _mm256_packs_epi16(ymmRes0, ymmRes1);
// ymm0 = [ymmRes1[255:128] ymmRes0[255:128] ymmRes1[127:0] ymmRes0[127:0]]
// p_llrRes = [ymmRes1[255:128] ymmRes1[127:0] ymmRes0[255:128] ymmRes0[127:0]]
*p_llrRes = _mm256_permute4x64_epi64(ymm0, 0xD8);
// Next result
p_llrRes++;
}
}
// =====================================================================
// Process group with 22 CNs
if (lut_numBnInBnGroups[21] > 0)
{
// If elements in group move to next address
idxBnGroup++;
// Number of groups of 32 BNs for parallel processing
M = (lut_numBnInBnGroups[21]*Z + 31)>>5;
// Set the offset to each CN within a group in terms of 16 Byte
cnOffsetInGroup = (lut_numBnInBnGroups[21]*NR_LDPC_ZMAX)>>4;
// Set pointers to start of group 22
p_bnProcBuf = (__m128i*) &bnProcBuf [lut_startAddrBnGroups [idxBnGroup]];
p_llrProcBuf = (__m128i*) &llrProcBuf [lut_startAddrBnGroupsLlr[idxBnGroup]];
p_llrRes = (__m256i*) &llrRes [lut_startAddrBnGroupsLlr[idxBnGroup]];
// Loop over BNs
for (i=0,j=0; i<M; i++,j+=2)
{
// First 16 LLRs of first CN
ymmRes0 = _mm256_cvtepi8_epi16(p_bnProcBuf[j]);
ymmRes1 = _mm256_cvtepi8_epi16(p_bnProcBuf[j+1]);
// Loop over CNs
for (k=1; k<22; k++)
{
ymm0 = _mm256_cvtepi8_epi16(p_bnProcBuf[k*cnOffsetInGroup + j]);
ymmRes0 = _mm256_adds_epi16(ymmRes0, ymm0);
ymm1 = _mm256_cvtepi8_epi16(p_bnProcBuf[k*cnOffsetInGroup + j+1]);
ymmRes1 = _mm256_adds_epi16(ymmRes1, ymm1);
}
// Add LLR from receiver input
ymm0 = _mm256_cvtepi8_epi16(p_llrProcBuf[j]);
ymmRes0 = _mm256_adds_epi16(ymmRes0, ymm0);
ymm1 = _mm256_cvtepi8_epi16(p_llrProcBuf[j+1]);
ymmRes1 = _mm256_adds_epi16(ymmRes1, ymm1);
// Pack results back to epi8
ymm0 = _mm256_packs_epi16(ymmRes0, ymmRes1);
// ymm0 = [ymmRes1[255:128] ymmRes0[255:128] ymmRes1[127:0] ymmRes0[127:0]]
// p_llrRes = [ymmRes1[255:128] ymmRes1[127:0] ymmRes0[255:128] ymmRes0[127:0]]
*p_llrRes = _mm256_permute4x64_epi64(ymm0, 0xD8);
// Next result
p_llrRes++;
}
}
// =====================================================================
// Process group with 23 CNs
if (lut_numBnInBnGroups[22] > 0)
{
// If elements in group move to next address
idxBnGroup++;
// Number of groups of 32 BNs for parallel processing
M = (lut_numBnInBnGroups[22]*Z + 31)>>5;
// Set the offset to each CN within a group in terms of 16 Byte
cnOffsetInGroup = (lut_numBnInBnGroups[22]*NR_LDPC_ZMAX)>>4;
// Set pointers to start of group 23
p_bnProcBuf = (__m128i*) &bnProcBuf [lut_startAddrBnGroups [idxBnGroup]];
p_llrProcBuf = (__m128i*) &llrProcBuf [lut_startAddrBnGroupsLlr[idxBnGroup]];
p_llrRes = (__m256i*) &llrRes [lut_startAddrBnGroupsLlr[idxBnGroup]];
// Loop over BNs
for (i=0,j=0; i<M; i++,j+=2)
{
// First 16 LLRs of first CN
ymmRes0 = _mm256_cvtepi8_epi16(p_bnProcBuf[j]);
ymmRes1 = _mm256_cvtepi8_epi16(p_bnProcBuf[j+1]);
// Loop over CNs
for (k=1; k<23; k++)
{
ymm0 = _mm256_cvtepi8_epi16(p_bnProcBuf[k*cnOffsetInGroup + j]);
ymmRes0 = _mm256_adds_epi16(ymmRes0, ymm0);
ymm1 = _mm256_cvtepi8_epi16(p_bnProcBuf[k*cnOffsetInGroup + j+1]);
ymmRes1 = _mm256_adds_epi16(ymmRes1, ymm1);
}
// Add LLR from receiver input
ymm0 = _mm256_cvtepi8_epi16(p_llrProcBuf[j]);
ymmRes0 = _mm256_adds_epi16(ymmRes0, ymm0);
ymm1 = _mm256_cvtepi8_epi16(p_llrProcBuf[j+1]);
ymmRes1 = _mm256_adds_epi16(ymmRes1, ymm1);
// Pack results back to epi8
ymm0 = _mm256_packs_epi16(ymmRes0, ymmRes1);
// ymm0 = [ymmRes1[255:128] ymmRes0[255:128] ymmRes1[127:0] ymmRes0[127:0]]
// p_llrRes = [ymmRes1[255:128] ymmRes1[127:0] ymmRes0[255:128] ymmRes0[127:0]]
*p_llrRes = _mm256_permute4x64_epi64(ymm0, 0xD8);
// Next result
p_llrRes++;
}
}
// =====================================================================
// Process group with 24 CNs
if (lut_numBnInBnGroups[23] > 0)
{
// If elements in group move to next address
idxBnGroup++;
// Number of groups of 32 BNs for parallel processing
M = (lut_numBnInBnGroups[23]*Z + 31)>>5;
// Set the offset to each CN within a group in terms of 16 Byte
cnOffsetInGroup = (lut_numBnInBnGroups[23]*NR_LDPC_ZMAX)>>4;
// Set pointers to start of group 24
p_bnProcBuf = (__m128i*) &bnProcBuf [lut_startAddrBnGroups [idxBnGroup]];
p_llrProcBuf = (__m128i*) &llrProcBuf [lut_startAddrBnGroupsLlr[idxBnGroup]];
p_llrRes = (__m256i*) &llrRes [lut_startAddrBnGroupsLlr[idxBnGroup]];
// Loop over BNs
for (i=0,j=0; i<M; i++,j+=2)
{
// First 16 LLRs of first CN
ymmRes0 = _mm256_cvtepi8_epi16(p_bnProcBuf[j]);
ymmRes1 = _mm256_cvtepi8_epi16(p_bnProcBuf[j+1]);
// Loop over CNs
for (k=1; k<24; k++)
{
ymm0 = _mm256_cvtepi8_epi16(p_bnProcBuf[k*cnOffsetInGroup + j]);
ymmRes0 = _mm256_adds_epi16(ymmRes0, ymm0);
ymm1 = _mm256_cvtepi8_epi16(p_bnProcBuf[k*cnOffsetInGroup + j+1]);
ymmRes1 = _mm256_adds_epi16(ymmRes1, ymm1);
}
// Add LLR from receiver input
ymm0 = _mm256_cvtepi8_epi16(p_llrProcBuf[j]);
ymmRes0 = _mm256_adds_epi16(ymmRes0, ymm0);
ymm1 = _mm256_cvtepi8_epi16(p_llrProcBuf[j+1]);
ymmRes1 = _mm256_adds_epi16(ymmRes1, ymm1);
// Pack results back to epi8
ymm0 = _mm256_packs_epi16(ymmRes0, ymmRes1);
// ymm0 = [ymmRes1[255:128] ymmRes0[255:128] ymmRes1[127:0] ymmRes0[127:0]]
// p_llrRes = [ymmRes1[255:128] ymmRes1[127:0] ymmRes0[255:128] ymmRes0[127:0]]
*p_llrRes = _mm256_permute4x64_epi64(ymm0, 0xD8);
// Next result
p_llrRes++;
}
}
// =====================================================================
// Process group with 25 CNs
if (lut_numBnInBnGroups[24] > 0)
{
// If elements in group move to next address
idxBnGroup++;
// Number of groups of 32 BNs for parallel processing
M = (lut_numBnInBnGroups[24]*Z + 31)>>5;
// Set the offset to each CN within a group in terms of 16 Byte
cnOffsetInGroup = (lut_numBnInBnGroups[24]*NR_LDPC_ZMAX)>>4;
// Set pointers to start of group 25
p_bnProcBuf = (__m128i*) &bnProcBuf [lut_startAddrBnGroups [idxBnGroup]];
p_llrProcBuf = (__m128i*) &llrProcBuf [lut_startAddrBnGroupsLlr[idxBnGroup]];
p_llrRes = (__m256i*) &llrRes [lut_startAddrBnGroupsLlr[idxBnGroup]];
// Loop over BNs
for (i=0,j=0; i<M; i++,j+=2)
{
// First 16 LLRs of first CN
ymmRes0 = _mm256_cvtepi8_epi16(p_bnProcBuf[j]);
ymmRes1 = _mm256_cvtepi8_epi16(p_bnProcBuf[j+1]);
// Loop over CNs
for (k=1; k<25; k++)
{
ymm0 = _mm256_cvtepi8_epi16(p_bnProcBuf[k*cnOffsetInGroup + j]);
ymmRes0 = _mm256_adds_epi16(ymmRes0, ymm0);
ymm1 = _mm256_cvtepi8_epi16(p_bnProcBuf[k*cnOffsetInGroup + j+1]);
ymmRes1 = _mm256_adds_epi16(ymmRes1, ymm1);
}
// Add LLR from receiver input
ymm0 = _mm256_cvtepi8_epi16(p_llrProcBuf[j]);
ymmRes0 = _mm256_adds_epi16(ymmRes0, ymm0);
ymm1 = _mm256_cvtepi8_epi16(p_llrProcBuf[j+1]);
ymmRes1 = _mm256_adds_epi16(ymmRes1, ymm1);
// Pack results back to epi8
ymm0 = _mm256_packs_epi16(ymmRes0, ymmRes1);
// ymm0 = [ymmRes1[255:128] ymmRes0[255:128] ymmRes1[127:0] ymmRes0[127:0]]
// p_llrRes = [ymmRes1[255:128] ymmRes1[127:0] ymmRes0[255:128] ymmRes0[127:0]]
*p_llrRes = _mm256_permute4x64_epi64(ymm0, 0xD8);
// Next result
p_llrRes++;
}
}
// =====================================================================
// Process group with 26 CNs
if (lut_numBnInBnGroups[25] > 0)
{
// If elements in group move to next address
idxBnGroup++;
// Number of groups of 32 BNs for parallel processing
M = (lut_numBnInBnGroups[25]*Z + 31)>>5;
// Set the offset to each CN within a group in terms of 16 Byte
cnOffsetInGroup = (lut_numBnInBnGroups[25]*NR_LDPC_ZMAX)>>4;
// Set pointers to start of group 26
p_bnProcBuf = (__m128i*) &bnProcBuf [lut_startAddrBnGroups [idxBnGroup]];
p_llrProcBuf = (__m128i*) &llrProcBuf [lut_startAddrBnGroupsLlr[idxBnGroup]];
p_llrRes = (__m256i*) &llrRes [lut_startAddrBnGroupsLlr[idxBnGroup]];
// Loop over BNs
for (i=0,j=0; i<M; i++,j+=2)
{
// First 16 LLRs of first CN
ymmRes0 = _mm256_cvtepi8_epi16(p_bnProcBuf[j]);
ymmRes1 = _mm256_cvtepi8_epi16(p_bnProcBuf[j+1]);
// Loop over CNs
for (k=1; k<26; k++)
{
ymm0 = _mm256_cvtepi8_epi16(p_bnProcBuf[k*cnOffsetInGroup + j]);
ymmRes0 = _mm256_adds_epi16(ymmRes0, ymm0);
ymm1 = _mm256_cvtepi8_epi16(p_bnProcBuf[k*cnOffsetInGroup + j+1]);
ymmRes1 = _mm256_adds_epi16(ymmRes1, ymm1);
}
// Add LLR from receiver input
ymm0 = _mm256_cvtepi8_epi16(p_llrProcBuf[j]);
ymmRes0 = _mm256_adds_epi16(ymmRes0, ymm0);
ymm1 = _mm256_cvtepi8_epi16(p_llrProcBuf[j+1]);
ymmRes1 = _mm256_adds_epi16(ymmRes1, ymm1);
// Pack results back to epi8
ymm0 = _mm256_packs_epi16(ymmRes0, ymmRes1);
// ymm0 = [ymmRes1[255:128] ymmRes0[255:128] ymmRes1[127:0] ymmRes0[127:0]]
// p_llrRes = [ymmRes1[255:128] ymmRes1[127:0] ymmRes0[255:128] ymmRes0[127:0]]
*p_llrRes = _mm256_permute4x64_epi64(ymm0, 0xD8);
// Next result
p_llrRes++;
}
}
// =====================================================================
// Process group with 27 CNs
if (lut_numBnInBnGroups[26] > 0)
{
// If elements in group move to next address
idxBnGroup++;
// Number of groups of 32 BNs for parallel processing
M = (lut_numBnInBnGroups[26]*Z + 31)>>5;
// Set the offset to each CN within a group in terms of 16 Byte
cnOffsetInGroup = (lut_numBnInBnGroups[26]*NR_LDPC_ZMAX)>>4;
// Set pointers to start of group 27
p_bnProcBuf = (__m128i*) &bnProcBuf [lut_startAddrBnGroups [idxBnGroup]];
p_llrProcBuf = (__m128i*) &llrProcBuf [lut_startAddrBnGroupsLlr[idxBnGroup]];
p_llrRes = (__m256i*) &llrRes [lut_startAddrBnGroupsLlr[idxBnGroup]];
// Loop over BNs
for (i=0,j=0; i<M; i++,j+=2)
{
// First 16 LLRs of first CN
ymmRes0 = _mm256_cvtepi8_epi16(p_bnProcBuf[j]);
ymmRes1 = _mm256_cvtepi8_epi16(p_bnProcBuf[j+1]);
// Loop over CNs
for (k=1; k<27; k++)
{
ymm0 = _mm256_cvtepi8_epi16(p_bnProcBuf[k*cnOffsetInGroup + j]);
ymmRes0 = _mm256_adds_epi16(ymmRes0, ymm0);
ymm1 = _mm256_cvtepi8_epi16(p_bnProcBuf[k*cnOffsetInGroup + j+1]);
ymmRes1 = _mm256_adds_epi16(ymmRes1, ymm1);
}
// Add LLR from receiver input
ymm0 = _mm256_cvtepi8_epi16(p_llrProcBuf[j]);
ymmRes0 = _mm256_adds_epi16(ymmRes0, ymm0);
ymm1 = _mm256_cvtepi8_epi16(p_llrProcBuf[j+1]);
ymmRes1 = _mm256_adds_epi16(ymmRes1, ymm1);
// Pack results back to epi8
ymm0 = _mm256_packs_epi16(ymmRes0, ymmRes1);
// ymm0 = [ymmRes1[255:128] ymmRes0[255:128] ymmRes1[127:0] ymmRes0[127:0]]
// p_llrRes = [ymmRes1[255:128] ymmRes1[127:0] ymmRes0[255:128] ymmRes0[127:0]]
*p_llrRes = _mm256_permute4x64_epi64(ymm0, 0xD8);
// Next result
p_llrRes++;
}
}
// =====================================================================
// Process group with 28 CNs
if (lut_numBnInBnGroups[27] > 0)
{
// If elements in group move to next address
idxBnGroup++;
// Number of groups of 32 BNs for parallel processing
M = (lut_numBnInBnGroups[27]*Z + 31)>>5;
// Set the offset to each CN within a group in terms of 16 Byte
cnOffsetInGroup = (lut_numBnInBnGroups[27]*NR_LDPC_ZMAX)>>4;
// Set pointers to start of group 28
p_bnProcBuf = (__m128i*) &bnProcBuf [lut_startAddrBnGroups [idxBnGroup]];
p_llrProcBuf = (__m128i*) &llrProcBuf [lut_startAddrBnGroupsLlr[idxBnGroup]];
p_llrRes = (__m256i*) &llrRes [lut_startAddrBnGroupsLlr[idxBnGroup]];
// Loop over BNs
for (i=0,j=0; i<M; i++,j+=2)
{
// First 16 LLRs of first CN
ymmRes0 = _mm256_cvtepi8_epi16(p_bnProcBuf[j]);
ymmRes1 = _mm256_cvtepi8_epi16(p_bnProcBuf[j+1]);
// Loop over CNs
for (k=1; k<28; k++)
{
ymm0 = _mm256_cvtepi8_epi16(p_bnProcBuf[k*cnOffsetInGroup + j]);
ymmRes0 = _mm256_adds_epi16(ymmRes0, ymm0);
ymm1 = _mm256_cvtepi8_epi16(p_bnProcBuf[k*cnOffsetInGroup + j+1]);
ymmRes1 = _mm256_adds_epi16(ymmRes1, ymm1);
}
// Add LLR from receiver input
ymm0 = _mm256_cvtepi8_epi16(p_llrProcBuf[j]);
ymmRes0 = _mm256_adds_epi16(ymmRes0, ymm0);
ymm1 = _mm256_cvtepi8_epi16(p_llrProcBuf[j+1]);
ymmRes1 = _mm256_adds_epi16(ymmRes1, ymm1);
// Pack results back to epi8
ymm0 = _mm256_packs_epi16(ymmRes0, ymmRes1);
// ymm0 = [ymmRes1[255:128] ymmRes0[255:128] ymmRes1[127:0] ymmRes0[127:0]]
// p_llrRes = [ymmRes1[255:128] ymmRes1[127:0] ymmRes0[255:128] ymmRes0[127:0]]
*p_llrRes = _mm256_permute4x64_epi64(ymm0, 0xD8);
// Next result
p_llrRes++;
}
}
// =====================================================================
// Process group with 29 CNs
if (lut_numBnInBnGroups[28] > 0)
{
// If elements in group move to next address
idxBnGroup++;
// Number of groups of 32 BNs for parallel processing
M = (lut_numBnInBnGroups[28]*Z + 31)>>5;
// Set the offset to each CN within a group in terms of 16 Byte
cnOffsetInGroup = (lut_numBnInBnGroups[28]*NR_LDPC_ZMAX)>>4;
// Set pointers to start of group 29
p_bnProcBuf = (__m128i*) &bnProcBuf [lut_startAddrBnGroups [idxBnGroup]];
p_llrProcBuf = (__m128i*) &llrProcBuf [lut_startAddrBnGroupsLlr[idxBnGroup]];
p_llrRes = (__m256i*) &llrRes [lut_startAddrBnGroupsLlr[idxBnGroup]];
// Loop over BNs
for (i=0,j=0; i<M; i++,j+=2)
{
// First 16 LLRs of first CN
ymmRes0 = _mm256_cvtepi8_epi16(p_bnProcBuf[j]);
ymmRes1 = _mm256_cvtepi8_epi16(p_bnProcBuf[j+1]);
// Loop over CNs
for (k=1; k<29; k++)
{
ymm0 = _mm256_cvtepi8_epi16(p_bnProcBuf[k*cnOffsetInGroup + j]);
ymmRes0 = _mm256_adds_epi16(ymmRes0, ymm0);
ymm1 = _mm256_cvtepi8_epi16(p_bnProcBuf[k*cnOffsetInGroup + j+1]);
ymmRes1 = _mm256_adds_epi16(ymmRes1, ymm1);
}
// Add LLR from receiver input
ymm0 = _mm256_cvtepi8_epi16(p_llrProcBuf[j]);
ymmRes0 = _mm256_adds_epi16(ymmRes0, ymm0);
ymm1 = _mm256_cvtepi8_epi16(p_llrProcBuf[j+1]);
ymmRes1 = _mm256_adds_epi16(ymmRes1, ymm1);
// Pack results back to epi8
ymm0 = _mm256_packs_epi16(ymmRes0, ymmRes1);
// ymm0 = [ymmRes1[255:128] ymmRes0[255:128] ymmRes1[127:0] ymmRes0[127:0]]
// p_llrRes = [ymmRes1[255:128] ymmRes1[127:0] ymmRes0[255:128] ymmRes0[127:0]]
*p_llrRes = _mm256_permute4x64_epi64(ymm0, 0xD8);
// Next result
p_llrRes++;
}
}
// =====================================================================
// Process group with 30 CNs
if (lut_numBnInBnGroups[29] > 0)
{
// If elements in group move to next address
idxBnGroup++;
// Number of groups of 32 BNs for parallel processing
M = (lut_numBnInBnGroups[29]*Z + 31)>>5;
// Set the offset to each CN within a group in terms of 16 Byte
cnOffsetInGroup = (lut_numBnInBnGroups[29]*NR_LDPC_ZMAX)>>4;
// Set pointers to start of group 30
p_bnProcBuf = (__m128i*) &bnProcBuf [lut_startAddrBnGroups [idxBnGroup]];
p_llrProcBuf = (__m128i*) &llrProcBuf [lut_startAddrBnGroupsLlr[idxBnGroup]];
p_llrRes = (__m256i*) &llrRes [lut_startAddrBnGroupsLlr[idxBnGroup]];
// Loop over BNs
for (i=0,j=0; i<M; i++,j+=2)
{
// First 16 LLRs of first CN
ymmRes0 = _mm256_cvtepi8_epi16(p_bnProcBuf[j]);
ymmRes1 = _mm256_cvtepi8_epi16(p_bnProcBuf[j+1]);
// Loop over CNs
for (k=1; k<30; k++)
{
ymm0 = _mm256_cvtepi8_epi16(p_bnProcBuf[k*cnOffsetInGroup + j]);
ymmRes0 = _mm256_adds_epi16(ymmRes0, ymm0);
ymm1 = _mm256_cvtepi8_epi16(p_bnProcBuf[k*cnOffsetInGroup + j+1]);
ymmRes1 = _mm256_adds_epi16(ymmRes1, ymm1);
}
// Add LLR from receiver input
ymm0 = _mm256_cvtepi8_epi16(p_llrProcBuf[j]);
ymmRes0 = _mm256_adds_epi16(ymmRes0, ymm0);
ymm1 = _mm256_cvtepi8_epi16(p_llrProcBuf[j+1]);
ymmRes1 = _mm256_adds_epi16(ymmRes1, ymm1);
// Pack results back to epi8
ymm0 = _mm256_packs_epi16(ymmRes0, ymmRes1);
// ymm0 = [ymmRes1[255:128] ymmRes0[255:128] ymmRes1[127:0] ymmRes0[127:0]]
// p_llrRes = [ymmRes1[255:128] ymmRes1[127:0] ymmRes0[255:128] ymmRes0[127:0]]
*p_llrRes = _mm256_permute4x64_epi64(ymm0, 0xD8);
// Next result
p_llrRes++;
}
}
}
/**
\brief Performs second part of BN processing on the BN processing buffer and the LLR results buffer and stores the results in the BN processing results buffer.
At every BN, the LLR of the corresponding edge is subtracted from the sum computed in bnProcPc.
\param p_lut Pointer to decoder LUTs
\param Z Lifting size
*/
static inline void nrLDPC_bnProc(t_nrLDPC_lut* p_lut, t_nrLDPC_procBuf* p_procBuf, uint16_t Z)
{
// BN Processing calculating the values to send back to the CNs for next iteration
// bnProcBufRes contains the sum of all edges to each BN at the start of each group
const uint8_t* lut_numBnInBnGroups = p_lut->numBnInBnGroups;
const uint32_t* lut_startAddrBnGroups = p_lut->startAddrBnGroups;
const uint16_t* lut_startAddrBnGroupsLlr = p_lut->startAddrBnGroupsLlr;
int8_t* bnProcBuf = p_procBuf->bnProcBuf;
int8_t* bnProcBufRes = p_procBuf->bnProcBufRes;
int8_t* llrRes = p_procBuf->llrRes;
__m256i* p_bnProcBuf;
__m256i* p_bnProcBufRes;
__m256i* p_llrRes;
__m256i* p_res;
// Number of BNs in Groups
uint32_t M;
//uint32_t M32rem;
uint32_t i;
uint32_t k;
// Offset to each bit within a group in terms of 32 Byte
uint32_t cnOffsetInGroup;
uint8_t idxBnGroup = 0;
// =====================================================================
// Process group with 1 CN
// Already done in bnProcBufPc
// =====================================================================
// Process group with 2 CNs
if (lut_numBnInBnGroups[1] > 0)
{
// If elements in group move to next address
idxBnGroup++;
// Number of groups of 32 BNs for parallel processing
M = (lut_numBnInBnGroups[1]*Z + 31)>>5;
// Set the offset to each CN within a group in terms of 32 Byte
cnOffsetInGroup = (lut_numBnInBnGroups[1]*NR_LDPC_ZMAX)>>5;
// Set pointers to start of group 2
p_bnProcBuf = (__m256i*) &bnProcBuf [lut_startAddrBnGroups[idxBnGroup]];
p_bnProcBufRes = (__m256i*) &bnProcBufRes[lut_startAddrBnGroups[idxBnGroup]];
// Loop over CNs
for (k=0; k<2; k++)
{
p_res = &p_bnProcBufRes[k*cnOffsetInGroup];
p_llrRes = (__m256i*) &llrRes[lut_startAddrBnGroupsLlr[idxBnGroup]];
// Loop over BNs
for (i=0; i<M; i++)
{
*p_res = _mm256_subs_epi8(*p_llrRes, p_bnProcBuf[k*cnOffsetInGroup + i]);
p_res++;
p_llrRes++;
}
}
}
// =====================================================================
// Process group with 3 CNs
if (lut_numBnInBnGroups[2] > 0)
{
// If elements in group move to next address
idxBnGroup++;
// Number of groups of 32 BNs for parallel processing
M = (lut_numBnInBnGroups[2]*Z + 31)>>5;
// Set the offset to each CN within a group in terms of 32 Byte
cnOffsetInGroup = (lut_numBnInBnGroups[2]*NR_LDPC_ZMAX)>>5;
// Set pointers to start of group 3
p_bnProcBuf = (__m256i*) &bnProcBuf [lut_startAddrBnGroups[idxBnGroup]];
p_bnProcBufRes = (__m256i*) &bnProcBufRes[lut_startAddrBnGroups[idxBnGroup]];
// Loop over CNs
for (k=0; k<3; k++)
{
p_res = &p_bnProcBufRes[k*cnOffsetInGroup];
p_llrRes = (__m256i*) &llrRes[lut_startAddrBnGroupsLlr[idxBnGroup]];
// Loop over BNs
for (i=0; i<M; i++)
{
*p_res = _mm256_subs_epi8(*p_llrRes, p_bnProcBuf[k*cnOffsetInGroup + i]);
p_res++;
p_llrRes++;
}
}
}
// =====================================================================
// Process group with 4 CNs
if (lut_numBnInBnGroups[3] > 0)
{
// If elements in group move to next address
idxBnGroup++;
// Number of groups of 32 BNs for parallel processing
M = (lut_numBnInBnGroups[3]*Z + 31)>>5;
// Set the offset to each CN within a group in terms of 32 Byte
cnOffsetInGroup = (lut_numBnInBnGroups[3]*NR_LDPC_ZMAX)>>5;
// Set pointers to start of group 4
p_bnProcBuf = (__m256i*) &bnProcBuf [lut_startAddrBnGroups[idxBnGroup]];
p_bnProcBufRes = (__m256i*) &bnProcBufRes[lut_startAddrBnGroups[idxBnGroup]];
// Loop over CNs
for (k=0; k<4; k++)
{
p_res = &p_bnProcBufRes[k*cnOffsetInGroup];
p_llrRes = (__m256i*) &llrRes[lut_startAddrBnGroupsLlr[idxBnGroup]];
// Loop over BNs
for (i=0; i<M; i++)
{
*p_res = _mm256_subs_epi8(*p_llrRes, p_bnProcBuf[k*cnOffsetInGroup + i]);
p_res++;
p_llrRes++;
}
}
}
// =====================================================================
// Process group with 5 CNs
if (lut_numBnInBnGroups[4] > 0)
{
// If elements in group move to next address
idxBnGroup++;
// Number of groups of 32 BNs for parallel processing
M = (lut_numBnInBnGroups[4]*Z + 31)>>5;
// Set the offset to each CN within a group in terms of 32 Byte
cnOffsetInGroup = (lut_numBnInBnGroups[4]*NR_LDPC_ZMAX)>>5;
// Set pointers to start of group 5
p_bnProcBuf = (__m256i*) &bnProcBuf [lut_startAddrBnGroups[idxBnGroup]];
p_bnProcBufRes = (__m256i*) &bnProcBufRes[lut_startAddrBnGroups[idxBnGroup]];
// Loop over CNs
for (k=0; k<5; k++)
{
p_res = &p_bnProcBufRes[k*cnOffsetInGroup];
p_llrRes = (__m256i*) &llrRes [lut_startAddrBnGroupsLlr[idxBnGroup]];
// Loop over BNs
for (i=0; i<M; i++)
{
*p_res = _mm256_subs_epi8(*p_llrRes, p_bnProcBuf[k*cnOffsetInGroup + i]);
p_res++;
p_llrRes++;
}
}
}
// =====================================================================
// Process group with 6 CNs
if (lut_numBnInBnGroups[5] > 0)
{
// If elements in group move to next address
idxBnGroup++;
// Number of groups of 32 BNs for parallel processing
M = (lut_numBnInBnGroups[5]*Z + 31)>>5;
// Set the offset to each CN within a group in terms of 32 Byte
cnOffsetInGroup = (lut_numBnInBnGroups[5]*NR_LDPC_ZMAX)>>5;
// Set pointers to start of group 6
p_bnProcBuf = (__m256i*) &bnProcBuf [lut_startAddrBnGroups[idxBnGroup]];
p_bnProcBufRes = (__m256i*) &bnProcBufRes[lut_startAddrBnGroups[idxBnGroup]];
// Loop over CNs
for (k=0; k<6; k++)
{
p_res = &p_bnProcBufRes[k*cnOffsetInGroup];
p_llrRes = (__m256i*) &llrRes[lut_startAddrBnGroupsLlr[idxBnGroup]];
// Loop over BNs
for (i=0; i<M; i++)
{
*p_res = _mm256_subs_epi8(*p_llrRes, p_bnProcBuf[k*cnOffsetInGroup + i]);
p_res++;
p_llrRes++;
}
}
}
// =====================================================================
// Process group with 7 CNs
if (lut_numBnInBnGroups[6] > 0)
{
// If elements in group move to next address
idxBnGroup++;
// Number of groups of 32 BNs for parallel processing
M = (lut_numBnInBnGroups[6]*Z + 31)>>5;
// Set the offset to each CN within a group in terms of 32 Byte
cnOffsetInGroup = (lut_numBnInBnGroups[6]*NR_LDPC_ZMAX)>>5;
// Set pointers to start of group 7
p_bnProcBuf = (__m256i*) &bnProcBuf [lut_startAddrBnGroups[idxBnGroup]];
p_bnProcBufRes = (__m256i*) &bnProcBufRes[lut_startAddrBnGroups[idxBnGroup]];
// Loop over CNs
for (k=0; k<7; k++)
{
p_res = &p_bnProcBufRes[k*cnOffsetInGroup];
p_llrRes = (__m256i*) &llrRes [lut_startAddrBnGroupsLlr[idxBnGroup]];
// Loop over BNs
for (i=0; i<M; i++)
{
*p_res = _mm256_subs_epi8(*p_llrRes, p_bnProcBuf[k*cnOffsetInGroup + i]);
p_res++;
p_llrRes++;
}
}
}
// =====================================================================
// Process group with 8 CNs
if (lut_numBnInBnGroups[7] > 0)
{
// If elements in group move to next address
idxBnGroup++;
// Number of groups of 32 BNs for parallel processing
M = (lut_numBnInBnGroups[7]*Z + 31)>>5;
// Set the offset to each CN within a group in terms of 32 Byte
cnOffsetInGroup = (lut_numBnInBnGroups[7]*NR_LDPC_ZMAX)>>5;
// Set pointers to start of group 8
p_bnProcBuf = (__m256i*) &bnProcBuf [lut_startAddrBnGroups[idxBnGroup]];
p_bnProcBufRes = (__m256i*) &bnProcBufRes[lut_startAddrBnGroups[idxBnGroup]];
// Loop over CNs
for (k=0; k<8; k++)
{
p_res = &p_bnProcBufRes[k*cnOffsetInGroup];
p_llrRes = (__m256i*) &llrRes [lut_startAddrBnGroupsLlr[idxBnGroup]];
// Loop over BNs
for (i=0; i<M; i++)
{
*p_res = _mm256_subs_epi8(*p_llrRes, p_bnProcBuf[k*cnOffsetInGroup + i]);
p_res++;
p_llrRes++;
}
}
}
// =====================================================================
// Process group with 9 CNs
if (lut_numBnInBnGroups[8] > 0)
{
// If elements in group move to next address
idxBnGroup++;
// Number of groups of 32 BNs for parallel processing
M = (lut_numBnInBnGroups[8]*Z + 31)>>5;
// Set the offset to each CN within a group in terms of 32 Byte
cnOffsetInGroup = (lut_numBnInBnGroups[8]*NR_LDPC_ZMAX)>>5;
// Set pointers to start of group 9
p_bnProcBuf = (__m256i*) &bnProcBuf [lut_startAddrBnGroups[idxBnGroup]];
p_bnProcBufRes = (__m256i*) &bnProcBufRes[lut_startAddrBnGroups[idxBnGroup]];
// Loop over CNs
for (k=0; k<9; k++)
{
p_res = &p_bnProcBufRes[k*cnOffsetInGroup];
p_llrRes = (__m256i*) &llrRes [lut_startAddrBnGroupsLlr[idxBnGroup]];
// Loop over BNs
for (i=0; i<M; i++)
{
*p_res = _mm256_subs_epi8(*p_llrRes, p_bnProcBuf[k*cnOffsetInGroup + i]);
p_res++;
p_llrRes++;
}
}
}
// =====================================================================
// Process group with 10 CNs
if (lut_numBnInBnGroups[9] > 0)
{
// If elements in group move to next address
idxBnGroup++;
// Number of groups of 32 BNs for parallel processing
M = (lut_numBnInBnGroups[9]*Z + 31)>>5;
// Set the offset to each CN within a group in terms of 32 Byte
cnOffsetInGroup = (lut_numBnInBnGroups[9]*NR_LDPC_ZMAX)>>5;
// Set pointers to start of group 10
p_bnProcBuf = (__m256i*) &bnProcBuf [lut_startAddrBnGroups[idxBnGroup]];
p_bnProcBufRes = (__m256i*) &bnProcBufRes[lut_startAddrBnGroups[idxBnGroup]];
// Loop over CNs
for (k=0; k<10; k++)
{
p_res = &p_bnProcBufRes[k*cnOffsetInGroup];
p_llrRes = (__m256i*) &llrRes [lut_startAddrBnGroupsLlr[idxBnGroup]];
// Loop over BNs
for (i=0; i<M; i++)
{
*p_res = _mm256_subs_epi8(*p_llrRes, p_bnProcBuf[k*cnOffsetInGroup + i]);
p_res++;
p_llrRes++;
}
}
}
// =====================================================================
// Process group with 11 CNs
if (lut_numBnInBnGroups[10] > 0)
{
// If elements in group move to next address
idxBnGroup++;
// Number of groups of 32 BNs for parallel processing
M = (lut_numBnInBnGroups[10]*Z + 31)>>5;
// Set the offset to each CN within a group in terms of 32 Byte
cnOffsetInGroup = (lut_numBnInBnGroups[10]*NR_LDPC_ZMAX)>>5;
// Set pointers to start of group 10
p_bnProcBuf = (__m256i*) &bnProcBuf [lut_startAddrBnGroups[idxBnGroup]];
p_bnProcBufRes = (__m256i*) &bnProcBufRes[lut_startAddrBnGroups[idxBnGroup]];
// Loop over CNs
for (k=0; k<11; k++)
{
p_res = &p_bnProcBufRes[k*cnOffsetInGroup];
p_llrRes = (__m256i*) &llrRes [lut_startAddrBnGroupsLlr[idxBnGroup]];
// Loop over BNs
for (i=0; i<M; i++)
{
*p_res = _mm256_subs_epi8(*p_llrRes, p_bnProcBuf[k*cnOffsetInGroup + i]);
p_res++;
p_llrRes++;
}
}
}
// =====================================================================
// Process group with 12 CNs
if (lut_numBnInBnGroups[11] > 0)
{
// If elements in group move to next address
idxBnGroup++;
// Number of groups of 32 BNs for parallel processing
M = (lut_numBnInBnGroups[11]*Z + 31)>>5;
// Set the offset to each CN within a group in terms of 32 Byte
cnOffsetInGroup = (lut_numBnInBnGroups[11]*NR_LDPC_ZMAX)>>5;
// Set pointers to start of group 12
p_bnProcBuf = (__m256i*) &bnProcBuf [lut_startAddrBnGroups[idxBnGroup]];
p_bnProcBufRes = (__m256i*) &bnProcBufRes[lut_startAddrBnGroups[idxBnGroup]];
// Loop over CNs
for (k=0; k<12; k++)
{
p_res = &p_bnProcBufRes[k*cnOffsetInGroup];
p_llrRes = (__m256i*) &llrRes [lut_startAddrBnGroupsLlr[idxBnGroup]];
// Loop over BNs
for (i=0; i<M; i++)
{
*p_res = _mm256_subs_epi8(*p_llrRes, p_bnProcBuf[k*cnOffsetInGroup + i]);
p_res++;
p_llrRes++;
}
}
}
// =====================================================================
// Process group with 13 CNs
if (lut_numBnInBnGroups[12] > 0)
{
// If elements in group move to next address
idxBnGroup++;
// Number of groups of 32 BNs for parallel processing
M = (lut_numBnInBnGroups[12]*Z + 31)>>5;
// Set the offset to each CN within a group in terms of 32 Byte
cnOffsetInGroup = (lut_numBnInBnGroups[12]*NR_LDPC_ZMAX)>>5;
// Set pointers to start of group 13
p_bnProcBuf = (__m256i*) &bnProcBuf [lut_startAddrBnGroups[idxBnGroup]];
p_bnProcBufRes = (__m256i*) &bnProcBufRes[lut_startAddrBnGroups[idxBnGroup]];
// Loop over CNs
for (k=0; k<13; k++)
{
p_res = &p_bnProcBufRes[k*cnOffsetInGroup];
p_llrRes = (__m256i*) &llrRes [lut_startAddrBnGroupsLlr[idxBnGroup]];
// Loop over BNs
for (i=0; i<M; i++)
{
*p_res = _mm256_subs_epi8(*p_llrRes, p_bnProcBuf[k*cnOffsetInGroup + i]);
p_res++;
p_llrRes++;
}
}
}
// =====================================================================
// Process group with 14 CNs
if (lut_numBnInBnGroups[13] > 0)
{
// If elements in group move to next address
idxBnGroup++;
// Number of groups of 32 BNs for parallel processing
M = (lut_numBnInBnGroups[13]*Z + 31)>>5;
// Set the offset to each CN within a group in terms of 32 Byte
cnOffsetInGroup = (lut_numBnInBnGroups[13]*NR_LDPC_ZMAX)>>5;
// Set pointers to start of group 14
p_bnProcBuf = (__m256i*) &bnProcBuf [lut_startAddrBnGroups[idxBnGroup]];
p_bnProcBufRes = (__m256i*) &bnProcBufRes[lut_startAddrBnGroups[idxBnGroup]];
// Loop over CNs
for (k=0; k<14; k++)
{
p_res = &p_bnProcBufRes[k*cnOffsetInGroup];
p_llrRes = (__m256i*) &llrRes [lut_startAddrBnGroupsLlr[idxBnGroup]];
// Loop over BNs
for (i=0; i<M; i++)
{
*p_res = _mm256_subs_epi8(*p_llrRes, p_bnProcBuf[k*cnOffsetInGroup + i]);
p_res++;
p_llrRes++;
}
}
}
// =====================================================================
// Process group with 15 CNs
if (lut_numBnInBnGroups[14] > 0)
{
// If elements in group move to next address
idxBnGroup++;
// Number of groups of 32 BNs for parallel processing
M = (lut_numBnInBnGroups[14]*Z + 31)>>5;
// Set the offset to each CN within a group in terms of 32 Byte
cnOffsetInGroup = (lut_numBnInBnGroups[14]*NR_LDPC_ZMAX)>>5;
// Set pointers to start of group 15
p_bnProcBuf = (__m256i*) &bnProcBuf [lut_startAddrBnGroups[idxBnGroup]];
p_bnProcBufRes = (__m256i*) &bnProcBufRes[lut_startAddrBnGroups[idxBnGroup]];
// Loop over CNs
for (k=0; k<15; k++)
{
p_res = &p_bnProcBufRes[k*cnOffsetInGroup];
p_llrRes = (__m256i*) &llrRes [lut_startAddrBnGroupsLlr[idxBnGroup]];
// Loop over BNs
for (i=0; i<M; i++)
{
*p_res = _mm256_subs_epi8(*p_llrRes, p_bnProcBuf[k*cnOffsetInGroup + i]);
p_res++;
p_llrRes++;
}
}
}
// =====================================================================
// Process group with 16 CNs
if (lut_numBnInBnGroups[15] > 0)
{
// If elements in group move to next address
idxBnGroup++;
// Number of groups of 32 BNs for parallel processing
M = (lut_numBnInBnGroups[15]*Z + 31)>>5;
// Set the offset to each CN within a group in terms of 32 Byte
cnOffsetInGroup = (lut_numBnInBnGroups[15]*NR_LDPC_ZMAX)>>5;
// Set pointers to start of group 16
p_bnProcBuf = (__m256i*) &bnProcBuf [lut_startAddrBnGroups[idxBnGroup]];
p_bnProcBufRes = (__m256i*) &bnProcBufRes[lut_startAddrBnGroups[idxBnGroup]];
// Loop over CNs
for (k=0; k<16; k++)
{
p_res = &p_bnProcBufRes[k*cnOffsetInGroup];
p_llrRes = (__m256i*) &llrRes [lut_startAddrBnGroupsLlr[idxBnGroup]];
// Loop over BNs
for (i=0; i<M; i++)
{
*p_res = _mm256_subs_epi8(*p_llrRes, p_bnProcBuf[k*cnOffsetInGroup + i]);
p_res++;
p_llrRes++;
}
}
}
// =====================================================================
// Process group with 17 CNs
if (lut_numBnInBnGroups[16] > 0)
{
// If elements in group move to next address
idxBnGroup++;
// Number of groups of 32 BNs for parallel processing
M = (lut_numBnInBnGroups[16]*Z + 31)>>5;
// Set the offset to each CN within a group in terms of 32 Byte
cnOffsetInGroup = (lut_numBnInBnGroups[16]*NR_LDPC_ZMAX)>>5;
// Set pointers to start of group 17
p_bnProcBuf = (__m256i*) &bnProcBuf [lut_startAddrBnGroups[idxBnGroup]];
p_bnProcBufRes = (__m256i*) &bnProcBufRes[lut_startAddrBnGroups[idxBnGroup]];
// Loop over CNs
for (k=0; k<17; k++)
{
p_res = &p_bnProcBufRes[k*cnOffsetInGroup];
p_llrRes = (__m256i*) &llrRes [lut_startAddrBnGroupsLlr[idxBnGroup]];
// Loop over BNs
for (i=0; i<M; i++)
{
*p_res = _mm256_subs_epi8(*p_llrRes, p_bnProcBuf[k*cnOffsetInGroup + i]);
p_res++;
p_llrRes++;
}
}
}
// =====================================================================
// Process group with 18 CNs
if (lut_numBnInBnGroups[17] > 0)
{
// If elements in group move to next address
idxBnGroup++;
// Number of groups of 32 BNs for parallel processing
M = (lut_numBnInBnGroups[17]*Z + 31)>>5;
// Set the offset to each CN within a group in terms of 32 Byte
cnOffsetInGroup = (lut_numBnInBnGroups[17]*NR_LDPC_ZMAX)>>5;
// Set pointers to start of group 18
p_bnProcBuf = (__m256i*) &bnProcBuf [lut_startAddrBnGroups[idxBnGroup]];
p_bnProcBufRes = (__m256i*) &bnProcBufRes[lut_startAddrBnGroups[idxBnGroup]];
// Loop over CNs
for (k=0; k<18; k++)
{
p_res = &p_bnProcBufRes[k*cnOffsetInGroup];
p_llrRes = (__m256i*) &llrRes [lut_startAddrBnGroupsLlr[idxBnGroup]];
// Loop over BNs
for (i=0; i<M; i++)
{
*p_res = _mm256_subs_epi8(*p_llrRes, p_bnProcBuf[k*cnOffsetInGroup + i]);
p_res++;
p_llrRes++;
}
}
}
// =====================================================================
// Process group with 19 CNs
if (lut_numBnInBnGroups[18] > 0)
{
// If elements in group move to next address
idxBnGroup++;
// Number of groups of 32 BNs for parallel processing
M = (lut_numBnInBnGroups[18]*Z + 31)>>5;
// Set the offset to each CN within a group in terms of 32 Byte
cnOffsetInGroup = (lut_numBnInBnGroups[18]*NR_LDPC_ZMAX)>>5;
// Set pointers to start of group 19
p_bnProcBuf = (__m256i*) &bnProcBuf [lut_startAddrBnGroups[idxBnGroup]];
p_bnProcBufRes = (__m256i*) &bnProcBufRes[lut_startAddrBnGroups[idxBnGroup]];
// Loop over CNs
for (k=0; k<19; k++)
{
p_res = &p_bnProcBufRes[k*cnOffsetInGroup];
p_llrRes = (__m256i*) &llrRes [lut_startAddrBnGroupsLlr[idxBnGroup]];
// Loop over BNs
for (i=0; i<M; i++)
{
*p_res = _mm256_subs_epi8(*p_llrRes, p_bnProcBuf[k*cnOffsetInGroup + i]);
p_res++;
p_llrRes++;
}
}
}
// =====================================================================
// Process group with 20 CNs
if (lut_numBnInBnGroups[19] > 0)
{
// If elements in group move to next address
idxBnGroup++;
// Number of groups of 32 BNs for parallel processing
M = (lut_numBnInBnGroups[19]*Z + 31)>>5;
// Set the offset to each CN within a group in terms of 32 Byte
cnOffsetInGroup = (lut_numBnInBnGroups[19]*NR_LDPC_ZMAX)>>5;
// Set pointers to start of group 20
p_bnProcBuf = (__m256i*) &bnProcBuf [lut_startAddrBnGroups[idxBnGroup]];
p_bnProcBufRes = (__m256i*) &bnProcBufRes[lut_startAddrBnGroups[idxBnGroup]];
// Loop over CNs
for (k=0; k<20; k++)
{
p_res = &p_bnProcBufRes[k*cnOffsetInGroup];
p_llrRes = (__m256i*) &llrRes [lut_startAddrBnGroupsLlr[idxBnGroup]];
// Loop over BNs
for (i=0; i<M; i++)
{
*p_res = _mm256_subs_epi8(*p_llrRes, p_bnProcBuf[k*cnOffsetInGroup + i]);
p_res++;
p_llrRes++;
}
}
}
// =====================================================================
// Process group with 21 CNs
if (lut_numBnInBnGroups[20] > 0)
{
// If elements in group move to next address
idxBnGroup++;
// Number of groups of 32 BNs for parallel processing
M = (lut_numBnInBnGroups[20]*Z + 31)>>5;
// Set the offset to each CN within a group in terms of 32 Byte
cnOffsetInGroup = (lut_numBnInBnGroups[20]*NR_LDPC_ZMAX)>>5;
// Set pointers to start of group 21
p_bnProcBuf = (__m256i*) &bnProcBuf [lut_startAddrBnGroups[idxBnGroup]];
p_bnProcBufRes = (__m256i*) &bnProcBufRes[lut_startAddrBnGroups[idxBnGroup]];
// Loop over CNs
for (k=0; k<21; k++)
{
p_res = &p_bnProcBufRes[k*cnOffsetInGroup];
p_llrRes = (__m256i*) &llrRes [lut_startAddrBnGroupsLlr[idxBnGroup]];
// Loop over BNs
for (i=0; i<M; i++)
{
*p_res = _mm256_subs_epi8(*p_llrRes, p_bnProcBuf[k*cnOffsetInGroup + i]);
p_res++;
p_llrRes++;
}
}
}
// =====================================================================
// Process group with 22 CNs
if (lut_numBnInBnGroups[21] > 0)
{
// If elements in group move to next address
idxBnGroup++;
// Number of groups of 32 BNs for parallel processing
M = (lut_numBnInBnGroups[21]*Z + 31)>>5;
// Set the offset to each CN within a group in terms of 32 Byte
cnOffsetInGroup = (lut_numBnInBnGroups[21]*NR_LDPC_ZMAX)>>5;
// Set pointers to start of group 22
p_bnProcBuf = (__m256i*) &bnProcBuf [lut_startAddrBnGroups[idxBnGroup]];
p_bnProcBufRes = (__m256i*) &bnProcBufRes[lut_startAddrBnGroups[idxBnGroup]];
// Loop over CNs
for (k=0; k<22; k++)
{
p_res = &p_bnProcBufRes[k*cnOffsetInGroup];
p_llrRes = (__m256i*) &llrRes [lut_startAddrBnGroupsLlr[idxBnGroup]];
// Loop over BNs
for (i=0; i<M; i++)
{
*p_res = _mm256_subs_epi8(*p_llrRes, p_bnProcBuf[k*cnOffsetInGroup + i]);
p_res++;
p_llrRes++;
}
}
}
// =====================================================================
// Process group with 23 CNs
if (lut_numBnInBnGroups[22] > 0)
{
// If elements in group move to next address
idxBnGroup++;
// Number of groups of 32 BNs for parallel processing
M = (lut_numBnInBnGroups[22]*Z + 31)>>5;
// Set the offset to each CN within a group in terms of 32 Byte
cnOffsetInGroup = (lut_numBnInBnGroups[22]*NR_LDPC_ZMAX)>>5;
// Set pointers to start of group 23
p_bnProcBuf = (__m256i*) &bnProcBuf [lut_startAddrBnGroups[idxBnGroup]];
p_bnProcBufRes = (__m256i*) &bnProcBufRes[lut_startAddrBnGroups[idxBnGroup]];
// Loop over CNs
for (k=0; k<23; k++)
{
p_res = &p_bnProcBufRes[k*cnOffsetInGroup];
p_llrRes = (__m256i*) &llrRes [lut_startAddrBnGroupsLlr[idxBnGroup]];
// Loop over BNs
for (i=0; i<M; i++)
{
*p_res = _mm256_subs_epi8(*p_llrRes, p_bnProcBuf[k*cnOffsetInGroup + i]);
p_res++;
p_llrRes++;
}
}
}
// =====================================================================
// Process group with 24 CNs
if (lut_numBnInBnGroups[23] > 0)
{
// If elements in group move to next address
idxBnGroup++;
// Number of groups of 32 BNs for parallel processing
M = (lut_numBnInBnGroups[23]*Z + 31)>>5;
// Set the offset to each CN within a group in terms of 32 Byte
cnOffsetInGroup = (lut_numBnInBnGroups[23]*NR_LDPC_ZMAX)>>5;
// Set pointers to start of group 24
p_bnProcBuf = (__m256i*) &bnProcBuf [lut_startAddrBnGroups[idxBnGroup]];
p_bnProcBufRes = (__m256i*) &bnProcBufRes[lut_startAddrBnGroups[idxBnGroup]];
// Loop over CNs
for (k=0; k<24; k++)
{
p_res = &p_bnProcBufRes[k*cnOffsetInGroup];
p_llrRes = (__m256i*) &llrRes [lut_startAddrBnGroupsLlr[idxBnGroup]];
// Loop over BNs
for (i=0; i<M; i++)
{
*p_res = _mm256_subs_epi8(*p_llrRes, p_bnProcBuf[k*cnOffsetInGroup + i]);
p_res++;
p_llrRes++;
}
}
}
// =====================================================================
// Process group with 25 CNs
if (lut_numBnInBnGroups[24] > 0)
{
// If elements in group move to next address
idxBnGroup++;
// Number of groups of 32 BNs for parallel processing
M = (lut_numBnInBnGroups[24]*Z + 31)>>5;
// Set the offset to each CN within a group in terms of 32 Byte
cnOffsetInGroup = (lut_numBnInBnGroups[24]*NR_LDPC_ZMAX)>>5;
// Set pointers to start of group 25
p_bnProcBuf = (__m256i*) &bnProcBuf [lut_startAddrBnGroups[idxBnGroup]];
p_bnProcBufRes = (__m256i*) &bnProcBufRes[lut_startAddrBnGroups[idxBnGroup]];
// Loop over CNs
for (k=0; k<25; k++)
{
p_res = &p_bnProcBufRes[k*cnOffsetInGroup];
p_llrRes = (__m256i*) &llrRes [lut_startAddrBnGroupsLlr[idxBnGroup]];
// Loop over BNs
for (i=0; i<M; i++)
{
*p_res = _mm256_subs_epi8(*p_llrRes, p_bnProcBuf[k*cnOffsetInGroup + i]);
p_res++;
p_llrRes++;
}
}
}
// =====================================================================
// Process group with 26 CNs
if (lut_numBnInBnGroups[25] > 0)
{
// If elements in group move to next address
idxBnGroup++;
// Number of groups of 32 BNs for parallel processing
M = (lut_numBnInBnGroups[25]*Z + 31)>>5;
// Set the offset to each CN within a group in terms of 32 Byte
cnOffsetInGroup = (lut_numBnInBnGroups[25]*NR_LDPC_ZMAX)>>5;
// Set pointers to start of group 26
p_bnProcBuf = (__m256i*) &bnProcBuf [lut_startAddrBnGroups[idxBnGroup]];
p_bnProcBufRes = (__m256i*) &bnProcBufRes[lut_startAddrBnGroups[idxBnGroup]];
// Loop over CNs
for (k=0; k<26; k++)
{
p_res = &p_bnProcBufRes[k*cnOffsetInGroup];
p_llrRes = (__m256i*) &llrRes [lut_startAddrBnGroupsLlr[idxBnGroup]];
// Loop over BNs
for (i=0; i<M; i++)
{
*p_res = _mm256_subs_epi8(*p_llrRes, p_bnProcBuf[k*cnOffsetInGroup + i]);
p_res++;
p_llrRes++;
}
}
}
// =====================================================================
// Process group with 27 CNs
if (lut_numBnInBnGroups[26] > 0)
{
// If elements in group move to next address
idxBnGroup++;
// Number of groups of 32 BNs for parallel processing
M = (lut_numBnInBnGroups[26]*Z + 31)>>5;
// Set the offset to each CN within a group in terms of 32 Byte
cnOffsetInGroup = (lut_numBnInBnGroups[26]*NR_LDPC_ZMAX)>>5;
// Set pointers to start of group 27
p_bnProcBuf = (__m256i*) &bnProcBuf [lut_startAddrBnGroups[idxBnGroup]];
p_bnProcBufRes = (__m256i*) &bnProcBufRes[lut_startAddrBnGroups[idxBnGroup]];
// Loop over CNs
for (k=0; k<27; k++)
{
p_res = &p_bnProcBufRes[k*cnOffsetInGroup];
p_llrRes = (__m256i*) &llrRes [lut_startAddrBnGroupsLlr[idxBnGroup]];
// Loop over BNs
for (i=0; i<M; i++)
{
*p_res = _mm256_subs_epi8(*p_llrRes, p_bnProcBuf[k*cnOffsetInGroup + i]);
p_res++;
p_llrRes++;
}
}
}
// =====================================================================
// Process group with 28 CNs
if (lut_numBnInBnGroups[27] > 0)
{
// If elements in group move to next address
idxBnGroup++;
// Number of groups of 32 BNs for parallel processing
M = (lut_numBnInBnGroups[27]*Z + 31)>>5;
// Set the offset to each CN within a group in terms of 32 Byte
cnOffsetInGroup = (lut_numBnInBnGroups[27]*NR_LDPC_ZMAX)>>5;
// Set pointers to start of group 28
p_bnProcBuf = (__m256i*) &bnProcBuf [lut_startAddrBnGroups[idxBnGroup]];
p_bnProcBufRes = (__m256i*) &bnProcBufRes[lut_startAddrBnGroups[idxBnGroup]];
// Loop over CNs
for (k=0; k<28; k++)
{
p_res = &p_bnProcBufRes[k*cnOffsetInGroup];
p_llrRes = (__m256i*) &llrRes [lut_startAddrBnGroupsLlr[idxBnGroup]];
// Loop over BNs
for (i=0; i<M; i++)
{
*p_res = _mm256_subs_epi8(*p_llrRes, p_bnProcBuf[k*cnOffsetInGroup + i]);
p_res++;
p_llrRes++;
}
}
}
// =====================================================================
// Process group with 29 CNs
if (lut_numBnInBnGroups[28] > 0)
{
// If elements in group move to next address
idxBnGroup++;
// Number of groups of 32 BNs for parallel processing
M = (lut_numBnInBnGroups[28]*Z + 31)>>5;
// Set the offset to each CN within a group in terms of 32 Byte
cnOffsetInGroup = (lut_numBnInBnGroups[28]*NR_LDPC_ZMAX)>>5;
// Set pointers to start of group 29
p_bnProcBuf = (__m256i*) &bnProcBuf [lut_startAddrBnGroups[idxBnGroup]];
p_bnProcBufRes = (__m256i*) &bnProcBufRes[lut_startAddrBnGroups[idxBnGroup]];
// Loop over CNs
for (k=0; k<29; k++)
{
p_res = &p_bnProcBufRes[k*cnOffsetInGroup];
p_llrRes = (__m256i*) &llrRes [lut_startAddrBnGroupsLlr[idxBnGroup]];
// Loop over BNs
for (i=0; i<M; i++)
{
*p_res = _mm256_subs_epi8(*p_llrRes, p_bnProcBuf[k*cnOffsetInGroup + i]);
p_res++;
p_llrRes++;
}
}
}
// =====================================================================
// Process group with 30 CNs
if (lut_numBnInBnGroups[29] > 0)
{
// If elements in group move to next address
idxBnGroup++;
// Number of groups of 32 BNs for parallel processing
M = (lut_numBnInBnGroups[29]*Z + 31)>>5;
// Set the offset to each CN within a group in terms of 32 Byte
cnOffsetInGroup = (lut_numBnInBnGroups[29]*NR_LDPC_ZMAX)>>5;
// Set pointers to start of group 30
p_bnProcBuf = (__m256i*) &bnProcBuf [lut_startAddrBnGroups[idxBnGroup]];
p_bnProcBufRes = (__m256i*) &bnProcBufRes[lut_startAddrBnGroups[idxBnGroup]];
// Loop over CNs
for (k=0; k<30; k++)
{
p_res = &p_bnProcBufRes[k*cnOffsetInGroup];
p_llrRes = (__m256i*) &llrRes [lut_startAddrBnGroupsLlr[idxBnGroup]];
// Loop over BNs
for (i=0; i<M; i++)
{
*p_res = _mm256_subs_epi8(*p_llrRes, p_bnProcBuf[k*cnOffsetInGroup + i]);
p_res++;
p_llrRes++;
}
}
}
}
/**
\brief Performs hard-decision on output LLRs
\param out Pointer hard-decision output, every int8_t contains 0 or 1
\param llrOut Pointer to output LLRs
\param numLLR Number of LLRs
*/
static inline void nrLDPC_llr2bit(int8_t* out, int8_t* llrOut, uint16_t numLLR)
{
__m256i* p_llrOut = (__m256i*) llrOut;
__m256i* p_out = (__m256i*) out;
int8_t* p_llrOut8;
int8_t* p_out8;
uint32_t i;
uint32_t M = numLLR>>5;
uint32_t Mr = numLLR&31;
const __m256i* p_zeros = (__m256i*) zeros256_epi8;
const __m256i* p_ones = (__m256i*) ones256_epi8;
for (i=0; i<M; i++)
{
*p_out++ = _mm256_and_si256(*p_ones, _mm256_cmpgt_epi8(*p_zeros, *p_llrOut));
p_llrOut++;
}
if (Mr > 0)
{
// Remaining LLRs that do not fit in multiples of 32 bytes
p_llrOut8 = (int8_t*) p_llrOut;
p_out8 = (int8_t*) p_out;
for (i=0; i<Mr; i++)
{
if (p_llrOut8[i] < 0)
{
p_out8[i] = 1;
}
else
{
p_out8[i] = 0;
}
}
}
}
/**
\brief Performs hard-decision on output LLRs and packs the output in 32 bit values.
\param out Pointer hard-decision output, every int8_t contains 8 bits
\param llrOut Pointer to output LLRs
\param numLLR Number of LLRs
*/
static inline void nrLDPC_llr2bitPacked(int8_t* out, int8_t* llrOut, uint16_t numLLR)
{
__m256i* p_llrOut = (__m256i*) llrOut;
uint32_t* p_bits = (uint32_t*) out;
int8_t* p_llrOut8;
uint32_t bitsTmp = 0;
uint32_t i;
uint32_t M = numLLR>>5;
uint32_t Mr = numLLR&31;
for (i=0; i<M; i++)
{
*p_bits++ = _mm256_movemask_epi8(*p_llrOut);
p_llrOut++;
}
if (Mr > 0)
{
// Remaining LLRs that do not fit in multiples of 32 bytes
p_llrOut8 = (int8_t*) p_llrOut;
for (i=0; i<Mr; i++)
{
if (p_llrOut8[i] < 0)
{
bitsTmp |= (1<<i);
}
else
{
bitsTmp &= (0<<i);
}
}
}
*p_bits = bitsTmp;
}
#endif