diff --git a/openair1/PHY/CODING/nrLDPC_decoder/nrLDPC_tools/generator_bnProc/bnProcPc_gen_BG1_avx2.c b/openair1/PHY/CODING/nrLDPC_decoder/nrLDPC_tools/generator_bnProc/bnProcPc_gen_BG1_avx2.c index feb0fa7089338aa0ebfa855ac9318ed7d9b68604..fd015580495e5c1832ff666653ad45e189d5b88a 100644 --- a/openair1/PHY/CODING/nrLDPC_decoder/nrLDPC_tools/generator_bnProc/bnProcPc_gen_BG1_avx2.c +++ b/openair1/PHY/CODING/nrLDPC_decoder/nrLDPC_tools/generator_bnProc/bnProcPc_gen_BG1_avx2.c @@ -71,7 +71,7 @@ fprintf(fd, "// Process group with 1 CNs \n"); // Process group with 2 CNs - +/* if (lut_numBnInBnGroups[0] > 0) { // If elements in group move to next address @@ -123,7 +123,7 @@ fprintf(fd, "// Process group with 1 CNs \n"); } // ===================================================================== // Process group with 2 CNs - +*/ fprintf(fd, "// Process group with 2 CNs \n"); diff --git a/openair1/PHY/CODING/nrLDPC_decoder/nrLDPC_tools/generator_bnProc/bnProcPc_gen_BG1_avx2.o b/openair1/PHY/CODING/nrLDPC_decoder/nrLDPC_tools/generator_bnProc/bnProcPc_gen_BG1_avx2.o new file mode 100644 index 0000000000000000000000000000000000000000..8caf4b85a31f5704239e0d6aa2071be5529f93a2 Binary files /dev/null and b/openair1/PHY/CODING/nrLDPC_decoder/nrLDPC_tools/generator_bnProc/bnProcPc_gen_BG1_avx2.o differ diff --git a/openair1/PHY/CODING/nrLDPC_decoder/nrLDPC_tools/generator_bnProc/bnProcPc_gen_BG2_avx2.o b/openair1/PHY/CODING/nrLDPC_decoder/nrLDPC_tools/generator_bnProc/bnProcPc_gen_BG2_avx2.o new file mode 100644 index 0000000000000000000000000000000000000000..2f6a08ea358ebdad5d329ffe676e4f502c3abf58 Binary files /dev/null and b/openair1/PHY/CODING/nrLDPC_decoder/nrLDPC_tools/generator_bnProc/bnProcPc_gen_BG2_avx2.o differ diff --git a/openair1/PHY/CODING/nrLDPC_decoder/nrLDPC_tools/generator_bnProc/bnProc_gen_BG1_avx2.o b/openair1/PHY/CODING/nrLDPC_decoder/nrLDPC_tools/generator_bnProc/bnProc_gen_BG1_avx2.o new file mode 100644 index 0000000000000000000000000000000000000000..d40fd0a3af91fdbdb82afdac3ac2f5500eec1dc6 Binary files /dev/null and b/openair1/PHY/CODING/nrLDPC_decoder/nrLDPC_tools/generator_bnProc/bnProc_gen_BG1_avx2.o differ diff --git a/openair1/PHY/CODING/nrLDPC_decoder/nrLDPC_tools/generator_bnProc/bnProc_gen_BG2_avx2.o b/openair1/PHY/CODING/nrLDPC_decoder/nrLDPC_tools/generator_bnProc/bnProc_gen_BG2_avx2.o new file mode 100644 index 0000000000000000000000000000000000000000..248c7244cf105f023b1896cd4308aee4e0e8ce83 Binary files /dev/null and b/openair1/PHY/CODING/nrLDPC_decoder/nrLDPC_tools/generator_bnProc/bnProc_gen_BG2_avx2.o differ diff --git a/openair1/PHY/CODING/nrLDPC_decoder/nrLDPC_tools/generator_bnProc/bnProc_gen_avx2 b/openair1/PHY/CODING/nrLDPC_decoder/nrLDPC_tools/generator_bnProc/bnProc_gen_avx2 index d2fd3730144602aa16b4728e7acbcaff53d6e823..8e7d6324bd6cb8fa9fbb81d57e9d9c56413530da 100755 Binary files a/openair1/PHY/CODING/nrLDPC_decoder/nrLDPC_tools/generator_bnProc/bnProc_gen_avx2 and b/openair1/PHY/CODING/nrLDPC_decoder/nrLDPC_tools/generator_bnProc/bnProc_gen_avx2 differ diff --git a/openair1/PHY/CODING/nrLDPC_decoder/nrLDPC_tools/generator_bnProc/main.o b/openair1/PHY/CODING/nrLDPC_decoder/nrLDPC_tools/generator_bnProc/main.o new file mode 100644 index 0000000000000000000000000000000000000000..c9ea8704ae2d57a0c36127605f696ab9b1febb99 Binary files /dev/null and b/openair1/PHY/CODING/nrLDPC_decoder/nrLDPC_tools/generator_bnProc/main.o differ diff --git a/openair1/PHY/CODING/nrLDPC_decoder/nrLDPC_tools/generator_bnProc/sauvegarde.tar.gz b/openair1/PHY/CODING/nrLDPC_decoder/nrLDPC_tools/generator_bnProc/sauvegarde.tar.gz new file mode 100644 index 0000000000000000000000000000000000000000..438d0b4b2111a33261ab02399920965505229458 Binary files /dev/null and b/openair1/PHY/CODING/nrLDPC_decoder/nrLDPC_tools/generator_bnProc/sauvegarde.tar.gz differ diff --git a/openair1/PHY/CODING/nrLDPC_decoder/nrLDPC_tools/generator_bnProc_avx512/bnProcPc_gen_BG1_avx512.c b/openair1/PHY/CODING/nrLDPC_decoder/nrLDPC_tools/generator_bnProc_avx512/bnProcPc_gen_BG1_avx512.c new file mode 100644 index 0000000000000000000000000000000000000000..09bd99521639960ad989eb46a832cf33ee5c96af --- /dev/null +++ b/openair1/PHY/CODING/nrLDPC_decoder/nrLDPC_tools/generator_bnProc_avx512/bnProcPc_gen_BG1_avx512.c @@ -0,0 +1,1768 @@ + + + + +#include <stdint.h> +#include <immintrin.h> +#include "../../nrLDPCdecoder_defs.h" +#include "../../nrLDPC_types.h" + + +void nrLDPC_bnProcPc_BG1_generator_AVX512(int R) +{ + const char *ratestr[3]={"13","23","89"}; + + if (R<0 || R>2) {printf("Illegal R %d\n",R); abort();} + + + + + char fname[50]; + sprintf(fname,"../ldpc_gen_files/bn_avx512/nrLDPC_bnProcPc_BG1_R%s_AVX512.h",ratestr[R]); + FILE *fd=fopen(fname,"w"); + if (fd == NULL) {printf("Cannot create \n");abort();} + + + + fprintf(fd,"static inline void nrLDPC_bnProcPc_BG1_R%s_AVX512(int8_t* bnProcBuf,int8_t* llrRes , int8_t* llrProcBuf, uint16_t Z ) {\n",ratestr[R]); + const uint8_t* lut_numBnInBnGroups; + const uint32_t* lut_startAddrBnGroups; + const uint16_t* lut_startAddrBnGroupsLlr; + if (R==0) { + + + lut_numBnInBnGroups = lut_numBnInBnGroups_BG1_R13; + lut_startAddrBnGroups = lut_startAddrBnGroups_BG1_R13; + lut_startAddrBnGroupsLlr = lut_startAddrBnGroupsLlr_BG1_R13; + + } + else if (R==1){ + + lut_numBnInBnGroups = lut_numBnInBnGroups_BG1_R23; + lut_startAddrBnGroups = lut_startAddrBnGroups_BG1_R23; + lut_startAddrBnGroupsLlr = lut_startAddrBnGroupsLlr_BG1_R23; + } + else if (R==2) { + + lut_numBnInBnGroups = lut_numBnInBnGroups_BG1_R89; + lut_startAddrBnGroups = lut_startAddrBnGroups_BG1_R89; + lut_startAddrBnGroupsLlr = lut_startAddrBnGroupsLlr_BG1_R89; + } + else { printf("aborting, illegal R %d\n",R); fclose(fd);abort();} + // Number of BNs in Groups +// uint32_t M; + //uint32_t M32rem; + //uint32_t i,j; + uint32_t k; + // Offset to each bit within a group in terms of 32 Byte + uint32_t cnOffsetInGroup; + uint8_t idxBnGroup = 0; + + fprintf(fd," __m512i zmm0, zmm1, zmmRes0, zmmRes1; \n"); + + + fprintf(fd," __m256i* p_bnProcBuf; \n"); + fprintf(fd," __m256i* p_llrProcBuf;\n"); + fprintf(fd," __m512i* p_llrRes; \n"); + fprintf(fd," uint32_t M ;\n"); + + +fprintf(fd, "// Process group with 1 CNs \n"); + + + // Process group with 2 CNs + + if (lut_numBnInBnGroups[0] > 0) + { + // If elements in group move to next address + // idxBnGroup++; + + // Number of groups of 32 BNs for parallel processing + fprintf(fd," M = (%d*Z + 63)>>6;\n",lut_numBnInBnGroups[0] ); + + // Set the offset to each CN within a group in terms of 16 Byte + cnOffsetInGroup = (lut_numBnInBnGroups[0]*NR_LDPC_ZMAX)>>5; + + // Set pointers to start of group 2 + fprintf(fd," p_bnProcBuf = (__m256i*) &bnProcBuf [%d];\n",lut_startAddrBnGroups[idxBnGroup]); + fprintf(fd," p_llrProcBuf = (__m256i*) &llrProcBuf [%d];\n",lut_startAddrBnGroupsLlr[idxBnGroup]); + fprintf(fd," p_llrRes = (__m512i*) &llrRes [%d];\n",lut_startAddrBnGroupsLlr[idxBnGroup]); + + // Loop over BNs + fprintf(fd," for (int i=0,j=0;i<M;i++,j+=2) {\n"); + // First 16 LLRs of first CN + fprintf(fd," zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf [j]);\n"); + fprintf(fd," zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[j + 1]);\n"); + + // Loop over CNs + for (k=1; k<1; k++) + { + fprintf(fd," zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf[%d + j]);\n", k*cnOffsetInGroup); + fprintf(fd," zmmRes0 = _mm512_adds_epi16(zmmRes0, zmm0);\n"); + + fprintf(fd," zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[%d + j +1]);\n", k*cnOffsetInGroup); + + fprintf(fd, " zmmRes1 = _mm512_adds_epi16(zmmRes1, zmm1); \n"); + } + + // Add LLR from receiver input + fprintf(fd," zmm0 = _mm512_cvtepi8_epi16(p_llrProcBuf[j]);\n"); + fprintf(fd," zmmRes0 = _mm512_adds_epi16(zmmRes0, zmm0);\n"); + + fprintf(fd," zmm1 = _mm512_cvtepi8_epi16(p_llrProcBuf[j +1 ]);\n"); + fprintf(fd," zmmRes1 = _mm512_adds_epi16(zmmRes1, zmm1);\n"); + + // Pack results back to epi8 + fprintf(fd," zmm0 = _mm512_packs_epi16(zmmRes0, zmmRes1);\n"); + // zmm0 = [zmmRes1[255:256] zmmRes0[255:256] zmmRes1[127:0] zmmRes0[127:0]] + // p_llrRes = [zmmRes1[255:256] zmmRes1[127:0] zmmRes0[255:256] zmmRes0[127:0]] + fprintf(fd," p_llrRes[i] = _mm512_permutex_epi64(zmm0, 0xD8);\n"); + + + fprintf(fd,"}\n"); + } + // ===================================================================== + // Process group with 2 CNs + + +fprintf(fd, "// Process group with 2 CNs \n"); + + // Process group with 2 CNs + + if (lut_numBnInBnGroups[1] > 0) + { + // If elements in group move to next address + idxBnGroup++; + + // Number of groups of 32 BNs for parallel processing + fprintf(fd," M = (%d*Z + 63)>>6;\n",lut_numBnInBnGroups[1] ); + + // Set the offset to each CN within a group in terms of 16 Byte + cnOffsetInGroup = (lut_numBnInBnGroups[1]*NR_LDPC_ZMAX)>>5; + + // Set pointers to start of group 2 + fprintf(fd," p_bnProcBuf = (__m256i*) &bnProcBuf [%d];\n",lut_startAddrBnGroups[idxBnGroup]); + fprintf(fd," p_llrProcBuf = (__m256i*) &llrProcBuf [%d];\n",lut_startAddrBnGroupsLlr[idxBnGroup]); + fprintf(fd," p_llrRes = (__m512i*) &llrRes [%d];\n",lut_startAddrBnGroupsLlr[idxBnGroup]); + + // Loop over BNs + fprintf(fd," for (int i=0,j=0;i<M;i++,j+=2) {\n"); + // First 16 LLRs of first CN + fprintf(fd," zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf [j]);\n"); + fprintf(fd," zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[j + 1]);\n"); + + // Loop over CNs + for (k=1; k<2; k++) + { + fprintf(fd," zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf[%d + j]);\n", k*cnOffsetInGroup); + fprintf(fd," zmmRes0 = _mm512_adds_epi16(zmmRes0, zmm0);\n"); + + fprintf(fd," zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[%d + j +1]);\n", k*cnOffsetInGroup); + + fprintf(fd, " zmmRes1 = _mm512_adds_epi16(zmmRes1, zmm1); \n"); + } + + // Add LLR from receiver input + fprintf(fd," zmm0 = _mm512_cvtepi8_epi16(p_llrProcBuf[j]);\n"); + fprintf(fd," zmmRes0 = _mm512_adds_epi16(zmmRes0, zmm0);\n"); + + fprintf(fd," zmm1 = _mm512_cvtepi8_epi16(p_llrProcBuf[j +1 ]);\n"); + fprintf(fd," zmmRes1 = _mm512_adds_epi16(zmmRes1, zmm1);\n"); + + // Pack results back to epi8 + fprintf(fd," zmm0 = _mm512_packs_epi16(zmmRes0, zmmRes1);\n"); + // zmm0 = [zmmRes1[255:256] zmmRes0[255:256] zmmRes1[127:0] zmmRes0[127:0]] + // p_llrRes = [zmmRes1[255:256] zmmRes1[127:0] zmmRes0[255:256] zmmRes0[127:0]] + fprintf(fd," p_llrRes[i] = _mm512_permutex_epi64(zmm0, 0xD8);\n"); + + + fprintf(fd,"}\n"); + } + + // ===================================================================== + // Process group with 3 CNs + + +fprintf(fd, "// Process group with 3 CNs \n"); + + // Process group with 3 CNs + + if (lut_numBnInBnGroups[2] > 0) + { + // If elements in group move to next address + idxBnGroup++; + + // Number of groups of 32 BNs for parallel processing + fprintf(fd," M = (%d*Z + 63)>>6;\n",lut_numBnInBnGroups[2] ); + + // Set the offset to each CN within a group in terms of 16 Byte + cnOffsetInGroup = (lut_numBnInBnGroups[2]*NR_LDPC_ZMAX)>>5; + + // Set pointers to start of group 2 + fprintf(fd," p_bnProcBuf = (__m256i*) &bnProcBuf [%d];\n",lut_startAddrBnGroups[idxBnGroup]); + fprintf(fd," p_llrProcBuf = (__m256i*) &llrProcBuf [%d];\n",lut_startAddrBnGroupsLlr[idxBnGroup]); + fprintf(fd," p_llrRes = (__m512i*) &llrRes [%d];\n",lut_startAddrBnGroupsLlr[idxBnGroup]); + // Loop over BNs + fprintf(fd," for (int i=0,j=0;i<M;i++,j+=2) {\n"); + // First 16 LLRs of first CN + fprintf(fd," zmmRes0 = _mm512_cvtepi8_epi16(p_bnProcBuf [j]);\n"); + fprintf(fd," zmmRes1 = _mm512_cvtepi8_epi16(p_bnProcBuf [j +1]);\n"); + + // Loop over CNs + for (k=1; k<3; k++) + { + fprintf(fd," zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf[%d + j]);\n", k*cnOffsetInGroup); + fprintf(fd," zmmRes0 = _mm512_adds_epi16(zmmRes0, zmm0);\n"); + + fprintf(fd," zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[%d + j +1]);\n", k*cnOffsetInGroup); + + fprintf(fd," zmmRes1 = _mm512_adds_epi16(zmmRes1, zmm1); \n"); + } + + // Add LLR from receiver input + fprintf(fd," zmm0 = _mm512_cvtepi8_epi16(p_llrProcBuf[j]);\n"); + fprintf(fd," zmmRes0 = _mm512_adds_epi16(zmmRes0, zmm0);\n"); + + fprintf(fd," zmm1 = _mm512_cvtepi8_epi16(p_llrProcBuf[j +1 ]);\n"); + fprintf(fd," zmmRes1 = _mm512_adds_epi16(zmmRes1, zmm1);\n"); + + // Pack results back to epi8 + fprintf(fd," zmm0 = _mm512_packs_epi16(zmmRes0, zmmRes1);\n"); + // zmm0 = [zmmRes1[255:256] zmmRes0[255:256] zmmRes1[127:0] zmmRes0[127:0]] + fprintf(fd," p_llrRes[i] = _mm512_permutex_epi64(zmm0, 0xD8);\n"); + + fprintf(fd,"}\n"); + } + + + + // ===================================================================== + // Process group with 4 CNs + +fprintf(fd, "// Process group with 4 CNs \n"); + + // Process group with 4 CNs + + if (lut_numBnInBnGroups[3] > 0) + { + // If elements in group move to next address + idxBnGroup++; + + // Number of groups of 32 BNs for parallel processing + fprintf(fd," M = (%d*Z + 63)>>6;\n",lut_numBnInBnGroups[3] ); + + // Set the offset to each CN within a group in terms of 16 Byte + cnOffsetInGroup = (lut_numBnInBnGroups[3]*NR_LDPC_ZMAX)>>5; + + // Set pointers to start of group 2 + fprintf(fd," p_bnProcBuf = (__m256i*) &bnProcBuf [%d];\n",lut_startAddrBnGroups[idxBnGroup]); + fprintf(fd," p_llrProcBuf = (__m256i*) &llrProcBuf [%d];\n",lut_startAddrBnGroupsLlr[idxBnGroup]); + fprintf(fd," p_llrRes = (__m512i*) &llrRes [%d];\n",lut_startAddrBnGroupsLlr[idxBnGroup]); + + // Loop over BNs + fprintf(fd," for (int i=0,j=0;i<M;i++,j+=2) {\n"); + // First 16 LLRs of first CN + fprintf(fd," zmmRes0 = _mm512_cvtepi8_epi16(p_bnProcBuf [j]);\n"); + fprintf(fd," zmmRes1 = _mm512_cvtepi8_epi16(p_bnProcBuf [j +1]);\n"); + + // Loop over CNs + for (k=1; k<4; k++) + { + fprintf(fd," zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf[%d + j]);\n", k*cnOffsetInGroup); + fprintf(fd," zmmRes0 = _mm512_adds_epi16(zmmRes0, zmm0);\n"); + + fprintf(fd," zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[%d + j +1]);\n", k*cnOffsetInGroup); + + fprintf(fd, " zmmRes1 = _mm512_adds_epi16(zmmRes1, zmm1); \n"); + } + + // Add LLR from receiver input + fprintf(fd," zmm0 = _mm512_cvtepi8_epi16(p_llrProcBuf[j]);\n"); + fprintf(fd," zmmRes0 = _mm512_adds_epi16(zmmRes0, zmm0);\n"); + + fprintf(fd," zmm1 = _mm512_cvtepi8_epi16(p_llrProcBuf[j +1 ]);\n"); + fprintf(fd," zmmRes1 = _mm512_adds_epi16(zmmRes1, zmm1);\n"); + + // Pack results back to epi8 + fprintf(fd," zmm0 = _mm512_packs_epi16(zmmRes0, zmmRes1);\n"); + // zmm0 = [zmmRes1[255:256] zmmRes0[255:256] zmmRes1[127:0] zmmRes0[127:0]] + // p_llrRes = [zmmRes1[255:256] zmmRes1[127:0] zmmRes0[255:256] zmmRes0[127:0]] + fprintf(fd," p_llrRes[i] = _mm512_permutex_epi64(zmm0, 0xD8);\n"); + + fprintf(fd,"}\n"); + } + + + // ===================================================================== + // Process group with 5 CNs + +fprintf(fd, "// Process group with 5 CNs \n"); + + // Process group with 5 CNs + + if (lut_numBnInBnGroups[4] > 0) + { + // If elements in group move to next address + idxBnGroup++; + + // Number of groups of 32 BNs for parallel processing + fprintf(fd," M = (%d*Z + 63)>>6;\n",lut_numBnInBnGroups[4] ); + + // Set the offset to each CN within a group in terms of 16 Byte + cnOffsetInGroup = (lut_numBnInBnGroups[4]*NR_LDPC_ZMAX)>>5; + + // Set pointers to start of group 2 + fprintf(fd," p_bnProcBuf = (__m256i*) &bnProcBuf [%d];\n",lut_startAddrBnGroups[idxBnGroup]); + fprintf(fd," p_llrProcBuf = (__m256i*) &llrProcBuf [%d];\n",lut_startAddrBnGroupsLlr[idxBnGroup]); + fprintf(fd," p_llrRes = (__m512i*) &llrRes [%d];\n",lut_startAddrBnGroupsLlr[idxBnGroup]); + // Loop over BNs + fprintf(fd," for (int i=0,j=0;i<M;i++,j+=2) {\n"); + // First 16 LLRs of first CN + fprintf(fd," zmmRes0 = _mm512_cvtepi8_epi16(p_bnProcBuf [j]);\n"); + fprintf(fd," zmmRes1 = _mm512_cvtepi8_epi16(p_bnProcBuf [j +1]);\n"); + + // Loop over CNs + for (k=1; k<5; k++) + { + fprintf(fd," zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf[%d + j]);\n", k*cnOffsetInGroup); + fprintf(fd," zmmRes0 = _mm512_adds_epi16(zmmRes0, zmm0);\n"); + + fprintf(fd," zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[%d + j +1]);\n", k*cnOffsetInGroup); + + fprintf(fd, " zmmRes1 = _mm512_adds_epi16(zmmRes1, zmm1); \n"); + } + + // Add LLR from receiver input + fprintf(fd," zmm0 = _mm512_cvtepi8_epi16(p_llrProcBuf[j]);\n"); + fprintf(fd," zmmRes0 = _mm512_adds_epi16(zmmRes0, zmm0);\n"); + + fprintf(fd," zmm1 = _mm512_cvtepi8_epi16(p_llrProcBuf[j +1 ]);\n"); + fprintf(fd," zmmRes1 = _mm512_adds_epi16(zmmRes1, zmm1);\n"); + + // Pack results back to epi8 + fprintf(fd," zmm0 = _mm512_packs_epi16(zmmRes0, zmmRes1);\n"); + // zmm0 = [zmmRes1[255:256] zmmRes0[255:256] zmmRes1[127:0] zmmRes0[127:0]] + // p_llrRes = [zmmRes1[255:256] zmmRes1[127:0] zmmRes0[255:256] zmmRes0[127:0]] + fprintf(fd," p_llrRes[i] = _mm512_permutex_epi64(zmm0, 0xD8);\n"); + + fprintf(fd,"}\n"); + } + + + + // ===================================================================== + // Process group with 6 CNs + +fprintf(fd, "// Process group with 6 CNs \n"); + + // Process group with 6 CNs + + if (lut_numBnInBnGroups[5] > 0) + { + // If elements in group move to next address + idxBnGroup++; + + // Number of groups of 32 BNs for parallel processing + fprintf(fd," M = (%d*Z + 63)>>6;\n",lut_numBnInBnGroups[5] ); + + // Set the offset to each CN within a group in terms of 16 Byte + cnOffsetInGroup = (lut_numBnInBnGroups[5]*NR_LDPC_ZMAX)>>5; + + // Set pointers to start of group 2 + fprintf(fd," p_bnProcBuf = (__m256i*) &bnProcBuf [%d];\n",lut_startAddrBnGroups[idxBnGroup]); + fprintf(fd," p_llrProcBuf = (__m256i*) &llrProcBuf [%d];\n",lut_startAddrBnGroupsLlr[idxBnGroup]); + fprintf(fd," p_llrRes = (__m512i*) &llrRes [%d];\n",lut_startAddrBnGroupsLlr[idxBnGroup]); + // Loop over BNs + fprintf(fd," for (int i=0,j=0;i<M;i++,j+=2) {\n"); + // First 16 LLRs of first CN + fprintf(fd," zmmRes0 = _mm512_cvtepi8_epi16(p_bnProcBuf [j]);\n"); + fprintf(fd," zmmRes1 = _mm512_cvtepi8_epi16(p_bnProcBuf [j +1]);\n"); + + // Loop over CNs + for (k=1; k<6; k++) + { + fprintf(fd," zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf[%d + j]);\n", k*cnOffsetInGroup); + fprintf(fd," zmmRes0 = _mm512_adds_epi16(zmmRes0, zmm0);\n"); + + fprintf(fd," zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[%d + j +1]);\n", k*cnOffsetInGroup); + + fprintf(fd, " zmmRes1 = _mm512_adds_epi16(zmmRes1, zmm1); \n"); + } + + // Add LLR from receiver input + fprintf(fd," zmm0 = _mm512_cvtepi8_epi16(p_llrProcBuf[j]);\n"); + fprintf(fd," zmmRes0 = _mm512_adds_epi16(zmmRes0, zmm0);\n"); + + fprintf(fd," zmm1 = _mm512_cvtepi8_epi16(p_llrProcBuf[j +1 ]);\n"); + fprintf(fd," zmmRes1 = _mm512_adds_epi16(zmmRes1, zmm1);\n"); + + // Pack results back to epi8 + fprintf(fd," zmm0 = _mm512_packs_epi16(zmmRes0, zmmRes1);\n"); + // zmm0 = [zmmRes1[255:256] zmmRes0[255:256] zmmRes1[127:0] zmmRes0[127:0]] + // p_llrRes = [zmmRes1[255:256] zmmRes1[127:0] zmmRes0[255:256] zmmRes0[127:0]] + fprintf(fd," p_llrRes[i] = _mm512_permutex_epi64(zmm0, 0xD8);\n"); + + fprintf(fd,"}\n"); + } + + + // ===================================================================== + // Process group with 7 CNs + +fprintf(fd, "// Process group with 7 CNs \n"); + + // Process group with 7 CNs + + if (lut_numBnInBnGroups[6] > 0) + { + // If elements in group move to next address + idxBnGroup++; + + // Number of groups of 32 BNs for parallel processing + fprintf(fd," M = (%d*Z + 63)>>6;\n",lut_numBnInBnGroups[6] ); + + // Set the offset to each CN within a group in terms of 16 Byte + cnOffsetInGroup = (lut_numBnInBnGroups[6]*NR_LDPC_ZMAX)>>5; + + // Set pointers to start of group 2 + fprintf(fd," p_bnProcBuf = (__m256i*) &bnProcBuf [%d];\n",lut_startAddrBnGroups[idxBnGroup]); + fprintf(fd," p_llrProcBuf = (__m256i*) &llrProcBuf [%d];\n",lut_startAddrBnGroupsLlr[idxBnGroup]); + fprintf(fd," p_llrRes = (__m512i*) &llrRes [%d];\n",lut_startAddrBnGroupsLlr[idxBnGroup]); + // Loop over BNs + fprintf(fd," for (int i=0,j=0;i<M;i++,j+=2) {\n"); + // First 16 LLRs of first CN + fprintf(fd," zmmRes0 = _mm512_cvtepi8_epi16(p_bnProcBuf [j]);\n"); + fprintf(fd," zmmRes1 = _mm512_cvtepi8_epi16(p_bnProcBuf [j +1]);\n"); + + // Loop over CNs + for (k=1; k<7; k++) + { + fprintf(fd," zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf[%d + j]);\n", k*cnOffsetInGroup); + fprintf(fd," zmmRes0 = _mm512_adds_epi16(zmmRes0, zmm0);\n"); + + fprintf(fd," zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[%d + j +1]);\n", k*cnOffsetInGroup); + + fprintf(fd, " zmmRes1 = _mm512_adds_epi16(zmmRes1, zmm1); \n"); + } + + // Add LLR from receiver input + fprintf(fd," zmm0 = _mm512_cvtepi8_epi16(p_llrProcBuf[j]);\n"); + fprintf(fd," zmmRes0 = _mm512_adds_epi16(zmmRes0, zmm0);\n"); + + fprintf(fd," zmm1 = _mm512_cvtepi8_epi16(p_llrProcBuf[j +1 ]);\n"); + fprintf(fd," zmmRes1 = _mm512_adds_epi16(zmmRes1, zmm1);\n"); + + // Pack results back to epi8 + fprintf(fd," zmm0 = _mm512_packs_epi16(zmmRes0, zmmRes1);\n"); + // zmm0 = [zmmRes1[255:256] zmmRes0[255:256] zmmRes1[127:0] zmmRes0[127:0]] + // p_llrRes = [zmmRes1[255:256] zmmRes1[127:0] zmmRes0[255:256] zmmRes0[127:0]] + //fprintf(fd," (__m512i*) &llrRes[%d + i] = _mm512_permutex_epi64(zmm0, 0xD8);\n",lut_startAddrBnGroupsLlr[idxBnGroup]>>6 ); + fprintf(fd," p_llrRes[i] = _mm512_permutex_epi64(zmm0, 0xD8);\n"); + + fprintf(fd,"}\n"); + } + + + // ===================================================================== + // Process group with 8 CNs + +fprintf(fd, "// Process group with 8 CNs \n"); + + // Process group with 8 CNs + + if (lut_numBnInBnGroups[7] > 0) + { + // If elements in group move to next address + idxBnGroup++; + + // Number of groups of 32 BNs for parallel processing + fprintf(fd," M = (%d*Z + 63)>>6;\n",lut_numBnInBnGroups[7] ); + + // Set the offset to each CN within a group in terms of 16 Byte + cnOffsetInGroup = (lut_numBnInBnGroups[7]*NR_LDPC_ZMAX)>>5; + + // Set pointers to start of group 2 + fprintf(fd," p_bnProcBuf = (__m256i*) &bnProcBuf [%d];\n",lut_startAddrBnGroups[idxBnGroup]); + fprintf(fd," p_llrProcBuf = (__m256i*) &llrProcBuf [%d];\n",lut_startAddrBnGroupsLlr[idxBnGroup]); + fprintf(fd," p_llrRes = (__m512i*) &llrRes [%d];\n",lut_startAddrBnGroupsLlr[idxBnGroup]); + // Loop over BNs + fprintf(fd," for (int i=0,j=0;i<M;i++,j+=2) {\n"); + // First 16 LLRs of first CN + fprintf(fd," zmmRes0 = _mm512_cvtepi8_epi16(p_bnProcBuf [j]);\n"); + fprintf(fd," zmmRes1 = _mm512_cvtepi8_epi16(p_bnProcBuf [j +1]);\n"); + + // Loop over CNs + for (k=1; k<8; k++) + { + fprintf(fd," zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf[%d + j]);\n", k*cnOffsetInGroup); + fprintf(fd," zmmRes0 = _mm512_adds_epi16(zmmRes0, zmm0);\n"); + + fprintf(fd," zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[%d + j +1]);\n", k*cnOffsetInGroup); + + fprintf(fd, " zmmRes1 = _mm512_adds_epi16(zmmRes1, zmm1); \n"); + } + + // Add LLR from receiver input + fprintf(fd," zmm0 = _mm512_cvtepi8_epi16(p_llrProcBuf[j]);\n"); + fprintf(fd," zmmRes0 = _mm512_adds_epi16(zmmRes0, zmm0);\n"); + + fprintf(fd," zmm1 = _mm512_cvtepi8_epi16(p_llrProcBuf[j +1 ]);\n"); + fprintf(fd," zmmRes1 = _mm512_adds_epi16(zmmRes1, zmm1);\n"); + + // Pack results back to epi8 + fprintf(fd," zmm0 = _mm512_packs_epi16(zmmRes0, zmmRes1);\n"); + // zmm0 = [zmmRes1[255:256] zmmRes0[255:256] zmmRes1[127:0] zmmRes0[127:0]] + // p_llrRes = [zmmRes1[255:256] zmmRes1[127:0] zmmRes0[255:256] zmmRes0[127:0]] + //fprintf(fd," (__m512i*) &llrRes[%d + i] = _mm512_permutex_epi64(zmm0, 0xD8);\n",lut_startAddrBnGroupsLlr[idxBnGroup]>>6 ); + + fprintf(fd," p_llrRes[i] = _mm512_permutex_epi64(zmm0, 0xD8);\n"); + + fprintf(fd,"}\n"); + } + + // ===================================================================== + // Process group with 9 CNs + +fprintf(fd, "// Process group with 9 CNs \n"); + + // Process group with 9 CNs + + if (lut_numBnInBnGroups[8] > 0) + { + // If elements in group move to next address + idxBnGroup++; + + // Number of groups of 32 BNs for parallel processing + fprintf(fd," M = (%d*Z + 63)>>6;\n",lut_numBnInBnGroups[8] ); + + // Set the offset to each CN within a group in terms of 16 Byte + cnOffsetInGroup = (lut_numBnInBnGroups[8]*NR_LDPC_ZMAX)>>5; + + // Set pointers to start of group 2 + fprintf(fd," p_bnProcBuf = (__m256i*) &bnProcBuf [%d];\n",lut_startAddrBnGroups[idxBnGroup]); + fprintf(fd," p_llrProcBuf = (__m256i*) &llrProcBuf [%d];\n",lut_startAddrBnGroupsLlr[idxBnGroup]); + fprintf(fd," p_llrRes = (__m512i*) &llrRes [%d];\n",lut_startAddrBnGroupsLlr[idxBnGroup]); + // Loop over BNs + fprintf(fd," for (int i=0,j=0;i<M;i++,j+=2) {\n"); + // First 16 LLRs of first CN + fprintf(fd," zmmRes0 = _mm512_cvtepi8_epi16(p_bnProcBuf [j]);\n"); + fprintf(fd," zmmRes1 = _mm512_cvtepi8_epi16(p_bnProcBuf [j +1]);\n"); + + // Loop over CNs + for (k=1; k<9; k++) + { + fprintf(fd," zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf[%d + j]);\n", k*cnOffsetInGroup); + fprintf(fd," zmmRes0 = _mm512_adds_epi16(zmmRes0, zmm0);\n"); + + fprintf(fd," zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[%d + j +1]);\n", k*cnOffsetInGroup); + + fprintf(fd, " zmmRes1 = _mm512_adds_epi16(zmmRes1, zmm1); \n"); + } + + // Add LLR from receiver input + fprintf(fd," zmm0 = _mm512_cvtepi8_epi16(p_llrProcBuf[j]);\n"); + fprintf(fd," zmmRes0 = _mm512_adds_epi16(zmmRes0, zmm0);\n"); + + fprintf(fd," zmm1 = _mm512_cvtepi8_epi16(p_llrProcBuf[j +1 ]);\n"); + fprintf(fd," zmmRes1 = _mm512_adds_epi16(zmmRes1, zmm1);\n"); + + // Pack results back to epi8 + fprintf(fd," zmm0 = _mm512_packs_epi16(zmmRes0, zmmRes1);\n"); + // zmm0 = [zmmRes1[255:256] zmmRes0[255:256] zmmRes1[127:0] zmmRes0[127:0]] + // p_llrRes = [zmmRes1[255:256] zmmRes1[127:0] zmmRes0[255:256] zmmRes0[127:0]] + //fprintf(fd," (__m512i*) &llrRes[%d + i] = _mm512_permutex_epi64(zmm0, 0xD8);\n",lut_startAddrBnGroupsLlr[idxBnGroup]>>6 ); + fprintf(fd," p_llrRes[i] = _mm512_permutex_epi64(zmm0, 0xD8);\n"); + + fprintf(fd,"}\n"); + } + + + // ===================================================================== + // Process group with 10 CNs + +fprintf(fd, "// Process group with 10 CNs \n"); + + // Process group with 10 CNs + + if (lut_numBnInBnGroups[9] > 0) + { + // If elements in group move to next address + idxBnGroup++; + + // Number of groups of 32 BNs for parallel processing + fprintf(fd," M = (%d*Z + 63)>>6;\n",lut_numBnInBnGroups[9] ); + + // Set the offset to each CN within a group in terms of 16 Byte + cnOffsetInGroup = (lut_numBnInBnGroups[9]*NR_LDPC_ZMAX)>>5; + + // Set pointers to start of group 2 + fprintf(fd," p_bnProcBuf = (__m256i*) &bnProcBuf [%d];\n",lut_startAddrBnGroups[idxBnGroup]); + fprintf(fd," p_llrProcBuf = (__m256i*) &llrProcBuf [%d];\n",lut_startAddrBnGroupsLlr[idxBnGroup]); + fprintf(fd," p_llrRes = (__m512i*) &llrRes [%d];\n",lut_startAddrBnGroupsLlr[idxBnGroup]); + // Loop over BNs + fprintf(fd," for (int i=0,j=0;i<M;i++,j+=2) {\n"); + // First 16 LLRs of first CN + fprintf(fd," zmmRes0 = _mm512_cvtepi8_epi16(p_bnProcBuf [j]);\n"); + fprintf(fd," zmmRes1 = _mm512_cvtepi8_epi16(p_bnProcBuf [j +1]);\n"); + + // Loop over CNs + for (k=1; k<10; k++) + { + fprintf(fd," zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf[%d + j]);\n", k*cnOffsetInGroup); + fprintf(fd," zmmRes0 = _mm512_adds_epi16(zmmRes0, zmm0);\n"); + + fprintf(fd," zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[%d + j +1]);\n", k*cnOffsetInGroup); + + fprintf(fd, " zmmRes1 = _mm512_adds_epi16(zmmRes1, zmm1); \n"); + } + + // Add LLR from receiver input + fprintf(fd," zmm0 = _mm512_cvtepi8_epi16(p_llrProcBuf[j]);\n"); + fprintf(fd," zmmRes0 = _mm512_adds_epi16(zmmRes0, zmm0);\n"); + + fprintf(fd," zmm1 = _mm512_cvtepi8_epi16(p_llrProcBuf[j +1 ]);\n"); + fprintf(fd," zmmRes1 = _mm512_adds_epi16(zmmRes1, zmm1);\n"); + + // Pack results back to epi8 + fprintf(fd," zmm0 = _mm512_packs_epi16(zmmRes0, zmmRes1);\n"); + // zmm0 = [zmmRes1[255:256] zmmRes0[255:256] zmmRes1[127:0] zmmRes0[127:0]] + // p_llrRes = [zmmRes1[255:256] zmmRes1[127:0] zmmRes0[255:256] zmmRes0[127:0]] + fprintf(fd," p_llrRes[i] = _mm512_permutex_epi64(zmm0, 0xD8);\n"); + + fprintf(fd,"}\n"); + } + + + + // ===================================================================== + +fprintf(fd, "// Process group with 11 CNs \n"); + + + + // Process group with 2 CNs + + if (lut_numBnInBnGroups[10] > 0) + { + // If elements in group move to next address + idxBnGroup++; + + // Number of groups of 32 BNs for parallel processing + fprintf(fd," M = (%d*Z + 63)>>6;\n",lut_numBnInBnGroups[10] );; + + // Set the offset to each CN within a group in terms of 16 Byte + cnOffsetInGroup = (lut_numBnInBnGroups[10]*NR_LDPC_ZMAX)>>5; + + // Set pointers to start of group 2 + fprintf(fd," p_bnProcBuf = (__m256i*) &bnProcBuf [%d];\n",lut_startAddrBnGroups[idxBnGroup]); + fprintf(fd," p_llrProcBuf = (__m256i*) &llrProcBuf [%d];\n",lut_startAddrBnGroupsLlr[idxBnGroup]); + fprintf(fd," p_llrRes = (__m512i*) &llrRes [%d];\n",lut_startAddrBnGroupsLlr[idxBnGroup]); + // Loop over BNs + fprintf(fd," for (int i=0,j=0;i<M;i++,j+=2) {\n"); + // First 16 LLRs of first CN + fprintf(fd," zmmRes0 = _mm512_cvtepi8_epi16(p_bnProcBuf [j]);\n"); + fprintf(fd," zmmRes1 = _mm512_cvtepi8_epi16(p_bnProcBuf [j +1]);\n"); + + // Loop over CNs + for (k=1; k<11; k++) + { + fprintf(fd," zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf[%d + j]);\n", k*cnOffsetInGroup); + fprintf(fd," zmmRes0 = _mm512_adds_epi16(zmmRes0, zmm0);\n"); + + fprintf(fd," zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[%d + j +1]);\n", k*cnOffsetInGroup); + + fprintf(fd, " zmmRes1 = _mm512_adds_epi16(zmmRes1, zmm1); \n"); + } + + // Add LLR from receiver input + fprintf(fd," zmm0 = _mm512_cvtepi8_epi16(p_llrProcBuf[j]);\n"); + fprintf(fd," zmmRes0 = _mm512_adds_epi16(zmmRes0, zmm0);\n"); + + fprintf(fd," zmm1 = _mm512_cvtepi8_epi16(p_llrProcBuf[j +1 ]);\n"); + fprintf(fd," zmmRes1 = _mm512_adds_epi16(zmmRes1, zmm1);\n"); + + // Pack results back to epi8 + fprintf(fd," zmm0 = _mm512_packs_epi16(zmmRes0, zmmRes1);\n"); + // zmm0 = [zmmRes1[255:256] zmmRes0[255:256] zmmRes1[127:0] zmmRes0[127:0]] + // p_llrRes = [zmmRes1[255:256] zmmRes1[127:0] zmmRes0[255:256] zmmRes0[127:0]] + fprintf(fd," p_llrRes[i] = _mm512_permutex_epi64(zmm0, 0xD8);\n"); + + fprintf(fd,"}\n"); + } + // ===================================================================== + // Process group with 2 CNs + + +fprintf(fd, "// Process group with 12 CNs \n"); + + // Process group with 2 CNs + + if (lut_numBnInBnGroups[11] > 0) + { + // If elements in group move to next address + idxBnGroup++; + + // Number of groups of 32 BNs for parallel processing + fprintf(fd," M = (%d*Z + 63)>>6;\n",lut_numBnInBnGroups[11] );; + + // Set the offset to each CN within a group in terms of 16 Byte + cnOffsetInGroup = (lut_numBnInBnGroups[11]*NR_LDPC_ZMAX)>>5; + + // Set pointers to start of group 2 + fprintf(fd," p_bnProcBuf = (__m256i*) &bnProcBuf [%d];\n",lut_startAddrBnGroups[idxBnGroup]); + fprintf(fd," p_llrProcBuf = (__m256i*) &llrProcBuf [%d];\n",lut_startAddrBnGroupsLlr[idxBnGroup]); + fprintf(fd," p_llrRes = (__m512i*) &llrRes [%d];\n",lut_startAddrBnGroupsLlr[idxBnGroup]); + // Loop over BNs + fprintf(fd," for (int i=0,j=0;i<M;i++,j+=2) {\n"); + // First 16 LLRs of first CN + fprintf(fd," zmmRes0 = _mm512_cvtepi8_epi16(p_bnProcBuf [j]);\n"); + fprintf(fd," zmmRes1 = _mm512_cvtepi8_epi16(p_bnProcBuf [j +1]);\n"); + + // Loop over CNs + for (k=1; k<12; k++) + { + fprintf(fd," zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf[%d + j]);\n", k*cnOffsetInGroup); + fprintf(fd," zmmRes0 = _mm512_adds_epi16(zmmRes0, zmm0);\n"); + + fprintf(fd," zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[%d + j +1]);\n", k*cnOffsetInGroup); + + fprintf(fd, " zmmRes1 = _mm512_adds_epi16(zmmRes1, zmm1); \n"); + } + + // Add LLR from receiver input + fprintf(fd," zmm0 = _mm512_cvtepi8_epi16(p_llrProcBuf[j]);\n"); + fprintf(fd," zmmRes0 = _mm512_adds_epi16(zmmRes0, zmm0);\n"); + + fprintf(fd," zmm1 = _mm512_cvtepi8_epi16(p_llrProcBuf[j +1 ]);\n"); + fprintf(fd," zmmRes1 = _mm512_adds_epi16(zmmRes1, zmm1);\n"); + + // Pack results back to epi8 + fprintf(fd," zmm0 = _mm512_packs_epi16(zmmRes0, zmmRes1);\n"); + // zmm0 = [zmmRes1[255:256] zmmRes0[255:256] zmmRes1[127:0] zmmRes0[127:0]] + // p_llrRes = [zmmRes1[255:256] zmmRes1[127:0] zmmRes0[255:256] zmmRes0[127:0]] + fprintf(fd," p_llrRes[i] = _mm512_permutex_epi64(zmm0, 0xD8);\n"); + + fprintf(fd,"}\n"); + } + + // ===================================================================== + // Process group with 13 CNs + + +fprintf(fd, "// Process group with 13 CNs \n"); + + // Process group with 3 CNs + + if (lut_numBnInBnGroups[12] > 0) + { + // If elements in group move to next address + idxBnGroup++; + + // Number of groups of 32 BNs for parallel processing + fprintf(fd," M = (%d*Z + 63)>>6;\n",lut_numBnInBnGroups[12] );; + + // Set the offset to each CN within a group in terms of 16 Byte + cnOffsetInGroup = (lut_numBnInBnGroups[12]*NR_LDPC_ZMAX)>>5; + + // Set pointers to start of group 2 + fprintf(fd," p_bnProcBuf = (__m256i*) &bnProcBuf [%d];\n",lut_startAddrBnGroups[idxBnGroup]); + fprintf(fd," p_llrProcBuf = (__m256i*) &llrProcBuf [%d];\n",lut_startAddrBnGroupsLlr[idxBnGroup]); + fprintf(fd," p_llrRes = (__m512i*) &llrRes [%d];\n",lut_startAddrBnGroupsLlr[idxBnGroup]); + // Loop over BNs + fprintf(fd," for (int i=0,j=0;i<M;i++,j+=2) {\n"); + // First 16 LLRs of first CN + fprintf(fd," zmmRes0 = _mm512_cvtepi8_epi16(p_bnProcBuf [j]);\n"); + fprintf(fd," zmmRes1 = _mm512_cvtepi8_epi16(p_bnProcBuf [j +1]);\n"); + + // Loop over CNs + for (k=1; k<13; k++) + { + fprintf(fd," zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf[%d + j]);\n", k*cnOffsetInGroup); + fprintf(fd," zmmRes0 = _mm512_adds_epi16(zmmRes0, zmm0);\n"); + + fprintf(fd," zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[%d + j +1]);\n", k*cnOffsetInGroup); + + fprintf(fd," zmmRes1 = _mm512_adds_epi16(zmmRes1, zmm1); \n"); + } + + // Add LLR from receiver input + fprintf(fd," zmm0 = _mm512_cvtepi8_epi16(p_llrProcBuf[j]);\n"); + fprintf(fd," zmmRes0 = _mm512_adds_epi16(zmmRes0, zmm0);\n"); + + fprintf(fd," zmm1 = _mm512_cvtepi8_epi16(p_llrProcBuf[j +1 ]);\n"); + fprintf(fd," zmmRes1 = _mm512_adds_epi16(zmmRes1, zmm1);\n"); + + // Pack results back to epi8 + fprintf(fd," zmm0 = _mm512_packs_epi16(zmmRes0, zmmRes1);\n"); + // zmm0 = [zmmRes1[255:256] zmmRes0[255:256] zmmRes1[127:0] zmmRes0[127:0]] + // p_llrRes = [zmmRes1[255:256] zmmRes1[127:0] zmmRes0[255:256] zmmRes0[127:0]] + fprintf(fd," p_llrRes[i] = _mm512_permutex_epi64(zmm0, 0xD8);\n"); + + fprintf(fd,"}\n"); + } + + + + // ===================================================================== + // Process group with 4 CNs + +fprintf(fd, "// Process group with 14 CNs \n"); + + // Process group with 4 CNs + + if (lut_numBnInBnGroups[13] > 0) + { + // If elements in group move to next address + idxBnGroup++; + + // Number of groups of 32 BNs for parallel processing + fprintf(fd," M = (%d*Z + 63)>>6;\n",lut_numBnInBnGroups[13] );; + + // Set the offset to each CN within a group in terms of 16 Byte + cnOffsetInGroup = (lut_numBnInBnGroups[13]*NR_LDPC_ZMAX)>>5; + + // Set pointers to start of group 2 + fprintf(fd," p_bnProcBuf = (__m256i*) &bnProcBuf [%d];\n",lut_startAddrBnGroups[idxBnGroup]); + fprintf(fd," p_llrProcBuf = (__m256i*) &llrProcBuf [%d];\n",lut_startAddrBnGroupsLlr[idxBnGroup]); + fprintf(fd," p_llrRes = (__m512i*) &llrRes [%d];\n",lut_startAddrBnGroupsLlr[idxBnGroup]); + // Loop over BNs + fprintf(fd," for (int i=0,j=0;i<M;i++,j+=2) {\n"); + // First 16 LLRs of first CN + fprintf(fd," zmmRes0 = _mm512_cvtepi8_epi16(p_bnProcBuf [j]);\n"); + fprintf(fd," zmmRes1 = _mm512_cvtepi8_epi16(p_bnProcBuf [j +1]);\n"); + + // Loop over CNs + for (k=1; k<14; k++) + { + fprintf(fd," zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf[%d + j]);\n", k*cnOffsetInGroup); + fprintf(fd," zmmRes0 = _mm512_adds_epi16(zmmRes0, zmm0);\n"); + + fprintf(fd," zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[%d + j +1]);\n", k*cnOffsetInGroup); + + fprintf(fd, " zmmRes1 = _mm512_adds_epi16(zmmRes1, zmm1); \n"); + } + + // Add LLR from receiver input + fprintf(fd," zmm0 = _mm512_cvtepi8_epi16(p_llrProcBuf[j]);\n"); + fprintf(fd," zmmRes0 = _mm512_adds_epi16(zmmRes0, zmm0);\n"); + + fprintf(fd," zmm1 = _mm512_cvtepi8_epi16(p_llrProcBuf[j +1 ]);\n"); + fprintf(fd," zmmRes1 = _mm512_adds_epi16(zmmRes1, zmm1);\n"); + + // Pack results back to epi8 + fprintf(fd," zmm0 = _mm512_packs_epi16(zmmRes0, zmmRes1);\n"); + // zmm0 = [zmmRes1[255:256] zmmRes0[255:256] zmmRes1[127:0] zmmRes0[127:0]] + // p_llrRes = [zmmRes1[255:256] zmmRes1[127:0] zmmRes0[255:256] zmmRes0[127:0]] + fprintf(fd," p_llrRes[i] = _mm512_permutex_epi64(zmm0, 0xD8);\n"); + + fprintf(fd,"}\n"); + } + + + // ===================================================================== + // Process group with 5 CNs + +fprintf(fd, "// Process group with 15 CNs \n"); + + // Process group with 5 CNs + + if (lut_numBnInBnGroups[14] > 0) + { + // If elements in group move to next address + idxBnGroup++; + + // Number of groups of 32 BNs for parallel processing + fprintf(fd," M = (%d*Z + 63)>>6;\n",lut_numBnInBnGroups[14] );; + + // Set the offset to each CN within a group in terms of 16 Byte + cnOffsetInGroup = (lut_numBnInBnGroups[14]*NR_LDPC_ZMAX)>>5; + + // Set pointers to start of group 2 + fprintf(fd," p_bnProcBuf = (__m256i*) &bnProcBuf [%d];\n",lut_startAddrBnGroups[idxBnGroup]); + fprintf(fd," p_llrProcBuf = (__m256i*) &llrProcBuf [%d];\n",lut_startAddrBnGroupsLlr[idxBnGroup]); + fprintf(fd," p_llrRes = (__m512i*) &llrRes [%d];\n",lut_startAddrBnGroupsLlr[idxBnGroup]); + // Loop over BNs + fprintf(fd," for (int i=0,j=0;i<M;i++,j+=2) {\n"); + // First 16 LLRs of first CN + fprintf(fd," zmmRes0 = _mm512_cvtepi8_epi16(p_bnProcBuf [j]);\n"); + fprintf(fd," zmmRes1 = _mm512_cvtepi8_epi16(p_bnProcBuf [j +1]);\n"); + + // Loop over CNs + for (k=1; k<15; k++) + { + fprintf(fd," zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf[%d + j]);\n", k*cnOffsetInGroup); + fprintf(fd," zmmRes0 = _mm512_adds_epi16(zmmRes0, zmm0);\n"); + + fprintf(fd," zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[%d + j +1]);\n", k*cnOffsetInGroup); + + fprintf(fd, " zmmRes1 = _mm512_adds_epi16(zmmRes1, zmm1); \n"); + } + + // Add LLR from receiver input + fprintf(fd," zmm0 = _mm512_cvtepi8_epi16(p_llrProcBuf[j]);\n"); + fprintf(fd," zmmRes0 = _mm512_adds_epi16(zmmRes0, zmm0);\n"); + + fprintf(fd," zmm1 = _mm512_cvtepi8_epi16(p_llrProcBuf[j +1 ]);\n"); + fprintf(fd," zmmRes1 = _mm512_adds_epi16(zmmRes1, zmm1);\n"); + + // Pack results back to epi8 + fprintf(fd," zmm0 = _mm512_packs_epi16(zmmRes0, zmmRes1);\n"); + // zmm0 = [zmmRes1[255:256] zmmRes0[255:256] zmmRes1[127:0] zmmRes0[127:0]] + // p_llrRes = [zmmRes1[255:256] zmmRes1[127:0] zmmRes0[255:256] zmmRes0[127:0]] + fprintf(fd," p_llrRes[i] = _mm512_permutex_epi64(zmm0, 0xD8);\n"); + + fprintf(fd,"}\n"); + } + + + + // ===================================================================== + // Process group with 6 CNs + +fprintf(fd, "// Process group with 16 CNs \n"); + + // Process group with 6 CNs + + if (lut_numBnInBnGroups[15] > 0) + { + // If elements in group move to next address + idxBnGroup++; + + // Number of groups of 32 BNs for parallel processing + fprintf(fd," M = (%d*Z + 63)>>6;\n",lut_numBnInBnGroups[15] );; + + // Set the offset to each CN within a group in terms of 16 Byte + cnOffsetInGroup = (lut_numBnInBnGroups[15]*NR_LDPC_ZMAX)>>5; + + // Set pointers to start of group 2 + fprintf(fd," p_bnProcBuf = (__m256i*) &bnProcBuf [%d];\n",lut_startAddrBnGroups[idxBnGroup]); + fprintf(fd," p_llrProcBuf = (__m256i*) &llrProcBuf [%d];\n",lut_startAddrBnGroupsLlr[idxBnGroup]); + fprintf(fd," p_llrRes = (__m512i*) &llrRes [%d];\n",lut_startAddrBnGroupsLlr[idxBnGroup]); + // Loop over BNs + fprintf(fd," for (int i=0,j=0;i<M;i++,j+=2) {\n"); + // First 16 LLRs of first CN + fprintf(fd," zmmRes0 = _mm512_cvtepi8_epi16(p_bnProcBuf [j]);\n"); + fprintf(fd," zmmRes1 = _mm512_cvtepi8_epi16(p_bnProcBuf [j +1]);\n"); + + // Loop over CNs + for (k=1; k<16; k++) + { + fprintf(fd," zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf[%d + j]);\n", k*cnOffsetInGroup); + fprintf(fd," zmmRes0 = _mm512_adds_epi16(zmmRes0, zmm0);\n"); + + fprintf(fd," zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[%d + j +1]);\n", k*cnOffsetInGroup); + + fprintf(fd, " zmmRes1 = _mm512_adds_epi16(zmmRes1, zmm1); \n"); + } + + // Add LLR from receiver input + fprintf(fd," zmm0 = _mm512_cvtepi8_epi16(p_llrProcBuf[j]);\n"); + fprintf(fd," zmmRes0 = _mm512_adds_epi16(zmmRes0, zmm0);\n"); + + fprintf(fd," zmm1 = _mm512_cvtepi8_epi16(p_llrProcBuf[j +1 ]);\n"); + fprintf(fd," zmmRes1 = _mm512_adds_epi16(zmmRes1, zmm1);\n"); + + // Pack results back to epi8 + fprintf(fd," zmm0 = _mm512_packs_epi16(zmmRes0, zmmRes1);\n"); + // zmm0 = [zmmRes1[255:256] zmmRes0[255:256] zmmRes1[127:0] zmmRes0[127:0]] + // p_llrRes = [zmmRes1[255:256] zmmRes1[127:0] zmmRes0[255:256] zmmRes0[127:0]] + fprintf(fd," p_llrRes[i] = _mm512_permutex_epi64(zmm0, 0xD8);\n"); + + fprintf(fd,"}\n"); + } + + + // ===================================================================== + // Process group with 17 CNs + +fprintf(fd, "// Process group with 17 CNs \n"); + + // Process group with 17 CNs + + if (lut_numBnInBnGroups[16] > 0) + { + // If elements in group move to next address + idxBnGroup++; + + // Number of groups of 32 BNs for parallel processing + fprintf(fd," M = (%d*Z + 63)>>6;\n",lut_numBnInBnGroups[16] );; + + // Set the offset to each CN within a group in terms of 16 Byte + cnOffsetInGroup = (lut_numBnInBnGroups[16]*NR_LDPC_ZMAX)>>5; + + // Set pointers to start of group 2 + fprintf(fd," p_bnProcBuf = (__m256i*) &bnProcBuf [%d];\n",lut_startAddrBnGroups[idxBnGroup]); + fprintf(fd," p_llrProcBuf = (__m256i*) &llrProcBuf [%d];\n",lut_startAddrBnGroupsLlr[idxBnGroup]); + fprintf(fd," p_llrRes = (__m512i*) &llrRes [%d];\n",lut_startAddrBnGroupsLlr[idxBnGroup]); + // Loop over BNs + fprintf(fd," for (int i=0,j=0;i<M;i++,j+=2) {\n"); + // First 16 LLRs of first CN + fprintf(fd," zmmRes0 = _mm512_cvtepi8_epi16(p_bnProcBuf [j]);\n"); + fprintf(fd," zmmRes1 = _mm512_cvtepi8_epi16(p_bnProcBuf [j +1]);\n"); + + // Loop over CNs + for (k=1; k<17; k++) + { + fprintf(fd," zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf[%d + j]);\n", k*cnOffsetInGroup); + fprintf(fd," zmmRes0 = _mm512_adds_epi16(zmmRes0, zmm0);\n"); + + fprintf(fd," zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[%d + j +1]);\n", k*cnOffsetInGroup); + + fprintf(fd, " zmmRes1 = _mm512_adds_epi16(zmmRes1, zmm1); \n"); + } + + // Add LLR from receiver input + fprintf(fd," zmm0 = _mm512_cvtepi8_epi16(p_llrProcBuf[j]);\n"); + fprintf(fd," zmmRes0 = _mm512_adds_epi16(zmmRes0, zmm0);\n"); + + fprintf(fd," zmm1 = _mm512_cvtepi8_epi16(p_llrProcBuf[j +1 ]);\n"); + fprintf(fd," zmmRes1 = _mm512_adds_epi16(zmmRes1, zmm1);\n"); + + // Pack results back to epi8 + fprintf(fd," zmm0 = _mm512_packs_epi16(zmmRes0, zmmRes1);\n"); + // zmm0 = [zmmRes1[255:256] zmmRes0[255:256] zmmRes1[127:0] zmmRes0[127:0]] + // p_llrRes = [zmmRes1[255:256] zmmRes1[127:0] zmmRes0[255:256] zmmRes0[127:0]] + fprintf(fd," p_llrRes[i] = _mm512_permutex_epi64(zmm0, 0xD8);\n"); + + fprintf(fd,"}\n"); + } + + + // ===================================================================== + // Process group with 18 CNs + +fprintf(fd, "// Process group with 18 CNs \n"); + + // Process group with 8 CNs + + if (lut_numBnInBnGroups[17] > 0) + { + // If elements in group move to next address + idxBnGroup++; + + // Number of groups of 32 BNs for parallel processing + fprintf(fd," M = (%d*Z + 63)>>6;\n",lut_numBnInBnGroups[17] );; + + // Set the offset to each CN within a group in terms of 16 Byte + cnOffsetInGroup = (lut_numBnInBnGroups[17]*NR_LDPC_ZMAX)>>5; + + // Set pointers to start of group 2 + fprintf(fd," p_bnProcBuf = (__m256i*) &bnProcBuf [%d];\n",lut_startAddrBnGroups[idxBnGroup]); + fprintf(fd," p_llrProcBuf = (__m256i*) &llrProcBuf [%d];\n",lut_startAddrBnGroupsLlr[idxBnGroup]); + fprintf(fd," p_llrRes = (__m512i*) &llrRes [%d];\n",lut_startAddrBnGroupsLlr[idxBnGroup]); + // Loop over BNs + fprintf(fd," for (int i=0,j=0;i<M;i++,j+=2) {\n"); + // First 16 LLRs of first CN + fprintf(fd," zmmRes0 = _mm512_cvtepi8_epi16(p_bnProcBuf [j]);\n"); + fprintf(fd," zmmRes1 = _mm512_cvtepi8_epi16(p_bnProcBuf [j +1]);\n"); + + // Loop over CNs + for (k=1; k<18; k++) + { + fprintf(fd," zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf[%d + j]);\n", k*cnOffsetInGroup); + fprintf(fd," zmmRes0 = _mm512_adds_epi16(zmmRes0, zmm0);\n"); + + fprintf(fd," zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[%d + j +1]);\n", k*cnOffsetInGroup); + + fprintf(fd, " zmmRes1 = _mm512_adds_epi16(zmmRes1, zmm1); \n"); + } + + // Add LLR from receiver input + fprintf(fd," zmm0 = _mm512_cvtepi8_epi16(p_llrProcBuf[j]);\n"); + fprintf(fd," zmmRes0 = _mm512_adds_epi16(zmmRes0, zmm0);\n"); + + fprintf(fd," zmm1 = _mm512_cvtepi8_epi16(p_llrProcBuf[j +1 ]);\n"); + fprintf(fd," zmmRes1 = _mm512_adds_epi16(zmmRes1, zmm1);\n"); + + // Pack results back to epi8 + fprintf(fd," zmm0 = _mm512_packs_epi16(zmmRes0, zmmRes1);\n"); + // zmm0 = [zmmRes1[255:256] zmmRes0[255:256] zmmRes1[127:0] zmmRes0[127:0]] + // p_llrRes = [zmmRes1[255:256] zmmRes1[127:0] zmmRes0[255:256] zmmRes0[127:0]] + fprintf(fd," p_llrRes[i] = _mm512_permutex_epi64(zmm0, 0xD8);\n"); + + fprintf(fd,"}\n"); + } + + // ===================================================================== + // Process group with 9 CNs + +fprintf(fd, "// Process group with 19 CNs \n"); + + // Process group with 9 CNs + + if (lut_numBnInBnGroups[18] > 0) + { + // If elements in group move to next address + idxBnGroup++; + + // Number of groups of 32 BNs for parallel processing + fprintf(fd," M = (%d*Z + 63)>>6;\n",lut_numBnInBnGroups[18] );; + + // Set the offset to each CN within a group in terms of 16 Byte + cnOffsetInGroup = (lut_numBnInBnGroups[18]*NR_LDPC_ZMAX)>>5; + + // Set pointers to start of group 2 + fprintf(fd," p_bnProcBuf = (__m256i*) &bnProcBuf [%d];\n",lut_startAddrBnGroups[idxBnGroup]); + fprintf(fd," p_llrProcBuf = (__m256i*) &llrProcBuf [%d];\n",lut_startAddrBnGroupsLlr[idxBnGroup]); + fprintf(fd," p_llrRes = (__m512i*) &llrRes [%d];\n",lut_startAddrBnGroupsLlr[idxBnGroup]); + // Loop over BNs + fprintf(fd," for (int i=0,j=0;i<M;i++,j+=2) {\n"); + // First 16 LLRs of first CN + fprintf(fd," zmmRes0 = _mm512_cvtepi8_epi16(p_bnProcBuf [j]);\n"); + fprintf(fd," zmmRes1 = _mm512_cvtepi8_epi16(p_bnProcBuf [j +1]);\n"); + + // Loop over CNs + for (k=1; k<19; k++) + { + fprintf(fd," zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf[%d + j]);\n", k*cnOffsetInGroup); + fprintf(fd," zmmRes0 = _mm512_adds_epi16(zmmRes0, zmm0);\n"); + + fprintf(fd," zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[%d + j +1]);\n", k*cnOffsetInGroup); + + fprintf(fd, " zmmRes1 = _mm512_adds_epi16(zmmRes1, zmm1); \n"); + } + + // Add LLR from receiver input + fprintf(fd," zmm0 = _mm512_cvtepi8_epi16(p_llrProcBuf[j]);\n"); + fprintf(fd," zmmRes0 = _mm512_adds_epi16(zmmRes0, zmm0);\n"); + + fprintf(fd," zmm1 = _mm512_cvtepi8_epi16(p_llrProcBuf[j +1 ]);\n"); + fprintf(fd," zmmRes1 = _mm512_adds_epi16(zmmRes1, zmm1);\n"); + + // Pack results back to epi8 + fprintf(fd," zmm0 = _mm512_packs_epi16(zmmRes0, zmmRes1);\n"); + // zmm0 = [zmmRes1[255:256] zmmRes0[255:256] zmmRes1[127:0] zmmRes0[127:0]] + // p_llrRes = [zmmRes1[255:256] zmmRes1[127:0] zmmRes0[255:256] zmmRes0[127:0]] + fprintf(fd," p_llrRes[i] = _mm512_permutex_epi64(zmm0, 0xD8);\n"); + + fprintf(fd,"}\n"); + } + + + // ===================================================================== + // Process group with 20 CNs + +fprintf(fd, "// Process group with 20 CNs \n"); + + // Process group with 20 CNs + + if (lut_numBnInBnGroups[19] > 0) + { + // If elements in group move to next address + idxBnGroup++; + + // Number of groups of 32 BNs for parallel processing + fprintf(fd," M = (%d*Z + 63)>>6;\n",lut_numBnInBnGroups[19] );; + + // Set the offset to each CN within a group in terms of 16 Byte + cnOffsetInGroup = (lut_numBnInBnGroups[19]*NR_LDPC_ZMAX)>>5; + + // Set pointers to start of group 2 + fprintf(fd," p_bnProcBuf = (__m256i*) &bnProcBuf [%d];\n",lut_startAddrBnGroups[idxBnGroup]); + fprintf(fd," p_llrProcBuf = (__m256i*) &llrProcBuf [%d];\n",lut_startAddrBnGroupsLlr[idxBnGroup]); + fprintf(fd," p_llrRes = (__m512i*) &llrRes [%d];\n",lut_startAddrBnGroupsLlr[idxBnGroup]); + // Loop over BNs + fprintf(fd," for (int i=0,j=0;i<M;i++,j+=2) {\n"); + // First 16 LLRs of first CN + fprintf(fd," zmmRes0 = _mm512_cvtepi8_epi16(p_bnProcBuf [j]);\n"); + fprintf(fd," zmmRes1 = _mm512_cvtepi8_epi16(p_bnProcBuf [j +1]);\n"); + + // Loop over CNs + for (k=1; k<20; k++) + { + fprintf(fd," zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf[%d + j]);\n", k*cnOffsetInGroup); + fprintf(fd," zmmRes0 = _mm512_adds_epi16(zmmRes0, zmm0);\n"); + + fprintf(fd," zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[%d + j +1]);\n", k*cnOffsetInGroup); + + fprintf(fd, " zmmRes1 = _mm512_adds_epi16(zmmRes1, zmm1); \n"); + } + + // Add LLR from receiver input + fprintf(fd," zmm0 = _mm512_cvtepi8_epi16(p_llrProcBuf[j]);\n"); + fprintf(fd," zmmRes0 = _mm512_adds_epi16(zmmRes0, zmm0);\n"); + + fprintf(fd," zmm1 = _mm512_cvtepi8_epi16(p_llrProcBuf[j +1 ]);\n"); + fprintf(fd," zmmRes1 = _mm512_adds_epi16(zmmRes1, zmm1);\n"); + + // Pack results back to epi8 + fprintf(fd," zmm0 = _mm512_packs_epi16(zmmRes0, zmmRes1);\n"); + // zmm0 = [zmmRes1[255:256] zmmRes0[255:256] zmmRes1[127:0] zmmRes0[127:0]] + // p_llrRes = [zmmRes1[255:256] zmmRes1[127:0] zmmRes0[255:256] zmmRes0[127:0]] + fprintf(fd," p_llrRes[i] = _mm512_permutex_epi64(zmm0, 0xD8);\n"); + + fprintf(fd,"}\n"); + } + + + + + + // ===================================================================== + +fprintf(fd, "// Process group with 21 CNs \n"); + + + + // Process group with 2 CNs + + if (lut_numBnInBnGroups[20] > 0) + { + // If elements in group move to next address + idxBnGroup++; + + // Number of groups of 32 BNs for parallel processing + fprintf(fd," M = (%d*Z + 63)>>6;\n",lut_numBnInBnGroups[20] );; + + // Set the offset to each CN within a group in terms of 16 Byte + cnOffsetInGroup = (lut_numBnInBnGroups[20]*NR_LDPC_ZMAX)>>5; + + // Set pointers to start of group 2 + fprintf(fd," p_bnProcBuf = (__m256i*) &bnProcBuf [%d];\n",lut_startAddrBnGroups[idxBnGroup]); + fprintf(fd," p_llrProcBuf = (__m256i*) &llrProcBuf [%d];\n",lut_startAddrBnGroupsLlr[idxBnGroup]); + fprintf(fd," p_llrRes = (__m512i*) &llrRes [%d];\n",lut_startAddrBnGroupsLlr[idxBnGroup]); + // Loop over BNs + fprintf(fd," for (int i=0,j=0;i<M;i++,j+=2) {\n"); + // First 16 LLRs of first CN + fprintf(fd," zmmRes0 = _mm512_cvtepi8_epi16(p_bnProcBuf [j]);\n"); + fprintf(fd," zmmRes1 = _mm512_cvtepi8_epi16(p_bnProcBuf [j +1]);\n"); + + // Loop over CNs + for (k=1; k<21; k++) + { + fprintf(fd," zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf[%d + j]);\n", k*cnOffsetInGroup); + fprintf(fd," zmmRes0 = _mm512_adds_epi16(zmmRes0, zmm0);\n"); + + fprintf(fd," zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[%d + j +1]);\n", k*cnOffsetInGroup); + + fprintf(fd, " zmmRes1 = _mm512_adds_epi16(zmmRes1, zmm1); \n"); + } + + // Add LLR from receiver input + fprintf(fd," zmm0 = _mm512_cvtepi8_epi16(p_llrProcBuf[j]);\n"); + fprintf(fd," zmmRes0 = _mm512_adds_epi16(zmmRes0, zmm0);\n"); + + fprintf(fd," zmm1 = _mm512_cvtepi8_epi16(p_llrProcBuf[j +1 ]);\n"); + fprintf(fd," zmmRes1 = _mm512_adds_epi16(zmmRes1, zmm1);\n"); + + // Pack results back to epi8 + fprintf(fd," zmm0 = _mm512_packs_epi16(zmmRes0, zmmRes1);\n"); + // zmm0 = [zmmRes1[255:256] zmmRes0[255:256] zmmRes1[127:0] zmmRes0[127:0]] + // p_llrRes = [zmmRes1[255:256] zmmRes1[127:0] zmmRes0[255:256] zmmRes0[127:0]] + fprintf(fd," p_llrRes[i] = _mm512_permutex_epi64(zmm0, 0xD8);\n"); + + fprintf(fd,"}\n"); + } + // ===================================================================== + // Process group with 2 CNs + + +fprintf(fd, "// Process group with 22 CNs \n"); + + // Process group with 2 CNs + + if (lut_numBnInBnGroups[21] > 0) + { + // If elements in group move to next address + idxBnGroup++; + + // Number of groups of 32 BNs for parallel processing + fprintf(fd," M = (%d*Z + 63)>>6;\n",lut_numBnInBnGroups[21] );; + + // Set the offset to each CN within a group in terms of 16 Byte + cnOffsetInGroup = (lut_numBnInBnGroups[21]*NR_LDPC_ZMAX)>>5; + + // Set pointers to start of group 2 + fprintf(fd," p_bnProcBuf = (__m256i*) &bnProcBuf [%d];\n",lut_startAddrBnGroups[idxBnGroup]); + fprintf(fd," p_llrProcBuf = (__m256i*) &llrProcBuf [%d];\n",lut_startAddrBnGroupsLlr[idxBnGroup]); + fprintf(fd," p_llrRes = (__m512i*) &llrRes [%d];\n",lut_startAddrBnGroupsLlr[idxBnGroup]); + // Loop over BNs + fprintf(fd," for (int i=0,j=0;i<M;i++,j+=2) {\n"); + // First 16 LLRs of first CN + fprintf(fd," zmmRes0 = _mm512_cvtepi8_epi16(p_bnProcBuf [j]);\n"); + fprintf(fd," zmmRes1 = _mm512_cvtepi8_epi16(p_bnProcBuf [j +1]);\n"); + + // Loop over CNs + for (k=1; k<22; k++) + { + fprintf(fd," zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf[%d + j]);\n", k*cnOffsetInGroup); + fprintf(fd," zmmRes0 = _mm512_adds_epi16(zmmRes0, zmm0);\n"); + + fprintf(fd," zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[%d + j +1]);\n", k*cnOffsetInGroup); + + fprintf(fd, " zmmRes1 = _mm512_adds_epi16(zmmRes1, zmm1); \n"); + } + + // Add LLR from receiver input + fprintf(fd," zmm0 = _mm512_cvtepi8_epi16(p_llrProcBuf[j]);\n"); + fprintf(fd," zmmRes0 = _mm512_adds_epi16(zmmRes0, zmm0);\n"); + + fprintf(fd," zmm1 = _mm512_cvtepi8_epi16(p_llrProcBuf[j +1 ]);\n"); + fprintf(fd," zmmRes1 = _mm512_adds_epi16(zmmRes1, zmm1);\n"); + + // Pack results back to epi8 + fprintf(fd," zmm0 = _mm512_packs_epi16(zmmRes0, zmmRes1);\n"); + // zmm0 = [zmmRes1[255:256] zmmRes0[255:256] zmmRes1[127:0] zmmRes0[127:0]] + // p_llrRes = [zmmRes1[255:256] zmmRes1[127:0] zmmRes0[255:256] zmmRes0[127:0]] + fprintf(fd," p_llrRes[i] = _mm512_permutex_epi64(zmm0, 0xD8);\n"); + + fprintf(fd,"}\n"); + } + + // ===================================================================== + // Process group with 13 CNs + + +fprintf(fd, "// Process group with <23 CNs \n"); + + // Process group with 3 CNs + + if (lut_numBnInBnGroups[22] > 0) + { + // If elements in group move to next address + idxBnGroup++; + + // Number of groups of 32 BNs for parallel processing + fprintf(fd," M = (%d*Z + 63)>>6;\n",lut_numBnInBnGroups[22] );; + + // Set the offset to each CN within a group in terms of 16 Byte + cnOffsetInGroup = (lut_numBnInBnGroups[22]*NR_LDPC_ZMAX)>>5; + + // Set pointers to start of group 2 + fprintf(fd," p_bnProcBuf = (__m256i*) &bnProcBuf [%d];\n",lut_startAddrBnGroups[idxBnGroup]); + fprintf(fd," p_llrProcBuf = (__m256i*) &llrProcBuf [%d];\n",lut_startAddrBnGroupsLlr[idxBnGroup]); + fprintf(fd," p_llrRes = (__m512i*) &llrRes [%d];\n",lut_startAddrBnGroupsLlr[idxBnGroup]); + // Loop over BNs + fprintf(fd," for (int i=0,j=0;i<M;i++,j+=2) {\n"); + // First 16 LLRs of first CN + fprintf(fd," zmmRes0 = _mm512_cvtepi8_epi16(p_bnProcBuf [j]);\n"); + fprintf(fd," zmmRes1 = _mm512_cvtepi8_epi16(p_bnProcBuf [j +1]);\n"); + + // Loop over CNs + for (k=1; k<23; k++) + { + fprintf(fd," zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf[%d + j]);\n", k*cnOffsetInGroup); + fprintf(fd," zmmRes0 = _mm512_adds_epi16(zmmRes0, zmm0);\n"); + + fprintf(fd," zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[%d + j +1]);\n", k*cnOffsetInGroup); + + fprintf(fd," zmmRes1 = _mm512_adds_epi16(zmmRes1, zmm1); \n"); + } + + // Add LLR from receiver input + fprintf(fd," zmm0 = _mm512_cvtepi8_epi16(p_llrProcBuf[j]);\n"); + fprintf(fd," zmmRes0 = _mm512_adds_epi16(zmmRes0, zmm0);\n"); + + fprintf(fd," zmm1 = _mm512_cvtepi8_epi16(p_llrProcBuf[j +1 ]);\n"); + fprintf(fd," zmmRes1 = _mm512_adds_epi16(zmmRes1, zmm1);\n"); + + // Pack results back to epi8 + fprintf(fd," zmm0 = _mm512_packs_epi16(zmmRes0, zmmRes1);\n"); + // zmm0 = [zmmRes1[255:256] zmmRes0[255:256] zmmRes1[127:0] zmmRes0[127:0]] + // p_llrRes = [zmmRes1[255:256] zmmRes1[127:0] zmmRes0[255:256] zmmRes0[127:0]] + fprintf(fd," p_llrRes[i] = _mm512_permutex_epi64(zmm0, 0xD8);\n"); + + fprintf(fd,"}\n"); + } + + + + // ===================================================================== + // Process group with 4 CNs + +fprintf(fd, "// Process group with 24 CNs \n"); + + // Process group with 4 CNs + + if (lut_numBnInBnGroups[23] > 0) + { + // If elements in group move to next address + idxBnGroup++; + + // Number of groups of 32 BNs for parallel processing + fprintf(fd," M = (%d*Z + 63)>>6;\n",lut_numBnInBnGroups[23] );; + + // Set the offset to each CN within a group in terms of 16 Byte + cnOffsetInGroup = (lut_numBnInBnGroups[23]*NR_LDPC_ZMAX)>>5; + + // Set pointers to start of group 2 + fprintf(fd," p_bnProcBuf = (__m256i*) &bnProcBuf [%d];\n",lut_startAddrBnGroups[idxBnGroup]); + fprintf(fd," p_llrProcBuf = (__m256i*) &llrProcBuf [%d];\n",lut_startAddrBnGroupsLlr[idxBnGroup]); + fprintf(fd," p_llrRes = (__m512i*) &llrRes [%d];\n",lut_startAddrBnGroupsLlr[idxBnGroup]); + // Loop over BNs + fprintf(fd," for (int i=0,j=0;i<M;i++,j+=2) {\n"); + // First 16 LLRs of first CN + fprintf(fd," zmmRes0 = _mm512_cvtepi8_epi16(p_bnProcBuf [j]);\n"); + fprintf(fd," zmmRes1 = _mm512_cvtepi8_epi16(p_bnProcBuf [j +1]);\n"); + + // Loop over CNs + for (k=1; k<24; k++) + { + fprintf(fd," zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf[%d + j]);\n", k*cnOffsetInGroup); + fprintf(fd," zmmRes0 = _mm512_adds_epi16(zmmRes0, zmm0);\n"); + + fprintf(fd," zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[%d + j +1]);\n", k*cnOffsetInGroup); + + fprintf(fd, " zmmRes1 = _mm512_adds_epi16(zmmRes1, zmm1); \n"); + } + + // Add LLR from receiver input + fprintf(fd," zmm0 = _mm512_cvtepi8_epi16(p_llrProcBuf[j]);\n"); + fprintf(fd," zmmRes0 = _mm512_adds_epi16(zmmRes0, zmm0);\n"); + + fprintf(fd," zmm1 = _mm512_cvtepi8_epi16(p_llrProcBuf[j +1 ]);\n"); + fprintf(fd," zmmRes1 = _mm512_adds_epi16(zmmRes1, zmm1);\n"); + + // Pack results back to epi8 + fprintf(fd," zmm0 = _mm512_packs_epi16(zmmRes0, zmmRes1);\n"); + // zmm0 = [zmmRes1[255:256] zmmRes0[255:256] zmmRes1[127:0] zmmRes0[127:0]] + // p_llrRes = [zmmRes1[255:256] zmmRes1[127:0] zmmRes0[255:256] zmmRes0[127:0]] + fprintf(fd," p_llrRes[i] = _mm512_permutex_epi64(zmm0, 0xD8);\n"); + + fprintf(fd,"}\n"); + } + + + // ===================================================================== + // Process group with 5 CNs + +fprintf(fd, "// Process group with 25 CNs \n"); + + // Process group with 5 CNs + + if (lut_numBnInBnGroups[24] > 0) + { + // If elements in group move to next address + idxBnGroup++; + + // Number of groups of 32 BNs for parallel processing + fprintf(fd," M = (%d*Z + 63)>>6;\n",lut_numBnInBnGroups[24] );; + + // Set the offset to each CN within a group in terms of 16 Byte + cnOffsetInGroup = (lut_numBnInBnGroups[24]*NR_LDPC_ZMAX)>>5; + + // Set pointers to start of group 2 + fprintf(fd," p_bnProcBuf = (__m256i*) &bnProcBuf [%d];\n",lut_startAddrBnGroups[idxBnGroup]); + fprintf(fd," p_llrProcBuf = (__m256i*) &llrProcBuf [%d];\n",lut_startAddrBnGroupsLlr[idxBnGroup]); + fprintf(fd," p_llrRes = (__m512i*) &llrRes [%d];\n",lut_startAddrBnGroupsLlr[idxBnGroup]); + // Loop over BNs + fprintf(fd," for (int i=0,j=0;i<M;i++,j+=2) {\n"); + // First 16 LLRs of first CN + fprintf(fd," zmmRes0 = _mm512_cvtepi8_epi16(p_bnProcBuf [j]);\n"); + fprintf(fd," zmmRes1 = _mm512_cvtepi8_epi16(p_bnProcBuf [j +1]);\n"); + + // Loop over CNs + for (k=1; k<25; k++) + { + fprintf(fd," zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf[%d + j]);\n", k*cnOffsetInGroup); + fprintf(fd," zmmRes0 = _mm512_adds_epi16(zmmRes0, zmm0);\n"); + + fprintf(fd," zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[%d + j +1]);\n", k*cnOffsetInGroup); + + fprintf(fd, " zmmRes1 = _mm512_adds_epi16(zmmRes1, zmm1); \n"); + } + + // Add LLR from receiver input + fprintf(fd," zmm0 = _mm512_cvtepi8_epi16(p_llrProcBuf[j]);\n"); + fprintf(fd," zmmRes0 = _mm512_adds_epi16(zmmRes0, zmm0);\n"); + + fprintf(fd," zmm1 = _mm512_cvtepi8_epi16(p_llrProcBuf[j +1 ]);\n"); + fprintf(fd," zmmRes1 = _mm512_adds_epi16(zmmRes1, zmm1);\n"); + + // Pack results back to epi8 + fprintf(fd," zmm0 = _mm512_packs_epi16(zmmRes0, zmmRes1);\n"); + // zmm0 = [zmmRes1[255:256] zmmRes0[255:256] zmmRes1[127:0] zmmRes0[127:0]] + // p_llrRes = [zmmRes1[255:256] zmmRes1[127:0] zmmRes0[255:256] zmmRes0[127:0]] + fprintf(fd," p_llrRes[i] = _mm512_permutex_epi64(zmm0, 0xD8);\n"); + + fprintf(fd,"}\n"); + } + + + + // ===================================================================== + // Process group with 6 CNs + +fprintf(fd, "// Process group with 26 CNs \n"); + + // Process group with 6 CNs + + if (lut_numBnInBnGroups[25] > 0) + { + // If elements in group move to next address + idxBnGroup++; + + // Number of groups of 32 BNs for parallel processing + fprintf(fd," M = (%d*Z + 63)>>6;\n",lut_numBnInBnGroups[25] );; + + // Set the offset to each CN within a group in terms of 16 Byte + cnOffsetInGroup = (lut_numBnInBnGroups[25]*NR_LDPC_ZMAX)>>5; + + // Set pointers to start of group 2 + fprintf(fd," p_bnProcBuf = (__m256i*) &bnProcBuf [%d];\n",lut_startAddrBnGroups[idxBnGroup]); + fprintf(fd," p_llrProcBuf = (__m256i*) &llrProcBuf [%d];\n",lut_startAddrBnGroupsLlr[idxBnGroup]); + fprintf(fd," p_llrRes = (__m512i*) &llrRes [%d];\n",lut_startAddrBnGroupsLlr[idxBnGroup]); + // Loop over BNs + fprintf(fd," for (int i=0,j=0;i<M;i++,j+=2) {\n"); + // First 16 LLRs of first CN + fprintf(fd," zmmRes0 = _mm512_cvtepi8_epi16(p_bnProcBuf [j]);\n"); + fprintf(fd," zmmRes1 = _mm512_cvtepi8_epi16(p_bnProcBuf [j +1]);\n"); + + // Loop over CNs + for (k=1; k<26; k++) + { + fprintf(fd," zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf[%d + j]);\n", k*cnOffsetInGroup); + fprintf(fd," zmmRes0 = _mm512_adds_epi16(zmmRes0, zmm0);\n"); + + fprintf(fd," zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[%d + j +1]);\n", k*cnOffsetInGroup); + + fprintf(fd, " zmmRes1 = _mm512_adds_epi16(zmmRes1, zmm1); \n"); + } + + // Add LLR from receiver input + fprintf(fd," zmm0 = _mm512_cvtepi8_epi16(p_llrProcBuf[j]);\n"); + fprintf(fd," zmmRes0 = _mm512_adds_epi16(zmmRes0, zmm0);\n"); + + fprintf(fd," zmm1 = _mm512_cvtepi8_epi16(p_llrProcBuf[j +1 ]);\n"); + fprintf(fd," zmmRes1 = _mm512_adds_epi16(zmmRes1, zmm1);\n"); + + // Pack results back to epi8 + fprintf(fd," zmm0 = _mm512_packs_epi16(zmmRes0, zmmRes1);\n"); + // zmm0 = [zmmRes1[255:256] zmmRes0[255:256] zmmRes1[127:0] zmmRes0[127:0]] + // p_llrRes = [zmmRes1[255:256] zmmRes1[127:0] zmmRes0[255:256] zmmRes0[127:0]] + fprintf(fd," p_llrRes[i] = _mm512_permutex_epi64(zmm0, 0xD8);\n"); + + fprintf(fd,"}\n"); + } + + + // ===================================================================== + // Process group with 17 CNs + +fprintf(fd, "// Process group with 27 CNs \n"); + + // Process group with 17 CNs + + if (lut_numBnInBnGroups[26] > 0) + { + // If elements in group move to next address + idxBnGroup++; + + // Number of groups of 32 BNs for parallel processing + fprintf(fd," M = (%d*Z + 63)>>6;\n",lut_numBnInBnGroups[26] );; + + // Set the offset to each CN within a group in terms of 16 Byte + cnOffsetInGroup = (lut_numBnInBnGroups[26]*NR_LDPC_ZMAX)>>5; + + // Set pointers to start of group 2 + fprintf(fd," p_bnProcBuf = (__m256i*) &bnProcBuf [%d];\n",lut_startAddrBnGroups[idxBnGroup]); + fprintf(fd," p_llrProcBuf = (__m256i*) &llrProcBuf [%d];\n",lut_startAddrBnGroupsLlr[idxBnGroup]); + fprintf(fd," p_llrRes = (__m512i*) &llrRes [%d];\n",lut_startAddrBnGroupsLlr[idxBnGroup]); + // Loop over BNs + fprintf(fd," for (int i=0,j=0;i<M;i++,j+=2) {\n"); + // First 16 LLRs of first CN + fprintf(fd," zmmRes0 = _mm512_cvtepi8_epi16(p_bnProcBuf [j]);\n"); + fprintf(fd," zmmRes1 = _mm512_cvtepi8_epi16(p_bnProcBuf [j +1]);\n"); + + // Loop over CNs + for (k=1; k<27; k++) + { + fprintf(fd," zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf[%d + j]);\n", k*cnOffsetInGroup); + fprintf(fd," zmmRes0 = _mm512_adds_epi16(zmmRes0, zmm0);\n"); + + fprintf(fd," zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[%d + j +1]);\n", k*cnOffsetInGroup); + + fprintf(fd, " zmmRes1 = _mm512_adds_epi16(zmmRes1, zmm1); \n"); + } + + // Add LLR from receiver input + fprintf(fd," zmm0 = _mm512_cvtepi8_epi16(p_llrProcBuf[j]);\n"); + fprintf(fd," zmmRes0 = _mm512_adds_epi16(zmmRes0, zmm0);\n"); + + fprintf(fd," zmm1 = _mm512_cvtepi8_epi16(p_llrProcBuf[j +1 ]);\n"); + fprintf(fd," zmmRes1 = _mm512_adds_epi16(zmmRes1, zmm1);\n"); + + // Pack results back to epi8 + fprintf(fd," zmm0 = _mm512_packs_epi16(zmmRes0, zmmRes1);\n"); + // zmm0 = [zmmRes1[255:256] zmmRes0[255:256] zmmRes1[127:0] zmmRes0[127:0]] + // p_llrRes = [zmmRes1[255:256] zmmRes1[127:0] zmmRes0[255:256] zmmRes0[127:0]] + fprintf(fd," p_llrRes[i] = _mm512_permutex_epi64(zmm0, 0xD8);\n"); + + fprintf(fd,"}\n"); + } + + + // ===================================================================== + // Process group with 18 CNs + +fprintf(fd, "// Process group with 28 CNs \n"); + + // Process group with 8 CNs + + if (lut_numBnInBnGroups[27] > 0) + { + // If elements in group move to next address + idxBnGroup++; + + // Number of groups of 32 BNs for parallel processing + fprintf(fd," M = (%d*Z + 63)>>6;\n",lut_numBnInBnGroups[27] );; + + // Set the offset to each CN within a group in terms of 16 Byte + cnOffsetInGroup = (lut_numBnInBnGroups[27]*NR_LDPC_ZMAX)>>5; + + // Set pointers to start of group 2 + fprintf(fd," p_bnProcBuf = (__m256i*) &bnProcBuf [%d];\n",lut_startAddrBnGroups[idxBnGroup]); + fprintf(fd," p_llrProcBuf = (__m256i*) &llrProcBuf [%d];\n",lut_startAddrBnGroupsLlr[idxBnGroup]); + fprintf(fd," p_llrRes = (__m512i*) &llrRes [%d];\n",lut_startAddrBnGroupsLlr[idxBnGroup]); + // Loop over BNs + fprintf(fd," for (int i=0,j=0;i<M;i++,j+=2) {\n"); + // First 16 LLRs of first CN + fprintf(fd," zmmRes0 = _mm512_cvtepi8_epi16(p_bnProcBuf [j]);\n"); + fprintf(fd," zmmRes1 = _mm512_cvtepi8_epi16(p_bnProcBuf [j +1]);\n"); + + // Loop over CNs + for (k=1; k<28; k++) + { + fprintf(fd," zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf[%d + j]);\n", k*cnOffsetInGroup); + fprintf(fd," zmmRes0 = _mm512_adds_epi16(zmmRes0, zmm0);\n"); + + fprintf(fd," zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[%d + j +1]);\n", k*cnOffsetInGroup); + + fprintf(fd, " zmmRes1 = _mm512_adds_epi16(zmmRes1, zmm1); \n"); + } + + // Add LLR from receiver input + fprintf(fd," zmm0 = _mm512_cvtepi8_epi16(p_llrProcBuf[j]);\n"); + fprintf(fd," zmmRes0 = _mm512_adds_epi16(zmmRes0, zmm0);\n"); + + fprintf(fd," zmm1 = _mm512_cvtepi8_epi16(p_llrProcBuf[j +1 ]);\n"); + fprintf(fd," zmmRes1 = _mm512_adds_epi16(zmmRes1, zmm1);\n"); + + // Pack results back to epi8 + fprintf(fd," zmm0 = _mm512_packs_epi16(zmmRes0, zmmRes1);\n"); + // zmm0 = [zmmRes1[255:256] zmmRes0[255:256] zmmRes1[127:0] zmmRes0[127:0]] + // p_llrRes = [zmmRes1[255:256] zmmRes1[127:0] zmmRes0[255:256] zmmRes0[127:0]] + fprintf(fd," p_llrRes[i] = _mm512_permutex_epi64(zmm0, 0xD8);\n"); + + fprintf(fd,"}\n"); + } + + // ===================================================================== + // Process group with 9 CNs + +fprintf(fd, "// Process group with 29 CNs \n"); + + // Process group with 9 CNs + + if (lut_numBnInBnGroups[28] > 0) + { + // If elements in group move to next address + idxBnGroup++; + + // Number of groups of 32 BNs for parallel processing + fprintf(fd," M = (%d*Z + 63)>>6;\n",lut_numBnInBnGroups[28] );; + + // Set the offset to each CN within a group in terms of 16 Byte + cnOffsetInGroup = (lut_numBnInBnGroups[28]*NR_LDPC_ZMAX)>>5; + + // Set pointers to start of group 2 + fprintf(fd," p_bnProcBuf = (__m256i*) &bnProcBuf [%d];\n",lut_startAddrBnGroups[idxBnGroup]); + fprintf(fd," p_llrProcBuf = (__m256i*) &llrProcBuf [%d];\n",lut_startAddrBnGroupsLlr[idxBnGroup]); + fprintf(fd," p_llrRes = (__m512i*) &llrRes [%d];\n",lut_startAddrBnGroupsLlr[idxBnGroup]); + // Loop over BNs + fprintf(fd," for (int i=0,j=0;i<M;i++,j+=2) {\n"); + // First 16 LLRs of first CN + fprintf(fd," zmmRes0 = _mm512_cvtepi8_epi16(p_bnProcBuf [j]);\n"); + fprintf(fd," zmmRes1 = _mm512_cvtepi8_epi16(p_bnProcBuf [j +1]);\n"); + + // Loop over CNs + for (k=1; k<29; k++) + { + fprintf(fd," zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf[%d + j]);\n", k*cnOffsetInGroup); + fprintf(fd," zmmRes0 = _mm512_adds_epi16(zmmRes0, zmm0);\n"); + + fprintf(fd," zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[%d + j +1]);\n", k*cnOffsetInGroup); + + fprintf(fd, " zmmRes1 = _mm512_adds_epi16(zmmRes1, zmm1); \n"); + } + + // Add LLR from receiver input + fprintf(fd," zmm0 = _mm512_cvtepi8_epi16(p_llrProcBuf[j]);\n"); + fprintf(fd," zmmRes0 = _mm512_adds_epi16(zmmRes0, zmm0);\n"); + + fprintf(fd," zmm1 = _mm512_cvtepi8_epi16(p_llrProcBuf[j +1 ]);\n"); + fprintf(fd," zmmRes1 = _mm512_adds_epi16(zmmRes1, zmm1);\n"); + + // Pack results back to epi8 + fprintf(fd," zmm0 = _mm512_packs_epi16(zmmRes0, zmmRes1);\n"); + // zmm0 = [zmmRes1[255:256] zmmRes0[255:256] zmmRes1[127:0] zmmRes0[127:0]] + // p_llrRes = [zmmRes1[255:256] zmmRes1[127:0] zmmRes0[255:256] zmmRes0[127:0]] + fprintf(fd," p_llrRes[i] = _mm512_permutex_epi64(zmm0, 0xD8);\n"); + + fprintf(fd,"}\n"); + } + + + // ===================================================================== + // Process group with 20 CNs + +fprintf(fd, "// Process group with 30 CNs \n"); + + // Process group with 20 CNs + + if (lut_numBnInBnGroups[29] > 0) + { + // If elements in group move to next address + idxBnGroup++; + + // Number of groups of 32 BNs for parallel processing + fprintf(fd," M = (%d*Z + 63)>>6;\n",lut_numBnInBnGroups[29] );; + + // Set the offset to each CN within a group in terms of 16 Byte + cnOffsetInGroup = (lut_numBnInBnGroups[29]*NR_LDPC_ZMAX)>>5; + + // Set pointers to start of group 2 + fprintf(fd," p_bnProcBuf = (__m256i*) &bnProcBuf [%d];\n",lut_startAddrBnGroups[idxBnGroup]); + fprintf(fd," p_llrProcBuf = (__m256i*) &llrProcBuf [%d];\n",lut_startAddrBnGroupsLlr[idxBnGroup]); + fprintf(fd," p_llrRes = (__m512i*) &llrRes [%d];\n",lut_startAddrBnGroupsLlr[idxBnGroup]); + // Loop over BNs + fprintf(fd," for (int i=0,j=0;i<M;i++,j+=2) {\n"); + // First 16 LLRs of first CN + fprintf(fd," zmmRes0 = _mm512_cvtepi8_epi16(p_bnProcBuf [j]);\n"); + fprintf(fd," zmmRes1 = _mm512_cvtepi8_epi16(p_bnProcBuf [j +1]);\n"); + + // Loop over CNs + for (k=1; k<30; k++) + { + fprintf(fd," zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf[%d + j]);\n", k*cnOffsetInGroup); + fprintf(fd," zmmRes0 = _mm512_adds_epi16(zmmRes0, zmm0);\n"); + + fprintf(fd," zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[%d + j +1]);\n", k*cnOffsetInGroup); + + fprintf(fd, " zmmRes1 = _mm512_adds_epi16(zmmRes1, zmm1); \n"); + } + + // Add LLR from receiver input + fprintf(fd," zmm0 = _mm512_cvtepi8_epi16(p_llrProcBuf[j]);\n"); + fprintf(fd," zmmRes0 = _mm512_adds_epi16(zmmRes0, zmm0);\n"); + + fprintf(fd," zmm1 = _mm512_cvtepi8_epi16(p_llrProcBuf[j +1 ]);\n"); + fprintf(fd," zmmRes1 = _mm512_adds_epi16(zmmRes1, zmm1);\n"); + + // Pack results back to epi8 + fprintf(fd," zmm0 = _mm512_packs_epi16(zmmRes0, zmmRes1);\n"); + // zmm0 = [zmmRes1[255:256] zmmRes0[255:256] zmmRes1[127:0] zmmRes0[127:0]] + // p_llrRes = [zmmRes1[255:256] zmmRes1[127:0] zmmRes0[255:256] zmmRes0[127:0]] + fprintf(fd," p_llrRes[i] = _mm512_permutex_epi64(zmm0, 0xD8);\n"); + + fprintf(fd,"}\n"); + } + + fprintf(fd,"}\n"); + fclose(fd); +}//end of the function nrLDPC_bnProcPc_BG1 + + + + + diff --git a/openair1/PHY/CODING/nrLDPC_decoder/nrLDPC_tools/generator_bnProc_avx512/bnProcPc_gen_BG2_avx512.c b/openair1/PHY/CODING/nrLDPC_decoder/nrLDPC_tools/generator_bnProc_avx512/bnProcPc_gen_BG2_avx512.c new file mode 100644 index 0000000000000000000000000000000000000000..a025ae8efb3b1b653720ae741c66e2d8f1ce38a6 --- /dev/null +++ b/openair1/PHY/CODING/nrLDPC_decoder/nrLDPC_tools/generator_bnProc_avx512/bnProcPc_gen_BG2_avx512.c @@ -0,0 +1,1770 @@ + + + +#include <stdint.h> +#include <immintrin.h> +#include "../../nrLDPCdecoder_defs.h" +#include "../../nrLDPC_types.h" + + +void nrLDPC_bnProcPc_BG2_generator_AVX512(int R) +{ + const char *ratestr[3]={"15","13","23"}; + + if (R<0 || R>2) {printf("Illegal R %d\n",R); abort();} + + + // system("mkdir -p ../ldpc_gen_files"); + + char fname[50]; + sprintf(fname,"../ldpc_gen_files/bn_avx512/nrLDPC_bnProcPc_BG2_R%s_AVX512.h",ratestr[R]); + FILE *fd=fopen(fname,"w"); + if (fd == NULL) {printf("Cannot create \n");abort();} + +// fprintf(fd,"#include <stdint.h>\n"); + //fprintf(fd,"#include <immintrin.h>\n"); + + fprintf(fd,"static inline void nrLDPC_bnProcPc_BG2_R%s_AVX512(int8_t* bnProcBuf,int8_t* llrRes , int8_t* llrProcBuf, uint16_t Z ) {\n",ratestr[R]); + const uint8_t* lut_numBnInBnGroups; + const uint32_t* lut_startAddrBnGroups; + const uint16_t* lut_startAddrBnGroupsLlr; + if (R==0) { + + + lut_numBnInBnGroups = lut_numBnInBnGroups_BG2_R15; + lut_startAddrBnGroups = lut_startAddrBnGroups_BG2_R15; + lut_startAddrBnGroupsLlr = lut_startAddrBnGroupsLlr_BG2_R15; + + } + else if (R==1){ + + lut_numBnInBnGroups = lut_numBnInBnGroups_BG2_R13; + lut_startAddrBnGroups = lut_startAddrBnGroups_BG2_R13; + lut_startAddrBnGroupsLlr = lut_startAddrBnGroupsLlr_BG2_R13; + } + else if (R==2) { + + lut_numBnInBnGroups = lut_numBnInBnGroups_BG2_R23; + lut_startAddrBnGroups = lut_startAddrBnGroups_BG2_R23; + lut_startAddrBnGroupsLlr = lut_startAddrBnGroupsLlr_BG2_R23; + } + else { printf("aborting, illegal R %d\n",R); fclose(fd);abort();} + + + uint32_t k; + // Offset to each bit within a group in terms of 32 Byte + uint32_t cnOffsetInGroup; + uint8_t idxBnGroup = 0; + + fprintf(fd," __m512i zmm0,zmm1,zmmRes0,zmmRes1; \n"); + + + fprintf(fd," __m256i* p_bnProcBuf; \n"); + fprintf(fd," __m256i* p_llrProcBuf;\n"); + fprintf(fd," __m512i* p_llrRes; \n"); + fprintf(fd," uint32_t M ;\n"); + + +fprintf(fd, "// Process group with 1 CNs \n"); + + + // Process group with 2 CNs + + if (lut_numBnInBnGroups[0] > 0) + { + // If elements in group move to next address + // idxBnGroup++; + + // Number of groups of 32 BNs for parallel processing + fprintf(fd," M = (%d*Z + 63)>>6;\n",lut_numBnInBnGroups[0] ); + + // Set the offset to each CN within a group in terms of 16 Byte + cnOffsetInGroup = (lut_numBnInBnGroups[0]*NR_LDPC_ZMAX)>>5; + + // Set pointers to start of group 2 + fprintf(fd," p_bnProcBuf = (__m256i*) &bnProcBuf [%d];\n",lut_startAddrBnGroups[idxBnGroup]); + fprintf(fd," p_llrProcBuf = (__m256i*) &llrProcBuf [%d];\n",lut_startAddrBnGroupsLlr[idxBnGroup]); + fprintf(fd," p_llrRes = (__m512i*) &llrRes [%d];\n",lut_startAddrBnGroupsLlr[idxBnGroup]); + + // Loop over BNs + fprintf(fd," for (int i=0,j=0;i<M;i++,j+=2) {\n"); + // First 16 LLRs of first CN + fprintf(fd," zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf [j]);\n"); + fprintf(fd," zmm1 = _mm512_cvtepi8_epi16(p_llrProcBuf[j]);\n"); + + // Loop over CNs + /*for (k=1; k<1; k++) + { + fprintf(fd," zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf[%d + j]);\n", k*cnOffsetInGroup); + fprintf(fd," zmmRes0 = _mm512_adds_epi16(zmmRes0,zmm0);\n"); + + fprintf(fd," zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[%d + j +1]);\n", k*cnOffsetInGroup); + + fprintf(fd, " zmmRes1 = _mm512_adds_epi16(zmmRes1,zmm1); \n"); + } +*/ + // Add LLR from receiver input + fprintf(fd," zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf[j+1]);\n"); + fprintf(fd," zmmRes0 = _mm512_adds_epi16(zmmRes0,zmm0);\n"); + + fprintf(fd," zmm1 = _mm512_cvtepi8_epi16(p_llrProcBuf[j +1 ]);\n"); + fprintf(fd," zmmRes1 = _mm512_adds_epi16(zmmRes1,zmm1);\n"); + + // Pack results back to epi8 + fprintf(fd," zmm0 = _mm512_packs_epi16(zmmRes0,zmmRes1);\n"); + //zmm0 = [zmmRes1[255:256]zmmRes0[255:256]zmmRes1[127:0]zmmRes0[127:0]] + // p_llrRes = [zmmRes1[255:256]zmmRes1[127:0]zmmRes0[255:256]zmmRes0[127:0]] + fprintf(fd," p_llrRes[i] = _mm512_permutex_epi64(zmm0, 0xD8);\n"); + + + fprintf(fd,"}\n"); + } + // ===================================================================== + // Process group with 2 CNs + + +fprintf(fd, "// Process group with 2 CNs \n"); + + // Process group with 2 CNs + + if (lut_numBnInBnGroups[1] > 0) + { + // If elements in group move to next address + idxBnGroup++; + + // Number of groups of 32 BNs for parallel processing + fprintf(fd," M = (%d*Z + 63)>>6;\n",lut_numBnInBnGroups[1] ); + + // Set the offset to each CN within a group in terms of 16 Byte + cnOffsetInGroup = (lut_numBnInBnGroups[1]*NR_LDPC_ZMAX)>>5; + + // Set pointers to start of group 2 + fprintf(fd," p_bnProcBuf = (__m256i*) &bnProcBuf [%d];\n",lut_startAddrBnGroups[idxBnGroup]); + fprintf(fd," p_llrProcBuf = (__m256i*) &llrProcBuf [%d];\n",lut_startAddrBnGroupsLlr[idxBnGroup]); + fprintf(fd," p_llrRes = (__m512i*) &llrRes [%d];\n",lut_startAddrBnGroupsLlr[idxBnGroup]); + + // Loop over BNs + fprintf(fd," for (int i=0,j=0;i<M;i++,j+=2) {\n"); + // First 16 LLRs of first CN + fprintf(fd," zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf [j]);\n"); + fprintf(fd," zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[j + 1]);\n"); + + // Loop over CNs + for (k=1; k<2; k++) + { + fprintf(fd," zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf[%d + j]);\n", k*cnOffsetInGroup); + fprintf(fd," zmmRes0 = _mm512_adds_epi16(zmmRes0,zmm0);\n"); + + fprintf(fd," zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[%d + j +1]);\n", k*cnOffsetInGroup); + + fprintf(fd, " zmmRes1 = _mm512_adds_epi16(zmmRes1,zmm1); \n"); + } + + // Add LLR from receiver input + fprintf(fd," zmm0 = _mm512_cvtepi8_epi16(p_llrProcBuf[j]);\n"); + fprintf(fd," zmmRes0 = _mm512_adds_epi16(zmmRes0,zmm0);\n"); + + fprintf(fd," zmm1 = _mm512_cvtepi8_epi16(p_llrProcBuf[j +1 ]);\n"); + fprintf(fd," zmmRes1 = _mm512_adds_epi16(zmmRes1,zmm1);\n"); + + // Pack results back to epi8 + fprintf(fd," zmm0 = _mm512_packs_epi16(zmmRes0,zmmRes1);\n"); + //zmm0 = [zmmRes1[255:256]zmmRes0[255:256]zmmRes1[127:0]zmmRes0[127:0]] + // p_llrRes = [zmmRes1[255:256]zmmRes1[127:0]zmmRes0[255:256]zmmRes0[127:0]] + fprintf(fd," p_llrRes[i] = _mm512_permutex_epi64(zmm0, 0xD8);\n"); + + + fprintf(fd,"}\n"); + } + + // ===================================================================== + // Process group with 3 CNs + + +fprintf(fd, "// Process group with 3 CNs \n"); + + // Process group with 3 CNs + + if (lut_numBnInBnGroups[2] > 0) + { + // If elements in group move to next address + idxBnGroup++; + + // Number of groups of 32 BNs for parallel processing + fprintf(fd," M = (%d*Z + 63)>>6;\n",lut_numBnInBnGroups[2] ); + + // Set the offset to each CN within a group in terms of 16 Byte + cnOffsetInGroup = (lut_numBnInBnGroups[2]*NR_LDPC_ZMAX)>>5; + + // Set pointers to start of group 2 + fprintf(fd," p_bnProcBuf = (__m256i*) &bnProcBuf [%d];\n",lut_startAddrBnGroups[idxBnGroup]); + fprintf(fd," p_llrProcBuf = (__m256i*) &llrProcBuf [%d];\n",lut_startAddrBnGroupsLlr[idxBnGroup]); + fprintf(fd," p_llrRes = (__m512i*) &llrRes [%d];\n",lut_startAddrBnGroupsLlr[idxBnGroup]); + // Loop over BNs + fprintf(fd," for (int i=0,j=0;i<M;i++,j+=2) {\n"); + // First 16 LLRs of first CN + fprintf(fd," zmmRes0 = _mm512_cvtepi8_epi16(p_bnProcBuf [j]);\n"); + fprintf(fd," zmmRes1 = _mm512_cvtepi8_epi16(p_bnProcBuf [j +1]);\n"); + + // Loop over CNs + for (k=1; k<3; k++) + { + fprintf(fd," zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf[%d + j]);\n", k*cnOffsetInGroup); + fprintf(fd," zmmRes0 = _mm512_adds_epi16(zmmRes0,zmm0);\n"); + + fprintf(fd," zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[%d + j +1]);\n", k*cnOffsetInGroup); + + fprintf(fd," zmmRes1 = _mm512_adds_epi16(zmmRes1,zmm1); \n"); + } + + // Add LLR from receiver input + fprintf(fd," zmm0 = _mm512_cvtepi8_epi16(p_llrProcBuf[j]);\n"); + fprintf(fd," zmmRes0 = _mm512_adds_epi16(zmmRes0,zmm0);\n"); + + fprintf(fd," zmm1 = _mm512_cvtepi8_epi16(p_llrProcBuf[j +1 ]);\n"); + fprintf(fd," zmmRes1 = _mm512_adds_epi16(zmmRes1,zmm1);\n"); + + // Pack results back to epi8 + fprintf(fd," zmm0 = _mm512_packs_epi16(zmmRes0,zmmRes1);\n"); + //zmm0 = [zmmRes1[255:256]zmmRes0[255:256]zmmRes1[127:0]zmmRes0[127:0]] + fprintf(fd," p_llrRes[i] = _mm512_permutex_epi64(zmm0, 0xD8);\n"); + + fprintf(fd,"}\n"); + } + + + + // ===================================================================== + // Process group with 4 CNs + +fprintf(fd, "// Process group with 4 CNs \n"); + + // Process group with 4 CNs + + if (lut_numBnInBnGroups[3] > 0) + { + // If elements in group move to next address + idxBnGroup++; + + // Number of groups of 32 BNs for parallel processing + fprintf(fd," M = (%d*Z + 63)>>6;\n",lut_numBnInBnGroups[3] ); + + // Set the offset to each CN within a group in terms of 16 Byte + cnOffsetInGroup = (lut_numBnInBnGroups[3]*NR_LDPC_ZMAX)>>5; + + // Set pointers to start of group 2 + fprintf(fd," p_bnProcBuf = (__m256i*) &bnProcBuf [%d];\n",lut_startAddrBnGroups[idxBnGroup]); + fprintf(fd," p_llrProcBuf = (__m256i*) &llrProcBuf [%d];\n",lut_startAddrBnGroupsLlr[idxBnGroup]); + fprintf(fd," p_llrRes = (__m512i*) &llrRes [%d];\n",lut_startAddrBnGroupsLlr[idxBnGroup]); + + // Loop over BNs + fprintf(fd," for (int i=0,j=0;i<M;i++,j+=2) {\n"); + // First 16 LLRs of first CN + fprintf(fd," zmmRes0 = _mm512_cvtepi8_epi16(p_bnProcBuf [j]);\n"); + fprintf(fd," zmmRes1 = _mm512_cvtepi8_epi16(p_bnProcBuf [j +1]);\n"); + + // Loop over CNs + for (k=1; k<4; k++) + { + fprintf(fd," zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf[%d + j]);\n", k*cnOffsetInGroup); + fprintf(fd," zmmRes0 = _mm512_adds_epi16(zmmRes0,zmm0);\n"); + + fprintf(fd," zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[%d + j +1]);\n", k*cnOffsetInGroup); + + fprintf(fd, " zmmRes1 = _mm512_adds_epi16(zmmRes1,zmm1); \n"); + } + + // Add LLR from receiver input + fprintf(fd," zmm0 = _mm512_cvtepi8_epi16(p_llrProcBuf[j]);\n"); + fprintf(fd," zmmRes0 = _mm512_adds_epi16(zmmRes0,zmm0);\n"); + + fprintf(fd," zmm1 = _mm512_cvtepi8_epi16(p_llrProcBuf[j +1 ]);\n"); + fprintf(fd," zmmRes1 = _mm512_adds_epi16(zmmRes1,zmm1);\n"); + + // Pack results back to epi8 + fprintf(fd," zmm0 = _mm512_packs_epi16(zmmRes0,zmmRes1);\n"); + //zmm0 = [zmmRes1[255:256]zmmRes0[255:256]zmmRes1[127:0]zmmRes0[127:0]] + // p_llrRes = [zmmRes1[255:256]zmmRes1[127:0]zmmRes0[255:256]zmmRes0[127:0]] + fprintf(fd," p_llrRes[i] = _mm512_permutex_epi64(zmm0, 0xD8);\n"); + + fprintf(fd,"}\n"); + } + + + // ===================================================================== + // Process group with 5 CNs + +fprintf(fd, "// Process group with 5 CNs \n"); + + // Process group with 5 CNs + + if (lut_numBnInBnGroups[4] > 0) + { + // If elements in group move to next address + idxBnGroup++; + + // Number of groups of 32 BNs for parallel processing + fprintf(fd," M = (%d*Z + 63)>>6;\n",lut_numBnInBnGroups[4] ); + + // Set the offset to each CN within a group in terms of 16 Byte + cnOffsetInGroup = (lut_numBnInBnGroups[4]*NR_LDPC_ZMAX)>>5; + + // Set pointers to start of group 2 + fprintf(fd," p_bnProcBuf = (__m256i*) &bnProcBuf [%d];\n",lut_startAddrBnGroups[idxBnGroup]); + fprintf(fd," p_llrProcBuf = (__m256i*) &llrProcBuf [%d];\n",lut_startAddrBnGroupsLlr[idxBnGroup]); + fprintf(fd," p_llrRes = (__m512i*) &llrRes [%d];\n",lut_startAddrBnGroupsLlr[idxBnGroup]); + // Loop over BNs + fprintf(fd," for (int i=0,j=0;i<M;i++,j+=2) {\n"); + // First 16 LLRs of first CN + fprintf(fd," zmmRes0 = _mm512_cvtepi8_epi16(p_bnProcBuf [j]);\n"); + fprintf(fd," zmmRes1 = _mm512_cvtepi8_epi16(p_bnProcBuf [j +1]);\n"); + + // Loop over CNs + for (k=1; k<5; k++) + { + fprintf(fd," zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf[%d + j]);\n", k*cnOffsetInGroup); + fprintf(fd," zmmRes0 = _mm512_adds_epi16(zmmRes0,zmm0);\n"); + + fprintf(fd," zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[%d + j +1]);\n", k*cnOffsetInGroup); + + fprintf(fd, " zmmRes1 = _mm512_adds_epi16(zmmRes1,zmm1); \n"); + } + + // Add LLR from receiver input + fprintf(fd," zmm0 = _mm512_cvtepi8_epi16(p_llrProcBuf[j]);\n"); + fprintf(fd," zmmRes0 = _mm512_adds_epi16(zmmRes0,zmm0);\n"); + + fprintf(fd," zmm1 = _mm512_cvtepi8_epi16(p_llrProcBuf[j +1 ]);\n"); + fprintf(fd," zmmRes1 = _mm512_adds_epi16(zmmRes1,zmm1);\n"); + + // Pack results back to epi8 + fprintf(fd," zmm0 = _mm512_packs_epi16(zmmRes0,zmmRes1);\n"); + //zmm0 = [zmmRes1[255:256]zmmRes0[255:256]zmmRes1[127:0]zmmRes0[127:0]] + // p_llrRes = [zmmRes1[255:256]zmmRes1[127:0]zmmRes0[255:256]zmmRes0[127:0]] + fprintf(fd," p_llrRes[i] = _mm512_permutex_epi64(zmm0, 0xD8);\n"); + + fprintf(fd,"}\n"); + } + + + + // ===================================================================== + // Process group with 6 CNs + +fprintf(fd, "// Process group with 6 CNs \n"); + + // Process group with 6 CNs + + if (lut_numBnInBnGroups[5] > 0) + { + // If elements in group move to next address + idxBnGroup++; + + // Number of groups of 32 BNs for parallel processing + fprintf(fd," M = (%d*Z + 63)>>6;\n",lut_numBnInBnGroups[5] ); + + // Set the offset to each CN within a group in terms of 16 Byte + cnOffsetInGroup = (lut_numBnInBnGroups[5]*NR_LDPC_ZMAX)>>5; + + // Set pointers to start of group 2 + fprintf(fd," p_bnProcBuf = (__m256i*) &bnProcBuf [%d];\n",lut_startAddrBnGroups[idxBnGroup]); + fprintf(fd," p_llrProcBuf = (__m256i*) &llrProcBuf [%d];\n",lut_startAddrBnGroupsLlr[idxBnGroup]); + fprintf(fd," p_llrRes = (__m512i*) &llrRes [%d];\n",lut_startAddrBnGroupsLlr[idxBnGroup]); + // Loop over BNs + fprintf(fd," for (int i=0,j=0;i<M;i++,j+=2) {\n"); + // First 16 LLRs of first CN + fprintf(fd," zmmRes0 = _mm512_cvtepi8_epi16(p_bnProcBuf [j]);\n"); + fprintf(fd," zmmRes1 = _mm512_cvtepi8_epi16(p_bnProcBuf [j +1]);\n"); + + // Loop over CNs + for (k=1; k<6; k++) + { + fprintf(fd," zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf[%d + j]);\n", k*cnOffsetInGroup); + fprintf(fd," zmmRes0 = _mm512_adds_epi16(zmmRes0,zmm0);\n"); + + fprintf(fd," zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[%d + j +1]);\n", k*cnOffsetInGroup); + + fprintf(fd, " zmmRes1 = _mm512_adds_epi16(zmmRes1,zmm1); \n"); + } + + // Add LLR from receiver input + fprintf(fd," zmm0 = _mm512_cvtepi8_epi16(p_llrProcBuf[j]);\n"); + fprintf(fd," zmmRes0 = _mm512_adds_epi16(zmmRes0,zmm0);\n"); + + fprintf(fd," zmm1 = _mm512_cvtepi8_epi16(p_llrProcBuf[j +1 ]);\n"); + fprintf(fd," zmmRes1 = _mm512_adds_epi16(zmmRes1,zmm1);\n"); + + // Pack results back to epi8 + fprintf(fd," zmm0 = _mm512_packs_epi16(zmmRes0,zmmRes1);\n"); + //zmm0 = [zmmRes1[255:256]zmmRes0[255:256]zmmRes1[127:0]zmmRes0[127:0]] + // p_llrRes = [zmmRes1[255:256]zmmRes1[127:0]zmmRes0[255:256]zmmRes0[127:0]] + fprintf(fd," p_llrRes[i] = _mm512_permutex_epi64(zmm0, 0xD8);\n"); + + fprintf(fd,"}\n"); + } + + + // ===================================================================== + // Process group with 7 CNs + +fprintf(fd, "// Process group with 7 CNs \n"); + + // Process group with 7 CNs + + if (lut_numBnInBnGroups[6] > 0) + { + // If elements in group move to next address + idxBnGroup++; + + // Number of groups of 32 BNs for parallel processing + fprintf(fd," M = (%d*Z + 63)>>6;\n",lut_numBnInBnGroups[6] ); + + // Set the offset to each CN within a group in terms of 16 Byte + cnOffsetInGroup = (lut_numBnInBnGroups[6]*NR_LDPC_ZMAX)>>5; + + // Set pointers to start of group 2 + fprintf(fd," p_bnProcBuf = (__m256i*) &bnProcBuf [%d];\n",lut_startAddrBnGroups[idxBnGroup]); + fprintf(fd," p_llrProcBuf = (__m256i*) &llrProcBuf [%d];\n",lut_startAddrBnGroupsLlr[idxBnGroup]); + fprintf(fd," p_llrRes = (__m512i*) &llrRes [%d];\n",lut_startAddrBnGroupsLlr[idxBnGroup]); + // Loop over BNs + fprintf(fd," for (int i=0,j=0;i<M;i++,j+=2) {\n"); + // First 16 LLRs of first CN + fprintf(fd," zmmRes0 = _mm512_cvtepi8_epi16(p_bnProcBuf [j]);\n"); + fprintf(fd," zmmRes1 = _mm512_cvtepi8_epi16(p_bnProcBuf [j +1]);\n"); + + // Loop over CNs + for (k=1; k<7; k++) + { + fprintf(fd," zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf[%d + j]);\n", k*cnOffsetInGroup); + fprintf(fd," zmmRes0 = _mm512_adds_epi16(zmmRes0,zmm0);\n"); + + fprintf(fd," zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[%d + j +1]);\n", k*cnOffsetInGroup); + + fprintf(fd, " zmmRes1 = _mm512_adds_epi16(zmmRes1,zmm1); \n"); + } + + // Add LLR from receiver input + fprintf(fd," zmm0 = _mm512_cvtepi8_epi16(p_llrProcBuf[j]);\n"); + fprintf(fd," zmmRes0 = _mm512_adds_epi16(zmmRes0,zmm0);\n"); + + fprintf(fd," zmm1 = _mm512_cvtepi8_epi16(p_llrProcBuf[j +1 ]);\n"); + fprintf(fd," zmmRes1 = _mm512_adds_epi16(zmmRes1,zmm1);\n"); + + // Pack results back to epi8 + fprintf(fd," zmm0 = _mm512_packs_epi16(zmmRes0,zmmRes1);\n"); + //zmm0 = [zmmRes1[255:256]zmmRes0[255:256]zmmRes1[127:0]zmmRes0[127:0]] + // p_llrRes = [zmmRes1[255:256]zmmRes1[127:0]zmmRes0[255:256]zmmRes0[127:0]] + //fprintf(fd," (__m512i*) &llrRes[%d + i] = _mm512_permutex_epi64(zmm0, 0xD8);\n",lut_startAddrBnGroupsLlr[idxBnGroup]>>5 ); + fprintf(fd," p_llrRes[i] = _mm512_permutex_epi64(zmm0, 0xD8);\n"); + + fprintf(fd,"}\n"); + } + + + // ===================================================================== + // Process group with 8 CNs + +fprintf(fd, "// Process group with 8 CNs \n"); + + // Process group with 8 CNs + + if (lut_numBnInBnGroups[7] > 0) + { + // If elements in group move to next address + idxBnGroup++; + + // Number of groups of 32 BNs for parallel processing + fprintf(fd," M = (%d*Z + 63)>>6;\n",lut_numBnInBnGroups[7] ); + + // Set the offset to each CN within a group in terms of 16 Byte + cnOffsetInGroup = (lut_numBnInBnGroups[7]*NR_LDPC_ZMAX)>>5; + + // Set pointers to start of group 2 + fprintf(fd," p_bnProcBuf = (__m256i*) &bnProcBuf [%d];\n",lut_startAddrBnGroups[idxBnGroup]); + fprintf(fd," p_llrProcBuf = (__m256i*) &llrProcBuf [%d];\n",lut_startAddrBnGroupsLlr[idxBnGroup]); + fprintf(fd," p_llrRes = (__m512i*) &llrRes [%d];\n",lut_startAddrBnGroupsLlr[idxBnGroup]); + // Loop over BNs + fprintf(fd," for (int i=0,j=0;i<M;i++,j+=2) {\n"); + // First 16 LLRs of first CN + fprintf(fd," zmmRes0 = _mm512_cvtepi8_epi16(p_bnProcBuf [j]);\n"); + fprintf(fd," zmmRes1 = _mm512_cvtepi8_epi16(p_bnProcBuf [j +1]);\n"); + + // Loop over CNs + for (k=1; k<8; k++) + { + fprintf(fd," zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf[%d + j]);\n", k*cnOffsetInGroup); + fprintf(fd," zmmRes0 = _mm512_adds_epi16(zmmRes0,zmm0);\n"); + + fprintf(fd," zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[%d + j +1]);\n", k*cnOffsetInGroup); + + fprintf(fd, " zmmRes1 = _mm512_adds_epi16(zmmRes1,zmm1); \n"); + } + + // Add LLR from receiver input + fprintf(fd," zmm0 = _mm512_cvtepi8_epi16(p_llrProcBuf[j]);\n"); + fprintf(fd," zmmRes0 = _mm512_adds_epi16(zmmRes0,zmm0);\n"); + + fprintf(fd," zmm1 = _mm512_cvtepi8_epi16(p_llrProcBuf[j +1 ]);\n"); + fprintf(fd," zmmRes1 = _mm512_adds_epi16(zmmRes1,zmm1);\n"); + + // Pack results back to epi8 + fprintf(fd," zmm0 = _mm512_packs_epi16(zmmRes0,zmmRes1);\n"); + //zmm0 = [zmmRes1[255:256]zmmRes0[255:256]zmmRes1[127:0]zmmRes0[127:0]] + // p_llrRes = [zmmRes1[255:256]zmmRes1[127:0]zmmRes0[255:256]zmmRes0[127:0]] + //fprintf(fd," (__m512i*) &llrRes[%d + i] = _mm512_permutex_epi64(zmm0, 0xD8);\n",lut_startAddrBnGroupsLlr[idxBnGroup]>>5 ); + + fprintf(fd," p_llrRes[i] = _mm512_permutex_epi64(zmm0, 0xD8);\n"); + + fprintf(fd,"}\n"); + } + + // ===================================================================== + // Process group with 9 CNs + +fprintf(fd, "// Process group with 9 CNs \n"); + + // Process group with 9 CNs + + if (lut_numBnInBnGroups[8] > 0) + { + // If elements in group move to next address + idxBnGroup++; + + // Number of groups of 32 BNs for parallel processing + fprintf(fd," M = (%d*Z + 63)>>6;\n",lut_numBnInBnGroups[8] ); + + // Set the offset to each CN within a group in terms of 16 Byte + cnOffsetInGroup = (lut_numBnInBnGroups[8]*NR_LDPC_ZMAX)>>5; + + // Set pointers to start of group 2 + fprintf(fd," p_bnProcBuf = (__m256i*) &bnProcBuf [%d];\n",lut_startAddrBnGroups[idxBnGroup]); + fprintf(fd," p_llrProcBuf = (__m256i*) &llrProcBuf [%d];\n",lut_startAddrBnGroupsLlr[idxBnGroup]); + fprintf(fd," p_llrRes = (__m512i*) &llrRes [%d];\n",lut_startAddrBnGroupsLlr[idxBnGroup]); + // Loop over BNs + fprintf(fd," for (int i=0,j=0;i<M;i++,j+=2) {\n"); + // First 16 LLRs of first CN + fprintf(fd," zmmRes0 = _mm512_cvtepi8_epi16(p_bnProcBuf [j]);\n"); + fprintf(fd," zmmRes1 = _mm512_cvtepi8_epi16(p_bnProcBuf [j +1]);\n"); + + // Loop over CNs + for (k=1; k<9; k++) + { + fprintf(fd," zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf[%d + j]);\n", k*cnOffsetInGroup); + fprintf(fd," zmmRes0 = _mm512_adds_epi16(zmmRes0,zmm0);\n"); + + fprintf(fd," zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[%d + j +1]);\n", k*cnOffsetInGroup); + + fprintf(fd, " zmmRes1 = _mm512_adds_epi16(zmmRes1,zmm1); \n"); + } + + // Add LLR from receiver input + fprintf(fd," zmm0 = _mm512_cvtepi8_epi16(p_llrProcBuf[j]);\n"); + fprintf(fd," zmmRes0 = _mm512_adds_epi16(zmmRes0,zmm0);\n"); + + fprintf(fd," zmm1 = _mm512_cvtepi8_epi16(p_llrProcBuf[j +1 ]);\n"); + fprintf(fd," zmmRes1 = _mm512_adds_epi16(zmmRes1,zmm1);\n"); + + // Pack results back to epi8 + fprintf(fd," zmm0 = _mm512_packs_epi16(zmmRes0,zmmRes1);\n"); + //zmm0 = [zmmRes1[255:256]zmmRes0[255:256]zmmRes1[127:0]zmmRes0[127:0]] + // p_llrRes = [zmmRes1[255:256]zmmRes1[127:0]zmmRes0[255:256]zmmRes0[127:0]] + //fprintf(fd," (__m512i*) &llrRes[%d + i] = _mm512_permutex_epi64(zmm0, 0xD8);\n",lut_startAddrBnGroupsLlr[idxBnGroup]>>5 ); + fprintf(fd," p_llrRes[i] = _mm512_permutex_epi64(zmm0, 0xD8);\n"); + + fprintf(fd,"}\n"); + } + + + // ===================================================================== + // Process group with 10 CNs + +fprintf(fd, "// Process group with 10 CNs \n"); + + // Process group with 10 CNs + + if (lut_numBnInBnGroups[9] > 0) + { + // If elements in group move to next address + idxBnGroup++; + + // Number of groups of 32 BNs for parallel processing + fprintf(fd," M = (%d*Z + 63)>>6;\n",lut_numBnInBnGroups[9] ); + + // Set the offset to each CN within a group in terms of 16 Byte + cnOffsetInGroup = (lut_numBnInBnGroups[9]*NR_LDPC_ZMAX)>>5; + + // Set pointers to start of group 2 + fprintf(fd," p_bnProcBuf = (__m256i*) &bnProcBuf [%d];\n",lut_startAddrBnGroups[idxBnGroup]); + fprintf(fd," p_llrProcBuf = (__m256i*) &llrProcBuf [%d];\n",lut_startAddrBnGroupsLlr[idxBnGroup]); + fprintf(fd," p_llrRes = (__m512i*) &llrRes [%d];\n",lut_startAddrBnGroupsLlr[idxBnGroup]); + // Loop over BNs + fprintf(fd," for (int i=0,j=0;i<M;i++,j+=2) {\n"); + // First 16 LLRs of first CN + fprintf(fd," zmmRes0 = _mm512_cvtepi8_epi16(p_bnProcBuf [j]);\n"); + fprintf(fd," zmmRes1 = _mm512_cvtepi8_epi16(p_bnProcBuf [j +1]);\n"); + + // Loop over CNs + for (k=1; k<10; k++) + { + fprintf(fd," zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf[%d + j]);\n", k*cnOffsetInGroup); + fprintf(fd," zmmRes0 = _mm512_adds_epi16(zmmRes0,zmm0);\n"); + + fprintf(fd," zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[%d + j +1]);\n", k*cnOffsetInGroup); + + fprintf(fd, " zmmRes1 = _mm512_adds_epi16(zmmRes1,zmm1); \n"); + } + + // Add LLR from receiver input + fprintf(fd," zmm0 = _mm512_cvtepi8_epi16(p_llrProcBuf[j]);\n"); + fprintf(fd," zmmRes0 = _mm512_adds_epi16(zmmRes0,zmm0);\n"); + + fprintf(fd," zmm1 = _mm512_cvtepi8_epi16(p_llrProcBuf[j +1 ]);\n"); + fprintf(fd," zmmRes1 = _mm512_adds_epi16(zmmRes1,zmm1);\n"); + + // Pack results back to epi8 + fprintf(fd," zmm0 = _mm512_packs_epi16(zmmRes0,zmmRes1);\n"); + //zmm0 = [zmmRes1[255:256]zmmRes0[255:256]zmmRes1[127:0]zmmRes0[127:0]] + // p_llrRes = [zmmRes1[255:256]zmmRes1[127:0]zmmRes0[255:256]zmmRes0[127:0]] + fprintf(fd," p_llrRes[i] = _mm512_permutex_epi64(zmm0, 0xD8);\n"); + + fprintf(fd,"}\n"); + } + + + + // ===================================================================== + +fprintf(fd, "// Process group with 11 CNs \n"); + + + + // Process group with 2 CNs + + if (lut_numBnInBnGroups[10] > 0) + { + // If elements in group move to next address + idxBnGroup++; + + // Number of groups of 32 BNs for parallel processing + fprintf(fd," M = (%d*Z + 63)>>6;\n",lut_numBnInBnGroups[10] );; + + // Set the offset to each CN within a group in terms of 16 Byte + cnOffsetInGroup = (lut_numBnInBnGroups[10]*NR_LDPC_ZMAX)>>5; + + // Set pointers to start of group 2 + fprintf(fd," p_bnProcBuf = (__m256i*) &bnProcBuf [%d];\n",lut_startAddrBnGroups[idxBnGroup]); + fprintf(fd," p_llrProcBuf = (__m256i*) &llrProcBuf [%d];\n",lut_startAddrBnGroupsLlr[idxBnGroup]); + fprintf(fd," p_llrRes = (__m512i*) &llrRes [%d];\n",lut_startAddrBnGroupsLlr[idxBnGroup]); + // Loop over BNs + fprintf(fd," for (int i=0,j=0;i<M;i++,j+=2) {\n"); + // First 16 LLRs of first CN + fprintf(fd," zmmRes0 = _mm512_cvtepi8_epi16(p_bnProcBuf [j]);\n"); + fprintf(fd," zmmRes1 = _mm512_cvtepi8_epi16(p_bnProcBuf [j +1]);\n"); + + // Loop over CNs + for (k=1; k<11; k++) + { + fprintf(fd," zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf[%d + j]);\n", k*cnOffsetInGroup); + fprintf(fd," zmmRes0 = _mm512_adds_epi16(zmmRes0,zmm0);\n"); + + fprintf(fd," zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[%d + j +1]);\n", k*cnOffsetInGroup); + + fprintf(fd, " zmmRes1 = _mm512_adds_epi16(zmmRes1,zmm1); \n"); + } + + // Add LLR from receiver input + fprintf(fd," zmm0 = _mm512_cvtepi8_epi16(p_llrProcBuf[j]);\n"); + fprintf(fd," zmmRes0 = _mm512_adds_epi16(zmmRes0,zmm0);\n"); + + fprintf(fd," zmm1 = _mm512_cvtepi8_epi16(p_llrProcBuf[j +1 ]);\n"); + fprintf(fd," zmmRes1 = _mm512_adds_epi16(zmmRes1,zmm1);\n"); + + // Pack results back to epi8 + fprintf(fd," zmm0 = _mm512_packs_epi16(zmmRes0,zmmRes1);\n"); + //zmm0 = [zmmRes1[255:256]zmmRes0[255:256]zmmRes1[127:0]zmmRes0[127:0]] + // p_llrRes = [zmmRes1[255:256]zmmRes1[127:0]zmmRes0[255:256]zmmRes0[127:0]] + fprintf(fd," p_llrRes[i] = _mm512_permutex_epi64(zmm0, 0xD8);\n"); + + fprintf(fd,"}\n"); + } + // ===================================================================== + // Process group with 2 CNs + + +fprintf(fd, "// Process group with 12 CNs \n"); + + // Process group with 2 CNs + + if (lut_numBnInBnGroups[11] > 0) + { + // If elements in group move to next address + idxBnGroup++; + + // Number of groups of 32 BNs for parallel processing + fprintf(fd," M = (%d*Z + 63)>>6;\n",lut_numBnInBnGroups[11] );; + + // Set the offset to each CN within a group in terms of 16 Byte + cnOffsetInGroup = (lut_numBnInBnGroups[11]*NR_LDPC_ZMAX)>>5; + + // Set pointers to start of group 2 + fprintf(fd," p_bnProcBuf = (__m256i*) &bnProcBuf [%d];\n",lut_startAddrBnGroups[idxBnGroup]); + fprintf(fd," p_llrProcBuf = (__m256i*) &llrProcBuf [%d];\n",lut_startAddrBnGroupsLlr[idxBnGroup]); + fprintf(fd," p_llrRes = (__m512i*) &llrRes [%d];\n",lut_startAddrBnGroupsLlr[idxBnGroup]); + // Loop over BNs + fprintf(fd," for (int i=0,j=0;i<M;i++,j+=2) {\n"); + // First 16 LLRs of first CN + fprintf(fd," zmmRes0 = _mm512_cvtepi8_epi16(p_bnProcBuf [j]);\n"); + fprintf(fd," zmmRes1 = _mm512_cvtepi8_epi16(p_bnProcBuf [j +1]);\n"); + + // Loop over CNs + for (k=1; k<12; k++) + { + fprintf(fd," zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf[%d + j]);\n", k*cnOffsetInGroup); + fprintf(fd," zmmRes0 = _mm512_adds_epi16(zmmRes0,zmm0);\n"); + + fprintf(fd," zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[%d + j +1]);\n", k*cnOffsetInGroup); + + fprintf(fd, " zmmRes1 = _mm512_adds_epi16(zmmRes1,zmm1); \n"); + } + + // Add LLR from receiver input + fprintf(fd," zmm0 = _mm512_cvtepi8_epi16(p_llrProcBuf[j]);\n"); + fprintf(fd," zmmRes0 = _mm512_adds_epi16(zmmRes0,zmm0);\n"); + + fprintf(fd," zmm1 = _mm512_cvtepi8_epi16(p_llrProcBuf[j +1 ]);\n"); + fprintf(fd," zmmRes1 = _mm512_adds_epi16(zmmRes1,zmm1);\n"); + + // Pack results back to epi8 + fprintf(fd," zmm0 = _mm512_packs_epi16(zmmRes0,zmmRes1);\n"); + //zmm0 = [zmmRes1[255:256]zmmRes0[255:256]zmmRes1[127:0]zmmRes0[127:0]] + // p_llrRes = [zmmRes1[255:256]zmmRes1[127:0]zmmRes0[255:256]zmmRes0[127:0]] + fprintf(fd," p_llrRes[i] = _mm512_permutex_epi64(zmm0, 0xD8);\n"); + + fprintf(fd,"}\n"); + } + + // ===================================================================== + // Process group with 13 CNs + + +fprintf(fd, "// Process group with 13 CNs \n"); + + // Process group with 3 CNs + + if (lut_numBnInBnGroups[12] > 0) + { + // If elements in group move to next address + idxBnGroup++; + + // Number of groups of 32 BNs for parallel processing + fprintf(fd," M = (%d*Z + 63)>>6;\n",lut_numBnInBnGroups[12] );; + + // Set the offset to each CN within a group in terms of 16 Byte + cnOffsetInGroup = (lut_numBnInBnGroups[12]*NR_LDPC_ZMAX)>>5; + + // Set pointers to start of group 2 + fprintf(fd," p_bnProcBuf = (__m256i*) &bnProcBuf [%d];\n",lut_startAddrBnGroups[idxBnGroup]); + fprintf(fd," p_llrProcBuf = (__m256i*) &llrProcBuf [%d];\n",lut_startAddrBnGroupsLlr[idxBnGroup]); + fprintf(fd," p_llrRes = (__m512i*) &llrRes [%d];\n",lut_startAddrBnGroupsLlr[idxBnGroup]); + // Loop over BNs + fprintf(fd," for (int i=0,j=0;i<M;i++,j+=2) {\n"); + // First 16 LLRs of first CN + fprintf(fd," zmmRes0 = _mm512_cvtepi8_epi16(p_bnProcBuf [j]);\n"); + fprintf(fd," zmmRes1 = _mm512_cvtepi8_epi16(p_bnProcBuf [j +1]);\n"); + + // Loop over CNs + for (k=1; k<13; k++) + { + fprintf(fd," zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf[%d + j]);\n", k*cnOffsetInGroup); + fprintf(fd," zmmRes0 = _mm512_adds_epi16(zmmRes0,zmm0);\n"); + + fprintf(fd," zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[%d + j +1]);\n", k*cnOffsetInGroup); + + fprintf(fd," zmmRes1 = _mm512_adds_epi16(zmmRes1,zmm1); \n"); + } + + // Add LLR from receiver input + fprintf(fd," zmm0 = _mm512_cvtepi8_epi16(p_llrProcBuf[j]);\n"); + fprintf(fd," zmmRes0 = _mm512_adds_epi16(zmmRes0,zmm0);\n"); + + fprintf(fd," zmm1 = _mm512_cvtepi8_epi16(p_llrProcBuf[j +1 ]);\n"); + fprintf(fd," zmmRes1 = _mm512_adds_epi16(zmmRes1,zmm1);\n"); + + // Pack results back to epi8 + fprintf(fd," zmm0 = _mm512_packs_epi16(zmmRes0,zmmRes1);\n"); + //zmm0 = [zmmRes1[255:256]zmmRes0[255:256]zmmRes1[127:0]zmmRes0[127:0]] + // p_llrRes = [zmmRes1[255:256]zmmRes1[127:0]zmmRes0[255:256]zmmRes0[127:0]] + fprintf(fd," p_llrRes[i] = _mm512_permutex_epi64(zmm0, 0xD8);\n"); + + fprintf(fd,"}\n"); + } + + + + // ===================================================================== + // Process group with 4 CNs + +fprintf(fd, "// Process group with 14 CNs \n"); + + // Process group with 4 CNs + + if (lut_numBnInBnGroups[13] > 0) + { + // If elements in group move to next address + idxBnGroup++; + + // Number of groups of 32 BNs for parallel processing + fprintf(fd," M = (%d*Z + 63)>>6;\n",lut_numBnInBnGroups[13] );; + + // Set the offset to each CN within a group in terms of 16 Byte + cnOffsetInGroup = (lut_numBnInBnGroups[13]*NR_LDPC_ZMAX)>>5; + + // Set pointers to start of group 2 + fprintf(fd," p_bnProcBuf = (__m256i*) &bnProcBuf [%d];\n",lut_startAddrBnGroups[idxBnGroup]); + fprintf(fd," p_llrProcBuf = (__m256i*) &llrProcBuf [%d];\n",lut_startAddrBnGroupsLlr[idxBnGroup]); + fprintf(fd," p_llrRes = (__m512i*) &llrRes [%d];\n",lut_startAddrBnGroupsLlr[idxBnGroup]); + // Loop over BNs + fprintf(fd," for (int i=0,j=0;i<M;i++,j+=2) {\n"); + // First 16 LLRs of first CN + fprintf(fd," zmmRes0 = _mm512_cvtepi8_epi16(p_bnProcBuf [j]);\n"); + fprintf(fd," zmmRes1 = _mm512_cvtepi8_epi16(p_bnProcBuf [j +1]);\n"); + + // Loop over CNs + for (k=1; k<14; k++) + { + fprintf(fd," zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf[%d + j]);\n", k*cnOffsetInGroup); + fprintf(fd," zmmRes0 = _mm512_adds_epi16(zmmRes0,zmm0);\n"); + + fprintf(fd," zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[%d + j +1]);\n", k*cnOffsetInGroup); + + fprintf(fd, " zmmRes1 = _mm512_adds_epi16(zmmRes1,zmm1); \n"); + } + + // Add LLR from receiver input + fprintf(fd," zmm0 = _mm512_cvtepi8_epi16(p_llrProcBuf[j]);\n"); + fprintf(fd," zmmRes0 = _mm512_adds_epi16(zmmRes0,zmm0);\n"); + + fprintf(fd," zmm1 = _mm512_cvtepi8_epi16(p_llrProcBuf[j +1 ]);\n"); + fprintf(fd," zmmRes1 = _mm512_adds_epi16(zmmRes1,zmm1);\n"); + + // Pack results back to epi8 + fprintf(fd," zmm0 = _mm512_packs_epi16(zmmRes0,zmmRes1);\n"); + //zmm0 = [zmmRes1[255:256]zmmRes0[255:256]zmmRes1[127:0]zmmRes0[127:0]] + // p_llrRes = [zmmRes1[255:256]zmmRes1[127:0]zmmRes0[255:256]zmmRes0[127:0]] + fprintf(fd," p_llrRes[i] = _mm512_permutex_epi64(zmm0, 0xD8);\n"); + + fprintf(fd,"}\n"); + } + + + // ===================================================================== + // Process group with 5 CNs + +fprintf(fd, "// Process group with 15 CNs \n"); + + // Process group with 5 CNs + + if (lut_numBnInBnGroups[14] > 0) + { + // If elements in group move to next address + idxBnGroup++; + + // Number of groups of 32 BNs for parallel processing + fprintf(fd," M = (%d*Z + 63)>>6;\n",lut_numBnInBnGroups[14] );; + + // Set the offset to each CN within a group in terms of 16 Byte + cnOffsetInGroup = (lut_numBnInBnGroups[14]*NR_LDPC_ZMAX)>>5; + + // Set pointers to start of group 2 + fprintf(fd," p_bnProcBuf = (__m256i*) &bnProcBuf [%d];\n",lut_startAddrBnGroups[idxBnGroup]); + fprintf(fd," p_llrProcBuf = (__m256i*) &llrProcBuf [%d];\n",lut_startAddrBnGroupsLlr[idxBnGroup]); + fprintf(fd," p_llrRes = (__m512i*) &llrRes [%d];\n",lut_startAddrBnGroupsLlr[idxBnGroup]); + // Loop over BNs + fprintf(fd," for (int i=0,j=0;i<M;i++,j+=2) {\n"); + // First 16 LLRs of first CN + fprintf(fd," zmmRes0 = _mm512_cvtepi8_epi16(p_bnProcBuf [j]);\n"); + fprintf(fd," zmmRes1 = _mm512_cvtepi8_epi16(p_bnProcBuf [j +1]);\n"); + + // Loop over CNs + for (k=1; k<15; k++) + { + fprintf(fd," zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf[%d + j]);\n", k*cnOffsetInGroup); + fprintf(fd," zmmRes0 = _mm512_adds_epi16(zmmRes0,zmm0);\n"); + + fprintf(fd," zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[%d + j +1]);\n", k*cnOffsetInGroup); + + fprintf(fd, " zmmRes1 = _mm512_adds_epi16(zmmRes1,zmm1); \n"); + } + + // Add LLR from receiver input + fprintf(fd," zmm0 = _mm512_cvtepi8_epi16(p_llrProcBuf[j]);\n"); + fprintf(fd," zmmRes0 = _mm512_adds_epi16(zmmRes0,zmm0);\n"); + + fprintf(fd," zmm1 = _mm512_cvtepi8_epi16(p_llrProcBuf[j +1 ]);\n"); + fprintf(fd," zmmRes1 = _mm512_adds_epi16(zmmRes1,zmm1);\n"); + + // Pack results back to epi8 + fprintf(fd," zmm0 = _mm512_packs_epi16(zmmRes0,zmmRes1);\n"); + //zmm0 = [zmmRes1[255:256]zmmRes0[255:256]zmmRes1[127:0]zmmRes0[127:0]] + // p_llrRes = [zmmRes1[255:256]zmmRes1[127:0]zmmRes0[255:256]zmmRes0[127:0]] + fprintf(fd," p_llrRes[i] = _mm512_permutex_epi64(zmm0, 0xD8);\n"); + + fprintf(fd,"}\n"); + } + + + + // ===================================================================== + // Process group with 6 CNs + +fprintf(fd, "// Process group with 16 CNs \n"); + + // Process group with 6 CNs + + if (lut_numBnInBnGroups[15] > 0) + { + // If elements in group move to next address + idxBnGroup++; + + // Number of groups of 32 BNs for parallel processing + fprintf(fd," M = (%d*Z + 63)>>6;\n",lut_numBnInBnGroups[15] );; + + // Set the offset to each CN within a group in terms of 16 Byte + cnOffsetInGroup = (lut_numBnInBnGroups[15]*NR_LDPC_ZMAX)>>5; + + // Set pointers to start of group 2 + fprintf(fd," p_bnProcBuf = (__m256i*) &bnProcBuf [%d];\n",lut_startAddrBnGroups[idxBnGroup]); + fprintf(fd," p_llrProcBuf = (__m256i*) &llrProcBuf [%d];\n",lut_startAddrBnGroupsLlr[idxBnGroup]); + fprintf(fd," p_llrRes = (__m512i*) &llrRes [%d];\n",lut_startAddrBnGroupsLlr[idxBnGroup]); + // Loop over BNs + fprintf(fd," for (int i=0,j=0;i<M;i++,j+=2) {\n"); + // First 16 LLRs of first CN + fprintf(fd," zmmRes0 = _mm512_cvtepi8_epi16(p_bnProcBuf [j]);\n"); + fprintf(fd," zmmRes1 = _mm512_cvtepi8_epi16(p_bnProcBuf [j +1]);\n"); + + // Loop over CNs + for (k=1; k<16; k++) + { + fprintf(fd," zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf[%d + j]);\n", k*cnOffsetInGroup); + fprintf(fd," zmmRes0 = _mm512_adds_epi16(zmmRes0,zmm0);\n"); + + fprintf(fd," zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[%d + j +1]);\n", k*cnOffsetInGroup); + + fprintf(fd, " zmmRes1 = _mm512_adds_epi16(zmmRes1,zmm1); \n"); + } + + // Add LLR from receiver input + fprintf(fd," zmm0 = _mm512_cvtepi8_epi16(p_llrProcBuf[j]);\n"); + fprintf(fd," zmmRes0 = _mm512_adds_epi16(zmmRes0,zmm0);\n"); + + fprintf(fd," zmm1 = _mm512_cvtepi8_epi16(p_llrProcBuf[j +1 ]);\n"); + fprintf(fd," zmmRes1 = _mm512_adds_epi16(zmmRes1,zmm1);\n"); + + // Pack results back to epi8 + fprintf(fd," zmm0 = _mm512_packs_epi16(zmmRes0,zmmRes1);\n"); + //zmm0 = [zmmRes1[255:256]zmmRes0[255:256]zmmRes1[127:0]zmmRes0[127:0]] + // p_llrRes = [zmmRes1[255:256]zmmRes1[127:0]zmmRes0[255:256]zmmRes0[127:0]] + fprintf(fd," p_llrRes[i] = _mm512_permutex_epi64(zmm0, 0xD8);\n"); + + fprintf(fd,"}\n"); + } + + + // ===================================================================== + // Process group with 17 CNs + +fprintf(fd, "// Process group with 17 CNs \n"); + + // Process group with 17 CNs + + if (lut_numBnInBnGroups[16] > 0) + { + // If elements in group move to next address + idxBnGroup++; + + // Number of groups of 32 BNs for parallel processing + fprintf(fd," M = (%d*Z + 63)>>6;\n",lut_numBnInBnGroups[16] );; + + // Set the offset to each CN within a group in terms of 16 Byte + cnOffsetInGroup = (lut_numBnInBnGroups[16]*NR_LDPC_ZMAX)>>5; + + // Set pointers to start of group 2 + fprintf(fd," p_bnProcBuf = (__m256i*) &bnProcBuf [%d];\n",lut_startAddrBnGroups[idxBnGroup]); + fprintf(fd," p_llrProcBuf = (__m256i*) &llrProcBuf [%d];\n",lut_startAddrBnGroupsLlr[idxBnGroup]); + fprintf(fd," p_llrRes = (__m512i*) &llrRes [%d];\n",lut_startAddrBnGroupsLlr[idxBnGroup]); + // Loop over BNs + fprintf(fd," for (int i=0,j=0;i<M;i++,j+=2) {\n"); + // First 16 LLRs of first CN + fprintf(fd," zmmRes0 = _mm512_cvtepi8_epi16(p_bnProcBuf [j]);\n"); + fprintf(fd," zmmRes1 = _mm512_cvtepi8_epi16(p_bnProcBuf [j +1]);\n"); + + // Loop over CNs + for (k=1; k<17; k++) + { + fprintf(fd," zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf[%d + j]);\n", k*cnOffsetInGroup); + fprintf(fd," zmmRes0 = _mm512_adds_epi16(zmmRes0,zmm0);\n"); + + fprintf(fd," zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[%d + j +1]);\n", k*cnOffsetInGroup); + + fprintf(fd, " zmmRes1 = _mm512_adds_epi16(zmmRes1,zmm1); \n"); + } + + // Add LLR from receiver input + fprintf(fd," zmm0 = _mm512_cvtepi8_epi16(p_llrProcBuf[j]);\n"); + fprintf(fd," zmmRes0 = _mm512_adds_epi16(zmmRes0,zmm0);\n"); + + fprintf(fd," zmm1 = _mm512_cvtepi8_epi16(p_llrProcBuf[j +1 ]);\n"); + fprintf(fd," zmmRes1 = _mm512_adds_epi16(zmmRes1,zmm1);\n"); + + // Pack results back to epi8 + fprintf(fd," zmm0 = _mm512_packs_epi16(zmmRes0,zmmRes1);\n"); + //zmm0 = [zmmRes1[255:256]zmmRes0[255:256]zmmRes1[127:0]zmmRes0[127:0]] + // p_llrRes = [zmmRes1[255:256]zmmRes1[127:0]zmmRes0[255:256]zmmRes0[127:0]] + fprintf(fd," p_llrRes[i] = _mm512_permutex_epi64(zmm0, 0xD8);\n"); + + fprintf(fd,"}\n"); + } + + + // ===================================================================== + // Process group with 18 CNs + +fprintf(fd, "// Process group with 18 CNs \n"); + + // Process group with 8 CNs + + if (lut_numBnInBnGroups[17] > 0) + { + // If elements in group move to next address + idxBnGroup++; + + // Number of groups of 32 BNs for parallel processing + fprintf(fd," M = (%d*Z + 63)>>6;\n",lut_numBnInBnGroups[17] );; + + // Set the offset to each CN within a group in terms of 16 Byte + cnOffsetInGroup = (lut_numBnInBnGroups[17]*NR_LDPC_ZMAX)>>5; + + // Set pointers to start of group 2 + fprintf(fd," p_bnProcBuf = (__m256i*) &bnProcBuf [%d];\n",lut_startAddrBnGroups[idxBnGroup]); + fprintf(fd," p_llrProcBuf = (__m256i*) &llrProcBuf [%d];\n",lut_startAddrBnGroupsLlr[idxBnGroup]); + fprintf(fd," p_llrRes = (__m512i*) &llrRes [%d];\n",lut_startAddrBnGroupsLlr[idxBnGroup]); + // Loop over BNs + fprintf(fd," for (int i=0,j=0;i<M;i++,j+=2) {\n"); + // First 16 LLRs of first CN + fprintf(fd," zmmRes0 = _mm512_cvtepi8_epi16(p_bnProcBuf [j]);\n"); + fprintf(fd," zmmRes1 = _mm512_cvtepi8_epi16(p_bnProcBuf [j +1]);\n"); + + // Loop over CNs + for (k=1; k<18; k++) + { + fprintf(fd," zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf[%d + j]);\n", k*cnOffsetInGroup); + fprintf(fd," zmmRes0 = _mm512_adds_epi16(zmmRes0,zmm0);\n"); + + fprintf(fd," zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[%d + j +1]);\n", k*cnOffsetInGroup); + + fprintf(fd, " zmmRes1 = _mm512_adds_epi16(zmmRes1,zmm1); \n"); + } + + // Add LLR from receiver input + fprintf(fd," zmm0 = _mm512_cvtepi8_epi16(p_llrProcBuf[j]);\n"); + fprintf(fd," zmmRes0 = _mm512_adds_epi16(zmmRes0,zmm0);\n"); + + fprintf(fd," zmm1 = _mm512_cvtepi8_epi16(p_llrProcBuf[j +1 ]);\n"); + fprintf(fd," zmmRes1 = _mm512_adds_epi16(zmmRes1,zmm1);\n"); + + // Pack results back to epi8 + fprintf(fd," zmm0 = _mm512_packs_epi16(zmmRes0,zmmRes1);\n"); + //zmm0 = [zmmRes1[255:256]zmmRes0[255:256]zmmRes1[127:0]zmmRes0[127:0]] + // p_llrRes = [zmmRes1[255:256]zmmRes1[127:0]zmmRes0[255:256]zmmRes0[127:0]] + fprintf(fd," p_llrRes[i] = _mm512_permutex_epi64(zmm0, 0xD8);\n"); + + fprintf(fd,"}\n"); + } + + // ===================================================================== + // Process group with 9 CNs + +fprintf(fd, "// Process group with 19 CNs \n"); + + // Process group with 9 CNs + + if (lut_numBnInBnGroups[18] > 0) + { + // If elements in group move to next address + idxBnGroup++; + + // Number of groups of 32 BNs for parallel processing + fprintf(fd," M = (%d*Z + 63)>>6;\n",lut_numBnInBnGroups[18] );; + + // Set the offset to each CN within a group in terms of 16 Byte + cnOffsetInGroup = (lut_numBnInBnGroups[18]*NR_LDPC_ZMAX)>>5; + + // Set pointers to start of group 2 + fprintf(fd," p_bnProcBuf = (__m256i*) &bnProcBuf [%d];\n",lut_startAddrBnGroups[idxBnGroup]); + fprintf(fd," p_llrProcBuf = (__m256i*) &llrProcBuf [%d];\n",lut_startAddrBnGroupsLlr[idxBnGroup]); + fprintf(fd," p_llrRes = (__m512i*) &llrRes [%d];\n",lut_startAddrBnGroupsLlr[idxBnGroup]); + // Loop over BNs + fprintf(fd," for (int i=0,j=0;i<M;i++,j+=2) {\n"); + // First 16 LLRs of first CN + fprintf(fd," zmmRes0 = _mm512_cvtepi8_epi16(p_bnProcBuf [j]);\n"); + fprintf(fd," zmmRes1 = _mm512_cvtepi8_epi16(p_bnProcBuf [j +1]);\n"); + + // Loop over CNs + for (k=1; k<19; k++) + { + fprintf(fd," zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf[%d + j]);\n", k*cnOffsetInGroup); + fprintf(fd," zmmRes0 = _mm512_adds_epi16(zmmRes0,zmm0);\n"); + + fprintf(fd," zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[%d + j +1]);\n", k*cnOffsetInGroup); + + fprintf(fd, " zmmRes1 = _mm512_adds_epi16(zmmRes1,zmm1); \n"); + } + + // Add LLR from receiver input + fprintf(fd," zmm0 = _mm512_cvtepi8_epi16(p_llrProcBuf[j]);\n"); + fprintf(fd," zmmRes0 = _mm512_adds_epi16(zmmRes0,zmm0);\n"); + + fprintf(fd," zmm1 = _mm512_cvtepi8_epi16(p_llrProcBuf[j +1 ]);\n"); + fprintf(fd," zmmRes1 = _mm512_adds_epi16(zmmRes1,zmm1);\n"); + + // Pack results back to epi8 + fprintf(fd," zmm0 = _mm512_packs_epi16(zmmRes0,zmmRes1);\n"); + //zmm0 = [zmmRes1[255:256]zmmRes0[255:256]zmmRes1[127:0]zmmRes0[127:0]] + // p_llrRes = [zmmRes1[255:256]zmmRes1[127:0]zmmRes0[255:256]zmmRes0[127:0]] + fprintf(fd," p_llrRes[i] = _mm512_permutex_epi64(zmm0, 0xD8);\n"); + + fprintf(fd,"}\n"); + } + + + // ===================================================================== + // Process group with 20 CNs + +fprintf(fd, "// Process group with 20 CNs \n"); + + // Process group with 20 CNs + + if (lut_numBnInBnGroups[19] > 0) + { + // If elements in group move to next address + idxBnGroup++; + + // Number of groups of 32 BNs for parallel processing + fprintf(fd," M = (%d*Z + 63)>>6;\n",lut_numBnInBnGroups[19] );; + + // Set the offset to each CN within a group in terms of 16 Byte + cnOffsetInGroup = (lut_numBnInBnGroups[19]*NR_LDPC_ZMAX)>>5; + + // Set pointers to start of group 2 + fprintf(fd," p_bnProcBuf = (__m256i*) &bnProcBuf [%d];\n",lut_startAddrBnGroups[idxBnGroup]); + fprintf(fd," p_llrProcBuf = (__m256i*) &llrProcBuf [%d];\n",lut_startAddrBnGroupsLlr[idxBnGroup]); + fprintf(fd," p_llrRes = (__m512i*) &llrRes [%d];\n",lut_startAddrBnGroupsLlr[idxBnGroup]); + // Loop over BNs + fprintf(fd," for (int i=0,j=0;i<M;i++,j+=2) {\n"); + // First 16 LLRs of first CN + fprintf(fd," zmmRes0 = _mm512_cvtepi8_epi16(p_bnProcBuf [j]);\n"); + fprintf(fd," zmmRes1 = _mm512_cvtepi8_epi16(p_bnProcBuf [j +1]);\n"); + + // Loop over CNs + for (k=1; k<20; k++) + { + fprintf(fd," zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf[%d + j]);\n", k*cnOffsetInGroup); + fprintf(fd," zmmRes0 = _mm512_adds_epi16(zmmRes0,zmm0);\n"); + + fprintf(fd," zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[%d + j +1]);\n", k*cnOffsetInGroup); + + fprintf(fd, " zmmRes1 = _mm512_adds_epi16(zmmRes1,zmm1); \n"); + } + + // Add LLR from receiver input + fprintf(fd," zmm0 = _mm512_cvtepi8_epi16(p_llrProcBuf[j]);\n"); + fprintf(fd," zmmRes0 = _mm512_adds_epi16(zmmRes0,zmm0);\n"); + + fprintf(fd," zmm1 = _mm512_cvtepi8_epi16(p_llrProcBuf[j +1 ]);\n"); + fprintf(fd," zmmRes1 = _mm512_adds_epi16(zmmRes1,zmm1);\n"); + + // Pack results back to epi8 + fprintf(fd," zmm0 = _mm512_packs_epi16(zmmRes0,zmmRes1);\n"); + //zmm0 = [zmmRes1[255:256]zmmRes0[255:256]zmmRes1[127:0]zmmRes0[127:0]] + // p_llrRes = [zmmRes1[255:256]zmmRes1[127:0]zmmRes0[255:256]zmmRes0[127:0]] + fprintf(fd," p_llrRes[i] = _mm512_permutex_epi64(zmm0, 0xD8);\n"); + + fprintf(fd,"}\n"); + } + + + + + + // ===================================================================== + +fprintf(fd, "// Process group with 21 CNs \n"); + + + + // Process group with 2 CNs + + if (lut_numBnInBnGroups[20] > 0) + { + // If elements in group move to next address + idxBnGroup++; + + // Number of groups of 32 BNs for parallel processing + fprintf(fd," M = (%d*Z + 63)>>6;\n",lut_numBnInBnGroups[20] );; + + // Set the offset to each CN within a group in terms of 16 Byte + cnOffsetInGroup = (lut_numBnInBnGroups[20]*NR_LDPC_ZMAX)>>5; + + // Set pointers to start of group 2 + fprintf(fd," p_bnProcBuf = (__m256i*) &bnProcBuf [%d];\n",lut_startAddrBnGroups[idxBnGroup]); + fprintf(fd," p_llrProcBuf = (__m256i*) &llrProcBuf [%d];\n",lut_startAddrBnGroupsLlr[idxBnGroup]); + fprintf(fd," p_llrRes = (__m512i*) &llrRes [%d];\n",lut_startAddrBnGroupsLlr[idxBnGroup]); + // Loop over BNs + fprintf(fd," for (int i=0,j=0;i<M;i++,j+=2) {\n"); + // First 16 LLRs of first CN + fprintf(fd," zmmRes0 = _mm512_cvtepi8_epi16(p_bnProcBuf [j]);\n"); + fprintf(fd," zmmRes1 = _mm512_cvtepi8_epi16(p_bnProcBuf [j +1]);\n"); + + // Loop over CNs + for (k=1; k<21; k++) + { + fprintf(fd," zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf[%d + j]);\n", k*cnOffsetInGroup); + fprintf(fd," zmmRes0 = _mm512_adds_epi16(zmmRes0,zmm0);\n"); + + fprintf(fd," zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[%d + j +1]);\n", k*cnOffsetInGroup); + + fprintf(fd, " zmmRes1 = _mm512_adds_epi16(zmmRes1,zmm1); \n"); + } + + // Add LLR from receiver input + fprintf(fd," zmm0 = _mm512_cvtepi8_epi16(p_llrProcBuf[j]);\n"); + fprintf(fd," zmmRes0 = _mm512_adds_epi16(zmmRes0,zmm0);\n"); + + fprintf(fd," zmm1 = _mm512_cvtepi8_epi16(p_llrProcBuf[j +1 ]);\n"); + fprintf(fd," zmmRes1 = _mm512_adds_epi16(zmmRes1,zmm1);\n"); + + // Pack results back to epi8 + fprintf(fd," zmm0 = _mm512_packs_epi16(zmmRes0,zmmRes1);\n"); + //zmm0 = [zmmRes1[255:256]zmmRes0[255:256]zmmRes1[127:0]zmmRes0[127:0]] + // p_llrRes = [zmmRes1[255:256]zmmRes1[127:0]zmmRes0[255:256]zmmRes0[127:0]] + fprintf(fd," p_llrRes[i] = _mm512_permutex_epi64(zmm0, 0xD8);\n"); + + fprintf(fd,"}\n"); + } + // ===================================================================== + // Process group with 2 CNs + + +fprintf(fd, "// Process group with 22 CNs \n"); + + // Process group with 2 CNs + + if (lut_numBnInBnGroups[21] > 0) + { + // If elements in group move to next address + idxBnGroup++; + + // Number of groups of 32 BNs for parallel processing + fprintf(fd," M = (%d*Z + 63)>>6;\n",lut_numBnInBnGroups[21] );; + + // Set the offset to each CN within a group in terms of 16 Byte + cnOffsetInGroup = (lut_numBnInBnGroups[21]*NR_LDPC_ZMAX)>>5; + + // Set pointers to start of group 2 + fprintf(fd," p_bnProcBuf = (__m256i*) &bnProcBuf [%d];\n",lut_startAddrBnGroups[idxBnGroup]); + fprintf(fd," p_llrProcBuf = (__m256i*) &llrProcBuf [%d];\n",lut_startAddrBnGroupsLlr[idxBnGroup]); + fprintf(fd," p_llrRes = (__m512i*) &llrRes [%d];\n",lut_startAddrBnGroupsLlr[idxBnGroup]); + // Loop over BNs + fprintf(fd," for (int i=0,j=0;i<M;i++,j+=2) {\n"); + // First 16 LLRs of first CN + fprintf(fd," zmmRes0 = _mm512_cvtepi8_epi16(p_bnProcBuf [j]);\n"); + fprintf(fd," zmmRes1 = _mm512_cvtepi8_epi16(p_bnProcBuf [j +1]);\n"); + + // Loop over CNs + for (k=1; k<22; k++) + { + fprintf(fd," zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf[%d + j]);\n", k*cnOffsetInGroup); + fprintf(fd," zmmRes0 = _mm512_adds_epi16(zmmRes0,zmm0);\n"); + + fprintf(fd," zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[%d + j +1]);\n", k*cnOffsetInGroup); + + fprintf(fd, " zmmRes1 = _mm512_adds_epi16(zmmRes1,zmm1); \n"); + } + + // Add LLR from receiver input + fprintf(fd," zmm0 = _mm512_cvtepi8_epi16(p_llrProcBuf[j]);\n"); + fprintf(fd," zmmRes0 = _mm512_adds_epi16(zmmRes0,zmm0);\n"); + + fprintf(fd," zmm1 = _mm512_cvtepi8_epi16(p_llrProcBuf[j +1 ]);\n"); + fprintf(fd," zmmRes1 = _mm512_adds_epi16(zmmRes1,zmm1);\n"); + + // Pack results back to epi8 + fprintf(fd," zmm0 = _mm512_packs_epi16(zmmRes0,zmmRes1);\n"); + //zmm0 = [zmmRes1[255:256]zmmRes0[255:256]zmmRes1[127:0]zmmRes0[127:0]] + // p_llrRes = [zmmRes1[255:256]zmmRes1[127:0]zmmRes0[255:256]zmmRes0[127:0]] + fprintf(fd," p_llrRes[i] = _mm512_permutex_epi64(zmm0, 0xD8);\n"); + + fprintf(fd,"}\n"); + } + + // ===================================================================== + // Process group with 13 CNs + + +fprintf(fd, "// Process group with <23 CNs \n"); + + // Process group with 3 CNs + + if (lut_numBnInBnGroups[22] > 0) + { + // If elements in group move to next address + idxBnGroup++; + + // Number of groups of 32 BNs for parallel processing + fprintf(fd," M = (%d*Z + 63)>>6;\n",lut_numBnInBnGroups[22] );; + + // Set the offset to each CN within a group in terms of 16 Byte + cnOffsetInGroup = (lut_numBnInBnGroups[22]*NR_LDPC_ZMAX)>>5; + + // Set pointers to start of group 2 + fprintf(fd," p_bnProcBuf = (__m256i*) &bnProcBuf [%d];\n",lut_startAddrBnGroups[idxBnGroup]); + fprintf(fd," p_llrProcBuf = (__m256i*) &llrProcBuf [%d];\n",lut_startAddrBnGroupsLlr[idxBnGroup]); + fprintf(fd," p_llrRes = (__m512i*) &llrRes [%d];\n",lut_startAddrBnGroupsLlr[idxBnGroup]); + // Loop over BNs + fprintf(fd," for (int i=0,j=0;i<M;i++,j+=2) {\n"); + // First 16 LLRs of first CN + fprintf(fd," zmmRes0 = _mm512_cvtepi8_epi16(p_bnProcBuf [j]);\n"); + fprintf(fd," zmmRes1 = _mm512_cvtepi8_epi16(p_bnProcBuf [j +1]);\n"); + + // Loop over CNs + for (k=1; k<23; k++) + { + fprintf(fd," zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf[%d + j]);\n", k*cnOffsetInGroup); + fprintf(fd," zmmRes0 = _mm512_adds_epi16(zmmRes0,zmm0);\n"); + + fprintf(fd," zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[%d + j +1]);\n", k*cnOffsetInGroup); + + fprintf(fd," zmmRes1 = _mm512_adds_epi16(zmmRes1,zmm1); \n"); + } + + // Add LLR from receiver input + fprintf(fd," zmm0 = _mm512_cvtepi8_epi16(p_llrProcBuf[j]);\n"); + fprintf(fd," zmmRes0 = _mm512_adds_epi16(zmmRes0,zmm0);\n"); + + fprintf(fd," zmm1 = _mm512_cvtepi8_epi16(p_llrProcBuf[j +1 ]);\n"); + fprintf(fd," zmmRes1 = _mm512_adds_epi16(zmmRes1,zmm1);\n"); + + // Pack results back to epi8 + fprintf(fd," zmm0 = _mm512_packs_epi16(zmmRes0,zmmRes1);\n"); + //zmm0 = [zmmRes1[255:256]zmmRes0[255:256]zmmRes1[127:0]zmmRes0[127:0]] + // p_llrRes = [zmmRes1[255:256]zmmRes1[127:0]zmmRes0[255:256]zmmRes0[127:0]] + fprintf(fd," p_llrRes[i] = _mm512_permutex_epi64(zmm0, 0xD8);\n"); + + fprintf(fd,"}\n"); + } + + + + // ===================================================================== + // Process group with 4 CNs + +fprintf(fd, "// Process group with 24 CNs \n"); + + // Process group with 4 CNs + + if (lut_numBnInBnGroups[23] > 0) + { + // If elements in group move to next address + idxBnGroup++; + + // Number of groups of 32 BNs for parallel processing + fprintf(fd," M = (%d*Z + 63)>>6;\n",lut_numBnInBnGroups[23] );; + + // Set the offset to each CN within a group in terms of 16 Byte + cnOffsetInGroup = (lut_numBnInBnGroups[23]*NR_LDPC_ZMAX)>>5; + + // Set pointers to start of group 2 + fprintf(fd," p_bnProcBuf = (__m256i*) &bnProcBuf [%d];\n",lut_startAddrBnGroups[idxBnGroup]); + fprintf(fd," p_llrProcBuf = (__m256i*) &llrProcBuf [%d];\n",lut_startAddrBnGroupsLlr[idxBnGroup]); + fprintf(fd," p_llrRes = (__m512i*) &llrRes [%d];\n",lut_startAddrBnGroupsLlr[idxBnGroup]); + // Loop over BNs + fprintf(fd," for (int i=0,j=0;i<M;i++,j+=2) {\n"); + // First 16 LLRs of first CN + fprintf(fd," zmmRes0 = _mm512_cvtepi8_epi16(p_bnProcBuf [j]);\n"); + fprintf(fd," zmmRes1 = _mm512_cvtepi8_epi16(p_bnProcBuf [j +1]);\n"); + + // Loop over CNs + for (k=1; k<24; k++) + { + fprintf(fd," zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf[%d + j]);\n", k*cnOffsetInGroup); + fprintf(fd," zmmRes0 = _mm512_adds_epi16(zmmRes0,zmm0);\n"); + + fprintf(fd," zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[%d + j +1]);\n", k*cnOffsetInGroup); + + fprintf(fd, " zmmRes1 = _mm512_adds_epi16(zmmRes1,zmm1); \n"); + } + + // Add LLR from receiver input + fprintf(fd," zmm0 = _mm512_cvtepi8_epi16(p_llrProcBuf[j]);\n"); + fprintf(fd," zmmRes0 = _mm512_adds_epi16(zmmRes0,zmm0);\n"); + + fprintf(fd," zmm1 = _mm512_cvtepi8_epi16(p_llrProcBuf[j +1 ]);\n"); + fprintf(fd," zmmRes1 = _mm512_adds_epi16(zmmRes1,zmm1);\n"); + + // Pack results back to epi8 + fprintf(fd," zmm0 = _mm512_packs_epi16(zmmRes0,zmmRes1);\n"); + //zmm0 = [zmmRes1[255:256]zmmRes0[255:256]zmmRes1[127:0]zmmRes0[127:0]] + // p_llrRes = [zmmRes1[255:256]zmmRes1[127:0]zmmRes0[255:256]zmmRes0[127:0]] + fprintf(fd," p_llrRes[i] = _mm512_permutex_epi64(zmm0, 0xD8);\n"); + + fprintf(fd,"}\n"); + } + + + // ===================================================================== + // Process group with 5 CNs + +fprintf(fd, "// Process group with 25 CNs \n"); + + // Process group with 5 CNs + + if (lut_numBnInBnGroups[24] > 0) + { + // If elements in group move to next address + idxBnGroup++; + + // Number of groups of 32 BNs for parallel processing + fprintf(fd," M = (%d*Z + 63)>>6;\n",lut_numBnInBnGroups[24] );; + + // Set the offset to each CN within a group in terms of 16 Byte + cnOffsetInGroup = (lut_numBnInBnGroups[24]*NR_LDPC_ZMAX)>>5; + + // Set pointers to start of group 2 + fprintf(fd," p_bnProcBuf = (__m256i*) &bnProcBuf [%d];\n",lut_startAddrBnGroups[idxBnGroup]); + fprintf(fd," p_llrProcBuf = (__m256i*) &llrProcBuf [%d];\n",lut_startAddrBnGroupsLlr[idxBnGroup]); + fprintf(fd," p_llrRes = (__m512i*) &llrRes [%d];\n",lut_startAddrBnGroupsLlr[idxBnGroup]); + // Loop over BNs + fprintf(fd," for (int i=0,j=0;i<M;i++,j+=2) {\n"); + // First 16 LLRs of first CN + fprintf(fd," zmmRes0 = _mm512_cvtepi8_epi16(p_bnProcBuf [j]);\n"); + fprintf(fd," zmmRes1 = _mm512_cvtepi8_epi16(p_bnProcBuf [j +1]);\n"); + + // Loop over CNs + for (k=1; k<25; k++) + { + fprintf(fd," zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf[%d + j]);\n", k*cnOffsetInGroup); + fprintf(fd," zmmRes0 = _mm512_adds_epi16(zmmRes0,zmm0);\n"); + + fprintf(fd," zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[%d + j +1]);\n", k*cnOffsetInGroup); + + fprintf(fd, " zmmRes1 = _mm512_adds_epi16(zmmRes1,zmm1); \n"); + } + + // Add LLR from receiver input + fprintf(fd," zmm0 = _mm512_cvtepi8_epi16(p_llrProcBuf[j]);\n"); + fprintf(fd," zmmRes0 = _mm512_adds_epi16(zmmRes0,zmm0);\n"); + + fprintf(fd," zmm1 = _mm512_cvtepi8_epi16(p_llrProcBuf[j +1 ]);\n"); + fprintf(fd," zmmRes1 = _mm512_adds_epi16(zmmRes1,zmm1);\n"); + + // Pack results back to epi8 + fprintf(fd," zmm0 = _mm512_packs_epi16(zmmRes0,zmmRes1);\n"); + //zmm0 = [zmmRes1[255:256]zmmRes0[255:256]zmmRes1[127:0]zmmRes0[127:0]] + // p_llrRes = [zmmRes1[255:256]zmmRes1[127:0]zmmRes0[255:256]zmmRes0[127:0]] + fprintf(fd," p_llrRes[i] = _mm512_permutex_epi64(zmm0, 0xD8);\n"); + + fprintf(fd,"}\n"); + } + + + + // ===================================================================== + // Process group with 6 CNs + +fprintf(fd, "// Process group with 26 CNs \n"); + + // Process group with 6 CNs + + if (lut_numBnInBnGroups[25] > 0) + { + // If elements in group move to next address + idxBnGroup++; + + // Number of groups of 32 BNs for parallel processing + fprintf(fd," M = (%d*Z + 63)>>6;\n",lut_numBnInBnGroups[25] );; + + // Set the offset to each CN within a group in terms of 16 Byte + cnOffsetInGroup = (lut_numBnInBnGroups[25]*NR_LDPC_ZMAX)>>5; + + // Set pointers to start of group 2 + fprintf(fd," p_bnProcBuf = (__m256i*) &bnProcBuf [%d];\n",lut_startAddrBnGroups[idxBnGroup]); + fprintf(fd," p_llrProcBuf = (__m256i*) &llrProcBuf [%d];\n",lut_startAddrBnGroupsLlr[idxBnGroup]); + fprintf(fd," p_llrRes = (__m512i*) &llrRes [%d];\n",lut_startAddrBnGroupsLlr[idxBnGroup]); + // Loop over BNs + fprintf(fd," for (int i=0,j=0;i<M;i++,j+=2) {\n"); + // First 16 LLRs of first CN + fprintf(fd," zmmRes0 = _mm512_cvtepi8_epi16(p_bnProcBuf [j]);\n"); + fprintf(fd," zmmRes1 = _mm512_cvtepi8_epi16(p_bnProcBuf [j +1]);\n"); + + // Loop over CNs + for (k=1; k<26; k++) + { + fprintf(fd," zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf[%d + j]);\n", k*cnOffsetInGroup); + fprintf(fd," zmmRes0 = _mm512_adds_epi16(zmmRes0,zmm0);\n"); + + fprintf(fd," zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[%d + j +1]);\n", k*cnOffsetInGroup); + + fprintf(fd, " zmmRes1 = _mm512_adds_epi16(zmmRes1,zmm1); \n"); + } + + // Add LLR from receiver input + fprintf(fd," zmm0 = _mm512_cvtepi8_epi16(p_llrProcBuf[j]);\n"); + fprintf(fd," zmmRes0 = _mm512_adds_epi16(zmmRes0,zmm0);\n"); + + fprintf(fd," zmm1 = _mm512_cvtepi8_epi16(p_llrProcBuf[j +1 ]);\n"); + fprintf(fd," zmmRes1 = _mm512_adds_epi16(zmmRes1,zmm1);\n"); + + // Pack results back to epi8 + fprintf(fd," zmm0 = _mm512_packs_epi16(zmmRes0,zmmRes1);\n"); + //zmm0 = [zmmRes1[255:256]zmmRes0[255:256]zmmRes1[127:0]zmmRes0[127:0]] + // p_llrRes = [zmmRes1[255:256]zmmRes1[127:0]zmmRes0[255:256]zmmRes0[127:0]] + fprintf(fd," p_llrRes[i] = _mm512_permutex_epi64(zmm0, 0xD8);\n"); + + fprintf(fd,"}\n"); + } + + + // ===================================================================== + // Process group with 17 CNs + +fprintf(fd, "// Process group with 27 CNs \n"); + + // Process group with 17 CNs + + if (lut_numBnInBnGroups[26] > 0) + { + // If elements in group move to next address + idxBnGroup++; + + // Number of groups of 32 BNs for parallel processing + fprintf(fd," M = (%d*Z + 63)>>6;\n",lut_numBnInBnGroups[26] );; + + // Set the offset to each CN within a group in terms of 16 Byte + cnOffsetInGroup = (lut_numBnInBnGroups[26]*NR_LDPC_ZMAX)>>5; + + // Set pointers to start of group 2 + fprintf(fd," p_bnProcBuf = (__m256i*) &bnProcBuf [%d];\n",lut_startAddrBnGroups[idxBnGroup]); + fprintf(fd," p_llrProcBuf = (__m256i*) &llrProcBuf [%d];\n",lut_startAddrBnGroupsLlr[idxBnGroup]); + fprintf(fd," p_llrRes = (__m512i*) &llrRes [%d];\n",lut_startAddrBnGroupsLlr[idxBnGroup]); + // Loop over BNs + fprintf(fd," for (int i=0,j=0;i<M;i++,j+=2) {\n"); + // First 16 LLRs of first CN + fprintf(fd," zmmRes0 = _mm512_cvtepi8_epi16(p_bnProcBuf [j]);\n"); + fprintf(fd," zmmRes1 = _mm512_cvtepi8_epi16(p_bnProcBuf [j +1]);\n"); + + // Loop over CNs + for (k=1; k<27; k++) + { + fprintf(fd," zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf[%d + j]);\n", k*cnOffsetInGroup); + fprintf(fd," zmmRes0 = _mm512_adds_epi16(zmmRes0,zmm0);\n"); + + fprintf(fd," zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[%d + j +1]);\n", k*cnOffsetInGroup); + + fprintf(fd, " zmmRes1 = _mm512_adds_epi16(zmmRes1,zmm1); \n"); + } + + // Add LLR from receiver input + fprintf(fd," zmm0 = _mm512_cvtepi8_epi16(p_llrProcBuf[j]);\n"); + fprintf(fd," zmmRes0 = _mm512_adds_epi16(zmmRes0,zmm0);\n"); + + fprintf(fd," zmm1 = _mm512_cvtepi8_epi16(p_llrProcBuf[j +1 ]);\n"); + fprintf(fd," zmmRes1 = _mm512_adds_epi16(zmmRes1,zmm1);\n"); + + // Pack results back to epi8 + fprintf(fd," zmm0 = _mm512_packs_epi16(zmmRes0,zmmRes1);\n"); + //zmm0 = [zmmRes1[255:256]zmmRes0[255:256]zmmRes1[127:0]zmmRes0[127:0]] + // p_llrRes = [zmmRes1[255:256]zmmRes1[127:0]zmmRes0[255:256]zmmRes0[127:0]] + fprintf(fd," p_llrRes[i] = _mm512_permutex_epi64(zmm0, 0xD8);\n"); + + fprintf(fd,"}\n"); + } + + + // ===================================================================== + // Process group with 18 CNs + +fprintf(fd, "// Process group with 28 CNs \n"); + + // Process group with 8 CNs + + if (lut_numBnInBnGroups[27] > 0) + { + // If elements in group move to next address + idxBnGroup++; + + // Number of groups of 32 BNs for parallel processing + fprintf(fd," M = (%d*Z + 63)>>6;\n",lut_numBnInBnGroups[27] );; + + // Set the offset to each CN within a group in terms of 16 Byte + cnOffsetInGroup = (lut_numBnInBnGroups[27]*NR_LDPC_ZMAX)>>5; + + // Set pointers to start of group 2 + fprintf(fd," p_bnProcBuf = (__m256i*) &bnProcBuf [%d];\n",lut_startAddrBnGroups[idxBnGroup]); + fprintf(fd," p_llrProcBuf = (__m256i*) &llrProcBuf [%d];\n",lut_startAddrBnGroupsLlr[idxBnGroup]); + fprintf(fd," p_llrRes = (__m512i*) &llrRes [%d];\n",lut_startAddrBnGroupsLlr[idxBnGroup]); + // Loop over BNs + fprintf(fd," for (int i=0,j=0;i<M;i++,j+=2) {\n"); + // First 16 LLRs of first CN + fprintf(fd," zmmRes0 = _mm512_cvtepi8_epi16(p_bnProcBuf [j]);\n"); + fprintf(fd," zmmRes1 = _mm512_cvtepi8_epi16(p_bnProcBuf [j +1]);\n"); + + // Loop over CNs + for (k=1; k<28; k++) + { + fprintf(fd," zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf[%d + j]);\n", k*cnOffsetInGroup); + fprintf(fd," zmmRes0 = _mm512_adds_epi16(zmmRes0,zmm0);\n"); + + fprintf(fd," zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[%d + j +1]);\n", k*cnOffsetInGroup); + + fprintf(fd, " zmmRes1 = _mm512_adds_epi16(zmmRes1,zmm1); \n"); + } + + // Add LLR from receiver input + fprintf(fd," zmm0 = _mm512_cvtepi8_epi16(p_llrProcBuf[j]);\n"); + fprintf(fd," zmmRes0 = _mm512_adds_epi16(zmmRes0,zmm0);\n"); + + fprintf(fd," zmm1 = _mm512_cvtepi8_epi16(p_llrProcBuf[j +1 ]);\n"); + fprintf(fd," zmmRes1 = _mm512_adds_epi16(zmmRes1,zmm1);\n"); + + // Pack results back to epi8 + fprintf(fd," zmm0 = _mm512_packs_epi16(zmmRes0,zmmRes1);\n"); + //zmm0 = [zmmRes1[255:256]zmmRes0[255:256]zmmRes1[127:0]zmmRes0[127:0]] + // p_llrRes = [zmmRes1[255:256]zmmRes1[127:0]zmmRes0[255:256]zmmRes0[127:0]] + fprintf(fd," p_llrRes[i] = _mm512_permutex_epi64(zmm0, 0xD8);\n"); + + fprintf(fd,"}\n"); + } + + // ===================================================================== + // Process group with 9 CNs + +fprintf(fd, "// Process group with 29 CNs \n"); + + // Process group with 9 CNs + + if (lut_numBnInBnGroups[28] > 0) + { + // If elements in group move to next address + idxBnGroup++; + + // Number of groups of 32 BNs for parallel processing + fprintf(fd," M = (%d*Z + 63)>>6;\n",lut_numBnInBnGroups[28] );; + + // Set the offset to each CN within a group in terms of 16 Byte + cnOffsetInGroup = (lut_numBnInBnGroups[28]*NR_LDPC_ZMAX)>>5; + + // Set pointers to start of group 2 + fprintf(fd," p_bnProcBuf = (__m256i*) &bnProcBuf [%d];\n",lut_startAddrBnGroups[idxBnGroup]); + fprintf(fd," p_llrProcBuf = (__m256i*) &llrProcBuf [%d];\n",lut_startAddrBnGroupsLlr[idxBnGroup]); + fprintf(fd," p_llrRes = (__m512i*) &llrRes [%d];\n",lut_startAddrBnGroupsLlr[idxBnGroup]); + // Loop over BNs + fprintf(fd," for (int i=0,j=0;i<M;i++,j+=2) {\n"); + // First 16 LLRs of first CN + fprintf(fd," zmmRes0 = _mm512_cvtepi8_epi16(p_bnProcBuf [j]);\n"); + fprintf(fd," zmmRes1 = _mm512_cvtepi8_epi16(p_bnProcBuf [j +1]);\n"); + + // Loop over CNs + for (k=1; k<29; k++) + { + fprintf(fd," zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf[%d + j]);\n", k*cnOffsetInGroup); + fprintf(fd," zmmRes0 = _mm512_adds_epi16(zmmRes0,zmm0);\n"); + + fprintf(fd," zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[%d + j +1]);\n", k*cnOffsetInGroup); + + fprintf(fd, " zmmRes1 = _mm512_adds_epi16(zmmRes1,zmm1); \n"); + } + + // Add LLR from receiver input + fprintf(fd," zmm0 = _mm512_cvtepi8_epi16(p_llrProcBuf[j]);\n"); + fprintf(fd," zmmRes0 = _mm512_adds_epi16(zmmRes0,zmm0);\n"); + + fprintf(fd," zmm1 = _mm512_cvtepi8_epi16(p_llrProcBuf[j +1 ]);\n"); + fprintf(fd," zmmRes1 = _mm512_adds_epi16(zmmRes1,zmm1);\n"); + + // Pack results back to epi8 + fprintf(fd," zmm0 = _mm512_packs_epi16(zmmRes0,zmmRes1);\n"); + //zmm0 = [zmmRes1[255:256]zmmRes0[255:256]zmmRes1[127:0]zmmRes0[127:0]] + // p_llrRes = [zmmRes1[255:256]zmmRes1[127:0]zmmRes0[255:256]zmmRes0[127:0]] + fprintf(fd," p_llrRes[i] = _mm512_permutex_epi64(zmm0, 0xD8);\n"); + + fprintf(fd,"}\n"); + } + + + // ===================================================================== + // Process group with 20 CNs + +fprintf(fd, "// Process group with 30 CNs \n"); + + // Process group with 20 CNs + + if (lut_numBnInBnGroups[29] > 0) + { + // If elements in group move to next address + idxBnGroup++; + + // Number of groups of 32 BNs for parallel processing + fprintf(fd," M = (%d*Z + 63)>>6;\n",lut_numBnInBnGroups[29] );; + + // Set the offset to each CN within a group in terms of 16 Byte + cnOffsetInGroup = (lut_numBnInBnGroups[29]*NR_LDPC_ZMAX)>>5; + + // Set pointers to start of group 2 + fprintf(fd," p_bnProcBuf = (__m256i*) &bnProcBuf [%d];\n",lut_startAddrBnGroups[idxBnGroup]); + fprintf(fd," p_llrProcBuf = (__m256i*) &llrProcBuf [%d];\n",lut_startAddrBnGroupsLlr[idxBnGroup]); + fprintf(fd," p_llrRes = (__m512i*) &llrRes [%d];\n",lut_startAddrBnGroupsLlr[idxBnGroup]); + // Loop over BNs + fprintf(fd," for (int i=0,j=0;i<M;i++,j+=2) {\n"); + // First 16 LLRs of first CN + fprintf(fd," zmmRes0 = _mm512_cvtepi8_epi16(p_bnProcBuf [j]);\n"); + fprintf(fd," zmmRes1 = _mm512_cvtepi8_epi16(p_bnProcBuf [j +1]);\n"); + + // Loop over CNs + for (k=1; k<30; k++) + { + fprintf(fd," zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf[%d + j]);\n", k*cnOffsetInGroup); + fprintf(fd," zmmRes0 = _mm512_adds_epi16(zmmRes0,zmm0);\n"); + + fprintf(fd," zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[%d + j +1]);\n", k*cnOffsetInGroup); + + fprintf(fd, " zmmRes1 = _mm512_adds_epi16(zmmRes1,zmm1); \n"); + } + + // Add LLR from receiver input + fprintf(fd," zmm0 = _mm512_cvtepi8_epi16(p_llrProcBuf[j]);\n"); + fprintf(fd," zmmRes0 = _mm512_adds_epi16(zmmRes0,zmm0);\n"); + + fprintf(fd," zmm1 = _mm512_cvtepi8_epi16(p_llrProcBuf[j +1 ]);\n"); + fprintf(fd," zmmRes1 = _mm512_adds_epi16(zmmRes1,zmm1);\n"); + + // Pack results back to epi8 + fprintf(fd," zmm0 = _mm512_packs_epi16(zmmRes0,zmmRes1);\n"); + //zmm0 = [zmmRes1[255:256]zmmRes0[255:256]zmmRes1[127:0]zmmRes0[127:0]] + // p_llrRes = [zmmRes1[255:256]zmmRes1[127:0]zmmRes0[255:256]zmmRes0[127:0]] + fprintf(fd," p_llrRes[i] = _mm512_permutex_epi64(zmm0, 0xD8);\n"); + + fprintf(fd,"}\n"); + } + + fprintf(fd,"}\n"); + fclose(fd); +}//end of the function nrLDPC_bnProcPc_BG2 + + + + + + + + + diff --git a/openair1/PHY/CODING/nrLDPC_decoder/nrLDPC_tools/generator_bnProc_avx512/bnProc_gen_avx512 b/openair1/PHY/CODING/nrLDPC_decoder/nrLDPC_tools/generator_bnProc_avx512/bnProc_gen_avx512 index f686a271f18d31892f83a65819d96e3439325fd9..dabfabe18fdbb098b9ea9a0996079e0fde890445 100755 Binary files a/openair1/PHY/CODING/nrLDPC_decoder/nrLDPC_tools/generator_bnProc_avx512/bnProc_gen_avx512 and b/openair1/PHY/CODING/nrLDPC_decoder/nrLDPC_tools/generator_bnProc_avx512/bnProc_gen_avx512 differ diff --git a/openair1/PHY/CODING/nrLDPC_decoder/nrLDPC_tools/generator_bnProc_avx512/main.c b/openair1/PHY/CODING/nrLDPC_decoder/nrLDPC_tools/generator_bnProc_avx512/main.c index 3b2bc3ede498433e920ff2431f816ea7aa1a0d53..2bc870f17b10d31ab6e6ad4ac0bb761f5c0ef012 100644 --- a/openair1/PHY/CODING/nrLDPC_decoder/nrLDPC_tools/generator_bnProc_avx512/main.c +++ b/openair1/PHY/CODING/nrLDPC_decoder/nrLDPC_tools/generator_bnProc_avx512/main.c @@ -4,8 +4,8 @@ #define NB_R 3 void nrLDPC_bnProc_BG1_generator_AVX512(int); void nrLDPC_bnProc_BG2_generator_AVX512(int); -//void nrLDPC_bnProcPc_BG1_generator_AVX2(int); -//void nrLDPC_bnProcPc_BG2_generator_AVX2(int); +void nrLDPC_bnProcPc_BG1_generator_AVX512(int); +void nrLDPC_bnProcPc_BG2_generator_AVX512(int); int main() { @@ -15,8 +15,8 @@ int main() nrLDPC_bnProc_BG1_generator_AVX512(R[i]); nrLDPC_bnProc_BG2_generator_AVX512(R[i]); -// nrLDPC_bnProcPc_BG1_generator_AVX2(R[i]); -// nrLDPC_bnProcPc_BG2_generator_AVX2(R[i]); + nrLDPC_bnProcPc_BG1_generator_AVX512(R[i]); + nrLDPC_bnProcPc_BG2_generator_AVX512(R[i]); } diff --git a/openair1/PHY/CODING/nrLDPC_decoder/nrLDPC_tools/ldpc_gen_files/bnProc/nrLDPC_bnProc_BG2_R13_AVX2.c b/openair1/PHY/CODING/nrLDPC_decoder/nrLDPC_tools/ldpc_gen_files/bnProc/nrLDPC_bnProc_BG2_R13_AVX2.c new file mode 100644 index 0000000000000000000000000000000000000000..3f1c421cebdee768cc056933786826f33ddf9fe6 --- /dev/null +++ b/openair1/PHY/CODING/nrLDPC_decoder/nrLDPC_tools/ldpc_gen_files/bnProc/nrLDPC_bnProc_BG2_R13_AVX2.c @@ -0,0 +1,440 @@ +#include <stdint.h> +#include <immintrin.h> +void nrLDPC_bnProc_BG2_R13_AVX2(int8_t* bnProcBuf,int8_t* bnProcBufRes, int8_t* llrRes, uint16_t Z ) { + __m256i* p_bnProcBuf; + __m256i* p_bnProcBufRes; + __m256i* p_llrRes; + __m256i* p_res; + uint32_t M, i; +// Process group with 2 CNs + M = (1*Z + 31)>>5; + p_bnProcBuf = (__m256i*) &bnProcBuf [6912]; + p_bnProcBufRes = (__m256i*) &bnProcBufRes [6912]; + p_res = &p_bnProcBufRes[0]; + p_llrRes = (__m256i*) &llrRes [6912]; + for (i=0;i<M;i++) { + p_res[i] = _mm256_subs_epi8(p_llrRes[i], p_bnProcBuf[0 + i]); +} + p_res = &p_bnProcBufRes[12]; + p_llrRes = (__m256i*) &llrRes [6912]; + for (i=0;i<M;i++) { + p_res[i] = _mm256_subs_epi8(p_llrRes[i], p_bnProcBuf[12 + i]); +} +// Process group with 3 CNs +// Process group with 4 CNs + M = (2*Z + 31)>>5; + p_bnProcBuf = (__m256i*) &bnProcBuf [7680]; + p_bnProcBufRes = (__m256i*) &bnProcBufRes [7680]; + p_res = &p_bnProcBufRes[0]; + p_llrRes = (__m256i*) &llrRes [7296]; + for (i=0;i<M;i++) { + p_res[i] = _mm256_subs_epi8(p_llrRes[i], p_bnProcBuf[0 + i]); +} + p_res = &p_bnProcBufRes[24]; + p_llrRes = (__m256i*) &llrRes [7296]; + for (i=0;i<M;i++) { + p_res[i] = _mm256_subs_epi8(p_llrRes[i], p_bnProcBuf[24 + i]); +} + p_res = &p_bnProcBufRes[48]; + p_llrRes = (__m256i*) &llrRes [7296]; + for (i=0;i<M;i++) { + p_res[i] = _mm256_subs_epi8(p_llrRes[i], p_bnProcBuf[48 + i]); +} + p_res = &p_bnProcBufRes[72]; + p_llrRes = (__m256i*) &llrRes [7296]; + for (i=0;i<M;i++) { + p_res[i] = _mm256_subs_epi8(p_llrRes[i], p_bnProcBuf[72 + i]); +} +// Process group with 5 CNs + M = (1*Z + 31)>>5; + p_bnProcBuf = (__m256i*) &bnProcBuf [10752]; + p_bnProcBufRes = (__m256i*) &bnProcBufRes [10752]; + p_res = &p_bnProcBufRes[0]; + p_llrRes = (__m256i*) &llrRes [8064]; + for (i=0;i<M;i++) { + p_res[i] = _mm256_subs_epi8(p_llrRes[i], p_bnProcBuf[0 + i]); +} + p_res = &p_bnProcBufRes[12]; + p_llrRes = (__m256i*) &llrRes [8064]; + for (i=0;i<M;i++) { + p_res[i] = _mm256_subs_epi8(p_llrRes[i], p_bnProcBuf[12 + i]); +} + p_res = &p_bnProcBufRes[24]; + p_llrRes = (__m256i*) &llrRes [8064]; + for (i=0;i<M;i++) { + p_res[i] = _mm256_subs_epi8(p_llrRes[i], p_bnProcBuf[24 + i]); +} + p_res = &p_bnProcBufRes[36]; + p_llrRes = (__m256i*) &llrRes [8064]; + for (i=0;i<M;i++) { + p_res[i] = _mm256_subs_epi8(p_llrRes[i], p_bnProcBuf[36 + i]); +} + p_res = &p_bnProcBufRes[48]; + p_llrRes = (__m256i*) &llrRes [8064]; + for (i=0;i<M;i++) { + p_res[i] = _mm256_subs_epi8(p_llrRes[i], p_bnProcBuf[48 + i]); +} +// Process group with 6 CNs + M = (5*Z + 31)>>5; + p_bnProcBuf = (__m256i*) &bnProcBuf [12672]; + p_bnProcBufRes = (__m256i*) &bnProcBufRes [12672]; + p_res = &p_bnProcBufRes[0]; + p_llrRes = (__m256i*) &llrRes [8448]; + for (i=0;i<M;i++) { + p_res[i] = _mm256_subs_epi8(p_llrRes[i], p_bnProcBuf[0 + i]); +} + p_res = &p_bnProcBufRes[60]; + p_llrRes = (__m256i*) &llrRes [8448]; + for (i=0;i<M;i++) { + p_res[i] = _mm256_subs_epi8(p_llrRes[i], p_bnProcBuf[60 + i]); +} + p_res = &p_bnProcBufRes[120]; + p_llrRes = (__m256i*) &llrRes [8448]; + for (i=0;i<M;i++) { + p_res[i] = _mm256_subs_epi8(p_llrRes[i], p_bnProcBuf[120 + i]); +} + p_res = &p_bnProcBufRes[180]; + p_llrRes = (__m256i*) &llrRes [8448]; + for (i=0;i<M;i++) { + p_res[i] = _mm256_subs_epi8(p_llrRes[i], p_bnProcBuf[180 + i]); +} + p_res = &p_bnProcBufRes[240]; + p_llrRes = (__m256i*) &llrRes [8448]; + for (i=0;i<M;i++) { + p_res[i] = _mm256_subs_epi8(p_llrRes[i], p_bnProcBuf[240 + i]); +} + p_res = &p_bnProcBufRes[300]; + p_llrRes = (__m256i*) &llrRes [8448]; + for (i=0;i<M;i++) { + p_res[i] = _mm256_subs_epi8(p_llrRes[i], p_bnProcBuf[300 + i]); +} +// Process group with 7 CNs + M = (1*Z + 31)>>5; + p_bnProcBuf = (__m256i*) &bnProcBuf [24192]; + p_bnProcBufRes = (__m256i*) &bnProcBufRes [24192]; + p_res = &p_bnProcBufRes[0]; + p_llrRes = (__m256i*) &llrRes [10368]; + for (i=0;i<M;i++) { + p_res[i] = _mm256_subs_epi8(p_llrRes[i], p_bnProcBuf[0 + i]); +} + p_res = &p_bnProcBufRes[12]; + p_llrRes = (__m256i*) &llrRes [10368]; + for (i=0;i<M;i++) { + p_res[i] = _mm256_subs_epi8(p_llrRes[i], p_bnProcBuf[12 + i]); +} + p_res = &p_bnProcBufRes[24]; + p_llrRes = (__m256i*) &llrRes [10368]; + for (i=0;i<M;i++) { + p_res[i] = _mm256_subs_epi8(p_llrRes[i], p_bnProcBuf[24 + i]); +} + p_res = &p_bnProcBufRes[36]; + p_llrRes = (__m256i*) &llrRes [10368]; + for (i=0;i<M;i++) { + p_res[i] = _mm256_subs_epi8(p_llrRes[i], p_bnProcBuf[36 + i]); +} + p_res = &p_bnProcBufRes[48]; + p_llrRes = (__m256i*) &llrRes [10368]; + for (i=0;i<M;i++) { + p_res[i] = _mm256_subs_epi8(p_llrRes[i], p_bnProcBuf[48 + i]); +} + p_res = &p_bnProcBufRes[60]; + p_llrRes = (__m256i*) &llrRes [10368]; + for (i=0;i<M;i++) { + p_res[i] = _mm256_subs_epi8(p_llrRes[i], p_bnProcBuf[60 + i]); +} + p_res = &p_bnProcBufRes[72]; + p_llrRes = (__m256i*) &llrRes [10368]; + for (i=0;i<M;i++) { + p_res[i] = _mm256_subs_epi8(p_llrRes[i], p_bnProcBuf[72 + i]); +} +// Process group with 8 CNs + M = (1*Z + 31)>>5; + p_bnProcBuf = (__m256i*) &bnProcBuf [26880]; + p_bnProcBufRes = (__m256i*) &bnProcBufRes [26880]; + p_res = &p_bnProcBufRes[0]; + p_llrRes = (__m256i*) &llrRes [10752]; + for (i=0;i<M;i++) { + p_res[i] = _mm256_subs_epi8(p_llrRes[i], p_bnProcBuf[0 + i]); +} + p_res = &p_bnProcBufRes[12]; + p_llrRes = (__m256i*) &llrRes [10752]; + for (i=0;i<M;i++) { + p_res[i] = _mm256_subs_epi8(p_llrRes[i], p_bnProcBuf[12 + i]); +} + p_res = &p_bnProcBufRes[24]; + p_llrRes = (__m256i*) &llrRes [10752]; + for (i=0;i<M;i++) { + p_res[i] = _mm256_subs_epi8(p_llrRes[i], p_bnProcBuf[24 + i]); +} + p_res = &p_bnProcBufRes[36]; + p_llrRes = (__m256i*) &llrRes [10752]; + for (i=0;i<M;i++) { + p_res[i] = _mm256_subs_epi8(p_llrRes[i], p_bnProcBuf[36 + i]); +} + p_res = &p_bnProcBufRes[48]; + p_llrRes = (__m256i*) &llrRes [10752]; + for (i=0;i<M;i++) { + p_res[i] = _mm256_subs_epi8(p_llrRes[i], p_bnProcBuf[48 + i]); +} + p_res = &p_bnProcBufRes[60]; + p_llrRes = (__m256i*) &llrRes [10752]; + for (i=0;i<M;i++) { + p_res[i] = _mm256_subs_epi8(p_llrRes[i], p_bnProcBuf[60 + i]); +} + p_res = &p_bnProcBufRes[72]; + p_llrRes = (__m256i*) &llrRes [10752]; + for (i=0;i<M;i++) { + p_res[i] = _mm256_subs_epi8(p_llrRes[i], p_bnProcBuf[72 + i]); +} + p_res = &p_bnProcBufRes[84]; + p_llrRes = (__m256i*) &llrRes [10752]; + for (i=0;i<M;i++) { + p_res[i] = _mm256_subs_epi8(p_llrRes[i], p_bnProcBuf[84 + i]); +} +// Process group with 9 CNs +// Process group with 10 CNs +// Process group with 11 CNs +// Process group with 12 CNs +// Process group with 13 CNs + M = (1*Z + 31)>>5; + p_bnProcBuf = (__m256i*) &bnProcBuf [29952]; + p_bnProcBufRes = (__m256i*) &bnProcBufRes [29952]; + p_res = &p_bnProcBufRes[0]; + p_llrRes = (__m256i*) &llrRes [11136]; + for (i=0;i<M;i++) { + p_res[i] = _mm256_subs_epi8(p_llrRes[i], p_bnProcBuf[0 + i]); +} + p_res = &p_bnProcBufRes[12]; + p_llrRes = (__m256i*) &llrRes [11136]; + for (i=0;i<M;i++) { + p_res[i] = _mm256_subs_epi8(p_llrRes[i], p_bnProcBuf[12 + i]); +} + p_res = &p_bnProcBufRes[24]; + p_llrRes = (__m256i*) &llrRes [11136]; + for (i=0;i<M;i++) { + p_res[i] = _mm256_subs_epi8(p_llrRes[i], p_bnProcBuf[24 + i]); +} + p_res = &p_bnProcBufRes[36]; + p_llrRes = (__m256i*) &llrRes [11136]; + for (i=0;i<M;i++) { + p_res[i] = _mm256_subs_epi8(p_llrRes[i], p_bnProcBuf[36 + i]); +} + p_res = &p_bnProcBufRes[48]; + p_llrRes = (__m256i*) &llrRes [11136]; + for (i=0;i<M;i++) { + p_res[i] = _mm256_subs_epi8(p_llrRes[i], p_bnProcBuf[48 + i]); +} + p_res = &p_bnProcBufRes[60]; + p_llrRes = (__m256i*) &llrRes [11136]; + for (i=0;i<M;i++) { + p_res[i] = _mm256_subs_epi8(p_llrRes[i], p_bnProcBuf[60 + i]); +} + p_res = &p_bnProcBufRes[72]; + p_llrRes = (__m256i*) &llrRes [11136]; + for (i=0;i<M;i++) { + p_res[i] = _mm256_subs_epi8(p_llrRes[i], p_bnProcBuf[72 + i]); +} + p_res = &p_bnProcBufRes[84]; + p_llrRes = (__m256i*) &llrRes [11136]; + for (i=0;i<M;i++) { + p_res[i] = _mm256_subs_epi8(p_llrRes[i], p_bnProcBuf[84 + i]); +} + p_res = &p_bnProcBufRes[96]; + p_llrRes = (__m256i*) &llrRes [11136]; + for (i=0;i<M;i++) { + p_res[i] = _mm256_subs_epi8(p_llrRes[i], p_bnProcBuf[96 + i]); +} + p_res = &p_bnProcBufRes[108]; + p_llrRes = (__m256i*) &llrRes [11136]; + for (i=0;i<M;i++) { + p_res[i] = _mm256_subs_epi8(p_llrRes[i], p_bnProcBuf[108 + i]); +} + p_res = &p_bnProcBufRes[120]; + p_llrRes = (__m256i*) &llrRes [11136]; + for (i=0;i<M;i++) { + p_res[i] = _mm256_subs_epi8(p_llrRes[i], p_bnProcBuf[120 + i]); +} + p_res = &p_bnProcBufRes[132]; + p_llrRes = (__m256i*) &llrRes [11136]; + for (i=0;i<M;i++) { + p_res[i] = _mm256_subs_epi8(p_llrRes[i], p_bnProcBuf[132 + i]); +} + p_res = &p_bnProcBufRes[144]; + p_llrRes = (__m256i*) &llrRes [11136]; + for (i=0;i<M;i++) { + p_res[i] = _mm256_subs_epi8(p_llrRes[i], p_bnProcBuf[144 + i]); +} +// Process group with 14 CNs + M = (1*Z + 31)>>5; + p_bnProcBuf = (__m256i*) &bnProcBuf [34944]; + p_bnProcBufRes = (__m256i*) &bnProcBufRes [34944]; + p_res = &p_bnProcBufRes[0]; + p_llrRes = (__m256i*) &llrRes [11520]; + for (i=0;i<M;i++) { + p_res[i] = _mm256_subs_epi8(p_llrRes[i], p_bnProcBuf[0 + i]); +} + p_res = &p_bnProcBufRes[12]; + p_llrRes = (__m256i*) &llrRes [11520]; + for (i=0;i<M;i++) { + p_res[i] = _mm256_subs_epi8(p_llrRes[i], p_bnProcBuf[12 + i]); +} + p_res = &p_bnProcBufRes[24]; + p_llrRes = (__m256i*) &llrRes [11520]; + for (i=0;i<M;i++) { + p_res[i] = _mm256_subs_epi8(p_llrRes[i], p_bnProcBuf[24 + i]); +} + p_res = &p_bnProcBufRes[36]; + p_llrRes = (__m256i*) &llrRes [11520]; + for (i=0;i<M;i++) { + p_res[i] = _mm256_subs_epi8(p_llrRes[i], p_bnProcBuf[36 + i]); +} + p_res = &p_bnProcBufRes[48]; + p_llrRes = (__m256i*) &llrRes [11520]; + for (i=0;i<M;i++) { + p_res[i] = _mm256_subs_epi8(p_llrRes[i], p_bnProcBuf[48 + i]); +} + p_res = &p_bnProcBufRes[60]; + p_llrRes = (__m256i*) &llrRes [11520]; + for (i=0;i<M;i++) { + p_res[i] = _mm256_subs_epi8(p_llrRes[i], p_bnProcBuf[60 + i]); +} + p_res = &p_bnProcBufRes[72]; + p_llrRes = (__m256i*) &llrRes [11520]; + for (i=0;i<M;i++) { + p_res[i] = _mm256_subs_epi8(p_llrRes[i], p_bnProcBuf[72 + i]); +} + p_res = &p_bnProcBufRes[84]; + p_llrRes = (__m256i*) &llrRes [11520]; + for (i=0;i<M;i++) { + p_res[i] = _mm256_subs_epi8(p_llrRes[i], p_bnProcBuf[84 + i]); +} + p_res = &p_bnProcBufRes[96]; + p_llrRes = (__m256i*) &llrRes [11520]; + for (i=0;i<M;i++) { + p_res[i] = _mm256_subs_epi8(p_llrRes[i], p_bnProcBuf[96 + i]); +} + p_res = &p_bnProcBufRes[108]; + p_llrRes = (__m256i*) &llrRes [11520]; + for (i=0;i<M;i++) { + p_res[i] = _mm256_subs_epi8(p_llrRes[i], p_bnProcBuf[108 + i]); +} + p_res = &p_bnProcBufRes[120]; + p_llrRes = (__m256i*) &llrRes [11520]; + for (i=0;i<M;i++) { + p_res[i] = _mm256_subs_epi8(p_llrRes[i], p_bnProcBuf[120 + i]); +} + p_res = &p_bnProcBufRes[132]; + p_llrRes = (__m256i*) &llrRes [11520]; + for (i=0;i<M;i++) { + p_res[i] = _mm256_subs_epi8(p_llrRes[i], p_bnProcBuf[132 + i]); +} + p_res = &p_bnProcBufRes[144]; + p_llrRes = (__m256i*) &llrRes [11520]; + for (i=0;i<M;i++) { + p_res[i] = _mm256_subs_epi8(p_llrRes[i], p_bnProcBuf[144 + i]); +} + p_res = &p_bnProcBufRes[156]; + p_llrRes = (__m256i*) &llrRes [11520]; + for (i=0;i<M;i++) { + p_res[i] = _mm256_subs_epi8(p_llrRes[i], p_bnProcBuf[156 + i]); +} +// Process group with 15 CNs +// Process group with 16 CNs + M = (1*Z + 31)>>5; + p_bnProcBuf = (__m256i*) &bnProcBuf [40320]; + p_bnProcBufRes = (__m256i*) &bnProcBufRes [40320]; + p_res = &p_bnProcBufRes[0]; + p_llrRes = (__m256i*) &llrRes [11904]; + for (i=0;i<M;i++) { + p_res[i] = _mm256_subs_epi8(p_llrRes[i], p_bnProcBuf[0 + i]); +} + p_res = &p_bnProcBufRes[12]; + p_llrRes = (__m256i*) &llrRes [11904]; + for (i=0;i<M;i++) { + p_res[i] = _mm256_subs_epi8(p_llrRes[i], p_bnProcBuf[12 + i]); +} + p_res = &p_bnProcBufRes[24]; + p_llrRes = (__m256i*) &llrRes [11904]; + for (i=0;i<M;i++) { + p_res[i] = _mm256_subs_epi8(p_llrRes[i], p_bnProcBuf[24 + i]); +} + p_res = &p_bnProcBufRes[36]; + p_llrRes = (__m256i*) &llrRes [11904]; + for (i=0;i<M;i++) { + p_res[i] = _mm256_subs_epi8(p_llrRes[i], p_bnProcBuf[36 + i]); +} + p_res = &p_bnProcBufRes[48]; + p_llrRes = (__m256i*) &llrRes [11904]; + for (i=0;i<M;i++) { + p_res[i] = _mm256_subs_epi8(p_llrRes[i], p_bnProcBuf[48 + i]); +} + p_res = &p_bnProcBufRes[60]; + p_llrRes = (__m256i*) &llrRes [11904]; + for (i=0;i<M;i++) { + p_res[i] = _mm256_subs_epi8(p_llrRes[i], p_bnProcBuf[60 + i]); +} + p_res = &p_bnProcBufRes[72]; + p_llrRes = (__m256i*) &llrRes [11904]; + for (i=0;i<M;i++) { + p_res[i] = _mm256_subs_epi8(p_llrRes[i], p_bnProcBuf[72 + i]); +} + p_res = &p_bnProcBufRes[84]; + p_llrRes = (__m256i*) &llrRes [11904]; + for (i=0;i<M;i++) { + p_res[i] = _mm256_subs_epi8(p_llrRes[i], p_bnProcBuf[84 + i]); +} + p_res = &p_bnProcBufRes[96]; + p_llrRes = (__m256i*) &llrRes [11904]; + for (i=0;i<M;i++) { + p_res[i] = _mm256_subs_epi8(p_llrRes[i], p_bnProcBuf[96 + i]); +} + p_res = &p_bnProcBufRes[108]; + p_llrRes = (__m256i*) &llrRes [11904]; + for (i=0;i<M;i++) { + p_res[i] = _mm256_subs_epi8(p_llrRes[i], p_bnProcBuf[108 + i]); +} + p_res = &p_bnProcBufRes[120]; + p_llrRes = (__m256i*) &llrRes [11904]; + for (i=0;i<M;i++) { + p_res[i] = _mm256_subs_epi8(p_llrRes[i], p_bnProcBuf[120 + i]); +} + p_res = &p_bnProcBufRes[132]; + p_llrRes = (__m256i*) &llrRes [11904]; + for (i=0;i<M;i++) { + p_res[i] = _mm256_subs_epi8(p_llrRes[i], p_bnProcBuf[132 + i]); +} + p_res = &p_bnProcBufRes[144]; + p_llrRes = (__m256i*) &llrRes [11904]; + for (i=0;i<M;i++) { + p_res[i] = _mm256_subs_epi8(p_llrRes[i], p_bnProcBuf[144 + i]); +} + p_res = &p_bnProcBufRes[156]; + p_llrRes = (__m256i*) &llrRes [11904]; + for (i=0;i<M;i++) { + p_res[i] = _mm256_subs_epi8(p_llrRes[i], p_bnProcBuf[156 + i]); +} + p_res = &p_bnProcBufRes[168]; + p_llrRes = (__m256i*) &llrRes [11904]; + for (i=0;i<M;i++) { + p_res[i] = _mm256_subs_epi8(p_llrRes[i], p_bnProcBuf[168 + i]); +} + p_res = &p_bnProcBufRes[180]; + p_llrRes = (__m256i*) &llrRes [11904]; + for (i=0;i<M;i++) { + p_res[i] = _mm256_subs_epi8(p_llrRes[i], p_bnProcBuf[180 + i]); +} +// Process group with 17 CNs +// Process group with 18 CNs +// Process group with 19 CNs +// Process group with 20 CNs +// Process group with 21 CNs +// Process group with 22 CNs +// Process group with <23 CNs +// Process group with 24 CNs +// Process group with 25 CNs +// Process group with 26 CNs +// Process group with 27 CNs +// Process group with 28 CNs +// Process group with 29 CNs +// Process group with 30 CNs +} diff --git a/openair1/PHY/CODING/nrLDPC_decoder/nrLDPC_tools/ldpc_gen_files/bnProc/nrLDPC_bnProc_BG2_R15_AVX2.c b/openair1/PHY/CODING/nrLDPC_decoder/nrLDPC_tools/ldpc_gen_files/bnProc/nrLDPC_bnProc_BG2_R15_AVX2.c new file mode 100644 index 0000000000000000000000000000000000000000..f649102df7a8cde518242a39292f8180778425e4 --- /dev/null +++ b/openair1/PHY/CODING/nrLDPC_decoder/nrLDPC_tools/ldpc_gen_files/bnProc/nrLDPC_bnProc_BG2_R15_AVX2.c @@ -0,0 +1,799 @@ +#include <stdint.h> +#include <immintrin.h> +void nrLDPC_bnProc_BG2_R15_AVX2(int8_t* bnProcBuf,int8_t* bnProcBufRes, int8_t* llrRes, uint16_t Z ) { + __m256i* p_bnProcBuf; + __m256i* p_bnProcBufRes; + __m256i* p_llrRes; + __m256i* p_res; + uint32_t M, i; +// Process group with 2 CNs +// Process group with 3 CNs +// Process group with 4 CNs +// Process group with 5 CNs + M = (2*Z + 31)>>5; + p_bnProcBuf = (__m256i*) &bnProcBuf [14592]; + p_bnProcBufRes = (__m256i*) &bnProcBufRes [14592]; + p_res = &p_bnProcBufRes[0]; + p_llrRes = (__m256i*) &llrRes [14592]; + for (i=0;i<M;i++) { + p_res[i] = _mm256_subs_epi8(p_llrRes[i], p_bnProcBuf[0 + i]); +} + p_res = &p_bnProcBufRes[24]; + p_llrRes = (__m256i*) &llrRes [14592]; + for (i=0;i<M;i++) { + p_res[i] = _mm256_subs_epi8(p_llrRes[i], p_bnProcBuf[24 + i]); +} + p_res = &p_bnProcBufRes[48]; + p_llrRes = (__m256i*) &llrRes [14592]; + for (i=0;i<M;i++) { + p_res[i] = _mm256_subs_epi8(p_llrRes[i], p_bnProcBuf[48 + i]); +} + p_res = &p_bnProcBufRes[72]; + p_llrRes = (__m256i*) &llrRes [14592]; + for (i=0;i<M;i++) { + p_res[i] = _mm256_subs_epi8(p_llrRes[i], p_bnProcBuf[72 + i]); +} + p_res = &p_bnProcBufRes[96]; + p_llrRes = (__m256i*) &llrRes [14592]; + for (i=0;i<M;i++) { + p_res[i] = _mm256_subs_epi8(p_llrRes[i], p_bnProcBuf[96 + i]); +} +// Process group with 6 CNs + M = (1*Z + 31)>>5; + p_bnProcBuf = (__m256i*) &bnProcBuf [18432]; + p_bnProcBufRes = (__m256i*) &bnProcBufRes [18432]; + p_res = &p_bnProcBufRes[0]; + p_llrRes = (__m256i*) &llrRes [15360]; + for (i=0;i<M;i++) { + p_res[i] = _mm256_subs_epi8(p_llrRes[i], p_bnProcBuf[0 + i]); +} + p_res = &p_bnProcBufRes[12]; + p_llrRes = (__m256i*) &llrRes [15360]; + for (i=0;i<M;i++) { + p_res[i] = _mm256_subs_epi8(p_llrRes[i], p_bnProcBuf[12 + i]); +} + p_res = &p_bnProcBufRes[24]; + p_llrRes = (__m256i*) &llrRes [15360]; + for (i=0;i<M;i++) { + p_res[i] = _mm256_subs_epi8(p_llrRes[i], p_bnProcBuf[24 + i]); +} + p_res = &p_bnProcBufRes[36]; + p_llrRes = (__m256i*) &llrRes [15360]; + for (i=0;i<M;i++) { + p_res[i] = _mm256_subs_epi8(p_llrRes[i], p_bnProcBuf[36 + i]); +} + p_res = &p_bnProcBufRes[48]; + p_llrRes = (__m256i*) &llrRes [15360]; + for (i=0;i<M;i++) { + p_res[i] = _mm256_subs_epi8(p_llrRes[i], p_bnProcBuf[48 + i]); +} + p_res = &p_bnProcBufRes[60]; + p_llrRes = (__m256i*) &llrRes [15360]; + for (i=0;i<M;i++) { + p_res[i] = _mm256_subs_epi8(p_llrRes[i], p_bnProcBuf[60 + i]); +} +// Process group with 7 CNs + M = (1*Z + 31)>>5; + p_bnProcBuf = (__m256i*) &bnProcBuf [20736]; + p_bnProcBufRes = (__m256i*) &bnProcBufRes [20736]; + p_res = &p_bnProcBufRes[0]; + p_llrRes = (__m256i*) &llrRes [15744]; + for (i=0;i<M;i++) { + p_res[i] = _mm256_subs_epi8(p_llrRes[i], p_bnProcBuf[0 + i]); +} + p_res = &p_bnProcBufRes[12]; + p_llrRes = (__m256i*) &llrRes [15744]; + for (i=0;i<M;i++) { + p_res[i] = _mm256_subs_epi8(p_llrRes[i], p_bnProcBuf[12 + i]); +} + p_res = &p_bnProcBufRes[24]; + p_llrRes = (__m256i*) &llrRes [15744]; + for (i=0;i<M;i++) { + p_res[i] = _mm256_subs_epi8(p_llrRes[i], p_bnProcBuf[24 + i]); +} + p_res = &p_bnProcBufRes[36]; + p_llrRes = (__m256i*) &llrRes [15744]; + for (i=0;i<M;i++) { + p_res[i] = _mm256_subs_epi8(p_llrRes[i], p_bnProcBuf[36 + i]); +} + p_res = &p_bnProcBufRes[48]; + p_llrRes = (__m256i*) &llrRes [15744]; + for (i=0;i<M;i++) { + p_res[i] = _mm256_subs_epi8(p_llrRes[i], p_bnProcBuf[48 + i]); +} + p_res = &p_bnProcBufRes[60]; + p_llrRes = (__m256i*) &llrRes [15744]; + for (i=0;i<M;i++) { + p_res[i] = _mm256_subs_epi8(p_llrRes[i], p_bnProcBuf[60 + i]); +} + p_res = &p_bnProcBufRes[72]; + p_llrRes = (__m256i*) &llrRes [15744]; + for (i=0;i<M;i++) { + p_res[i] = _mm256_subs_epi8(p_llrRes[i], p_bnProcBuf[72 + i]); +} +// Process group with 8 CNs + M = (1*Z + 31)>>5; + p_bnProcBuf = (__m256i*) &bnProcBuf [23424]; + p_bnProcBufRes = (__m256i*) &bnProcBufRes [23424]; + p_res = &p_bnProcBufRes[0]; + p_llrRes = (__m256i*) &llrRes [16128]; + for (i=0;i<M;i++) { + p_res[i] = _mm256_subs_epi8(p_llrRes[i], p_bnProcBuf[0 + i]); +} + p_res = &p_bnProcBufRes[12]; + p_llrRes = (__m256i*) &llrRes [16128]; + for (i=0;i<M;i++) { + p_res[i] = _mm256_subs_epi8(p_llrRes[i], p_bnProcBuf[12 + i]); +} + p_res = &p_bnProcBufRes[24]; + p_llrRes = (__m256i*) &llrRes [16128]; + for (i=0;i<M;i++) { + p_res[i] = _mm256_subs_epi8(p_llrRes[i], p_bnProcBuf[24 + i]); +} + p_res = &p_bnProcBufRes[36]; + p_llrRes = (__m256i*) &llrRes [16128]; + for (i=0;i<M;i++) { + p_res[i] = _mm256_subs_epi8(p_llrRes[i], p_bnProcBuf[36 + i]); +} + p_res = &p_bnProcBufRes[48]; + p_llrRes = (__m256i*) &llrRes [16128]; + for (i=0;i<M;i++) { + p_res[i] = _mm256_subs_epi8(p_llrRes[i], p_bnProcBuf[48 + i]); +} + p_res = &p_bnProcBufRes[60]; + p_llrRes = (__m256i*) &llrRes [16128]; + for (i=0;i<M;i++) { + p_res[i] = _mm256_subs_epi8(p_llrRes[i], p_bnProcBuf[60 + i]); +} + p_res = &p_bnProcBufRes[72]; + p_llrRes = (__m256i*) &llrRes [16128]; + for (i=0;i<M;i++) { + p_res[i] = _mm256_subs_epi8(p_llrRes[i], p_bnProcBuf[72 + i]); +} + p_res = &p_bnProcBufRes[84]; + p_llrRes = (__m256i*) &llrRes [16128]; + for (i=0;i<M;i++) { + p_res[i] = _mm256_subs_epi8(p_llrRes[i], p_bnProcBuf[84 + i]); +} +// Process group with 9 CNs + M = (2*Z + 31)>>5; + p_bnProcBuf = (__m256i*) &bnProcBuf [26496]; + p_bnProcBufRes = (__m256i*) &bnProcBufRes [26496]; + p_res = &p_bnProcBufRes[0]; + p_llrRes = (__m256i*) &llrRes [16512]; + for (i=0;i<M;i++) { + p_res[i] = _mm256_subs_epi8(p_llrRes[i], p_bnProcBuf[0 + i]); +} + p_res = &p_bnProcBufRes[24]; + p_llrRes = (__m256i*) &llrRes [16512]; + for (i=0;i<M;i++) { + p_res[i] = _mm256_subs_epi8(p_llrRes[i], p_bnProcBuf[24 + i]); +} + p_res = &p_bnProcBufRes[48]; + p_llrRes = (__m256i*) &llrRes [16512]; + for (i=0;i<M;i++) { + p_res[i] = _mm256_subs_epi8(p_llrRes[i], p_bnProcBuf[48 + i]); +} + p_res = &p_bnProcBufRes[72]; + p_llrRes = (__m256i*) &llrRes [16512]; + for (i=0;i<M;i++) { + p_res[i] = _mm256_subs_epi8(p_llrRes[i], p_bnProcBuf[72 + i]); +} + p_res = &p_bnProcBufRes[96]; + p_llrRes = (__m256i*) &llrRes [16512]; + for (i=0;i<M;i++) { + p_res[i] = _mm256_subs_epi8(p_llrRes[i], p_bnProcBuf[96 + i]); +} + p_res = &p_bnProcBufRes[120]; + p_llrRes = (__m256i*) &llrRes [16512]; + for (i=0;i<M;i++) { + p_res[i] = _mm256_subs_epi8(p_llrRes[i], p_bnProcBuf[120 + i]); +} + p_res = &p_bnProcBufRes[144]; + p_llrRes = (__m256i*) &llrRes [16512]; + for (i=0;i<M;i++) { + p_res[i] = _mm256_subs_epi8(p_llrRes[i], p_bnProcBuf[144 + i]); +} + p_res = &p_bnProcBufRes[168]; + p_llrRes = (__m256i*) &llrRes [16512]; + for (i=0;i<M;i++) { + p_res[i] = _mm256_subs_epi8(p_llrRes[i], p_bnProcBuf[168 + i]); +} + p_res = &p_bnProcBufRes[192]; + p_llrRes = (__m256i*) &llrRes [16512]; + for (i=0;i<M;i++) { + p_res[i] = _mm256_subs_epi8(p_llrRes[i], p_bnProcBuf[192 + i]); +} +// Process group with 10 CNs + M = (1*Z + 31)>>5; + p_bnProcBuf = (__m256i*) &bnProcBuf [33408]; + p_bnProcBufRes = (__m256i*) &bnProcBufRes [33408]; + p_res = &p_bnProcBufRes[0]; + p_llrRes = (__m256i*) &llrRes [17280]; + for (i=0;i<M;i++) { + p_res[i] = _mm256_subs_epi8(p_llrRes[i], p_bnProcBuf[0 + i]); +} + p_res = &p_bnProcBufRes[12]; + p_llrRes = (__m256i*) &llrRes [17280]; + for (i=0;i<M;i++) { + p_res[i] = _mm256_subs_epi8(p_llrRes[i], p_bnProcBuf[12 + i]); +} + p_res = &p_bnProcBufRes[24]; + p_llrRes = (__m256i*) &llrRes [17280]; + for (i=0;i<M;i++) { + p_res[i] = _mm256_subs_epi8(p_llrRes[i], p_bnProcBuf[24 + i]); +} + p_res = &p_bnProcBufRes[36]; + p_llrRes = (__m256i*) &llrRes [17280]; + for (i=0;i<M;i++) { + p_res[i] = _mm256_subs_epi8(p_llrRes[i], p_bnProcBuf[36 + i]); +} + p_res = &p_bnProcBufRes[48]; + p_llrRes = (__m256i*) &llrRes [17280]; + for (i=0;i<M;i++) { + p_res[i] = _mm256_subs_epi8(p_llrRes[i], p_bnProcBuf[48 + i]); +} + p_res = &p_bnProcBufRes[60]; + p_llrRes = (__m256i*) &llrRes [17280]; + for (i=0;i<M;i++) { + p_res[i] = _mm256_subs_epi8(p_llrRes[i], p_bnProcBuf[60 + i]); +} + p_res = &p_bnProcBufRes[72]; + p_llrRes = (__m256i*) &llrRes [17280]; + for (i=0;i<M;i++) { + p_res[i] = _mm256_subs_epi8(p_llrRes[i], p_bnProcBuf[72 + i]); +} + p_res = &p_bnProcBufRes[84]; + p_llrRes = (__m256i*) &llrRes [17280]; + for (i=0;i<M;i++) { + p_res[i] = _mm256_subs_epi8(p_llrRes[i], p_bnProcBuf[84 + i]); +} + p_res = &p_bnProcBufRes[96]; + p_llrRes = (__m256i*) &llrRes [17280]; + for (i=0;i<M;i++) { + p_res[i] = _mm256_subs_epi8(p_llrRes[i], p_bnProcBuf[96 + i]); +} + p_res = &p_bnProcBufRes[108]; + p_llrRes = (__m256i*) &llrRes [17280]; + for (i=0;i<M;i++) { + p_res[i] = _mm256_subs_epi8(p_llrRes[i], p_bnProcBuf[108 + i]); +} +// Process group with 11 CNs +// Process group with 12 CNs + M = (1*Z + 31)>>5; + p_bnProcBuf = (__m256i*) &bnProcBuf [37248]; + p_bnProcBufRes = (__m256i*) &bnProcBufRes [37248]; + p_res = &p_bnProcBufRes[0]; + p_llrRes = (__m256i*) &llrRes [17664]; + for (i=0;i<M;i++) { + p_res[i] = _mm256_subs_epi8(p_llrRes[i], p_bnProcBuf[0 + i]); +} + p_res = &p_bnProcBufRes[12]; + p_llrRes = (__m256i*) &llrRes [17664]; + for (i=0;i<M;i++) { + p_res[i] = _mm256_subs_epi8(p_llrRes[i], p_bnProcBuf[12 + i]); +} + p_res = &p_bnProcBufRes[24]; + p_llrRes = (__m256i*) &llrRes [17664]; + for (i=0;i<M;i++) { + p_res[i] = _mm256_subs_epi8(p_llrRes[i], p_bnProcBuf[24 + i]); +} + p_res = &p_bnProcBufRes[36]; + p_llrRes = (__m256i*) &llrRes [17664]; + for (i=0;i<M;i++) { + p_res[i] = _mm256_subs_epi8(p_llrRes[i], p_bnProcBuf[36 + i]); +} + p_res = &p_bnProcBufRes[48]; + p_llrRes = (__m256i*) &llrRes [17664]; + for (i=0;i<M;i++) { + p_res[i] = _mm256_subs_epi8(p_llrRes[i], p_bnProcBuf[48 + i]); +} + p_res = &p_bnProcBufRes[60]; + p_llrRes = (__m256i*) &llrRes [17664]; + for (i=0;i<M;i++) { + p_res[i] = _mm256_subs_epi8(p_llrRes[i], p_bnProcBuf[60 + i]); +} + p_res = &p_bnProcBufRes[72]; + p_llrRes = (__m256i*) &llrRes [17664]; + for (i=0;i<M;i++) { + p_res[i] = _mm256_subs_epi8(p_llrRes[i], p_bnProcBuf[72 + i]); +} + p_res = &p_bnProcBufRes[84]; + p_llrRes = (__m256i*) &llrRes [17664]; + for (i=0;i<M;i++) { + p_res[i] = _mm256_subs_epi8(p_llrRes[i], p_bnProcBuf[84 + i]); +} + p_res = &p_bnProcBufRes[96]; + p_llrRes = (__m256i*) &llrRes [17664]; + for (i=0;i<M;i++) { + p_res[i] = _mm256_subs_epi8(p_llrRes[i], p_bnProcBuf[96 + i]); +} + p_res = &p_bnProcBufRes[108]; + p_llrRes = (__m256i*) &llrRes [17664]; + for (i=0;i<M;i++) { + p_res[i] = _mm256_subs_epi8(p_llrRes[i], p_bnProcBuf[108 + i]); +} + p_res = &p_bnProcBufRes[120]; + p_llrRes = (__m256i*) &llrRes [17664]; + for (i=0;i<M;i++) { + p_res[i] = _mm256_subs_epi8(p_llrRes[i], p_bnProcBuf[120 + i]); +} + p_res = &p_bnProcBufRes[132]; + p_llrRes = (__m256i*) &llrRes [17664]; + for (i=0;i<M;i++) { + p_res[i] = _mm256_subs_epi8(p_llrRes[i], p_bnProcBuf[132 + i]); +} +// Process group with 13 CNs + M = (1*Z + 31)>>5; + p_bnProcBuf = (__m256i*) &bnProcBuf [41856]; + p_bnProcBufRes = (__m256i*) &bnProcBufRes [41856]; + p_res = &p_bnProcBufRes[0]; + p_llrRes = (__m256i*) &llrRes [18048]; + for (i=0;i<M;i++) { + p_res[i] = _mm256_subs_epi8(p_llrRes[i], p_bnProcBuf[0 + i]); +} + p_res = &p_bnProcBufRes[12]; + p_llrRes = (__m256i*) &llrRes [18048]; + for (i=0;i<M;i++) { + p_res[i] = _mm256_subs_epi8(p_llrRes[i], p_bnProcBuf[12 + i]); +} + p_res = &p_bnProcBufRes[24]; + p_llrRes = (__m256i*) &llrRes [18048]; + for (i=0;i<M;i++) { + p_res[i] = _mm256_subs_epi8(p_llrRes[i], p_bnProcBuf[24 + i]); +} + p_res = &p_bnProcBufRes[36]; + p_llrRes = (__m256i*) &llrRes [18048]; + for (i=0;i<M;i++) { + p_res[i] = _mm256_subs_epi8(p_llrRes[i], p_bnProcBuf[36 + i]); +} + p_res = &p_bnProcBufRes[48]; + p_llrRes = (__m256i*) &llrRes [18048]; + for (i=0;i<M;i++) { + p_res[i] = _mm256_subs_epi8(p_llrRes[i], p_bnProcBuf[48 + i]); +} + p_res = &p_bnProcBufRes[60]; + p_llrRes = (__m256i*) &llrRes [18048]; + for (i=0;i<M;i++) { + p_res[i] = _mm256_subs_epi8(p_llrRes[i], p_bnProcBuf[60 + i]); +} + p_res = &p_bnProcBufRes[72]; + p_llrRes = (__m256i*) &llrRes [18048]; + for (i=0;i<M;i++) { + p_res[i] = _mm256_subs_epi8(p_llrRes[i], p_bnProcBuf[72 + i]); +} + p_res = &p_bnProcBufRes[84]; + p_llrRes = (__m256i*) &llrRes [18048]; + for (i=0;i<M;i++) { + p_res[i] = _mm256_subs_epi8(p_llrRes[i], p_bnProcBuf[84 + i]); +} + p_res = &p_bnProcBufRes[96]; + p_llrRes = (__m256i*) &llrRes [18048]; + for (i=0;i<M;i++) { + p_res[i] = _mm256_subs_epi8(p_llrRes[i], p_bnProcBuf[96 + i]); +} + p_res = &p_bnProcBufRes[108]; + p_llrRes = (__m256i*) &llrRes [18048]; + for (i=0;i<M;i++) { + p_res[i] = _mm256_subs_epi8(p_llrRes[i], p_bnProcBuf[108 + i]); +} + p_res = &p_bnProcBufRes[120]; + p_llrRes = (__m256i*) &llrRes [18048]; + for (i=0;i<M;i++) { + p_res[i] = _mm256_subs_epi8(p_llrRes[i], p_bnProcBuf[120 + i]); +} + p_res = &p_bnProcBufRes[132]; + p_llrRes = (__m256i*) &llrRes [18048]; + for (i=0;i<M;i++) { + p_res[i] = _mm256_subs_epi8(p_llrRes[i], p_bnProcBuf[132 + i]); +} + p_res = &p_bnProcBufRes[144]; + p_llrRes = (__m256i*) &llrRes [18048]; + for (i=0;i<M;i++) { + p_res[i] = _mm256_subs_epi8(p_llrRes[i], p_bnProcBuf[144 + i]); +} +// Process group with 14 CNs + M = (1*Z + 31)>>5; + p_bnProcBuf = (__m256i*) &bnProcBuf [46848]; + p_bnProcBufRes = (__m256i*) &bnProcBufRes [46848]; + p_res = &p_bnProcBufRes[0]; + p_llrRes = (__m256i*) &llrRes [18432]; + for (i=0;i<M;i++) { + p_res[i] = _mm256_subs_epi8(p_llrRes[i], p_bnProcBuf[0 + i]); +} + p_res = &p_bnProcBufRes[12]; + p_llrRes = (__m256i*) &llrRes [18432]; + for (i=0;i<M;i++) { + p_res[i] = _mm256_subs_epi8(p_llrRes[i], p_bnProcBuf[12 + i]); +} + p_res = &p_bnProcBufRes[24]; + p_llrRes = (__m256i*) &llrRes [18432]; + for (i=0;i<M;i++) { + p_res[i] = _mm256_subs_epi8(p_llrRes[i], p_bnProcBuf[24 + i]); +} + p_res = &p_bnProcBufRes[36]; + p_llrRes = (__m256i*) &llrRes [18432]; + for (i=0;i<M;i++) { + p_res[i] = _mm256_subs_epi8(p_llrRes[i], p_bnProcBuf[36 + i]); +} + p_res = &p_bnProcBufRes[48]; + p_llrRes = (__m256i*) &llrRes [18432]; + for (i=0;i<M;i++) { + p_res[i] = _mm256_subs_epi8(p_llrRes[i], p_bnProcBuf[48 + i]); +} + p_res = &p_bnProcBufRes[60]; + p_llrRes = (__m256i*) &llrRes [18432]; + for (i=0;i<M;i++) { + p_res[i] = _mm256_subs_epi8(p_llrRes[i], p_bnProcBuf[60 + i]); +} + p_res = &p_bnProcBufRes[72]; + p_llrRes = (__m256i*) &llrRes [18432]; + for (i=0;i<M;i++) { + p_res[i] = _mm256_subs_epi8(p_llrRes[i], p_bnProcBuf[72 + i]); +} + p_res = &p_bnProcBufRes[84]; + p_llrRes = (__m256i*) &llrRes [18432]; + for (i=0;i<M;i++) { + p_res[i] = _mm256_subs_epi8(p_llrRes[i], p_bnProcBuf[84 + i]); +} + p_res = &p_bnProcBufRes[96]; + p_llrRes = (__m256i*) &llrRes [18432]; + for (i=0;i<M;i++) { + p_res[i] = _mm256_subs_epi8(p_llrRes[i], p_bnProcBuf[96 + i]); +} + p_res = &p_bnProcBufRes[108]; + p_llrRes = (__m256i*) &llrRes [18432]; + for (i=0;i<M;i++) { + p_res[i] = _mm256_subs_epi8(p_llrRes[i], p_bnProcBuf[108 + i]); +} + p_res = &p_bnProcBufRes[120]; + p_llrRes = (__m256i*) &llrRes [18432]; + for (i=0;i<M;i++) { + p_res[i] = _mm256_subs_epi8(p_llrRes[i], p_bnProcBuf[120 + i]); +} + p_res = &p_bnProcBufRes[132]; + p_llrRes = (__m256i*) &llrRes [18432]; + for (i=0;i<M;i++) { + p_res[i] = _mm256_subs_epi8(p_llrRes[i], p_bnProcBuf[132 + i]); +} + p_res = &p_bnProcBufRes[144]; + p_llrRes = (__m256i*) &llrRes [18432]; + for (i=0;i<M;i++) { + p_res[i] = _mm256_subs_epi8(p_llrRes[i], p_bnProcBuf[144 + i]); +} + p_res = &p_bnProcBufRes[156]; + p_llrRes = (__m256i*) &llrRes [18432]; + for (i=0;i<M;i++) { + p_res[i] = _mm256_subs_epi8(p_llrRes[i], p_bnProcBuf[156 + i]); +} +// Process group with 15 CNs +// Process group with 16 CNs + M = (1*Z + 31)>>5; + p_bnProcBuf = (__m256i*) &bnProcBuf [52224]; + p_bnProcBufRes = (__m256i*) &bnProcBufRes [52224]; + p_res = &p_bnProcBufRes[0]; + p_llrRes = (__m256i*) &llrRes [18816]; + for (i=0;i<M;i++) { + p_res[i] = _mm256_subs_epi8(p_llrRes[i], p_bnProcBuf[0 + i]); +} + p_res = &p_bnProcBufRes[12]; + p_llrRes = (__m256i*) &llrRes [18816]; + for (i=0;i<M;i++) { + p_res[i] = _mm256_subs_epi8(p_llrRes[i], p_bnProcBuf[12 + i]); +} + p_res = &p_bnProcBufRes[24]; + p_llrRes = (__m256i*) &llrRes [18816]; + for (i=0;i<M;i++) { + p_res[i] = _mm256_subs_epi8(p_llrRes[i], p_bnProcBuf[24 + i]); +} + p_res = &p_bnProcBufRes[36]; + p_llrRes = (__m256i*) &llrRes [18816]; + for (i=0;i<M;i++) { + p_res[i] = _mm256_subs_epi8(p_llrRes[i], p_bnProcBuf[36 + i]); +} + p_res = &p_bnProcBufRes[48]; + p_llrRes = (__m256i*) &llrRes [18816]; + for (i=0;i<M;i++) { + p_res[i] = _mm256_subs_epi8(p_llrRes[i], p_bnProcBuf[48 + i]); +} + p_res = &p_bnProcBufRes[60]; + p_llrRes = (__m256i*) &llrRes [18816]; + for (i=0;i<M;i++) { + p_res[i] = _mm256_subs_epi8(p_llrRes[i], p_bnProcBuf[60 + i]); +} + p_res = &p_bnProcBufRes[72]; + p_llrRes = (__m256i*) &llrRes [18816]; + for (i=0;i<M;i++) { + p_res[i] = _mm256_subs_epi8(p_llrRes[i], p_bnProcBuf[72 + i]); +} + p_res = &p_bnProcBufRes[84]; + p_llrRes = (__m256i*) &llrRes [18816]; + for (i=0;i<M;i++) { + p_res[i] = _mm256_subs_epi8(p_llrRes[i], p_bnProcBuf[84 + i]); +} + p_res = &p_bnProcBufRes[96]; + p_llrRes = (__m256i*) &llrRes [18816]; + for (i=0;i<M;i++) { + p_res[i] = _mm256_subs_epi8(p_llrRes[i], p_bnProcBuf[96 + i]); +} + p_res = &p_bnProcBufRes[108]; + p_llrRes = (__m256i*) &llrRes [18816]; + for (i=0;i<M;i++) { + p_res[i] = _mm256_subs_epi8(p_llrRes[i], p_bnProcBuf[108 + i]); +} + p_res = &p_bnProcBufRes[120]; + p_llrRes = (__m256i*) &llrRes [18816]; + for (i=0;i<M;i++) { + p_res[i] = _mm256_subs_epi8(p_llrRes[i], p_bnProcBuf[120 + i]); +} + p_res = &p_bnProcBufRes[132]; + p_llrRes = (__m256i*) &llrRes [18816]; + for (i=0;i<M;i++) { + p_res[i] = _mm256_subs_epi8(p_llrRes[i], p_bnProcBuf[132 + i]); +} + p_res = &p_bnProcBufRes[144]; + p_llrRes = (__m256i*) &llrRes [18816]; + for (i=0;i<M;i++) { + p_res[i] = _mm256_subs_epi8(p_llrRes[i], p_bnProcBuf[144 + i]); +} + p_res = &p_bnProcBufRes[156]; + p_llrRes = (__m256i*) &llrRes [18816]; + for (i=0;i<M;i++) { + p_res[i] = _mm256_subs_epi8(p_llrRes[i], p_bnProcBuf[156 + i]); +} + p_res = &p_bnProcBufRes[168]; + p_llrRes = (__m256i*) &llrRes [18816]; + for (i=0;i<M;i++) { + p_res[i] = _mm256_subs_epi8(p_llrRes[i], p_bnProcBuf[168 + i]); +} + p_res = &p_bnProcBufRes[180]; + p_llrRes = (__m256i*) &llrRes [18816]; + for (i=0;i<M;i++) { + p_res[i] = _mm256_subs_epi8(p_llrRes[i], p_bnProcBuf[180 + i]); +} +// Process group with 17 CNs +// Process group with 18 CNs +// Process group with 19 CNs +// Process group with 20 CNs +// Process group with 21 CNs +// Process group with 22 CNs + M = (1*Z + 31)>>5; + p_bnProcBuf = (__m256i*) &bnProcBuf [58368]; + p_bnProcBufRes = (__m256i*) &bnProcBufRes [58368]; + p_res = &p_bnProcBufRes[0]; + p_llrRes = (__m256i*) &llrRes [19200]; + for (i=0;i<M;i++) { + p_res[i] = _mm256_subs_epi8(p_llrRes[i], p_bnProcBuf[0 + i]); +} + p_res = &p_bnProcBufRes[12]; + p_llrRes = (__m256i*) &llrRes [19200]; + for (i=0;i<M;i++) { + p_res[i] = _mm256_subs_epi8(p_llrRes[i], p_bnProcBuf[12 + i]); +} + p_res = &p_bnProcBufRes[24]; + p_llrRes = (__m256i*) &llrRes [19200]; + for (i=0;i<M;i++) { + p_res[i] = _mm256_subs_epi8(p_llrRes[i], p_bnProcBuf[24 + i]); +} + p_res = &p_bnProcBufRes[36]; + p_llrRes = (__m256i*) &llrRes [19200]; + for (i=0;i<M;i++) { + p_res[i] = _mm256_subs_epi8(p_llrRes[i], p_bnProcBuf[36 + i]); +} + p_res = &p_bnProcBufRes[48]; + p_llrRes = (__m256i*) &llrRes [19200]; + for (i=0;i<M;i++) { + p_res[i] = _mm256_subs_epi8(p_llrRes[i], p_bnProcBuf[48 + i]); +} + p_res = &p_bnProcBufRes[60]; + p_llrRes = (__m256i*) &llrRes [19200]; + for (i=0;i<M;i++) { + p_res[i] = _mm256_subs_epi8(p_llrRes[i], p_bnProcBuf[60 + i]); +} + p_res = &p_bnProcBufRes[72]; + p_llrRes = (__m256i*) &llrRes [19200]; + for (i=0;i<M;i++) { + p_res[i] = _mm256_subs_epi8(p_llrRes[i], p_bnProcBuf[72 + i]); +} + p_res = &p_bnProcBufRes[84]; + p_llrRes = (__m256i*) &llrRes [19200]; + for (i=0;i<M;i++) { + p_res[i] = _mm256_subs_epi8(p_llrRes[i], p_bnProcBuf[84 + i]); +} + p_res = &p_bnProcBufRes[96]; + p_llrRes = (__m256i*) &llrRes [19200]; + for (i=0;i<M;i++) { + p_res[i] = _mm256_subs_epi8(p_llrRes[i], p_bnProcBuf[96 + i]); +} + p_res = &p_bnProcBufRes[108]; + p_llrRes = (__m256i*) &llrRes [19200]; + for (i=0;i<M;i++) { + p_res[i] = _mm256_subs_epi8(p_llrRes[i], p_bnProcBuf[108 + i]); +} + p_res = &p_bnProcBufRes[120]; + p_llrRes = (__m256i*) &llrRes [19200]; + for (i=0;i<M;i++) { + p_res[i] = _mm256_subs_epi8(p_llrRes[i], p_bnProcBuf[120 + i]); +} + p_res = &p_bnProcBufRes[132]; + p_llrRes = (__m256i*) &llrRes [19200]; + for (i=0;i<M;i++) { + p_res[i] = _mm256_subs_epi8(p_llrRes[i], p_bnProcBuf[132 + i]); +} + p_res = &p_bnProcBufRes[144]; + p_llrRes = (__m256i*) &llrRes [19200]; + for (i=0;i<M;i++) { + p_res[i] = _mm256_subs_epi8(p_llrRes[i], p_bnProcBuf[144 + i]); +} + p_res = &p_bnProcBufRes[156]; + p_llrRes = (__m256i*) &llrRes [19200]; + for (i=0;i<M;i++) { + p_res[i] = _mm256_subs_epi8(p_llrRes[i], p_bnProcBuf[156 + i]); +} + p_res = &p_bnProcBufRes[168]; + p_llrRes = (__m256i*) &llrRes [19200]; + for (i=0;i<M;i++) { + p_res[i] = _mm256_subs_epi8(p_llrRes[i], p_bnProcBuf[168 + i]); +} + p_res = &p_bnProcBufRes[180]; + p_llrRes = (__m256i*) &llrRes [19200]; + for (i=0;i<M;i++) { + p_res[i] = _mm256_subs_epi8(p_llrRes[i], p_bnProcBuf[180 + i]); +} + p_res = &p_bnProcBufRes[192]; + p_llrRes = (__m256i*) &llrRes [19200]; + for (i=0;i<M;i++) { + p_res[i] = _mm256_subs_epi8(p_llrRes[i], p_bnProcBuf[192 + i]); +} + p_res = &p_bnProcBufRes[204]; + p_llrRes = (__m256i*) &llrRes [19200]; + for (i=0;i<M;i++) { + p_res[i] = _mm256_subs_epi8(p_llrRes[i], p_bnProcBuf[204 + i]); +} + p_res = &p_bnProcBufRes[216]; + p_llrRes = (__m256i*) &llrRes [19200]; + for (i=0;i<M;i++) { + p_res[i] = _mm256_subs_epi8(p_llrRes[i], p_bnProcBuf[216 + i]); +} + p_res = &p_bnProcBufRes[228]; + p_llrRes = (__m256i*) &llrRes [19200]; + for (i=0;i<M;i++) { + p_res[i] = _mm256_subs_epi8(p_llrRes[i], p_bnProcBuf[228 + i]); +} + p_res = &p_bnProcBufRes[240]; + p_llrRes = (__m256i*) &llrRes [19200]; + for (i=0;i<M;i++) { + p_res[i] = _mm256_subs_epi8(p_llrRes[i], p_bnProcBuf[240 + i]); +} + p_res = &p_bnProcBufRes[252]; + p_llrRes = (__m256i*) &llrRes [19200]; + for (i=0;i<M;i++) { + p_res[i] = _mm256_subs_epi8(p_llrRes[i], p_bnProcBuf[252 + i]); +} +// Process group with <23 CNs + M = (1*Z + 31)>>5; + p_bnProcBuf = (__m256i*) &bnProcBuf [66816]; + p_bnProcBufRes = (__m256i*) &bnProcBufRes [66816]; + p_res = &p_bnProcBufRes[0]; + p_llrRes = (__m256i*) &llrRes [19584]; + for (i=0;i<M;i++) { + p_res[i] = _mm256_subs_epi8(p_llrRes[i], p_bnProcBuf[0 + i]); +} + p_res = &p_bnProcBufRes[12]; + p_llrRes = (__m256i*) &llrRes [19584]; + for (i=0;i<M;i++) { + p_res[i] = _mm256_subs_epi8(p_llrRes[i], p_bnProcBuf[12 + i]); +} + p_res = &p_bnProcBufRes[24]; + p_llrRes = (__m256i*) &llrRes [19584]; + for (i=0;i<M;i++) { + p_res[i] = _mm256_subs_epi8(p_llrRes[i], p_bnProcBuf[24 + i]); +} + p_res = &p_bnProcBufRes[36]; + p_llrRes = (__m256i*) &llrRes [19584]; + for (i=0;i<M;i++) { + p_res[i] = _mm256_subs_epi8(p_llrRes[i], p_bnProcBuf[36 + i]); +} + p_res = &p_bnProcBufRes[48]; + p_llrRes = (__m256i*) &llrRes [19584]; + for (i=0;i<M;i++) { + p_res[i] = _mm256_subs_epi8(p_llrRes[i], p_bnProcBuf[48 + i]); +} + p_res = &p_bnProcBufRes[60]; + p_llrRes = (__m256i*) &llrRes [19584]; + for (i=0;i<M;i++) { + p_res[i] = _mm256_subs_epi8(p_llrRes[i], p_bnProcBuf[60 + i]); +} + p_res = &p_bnProcBufRes[72]; + p_llrRes = (__m256i*) &llrRes [19584]; + for (i=0;i<M;i++) { + p_res[i] = _mm256_subs_epi8(p_llrRes[i], p_bnProcBuf[72 + i]); +} + p_res = &p_bnProcBufRes[84]; + p_llrRes = (__m256i*) &llrRes [19584]; + for (i=0;i<M;i++) { + p_res[i] = _mm256_subs_epi8(p_llrRes[i], p_bnProcBuf[84 + i]); +} + p_res = &p_bnProcBufRes[96]; + p_llrRes = (__m256i*) &llrRes [19584]; + for (i=0;i<M;i++) { + p_res[i] = _mm256_subs_epi8(p_llrRes[i], p_bnProcBuf[96 + i]); +} + p_res = &p_bnProcBufRes[108]; + p_llrRes = (__m256i*) &llrRes [19584]; + for (i=0;i<M;i++) { + p_res[i] = _mm256_subs_epi8(p_llrRes[i], p_bnProcBuf[108 + i]); +} + p_res = &p_bnProcBufRes[120]; + p_llrRes = (__m256i*) &llrRes [19584]; + for (i=0;i<M;i++) { + p_res[i] = _mm256_subs_epi8(p_llrRes[i], p_bnProcBuf[120 + i]); +} + p_res = &p_bnProcBufRes[132]; + p_llrRes = (__m256i*) &llrRes [19584]; + for (i=0;i<M;i++) { + p_res[i] = _mm256_subs_epi8(p_llrRes[i], p_bnProcBuf[132 + i]); +} + p_res = &p_bnProcBufRes[144]; + p_llrRes = (__m256i*) &llrRes [19584]; + for (i=0;i<M;i++) { + p_res[i] = _mm256_subs_epi8(p_llrRes[i], p_bnProcBuf[144 + i]); +} + p_res = &p_bnProcBufRes[156]; + p_llrRes = (__m256i*) &llrRes [19584]; + for (i=0;i<M;i++) { + p_res[i] = _mm256_subs_epi8(p_llrRes[i], p_bnProcBuf[156 + i]); +} + p_res = &p_bnProcBufRes[168]; + p_llrRes = (__m256i*) &llrRes [19584]; + for (i=0;i<M;i++) { + p_res[i] = _mm256_subs_epi8(p_llrRes[i], p_bnProcBuf[168 + i]); +} + p_res = &p_bnProcBufRes[180]; + p_llrRes = (__m256i*) &llrRes [19584]; + for (i=0;i<M;i++) { + p_res[i] = _mm256_subs_epi8(p_llrRes[i], p_bnProcBuf[180 + i]); +} + p_res = &p_bnProcBufRes[192]; + p_llrRes = (__m256i*) &llrRes [19584]; + for (i=0;i<M;i++) { + p_res[i] = _mm256_subs_epi8(p_llrRes[i], p_bnProcBuf[192 + i]); +} + p_res = &p_bnProcBufRes[204]; + p_llrRes = (__m256i*) &llrRes [19584]; + for (i=0;i<M;i++) { + p_res[i] = _mm256_subs_epi8(p_llrRes[i], p_bnProcBuf[204 + i]); +} + p_res = &p_bnProcBufRes[216]; + p_llrRes = (__m256i*) &llrRes [19584]; + for (i=0;i<M;i++) { + p_res[i] = _mm256_subs_epi8(p_llrRes[i], p_bnProcBuf[216 + i]); +} + p_res = &p_bnProcBufRes[228]; + p_llrRes = (__m256i*) &llrRes [19584]; + for (i=0;i<M;i++) { + p_res[i] = _mm256_subs_epi8(p_llrRes[i], p_bnProcBuf[228 + i]); +} + p_res = &p_bnProcBufRes[240]; + p_llrRes = (__m256i*) &llrRes [19584]; + for (i=0;i<M;i++) { + p_res[i] = _mm256_subs_epi8(p_llrRes[i], p_bnProcBuf[240 + i]); +} + p_res = &p_bnProcBufRes[252]; + p_llrRes = (__m256i*) &llrRes [19584]; + for (i=0;i<M;i++) { + p_res[i] = _mm256_subs_epi8(p_llrRes[i], p_bnProcBuf[252 + i]); +} + p_res = &p_bnProcBufRes[264]; + p_llrRes = (__m256i*) &llrRes [19584]; + for (i=0;i<M;i++) { + p_res[i] = _mm256_subs_epi8(p_llrRes[i], p_bnProcBuf[264 + i]); +} +// Process group with 24 CNs +// Process group with 25 CNs +// Process group with 26 CNs +// Process group with 27 CNs +// Process group with 28 CNs +// Process group with 29 CNs +// Process group with 30 CNs +} diff --git a/openair1/PHY/CODING/nrLDPC_decoder/nrLDPC_tools/ldpc_gen_files/bnProc/nrLDPC_bnProc_BG2_R23_AVX2.c b/openair1/PHY/CODING/nrLDPC_decoder/nrLDPC_tools/ldpc_gen_files/bnProc/nrLDPC_bnProc_BG2_R23_AVX2.c new file mode 100644 index 0000000000000000000000000000000000000000..59a1613099c8da476887c855153e5a4b2ebdd575 --- /dev/null +++ b/openair1/PHY/CODING/nrLDPC_decoder/nrLDPC_tools/ldpc_gen_files/bnProc/nrLDPC_bnProc_BG2_R23_AVX2.c @@ -0,0 +1,153 @@ +#include <stdint.h> +#include <immintrin.h> +void nrLDPC_bnProc_BG2_R23_AVX2(int8_t* bnProcBuf,int8_t* bnProcBufRes, int8_t* llrRes, uint16_t Z ) { + __m256i* p_bnProcBuf; + __m256i* p_bnProcBufRes; + __m256i* p_llrRes; + __m256i* p_res; + uint32_t M, i; +// Process group with 2 CNs + M = (3*Z + 31)>>5; + p_bnProcBuf = (__m256i*) &bnProcBuf [1152]; + p_bnProcBufRes = (__m256i*) &bnProcBufRes [1152]; + p_res = &p_bnProcBufRes[0]; + p_llrRes = (__m256i*) &llrRes [1152]; + for (i=0;i<M;i++) { + p_res[i] = _mm256_subs_epi8(p_llrRes[i], p_bnProcBuf[0 + i]); +} + p_res = &p_bnProcBufRes[36]; + p_llrRes = (__m256i*) &llrRes [1152]; + for (i=0;i<M;i++) { + p_res[i] = _mm256_subs_epi8(p_llrRes[i], p_bnProcBuf[36 + i]); +} +// Process group with 3 CNs + M = (5*Z + 31)>>5; + p_bnProcBuf = (__m256i*) &bnProcBuf [3456]; + p_bnProcBufRes = (__m256i*) &bnProcBufRes [3456]; + p_res = &p_bnProcBufRes[0]; + p_llrRes = (__m256i*) &llrRes [2304]; + for (i=0;i<M;i++) { + p_res[i] = _mm256_subs_epi8(p_llrRes[i], p_bnProcBuf[0 + i]); +} + p_res = &p_bnProcBufRes[60]; + p_llrRes = (__m256i*) &llrRes [2304]; + for (i=0;i<M;i++) { + p_res[i] = _mm256_subs_epi8(p_llrRes[i], p_bnProcBuf[60 + i]); +} + p_res = &p_bnProcBufRes[120]; + p_llrRes = (__m256i*) &llrRes [2304]; + for (i=0;i<M;i++) { + p_res[i] = _mm256_subs_epi8(p_llrRes[i], p_bnProcBuf[120 + i]); +} +// Process group with 4 CNs + M = (3*Z + 31)>>5; + p_bnProcBuf = (__m256i*) &bnProcBuf [9216]; + p_bnProcBufRes = (__m256i*) &bnProcBufRes [9216]; + p_res = &p_bnProcBufRes[0]; + p_llrRes = (__m256i*) &llrRes [4224]; + for (i=0;i<M;i++) { + p_res[i] = _mm256_subs_epi8(p_llrRes[i], p_bnProcBuf[0 + i]); +} + p_res = &p_bnProcBufRes[36]; + p_llrRes = (__m256i*) &llrRes [4224]; + for (i=0;i<M;i++) { + p_res[i] = _mm256_subs_epi8(p_llrRes[i], p_bnProcBuf[36 + i]); +} + p_res = &p_bnProcBufRes[72]; + p_llrRes = (__m256i*) &llrRes [4224]; + for (i=0;i<M;i++) { + p_res[i] = _mm256_subs_epi8(p_llrRes[i], p_bnProcBuf[72 + i]); +} + p_res = &p_bnProcBufRes[108]; + p_llrRes = (__m256i*) &llrRes [4224]; + for (i=0;i<M;i++) { + p_res[i] = _mm256_subs_epi8(p_llrRes[i], p_bnProcBuf[108 + i]); +} +// Process group with 5 CNs + M = (2*Z + 31)>>5; + p_bnProcBuf = (__m256i*) &bnProcBuf [13824]; + p_bnProcBufRes = (__m256i*) &bnProcBufRes [13824]; + p_res = &p_bnProcBufRes[0]; + p_llrRes = (__m256i*) &llrRes [5376]; + for (i=0;i<M;i++) { + p_res[i] = _mm256_subs_epi8(p_llrRes[i], p_bnProcBuf[0 + i]); +} + p_res = &p_bnProcBufRes[24]; + p_llrRes = (__m256i*) &llrRes [5376]; + for (i=0;i<M;i++) { + p_res[i] = _mm256_subs_epi8(p_llrRes[i], p_bnProcBuf[24 + i]); +} + p_res = &p_bnProcBufRes[48]; + p_llrRes = (__m256i*) &llrRes [5376]; + for (i=0;i<M;i++) { + p_res[i] = _mm256_subs_epi8(p_llrRes[i], p_bnProcBuf[48 + i]); +} + p_res = &p_bnProcBufRes[72]; + p_llrRes = (__m256i*) &llrRes [5376]; + for (i=0;i<M;i++) { + p_res[i] = _mm256_subs_epi8(p_llrRes[i], p_bnProcBuf[72 + i]); +} + p_res = &p_bnProcBufRes[96]; + p_llrRes = (__m256i*) &llrRes [5376]; + for (i=0;i<M;i++) { + p_res[i] = _mm256_subs_epi8(p_llrRes[i], p_bnProcBuf[96 + i]); +} +// Process group with 6 CNs + M = (1*Z + 31)>>5; + p_bnProcBuf = (__m256i*) &bnProcBuf [17664]; + p_bnProcBufRes = (__m256i*) &bnProcBufRes [17664]; + p_res = &p_bnProcBufRes[0]; + p_llrRes = (__m256i*) &llrRes [6144]; + for (i=0;i<M;i++) { + p_res[i] = _mm256_subs_epi8(p_llrRes[i], p_bnProcBuf[0 + i]); +} + p_res = &p_bnProcBufRes[12]; + p_llrRes = (__m256i*) &llrRes [6144]; + for (i=0;i<M;i++) { + p_res[i] = _mm256_subs_epi8(p_llrRes[i], p_bnProcBuf[12 + i]); +} + p_res = &p_bnProcBufRes[24]; + p_llrRes = (__m256i*) &llrRes [6144]; + for (i=0;i<M;i++) { + p_res[i] = _mm256_subs_epi8(p_llrRes[i], p_bnProcBuf[24 + i]); +} + p_res = &p_bnProcBufRes[36]; + p_llrRes = (__m256i*) &llrRes [6144]; + for (i=0;i<M;i++) { + p_res[i] = _mm256_subs_epi8(p_llrRes[i], p_bnProcBuf[36 + i]); +} + p_res = &p_bnProcBufRes[48]; + p_llrRes = (__m256i*) &llrRes [6144]; + for (i=0;i<M;i++) { + p_res[i] = _mm256_subs_epi8(p_llrRes[i], p_bnProcBuf[48 + i]); +} + p_res = &p_bnProcBufRes[60]; + p_llrRes = (__m256i*) &llrRes [6144]; + for (i=0;i<M;i++) { + p_res[i] = _mm256_subs_epi8(p_llrRes[i], p_bnProcBuf[60 + i]); +} +// Process group with 7 CNs +// Process group with 8 CNs +// Process group with 9 CNs +// Process group with 10 CNs +// Process group with 11 CNs +// Process group with 12 CNs +// Process group with 13 CNs +// Process group with 14 CNs +// Process group with 15 CNs +// Process group with 16 CNs +// Process group with 17 CNs +// Process group with 18 CNs +// Process group with 19 CNs +// Process group with 20 CNs +// Process group with 21 CNs +// Process group with 22 CNs +// Process group with <23 CNs +// Process group with 24 CNs +// Process group with 25 CNs +// Process group with 26 CNs +// Process group with 27 CNs +// Process group with 28 CNs +// Process group with 29 CNs +// Process group with 30 CNs +} diff --git a/openair1/PHY/CODING/nrLDPC_decoder/nrLDPC_tools/ldpc_gen_files/bnProcPc/nrLDPC_bnProcPc_BG1_R13_AVX2.h b/openair1/PHY/CODING/nrLDPC_decoder/nrLDPC_tools/ldpc_gen_files/bnProcPc/nrLDPC_bnProcPc_BG1_R13_AVX2.h index a4508d2d7a3db278a3700ccba3ae4c0c0fad8168..b10a842171b778f7d4323b6e1ad35ac6deb01b95 100644 --- a/openair1/PHY/CODING/nrLDPC_decoder/nrLDPC_tools/ldpc_gen_files/bnProcPc/nrLDPC_bnProcPc_BG1_R13_AVX2.h +++ b/openair1/PHY/CODING/nrLDPC_decoder/nrLDPC_tools/ldpc_gen_files/bnProcPc/nrLDPC_bnProcPc_BG1_R13_AVX2.h @@ -5,20 +5,6 @@ static inline void nrLDPC_bnProcPc_BG1_R13_AVX2(int8_t* bnProcBuf,int8_t* llrRes __m256i* p_llrRes; uint32_t M ; // Process group with 1 CNs - M = (42*Z + 31)>>5; - p_bnProcBuf = (__m128i*) &bnProcBuf [0]; - p_llrProcBuf = (__m128i*) &llrProcBuf [0]; - p_llrRes = (__m256i*) &llrRes [0]; - for (int i=0,j=0;i<M;i++,j+=2) { - ymm0 = _mm256_cvtepi8_epi16(p_bnProcBuf [j]); - ymm1 = _mm256_cvtepi8_epi16(p_bnProcBuf[j + 1]); - ymm0 = _mm256_cvtepi8_epi16(p_llrProcBuf[j]); - ymmRes0 = _mm256_adds_epi16(ymmRes0, ymm0); - ymm1 = _mm256_cvtepi8_epi16(p_llrProcBuf[j +1 ]); - ymmRes1 = _mm256_adds_epi16(ymmRes1, ymm1); - ymm0 = _mm256_packs_epi16(ymmRes0, ymmRes1); - p_llrRes[i] = _mm256_permute4x64_epi64(ymm0, 0xD8); -} // Process group with 2 CNs // Process group with 3 CNs // Process group with 4 CNs diff --git a/openair1/PHY/CODING/nrLDPC_decoder/nrLDPC_tools/ldpc_gen_files/bnProcPc/nrLDPC_bnProcPc_BG1_R23_AVX2.h b/openair1/PHY/CODING/nrLDPC_decoder/nrLDPC_tools/ldpc_gen_files/bnProcPc/nrLDPC_bnProcPc_BG1_R23_AVX2.h index 47fd449fe44011f19cfdd68225d5c79fb62e37d5..8647e8cde6a758fa7e38dbbda53184d7559cfc38 100644 --- a/openair1/PHY/CODING/nrLDPC_decoder/nrLDPC_tools/ldpc_gen_files/bnProcPc/nrLDPC_bnProcPc_BG1_R23_AVX2.h +++ b/openair1/PHY/CODING/nrLDPC_decoder/nrLDPC_tools/ldpc_gen_files/bnProcPc/nrLDPC_bnProcPc_BG1_R23_AVX2.h @@ -5,20 +5,6 @@ static inline void nrLDPC_bnProcPc_BG1_R23_AVX2(int8_t* bnProcBuf,int8_t* llrRes __m256i* p_llrRes; uint32_t M ; // Process group with 1 CNs - M = (9*Z + 31)>>5; - p_bnProcBuf = (__m128i*) &bnProcBuf [0]; - p_llrProcBuf = (__m128i*) &llrProcBuf [0]; - p_llrRes = (__m256i*) &llrRes [0]; - for (int i=0,j=0;i<M;i++,j+=2) { - ymm0 = _mm256_cvtepi8_epi16(p_bnProcBuf [j]); - ymm1 = _mm256_cvtepi8_epi16(p_bnProcBuf[j + 1]); - ymm0 = _mm256_cvtepi8_epi16(p_llrProcBuf[j]); - ymmRes0 = _mm256_adds_epi16(ymmRes0, ymm0); - ymm1 = _mm256_cvtepi8_epi16(p_llrProcBuf[j +1 ]); - ymmRes1 = _mm256_adds_epi16(ymmRes1, ymm1); - ymm0 = _mm256_packs_epi16(ymmRes0, ymmRes1); - p_llrRes[i] = _mm256_permute4x64_epi64(ymm0, 0xD8); -} // Process group with 2 CNs M = (1*Z + 31)>>5; p_bnProcBuf = (__m128i*) &bnProcBuf [3456]; diff --git a/openair1/PHY/CODING/nrLDPC_decoder/nrLDPC_tools/ldpc_gen_files/bnProcPc/nrLDPC_bnProcPc_BG1_R89_AVX2.h b/openair1/PHY/CODING/nrLDPC_decoder/nrLDPC_tools/ldpc_gen_files/bnProcPc/nrLDPC_bnProcPc_BG1_R89_AVX2.h index 8c7fa43f5d51237654926b1f51cc3b66b907533c..aee271db84d036d34c317d326ede442d84b51056 100644 --- a/openair1/PHY/CODING/nrLDPC_decoder/nrLDPC_tools/ldpc_gen_files/bnProcPc/nrLDPC_bnProcPc_BG1_R89_AVX2.h +++ b/openair1/PHY/CODING/nrLDPC_decoder/nrLDPC_tools/ldpc_gen_files/bnProcPc/nrLDPC_bnProcPc_BG1_R89_AVX2.h @@ -5,20 +5,6 @@ static inline void nrLDPC_bnProcPc_BG1_R89_AVX2(int8_t* bnProcBuf,int8_t* llrRes __m256i* p_llrRes; uint32_t M ; // Process group with 1 CNs - M = (1*Z + 31)>>5; - p_bnProcBuf = (__m128i*) &bnProcBuf [0]; - p_llrProcBuf = (__m128i*) &llrProcBuf [0]; - p_llrRes = (__m256i*) &llrRes [0]; - for (int i=0,j=0;i<M;i++,j+=2) { - ymm0 = _mm256_cvtepi8_epi16(p_bnProcBuf [j]); - ymm1 = _mm256_cvtepi8_epi16(p_bnProcBuf[j + 1]); - ymm0 = _mm256_cvtepi8_epi16(p_llrProcBuf[j]); - ymmRes0 = _mm256_adds_epi16(ymmRes0, ymm0); - ymm1 = _mm256_cvtepi8_epi16(p_llrProcBuf[j +1 ]); - ymmRes1 = _mm256_adds_epi16(ymmRes1, ymm1); - ymm0 = _mm256_packs_epi16(ymmRes0, ymmRes1); - p_llrRes[i] = _mm256_permute4x64_epi64(ymm0, 0xD8); -} // Process group with 2 CNs M = (3*Z + 31)>>5; p_bnProcBuf = (__m128i*) &bnProcBuf [384]; diff --git a/openair1/PHY/CODING/nrLDPC_decoder/nrLDPC_tools/ldpc_gen_files/bn_avx512/nrLDPC_bnProcPc_BG1_R13_AVX512.h b/openair1/PHY/CODING/nrLDPC_decoder/nrLDPC_tools/ldpc_gen_files/bn_avx512/nrLDPC_bnProcPc_BG1_R13_AVX512.h new file mode 100644 index 0000000000000000000000000000000000000000..1db81b7a061e6c15bcbe6d842f7765a708aa16b3 --- /dev/null +++ b/openair1/PHY/CODING/nrLDPC_decoder/nrLDPC_tools/ldpc_gen_files/bn_avx512/nrLDPC_bnProcPc_BG1_R13_AVX512.h @@ -0,0 +1,743 @@ +static inline void nrLDPC_bnProcPc_BG1_R13_AVX512(int8_t* bnProcBuf,int8_t* llrRes , int8_t* llrProcBuf, uint16_t Z ) { + __m512i zmm0, zmm1, zmmRes0, zmmRes1; + __m256i* p_bnProcBuf; + __m256i* p_llrProcBuf; + __m512i* p_llrRes; + uint32_t M ; +// Process group with 1 CNs + M = (42*Z + 63)>>6; + p_bnProcBuf = (__m256i*) &bnProcBuf [0]; + p_llrProcBuf = (__m256i*) &llrProcBuf [0]; + p_llrRes = (__m512i*) &llrRes [0]; + for (int i=0,j=0;i<M;i++,j+=2) { + zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf [j]); + zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[j + 1]); + zmm0 = _mm512_cvtepi8_epi16(p_llrProcBuf[j]); + zmmRes0 = _mm512_adds_epi16(zmmRes0, zmm0); + zmm1 = _mm512_cvtepi8_epi16(p_llrProcBuf[j +1 ]); + zmmRes1 = _mm512_adds_epi16(zmmRes1, zmm1); + zmm0 = _mm512_packs_epi16(zmmRes0, zmmRes1); + p_llrRes[i] = _mm512_permutex_epi64(zmm0, 0xD8); +} +// Process group with 2 CNs +// Process group with 3 CNs +// Process group with 4 CNs + M = (1*Z + 63)>>6; + p_bnProcBuf = (__m256i*) &bnProcBuf [16128]; + p_llrProcBuf = (__m256i*) &llrProcBuf [16128]; + p_llrRes = (__m512i*) &llrRes [16128]; + for (int i=0,j=0;i<M;i++,j+=2) { + zmmRes0 = _mm512_cvtepi8_epi16(p_bnProcBuf [j]); + zmmRes1 = _mm512_cvtepi8_epi16(p_bnProcBuf [j +1]); + zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf[12 + j]); + zmmRes0 = _mm512_adds_epi16(zmmRes0, zmm0); + zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[12 + j +1]); + zmmRes1 = _mm512_adds_epi16(zmmRes1, zmm1); + zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf[24 + j]); + zmmRes0 = _mm512_adds_epi16(zmmRes0, zmm0); + zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[24 + j +1]); + zmmRes1 = _mm512_adds_epi16(zmmRes1, zmm1); + zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf[36 + j]); + zmmRes0 = _mm512_adds_epi16(zmmRes0, zmm0); + zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[36 + j +1]); + zmmRes1 = _mm512_adds_epi16(zmmRes1, zmm1); + zmm0 = _mm512_cvtepi8_epi16(p_llrProcBuf[j]); + zmmRes0 = _mm512_adds_epi16(zmmRes0, zmm0); + zmm1 = _mm512_cvtepi8_epi16(p_llrProcBuf[j +1 ]); + zmmRes1 = _mm512_adds_epi16(zmmRes1, zmm1); + zmm0 = _mm512_packs_epi16(zmmRes0, zmmRes1); + p_llrRes[i] = _mm512_permutex_epi64(zmm0, 0xD8); +} +// Process group with 5 CNs + M = (1*Z + 63)>>6; + p_bnProcBuf = (__m256i*) &bnProcBuf [17664]; + p_llrProcBuf = (__m256i*) &llrProcBuf [16512]; + p_llrRes = (__m512i*) &llrRes [16512]; + for (int i=0,j=0;i<M;i++,j+=2) { + zmmRes0 = _mm512_cvtepi8_epi16(p_bnProcBuf [j]); + zmmRes1 = _mm512_cvtepi8_epi16(p_bnProcBuf [j +1]); + zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf[12 + j]); + zmmRes0 = _mm512_adds_epi16(zmmRes0, zmm0); + zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[12 + j +1]); + zmmRes1 = _mm512_adds_epi16(zmmRes1, zmm1); + zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf[24 + j]); + zmmRes0 = _mm512_adds_epi16(zmmRes0, zmm0); + zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[24 + j +1]); + zmmRes1 = _mm512_adds_epi16(zmmRes1, zmm1); + zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf[36 + j]); + zmmRes0 = _mm512_adds_epi16(zmmRes0, zmm0); + zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[36 + j +1]); + zmmRes1 = _mm512_adds_epi16(zmmRes1, zmm1); + zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf[48 + j]); + zmmRes0 = _mm512_adds_epi16(zmmRes0, zmm0); + zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[48 + j +1]); + zmmRes1 = _mm512_adds_epi16(zmmRes1, zmm1); + zmm0 = _mm512_cvtepi8_epi16(p_llrProcBuf[j]); + zmmRes0 = _mm512_adds_epi16(zmmRes0, zmm0); + zmm1 = _mm512_cvtepi8_epi16(p_llrProcBuf[j +1 ]); + zmmRes1 = _mm512_adds_epi16(zmmRes1, zmm1); + zmm0 = _mm512_packs_epi16(zmmRes0, zmmRes1); + p_llrRes[i] = _mm512_permutex_epi64(zmm0, 0xD8); +} +// Process group with 6 CNs + M = (2*Z + 63)>>6; + p_bnProcBuf = (__m256i*) &bnProcBuf [19584]; + p_llrProcBuf = (__m256i*) &llrProcBuf [16896]; + p_llrRes = (__m512i*) &llrRes [16896]; + for (int i=0,j=0;i<M;i++,j+=2) { + zmmRes0 = _mm512_cvtepi8_epi16(p_bnProcBuf [j]); + zmmRes1 = _mm512_cvtepi8_epi16(p_bnProcBuf [j +1]); + zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf[24 + j]); + zmmRes0 = _mm512_adds_epi16(zmmRes0, zmm0); + zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[24 + j +1]); + zmmRes1 = _mm512_adds_epi16(zmmRes1, zmm1); + zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf[48 + j]); + zmmRes0 = _mm512_adds_epi16(zmmRes0, zmm0); + zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[48 + j +1]); + zmmRes1 = _mm512_adds_epi16(zmmRes1, zmm1); + zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf[72 + j]); + zmmRes0 = _mm512_adds_epi16(zmmRes0, zmm0); + zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[72 + j +1]); + zmmRes1 = _mm512_adds_epi16(zmmRes1, zmm1); + zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf[96 + j]); + zmmRes0 = _mm512_adds_epi16(zmmRes0, zmm0); + zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[96 + j +1]); + zmmRes1 = _mm512_adds_epi16(zmmRes1, zmm1); + zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf[120 + j]); + zmmRes0 = _mm512_adds_epi16(zmmRes0, zmm0); + zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[120 + j +1]); + zmmRes1 = _mm512_adds_epi16(zmmRes1, zmm1); + zmm0 = _mm512_cvtepi8_epi16(p_llrProcBuf[j]); + zmmRes0 = _mm512_adds_epi16(zmmRes0, zmm0); + zmm1 = _mm512_cvtepi8_epi16(p_llrProcBuf[j +1 ]); + zmmRes1 = _mm512_adds_epi16(zmmRes1, zmm1); + zmm0 = _mm512_packs_epi16(zmmRes0, zmmRes1); + p_llrRes[i] = _mm512_permutex_epi64(zmm0, 0xD8); +} +// Process group with 7 CNs + M = (4*Z + 63)>>6; + p_bnProcBuf = (__m256i*) &bnProcBuf [24192]; + p_llrProcBuf = (__m256i*) &llrProcBuf [17664]; + p_llrRes = (__m512i*) &llrRes [17664]; + for (int i=0,j=0;i<M;i++,j+=2) { + zmmRes0 = _mm512_cvtepi8_epi16(p_bnProcBuf [j]); + zmmRes1 = _mm512_cvtepi8_epi16(p_bnProcBuf [j +1]); + zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf[48 + j]); + zmmRes0 = _mm512_adds_epi16(zmmRes0, zmm0); + zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[48 + j +1]); + zmmRes1 = _mm512_adds_epi16(zmmRes1, zmm1); + zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf[96 + j]); + zmmRes0 = _mm512_adds_epi16(zmmRes0, zmm0); + zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[96 + j +1]); + zmmRes1 = _mm512_adds_epi16(zmmRes1, zmm1); + zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf[144 + j]); + zmmRes0 = _mm512_adds_epi16(zmmRes0, zmm0); + zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[144 + j +1]); + zmmRes1 = _mm512_adds_epi16(zmmRes1, zmm1); + zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf[192 + j]); + zmmRes0 = _mm512_adds_epi16(zmmRes0, zmm0); + zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[192 + j +1]); + zmmRes1 = _mm512_adds_epi16(zmmRes1, zmm1); + zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf[240 + j]); + zmmRes0 = _mm512_adds_epi16(zmmRes0, zmm0); + zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[240 + j +1]); + zmmRes1 = _mm512_adds_epi16(zmmRes1, zmm1); + zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf[288 + j]); + zmmRes0 = _mm512_adds_epi16(zmmRes0, zmm0); + zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[288 + j +1]); + zmmRes1 = _mm512_adds_epi16(zmmRes1, zmm1); + zmm0 = _mm512_cvtepi8_epi16(p_llrProcBuf[j]); + zmmRes0 = _mm512_adds_epi16(zmmRes0, zmm0); + zmm1 = _mm512_cvtepi8_epi16(p_llrProcBuf[j +1 ]); + zmmRes1 = _mm512_adds_epi16(zmmRes1, zmm1); + zmm0 = _mm512_packs_epi16(zmmRes0, zmmRes1); + p_llrRes[i] = _mm512_permutex_epi64(zmm0, 0xD8); +} +// Process group with 8 CNs + M = (3*Z + 63)>>6; + p_bnProcBuf = (__m256i*) &bnProcBuf [34944]; + p_llrProcBuf = (__m256i*) &llrProcBuf [19200]; + p_llrRes = (__m512i*) &llrRes [19200]; + for (int i=0,j=0;i<M;i++,j+=2) { + zmmRes0 = _mm512_cvtepi8_epi16(p_bnProcBuf [j]); + zmmRes1 = _mm512_cvtepi8_epi16(p_bnProcBuf [j +1]); + zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf[36 + j]); + zmmRes0 = _mm512_adds_epi16(zmmRes0, zmm0); + zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[36 + j +1]); + zmmRes1 = _mm512_adds_epi16(zmmRes1, zmm1); + zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf[72 + j]); + zmmRes0 = _mm512_adds_epi16(zmmRes0, zmm0); + zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[72 + j +1]); + zmmRes1 = _mm512_adds_epi16(zmmRes1, zmm1); + zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf[108 + j]); + zmmRes0 = _mm512_adds_epi16(zmmRes0, zmm0); + zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[108 + j +1]); + zmmRes1 = _mm512_adds_epi16(zmmRes1, zmm1); + zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf[144 + j]); + zmmRes0 = _mm512_adds_epi16(zmmRes0, zmm0); + zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[144 + j +1]); + zmmRes1 = _mm512_adds_epi16(zmmRes1, zmm1); + zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf[180 + j]); + zmmRes0 = _mm512_adds_epi16(zmmRes0, zmm0); + zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[180 + j +1]); + zmmRes1 = _mm512_adds_epi16(zmmRes1, zmm1); + zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf[216 + j]); + zmmRes0 = _mm512_adds_epi16(zmmRes0, zmm0); + zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[216 + j +1]); + zmmRes1 = _mm512_adds_epi16(zmmRes1, zmm1); + zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf[252 + j]); + zmmRes0 = _mm512_adds_epi16(zmmRes0, zmm0); + zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[252 + j +1]); + zmmRes1 = _mm512_adds_epi16(zmmRes1, zmm1); + zmm0 = _mm512_cvtepi8_epi16(p_llrProcBuf[j]); + zmmRes0 = _mm512_adds_epi16(zmmRes0, zmm0); + zmm1 = _mm512_cvtepi8_epi16(p_llrProcBuf[j +1 ]); + zmmRes1 = _mm512_adds_epi16(zmmRes1, zmm1); + zmm0 = _mm512_packs_epi16(zmmRes0, zmmRes1); + p_llrRes[i] = _mm512_permutex_epi64(zmm0, 0xD8); +} +// Process group with 9 CNs + M = (1*Z + 63)>>6; + p_bnProcBuf = (__m256i*) &bnProcBuf [44160]; + p_llrProcBuf = (__m256i*) &llrProcBuf [20352]; + p_llrRes = (__m512i*) &llrRes [20352]; + for (int i=0,j=0;i<M;i++,j+=2) { + zmmRes0 = _mm512_cvtepi8_epi16(p_bnProcBuf [j]); + zmmRes1 = _mm512_cvtepi8_epi16(p_bnProcBuf [j +1]); + zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf[12 + j]); + zmmRes0 = _mm512_adds_epi16(zmmRes0, zmm0); + zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[12 + j +1]); + zmmRes1 = _mm512_adds_epi16(zmmRes1, zmm1); + zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf[24 + j]); + zmmRes0 = _mm512_adds_epi16(zmmRes0, zmm0); + zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[24 + j +1]); + zmmRes1 = _mm512_adds_epi16(zmmRes1, zmm1); + zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf[36 + j]); + zmmRes0 = _mm512_adds_epi16(zmmRes0, zmm0); + zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[36 + j +1]); + zmmRes1 = _mm512_adds_epi16(zmmRes1, zmm1); + zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf[48 + j]); + zmmRes0 = _mm512_adds_epi16(zmmRes0, zmm0); + zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[48 + j +1]); + zmmRes1 = _mm512_adds_epi16(zmmRes1, zmm1); + zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf[60 + j]); + zmmRes0 = _mm512_adds_epi16(zmmRes0, zmm0); + zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[60 + j +1]); + zmmRes1 = _mm512_adds_epi16(zmmRes1, zmm1); + zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf[72 + j]); + zmmRes0 = _mm512_adds_epi16(zmmRes0, zmm0); + zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[72 + j +1]); + zmmRes1 = _mm512_adds_epi16(zmmRes1, zmm1); + zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf[84 + j]); + zmmRes0 = _mm512_adds_epi16(zmmRes0, zmm0); + zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[84 + j +1]); + zmmRes1 = _mm512_adds_epi16(zmmRes1, zmm1); + zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf[96 + j]); + zmmRes0 = _mm512_adds_epi16(zmmRes0, zmm0); + zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[96 + j +1]); + zmmRes1 = _mm512_adds_epi16(zmmRes1, zmm1); + zmm0 = _mm512_cvtepi8_epi16(p_llrProcBuf[j]); + zmmRes0 = _mm512_adds_epi16(zmmRes0, zmm0); + zmm1 = _mm512_cvtepi8_epi16(p_llrProcBuf[j +1 ]); + zmmRes1 = _mm512_adds_epi16(zmmRes1, zmm1); + zmm0 = _mm512_packs_epi16(zmmRes0, zmmRes1); + p_llrRes[i] = _mm512_permutex_epi64(zmm0, 0xD8); +} +// Process group with 10 CNs + M = (4*Z + 63)>>6; + p_bnProcBuf = (__m256i*) &bnProcBuf [47616]; + p_llrProcBuf = (__m256i*) &llrProcBuf [20736]; + p_llrRes = (__m512i*) &llrRes [20736]; + for (int i=0,j=0;i<M;i++,j+=2) { + zmmRes0 = _mm512_cvtepi8_epi16(p_bnProcBuf [j]); + zmmRes1 = _mm512_cvtepi8_epi16(p_bnProcBuf [j +1]); + zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf[48 + j]); + zmmRes0 = _mm512_adds_epi16(zmmRes0, zmm0); + zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[48 + j +1]); + zmmRes1 = _mm512_adds_epi16(zmmRes1, zmm1); + zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf[96 + j]); + zmmRes0 = _mm512_adds_epi16(zmmRes0, zmm0); + zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[96 + j +1]); + zmmRes1 = _mm512_adds_epi16(zmmRes1, zmm1); + zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf[144 + j]); + zmmRes0 = _mm512_adds_epi16(zmmRes0, zmm0); + zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[144 + j +1]); + zmmRes1 = _mm512_adds_epi16(zmmRes1, zmm1); + zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf[192 + j]); + zmmRes0 = _mm512_adds_epi16(zmmRes0, zmm0); + zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[192 + j +1]); + zmmRes1 = _mm512_adds_epi16(zmmRes1, zmm1); + zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf[240 + j]); + zmmRes0 = _mm512_adds_epi16(zmmRes0, zmm0); + zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[240 + j +1]); + zmmRes1 = _mm512_adds_epi16(zmmRes1, zmm1); + zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf[288 + j]); + zmmRes0 = _mm512_adds_epi16(zmmRes0, zmm0); + zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[288 + j +1]); + zmmRes1 = _mm512_adds_epi16(zmmRes1, zmm1); + zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf[336 + j]); + zmmRes0 = _mm512_adds_epi16(zmmRes0, zmm0); + zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[336 + j +1]); + zmmRes1 = _mm512_adds_epi16(zmmRes1, zmm1); + zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf[384 + j]); + zmmRes0 = _mm512_adds_epi16(zmmRes0, zmm0); + zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[384 + j +1]); + zmmRes1 = _mm512_adds_epi16(zmmRes1, zmm1); + zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf[432 + j]); + zmmRes0 = _mm512_adds_epi16(zmmRes0, zmm0); + zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[432 + j +1]); + zmmRes1 = _mm512_adds_epi16(zmmRes1, zmm1); + zmm0 = _mm512_cvtepi8_epi16(p_llrProcBuf[j]); + zmmRes0 = _mm512_adds_epi16(zmmRes0, zmm0); + zmm1 = _mm512_cvtepi8_epi16(p_llrProcBuf[j +1 ]); + zmmRes1 = _mm512_adds_epi16(zmmRes1, zmm1); + zmm0 = _mm512_packs_epi16(zmmRes0, zmmRes1); + p_llrRes[i] = _mm512_permutex_epi64(zmm0, 0xD8); +} +// Process group with 11 CNs + M = (3*Z + 63)>>6; + p_bnProcBuf = (__m256i*) &bnProcBuf [62976]; + p_llrProcBuf = (__m256i*) &llrProcBuf [22272]; + p_llrRes = (__m512i*) &llrRes [22272]; + for (int i=0,j=0;i<M;i++,j+=2) { + zmmRes0 = _mm512_cvtepi8_epi16(p_bnProcBuf [j]); + zmmRes1 = _mm512_cvtepi8_epi16(p_bnProcBuf [j +1]); + zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf[36 + j]); + zmmRes0 = _mm512_adds_epi16(zmmRes0, zmm0); + zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[36 + j +1]); + zmmRes1 = _mm512_adds_epi16(zmmRes1, zmm1); + zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf[72 + j]); + zmmRes0 = _mm512_adds_epi16(zmmRes0, zmm0); + zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[72 + j +1]); + zmmRes1 = _mm512_adds_epi16(zmmRes1, zmm1); + zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf[108 + j]); + zmmRes0 = _mm512_adds_epi16(zmmRes0, zmm0); + zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[108 + j +1]); + zmmRes1 = _mm512_adds_epi16(zmmRes1, zmm1); + zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf[144 + j]); + zmmRes0 = _mm512_adds_epi16(zmmRes0, zmm0); + zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[144 + j +1]); + zmmRes1 = _mm512_adds_epi16(zmmRes1, zmm1); + zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf[180 + j]); + zmmRes0 = _mm512_adds_epi16(zmmRes0, zmm0); + zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[180 + j +1]); + zmmRes1 = _mm512_adds_epi16(zmmRes1, zmm1); + zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf[216 + j]); + zmmRes0 = _mm512_adds_epi16(zmmRes0, zmm0); + zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[216 + j +1]); + zmmRes1 = _mm512_adds_epi16(zmmRes1, zmm1); + zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf[252 + j]); + zmmRes0 = _mm512_adds_epi16(zmmRes0, zmm0); + zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[252 + j +1]); + zmmRes1 = _mm512_adds_epi16(zmmRes1, zmm1); + zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf[288 + j]); + zmmRes0 = _mm512_adds_epi16(zmmRes0, zmm0); + zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[288 + j +1]); + zmmRes1 = _mm512_adds_epi16(zmmRes1, zmm1); + zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf[324 + j]); + zmmRes0 = _mm512_adds_epi16(zmmRes0, zmm0); + zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[324 + j +1]); + zmmRes1 = _mm512_adds_epi16(zmmRes1, zmm1); + zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf[360 + j]); + zmmRes0 = _mm512_adds_epi16(zmmRes0, zmm0); + zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[360 + j +1]); + zmmRes1 = _mm512_adds_epi16(zmmRes1, zmm1); + zmm0 = _mm512_cvtepi8_epi16(p_llrProcBuf[j]); + zmmRes0 = _mm512_adds_epi16(zmmRes0, zmm0); + zmm1 = _mm512_cvtepi8_epi16(p_llrProcBuf[j +1 ]); + zmmRes1 = _mm512_adds_epi16(zmmRes1, zmm1); + zmm0 = _mm512_packs_epi16(zmmRes0, zmmRes1); + p_llrRes[i] = _mm512_permutex_epi64(zmm0, 0xD8); +} +// Process group with 12 CNs + M = (4*Z + 63)>>6; + p_bnProcBuf = (__m256i*) &bnProcBuf [75648]; + p_llrProcBuf = (__m256i*) &llrProcBuf [23424]; + p_llrRes = (__m512i*) &llrRes [23424]; + for (int i=0,j=0;i<M;i++,j+=2) { + zmmRes0 = _mm512_cvtepi8_epi16(p_bnProcBuf [j]); + zmmRes1 = _mm512_cvtepi8_epi16(p_bnProcBuf [j +1]); + zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf[48 + j]); + zmmRes0 = _mm512_adds_epi16(zmmRes0, zmm0); + zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[48 + j +1]); + zmmRes1 = _mm512_adds_epi16(zmmRes1, zmm1); + zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf[96 + j]); + zmmRes0 = _mm512_adds_epi16(zmmRes0, zmm0); + zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[96 + j +1]); + zmmRes1 = _mm512_adds_epi16(zmmRes1, zmm1); + zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf[144 + j]); + zmmRes0 = _mm512_adds_epi16(zmmRes0, zmm0); + zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[144 + j +1]); + zmmRes1 = _mm512_adds_epi16(zmmRes1, zmm1); + zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf[192 + j]); + zmmRes0 = _mm512_adds_epi16(zmmRes0, zmm0); + zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[192 + j +1]); + zmmRes1 = _mm512_adds_epi16(zmmRes1, zmm1); + zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf[240 + j]); + zmmRes0 = _mm512_adds_epi16(zmmRes0, zmm0); + zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[240 + j +1]); + zmmRes1 = _mm512_adds_epi16(zmmRes1, zmm1); + zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf[288 + j]); + zmmRes0 = _mm512_adds_epi16(zmmRes0, zmm0); + zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[288 + j +1]); + zmmRes1 = _mm512_adds_epi16(zmmRes1, zmm1); + zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf[336 + j]); + zmmRes0 = _mm512_adds_epi16(zmmRes0, zmm0); + zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[336 + j +1]); + zmmRes1 = _mm512_adds_epi16(zmmRes1, zmm1); + zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf[384 + j]); + zmmRes0 = _mm512_adds_epi16(zmmRes0, zmm0); + zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[384 + j +1]); + zmmRes1 = _mm512_adds_epi16(zmmRes1, zmm1); + zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf[432 + j]); + zmmRes0 = _mm512_adds_epi16(zmmRes0, zmm0); + zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[432 + j +1]); + zmmRes1 = _mm512_adds_epi16(zmmRes1, zmm1); + zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf[480 + j]); + zmmRes0 = _mm512_adds_epi16(zmmRes0, zmm0); + zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[480 + j +1]); + zmmRes1 = _mm512_adds_epi16(zmmRes1, zmm1); + zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf[528 + j]); + zmmRes0 = _mm512_adds_epi16(zmmRes0, zmm0); + zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[528 + j +1]); + zmmRes1 = _mm512_adds_epi16(zmmRes1, zmm1); + zmm0 = _mm512_cvtepi8_epi16(p_llrProcBuf[j]); + zmmRes0 = _mm512_adds_epi16(zmmRes0, zmm0); + zmm1 = _mm512_cvtepi8_epi16(p_llrProcBuf[j +1 ]); + zmmRes1 = _mm512_adds_epi16(zmmRes1, zmm1); + zmm0 = _mm512_packs_epi16(zmmRes0, zmmRes1); + p_llrRes[i] = _mm512_permutex_epi64(zmm0, 0xD8); +} +// Process group with 13 CNs + M = (1*Z + 63)>>6; + p_bnProcBuf = (__m256i*) &bnProcBuf [94080]; + p_llrProcBuf = (__m256i*) &llrProcBuf [24960]; + p_llrRes = (__m512i*) &llrRes [24960]; + for (int i=0,j=0;i<M;i++,j+=2) { + zmmRes0 = _mm512_cvtepi8_epi16(p_bnProcBuf [j]); + zmmRes1 = _mm512_cvtepi8_epi16(p_bnProcBuf [j +1]); + zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf[12 + j]); + zmmRes0 = _mm512_adds_epi16(zmmRes0, zmm0); + zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[12 + j +1]); + zmmRes1 = _mm512_adds_epi16(zmmRes1, zmm1); + zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf[24 + j]); + zmmRes0 = _mm512_adds_epi16(zmmRes0, zmm0); + zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[24 + j +1]); + zmmRes1 = _mm512_adds_epi16(zmmRes1, zmm1); + zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf[36 + j]); + zmmRes0 = _mm512_adds_epi16(zmmRes0, zmm0); + zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[36 + j +1]); + zmmRes1 = _mm512_adds_epi16(zmmRes1, zmm1); + zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf[48 + j]); + zmmRes0 = _mm512_adds_epi16(zmmRes0, zmm0); + zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[48 + j +1]); + zmmRes1 = _mm512_adds_epi16(zmmRes1, zmm1); + zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf[60 + j]); + zmmRes0 = _mm512_adds_epi16(zmmRes0, zmm0); + zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[60 + j +1]); + zmmRes1 = _mm512_adds_epi16(zmmRes1, zmm1); + zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf[72 + j]); + zmmRes0 = _mm512_adds_epi16(zmmRes0, zmm0); + zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[72 + j +1]); + zmmRes1 = _mm512_adds_epi16(zmmRes1, zmm1); + zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf[84 + j]); + zmmRes0 = _mm512_adds_epi16(zmmRes0, zmm0); + zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[84 + j +1]); + zmmRes1 = _mm512_adds_epi16(zmmRes1, zmm1); + zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf[96 + j]); + zmmRes0 = _mm512_adds_epi16(zmmRes0, zmm0); + zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[96 + j +1]); + zmmRes1 = _mm512_adds_epi16(zmmRes1, zmm1); + zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf[108 + j]); + zmmRes0 = _mm512_adds_epi16(zmmRes0, zmm0); + zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[108 + j +1]); + zmmRes1 = _mm512_adds_epi16(zmmRes1, zmm1); + zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf[120 + j]); + zmmRes0 = _mm512_adds_epi16(zmmRes0, zmm0); + zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[120 + j +1]); + zmmRes1 = _mm512_adds_epi16(zmmRes1, zmm1); + zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf[132 + j]); + zmmRes0 = _mm512_adds_epi16(zmmRes0, zmm0); + zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[132 + j +1]); + zmmRes1 = _mm512_adds_epi16(zmmRes1, zmm1); + zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf[144 + j]); + zmmRes0 = _mm512_adds_epi16(zmmRes0, zmm0); + zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[144 + j +1]); + zmmRes1 = _mm512_adds_epi16(zmmRes1, zmm1); + zmm0 = _mm512_cvtepi8_epi16(p_llrProcBuf[j]); + zmmRes0 = _mm512_adds_epi16(zmmRes0, zmm0); + zmm1 = _mm512_cvtepi8_epi16(p_llrProcBuf[j +1 ]); + zmmRes1 = _mm512_adds_epi16(zmmRes1, zmm1); + zmm0 = _mm512_packs_epi16(zmmRes0, zmmRes1); + p_llrRes[i] = _mm512_permutex_epi64(zmm0, 0xD8); +} +// Process group with 14 CNs +// Process group with 15 CNs +// Process group with 16 CNs +// Process group with 17 CNs +// Process group with 18 CNs +// Process group with 19 CNs +// Process group with 20 CNs +// Process group with 21 CNs +// Process group with 22 CNs +// Process group with <23 CNs +// Process group with 24 CNs +// Process group with 25 CNs +// Process group with 26 CNs +// Process group with 27 CNs +// Process group with 28 CNs + M = (1*Z + 63)>>6; + p_bnProcBuf = (__m256i*) &bnProcBuf [99072]; + p_llrProcBuf = (__m256i*) &llrProcBuf [25344]; + p_llrRes = (__m512i*) &llrRes [25344]; + for (int i=0,j=0;i<M;i++,j+=2) { + zmmRes0 = _mm512_cvtepi8_epi16(p_bnProcBuf [j]); + zmmRes1 = _mm512_cvtepi8_epi16(p_bnProcBuf [j +1]); + zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf[12 + j]); + zmmRes0 = _mm512_adds_epi16(zmmRes0, zmm0); + zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[12 + j +1]); + zmmRes1 = _mm512_adds_epi16(zmmRes1, zmm1); + zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf[24 + j]); + zmmRes0 = _mm512_adds_epi16(zmmRes0, zmm0); + zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[24 + j +1]); + zmmRes1 = _mm512_adds_epi16(zmmRes1, zmm1); + zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf[36 + j]); + zmmRes0 = _mm512_adds_epi16(zmmRes0, zmm0); + zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[36 + j +1]); + zmmRes1 = _mm512_adds_epi16(zmmRes1, zmm1); + zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf[48 + j]); + zmmRes0 = _mm512_adds_epi16(zmmRes0, zmm0); + zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[48 + j +1]); + zmmRes1 = _mm512_adds_epi16(zmmRes1, zmm1); + zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf[60 + j]); + zmmRes0 = _mm512_adds_epi16(zmmRes0, zmm0); + zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[60 + j +1]); + zmmRes1 = _mm512_adds_epi16(zmmRes1, zmm1); + zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf[72 + j]); + zmmRes0 = _mm512_adds_epi16(zmmRes0, zmm0); + zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[72 + j +1]); + zmmRes1 = _mm512_adds_epi16(zmmRes1, zmm1); + zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf[84 + j]); + zmmRes0 = _mm512_adds_epi16(zmmRes0, zmm0); + zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[84 + j +1]); + zmmRes1 = _mm512_adds_epi16(zmmRes1, zmm1); + zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf[96 + j]); + zmmRes0 = _mm512_adds_epi16(zmmRes0, zmm0); + zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[96 + j +1]); + zmmRes1 = _mm512_adds_epi16(zmmRes1, zmm1); + zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf[108 + j]); + zmmRes0 = _mm512_adds_epi16(zmmRes0, zmm0); + zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[108 + j +1]); + zmmRes1 = _mm512_adds_epi16(zmmRes1, zmm1); + zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf[120 + j]); + zmmRes0 = _mm512_adds_epi16(zmmRes0, zmm0); + zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[120 + j +1]); + zmmRes1 = _mm512_adds_epi16(zmmRes1, zmm1); + zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf[132 + j]); + zmmRes0 = _mm512_adds_epi16(zmmRes0, zmm0); + zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[132 + j +1]); + zmmRes1 = _mm512_adds_epi16(zmmRes1, zmm1); + zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf[144 + j]); + zmmRes0 = _mm512_adds_epi16(zmmRes0, zmm0); + zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[144 + j +1]); + zmmRes1 = _mm512_adds_epi16(zmmRes1, zmm1); + zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf[156 + j]); + zmmRes0 = _mm512_adds_epi16(zmmRes0, zmm0); + zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[156 + j +1]); + zmmRes1 = _mm512_adds_epi16(zmmRes1, zmm1); + zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf[168 + j]); + zmmRes0 = _mm512_adds_epi16(zmmRes0, zmm0); + zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[168 + j +1]); + zmmRes1 = _mm512_adds_epi16(zmmRes1, zmm1); + zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf[180 + j]); + zmmRes0 = _mm512_adds_epi16(zmmRes0, zmm0); + zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[180 + j +1]); + zmmRes1 = _mm512_adds_epi16(zmmRes1, zmm1); + zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf[192 + j]); + zmmRes0 = _mm512_adds_epi16(zmmRes0, zmm0); + zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[192 + j +1]); + zmmRes1 = _mm512_adds_epi16(zmmRes1, zmm1); + zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf[204 + j]); + zmmRes0 = _mm512_adds_epi16(zmmRes0, zmm0); + zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[204 + j +1]); + zmmRes1 = _mm512_adds_epi16(zmmRes1, zmm1); + zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf[216 + j]); + zmmRes0 = _mm512_adds_epi16(zmmRes0, zmm0); + zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[216 + j +1]); + zmmRes1 = _mm512_adds_epi16(zmmRes1, zmm1); + zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf[228 + j]); + zmmRes0 = _mm512_adds_epi16(zmmRes0, zmm0); + zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[228 + j +1]); + zmmRes1 = _mm512_adds_epi16(zmmRes1, zmm1); + zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf[240 + j]); + zmmRes0 = _mm512_adds_epi16(zmmRes0, zmm0); + zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[240 + j +1]); + zmmRes1 = _mm512_adds_epi16(zmmRes1, zmm1); + zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf[252 + j]); + zmmRes0 = _mm512_adds_epi16(zmmRes0, zmm0); + zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[252 + j +1]); + zmmRes1 = _mm512_adds_epi16(zmmRes1, zmm1); + zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf[264 + j]); + zmmRes0 = _mm512_adds_epi16(zmmRes0, zmm0); + zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[264 + j +1]); + zmmRes1 = _mm512_adds_epi16(zmmRes1, zmm1); + zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf[276 + j]); + zmmRes0 = _mm512_adds_epi16(zmmRes0, zmm0); + zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[276 + j +1]); + zmmRes1 = _mm512_adds_epi16(zmmRes1, zmm1); + zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf[288 + j]); + zmmRes0 = _mm512_adds_epi16(zmmRes0, zmm0); + zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[288 + j +1]); + zmmRes1 = _mm512_adds_epi16(zmmRes1, zmm1); + zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf[300 + j]); + zmmRes0 = _mm512_adds_epi16(zmmRes0, zmm0); + zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[300 + j +1]); + zmmRes1 = _mm512_adds_epi16(zmmRes1, zmm1); + zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf[312 + j]); + zmmRes0 = _mm512_adds_epi16(zmmRes0, zmm0); + zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[312 + j +1]); + zmmRes1 = _mm512_adds_epi16(zmmRes1, zmm1); + zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf[324 + j]); + zmmRes0 = _mm512_adds_epi16(zmmRes0, zmm0); + zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[324 + j +1]); + zmmRes1 = _mm512_adds_epi16(zmmRes1, zmm1); + zmm0 = _mm512_cvtepi8_epi16(p_llrProcBuf[j]); + zmmRes0 = _mm512_adds_epi16(zmmRes0, zmm0); + zmm1 = _mm512_cvtepi8_epi16(p_llrProcBuf[j +1 ]); + zmmRes1 = _mm512_adds_epi16(zmmRes1, zmm1); + zmm0 = _mm512_packs_epi16(zmmRes0, zmmRes1); + p_llrRes[i] = _mm512_permutex_epi64(zmm0, 0xD8); +} +// Process group with 29 CNs +// Process group with 30 CNs + M = (1*Z + 63)>>6; + p_bnProcBuf = (__m256i*) &bnProcBuf [109824]; + p_llrProcBuf = (__m256i*) &llrProcBuf [25728]; + p_llrRes = (__m512i*) &llrRes [25728]; + for (int i=0,j=0;i<M;i++,j+=2) { + zmmRes0 = _mm512_cvtepi8_epi16(p_bnProcBuf [j]); + zmmRes1 = _mm512_cvtepi8_epi16(p_bnProcBuf [j +1]); + zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf[12 + j]); + zmmRes0 = _mm512_adds_epi16(zmmRes0, zmm0); + zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[12 + j +1]); + zmmRes1 = _mm512_adds_epi16(zmmRes1, zmm1); + zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf[24 + j]); + zmmRes0 = _mm512_adds_epi16(zmmRes0, zmm0); + zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[24 + j +1]); + zmmRes1 = _mm512_adds_epi16(zmmRes1, zmm1); + zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf[36 + j]); + zmmRes0 = _mm512_adds_epi16(zmmRes0, zmm0); + zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[36 + j +1]); + zmmRes1 = _mm512_adds_epi16(zmmRes1, zmm1); + zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf[48 + j]); + zmmRes0 = _mm512_adds_epi16(zmmRes0, zmm0); + zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[48 + j +1]); + zmmRes1 = _mm512_adds_epi16(zmmRes1, zmm1); + zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf[60 + j]); + zmmRes0 = _mm512_adds_epi16(zmmRes0, zmm0); + zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[60 + j +1]); + zmmRes1 = _mm512_adds_epi16(zmmRes1, zmm1); + zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf[72 + j]); + zmmRes0 = _mm512_adds_epi16(zmmRes0, zmm0); + zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[72 + j +1]); + zmmRes1 = _mm512_adds_epi16(zmmRes1, zmm1); + zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf[84 + j]); + zmmRes0 = _mm512_adds_epi16(zmmRes0, zmm0); + zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[84 + j +1]); + zmmRes1 = _mm512_adds_epi16(zmmRes1, zmm1); + zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf[96 + j]); + zmmRes0 = _mm512_adds_epi16(zmmRes0, zmm0); + zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[96 + j +1]); + zmmRes1 = _mm512_adds_epi16(zmmRes1, zmm1); + zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf[108 + j]); + zmmRes0 = _mm512_adds_epi16(zmmRes0, zmm0); + zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[108 + j +1]); + zmmRes1 = _mm512_adds_epi16(zmmRes1, zmm1); + zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf[120 + j]); + zmmRes0 = _mm512_adds_epi16(zmmRes0, zmm0); + zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[120 + j +1]); + zmmRes1 = _mm512_adds_epi16(zmmRes1, zmm1); + zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf[132 + j]); + zmmRes0 = _mm512_adds_epi16(zmmRes0, zmm0); + zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[132 + j +1]); + zmmRes1 = _mm512_adds_epi16(zmmRes1, zmm1); + zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf[144 + j]); + zmmRes0 = _mm512_adds_epi16(zmmRes0, zmm0); + zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[144 + j +1]); + zmmRes1 = _mm512_adds_epi16(zmmRes1, zmm1); + zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf[156 + j]); + zmmRes0 = _mm512_adds_epi16(zmmRes0, zmm0); + zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[156 + j +1]); + zmmRes1 = _mm512_adds_epi16(zmmRes1, zmm1); + zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf[168 + j]); + zmmRes0 = _mm512_adds_epi16(zmmRes0, zmm0); + zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[168 + j +1]); + zmmRes1 = _mm512_adds_epi16(zmmRes1, zmm1); + zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf[180 + j]); + zmmRes0 = _mm512_adds_epi16(zmmRes0, zmm0); + zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[180 + j +1]); + zmmRes1 = _mm512_adds_epi16(zmmRes1, zmm1); + zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf[192 + j]); + zmmRes0 = _mm512_adds_epi16(zmmRes0, zmm0); + zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[192 + j +1]); + zmmRes1 = _mm512_adds_epi16(zmmRes1, zmm1); + zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf[204 + j]); + zmmRes0 = _mm512_adds_epi16(zmmRes0, zmm0); + zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[204 + j +1]); + zmmRes1 = _mm512_adds_epi16(zmmRes1, zmm1); + zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf[216 + j]); + zmmRes0 = _mm512_adds_epi16(zmmRes0, zmm0); + zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[216 + j +1]); + zmmRes1 = _mm512_adds_epi16(zmmRes1, zmm1); + zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf[228 + j]); + zmmRes0 = _mm512_adds_epi16(zmmRes0, zmm0); + zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[228 + j +1]); + zmmRes1 = _mm512_adds_epi16(zmmRes1, zmm1); + zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf[240 + j]); + zmmRes0 = _mm512_adds_epi16(zmmRes0, zmm0); + zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[240 + j +1]); + zmmRes1 = _mm512_adds_epi16(zmmRes1, zmm1); + zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf[252 + j]); + zmmRes0 = _mm512_adds_epi16(zmmRes0, zmm0); + zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[252 + j +1]); + zmmRes1 = _mm512_adds_epi16(zmmRes1, zmm1); + zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf[264 + j]); + zmmRes0 = _mm512_adds_epi16(zmmRes0, zmm0); + zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[264 + j +1]); + zmmRes1 = _mm512_adds_epi16(zmmRes1, zmm1); + zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf[276 + j]); + zmmRes0 = _mm512_adds_epi16(zmmRes0, zmm0); + zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[276 + j +1]); + zmmRes1 = _mm512_adds_epi16(zmmRes1, zmm1); + zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf[288 + j]); + zmmRes0 = _mm512_adds_epi16(zmmRes0, zmm0); + zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[288 + j +1]); + zmmRes1 = _mm512_adds_epi16(zmmRes1, zmm1); + zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf[300 + j]); + zmmRes0 = _mm512_adds_epi16(zmmRes0, zmm0); + zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[300 + j +1]); + zmmRes1 = _mm512_adds_epi16(zmmRes1, zmm1); + zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf[312 + j]); + zmmRes0 = _mm512_adds_epi16(zmmRes0, zmm0); + zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[312 + j +1]); + zmmRes1 = _mm512_adds_epi16(zmmRes1, zmm1); + zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf[324 + j]); + zmmRes0 = _mm512_adds_epi16(zmmRes0, zmm0); + zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[324 + j +1]); + zmmRes1 = _mm512_adds_epi16(zmmRes1, zmm1); + zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf[336 + j]); + zmmRes0 = _mm512_adds_epi16(zmmRes0, zmm0); + zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[336 + j +1]); + zmmRes1 = _mm512_adds_epi16(zmmRes1, zmm1); + zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf[348 + j]); + zmmRes0 = _mm512_adds_epi16(zmmRes0, zmm0); + zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[348 + j +1]); + zmmRes1 = _mm512_adds_epi16(zmmRes1, zmm1); + zmm0 = _mm512_cvtepi8_epi16(p_llrProcBuf[j]); + zmmRes0 = _mm512_adds_epi16(zmmRes0, zmm0); + zmm1 = _mm512_cvtepi8_epi16(p_llrProcBuf[j +1 ]); + zmmRes1 = _mm512_adds_epi16(zmmRes1, zmm1); + zmm0 = _mm512_packs_epi16(zmmRes0, zmmRes1); + p_llrRes[i] = _mm512_permutex_epi64(zmm0, 0xD8); +} +} diff --git a/openair1/PHY/CODING/nrLDPC_decoder/nrLDPC_tools/ldpc_gen_files/bn_avx512/nrLDPC_bnProcPc_BG1_R23_AVX512.h b/openair1/PHY/CODING/nrLDPC_decoder/nrLDPC_tools/ldpc_gen_files/bn_avx512/nrLDPC_bnProcPc_BG1_R23_AVX512.h new file mode 100644 index 0000000000000000000000000000000000000000..902da267d9f611fe0a5a830a8047fe34d401a243 --- /dev/null +++ b/openair1/PHY/CODING/nrLDPC_decoder/nrLDPC_tools/ldpc_gen_files/bn_avx512/nrLDPC_bnProcPc_BG1_R23_AVX512.h @@ -0,0 +1,293 @@ +static inline void nrLDPC_bnProcPc_BG1_R23_AVX512(int8_t* bnProcBuf,int8_t* llrRes , int8_t* llrProcBuf, uint16_t Z ) { + __m512i zmm0, zmm1, zmmRes0, zmmRes1; + __m256i* p_bnProcBuf; + __m256i* p_llrProcBuf; + __m512i* p_llrRes; + uint32_t M ; +// Process group with 1 CNs + M = (9*Z + 63)>>6; + p_bnProcBuf = (__m256i*) &bnProcBuf [0]; + p_llrProcBuf = (__m256i*) &llrProcBuf [0]; + p_llrRes = (__m512i*) &llrRes [0]; + for (int i=0,j=0;i<M;i++,j+=2) { + zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf [j]); + zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[j + 1]); + zmm0 = _mm512_cvtepi8_epi16(p_llrProcBuf[j]); + zmmRes0 = _mm512_adds_epi16(zmmRes0, zmm0); + zmm1 = _mm512_cvtepi8_epi16(p_llrProcBuf[j +1 ]); + zmmRes1 = _mm512_adds_epi16(zmmRes1, zmm1); + zmm0 = _mm512_packs_epi16(zmmRes0, zmmRes1); + p_llrRes[i] = _mm512_permutex_epi64(zmm0, 0xD8); +} +// Process group with 2 CNs + M = (1*Z + 63)>>6; + p_bnProcBuf = (__m256i*) &bnProcBuf [3456]; + p_llrProcBuf = (__m256i*) &llrProcBuf [3456]; + p_llrRes = (__m512i*) &llrRes [3456]; + for (int i=0,j=0;i<M;i++,j+=2) { + zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf [j]); + zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[j + 1]); + zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf[12 + j]); + zmmRes0 = _mm512_adds_epi16(zmmRes0, zmm0); + zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[12 + j +1]); + zmmRes1 = _mm512_adds_epi16(zmmRes1, zmm1); + zmm0 = _mm512_cvtepi8_epi16(p_llrProcBuf[j]); + zmmRes0 = _mm512_adds_epi16(zmmRes0, zmm0); + zmm1 = _mm512_cvtepi8_epi16(p_llrProcBuf[j +1 ]); + zmmRes1 = _mm512_adds_epi16(zmmRes1, zmm1); + zmm0 = _mm512_packs_epi16(zmmRes0, zmmRes1); + p_llrRes[i] = _mm512_permutex_epi64(zmm0, 0xD8); +} +// Process group with 3 CNs + M = (5*Z + 63)>>6; + p_bnProcBuf = (__m256i*) &bnProcBuf [4224]; + p_llrProcBuf = (__m256i*) &llrProcBuf [3840]; + p_llrRes = (__m512i*) &llrRes [3840]; + for (int i=0,j=0;i<M;i++,j+=2) { + zmmRes0 = _mm512_cvtepi8_epi16(p_bnProcBuf [j]); + zmmRes1 = _mm512_cvtepi8_epi16(p_bnProcBuf [j +1]); + zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf[60 + j]); + zmmRes0 = _mm512_adds_epi16(zmmRes0, zmm0); + zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[60 + j +1]); + zmmRes1 = _mm512_adds_epi16(zmmRes1, zmm1); + zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf[120 + j]); + zmmRes0 = _mm512_adds_epi16(zmmRes0, zmm0); + zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[120 + j +1]); + zmmRes1 = _mm512_adds_epi16(zmmRes1, zmm1); + zmm0 = _mm512_cvtepi8_epi16(p_llrProcBuf[j]); + zmmRes0 = _mm512_adds_epi16(zmmRes0, zmm0); + zmm1 = _mm512_cvtepi8_epi16(p_llrProcBuf[j +1 ]); + zmmRes1 = _mm512_adds_epi16(zmmRes1, zmm1); + zmm0 = _mm512_packs_epi16(zmmRes0, zmmRes1); + p_llrRes[i] = _mm512_permutex_epi64(zmm0, 0xD8); +} +// Process group with 4 CNs + M = (3*Z + 63)>>6; + p_bnProcBuf = (__m256i*) &bnProcBuf [9984]; + p_llrProcBuf = (__m256i*) &llrProcBuf [5760]; + p_llrRes = (__m512i*) &llrRes [5760]; + for (int i=0,j=0;i<M;i++,j+=2) { + zmmRes0 = _mm512_cvtepi8_epi16(p_bnProcBuf [j]); + zmmRes1 = _mm512_cvtepi8_epi16(p_bnProcBuf [j +1]); + zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf[36 + j]); + zmmRes0 = _mm512_adds_epi16(zmmRes0, zmm0); + zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[36 + j +1]); + zmmRes1 = _mm512_adds_epi16(zmmRes1, zmm1); + zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf[72 + j]); + zmmRes0 = _mm512_adds_epi16(zmmRes0, zmm0); + zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[72 + j +1]); + zmmRes1 = _mm512_adds_epi16(zmmRes1, zmm1); + zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf[108 + j]); + zmmRes0 = _mm512_adds_epi16(zmmRes0, zmm0); + zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[108 + j +1]); + zmmRes1 = _mm512_adds_epi16(zmmRes1, zmm1); + zmm0 = _mm512_cvtepi8_epi16(p_llrProcBuf[j]); + zmmRes0 = _mm512_adds_epi16(zmmRes0, zmm0); + zmm1 = _mm512_cvtepi8_epi16(p_llrProcBuf[j +1 ]); + zmmRes1 = _mm512_adds_epi16(zmmRes1, zmm1); + zmm0 = _mm512_packs_epi16(zmmRes0, zmmRes1); + p_llrRes[i] = _mm512_permutex_epi64(zmm0, 0xD8); +} +// Process group with 5 CNs + M = (7*Z + 63)>>6; + p_bnProcBuf = (__m256i*) &bnProcBuf [14592]; + p_llrProcBuf = (__m256i*) &llrProcBuf [6912]; + p_llrRes = (__m512i*) &llrRes [6912]; + for (int i=0,j=0;i<M;i++,j+=2) { + zmmRes0 = _mm512_cvtepi8_epi16(p_bnProcBuf [j]); + zmmRes1 = _mm512_cvtepi8_epi16(p_bnProcBuf [j +1]); + zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf[84 + j]); + zmmRes0 = _mm512_adds_epi16(zmmRes0, zmm0); + zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[84 + j +1]); + zmmRes1 = _mm512_adds_epi16(zmmRes1, zmm1); + zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf[168 + j]); + zmmRes0 = _mm512_adds_epi16(zmmRes0, zmm0); + zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[168 + j +1]); + zmmRes1 = _mm512_adds_epi16(zmmRes1, zmm1); + zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf[252 + j]); + zmmRes0 = _mm512_adds_epi16(zmmRes0, zmm0); + zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[252 + j +1]); + zmmRes1 = _mm512_adds_epi16(zmmRes1, zmm1); + zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf[336 + j]); + zmmRes0 = _mm512_adds_epi16(zmmRes0, zmm0); + zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[336 + j +1]); + zmmRes1 = _mm512_adds_epi16(zmmRes1, zmm1); + zmm0 = _mm512_cvtepi8_epi16(p_llrProcBuf[j]); + zmmRes0 = _mm512_adds_epi16(zmmRes0, zmm0); + zmm1 = _mm512_cvtepi8_epi16(p_llrProcBuf[j +1 ]); + zmmRes1 = _mm512_adds_epi16(zmmRes1, zmm1); + zmm0 = _mm512_packs_epi16(zmmRes0, zmmRes1); + p_llrRes[i] = _mm512_permutex_epi64(zmm0, 0xD8); +} +// Process group with 6 CNs + M = (8*Z + 63)>>6; + p_bnProcBuf = (__m256i*) &bnProcBuf [28032]; + p_llrProcBuf = (__m256i*) &llrProcBuf [9600]; + p_llrRes = (__m512i*) &llrRes [9600]; + for (int i=0,j=0;i<M;i++,j+=2) { + zmmRes0 = _mm512_cvtepi8_epi16(p_bnProcBuf [j]); + zmmRes1 = _mm512_cvtepi8_epi16(p_bnProcBuf [j +1]); + zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf[96 + j]); + zmmRes0 = _mm512_adds_epi16(zmmRes0, zmm0); + zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[96 + j +1]); + zmmRes1 = _mm512_adds_epi16(zmmRes1, zmm1); + zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf[192 + j]); + zmmRes0 = _mm512_adds_epi16(zmmRes0, zmm0); + zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[192 + j +1]); + zmmRes1 = _mm512_adds_epi16(zmmRes1, zmm1); + zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf[288 + j]); + zmmRes0 = _mm512_adds_epi16(zmmRes0, zmm0); + zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[288 + j +1]); + zmmRes1 = _mm512_adds_epi16(zmmRes1, zmm1); + zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf[384 + j]); + zmmRes0 = _mm512_adds_epi16(zmmRes0, zmm0); + zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[384 + j +1]); + zmmRes1 = _mm512_adds_epi16(zmmRes1, zmm1); + zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf[480 + j]); + zmmRes0 = _mm512_adds_epi16(zmmRes0, zmm0); + zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[480 + j +1]); + zmmRes1 = _mm512_adds_epi16(zmmRes1, zmm1); + zmm0 = _mm512_cvtepi8_epi16(p_llrProcBuf[j]); + zmmRes0 = _mm512_adds_epi16(zmmRes0, zmm0); + zmm1 = _mm512_cvtepi8_epi16(p_llrProcBuf[j +1 ]); + zmmRes1 = _mm512_adds_epi16(zmmRes1, zmm1); + zmm0 = _mm512_packs_epi16(zmmRes0, zmmRes1); + p_llrRes[i] = _mm512_permutex_epi64(zmm0, 0xD8); +} +// Process group with 7 CNs +// Process group with 8 CNs +// Process group with 9 CNs +// Process group with 10 CNs +// Process group with 11 CNs + M = (1*Z + 63)>>6; + p_bnProcBuf = (__m256i*) &bnProcBuf [46464]; + p_llrProcBuf = (__m256i*) &llrProcBuf [12672]; + p_llrRes = (__m512i*) &llrRes [12672]; + for (int i=0,j=0;i<M;i++,j+=2) { + zmmRes0 = _mm512_cvtepi8_epi16(p_bnProcBuf [j]); + zmmRes1 = _mm512_cvtepi8_epi16(p_bnProcBuf [j +1]); + zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf[12 + j]); + zmmRes0 = _mm512_adds_epi16(zmmRes0, zmm0); + zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[12 + j +1]); + zmmRes1 = _mm512_adds_epi16(zmmRes1, zmm1); + zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf[24 + j]); + zmmRes0 = _mm512_adds_epi16(zmmRes0, zmm0); + zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[24 + j +1]); + zmmRes1 = _mm512_adds_epi16(zmmRes1, zmm1); + zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf[36 + j]); + zmmRes0 = _mm512_adds_epi16(zmmRes0, zmm0); + zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[36 + j +1]); + zmmRes1 = _mm512_adds_epi16(zmmRes1, zmm1); + zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf[48 + j]); + zmmRes0 = _mm512_adds_epi16(zmmRes0, zmm0); + zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[48 + j +1]); + zmmRes1 = _mm512_adds_epi16(zmmRes1, zmm1); + zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf[60 + j]); + zmmRes0 = _mm512_adds_epi16(zmmRes0, zmm0); + zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[60 + j +1]); + zmmRes1 = _mm512_adds_epi16(zmmRes1, zmm1); + zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf[72 + j]); + zmmRes0 = _mm512_adds_epi16(zmmRes0, zmm0); + zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[72 + j +1]); + zmmRes1 = _mm512_adds_epi16(zmmRes1, zmm1); + zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf[84 + j]); + zmmRes0 = _mm512_adds_epi16(zmmRes0, zmm0); + zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[84 + j +1]); + zmmRes1 = _mm512_adds_epi16(zmmRes1, zmm1); + zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf[96 + j]); + zmmRes0 = _mm512_adds_epi16(zmmRes0, zmm0); + zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[96 + j +1]); + zmmRes1 = _mm512_adds_epi16(zmmRes1, zmm1); + zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf[108 + j]); + zmmRes0 = _mm512_adds_epi16(zmmRes0, zmm0); + zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[108 + j +1]); + zmmRes1 = _mm512_adds_epi16(zmmRes1, zmm1); + zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf[120 + j]); + zmmRes0 = _mm512_adds_epi16(zmmRes0, zmm0); + zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[120 + j +1]); + zmmRes1 = _mm512_adds_epi16(zmmRes1, zmm1); + zmm0 = _mm512_cvtepi8_epi16(p_llrProcBuf[j]); + zmmRes0 = _mm512_adds_epi16(zmmRes0, zmm0); + zmm1 = _mm512_cvtepi8_epi16(p_llrProcBuf[j +1 ]); + zmmRes1 = _mm512_adds_epi16(zmmRes1, zmm1); + zmm0 = _mm512_packs_epi16(zmmRes0, zmmRes1); + p_llrRes[i] = _mm512_permutex_epi64(zmm0, 0xD8); +} +// Process group with 12 CNs + M = (1*Z + 63)>>6; + p_bnProcBuf = (__m256i*) &bnProcBuf [50688]; + p_llrProcBuf = (__m256i*) &llrProcBuf [13056]; + p_llrRes = (__m512i*) &llrRes [13056]; + for (int i=0,j=0;i<M;i++,j+=2) { + zmmRes0 = _mm512_cvtepi8_epi16(p_bnProcBuf [j]); + zmmRes1 = _mm512_cvtepi8_epi16(p_bnProcBuf [j +1]); + zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf[12 + j]); + zmmRes0 = _mm512_adds_epi16(zmmRes0, zmm0); + zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[12 + j +1]); + zmmRes1 = _mm512_adds_epi16(zmmRes1, zmm1); + zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf[24 + j]); + zmmRes0 = _mm512_adds_epi16(zmmRes0, zmm0); + zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[24 + j +1]); + zmmRes1 = _mm512_adds_epi16(zmmRes1, zmm1); + zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf[36 + j]); + zmmRes0 = _mm512_adds_epi16(zmmRes0, zmm0); + zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[36 + j +1]); + zmmRes1 = _mm512_adds_epi16(zmmRes1, zmm1); + zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf[48 + j]); + zmmRes0 = _mm512_adds_epi16(zmmRes0, zmm0); + zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[48 + j +1]); + zmmRes1 = _mm512_adds_epi16(zmmRes1, zmm1); + zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf[60 + j]); + zmmRes0 = _mm512_adds_epi16(zmmRes0, zmm0); + zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[60 + j +1]); + zmmRes1 = _mm512_adds_epi16(zmmRes1, zmm1); + zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf[72 + j]); + zmmRes0 = _mm512_adds_epi16(zmmRes0, zmm0); + zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[72 + j +1]); + zmmRes1 = _mm512_adds_epi16(zmmRes1, zmm1); + zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf[84 + j]); + zmmRes0 = _mm512_adds_epi16(zmmRes0, zmm0); + zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[84 + j +1]); + zmmRes1 = _mm512_adds_epi16(zmmRes1, zmm1); + zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf[96 + j]); + zmmRes0 = _mm512_adds_epi16(zmmRes0, zmm0); + zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[96 + j +1]); + zmmRes1 = _mm512_adds_epi16(zmmRes1, zmm1); + zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf[108 + j]); + zmmRes0 = _mm512_adds_epi16(zmmRes0, zmm0); + zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[108 + j +1]); + zmmRes1 = _mm512_adds_epi16(zmmRes1, zmm1); + zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf[120 + j]); + zmmRes0 = _mm512_adds_epi16(zmmRes0, zmm0); + zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[120 + j +1]); + zmmRes1 = _mm512_adds_epi16(zmmRes1, zmm1); + zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf[132 + j]); + zmmRes0 = _mm512_adds_epi16(zmmRes0, zmm0); + zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[132 + j +1]); + zmmRes1 = _mm512_adds_epi16(zmmRes1, zmm1); + zmm0 = _mm512_cvtepi8_epi16(p_llrProcBuf[j]); + zmmRes0 = _mm512_adds_epi16(zmmRes0, zmm0); + zmm1 = _mm512_cvtepi8_epi16(p_llrProcBuf[j +1 ]); + zmmRes1 = _mm512_adds_epi16(zmmRes1, zmm1); + zmm0 = _mm512_packs_epi16(zmmRes0, zmmRes1); + p_llrRes[i] = _mm512_permutex_epi64(zmm0, 0xD8); +} +// Process group with 13 CNs +// Process group with 14 CNs +// Process group with 15 CNs +// Process group with 16 CNs +// Process group with 17 CNs +// Process group with 18 CNs +// Process group with 19 CNs +// Process group with 20 CNs +// Process group with 21 CNs +// Process group with 22 CNs +// Process group with <23 CNs +// Process group with 24 CNs +// Process group with 25 CNs +// Process group with 26 CNs +// Process group with 27 CNs +// Process group with 28 CNs +// Process group with 29 CNs +// Process group with 30 CNs +} diff --git a/openair1/PHY/CODING/nrLDPC_decoder/nrLDPC_tools/ldpc_gen_files/bn_avx512/nrLDPC_bnProcPc_BG1_R89_AVX512.h b/openair1/PHY/CODING/nrLDPC_decoder/nrLDPC_tools/ldpc_gen_files/bn_avx512/nrLDPC_bnProcPc_BG1_R89_AVX512.h new file mode 100644 index 0000000000000000000000000000000000000000..4731c417df7a969b3f533550589b3314e0d04400 --- /dev/null +++ b/openair1/PHY/CODING/nrLDPC_decoder/nrLDPC_tools/ldpc_gen_files/bn_avx512/nrLDPC_bnProcPc_BG1_R89_AVX512.h @@ -0,0 +1,147 @@ +static inline void nrLDPC_bnProcPc_BG1_R89_AVX512(int8_t* bnProcBuf,int8_t* llrRes , int8_t* llrProcBuf, uint16_t Z ) { + __m512i zmm0, zmm1, zmmRes0, zmmRes1; + __m256i* p_bnProcBuf; + __m256i* p_llrProcBuf; + __m512i* p_llrRes; + uint32_t M ; +// Process group with 1 CNs + M = (1*Z + 63)>>6; + p_bnProcBuf = (__m256i*) &bnProcBuf [0]; + p_llrProcBuf = (__m256i*) &llrProcBuf [0]; + p_llrRes = (__m512i*) &llrRes [0]; + for (int i=0,j=0;i<M;i++,j+=2) { + zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf [j]); + zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[j + 1]); + zmm0 = _mm512_cvtepi8_epi16(p_llrProcBuf[j]); + zmmRes0 = _mm512_adds_epi16(zmmRes0, zmm0); + zmm1 = _mm512_cvtepi8_epi16(p_llrProcBuf[j +1 ]); + zmmRes1 = _mm512_adds_epi16(zmmRes1, zmm1); + zmm0 = _mm512_packs_epi16(zmmRes0, zmmRes1); + p_llrRes[i] = _mm512_permutex_epi64(zmm0, 0xD8); +} +// Process group with 2 CNs + M = (3*Z + 63)>>6; + p_bnProcBuf = (__m256i*) &bnProcBuf [384]; + p_llrProcBuf = (__m256i*) &llrProcBuf [384]; + p_llrRes = (__m512i*) &llrRes [384]; + for (int i=0,j=0;i<M;i++,j+=2) { + zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf [j]); + zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[j + 1]); + zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf[36 + j]); + zmmRes0 = _mm512_adds_epi16(zmmRes0, zmm0); + zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[36 + j +1]); + zmmRes1 = _mm512_adds_epi16(zmmRes1, zmm1); + zmm0 = _mm512_cvtepi8_epi16(p_llrProcBuf[j]); + zmmRes0 = _mm512_adds_epi16(zmmRes0, zmm0); + zmm1 = _mm512_cvtepi8_epi16(p_llrProcBuf[j +1 ]); + zmmRes1 = _mm512_adds_epi16(zmmRes1, zmm1); + zmm0 = _mm512_packs_epi16(zmmRes0, zmmRes1); + p_llrRes[i] = _mm512_permutex_epi64(zmm0, 0xD8); +} +// Process group with 3 CNs + M = (21*Z + 63)>>6; + p_bnProcBuf = (__m256i*) &bnProcBuf [2688]; + p_llrProcBuf = (__m256i*) &llrProcBuf [1536]; + p_llrRes = (__m512i*) &llrRes [1536]; + for (int i=0,j=0;i<M;i++,j+=2) { + zmmRes0 = _mm512_cvtepi8_epi16(p_bnProcBuf [j]); + zmmRes1 = _mm512_cvtepi8_epi16(p_bnProcBuf [j +1]); + zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf[252 + j]); + zmmRes0 = _mm512_adds_epi16(zmmRes0, zmm0); + zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[252 + j +1]); + zmmRes1 = _mm512_adds_epi16(zmmRes1, zmm1); + zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf[504 + j]); + zmmRes0 = _mm512_adds_epi16(zmmRes0, zmm0); + zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[504 + j +1]); + zmmRes1 = _mm512_adds_epi16(zmmRes1, zmm1); + zmm0 = _mm512_cvtepi8_epi16(p_llrProcBuf[j]); + zmmRes0 = _mm512_adds_epi16(zmmRes0, zmm0); + zmm1 = _mm512_cvtepi8_epi16(p_llrProcBuf[j +1 ]); + zmmRes1 = _mm512_adds_epi16(zmmRes1, zmm1); + zmm0 = _mm512_packs_epi16(zmmRes0, zmmRes1); + p_llrRes[i] = _mm512_permutex_epi64(zmm0, 0xD8); +} +// Process group with 4 CNs + M = (1*Z + 63)>>6; + p_bnProcBuf = (__m256i*) &bnProcBuf [26880]; + p_llrProcBuf = (__m256i*) &llrProcBuf [9600]; + p_llrRes = (__m512i*) &llrRes [9600]; + for (int i=0,j=0;i<M;i++,j+=2) { + zmmRes0 = _mm512_cvtepi8_epi16(p_bnProcBuf [j]); + zmmRes1 = _mm512_cvtepi8_epi16(p_bnProcBuf [j +1]); + zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf[12 + j]); + zmmRes0 = _mm512_adds_epi16(zmmRes0, zmm0); + zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[12 + j +1]); + zmmRes1 = _mm512_adds_epi16(zmmRes1, zmm1); + zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf[24 + j]); + zmmRes0 = _mm512_adds_epi16(zmmRes0, zmm0); + zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[24 + j +1]); + zmmRes1 = _mm512_adds_epi16(zmmRes1, zmm1); + zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf[36 + j]); + zmmRes0 = _mm512_adds_epi16(zmmRes0, zmm0); + zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[36 + j +1]); + zmmRes1 = _mm512_adds_epi16(zmmRes1, zmm1); + zmm0 = _mm512_cvtepi8_epi16(p_llrProcBuf[j]); + zmmRes0 = _mm512_adds_epi16(zmmRes0, zmm0); + zmm1 = _mm512_cvtepi8_epi16(p_llrProcBuf[j +1 ]); + zmmRes1 = _mm512_adds_epi16(zmmRes1, zmm1); + zmm0 = _mm512_packs_epi16(zmmRes0, zmmRes1); + p_llrRes[i] = _mm512_permutex_epi64(zmm0, 0xD8); +} +// Process group with 5 CNs + M = (1*Z + 63)>>6; + p_bnProcBuf = (__m256i*) &bnProcBuf [28416]; + p_llrProcBuf = (__m256i*) &llrProcBuf [9984]; + p_llrRes = (__m512i*) &llrRes [9984]; + for (int i=0,j=0;i<M;i++,j+=2) { + zmmRes0 = _mm512_cvtepi8_epi16(p_bnProcBuf [j]); + zmmRes1 = _mm512_cvtepi8_epi16(p_bnProcBuf [j +1]); + zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf[12 + j]); + zmmRes0 = _mm512_adds_epi16(zmmRes0, zmm0); + zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[12 + j +1]); + zmmRes1 = _mm512_adds_epi16(zmmRes1, zmm1); + zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf[24 + j]); + zmmRes0 = _mm512_adds_epi16(zmmRes0, zmm0); + zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[24 + j +1]); + zmmRes1 = _mm512_adds_epi16(zmmRes1, zmm1); + zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf[36 + j]); + zmmRes0 = _mm512_adds_epi16(zmmRes0, zmm0); + zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[36 + j +1]); + zmmRes1 = _mm512_adds_epi16(zmmRes1, zmm1); + zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf[48 + j]); + zmmRes0 = _mm512_adds_epi16(zmmRes0, zmm0); + zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[48 + j +1]); + zmmRes1 = _mm512_adds_epi16(zmmRes1, zmm1); + zmm0 = _mm512_cvtepi8_epi16(p_llrProcBuf[j]); + zmmRes0 = _mm512_adds_epi16(zmmRes0, zmm0); + zmm1 = _mm512_cvtepi8_epi16(p_llrProcBuf[j +1 ]); + zmmRes1 = _mm512_adds_epi16(zmmRes1, zmm1); + zmm0 = _mm512_packs_epi16(zmmRes0, zmmRes1); + p_llrRes[i] = _mm512_permutex_epi64(zmm0, 0xD8); +} +// Process group with 6 CNs +// Process group with 7 CNs +// Process group with 8 CNs +// Process group with 9 CNs +// Process group with 10 CNs +// Process group with 11 CNs +// Process group with 12 CNs +// Process group with 13 CNs +// Process group with 14 CNs +// Process group with 15 CNs +// Process group with 16 CNs +// Process group with 17 CNs +// Process group with 18 CNs +// Process group with 19 CNs +// Process group with 20 CNs +// Process group with 21 CNs +// Process group with 22 CNs +// Process group with <23 CNs +// Process group with 24 CNs +// Process group with 25 CNs +// Process group with 26 CNs +// Process group with 27 CNs +// Process group with 28 CNs +// Process group with 29 CNs +// Process group with 30 CNs +} diff --git a/openair1/PHY/CODING/nrLDPC_decoder/nrLDPC_tools/ldpc_gen_files/bn_avx512/nrLDPC_bnProcPc_BG2_R13_AVX512.h b/openair1/PHY/CODING/nrLDPC_decoder/nrLDPC_tools/ldpc_gen_files/bn_avx512/nrLDPC_bnProcPc_BG2_R13_AVX512.h new file mode 100644 index 0000000000000000000000000000000000000000..67fc3eea3d3edb3a7aabbe62e6e42f06340c045c --- /dev/null +++ b/openair1/PHY/CODING/nrLDPC_decoder/nrLDPC_tools/ldpc_gen_files/bn_avx512/nrLDPC_bnProcPc_BG2_R13_AVX512.h @@ -0,0 +1,441 @@ +static inline void nrLDPC_bnProcPc_BG2_R13_AVX512(int8_t* bnProcBuf,int8_t* llrRes , int8_t* llrProcBuf, uint16_t Z ) { + __m512i zmm0,zmm1,zmmRes0,zmmRes1; + __m256i* p_bnProcBuf; + __m256i* p_llrProcBuf; + __m512i* p_llrRes; + uint32_t M ; +// Process group with 1 CNs + M = (18*Z + 63)>>6; + p_bnProcBuf = (__m256i*) &bnProcBuf [0]; + p_llrProcBuf = (__m256i*) &llrProcBuf [0]; + p_llrRes = (__m512i*) &llrRes [0]; + for (int i=0,j=0;i<M;i++,j+=2) { + zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf [j]); + zmm1 = _mm512_cvtepi8_epi16(p_llrProcBuf[j]); + zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf[j+1]); + zmmRes0 = _mm512_adds_epi16(zmmRes0,zmm0); + zmm1 = _mm512_cvtepi8_epi16(p_llrProcBuf[j +1 ]); + zmmRes1 = _mm512_adds_epi16(zmmRes1,zmm1); + zmm0 = _mm512_packs_epi16(zmmRes0,zmmRes1); + p_llrRes[i] = _mm512_permutex_epi64(zmm0, 0xD8); +} +// Process group with 2 CNs + M = (1*Z + 63)>>6; + p_bnProcBuf = (__m256i*) &bnProcBuf [6912]; + p_llrProcBuf = (__m256i*) &llrProcBuf [6912]; + p_llrRes = (__m512i*) &llrRes [6912]; + for (int i=0,j=0;i<M;i++,j+=2) { + zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf [j]); + zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[j + 1]); + zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf[12 + j]); + zmmRes0 = _mm512_adds_epi16(zmmRes0,zmm0); + zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[12 + j +1]); + zmmRes1 = _mm512_adds_epi16(zmmRes1,zmm1); + zmm0 = _mm512_cvtepi8_epi16(p_llrProcBuf[j]); + zmmRes0 = _mm512_adds_epi16(zmmRes0,zmm0); + zmm1 = _mm512_cvtepi8_epi16(p_llrProcBuf[j +1 ]); + zmmRes1 = _mm512_adds_epi16(zmmRes1,zmm1); + zmm0 = _mm512_packs_epi16(zmmRes0,zmmRes1); + p_llrRes[i] = _mm512_permutex_epi64(zmm0, 0xD8); +} +// Process group with 3 CNs +// Process group with 4 CNs + M = (2*Z + 63)>>6; + p_bnProcBuf = (__m256i*) &bnProcBuf [7680]; + p_llrProcBuf = (__m256i*) &llrProcBuf [7296]; + p_llrRes = (__m512i*) &llrRes [7296]; + for (int i=0,j=0;i<M;i++,j+=2) { + zmmRes0 = _mm512_cvtepi8_epi16(p_bnProcBuf [j]); + zmmRes1 = _mm512_cvtepi8_epi16(p_bnProcBuf [j +1]); + zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf[24 + j]); + zmmRes0 = _mm512_adds_epi16(zmmRes0,zmm0); + zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[24 + j +1]); + zmmRes1 = _mm512_adds_epi16(zmmRes1,zmm1); + zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf[48 + j]); + zmmRes0 = _mm512_adds_epi16(zmmRes0,zmm0); + zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[48 + j +1]); + zmmRes1 = _mm512_adds_epi16(zmmRes1,zmm1); + zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf[72 + j]); + zmmRes0 = _mm512_adds_epi16(zmmRes0,zmm0); + zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[72 + j +1]); + zmmRes1 = _mm512_adds_epi16(zmmRes1,zmm1); + zmm0 = _mm512_cvtepi8_epi16(p_llrProcBuf[j]); + zmmRes0 = _mm512_adds_epi16(zmmRes0,zmm0); + zmm1 = _mm512_cvtepi8_epi16(p_llrProcBuf[j +1 ]); + zmmRes1 = _mm512_adds_epi16(zmmRes1,zmm1); + zmm0 = _mm512_packs_epi16(zmmRes0,zmmRes1); + p_llrRes[i] = _mm512_permutex_epi64(zmm0, 0xD8); +} +// Process group with 5 CNs + M = (1*Z + 63)>>6; + p_bnProcBuf = (__m256i*) &bnProcBuf [10752]; + p_llrProcBuf = (__m256i*) &llrProcBuf [8064]; + p_llrRes = (__m512i*) &llrRes [8064]; + for (int i=0,j=0;i<M;i++,j+=2) { + zmmRes0 = _mm512_cvtepi8_epi16(p_bnProcBuf [j]); + zmmRes1 = _mm512_cvtepi8_epi16(p_bnProcBuf [j +1]); + zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf[12 + j]); + zmmRes0 = _mm512_adds_epi16(zmmRes0,zmm0); + zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[12 + j +1]); + zmmRes1 = _mm512_adds_epi16(zmmRes1,zmm1); + zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf[24 + j]); + zmmRes0 = _mm512_adds_epi16(zmmRes0,zmm0); + zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[24 + j +1]); + zmmRes1 = _mm512_adds_epi16(zmmRes1,zmm1); + zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf[36 + j]); + zmmRes0 = _mm512_adds_epi16(zmmRes0,zmm0); + zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[36 + j +1]); + zmmRes1 = _mm512_adds_epi16(zmmRes1,zmm1); + zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf[48 + j]); + zmmRes0 = _mm512_adds_epi16(zmmRes0,zmm0); + zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[48 + j +1]); + zmmRes1 = _mm512_adds_epi16(zmmRes1,zmm1); + zmm0 = _mm512_cvtepi8_epi16(p_llrProcBuf[j]); + zmmRes0 = _mm512_adds_epi16(zmmRes0,zmm0); + zmm1 = _mm512_cvtepi8_epi16(p_llrProcBuf[j +1 ]); + zmmRes1 = _mm512_adds_epi16(zmmRes1,zmm1); + zmm0 = _mm512_packs_epi16(zmmRes0,zmmRes1); + p_llrRes[i] = _mm512_permutex_epi64(zmm0, 0xD8); +} +// Process group with 6 CNs + M = (5*Z + 63)>>6; + p_bnProcBuf = (__m256i*) &bnProcBuf [12672]; + p_llrProcBuf = (__m256i*) &llrProcBuf [8448]; + p_llrRes = (__m512i*) &llrRes [8448]; + for (int i=0,j=0;i<M;i++,j+=2) { + zmmRes0 = _mm512_cvtepi8_epi16(p_bnProcBuf [j]); + zmmRes1 = _mm512_cvtepi8_epi16(p_bnProcBuf [j +1]); + zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf[60 + j]); + zmmRes0 = _mm512_adds_epi16(zmmRes0,zmm0); + zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[60 + j +1]); + zmmRes1 = _mm512_adds_epi16(zmmRes1,zmm1); + zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf[120 + j]); + zmmRes0 = _mm512_adds_epi16(zmmRes0,zmm0); + zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[120 + j +1]); + zmmRes1 = _mm512_adds_epi16(zmmRes1,zmm1); + zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf[180 + j]); + zmmRes0 = _mm512_adds_epi16(zmmRes0,zmm0); + zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[180 + j +1]); + zmmRes1 = _mm512_adds_epi16(zmmRes1,zmm1); + zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf[240 + j]); + zmmRes0 = _mm512_adds_epi16(zmmRes0,zmm0); + zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[240 + j +1]); + zmmRes1 = _mm512_adds_epi16(zmmRes1,zmm1); + zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf[300 + j]); + zmmRes0 = _mm512_adds_epi16(zmmRes0,zmm0); + zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[300 + j +1]); + zmmRes1 = _mm512_adds_epi16(zmmRes1,zmm1); + zmm0 = _mm512_cvtepi8_epi16(p_llrProcBuf[j]); + zmmRes0 = _mm512_adds_epi16(zmmRes0,zmm0); + zmm1 = _mm512_cvtepi8_epi16(p_llrProcBuf[j +1 ]); + zmmRes1 = _mm512_adds_epi16(zmmRes1,zmm1); + zmm0 = _mm512_packs_epi16(zmmRes0,zmmRes1); + p_llrRes[i] = _mm512_permutex_epi64(zmm0, 0xD8); +} +// Process group with 7 CNs + M = (1*Z + 63)>>6; + p_bnProcBuf = (__m256i*) &bnProcBuf [24192]; + p_llrProcBuf = (__m256i*) &llrProcBuf [10368]; + p_llrRes = (__m512i*) &llrRes [10368]; + for (int i=0,j=0;i<M;i++,j+=2) { + zmmRes0 = _mm512_cvtepi8_epi16(p_bnProcBuf [j]); + zmmRes1 = _mm512_cvtepi8_epi16(p_bnProcBuf [j +1]); + zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf[12 + j]); + zmmRes0 = _mm512_adds_epi16(zmmRes0,zmm0); + zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[12 + j +1]); + zmmRes1 = _mm512_adds_epi16(zmmRes1,zmm1); + zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf[24 + j]); + zmmRes0 = _mm512_adds_epi16(zmmRes0,zmm0); + zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[24 + j +1]); + zmmRes1 = _mm512_adds_epi16(zmmRes1,zmm1); + zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf[36 + j]); + zmmRes0 = _mm512_adds_epi16(zmmRes0,zmm0); + zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[36 + j +1]); + zmmRes1 = _mm512_adds_epi16(zmmRes1,zmm1); + zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf[48 + j]); + zmmRes0 = _mm512_adds_epi16(zmmRes0,zmm0); + zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[48 + j +1]); + zmmRes1 = _mm512_adds_epi16(zmmRes1,zmm1); + zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf[60 + j]); + zmmRes0 = _mm512_adds_epi16(zmmRes0,zmm0); + zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[60 + j +1]); + zmmRes1 = _mm512_adds_epi16(zmmRes1,zmm1); + zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf[72 + j]); + zmmRes0 = _mm512_adds_epi16(zmmRes0,zmm0); + zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[72 + j +1]); + zmmRes1 = _mm512_adds_epi16(zmmRes1,zmm1); + zmm0 = _mm512_cvtepi8_epi16(p_llrProcBuf[j]); + zmmRes0 = _mm512_adds_epi16(zmmRes0,zmm0); + zmm1 = _mm512_cvtepi8_epi16(p_llrProcBuf[j +1 ]); + zmmRes1 = _mm512_adds_epi16(zmmRes1,zmm1); + zmm0 = _mm512_packs_epi16(zmmRes0,zmmRes1); + p_llrRes[i] = _mm512_permutex_epi64(zmm0, 0xD8); +} +// Process group with 8 CNs + M = (1*Z + 63)>>6; + p_bnProcBuf = (__m256i*) &bnProcBuf [26880]; + p_llrProcBuf = (__m256i*) &llrProcBuf [10752]; + p_llrRes = (__m512i*) &llrRes [10752]; + for (int i=0,j=0;i<M;i++,j+=2) { + zmmRes0 = _mm512_cvtepi8_epi16(p_bnProcBuf [j]); + zmmRes1 = _mm512_cvtepi8_epi16(p_bnProcBuf [j +1]); + zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf[12 + j]); + zmmRes0 = _mm512_adds_epi16(zmmRes0,zmm0); + zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[12 + j +1]); + zmmRes1 = _mm512_adds_epi16(zmmRes1,zmm1); + zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf[24 + j]); + zmmRes0 = _mm512_adds_epi16(zmmRes0,zmm0); + zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[24 + j +1]); + zmmRes1 = _mm512_adds_epi16(zmmRes1,zmm1); + zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf[36 + j]); + zmmRes0 = _mm512_adds_epi16(zmmRes0,zmm0); + zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[36 + j +1]); + zmmRes1 = _mm512_adds_epi16(zmmRes1,zmm1); + zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf[48 + j]); + zmmRes0 = _mm512_adds_epi16(zmmRes0,zmm0); + zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[48 + j +1]); + zmmRes1 = _mm512_adds_epi16(zmmRes1,zmm1); + zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf[60 + j]); + zmmRes0 = _mm512_adds_epi16(zmmRes0,zmm0); + zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[60 + j +1]); + zmmRes1 = _mm512_adds_epi16(zmmRes1,zmm1); + zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf[72 + j]); + zmmRes0 = _mm512_adds_epi16(zmmRes0,zmm0); + zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[72 + j +1]); + zmmRes1 = _mm512_adds_epi16(zmmRes1,zmm1); + zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf[84 + j]); + zmmRes0 = _mm512_adds_epi16(zmmRes0,zmm0); + zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[84 + j +1]); + zmmRes1 = _mm512_adds_epi16(zmmRes1,zmm1); + zmm0 = _mm512_cvtepi8_epi16(p_llrProcBuf[j]); + zmmRes0 = _mm512_adds_epi16(zmmRes0,zmm0); + zmm1 = _mm512_cvtepi8_epi16(p_llrProcBuf[j +1 ]); + zmmRes1 = _mm512_adds_epi16(zmmRes1,zmm1); + zmm0 = _mm512_packs_epi16(zmmRes0,zmmRes1); + p_llrRes[i] = _mm512_permutex_epi64(zmm0, 0xD8); +} +// Process group with 9 CNs +// Process group with 10 CNs +// Process group with 11 CNs +// Process group with 12 CNs +// Process group with 13 CNs + M = (1*Z + 63)>>6; + p_bnProcBuf = (__m256i*) &bnProcBuf [29952]; + p_llrProcBuf = (__m256i*) &llrProcBuf [11136]; + p_llrRes = (__m512i*) &llrRes [11136]; + for (int i=0,j=0;i<M;i++,j+=2) { + zmmRes0 = _mm512_cvtepi8_epi16(p_bnProcBuf [j]); + zmmRes1 = _mm512_cvtepi8_epi16(p_bnProcBuf [j +1]); + zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf[12 + j]); + zmmRes0 = _mm512_adds_epi16(zmmRes0,zmm0); + zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[12 + j +1]); + zmmRes1 = _mm512_adds_epi16(zmmRes1,zmm1); + zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf[24 + j]); + zmmRes0 = _mm512_adds_epi16(zmmRes0,zmm0); + zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[24 + j +1]); + zmmRes1 = _mm512_adds_epi16(zmmRes1,zmm1); + zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf[36 + j]); + zmmRes0 = _mm512_adds_epi16(zmmRes0,zmm0); + zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[36 + j +1]); + zmmRes1 = _mm512_adds_epi16(zmmRes1,zmm1); + zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf[48 + j]); + zmmRes0 = _mm512_adds_epi16(zmmRes0,zmm0); + zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[48 + j +1]); + zmmRes1 = _mm512_adds_epi16(zmmRes1,zmm1); + zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf[60 + j]); + zmmRes0 = _mm512_adds_epi16(zmmRes0,zmm0); + zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[60 + j +1]); + zmmRes1 = _mm512_adds_epi16(zmmRes1,zmm1); + zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf[72 + j]); + zmmRes0 = _mm512_adds_epi16(zmmRes0,zmm0); + zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[72 + j +1]); + zmmRes1 = _mm512_adds_epi16(zmmRes1,zmm1); + zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf[84 + j]); + zmmRes0 = _mm512_adds_epi16(zmmRes0,zmm0); + zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[84 + j +1]); + zmmRes1 = _mm512_adds_epi16(zmmRes1,zmm1); + zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf[96 + j]); + zmmRes0 = _mm512_adds_epi16(zmmRes0,zmm0); + zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[96 + j +1]); + zmmRes1 = _mm512_adds_epi16(zmmRes1,zmm1); + zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf[108 + j]); + zmmRes0 = _mm512_adds_epi16(zmmRes0,zmm0); + zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[108 + j +1]); + zmmRes1 = _mm512_adds_epi16(zmmRes1,zmm1); + zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf[120 + j]); + zmmRes0 = _mm512_adds_epi16(zmmRes0,zmm0); + zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[120 + j +1]); + zmmRes1 = _mm512_adds_epi16(zmmRes1,zmm1); + zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf[132 + j]); + zmmRes0 = _mm512_adds_epi16(zmmRes0,zmm0); + zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[132 + j +1]); + zmmRes1 = _mm512_adds_epi16(zmmRes1,zmm1); + zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf[144 + j]); + zmmRes0 = _mm512_adds_epi16(zmmRes0,zmm0); + zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[144 + j +1]); + zmmRes1 = _mm512_adds_epi16(zmmRes1,zmm1); + zmm0 = _mm512_cvtepi8_epi16(p_llrProcBuf[j]); + zmmRes0 = _mm512_adds_epi16(zmmRes0,zmm0); + zmm1 = _mm512_cvtepi8_epi16(p_llrProcBuf[j +1 ]); + zmmRes1 = _mm512_adds_epi16(zmmRes1,zmm1); + zmm0 = _mm512_packs_epi16(zmmRes0,zmmRes1); + p_llrRes[i] = _mm512_permutex_epi64(zmm0, 0xD8); +} +// Process group with 14 CNs + M = (1*Z + 63)>>6; + p_bnProcBuf = (__m256i*) &bnProcBuf [34944]; + p_llrProcBuf = (__m256i*) &llrProcBuf [11520]; + p_llrRes = (__m512i*) &llrRes [11520]; + for (int i=0,j=0;i<M;i++,j+=2) { + zmmRes0 = _mm512_cvtepi8_epi16(p_bnProcBuf [j]); + zmmRes1 = _mm512_cvtepi8_epi16(p_bnProcBuf [j +1]); + zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf[12 + j]); + zmmRes0 = _mm512_adds_epi16(zmmRes0,zmm0); + zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[12 + j +1]); + zmmRes1 = _mm512_adds_epi16(zmmRes1,zmm1); + zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf[24 + j]); + zmmRes0 = _mm512_adds_epi16(zmmRes0,zmm0); + zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[24 + j +1]); + zmmRes1 = _mm512_adds_epi16(zmmRes1,zmm1); + zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf[36 + j]); + zmmRes0 = _mm512_adds_epi16(zmmRes0,zmm0); + zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[36 + j +1]); + zmmRes1 = _mm512_adds_epi16(zmmRes1,zmm1); + zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf[48 + j]); + zmmRes0 = _mm512_adds_epi16(zmmRes0,zmm0); + zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[48 + j +1]); + zmmRes1 = _mm512_adds_epi16(zmmRes1,zmm1); + zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf[60 + j]); + zmmRes0 = _mm512_adds_epi16(zmmRes0,zmm0); + zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[60 + j +1]); + zmmRes1 = _mm512_adds_epi16(zmmRes1,zmm1); + zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf[72 + j]); + zmmRes0 = _mm512_adds_epi16(zmmRes0,zmm0); + zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[72 + j +1]); + zmmRes1 = _mm512_adds_epi16(zmmRes1,zmm1); + zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf[84 + j]); + zmmRes0 = _mm512_adds_epi16(zmmRes0,zmm0); + zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[84 + j +1]); + zmmRes1 = _mm512_adds_epi16(zmmRes1,zmm1); + zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf[96 + j]); + zmmRes0 = _mm512_adds_epi16(zmmRes0,zmm0); + zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[96 + j +1]); + zmmRes1 = _mm512_adds_epi16(zmmRes1,zmm1); + zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf[108 + j]); + zmmRes0 = _mm512_adds_epi16(zmmRes0,zmm0); + zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[108 + j +1]); + zmmRes1 = _mm512_adds_epi16(zmmRes1,zmm1); + zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf[120 + j]); + zmmRes0 = _mm512_adds_epi16(zmmRes0,zmm0); + zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[120 + j +1]); + zmmRes1 = _mm512_adds_epi16(zmmRes1,zmm1); + zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf[132 + j]); + zmmRes0 = _mm512_adds_epi16(zmmRes0,zmm0); + zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[132 + j +1]); + zmmRes1 = _mm512_adds_epi16(zmmRes1,zmm1); + zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf[144 + j]); + zmmRes0 = _mm512_adds_epi16(zmmRes0,zmm0); + zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[144 + j +1]); + zmmRes1 = _mm512_adds_epi16(zmmRes1,zmm1); + zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf[156 + j]); + zmmRes0 = _mm512_adds_epi16(zmmRes0,zmm0); + zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[156 + j +1]); + zmmRes1 = _mm512_adds_epi16(zmmRes1,zmm1); + zmm0 = _mm512_cvtepi8_epi16(p_llrProcBuf[j]); + zmmRes0 = _mm512_adds_epi16(zmmRes0,zmm0); + zmm1 = _mm512_cvtepi8_epi16(p_llrProcBuf[j +1 ]); + zmmRes1 = _mm512_adds_epi16(zmmRes1,zmm1); + zmm0 = _mm512_packs_epi16(zmmRes0,zmmRes1); + p_llrRes[i] = _mm512_permutex_epi64(zmm0, 0xD8); +} +// Process group with 15 CNs +// Process group with 16 CNs + M = (1*Z + 63)>>6; + p_bnProcBuf = (__m256i*) &bnProcBuf [40320]; + p_llrProcBuf = (__m256i*) &llrProcBuf [11904]; + p_llrRes = (__m512i*) &llrRes [11904]; + for (int i=0,j=0;i<M;i++,j+=2) { + zmmRes0 = _mm512_cvtepi8_epi16(p_bnProcBuf [j]); + zmmRes1 = _mm512_cvtepi8_epi16(p_bnProcBuf [j +1]); + zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf[12 + j]); + zmmRes0 = _mm512_adds_epi16(zmmRes0,zmm0); + zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[12 + j +1]); + zmmRes1 = _mm512_adds_epi16(zmmRes1,zmm1); + zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf[24 + j]); + zmmRes0 = _mm512_adds_epi16(zmmRes0,zmm0); + zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[24 + j +1]); + zmmRes1 = _mm512_adds_epi16(zmmRes1,zmm1); + zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf[36 + j]); + zmmRes0 = _mm512_adds_epi16(zmmRes0,zmm0); + zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[36 + j +1]); + zmmRes1 = _mm512_adds_epi16(zmmRes1,zmm1); + zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf[48 + j]); + zmmRes0 = _mm512_adds_epi16(zmmRes0,zmm0); + zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[48 + j +1]); + zmmRes1 = _mm512_adds_epi16(zmmRes1,zmm1); + zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf[60 + j]); + zmmRes0 = _mm512_adds_epi16(zmmRes0,zmm0); + zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[60 + j +1]); + zmmRes1 = _mm512_adds_epi16(zmmRes1,zmm1); + zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf[72 + j]); + zmmRes0 = _mm512_adds_epi16(zmmRes0,zmm0); + zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[72 + j +1]); + zmmRes1 = _mm512_adds_epi16(zmmRes1,zmm1); + zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf[84 + j]); + zmmRes0 = _mm512_adds_epi16(zmmRes0,zmm0); + zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[84 + j +1]); + zmmRes1 = _mm512_adds_epi16(zmmRes1,zmm1); + zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf[96 + j]); + zmmRes0 = _mm512_adds_epi16(zmmRes0,zmm0); + zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[96 + j +1]); + zmmRes1 = _mm512_adds_epi16(zmmRes1,zmm1); + zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf[108 + j]); + zmmRes0 = _mm512_adds_epi16(zmmRes0,zmm0); + zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[108 + j +1]); + zmmRes1 = _mm512_adds_epi16(zmmRes1,zmm1); + zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf[120 + j]); + zmmRes0 = _mm512_adds_epi16(zmmRes0,zmm0); + zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[120 + j +1]); + zmmRes1 = _mm512_adds_epi16(zmmRes1,zmm1); + zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf[132 + j]); + zmmRes0 = _mm512_adds_epi16(zmmRes0,zmm0); + zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[132 + j +1]); + zmmRes1 = _mm512_adds_epi16(zmmRes1,zmm1); + zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf[144 + j]); + zmmRes0 = _mm512_adds_epi16(zmmRes0,zmm0); + zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[144 + j +1]); + zmmRes1 = _mm512_adds_epi16(zmmRes1,zmm1); + zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf[156 + j]); + zmmRes0 = _mm512_adds_epi16(zmmRes0,zmm0); + zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[156 + j +1]); + zmmRes1 = _mm512_adds_epi16(zmmRes1,zmm1); + zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf[168 + j]); + zmmRes0 = _mm512_adds_epi16(zmmRes0,zmm0); + zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[168 + j +1]); + zmmRes1 = _mm512_adds_epi16(zmmRes1,zmm1); + zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf[180 + j]); + zmmRes0 = _mm512_adds_epi16(zmmRes0,zmm0); + zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[180 + j +1]); + zmmRes1 = _mm512_adds_epi16(zmmRes1,zmm1); + zmm0 = _mm512_cvtepi8_epi16(p_llrProcBuf[j]); + zmmRes0 = _mm512_adds_epi16(zmmRes0,zmm0); + zmm1 = _mm512_cvtepi8_epi16(p_llrProcBuf[j +1 ]); + zmmRes1 = _mm512_adds_epi16(zmmRes1,zmm1); + zmm0 = _mm512_packs_epi16(zmmRes0,zmmRes1); + p_llrRes[i] = _mm512_permutex_epi64(zmm0, 0xD8); +} +// Process group with 17 CNs +// Process group with 18 CNs +// Process group with 19 CNs +// Process group with 20 CNs +// Process group with 21 CNs +// Process group with 22 CNs +// Process group with <23 CNs +// Process group with 24 CNs +// Process group with 25 CNs +// Process group with 26 CNs +// Process group with 27 CNs +// Process group with 28 CNs +// Process group with 29 CNs +// Process group with 30 CNs +} diff --git a/openair1/PHY/CODING/nrLDPC_decoder/nrLDPC_tools/ldpc_gen_files/bn_avx512/nrLDPC_bnProcPc_BG2_R15_AVX512.h b/openair1/PHY/CODING/nrLDPC_decoder/nrLDPC_tools/ldpc_gen_files/bn_avx512/nrLDPC_bnProcPc_BG2_R15_AVX512.h new file mode 100644 index 0000000000000000000000000000000000000000..b6fce03ca4c4d4fca379c89b0f3eec94ca9d9658 --- /dev/null +++ b/openair1/PHY/CODING/nrLDPC_decoder/nrLDPC_tools/ldpc_gen_files/bn_avx512/nrLDPC_bnProcPc_BG2_R15_AVX512.h @@ -0,0 +1,751 @@ +static inline void nrLDPC_bnProcPc_BG2_R15_AVX512(int8_t* bnProcBuf,int8_t* llrRes , int8_t* llrProcBuf, uint16_t Z ) { + __m512i zmm0,zmm1,zmmRes0,zmmRes1; + __m256i* p_bnProcBuf; + __m256i* p_llrProcBuf; + __m512i* p_llrRes; + uint32_t M ; +// Process group with 1 CNs + M = (38*Z + 63)>>6; + p_bnProcBuf = (__m256i*) &bnProcBuf [0]; + p_llrProcBuf = (__m256i*) &llrProcBuf [0]; + p_llrRes = (__m512i*) &llrRes [0]; + for (int i=0,j=0;i<M;i++,j+=2) { + zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf [j]); + zmm1 = _mm512_cvtepi8_epi16(p_llrProcBuf[j]); + zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf[j+1]); + zmmRes0 = _mm512_adds_epi16(zmmRes0,zmm0); + zmm1 = _mm512_cvtepi8_epi16(p_llrProcBuf[j +1 ]); + zmmRes1 = _mm512_adds_epi16(zmmRes1,zmm1); + zmm0 = _mm512_packs_epi16(zmmRes0,zmmRes1); + p_llrRes[i] = _mm512_permutex_epi64(zmm0, 0xD8); +} +// Process group with 2 CNs +// Process group with 3 CNs +// Process group with 4 CNs +// Process group with 5 CNs + M = (2*Z + 63)>>6; + p_bnProcBuf = (__m256i*) &bnProcBuf [14592]; + p_llrProcBuf = (__m256i*) &llrProcBuf [14592]; + p_llrRes = (__m512i*) &llrRes [14592]; + for (int i=0,j=0;i<M;i++,j+=2) { + zmmRes0 = _mm512_cvtepi8_epi16(p_bnProcBuf [j]); + zmmRes1 = _mm512_cvtepi8_epi16(p_bnProcBuf [j +1]); + zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf[24 + j]); + zmmRes0 = _mm512_adds_epi16(zmmRes0,zmm0); + zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[24 + j +1]); + zmmRes1 = _mm512_adds_epi16(zmmRes1,zmm1); + zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf[48 + j]); + zmmRes0 = _mm512_adds_epi16(zmmRes0,zmm0); + zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[48 + j +1]); + zmmRes1 = _mm512_adds_epi16(zmmRes1,zmm1); + zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf[72 + j]); + zmmRes0 = _mm512_adds_epi16(zmmRes0,zmm0); + zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[72 + j +1]); + zmmRes1 = _mm512_adds_epi16(zmmRes1,zmm1); + zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf[96 + j]); + zmmRes0 = _mm512_adds_epi16(zmmRes0,zmm0); + zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[96 + j +1]); + zmmRes1 = _mm512_adds_epi16(zmmRes1,zmm1); + zmm0 = _mm512_cvtepi8_epi16(p_llrProcBuf[j]); + zmmRes0 = _mm512_adds_epi16(zmmRes0,zmm0); + zmm1 = _mm512_cvtepi8_epi16(p_llrProcBuf[j +1 ]); + zmmRes1 = _mm512_adds_epi16(zmmRes1,zmm1); + zmm0 = _mm512_packs_epi16(zmmRes0,zmmRes1); + p_llrRes[i] = _mm512_permutex_epi64(zmm0, 0xD8); +} +// Process group with 6 CNs + M = (1*Z + 63)>>6; + p_bnProcBuf = (__m256i*) &bnProcBuf [18432]; + p_llrProcBuf = (__m256i*) &llrProcBuf [15360]; + p_llrRes = (__m512i*) &llrRes [15360]; + for (int i=0,j=0;i<M;i++,j+=2) { + zmmRes0 = _mm512_cvtepi8_epi16(p_bnProcBuf [j]); + zmmRes1 = _mm512_cvtepi8_epi16(p_bnProcBuf [j +1]); + zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf[12 + j]); + zmmRes0 = _mm512_adds_epi16(zmmRes0,zmm0); + zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[12 + j +1]); + zmmRes1 = _mm512_adds_epi16(zmmRes1,zmm1); + zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf[24 + j]); + zmmRes0 = _mm512_adds_epi16(zmmRes0,zmm0); + zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[24 + j +1]); + zmmRes1 = _mm512_adds_epi16(zmmRes1,zmm1); + zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf[36 + j]); + zmmRes0 = _mm512_adds_epi16(zmmRes0,zmm0); + zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[36 + j +1]); + zmmRes1 = _mm512_adds_epi16(zmmRes1,zmm1); + zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf[48 + j]); + zmmRes0 = _mm512_adds_epi16(zmmRes0,zmm0); + zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[48 + j +1]); + zmmRes1 = _mm512_adds_epi16(zmmRes1,zmm1); + zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf[60 + j]); + zmmRes0 = _mm512_adds_epi16(zmmRes0,zmm0); + zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[60 + j +1]); + zmmRes1 = _mm512_adds_epi16(zmmRes1,zmm1); + zmm0 = _mm512_cvtepi8_epi16(p_llrProcBuf[j]); + zmmRes0 = _mm512_adds_epi16(zmmRes0,zmm0); + zmm1 = _mm512_cvtepi8_epi16(p_llrProcBuf[j +1 ]); + zmmRes1 = _mm512_adds_epi16(zmmRes1,zmm1); + zmm0 = _mm512_packs_epi16(zmmRes0,zmmRes1); + p_llrRes[i] = _mm512_permutex_epi64(zmm0, 0xD8); +} +// Process group with 7 CNs + M = (1*Z + 63)>>6; + p_bnProcBuf = (__m256i*) &bnProcBuf [20736]; + p_llrProcBuf = (__m256i*) &llrProcBuf [15744]; + p_llrRes = (__m512i*) &llrRes [15744]; + for (int i=0,j=0;i<M;i++,j+=2) { + zmmRes0 = _mm512_cvtepi8_epi16(p_bnProcBuf [j]); + zmmRes1 = _mm512_cvtepi8_epi16(p_bnProcBuf [j +1]); + zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf[12 + j]); + zmmRes0 = _mm512_adds_epi16(zmmRes0,zmm0); + zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[12 + j +1]); + zmmRes1 = _mm512_adds_epi16(zmmRes1,zmm1); + zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf[24 + j]); + zmmRes0 = _mm512_adds_epi16(zmmRes0,zmm0); + zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[24 + j +1]); + zmmRes1 = _mm512_adds_epi16(zmmRes1,zmm1); + zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf[36 + j]); + zmmRes0 = _mm512_adds_epi16(zmmRes0,zmm0); + zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[36 + j +1]); + zmmRes1 = _mm512_adds_epi16(zmmRes1,zmm1); + zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf[48 + j]); + zmmRes0 = _mm512_adds_epi16(zmmRes0,zmm0); + zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[48 + j +1]); + zmmRes1 = _mm512_adds_epi16(zmmRes1,zmm1); + zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf[60 + j]); + zmmRes0 = _mm512_adds_epi16(zmmRes0,zmm0); + zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[60 + j +1]); + zmmRes1 = _mm512_adds_epi16(zmmRes1,zmm1); + zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf[72 + j]); + zmmRes0 = _mm512_adds_epi16(zmmRes0,zmm0); + zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[72 + j +1]); + zmmRes1 = _mm512_adds_epi16(zmmRes1,zmm1); + zmm0 = _mm512_cvtepi8_epi16(p_llrProcBuf[j]); + zmmRes0 = _mm512_adds_epi16(zmmRes0,zmm0); + zmm1 = _mm512_cvtepi8_epi16(p_llrProcBuf[j +1 ]); + zmmRes1 = _mm512_adds_epi16(zmmRes1,zmm1); + zmm0 = _mm512_packs_epi16(zmmRes0,zmmRes1); + p_llrRes[i] = _mm512_permutex_epi64(zmm0, 0xD8); +} +// Process group with 8 CNs + M = (1*Z + 63)>>6; + p_bnProcBuf = (__m256i*) &bnProcBuf [23424]; + p_llrProcBuf = (__m256i*) &llrProcBuf [16128]; + p_llrRes = (__m512i*) &llrRes [16128]; + for (int i=0,j=0;i<M;i++,j+=2) { + zmmRes0 = _mm512_cvtepi8_epi16(p_bnProcBuf [j]); + zmmRes1 = _mm512_cvtepi8_epi16(p_bnProcBuf [j +1]); + zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf[12 + j]); + zmmRes0 = _mm512_adds_epi16(zmmRes0,zmm0); + zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[12 + j +1]); + zmmRes1 = _mm512_adds_epi16(zmmRes1,zmm1); + zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf[24 + j]); + zmmRes0 = _mm512_adds_epi16(zmmRes0,zmm0); + zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[24 + j +1]); + zmmRes1 = _mm512_adds_epi16(zmmRes1,zmm1); + zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf[36 + j]); + zmmRes0 = _mm512_adds_epi16(zmmRes0,zmm0); + zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[36 + j +1]); + zmmRes1 = _mm512_adds_epi16(zmmRes1,zmm1); + zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf[48 + j]); + zmmRes0 = _mm512_adds_epi16(zmmRes0,zmm0); + zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[48 + j +1]); + zmmRes1 = _mm512_adds_epi16(zmmRes1,zmm1); + zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf[60 + j]); + zmmRes0 = _mm512_adds_epi16(zmmRes0,zmm0); + zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[60 + j +1]); + zmmRes1 = _mm512_adds_epi16(zmmRes1,zmm1); + zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf[72 + j]); + zmmRes0 = _mm512_adds_epi16(zmmRes0,zmm0); + zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[72 + j +1]); + zmmRes1 = _mm512_adds_epi16(zmmRes1,zmm1); + zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf[84 + j]); + zmmRes0 = _mm512_adds_epi16(zmmRes0,zmm0); + zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[84 + j +1]); + zmmRes1 = _mm512_adds_epi16(zmmRes1,zmm1); + zmm0 = _mm512_cvtepi8_epi16(p_llrProcBuf[j]); + zmmRes0 = _mm512_adds_epi16(zmmRes0,zmm0); + zmm1 = _mm512_cvtepi8_epi16(p_llrProcBuf[j +1 ]); + zmmRes1 = _mm512_adds_epi16(zmmRes1,zmm1); + zmm0 = _mm512_packs_epi16(zmmRes0,zmmRes1); + p_llrRes[i] = _mm512_permutex_epi64(zmm0, 0xD8); +} +// Process group with 9 CNs + M = (2*Z + 63)>>6; + p_bnProcBuf = (__m256i*) &bnProcBuf [26496]; + p_llrProcBuf = (__m256i*) &llrProcBuf [16512]; + p_llrRes = (__m512i*) &llrRes [16512]; + for (int i=0,j=0;i<M;i++,j+=2) { + zmmRes0 = _mm512_cvtepi8_epi16(p_bnProcBuf [j]); + zmmRes1 = _mm512_cvtepi8_epi16(p_bnProcBuf [j +1]); + zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf[24 + j]); + zmmRes0 = _mm512_adds_epi16(zmmRes0,zmm0); + zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[24 + j +1]); + zmmRes1 = _mm512_adds_epi16(zmmRes1,zmm1); + zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf[48 + j]); + zmmRes0 = _mm512_adds_epi16(zmmRes0,zmm0); + zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[48 + j +1]); + zmmRes1 = _mm512_adds_epi16(zmmRes1,zmm1); + zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf[72 + j]); + zmmRes0 = _mm512_adds_epi16(zmmRes0,zmm0); + zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[72 + j +1]); + zmmRes1 = _mm512_adds_epi16(zmmRes1,zmm1); + zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf[96 + j]); + zmmRes0 = _mm512_adds_epi16(zmmRes0,zmm0); + zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[96 + j +1]); + zmmRes1 = _mm512_adds_epi16(zmmRes1,zmm1); + zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf[120 + j]); + zmmRes0 = _mm512_adds_epi16(zmmRes0,zmm0); + zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[120 + j +1]); + zmmRes1 = _mm512_adds_epi16(zmmRes1,zmm1); + zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf[144 + j]); + zmmRes0 = _mm512_adds_epi16(zmmRes0,zmm0); + zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[144 + j +1]); + zmmRes1 = _mm512_adds_epi16(zmmRes1,zmm1); + zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf[168 + j]); + zmmRes0 = _mm512_adds_epi16(zmmRes0,zmm0); + zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[168 + j +1]); + zmmRes1 = _mm512_adds_epi16(zmmRes1,zmm1); + zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf[192 + j]); + zmmRes0 = _mm512_adds_epi16(zmmRes0,zmm0); + zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[192 + j +1]); + zmmRes1 = _mm512_adds_epi16(zmmRes1,zmm1); + zmm0 = _mm512_cvtepi8_epi16(p_llrProcBuf[j]); + zmmRes0 = _mm512_adds_epi16(zmmRes0,zmm0); + zmm1 = _mm512_cvtepi8_epi16(p_llrProcBuf[j +1 ]); + zmmRes1 = _mm512_adds_epi16(zmmRes1,zmm1); + zmm0 = _mm512_packs_epi16(zmmRes0,zmmRes1); + p_llrRes[i] = _mm512_permutex_epi64(zmm0, 0xD8); +} +// Process group with 10 CNs + M = (1*Z + 63)>>6; + p_bnProcBuf = (__m256i*) &bnProcBuf [33408]; + p_llrProcBuf = (__m256i*) &llrProcBuf [17280]; + p_llrRes = (__m512i*) &llrRes [17280]; + for (int i=0,j=0;i<M;i++,j+=2) { + zmmRes0 = _mm512_cvtepi8_epi16(p_bnProcBuf [j]); + zmmRes1 = _mm512_cvtepi8_epi16(p_bnProcBuf [j +1]); + zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf[12 + j]); + zmmRes0 = _mm512_adds_epi16(zmmRes0,zmm0); + zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[12 + j +1]); + zmmRes1 = _mm512_adds_epi16(zmmRes1,zmm1); + zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf[24 + j]); + zmmRes0 = _mm512_adds_epi16(zmmRes0,zmm0); + zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[24 + j +1]); + zmmRes1 = _mm512_adds_epi16(zmmRes1,zmm1); + zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf[36 + j]); + zmmRes0 = _mm512_adds_epi16(zmmRes0,zmm0); + zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[36 + j +1]); + zmmRes1 = _mm512_adds_epi16(zmmRes1,zmm1); + zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf[48 + j]); + zmmRes0 = _mm512_adds_epi16(zmmRes0,zmm0); + zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[48 + j +1]); + zmmRes1 = _mm512_adds_epi16(zmmRes1,zmm1); + zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf[60 + j]); + zmmRes0 = _mm512_adds_epi16(zmmRes0,zmm0); + zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[60 + j +1]); + zmmRes1 = _mm512_adds_epi16(zmmRes1,zmm1); + zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf[72 + j]); + zmmRes0 = _mm512_adds_epi16(zmmRes0,zmm0); + zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[72 + j +1]); + zmmRes1 = _mm512_adds_epi16(zmmRes1,zmm1); + zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf[84 + j]); + zmmRes0 = _mm512_adds_epi16(zmmRes0,zmm0); + zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[84 + j +1]); + zmmRes1 = _mm512_adds_epi16(zmmRes1,zmm1); + zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf[96 + j]); + zmmRes0 = _mm512_adds_epi16(zmmRes0,zmm0); + zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[96 + j +1]); + zmmRes1 = _mm512_adds_epi16(zmmRes1,zmm1); + zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf[108 + j]); + zmmRes0 = _mm512_adds_epi16(zmmRes0,zmm0); + zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[108 + j +1]); + zmmRes1 = _mm512_adds_epi16(zmmRes1,zmm1); + zmm0 = _mm512_cvtepi8_epi16(p_llrProcBuf[j]); + zmmRes0 = _mm512_adds_epi16(zmmRes0,zmm0); + zmm1 = _mm512_cvtepi8_epi16(p_llrProcBuf[j +1 ]); + zmmRes1 = _mm512_adds_epi16(zmmRes1,zmm1); + zmm0 = _mm512_packs_epi16(zmmRes0,zmmRes1); + p_llrRes[i] = _mm512_permutex_epi64(zmm0, 0xD8); +} +// Process group with 11 CNs +// Process group with 12 CNs + M = (1*Z + 63)>>6; + p_bnProcBuf = (__m256i*) &bnProcBuf [37248]; + p_llrProcBuf = (__m256i*) &llrProcBuf [17664]; + p_llrRes = (__m512i*) &llrRes [17664]; + for (int i=0,j=0;i<M;i++,j+=2) { + zmmRes0 = _mm512_cvtepi8_epi16(p_bnProcBuf [j]); + zmmRes1 = _mm512_cvtepi8_epi16(p_bnProcBuf [j +1]); + zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf[12 + j]); + zmmRes0 = _mm512_adds_epi16(zmmRes0,zmm0); + zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[12 + j +1]); + zmmRes1 = _mm512_adds_epi16(zmmRes1,zmm1); + zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf[24 + j]); + zmmRes0 = _mm512_adds_epi16(zmmRes0,zmm0); + zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[24 + j +1]); + zmmRes1 = _mm512_adds_epi16(zmmRes1,zmm1); + zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf[36 + j]); + zmmRes0 = _mm512_adds_epi16(zmmRes0,zmm0); + zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[36 + j +1]); + zmmRes1 = _mm512_adds_epi16(zmmRes1,zmm1); + zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf[48 + j]); + zmmRes0 = _mm512_adds_epi16(zmmRes0,zmm0); + zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[48 + j +1]); + zmmRes1 = _mm512_adds_epi16(zmmRes1,zmm1); + zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf[60 + j]); + zmmRes0 = _mm512_adds_epi16(zmmRes0,zmm0); + zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[60 + j +1]); + zmmRes1 = _mm512_adds_epi16(zmmRes1,zmm1); + zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf[72 + j]); + zmmRes0 = _mm512_adds_epi16(zmmRes0,zmm0); + zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[72 + j +1]); + zmmRes1 = _mm512_adds_epi16(zmmRes1,zmm1); + zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf[84 + j]); + zmmRes0 = _mm512_adds_epi16(zmmRes0,zmm0); + zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[84 + j +1]); + zmmRes1 = _mm512_adds_epi16(zmmRes1,zmm1); + zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf[96 + j]); + zmmRes0 = _mm512_adds_epi16(zmmRes0,zmm0); + zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[96 + j +1]); + zmmRes1 = _mm512_adds_epi16(zmmRes1,zmm1); + zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf[108 + j]); + zmmRes0 = _mm512_adds_epi16(zmmRes0,zmm0); + zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[108 + j +1]); + zmmRes1 = _mm512_adds_epi16(zmmRes1,zmm1); + zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf[120 + j]); + zmmRes0 = _mm512_adds_epi16(zmmRes0,zmm0); + zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[120 + j +1]); + zmmRes1 = _mm512_adds_epi16(zmmRes1,zmm1); + zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf[132 + j]); + zmmRes0 = _mm512_adds_epi16(zmmRes0,zmm0); + zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[132 + j +1]); + zmmRes1 = _mm512_adds_epi16(zmmRes1,zmm1); + zmm0 = _mm512_cvtepi8_epi16(p_llrProcBuf[j]); + zmmRes0 = _mm512_adds_epi16(zmmRes0,zmm0); + zmm1 = _mm512_cvtepi8_epi16(p_llrProcBuf[j +1 ]); + zmmRes1 = _mm512_adds_epi16(zmmRes1,zmm1); + zmm0 = _mm512_packs_epi16(zmmRes0,zmmRes1); + p_llrRes[i] = _mm512_permutex_epi64(zmm0, 0xD8); +} +// Process group with 13 CNs + M = (1*Z + 63)>>6; + p_bnProcBuf = (__m256i*) &bnProcBuf [41856]; + p_llrProcBuf = (__m256i*) &llrProcBuf [18048]; + p_llrRes = (__m512i*) &llrRes [18048]; + for (int i=0,j=0;i<M;i++,j+=2) { + zmmRes0 = _mm512_cvtepi8_epi16(p_bnProcBuf [j]); + zmmRes1 = _mm512_cvtepi8_epi16(p_bnProcBuf [j +1]); + zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf[12 + j]); + zmmRes0 = _mm512_adds_epi16(zmmRes0,zmm0); + zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[12 + j +1]); + zmmRes1 = _mm512_adds_epi16(zmmRes1,zmm1); + zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf[24 + j]); + zmmRes0 = _mm512_adds_epi16(zmmRes0,zmm0); + zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[24 + j +1]); + zmmRes1 = _mm512_adds_epi16(zmmRes1,zmm1); + zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf[36 + j]); + zmmRes0 = _mm512_adds_epi16(zmmRes0,zmm0); + zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[36 + j +1]); + zmmRes1 = _mm512_adds_epi16(zmmRes1,zmm1); + zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf[48 + j]); + zmmRes0 = _mm512_adds_epi16(zmmRes0,zmm0); + zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[48 + j +1]); + zmmRes1 = _mm512_adds_epi16(zmmRes1,zmm1); + zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf[60 + j]); + zmmRes0 = _mm512_adds_epi16(zmmRes0,zmm0); + zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[60 + j +1]); + zmmRes1 = _mm512_adds_epi16(zmmRes1,zmm1); + zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf[72 + j]); + zmmRes0 = _mm512_adds_epi16(zmmRes0,zmm0); + zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[72 + j +1]); + zmmRes1 = _mm512_adds_epi16(zmmRes1,zmm1); + zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf[84 + j]); + zmmRes0 = _mm512_adds_epi16(zmmRes0,zmm0); + zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[84 + j +1]); + zmmRes1 = _mm512_adds_epi16(zmmRes1,zmm1); + zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf[96 + j]); + zmmRes0 = _mm512_adds_epi16(zmmRes0,zmm0); + zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[96 + j +1]); + zmmRes1 = _mm512_adds_epi16(zmmRes1,zmm1); + zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf[108 + j]); + zmmRes0 = _mm512_adds_epi16(zmmRes0,zmm0); + zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[108 + j +1]); + zmmRes1 = _mm512_adds_epi16(zmmRes1,zmm1); + zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf[120 + j]); + zmmRes0 = _mm512_adds_epi16(zmmRes0,zmm0); + zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[120 + j +1]); + zmmRes1 = _mm512_adds_epi16(zmmRes1,zmm1); + zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf[132 + j]); + zmmRes0 = _mm512_adds_epi16(zmmRes0,zmm0); + zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[132 + j +1]); + zmmRes1 = _mm512_adds_epi16(zmmRes1,zmm1); + zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf[144 + j]); + zmmRes0 = _mm512_adds_epi16(zmmRes0,zmm0); + zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[144 + j +1]); + zmmRes1 = _mm512_adds_epi16(zmmRes1,zmm1); + zmm0 = _mm512_cvtepi8_epi16(p_llrProcBuf[j]); + zmmRes0 = _mm512_adds_epi16(zmmRes0,zmm0); + zmm1 = _mm512_cvtepi8_epi16(p_llrProcBuf[j +1 ]); + zmmRes1 = _mm512_adds_epi16(zmmRes1,zmm1); + zmm0 = _mm512_packs_epi16(zmmRes0,zmmRes1); + p_llrRes[i] = _mm512_permutex_epi64(zmm0, 0xD8); +} +// Process group with 14 CNs + M = (1*Z + 63)>>6; + p_bnProcBuf = (__m256i*) &bnProcBuf [46848]; + p_llrProcBuf = (__m256i*) &llrProcBuf [18432]; + p_llrRes = (__m512i*) &llrRes [18432]; + for (int i=0,j=0;i<M;i++,j+=2) { + zmmRes0 = _mm512_cvtepi8_epi16(p_bnProcBuf [j]); + zmmRes1 = _mm512_cvtepi8_epi16(p_bnProcBuf [j +1]); + zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf[12 + j]); + zmmRes0 = _mm512_adds_epi16(zmmRes0,zmm0); + zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[12 + j +1]); + zmmRes1 = _mm512_adds_epi16(zmmRes1,zmm1); + zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf[24 + j]); + zmmRes0 = _mm512_adds_epi16(zmmRes0,zmm0); + zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[24 + j +1]); + zmmRes1 = _mm512_adds_epi16(zmmRes1,zmm1); + zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf[36 + j]); + zmmRes0 = _mm512_adds_epi16(zmmRes0,zmm0); + zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[36 + j +1]); + zmmRes1 = _mm512_adds_epi16(zmmRes1,zmm1); + zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf[48 + j]); + zmmRes0 = _mm512_adds_epi16(zmmRes0,zmm0); + zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[48 + j +1]); + zmmRes1 = _mm512_adds_epi16(zmmRes1,zmm1); + zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf[60 + j]); + zmmRes0 = _mm512_adds_epi16(zmmRes0,zmm0); + zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[60 + j +1]); + zmmRes1 = _mm512_adds_epi16(zmmRes1,zmm1); + zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf[72 + j]); + zmmRes0 = _mm512_adds_epi16(zmmRes0,zmm0); + zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[72 + j +1]); + zmmRes1 = _mm512_adds_epi16(zmmRes1,zmm1); + zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf[84 + j]); + zmmRes0 = _mm512_adds_epi16(zmmRes0,zmm0); + zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[84 + j +1]); + zmmRes1 = _mm512_adds_epi16(zmmRes1,zmm1); + zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf[96 + j]); + zmmRes0 = _mm512_adds_epi16(zmmRes0,zmm0); + zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[96 + j +1]); + zmmRes1 = _mm512_adds_epi16(zmmRes1,zmm1); + zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf[108 + j]); + zmmRes0 = _mm512_adds_epi16(zmmRes0,zmm0); + zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[108 + j +1]); + zmmRes1 = _mm512_adds_epi16(zmmRes1,zmm1); + zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf[120 + j]); + zmmRes0 = _mm512_adds_epi16(zmmRes0,zmm0); + zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[120 + j +1]); + zmmRes1 = _mm512_adds_epi16(zmmRes1,zmm1); + zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf[132 + j]); + zmmRes0 = _mm512_adds_epi16(zmmRes0,zmm0); + zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[132 + j +1]); + zmmRes1 = _mm512_adds_epi16(zmmRes1,zmm1); + zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf[144 + j]); + zmmRes0 = _mm512_adds_epi16(zmmRes0,zmm0); + zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[144 + j +1]); + zmmRes1 = _mm512_adds_epi16(zmmRes1,zmm1); + zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf[156 + j]); + zmmRes0 = _mm512_adds_epi16(zmmRes0,zmm0); + zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[156 + j +1]); + zmmRes1 = _mm512_adds_epi16(zmmRes1,zmm1); + zmm0 = _mm512_cvtepi8_epi16(p_llrProcBuf[j]); + zmmRes0 = _mm512_adds_epi16(zmmRes0,zmm0); + zmm1 = _mm512_cvtepi8_epi16(p_llrProcBuf[j +1 ]); + zmmRes1 = _mm512_adds_epi16(zmmRes1,zmm1); + zmm0 = _mm512_packs_epi16(zmmRes0,zmmRes1); + p_llrRes[i] = _mm512_permutex_epi64(zmm0, 0xD8); +} +// Process group with 15 CNs +// Process group with 16 CNs + M = (1*Z + 63)>>6; + p_bnProcBuf = (__m256i*) &bnProcBuf [52224]; + p_llrProcBuf = (__m256i*) &llrProcBuf [18816]; + p_llrRes = (__m512i*) &llrRes [18816]; + for (int i=0,j=0;i<M;i++,j+=2) { + zmmRes0 = _mm512_cvtepi8_epi16(p_bnProcBuf [j]); + zmmRes1 = _mm512_cvtepi8_epi16(p_bnProcBuf [j +1]); + zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf[12 + j]); + zmmRes0 = _mm512_adds_epi16(zmmRes0,zmm0); + zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[12 + j +1]); + zmmRes1 = _mm512_adds_epi16(zmmRes1,zmm1); + zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf[24 + j]); + zmmRes0 = _mm512_adds_epi16(zmmRes0,zmm0); + zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[24 + j +1]); + zmmRes1 = _mm512_adds_epi16(zmmRes1,zmm1); + zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf[36 + j]); + zmmRes0 = _mm512_adds_epi16(zmmRes0,zmm0); + zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[36 + j +1]); + zmmRes1 = _mm512_adds_epi16(zmmRes1,zmm1); + zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf[48 + j]); + zmmRes0 = _mm512_adds_epi16(zmmRes0,zmm0); + zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[48 + j +1]); + zmmRes1 = _mm512_adds_epi16(zmmRes1,zmm1); + zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf[60 + j]); + zmmRes0 = _mm512_adds_epi16(zmmRes0,zmm0); + zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[60 + j +1]); + zmmRes1 = _mm512_adds_epi16(zmmRes1,zmm1); + zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf[72 + j]); + zmmRes0 = _mm512_adds_epi16(zmmRes0,zmm0); + zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[72 + j +1]); + zmmRes1 = _mm512_adds_epi16(zmmRes1,zmm1); + zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf[84 + j]); + zmmRes0 = _mm512_adds_epi16(zmmRes0,zmm0); + zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[84 + j +1]); + zmmRes1 = _mm512_adds_epi16(zmmRes1,zmm1); + zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf[96 + j]); + zmmRes0 = _mm512_adds_epi16(zmmRes0,zmm0); + zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[96 + j +1]); + zmmRes1 = _mm512_adds_epi16(zmmRes1,zmm1); + zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf[108 + j]); + zmmRes0 = _mm512_adds_epi16(zmmRes0,zmm0); + zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[108 + j +1]); + zmmRes1 = _mm512_adds_epi16(zmmRes1,zmm1); + zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf[120 + j]); + zmmRes0 = _mm512_adds_epi16(zmmRes0,zmm0); + zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[120 + j +1]); + zmmRes1 = _mm512_adds_epi16(zmmRes1,zmm1); + zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf[132 + j]); + zmmRes0 = _mm512_adds_epi16(zmmRes0,zmm0); + zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[132 + j +1]); + zmmRes1 = _mm512_adds_epi16(zmmRes1,zmm1); + zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf[144 + j]); + zmmRes0 = _mm512_adds_epi16(zmmRes0,zmm0); + zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[144 + j +1]); + zmmRes1 = _mm512_adds_epi16(zmmRes1,zmm1); + zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf[156 + j]); + zmmRes0 = _mm512_adds_epi16(zmmRes0,zmm0); + zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[156 + j +1]); + zmmRes1 = _mm512_adds_epi16(zmmRes1,zmm1); + zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf[168 + j]); + zmmRes0 = _mm512_adds_epi16(zmmRes0,zmm0); + zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[168 + j +1]); + zmmRes1 = _mm512_adds_epi16(zmmRes1,zmm1); + zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf[180 + j]); + zmmRes0 = _mm512_adds_epi16(zmmRes0,zmm0); + zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[180 + j +1]); + zmmRes1 = _mm512_adds_epi16(zmmRes1,zmm1); + zmm0 = _mm512_cvtepi8_epi16(p_llrProcBuf[j]); + zmmRes0 = _mm512_adds_epi16(zmmRes0,zmm0); + zmm1 = _mm512_cvtepi8_epi16(p_llrProcBuf[j +1 ]); + zmmRes1 = _mm512_adds_epi16(zmmRes1,zmm1); + zmm0 = _mm512_packs_epi16(zmmRes0,zmmRes1); + p_llrRes[i] = _mm512_permutex_epi64(zmm0, 0xD8); +} +// Process group with 17 CNs +// Process group with 18 CNs +// Process group with 19 CNs +// Process group with 20 CNs +// Process group with 21 CNs +// Process group with 22 CNs + M = (1*Z + 63)>>6; + p_bnProcBuf = (__m256i*) &bnProcBuf [58368]; + p_llrProcBuf = (__m256i*) &llrProcBuf [19200]; + p_llrRes = (__m512i*) &llrRes [19200]; + for (int i=0,j=0;i<M;i++,j+=2) { + zmmRes0 = _mm512_cvtepi8_epi16(p_bnProcBuf [j]); + zmmRes1 = _mm512_cvtepi8_epi16(p_bnProcBuf [j +1]); + zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf[12 + j]); + zmmRes0 = _mm512_adds_epi16(zmmRes0,zmm0); + zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[12 + j +1]); + zmmRes1 = _mm512_adds_epi16(zmmRes1,zmm1); + zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf[24 + j]); + zmmRes0 = _mm512_adds_epi16(zmmRes0,zmm0); + zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[24 + j +1]); + zmmRes1 = _mm512_adds_epi16(zmmRes1,zmm1); + zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf[36 + j]); + zmmRes0 = _mm512_adds_epi16(zmmRes0,zmm0); + zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[36 + j +1]); + zmmRes1 = _mm512_adds_epi16(zmmRes1,zmm1); + zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf[48 + j]); + zmmRes0 = _mm512_adds_epi16(zmmRes0,zmm0); + zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[48 + j +1]); + zmmRes1 = _mm512_adds_epi16(zmmRes1,zmm1); + zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf[60 + j]); + zmmRes0 = _mm512_adds_epi16(zmmRes0,zmm0); + zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[60 + j +1]); + zmmRes1 = _mm512_adds_epi16(zmmRes1,zmm1); + zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf[72 + j]); + zmmRes0 = _mm512_adds_epi16(zmmRes0,zmm0); + zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[72 + j +1]); + zmmRes1 = _mm512_adds_epi16(zmmRes1,zmm1); + zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf[84 + j]); + zmmRes0 = _mm512_adds_epi16(zmmRes0,zmm0); + zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[84 + j +1]); + zmmRes1 = _mm512_adds_epi16(zmmRes1,zmm1); + zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf[96 + j]); + zmmRes0 = _mm512_adds_epi16(zmmRes0,zmm0); + zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[96 + j +1]); + zmmRes1 = _mm512_adds_epi16(zmmRes1,zmm1); + zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf[108 + j]); + zmmRes0 = _mm512_adds_epi16(zmmRes0,zmm0); + zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[108 + j +1]); + zmmRes1 = _mm512_adds_epi16(zmmRes1,zmm1); + zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf[120 + j]); + zmmRes0 = _mm512_adds_epi16(zmmRes0,zmm0); + zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[120 + j +1]); + zmmRes1 = _mm512_adds_epi16(zmmRes1,zmm1); + zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf[132 + j]); + zmmRes0 = _mm512_adds_epi16(zmmRes0,zmm0); + zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[132 + j +1]); + zmmRes1 = _mm512_adds_epi16(zmmRes1,zmm1); + zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf[144 + j]); + zmmRes0 = _mm512_adds_epi16(zmmRes0,zmm0); + zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[144 + j +1]); + zmmRes1 = _mm512_adds_epi16(zmmRes1,zmm1); + zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf[156 + j]); + zmmRes0 = _mm512_adds_epi16(zmmRes0,zmm0); + zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[156 + j +1]); + zmmRes1 = _mm512_adds_epi16(zmmRes1,zmm1); + zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf[168 + j]); + zmmRes0 = _mm512_adds_epi16(zmmRes0,zmm0); + zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[168 + j +1]); + zmmRes1 = _mm512_adds_epi16(zmmRes1,zmm1); + zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf[180 + j]); + zmmRes0 = _mm512_adds_epi16(zmmRes0,zmm0); + zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[180 + j +1]); + zmmRes1 = _mm512_adds_epi16(zmmRes1,zmm1); + zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf[192 + j]); + zmmRes0 = _mm512_adds_epi16(zmmRes0,zmm0); + zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[192 + j +1]); + zmmRes1 = _mm512_adds_epi16(zmmRes1,zmm1); + zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf[204 + j]); + zmmRes0 = _mm512_adds_epi16(zmmRes0,zmm0); + zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[204 + j +1]); + zmmRes1 = _mm512_adds_epi16(zmmRes1,zmm1); + zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf[216 + j]); + zmmRes0 = _mm512_adds_epi16(zmmRes0,zmm0); + zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[216 + j +1]); + zmmRes1 = _mm512_adds_epi16(zmmRes1,zmm1); + zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf[228 + j]); + zmmRes0 = _mm512_adds_epi16(zmmRes0,zmm0); + zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[228 + j +1]); + zmmRes1 = _mm512_adds_epi16(zmmRes1,zmm1); + zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf[240 + j]); + zmmRes0 = _mm512_adds_epi16(zmmRes0,zmm0); + zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[240 + j +1]); + zmmRes1 = _mm512_adds_epi16(zmmRes1,zmm1); + zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf[252 + j]); + zmmRes0 = _mm512_adds_epi16(zmmRes0,zmm0); + zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[252 + j +1]); + zmmRes1 = _mm512_adds_epi16(zmmRes1,zmm1); + zmm0 = _mm512_cvtepi8_epi16(p_llrProcBuf[j]); + zmmRes0 = _mm512_adds_epi16(zmmRes0,zmm0); + zmm1 = _mm512_cvtepi8_epi16(p_llrProcBuf[j +1 ]); + zmmRes1 = _mm512_adds_epi16(zmmRes1,zmm1); + zmm0 = _mm512_packs_epi16(zmmRes0,zmmRes1); + p_llrRes[i] = _mm512_permutex_epi64(zmm0, 0xD8); +} +// Process group with <23 CNs + M = (1*Z + 63)>>6; + p_bnProcBuf = (__m256i*) &bnProcBuf [66816]; + p_llrProcBuf = (__m256i*) &llrProcBuf [19584]; + p_llrRes = (__m512i*) &llrRes [19584]; + for (int i=0,j=0;i<M;i++,j+=2) { + zmmRes0 = _mm512_cvtepi8_epi16(p_bnProcBuf [j]); + zmmRes1 = _mm512_cvtepi8_epi16(p_bnProcBuf [j +1]); + zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf[12 + j]); + zmmRes0 = _mm512_adds_epi16(zmmRes0,zmm0); + zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[12 + j +1]); + zmmRes1 = _mm512_adds_epi16(zmmRes1,zmm1); + zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf[24 + j]); + zmmRes0 = _mm512_adds_epi16(zmmRes0,zmm0); + zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[24 + j +1]); + zmmRes1 = _mm512_adds_epi16(zmmRes1,zmm1); + zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf[36 + j]); + zmmRes0 = _mm512_adds_epi16(zmmRes0,zmm0); + zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[36 + j +1]); + zmmRes1 = _mm512_adds_epi16(zmmRes1,zmm1); + zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf[48 + j]); + zmmRes0 = _mm512_adds_epi16(zmmRes0,zmm0); + zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[48 + j +1]); + zmmRes1 = _mm512_adds_epi16(zmmRes1,zmm1); + zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf[60 + j]); + zmmRes0 = _mm512_adds_epi16(zmmRes0,zmm0); + zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[60 + j +1]); + zmmRes1 = _mm512_adds_epi16(zmmRes1,zmm1); + zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf[72 + j]); + zmmRes0 = _mm512_adds_epi16(zmmRes0,zmm0); + zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[72 + j +1]); + zmmRes1 = _mm512_adds_epi16(zmmRes1,zmm1); + zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf[84 + j]); + zmmRes0 = _mm512_adds_epi16(zmmRes0,zmm0); + zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[84 + j +1]); + zmmRes1 = _mm512_adds_epi16(zmmRes1,zmm1); + zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf[96 + j]); + zmmRes0 = _mm512_adds_epi16(zmmRes0,zmm0); + zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[96 + j +1]); + zmmRes1 = _mm512_adds_epi16(zmmRes1,zmm1); + zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf[108 + j]); + zmmRes0 = _mm512_adds_epi16(zmmRes0,zmm0); + zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[108 + j +1]); + zmmRes1 = _mm512_adds_epi16(zmmRes1,zmm1); + zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf[120 + j]); + zmmRes0 = _mm512_adds_epi16(zmmRes0,zmm0); + zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[120 + j +1]); + zmmRes1 = _mm512_adds_epi16(zmmRes1,zmm1); + zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf[132 + j]); + zmmRes0 = _mm512_adds_epi16(zmmRes0,zmm0); + zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[132 + j +1]); + zmmRes1 = _mm512_adds_epi16(zmmRes1,zmm1); + zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf[144 + j]); + zmmRes0 = _mm512_adds_epi16(zmmRes0,zmm0); + zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[144 + j +1]); + zmmRes1 = _mm512_adds_epi16(zmmRes1,zmm1); + zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf[156 + j]); + zmmRes0 = _mm512_adds_epi16(zmmRes0,zmm0); + zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[156 + j +1]); + zmmRes1 = _mm512_adds_epi16(zmmRes1,zmm1); + zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf[168 + j]); + zmmRes0 = _mm512_adds_epi16(zmmRes0,zmm0); + zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[168 + j +1]); + zmmRes1 = _mm512_adds_epi16(zmmRes1,zmm1); + zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf[180 + j]); + zmmRes0 = _mm512_adds_epi16(zmmRes0,zmm0); + zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[180 + j +1]); + zmmRes1 = _mm512_adds_epi16(zmmRes1,zmm1); + zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf[192 + j]); + zmmRes0 = _mm512_adds_epi16(zmmRes0,zmm0); + zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[192 + j +1]); + zmmRes1 = _mm512_adds_epi16(zmmRes1,zmm1); + zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf[204 + j]); + zmmRes0 = _mm512_adds_epi16(zmmRes0,zmm0); + zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[204 + j +1]); + zmmRes1 = _mm512_adds_epi16(zmmRes1,zmm1); + zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf[216 + j]); + zmmRes0 = _mm512_adds_epi16(zmmRes0,zmm0); + zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[216 + j +1]); + zmmRes1 = _mm512_adds_epi16(zmmRes1,zmm1); + zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf[228 + j]); + zmmRes0 = _mm512_adds_epi16(zmmRes0,zmm0); + zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[228 + j +1]); + zmmRes1 = _mm512_adds_epi16(zmmRes1,zmm1); + zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf[240 + j]); + zmmRes0 = _mm512_adds_epi16(zmmRes0,zmm0); + zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[240 + j +1]); + zmmRes1 = _mm512_adds_epi16(zmmRes1,zmm1); + zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf[252 + j]); + zmmRes0 = _mm512_adds_epi16(zmmRes0,zmm0); + zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[252 + j +1]); + zmmRes1 = _mm512_adds_epi16(zmmRes1,zmm1); + zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf[264 + j]); + zmmRes0 = _mm512_adds_epi16(zmmRes0,zmm0); + zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[264 + j +1]); + zmmRes1 = _mm512_adds_epi16(zmmRes1,zmm1); + zmm0 = _mm512_cvtepi8_epi16(p_llrProcBuf[j]); + zmmRes0 = _mm512_adds_epi16(zmmRes0,zmm0); + zmm1 = _mm512_cvtepi8_epi16(p_llrProcBuf[j +1 ]); + zmmRes1 = _mm512_adds_epi16(zmmRes1,zmm1); + zmm0 = _mm512_packs_epi16(zmmRes0,zmmRes1); + p_llrRes[i] = _mm512_permutex_epi64(zmm0, 0xD8); +} +// Process group with 24 CNs +// Process group with 25 CNs +// Process group with 26 CNs +// Process group with 27 CNs +// Process group with 28 CNs +// Process group with 29 CNs +// Process group with 30 CNs +} diff --git a/openair1/PHY/CODING/nrLDPC_decoder/nrLDPC_tools/ldpc_gen_files/bn_avx512/nrLDPC_bnProcPc_BG2_R23_AVX512.h b/openair1/PHY/CODING/nrLDPC_decoder/nrLDPC_tools/ldpc_gen_files/bn_avx512/nrLDPC_bnProcPc_BG2_R23_AVX512.h new file mode 100644 index 0000000000000000000000000000000000000000..6330988dd3a1f306137541710367bd604541651b --- /dev/null +++ b/openair1/PHY/CODING/nrLDPC_decoder/nrLDPC_tools/ldpc_gen_files/bn_avx512/nrLDPC_bnProcPc_BG2_R23_AVX512.h @@ -0,0 +1,181 @@ +static inline void nrLDPC_bnProcPc_BG2_R23_AVX512(int8_t* bnProcBuf,int8_t* llrRes , int8_t* llrProcBuf, uint16_t Z ) { + __m512i zmm0,zmm1,zmmRes0,zmmRes1; + __m256i* p_bnProcBuf; + __m256i* p_llrProcBuf; + __m512i* p_llrRes; + uint32_t M ; +// Process group with 1 CNs + M = (3*Z + 63)>>6; + p_bnProcBuf = (__m256i*) &bnProcBuf [0]; + p_llrProcBuf = (__m256i*) &llrProcBuf [0]; + p_llrRes = (__m512i*) &llrRes [0]; + for (int i=0,j=0;i<M;i++,j+=2) { + zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf [j]); + zmm1 = _mm512_cvtepi8_epi16(p_llrProcBuf[j]); + zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf[j+1]); + zmmRes0 = _mm512_adds_epi16(zmmRes0,zmm0); + zmm1 = _mm512_cvtepi8_epi16(p_llrProcBuf[j +1 ]); + zmmRes1 = _mm512_adds_epi16(zmmRes1,zmm1); + zmm0 = _mm512_packs_epi16(zmmRes0,zmmRes1); + p_llrRes[i] = _mm512_permutex_epi64(zmm0, 0xD8); +} +// Process group with 2 CNs + M = (3*Z + 63)>>6; + p_bnProcBuf = (__m256i*) &bnProcBuf [1152]; + p_llrProcBuf = (__m256i*) &llrProcBuf [1152]; + p_llrRes = (__m512i*) &llrRes [1152]; + for (int i=0,j=0;i<M;i++,j+=2) { + zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf [j]); + zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[j + 1]); + zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf[36 + j]); + zmmRes0 = _mm512_adds_epi16(zmmRes0,zmm0); + zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[36 + j +1]); + zmmRes1 = _mm512_adds_epi16(zmmRes1,zmm1); + zmm0 = _mm512_cvtepi8_epi16(p_llrProcBuf[j]); + zmmRes0 = _mm512_adds_epi16(zmmRes0,zmm0); + zmm1 = _mm512_cvtepi8_epi16(p_llrProcBuf[j +1 ]); + zmmRes1 = _mm512_adds_epi16(zmmRes1,zmm1); + zmm0 = _mm512_packs_epi16(zmmRes0,zmmRes1); + p_llrRes[i] = _mm512_permutex_epi64(zmm0, 0xD8); +} +// Process group with 3 CNs + M = (5*Z + 63)>>6; + p_bnProcBuf = (__m256i*) &bnProcBuf [3456]; + p_llrProcBuf = (__m256i*) &llrProcBuf [2304]; + p_llrRes = (__m512i*) &llrRes [2304]; + for (int i=0,j=0;i<M;i++,j+=2) { + zmmRes0 = _mm512_cvtepi8_epi16(p_bnProcBuf [j]); + zmmRes1 = _mm512_cvtepi8_epi16(p_bnProcBuf [j +1]); + zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf[60 + j]); + zmmRes0 = _mm512_adds_epi16(zmmRes0,zmm0); + zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[60 + j +1]); + zmmRes1 = _mm512_adds_epi16(zmmRes1,zmm1); + zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf[120 + j]); + zmmRes0 = _mm512_adds_epi16(zmmRes0,zmm0); + zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[120 + j +1]); + zmmRes1 = _mm512_adds_epi16(zmmRes1,zmm1); + zmm0 = _mm512_cvtepi8_epi16(p_llrProcBuf[j]); + zmmRes0 = _mm512_adds_epi16(zmmRes0,zmm0); + zmm1 = _mm512_cvtepi8_epi16(p_llrProcBuf[j +1 ]); + zmmRes1 = _mm512_adds_epi16(zmmRes1,zmm1); + zmm0 = _mm512_packs_epi16(zmmRes0,zmmRes1); + p_llrRes[i] = _mm512_permutex_epi64(zmm0, 0xD8); +} +// Process group with 4 CNs + M = (3*Z + 63)>>6; + p_bnProcBuf = (__m256i*) &bnProcBuf [9216]; + p_llrProcBuf = (__m256i*) &llrProcBuf [4224]; + p_llrRes = (__m512i*) &llrRes [4224]; + for (int i=0,j=0;i<M;i++,j+=2) { + zmmRes0 = _mm512_cvtepi8_epi16(p_bnProcBuf [j]); + zmmRes1 = _mm512_cvtepi8_epi16(p_bnProcBuf [j +1]); + zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf[36 + j]); + zmmRes0 = _mm512_adds_epi16(zmmRes0,zmm0); + zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[36 + j +1]); + zmmRes1 = _mm512_adds_epi16(zmmRes1,zmm1); + zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf[72 + j]); + zmmRes0 = _mm512_adds_epi16(zmmRes0,zmm0); + zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[72 + j +1]); + zmmRes1 = _mm512_adds_epi16(zmmRes1,zmm1); + zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf[108 + j]); + zmmRes0 = _mm512_adds_epi16(zmmRes0,zmm0); + zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[108 + j +1]); + zmmRes1 = _mm512_adds_epi16(zmmRes1,zmm1); + zmm0 = _mm512_cvtepi8_epi16(p_llrProcBuf[j]); + zmmRes0 = _mm512_adds_epi16(zmmRes0,zmm0); + zmm1 = _mm512_cvtepi8_epi16(p_llrProcBuf[j +1 ]); + zmmRes1 = _mm512_adds_epi16(zmmRes1,zmm1); + zmm0 = _mm512_packs_epi16(zmmRes0,zmmRes1); + p_llrRes[i] = _mm512_permutex_epi64(zmm0, 0xD8); +} +// Process group with 5 CNs + M = (2*Z + 63)>>6; + p_bnProcBuf = (__m256i*) &bnProcBuf [13824]; + p_llrProcBuf = (__m256i*) &llrProcBuf [5376]; + p_llrRes = (__m512i*) &llrRes [5376]; + for (int i=0,j=0;i<M;i++,j+=2) { + zmmRes0 = _mm512_cvtepi8_epi16(p_bnProcBuf [j]); + zmmRes1 = _mm512_cvtepi8_epi16(p_bnProcBuf [j +1]); + zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf[24 + j]); + zmmRes0 = _mm512_adds_epi16(zmmRes0,zmm0); + zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[24 + j +1]); + zmmRes1 = _mm512_adds_epi16(zmmRes1,zmm1); + zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf[48 + j]); + zmmRes0 = _mm512_adds_epi16(zmmRes0,zmm0); + zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[48 + j +1]); + zmmRes1 = _mm512_adds_epi16(zmmRes1,zmm1); + zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf[72 + j]); + zmmRes0 = _mm512_adds_epi16(zmmRes0,zmm0); + zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[72 + j +1]); + zmmRes1 = _mm512_adds_epi16(zmmRes1,zmm1); + zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf[96 + j]); + zmmRes0 = _mm512_adds_epi16(zmmRes0,zmm0); + zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[96 + j +1]); + zmmRes1 = _mm512_adds_epi16(zmmRes1,zmm1); + zmm0 = _mm512_cvtepi8_epi16(p_llrProcBuf[j]); + zmmRes0 = _mm512_adds_epi16(zmmRes0,zmm0); + zmm1 = _mm512_cvtepi8_epi16(p_llrProcBuf[j +1 ]); + zmmRes1 = _mm512_adds_epi16(zmmRes1,zmm1); + zmm0 = _mm512_packs_epi16(zmmRes0,zmmRes1); + p_llrRes[i] = _mm512_permutex_epi64(zmm0, 0xD8); +} +// Process group with 6 CNs + M = (1*Z + 63)>>6; + p_bnProcBuf = (__m256i*) &bnProcBuf [17664]; + p_llrProcBuf = (__m256i*) &llrProcBuf [6144]; + p_llrRes = (__m512i*) &llrRes [6144]; + for (int i=0,j=0;i<M;i++,j+=2) { + zmmRes0 = _mm512_cvtepi8_epi16(p_bnProcBuf [j]); + zmmRes1 = _mm512_cvtepi8_epi16(p_bnProcBuf [j +1]); + zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf[12 + j]); + zmmRes0 = _mm512_adds_epi16(zmmRes0,zmm0); + zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[12 + j +1]); + zmmRes1 = _mm512_adds_epi16(zmmRes1,zmm1); + zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf[24 + j]); + zmmRes0 = _mm512_adds_epi16(zmmRes0,zmm0); + zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[24 + j +1]); + zmmRes1 = _mm512_adds_epi16(zmmRes1,zmm1); + zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf[36 + j]); + zmmRes0 = _mm512_adds_epi16(zmmRes0,zmm0); + zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[36 + j +1]); + zmmRes1 = _mm512_adds_epi16(zmmRes1,zmm1); + zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf[48 + j]); + zmmRes0 = _mm512_adds_epi16(zmmRes0,zmm0); + zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[48 + j +1]); + zmmRes1 = _mm512_adds_epi16(zmmRes1,zmm1); + zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf[60 + j]); + zmmRes0 = _mm512_adds_epi16(zmmRes0,zmm0); + zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[60 + j +1]); + zmmRes1 = _mm512_adds_epi16(zmmRes1,zmm1); + zmm0 = _mm512_cvtepi8_epi16(p_llrProcBuf[j]); + zmmRes0 = _mm512_adds_epi16(zmmRes0,zmm0); + zmm1 = _mm512_cvtepi8_epi16(p_llrProcBuf[j +1 ]); + zmmRes1 = _mm512_adds_epi16(zmmRes1,zmm1); + zmm0 = _mm512_packs_epi16(zmmRes0,zmmRes1); + p_llrRes[i] = _mm512_permutex_epi64(zmm0, 0xD8); +} +// Process group with 7 CNs +// Process group with 8 CNs +// Process group with 9 CNs +// Process group with 10 CNs +// Process group with 11 CNs +// Process group with 12 CNs +// Process group with 13 CNs +// Process group with 14 CNs +// Process group with 15 CNs +// Process group with 16 CNs +// Process group with 17 CNs +// Process group with 18 CNs +// Process group with 19 CNs +// Process group with 20 CNs +// Process group with 21 CNs +// Process group with 22 CNs +// Process group with <23 CNs +// Process group with 24 CNs +// Process group with 25 CNs +// Process group with 26 CNs +// Process group with 27 CNs +// Process group with 28 CNs +// Process group with 29 CNs +// Process group with 30 CNs +}