Commit b1e28edd authored by Florian Kaltenberger's avatar Florian Kaltenberger

Merge branch 'enhancement-43-AVX2' into bugfix-48-L1L2signaling

corrected some additional FFT related issues from AVX2 merge when running with real-time MODEM.

Conflicts:
	openair1/PHY/LTE_TRANSPORT/dlsch_demodulation.c
parents 801e343c c0f6881c
......@@ -174,7 +174,7 @@ set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -Wl,-rpath -Wl,${OPENAIR_
# these changes are related to hardcoded path to include .h files
add_definitions(-DCMAKER)
set(CMAKE_C_FLAGS_DEBUG "${CMAKE_C_FLAGS} -g -DMALLOC_CHECK_=3")
set(CMAKE_C_FLAGS_RELWITHDEBINFO "${CMAKE_C_FLAGS} -g -DMALLOC_CHECK_=3 -O2")
set(CMAKE_C_FLAGS_RELWITHDEBINFO "${CMAKE_C_FLAGS} -g -DMALLOC_CHECK_=3 -O3")
set(GIT_BRANCH "UNKNOWN")
......@@ -949,6 +949,7 @@ set(PHY_SRC
${OPENAIR1_DIR}/PHY/CODING/crc_byte.c
${OPENAIR1_DIR}/PHY/CODING/3gpplte_turbo_decoder_sse_8bit.c
${OPENAIR1_DIR}/PHY/CODING/3gpplte_turbo_decoder_sse_16bit.c
${OPENAIR1_DIR}/PHY/CODING/3gpplte_turbo_decoder_avx2_16bit.c
${OPENAIR1_DIR}/PHY/CODING/lte_rate_matching.c
${OPENAIR1_DIR}/PHY/CODING/rate_matching.c
${OPENAIR1_DIR}/PHY/CODING/viterbi.c
......
......@@ -37,6 +37,7 @@
#include "extern_3GPPinterleaver.h"
#else
#include "vars.h"
#include <stdint.h>
#endif
#include <stdio.h>
#include <string.h>
......@@ -48,6 +49,7 @@
#define print_shorts(s,x) printf("%s %x,%x,%x,%x,%x,%x,%x,%x\n",s,(x)[0],(x)[1],(x)[2],(x)[3],(x)[4],(x)[5],(x)[6],(x)[7])
#define print_ints(s,x) printf("%s %x %x %x %x\n",s,(x)[0],(x)[1],(x)[2],(x)[3])
#define print_bytes2(s,x) printf("%s %x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x\n",s,(x)[0],(x)[1],(x)[2],(x)[3],(x)[4],(x)[5],(x)[6],(x)[7],(x)[8],(x)[9],(x)[10],(x)[11],(x)[12],(x)[13],(x)[14],(x)[15],(x)[16],(x)[17],(x)[18],(x)[19],(x)[20],(x)[21],(x)[22],(x)[23],(x)[24],(x)[25],(x)[26],(x)[27],(x)[28],(x)[29],(x)[30],(x)[31])
//#define DEBUG_TURBO_ENCODER 1
#define CALLGRIND 1
......@@ -57,16 +59,12 @@ unsigned long long threegpplte_interleaver_tmp;
#if defined(__x86_64__) || defined(__i386__)
struct treillis {
union {
__m64 systematic_64[3];
char systematic_8[24];
};
union {
__m64 parity1_64[3];
char parity1_8[24];
__m64 systematic_andp1_64[3];
uint8_t systematic_andp1_8[24];
};
union {
__m64 parity2_64[3];
char parity2_8[24];
uint8_t parity2_8[24];
};
int exit_state;
} __attribute__ ((aligned(64)));
......@@ -75,12 +73,8 @@ struct treillis {
struct treillis {
union {
uint8x8_t systematic_64[3];
char systematic_8[24];
}__attribute__((aligned(64)));
union {
uint8x8_t parity1_64[3];
char parity1_8[24];
uint8x8_t systematic_andp1_64[3];
char systematic_andp1_8[24];
}__attribute__((aligned(64)));
union {
uint8x8_t parity2_64[3];
......@@ -91,6 +85,7 @@ struct treillis {
#endif
struct treillis all_treillis[8][256];
int all_treillis_initialized=0;
static inline unsigned char threegpplte_rsc(unsigned char input,unsigned char *state)
......@@ -116,18 +111,20 @@ void treillis_table_init(void)
unsigned char v, current_state;
// clear all_treillis
for (i=0; i<8; i++)
for (i=0; i<8; i++) {
bzero( all_treillis[i], sizeof(all_treillis[0]) );
}
for (i=0; i<8; i++) { //all possible initial states
for (j=0; j<=255; j++) { // all possible values of a byte
current_state=i;
for (b=0; b<8 ; b++ ) { // pre-compute the image of the byte j in _m128i vector right place
all_treillis[i][j].systematic_8[b*3]= (j&(1<<(7-b)))>>(7-b);
v=threegpplte_rsc( all_treillis[i][j].systematic_8[b*3] ,
all_treillis[i][j].systematic_andp1_8[b*3]= (j&(1<<(7-b)))>>(7-b);
v=threegpplte_rsc( all_treillis[i][j].systematic_andp1_8[b*3] ,
&current_state);
all_treillis[i][j].parity1_8[b*3+1]=v; // for the yparity1
all_treillis[i][j].systematic_andp1_8[b*3+1]=v; // for the yparity1
// all_treillis[i][j].parity1_8[b*3+1]=v; // for the yparity1
all_treillis[i][j].parity2_8[b*3+2]=v; // for the yparity2
}
......@@ -143,9 +140,10 @@ void treillis_table_init(void)
char interleave_compact_byte(short * base_interleaver,unsigned char * input, unsigned char * output, int n)
{
char expandInput[768*8] __attribute__((aligned(16)));
char expandInput[768*8] __attribute__((aligned(32)));
int i,loop=n>>4;
#if defined(__x86_64__) || defined(__i386__)
#ifndef __AVX2__
__m128i *i_128=(__m128i *)input, *o_128=(__m128i*)expandInput;
__m128i tmp1, tmp2, tmp3, tmp4;
__m128i BIT_MASK = _mm_set_epi8( 0b00000001,
......@@ -164,6 +162,43 @@ char interleave_compact_byte(short * base_interleaver,unsigned char * input, uns
0b00100000,
0b01000000,
0b10000000);
#else
__m256i *i_256=(__m256i *)input, *o_256=(__m256i*)expandInput;
__m256i tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
__m256i BIT_MASK = _mm256_set_epi8( 0b00000001,
0b00000010,
0b00000100,
0b00001000,
0b00010000,
0b00100000,
0b01000000,
0b10000000,
0b00000001,
0b00000010,
0b00000100,
0b00001000,
0b00010000,
0b00100000,
0b01000000,
0b10000000,
0b00000001,
0b00000010,
0b00000100,
0b00001000,
0b00010000,
0b00100000,
0b01000000,
0b10000000,
0b00000001,
0b00000010,
0b00000100,
0b00001000,
0b00010000,
0b00100000,
0b01000000,
0b10000000);
#endif
#elif defined(__arm__)
uint8x16_t *i_128=(uint8x16_t *)input, *o_128=(uint8x16_t *)expandInput;
uint8x16_t tmp1,tmp2;
......@@ -187,46 +222,126 @@ char interleave_compact_byte(short * base_interleaver,unsigned char * input, uns
0b00000010,
0b00000001};
#endif
#ifndef __AVX2__
if ((n&15) > 0)
loop++;
#else
loop=n>>5;
if ((n&31) > 0)
loop++;
#endif
for (i=0; i<loop ; i++ ) {
/* int cur_byte=i<<3; */
/* for (b=0;b<8;b++) */
/* expandInput[cur_byte+b] = (input[i]&(1<<(7-b)))>>(7-b); */
// int cur_byte=i<<3;
// for (b=0;b<8;b++)
// expandInput[cur_byte+b] = (input[i]&(1<<(7-b)))>>(7-b);
#if defined(__x86_64__) || defined(__i386__)
tmp1=_mm_load_si128(i_128++);
tmp2=_mm_unpacklo_epi8(tmp1,tmp1);
tmp3=_mm_unpacklo_epi16(tmp2,tmp2);
tmp4=_mm_unpacklo_epi32(tmp3,tmp3);
#ifndef __AVX2__
tmp1=_mm_load_si128(i_128++); // tmp1 = B0,B1,...,B15
tmp2=_mm_unpacklo_epi8(tmp1,tmp1); // tmp2 = B0,B0,B1,B1,...,B7,B7
tmp3=_mm_unpacklo_epi16(tmp2,tmp2); // tmp3 = B0,B0,B0,B0,B1,B1,B1,B1,B2,B2,B2,B2,B3,B3,B3,B3
tmp4=_mm_unpacklo_epi32(tmp3,tmp3); // tmp4 - B0,B0,B0,B0,B0,B0,B0,B0,B1,B1,B1,B1,B1,B1,B1,B1
*o_128++=_mm_cmpeq_epi8(_mm_and_si128(tmp4,BIT_MASK),BIT_MASK);
tmp4=_mm_unpackhi_epi32(tmp3,tmp3);
tmp4=_mm_unpackhi_epi32(tmp3,tmp3); // tmp4 - B2,B2,B2,B2,B2,B2,B2,B2,B3,B3,B3,B3,B3,B3,B3,B3
*o_128++=_mm_cmpeq_epi8(_mm_and_si128(tmp4,BIT_MASK),BIT_MASK);;
tmp3=_mm_unpackhi_epi16(tmp2,tmp2);
tmp4=_mm_unpacklo_epi32(tmp3,tmp3);
tmp3=_mm_unpackhi_epi16(tmp2,tmp2); // tmp3 = B4,B4,B4,B4,B5,B5,B5,B5,B6,B6,B6,B6,B7,B7,B7,B7
tmp4=_mm_unpacklo_epi32(tmp3,tmp3); // tmp4 - B4,B4,B4,B4,B4,B4,B4,B4,B5,B5,B5,B5,B5,B5,B5,B5
*o_128++=_mm_cmpeq_epi8(_mm_and_si128(tmp4,BIT_MASK),BIT_MASK);;
tmp4=_mm_unpackhi_epi32(tmp3,tmp3);
tmp4=_mm_unpackhi_epi32(tmp3,tmp3); // tmp4 - B6,B6,B6,B6,B6,B6,B6,B6,B7,B7,B7,B7,B7,B7,B7,B7
*o_128++=_mm_cmpeq_epi8(_mm_and_si128(tmp4,BIT_MASK),BIT_MASK);;
tmp2=_mm_unpackhi_epi8(tmp1,tmp1);
tmp3=_mm_unpacklo_epi16(tmp2,tmp2);
tmp4=_mm_unpacklo_epi32(tmp3,tmp3);
tmp2=_mm_unpackhi_epi8(tmp1,tmp1); // tmp2 = B8,B8,B9,B9,...,B15,B15
tmp3=_mm_unpacklo_epi16(tmp2,tmp2); // tmp3 = B8,B8,B8,B8,B9,B9,B9,B9,B10,B10,B10,B10,B11,B11,B11,B11
tmp4=_mm_unpacklo_epi32(tmp3,tmp3); // tmp4 = B8,B8,B8,B8,B8,B8,B8,B8,B9,B9,B9,B9,B9,B9,B9,B9
*o_128++=_mm_cmpeq_epi8(_mm_and_si128(tmp4,BIT_MASK),BIT_MASK);;
tmp4=_mm_unpackhi_epi32(tmp3,tmp3);
tmp4=_mm_unpackhi_epi32(tmp3,tmp3); // tmp4 = B10,B10,B10,B10,B10,B10,B10,B10,B11,B11,B11,B11,B11,B11,B11,B11
*o_128++=_mm_cmpeq_epi8(_mm_and_si128(tmp4,BIT_MASK),BIT_MASK);;
tmp3=_mm_unpackhi_epi16(tmp2,tmp2);
tmp4=_mm_unpacklo_epi32(tmp3,tmp3);
tmp3=_mm_unpackhi_epi16(tmp2,tmp2); // tmp3 = B12,B12,B12,B12,B13,B13,B13,B13,B14,B14,B14,B14,B15,B15,B15,B15
tmp4=_mm_unpacklo_epi32(tmp3,tmp3); // tmp4 = B12,B12,B12,B12,B12,B12,B12,B12,B13,B13,B13,B13,B13,B13,B13,B13
*o_128++=_mm_cmpeq_epi8(_mm_and_si128(tmp4,BIT_MASK),BIT_MASK);;
tmp4=_mm_unpackhi_epi32(tmp3,tmp3);
tmp4=_mm_unpackhi_epi32(tmp3,tmp3); // tmp4 = B14,B14,B14,B14,B14,B14,B14,B14,B15,B15,B15,B15,B15,B15,B15,B15
*o_128++=_mm_cmpeq_epi8(_mm_and_si128(tmp4,BIT_MASK),BIT_MASK);;
#else
tmp1=_mm256_load_si256(i_256++); // tmp1 = B0,B1,...,B15,...,B31
//print_bytes2("in",(uint8_t*)&tmp1);
tmp2=_mm256_unpacklo_epi8(tmp1,tmp1); // tmp2 = B0,B0,B1,B1,...,B7,B7,B16,B16,B17,B17,...,B23,B23
tmp3=_mm256_unpacklo_epi16(tmp2,tmp2); // tmp3 = B0,B0,B0,B0,B1,B1,B1,B1,B2,B2,B2,B2,B3,B3,B3,B3,B16,B16,B16,B16,...,B19,B19,B19,B19
tmp4=_mm256_unpacklo_epi32(tmp3,tmp3); // tmp4 - B0,B0,B0,B0,B0,B0,B0,B0,B1,B1,B1,B1,B1,B1,B1,B1,B16,B16...,B17..,B17
tmp5=_mm256_unpackhi_epi32(tmp3,tmp3); // tmp5 - B2,B2,B2,B2,B2,B2,B2,B2,B3,B3,B3,B3,B3,B3,B3,B3,B18...,B18,B19,...,B19
tmp6=_mm256_insertf128_si256(tmp4,_mm256_extracti128_si256(tmp5,0),1); // tmp6 = B0 B1 B2 B3
tmp7=_mm256_insertf128_si256(tmp5,_mm256_extracti128_si256(tmp4,1),0); // tmp7 = B16 B17 B18 B19
//print_bytes2("tmp2",(uint8_t*)&tmp2);
//print_bytes2("tmp3",(uint8_t*)&tmp3);
//print_bytes2("tmp4",(uint8_t*)&tmp4);
//print_bytes2("tmp5",(uint8_t*)&tmp4);
//print_bytes2("tmp6",(uint8_t*)&tmp6);
//print_bytes2("tmp7",(uint8_t*)&tmp7);
o_256[0]=_mm256_cmpeq_epi8(_mm256_and_si256(tmp6,BIT_MASK),BIT_MASK);
//print_bytes2("out",(uint8_t*)o_256);
o_256[4]=_mm256_cmpeq_epi8(_mm256_and_si256(tmp7,BIT_MASK),BIT_MASK);;
//print_bytes2("out",(uint8_t*)(o_256+4));
tmp3=_mm256_unpackhi_epi16(tmp2,tmp2); // tmp3 = B4,B4,B4,B4,B5,B5,B5,B5,B6,B6,B6,B6,B7,B7,B7,B7,B20,B20,B20,B20,...,B23,B23,B23,B23
tmp4=_mm256_unpacklo_epi32(tmp3,tmp3); // tmp4 - B4,B4,B4,B4,B4,B4,B4,B4,B5,B5,B5,B5,B5,B5,B5,B5,B20,B20...,B21..,B21
tmp5=_mm256_unpackhi_epi32(tmp3,tmp3); // tmp5 - B6,B6,B6,B6,B6,B6,B6,B6,B7,B7,B7,B7,B7,B7,B7,B7,B22...,B22,B23,...,B23
tmp6=_mm256_insertf128_si256(tmp4,_mm256_extracti128_si256(tmp5,0),1); // tmp6 = B4 B5 B6 B7
tmp7=_mm256_insertf128_si256(tmp5,_mm256_extracti128_si256(tmp4,1),0); // tmp7 = B20 B21 B22 B23
//print_bytes2("tmp2",(uint8_t*)&tmp2);
//print_bytes2("tmp3",(uint8_t*)&tmp3);
//print_bytes2("tmp4",(uint8_t*)&tmp4);
//print_bytes2("tmp5",(uint8_t*)&tmp4);
//print_bytes2("tmp6",(uint8_t*)&tmp6);
//print_bytes2("tmp7",(uint8_t*)&tmp7);
o_256[1]=_mm256_cmpeq_epi8(_mm256_and_si256(tmp6,BIT_MASK),BIT_MASK);
//print_bytes2("out",(uint8_t*)(o_256+1));
o_256[5]=_mm256_cmpeq_epi8(_mm256_and_si256(tmp7,BIT_MASK),BIT_MASK);;
//print_bytes2("out",(uint8_t*)(o_256+4));
tmp2=_mm256_unpackhi_epi8(tmp1,tmp1); // tmp2 = B8 B9 B10 B11 B12 B13 B14 B15 B25 B26 B27 B28 B29 B30 B31
tmp3=_mm256_unpacklo_epi16(tmp2,tmp2); // tmp3 = B8,B9,B10,B11,B26,B27,B28,B29
tmp4=_mm256_unpacklo_epi32(tmp3,tmp3); // tmp4 - B8,B9,B26,B27
tmp5=_mm256_unpackhi_epi32(tmp3,tmp3); // tmp5 - B10,B11,B28,B29
tmp6=_mm256_insertf128_si256(tmp4,_mm256_extracti128_si256(tmp5,0),1); // tmp6 = B8 B9 B10 B11
tmp7=_mm256_insertf128_si256(tmp5,_mm256_extracti128_si256(tmp4,1),0); // tmp7 = B26 B27 B28 B29
//print_bytes2("tmp2",(uint8_t*)&tmp2);
//print_bytes2("tmp3",(uint8_t*)&tmp3);
//print_bytes2("tmp4",(uint8_t*)&tmp4);
//print_bytes2("tmp5",(uint8_t*)&tmp4);
//print_bytes2("tmp6",(uint8_t*)&tmp6);
//print_bytes2("tmp7",(uint8_t*)&tmp7);
o_256[2]=_mm256_cmpeq_epi8(_mm256_and_si256(tmp6,BIT_MASK),BIT_MASK);
//print_bytes2("out",(uint8_t*)(o_256+2));
o_256[6]=_mm256_cmpeq_epi8(_mm256_and_si256(tmp7,BIT_MASK),BIT_MASK);;
//print_bytes2("out",(uint8_t*)(o_256+4));
tmp3=_mm256_unpackhi_epi16(tmp2,tmp2); // tmp3 = B12 B13 B14 B15 B28 B29 B30 B31
tmp4=_mm256_unpacklo_epi32(tmp3,tmp3); // tmp4 = B12 B13 B28 B29
tmp5=_mm256_unpackhi_epi32(tmp3,tmp3); // tmp5 = B14 B15 B30 B31
tmp6=_mm256_insertf128_si256(tmp4,_mm256_extracti128_si256(tmp5,0),1); // tmp6 = B12 B13 B14 B15
tmp7=_mm256_insertf128_si256(tmp5,_mm256_extracti128_si256(tmp4,1),0); // tmp7 = B28 B29 B30 B31
//print_bytes2("tmp2",(uint8_t*)&tmp2);
//print_bytes2("tmp3",(uint8_t*)&tmp3);
//print_bytes2("tmp4",(uint8_t*)&tmp4);
//print_bytes2("tmp5",(uint8_t*)&tmp4);
//print_bytes2("tmp6",(uint8_t*)&tmp6);
//print_bytes2("tmp7",(uint8_t*)&tmp7);
o_256[3]=_mm256_cmpeq_epi8(_mm256_and_si256(tmp6,BIT_MASK),BIT_MASK);
//print_bytes2("out",(uint8_t*)(o_256+3));
o_256[7]=_mm256_cmpeq_epi8(_mm256_and_si256(tmp7,BIT_MASK),BIT_MASK);;
//print_bytes2("out",(uint8_t*)(o_256+7));
o_256+=8;
#endif
#elif defined(__arm__)
tmp1=vld1q_u8((uint8_t*)i_128);
//print_bytes("tmp1:",(uint8_t*)&tmp1);
......@@ -302,11 +417,17 @@ char interleave_compact_byte(short * base_interleaver,unsigned char * input, uns
i_128++;
#endif
}
short * ptr_intl=base_interleaver;
#if defined(__x86_64) || defined(__i386__)
#ifndef __AVX2__
__m128i tmp;
uint16_t *systematic2_ptr=(unsigned short *) output;
uint16_t *systematic2_ptr=(uint16_t *) output;
#else
__m256i tmp;
uint32_t *systematic2_ptr=(uint32_t *) output;
#endif
#elif defined(__arm__)
uint8x16_t tmp;
const uint8_t __attribute__ ((aligned (16))) _Powers[16]=
......@@ -316,11 +437,15 @@ char interleave_compact_byte(short * base_interleaver,unsigned char * input, uns
uint8x16_t Powers= vld1q_u8(_Powers);
uint8_t *systematic2_ptr=(uint8_t *) output;
#endif
#ifndef __AVX2__
int input_length_words=n>>1;
#else
int input_length_words=n>>2;
#endif
for ( i=0; i< input_length_words ; i ++ ) {
#if defined(__x86_64__) || defined(__i386__)
#ifndef __AVX2__
tmp=_mm_insert_epi8(tmp,expandInput[*ptr_intl++],7);
tmp=_mm_insert_epi8(tmp,expandInput[*ptr_intl++],6);
tmp=_mm_insert_epi8(tmp,expandInput[*ptr_intl++],5);
......@@ -338,6 +463,45 @@ char interleave_compact_byte(short * base_interleaver,unsigned char * input, uns
tmp=_mm_insert_epi8(tmp,expandInput[*ptr_intl++],8+1);
tmp=_mm_insert_epi8(tmp,expandInput[*ptr_intl++],8+0);
*systematic2_ptr++=(unsigned short)_mm_movemask_epi8(tmp);
#else
tmp=_mm256_insert_epi8(tmp,expandInput[*ptr_intl++],7);
tmp=_mm256_insert_epi8(tmp,expandInput[*ptr_intl++],6);
tmp=_mm256_insert_epi8(tmp,expandInput[*ptr_intl++],5);
tmp=_mm256_insert_epi8(tmp,expandInput[*ptr_intl++],4);
tmp=_mm256_insert_epi8(tmp,expandInput[*ptr_intl++],3);
tmp=_mm256_insert_epi8(tmp,expandInput[*ptr_intl++],2);
tmp=_mm256_insert_epi8(tmp,expandInput[*ptr_intl++],1);
tmp=_mm256_insert_epi8(tmp,expandInput[*ptr_intl++],0);
tmp=_mm256_insert_epi8(tmp,expandInput[*ptr_intl++],8+7);
tmp=_mm256_insert_epi8(tmp,expandInput[*ptr_intl++],8+6);
tmp=_mm256_insert_epi8(tmp,expandInput[*ptr_intl++],8+5);
tmp=_mm256_insert_epi8(tmp,expandInput[*ptr_intl++],8+4);
tmp=_mm256_insert_epi8(tmp,expandInput[*ptr_intl++],8+3);
tmp=_mm256_insert_epi8(tmp,expandInput[*ptr_intl++],8+2);
tmp=_mm256_insert_epi8(tmp,expandInput[*ptr_intl++],8+1);
tmp=_mm256_insert_epi8(tmp,expandInput[*ptr_intl++],8+0);
tmp=_mm256_insert_epi8(tmp,expandInput[*ptr_intl++],16+7);
tmp=_mm256_insert_epi8(tmp,expandInput[*ptr_intl++],16+6);
tmp=_mm256_insert_epi8(tmp,expandInput[*ptr_intl++],16+5);
tmp=_mm256_insert_epi8(tmp,expandInput[*ptr_intl++],16+4);
tmp=_mm256_insert_epi8(tmp,expandInput[*ptr_intl++],16+3);
tmp=_mm256_insert_epi8(tmp,expandInput[*ptr_intl++],16+2);
tmp=_mm256_insert_epi8(tmp,expandInput[*ptr_intl++],16+1);
tmp=_mm256_insert_epi8(tmp,expandInput[*ptr_intl++],16+0);
tmp=_mm256_insert_epi8(tmp,expandInput[*ptr_intl++],24+7);
tmp=_mm256_insert_epi8(tmp,expandInput[*ptr_intl++],24+6);
tmp=_mm256_insert_epi8(tmp,expandInput[*ptr_intl++],24+5);
tmp=_mm256_insert_epi8(tmp,expandInput[*ptr_intl++],24+4);
tmp=_mm256_insert_epi8(tmp,expandInput[*ptr_intl++],24+3);
tmp=_mm256_insert_epi8(tmp,expandInput[*ptr_intl++],24+2);
tmp=_mm256_insert_epi8(tmp,expandInput[*ptr_intl++],24+1);
tmp=_mm256_insert_epi8(tmp,expandInput[*ptr_intl++],24+0);
*systematic2_ptr++=(unsigned int)_mm256_movemask_epi8(tmp);
#endif
#elif defined(__arm__)
tmp=vsetq_lane_u8(expandInput[*ptr_intl++],tmp,7);
tmp=vsetq_lane_u8(expandInput[*ptr_intl++],tmp,6);
......@@ -391,8 +555,9 @@ void threegpplte_turbo_encoder(unsigned char *input,
unsigned short input_length_bits = input_length_bytes<<3;
short * base_interleaver;
if ( all_treillis_initialized == 0 )
if ( all_treillis_initialized == 0 ) {
treillis_table_init();
}
// look for f1 and f2 precomputed interleaver values
for (i=0; i < 188 && f1f2mat[i].nb_bits != input_length_bits; i++);
......@@ -405,7 +570,7 @@ void threegpplte_turbo_encoder(unsigned char *input,
}
unsigned char systematic2[768];
unsigned char systematic2[768] __attribute__((aligned(32)));
interleave_compact_byte(base_interleaver,input,systematic2,input_length_bytes);
#if defined(__x86_64__) || defined(__i386__)
......@@ -419,22 +584,26 @@ void threegpplte_turbo_encoder(unsigned char *input,
for ( state0=state1=i=0 ; i<input_length_bytes; i++ ) {
cur_s1=input[i];
cur_s2=systematic2[i];
for ( code_rate=0; code_rate<3; code_rate++) {
#if defined(__x86_64__) || defined(__i386__)
*ptr_output++ = _mm_add_pi8(all_treillis[state0][cur_s1].systematic_64[code_rate],
_mm_add_pi8(all_treillis[state0][cur_s1].parity1_64[code_rate],
all_treillis[state1][cur_s2].parity2_64[code_rate]));
/*
*ptr_output++ = _mm_add_pi8(all_treillis[state0][cur_s1].systematic_64[code_rate],
_mm_add_pi8(all_treillis[state0][cur_s1].parity1_64[code_rate],
all_treillis[state1][cur_s2].parity2_64[code_rate]));
*/
*ptr_output++ = _mm_add_pi8(all_treillis[state0][cur_s1].systematic_andp1_64[code_rate],
all_treillis[state1][cur_s2].parity2_64[code_rate]);
#elif defined(__arm__)
uint8x8_t ptmp = vadd_u8(all_treillis[state0][cur_s1].parity1_64[code_rate],
all_treillis[state1][cur_s2].parity2_64[code_rate]);
*ptr_output++ = vadd_u8(all_treillis[state0][cur_s1].systematic_64[code_rate],
ptmp);
*ptr_output++ = vadd_u8(all_treillis[state0][cur_s1].systematic_andp1_64[code_rate],
all_treillis[state0][cur_s1].parity1_64[code_rate]);
#endif
}
state0=all_treillis[state0][cur_s1].exit_state;
state1=all_treillis[state1][cur_s2].exit_state;
}
state0=all_treillis[state0][cur_s1].exit_state;
state1=all_treillis[state1][cur_s2].exit_state;
}
x=output+(input_length_bits*3);
......@@ -485,7 +654,7 @@ void threegpplte_turbo_encoder(unsigned char *input,
int main(int argc,char **argv)
{
unsigned char input[INPUT_LENGTH+16],state,state2;
unsigned char input[INPUT_LENGTH+32],state,state2;
unsigned char output[12+(3*(INPUT_LENGTH<<3))],x,z;
int i;
unsigned char out;
......@@ -510,7 +679,7 @@ int main(int argc,char **argv)
memset((void*)input,0,INPUT_LENGTH+16);
for (i=0; i<INPUT_LENGTH; i++) {
input[i] = i*219;
printf("Input %d : %x\n",i,input[i]);
printf("Input %d : %d\n",i,input[i]);
}
threegpplte_turbo_encoder(&input[0],
......
/*******************************************************************************
OpenAirInterface
Copyright(c) 1999 - 2014 Eurecom
OpenAirInterface is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
OpenAirInterface is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with OpenAirInterface.The full GNU General Public License is
included in this distribution in the file called "COPYING". If not,
see <http://www.gnu.org/licenses/>.
Contact Information
OpenAirInterface Admin: openair_admin@eurecom.fr
OpenAirInterface Tech : openair_tech@eurecom.fr
OpenAirInterface Dev : openair4g-devel@lists.eurecom.fr
Address : Eurecom, Campus SophiaTech, 450 Route des Chappes, CS 50193 - 06904 Biot Sophia Antipolis cedex, FRANCE
*******************************************************************************/
/* file: 3gpplte_turbo_decoder_sse_16bit.c
purpose: Routines for implementing max-logmap decoding of Turbo-coded (DLSCH) transport channels from 36-212, V8.6 2009-03
authors: raymond.knopp@eurecom.fr, Laurent Thomas (Alcatel-Lucent)
date: 21.10.2009
Note: This version of the routine currently requires SSE2,SSSE3 and SSE4.1 equipped computers. It uses 16-bit inputs for
LLRS and uses 16-bit arithmetic for the internal computations!
Changelog: 17.11.2009 FK SSE4.1 not required anymore
Aug. 2012 new parallelization options for higher speed (8-way parallelization)
Jan. 2013 8-bit LLR support with 16-way parallelization
Feb. 2013 New interleaving and hard-decision optimizations (L. Thomas)
May 2013 Extracted 16bit code
*/
///
///
#ifdef __AVX2__
#include "PHY/sse_intrin.h"
#ifndef TEST_DEBUG
#include "PHY/defs.h"
#include "PHY/CODING/defs.h"
#include "PHY/CODING/lte_interleaver_inline.h"
#include "extern_3GPPinterleaver.h"
#else
#include "defs.h"
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#endif
#ifdef MEX
#include "mex.h"
#endif
//#define DEBUG_LOGMAP
#ifdef DEBUG_LOGMAP
#define print_shorts(s,x) fprintf(fdavx2,"%s %d,%d,%d,%d,%d,%d,%d,%d\n",s,(x)[0],(x)[1],(x)[2],(x)[3],(x)[4],(x)[5],(x)[6],(x)[7]);fprintf(fdavx2b,"%s %d,%d,%d,%d,%d,%d,%d,%d\n",s,(x)[8],(x)[9],(x)[10],(x)[11],(x)[12],(x)[13],(x)[14],(x)[15])
FILE *fdavx2,*fdavx2b;
#else
#endif
#define print_bytes(s,x) printf("%s %d,%d,%d,%d,%d,%d,%d,%d,%d,%d,%d,%d,%d,%d,%d,%d,%d,%d,%d,%d,%d,%d,%d,%d,%d,%d,%d,%d,%d,%d,%d,%d\n",s,(x)[0],(x)[1],(x)[2],(x)[3],(x)[4],(x)[5],(x)[6],(x)[7],(x)[8],(x)[9],(x)[10],(x)[11],(x)[12],(x)[13],(x)[14],(x)[15],(x)[16],(x)[17],(x)[18],(x)[19],(x)[20],(x)[21],(x)[22],(x)[23],(x)[24],(x)[25],(x)[26],(x)[27],(x)[28],(x)[29],(x)[30],(x)[31])
typedef int16_t llr_t; // internal decoder LLR data is 16-bit fixed
typedef int16_t channel_t;
#define MAX 256
void log_map16avx2(llr_t* systematic,channel_t* y_parity, llr_t* m11, llr_t* m10, llr_t *alpha, llr_t *beta, llr_t* ext,uint16_t frame_length,unsigned char term_flag,unsigned char F,int offset8_flag,time_stats_t *alpha_stats,time_stats_t *beta_stats,time_stats_t *gamma_stats,time_stats_t *ext_stats);
void compute_gamma16avx2(llr_t* m11,llr_t* m10,llr_t* systematic, channel_t* y_parity, uint16_t frame_length,unsigned char term_flag);
void compute_alpha16avx2(llr_t*alpha,llr_t *beta, llr_t* m11,llr_t* m10, uint16_t frame_length,unsigned char F);
void compute_beta16avx2(llr_t*alpha, llr_t* beta,llr_t* m11,llr_t* m10, uint16_t frame_length,unsigned char F,int offset8_flag);
void compute_ext16avx2(llr_t* alpha,llr_t* beta,llr_t* m11,llr_t* m10,llr_t* extrinsic, llr_t* ap, uint16_t frame_length);
void log_map16avx2(llr_t* systematic,
channel_t* y_parity,
llr_t* m11,
llr_t* m10,
llr_t *alpha,
llr_t *beta,
llr_t* ext,
uint16_t frame_length,
unsigned char term_flag,
unsigned char F,
int offset8_flag,
time_stats_t *alpha_stats,
time_stats_t *beta_stats,
time_stats_t *gamma_stats,
time_stats_t *ext_stats)
{
#ifdef DEBUG_LOGMAP
fprintf(fdavx2,"log_map (avx2_16bit), frame_length %d\n",frame_length);
fprintf(fdavx2b,"log_map (avx2_16bit), frame_length %d\n",frame_length);
#endif
start_meas(gamma_stats) ;
compute_gamma16avx2(m11,m10,systematic,y_parity,frame_length,term_flag) ;
stop_meas(gamma_stats);
start_meas(alpha_stats) ;
compute_alpha16avx2(alpha,beta,m11,m10,frame_length,F) ;
stop_meas(alpha_stats);
start_meas(beta_stats) ;
compute_beta16avx2(alpha,beta,m11,m10,frame_length,F,offset8_flag) ;
stop_meas(beta_stats);
start_meas(ext_stats) ;
compute_ext16avx2(alpha,beta,m11,m10,ext,systematic,frame_length) ;
stop_meas(ext_stats);
}
void compute_gamma16avx2(llr_t* m11,llr_t* m10,llr_t* systematic,channel_t* y_parity,
uint16_t frame_length,unsigned char term_flag)
{
int k,K1;
__m256i *systematic128 = (__m256i *)systematic;
__m256i *y_parity128 = (__m256i *)y_parity;
__m256i *m10_128 = (__m256i *)m10;
__m256i *m11_128 = (__m256i *)m11;
#ifdef DEBUG_LOGMAP
fprintf(fdavx2,"compute_gamma (avx2_16bit), %p,%p,%p,%p,framelength %d\n",m11,m10,systematic,y_parity,frame_length);
fprintf(fdavx2b,"compute_gamma (avx2_16bit), %p,%p,%p,%p,framelength %d\n",m11,m10,systematic,y_parity,frame_length);
#endif
K1=frame_length>>3;
for (k=0; k<K1; k++) {
m11_128[k] = _mm256_srai_epi16(_mm256_adds_epi16(systematic128[k],y_parity128[k]),1);
m10_128[k] = _mm256_srai_epi16(_mm256_subs_epi16(systematic128[k],y_parity128[k]),1);
#ifdef DEBUG_LOGMAP
fprintf(fdavx2,"Loop index k %d\n",k);
fprintf(fdavx2b,"Loop index k %d\n",k);
print_shorts("sys",(int16_t*)&systematic128[k]);
print_shorts("yp",(int16_t*)&y_parity128[k]);
print_shorts("m11",(int16_t*)&m11_128[k]);
print_shorts("m10",(int16_t*)&m10_128[k]);
#endif
}
// Termination
m11_128[k] = _mm256_srai_epi16(_mm256_adds_epi16(systematic128[k+term_flag],y_parity128[k]),1);
m10_128[k] = _mm256_srai_epi16(_mm256_subs_epi16(systematic128[k+term_flag],y_parity128[k]),1);
#ifdef DEBUG_LOGMAP
fprintf(fdavx2,"Loop index k %d (term flag %d)\n",k,term_flag);
fprintf(fdavx2b,"Loop index k %d (term flag %d)\n",k,term_flag);
print_shorts("sys",(int16_t*)&systematic128[k+term_flag]);
print_shorts("yp",(int16_t*)&y_parity128[k]);
print_shorts("m11",(int16_t*)&m11_128[k]);
print_shorts("m10",(int16_t*)&m10_128[k]);
#endif
}
#define L 40
void compute_alpha16avx2(llr_t* alpha,llr_t* beta,llr_t* m_11,llr_t* m_10,uint16_t frame_length,unsigned char F)
{
int k,l,l2,K1,rerun_flag=0;
__m256i *alpha128=(__m256i *)alpha,*alpha_ptr;
__m256i a0,a1,a2,a3,a4,a5,a6,a7,*m11p,*m10p;
__m256i m_b0,m_b1,m_b2,m_b3,m_b4,m_b5,m_b6,m_b7;
__m256i new0,new1,new2,new3,new4,new5,new6,new7;
__m256i alpha_max;
unsigned long long timein,timeout;
l2 = L>>3;
K1 = (frame_length>>3);
#ifdef DEBUG_LOGMAP
fprintf(fdavx2,"Compute alpha (avx2_16bit)\n");
fprintf(fdavx2b,"Compute alpha (avx2_16bit)\n");
#endif
timein = rdtsc_oai();
for (l=K1;; l=l2,rerun_flag=1) {
alpha128 = (__m256i *)alpha;
if (rerun_flag == 0) {
alpha128[0] = _mm256_set_epi16(-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2,0,-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2,0);
alpha128[1] = _mm256_set_epi16(-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2);
alpha128[2] = _mm256_set_epi16(-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2);
alpha128[3] = _mm256_set_epi16(-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2);
alpha128[4] = _mm256_set_epi16(-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2);
alpha128[5] = _mm256_set_epi16(-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2);
alpha128[6] = _mm256_set_epi16(-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2);
alpha128[7] = _mm256_set_epi16(-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2);
#ifdef DEBUG_LOGMAP
fprintf(fdavx2,"Initial alpha\n");
fprintf(fdavx2b,"Initial alpha\n");
print_shorts("a0",(int16_t*)&alpha128[0]);
print_shorts("a1",(int16_t*)&alpha128[1]);
print_shorts("a2",(int16_t*)&alpha128[2]);
print_shorts("a3",(int16_t*)&alpha128[3]);
print_shorts("a4",(int16_t*)&alpha128[4]);
print_shorts("a5",(int16_t*)&alpha128[5]);
print_shorts("a6",(int16_t*)&alpha128[6]);
print_shorts("a7",(int16_t*)&alpha128[7]);
#endif
} else {
//set initial alpha in columns 1-7 from final alpha from last run in columns 0-6
alpha128[0] = _mm256_slli_si256(alpha128[frame_length],