Commit 015187e5 authored by knopp's avatar knopp
Browse files

AVX2 optimization activated for gamma computation in 16-bit turbo decoder (single-codeword)

parent 8d4405bd
......@@ -134,7 +134,7 @@ else (CMAKE_SYSTEM_PROCESSOR STREQUAL "armv7l")
set(C_FLAGS_PROCESSOR "${C_FLAGS_PROCESSOR} -mavx2")
endif()
if (CPUINFO MATCHES "sse4_2")
set(C_FLAGS_PROCESSOR "${C_FLAGS_PROCESSOR} -mavx2 -msse4.2")
set(C_FLAGS_PROCESSOR "${C_FLAGS_PROCESSOR} -mavx2 -msse4.2 -fno-tree-vectorize")
endif()
if (CPUINFO MATCHES "sse4_1")
set(C_FLAGS_PROCESSOR "${C_FLAGS_PROCESSOR} -msse4.1")
......
......@@ -223,6 +223,7 @@ char interleave_compact_byte(short * base_interleaver,unsigned char * input, uns
0b00000001};
#endif
#ifndef __AVX2__
if ((n&15) > 0)
loop++;
......
......@@ -186,12 +186,16 @@ void compute_alpha16avx2(llr_t* alpha,llr_t* beta,llr_t* m_11,llr_t* m_10,uint16
__m256i new0,new1,new2,new3,new4,new5,new6,new7;
__m256i alpha_max;
unsigned long long timein,timeout;
l2 = L>>3;
K1 = (frame_length>>3);
#ifdef DEBUG_LOGMAP
fprintf(fdavx2,"Compute alpha (avx2_16bit)\n");
fprintf(fdavx2b,"Compute alpha (avx2_16bit)\n");
#endif
timein = rdtsc_oai();
for (l=K1;; l=l2,rerun_flag=1) {
alpha128 = (__m256i *)alpha;
......@@ -378,6 +382,9 @@ void compute_alpha16avx2(llr_t* alpha,llr_t* beta,llr_t* m_11,llr_t* m_10,uint16
if (rerun_flag==1)
break;
}
timeout = rdtsc_oai();
printf("alpha: inner loop time %llu\n",timeout-timein);
}
......@@ -386,9 +393,10 @@ void compute_beta16avx2(llr_t* alpha,llr_t* beta,llr_t *m_11,llr_t* m_10,uint16_
int k,rerun_flag=0;
__m256i m11_128,m10_128;
__m256i m_b0,m_b1,m_b2,m_b3,m_b4,m_b5,m_b6,m_b7;
__m256i new0,new1,new2,new3,new4,new5,new6,new7;
__m256i *m11p,*m10p;
register __m256i b0,b1,b2,b3,b4,b5,b6,b7;
register __m256i m_b0,m_b1,m_b2,m_b3,m_b4,m_b5,m_b6,m_b7;
register __m256i new0,new1,new2,new3,new4,new5,new6,new7;
__m256i *beta128,*alpha128,*beta_ptr;
__m256i beta_max;
......@@ -398,6 +406,8 @@ void compute_beta16avx2(llr_t* alpha,llr_t* beta,llr_t *m_11,llr_t* m_10,uint16_
llr_t beta0,beta1;
llr_t beta0_cw2,beta1_cw2;
unsigned long long timein,timeout;
#ifdef DEBUG_LOGMAP
fprintf(fdavx2,"compute_beta (avx2_16bit), %p,%p,%p,%p,framelength %d,F %d\n",
beta,m_11,m_10,alpha,frame_length,F);
......@@ -590,56 +600,74 @@ void compute_beta16avx2(llr_t* alpha,llr_t* beta,llr_t *m_11,llr_t* m_10,uint16_
#endif
int loopval=((rerun_flag==0)?0:((frame_length-L)>>3));
printf("beta: rerun %d => loopval %d\n",rerun_flag,loopval);
timein = rdtsc_oai();
m11p = (frame_length>>3)-1+(__m256i*)m_11;
m10p = (frame_length>>3)-1+(__m256i*)m_10;
for (k=(frame_length>>3)-1; k>=loopval; k--) {
m11_128=((__m256i*)m_11)[k];
m10_128=((__m256i*)m_10)[k];
m_b0 = _mm256_adds_epi16(beta_ptr[4],m11_128); //m11
m_b1 = _mm256_subs_epi16(beta_ptr[4],m11_128); //m00
m_b2 = _mm256_subs_epi16(beta_ptr[5],m10_128); //m01
m_b3 = _mm256_adds_epi16(beta_ptr[5],m10_128); //m10
m_b4 = _mm256_adds_epi16(beta_ptr[6],m10_128); //m10
m_b5 = _mm256_subs_epi16(beta_ptr[6],m10_128); //m01
m_b6 = _mm256_subs_epi16(beta_ptr[7],m11_128); //m00
m_b7 = _mm256_adds_epi16(beta_ptr[7],m11_128); //m11
new0 = _mm256_subs_epi16(beta_ptr[0],m11_128); //m00
new1 = _mm256_adds_epi16(beta_ptr[0],m11_128); //m11
new2 = _mm256_adds_epi16(beta_ptr[1],m10_128); //m10
new3 = _mm256_subs_epi16(beta_ptr[1],m10_128); //m01
new4 = _mm256_subs_epi16(beta_ptr[2],m10_128); //m01
new5 = _mm256_adds_epi16(beta_ptr[2],m10_128); //m10
new6 = _mm256_adds_epi16(beta_ptr[3],m11_128); //m11
new7 = _mm256_subs_epi16(beta_ptr[3],m11_128); //m00
b4 = _mm256_load_si256(&beta_ptr[4]);
b5 = _mm256_load_si256(&beta_ptr[5]);
b6 = _mm256_load_si256(&beta_ptr[6]);
b7 = _mm256_load_si256(&beta_ptr[7]);
m_b0 = _mm256_adds_epi16(b4,*m11p); //m11
m_b1 = _mm256_subs_epi16(b4,*m11p); //m00
m_b2 = _mm256_subs_epi16(b5,*m10p); //m01
m_b3 = _mm256_adds_epi16(b5,*m10p); //m10
m_b4 = _mm256_adds_epi16(b6,*m10p); //m10
m_b5 = _mm256_subs_epi16(b6,*m10p); //m01
m_b6 = _mm256_subs_epi16(b7,*m11p); //m00
m_b7 = _mm256_adds_epi16(b7,*m11p); //m11
b0 = _mm256_load_si256(&beta_ptr[0]);
b1 = _mm256_load_si256(&beta_ptr[1]);
b2 = _mm256_load_si256(&beta_ptr[2]);
b3 = _mm256_load_si256(&beta_ptr[3]);
new0 = _mm256_subs_epi16(b0,*m11p); //m00
new1 = _mm256_adds_epi16(b0,*m11p); //m11
new2 = _mm256_adds_epi16(b1,*m10p); //m10
new3 = _mm256_subs_epi16(b1,*m10p); //m01
new4 = _mm256_subs_epi16(b2,*m10p); //m01
new5 = _mm256_adds_epi16(b2,*m10p); //m10
new6 = _mm256_adds_epi16(b3,*m11p); //m11
new7 = _mm256_subs_epi16(b3,*m11p); //m00
b0 = _mm256_max_epi16(m_b0,new0);
b1 = _mm256_max_epi16(m_b1,new1);
b2 = _mm256_max_epi16(m_b2,new2);
b3 = _mm256_max_epi16(m_b3,new3);
b4 = _mm256_max_epi16(m_b4,new4);
b5 = _mm256_max_epi16(m_b5,new5);
b6 = _mm256_max_epi16(m_b6,new6);
b7 = _mm256_max_epi16(m_b7,new7);
beta_max = _mm256_max_epi16(b0,b1);
beta_max = _mm256_max_epi16(beta_max ,b2);
beta_max = _mm256_max_epi16(beta_max ,b3);
beta_max = _mm256_max_epi16(beta_max ,b4);
beta_max = _mm256_max_epi16(beta_max ,b5);
beta_max = _mm256_max_epi16(beta_max ,b6);
beta_max = _mm256_max_epi16(beta_max ,b7);
beta_ptr-=8;
beta_ptr[0] = _mm256_max_epi16(m_b0,new0);
beta_ptr[1] = _mm256_max_epi16(m_b1,new1);
beta_ptr[2] = _mm256_max_epi16(m_b2,new2);
beta_ptr[3] = _mm256_max_epi16(m_b3,new3);
beta_ptr[4] = _mm256_max_epi16(m_b4,new4);
beta_ptr[5] = _mm256_max_epi16(m_b5,new5);
beta_ptr[6] = _mm256_max_epi16(m_b6,new6);
beta_ptr[7] = _mm256_max_epi16(m_b7,new7);
beta_max = _mm256_max_epi16(beta_ptr[0],beta_ptr[1]);
beta_max = _mm256_max_epi16(beta_max ,beta_ptr[2]);
beta_max = _mm256_max_epi16(beta_max ,beta_ptr[3]);
beta_max = _mm256_max_epi16(beta_max ,beta_ptr[4]);
beta_max = _mm256_max_epi16(beta_max ,beta_ptr[5]);
beta_max = _mm256_max_epi16(beta_max ,beta_ptr[6]);
beta_max = _mm256_max_epi16(beta_max ,beta_ptr[7]);
beta_ptr[0] = _mm256_subs_epi16(beta_ptr[0],beta_max);
beta_ptr[1] = _mm256_subs_epi16(beta_ptr[1],beta_max);
beta_ptr[2] = _mm256_subs_epi16(beta_ptr[2],beta_max);
beta_ptr[3] = _mm256_subs_epi16(beta_ptr[3],beta_max);
beta_ptr[4] = _mm256_subs_epi16(beta_ptr[4],beta_max);
beta_ptr[5] = _mm256_subs_epi16(beta_ptr[5],beta_max);
beta_ptr[6] = _mm256_subs_epi16(beta_ptr[6],beta_max);
beta_ptr[7] = _mm256_subs_epi16(beta_ptr[7],beta_max);
m11p--;
m10p--;
beta_ptr[0] = _mm256_subs_epi16(b0,beta_max);
beta_ptr[1] = _mm256_subs_epi16(b1,beta_max);
beta_ptr[2] = _mm256_subs_epi16(b2,beta_max);
beta_ptr[3] = _mm256_subs_epi16(b3,beta_max);
beta_ptr[4] = _mm256_subs_epi16(b4,beta_max);
beta_ptr[5] = _mm256_subs_epi16(b5,beta_max);
beta_ptr[6] = _mm256_subs_epi16(b6,beta_max);
beta_ptr[7] = _mm256_subs_epi16(b7,beta_max);
#ifdef DEBUG_LOGMAP
fprintf(fdavx2,"Loop index %d, mb\n",k);
......@@ -658,6 +686,8 @@ void compute_beta16avx2(llr_t* alpha,llr_t* beta,llr_t *m_11,llr_t* m_10,uint16_
#endif
}
timeout = rdtsc_oai();
printf("beta: inner loop time %llu\n",timeout-timein);
if (rerun_flag==1)
break;
......@@ -968,7 +998,7 @@ unsigned char phy_threegpplte_turbo_decoder16avx2(int16_t *y,
yp2 = yparity2;
#if 0
for (i=0; i<n; i+=8) {
pi2_p = &pi2tab16avx2[iind][i];
......@@ -1084,9 +1114,23 @@ unsigned char phy_threegpplte_turbo_decoder16avx2(int16_t *y,
yp128_cw2+=3;
}
yp=(llr_t*)yp128;
yp_cw2=(llr_t*)yp128_cw2;
#else
pi2_p = &pi2tab16avx2[iind][0];
for (i=0,j=0; i<n; i++) {
s[*pi2_p] = y[j];
s[*pi2_p+8] = y2[j++];
yp1[*pi2_p] = y[j];
yp1[*pi2_p+8] = y2[j++];
yp2[*pi2_p] = y[j];
yp2[(*pi2_p++)+8] = y2[j++];
}
yp=(llr_t*)&y[j];
yp_cw2=(llr_t*)&y2[j];
#endif
// Termination
for (i=0; i<3; i++) {
......
......@@ -144,12 +144,25 @@ void compute_gamma16(llr_t* m11,llr_t* m10,llr_t* systematic,channel_t* y_parity
fprintf(fdsse4,"compute_gamma (sse_16bit), %p,%p,%p,%p,framelength %d\n",m11,m10,systematic,y_parity,frame_length);
#endif
#ifndef __AVX2__
K1=frame_length>>3;
#else
if ((frame_length&15) > 0)
K1=(frame_length+1)>>4;
else
K1=frame_length>>4;
#endif
for (k=0; k<K1; k++) {
#if defined(__x86_64__) || defined(__i386__)
#ifndef __AVX2__
m11_128[k] = _mm_srai_epi16(_mm_adds_epi16(systematic128[k],y_parity128[k]),1);
m10_128[k] = _mm_srai_epi16(_mm_subs_epi16(systematic128[k],y_parity128[k]),1);
#else
((__m256i*)m11_128)[k] = _mm256_srai_epi16(_mm256_adds_epi16(((__m256i*)systematic128)[k],((__m256i*)y_parity128)[k]),1);
// ((__m256i*)m10_128)[k] = _mm256_srai_epi16(_mm256_subs_epi16(((__m256i*)y_parity128)[k],((__m256i*)systematic128)[k]),1);
((__m256i*)m10_128)[k] = _mm256_srai_epi16(_mm256_subs_epi16(((__m256i*)systematic128)[k],((__m256i*)y_parity128)[k]),1);
#endif
#elif defined(__arm__)
m11_128[k] = vhaddq_s16(systematic128[k],y_parity128[k]);
m10_128[k] = vhsubq_s16(systematic128[k],y_parity128[k]);
......@@ -164,13 +177,19 @@ void compute_gamma16(llr_t* m11,llr_t* m10,llr_t* systematic,channel_t* y_parity
#endif
}
k=frame_length>>3;
// Termination
#if defined(__x86_64__) || defined(__i386__)
m11_128[k] = _mm_srai_epi16(_mm_adds_epi16(systematic128[k+term_flag],y_parity128[k]),1);
//#ifndef __AVX2__
#if 1
m10_128[k] = _mm_srai_epi16(_mm_subs_epi16(systematic128[k+term_flag],y_parity128[k]),1);
#else
m10_128[k] = _mm_srai_epi16(_mm_subs_epi16(y_parity128[k],systematic128[k+term_flag]),1);
#endif
#elif defined(__arm__)
m11_128[k] = vhaddq_s16(systematic128[k+term_flag],y_parity128[k]);
m10_128[k] = vhsubq_s16(systematic128[k+term_flag],y_parity128[k]);
m10_128[k] = vhsubq_s16(systematic128[k+term_flag],y_parity128[k]);
#endif
#ifdef DEBUG_LOGMAP
......@@ -188,11 +207,21 @@ void compute_alpha16(llr_t* alpha,llr_t* beta,llr_t* m_11,llr_t* m_10,unsigned s
{
int k,l,l2,K1,rerun_flag=0;
#if defined(__x86_64__) || defined(__i386__)
__m128i *alpha128=(__m128i *)alpha,*alpha_ptr;
__m128i a0,a1,a2,a3,a4,a5,a6,a7,*m11p,*m10p;
__m128i *alpha128=(__m128i *)alpha,*alpha_ptr,*m11p,*m10p;
//#ifndef __AVX2__
#if 1
__m128i a0,a1,a2,a3,a4,a5,a6,a7;
__m128i m_b0,m_b1,m_b2,m_b3,m_b4,m_b5,m_b6,m_b7;
__m128i new0,new1,new2,new3,new4,new5,new6,new7;
__m128i alpha_max;
#else
__m256i *alpha256=(__m256i *)alpha,*alpha_ptr256,m11,m10;
__m256i a01,a23,a45,a67,a02,a13,a64,a75;
__m256i m_b01,m_b23,m_b45,m_b67,new01,new23,new45,new67;
__m256i m11m10_256;
__m256i alpha_max;
#endif
#elif defined(__arm__)
int16x8_t *alpha128=(int16x8_t *)alpha,*alpha_ptr;
int16x8_t a0,a1,a2,a3,a4,a5,a6,a7,*m11p,*m10p;
......@@ -208,6 +237,10 @@ void compute_alpha16(llr_t* alpha,llr_t* beta,llr_t* m_11,llr_t* m_10,unsigned s
for (l=K1;; l=l2,rerun_flag=1) {
#if defined(__x86_64__) || defined(__i386__)
alpha128 = (__m128i *)alpha;
//#ifdef __AVX2__
#if 0
alpha256 = (__m256i *)alpha;
#endif
#elif defined(__arm__)
alpha128 = (int16x8_t *)alpha;
#endif
......@@ -288,6 +321,11 @@ void compute_alpha16(llr_t* alpha,llr_t* beta,llr_t* m_11,llr_t* m_10,unsigned s
}
alpha_ptr = &alpha128[0];
//#ifdef __AVX2__
#if 0
alpha_ptr256 = &alpha256[0];
#endif
#if defined(__x86_64__) || defined(__i386__)
m11p = (__m128i*)m_11;
m10p = (__m128i*)m_10;
......@@ -300,6 +338,8 @@ void compute_alpha16(llr_t* alpha,llr_t* beta,llr_t* m_11,llr_t* m_10,unsigned s
k++) {
#if defined(__x86_64__) || defined(__i386__)
//#ifndef __AVX2__
#if 1
a1=_mm_load_si128(&alpha_ptr[1]);
a3=_mm_load_si128(&alpha_ptr[3]);
a5=_mm_load_si128(&alpha_ptr[5]);
......@@ -344,6 +384,37 @@ void compute_alpha16(llr_t* alpha,llr_t* beta,llr_t* m_11,llr_t* m_10,unsigned s
alpha_max = _mm_max_epi16(alpha_max,a5);
alpha_max = _mm_max_epi16(alpha_max,a6);
alpha_max = _mm_max_epi16(alpha_max,a7);
#else
a02=_mm256_load_si256(&alpha_ptr256[0]);
a13=_mm256_load_si256(&alpha_ptr256[1]);
a64=_mm256_load_si256(&alpha_ptr256[2]);
a75=_mm256_load_si256(&alpha_ptr256[3]);
m11m10_256 = _mm256_insertf128_si256(m11m10_256,*m11p,0);
m11m10_256 = _mm256_insertf128_si256(m11m10_256,*m10p,1);
m_b01 = _mm256_adds_epi16(a13,m11m10_256); //negative m10
m_b23 = _mm256_subs_epi16(a75,m11m10_256); //negative m10
m_b45 = _mm256_subs_epi16(a13,m11m10_256); //negative m10
m_b67 = _mm256_adds_epi16(a75,m11m10_256); //negative m10
new01 = _mm256_subs_epi16(a02,m11m10_256); //negative m10
new23 = _mm256_adds_epi16(a64,m11m10_256); //negative m10
new45 = _mm256_adds_epi16(a02,m11m10_256); //negative m10
new67 = _mm256_subs_epi16(a64,m11m10_256); //negative m10
a01 = _mm256_max_epi16(m_b01,new01);
a23 = _mm256_max_epi16(m_b23,new23);
a45 = _mm256_max_epi16(m_b45,new45);
a67 = _mm256_max_epi16(m_b67,new67);
alpha_max = _mm256_max_epi16(a01,a23);
alpha_max = _mm256_max_epi16(alpha_max,a45);
alpha_max = _mm256_max_epi16(alpha_max,a67);
alpha_max = _mm256_max_epi16(alpha_max,_mm256_permutevar8x32_epi32(alpha_max,_mm256_set_epi32(3,2,1,0,7,6,5,4)));
#endif
#elif defined(__arm__)
m_b0 = vqaddq_s16(alpha_ptr[1],*m11p); // m11
m_b4 = vqsubq_s16(alpha_ptr[1],*m11p); // m00=-m11
......@@ -383,9 +454,15 @@ void compute_alpha16(llr_t* alpha,llr_t* beta,llr_t* m_11,llr_t* m_10,unsigned s
#endif
alpha_ptr+=8;
//#ifdef __AVX2__
#if 0
alpha_ptr256+=4;
#endif
m11p++;
m10p++;
#if defined(__x86_64__) || defined(__i386__)
//#ifndef __AVX2__
#if 1
alpha_ptr[0] = _mm_subs_epi16(a0,alpha_max);
alpha_ptr[1] = _mm_subs_epi16(a1,alpha_max);
alpha_ptr[2] = _mm_subs_epi16(a2,alpha_max);
......@@ -394,6 +471,18 @@ void compute_alpha16(llr_t* alpha,llr_t* beta,llr_t* m_11,llr_t* m_10,unsigned s
alpha_ptr[5] = _mm_subs_epi16(a5,alpha_max);
alpha_ptr[6] = _mm_subs_epi16(a6,alpha_max);
alpha_ptr[7] = _mm_subs_epi16(a7,alpha_max);
#else
a01 = _mm256_subs_epi16(a01,alpha_max);
a23 = _mm256_subs_epi16(a23,alpha_max);
a45 = _mm256_subs_epi16(a45,alpha_max);
a67 = _mm256_subs_epi16(a67,alpha_max);
alpha_ptr256[0] = _mm256_permute2x128_si256(a01,a23,0x20); //a02
alpha_ptr256[1] = _mm256_permute2x128_si256(a01,a23,0x13); //a13
alpha_ptr256[2] = _mm256_permute2x128_si256(a45,a67,0x02); //a64
alpha_ptr256[3] = _mm256_permute2x128_si256(a45,a67,0x31); //a75
#endif
#elif defined(__arm__)
alpha_ptr[0] = vqsubq_s16(a0,alpha_max);
alpha_ptr[1] = vqsubq_s16(a1,alpha_max);
......@@ -488,8 +577,12 @@ void compute_beta16(llr_t* alpha,llr_t* beta,llr_t *m_11,llr_t* m_10,unsigned sh
// fprintf(fdsse4,"beta init: offset8 %d\n",offset8_flag);
m11=(int16_t)m_11[2+frame_length];
//#ifndef __AVX2__
#if 1
m10=(int16_t)m_10[2+frame_length];
#else
m10=-(int16_t)m_10[2+frame_length];
#endif
#ifdef DEBUG_LOGMAP
fprintf(fdsse4,"m11,m10 %d,%d\n",m11,m10);
#endif
......@@ -643,6 +736,9 @@ void compute_beta16(llr_t* alpha,llr_t* beta,llr_t *m_11,llr_t* m_10,unsigned sh
m11_128=((__m128i*)m_11)[k];
m10_128=((__m128i*)m_10)[k];
//#ifndef __AVX2__
#if 1
m_b0 = _mm_adds_epi16(beta_ptr[4],m11_128); //m11
m_b1 = _mm_subs_epi16(beta_ptr[4],m11_128); //m00
m_b2 = _mm_subs_epi16(beta_ptr[5],m10_128); //m01
......@@ -652,6 +748,7 @@ void compute_beta16(llr_t* alpha,llr_t* beta,llr_t *m_11,llr_t* m_10,unsigned sh
m_b6 = _mm_subs_epi16(beta_ptr[7],m11_128); //m00
m_b7 = _mm_adds_epi16(beta_ptr[7],m11_128); //m11
new0 = _mm_subs_epi16(beta_ptr[0],m11_128); //m00
new1 = _mm_adds_epi16(beta_ptr[0],m11_128); //m11
new2 = _mm_adds_epi16(beta_ptr[1],m10_128); //m10
......@@ -661,8 +758,29 @@ void compute_beta16(llr_t* alpha,llr_t* beta,llr_t *m_11,llr_t* m_10,unsigned sh
new6 = _mm_adds_epi16(beta_ptr[3],m11_128); //m11
new7 = _mm_subs_epi16(beta_ptr[3],m11_128); //m00
#else
b01=_mm256_load_si256(&((_m256i*)beta_ptr)[0]);
b23=_mm256_load_si256(&((_m256i*)beta_ptr)[1]);
b45=_mm256_load_si256(&((_m256i*)beta_ptr)[2]);
b67=_mm256_load_si256(&((_m256i*)beta_ptr)[3]);
m11m10_256 = _mm256_insertf128_si256(m11m10_256,m11_128,0);
m11m10_256 = _mm256_insertf128_si256(m11m10_256,m10_128,1);
m_b02 = _mm256_adds_epi16(b45,m11m10_256); //negative m10
m_b13 = _mm256_subs_epi16(b45,m11m10_256); //negative m10
m_b64 = _mm256_subs_epi16(b67,m11m10_256); //negative m10
m_b75 = _mm256_adds_epi16(b67,m11m10_256); //negative m10
new02 = _mm256_subs_epi16(b01,m11m10_256); //negative m10
new13 = _mm256_adds_epi16(b01,m11m10_256); //negative m10
new64 = _mm256_adds_epi16(b23,m11m10_256); //negative m10
new75 = _mm256_subs_epi16(b24,m11m10_256); //negative m10
#endif
beta_ptr-=8;
//#ifndef __AVX2__
#if 1
beta_ptr[0] = _mm_max_epi16(m_b0,new0);
beta_ptr[1] = _mm_max_epi16(m_b1,new1);
beta_ptr[2] = _mm_max_epi16(m_b2,new2);
......@@ -688,6 +806,28 @@ void compute_beta16(llr_t* alpha,llr_t* beta,llr_t *m_11,llr_t* m_10,unsigned sh
beta_ptr[5] = _mm_subs_epi16(beta_ptr[5],beta_max);
beta_ptr[6] = _mm_subs_epi16(beta_ptr[6],beta_max);
beta_ptr[7] = _mm_subs_epi16(beta_ptr[7],beta_max);
#else
b02 = _mm256_max_epi16(m_b02,new02);
b13 = _mm256_max_epi16(m_b13,new13);
b64 = _mm256_max_epi16(m_b64,new64);
b75 = _mm256_max_epi16(m_b75,new75);
beta_max = _mm256_max_epi16(b02,b13);
beta_max = _mm256_max_epi16(beta_max,b64);
beta_max = _mm256_max_epi16(beta_max,b75);
beta_max = _mm256_max_epi16(beta_max,_mm256_permutevar8x32_epi32(betaa_max,_mm256_set_epi32(3,2,1,0,7,6,5,4)));
b02 = _mm256_subs_epi16(b02,beta_max);
b13 = _mm256_subs_epi16(b13,beta_max);
b64 = _mm256_subs_epi16(b64,beta_max);
b75 = _mm256_subs_epi16(b75,beta_max);
((_m256i*)beta_ptr)[0]) = _mm256_permute2x128_si256(b02,b13,0x02); //b01
((_m256i*)beta_ptr)[1]) = _mm256_permute2x128_si256(b02,b13,0x31); //b23
((_m256i*)beta_ptr)[2]) = _mm256_permute2x128_si256(b64,b75,0x13); //b45
((_m256i*)beta_ptr)[3]) = _mm256_permute2x128_si256(b64,b75,0x20); //b67
#endif
#elif defined(__arm__)
m11_128=((int16x8_t*)m_11)[k];
m10_128=((int16x8_t*)m_10)[k];
......@@ -820,6 +960,9 @@ void compute_ext16(llr_t* alpha,llr_t* beta,llr_t* m_11,llr_t* m_10,llr_t* ext,
print_shorts("b6:",&beta_ptr[6]);
print_shorts("b7:",&beta_ptr[7]);
*/
//#ifndef __AVX2__
#if 1
m00_4 = _mm_adds_epi16(alpha_ptr[7],beta_ptr[3]); //ALPHA_BETA_4m00;
m11_4 = _mm_adds_epi16(alpha_ptr[7],beta_ptr[7]); //ALPHA_BETA_4m11;
m00_3 = _mm_adds_epi16(alpha_ptr[6],beta_ptr[7]); //ALPHA_BETA_3m00;
......@@ -836,6 +979,32 @@ void compute_ext16(llr_t* alpha,llr_t* beta,llr_t* m_11,llr_t* m_10,llr_t* ext,
m10_2 = _mm_adds_epi16(alpha_ptr[3],beta_ptr[5]); //ALPHA_BETA_2m10;
m10_1 = _mm_adds_epi16(alpha_ptr[2],beta_ptr[1]); //ALPHA_BETA_1m10;
m01_1 = _mm_adds_epi16(alpha_ptr[2],beta_ptr[5]); //ALPHA_BETA_1m01;
#else
m00_1 = _mm_adds_epi16(alpha_ptr[0],beta_ptr[0]); //ALPHA_BETA_1m00;
m10_1 = _mm_adds_epi16(alpha_ptr[2],beta_ptr[1]); //ALPHA_BETA_1m10;
m11_1 = _mm_adds_epi16(alpha_ptr[0],beta_ptr[4]); //ALPHA_BETA_1m11;
m01_1 = _mm_adds_epi16(alpha_ptr[2],beta_ptr[5]); //ALPHA_BETA_1m01;
m11_2 = _mm_adds_epi16(alpha_ptr[1],beta_ptr[0]); //ALPHA_BETA_2m11;
m01_2 = _mm_adds_epi16(alpha_ptr[3],beta_ptr[1]); //ALPHA_BETA_2m01;
m00_2 = _mm_adds_epi16(alpha_ptr[1],beta_ptr[4]); //ALPHA_BETA_2m00;
m10_2 = _mm_adds_epi16(alpha_ptr[3],beta_ptr[5]); //ALPHA_BETA_2m10;
m11_3 = _mm_adds_epi16(alpha_ptr[6],beta_ptr[3]); //ALPHA_BETA_3m11;
m01_3 = _mm_adds_epi16(alpha_ptr[4],beta_ptr[2]); //ALPHA_BETA_3m01;
m00_3 = _mm_adds_epi16(alpha_ptr[6],beta_ptr[7]); //ALPHA_BETA_3m00;
m10_3 = _mm_adds_epi16(alpha_ptr[4],beta_ptr[6]); //ALPHA_BETA_3m10;
m00_4 = _mm_adds_epi16(alpha_ptr[7],beta_ptr[3]); //ALPHA_BETA_4m00;
m10_4 = _mm_adds_epi16(alpha_ptr[5],beta_ptr[2]); //ALPHA_BETA_4m10;
m11_4 = _mm_adds_epi16(alpha_ptr[7],beta_ptr[7]); //ALPHA_BETA_4m11;
m01_4 = _mm_adds_epi16(alpha_ptr[5],beta_ptr[6]); //ALPHA_BETA_4m01;
#endif
/*
print_shorts("m11_1:",&m11_1);
print_shorts("m11_2:",&m11_2);
......@@ -1030,19 +1199,19 @@ unsigned char phy_threegpplte_turbo_decoder16(short *y,
n is the size in bits of the coded block, with the tail */
llr_t systematic0[n+16] __attribute__ ((aligned(16)));
llr_t systematic1[n+16] __attribute__ ((aligned(16)));
llr_t systematic2[n+16] __attribute__ ((aligned(16)));
llr_t yparity1[n+16] __attribute__ ((aligned(16)));
llr_t yparity2[n+16] __attribute__ ((aligned(16)));
llr_t systematic0[n+16] __attribute__ ((aligned(32)));
llr_t systematic1[n+16] __attribute__ ((aligned(32)));
llr_t systematic2[n+16] __attribute__ ((aligned(32)));
llr_t yparity1[n+16] __attribute__ ((aligned(32)));
llr_t yparity2[n+16] __attribute__ ((aligned(32)));
llr_t ext[n+128] __attribute__((aligned(16)));
llr_t ext2[n+128] __attribute__((aligned(16)));
llr_t ext[n+128] __attribute__((aligned(32)));
llr_t ext2[n+128] __attribute__((aligned(32)));
llr_t alpha[(n+16)*8] __attribute__ ((aligned(16)));
llr_t beta[(n+16)*8] __attribute__ ((aligned(16)));
llr_t m11[n+16] __attribute__ ((aligned(16)));
llr_t m10[n+16] __attribute__ ((aligned(16)));
llr_t alpha[(n+16)*8] __attribute__ ((aligned(32)));
llr_t beta[(n+16)*8] __attribute__ ((aligned(32)));
llr_t m11[n+32] __attribute__ ((aligned(32)));
llr_t m10[n+32] __attribute__ ((aligned(32)));
int *pi2_p,*pi4_p,*pi5_p,*pi6_p;
......
......@@ -61,21 +61,18 @@ void lte_gold(LTE_DL_FRAME_PARMS *frame_parms,uint32_t lte_gold_table[20][2][14]
x2 = Ncp +
(Nid_cell<<1) +
(((1+(Nid_cell<<1))*(1 + (((frame_parms->Ncp==0)?4:3)*l) + (7*(1+ns))))<<10); //cinit
(((1+(Nid_cell<<1))*(1 + (((frame_parms->Ncp==0)?4:3)*l) + (7*(1+ns))))<<10); //cinit
//x2 = frame_parms->Ncp + (Nid_cell<<1) + (1+(Nid_cell<<1))*(1 + (3*l) + (7*(1+ns))); //cinit
//n = 0
// printf("cinit (ns %d, l %d) => %d\n",ns,l,x2);
x1 = 1+ (1<<31);
x2=x2 ^ ((x2 ^ (x2>>1) ^ (x2>>2) ^ (x2>>3))<<31);
// skip first 50 double words (1600 bits)
//printf("n=0 : x1 %x, x2 %x\n",x1,x2);
for (n=1; n<50; n++) {
x1 = (x1>>1) ^ (x1>>4);
x1 = x1 ^ (x1<<31) ^ (x1<<28);