Commit 8d4405bd authored by knopp's avatar knopp

added avx2 optimized turbo decoder for 16-bit LLR. This decoder parallelizes...

added avx2 optimized turbo decoder for 16-bit LLR.  This decoder parallelizes by decoding 2 code segments concurrently. requires updates dlsch_decoding.c to identify when new parallel version can be used. other minor changes related to memory allocations for future avx2 optimizations (32-byte alignment).
parent 27b1707e
......@@ -134,7 +134,7 @@ else (CMAKE_SYSTEM_PROCESSOR STREQUAL "armv7l")
set(C_FLAGS_PROCESSOR "${C_FLAGS_PROCESSOR} -mavx2")
endif()
if (CPUINFO MATCHES "sse4_2")
set(C_FLAGS_PROCESSOR "${C_FLAGS_PROCESSOR} -msse4.2")
set(C_FLAGS_PROCESSOR "${C_FLAGS_PROCESSOR} -mavx2 -msse4.2")
endif()
if (CPUINFO MATCHES "sse4_1")
set(C_FLAGS_PROCESSOR "${C_FLAGS_PROCESSOR} -msse4.1")
......@@ -168,7 +168,7 @@ set(CMAKE_CXX_FLAGS
# these changes are related to hardcoded path to include .h files
add_definitions(-DCMAKER)
set(CMAKE_C_FLAGS_DEBUG "${CMAKE_C_FLAGS} -g -DMALLOC_CHECK_=3")
set(CMAKE_C_FLAGS_RELWITHDEBINFO "${CMAKE_C_FLAGS} -g -DMALLOC_CHECK_=3 -O2")
set(CMAKE_C_FLAGS_RELWITHDEBINFO "${CMAKE_C_FLAGS} -g -DMALLOC_CHECK_=3 -O3")
# Below has been put in comment because does not work with
# SVN authentication.
......@@ -840,6 +840,7 @@ set(PHY_SRC
${OPENAIR1_DIR}/PHY/CODING/crc_byte.c
${OPENAIR1_DIR}/PHY/CODING/3gpplte_turbo_decoder_sse_8bit.c
${OPENAIR1_DIR}/PHY/CODING/3gpplte_turbo_decoder_sse_16bit.c
${OPENAIR1_DIR}/PHY/CODING/3gpplte_turbo_decoder_avx2_16bit.c
${OPENAIR1_DIR}/PHY/CODING/lte_rate_matching.c
${OPENAIR1_DIR}/PHY/CODING/rate_matching.c
${OPENAIR1_DIR}/PHY/CODING/viterbi.c
......
/*******************************************************************************
OpenAirInterface
Copyright(c) 1999 - 2014 Eurecom
OpenAirInterface is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
OpenAirInterface is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with OpenAirInterface.The full GNU General Public License is
included in this distribution in the file called "COPYING". If not,
see <http://www.gnu.org/licenses/>.
Contact Information
OpenAirInterface Admin: openair_admin@eurecom.fr
OpenAirInterface Tech : openair_tech@eurecom.fr
OpenAirInterface Dev : openair4g-devel@lists.eurecom.fr
Address : Eurecom, Campus SophiaTech, 450 Route des Chappes, CS 50193 - 06904 Biot Sophia Antipolis cedex, FRANCE
*******************************************************************************/
/* file: 3gpplte_turbo_decoder_sse_16bit.c
purpose: Routines for implementing max-logmap decoding of Turbo-coded (DLSCH) transport channels from 36-212, V8.6 2009-03
authors: raymond.knopp@eurecom.fr, Laurent Thomas (Alcatel-Lucent)
date: 21.10.2009
Note: This version of the routine currently requires SSE2,SSSE3 and SSE4.1 equipped computers. It uses 16-bit inputs for
LLRS and uses 16-bit arithmetic for the internal computations!
Changelog: 17.11.2009 FK SSE4.1 not required anymore
Aug. 2012 new parallelization options for higher speed (8-way parallelization)
Jan. 2013 8-bit LLR support with 16-way parallelization
Feb. 2013 New interleaving and hard-decision optimizations (L. Thomas)
May 2013 Extracted 16bit code
*/
///
///
#ifdef __AVX2__
#include "PHY/sse_intrin.h"
#ifndef TEST_DEBUG
#include "PHY/defs.h"
#include "PHY/CODING/defs.h"
#include "PHY/CODING/lte_interleaver_inline.h"
#include "extern_3GPPinterleaver.h"
#else
#include "defs.h"
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#endif
#ifdef MEX
#include "mex.h"
#endif
//#define DEBUG_LOGMAP
#ifdef DEBUG_LOGMAP
#define print_shorts(s,x) fprintf(fdavx2,"%s %d,%d,%d,%d,%d,%d,%d,%d\n",s,(x)[0],(x)[1],(x)[2],(x)[3],(x)[4],(x)[5],(x)[6],(x)[7]);fprintf(fdavx2b,"%s %d,%d,%d,%d,%d,%d,%d,%d\n",s,(x)[8],(x)[9],(x)[10],(x)[11],(x)[12],(x)[13],(x)[14],(x)[15])
FILE *fdavx2,*fdavx2b;
#else
#endif
#define print_bytes(s,x) printf("%s %d,%d,%d,%d,%d,%d,%d,%d,%d,%d,%d,%d,%d,%d,%d,%d,%d,%d,%d,%d,%d,%d,%d,%d,%d,%d,%d,%d,%d,%d,%d,%d\n",s,(x)[0],(x)[1],(x)[2],(x)[3],(x)[4],(x)[5],(x)[6],(x)[7],(x)[8],(x)[9],(x)[10],(x)[11],(x)[12],(x)[13],(x)[14],(x)[15],(x)[16],(x)[17],(x)[18],(x)[19],(x)[20],(x)[21],(x)[22],(x)[23],(x)[24],(x)[25],(x)[26],(x)[27],(x)[28],(x)[29],(x)[30],(x)[31])
typedef int16_t llr_t; // internal decoder LLR data is 16-bit fixed
typedef int16_t channel_t;
#define MAX 256
void log_map16avx2(llr_t* systematic,channel_t* y_parity, llr_t* m11, llr_t* m10, llr_t *alpha, llr_t *beta, llr_t* ext,uint16_t frame_length,unsigned char term_flag,unsigned char F,int offset8_flag,time_stats_t *alpha_stats,time_stats_t *beta_stats,time_stats_t *gamma_stats,time_stats_t *ext_stats);
void compute_gamma16avx2(llr_t* m11,llr_t* m10,llr_t* systematic, channel_t* y_parity, uint16_t frame_length,unsigned char term_flag);
void compute_alpha16avx2(llr_t*alpha,llr_t *beta, llr_t* m11,llr_t* m10, uint16_t frame_length,unsigned char F);
void compute_beta16avx2(llr_t*alpha, llr_t* beta,llr_t* m11,llr_t* m10, uint16_t frame_length,unsigned char F,int offset8_flag);
void compute_ext16avx2(llr_t* alpha,llr_t* beta,llr_t* m11,llr_t* m10,llr_t* extrinsic, llr_t* ap, uint16_t frame_length);
void log_map16avx2(llr_t* systematic,
channel_t* y_parity,
llr_t* m11,
llr_t* m10,
llr_t *alpha,
llr_t *beta,
llr_t* ext,
uint16_t frame_length,
unsigned char term_flag,
unsigned char F,
int offset8_flag,
time_stats_t *alpha_stats,
time_stats_t *beta_stats,
time_stats_t *gamma_stats,
time_stats_t *ext_stats)
{
#ifdef DEBUG_LOGMAP
fprintf(fdavx2,"log_map (avx2_16bit), frame_length %d\n",frame_length);
fprintf(fdavx2b,"log_map (avx2_16bit), frame_length %d\n",frame_length);
#endif
start_meas(gamma_stats) ;
compute_gamma16avx2(m11,m10,systematic,y_parity,frame_length,term_flag) ;
stop_meas(gamma_stats);
start_meas(alpha_stats) ;
compute_alpha16avx2(alpha,beta,m11,m10,frame_length,F) ;
stop_meas(alpha_stats);
start_meas(beta_stats) ;
compute_beta16avx2(alpha,beta,m11,m10,frame_length,F,offset8_flag) ;
stop_meas(beta_stats);
start_meas(ext_stats) ;
compute_ext16avx2(alpha,beta,m11,m10,ext,systematic,frame_length) ;
stop_meas(ext_stats);
}
void compute_gamma16avx2(llr_t* m11,llr_t* m10,llr_t* systematic,channel_t* y_parity,
uint16_t frame_length,unsigned char term_flag)
{
int k,K1;
__m256i *systematic128 = (__m256i *)systematic;
__m256i *y_parity128 = (__m256i *)y_parity;
__m256i *m10_128 = (__m256i *)m10;
__m256i *m11_128 = (__m256i *)m11;
#ifdef DEBUG_LOGMAP
fprintf(fdavx2,"compute_gamma (avx2_16bit), %p,%p,%p,%p,framelength %d\n",m11,m10,systematic,y_parity,frame_length);
fprintf(fdavx2b,"compute_gamma (avx2_16bit), %p,%p,%p,%p,framelength %d\n",m11,m10,systematic,y_parity,frame_length);
#endif
K1=frame_length>>3;
for (k=0; k<K1; k++) {
m11_128[k] = _mm256_srai_epi16(_mm256_adds_epi16(systematic128[k],y_parity128[k]),1);
m10_128[k] = _mm256_srai_epi16(_mm256_subs_epi16(systematic128[k],y_parity128[k]),1);
#ifdef DEBUG_LOGMAP
fprintf(fdavx2,"Loop index k %d\n",k);
fprintf(fdavx2b,"Loop index k %d\n",k);
print_shorts("sys",(int16_t*)&systematic128[k]);
print_shorts("yp",(int16_t*)&y_parity128[k]);
print_shorts("m11",(int16_t*)&m11_128[k]);
print_shorts("m10",(int16_t*)&m10_128[k]);
#endif
}
// Termination
m11_128[k] = _mm256_srai_epi16(_mm256_adds_epi16(systematic128[k+term_flag],y_parity128[k]),1);
m10_128[k] = _mm256_srai_epi16(_mm256_subs_epi16(systematic128[k+term_flag],y_parity128[k]),1);
#ifdef DEBUG_LOGMAP
fprintf(fdavx2,"Loop index k %d (term flag %d)\n",k,term_flag);
fprintf(fdavx2b,"Loop index k %d (term flag %d)\n",k,term_flag);
print_shorts("sys",(int16_t*)&systematic128[k+term_flag]);
print_shorts("yp",(int16_t*)&y_parity128[k]);
print_shorts("m11",(int16_t*)&m11_128[k]);
print_shorts("m10",(int16_t*)&m10_128[k]);
#endif
}
#define L 40
void compute_alpha16avx2(llr_t* alpha,llr_t* beta,llr_t* m_11,llr_t* m_10,uint16_t frame_length,unsigned char F)
{
int k,l,l2,K1,rerun_flag=0;
__m256i *alpha128=(__m256i *)alpha,*alpha_ptr;
__m256i a0,a1,a2,a3,a4,a5,a6,a7,*m11p,*m10p;
__m256i m_b0,m_b1,m_b2,m_b3,m_b4,m_b5,m_b6,m_b7;
__m256i new0,new1,new2,new3,new4,new5,new6,new7;
__m256i alpha_max;
l2 = L>>3;
K1 = (frame_length>>3);
#ifdef DEBUG_LOGMAP
fprintf(fdavx2,"Compute alpha (avx2_16bit)\n");
fprintf(fdavx2b,"Compute alpha (avx2_16bit)\n");
#endif
for (l=K1;; l=l2,rerun_flag=1) {
alpha128 = (__m256i *)alpha;
if (rerun_flag == 0) {
alpha128[0] = _mm256_set_epi16(-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2,0,-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2,0);
alpha128[1] = _mm256_set_epi16(-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2);
alpha128[2] = _mm256_set_epi16(-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2);
alpha128[3] = _mm256_set_epi16(-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2);
alpha128[4] = _mm256_set_epi16(-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2);
alpha128[5] = _mm256_set_epi16(-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2);
alpha128[6] = _mm256_set_epi16(-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2);
alpha128[7] = _mm256_set_epi16(-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2);
#ifdef DEBUG_LOGMAP
fprintf(fdavx2,"Initial alpha\n");
fprintf(fdavx2b,"Initial alpha\n");
print_shorts("a0",(int16_t*)&alpha128[0]);
print_shorts("a1",(int16_t*)&alpha128[1]);
print_shorts("a2",(int16_t*)&alpha128[2]);
print_shorts("a3",(int16_t*)&alpha128[3]);
print_shorts("a4",(int16_t*)&alpha128[4]);
print_shorts("a5",(int16_t*)&alpha128[5]);
print_shorts("a6",(int16_t*)&alpha128[6]);
print_shorts("a7",(int16_t*)&alpha128[7]);
#endif
} else {
//set initial alpha in columns 1-7 from final alpha from last run in columns 0-6
alpha128[0] = _mm256_slli_si256(alpha128[frame_length],2);
alpha128[1] = _mm256_slli_si256(alpha128[1+frame_length],2);
alpha128[2] = _mm256_slli_si256(alpha128[2+frame_length],2);
alpha128[3] = _mm256_slli_si256(alpha128[3+frame_length],2);
alpha128[4] = _mm256_slli_si256(alpha128[4+frame_length],2);
alpha128[5] = _mm256_slli_si256(alpha128[5+frame_length],2);
alpha128[6] = _mm256_slli_si256(alpha128[6+frame_length],2);
alpha128[7] = _mm256_slli_si256(alpha128[7+frame_length],2);
// set initial alpha in column 0 to (0,-MAX/2,...,-MAX/2)
alpha[16] = -MAX/2;
alpha[32] = -MAX/2;
alpha[48] = -MAX/2;
alpha[64] = -MAX/2;
alpha[80] = -MAX/2;
alpha[96] = -MAX/2;
alpha[112] = -MAX/2;
alpha[24] = -MAX/2;
alpha[40] = -MAX/2;
alpha[56] = -MAX/2;
alpha[72] = -MAX/2;
alpha[88] = -MAX/2;
alpha[104] = -MAX/2;
alpha[120] = -MAX/2;
#ifdef DEBUG_LOGMAP
fprintf(fdavx2,"Second run\n");
fprintf(fdavx2b,"Second run\n");
print_shorts("a0",(int16_t*)&alpha128[0]);
print_shorts("a1",(int16_t*)&alpha128[1]);
print_shorts("a2",(int16_t*)&alpha128[2]);
print_shorts("a3",(int16_t*)&alpha128[3]);
print_shorts("a4",(int16_t*)&alpha128[4]);
print_shorts("a5",(int16_t*)&alpha128[5]);
print_shorts("a6",(int16_t*)&alpha128[6]);
print_shorts("a7",(int16_t*)&alpha128[7]);
#endif
}
alpha_ptr = &alpha128[0];
m11p = (__m256i*)m_11;
m10p = (__m256i*)m_10;
for (k=0;
k<l;
k++) {
a1=_mm256_load_si256(&alpha_ptr[1]);
a3=_mm256_load_si256(&alpha_ptr[3]);
a5=_mm256_load_si256(&alpha_ptr[5]);
a7=_mm256_load_si256(&alpha_ptr[7]);
m_b0 = _mm256_adds_epi16(a1,*m11p); // m11
m_b4 = _mm256_subs_epi16(a1,*m11p); // m00=-m11
m_b1 = _mm256_subs_epi16(a3,*m10p); // m01=-m10
m_b5 = _mm256_adds_epi16(a3,*m10p); // m10
m_b2 = _mm256_adds_epi16(a5,*m10p); // m10
m_b6 = _mm256_subs_epi16(a5,*m10p); // m01=-m10
m_b3 = _mm256_subs_epi16(a7,*m11p); // m00=-m11
m_b7 = _mm256_adds_epi16(a7,*m11p); // m11
a0=_mm256_load_si256(&alpha_ptr[0]);
a2=_mm256_load_si256(&alpha_ptr[2]);
a4=_mm256_load_si256(&alpha_ptr[4]);
a6=_mm256_load_si256(&alpha_ptr[6]);
new0 = _mm256_subs_epi16(a0,*m11p); // m00=-m11
new4 = _mm256_adds_epi16(a0,*m11p); // m11
new1 = _mm256_adds_epi16(a2,*m10p); // m10
new5 = _mm256_subs_epi16(a2,*m10p); // m01=-m10
new2 = _mm256_subs_epi16(a4,*m10p); // m01=-m10
new6 = _mm256_adds_epi16(a4,*m10p); // m10
new3 = _mm256_adds_epi16(a6,*m11p); // m11
new7 = _mm256_subs_epi16(a6,*m11p); // m00=-m11
a0 = _mm256_max_epi16(m_b0,new0);
a1 = _mm256_max_epi16(m_b1,new1);
a2 = _mm256_max_epi16(m_b2,new2);
a3 = _mm256_max_epi16(m_b3,new3);
a4 = _mm256_max_epi16(m_b4,new4);
a5 = _mm256_max_epi16(m_b5,new5);
a6 = _mm256_max_epi16(m_b6,new6);
a7 = _mm256_max_epi16(m_b7,new7);
alpha_max = _mm256_max_epi16(a0,a1);
alpha_max = _mm256_max_epi16(alpha_max,a2);
alpha_max = _mm256_max_epi16(alpha_max,a3);
alpha_max = _mm256_max_epi16(alpha_max,a4);
alpha_max = _mm256_max_epi16(alpha_max,a5);
alpha_max = _mm256_max_epi16(alpha_max,a6);
alpha_max = _mm256_max_epi16(alpha_max,a7);
alpha_ptr+=8;
m11p++;
m10p++;
alpha_ptr[0] = _mm256_subs_epi16(a0,alpha_max);
alpha_ptr[1] = _mm256_subs_epi16(a1,alpha_max);
alpha_ptr[2] = _mm256_subs_epi16(a2,alpha_max);
alpha_ptr[3] = _mm256_subs_epi16(a3,alpha_max);
alpha_ptr[4] = _mm256_subs_epi16(a4,alpha_max);
alpha_ptr[5] = _mm256_subs_epi16(a5,alpha_max);
alpha_ptr[6] = _mm256_subs_epi16(a6,alpha_max);
alpha_ptr[7] = _mm256_subs_epi16(a7,alpha_max);
#ifdef DEBUG_LOGMAP
fprintf(fdavx2,"Loop index %d\n",k);
fprintf(fdavx2b,"Loop index %d\n",k);
print_shorts("mb0",(int16_t*)&m_b0);
print_shorts("mb1",(int16_t*)&m_b1);
print_shorts("mb2",(int16_t*)&m_b2);
print_shorts("mb3",(int16_t*)&m_b3);
print_shorts("mb4",(int16_t*)&m_b4);
print_shorts("mb5",(int16_t*)&m_b5);
print_shorts("mb6",(int16_t*)&m_b6);
print_shorts("mb7",(int16_t*)&m_b7);
fprintf(fdavx2,"Loop index %d, new\n",k);
fprintf(fdavx2b,"Loop index %d, new\n",k);
print_shorts("new0",(int16_t*)&new0);
print_shorts("new1",(int16_t*)&new1);
print_shorts("new2",(int16_t*)&new2);
print_shorts("new3",(int16_t*)&new3);
print_shorts("new4",(int16_t*)&new4);
print_shorts("new5",(int16_t*)&new5);
print_shorts("new6",(int16_t*)&new6);
print_shorts("new7",(int16_t*)&new7);
fprintf(fdavx2,"Loop index %d, after max\n",k);
fprintf(fdavx2b,"Loop index %d, after max\n",k);
print_shorts("a0",(int16_t*)&a0);
print_shorts("a1",(int16_t*)&a1);
print_shorts("a2",(int16_t*)&a2);
print_shorts("a3",(int16_t*)&a3);
print_shorts("a4",(int16_t*)&a4);
print_shorts("a5",(int16_t*)&a5);
print_shorts("a6",(int16_t*)&a6);
print_shorts("a7",(int16_t*)&a7);
fprintf(fdavx2,"Loop index %d\n",k);
fprintf(fdavx2b,"Loop index %d\n",k);
print_shorts("a0",(int16_t*)&alpha_ptr[0]);
print_shorts("a1",(int16_t*)&alpha_ptr[1]);
print_shorts("a2",(int16_t*)&alpha_ptr[2]);
print_shorts("a3",(int16_t*)&alpha_ptr[3]);
print_shorts("a4",(int16_t*)&alpha_ptr[4]);
print_shorts("a5",(int16_t*)&alpha_ptr[5]);
print_shorts("a6",(int16_t*)&alpha_ptr[6]);
print_shorts("a7",(int16_t*)&alpha_ptr[7]);
#endif
}
if (rerun_flag==1)
break;
}
}
void compute_beta16avx2(llr_t* alpha,llr_t* beta,llr_t *m_11,llr_t* m_10,uint16_t frame_length,unsigned char F,int offset8_flag)
{
int k,rerun_flag=0;
__m256i m11_128,m10_128;
__m256i m_b0,m_b1,m_b2,m_b3,m_b4,m_b5,m_b6,m_b7;
__m256i new0,new1,new2,new3,new4,new5,new6,new7;
__m256i *beta128,*alpha128,*beta_ptr;
__m256i beta_max;
llr_t m11,m10,beta0_16,beta1_16,beta2_16,beta3_16,beta4_16,beta5_16,beta6_16,beta7_16,beta0_2,beta1_2,beta2_2,beta3_2,beta_m;
llr_t m11_cw2,m10_cw2,beta0_cw2_16,beta1_cw2_16,beta2_cw2_16,beta3_cw2_16,beta4_cw2_16,beta5_cw2_16,beta6_cw2_16,beta7_cw2_16,beta0_2_cw2,beta1_2_cw2,beta2_2_cw2,beta3_2_cw2,beta_m_cw2;
llr_t beta0,beta1;
llr_t beta0_cw2,beta1_cw2;
#ifdef DEBUG_LOGMAP
fprintf(fdavx2,"compute_beta (avx2_16bit), %p,%p,%p,%p,framelength %d,F %d\n",
beta,m_11,m_10,alpha,frame_length,F);
fprintf(fdavx2b,"compute_beta (avx2_16bit), %p,%p,%p,%p,framelength %d,F %d\n",
beta,m_11,m_10,alpha,frame_length,F);
#endif
// termination for beta initialization
// fprintf(fdavx2,"beta init: offset8 %d\n",offset8_flag);
m11=(int16_t)m_11[(frame_length<<1)+2];
m10=(int16_t)m_10[(frame_length<<1)+2];
m11_cw2=(int16_t)m_11[(frame_length<<1)+8+2];
m10_cw2=(int16_t)m_10[(frame_length<<1)+8+2];
#ifdef DEBUG_LOGMAP
fprintf(fdavx2,"m11,m10 %d,%d\n",m11,m10);
fprintf(fdavx2b,"m11,m10 %d,%d\n",m11_cw2,m10_cw2);
#endif
beta0 = -m11;//M0T_TERM;
beta1 = m11;//M1T_TERM;
beta0_cw2 = -m11_cw2;//M0T_TERM;
beta1_cw2 = m11_cw2;//M1T_TERM;
m11=(int16_t)m_11[(frame_length<<1)+1];
m10=(int16_t)m_10[(frame_length<<1)+1];
m11_cw2=(int16_t)m_11[(frame_length<<1)+1+8];
m10_cw2=(int16_t)m_10[(frame_length<<1)+1+8];
#ifdef DEBUG_LOGMAP
fprintf(fdavx2,"m11,m10 %d,%d\n",m11,m10);
fprintf(fdavx2b,"m11,m10 %d,%d\n",m11_cw2,m10_cw2);
#endif
beta0_2 = beta0-m11;//+M0T_TERM;
beta1_2 = beta0+m11;//+M1T_TERM;
beta2_2 = beta1+m10;//M2T_TERM;
beta3_2 = beta1-m10;//+M3T_TERM;
beta0_2_cw2 = beta0_cw2-m11_cw2;//+M0T_TERM;
beta1_2_cw2 = beta0_cw2+m11_cw2;//+M1T_TERM;
beta2_2_cw2 = beta1_cw2+m10_cw2;//M2T_TERM;
beta3_2_cw2 = beta1_cw2-m10_cw2;//+M3T_TERM;
m11=(int16_t)m_11[frame_length<<1];
m10=(int16_t)m_10[frame_length<<1];
m11_cw2=(int16_t)m_11[(frame_length<<1)+8];
m10_cw2=(int16_t)m_10[(frame_length<<1)+8];
#ifdef DEBUG_LOGMAP
fprintf(fdavx2,"m11,m10 %d,%d\n",m11,m10);
fprintf(fdavx2b,"m11,m10 %d,%d\n",m11_cw2,m10_cw2);
#endif
beta0_16 = beta0_2-m11;//+M0T_TERM;
beta1_16 = beta0_2+m11;//+M1T_TERM;
beta2_16 = beta1_2+m10;//+M2T_TERM;
beta3_16 = beta1_2-m10;//+M3T_TERM;
beta4_16 = beta2_2-m10;//+M4T_TERM;
beta5_16 = beta2_2+m10;//+M5T_TERM;
beta6_16 = beta3_2+m11;//+M6T_TERM;
beta7_16 = beta3_2-m11;//+M7T_TERM;
beta0_cw2_16 = beta0_2_cw2-m11_cw2;//+M0T_TERM;
beta1_cw2_16 = beta0_2_cw2+m11_cw2;//+M1T_TERM;
beta2_cw2_16 = beta1_2_cw2+m10_cw2;//+M2T_TERM;
beta3_cw2_16 = beta1_2_cw2-m10_cw2;//+M3T_TERM;
beta4_cw2_16 = beta2_2_cw2-m10_cw2;//+M4T_TERM;
beta5_cw2_16 = beta2_2_cw2+m10_cw2;//+M5T_TERM;
beta6_cw2_16 = beta3_2_cw2+m11_cw2;//+M6T_TERM;
beta7_cw2_16 = beta3_2_cw2-m11_cw2;//+M7T_TERM;
beta_m = (beta0_16>beta1_16) ? beta0_16 : beta1_16;
beta_m = (beta_m>beta2_16) ? beta_m : beta2_16;
beta_m = (beta_m>beta3_16) ? beta_m : beta3_16;
beta_m = (beta_m>beta4_16) ? beta_m : beta4_16;
beta_m = (beta_m>beta5_16) ? beta_m : beta5_16;
beta_m = (beta_m>beta6_16) ? beta_m : beta6_16;
beta_m = (beta_m>beta7_16) ? beta_m : beta7_16;
beta_m_cw2 = (beta0_cw2_16>beta1_cw2_16) ? beta0_cw2_16 : beta1_cw2_16;
beta_m_cw2 = (beta_m_cw2>beta2_cw2_16) ? beta_m_cw2 : beta2_cw2_16;
beta_m_cw2 = (beta_m_cw2>beta3_cw2_16) ? beta_m_cw2 : beta3_cw2_16;
beta_m_cw2 = (beta_m_cw2>beta4_cw2_16) ? beta_m_cw2 : beta4_cw2_16;
beta_m_cw2 = (beta_m_cw2>beta5_cw2_16) ? beta_m_cw2 : beta5_cw2_16;
beta_m_cw2 = (beta_m_cw2>beta6_cw2_16) ? beta_m_cw2 : beta6_cw2_16;
beta_m_cw2 = (beta_m_cw2>beta7_cw2_16) ? beta_m_cw2 : beta7_cw2_16;
beta0_16=beta0_16-beta_m;
beta1_16=beta1_16-beta_m;
beta2_16=beta2_16-beta_m;
beta3_16=beta3_16-beta_m;
beta4_16=beta4_16-beta_m;
beta5_16=beta5_16-beta_m;
beta6_16=beta6_16-beta_m;
beta7_16=beta7_16-beta_m;
beta0_cw2_16=beta0_cw2_16-beta_m_cw2;
beta1_cw2_16=beta1_cw2_16-beta_m_cw2;
beta2_cw2_16=beta2_cw2_16-beta_m_cw2;
beta3_cw2_16=beta3_cw2_16-beta_m_cw2;
beta4_cw2_16=beta4_cw2_16-beta_m_cw2;
beta5_cw2_16=beta5_cw2_16-beta_m_cw2;
beta6_cw2_16=beta6_cw2_16-beta_m_cw2;
beta7_cw2_16=beta7_cw2_16-beta_m_cw2;
for (rerun_flag=0;; rerun_flag=1) {
beta_ptr = (__m256i*)&beta[frame_length<<4];
alpha128 = (__m256i*)&alpha[0];
if (rerun_flag == 0) {
beta_ptr[0] = alpha128[(frame_length)];
beta_ptr[1] = alpha128[1+(frame_length)];
beta_ptr[2] = alpha128[2+(frame_length)];
beta_ptr[3] = alpha128[3+(frame_length)];
beta_ptr[4] = alpha128[4+(frame_length)];
beta_ptr[5] = alpha128[5+(frame_length)];
beta_ptr[6] = alpha128[6+(frame_length)];
beta_ptr[7] = alpha128[7+(frame_length)];
#ifdef DEBUG_LOGMAP
fprintf(fdavx2,"beta init \n");
fprintf(fdavx2b,"beta init \n");
print_shorts("b0",(int16_t*)&beta_ptr[0]);
print_shorts("b1",(int16_t*)&beta_ptr[1]);
print_shorts("b2",(int16_t*)&beta_ptr[2]);
print_shorts("b3",(int16_t*)&beta_ptr[3]);
print_shorts("b4",(int16_t*)&beta_ptr[4]);
print_shorts("b5",(int16_t*)&beta_ptr[5]);
print_shorts("b6",(int16_t*)&beta_ptr[6]);
print_shorts("b7",(int16_t*)&beta_ptr[7]);
#endif
} else {
beta128 = (__m256i*)&beta[0];
beta_ptr[0] = _mm256_srli_si256(beta128[0],2);
beta_ptr[1] = _mm256_srli_si256(beta128[1],2);
beta_ptr[2] = _mm256_srli_si256(beta128[2],2);
beta_ptr[3] = _mm256_srli_si256(beta128[3],2);