[GITLAB] - UPGRADE TO v12 on Wednesday the 18th of December at 11.30AM

Commit b1e28edd authored by Florian Kaltenberger's avatar Florian Kaltenberger

Merge branch 'enhancement-43-AVX2' into bugfix-48-L1L2signaling

corrected some additional FFT related issues from AVX2 merge when running with real-time MODEM.

Conflicts:
	openair1/PHY/LTE_TRANSPORT/dlsch_demodulation.c
parents 801e343c c0f6881c
......@@ -174,7 +174,7 @@ set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -Wl,-rpath -Wl,${OPENAIR_
# these changes are related to hardcoded path to include .h files
add_definitions(-DCMAKER)
set(CMAKE_C_FLAGS_DEBUG "${CMAKE_C_FLAGS} -g -DMALLOC_CHECK_=3")
set(CMAKE_C_FLAGS_RELWITHDEBINFO "${CMAKE_C_FLAGS} -g -DMALLOC_CHECK_=3 -O2")
set(CMAKE_C_FLAGS_RELWITHDEBINFO "${CMAKE_C_FLAGS} -g -DMALLOC_CHECK_=3 -O3")
set(GIT_BRANCH "UNKNOWN")
......@@ -949,6 +949,7 @@ set(PHY_SRC
${OPENAIR1_DIR}/PHY/CODING/crc_byte.c
${OPENAIR1_DIR}/PHY/CODING/3gpplte_turbo_decoder_sse_8bit.c
${OPENAIR1_DIR}/PHY/CODING/3gpplte_turbo_decoder_sse_16bit.c
${OPENAIR1_DIR}/PHY/CODING/3gpplte_turbo_decoder_avx2_16bit.c
${OPENAIR1_DIR}/PHY/CODING/lte_rate_matching.c
${OPENAIR1_DIR}/PHY/CODING/rate_matching.c
${OPENAIR1_DIR}/PHY/CODING/viterbi.c
......
This diff is collapsed.
This diff is collapsed.
......@@ -5,7 +5,7 @@ RATE12CC_SRC = ccoding_byte.c viterbi.c crc_byte.c
all: 3gpplte_sse
3gpplte_sse: $(TURBO_SRC)
gcc -o 3gpplte_sse 3gpplte_sse.c -msse4 -Wall -g -ggdb -DMAIN
gcc -o 3gpplte_sse 3gpplte_sse.c -msse4 -Wall -g -ggdb -DTC_MAIN -I../..
......
......@@ -483,6 +483,24 @@ uint8_t phy_threegpplte_turbo_decoder16(int16_t *y,
time_stats_t *intl1_stats,
time_stats_t *intl2_stats);
uint8_t phy_threegpplte_turbo_decoder16avx2(int16_t *y,
int16_t *y2,
uint8_t *decoded_bytes,
uint8_t *decoded_bytes2,
uint16_t n,
uint16_t interleaver_f1,
uint16_t interleaver_f2,
uint8_t max_iterations,
uint8_t crc_type,
uint8_t F,
time_stats_t *init_stats,
time_stats_t *alpha_stats,
time_stats_t *beta_stats,
time_stats_t *gamma_stats,
time_stats_t *ext_stats,
time_stats_t *intl1_stats,
time_stats_t *intl2_stats);
/*!
\brief This routine performs max-logmap detection for the 3GPP turbo code (with termination). It is optimized for SIMD processing and 8-bit
LLR arithmetic, and requires SSE2,SSSE3 and SSE4.1 (gcc >=4.3 and appropriate CPU)
......
......@@ -888,12 +888,16 @@ void phy_init_lte_top(LTE_DL_FRAME_PARMS *lte_frame_parms)
ccodelte_init();
ccodelte_init_inv();
treillis_table_init();
phy_generate_viterbi_tables();
phy_generate_viterbi_tables_lte();
init_td8();
init_td16();
#ifdef __AVX2__
init_td16avx2();
#endif
lte_sync_time_init(lte_frame_parms);
......
This diff is collapsed.
......@@ -203,8 +203,9 @@ int lte_dl_channel_estimation(PHY_VARS_UE *phy_vars_ue,
multadd_complex_vector_real_scalar(dl_ch-(phy_vars_ue->lte_frame_parms.ofdm_symbol_size<<1),
phy_vars_ue->ch_est_alpha,dl_ch-(phy_vars_ue->lte_frame_parms.ofdm_symbol_size<<1),
1,phy_vars_ue->lte_frame_parms.ofdm_symbol_size);
#ifdef DEBUG_CH
printf("k %d, first_carrier %d\n",k,phy_vars_ue->lte_frame_parms.first_carrier_offset);
#endif
if ((phy_vars_ue->lte_frame_parms.N_RB_DL==6) ||
(phy_vars_ue->lte_frame_parms.N_RB_DL==50) ||
(phy_vars_ue->lte_frame_parms.N_RB_DL==100)) {
......@@ -213,7 +214,9 @@ int lte_dl_channel_estimation(PHY_VARS_UE *phy_vars_ue,
// Treat first 2 pilots specially (left edge)
ch[0] = (int16_t)(((int32_t)pil[0]*rxF[0] - (int32_t)pil[1]*rxF[1])>>15);
ch[1] = (int16_t)(((int32_t)pil[0]*rxF[1] + (int32_t)pil[1]*rxF[0])>>15);
// printf("pilot 0 : rxF - > (%d,%d) ch -> (%d,%d), pil -> (%d,%d) \n",rxF[0],rxF[1],ch[0],ch[1],pil[0],pil[1]);
#ifdef DEBUG_CH
printf("pilot 0 : rxF - > (%d,%d) ch -> (%d,%d), pil -> (%d,%d) \n",rxF[0],rxF[1],ch[0],ch[1],pil[0],pil[1]);
#endif
multadd_real_vector_complex_scalar(fl,
ch,
dl_ch,
......@@ -224,7 +227,9 @@ int lte_dl_channel_estimation(PHY_VARS_UE *phy_vars_ue,
ch[0] = (int16_t)(((int32_t)pil[0]*rxF[0] - (int32_t)pil[1]*rxF[1])>>15);
ch[1] = (int16_t)(((int32_t)pil[0]*rxF[1] + (int32_t)pil[1]*rxF[0])>>15);
// printf("pilot 1 : rxF - > (%d,%d) ch -> (%d,%d), pil -> (%d,%d) \n",rxF[0],rxF[1],ch[0],ch[1],pil[0],pil[1]);
#ifdef DEBUG_CH
printf("pilot 1 : rxF - > (%d,%d) ch -> (%d,%d), pil -> (%d,%d) \n",rxF[0],rxF[1],ch[0],ch[1],pil[0],pil[1]);
#endif
multadd_real_vector_complex_scalar(f2l2,
ch,
dl_ch,
......@@ -235,15 +240,13 @@ int lte_dl_channel_estimation(PHY_VARS_UE *phy_vars_ue,
for (pilot_cnt=2; pilot_cnt<((phy_vars_ue->lte_frame_parms.N_RB_DL)-1); pilot_cnt+=2) {
// printf("%d\n",dl_ch-(int16_t *)&dl_ch_estimates[(p<<1)+aarx][ch_offset]);
// printf("pilot[%d][%d] (%d,%d)\n",p,pilot_cnt,pil[0],pil[1]);
// printf("rx[%d] -> (%d,%d)\n", k, rxF[0], rxF[1]);
ch[0] = (int16_t)(((int32_t)pil[0]*rxF[0] - (int32_t)pil[1]*rxF[1])>>15); //Re
ch[1] = (int16_t)(((int32_t)pil[0]*rxF[1] + (int32_t)pil[1]*rxF[0])>>15); //Im
// printf("**rb %d %d\n",rb,dl_ch-(int16_t *)&dl_ch_estimates[(p<<1)+aarx][ch_offset]);
#ifdef DEBUG_CH
printf("pilot %d : rxF - > (%d,%d) ch -> (%d,%d), pil -> (%d,%d) \n",pilot_cnt,rxF[0],rxF[1],ch[0],ch[1],pil[0],pil[1]);
#endif
multadd_real_vector_complex_scalar(f,
ch,
dl_ch,
......@@ -254,13 +257,11 @@ int lte_dl_channel_estimation(PHY_VARS_UE *phy_vars_ue,
rxF+=12;
dl_ch+=8;
// printf("pilot[%d][%d] (%d,%d)\n",p,rb,pil[0],pil[1]);
// printf("rx[%d] -> (%d,%d)\n", k+6, rxF[0], rxF[1]);
ch[0] = (int16_t)(((int32_t)pil[0]*rxF[0] - (int32_t)pil[1]*rxF[1])>>15);
ch[1] = (int16_t)(((int32_t)pil[0]*rxF[1] + (int32_t)pil[1]*rxF[0])>>15);
// printf("**rb %d %d\n",rb,dl_ch-(int16_t *)&dl_ch_estimates[(p<<1)+aarx][ch_offset]);
#ifdef DEBUG_CH
printf("pilot %d : rxF - > (%d,%d) ch -> (%d,%d), pil -> (%d,%d) \n",pilot_cnt+1,rxF[0],rxF[1],ch[0],ch[1],pil[0],pil[1]);
#endif
multadd_real_vector_complex_scalar(f2,
ch,
dl_ch,
......@@ -281,15 +282,17 @@ int lte_dl_channel_estimation(PHY_VARS_UE *phy_vars_ue,
rxF = (int16_t *)&rxdataF[aarx][((symbol_offset+1+k))];
#ifdef DEBUG_CH
printf("second half k %d\n",k);
#endif
for (pilot_cnt=0; pilot_cnt<((phy_vars_ue->lte_frame_parms.N_RB_DL)-3); pilot_cnt+=2) {
// printf("pilot[%d][%d] (%d,%d)\n",p,pilot_cnt,pil[0],pil[1]);
// printf("rx[%d] -> (%d,%d)\n", k+6, rxF[0], rxF[1]);
ch[0] = (int16_t)(((int32_t)pil[0]*rxF[0] - (int32_t)pil[1]*rxF[1])>>15);
ch[1] = (int16_t)(((int32_t)pil[0]*rxF[1] + (int32_t)pil[1]*rxF[0])>>15);
// printf("**rb %d %d\n",rb,dl_ch-(int16_t *)&dl_ch_estimates[(p<<1)+aarx][ch_offset]);
#ifdef DEBUG_CH
printf("pilot %d : rxF - > (%d,%d) ch -> (%d,%d), pil -> (%d,%d) \n",pilot_cnt,rxF[0],rxF[1],ch[0],ch[1],pil[0],pil[1]);
#endif
multadd_real_vector_complex_scalar(f,
ch,
dl_ch,
......@@ -300,8 +303,9 @@ int lte_dl_channel_estimation(PHY_VARS_UE *phy_vars_ue,
ch[0] = (int16_t)(((int32_t)pil[0]*rxF[0] - (int32_t)pil[1]*rxF[1])>>15);
ch[1] = (int16_t)(((int32_t)pil[0]*rxF[1] + (int32_t)pil[1]*rxF[0])>>15);
// printf("**rb %d %d\n",rb,dl_ch-(int16_T *)&dl_ch_estimates[(p<<1)+aarx][ch_offset]);
#ifdef DEBUG_CH
printf("pilot %d : rxF - > (%d,%d) ch -> (%d,%d), pil -> (%d,%d) \n",pilot_cnt+1,rxF[0],rxF[1],ch[0],ch[1],pil[0],pil[1]);
#endif
multadd_real_vector_complex_scalar(f2,
ch,
dl_ch,
......@@ -314,8 +318,9 @@ int lte_dl_channel_estimation(PHY_VARS_UE *phy_vars_ue,
ch[0] = (int16_t)(((int32_t)pil[0]*rxF[0] - (int32_t)pil[1]*rxF[1])>>15);
ch[1] = (int16_t)(((int32_t)pil[0]*rxF[1] + (int32_t)pil[1]*rxF[0])>>15);
// printf("pilot 49: rxF -> (%d,%d) ch -> (%d,%d), pil -> (%d,%d) \n",rxF[0],rxF[1],ch[0],ch[1],pil[0],pil[1]);
#ifdef DEBUG_CH
printf("pilot %d: rxF -> (%d,%d) ch -> (%d,%d), pil -> (%d,%d) \n",pilot_cnt,rxF[0],rxF[1],ch[0],ch[1],pil[0],pil[1]);
#endif
multadd_real_vector_complex_scalar(fr,
ch,
dl_ch,
......@@ -326,7 +331,9 @@ int lte_dl_channel_estimation(PHY_VARS_UE *phy_vars_ue,
ch[0] = (int16_t)(((int32_t)pil[0]*rxF[0] - (int32_t)pil[1]*rxF[1])>>15);
ch[1] = (int16_t)(((int32_t)pil[0]*rxF[1] + (int32_t)pil[1]*rxF[0])>>15);
// printf("pilot 50: rxF - > (%d,%d) ch -> (%d,%d), pil -> (%d,%d) \n",rxF[0],rxF[1],ch[0],ch[1],pil[0],pil[1]);
#ifdef DEBUG_CH
printf("pilot %d: rxF - > (%d,%d) ch -> (%d,%d), pil -> (%d,%d) \n",pilot_cnt+1,rxF[0],rxF[1],ch[0],ch[1],pil[0],pil[1]);
#endif
multadd_real_vector_complex_scalar(f2r2,
ch,
dl_ch,
......
......@@ -52,8 +52,8 @@
int* sync_corr_ue0 = NULL;
int* sync_corr_ue1 = NULL;
int* sync_corr_ue2 = NULL;
int sync_tmp[2048*4] __attribute__((aligned(16)));
short syncF_tmp[2048*2] __attribute__((aligned(16)));
int sync_tmp[2048*4] __attribute__((aligned(32)));
short syncF_tmp[2048*2] __attribute__((aligned(32)));
......
......@@ -56,8 +56,8 @@ void lte_sync_timefreq(PHY_VARS_UE *ue,int band,unsigned int DL_freq)
{
#if defined(__x86_64__) || defined(__i386__)
UE_SCAN_INFO_t *scan_info = &ue->scan_info[band];
int16_t spectrum[12288] __attribute__((aligned(16)));
int16_t spectrum_p5ms[12288] __attribute__((aligned(16)));
int16_t spectrum[12288] __attribute__((aligned(32)));
int16_t spectrum_p5ms[12288] __attribute__((aligned(32)));
int i,f,band_idx;
__m128i autocorr0[256/4],autocorr1[256/4],autocorr2[256/4];
__m128i autocorr0_t[256/4],autocorr1_t[256/4],autocorr2_t[256/4];
......
......@@ -61,21 +61,18 @@ void lte_gold(LTE_DL_FRAME_PARMS *frame_parms,uint32_t lte_gold_table[20][2][14]
x2 = Ncp +
(Nid_cell<<1) +
(((1+(Nid_cell<<1))*(1 + (((frame_parms->Ncp==0)?4:3)*l) + (7*(1+ns))))<<10); //cinit
(((1+(Nid_cell<<1))*(1 + (((frame_parms->Ncp==0)?4:3)*l) + (7*(1+ns))))<<10); //cinit
//x2 = frame_parms->Ncp + (Nid_cell<<1) + (1+(Nid_cell<<1))*(1 + (3*l) + (7*(1+ns))); //cinit
//n = 0
// printf("cinit (ns %d, l %d) => %d\n",ns,l,x2);
x1 = 1+ (1<<31);
x2=x2 ^ ((x2 ^ (x2>>1) ^ (x2>>2) ^ (x2>>3))<<31);
// skip first 50 double words (1600 bits)
//printf("n=0 : x1 %x, x2 %x\n",x1,x2);
for (n=1; n<50; n++) {
x1 = (x1>>1) ^ (x1>>4);
x1 = x1 ^ (x1<<31) ^ (x1<<28);
x2 = (x2>>1) ^ (x2>>2) ^ (x2>>3) ^ (x2>>4);
x2 = x2 ^ (x2<<31) ^ (x2<<30) ^ (x2<<29) ^ (x2<<28);
// printf("x1 : %x, x2 : %x\n",x1,x2);
}
for (n=0; n<14; n++) {
......@@ -84,7 +81,6 @@ void lte_gold(LTE_DL_FRAME_PARMS *frame_parms,uint32_t lte_gold_table[20][2][14]
x2 = (x2>>1) ^ (x2>>2) ^ (x2>>3) ^ (x2>>4);
x2 = x2 ^ (x2<<31) ^ (x2<<30) ^ (x2<<29) ^ (x2<<28);
lte_gold_table[ns][l][n] = x1^x2;
// printf("n=%d : c %x\n",n,x1^x2);
}
}
......
......@@ -117,7 +117,7 @@ LTE_UE_DLSCH_t *new_ue_dlsch(uint8_t Kmimo,uint8_t Mdlharq,uint32_t Nsoft,uint8_
dlsch->max_turbo_iterations = max_turbo_iterations;
for (i=0; i<Mdlharq; i++) {
// msg("new_ue_dlsch: Harq process %d\n",i);
// printf("new_ue_dlsch: Harq process %d\n",i);
dlsch->harq_processes[i] = (LTE_DL_UE_HARQ_t *)malloc16(sizeof(LTE_DL_UE_HARQ_t));
if (dlsch->harq_processes[i]) {
......@@ -156,7 +156,7 @@ LTE_UE_DLSCH_t *new_ue_dlsch(uint8_t Kmimo,uint8_t Mdlharq,uint32_t Nsoft,uint8_
return(dlsch);
}
msg("new_ue_dlsch with size %zu: exit_flag = %u\n",sizeof(LTE_DL_UE_HARQ_t), exit_flag);
printf("new_ue_dlsch with size %zu: exit_flag = %u\n",sizeof(LTE_DL_UE_HARQ_t), exit_flag);
free_ue_dlsch(dlsch);
return(NULL);
......@@ -187,6 +187,27 @@ uint32_t dlsch_decoding(PHY_VARS_UE *phy_vars_ue,
uint8_t crc_type;
#ifdef DEBUG_DLSCH_DECODING
uint16_t i;
#endif
#ifdef __AVX2__
int Kr_last,skipped_last=0;
uint8_t (*tc_2cw)(int16_t *y,
int16_t *y2,
uint8_t *,
uint8_t *,
uint16_t,
uint16_t,
uint16_t,
uint8_t,
uint8_t,
uint8_t,
time_stats_t *,
time_stats_t *,
time_stats_t *,
time_stats_t *,
time_stats_t *,
time_stats_t *,
time_stats_t *);
#endif
uint8_t (*tc)(int16_t *y,
uint8_t *,
......@@ -204,28 +225,35 @@ uint32_t dlsch_decoding(PHY_VARS_UE *phy_vars_ue,
time_stats_t *,
time_stats_t *);
if (!dlsch_llr) {
msg("dlsch_decoding.c: NULL dlsch_llr pointer\n");
printf("dlsch_decoding.c: NULL dlsch_llr pointer\n");
return(dlsch->max_turbo_iterations);
}
if (!harq_process) {
msg("dlsch_decoding.c: NULL harq_process pointer\n");
printf("dlsch_decoding.c: NULL harq_process pointer\n");
return(dlsch->max_turbo_iterations);
}
if (!frame_parms) {
msg("dlsch_decoding.c: NULL frame_parms pointer\n");
printf("dlsch_decoding.c: NULL frame_parms pointer\n");
return(dlsch->max_turbo_iterations);
}
if (subframe>9) {
msg("dlsch_decoding.c: Illegal subframe index %d\n",subframe);
printf("dlsch_decoding.c: Illegal subframe index %d\n",subframe);
return(dlsch->max_turbo_iterations);
}
if (llr8_flag == 0)
if (llr8_flag == 0) {
#ifdef __AVX2__
tc_2cw = phy_threegpplte_turbo_decoder16avx2;
#endif
tc = phy_threegpplte_turbo_decoder16;
}
else
tc = phy_threegpplte_turbo_decoder8;
......@@ -233,13 +261,13 @@ uint32_t dlsch_decoding(PHY_VARS_UE *phy_vars_ue,
/*
if (nb_rb > frame_parms->N_RB_DL) {
msg("dlsch_decoding.c: Illegal nb_rb %d\n",nb_rb);
printf("dlsch_decoding.c: Illegal nb_rb %d\n",nb_rb);
return(max_turbo_iterations);
}*/
/*harq_pid = dlsch->current_harq_pid;
if (harq_pid >= 8) {
msg("dlsch_decoding.c: Illegal harq_pid %d\n",harq_pid);
printf("dlsch_decoding.c: Illegal harq_pid %d\n",harq_pid);
return(max_turbo_iterations);
}
*/
......@@ -254,7 +282,7 @@ uint32_t dlsch_decoding(PHY_VARS_UE *phy_vars_ue,
G = harq_process->G;
//get_G(frame_parms,nb_rb,dlsch->rb_alloc,mod_order,num_pdcch_symbols,phy_vars_ue->frame,subframe);
// msg("DLSCH Decoding, harq_pid %d Ndi %d\n",harq_pid,harq_process->Ndi);
// printf("DLSCH Decoding, harq_pid %d Ndi %d\n",harq_pid,harq_process->Ndi);
if (harq_process->round == 0) {
// This is a new packet, so compute quantities regarding segmentation
......@@ -273,7 +301,7 @@ uint32_t dlsch_decoding(PHY_VARS_UE *phy_vars_ue,
/*
else {
msg("dlsch_decoding.c: Ndi>0 not checked yet!!\n");
printf("dlsch_decoding.c: Ndi>0 not checked yet!!\n");
return(max_turbo_iterations);
}
*/
......@@ -300,10 +328,14 @@ uint32_t dlsch_decoding(PHY_VARS_UE *phy_vars_ue,
break;
}
if (harq_process->C >= MAX_NUM_DLSCH_SEGMENTS/bw_scaling) {
if (harq_process->C > MAX_NUM_DLSCH_SEGMENTS/bw_scaling) {
LOG_E(PHY,"Illegal harq_process->C %d > %d\n",harq_process->C,MAX_NUM_DLSCH_SEGMENTS/bw_scaling);
return((1+dlsch->max_turbo_iterations));
}
#ifdef DEBUG_DLSCH_DECODING
printf("Segmentation: C %d, Cminus %d, Kminus %d, Kplus %d\n",harq_process->C,harq_process->Cminus,harq_process->Kminus,harq_process->Kplus);
#endif
for (r=0; r<harq_process->C; r++) {
......@@ -324,7 +356,7 @@ uint32_t dlsch_decoding(PHY_VARS_UE *phy_vars_ue,
else if (Kr_bytes <= 768)
iind = 123 + ((Kr_bytes-256)>>3);
else {
msg("dlsch_decoding: Illegal codeword size %d!!!\n",Kr_bytes);
printf("dlsch_decoding: Illegal codeword size %d!!!\n",Kr_bytes);
return(dlsch->max_turbo_iterations);
}
......@@ -418,15 +450,12 @@ uint32_t dlsch_decoding(PHY_VARS_UE *phy_vars_ue,
printf("\n");
*/
//#ifndef __AVX2__
#if 1
if (err_flag == 0) {
start_meas(dlsch_turbo_decoding_stats);
#ifdef TURBO_S
ret = phy_threegpplte_turbo_decoder_scalar
#else
ret = tc
#endif
(&harq_process->d[r][96],
harq_process->c[r],
Kr,
......@@ -446,7 +475,130 @@ uint32_t dlsch_decoding(PHY_VARS_UE *phy_vars_ue,
stop_meas(dlsch_turbo_decoding_stats);
}
#else
if ((harq_process->C == 1) ||
((r==harq_process->C-1) && (skipped_last==0))) { // last segment with odd number of segments
start_meas(dlsch_turbo_decoding_stats);
ret = tc
(&harq_process->d[r][96],
harq_process->c[r],
Kr,
f1f2mat_old[iind*2],
f1f2mat_old[(iind*2)+1],
dlsch->max_turbo_iterations,
crc_type,
(r==0) ? harq_process->F : 0,
&phy_vars_ue->dlsch_tc_init_stats,
&phy_vars_ue->dlsch_tc_alpha_stats,
&phy_vars_ue->dlsch_tc_beta_stats,
&phy_vars_ue->dlsch_tc_gamma_stats,
&phy_vars_ue->dlsch_tc_ext_stats,
&phy_vars_ue->dlsch_tc_intl1_stats,
&phy_vars_ue->dlsch_tc_intl2_stats); //(is_crnti==0)?harq_pid:harq_pid+1);
stop_meas(dlsch_turbo_decoding_stats);
// printf("single decode, exit\n");
// exit(-1);
}
else {
// we can merge code segments
if ((skipped_last == 0) && (r<harq_process->C-1)) {
skipped_last = 1;
Kr_last = Kr;
}
else {
skipped_last=0;
if (Kr_last == Kr) { // decode 2 code segments with AVX2 version
#ifdef DEBUG_DLSCH_DECODING
printf("single decoding segment %d (%p)\n",r-1,&harq_process->d[r-1][96]);
#endif
start_meas(dlsch_turbo_decoding_stats);
#ifdef DEBUG_DLSCH_DECODING
printf("double decoding segments %d,%d (%p,%p)\n",r-1,r,&harq_process->d[r-1][96],&harq_process->d[r][96]);
#endif
ret = tc_2cw
(&harq_process->d[r-1][96],
&harq_process->d[r][96],
harq_process->c[r-1],
harq_process->c[r],
Kr,
f1f2mat_old[iind*2],
f1f2mat_old[(iind*2)+1],
dlsch->max_turbo_iterations,
crc_type,
(r==0) ? harq_process->F : 0,
&phy_vars_ue->dlsch_tc_init_stats,
&phy_vars_ue->dlsch_tc_alpha_stats,
&phy_vars_ue->dlsch_tc_beta_stats,
&phy_vars_ue->dlsch_tc_gamma_stats,
&phy_vars_ue->dlsch_tc_ext_stats,
&phy_vars_ue->dlsch_tc_intl1_stats,
&phy_vars_ue->dlsch_tc_intl2_stats); //(is_crnti==0)?harq_pid:harq_pid+1);
/*
ret = tc
(&harq_process->d[r-1][96],
harq_process->c[r-1],
Kr_last,
f1f2mat_old[iind*2],
f1f2mat_old[(iind*2)+1],
dlsch->max_turbo_iterations,
crc_type,
(r==0) ? harq_process->F : 0,
&phy_vars_ue->dlsch_tc_init_stats,
&phy_vars_ue->dlsch_tc_alpha_stats,
&phy_vars_ue->dlsch_tc_beta_stats,
&phy_vars_ue->dlsch_tc_gamma_stats,
&phy_vars_ue->dlsch_tc_ext_stats,
&phy_vars_ue->dlsch_tc_intl1_stats,
&phy_vars_ue->dlsch_tc_intl2_stats); //(is_crnti==0)?harq_pid:harq_pid+1);
exit(-1);*/
stop_meas(dlsch_turbo_decoding_stats);
}
else { // Kr_last != Kr
start_meas(dlsch_turbo_decoding_stats);
ret = tc
(&harq_process->d[r-1][96],
harq_process->c[r-1],
Kr_last,
f1f2mat_old[iind*2],
f1f2mat_old[(iind*2)+1],
dlsch->max_turbo_iterations,
crc_type,
(r==0) ? harq_process->F : 0,
&phy_vars_ue->dlsch_tc_init_stats,
&phy_vars_ue->dlsch_tc_alpha_stats,
&phy_vars_ue->dlsch_tc_beta_stats,
&phy_vars_ue->dlsch_tc_gamma_stats,
&phy_vars_ue->dlsch_tc_ext_stats,
&phy_vars_ue->dlsch_tc_intl1_stats,
&phy_vars_ue->dlsch_tc_intl2_stats); //(is_crnti==0)?harq_pid:harq_pid+1);
stop_meas(dlsch_turbo_decoding_stats);
start_meas(dlsch_turbo_decoding_stats);
ret = tc
(&harq_process->d[r][96],
harq_process->c[r],
Kr,
f1f2mat_old[iind*2],
f1f2mat_old[(iind*2)+1],
dlsch->max_turbo_iterations,
crc_type,
(r==0) ? harq_process->F : 0,
&phy_vars_ue->dlsch_tc_init_stats,
&phy_vars_ue->dlsch_tc_alpha_stats,
&phy_vars_ue->dlsch_tc_beta_stats,
&phy_vars_ue->dlsch_tc_gamma_stats,
&phy_vars_ue->dlsch_tc_ext_stats,
&phy_vars_ue->dlsch_tc_intl1_stats,
&phy_vars_ue->dlsch_tc_intl2_stats); //(is_crnti==0)?harq_pid:harq_pid+1);
stop_meas(dlsch_turbo_decoding_stats);
}
}
}
#endif
if ((err_flag == 0) && (ret>=(1+dlsch->max_turbo_iterations))) {// a Code segment is in error so break;
......
......@@ -1898,17 +1898,17 @@ void dlsch_channel_compensation_TM3(LTE_DL_FRAME_PARMS *frame_parms,
for (aarx=0; aarx<frame_parms->nb_antennas_rx; aarx++) {
dl_ch0_128 = (__m128i *)&dl_ch_estimates_ext[aarx][symbol*frame_parms->N_RB_DL*12];
dl_ch1_128 = (__m128i *)&dl_ch_estimates_ext[2+aarx][symbol*frame_parms->N_RB_DL*12];
dl_ch0_128 = (__m128i *)&dl_ch_estimates_ext[aarx][symbol*frame_parms->N_RB_DL*12]; // hr,0
dl_ch1_128 = (__m128i *)&dl_ch_estimates_ext[2+aarx][symbol*frame_parms->N_RB_DL*12]; // hr,1
dl_ch_mag0_128 = (__m128i *)&dl_ch_mag0[aarx][symbol*frame_parms->N_RB_DL*12];
dl_ch_mag0_128b = (__m128i *)&dl_ch_magb0[aarx][symbol*frame_parms->N_RB_DL*12];
dl_ch_mag1_128 = (__m128i *)&dl_ch_mag1[aarx][symbol*frame_parms->N_RB_DL*12];
dl_ch_mag1_128b = (__m128i *)&dl_ch_magb1[aarx][symbol*frame_parms->N_RB_DL*12];
rxdataF128 = (__m128i *)&rxdataF_ext[aarx][symbol*frame_parms->N_RB_DL*12];
rxdataF_comp0_128 = (__m128i *)&rxdataF_comp0[aarx][symbol*frame_parms->N_RB_DL*12];
rxdataF_comp1_128 = (__m128i *)&rxdataF_comp1[aarx][symbol*frame_parms->N_RB_DL*12];
rxdataF128 = (__m128i *)&rxdataF_ext[aarx][symbol*frame_parms->N_RB_DL*12]; // yr
rxdataF_comp0_128 = (__m128i *)&rxdataF_comp0[aarx][symbol*frame_parms->N_RB_DL*12]; // yr,0 = yr * conj(hr,0)
rxdataF_comp1_128 = (__m128i *)&rxdataF_comp1[aarx][symbol*frame_parms->N_RB_DL*12]; // yr,1 = yr * conj(hr,1)
for (rb=0; rb<nb_rb; rb++) {
......
......@@ -608,7 +608,7 @@ int32_t generate_prach( PHY_VARS_UE *phy_vars_ue, uint8_t eNB_id, uint8_t subfra
uint8_t preamble_index = phy_vars_ue->prach_resources[eNB_id]->ra_PreambleIndex;
uint8_t tdd_mapindex = phy_vars_ue->prach_resources[eNB_id]->ra_TDD_map_index;
int16_t *prachF = phy_vars_ue->lte_ue_prach_vars[eNB_id]->prachF;
static int16_t prach_tmp[45600*2] __attribute__((aligned(16)));
static int16_t prach_tmp[45600*2] __attribute__((aligned(32)));
int16_t *prach = prach_tmp;
int16_t *prach2;
int16_t amp = phy_vars_ue->lte_ue_prach_vars[eNB_id]->amp;
......
......@@ -91,7 +91,7 @@ void PHY_ofdm_mod(int *input, /// pointer to complex input
)
{
static short temp[2048*4] __attribute__((aligned(16)));
static short temp[2048*4] __attribute__((aligned(32)));
unsigned short i,j;
short k;
......@@ -143,9 +143,18 @@ void PHY_ofdm_mod(int *input, /// pointer to complex input
printf("[PHY] symbol %d/%d offset %d (%p,%p -> %p)\n",i,nb_symbols,i*fftsize+(i*nb_prefix_samples),input,&input[i*fftsize],&output[(i*fftsize) + ((i)*nb_prefix_samples)]);
#endif
#ifndef __AVX2__
// handle 128-bit alignment for 128-bit SIMD (SSE4,NEON,AltiVEC)
idft((int16_t *)&input[i*fftsize],
(fftsize==128) ? (int16_t *)temp : (int16_t *)&output[(i*fftsize) + ((1+i)*nb_prefix_samples)],
1);
#else
// on AVX2 need 256-bit alignment
idft((int16_t *)&input[i*fftsize],
(fftsize<=512) ? (int16_t *)temp : (int16_t *)&output[(i*fftsize) + ((1+i)*nb_prefix_samples)],
1);
#endif
// Copy to frame buffer with Cyclic Extension
// Note: will have to adjust for synchronization offset!
......@@ -158,7 +167,12 @@ void PHY_ofdm_mod(int *input, /// pointer to complex input
// msg("Doing cyclic prefix method\n");
if (fftsize==128) {
#ifndef __AVX2__
if (fftsize==128)
#else
if (fftsize<=512)
#endif
{
for (j=0; j<fftsize ; j++) {
output_ptr[j] = temp_ptr[j];
}
......
......@@ -56,7 +56,7 @@ int slot_fep(PHY_VARS_UE *phy_vars_ue,
unsigned int rx_offset;
void (*dft)(int16_t *,int16_t *, int);
int tmp_dft_in[256]; // This is for misalignment issues for 6 and 15 PRBs
int tmp_dft_in[2048]; // This is for misalignment issues for 6 and 15 PRBs
switch (frame_parms->ofdm_symbol_size) {
case 128:
......@@ -115,8 +115,8 @@ int slot_fep(PHY_VARS_UE *phy_vars_ue,
memset(&ue_common_vars->rxdataF[aa][frame_parms->ofdm_symbol_size*symbol],0,frame_parms->ofdm_symbol_size*sizeof(int));
rx_offset = sample_offset + slot_offset + nb_prefix_samples0 + subframe_offset - SOFFSET;
// Align with 128 bit
rx_offset = rx_offset - rx_offset % 4;
// Align with 256 bit
// rx_offset = rx_offset&0xfffffff8;
#ifdef DEBUG_FEP
// if (phy_vars_ue->frame <100)
......@@ -131,9 +131,9 @@ int slot_fep(PHY_VARS_UE *phy_vars_ue,
(short *)&ue_common_vars->rxdata[aa][0],
frame_parms->ofdm_symbol_size*sizeof(int));
if ((rx_offset&3)!=0) { // if input to dft is not 128-bit aligned, issue for size 6 and 15 PRBs
if ((rx_offset&7)!=0) { // if input to dft is not 256-bit aligned, issue for size 6,15 and 25 PRBs
memcpy((void *)tmp_dft_in,
(void *)&ue_common_vars->rxdata[aa][(rx_offset-nb_prefix_samples0) % frame_length_samples],
(void *)&ue_common_vars->rxdata[aa][rx_offset % frame_length_samples],
frame_parms->ofdm_symbol_size*sizeof(int));
dft((int16_t *)tmp_dft_in,
(int16_t *)&ue_common_vars->rxdataF[aa][frame_parms->ofdm_symbol_size*symbol],1);
......@@ -146,8 +146,8 @@ int slot_fep(PHY_VARS_UE *phy_vars_ue,
}
} else {
rx_offset += (frame_parms->ofdm_symbol_size+nb_prefix_samples) +
(frame_parms->ofdm_symbol_size+nb_prefix_samples)*(l-1);
rx_offset += (frame_parms->ofdm_symbol_size+nb_prefix_samples)*l;// +
// (frame_parms->ofdm_symbol_size+nb_prefix_samples)*(l-1);
#ifdef DEBUG_FEP
// if (phy_vars_ue->frame <100)
......@@ -162,7 +162,7 @@ int slot_fep(PHY_VARS_UE *phy_vars_ue,
start_meas(&phy_vars_ue->rx_dft_stats);
if ((rx_offset&3)!=0) { // if input to dft is not 128-bit aligned, issue for size 6 and 15 PRBs
if ((rx_offset&7)!=0) { // if input to dft is not 128-bit aligned, issue for size 6 and 15 PRBs
memcpy((void *)tmp_dft_in,
(void *)&ue_common_vars->rxdata[aa][(rx_offset) % frame_length_samples],
frame_parms->ofdm_symbol_size*sizeof(int));
......
lte_dfts: lte_dfts.c
gcc -O2 -mavx2 -g -ggdb -o lte_dfts lte_dfts.c time_meas.c file_output.c ../../SIMULATION/TOOLS/taus.c -I$$OPENAIR1_DIR -I$$OPENAIR_TARGETS -I$$OPENAIR2_DIR/COMMON -DUSER_MODE -DMR_MAIN -DNB_ANTENNAS_RX=1 # -DD256STATS #-DD64STATS
lte_dfts_sse4: lte_dfts.c
gcc -O2 -msse4.1 -g -ggdb -o lte_dfts_sse4 lte_dfts.c time_meas.c file_output.c ../../SIMULATION/TOOLS/taus.c -I$$OPENAIR1_DIR -I$$OPENAIR_TARGETS -I$$OPENAIR2_DIR/COMMON -DUSER_MODE -DMR_MAIN -DNB_ANTENNAS_RX=1 # -DD256STATS #-DD64STATS
lte_dfts.s: lte_dfts.c
lte_dfts_avx2: lte_dfts.c
gcc -O2 -mavx2 -g -ggdb -o lte_dfts_avx2 lte_dfts.c time_meas.c file_output.c ../../SIMULATION/TOOLS/taus.c -I$$OPENAIR1_DIR -I$$OPENAIR_TARGETS -I$$OPENAIR2_DIR/COMMON -DUSER_MODE -DMR_MAIN -DNB_ANTENNAS_RX=1 # -DD256STATS #-DD64STATS
lte_dfts_avx2.s: lte_dfts.c
gcc -O2 -mavx2 -S lte_dfts.c time_meas.c file_output.c ../../SIMULATION/TOOLS/taus.c -I$$OPENAIR1_DIR -I$$OPENAIR_TARGETS -I$$OPENAIR2_DIR/COMMON -DUSER_MODE -DMR_MAIN -DNB_ANTENNAS_RX=1 # -DD256STATS #-DD64STATS
dft_cycles: lte_dfts
./lte_dfts | egrep cycles
lte_dfts_sse4.s: lte_dfts.c
gcc -O2 -msse4.1 -S lte_dfts.c time_meas.c file_output.c ../../SIMULATION/TOOLS/taus.c -I$$OPENAIR1_DIR -I$$OPENAIR_TARGETS -I$$OPENAIR2_DIR/COMMON -DUSER_MODE -DMR_MAIN -DNB_ANTENNAS_RX=1 # -DD256STATS #-DD64STATS
dft_cycles_avx2: lte_dfts_avx2