Commit 8d4405bd authored by knopp's avatar knopp

added avx2 optimized turbo decoder for 16-bit LLR. This decoder parallelizes...

added avx2 optimized turbo decoder for 16-bit LLR.  This decoder parallelizes by decoding 2 code segments concurrently. requires updates dlsch_decoding.c to identify when new parallel version can be used. other minor changes related to memory allocations for future avx2 optimizations (32-byte alignment).
parent 27b1707e
......@@ -134,7 +134,7 @@ else (CMAKE_SYSTEM_PROCESSOR STREQUAL "armv7l")
set(C_FLAGS_PROCESSOR "${C_FLAGS_PROCESSOR} -mavx2")
endif()
if (CPUINFO MATCHES "sse4_2")
set(C_FLAGS_PROCESSOR "${C_FLAGS_PROCESSOR} -msse4.2")
set(C_FLAGS_PROCESSOR "${C_FLAGS_PROCESSOR} -mavx2 -msse4.2")
endif()
if (CPUINFO MATCHES "sse4_1")
set(C_FLAGS_PROCESSOR "${C_FLAGS_PROCESSOR} -msse4.1")
......@@ -168,7 +168,7 @@ set(CMAKE_CXX_FLAGS
# these changes are related to hardcoded path to include .h files
add_definitions(-DCMAKER)
set(CMAKE_C_FLAGS_DEBUG "${CMAKE_C_FLAGS} -g -DMALLOC_CHECK_=3")
set(CMAKE_C_FLAGS_RELWITHDEBINFO "${CMAKE_C_FLAGS} -g -DMALLOC_CHECK_=3 -O2")
set(CMAKE_C_FLAGS_RELWITHDEBINFO "${CMAKE_C_FLAGS} -g -DMALLOC_CHECK_=3 -O3")
# Below has been put in comment because does not work with
# SVN authentication.
......@@ -840,6 +840,7 @@ set(PHY_SRC
${OPENAIR1_DIR}/PHY/CODING/crc_byte.c
${OPENAIR1_DIR}/PHY/CODING/3gpplte_turbo_decoder_sse_8bit.c
${OPENAIR1_DIR}/PHY/CODING/3gpplte_turbo_decoder_sse_16bit.c
${OPENAIR1_DIR}/PHY/CODING/3gpplte_turbo_decoder_avx2_16bit.c
${OPENAIR1_DIR}/PHY/CODING/lte_rate_matching.c
${OPENAIR1_DIR}/PHY/CODING/rate_matching.c
${OPENAIR1_DIR}/PHY/CODING/viterbi.c
......
This diff is collapsed.
......@@ -483,6 +483,24 @@ uint8_t phy_threegpplte_turbo_decoder16(int16_t *y,
time_stats_t *intl1_stats,
time_stats_t *intl2_stats);
uint8_t phy_threegpplte_turbo_decoder16avx2(int16_t *y,
int16_t *y2,
uint8_t *decoded_bytes,
uint8_t *decoded_bytes2,
uint16_t n,
uint16_t interleaver_f1,
uint16_t interleaver_f2,
uint8_t max_iterations,
uint8_t crc_type,
uint8_t F,
time_stats_t *init_stats,
time_stats_t *alpha_stats,
time_stats_t *beta_stats,
time_stats_t *gamma_stats,
time_stats_t *ext_stats,
time_stats_t *intl1_stats,
time_stats_t *intl2_stats);
/*!
\brief This routine performs max-logmap detection for the 3GPP turbo code (with termination). It is optimized for SIMD processing and 8-bit
LLR arithmetic, and requires SSE2,SSSE3 and SSE4.1 (gcc >=4.3 and appropriate CPU)
......
......@@ -895,7 +895,9 @@ void phy_init_lte_top(LTE_DL_FRAME_PARMS *lte_frame_parms)
init_td8();
init_td16();
#ifdef __AVX2__
init_td16avx2();
#endif
lte_sync_time_init(lte_frame_parms);
......
This diff is collapsed.
......@@ -52,8 +52,8 @@
int* sync_corr_ue0 = NULL;
int* sync_corr_ue1 = NULL;
int* sync_corr_ue2 = NULL;
int sync_tmp[2048*4] __attribute__((aligned(16)));
short syncF_tmp[2048*2] __attribute__((aligned(16)));
int sync_tmp[2048*4] __attribute__((aligned(32)));
short syncF_tmp[2048*2] __attribute__((aligned(32)));
......
......@@ -56,8 +56,8 @@ void lte_sync_timefreq(PHY_VARS_UE *ue,int band,unsigned int DL_freq)
{
#if defined(__x86_64__) || defined(__i386__)
UE_SCAN_INFO_t *scan_info = &ue->scan_info[band];
int16_t spectrum[12288] __attribute__((aligned(16)));
int16_t spectrum_p5ms[12288] __attribute__((aligned(16)));
int16_t spectrum[12288] __attribute__((aligned(32)));
int16_t spectrum_p5ms[12288] __attribute__((aligned(32)));
int i,f,band_idx;
__m128i autocorr0[256/4],autocorr1[256/4],autocorr2[256/4];
__m128i autocorr0_t[256/4],autocorr1_t[256/4],autocorr2_t[256/4];
......
......@@ -186,6 +186,27 @@ uint32_t dlsch_decoding(PHY_VARS_UE *phy_vars_ue,
uint8_t crc_type;
#ifdef DEBUG_DLSCH_DECODING
uint16_t i;
#endif
#ifdef __AVX2__
int Kr_last,skipped_last=0;
uint8_t (*tc_2cw)(int16_t *y,
int16_t *y2,
uint8_t *,
uint8_t *,
uint16_t,
uint16_t,
uint16_t,
uint8_t,
uint8_t,
uint8_t,
time_stats_t *,
time_stats_t *,
time_stats_t *,
time_stats_t *,
time_stats_t *,
time_stats_t *,
time_stats_t *);
#endif
uint8_t (*tc)(int16_t *y,
uint8_t *,
......@@ -203,6 +224,9 @@ uint32_t dlsch_decoding(PHY_VARS_UE *phy_vars_ue,
time_stats_t *,
time_stats_t *);
if (!dlsch_llr) {
printf("dlsch_decoding.c: NULL dlsch_llr pointer\n");
return(dlsch->max_turbo_iterations);
......@@ -223,8 +247,12 @@ uint32_t dlsch_decoding(PHY_VARS_UE *phy_vars_ue,
return(dlsch->max_turbo_iterations);
}
if (llr8_flag == 0)
if (llr8_flag == 0) {
#ifdef __AVX2__
tc_2cw = phy_threegpplte_turbo_decoder16avx2;
#endif
tc = phy_threegpplte_turbo_decoder16;
}
else
tc = phy_threegpplte_turbo_decoder8;
......@@ -300,6 +328,10 @@ uint32_t dlsch_decoding(PHY_VARS_UE *phy_vars_ue,
LOG_E(PHY,"Illegal harq_process->C %d > %d\n",harq_process->C,MAX_NUM_DLSCH_SEGMENTS/bw_scaling);
return((1+dlsch->max_turbo_iterations));
}
#ifdef DEBUG_DLSCH_DECODING
printf("Segmentation: C %d, Cminus %d, Kminus %d, Kplus %d\n",harq_process->C,harq_process->Cminus,harq_process->Kminus,harq_process->Kplus);
#endif
for (r=0; r<harq_process->C; r++) {
......@@ -414,15 +446,11 @@ uint32_t dlsch_decoding(PHY_VARS_UE *phy_vars_ue,
printf("\n");
*/
#ifndef __AVX2__
if (err_flag == 0) {
start_meas(dlsch_turbo_decoding_stats);
#ifdef TURBO_S
ret = phy_threegpplte_turbo_decoder_scalar
#else
ret = tc
#endif
(&harq_process->d[r][96],
harq_process->c[r],
Kr,
......@@ -442,7 +470,130 @@ uint32_t dlsch_decoding(PHY_VARS_UE *phy_vars_ue,
stop_meas(dlsch_turbo_decoding_stats);
}
#else
if ((harq_process->C == 1) ||
((r==harq_process->C-1) && (skipped_last==0))) { // last segment with odd number of segments
start_meas(dlsch_turbo_decoding_stats);
ret = tc
(&harq_process->d[r][96],
harq_process->c[r],
Kr,
f1f2mat_old[iind*2],
f1f2mat_old[(iind*2)+1],
dlsch->max_turbo_iterations,
crc_type,
(r==0) ? harq_process->F : 0,
&phy_vars_ue->dlsch_tc_init_stats,
&phy_vars_ue->dlsch_tc_alpha_stats,
&phy_vars_ue->dlsch_tc_beta_stats,
&phy_vars_ue->dlsch_tc_gamma_stats,
&phy_vars_ue->dlsch_tc_ext_stats,
&phy_vars_ue->dlsch_tc_intl1_stats,
&phy_vars_ue->dlsch_tc_intl2_stats); //(is_crnti==0)?harq_pid:harq_pid+1);
stop_meas(dlsch_turbo_decoding_stats);
// printf("single decode, exit\n");
// exit(-1);
}
else {
// we can merge code segments
if ((skipped_last == 0) && (r<harq_process->C-1)) {
skipped_last = 1;
Kr_last = Kr;
}
else {
skipped_last=0;
if (Kr_last == Kr) { // decode 2 code segments with AVX2 version
#ifdef DEBUG_DLSCH_DECODING
printf("single decoding segment %d (%p)\n",r-1,&harq_process->d[r-1][96]);
#endif
start_meas(dlsch_turbo_decoding_stats);
#ifdef DEBUG_DLSCH_DECODING
printf("double decoding segments %d,%d (%p,%p)\n",r-1,r,&harq_process->d[r-1][96],&harq_process->d[r][96]);
#endif
ret = tc_2cw
(&harq_process->d[r-1][96],
&harq_process->d[r][96],
harq_process->c[r-1],
harq_process->c[r],
Kr,
f1f2mat_old[iind*2],
f1f2mat_old[(iind*2)+1],
dlsch->max_turbo_iterations,
crc_type,
(r==0) ? harq_process->F : 0,
&phy_vars_ue->dlsch_tc_init_stats,
&phy_vars_ue->dlsch_tc_alpha_stats,
&phy_vars_ue->dlsch_tc_beta_stats,
&phy_vars_ue->dlsch_tc_gamma_stats,
&phy_vars_ue->dlsch_tc_ext_stats,
&phy_vars_ue->dlsch_tc_intl1_stats,
&phy_vars_ue->dlsch_tc_intl2_stats); //(is_crnti==0)?harq_pid:harq_pid+1);
/*
ret = tc
(&harq_process->d[r-1][96],
harq_process->c[r-1],
Kr_last,
f1f2mat_old[iind*2],
f1f2mat_old[(iind*2)+1],
dlsch->max_turbo_iterations,
crc_type,
(r==0) ? harq_process->F : 0,
&phy_vars_ue->dlsch_tc_init_stats,
&phy_vars_ue->dlsch_tc_alpha_stats,
&phy_vars_ue->dlsch_tc_beta_stats,
&phy_vars_ue->dlsch_tc_gamma_stats,
&phy_vars_ue->dlsch_tc_ext_stats,
&phy_vars_ue->dlsch_tc_intl1_stats,
&phy_vars_ue->dlsch_tc_intl2_stats); //(is_crnti==0)?harq_pid:harq_pid+1);
exit(-1);*/
stop_meas(dlsch_turbo_decoding_stats);
}
else { // Kr_last != Kr
start_meas(dlsch_turbo_decoding_stats);
ret = tc
(&harq_process->d[r-1][96],
harq_process->c[r-1],
Kr_last,
f1f2mat_old[iind*2],
f1f2mat_old[(iind*2)+1],
dlsch->max_turbo_iterations,
crc_type,
(r==0) ? harq_process->F : 0,
&phy_vars_ue->dlsch_tc_init_stats,
&phy_vars_ue->dlsch_tc_alpha_stats,
&phy_vars_ue->dlsch_tc_beta_stats,
&phy_vars_ue->dlsch_tc_gamma_stats,
&phy_vars_ue->dlsch_tc_ext_stats,
&phy_vars_ue->dlsch_tc_intl1_stats,
&phy_vars_ue->dlsch_tc_intl2_stats); //(is_crnti==0)?harq_pid:harq_pid+1);
stop_meas(dlsch_turbo_decoding_stats);
start_meas(dlsch_turbo_decoding_stats);
ret = tc
(&harq_process->d[r][96],
harq_process->c[r],
Kr,
f1f2mat_old[iind*2],
f1f2mat_old[(iind*2)+1],
dlsch->max_turbo_iterations,
crc_type,
(r==0) ? harq_process->F : 0,
&phy_vars_ue->dlsch_tc_init_stats,
&phy_vars_ue->dlsch_tc_alpha_stats,
&phy_vars_ue->dlsch_tc_beta_stats,
&phy_vars_ue->dlsch_tc_gamma_stats,
&phy_vars_ue->dlsch_tc_ext_stats,
&phy_vars_ue->dlsch_tc_intl1_stats,
&phy_vars_ue->dlsch_tc_intl2_stats); //(is_crnti==0)?harq_pid:harq_pid+1);
stop_meas(dlsch_turbo_decoding_stats);
}
}
}
#endif
if ((err_flag == 0) && (ret>=(1+dlsch->max_turbo_iterations))) {// a Code segment is in error so break;
......
......@@ -79,9 +79,9 @@ extern int exit_openair;
//extern void do_OFDM_mod(mod_sym_t **txdataF, int32_t **txdata, uint32_t frame, uint16_t next_slot, LTE_DL_FRAME_PARMS *frame_parms);
unsigned char dlsch_input_buffer[2700] __attribute__ ((aligned(16)));
int eNB_sync_buffer0[640*6] __attribute__ ((aligned(16)));
int eNB_sync_buffer1[640*6] __attribute__ ((aligned(16)));
unsigned char dlsch_input_buffer[2700] __attribute__ ((aligned(32)));
int eNB_sync_buffer0[640*6] __attribute__ ((aligned(32)));
int eNB_sync_buffer1[640*6] __attribute__ ((aligned(32)));
int *eNB_sync_buffer[2] = {eNB_sync_buffer0, eNB_sync_buffer1};
extern uint16_t hundred_times_log10_NPRB[100];
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment