diff --git a/executables/lte-ru.c b/executables/lte-ru.c index 326e55b2887ac3b5f55843b8dba5c9b50c535a06..7b291aa611f67313c83f67ec9e1b958cb560bc58 100644 --- a/executables/lte-ru.c +++ b/executables/lte-ru.c @@ -1815,6 +1815,12 @@ static void *ru_thread( void *param ) { // do RX front-end processing (frequency-shift, dft) if needed if (ru->feprx) ru->feprx(ru, proc->tti_rx); + if (ru->dft_in_levdB==-1) { + int sigenergy=0; + for (int aa=0;aa<ru->nb_rx;aa++) + sigenergy += signal_energy(ru->common.rxdata[aa]+proc->tti_rx*ru->frame_parms->samples_per_tti,2048); + ru->dft_in_levdB = dB_fixed(sigenergy)+30; + } // wakeup all eNB processes waiting for this RU AssertFatal((ret=pthread_mutex_lock(&proc->mutex_eNBs))==0,"mutex_lock returns %d\n",ret); diff --git a/executables/nr-ru.c b/executables/nr-ru.c index 90d7e235dec71a2ffd77f78aafca1ad07e05e9fa..f7bb294f16c5bbc82e2f4074e6d11fbac5861782 100644 --- a/executables/nr-ru.c +++ b/executables/nr-ru.c @@ -1320,6 +1320,12 @@ void *ru_thread(void *param) // set the tti that was generated to busy rx_tti_busy[proc->tti_rx % RU_RX_SLOT_DEPTH] = true; ru->feprx(ru,proc->tti_rx); + if (ru->dft_in_levdB==-1) { + int sigenergy=0; + for (int aa=0;aa<ru->nb_rx;aa++) + sigenergy += signal_energy(ru->common.rxdata[aa]+fp->get_samples_slot_timestamp(proc->tti_rx,fp,0),2048); + ru->dft_in_levdB = dB_fixed(sigenergy)+40; + } LOG_D(NR_PHY, "Setting %d.%d (%d) to busy\n", proc->frame_rx, proc->tti_rx, proc->tti_rx % RU_RX_SLOT_DEPTH); clock_gettime(CLOCK_MONOTONIC,&ru->rt_ru_profiling.return_RU_feprx[rt_prof_idx]); //LOG_M("rxdata.m","rxs",ru->common.rxdata[0],1228800,1,1); diff --git a/openair1/PHY/INIT/lte_init_ru.c b/openair1/PHY/INIT/lte_init_ru.c index 967d352769a9cf703c0fa0912f3c078233830d81..108eee38b21fd569b1a508d1f891400387d811e4 100644 --- a/openair1/PHY/INIT/lte_init_ru.c +++ b/openair1/PHY/INIT/lte_init_ru.c @@ -181,6 +181,7 @@ int phy_init_RU(RU_t *ru) { } // !=IF5 ru->common.sync_corr = (uint32_t *)malloc16_clear( LTE_NUMBER_OF_SUBFRAMES_PER_FRAME*sizeof(uint32_t)*fp->samples_per_tti ); + ru->dft_in_levdB = -1; return(0); } diff --git a/openair1/PHY/INIT/nr_init_ru.c b/openair1/PHY/INIT/nr_init_ru.c index 674c70cdc39eae21dcef00eda29f0196b3b3a6d3..075d2e20877663e8b8008d5f75abad662acca107 100644 --- a/openair1/PHY/INIT/nr_init_ru.c +++ b/openair1/PHY/INIT/nr_init_ru.c @@ -131,6 +131,7 @@ int nr_phy_init_RU(RU_t *ru) init_prach_ru_list(ru); + ru->dft_in_levdB = -1; return(0); } diff --git a/openair1/PHY/LTE_ESTIMATION/lte_adjust_sync_eNB.c b/openair1/PHY/LTE_ESTIMATION/lte_adjust_sync_eNB.c index 137035d0d71b850cf7ca5aaa4196dabd78247bb2..beac833e4f9ce274e7d3da6ce2cb19bb03d66ccc 100644 --- a/openair1/PHY/LTE_ESTIMATION/lte_adjust_sync_eNB.c +++ b/openair1/PHY/LTE_ESTIMATION/lte_adjust_sync_eNB.c @@ -69,7 +69,7 @@ int lte_est_timing_advance(LTE_DL_FRAME_PARMS *frame_parms, break; } if (len) - dft(get_dft(len), (int16_t *)lte_eNB_srs->srs_ch_estimates[aa], (int16_t *)lte_eNB_srs->srs_ch_estimates_time[aa], 1); + dft(get_dft(len), (int16_t *)lte_eNB_srs->srs_ch_estimates[aa], (int16_t *)lte_eNB_srs->srs_ch_estimates_time[aa], get_dft_scaling(len,0)); #ifdef DEBUG_PHY sprintf(fname,"srs_ch_estimates_time_%d%d.m",ind,aa); sprintf(vname,"srs_time_%d%d",ind,aa); diff --git a/openair1/PHY/LTE_ESTIMATION/lte_dl_channel_estimation.c b/openair1/PHY/LTE_ESTIMATION/lte_dl_channel_estimation.c index fe995885cb42e35d54a581ade0910068d9302862..d2ce4513bd677b7df30f653942ce44a16f37552a 100644 --- a/openair1/PHY/LTE_ESTIMATION/lte_dl_channel_estimation.c +++ b/openair1/PHY/LTE_ESTIMATION/lte_dl_channel_estimation.c @@ -671,7 +671,7 @@ int lte_dl_channel_estimation(PHY_VARS_UE *ue, idft(get_idft(s), (int16_t *)&dl_ch_estimates[(p << 1) + aarx][8], (int16_t *)vars->dl_ch_estimates_time[eNB_offset][(p << 1) + aarx], - 1); + get_idft_scaling(s,1)); } } } diff --git a/openair1/PHY/LTE_ESTIMATION/lte_dl_mbsfn_channel_estimation.c b/openair1/PHY/LTE_ESTIMATION/lte_dl_mbsfn_channel_estimation.c index 90606c63c4659f88bab3b9482bce31390667a9ad..1698e7bf26323a1d229b19cbc75971060d1315c2 100644 --- a/openair1/PHY/LTE_ESTIMATION/lte_dl_mbsfn_channel_estimation.c +++ b/openair1/PHY/LTE_ESTIMATION/lte_dl_mbsfn_channel_estimation.c @@ -749,7 +749,7 @@ int lte_dl_mbsfn_channel_estimation(PHY_VARS_UE *ue, (int16_t *)&tmp[8], (int16_t *)ue->common_vars.common_vars_rx_data_per_thread[ue->current_thread_id[subframe]] .dl_ch_estimates_time[eNB_offset][aa], - 1); + get_idft_scaling(len,1)); } } return(0); @@ -918,7 +918,7 @@ int lte_dl_mbsfn_khz_1dot25_channel_estimation(PHY_VARS_UE *ue, idft(get_idft(len), (int16_t *)&tmp[8], (int16_t *)ue->common_vars.common_vars_rx_data_per_thread[ue->current_thread_id[subframe]].dl_ch_estimates_time[0][aa], - 1); + get_idft_scaling(len,1)); } } return(0); diff --git a/openair1/PHY/LTE_ESTIMATION/lte_sync_time.c b/openair1/PHY/LTE_ESTIMATION/lte_sync_time.c index 4a1980af8c3cf3d06296458d369bbe1f8f58037c..97a63704480a1218d125cfc2966ad36225776603 100644 --- a/openair1/PHY/LTE_ESTIMATION/lte_sync_time.c +++ b/openair1/PHY/LTE_ESTIMATION/lte_sync_time.c @@ -58,7 +58,7 @@ static void doIdft(int size, short *in, short *out) { LOG_E(PHY, "Unknown N_RB_DL %d\n", size); return; } - idft(get_idft(len), in, out, 1); + idft(get_idft(len), in, out, get_idft_scaling(len,0)); } static void copyPrimary( c16_t *out, struct complex16 *in, int ofdmSize) { @@ -205,7 +205,7 @@ int ru_sync_time_init(RU_t *ru) { // LTE_UE_COMMON *common_vars return -1; } idft(get_idft(len), (int16_t *)&dmrsp[0][3 * ru->frame_parms->ofdm_symbol_size], ru->dmrssync, - 1); /// complex output + get_idft_scaling(len,0)); /// complex output return(0); } diff --git a/openair1/PHY/LTE_ESTIMATION/lte_sync_timefreq.c b/openair1/PHY/LTE_ESTIMATION/lte_sync_timefreq.c index bfda00607f8beef42dbbc65164a5b3684ceb5611..34ce9453c60fdeb9e2cfe91f6496e0dc447e70f7 100644 --- a/openair1/PHY/LTE_ESTIMATION/lte_sync_timefreq.c +++ b/openair1/PHY/LTE_ESTIMATION/lte_sync_timefreq.c @@ -79,7 +79,7 @@ void lte_sync_timefreq(PHY_VARS_UE *ue,int band,unsigned int DL_freq) //compute frequency-domain representation of 6144-sample chunk dft(DFT_6144,(int16_t *)rxp, - sp,1); + sp,get_dft_scaling(6144,0)); /* @@ -274,7 +274,7 @@ void lte_sync_timefreq(PHY_VARS_UE *ue,int band,unsigned int DL_freq) } // ifft, accumulate energy over two half-frames - idft(IDFT_256,(int16_t*)autocorr0,(int16_t*)tmp_t,1); + idft(IDFT_256,(int16_t*)autocorr0,(int16_t*)tmp_t,get_idft_scaling(256,1)); /* if (i==12288) { sprintf(fname,"corr256F_%d.m",abs(f)); @@ -292,12 +292,12 @@ void lte_sync_timefreq(PHY_VARS_UE *ue,int band,unsigned int DL_freq) for (re=0; re<(256/4); re++) autocorr0_t[re] = simde_mm_add_epi32(autocorr0_t[re], simde_mm_madd_epi16(tmp_t[re], tmp_t[re])); - idft(IDFT_256,(int16_t*)autocorr1,(int16_t*)tmp_t,1); + idft(IDFT_256,(int16_t*)autocorr1,(int16_t*)tmp_t,get_idft_scaling(256,1)); for (re=0; re<(256/4); re++) autocorr1_t[re] = simde_mm_add_epi32(autocorr1_t[re], simde_mm_madd_epi16(tmp_t[re], tmp_t[re])); - idft(IDFT_256,(int16_t*)autocorr2,(int16_t*)tmp_t,1); + idft(IDFT_256,(int16_t*)autocorr2,(int16_t*)tmp_t,get_idft_scaling(256,1)); for (re=0; re<(256/4); re++) autocorr2_t[re] = simde_mm_add_epi32(autocorr2_t[re], simde_mm_madd_epi16(tmp_t[re], tmp_t[re])); diff --git a/openair1/PHY/LTE_ESTIMATION/lte_ul_channel_estimation.c b/openair1/PHY/LTE_ESTIMATION/lte_ul_channel_estimation.c index a2c64945344f7e6e30cc23e84ff4e3c6cbd1fa5e..1812ef8d814a8b10ec634e526c2269127ed223de 100644 --- a/openair1/PHY/LTE_ESTIMATION/lte_ul_channel_estimation.c +++ b/openair1/PHY/LTE_ESTIMATION/lte_ul_channel_estimation.c @@ -240,7 +240,7 @@ int32_t lte_ul_channel_estimation(LTE_DL_FRAME_PARMS *frame_parms, LOG_E(PHY, "Unknown N_RB_DL %d\n", frame_parms->N_RB_DL); return -1; } - idft(get_idft(len), (int16_t *)temp_in_ifft_0, (int16_t *)ul_ch_estimates_time[aa], 1); + idft(get_idft(len), (int16_t *)temp_in_ifft_0, (int16_t *)ul_ch_estimates_time[aa], get_idft_scaling(len,1)); #if T_TRACER if (aa == 0) @@ -511,7 +511,7 @@ int32_t lte_ul_channel_estimation_RRU(LTE_DL_FRAME_PARMS *frame_parms, LOG_E(PHY, "Unknown N_RB_DL %d\n", frame_parms->N_RB_DL); return -1; } - idft(get_idft(len), (int16_t *)temp_in_ifft_0, (int16_t *)ul_ch_estimates_time[aa], 1); + idft(get_idft(len), (int16_t *)temp_in_ifft_0, (int16_t *)ul_ch_estimates_time[aa], get_idft_scaling(len,1)); #if T_TRACER if (aa == 0) diff --git a/openair1/PHY/LTE_TRANSPORT/prach.c b/openair1/PHY/LTE_TRANSPORT/prach.c index 92350194ca6512e2227f377a419677bcf0288d69..d19b8875447f19b6aaae4c222ea730f15bab0a53 100644 --- a/openair1/PHY/LTE_TRANSPORT/prach.c +++ b/openair1/PHY/LTE_TRANSPORT/prach.c @@ -320,9 +320,9 @@ void rx_prach0(PHY_VARS_eNB *eNB, break; } - dft(get_dft(fft_size), prach2, rxsigF[aa], 1); + dft(get_dft(fft_size), prach2, rxsigF[aa], get_dft_scaling(fft_size,ru->dft_in_levdB)); if (prach_fmt > 1 && prach_fmt != 4) - dft(get_dft(fft_size), prach2 + 2 * fft_size, rxsigF[aa] + 2 * fft_size, 1); + dft(get_dft(fft_size), prach2 + 2 * fft_size, rxsigF[aa] + 2 * fft_size, get_dft_scaling(fft_size,ru->dft_in_levdB)); k = (12*n_ra_prb) - 6*fp->N_RB_UL; @@ -529,13 +529,13 @@ void rx_prach0(PHY_VARS_eNB *eNB, // Now do IFFT of size 1024 (N_ZC=839) or 256 (N_ZC=139) if (N_ZC == 839) { log2_ifft_size = 10; - idft(IDFT_1024,(int16_t*)prachF,prach_ifft_tmp,1); + idft(IDFT_1024,(int16_t*)prachF,prach_ifft_tmp,get_idft_scaling(1024,1)); // compute energy and accumulate over receive antennas and repetitions for BR for (i=0; i<2048; i++) prach_ifft[i] += (prach_ifft_tmp[i<<1]*prach_ifft_tmp[i<<1] + prach_ifft_tmp[1+(i<<1)]*prach_ifft_tmp[1+(i<<1)])>>9; } else { - idft(IDFT_256,(int16_t*)prachF,prach_ifft_tmp,1); + idft(IDFT_256,(int16_t*)prachF,prach_ifft_tmp,get_idft_scaling(1024,1)); log2_ifft_size = 8; // compute energy and accumulate over receive antennas and repetitions for BR diff --git a/openair1/PHY/LTE_TRANSPORT/ulsch_demodulation.c b/openair1/PHY/LTE_TRANSPORT/ulsch_demodulation.c index 25068537054b769c1dbd0ba25e870fb8944606cf..4604f752eb84d48edb5bc280ee0d071b76ed5b47 100644 --- a/openair1/PHY/LTE_TRANSPORT/ulsch_demodulation.c +++ b/openair1/PHY/LTE_TRANSPORT/ulsch_demodulation.c @@ -130,9 +130,9 @@ void lte_idft(LTE_DL_FRAME_PARMS *frame_parms,uint32_t *z, uint16_t Msc_PUSCH) { dft_size_idx_t dftsize = get_dft(Msc_PUSCH); switch (Msc_PUSCH) { case 12: - dft(dftsize, (int16_t *)idft_in0, (int16_t *)idft_out0, 0); - dft(dftsize, (int16_t *)idft_in1, (int16_t *)idft_out1, 0); - dft(dftsize, (int16_t *)idft_in2, (int16_t *)idft_out2, 0); + dft(dftsize, (int16_t *)idft_in0, (int16_t *)idft_out0, (uint32_t*)0); + dft(dftsize, (int16_t *)idft_in1, (int16_t *)idft_out1, (uint32_t*)0); + dft(dftsize, (int16_t *)idft_in2, (int16_t *)idft_out2, (uint32_t*)0); norm128 = simde_mm_set1_epi16(9459); for (i=0; i<12; i++) { @@ -144,9 +144,9 @@ void lte_idft(LTE_DL_FRAME_PARMS *frame_parms,uint32_t *z, uint16_t Msc_PUSCH) { break; default: - dft(dftsize, idft_in0, idft_out0, 1); - dft(dftsize, idft_in1, idft_out1, 1); - dft(dftsize, idft_in2, idft_out2, 1); + dft(dftsize, idft_in0, idft_out0, (uint32_t*)1); + dft(dftsize, idft_in1, idft_out1, (uint32_t*)1); + dft(dftsize, idft_in2, idft_out2, (uint32_t*)1); } for (i=0,ip=0; i<Msc_PUSCH; i++,ip+=4) { diff --git a/openair1/PHY/LTE_UE_TRANSPORT/prach_ue.c b/openair1/PHY/LTE_UE_TRANSPORT/prach_ue.c index 9fe33c189d3c71fa305a36f4b53622082cadeb3c..5c30ac0fd764bb744136f11236b439736c3bc903 100644 --- a/openair1/PHY/LTE_UE_TRANSPORT/prach_ue.c +++ b/openair1/PHY/LTE_UE_TRANSPORT/prach_ue.c @@ -356,12 +356,12 @@ int32_t generate_prach( PHY_VARS_UE *ue, uint8_t eNB_id, uint8_t subframe, uint1 break; } if (prach_fmt == 4) { - idft(get_idft(len), prachF, prach2, 1); + idft(get_idft(len), prachF, prach2, get_idft_scaling(len,0)); // TODO: account for repeated format in dft output memmove(prach, prach + 2 * len, Ncp << 2); prach_len = len + Ncp; } else { - idft(get_idft(len), prachF, prach2, 1); + idft(get_idft(len), prachF, prach2, get_idft_scaling(len,0)); memmove(prach, prach + 2 * len, Ncp << 2); prach_len = len + Ncp; if (prach_fmt > 1) { diff --git a/openair1/PHY/LTE_UE_TRANSPORT/ulsch_modulation.c b/openair1/PHY/LTE_UE_TRANSPORT/ulsch_modulation.c index 9c4ccf067992bd0d8396ade60e8b3a5f4459bf80..df230f4d37348f6b2a5a20d874972cbf487b22ea 100644 --- a/openair1/PHY/LTE_UE_TRANSPORT/ulsch_modulation.c +++ b/openair1/PHY/LTE_UE_TRANSPORT/ulsch_modulation.c @@ -100,9 +100,9 @@ void dft_lte(int32_t *z,struct complex16 *input, int32_t Msc_PUSCH, uint8_t Nsym switch (Msc_PUSCH) { case 12: - dft(dftsize, (int16_t *)dft_in0, (int16_t *)dft_out0, 0); - dft(dftsize, (int16_t *)dft_in1, (int16_t *)dft_out1, 0); - dft(dftsize, (int16_t *)dft_in2, (int16_t *)dft_out2, 0); + dft(dftsize, (int16_t *)dft_in0, (int16_t *)dft_out0, (uint32_t*)0); + dft(dftsize, (int16_t *)dft_in1, (int16_t *)dft_out1, (uint32_t*)0); + dft(dftsize, (int16_t *)dft_in2, (int16_t *)dft_out2, (uint32_t*)0); norm128 = simde_mm_set1_epi16(9459); for (i = 0; i < 12; i++) { ((simde__m128i *)dft_out0)[i] = simde_mm_slli_epi16(simde_mm_mulhi_epi16(((simde__m128i *)dft_out0)[i], norm128), 1); @@ -113,9 +113,9 @@ void dft_lte(int32_t *z,struct complex16 *input, int32_t Msc_PUSCH, uint8_t Nsym break; default: - dft(dftsize, (int16_t *)dft_in0, (int16_t *)dft_out0, 1); - dft(dftsize, (int16_t *)dft_in1, (int16_t *)dft_out1, 1); - dft(dftsize, (int16_t *)dft_in2, (int16_t *)dft_out2, 1); + dft(dftsize, (int16_t *)dft_in0, (int16_t *)dft_out0, (uint32_t*)1); + dft(dftsize, (int16_t *)dft_in1, (int16_t *)dft_out1, (uint32_t*)1); + dft(dftsize, (int16_t *)dft_in2, (int16_t *)dft_out2, (uint32_t*)1); break; } diff --git a/openair1/PHY/MODULATION/nr_modulation.c b/openair1/PHY/MODULATION/nr_modulation.c index e8b215acccaa9dd1333fdc700c83af1e4e71fb23..f53453275e0c28fb61121a9cbff7916a89c1c4b8 100644 --- a/openair1/PHY/MODULATION/nr_modulation.c +++ b/openair1/PHY/MODULATION/nr_modulation.c @@ -344,7 +344,8 @@ void nr_dft(c16_t *z, c16_t *d, uint32_t Msc_PUSCH) dft_size_idx_t dftsize = get_dft(Msc_PUSCH); switch (Msc_PUSCH) { case 12: - dft(dftsize, (int16_t *)dft_in0, (int16_t *)dft_out0, 0); + dft(DFT_12,(int16_t *)dft_in0, (int16_t *)dft_out0, (uint32_t*)0); + norm128 = simde_mm_set1_epi16(9459); for (i = 0; i < 12; i++) { ((simde__m128i *)dft_out0)[i] = simde_mm_slli_epi16(simde_mm_mulhi_epi16(((simde__m128i *)dft_out0)[i], norm128), 1); @@ -352,7 +353,7 @@ void nr_dft(c16_t *z, c16_t *d, uint32_t Msc_PUSCH) break; default: - dft(dftsize, (int16_t *)dft_in0, (int16_t *)dft_out0, 1); + dft(dftsize, (int16_t *)dft_in0, (int16_t *)dft_out0, (uint32_t*)1); break; } diff --git a/openair1/PHY/MODULATION/nr_modulation.h b/openair1/PHY/MODULATION/nr_modulation.h index f066a967b5b8d47178fc4eea23050c7f8a2b14c8..46fbacccfb1b1c222c3d8ec1679f9ce7d82a349c 100644 --- a/openair1/PHY/MODULATION/nr_modulation.h +++ b/openair1/PHY/MODULATION/nr_modulation.h @@ -78,6 +78,7 @@ void nr_ue_layer_mapping(const c16_t *mod_symbs, const int n_layers, const int n \param symbol symbol within slot (0..12/14) \param Ns Slot number (0..19) \param sample_offset offset within rxdata (points to beginning of subframe) +\param levdB Input level to select scaling of dft in OFDM demod */ int nr_slot_fep_ul(NR_DL_FRAME_PARMS *frame_parms, @@ -85,7 +86,8 @@ int nr_slot_fep_ul(NR_DL_FRAME_PARMS *frame_parms, int32_t *rxdataF, unsigned char symbol, unsigned char Ns, - int sample_offset); + int sample_offset, + uint32_t levdB); /*! \brief This function implements the dft transform precoding in PUSCH diff --git a/openair1/PHY/MODULATION/ofdm_mod.c b/openair1/PHY/MODULATION/ofdm_mod.c index e25f7cd7f2457542bded79101f3bc0185e45311a..36c6756b7f88c9475c56e013077b54005eb418bd 100644 --- a/openair1/PHY/MODULATION/ofdm_mod.c +++ b/openair1/PHY/MODULATION/ofdm_mod.c @@ -137,6 +137,7 @@ void PHY_ofdm_mod(const int *input, /// pointer to complex input return; idft_size_idx_t idft_size = get_idft(fftsize); + uint32_t *scaling_sched = get_idft_scaling(fftsize,0); #ifdef DEBUG_OFDM_MOD printf("[PHY] OFDM mod (size %d,prefix %d) Symbols %d, input %p, output %p\n", @@ -169,11 +170,11 @@ void PHY_ofdm_mod(const int *input, /// pointer to complex input // Current idft implementation uses AVX-256: Check if buffer is already aligned to 256 bits (32 bytes) if ((uintptr_t)output_ptr % 32 == 0) { // output ptr is aligned, do ifft inplace - idft(idft_size, (int16_t *)&input[i * fftsize], (int16_t *)output_ptr, 1); + idft(idft_size, (int16_t *)&input[i * fftsize], (int16_t *)output_ptr, scaling_sched); } else { // output ptr is not aligned, needs an extra memcpy c16_t temp[fftsize] __attribute__((aligned(IDFT_OUTPUT_BUFFER_ALIGNMENT))); - idft(idft_size, (int16_t *)&input[i * fftsize], (int16_t *)temp, 1); + idft(idft_size, (int16_t *)&input[i * fftsize], (int16_t *)temp, scaling_sched); memcpy((void *)output_ptr, (void *)temp, sizeof(temp)); } // perform cyclic prefix insertion @@ -184,7 +185,7 @@ void PHY_ofdm_mod(const int *input, /// pointer to complex input case CYCLIC_SUFFIX: { // Use alignment of 64 bytes c16_t temp[fftsize] __attribute__((aligned(IDFT_OUTPUT_BUFFER_ALIGNMENT))); - idft(idft_size, (int16_t *)&input[i * fftsize], (int16_t *)temp, 1); + idft(idft_size, (int16_t *)&input[i * fftsize], (int16_t *)temp, scaling_sched); int *output_ptr = &output[(i * fftsize) + (i * nb_prefix_samples)]; memcpy(output_ptr, temp, sizeof(temp)); memcpy(&output_ptr[fftsize], temp, nb_prefix_samples * sizeof(c16_t)); @@ -197,7 +198,7 @@ void PHY_ofdm_mod(const int *input, /// pointer to complex input case NONE: { c16_t temp[fftsize] __attribute__((aligned(IDFT_OUTPUT_BUFFER_ALIGNMENT))); - idft(idft_size, (int16_t *)&input[i * fftsize], (int16_t *)temp, 1); + idft(idft_size, (int16_t *)&input[i * fftsize], (int16_t *)temp, scaling_sched); int *output_ptr = &output[i * fftsize]; memcpy(output_ptr, temp, sizeof(temp)); break; diff --git a/openair1/PHY/MODULATION/slot_fep.c b/openair1/PHY/MODULATION/slot_fep.c index 8488545355228e53b36690b5f045ea03c6321608..37ae8e4f7f2f58858075d796dc5ad343c19262b8 100644 --- a/openair1/PHY/MODULATION/slot_fep.c +++ b/openair1/PHY/MODULATION/slot_fep.c @@ -74,6 +74,7 @@ int slot_fep(PHY_VARS_UE *ue, return(-1); } + uint32_t sigenergy_avg=0; for (aa=0; aa<frame_parms->nb_antennas_rx; aa++) { memset(&common_vars->common_vars_rx_data_per_thread[ue->current_thread_id[Ns>>1]].rxdataF[aa][frame_parms->ofdm_symbol_size*symbol],0,frame_parms->ofdm_symbol_size*sizeof(int)); rx_offset = sample_offset + slot_offset + nb_prefix_samples0 + subframe_offset - SOFFSET; @@ -85,17 +86,24 @@ int slot_fep(PHY_VARS_UE *ue, memcpy((short *)&common_vars->rxdata[aa][frame_length_samples], (short *)&common_vars->rxdata[aa][0], frame_parms->ofdm_symbol_size*sizeof(int)); - + uint32_t sigenergy=0; + int dft_in_levdB; + if (ue->dft_in_levdB < 0) { + sigenergy=signal_energy((int32_t*)&common_vars->rxdata[aa][rx_offset & frame_length_samples],frame_parms->ofdm_symbol_size*sizeof(int)); + dft_in_levdB = dB_fixed(sigenergy); + sigenergy_avg += (sigenergy/frame_parms->nb_antennas_rx); + } + else dft_in_levdB = ue->dft_in_levdB; if ((rx_offset&7)!=0) { // if input to dft is not 256-bit aligned, issue for size 6,15 and 25 PRBs memcpy((void *)tmp_dft_in, (void *)&common_vars->rxdata[aa][rx_offset % frame_length_samples], frame_parms->ofdm_symbol_size*sizeof(int)); dft(dftsizeidx,(int16_t *)tmp_dft_in, - (int16_t *)&common_vars->common_vars_rx_data_per_thread[ue->current_thread_id[Ns>>1]].rxdataF[aa][frame_parms->ofdm_symbol_size*symbol],1); + (int16_t *)&common_vars->common_vars_rx_data_per_thread[ue->current_thread_id[Ns>>1]].rxdataF[aa][frame_parms->ofdm_symbol_size*symbol],get_dft_scaling(s,dft_in_levdB)); } else { // use dft input from RX buffer directly start_UE_TIMING(ue->rx_dft_stats); dft(dftsizeidx,(int16_t *)&common_vars->rxdata[aa][(rx_offset) % frame_length_samples], - (int16_t *)&common_vars->common_vars_rx_data_per_thread[ue->current_thread_id[Ns>>1]].rxdataF[aa][frame_parms->ofdm_symbol_size*symbol],1); + (int16_t *)&common_vars->common_vars_rx_data_per_thread[ue->current_thread_id[Ns>>1]].rxdataF[aa][frame_parms->ofdm_symbol_size*symbol],get_dft_scaling(s,dft_in_levdB)); stop_UE_TIMING(ue->rx_dft_stats); } } else { @@ -120,14 +128,15 @@ int slot_fep(PHY_VARS_UE *ue, (void *)&common_vars->rxdata[aa][(rx_offset) % frame_length_samples], frame_parms->ofdm_symbol_size*sizeof(int)); dft(dftsizeidx,(int16_t *)tmp_dft_in, - (int16_t *)&common_vars->common_vars_rx_data_per_thread[ue->current_thread_id[Ns>>1]].rxdataF[aa][frame_parms->ofdm_symbol_size*symbol],1); + (int16_t *)&common_vars->common_vars_rx_data_per_thread[ue->current_thread_id[Ns>>1]].rxdataF[aa][frame_parms->ofdm_symbol_size*symbol],get_dft_scaling(s,ue->dft_in_levdB)); } else { // use dft input from RX buffer directly dft(dftsizeidx,(int16_t *)&common_vars->rxdata[aa][(rx_offset) % frame_length_samples], - (int16_t *)&common_vars->common_vars_rx_data_per_thread[ue->current_thread_id[Ns>>1]].rxdataF[aa][frame_parms->ofdm_symbol_size*symbol],1); + (int16_t *)&common_vars->common_vars_rx_data_per_thread[ue->current_thread_id[Ns>>1]].rxdataF[aa][frame_parms->ofdm_symbol_size*symbol],get_dft_scaling(s,ue->dft_in_levdB)); } stop_UE_TIMING(ue->rx_dft_stats); } + if (ue->dft_in_levdB < 0) ue->dft_in_levdB = dB_fixed(sigenergy_avg)+20; #ifdef DEBUG_FEP @@ -249,11 +258,11 @@ int front_end_fft(PHY_VARS_UE *ue, (void *)&common_vars->rxdata[aa][rx_offset % frame_length_samples], frame_parms->ofdm_symbol_size*sizeof(int)); dft(dftsizeidx,(int16_t *)tmp_dft_in, - (int16_t *)&common_vars->common_vars_rx_data_per_thread[threadId].rxdataF[aa][frame_parms->ofdm_symbol_size*symbol],1); + (int16_t *)&common_vars->common_vars_rx_data_per_thread[threadId].rxdataF[aa][frame_parms->ofdm_symbol_size*symbol],get_dft_scaling(s,ue->dft_in_levdB)); } else { // use dft input from RX buffer directly start_meas(&ue->rx_dft_stats); dft(dftsizeidx,(int16_t *)&common_vars->rxdata[aa][(rx_offset) % frame_length_samples], - (int16_t *)&common_vars->common_vars_rx_data_per_thread[threadId].rxdataF[aa][frame_parms->ofdm_symbol_size*symbol],1); + (int16_t *)&common_vars->common_vars_rx_data_per_thread[threadId].rxdataF[aa][frame_parms->ofdm_symbol_size*symbol],get_dft_scaling(s,ue->dft_in_levdB)); stop_meas(&ue->rx_dft_stats); } } else { @@ -279,10 +288,10 @@ int front_end_fft(PHY_VARS_UE *ue, (void *)&common_vars->rxdata[aa][(rx_offset) % frame_length_samples], frame_parms->ofdm_symbol_size*sizeof(int)); dft(dftsizeidx,(int16_t *)tmp_dft_in, - (int16_t *)&common_vars->common_vars_rx_data_per_thread[threadId].rxdataF[aa][frame_parms->ofdm_symbol_size*symbol],1); + (int16_t *)&common_vars->common_vars_rx_data_per_thread[threadId].rxdataF[aa][frame_parms->ofdm_symbol_size*symbol],get_dft_scaling(s,ue->dft_in_levdB)); } else { // use dft input from RX buffer directly dft(dftsizeidx,(int16_t *)&common_vars->rxdata[aa][(rx_offset) % frame_length_samples], - (int16_t *)&common_vars->common_vars_rx_data_per_thread[threadId].rxdataF[aa][frame_parms->ofdm_symbol_size*symbol],1); + (int16_t *)&common_vars->common_vars_rx_data_per_thread[threadId].rxdataF[aa][frame_parms->ofdm_symbol_size*symbol],get_dft_scaling(s,ue->dft_in_levdB)); } stop_meas(&ue->rx_dft_stats); diff --git a/openair1/PHY/MODULATION/slot_fep_mbsfn.c b/openair1/PHY/MODULATION/slot_fep_mbsfn.c index fa3421959387a2ba9b8db8527450a38ad31dffa5..ca22b3b0c75fb46e1db6eeac4ffc763d05385ec3 100644 --- a/openair1/PHY/MODULATION/slot_fep_mbsfn.c +++ b/openair1/PHY/MODULATION/slot_fep_mbsfn.c @@ -84,7 +84,7 @@ int slot_fep_mbsfn(PHY_VARS_UE *ue, nb_prefix_samples0 + subframe_offset - SOFFSET) % frame_length_samples], - (int16_t *)&common_vars->common_vars_rx_data_per_thread[ue->current_thread_id[subframe]].rxdataF[aa][frame_parms->ofdm_symbol_size*l],1); + (int16_t *)&common_vars->common_vars_rx_data_per_thread[ue->current_thread_id[subframe]].rxdataF[aa][frame_parms->ofdm_symbol_size*l],get_dft_scaling(s,ue->dft_in_levdB)); stop_UE_TIMING(ue->rx_dft_stats); } else { if ((sample_offset + @@ -102,7 +102,7 @@ int slot_fep_mbsfn(PHY_VARS_UE *ue, (frame_parms->ofdm_symbol_size+nb_prefix_samples)*(l-1) + subframe_offset- SOFFSET) % frame_length_samples], - (int16_t *)&common_vars->common_vars_rx_data_per_thread[ue->current_thread_id[subframe]].rxdataF[aa][frame_parms->ofdm_symbol_size*l],1); + (int16_t *)&common_vars->common_vars_rx_data_per_thread[ue->current_thread_id[subframe]].rxdataF[aa][frame_parms->ofdm_symbol_size*l],get_dft_scaling(s,ue->dft_in_levdB)); stop_UE_TIMING(ue->rx_dft_stats); } } @@ -223,7 +223,7 @@ int slot_fep_mbsfn_khz_1dot25(PHY_VARS_UE *ue, nb_prefix_samples + subframe_offset - SOFFSET) % frame_length_samples], - (int16_t *)&common_vars->common_vars_rx_data_per_thread[ue->current_thread_id[subframe]].rxdataF[aa][0],1); + (int16_t *)&common_vars->common_vars_rx_data_per_thread[ue->current_thread_id[subframe]].rxdataF[aa][0],get_dft_scaling(ofdm_symbol_size,ue->dft_in_levdB)); stop_UE_TIMING(ue->rx_dft_stats); } diff --git a/openair1/PHY/MODULATION/slot_fep_nr.c b/openair1/PHY/MODULATION/slot_fep_nr.c index bd61d100f837e6d60f5800ca97da077cb2b287a3..269fcf39c32570fbd020b4703fe8b3cefff56e40 100644 --- a/openair1/PHY/MODULATION/slot_fep_nr.c +++ b/openair1/PHY/MODULATION/slot_fep_nr.c @@ -86,6 +86,12 @@ int nr_slot_fep(PHY_VARS_NR_UE *ue, Ns, symbol, nb_prefix_samples, nb_prefix_samples0, rx_offset, dB_fixed(signal_energy((int32_t *)&common_vars->rxdata[0][rx_offset],frame_parms->ofdm_symbol_size))); #endif + uint32_t *scaling_sched=NULL; + + if (ue && ue->dft_in_levdB >=0) + scaling_sched = get_dft_scaling(frame_parms->ofdm_symbol_size,ue->dft_in_levdB); + + uint32_t sigenergy_avg=0; for (unsigned char aa=0; aa<frame_parms->nb_antennas_rx; aa++) { int16_t *rxdata_ptr = (int16_t *)&rxdata[aa][rx_offset]; @@ -110,18 +116,25 @@ int nr_slot_fep(PHY_VARS_NR_UE *ue, if (ue) start_meas_nr_ue_phy(ue, RX_DFT_STATS); - + else + scaling_sched = get_dft_scaling(frame_parms->ofdm_symbol_size,dB_fixed(signal_energy((int32_t*)rxdata_ptr,dftsize))); + + if (ue && ue->dft_in_levdB < 0) { // this means dft scaling level needs to be recomputed + uint32_t sigenergy= signal_energy((int32_t*)rxdata_ptr,dftsize); + scaling_sched = get_dft_scaling(frame_parms->ofdm_symbol_size,dB_fixed(sigenergy)); + sigenergy_avg += sigenergy/frame_parms->nb_antennas_rx; + } dft(dftsize, rxdata_ptr, (int16_t *)&rxdataF[aa][frame_parms->ofdm_symbol_size*symbol], - 1); + scaling_sched); if (ue) stop_meas_nr_ue_phy(ue, RX_DFT_STATS); apply_nr_rotation_RX(frame_parms, rxdataF[aa], frame_parms->symbol_rotation[linktype], slot, N_RB, 0, symbol, 1); } - + if (ue && ue->dft_in_levdB < 0) ue->dft_in_levdB = dB_fixed(sigenergy_avg) + 20; #ifdef DEBUG_FEP printf("slot_fep: done\n"); #endif @@ -134,12 +147,14 @@ int nr_slot_fep_ul(NR_DL_FRAME_PARMS *frame_parms, int32_t *rxdataF, unsigned char symbol, unsigned char Ns, - int sample_offset) + int sample_offset, + uint32_t levdB) { unsigned int nb_prefix_samples = frame_parms->nb_prefix_samples; unsigned int nb_prefix_samples0 = frame_parms->nb_prefix_samples0; dft_size_idx_t dftsize = get_dft(frame_parms->ofdm_symbol_size); + uint32_t *scaling_sched = get_dft_scaling(frame_parms->ofdm_symbol_size,levdB); // This is for misalignment issues int32_t tmp_dft_in[8192] __attribute__ ((aligned (32))); @@ -183,7 +198,7 @@ int nr_slot_fep_ul(NR_DL_FRAME_PARMS *frame_parms, dft(dftsize, rxdata_ptr, (int16_t *)&rxdataF[symbol * frame_parms->ofdm_symbol_size], - 1); + scaling_sched); return 0; } diff --git a/openair1/PHY/MODULATION/slot_fep_ul.c b/openair1/PHY/MODULATION/slot_fep_ul.c index 51c215fe4656266a8b47239b5c7ebcf395f54fb3..a6fa11864b3297704df4f9ad86fa40a68d0ce738 100644 --- a/openair1/PHY/MODULATION/slot_fep_ul.c +++ b/openair1/PHY/MODULATION/slot_fep_ul.c @@ -82,7 +82,7 @@ int slot_fep_ul(RU_t *ru, #endif dft( dftsize,(int16_t *)&common->rxdata_7_5kHz[aa][rx_offset], (int16_t *)&common->rxdataF[aa][fp->ofdm_symbol_size*symbol], - 1 + get_dft_scaling(s,ru->dft_in_levdB) ); } else { @@ -94,13 +94,13 @@ int slot_fep_ul(RU_t *ru, fp->ofdm_symbol_size*sizeof(int)); dft( dftsize,(short *) tmp_dft_in, (short*) &common->rxdataF[aa][fp->ofdm_symbol_size*symbol], - 1 + get_dft_scaling(s,ru->dft_in_levdB) ); } else{ dft( dftsize,(short *)&common->rxdata_7_5kHz[aa][rx_offset], (short*)&common->rxdataF[aa][fp->ofdm_symbol_size*symbol], - 1 + get_dft_scaling(s,ru->dft_in_levdB) ); } } diff --git a/openair1/PHY/NR_ESTIMATION/nr_measurements_gNB.c b/openair1/PHY/NR_ESTIMATION/nr_measurements_gNB.c index 42b0b0b5155a4ffb7d0fdca407faa5d5f02ba129..5d78835e8e431068a0f63e40fe204a6e1806f6d8 100644 --- a/openair1/PHY/NR_ESTIMATION/nr_measurements_gNB.c +++ b/openair1/PHY/NR_ESTIMATION/nr_measurements_gNB.c @@ -102,7 +102,7 @@ void dump_nr_I0_stats(FILE *fd,PHY_VARS_gNB *gNB) { if (i%25 == 24) fprintf(fd,"\n"); } fprintf(fd,"\n"); - fprintf(fd,"max_IO = %d (%d), min_I0 = %d (%d), avg_I0 = %d dB",max_I0,amax,min_I0,amin,gNB->measurements.n0_subband_power_avg_dB); + fprintf(fd,"max_IO = %d (%d), min_I0 = %d (%d), avg_I0 = %d dB, dft_in_levdB %d dB",max_I0,amax,min_I0,amin,gNB->measurements.n0_subband_power_avg_dB,gNB->RU_list[0]->dft_in_levdB); if (gNB->frame_parms.nb_antennas_rx>1) { fprintf(fd,"("); for (int aarx=0;aarx<gNB->frame_parms.nb_antennas_rx;aarx++) diff --git a/openair1/PHY/NR_TRANSPORT/nr_prach.c b/openair1/PHY/NR_TRANSPORT/nr_prach.c index 3f41712f2cf734aac57bb718c69c18714177e53e..2b1dfcf7257b264c21386c1f75b7e2cb0e6dff59 100644 --- a/openair1/PHY/NR_TRANSPORT/nr_prach.c +++ b/openair1/PHY/NR_TRANSPORT/nr_prach.c @@ -381,6 +381,7 @@ void rx_nr_prach_ru(RU_t *ru, int prachFormat, int numRA, int prachStartSymbol, } const dft_size_idx_t dftsize = get_dft(dftlen); + uint32_t *scaling_sched = get_dft_scaling(dftlen,ru->dft_in_levdB); // Do forward transform if (LOG_DEBUGFLAG(PRACH)) { @@ -416,7 +417,7 @@ void rx_nr_prach_ru(RU_t *ru, int prachFormat, int numRA, int prachStartSymbol, // do DFT int16_t *prach2 = prach[aa] + (2*Ncp); // times 2 for complex samples for (int i = 0; i < reps; i++) - dft(dftsize, prach2 + 2*dftlen*i, rxsigF[aa] + 2*dftlen*i, 1); + dft(dftsize, prach2 + 2*dftlen*i, rxsigF[aa] + 2*dftlen*i, scaling_sched); //LOG_M("ru_rxsigF_tmp.m","rxsFtmp", rxsigF[aa], dftlen*2*reps, 1, 1); @@ -641,12 +642,12 @@ void rx_nr_prach(PHY_VARS_gNB *gNB, // Now do IFFT of size 1024 (N_ZC=839) or 256 (N_ZC=139) if (N_ZC == 839) { - idft(IDFT_1024, prachF, prach_ifft_tmp, 1); + idft(IDFT_1024, prachF, prach_ifft_tmp, IDFT_SCALING_1024[0]); // compute energy and accumulate over receive antennas for (int i = 0; i < 1024; i++) prach_ifft[i] += (int32_t)prach_ifft_tmp[i<<1]*(int32_t)prach_ifft_tmp[i<<1] + (int32_t)prach_ifft_tmp[1+(i<<1)]*(int32_t)prach_ifft_tmp[1+(i<<1)]; } else { - idft(IDFT_256, prachF, prach_ifft_tmp, 1); + idft(IDFT_256, prachF, prach_ifft_tmp, IDFT_SCALING_256[0]); log2_ifft_size = 8; // compute energy and accumulate over receive antennas and repetitions for BR for (int i = 0; i < 256; i++) diff --git a/openair1/PHY/NR_TRANSPORT/nr_ulsch_demodulation.c b/openair1/PHY/NR_TRANSPORT/nr_ulsch_demodulation.c index 396c8d89f4f3c9101dc5f645b7c6602ee80b52c6..1017191c49e2f5bba8b1b223834e706cbc4c25f2 100644 --- a/openair1/PHY/NR_TRANSPORT/nr_ulsch_demodulation.c +++ b/openair1/PHY/NR_TRANSPORT/nr_ulsch_demodulation.c @@ -35,7 +35,7 @@ void nr_idft(int32_t *z, uint32_t Msc_PUSCH) dft_size_idx_t dftsize = get_dft(Msc_PUSCH); switch (Msc_PUSCH) { case 12: - dft(dftsize, (int16_t *)idft_in0, (int16_t *)idft_out0, 0); + dft(dftsize, (int16_t *)idft_in0, (int16_t *)idft_out0, (uint32_t*)0); norm128 = simde_mm_set1_epi16(9459); @@ -45,7 +45,7 @@ void nr_idft(int32_t *z, uint32_t Msc_PUSCH) break; default: - dft(dftsize, idft_in0, idft_out0, 1); + dft(dftsize, idft_in0, idft_out0, (uint32_t*)1); break; } diff --git a/openair1/PHY/NR_UE_TRANSPORT/nr_prach.c b/openair1/PHY/NR_UE_TRANSPORT/nr_prach.c index e2928b2df465322e67b269a95b65b1633b076a54..e7364be44cd368e49f63b676512380646b94d267 100644 --- a/openair1/PHY/NR_UE_TRANSPORT/nr_prach.c +++ b/openair1/PHY/NR_UE_TRANSPORT/nr_prach.c @@ -428,7 +428,8 @@ int32_t generate_nr_prach(PHY_VARS_NR_UE *ue, uint8_t gNB_id, int frame, uint8_t // This is after cyclic prefix c16_t *prach2 = prach + Ncp; const idft_size_idx_t idft_size = get_idft(dftlen); - idft(idft_size, prachF, (int16_t *)prach, 1); + uint32_t *scaling_sched = get_idft_scaling(dftlen,0); + idft(idft_size, prachF, (int16_t *)prach, scaling_sched); memmove(prach2, prach, (dftlen << 2)); if (prach_sequence_length == 0) { diff --git a/openair1/PHY/NR_UE_TRANSPORT/pss_nr.c b/openair1/PHY/NR_UE_TRANSPORT/pss_nr.c index 180ca999184d69c4e03d9a83cfb1f7bbaaf4c19e..51859b8e3005249401c487d1627cd26b731eaa58 100644 --- a/openair1/PHY/NR_UE_TRANSPORT/pss_nr.c +++ b/openair1/PHY/NR_UE_TRANSPORT/pss_nr.c @@ -162,7 +162,7 @@ void generate_pss_nr_time(const NR_DL_FRAME_PARMS *fp, const int N_ID_2, int ssb idft((int16_t)get_idft(fp->ofdm_symbol_size), (int16_t *)synchroF_tmp, /* complex input but legacy type is wrong*/ (int16_t *)pssTime, /* complex output */ - 1); /* scaling factor */ + get_idft_scaling(fp->ofdm_symbol_size,1)); /* scaling factor */ #ifdef DBG_PSS_NR @@ -201,7 +201,7 @@ void generate_pss_nr_time(const NR_DL_FRAME_PARMS *fp, const int N_ID_2, int ssb dft((int16_t)get_dft(length), synchro_tmp, /* complex input */ synchroF_tmp, /* complex output */ - 1); /* scaling factor */ + get_dft_scaling(length,0)); /* scaling factor */ if ((N_ID_2 == 0) && (length == 256)) { LOG_M("pss_f_0.m","pss_f_0",synchroF_tmp,length,1,1); @@ -731,7 +731,7 @@ void sl_generate_pss_ifft_samples(sl_nr_ue_phy_params_t *sl_ue_params, SL_NR_UE_ idft((int16_t)get_idft(sl_fp->ofdm_symbol_size), (int16_t *)&pss_F[0], /* complex input */ (int16_t *)&pss_T[0], /* complex output */ - 1); /* scaling factor */ + get_idft_scaling(sl_fp->ofdm_symbol_size,1)); /* scaling factor */ } #ifdef SL_DUMP_PSBCH_TX_SAMPLES diff --git a/openair1/PHY/NR_UE_TRANSPORT/sss_nr.c b/openair1/PHY/NR_UE_TRANSPORT/sss_nr.c index 0a13462d8c37de53dcd82489327ee8e97bf0f37f..286ae1d2ac8545768139adec28a5f97149da4425 100644 --- a/openair1/PHY/NR_UE_TRANSPORT/sss_nr.c +++ b/openair1/PHY/NR_UE_TRANSPORT/sss_nr.c @@ -169,7 +169,7 @@ void insert_sss_nr(int16_t *sss_time, idft(IDFT_2048, (int16_t *)synchroF_tmp, /* complex input */ (int16_t *)synchro_tmp, /* complex output */ - 1); /* scaling factor */ + IDFT_SCALING_2048[1]); /* scaling factor */ /* then get final sss in time */ memcpy(sss_time, synchro_tmp, ofdm_symbol_size * sizeof(c16_t)); diff --git a/openair1/PHY/TOOLS/Makefile b/openair1/PHY/TOOLS/Makefile index a881bd40707c6fd82af1f083cfaa3c98baf41aee..14d24112c5631a21bd02a3dc9c71e9e12e164cef 100644 --- a/openair1/PHY/TOOLS/Makefile +++ b/openair1/PHY/TOOLS/Makefile @@ -1,8 +1,11 @@ oai_dfts_sse4: oai_dfts.c - gcc -O3 -std=gnu99 -msse4.1 -o oai_dfts_sse4 oai_dfts.c time_meas.c ../../SIMULATION/TOOLS/taus.c -I$$OPENAIR_HOME -I$$OPENAIR1_DIR -I$$OPENAIR_TARGETS -I$$OPENAIR_TARGETS/COMMON -I$$OPENAIR_HOME/radio/COMMON -I$$OPENAIR2_DIR -I$$OPENAIR2_DIR/COMMON -I$$OPENAIR_HOME/common/utils -I$$OPENAIR_HOME/common/utils/T -I$$OPENAIR_HOME/common/utils/msc -I$$OPENAIR_HOME/nfapi/open-nFAPI/nfapi/public_inc -DMR_MAIN -DNB_ANTENNAS_RX=1 -lm -lpthread # -DD256STATS #-DD64STATS + gcc -O3 -std=gnu99 -msse4.1 -o oai_dfts_sse4 oai_dfts.c time_meas.c ../../SIMULATION/TOOLS/taus.c -I$$OPENAIR_HOME -I$$OPENAIR1_DIR -I$$OPENAIR_TARGETS -I$$OPENAIR_TARGETS/COMMON -I$$OPENAIR_HOME/radio/COMMON -I$$OPENAIR2_DIR -I$$OPENAIR2_DIR/COMMON -I$$OPENAIR_HOME/common/utils -I$$OPENAIR_HOME/common/utils/T -I$$OPENAIR_HOME/common/utils/msc -I$$OPENAIR_HOME/nfapi/open-nFAPI/nfapi/public_inc -DMR_MAIN -DNB_ANTENNAS_RX=1 -DNB_ANTENNAS_TX=1 -lm -lpthread # -DD256STATS #-DD64STATS -oai_dfts_avx2: oai_dfts.c - gcc -O2 -std=gnu99 -mavx2 -g -ggdb -o oai_dfts_avx2 oai_dfts.c time_meas.c ../../SIMULATION/TOOLS/taus.c -I$$OPENAIR_HOME -I$$OPENAIR1_DIR -I$$OPENAIR_TARGETS -I$$OPENAIR_TARGETS/COMMON -I$$OPENAIR_HOME/radio/COMMON -I$$OPENAIR2_DIR -I$$OPENAIR2_DIR/COMMON -I$$OPENAIR_HOME/common/utils -I$$OPENAIR_HOME/common/utils/T -I$$OPENAIR_HOME/common/utils/msc -I$$OPENAIR_HOME/nfapi/open-nFAPI/nfapi/public_inc -DMR_MAIN -DNB_ANTENNAS_RX=1 -lm -lpthread # -DD256STATS #-DD64STATS +oai_dfts_avx2: oai_dfts.c fft_double.c + gcc -O2 -std=gnu99 -mavx2 -g -ggdb -o oai_dfts_avx2 fft_double.c oai_dfts.c ../../../common/utils/time_meas.c ../../SIMULATION/TOOLS/taus.c ../../SIMULATION/TOOLS/rangen_double.c -I$$OPENAIR_HOME -I$$OPENAIR1_DIR -I$$OPENAIR_TARGETS -I$$OPENAIR_TARGETS/COMMON -I$$OPENAIR_HOME/radio/COMMON -I$$OPENAIR2_DIR -I$$OPENAIR2_DIR/COMMON -I$$OPENAIR_HOME/common/utils -I$$OPENAIR_HOME/common/utils/T -I$$OPENAIR_HOME/common/utils/msc -I$$OPENAIR_HOME/nfapi/open-nFAPI/nfapi/public_inc -DMR_MAIN -DNB_ANTENNAS_RX=1 -DNB_ANTENNAS_TX=1 -DMAX_NUM_CCs=1 -lm -lpthread # -DD256STATS #-DD64STATS + +oai_dfts_avx512: oai_dfts.c fft_double.c + gcc -O2 -std=gnu99 -mavx512bw -march=skylake-avx512 -mtune=skylake-avx512 -g -ggdb -o oai_dfts_avx2 fft_double.c oai_dfts.c ../../../common/utils/time_meas.c ../../SIMULATION/TOOLS/taus.c ../../SIMULATION/TOOLS/rangen_double.c -I$$OPENAIR_HOME -I$$OPENAIR1_DIR -I$$OPENAIR_TARGETS -I$$OPENAIR_TARGETS/COMMON -I$$OPENAIR_HOME/radio/COMMON -I$$OPENAIR2_DIR -I$$OPENAIR2_DIR/COMMON -I$$OPENAIR_HOME/common/utils -I$$OPENAIR_HOME/common/utils/T -I$$OPENAIR_HOME/common/utils/msc -I$$OPENAIR_HOME/nfapi/open-nFAPI/nfapi/public_inc -DMR_MAIN -DNB_ANTENNAS_RX=1 -DNB_ANTENNAS_TX=1 -DMAX_NUM_CCs=1 -lm -lpthread # -DD256STATS #-DD64STATS oai_dfts_avx2.s: oai_dfts.c gcc -O2 -std=gnu99 -mavx2 -S oai_dfts.c time_meas.c ../../SIMULATION/TOOLS/taus.c -I$$OPENAIR_HOME -I$$OPENAIR1_DIR -I$$OPENAIR_TARGETS -I$$OPENAIR_TARGETS/COMMON -I$$OPENAIR_HOME/radio/COMMON -I$$OPENAIR2_DIR -I$$OPENAIR2_DIR/COMMON -I$$OPENAIR_HOME/common/utils -I$$OPENAIR_HOME/common/utils/T -I$$OPENAIR_HOME/common/utils/msc -I$$OPENAIR_HOME/nfapi/open-nFAPI/nfapi/public_inc -DMR_MAIN -DNB_ANTENNAS_RX=1 -lm -lpthread # -DD256STATS #-DD64STATS @@ -15,5 +18,5 @@ oai_dfts_sse4.s: oai_dfts.c dft_cycles_avx2: oai_dfts_avx2 ./oai_dfts_avx2 | grep -E cycles -oai_dfts_aarch64: oai_dfts_neon.c - gcc -O2 -std=gnu99 -gdwarf-2 -lgcc -lrt -g -ggdb -o oai_dfts_neon oai_dfts_neon.c ../../../common/utils/time_meas.c ../../SIMULATION/TOOLS/taus.c $$OPENAIR_HOME/common/utils/LOG/log.c ../../SIMULATION/TOOLS/rangen_double.c -I$$OPENAIR_HOME -I$$OPENAIR1_DIR -I$$OPENAIR_TARGETS -I$$OPENAIR_TARGETS/COMMON -I$$OPENAIR_HOME/radio/COMMON -I$$OPENAIR2_DIR -I$$OPENAIR2_DIR/COMMON -I$$OPENAIR_HOME/common/utils -I$$OPENAIR_HOME/common/utils/T -I$$OPENAIR_HOME/common/utils/msc -I$$OPENAIR_HOME/nfapi/open-nFAPI/nfapi/public_inc -DMR_MAIN -DNB_ANTENNAS_RX=1 -DNB_ANTENNAS_TX=1 -DMAX_NUM_CCs=1 -lm -lpthread +oai_dfts_aarch64: oai_dfts_neon.c fft_double.c + gcc -O2 -std=gnu99 -gdwarf-2 -lgcc -lrt -g -ggdb -o oai_dfts_neon fft_double.c oai_dfts_neon.c ../../../common/utils/time_meas.c ../../SIMULATION/TOOLS/taus.c ../../SIMULATION/TOOLS/rangen_double.c -I$$OPENAIR_HOME -I$$OPENAIR1_DIR -I$$OPENAIR_TARGETS -I$$OPENAIR_TARGETS/COMMON -I$$OPENAIR_HOME/radio/COMMON -I$$OPENAIR2_DIR -I$$OPENAIR2_DIR/COMMON -I$$OPENAIR_HOME/common/utils -I$$OPENAIR_HOME/common/utils/T -I$$OPENAIR_HOME/common/utils/msc -I$$OPENAIR_HOME/nfapi/open-nFAPI/nfapi/public_inc -DMR_MAIN -DNB_ANTENNAS_RX=1 -DNB_ANTENNAS_TX=1 -DMAX_NUM_CCs=1 -lm -lpthread diff --git a/openair1/PHY/TOOLS/dfts_load.c b/openair1/PHY/TOOLS/dfts_load.c index 2e4ddb595fe5796f8d484d3baec978ea5089f9d9..a8a455ddd69098ac7deadc3fe3515579acc84de2 100644 --- a/openair1/PHY/TOOLS/dfts_load.c +++ b/openair1/PHY/TOOLS/dfts_load.c @@ -40,6 +40,56 @@ #include "common/config/config_userapi.h" #include "common/utils/load_module_shlib.h" +uint32_t DFT_SCALING_64[5][2] = {{3,0},{2,1},{1,2},{1,2},{1,2}}; +uint32_t DFT_SCALING_128[5][3] = {{4,0,0},{3,1,0},{2,2,0},{1,3,0},{0,4,0}}; +uint32_t DFT_SCALING_256[5][3] = {{4,0,0},{3,1,0},{2,2,0},{1,3,0},{0,4,0}}; +int32_t DFT_SCALING_512_THRES[7] = {53,57,59,63,65,69,100}; +uint32_t DFT_SCALING_512[7][4] = {{5,0,0,0},{4,1,0,0},{4,0,1,0},{3,1,1,0},{3,0,1,1},{2,1,1,1},{2,0,1,2}}; +uint32_t DFT_SCALING_768[][4] = {{1,2,2,0},{1,2,2,0},{1,2,2,0},{1,2,2,0},{1,2,2,0}}; +int32_t DFT_SCALING_1024_THRES[5] = {49,55,63,69,100}; +uint32_t DFT_SCALING_1024[5][4] = {{5,0,0,0},{4,1,0,0},{3,1,1,0},{2,1,1,1},{1,1,1,2}}; +uint32_t DFT_SCALING_1536[5][5] = {{1,1,2,2,0},{1,1,2,2,0},{1,1,2,2,0},{1,1,2,2,0},{1,1,2,2,0}}; +int32_t DFT_SCALING_2048_THRES[10] = {47,49,53,55,59,61,65,67,69,100}; +uint32_t DFT_SCALING_2048[10][5] = {{1,1,1,1,2},{1,1,1,1,2},{1,1,1,1,2},{1,1,1,1,2},{1,1,1,1,2},{1,1,1,1,2},{1,1,1,1,2},{1,1,1,1,2},{1,1,1,1,2},{1,1,1,1,2}} ; //}{{6,0,0,0,0},{5,1,0,0,0},{5,0,1,0,0},{4,1,1,0,0},{4,0,1,1,0},{3,1,1,1,0},{3,0,1,1,1},{2,1,1,1,1},{2,0,2,0,2},{1,0,1,0,4}}; +uint32_t DFT_SCALING_3072[5][5] = {{1,4,1,0,0},{1,0,3,2,0},{1,0,3,2,0},{1,0,3,2,0},{1,0,3,2,0}}; +int32_t DFT_SCALING_4096_THRES[8] = {43,49,57,61,63,65,69,100}; +uint32_t DFT_SCALING_4096[8][5] = {{6,0,0,0,0},{5,1,0,0,0},{4,1,1,0,0},{3,1,1,1,0},{2,2,0,1,1},{2,1,1,1,1},{1,1,2,1,1},{0,0,3,0,3}}; +uint32_t DFT_SCALING_6144[5][6] = {{1,1,0,3,2,0},{1,1,0,3,2,0},{1,1,0,3,2,0},{1,1,0,3,2,0},{1,1,0,3,2,0}}; +uint32_t DFT_SCALING_8192[5][6] = {{1,0,0,3,3,0},{1,0,0,3,3,0},{1,0,0,3,3,0},{1,0,0,3,3,0},{1,0,0,3,3,0}}; +uint32_t DFT_SCALING_9216[5][6] = {{1,1,0,3,2,1},{1,1,0,3,2,0},{1,1,0,3,2,0},{1,1,0,3,2,0},{1,1,0,3,2,0}}; +uint32_t DFT_SCALING_12288[5][6] = {{1,0,0,3,3,0},{1,0,0,3,3,0},{1,0,0,3,3,0},{1,0,0,3,3,0},{1,0,0,3,3,0}}; +uint32_t DFT_SCALING_16384[5][6] = {{0,0,1,3,3,0},{0,0,1,3,3,0},{0,0,1,3,3,0},{0,0,1,3,3,0},{0,0,1,3,3,0}}; +uint32_t DFT_SCALING_18432[5][7] = {{1,1,0,0,3,3,0},{1,1,0,0,3,3,0},{1,1,0,0,3,3,0},{1,1,0,0,3,3,0},{1,1,0,0,3,3,0}}; +uint32_t DFT_SCALING_24576[5][7] = {{1,1,0,0,3,3,0},{1,1,0,0,3,3,0},{1,1,0,0,3,3,0},{1,1,0,0,3,3,0},{1,1,0,0,3,3,0}}; +uint32_t DFT_SCALING_32768[5][7] = {{1,0,0,1,3,3,0},{1,0,0,1,3,3,0},{1,0,0,1,3,3,0},{1,0,0,1,3,3,0},{1,0,0,1,3,3,0}}; +uint32_t DFT_SCALING_36864[5][7] = {{1,1,0,0,3,3,0},{1,1,0,0,3,3,0},{1,1,0,0,3,3,0},{1,1,0,0,3,3,0},{1,1,0,0,3,3,0}}; +uint32_t DFT_SCALING_49152[5][7] = {{1,0,0,1,3,3,0},{1,0,0,1,3,3,0},{1,0,0,1,3,3,0},{1,0,0,1,3,3,0},{1,0,0,1,3,3,0}}; +uint32_t DFT_SCALING_65536[5][7] = {{0,0,0,2,3,3,0},{0,0,0,2,3,3,0},{0,0,0,2,3,3,0},{0,0,0,2,3,3,0},{0,0,0,2,3,3,0}}; +uint32_t DFT_SCALING_73728[5][8] = {{1,1,1,0,0,3,3,0},{1,1,1,0,0,3,3,0},{1,1,1,0,0,3,3,0},{1,1,1,0,0,3,3,0},{1,1,1,0,0,3,3,0}}; +uint32_t DFT_SCALING_98304[5][8] = {{1,1,0,0,1,3,3,0},{1,1,0,0,1,3,3,0},{1,1,0,0,1,3,3,0},{1,1,0,0,1,3,3,0},{1,1,0,0,1,3,3,0}}; + +uint32_t IDFT_SCALING_128[2][2] = {{2,2},{1,3}}; +uint32_t IDFT_SCALING_256[2][2] = {{2,2},{1,3}}; +uint32_t IDFT_SCALING_512[2][3] = {{1,2,2},{1,1,3}}; +uint32_t IDFT_SCALING_768[2][3] = {{1,2,2},{1,1,3}}; +uint32_t IDFT_SCALING_1024[2][3] = {{4,1,0},{1,1,3}}; +uint32_t IDFT_SCALING_1536[2][4] = {{1,1,1,3},{1,1,1,3}}; +uint32_t IDFT_SCALING_2048[2][4] = {{3,2,1,0},{1,1,1,3}}; +uint32_t IDFT_SCALING_3072[2][4] = {{1,1,1,3},{1,1,1,3}}; +uint32_t IDFT_SCALING_4096[2][4] = {{3,2,1,0},{1,1,1,3}}; +uint32_t IDFT_SCALING_6144[2][5] = {{1,1,0,3,2},{1,1,1,1,3}}; +uint32_t IDFT_SCALING_8192[2][5] = {{1,0,0,3,3},{1,1,1,1,3}}; +uint32_t IDFT_SCALING_9216[2][5] = {{1,0,0,3,3},{1,1,1,1,3}}; +uint32_t IDFT_SCALING_12288[2][5] = {{1,0,0,3,3},{1,1,1,1,3}}; +uint32_t IDFT_SCALING_16384[2][5] = {{0,0,1,3,3},{1,1,1,1,3}}; +uint32_t IDFT_SCALING_18432[2][6] = {{1,1,0,0,3,3},{1,1,1,1,1,3}}; +uint32_t IDFT_SCALING_24576[2][6] = {{1,1,0,0,3,3},{1,1,1,1,1,3}}; +uint32_t IDFT_SCALING_32768[2][6] = {{1,0,0,1,3,3},{1,1,1,1,1,3}}; +uint32_t IDFT_SCALING_36864[2][6] = {{1,1,0,0,3,3},{1,1,1,1,1,3}}; +uint32_t IDFT_SCALING_49152[2][6] = {{1,0,0,1,3,3},{1,1,1,1,1,3}}; +uint32_t IDFT_SCALING_65536[2][6] = {{0,0,0,2,3,3},{1,1,1,1,1,3}}; +uint32_t IDFT_SCALING_73728[2][7] = {{1,1,1,0,0,3,3},{1,1,1,1,1,1,3}}; +uint32_t IDFT_SCALING_98304[2][7] = {{1,1,0,0,1,3,3},{1,1,1,1,1,1,3}}; /* function description array, to be used when loading the dfts/idfts lib */ static loader_shlibfunc_t shlib_fdesc[2]; diff --git a/openair1/PHY/TOOLS/fft_double.c b/openair1/PHY/TOOLS/fft_double.c new file mode 100644 index 0000000000000000000000000000000000000000..4109d0681748f491f76c211e83f0c7b94b9f236f --- /dev/null +++ b/openair1/PHY/TOOLS/fft_double.c @@ -0,0 +1,89 @@ +#include <stdio.h> +#include <stdlib.h> +#include <math.h> +#include "tools_defs.h" + + +void twiddle(cd_t *W, int N, double stuff) +{ + W->r=cos(stuff*2.0*M_PI/(double)N); + W->i=-sin(stuff*2.0*M_PI/(double)N); +} + + +int bitrev64[64] = {0,32,16,48,8,40,24,56,4,36,20,52,12,44,28,60,2,34,18,50,10,42,26,58, +6,38,22,54,14,46,30,62,1,33,17,49,9,41,25,57,5,37,21,53,13,45,29,61, +3,35,19,51,11,43,27,59,7,39,23,55,15,47,31,63}; +int bitrev128[128]; +int bitrev256[256]; +int bitrev512[512]; +int bitrev1024[1024]; +int bitrev2048[2048]; +int bitrev4096[4096]; + +void init_bitrev() { + + // 128 + for (int i=0;i<64;i++) { bitrev128[i]=2*bitrev64[i]; bitrev128[i+64]=1+bitrev128[i]; } + + // 256 + for (int i=0;i<128;i++) { bitrev256[i]=2*bitrev128[i]; bitrev256[i+128]=1+bitrev256[i]; } + + // 512 + for (int i=0;i<256;i++) { bitrev512[i]=2*bitrev256[i]; bitrev512[i+256]=1+bitrev512[i]; } + + // 1024 + for (int i=0;i<512;i++) { bitrev1024[i]=2*bitrev512[i]; bitrev1024[i+512]=1+bitrev1024[i]; } + + // 2048 + for (int i=0;i<1024;i++) { bitrev2048[i]=2*bitrev1024[i]; bitrev2048[i+1024]=1+bitrev2048[i]; } + + // 4096 + for (int i=0;i<2048;i++) { bitrev4096[i]=2*bitrev2048[i]; bitrev4096[i+2048]=1+bitrev4096[i]; } + +} + +/** RADIX-2 FFT ALGORITHM */ +/* Double precision*/ +void radix2(cd_t *x, int N) +{ + int n2, k1, N1, N2; + cd_t W, bfly[2]; + + N1=2; + N2=N/2; + /** Do 2 Point DFT */ + for (n2=0; n2<N2; n2++) + { + /** Radix 2 butterfly */ + bfly[0].r = (x[n2].r + x[N2 + n2].r); + bfly[0].i = (x[n2].i + x[N2 + n2].i); + + bfly[1].r = (x[n2].r - x[N2 + n2].r); + bfly[1].i = (x[n2].i - x[N2 + n2].i); + + + + twiddle(&W, N, (double)n2); + x[n2].r = bfly[0].r; + x[n2].i = bfly[0].i; + x[n2 + N2].r = bfly[1].r*W.r - bfly[1].i*W.i; + x[n2 + N2].i = bfly[1].i*W.r + bfly[1].r*W.i; + } + + /** Don't recurse if we're down to one butterfly */ + if (N2!=1) { + radix2(&x[0], N2); + radix2(&x[N2], N2); + } +} + +void normalize(cd_t *x,cd_t *y, int *bitrev, int N) { + for (int i=0;i<N;i++) { + y[i].r = x[bitrev[i]].r / sqrt((double)N); + y[i].i = x[bitrev[i]].i / sqrt((double)N); + } +} + + + diff --git a/openair1/PHY/TOOLS/oai_dfts.c b/openair1/PHY/TOOLS/oai_dfts.c index 36bae5bd3ddea5c9781fd6070f746650226fdfd9..ced79bef7d7f9e46c8bd48d131f0c3c6ef38d063 100644 --- a/openair1/PHY/TOOLS/oai_dfts.c +++ b/openair1/PHY/TOOLS/oai_dfts.c @@ -58,7 +58,6 @@ #define print_ints(s,x) printf("%s %d %d %d %d\n",s,(x)[0],(x)[1],(x)[2],(x)[3]) - const static int16_t conjugatedft[32] __attribute__((aligned(32))) = {-1,1,-1,1,-1,1,-1,1,-1,1,-1,1,-1,1,-1,1,-1,1}; @@ -1038,45 +1037,82 @@ __attribute__((always_inline)) static inline void bfly5_tw1(simde__m128i *x0, *(y4) = simde_mm_adds_epi16(*(x0), *(y4)); } -// performs 4x4 transpose of input x (complex interleaved) using 128bit SIMD intrinsics +// performs 8x4 transpose of input x (complex interleaved) using 256bit SIMD intrinsics // i.e. x = [x0r x0i x1r x1i ... x15r x15i], y = [x0r x0i x4r x4i x8r x8i x12r x12i x1r x1i x5r x5i x9r x9i x13r x13i x2r x2i ... x15r x15i] __attribute__((always_inline)) static inline void transpose16_ooff_simd256(simde__m256i *x, simde__m256i *y, int off) { - register simde__m256i ytmp0, ytmp1, ytmp2, ytmp3, ytmp4, ytmp5, ytmp6, ytmp7; + // x[0] = [x0 x1 x2 x3 x4 x5 x6 x7] + // x[1] = [x8 x9 x10 x11 x12 x13 x14 x15] + // x[2] = [x16 x17 x18 x19 x20 x21 x22 x23] + // x[3] = [x24 x25 x26 x27 x28 x29 x30 x31] + // y[0] = [x0 x4 x8 x12 x16 x20 x24 x28] + // y[off] = [x1 x5 x9 x13 x17 x21 x25 x29] + // y[2*off] = [x2 x6 x10 x14 x18 x22 x26 x30] + // y[3*off] = [x3 x7 x11 x15 x19 x23 x27 x31] simde__m256i *y2 = y; +#ifndef __AVX512VBMI__ + register simde__m256i ytmp0, ytmp1, ytmp2, ytmp3, ytmp4, ytmp5, ytmp6, ytmp7; simde__m256i const perm_mask = simde_mm256_set_epi32(7, 3, 5, 1, 6, 2, 4, 0); - ytmp0 = simde_mm256_permutevar8x32_epi32(x[0],perm_mask); // x00 x10 x01 x11 x02 x12 x03 x13 - ytmp1 = simde_mm256_permutevar8x32_epi32(x[1],perm_mask); // x20 x30 x21 x31 x22 x32 x23 x33 - ytmp2 = simde_mm256_permutevar8x32_epi32(x[2],perm_mask); // x40 x50 x41 x51 x42 x52 x43 x53 - ytmp3 = simde_mm256_permutevar8x32_epi32(x[3],perm_mask); // x60 x70 x61 x71 x62 x72 x63 x73 - ytmp4 = simde_mm256_unpacklo_epi64(ytmp0,ytmp1); // x00 x10 x20 x30 x01 x11 x21 x31 - ytmp5 = simde_mm256_unpackhi_epi64(ytmp0,ytmp1); // x02 x12 x22 x32 x03 x13 x23 x33 - ytmp6 = simde_mm256_unpacklo_epi64(ytmp2,ytmp3); // x40 x50 x60 x70 x41 x51 x61 x71 - ytmp7 = simde_mm256_unpackhi_epi64(ytmp2,ytmp3); // x42 x52 x62 x72 x43 x53 x63 x73 + ytmp0 = simde_mm256_permutevar8x32_epi32(x[0],perm_mask); // x0 x4 x2 x6 x1 x5 x3 x7 + ytmp1 = simde_mm256_permutevar8x32_epi32(x[1],perm_mask); // x8 x12 x10 x14 x9 x13 x11 x18 + ytmp2 = simde_mm256_permutevar8x32_epi32(x[2],perm_mask); // x16 x20 x18 x22 x17 x21 x19 x23 + + ytmp3 = simde_mm256_permutevar8x32_epi32(x[3],perm_mask); // x24 x28 x26 x30 x25 x29 x27 x31 + + ytmp4 = simde_mm256_unpacklo_epi64(ytmp0,ytmp1); // x0 x4 x8 x12 x1 x5 x9 x13 + ytmp5 = simde_mm256_unpackhi_epi64(ytmp0,ytmp1); // x2 x6 x10 x14 x3 x7 x11 x18 + ytmp6 = simde_mm256_unpacklo_epi64(ytmp2,ytmp3); // x16 x20 x24 x28 x17 x21 x25 x29 + ytmp7 = simde_mm256_unpackhi_epi64(ytmp2,ytmp3); // x18 x22 x26 x30 x19 x23 x27 x31 - *y2 = simde_mm256_insertf128_si256(ytmp4,simde_mm256_extracti128_si256(ytmp6,0),1); //x00 x10 x20 x30 x40 x50 x60 x70 + *y2 = simde_mm256_insertf128_si256(ytmp4,simde_mm256_extracti128_si256(ytmp6,0),1); // x0 x4 x8 x12 x16 x20 x24 x28 y2+=off; - *y2 = simde_mm256_insertf128_si256(ytmp6,simde_mm256_extracti128_si256(ytmp4,1),0); //x01 x11 x21 x31 x41 x51 x61 x71 + *y2 = simde_mm256_insertf128_si256(ytmp6,simde_mm256_extracti128_si256(ytmp4,1),0); // x1 x5 x9 x13 x17 x21 x25 x29 y2+=off; - *y2 = simde_mm256_insertf128_si256(ytmp5,simde_mm256_extracti128_si256(ytmp7,0),1); //x00 x10 x20 x30 x40 x50 x60 x70 + *y2 = simde_mm256_insertf128_si256(ytmp5,simde_mm256_extracti128_si256(ytmp7,0),1); // x2 x6 x10 x14 x18 x22 x26 x30 y2+=off; - *y2 = simde_mm256_insertf128_si256(ytmp7,simde_mm256_extracti128_si256(ytmp5,1),0); //x01 x11 x21 x31 x41 x51 x61 x71 + *y2 = simde_mm256_insertf128_si256(ytmp7,simde_mm256_extracti128_si256(ytmp5,1),0); // x3 x7 x11 x15 x19 x23 x27 x31 +#else + register simde__m256i ytmp0, ytmp1, ytmp2, ytmp3; + simde__m256i const perm_mask1 = simde_mm256_set_epi32(13, 9, 5, 1, 12, 8, 4, 0); + simde__m256i const perm_mask2 = simde_mm256_set_epi32(15, 11, 7, 3, 14, 10, 6, 2); + + simde__m256i const perm_mask3 = simde_mm256_set_epi64x(5, 4, 1, 0); + simde__m256i const perm_mask4 = simde_mm256_set_epi64x(7, 6, 3, 2); + ytmp0 = _mm256_permutex2var_epi32(x[0],perm_mask1,x[1]); // x0 x4 x8 x12 x1 x5 x9 x13 + ytmp1 = _mm256_permutex2var_epi32(x[2],perm_mask1,x[3]); // x16 x20 x24 x28 x17 x21 x25 x29 + ytmp2 = _mm256_permutex2var_epi32(x[0],perm_mask2,x[1]); // x2 x6 x10 x14 x3 x7 x11 x15 + ytmp3 = _mm256_permutex2var_epi32(x[2],perm_mask2,x[3]); // x18 x22 x26 x30 x19 x23 x27 x31 + *y2 = _mm256_permutex2var_epi64(ytmp0,perm_mask3,ytmp1); + y2+=off; + *y2 = _mm256_permutex2var_epi64(ytmp0,perm_mask4,ytmp1); + y2+=off; + *y2 = _mm256_permutex2var_epi64(ytmp2,perm_mask3,ytmp3); + y2+=off; + *y2 = _mm256_permutex2var_epi64(ytmp2,perm_mask4,ytmp3); +#endif } __attribute__((always_inline)) static inline void transpose4_ooff_simd256(simde__m256i *x, simde__m256i *y, int off) { - simde__m256i const perm_mask = simde_mm256_set_epi32(7, 5, 3, 1, 6, 4, 2, 0); - simde__m256i perm_tmp0, perm_tmp1; - // x[0] = [x0 x1 x2 x3 x4 x5 x6 x7] // x[1] = [x8 x9 x10 x11 x12 x13 x14] // y[0] = [x0 x2 x4 x6 x8 x10 x12 x14] // y[off] = [x1 x3 x5 x7 x9 x11 x13 x15] +#ifndef __AVX512VBMI__ + simde__m256i const perm_mask = simde_mm256_set_epi32(7, 5, 3, 1, 6, 4, 2, 0); + simde__m256i perm_tmp0, perm_tmp1; + perm_tmp0 = simde_mm256_permutevar8x32_epi32(x[0],perm_mask); perm_tmp1 = simde_mm256_permutevar8x32_epi32(x[1],perm_mask); y[0] = simde_mm256_insertf128_si256(perm_tmp0,simde_mm256_extracti128_si256(perm_tmp1,0),1); y[off] = simde_mm256_insertf128_si256(perm_tmp1,simde_mm256_extracti128_si256(perm_tmp0,1),0); +#else + __m256i const perm_mask1 = _mm256_set_epi32(14,12,10,8,6,4,2,0); + __m256i const perm_mask2 = _mm256_set_epi32(15,13,11,9,7,5,3,1); + y[0] = _mm256_permutex2var_epi32(x[0],perm_mask1,x[1]); + y[off] = _mm256_permutex2var_epi32(x[0],perm_mask2,x[1]); +#endif } // 16-point optimized DFT kernel @@ -1186,8 +1222,10 @@ static inline void dft16(int16_t *x,int16_t *y) __attribute__((always_inline) } #endif +//#define USE_DFT16_SHIFT + // Does two 16-point DFTS (x[0 .. 15] is 128 LSBs of input vector, x[16..31] is in 128 MSBs) -__attribute__((always_inline)) static inline void dft16_simd256(int16_t *x, int16_t *y) +__attribute__((always_inline)) static inline void dft16_simd256(int16_t *x, int16_t *y,int scale) { simde__m256i *tw16a_256 = (simde__m256i *)tw16arep, *tw16b_256 = (simde__m256i *)tw16brep, *x256 = (simde__m256i *)x, *y256 = (simde__m256i *)y; @@ -1226,7 +1264,10 @@ __attribute__((always_inline)) static inline void dft16_simd256(int16_t *x, int1 0, 3, 2); - +#ifdef __AVX512VBMI__ + const __m256i outputshufa = _mm256_set_epi64x(5,4,1,0); + const __m256i outputshufb = _mm256_set_epi64x(7,6,3,2); +#endif // First stage : 4 Radix-4 butterflies without input twiddles x02t = simde_mm256_adds_epi16(x256[0],x256[2]); @@ -1247,14 +1288,18 @@ __attribute__((always_inline)) static inline void dft16_simd256(int16_t *x, int1 print_shorts256("xtmp2",(int16_t*)&xtmp2); print_shorts256("xtmp3",(int16_t*)&xtmp3);*/ - ytmp0 = simde_mm256_unpacklo_epi32(xtmp0,xtmp1); - ytmp1 = simde_mm256_unpackhi_epi32(xtmp0,xtmp1); - ytmp2 = simde_mm256_unpacklo_epi32(xtmp2,xtmp3); - ytmp3 = simde_mm256_unpackhi_epi32(xtmp2,xtmp3); - xtmp0 = simde_mm256_unpacklo_epi64(ytmp0,ytmp2); - xtmp1 = simde_mm256_unpackhi_epi64(ytmp0,ytmp2); - xtmp2 = simde_mm256_unpacklo_epi64(ytmp1,ytmp3); - xtmp3 = simde_mm256_unpackhi_epi64(ytmp1,ytmp3); + // x0 x1 x2 x3 x4 x5 x6 x7 + // x8 x9 x10 x11 x12 x13 x14 x15 + // x16 x17 x18 x19 x20 x21 x22 x23 + // x24 x25 x26 x27 x28 x29 x30 x31 + ytmp0 = simde_mm256_unpacklo_epi32(xtmp0,xtmp1); // x0 x8 x1 x9 x4 x12 x5 x13 + ytmp1 = simde_mm256_unpackhi_epi32(xtmp0,xtmp1); // x2 x10 x3 x11 x6 x14 x7 x15 + ytmp2 = simde_mm256_unpacklo_epi32(xtmp2,xtmp3); // x16 x24 x17 x25 x20 x28 x21 x29 + ytmp3 = simde_mm256_unpackhi_epi32(xtmp2,xtmp3); // x18 x26 x19 x27 x22 x30 x23 x31 + xtmp0 = simde_mm256_unpacklo_epi64(ytmp0,ytmp2); // x0 x8 x16 x24 x4 x12 x20 x28 + xtmp1 = simde_mm256_unpackhi_epi64(ytmp0,ytmp2); // x1 x9 x17 x25 x5 x13 x21 x29 + xtmp2 = simde_mm256_unpacklo_epi64(ytmp1,ytmp3); // x2 x10 x18 x26 x6 x14 x22 x30 + xtmp3 = simde_mm256_unpackhi_epi64(ytmp1,ytmp3); // x3 x11 x19 x27 x7 x15 x23 x31 // Second stage : 4 Radix-4 butterflies with input twiddles xtmp1 = packed_cmult2_256(xtmp1,tw16a_256[0],tw16b_256[0]); @@ -1268,27 +1313,32 @@ __attribute__((always_inline)) static inline void dft16_simd256(int16_t *x, int1 x02t = simde_mm256_adds_epi16(xtmp0,xtmp2); x13t = simde_mm256_adds_epi16(xtmp1,xtmp3); - ytmp0 = simde_mm256_srai_epi16(simde_mm256_adds_epi16(x02t, x13t), 2); - ytmp2 = simde_mm256_srai_epi16(simde_mm256_subs_epi16(x02t, x13t), 2); + ytmp0 = simde_mm256_srai_epi16(simde_mm256_adds_epi16(x02t, x13t), scale); + ytmp2 = simde_mm256_srai_epi16(simde_mm256_subs_epi16(x02t, x13t), scale); x1_flip = simde_mm256_sign_epi16(xtmp1, *(simde__m256i *)conjugatedft); x1_flip = simde_mm256_shuffle_epi8(x1_flip,complex_shuffle); x3_flip = simde_mm256_sign_epi16(xtmp3, *(simde__m256i *)conjugatedft); x3_flip = simde_mm256_shuffle_epi8(x3_flip,complex_shuffle); x02t = simde_mm256_subs_epi16(xtmp0,xtmp2); x13t = simde_mm256_subs_epi16(x1_flip,x3_flip); - ytmp1 = simde_mm256_srai_epi16(simde_mm256_adds_epi16(x02t, x13t), 2); // x0 + x1f - x2 - x3f - ytmp3 = simde_mm256_srai_epi16(simde_mm256_subs_epi16(x02t, x13t), 2); // x0 - x1f - x2 + x3f + ytmp1 = simde_mm256_srai_epi16(simde_mm256_adds_epi16(x02t, x13t), scale); // x0 + x1f - x2 - x3f + ytmp3 = simde_mm256_srai_epi16(simde_mm256_subs_epi16(x02t, x13t), scale); // x0 - x1f - x2 + x3f // [y0 y1 y2 y3 y16 y17 y18 y19] // [y4 y5 y6 y7 y20 y21 y22 y23] // [y8 y9 y10 y11 y24 y25 y26 y27] // [y12 y13 y14 y15 y28 y29 y30 y31] - +#ifndef __AVX512VBMI__ y256[0] = simde_mm256_insertf128_si256(ytmp0,simde_mm256_extracti128_si256(ytmp1,0),1); y256[1] = simde_mm256_insertf128_si256(ytmp2,simde_mm256_extracti128_si256(ytmp3,0),1); y256[2] = simde_mm256_insertf128_si256(ytmp1,simde_mm256_extracti128_si256(ytmp0,1),0); y256[3] = simde_mm256_insertf128_si256(ytmp3,simde_mm256_extracti128_si256(ytmp2,1),0); - +#else + y256[0] = _mm256_permutex2var_epi64(ytmp0,outputshufa,ytmp1); + y256[1] = _mm256_permutex2var_epi64(ytmp2,outputshufa,ytmp3); + y256[2] = _mm256_permutex2var_epi64(ytmp0,outputshufb,ytmp1); + y256[3] = _mm256_permutex2var_epi64(ytmp2,outputshufb,ytmp3); +#endif // [y0 y1 y2 y3 y4 y5 y6 y7] // [y8 y9 y10 y11 y12 y13 y14 y15] // [y16 y17 y18 y19 y20 y21 y22 y23] @@ -1401,6 +1451,10 @@ __attribute__((always_inline)) static inline void idft16_simd256(int16_t *x, int 3, 2); +#ifdef __AVX512VBMI__ + const __m256i outputshufa = _mm256_set_epi64x(5,4,1,0); + const __m256i outputshufb = _mm256_set_epi64x(7,6,3,2); +#endif // First stage : 4 Radix-4 butterflies without input twiddles x02t = simde_mm256_adds_epi16(x256[0],x256[2]); @@ -1448,11 +1502,17 @@ __attribute__((always_inline)) static inline void idft16_simd256(int16_t *x, int // [y8 y9 y10 y11 y24 y25 y26 y27] // [y12 y13 y14 y15 y28 y29 y30 y31] +#ifndef __AVX512VBMI__ y256[0] = simde_mm256_insertf128_si256(ytmp0,simde_mm256_extracti128_si256(ytmp1,0),1); y256[1] = simde_mm256_insertf128_si256(ytmp2,simde_mm256_extracti128_si256(ytmp3,0),1); y256[2] = simde_mm256_insertf128_si256(ytmp1,simde_mm256_extracti128_si256(ytmp0,1),0); y256[3] = simde_mm256_insertf128_si256(ytmp3,simde_mm256_extracti128_si256(ytmp2,1),0); - +#else + y256[0] = _mm256_permutex2var_epi64(ytmp0,outputshufa,ytmp1); + y256[1] = _mm256_permutex2var_epi64(ytmp2,outputshufa,ytmp3); + y256[2] = _mm256_permutex2var_epi64(ytmp0,outputshufb,ytmp1); + y256[3] = _mm256_permutex2var_epi64(ytmp2,outputshufb,ytmp3); +#endif } // 64-point optimized DFT @@ -1520,14 +1580,13 @@ const static int16_t tw64c[96] __attribute__((aligned(32))) = { #define set1_int16_simd256(a) simde_mm256_set1_epi16(a); #define mulhi_int16_simd256(a,b) simde_mm256_mulhrs_epi16(a,b); //simde_mm256_slli_epi16(simde_mm256_mulhi_epi16(a,b),1); -void dft64(int16_t *x,int16_t *y,unsigned char scale) +void dft64(int16_t *x,int16_t *y,unsigned int *scale) { simd256_q15_t xtmp[16],ytmp[16],*tw64a_256=(simd256_q15_t *)tw64a,*tw64b_256=(simd256_q15_t *)tw64b,*x256=(simd256_q15_t *)x,*y256=(simd256_q15_t *)y; - simd256_q15_t xintl0,xintl1,xintl2,xintl3,xintl4,xintl5,xintl6,xintl7; - simd256_q15_t const perm_mask = simde_mm256_set_epi32(7, 3, 5, 1, 6, 2, 4, 0); - + int scale16=0; + if (scale) scale16 = scale[1]; #ifdef D64STATS time_stats_t ts_t,ts_d,ts_b; @@ -1541,53 +1600,38 @@ void dft64(int16_t *x,int16_t *y,unsigned char scale) stop_meas(&ts_t); start_meas(&ts_d); #endif - /* - print_shorts256("x2560",(int16_t*)x256); - print_shorts256("x2561",(int16_t*)(x256+1)); - print_shorts256("x2562",(int16_t*)(x256+2)); - print_shorts256("x2563",(int16_t*)(x256+3)); - print_shorts256("x2564",(int16_t*)(x256+4)); - print_shorts256("x2565",(int16_t*)(x256+5)); - print_shorts256("x2566",(int16_t*)(x256+6)); - print_shorts256("x2567",(int16_t*)(x256+7)); - */ - xintl0 = simde_mm256_permutevar8x32_epi32(x256[0],perm_mask); // x0 x4 x1 x5 x2 x6 x3 x7 - xintl1 = simde_mm256_permutevar8x32_epi32(x256[1],perm_mask); // x8 x12 x9 x13 x10 x14 x11 x15 - xintl2 = simde_mm256_permutevar8x32_epi32(x256[2],perm_mask); // x16 x20 x17 x21 x18 x22 x19 x23 - xintl3 = simde_mm256_permutevar8x32_epi32(x256[3],perm_mask); // x24 x28 x25 x29 x26 x30 x27 x31 - xintl4 = simde_mm256_permutevar8x32_epi32(x256[4],perm_mask); // x32 x28 x25 x29 x26 x30 x27 x31 - xintl5 = simde_mm256_permutevar8x32_epi32(x256[5],perm_mask); // x40 x28 x25 x29 x26 x30 x27 x31 - xintl6 = simde_mm256_permutevar8x32_epi32(x256[6],perm_mask); // x48 x28 x25 x29 x26 x30 x27 x31 - xintl7 = simde_mm256_permutevar8x32_epi32(x256[7],perm_mask); // x56 x28 x25 x29 x26 x30 x27 x31 - /* - print_shorts256("xintl0",(int16_t*)&xintl0); - print_shorts256("xintl1",(int16_t*)&xintl1); - print_shorts256("xintl2",(int16_t*)&xintl2); - print_shorts256("xintl3",(int16_t*)&xintl3); - print_shorts256("xintl4",(int16_t*)&xintl4); - print_shorts256("xintl5",(int16_t*)&xintl5); - print_shorts256("xintl6",(int16_t*)&xintl6); - print_shorts256("xintl7",(int16_t*)&xintl7); - */ +#ifndef __AVX512VBMI__ + simd256_q15_t xintl0,xintl1,xintl2,xintl3,xintl4,xintl5,xintl6,xintl7; + simd256_q15_t const perm_mask = simde_mm256_set_epi32(7, 3, 5, 1, 6, 2, 4, 0); + xintl0 = simde_mm256_permutevar8x32_epi32(x256[0],perm_mask); // x0 x4 x2 x6 x1 x5 x3 x7 + xintl1 = simde_mm256_permutevar8x32_epi32(x256[1],perm_mask); // x8 x12 x10 x14 x9 x13 x11 x15 + xintl2 = simde_mm256_permutevar8x32_epi32(x256[2],perm_mask); // x16 x20 x18 x22 x17 x21 x19 x23 + xintl3 = simde_mm256_permutevar8x32_epi32(x256[3],perm_mask); // x24 x28 x26 x30 x25 x29 x27 x31 + xintl4 = simde_mm256_permutevar8x32_epi32(x256[4],perm_mask); // x32 x28 x34 x38 x33 x37 x35 x39 + xintl5 = simde_mm256_permutevar8x32_epi32(x256[5],perm_mask); + xintl6 = simde_mm256_permutevar8x32_epi32(x256[6],perm_mask); + xintl7 = simde_mm256_permutevar8x32_epi32(x256[7],perm_mask); xtmp[0] = simde_mm256_unpacklo_epi64(xintl0,xintl1); // x0 x4 x8 x12 x1 x5 x9 x13 xtmp[4] = simde_mm256_unpackhi_epi64(xintl0,xintl1); // x2 x6 x10 x14 x3 x7 x11 x15 - xtmp[1] = simde_mm256_unpacklo_epi64(xintl2,xintl3); // x16 x20 x24 x28 x17 x21 x25 x29 - xtmp[5] = simde_mm256_unpackhi_epi64(xintl2,xintl3); // x18 x22 x26 x30 x19 x23 x27 x31 - xtmp[2] = simde_mm256_unpacklo_epi64(xintl4,xintl5); // x32 x36 x40 x44 x33 x37 x41 x45 - xtmp[6] = simde_mm256_unpackhi_epi64(xintl4,xintl5); // x34 x38 x42 x46 x35 x39 x43 x47 - xtmp[3] = simde_mm256_unpacklo_epi64(xintl6,xintl7); // x48 x52 x56 x60 x49 x53 x57 x61 - xtmp[7] = simde_mm256_unpackhi_epi64(xintl6,xintl7); // x50 x54 x58 x62 x51 x55 x59 x63 - /* - print_shorts256("xtmp0",(int16_t*)xtmp); - print_shorts256("xtmp1",(int16_t*)(xtmp+1)); - print_shorts256("xtmp2",(int16_t*)(xtmp+2)); - print_shorts256("xtmp3",(int16_t*)(xtmp+3)); - print_shorts256("xtmp4",(int16_t*)(xtmp+4)); - print_shorts256("xtmp5",(int16_t*)(xtmp+5)); - print_shorts256("xtmp6",(int16_t*)(xtmp+6)); - print_shorts256("xtmp7",(int16_t*)(xtmp+7)); - */ - dft16_simd256((int16_t*)(xtmp),(int16_t*)ytmp); + xtmp[1] = simde_mm256_unpacklo_epi64(xintl2,xintl3); + xtmp[5] = simde_mm256_unpackhi_epi64(xintl2,xintl3); + xtmp[2] = simde_mm256_unpacklo_epi64(xintl4,xintl5); + xtmp[6] = simde_mm256_unpackhi_epi64(xintl4,xintl5); + xtmp[3] = simde_mm256_unpacklo_epi64(xintl6,xintl7); + xtmp[7] = simde_mm256_unpackhi_epi64(xintl6,xintl7); +#else + __m256i const perm_mask1 = _mm256_set_epi32(13, 9, 5, 1, 12, 8, 4, 0); + __m256i const perm_mask2 = _mm256_set_epi32(15, 11, 7, 3, 14, 10, 6, 2); + xtmp[0] = _mm256_permutex2var_epi32(x256[0],perm_mask1,x256[1]); // x0 x4 x8 x12 x2 x6 x10 x14 + xtmp[1] = _mm256_permutex2var_epi32(x256[2],perm_mask1,x256[3]); // x16 x20 x24 x28 x18 x22 x26 x30 + xtmp[2] = _mm256_permutex2var_epi32(x256[4],perm_mask1,x256[5]); // x32 x36 x40 x44 x34 x38 x42 x46 + xtmp[3] = _mm256_permutex2var_epi32(x256[6],perm_mask1,x256[7]); // x48 x52 x56 x60 x50 x54 x58 x62 + xtmp[4] = _mm256_permutex2var_epi32(x256[0],perm_mask2,x256[1]); // x1 x5 x9 x13 x3 x7 x11 x15 + xtmp[5] = _mm256_permutex2var_epi32(x256[2],perm_mask2,x256[3]); // x17 x21 x25 x29 x19 x23 x27 x31 + xtmp[6] = _mm256_permutex2var_epi32(x256[4],perm_mask2,x256[5]); // x33 x37 x41 x45 x35 x39 x43 x46 + xtmp[7] = _mm256_permutex2var_epi32(x256[6],perm_mask2,x256[7]); // x49 x53 x57 x61 x51 x55 x59 x63 +#endif + dft16_simd256((int16_t*)(xtmp),(int16_t*)ytmp,scale16); // [y0 y1 y2 y3 y4 y5 y6 y7] // [y8 y9 y10 y11 y12 y13 y14 y15] // [y16 y17 y18 y19 y20 y21 y22 y23] @@ -1598,7 +1642,7 @@ void dft64(int16_t *x,int16_t *y,unsigned char scale) print_shorts256("ytmp2",(int16_t*)(ytmp+2)); print_shorts256("ytmp3",(int16_t*)(ytmp+3)); */ - dft16_simd256((int16_t*)(xtmp+4),(int16_t*)(ytmp+4)); + dft16_simd256((int16_t*)(xtmp+4),(int16_t*)(ytmp+4),scale16); // [y32 y33 y34 y35 y36 y37 y38 y39] // [y40 y41 y42 y43 y44 y45 y46 y47] // [y48 y49 y50 y51 y52 y53 y54 y55] @@ -1649,25 +1693,24 @@ void dft64(int16_t *x,int16_t *y,unsigned char scale) #endif - if (scale>0) { - y256[0] = shiftright_int16_simd256(y256[0], 1); - y256[1] = shiftright_int16_simd256(y256[1], 1); - y256[2] = shiftright_int16_simd256(y256[2], 1); - y256[3] = shiftright_int16_simd256(y256[3], 1); - y256[4] = shiftright_int16_simd256(y256[4], 1); - y256[5] = shiftright_int16_simd256(y256[5], 1); - y256[6] = shiftright_int16_simd256(y256[6], 1); - y256[7] = shiftright_int16_simd256(y256[7], 1); + if (scale && *scale>0) { + unsigned int scalec=*scale; + y256[0] = shiftright_int16_simd256(y256[0],scalec); + y256[1] = shiftright_int16_simd256(y256[1],scalec); + y256[2] = shiftright_int16_simd256(y256[2],scalec); + y256[3] = shiftright_int16_simd256(y256[3],scalec); + y256[4] = shiftright_int16_simd256(y256[4],scalec); + y256[5] = shiftright_int16_simd256(y256[5],scalec); + y256[6] = shiftright_int16_simd256(y256[6],scalec); + y256[7] = shiftright_int16_simd256(y256[7],scalec); } } -void idft64(int16_t *x,int16_t *y,unsigned char scale) +void idft64(int16_t *x,int16_t *y,unsigned int *scale) { simd256_q15_t xtmp[16],ytmp[16],*tw64a_256=(simd256_q15_t *)tw64,*tw64b_256=(simd256_q15_t *)tw64c,*x256=(simd256_q15_t *)x,*y256=(simd256_q15_t *)y; - register simd256_q15_t xintl0,xintl1,xintl2,xintl3,xintl4,xintl5,xintl6,xintl7; - simd256_q15_t const perm_mask = simde_mm256_set_epi32(7, 3, 5, 1, 6, 2, 4, 0); #ifdef D64STATS @@ -1684,24 +1727,37 @@ void idft64(int16_t *x,int16_t *y,unsigned char scale) start_meas(&ts_d); #endif - xintl0 = simde_mm256_permutevar8x32_epi32(x256[0],perm_mask); // x0 x4 x1 x5 x2 x6 x3 x7 - xintl1 = simde_mm256_permutevar8x32_epi32(x256[1],perm_mask); // x8 x12 x9 x13 x10 x14 x11 x15 - xintl2 = simde_mm256_permutevar8x32_epi32(x256[2],perm_mask); // x16 x20 x17 x21 x18 x22 x19 x23 - xintl3 = simde_mm256_permutevar8x32_epi32(x256[3],perm_mask); // x24 x28 x25 x29 x26 x30 x27 x31 - xintl4 = simde_mm256_permutevar8x32_epi32(x256[4],perm_mask); // x24 x28 x25 x29 x26 x30 x27 x31 - xintl5 = simde_mm256_permutevar8x32_epi32(x256[5],perm_mask); // x24 x28 x25 x29 x26 x30 x27 x31 - xintl6 = simde_mm256_permutevar8x32_epi32(x256[6],perm_mask); // x24 x28 x25 x29 x26 x30 x27 x31 - xintl7 = simde_mm256_permutevar8x32_epi32(x256[7],perm_mask); // x24 x28 x25 x29 x26 x30 x27 x31 - +#ifndef __AVX512VBMI__ + simd256_q15_t xintl0,xintl1,xintl2,xintl3,xintl4,xintl5,xintl6,xintl7; + simd256_q15_t const perm_mask = simde_mm256_set_epi32(7, 3, 5, 1, 6, 2, 4, 0); + xintl0 = simde_mm256_permutevar8x32_epi32(x256[0],perm_mask); // x0 x4 x2 x6 x1 x5 x3 x7 + xintl1 = simde_mm256_permutevar8x32_epi32(x256[1],perm_mask); // x8 x12 x10 x14 x9 x13 x11 x15 + xintl2 = simde_mm256_permutevar8x32_epi32(x256[2],perm_mask); // x16 x20 x18 x22 x17 x21 x19 x23 + xintl3 = simde_mm256_permutevar8x32_epi32(x256[3],perm_mask); // x24 x28 x26 x30 x25 x29 x27 x31 + xintl4 = simde_mm256_permutevar8x32_epi32(x256[4],perm_mask); // x32 x28 x34 x38 x33 x37 x35 x39 + xintl5 = simde_mm256_permutevar8x32_epi32(x256[5],perm_mask); + xintl6 = simde_mm256_permutevar8x32_epi32(x256[6],perm_mask); + xintl7 = simde_mm256_permutevar8x32_epi32(x256[7],perm_mask); xtmp[0] = simde_mm256_unpacklo_epi64(xintl0,xintl1); // x0 x4 x8 x12 x1 x5 x9 x13 xtmp[4] = simde_mm256_unpackhi_epi64(xintl0,xintl1); // x2 x6 x10 x14 x3 x7 x11 x15 - xtmp[1] = simde_mm256_unpacklo_epi64(xintl2,xintl3); // x16 x20 x24 x28 x17 x21 x25 x29 - xtmp[5] = simde_mm256_unpackhi_epi64(xintl2,xintl3); // x18 x22 x26 x30 x19 x23 x27 x31 - xtmp[2] = simde_mm256_unpacklo_epi64(xintl4,xintl5); // x32 x36 x40 x44 x33 x37 x41 x45 - xtmp[6] = simde_mm256_unpackhi_epi64(xintl4,xintl5); // x34 x38 x42 x46 x35 x39 x43 x47 - xtmp[3] = simde_mm256_unpacklo_epi64(xintl6,xintl7); // x48 x52 x56 x60 x49 x53 x57 x61 - xtmp[7] = simde_mm256_unpackhi_epi64(xintl6,xintl7); // x50 x54 x58 x62 x51 x55 x59 x63 - + xtmp[1] = simde_mm256_unpacklo_epi64(xintl2,xintl3); + xtmp[5] = simde_mm256_unpackhi_epi64(xintl2,xintl3); + xtmp[2] = simde_mm256_unpacklo_epi64(xintl4,xintl5); + xtmp[6] = simde_mm256_unpackhi_epi64(xintl4,xintl5); + xtmp[3] = simde_mm256_unpacklo_epi64(xintl6,xintl7); + xtmp[7] = simde_mm256_unpackhi_epi64(xintl6,xintl7); +#else + __m256i const perm_mask1 = _mm256_set_epi32(13, 9, 5, 1, 12, 8, 4, 0); + __m256i const perm_mask2 = _mm256_set_epi32(15, 11, 7, 3, 14, 10, 6, 2); + xtmp[0] = _mm256_permutex2var_epi32(x256[0],perm_mask1,x256[1]); // x0 x4 x8 x12 x2 x6 x10 x14 + xtmp[1] = _mm256_permutex2var_epi32(x256[2],perm_mask1,x256[3]); // x16 x20 x24 x28 x18 x22 x26 x30 + xtmp[2] = _mm256_permutex2var_epi32(x256[4],perm_mask1,x256[5]); // x32 x36 x40 x44 x34 x38 x42 x46 + xtmp[3] = _mm256_permutex2var_epi32(x256[6],perm_mask1,x256[7]); // x48 x52 x56 x60 x50 x54 x58 x62 + xtmp[4] = _mm256_permutex2var_epi32(x256[0],perm_mask2,x256[1]); // x1 x5 x9 x13 x3 x7 x11 x15 + xtmp[5] = _mm256_permutex2var_epi32(x256[2],perm_mask2,x256[3]); // x17 x21 x25 x29 x19 x23 x27 x31 + xtmp[6] = _mm256_permutex2var_epi32(x256[4],perm_mask2,x256[5]); // x33 x37 x41 x45 x35 x39 x43 x46 + xtmp[7] = _mm256_permutex2var_epi32(x256[6],perm_mask2,x256[7]); // x49 x53 x57 x61 x51 x55 x59 x63 +#endif idft16_simd256((int16_t*)(xtmp),(int16_t*)ytmp); // [y0 y1 y2 y3 y16 y17 y18 y19] @@ -1746,15 +1802,16 @@ void idft64(int16_t *x,int16_t *y,unsigned char scale) #endif - if (scale>0) { - y256[0] = shiftright_int16_simd256(y256[0],3); - y256[1] = shiftright_int16_simd256(y256[1],3); - y256[2] = shiftright_int16_simd256(y256[2],3); - y256[3] = shiftright_int16_simd256(y256[3],3); - y256[4] = shiftright_int16_simd256(y256[4],3); - y256[5] = shiftright_int16_simd256(y256[5],3); - y256[6] = shiftright_int16_simd256(y256[6],3); - y256[7] = shiftright_int16_simd256(y256[7],3); + if (scale && *scale>0) { + unsigned int scalec = *scale; + y256[0] = shiftright_int16_simd256(y256[0],scalec); + y256[1] = shiftright_int16_simd256(y256[1],scalec); + y256[2] = shiftright_int16_simd256(y256[2],scalec); + y256[3] = shiftright_int16_simd256(y256[3],scalec); + y256[4] = shiftright_int16_simd256(y256[4],scalec); + y256[5] = shiftright_int16_simd256(y256[5],scalec); + y256[6] = shiftright_int16_simd256(y256[6],scalec); + y256[7] = shiftright_int16_simd256(y256[7],scalec); } } @@ -1765,7 +1822,7 @@ static const int16_t tw128a[128] __attribute__((aligned(32))) = { 32767,0,32727, static const int16_t tw128b[128] __attribute__((aligned(32))) = {0,32767,-1608,32727,-3212,32609,-4808,32412,-6393,32137,-7962,31785,-9512,31356,-11039,30851,-12540,30272,-14010,29621,-15447,28897,-16846,28105,-18205,27244,-19520,26318,-20788,25329,-22005,24278,-23170,23169,-24279,22004,-25330,20787,-26319,19519,-27245,18204,-28106,16845,-28898,15446,-29622,14009,-30273,12539,-30852,11038,-31357,9511,-31786,7961,-32138,6392,-32413,4807,-32610,3211,-32728,1607,-32767,0,-32728,-1608,-32610,-3212,-32413,-4808,-32138,-6393,-31786,-7962,-31357,-9512,-30852,-11039,-30273,-12540,-29622,-14010,-28898,-15447,-28106,-16846,-27245,-18205,-26319,-19520,-25330,-20788,-24279,-22005,-23170,-23170,-22005,-24279,-20788,-25330,-19520,-26319,-18205,-27245,-16846,-28106,-15447,-28898,-14010,-29622,-12540,-30273,-11039,-30852,-9512,-31357,-7962,-31786,-6393,-32138,-4808,-32413,-3212,-32610,-1608,-32728}; -void dft128(int16_t *x,int16_t *y,unsigned char scale) +void dft128(int16_t *x,int16_t *y,unsigned int *scale) { simd256_q15_t xtmp[16],*x256 = (simd256_q15_t *)x; @@ -1789,8 +1846,10 @@ void dft128(int16_t *x,int16_t *y,unsigned char scale) LOG_M("dft128inb_256.m","dftinb",xtmp+8,64,1,1); } #endif - dft64((int16_t*)(xtmp),(int16_t*)ytmp,1); - dft64((int16_t*)(xtmp+8),(int16_t*)(ytmp+8),1); + unsigned int *scale64=NULL; + if (scale) scale64=scale+1; + dft64((int16_t*)(xtmp),(int16_t*)ytmp,scale64); + dft64((int16_t*)(xtmp+8),(int16_t*)(ytmp+8),scale64); #ifndef MR_MAIN if (LOG_DUMPFLAG(DEBUG_DFT)) { LOG_M("dft128outa_256.m","dftouta",ytmp,64,1,1); @@ -1808,25 +1867,45 @@ void dft128(int16_t *x,int16_t *y,unsigned char scale) ytmpp++; } - if (scale>0) { - - y256[0] = mulhi_int16_simd256(y256[0],ONE_OVER_SQRT2_Q15_256); - y256[1] = mulhi_int16_simd256(y256[1],ONE_OVER_SQRT2_Q15_256); - y256[2] = mulhi_int16_simd256(y256[2],ONE_OVER_SQRT2_Q15_256); - y256[3] = mulhi_int16_simd256(y256[3],ONE_OVER_SQRT2_Q15_256); - y256[4] = mulhi_int16_simd256(y256[4],ONE_OVER_SQRT2_Q15_256); - y256[5] = mulhi_int16_simd256(y256[5],ONE_OVER_SQRT2_Q15_256); - y256[6] = mulhi_int16_simd256(y256[6],ONE_OVER_SQRT2_Q15_256); - y256[7] = mulhi_int16_simd256(y256[7],ONE_OVER_SQRT2_Q15_256); - y256[8] = mulhi_int16_simd256(y256[8],ONE_OVER_SQRT2_Q15_256); - y256[9] = mulhi_int16_simd256(y256[9],ONE_OVER_SQRT2_Q15_256); - y256[10] = mulhi_int16_simd256(y256[10],ONE_OVER_SQRT2_Q15_256); - y256[11] = mulhi_int16_simd256(y256[11],ONE_OVER_SQRT2_Q15_256); - y256[12] = mulhi_int16_simd256(y256[12],ONE_OVER_SQRT2_Q15_256); - y256[13] = mulhi_int16_simd256(y256[13],ONE_OVER_SQRT2_Q15_256); - y256[14] = mulhi_int16_simd256(y256[14],ONE_OVER_SQRT2_Q15_256); - y256[15] = mulhi_int16_simd256(y256[15],ONE_OVER_SQRT2_Q15_256); - + if (scale && *scale>0) { + + if (*scale>1) { + uint32_t scale2=*scale-1; + y256[0] = mulhi_int16_simd256(shiftright_int16_simd256(y256[0],scale2),ONE_OVER_SQRT2_Q15_256); + y256[1] = mulhi_int16_simd256(shiftright_int16_simd256(y256[1],scale2),ONE_OVER_SQRT2_Q15_256); + y256[2] = mulhi_int16_simd256(shiftright_int16_simd256(y256[2],scale2),ONE_OVER_SQRT2_Q15_256); + y256[3] = mulhi_int16_simd256(shiftright_int16_simd256(y256[3],scale2),ONE_OVER_SQRT2_Q15_256); + y256[4] = mulhi_int16_simd256(shiftright_int16_simd256(y256[4],scale2),ONE_OVER_SQRT2_Q15_256); + y256[5] = mulhi_int16_simd256(shiftright_int16_simd256(y256[5],scale2),ONE_OVER_SQRT2_Q15_256); + y256[6] = mulhi_int16_simd256(shiftright_int16_simd256(y256[6],scale2),ONE_OVER_SQRT2_Q15_256); + y256[7] = mulhi_int16_simd256(shiftright_int16_simd256(y256[7],scale2),ONE_OVER_SQRT2_Q15_256); + y256[8] = mulhi_int16_simd256(shiftright_int16_simd256(y256[8],scale2),ONE_OVER_SQRT2_Q15_256); + y256[9] = mulhi_int16_simd256(shiftright_int16_simd256(y256[9],scale2),ONE_OVER_SQRT2_Q15_256); + y256[10] = mulhi_int16_simd256(shiftright_int16_simd256(y256[10],scale2),ONE_OVER_SQRT2_Q15_256); + y256[11] = mulhi_int16_simd256(shiftright_int16_simd256(y256[11],scale2),ONE_OVER_SQRT2_Q15_256); + y256[12] = mulhi_int16_simd256(shiftright_int16_simd256(y256[12],scale2),ONE_OVER_SQRT2_Q15_256); + y256[13] = mulhi_int16_simd256(shiftright_int16_simd256(y256[13],scale2),ONE_OVER_SQRT2_Q15_256); + y256[14] = mulhi_int16_simd256(shiftright_int16_simd256(y256[14],scale2),ONE_OVER_SQRT2_Q15_256); + y256[15] = mulhi_int16_simd256(shiftright_int16_simd256(y256[15],scale2),ONE_OVER_SQRT2_Q15_256); + } + else { + y256[0] = mulhi_int16_simd256(y256[0],ONE_OVER_SQRT2_Q15_256); + y256[1] = mulhi_int16_simd256(y256[1],ONE_OVER_SQRT2_Q15_256); + y256[2] = mulhi_int16_simd256(y256[2],ONE_OVER_SQRT2_Q15_256); + y256[3] = mulhi_int16_simd256(y256[3],ONE_OVER_SQRT2_Q15_256); + y256[4] = mulhi_int16_simd256(y256[4],ONE_OVER_SQRT2_Q15_256); + y256[5] = mulhi_int16_simd256(y256[5],ONE_OVER_SQRT2_Q15_256); + y256[6] = mulhi_int16_simd256(y256[6],ONE_OVER_SQRT2_Q15_256); + y256[7] = mulhi_int16_simd256(y256[7],ONE_OVER_SQRT2_Q15_256); + y256[8] = mulhi_int16_simd256(y256[8],ONE_OVER_SQRT2_Q15_256); + y256[9] = mulhi_int16_simd256(y256[9],ONE_OVER_SQRT2_Q15_256); + y256[10] = mulhi_int16_simd256(y256[10],ONE_OVER_SQRT2_Q15_256); + y256[11] = mulhi_int16_simd256(y256[11],ONE_OVER_SQRT2_Q15_256); + y256[12] = mulhi_int16_simd256(y256[12],ONE_OVER_SQRT2_Q15_256); + y256[13] = mulhi_int16_simd256(y256[13],ONE_OVER_SQRT2_Q15_256); + y256[14] = mulhi_int16_simd256(y256[14],ONE_OVER_SQRT2_Q15_256); + y256[15] = mulhi_int16_simd256(y256[15],ONE_OVER_SQRT2_Q15_256); + } } #ifndef MR_MAIN if (LOG_DUMPFLAG(DEBUG_DFT)) { @@ -1836,7 +1915,7 @@ void dft128(int16_t *x,int16_t *y,unsigned char scale) #endif } -void idft128(int16_t *x,int16_t *y,unsigned char scale) +void idft128(int16_t *x,int16_t *y,unsigned int *scale) { simd256_q15_t xtmp[16],*x256 = (simd256_q15_t *)x; @@ -1856,8 +1935,10 @@ void idft128(int16_t *x,int16_t *y,unsigned char scale) transpose4_ooff_simd256(x256+12,xtmp+6,8); transpose4_ooff_simd256(x256+14,xtmp+7,8); - idft64((int16_t*)(xtmp),(int16_t*)ytmp,1); - idft64((int16_t*)(xtmp+8),(int16_t*)(ytmp+8),1); + unsigned int *scale64=NULL; + if (scale) scale64=scale+1; + idft64((int16_t*)(xtmp),(int16_t*)ytmp,scale64); + idft64((int16_t*)(xtmp+8),(int16_t*)(ytmp+8),scale64); for (i=0; i<8; i++) { @@ -1869,25 +1950,45 @@ void idft128(int16_t *x,int16_t *y,unsigned char scale) ytmpp++; } - if (scale>0) { - - y256[0] = mulhi_int16_simd256(y256[0],ONE_OVER_SQRT2_Q15_256); - y256[1] = mulhi_int16_simd256(y256[1],ONE_OVER_SQRT2_Q15_256); - y256[2] = mulhi_int16_simd256(y256[2],ONE_OVER_SQRT2_Q15_256); - y256[3] = mulhi_int16_simd256(y256[3],ONE_OVER_SQRT2_Q15_256); - y256[4] = mulhi_int16_simd256(y256[4],ONE_OVER_SQRT2_Q15_256); - y256[5] = mulhi_int16_simd256(y256[5],ONE_OVER_SQRT2_Q15_256); - y256[6] = mulhi_int16_simd256(y256[6],ONE_OVER_SQRT2_Q15_256); - y256[7] = mulhi_int16_simd256(y256[7],ONE_OVER_SQRT2_Q15_256); - y256[8] = mulhi_int16_simd256(y256[8],ONE_OVER_SQRT2_Q15_256); - y256[9] = mulhi_int16_simd256(y256[9],ONE_OVER_SQRT2_Q15_256); - y256[10] = mulhi_int16_simd256(y256[10],ONE_OVER_SQRT2_Q15_256); - y256[11] = mulhi_int16_simd256(y256[11],ONE_OVER_SQRT2_Q15_256); - y256[12] = mulhi_int16_simd256(y256[12],ONE_OVER_SQRT2_Q15_256); - y256[13] = mulhi_int16_simd256(y256[13],ONE_OVER_SQRT2_Q15_256); - y256[14] = mulhi_int16_simd256(y256[14],ONE_OVER_SQRT2_Q15_256); - y256[15] = mulhi_int16_simd256(y256[15],ONE_OVER_SQRT2_Q15_256); - + if (scale && *scale>0) { + + if (*scale>1) { + uint32_t scale2=*scale-1; + y256[0] = mulhi_int16_simd256(shiftright_int16_simd256(y256[0],scale2),ONE_OVER_SQRT2_Q15_256); + y256[1] = mulhi_int16_simd256(shiftright_int16_simd256(y256[1],scale2),ONE_OVER_SQRT2_Q15_256); + y256[2] = mulhi_int16_simd256(shiftright_int16_simd256(y256[2],scale2),ONE_OVER_SQRT2_Q15_256); + y256[3] = mulhi_int16_simd256(shiftright_int16_simd256(y256[3],scale2),ONE_OVER_SQRT2_Q15_256); + y256[4] = mulhi_int16_simd256(shiftright_int16_simd256(y256[4],scale2),ONE_OVER_SQRT2_Q15_256); + y256[5] = mulhi_int16_simd256(shiftright_int16_simd256(y256[5],scale2),ONE_OVER_SQRT2_Q15_256); + y256[6] = mulhi_int16_simd256(shiftright_int16_simd256(y256[6],scale2),ONE_OVER_SQRT2_Q15_256); + y256[7] = mulhi_int16_simd256(shiftright_int16_simd256(y256[7],scale2),ONE_OVER_SQRT2_Q15_256); + y256[8] = mulhi_int16_simd256(shiftright_int16_simd256(y256[8],scale2),ONE_OVER_SQRT2_Q15_256); + y256[9] = mulhi_int16_simd256(shiftright_int16_simd256(y256[9],scale2),ONE_OVER_SQRT2_Q15_256); + y256[10] = mulhi_int16_simd256(shiftright_int16_simd256(y256[10],scale2),ONE_OVER_SQRT2_Q15_256); + y256[11] = mulhi_int16_simd256(shiftright_int16_simd256(y256[11],scale2),ONE_OVER_SQRT2_Q15_256); + y256[12] = mulhi_int16_simd256(shiftright_int16_simd256(y256[12],scale2),ONE_OVER_SQRT2_Q15_256); + y256[13] = mulhi_int16_simd256(shiftright_int16_simd256(y256[13],scale2),ONE_OVER_SQRT2_Q15_256); + y256[14] = mulhi_int16_simd256(shiftright_int16_simd256(y256[14],scale2),ONE_OVER_SQRT2_Q15_256); + y256[15] = mulhi_int16_simd256(shiftright_int16_simd256(y256[15],scale2),ONE_OVER_SQRT2_Q15_256); + } + else { + y256[0] = mulhi_int16_simd256(y256[0],ONE_OVER_SQRT2_Q15_256); + y256[1] = mulhi_int16_simd256(y256[1],ONE_OVER_SQRT2_Q15_256); + y256[2] = mulhi_int16_simd256(y256[2],ONE_OVER_SQRT2_Q15_256); + y256[3] = mulhi_int16_simd256(y256[3],ONE_OVER_SQRT2_Q15_256); + y256[4] = mulhi_int16_simd256(y256[4],ONE_OVER_SQRT2_Q15_256); + y256[5] = mulhi_int16_simd256(y256[5],ONE_OVER_SQRT2_Q15_256); + y256[6] = mulhi_int16_simd256(y256[6],ONE_OVER_SQRT2_Q15_256); + y256[7] = mulhi_int16_simd256(y256[7],ONE_OVER_SQRT2_Q15_256); + y256[8] = mulhi_int16_simd256(y256[8],ONE_OVER_SQRT2_Q15_256); + y256[9] = mulhi_int16_simd256(y256[9],ONE_OVER_SQRT2_Q15_256); + y256[10] = mulhi_int16_simd256(y256[10],ONE_OVER_SQRT2_Q15_256); + y256[11] = mulhi_int16_simd256(y256[11],ONE_OVER_SQRT2_Q15_256); + y256[12] = mulhi_int16_simd256(y256[12],ONE_OVER_SQRT2_Q15_256); + y256[13] = mulhi_int16_simd256(y256[13],ONE_OVER_SQRT2_Q15_256); + y256[14] = mulhi_int16_simd256(y256[14],ONE_OVER_SQRT2_Q15_256); + y256[15] = mulhi_int16_simd256(y256[15],ONE_OVER_SQRT2_Q15_256); + } } } @@ -1906,7 +2007,7 @@ static const int16_t tw256b[384] __attribute__((aligned(32))) = {0,32767,-805,32 0,32767,-1608,32727,-3212,32609,-4808,32412,-6393,32137,-7962,31785,-9512,31356,-11039,30851,-12540,30272,-14010,29621,-15447,28897,-16846,28105,-18205,27244,-19520,26318,-20788,25329,-22005,24278,-23170,23169,-24279,22004,-25330,20787,-26319,19519,-27245,18204,-28106,16845,-28898,15446,-29622,14009,-30273,12539,-30852,11038,-31357,9511,-31786,7961,-32138,6392,-32413,4807,-32610,3211,-32728,1607,-32767,0,-32728,-1608,-32610,-3212,-32413,-4808,-32138,-6393,-31786,-7962,-31357,-9512,-30852,-11039,-30273,-12540,-29622,-14010,-28898,-15447,-28106,-16846,-27245,-18205,-26319,-19520,-25330,-20788,-24279,-22005,-23170,-23170,-22005,-24279,-20788,-25330,-19520,-26319,-18205,-27245,-16846,-28106,-15447,-28898,-14010,-29622,-12540,-30273,-11039,-30852,-9512,-31357,-7962,-31786,-6393,-32138,-4808,-32413,-3212,-32610,-1608,-32728, 0,32767,-2411,32678,-4808,32412,-7180,31970,-9512,31356,-11793,30571,-14010,29621,-16151,28510,-18205,27244,-20160,25831,-22005,24278,-23732,22594,-25330,20787,-26790,18867,-28106,16845,-29269,14732,-30273,12539,-31114,10278,-31786,7961,-32285,5601,-32610,3211,-32758,804,-32728,-1608,-32521,-4012,-32138,-6393,-31581,-8740,-30852,-11039,-29956,-13279,-28898,-15447,-27684,-17531,-26319,-19520,-24812,-21403,-23170,-23170,-21403,-24812,-19520,-26319,-17531,-27684,-15447,-28898,-13279,-29956,-11039,-30852,-8740,-31581,-6393,-32138,-4012,-32521,-1608,-32728,804,-32758,3211,-32610,5601,-32285,7961,-31786,10278,-31114,12539,-30273,14732,-29269,16845,-28106,18867,-26790,20787,-25330,22594,-23732,24278,-22005,25831,-20160,27244,-18205,28510,-16151,29621,-14010,30571,-11793,31356,-9512,31970,-7180,32412,-4808,32678,-2411 }; -void dft256(int16_t *x,int16_t *y,unsigned char scale) +void dft256(int16_t *x,int16_t *y,unsigned int *scale) { simd256_q15_t xtmp[32],ytmp[32],*tw256a_256p=(simd256_q15_t *)tw256a,*tw256b_256p=(simd256_q15_t *)tw256b,*x256=(simd256_q15_t *)x,*y256=(simd256_q15_t *)y,*y256p=(simd256_q15_t *)y; @@ -1928,11 +2029,13 @@ void dft256(int16_t *x,int16_t *y,unsigned char scale) print_shorts256(vname,(int16_t*)(xtmp+i)); } exit(-1);*/ - - dft64((int16_t*)(xtmp),(int16_t*)(ytmp),1); - dft64((int16_t*)(xtmp+8),(int16_t*)(ytmp+8),1); - dft64((int16_t*)(xtmp+16),(int16_t*)(ytmp+16),1); - dft64((int16_t*)(xtmp+24),(int16_t*)(ytmp+24),1); + + unsigned int *scale64=NULL; + if (scale) scale64=scale+1; + dft64((int16_t*)(xtmp),(int16_t*)(ytmp),scale64); + dft64((int16_t*)(xtmp+8),(int16_t*)(ytmp+8),scale64); + dft64((int16_t*)(xtmp+16),(int16_t*)(ytmp+16),scale64); + dft64((int16_t*)(xtmp+24),(int16_t*)(ytmp+24),scale64); bfly4_16_256(ytmpp,ytmpp+8,ytmpp+16,ytmpp+24, @@ -1968,25 +2071,25 @@ void dft256(int16_t *x,int16_t *y,unsigned char scale) tw256a_256p+7,tw256a_256p+15,tw256a_256p+23, tw256b_256p+7,tw256b_256p+15,tw256b_256p+23); - if (scale>0) { - + if (scale && *scale>0) { + unsigned int scalec=*scale; for (i=0; i<2; i++) { - y256[0] = shiftright_int16_simd256(y256[0],1); - y256[1] = shiftright_int16_simd256(y256[1],1); - y256[2] = shiftright_int16_simd256(y256[2],1); - y256[3] = shiftright_int16_simd256(y256[3],1); - y256[4] = shiftright_int16_simd256(y256[4],1); - y256[5] = shiftright_int16_simd256(y256[5],1); - y256[6] = shiftright_int16_simd256(y256[6],1); - y256[7] = shiftright_int16_simd256(y256[7],1); - y256[8] = shiftright_int16_simd256(y256[8],1); - y256[9] = shiftright_int16_simd256(y256[9],1); - y256[10] = shiftright_int16_simd256(y256[10],1); - y256[11] = shiftright_int16_simd256(y256[11],1); - y256[12] = shiftright_int16_simd256(y256[12],1); - y256[13] = shiftright_int16_simd256(y256[13],1); - y256[14] = shiftright_int16_simd256(y256[14],1); - y256[15] = shiftright_int16_simd256(y256[15],1); + y256[0] = shiftright_int16_simd256(y256[0],scalec); + y256[1] = shiftright_int16_simd256(y256[1],scalec); + y256[2] = shiftright_int16_simd256(y256[2],scalec); + y256[3] = shiftright_int16_simd256(y256[3],scalec); + y256[4] = shiftright_int16_simd256(y256[4],scalec); + y256[5] = shiftright_int16_simd256(y256[5],scalec); + y256[6] = shiftright_int16_simd256(y256[6],scalec); + y256[7] = shiftright_int16_simd256(y256[7],scalec); + y256[8] = shiftright_int16_simd256(y256[8],scalec); + y256[9] = shiftright_int16_simd256(y256[9],scalec); + y256[10] = shiftright_int16_simd256(y256[10],scalec); + y256[11] = shiftright_int16_simd256(y256[11],scalec); + y256[12] = shiftright_int16_simd256(y256[12],scalec); + y256[13] = shiftright_int16_simd256(y256[13],scalec); + y256[14] = shiftright_int16_simd256(y256[14],scalec); + y256[15] = shiftright_int16_simd256(y256[15],scalec); y256+=16; } @@ -1995,7 +2098,7 @@ void dft256(int16_t *x,int16_t *y,unsigned char scale) } -void idft256(int16_t *x,int16_t *y,unsigned char scale) +void idft256(int16_t *x,int16_t *y,unsigned int *scale) { simd256_q15_t xtmp[32],ytmp[32],*tw256_256p=(simd256_q15_t *)tw256,*x256=(simd256_q15_t *)x,*y256=(simd256_q15_t *)y,*y256p=(simd256_q15_t *)y; @@ -2011,10 +2114,12 @@ void idft256(int16_t *x,int16_t *y,unsigned char scale) transpose16_ooff_simd256(x256+24,xtmp+6,8); transpose16_ooff_simd256(x256+28,xtmp+7,8); - idft64((int16_t*)(xtmp),(int16_t*)(ytmp),1); - idft64((int16_t*)(xtmp+8),(int16_t*)(ytmp+8),1); - idft64((int16_t*)(xtmp+16),(int16_t*)(ytmp+16),1); - idft64((int16_t*)(xtmp+24),(int16_t*)(ytmp+24),1); + unsigned int *scale64=NULL; + if (scale) scale64=scale+1; + idft64((int16_t*)(xtmp),(int16_t*)(ytmp),scale64); + idft64((int16_t*)(xtmp+8),(int16_t*)(ytmp+8),scale64); + idft64((int16_t*)(xtmp+16),(int16_t*)(ytmp+16),scale64); + idft64((int16_t*)(xtmp+24),(int16_t*)(ytmp+24),scale64); ibfly4_256(ytmpp,ytmpp+8,ytmpp+16,ytmpp+24, @@ -2050,25 +2155,25 @@ void idft256(int16_t *x,int16_t *y,unsigned char scale) tw256_256p+7,tw256_256p+15,tw256_256p+23); - if (scale>0) { - + if (scale && *scale>0) { + unsigned int scalec=*scale; for (i=0; i<2; i++) { - y256[0] = shiftright_int16_simd256(y256[0],1); - y256[1] = shiftright_int16_simd256(y256[1],1); - y256[2] = shiftright_int16_simd256(y256[2],1); - y256[3] = shiftright_int16_simd256(y256[3],1); - y256[4] = shiftright_int16_simd256(y256[4],1); - y256[5] = shiftright_int16_simd256(y256[5],1); - y256[6] = shiftright_int16_simd256(y256[6],1); - y256[7] = shiftright_int16_simd256(y256[7],1); - y256[8] = shiftright_int16_simd256(y256[8],1); - y256[9] = shiftright_int16_simd256(y256[9],1); - y256[10] = shiftright_int16_simd256(y256[10],1); - y256[11] = shiftright_int16_simd256(y256[11],1); - y256[12] = shiftright_int16_simd256(y256[12],1); - y256[13] = shiftright_int16_simd256(y256[13],1); - y256[14] = shiftright_int16_simd256(y256[14],1); - y256[15] = shiftright_int16_simd256(y256[15],1); + y256[0] = shiftright_int16_simd256(y256[0],scalec); + y256[1] = shiftright_int16_simd256(y256[1],scalec); + y256[2] = shiftright_int16_simd256(y256[2],scalec); + y256[3] = shiftright_int16_simd256(y256[3],scalec); + y256[4] = shiftright_int16_simd256(y256[4],scalec); + y256[5] = shiftright_int16_simd256(y256[5],scalec); + y256[6] = shiftright_int16_simd256(y256[6],scalec); + y256[7] = shiftright_int16_simd256(y256[7],scalec); + y256[8] = shiftright_int16_simd256(y256[8],scalec); + y256[9] = shiftright_int16_simd256(y256[9],scalec); + y256[10] = shiftright_int16_simd256(y256[10],scalec); + y256[11] = shiftright_int16_simd256(y256[11],scalec); + y256[12] = shiftright_int16_simd256(y256[12],scalec); + y256[13] = shiftright_int16_simd256(y256[13],scalec); + y256[14] = shiftright_int16_simd256(y256[14],scalec); + y256[15] = shiftright_int16_simd256(y256[15],scalec); y256+=16; } @@ -2081,7 +2186,7 @@ static const int16_t tw512[512] __attribute__((aligned(32))) = { 32767,0,32764,-403,32757,-805,32744,-1207,32727,-1608,32705,-2010,32678,-2411,32646,-2812,32609,-3212,32567,-3612,32520,-4012,32468,-4410,32412,-4808,32350,-5206,32284,-5602,32213,-5998,32137,-6393,32056,-6787,31970,-7180,31880,-7572,31785,-7962,31684,-8352,31580,-8740,31470,-9127,31356,-9512,31236,-9896,31113,-10279,30984,-10660,30851,-11039,30713,-11417,30571,-11793,30424,-12167,30272,-12540,30116,-12910,29955,-13279,29790,-13646,29621,-14010,29446,-14373,29268,-14733,29085,-15091,28897,-15447,28706,-15800,28510,-16151,28309,-16500,28105,-16846,27896,-17190,27683,-17531,27466,-17869,27244,-18205,27019,-18538,26789,-18868,26556,-19195,26318,-19520,26077,-19841,25831,-20160,25582,-20475,25329,-20788,25072,-21097,24811,-21403,24546,-21706,24278,-22005,24006,-22302,23731,-22595,23452,-22884,23169,-23170,22883,-23453,22594,-23732,22301,-24007,22004,-24279,21705,-24547,21402,-24812,21096,-25073,20787,-25330,20474,-25583,20159,-25832,19840,-26078,19519,-26319,19194,-26557,18867,-26790,18537,-27020,18204,-27245,17868,-27467,17530,-27684,17189,-27897,16845,-28106,16499,-28310,16150,-28511,15799,-28707,15446,-28898,15090,-29086,14732,-29269,14372,-29447,14009,-29622,13645,-29791,13278,-29956,12909,-30117,12539,-30273,12166,-30425,11792,-30572,11416,-30714,11038,-30852,10659,-30985,10278,-31114,9895,-31237,9511,-31357,9126,-31471,8739,-31581,8351,-31685,7961,-31786,7571,-31881,7179,-31971,6786,-32057,6392,-32138,5997,-32214,5601,-32285,5205,-32351,4807,-32413,4409,-32469,4011,-32521,3611,-32568,3211,-32610,2811,-32647,2410,-32679,2009,-32706,1607,-32728,1206,-32745,804,-32758,402,-32765,0,-32767,-403,-32765,-805,-32758,-1207,-32745,-1608,-32728,-2010,-32706,-2411,-32679,-2812,-32647,-3212,-32610,-3612,-32568,-4012,-32521,-4410,-32469,-4808,-32413,-5206,-32351,-5602,-32285,-5998,-32214,-6393,-32138,-6787,-32057,-7180,-31971,-7572,-31881,-7962,-31786,-8352,-31685,-8740,-31581,-9127,-31471,-9512,-31357,-9896,-31237,-10279,-31114,-10660,-30985,-11039,-30852,-11417,-30714,-11793,-30572,-12167,-30425,-12540,-30273,-12910,-30117,-13279,-29956,-13646,-29791,-14010,-29622,-14373,-29447,-14733,-29269,-15091,-29086,-15447,-28898,-15800,-28707,-16151,-28511,-16500,-28310,-16846,-28106,-17190,-27897,-17531,-27684,-17869,-27467,-18205,-27245,-18538,-27020,-18868,-26790,-19195,-26557,-19520,-26319,-19841,-26078,-20160,-25832,-20475,-25583,-20788,-25330,-21097,-25073,-21403,-24812,-21706,-24547,-22005,-24279,-22302,-24007,-22595,-23732,-22884,-23453,-23170,-23170,-23453,-22884,-23732,-22595,-24007,-22302,-24279,-22005,-24547,-21706,-24812,-21403,-25073,-21097,-25330,-20788,-25583,-20475,-25832,-20160,-26078,-19841,-26319,-19520,-26557,-19195,-26790,-18868,-27020,-18538,-27245,-18205,-27467,-17869,-27684,-17531,-27897,-17190,-28106,-16846,-28310,-16500,-28511,-16151,-28707,-15800,-28898,-15447,-29086,-15091,-29269,-14733,-29447,-14373,-29622,-14010,-29791,-13646,-29956,-13279,-30117,-12910,-30273,-12540,-30425,-12167,-30572,-11793,-30714,-11417,-30852,-11039,-30985,-10660,-31114,-10279,-31237,-9896,-31357,-9512,-31471,-9127,-31581,-8740,-31685,-8352,-31786,-7962,-31881,-7572,-31971,-7180,-32057,-6787,-32138,-6393,-32214,-5998,-32285,-5602,-32351,-5206,-32413,-4808,-32469,-4410,-32521,-4012,-32568,-3612,-32610,-3212,-32647,-2812,-32679,-2411,-32706,-2010,-32728,-1608,-32745,-1207,-32758,-805,-32765,-403 }; -void dft512(int16_t *x,int16_t *y,unsigned char scale) +void dft512(int16_t *x,int16_t *y,unsigned int *scale) { simd256_q15_t xtmp[64],*x256 = (simd256_q15_t *)x; @@ -2124,9 +2229,12 @@ void dft512(int16_t *x,int16_t *y,unsigned char scale) transpose4_ooff_simd256(x256+58,xtmp+29,32); transpose4_ooff_simd256(x256+60,xtmp+30,32); transpose4_ooff_simd256(x256+62,xtmp+31,32); - - dft256((int16_t*)(xtmp),(int16_t*)ytmp,1); - dft256((int16_t*)(xtmp+32),(int16_t*)(ytmp+32),1); + + + unsigned int *scale256=NULL; + if (scale) scale256=scale+1; + dft256((int16_t*)(xtmp),(int16_t*)ytmp,scale256); + dft256((int16_t*)(xtmp+32),(int16_t*)(ytmp+32),scale256); for (i=0; i<32; i++) { @@ -2138,32 +2246,56 @@ void dft512(int16_t *x,int16_t *y,unsigned char scale) ytmpp++; } - if (scale>0) { - - for (i=0;i<4;i++) { - y256[0] = mulhi_int16_simd256(y256[0],ONE_OVER_SQRT2_Q15_256); - y256[1] = mulhi_int16_simd256(y256[1],ONE_OVER_SQRT2_Q15_256); - y256[2] = mulhi_int16_simd256(y256[2],ONE_OVER_SQRT2_Q15_256); - y256[3] = mulhi_int16_simd256(y256[3],ONE_OVER_SQRT2_Q15_256); - y256[4] = mulhi_int16_simd256(y256[4],ONE_OVER_SQRT2_Q15_256); - y256[5] = mulhi_int16_simd256(y256[5],ONE_OVER_SQRT2_Q15_256); - y256[6] = mulhi_int16_simd256(y256[6],ONE_OVER_SQRT2_Q15_256); - y256[7] = mulhi_int16_simd256(y256[7],ONE_OVER_SQRT2_Q15_256); - y256[8] = mulhi_int16_simd256(y256[8],ONE_OVER_SQRT2_Q15_256); - y256[9] = mulhi_int16_simd256(y256[9],ONE_OVER_SQRT2_Q15_256); - y256[10] = mulhi_int16_simd256(y256[10],ONE_OVER_SQRT2_Q15_256); - y256[11] = mulhi_int16_simd256(y256[11],ONE_OVER_SQRT2_Q15_256); - y256[12] = mulhi_int16_simd256(y256[12],ONE_OVER_SQRT2_Q15_256); - y256[13] = mulhi_int16_simd256(y256[13],ONE_OVER_SQRT2_Q15_256); - y256[14] = mulhi_int16_simd256(y256[14],ONE_OVER_SQRT2_Q15_256); - y256[15] = mulhi_int16_simd256(y256[15],ONE_OVER_SQRT2_Q15_256); - y256+=16; + if (scale && *scale>0) { + y256p = y256; + if (*scale>1) { + uint32_t scale2=*scale-1; + for (i=0; i<4; i++) { + y256p[0] = mulhi_int16_simd256(shiftright_int16_simd256(y256p[0],scale2),ONE_OVER_SQRT2_Q15_256); + y256p[1] = mulhi_int16_simd256(shiftright_int16_simd256(y256p[1],scale2),ONE_OVER_SQRT2_Q15_256); + y256p[2] = mulhi_int16_simd256(shiftright_int16_simd256(y256p[2],scale2),ONE_OVER_SQRT2_Q15_256); + y256p[3] = mulhi_int16_simd256(shiftright_int16_simd256(y256p[3],scale2),ONE_OVER_SQRT2_Q15_256); + y256p[4] = mulhi_int16_simd256(shiftright_int16_simd256(y256p[4],scale2),ONE_OVER_SQRT2_Q15_256); + y256p[5] = mulhi_int16_simd256(shiftright_int16_simd256(y256p[5],scale2),ONE_OVER_SQRT2_Q15_256); + y256p[6] = mulhi_int16_simd256(shiftright_int16_simd256(y256p[6],scale2),ONE_OVER_SQRT2_Q15_256); + y256p[7] = mulhi_int16_simd256(shiftright_int16_simd256(y256p[7],scale2),ONE_OVER_SQRT2_Q15_256); + y256p[8] = mulhi_int16_simd256(shiftright_int16_simd256(y256p[8],scale2),ONE_OVER_SQRT2_Q15_256); + y256p[9] = mulhi_int16_simd256(shiftright_int16_simd256(y256p[9],scale2),ONE_OVER_SQRT2_Q15_256); + y256p[10] = mulhi_int16_simd256(shiftright_int16_simd256(y256p[10],scale2),ONE_OVER_SQRT2_Q15_256); + y256p[11] = mulhi_int16_simd256(shiftright_int16_simd256(y256p[11],scale2),ONE_OVER_SQRT2_Q15_256); + y256p[12] = mulhi_int16_simd256(shiftright_int16_simd256(y256p[12],scale2),ONE_OVER_SQRT2_Q15_256); + y256p[13] = mulhi_int16_simd256(shiftright_int16_simd256(y256p[13],scale2),ONE_OVER_SQRT2_Q15_256); + y256p[14] = mulhi_int16_simd256(shiftright_int16_simd256(y256p[14],scale2),ONE_OVER_SQRT2_Q15_256); + y256p[15] = mulhi_int16_simd256(shiftright_int16_simd256(y256p[15],scale2),ONE_OVER_SQRT2_Q15_256); + y256p+=16; + } + } + else { + for (i=0;i<4;i++) { + y256p[0] = mulhi_int16_simd256(y256p[0],ONE_OVER_SQRT2_Q15_256); + y256p[1] = mulhi_int16_simd256(y256p[1],ONE_OVER_SQRT2_Q15_256); + y256p[2] = mulhi_int16_simd256(y256p[2],ONE_OVER_SQRT2_Q15_256); + y256p[3] = mulhi_int16_simd256(y256p[3],ONE_OVER_SQRT2_Q15_256); + y256p[4] = mulhi_int16_simd256(y256p[4],ONE_OVER_SQRT2_Q15_256); + y256p[5] = mulhi_int16_simd256(y256p[5],ONE_OVER_SQRT2_Q15_256); + y256p[6] = mulhi_int16_simd256(y256p[6],ONE_OVER_SQRT2_Q15_256); + y256p[7] = mulhi_int16_simd256(y256p[7],ONE_OVER_SQRT2_Q15_256); + y256p[8] = mulhi_int16_simd256(y256p[8],ONE_OVER_SQRT2_Q15_256); + y256p[9] = mulhi_int16_simd256(y256p[9],ONE_OVER_SQRT2_Q15_256); + y256p[10] = mulhi_int16_simd256(y256p[10],ONE_OVER_SQRT2_Q15_256); + y256p[11] = mulhi_int16_simd256(y256p[11],ONE_OVER_SQRT2_Q15_256); + y256p[12] = mulhi_int16_simd256(y256p[12],ONE_OVER_SQRT2_Q15_256); + y256p[13] = mulhi_int16_simd256(y256p[13],ONE_OVER_SQRT2_Q15_256); + y256p[14] = mulhi_int16_simd256(y256p[14],ONE_OVER_SQRT2_Q15_256); + y256p[15] = mulhi_int16_simd256(y256p[15],ONE_OVER_SQRT2_Q15_256); + y256p+=16; + } } } } -void idft512(int16_t *x,int16_t *y,unsigned char scale) +void idft512(int16_t *x,int16_t *y,unsigned int *scale) { simd256_q15_t xtmp[64],*x256 = (simd256_q15_t *)x; @@ -2207,8 +2339,10 @@ void idft512(int16_t *x,int16_t *y,unsigned char scale) transpose4_ooff_simd256(x256+60,xtmp+30,32); transpose4_ooff_simd256(x256+62,xtmp+31,32); - idft256((int16_t*)(xtmp),(int16_t*)ytmp,1); - idft256((int16_t*)(xtmp+32),(int16_t*)(ytmp+32),1); + unsigned int *scale256=NULL; + if (scale) scale256=scale+1; + idft256((int16_t*)(xtmp),(int16_t*)ytmp,scale256); + idft256((int16_t*)(xtmp+32),(int16_t*)(ytmp+32),scale256); for (i=0; i<32; i++) { @@ -2220,34 +2354,57 @@ void idft512(int16_t *x,int16_t *y,unsigned char scale) ytmpp++; } - if (scale>0) { - - for (i=0;i<4;i++) { - y256[0] = mulhi_int16_simd256(y256[0],ONE_OVER_SQRT2_Q15_256); - y256[1] = mulhi_int16_simd256(y256[1],ONE_OVER_SQRT2_Q15_256); - y256[2] = mulhi_int16_simd256(y256[2],ONE_OVER_SQRT2_Q15_256); - y256[3] = mulhi_int16_simd256(y256[3],ONE_OVER_SQRT2_Q15_256); - y256[4] = mulhi_int16_simd256(y256[4],ONE_OVER_SQRT2_Q15_256); - y256[5] = mulhi_int16_simd256(y256[5],ONE_OVER_SQRT2_Q15_256); - y256[6] = mulhi_int16_simd256(y256[6],ONE_OVER_SQRT2_Q15_256); - y256[7] = mulhi_int16_simd256(y256[7],ONE_OVER_SQRT2_Q15_256); - y256[8] = mulhi_int16_simd256(y256[8],ONE_OVER_SQRT2_Q15_256); - y256[9] = mulhi_int16_simd256(y256[9],ONE_OVER_SQRT2_Q15_256); - y256[10] = mulhi_int16_simd256(y256[10],ONE_OVER_SQRT2_Q15_256); - y256[11] = mulhi_int16_simd256(y256[11],ONE_OVER_SQRT2_Q15_256); - y256[12] = mulhi_int16_simd256(y256[12],ONE_OVER_SQRT2_Q15_256); - y256[13] = mulhi_int16_simd256(y256[13],ONE_OVER_SQRT2_Q15_256); - y256[14] = mulhi_int16_simd256(y256[14],ONE_OVER_SQRT2_Q15_256); - y256[15] = mulhi_int16_simd256(y256[15],ONE_OVER_SQRT2_Q15_256); - y256+=16; + if (scale && *scale>0) { + y256p = y256; + if (*scale>1) { + uint32_t scale2=*scale-1; + for (i=0; i<4; i++) { + y256p[0] = mulhi_int16_simd256(shiftright_int16_simd256(y256p[0],scale2),ONE_OVER_SQRT2_Q15_256); + y256p[1] = mulhi_int16_simd256(shiftright_int16_simd256(y256p[1],scale2),ONE_OVER_SQRT2_Q15_256); + y256p[2] = mulhi_int16_simd256(shiftright_int16_simd256(y256p[2],scale2),ONE_OVER_SQRT2_Q15_256); + y256p[3] = mulhi_int16_simd256(shiftright_int16_simd256(y256p[3],scale2),ONE_OVER_SQRT2_Q15_256); + y256p[4] = mulhi_int16_simd256(shiftright_int16_simd256(y256p[4],scale2),ONE_OVER_SQRT2_Q15_256); + y256p[5] = mulhi_int16_simd256(shiftright_int16_simd256(y256p[5],scale2),ONE_OVER_SQRT2_Q15_256); + y256p[6] = mulhi_int16_simd256(shiftright_int16_simd256(y256p[6],scale2),ONE_OVER_SQRT2_Q15_256); + y256p[7] = mulhi_int16_simd256(shiftright_int16_simd256(y256p[7],scale2),ONE_OVER_SQRT2_Q15_256); + y256p[8] = mulhi_int16_simd256(shiftright_int16_simd256(y256p[8],scale2),ONE_OVER_SQRT2_Q15_256); + y256p[9] = mulhi_int16_simd256(shiftright_int16_simd256(y256p[9],scale2),ONE_OVER_SQRT2_Q15_256); + y256p[10] = mulhi_int16_simd256(shiftright_int16_simd256(y256p[10],scale2),ONE_OVER_SQRT2_Q15_256); + y256p[11] = mulhi_int16_simd256(shiftright_int16_simd256(y256p[11],scale2),ONE_OVER_SQRT2_Q15_256); + y256p[12] = mulhi_int16_simd256(shiftright_int16_simd256(y256p[12],scale2),ONE_OVER_SQRT2_Q15_256); + y256p[13] = mulhi_int16_simd256(shiftright_int16_simd256(y256p[13],scale2),ONE_OVER_SQRT2_Q15_256); + y256p[14] = mulhi_int16_simd256(shiftright_int16_simd256(y256p[14],scale2),ONE_OVER_SQRT2_Q15_256); + y256p[15] = mulhi_int16_simd256(shiftright_int16_simd256(y256p[15],scale2),ONE_OVER_SQRT2_Q15_256); + y256p+=16; + } + } + else { + for (i=0; i<4; i++) { + y256p[0] = mulhi_int16_simd256(y256p[0],ONE_OVER_SQRT2_Q15_256); + y256p[1] = mulhi_int16_simd256(y256p[1],ONE_OVER_SQRT2_Q15_256); + y256p[2] = mulhi_int16_simd256(y256p[2],ONE_OVER_SQRT2_Q15_256); + y256p[3] = mulhi_int16_simd256(y256p[3],ONE_OVER_SQRT2_Q15_256); + y256p[4] = mulhi_int16_simd256(y256p[4],ONE_OVER_SQRT2_Q15_256); + y256p[5] = mulhi_int16_simd256(y256p[5],ONE_OVER_SQRT2_Q15_256); + y256p[6] = mulhi_int16_simd256(y256p[6],ONE_OVER_SQRT2_Q15_256); + y256p[7] = mulhi_int16_simd256(y256p[7],ONE_OVER_SQRT2_Q15_256); + y256p[8] = mulhi_int16_simd256(y256p[8],ONE_OVER_SQRT2_Q15_256); + y256p[9] = mulhi_int16_simd256(y256p[9],ONE_OVER_SQRT2_Q15_256); + y256p[10] = mulhi_int16_simd256(y256p[10],ONE_OVER_SQRT2_Q15_256); + y256p[11] = mulhi_int16_simd256(y256p[11],ONE_OVER_SQRT2_Q15_256); + y256p[12] = mulhi_int16_simd256(y256p[12],ONE_OVER_SQRT2_Q15_256); + y256p[13] = mulhi_int16_simd256(y256p[13],ONE_OVER_SQRT2_Q15_256); + y256p[14] = mulhi_int16_simd256(y256p[14],ONE_OVER_SQRT2_Q15_256); + y256p[15] = mulhi_int16_simd256(y256p[15],ONE_OVER_SQRT2_Q15_256); + y256p+=16; + } } } - } int16_t tw1024[1536] __attribute__((aligned(32))); -void dft1024(int16_t *x,int16_t *y,unsigned char scale) +void dft1024(int16_t *x,int16_t *y,unsigned int *scale) { simd256_q15_t xtmp[128],ytmp[128],*tw1024_256p=(simd256_q15_t *)tw1024,*x256=(simd256_q15_t *)x,*y256=(simd256_q15_t *)y,*y256p=(simd256_q15_t *)y; @@ -2259,10 +2416,12 @@ void dft1024(int16_t *x,int16_t *y,unsigned char scale) } - dft256((int16_t*)(xtmp),(int16_t*)(ytmp),1); - dft256((int16_t*)(xtmp+32),(int16_t*)(ytmp+32),1); - dft256((int16_t*)(xtmp+64),(int16_t*)(ytmp+64),1); - dft256((int16_t*)(xtmp+96),(int16_t*)(ytmp+96),1); + unsigned int *scale256=NULL; + if (scale) scale256=scale+1; + dft256((int16_t*)(xtmp),(int16_t*)(ytmp),scale256); + dft256((int16_t*)(xtmp+32),(int16_t*)(ytmp+32),scale256); + dft256((int16_t*)(xtmp+64),(int16_t*)(ytmp+64),scale256); + dft256((int16_t*)(xtmp+96),(int16_t*)(ytmp+96),scale256); for (i=0; i<32; i++) { bfly4_256(ytmpp,ytmpp+32,ytmpp+64,ytmpp+96, @@ -2273,25 +2432,25 @@ void dft1024(int16_t *x,int16_t *y,unsigned char scale) ytmpp++; } - if (scale>0) { - + if (scale && *scale>0) { + unsigned int scalec=*scale; for (i=0; i<8; i++) { - y256[0] = shiftright_int16_simd256(y256[0],1); - y256[1] = shiftright_int16_simd256(y256[1],1); - y256[2] = shiftright_int16_simd256(y256[2],1); - y256[3] = shiftright_int16_simd256(y256[3],1); - y256[4] = shiftright_int16_simd256(y256[4],1); - y256[5] = shiftright_int16_simd256(y256[5],1); - y256[6] = shiftright_int16_simd256(y256[6],1); - y256[7] = shiftright_int16_simd256(y256[7],1); - y256[8] = shiftright_int16_simd256(y256[8],1); - y256[9] = shiftright_int16_simd256(y256[9],1); - y256[10] = shiftright_int16_simd256(y256[10],1); - y256[11] = shiftright_int16_simd256(y256[11],1); - y256[12] = shiftright_int16_simd256(y256[12],1); - y256[13] = shiftright_int16_simd256(y256[13],1); - y256[14] = shiftright_int16_simd256(y256[14],1); - y256[15] = shiftright_int16_simd256(y256[15],1); + y256[0] = shiftright_int16_simd256(y256[0],scalec); + y256[1] = shiftright_int16_simd256(y256[1],scalec); + y256[2] = shiftright_int16_simd256(y256[2],scalec); + y256[3] = shiftright_int16_simd256(y256[3],scalec); + y256[4] = shiftright_int16_simd256(y256[4],scalec); + y256[5] = shiftright_int16_simd256(y256[5],scalec); + y256[6] = shiftright_int16_simd256(y256[6],scalec); + y256[7] = shiftright_int16_simd256(y256[7],scalec); + y256[8] = shiftright_int16_simd256(y256[8],scalec); + y256[9] = shiftright_int16_simd256(y256[9],scalec); + y256[10] = shiftright_int16_simd256(y256[10],scalec); + y256[11] = shiftright_int16_simd256(y256[11],scalec); + y256[12] = shiftright_int16_simd256(y256[12],scalec); + y256[13] = shiftright_int16_simd256(y256[13],scalec); + y256[14] = shiftright_int16_simd256(y256[14],scalec); + y256[15] = shiftright_int16_simd256(y256[15],scalec); y256+=16; } @@ -2300,7 +2459,7 @@ void dft1024(int16_t *x,int16_t *y,unsigned char scale) } -void idft1024(int16_t *x,int16_t *y,unsigned char scale) +void idft1024(int16_t *x,int16_t *y,unsigned int *scale) { simd256_q15_t xtmp[128],ytmp[128],*tw1024_256p=(simd256_q15_t *)tw1024,*x256=(simd256_q15_t *)x,*y256=(simd256_q15_t *)y,*y256p=(simd256_q15_t *)y; @@ -2312,10 +2471,12 @@ void idft1024(int16_t *x,int16_t *y,unsigned char scale) } - idft256((int16_t*)(xtmp),(int16_t*)(ytmp),1); - idft256((int16_t*)(xtmp+32),(int16_t*)(ytmp+32),1); - idft256((int16_t*)(xtmp+64),(int16_t*)(ytmp+64),1); - idft256((int16_t*)(xtmp+96),(int16_t*)(ytmp+96),1); + unsigned int *scale256=NULL; + if (scale) scale256=scale+1; + idft256((int16_t*)(xtmp),(int16_t*)(ytmp),scale256); + idft256((int16_t*)(xtmp+32),(int16_t*)(ytmp+32),scale256); + idft256((int16_t*)(xtmp+64),(int16_t*)(ytmp+64),scale256); + idft256((int16_t*)(xtmp+96),(int16_t*)(ytmp+96),scale256); for (i=0; i<32; i++) { ibfly4_256(ytmpp,ytmpp+32,ytmpp+64,ytmpp+96, @@ -2326,25 +2487,25 @@ void idft1024(int16_t *x,int16_t *y,unsigned char scale) ytmpp++; } - if (scale>0) { - + if (scale && *scale>0) { + unsigned int scalec=*scale; for (i=0; i<8; i++) { - y256[0] = shiftright_int16_simd256(y256[0],1); - y256[1] = shiftright_int16_simd256(y256[1],1); - y256[2] = shiftright_int16_simd256(y256[2],1); - y256[3] = shiftright_int16_simd256(y256[3],1); - y256[4] = shiftright_int16_simd256(y256[4],1); - y256[5] = shiftright_int16_simd256(y256[5],1); - y256[6] = shiftright_int16_simd256(y256[6],1); - y256[7] = shiftright_int16_simd256(y256[7],1); - y256[8] = shiftright_int16_simd256(y256[8],1); - y256[9] = shiftright_int16_simd256(y256[9],1); - y256[10] = shiftright_int16_simd256(y256[10],1); - y256[11] = shiftright_int16_simd256(y256[11],1); - y256[12] = shiftright_int16_simd256(y256[12],1); - y256[13] = shiftright_int16_simd256(y256[13],1); - y256[14] = shiftright_int16_simd256(y256[14],1); - y256[15] = shiftright_int16_simd256(y256[15],1); + y256[0] = shiftright_int16_simd256(y256[0],scalec); + y256[1] = shiftright_int16_simd256(y256[1],scalec); + y256[2] = shiftright_int16_simd256(y256[2],scalec); + y256[3] = shiftright_int16_simd256(y256[3],scalec); + y256[4] = shiftright_int16_simd256(y256[4],scalec); + y256[5] = shiftright_int16_simd256(y256[5],scalec); + y256[6] = shiftright_int16_simd256(y256[6],scalec); + y256[7] = shiftright_int16_simd256(y256[7],scalec); + y256[8] = shiftright_int16_simd256(y256[8],scalec); + y256[9] = shiftright_int16_simd256(y256[9],scalec); + y256[10] = shiftright_int16_simd256(y256[10],scalec); + y256[11] = shiftright_int16_simd256(y256[11],scalec); + y256[12] = shiftright_int16_simd256(y256[12],scalec); + y256[13] = shiftright_int16_simd256(y256[13],scalec); + y256[14] = shiftright_int16_simd256(y256[14],scalec); + y256[15] = shiftright_int16_simd256(y256[15],scalec); y256+=16; } @@ -2355,7 +2516,7 @@ void idft1024(int16_t *x,int16_t *y,unsigned char scale) int16_t tw2048[2048] __attribute__((aligned(32))); -void dft2048(int16_t *x,int16_t *y,unsigned char scale) +void dft2048(int16_t *x,int16_t *y,unsigned int *scale) { simd256_q15_t xtmp[256],*xtmpp,*x256 = (simd256_q15_t *)x; @@ -2404,8 +2565,10 @@ void dft2048(int16_t *x,int16_t *y,unsigned char scale) xtmpp+=32; } - dft1024((int16_t*)(xtmp),(int16_t*)ytmp,1); - dft1024((int16_t*)(xtmp+128),(int16_t*)(ytmp+128),1); + unsigned int *scale1024=NULL; + if (scale) scale1024=scale+1; + dft1024((int16_t*)(xtmp),(int16_t*)ytmp,scale1024); + dft1024((int16_t*)(xtmp+128),(int16_t*)(ytmp+128),scale1024); for (i=0; i<128; i++) { @@ -2417,33 +2580,55 @@ void dft2048(int16_t *x,int16_t *y,unsigned char scale) ytmpp++; } - if (scale>0) { + if (scale && *scale>0) { y256p = y256; - - for (i=0; i<16; i++) { - y256p[0] = mulhi_int16_simd256(y256p[0],ONE_OVER_SQRT2_Q15_128); - y256p[1] = mulhi_int16_simd256(y256p[1],ONE_OVER_SQRT2_Q15_128); - y256p[2] = mulhi_int16_simd256(y256p[2],ONE_OVER_SQRT2_Q15_128); - y256p[3] = mulhi_int16_simd256(y256p[3],ONE_OVER_SQRT2_Q15_128); - y256p[4] = mulhi_int16_simd256(y256p[4],ONE_OVER_SQRT2_Q15_128); - y256p[5] = mulhi_int16_simd256(y256p[5],ONE_OVER_SQRT2_Q15_128); - y256p[6] = mulhi_int16_simd256(y256p[6],ONE_OVER_SQRT2_Q15_128); - y256p[7] = mulhi_int16_simd256(y256p[7],ONE_OVER_SQRT2_Q15_128); - y256p[8] = mulhi_int16_simd256(y256p[8],ONE_OVER_SQRT2_Q15_128); - y256p[9] = mulhi_int16_simd256(y256p[9],ONE_OVER_SQRT2_Q15_128); - y256p[10] = mulhi_int16_simd256(y256p[10],ONE_OVER_SQRT2_Q15_128); - y256p[11] = mulhi_int16_simd256(y256p[11],ONE_OVER_SQRT2_Q15_128); - y256p[12] = mulhi_int16_simd256(y256p[12],ONE_OVER_SQRT2_Q15_128); - y256p[13] = mulhi_int16_simd256(y256p[13],ONE_OVER_SQRT2_Q15_128); - y256p[14] = mulhi_int16_simd256(y256p[14],ONE_OVER_SQRT2_Q15_128); - y256p[15] = mulhi_int16_simd256(y256p[15],ONE_OVER_SQRT2_Q15_128); - y256p+=16; + if (*scale>1) { + uint32_t scale2=*scale-1; + for (i=0; i<16; i++) { + y256p[0] = mulhi_int16_simd256(shiftright_int16_simd256(y256p[0],scale2),ONE_OVER_SQRT2_Q15_128); + y256p[1] = mulhi_int16_simd256(shiftright_int16_simd256(y256p[1],scale2),ONE_OVER_SQRT2_Q15_128); + y256p[2] = mulhi_int16_simd256(shiftright_int16_simd256(y256p[2],scale2),ONE_OVER_SQRT2_Q15_128); + y256p[3] = mulhi_int16_simd256(shiftright_int16_simd256(y256p[3],scale2),ONE_OVER_SQRT2_Q15_128); + y256p[4] = mulhi_int16_simd256(shiftright_int16_simd256(y256p[4],scale2),ONE_OVER_SQRT2_Q15_128); + y256p[5] = mulhi_int16_simd256(shiftright_int16_simd256(y256p[5],scale2),ONE_OVER_SQRT2_Q15_128); + y256p[6] = mulhi_int16_simd256(shiftright_int16_simd256(y256p[6],scale2),ONE_OVER_SQRT2_Q15_128); + y256p[7] = mulhi_int16_simd256(shiftright_int16_simd256(y256p[7],scale2),ONE_OVER_SQRT2_Q15_128); + y256p[8] = mulhi_int16_simd256(shiftright_int16_simd256(y256p[8],scale2),ONE_OVER_SQRT2_Q15_128); + y256p[9] = mulhi_int16_simd256(shiftright_int16_simd256(y256p[9],scale2),ONE_OVER_SQRT2_Q15_128); + y256p[10] = mulhi_int16_simd256(shiftright_int16_simd256(y256p[10],scale2),ONE_OVER_SQRT2_Q15_128); + y256p[11] = mulhi_int16_simd256(shiftright_int16_simd256(y256p[11],scale2),ONE_OVER_SQRT2_Q15_128); + y256p[12] = mulhi_int16_simd256(shiftright_int16_simd256(y256p[12],scale2),ONE_OVER_SQRT2_Q15_128); + y256p[13] = mulhi_int16_simd256(shiftright_int16_simd256(y256p[13],scale2),ONE_OVER_SQRT2_Q15_128); + y256p[14] = mulhi_int16_simd256(shiftright_int16_simd256(y256p[14],scale2),ONE_OVER_SQRT2_Q15_128); + y256p[15] = mulhi_int16_simd256(shiftright_int16_simd256(y256p[15],scale2),ONE_OVER_SQRT2_Q15_128); + y256p+=16; + } + } + else { + for (i=0; i<16; i++) { + y256p[0] = mulhi_int16_simd256(y256p[0],ONE_OVER_SQRT2_Q15_128); + y256p[1] = mulhi_int16_simd256(y256p[1],ONE_OVER_SQRT2_Q15_128); + y256p[2] = mulhi_int16_simd256(y256p[2],ONE_OVER_SQRT2_Q15_128); + y256p[3] = mulhi_int16_simd256(y256p[3],ONE_OVER_SQRT2_Q15_128); + y256p[4] = mulhi_int16_simd256(y256p[4],ONE_OVER_SQRT2_Q15_128); + y256p[5] = mulhi_int16_simd256(y256p[5],ONE_OVER_SQRT2_Q15_128); + y256p[6] = mulhi_int16_simd256(y256p[6],ONE_OVER_SQRT2_Q15_128); + y256p[7] = mulhi_int16_simd256(y256p[7],ONE_OVER_SQRT2_Q15_128); + y256p[8] = mulhi_int16_simd256(y256p[8],ONE_OVER_SQRT2_Q15_128); + y256p[9] = mulhi_int16_simd256(y256p[9],ONE_OVER_SQRT2_Q15_128); + y256p[10] = mulhi_int16_simd256(y256p[10],ONE_OVER_SQRT2_Q15_128); + y256p[11] = mulhi_int16_simd256(y256p[11],ONE_OVER_SQRT2_Q15_128); + y256p[12] = mulhi_int16_simd256(y256p[12],ONE_OVER_SQRT2_Q15_128); + y256p[13] = mulhi_int16_simd256(y256p[13],ONE_OVER_SQRT2_Q15_128); + y256p[14] = mulhi_int16_simd256(y256p[14],ONE_OVER_SQRT2_Q15_128); + y256p[15] = mulhi_int16_simd256(y256p[15],ONE_OVER_SQRT2_Q15_128); + y256p+=16; + } } } - } -void idft2048(int16_t *x,int16_t *y,unsigned char scale) +void idft2048(int16_t *x,int16_t *y,unsigned int *scale) { simd256_q15_t xtmp[256],*xtmpp,*x256 = (simd256_q15_t *)x; @@ -2491,8 +2676,10 @@ void idft2048(int16_t *x,int16_t *y,unsigned char scale) xtmpp+=32; } - idft1024((int16_t*)(xtmp),(int16_t*)ytmp,1); - idft1024((int16_t*)(xtmp+128),(int16_t*)(ytmp+128),1); + unsigned int *scale1024=NULL; + if (scale) scale1024=scale+1; + idft1024((int16_t*)(xtmp),(int16_t*)ytmp,scale1024); + idft1024((int16_t*)(xtmp+128),(int16_t*)(ytmp+128),scale1024); for (i=0; i<128; i++) { @@ -2504,27 +2691,50 @@ void idft2048(int16_t *x,int16_t *y,unsigned char scale) ytmpp++; } - if (scale>0) { + if (scale && *scale>0) { y256p = y256; - - for (i=0; i<16; i++) { - y256p[0] = mulhi_int16_simd256(y256p[0],ONE_OVER_SQRT2_Q15_128); - y256p[1] = mulhi_int16_simd256(y256p[1],ONE_OVER_SQRT2_Q15_128); - y256p[2] = mulhi_int16_simd256(y256p[2],ONE_OVER_SQRT2_Q15_128); - y256p[3] = mulhi_int16_simd256(y256p[3],ONE_OVER_SQRT2_Q15_128); - y256p[4] = mulhi_int16_simd256(y256p[4],ONE_OVER_SQRT2_Q15_128); - y256p[5] = mulhi_int16_simd256(y256p[5],ONE_OVER_SQRT2_Q15_128); - y256p[6] = mulhi_int16_simd256(y256p[6],ONE_OVER_SQRT2_Q15_128); - y256p[7] = mulhi_int16_simd256(y256p[7],ONE_OVER_SQRT2_Q15_128); - y256p[8] = mulhi_int16_simd256(y256p[8],ONE_OVER_SQRT2_Q15_128); - y256p[9] = mulhi_int16_simd256(y256p[9],ONE_OVER_SQRT2_Q15_128); - y256p[10] = mulhi_int16_simd256(y256p[10],ONE_OVER_SQRT2_Q15_128); - y256p[11] = mulhi_int16_simd256(y256p[11],ONE_OVER_SQRT2_Q15_128); - y256p[12] = mulhi_int16_simd256(y256p[12],ONE_OVER_SQRT2_Q15_128); - y256p[13] = mulhi_int16_simd256(y256p[13],ONE_OVER_SQRT2_Q15_128); - y256p[14] = mulhi_int16_simd256(y256p[14],ONE_OVER_SQRT2_Q15_128); - y256p[15] = mulhi_int16_simd256(y256p[15],ONE_OVER_SQRT2_Q15_128); - y256p+=16; + if (*scale>1) { + uint32_t scale2=*scale-1; + for (i=0; i<16; i++) { + y256p[0] = mulhi_int16_simd256(shiftright_int16_simd256(y256p[0],scale2),ONE_OVER_SQRT2_Q15_128); + y256p[1] = mulhi_int16_simd256(shiftright_int16_simd256(y256p[1],scale2),ONE_OVER_SQRT2_Q15_128); + y256p[2] = mulhi_int16_simd256(shiftright_int16_simd256(y256p[2],scale2),ONE_OVER_SQRT2_Q15_128); + y256p[3] = mulhi_int16_simd256(shiftright_int16_simd256(y256p[3],scale2),ONE_OVER_SQRT2_Q15_128); + y256p[4] = mulhi_int16_simd256(shiftright_int16_simd256(y256p[4],scale2),ONE_OVER_SQRT2_Q15_128); + y256p[5] = mulhi_int16_simd256(shiftright_int16_simd256(y256p[5],scale2),ONE_OVER_SQRT2_Q15_128); + y256p[6] = mulhi_int16_simd256(shiftright_int16_simd256(y256p[6],scale2),ONE_OVER_SQRT2_Q15_128); + y256p[7] = mulhi_int16_simd256(shiftright_int16_simd256(y256p[7],scale2),ONE_OVER_SQRT2_Q15_128); + y256p[8] = mulhi_int16_simd256(shiftright_int16_simd256(y256p[8],scale2),ONE_OVER_SQRT2_Q15_128); + y256p[9] = mulhi_int16_simd256(shiftright_int16_simd256(y256p[9],scale2),ONE_OVER_SQRT2_Q15_128); + y256p[10] = mulhi_int16_simd256(shiftright_int16_simd256(y256p[10],scale2),ONE_OVER_SQRT2_Q15_128); + y256p[11] = mulhi_int16_simd256(shiftright_int16_simd256(y256p[11],scale2),ONE_OVER_SQRT2_Q15_128); + y256p[12] = mulhi_int16_simd256(shiftright_int16_simd256(y256p[12],scale2),ONE_OVER_SQRT2_Q15_128); + y256p[13] = mulhi_int16_simd256(shiftright_int16_simd256(y256p[13],scale2),ONE_OVER_SQRT2_Q15_128); + y256p[14] = mulhi_int16_simd256(shiftright_int16_simd256(y256p[14],scale2),ONE_OVER_SQRT2_Q15_128); + y256p[15] = mulhi_int16_simd256(shiftright_int16_simd256(y256p[15],scale2),ONE_OVER_SQRT2_Q15_128); + y256p+=16; + } + } + else { + for (i=0; i<16; i++) { + y256p[0] = mulhi_int16_simd256(y256p[0],ONE_OVER_SQRT2_Q15_128); + y256p[1] = mulhi_int16_simd256(y256p[1],ONE_OVER_SQRT2_Q15_128); + y256p[2] = mulhi_int16_simd256(y256p[2],ONE_OVER_SQRT2_Q15_128); + y256p[3] = mulhi_int16_simd256(y256p[3],ONE_OVER_SQRT2_Q15_128); + y256p[4] = mulhi_int16_simd256(y256p[4],ONE_OVER_SQRT2_Q15_128); + y256p[5] = mulhi_int16_simd256(y256p[5],ONE_OVER_SQRT2_Q15_128); + y256p[6] = mulhi_int16_simd256(y256p[6],ONE_OVER_SQRT2_Q15_128); + y256p[7] = mulhi_int16_simd256(y256p[7],ONE_OVER_SQRT2_Q15_128); + y256p[8] = mulhi_int16_simd256(y256p[8],ONE_OVER_SQRT2_Q15_128); + y256p[9] = mulhi_int16_simd256(y256p[9],ONE_OVER_SQRT2_Q15_128); + y256p[10] = mulhi_int16_simd256(y256p[10],ONE_OVER_SQRT2_Q15_128); + y256p[11] = mulhi_int16_simd256(y256p[11],ONE_OVER_SQRT2_Q15_128); + y256p[12] = mulhi_int16_simd256(y256p[12],ONE_OVER_SQRT2_Q15_128); + y256p[13] = mulhi_int16_simd256(y256p[13],ONE_OVER_SQRT2_Q15_128); + y256p[14] = mulhi_int16_simd256(y256p[14],ONE_OVER_SQRT2_Q15_128); + y256p[15] = mulhi_int16_simd256(y256p[15],ONE_OVER_SQRT2_Q15_128); + y256p+=16; + } } } @@ -2532,7 +2742,7 @@ void idft2048(int16_t *x,int16_t *y,unsigned char scale) int16_t tw4096[3*2*1024]; -void dft4096(int16_t *x,int16_t *y,unsigned char scale) +void dft4096(int16_t *x,int16_t *y,unsigned int *scale) { simd256_q15_t xtmp[512],ytmp[512],*tw4096_256p=(simd256_q15_t *)tw4096,*x256=(simd256_q15_t *)x,*y256=(simd256_q15_t *)y,*y256p=(simd256_q15_t *)y; @@ -2544,10 +2754,12 @@ void dft4096(int16_t *x,int16_t *y,unsigned char scale) } - dft1024((int16_t*)(xtmp),(int16_t*)(ytmp),1); - dft1024((int16_t*)(xtmp+128),(int16_t*)(ytmp+128),1); - dft1024((int16_t*)(xtmp+256),(int16_t*)(ytmp+256),1); - dft1024((int16_t*)(xtmp+384),(int16_t*)(ytmp+384),1); + unsigned int *scale1024=NULL; + if (scale) scale1024=scale+1; + dft1024((int16_t*)(xtmp),(int16_t*)(ytmp),scale1024); + dft1024((int16_t*)(xtmp+128),(int16_t*)(ytmp+128),scale1024); + dft1024((int16_t*)(xtmp+256),(int16_t*)(ytmp+256),scale1024); + dft1024((int16_t*)(xtmp+384),(int16_t*)(ytmp+384),scale1024); for (i=0; i<128; i++) { bfly4_256(ytmpp,ytmpp+128,ytmpp+256,ytmpp+384, @@ -2558,25 +2770,25 @@ void dft4096(int16_t *x,int16_t *y,unsigned char scale) ytmpp++; } - if (scale>0) { - + if (scale && *scale>0) { + unsigned int scalec=*scale; for (i=0; i<32; i++) { - y256[0] = shiftright_int16_simd256(y256[0],1); - y256[1] = shiftright_int16_simd256(y256[1],1); - y256[2] = shiftright_int16_simd256(y256[2],1); - y256[3] = shiftright_int16_simd256(y256[3],1); - y256[4] = shiftright_int16_simd256(y256[4],1); - y256[5] = shiftright_int16_simd256(y256[5],1); - y256[6] = shiftright_int16_simd256(y256[6],1); - y256[7] = shiftright_int16_simd256(y256[7],1); - y256[8] = shiftright_int16_simd256(y256[8],1); - y256[9] = shiftright_int16_simd256(y256[9],1); - y256[10] = shiftright_int16_simd256(y256[10],1); - y256[11] = shiftright_int16_simd256(y256[11],1); - y256[12] = shiftright_int16_simd256(y256[12],1); - y256[13] = shiftright_int16_simd256(y256[13],1); - y256[14] = shiftright_int16_simd256(y256[14],1); - y256[15] = shiftright_int16_simd256(y256[15],1); + y256[0] = shiftright_int16_simd256(y256[0],scalec); + y256[1] = shiftright_int16_simd256(y256[1],scalec); + y256[2] = shiftright_int16_simd256(y256[2],scalec); + y256[3] = shiftright_int16_simd256(y256[3],scalec); + y256[4] = shiftright_int16_simd256(y256[4],scalec); + y256[5] = shiftright_int16_simd256(y256[5],scalec); + y256[6] = shiftright_int16_simd256(y256[6],scalec); + y256[7] = shiftright_int16_simd256(y256[7],scalec); + y256[8] = shiftright_int16_simd256(y256[8],scalec); + y256[9] = shiftright_int16_simd256(y256[9],scalec); + y256[10] = shiftright_int16_simd256(y256[10],scalec); + y256[11] = shiftright_int16_simd256(y256[11],scalec); + y256[12] = shiftright_int16_simd256(y256[12],scalec); + y256[13] = shiftright_int16_simd256(y256[13],scalec); + y256[14] = shiftright_int16_simd256(y256[14],scalec); + y256[15] = shiftright_int16_simd256(y256[15],scalec); y256+=16; } @@ -2585,7 +2797,7 @@ void dft4096(int16_t *x,int16_t *y,unsigned char scale) } -void idft4096(int16_t *x,int16_t *y,unsigned char scale) +void idft4096(int16_t *x,int16_t *y,unsigned int *scale) { simd256_q15_t xtmp[512],ytmp[512],*tw4096_256p=(simd256_q15_t *)tw4096,*x256=(simd256_q15_t *)x,*y256=(simd256_q15_t *)y,*y256p=(simd256_q15_t *)y; @@ -2597,10 +2809,12 @@ void idft4096(int16_t *x,int16_t *y,unsigned char scale) } - idft1024((int16_t*)(xtmp),(int16_t*)(ytmp),1); - idft1024((int16_t*)(xtmp+128),(int16_t*)(ytmp+128),1); - idft1024((int16_t*)(xtmp+256),(int16_t*)(ytmp+256),1); - idft1024((int16_t*)(xtmp+384),(int16_t*)(ytmp+384),1); + unsigned int *scale1024=NULL; + if (scale) scale1024=scale+1; + idft1024((int16_t*)(xtmp),(int16_t*)(ytmp),scale1024); + idft1024((int16_t*)(xtmp+128),(int16_t*)(ytmp+128),scale1024); + idft1024((int16_t*)(xtmp+256),(int16_t*)(ytmp+256),scale1024); + idft1024((int16_t*)(xtmp+384),(int16_t*)(ytmp+384),scale1024); for (i=0; i<128; i++) { ibfly4_256(ytmpp,ytmpp+128,ytmpp+256,ytmpp+384, @@ -2611,25 +2825,25 @@ void idft4096(int16_t *x,int16_t *y,unsigned char scale) ytmpp++; } - if (scale>0) { - + if (scale && *scale>0) { + unsigned int scalec=*scale; for (i=0; i<32; i++) { - y256[0] = shiftright_int16_simd256(y256[0],1); - y256[1] = shiftright_int16_simd256(y256[1],1); - y256[2] = shiftright_int16_simd256(y256[2],1); - y256[3] = shiftright_int16_simd256(y256[3],1); - y256[4] = shiftright_int16_simd256(y256[4],1); - y256[5] = shiftright_int16_simd256(y256[5],1); - y256[6] = shiftright_int16_simd256(y256[6],1); - y256[7] = shiftright_int16_simd256(y256[7],1); - y256[8] = shiftright_int16_simd256(y256[8],1); - y256[9] = shiftright_int16_simd256(y256[9],1); - y256[10] = shiftright_int16_simd256(y256[10],1); - y256[11] = shiftright_int16_simd256(y256[11],1); - y256[12] = shiftright_int16_simd256(y256[12],1); - y256[13] = shiftright_int16_simd256(y256[13],1); - y256[14] = shiftright_int16_simd256(y256[14],1); - y256[15] = shiftright_int16_simd256(y256[15],1); + y256[0] = shiftright_int16_simd256(y256[0],scalec); + y256[1] = shiftright_int16_simd256(y256[1],scalec); + y256[2] = shiftright_int16_simd256(y256[2],scalec); + y256[3] = shiftright_int16_simd256(y256[3],scalec); + y256[4] = shiftright_int16_simd256(y256[4],scalec); + y256[5] = shiftright_int16_simd256(y256[5],scalec); + y256[6] = shiftright_int16_simd256(y256[6],scalec); + y256[7] = shiftright_int16_simd256(y256[7],scalec); + y256[8] = shiftright_int16_simd256(y256[8],scalec); + y256[9] = shiftright_int16_simd256(y256[9],scalec); + y256[10] = shiftright_int16_simd256(y256[10],scalec); + y256[11] = shiftright_int16_simd256(y256[11],scalec); + y256[12] = shiftright_int16_simd256(y256[12],scalec); + y256[13] = shiftright_int16_simd256(y256[13],scalec); + y256[14] = shiftright_int16_simd256(y256[14],scalec); + y256[15] = shiftright_int16_simd256(y256[15],scalec); y256+=16; } @@ -2640,7 +2854,7 @@ void idft4096(int16_t *x,int16_t *y,unsigned char scale) int16_t tw8192[2*4096] __attribute__((aligned(32))); -void dft8192(int16_t *x,int16_t *y,unsigned char scale) +void dft8192(int16_t *x,int16_t *y,unsigned int *scale) { simd256_q15_t xtmp[1024],*xtmpp,*x256 = (simd256_q15_t *)x; @@ -2689,8 +2903,10 @@ void dft8192(int16_t *x,int16_t *y,unsigned char scale) xtmpp+=32; } - dft4096((int16_t*)(xtmp),(int16_t*)ytmp,1); - dft4096((int16_t*)(xtmp+512),(int16_t*)(ytmp+512),1); + unsigned int *scale4096=NULL; + if (scale) scale4096=scale+1; + dft4096((int16_t*)(xtmp),(int16_t*)ytmp,scale4096); + dft4096((int16_t*)(xtmp+512),(int16_t*)(ytmp+512),scale4096); for (i=0; i<512; i++) { @@ -2702,33 +2918,56 @@ void dft8192(int16_t *x,int16_t *y,unsigned char scale) ytmpp++; } - if (scale>0) { + if (scale && *scale>0) { y256p = y256; - - for (i=0; i<64; i++) { - y256p[0] = mulhi_int16_simd256(y256p[0],ONE_OVER_SQRT2_Q15_128); - y256p[1] = mulhi_int16_simd256(y256p[1],ONE_OVER_SQRT2_Q15_128); - y256p[2] = mulhi_int16_simd256(y256p[2],ONE_OVER_SQRT2_Q15_128); - y256p[3] = mulhi_int16_simd256(y256p[3],ONE_OVER_SQRT2_Q15_128); - y256p[4] = mulhi_int16_simd256(y256p[4],ONE_OVER_SQRT2_Q15_128); - y256p[5] = mulhi_int16_simd256(y256p[5],ONE_OVER_SQRT2_Q15_128); - y256p[6] = mulhi_int16_simd256(y256p[6],ONE_OVER_SQRT2_Q15_128); - y256p[7] = mulhi_int16_simd256(y256p[7],ONE_OVER_SQRT2_Q15_128); - y256p[8] = mulhi_int16_simd256(y256p[8],ONE_OVER_SQRT2_Q15_128); - y256p[9] = mulhi_int16_simd256(y256p[9],ONE_OVER_SQRT2_Q15_128); - y256p[10] = mulhi_int16_simd256(y256p[10],ONE_OVER_SQRT2_Q15_128); - y256p[11] = mulhi_int16_simd256(y256p[11],ONE_OVER_SQRT2_Q15_128); - y256p[12] = mulhi_int16_simd256(y256p[12],ONE_OVER_SQRT2_Q15_128); - y256p[13] = mulhi_int16_simd256(y256p[13],ONE_OVER_SQRT2_Q15_128); - y256p[14] = mulhi_int16_simd256(y256p[14],ONE_OVER_SQRT2_Q15_128); - y256p[15] = mulhi_int16_simd256(y256p[15],ONE_OVER_SQRT2_Q15_128); - y256p+=16; + if (*scale > 1) { + uint32_t scale2=*scale-1; + for (i=0; i<64; i++) { + y256p[0] = mulhi_int16_simd256(shiftright_int16_simd256(y256p[0],scale2),ONE_OVER_SQRT2_Q15_128); + y256p[1] = mulhi_int16_simd256(shiftright_int16_simd256(y256p[1],scale2),ONE_OVER_SQRT2_Q15_128); + y256p[2] = mulhi_int16_simd256(shiftright_int16_simd256(y256p[2],scale2),ONE_OVER_SQRT2_Q15_128); + y256p[3] = mulhi_int16_simd256(shiftright_int16_simd256(y256p[3],scale2),ONE_OVER_SQRT2_Q15_128); + y256p[4] = mulhi_int16_simd256(shiftright_int16_simd256(y256p[4],scale2),ONE_OVER_SQRT2_Q15_128); + y256p[5] = mulhi_int16_simd256(shiftright_int16_simd256(y256p[5],scale2),ONE_OVER_SQRT2_Q15_128); + y256p[6] = mulhi_int16_simd256(shiftright_int16_simd256(y256p[6],scale2),ONE_OVER_SQRT2_Q15_128); + y256p[7] = mulhi_int16_simd256(shiftright_int16_simd256(y256p[7],scale2),ONE_OVER_SQRT2_Q15_128); + y256p[8] = mulhi_int16_simd256(shiftright_int16_simd256(y256p[8],scale2),ONE_OVER_SQRT2_Q15_128); + y256p[9] = mulhi_int16_simd256(shiftright_int16_simd256(y256p[9],scale2),ONE_OVER_SQRT2_Q15_128); + y256p[10] = mulhi_int16_simd256(shiftright_int16_simd256(y256p[10],scale2),ONE_OVER_SQRT2_Q15_128); + y256p[11] = mulhi_int16_simd256(shiftright_int16_simd256(y256p[11],scale2),ONE_OVER_SQRT2_Q15_128); + y256p[12] = mulhi_int16_simd256(shiftright_int16_simd256(y256p[12],scale2),ONE_OVER_SQRT2_Q15_128); + y256p[13] = mulhi_int16_simd256(shiftright_int16_simd256(y256p[13],scale2),ONE_OVER_SQRT2_Q15_128); + y256p[14] = mulhi_int16_simd256(shiftright_int16_simd256(y256p[14],scale2),ONE_OVER_SQRT2_Q15_128); + y256p[15] = mulhi_int16_simd256(shiftright_int16_simd256(y256p[15],scale2),ONE_OVER_SQRT2_Q15_128); + y256p+=16; + } + } + else { + for (i=0; i<64; i++) { + y256p[0] = mulhi_int16_simd256(y256p[0],ONE_OVER_SQRT2_Q15_128); + y256p[1] = mulhi_int16_simd256(y256p[1],ONE_OVER_SQRT2_Q15_128); + y256p[2] = mulhi_int16_simd256(y256p[2],ONE_OVER_SQRT2_Q15_128); + y256p[3] = mulhi_int16_simd256(y256p[3],ONE_OVER_SQRT2_Q15_128); + y256p[4] = mulhi_int16_simd256(y256p[4],ONE_OVER_SQRT2_Q15_128); + y256p[5] = mulhi_int16_simd256(y256p[5],ONE_OVER_SQRT2_Q15_128); + y256p[6] = mulhi_int16_simd256(y256p[6],ONE_OVER_SQRT2_Q15_128); + y256p[7] = mulhi_int16_simd256(y256p[7],ONE_OVER_SQRT2_Q15_128); + y256p[8] = mulhi_int16_simd256(y256p[8],ONE_OVER_SQRT2_Q15_128); + y256p[9] = mulhi_int16_simd256(y256p[9],ONE_OVER_SQRT2_Q15_128); + y256p[10] = mulhi_int16_simd256(y256p[10],ONE_OVER_SQRT2_Q15_128); + y256p[11] = mulhi_int16_simd256(y256p[11],ONE_OVER_SQRT2_Q15_128); + y256p[12] = mulhi_int16_simd256(y256p[12],ONE_OVER_SQRT2_Q15_128); + y256p[13] = mulhi_int16_simd256(y256p[13],ONE_OVER_SQRT2_Q15_128); + y256p[14] = mulhi_int16_simd256(y256p[14],ONE_OVER_SQRT2_Q15_128); + y256p[15] = mulhi_int16_simd256(y256p[15],ONE_OVER_SQRT2_Q15_128); + y256p+=16; + } } } } -void idft8192(int16_t *x,int16_t *y,unsigned char scale) +void idft8192(int16_t *x,int16_t *y,unsigned int *scale) { simd256_q15_t xtmp[1024],*xtmpp,*x256 = (simd256_q15_t *)x; @@ -2776,8 +3015,10 @@ void idft8192(int16_t *x,int16_t *y,unsigned char scale) xtmpp+=32; } - idft4096((int16_t*)(xtmp),(int16_t*)ytmp,1); - idft4096((int16_t*)(xtmp+512),(int16_t*)(ytmp+512),1); + unsigned int *scale4096=NULL; + if (scale) scale4096=scale+1; + idft4096((int16_t*)(xtmp),(int16_t*)ytmp,scale4096); + idft4096((int16_t*)(xtmp+512),(int16_t*)(ytmp+512),scale4096); for (i=0; i<512; i++) { @@ -2789,27 +3030,50 @@ void idft8192(int16_t *x,int16_t *y,unsigned char scale) ytmpp++; } - if (scale>0) { + if (scale && *scale>0) { y256p = y256; - - for (i=0; i<64; i++) { - y256p[0] = mulhi_int16_simd256(y256p[0],ONE_OVER_SQRT2_Q15_128); - y256p[1] = mulhi_int16_simd256(y256p[1],ONE_OVER_SQRT2_Q15_128); - y256p[2] = mulhi_int16_simd256(y256p[2],ONE_OVER_SQRT2_Q15_128); - y256p[3] = mulhi_int16_simd256(y256p[3],ONE_OVER_SQRT2_Q15_128); - y256p[4] = mulhi_int16_simd256(y256p[4],ONE_OVER_SQRT2_Q15_128); - y256p[5] = mulhi_int16_simd256(y256p[5],ONE_OVER_SQRT2_Q15_128); - y256p[6] = mulhi_int16_simd256(y256p[6],ONE_OVER_SQRT2_Q15_128); - y256p[7] = mulhi_int16_simd256(y256p[7],ONE_OVER_SQRT2_Q15_128); - y256p[8] = mulhi_int16_simd256(y256p[8],ONE_OVER_SQRT2_Q15_128); - y256p[9] = mulhi_int16_simd256(y256p[9],ONE_OVER_SQRT2_Q15_128); - y256p[10] = mulhi_int16_simd256(y256p[10],ONE_OVER_SQRT2_Q15_128); - y256p[11] = mulhi_int16_simd256(y256p[11],ONE_OVER_SQRT2_Q15_128); - y256p[12] = mulhi_int16_simd256(y256p[12],ONE_OVER_SQRT2_Q15_128); - y256p[13] = mulhi_int16_simd256(y256p[13],ONE_OVER_SQRT2_Q15_128); - y256p[14] = mulhi_int16_simd256(y256p[14],ONE_OVER_SQRT2_Q15_128); - y256p[15] = mulhi_int16_simd256(y256p[15],ONE_OVER_SQRT2_Q15_128); - y256p+=16; + if (*scale > 1) { + uint32_t scale2=*scale-1; + for (i=0; i<64; i++) { + y256p[0] = mulhi_int16_simd256(shiftright_int16_simd256(y256p[0],scale2),ONE_OVER_SQRT2_Q15_128); + y256p[1] = mulhi_int16_simd256(shiftright_int16_simd256(y256p[1],scale2),ONE_OVER_SQRT2_Q15_128); + y256p[2] = mulhi_int16_simd256(shiftright_int16_simd256(y256p[2],scale2),ONE_OVER_SQRT2_Q15_128); + y256p[3] = mulhi_int16_simd256(shiftright_int16_simd256(y256p[3],scale2),ONE_OVER_SQRT2_Q15_128); + y256p[4] = mulhi_int16_simd256(shiftright_int16_simd256(y256p[4],scale2),ONE_OVER_SQRT2_Q15_128); + y256p[5] = mulhi_int16_simd256(shiftright_int16_simd256(y256p[5],scale2),ONE_OVER_SQRT2_Q15_128); + y256p[6] = mulhi_int16_simd256(shiftright_int16_simd256(y256p[6],scale2),ONE_OVER_SQRT2_Q15_128); + y256p[7] = mulhi_int16_simd256(shiftright_int16_simd256(y256p[7],scale2),ONE_OVER_SQRT2_Q15_128); + y256p[8] = mulhi_int16_simd256(shiftright_int16_simd256(y256p[8],scale2),ONE_OVER_SQRT2_Q15_128); + y256p[9] = mulhi_int16_simd256(shiftright_int16_simd256(y256p[9],scale2),ONE_OVER_SQRT2_Q15_128); + y256p[10] = mulhi_int16_simd256(shiftright_int16_simd256(y256p[10],scale2),ONE_OVER_SQRT2_Q15_128); + y256p[11] = mulhi_int16_simd256(shiftright_int16_simd256(y256p[11],scale2),ONE_OVER_SQRT2_Q15_128); + y256p[12] = mulhi_int16_simd256(shiftright_int16_simd256(y256p[12],scale2),ONE_OVER_SQRT2_Q15_128); + y256p[13] = mulhi_int16_simd256(shiftright_int16_simd256(y256p[13],scale2),ONE_OVER_SQRT2_Q15_128); + y256p[14] = mulhi_int16_simd256(shiftright_int16_simd256(y256p[14],scale2),ONE_OVER_SQRT2_Q15_128); + y256p[15] = mulhi_int16_simd256(shiftright_int16_simd256(y256p[15],scale2),ONE_OVER_SQRT2_Q15_128); + y256p+=16; + } + } + else { + for (i=0; i<64; i++) { + y256p[0] = mulhi_int16_simd256(y256p[0],ONE_OVER_SQRT2_Q15_128); + y256p[1] = mulhi_int16_simd256(y256p[1],ONE_OVER_SQRT2_Q15_128); + y256p[2] = mulhi_int16_simd256(y256p[2],ONE_OVER_SQRT2_Q15_128); + y256p[3] = mulhi_int16_simd256(y256p[3],ONE_OVER_SQRT2_Q15_128); + y256p[4] = mulhi_int16_simd256(y256p[4],ONE_OVER_SQRT2_Q15_128); + y256p[5] = mulhi_int16_simd256(y256p[5],ONE_OVER_SQRT2_Q15_128); + y256p[6] = mulhi_int16_simd256(y256p[6],ONE_OVER_SQRT2_Q15_128); + y256p[7] = mulhi_int16_simd256(y256p[7],ONE_OVER_SQRT2_Q15_128); + y256p[8] = mulhi_int16_simd256(y256p[8],ONE_OVER_SQRT2_Q15_128); + y256p[9] = mulhi_int16_simd256(y256p[9],ONE_OVER_SQRT2_Q15_128); + y256p[10] = mulhi_int16_simd256(y256p[10],ONE_OVER_SQRT2_Q15_128); + y256p[11] = mulhi_int16_simd256(y256p[11],ONE_OVER_SQRT2_Q15_128); + y256p[12] = mulhi_int16_simd256(y256p[12],ONE_OVER_SQRT2_Q15_128); + y256p[13] = mulhi_int16_simd256(y256p[13],ONE_OVER_SQRT2_Q15_128); + y256p[14] = mulhi_int16_simd256(y256p[14],ONE_OVER_SQRT2_Q15_128); + y256p[15] = mulhi_int16_simd256(y256p[15],ONE_OVER_SQRT2_Q15_128); + y256p+=16; + } } } @@ -2817,7 +3081,7 @@ void idft8192(int16_t *x,int16_t *y,unsigned char scale) int16_t tw16384[3*2*4096] __attribute__((aligned(32))); -void dft16384(int16_t *x,int16_t *y,unsigned char scale) +void dft16384(int16_t *x,int16_t *y,unsigned int *scale) { simd256_q15_t xtmp[2048],ytmp[2048],*tw16384_256p=(simd256_q15_t *)tw16384,*x256=(simd256_q15_t *)x,*y256=(simd256_q15_t *)y,*y256p=(simd256_q15_t *)y; @@ -2829,10 +3093,12 @@ void dft16384(int16_t *x,int16_t *y,unsigned char scale) } - dft4096((int16_t*)(xtmp),(int16_t*)(ytmp),1); - dft4096((int16_t*)(xtmp+512),(int16_t*)(ytmp+512),1); - dft4096((int16_t*)(xtmp+1024),(int16_t*)(ytmp+1024),1); - dft4096((int16_t*)(xtmp+1536),(int16_t*)(ytmp+1536),1); + unsigned int *scale4096=NULL; + if (scale) scale4096=scale+1; + dft4096((int16_t*)(xtmp),(int16_t*)(ytmp),scale4096); + dft4096((int16_t*)(xtmp+512),(int16_t*)(ytmp+512),scale4096); + dft4096((int16_t*)(xtmp+1024),(int16_t*)(ytmp+1024),scale4096); + dft4096((int16_t*)(xtmp+1536),(int16_t*)(ytmp+1536),scale4096); for (i=0; i<512; i++) { bfly4_256(ytmpp,ytmpp+512,ytmpp+1024,ytmpp+1536, @@ -2843,25 +3109,25 @@ void dft16384(int16_t *x,int16_t *y,unsigned char scale) ytmpp++; } - if (scale>0) { - + if (scale && *scale>0) { + unsigned int scalec=*scale; for (i=0; i<128; i++) { - y256[0] = shiftright_int16_simd256(y256[0],1); - y256[1] = shiftright_int16_simd256(y256[1],1); - y256[2] = shiftright_int16_simd256(y256[2],1); - y256[3] = shiftright_int16_simd256(y256[3],1); - y256[4] = shiftright_int16_simd256(y256[4],1); - y256[5] = shiftright_int16_simd256(y256[5],1); - y256[6] = shiftright_int16_simd256(y256[6],1); - y256[7] = shiftright_int16_simd256(y256[7],1); - y256[8] = shiftright_int16_simd256(y256[8],1); - y256[9] = shiftright_int16_simd256(y256[9],1); - y256[10] = shiftright_int16_simd256(y256[10],1); - y256[11] = shiftright_int16_simd256(y256[11],1); - y256[12] = shiftright_int16_simd256(y256[12],1); - y256[13] = shiftright_int16_simd256(y256[13],1); - y256[14] = shiftright_int16_simd256(y256[14],1); - y256[15] = shiftright_int16_simd256(y256[15],1); + y256[0] = shiftright_int16_simd256(y256[0],scalec); + y256[1] = shiftright_int16_simd256(y256[1],scalec); + y256[2] = shiftright_int16_simd256(y256[2],scalec); + y256[3] = shiftright_int16_simd256(y256[3],scalec); + y256[4] = shiftright_int16_simd256(y256[4],scalec); + y256[5] = shiftright_int16_simd256(y256[5],scalec); + y256[6] = shiftright_int16_simd256(y256[6],scalec); + y256[7] = shiftright_int16_simd256(y256[7],scalec); + y256[8] = shiftright_int16_simd256(y256[8],scalec); + y256[9] = shiftright_int16_simd256(y256[9],scalec); + y256[10] = shiftright_int16_simd256(y256[10],scalec); + y256[11] = shiftright_int16_simd256(y256[11],scalec); + y256[12] = shiftright_int16_simd256(y256[12],scalec); + y256[13] = shiftright_int16_simd256(y256[13],scalec); + y256[14] = shiftright_int16_simd256(y256[14],scalec); + y256[15] = shiftright_int16_simd256(y256[15],scalec); y256+=16; } @@ -2870,7 +3136,7 @@ void dft16384(int16_t *x,int16_t *y,unsigned char scale) } -void idft16384(int16_t *x,int16_t *y,unsigned char scale) +void idft16384(int16_t *x,int16_t *y,unsigned int *scale) { simd256_q15_t xtmp[2048],ytmp[2048],*tw16384_256p=(simd256_q15_t *)tw16384,*x256=(simd256_q15_t *)x,*y256=(simd256_q15_t *)y,*y256p=(simd256_q15_t *)y; @@ -2882,10 +3148,12 @@ void idft16384(int16_t *x,int16_t *y,unsigned char scale) } - idft4096((int16_t*)(xtmp),(int16_t*)(ytmp),1); - idft4096((int16_t*)(xtmp+512),(int16_t*)(ytmp+512),1); - idft4096((int16_t*)(xtmp+1024),(int16_t*)(ytmp+1024),1); - idft4096((int16_t*)(xtmp+1536),(int16_t*)(ytmp+1536),1); + unsigned int *scale4096=NULL; + if (scale) scale4096=scale+1; + idft4096((int16_t*)(xtmp),(int16_t*)(ytmp),scale4096); + idft4096((int16_t*)(xtmp+512),(int16_t*)(ytmp+512),scale4096); + idft4096((int16_t*)(xtmp+1024),(int16_t*)(ytmp+1024),scale4096); + idft4096((int16_t*)(xtmp+1536),(int16_t*)(ytmp+1536),scale4096); for (i=0; i<512; i++) { ibfly4_256(ytmpp,ytmpp+512,ytmpp+1024,ytmpp+1536, @@ -2896,25 +3164,25 @@ void idft16384(int16_t *x,int16_t *y,unsigned char scale) ytmpp++; } - if (scale>0) { - + if (scale && *scale>0) { + unsigned int scalec=*scale; for (i=0; i<128; i++) { - y256[0] = shiftright_int16_simd256(y256[0],1); - y256[1] = shiftright_int16_simd256(y256[1],1); - y256[2] = shiftright_int16_simd256(y256[2],1); - y256[3] = shiftright_int16_simd256(y256[3],1); - y256[4] = shiftright_int16_simd256(y256[4],1); - y256[5] = shiftright_int16_simd256(y256[5],1); - y256[6] = shiftright_int16_simd256(y256[6],1); - y256[7] = shiftright_int16_simd256(y256[7],1); - y256[8] = shiftright_int16_simd256(y256[8],1); - y256[9] = shiftright_int16_simd256(y256[9],1); - y256[10] = shiftright_int16_simd256(y256[10],1); - y256[11] = shiftright_int16_simd256(y256[11],1); - y256[12] = shiftright_int16_simd256(y256[12],1); - y256[13] = shiftright_int16_simd256(y256[13],1); - y256[14] = shiftright_int16_simd256(y256[14],1); - y256[15] = shiftright_int16_simd256(y256[15],1); + y256[0] = shiftright_int16_simd256(y256[0],scalec); + y256[1] = shiftright_int16_simd256(y256[1],scalec); + y256[2] = shiftright_int16_simd256(y256[2],scalec); + y256[3] = shiftright_int16_simd256(y256[3],scalec); + y256[4] = shiftright_int16_simd256(y256[4],scalec); + y256[5] = shiftright_int16_simd256(y256[5],scalec); + y256[6] = shiftright_int16_simd256(y256[6],scalec); + y256[7] = shiftright_int16_simd256(y256[7],scalec); + y256[8] = shiftright_int16_simd256(y256[8],scalec); + y256[9] = shiftright_int16_simd256(y256[9],scalec); + y256[10] = shiftright_int16_simd256(y256[10],scalec); + y256[11] = shiftright_int16_simd256(y256[11],scalec); + y256[12] = shiftright_int16_simd256(y256[12],scalec); + y256[13] = shiftright_int16_simd256(y256[13],scalec); + y256[14] = shiftright_int16_simd256(y256[14],scalec); + y256[15] = shiftright_int16_simd256(y256[15],scalec); y256+=16; } @@ -2925,7 +3193,7 @@ void idft16384(int16_t *x,int16_t *y,unsigned char scale) int16_t tw32768[2*16384] __attribute__((aligned(32))); -void dft32768(int16_t *x,int16_t *y,unsigned char scale) +void dft32768(int16_t *x,int16_t *y,unsigned int *scale) { simd256_q15_t xtmp[4096],*xtmpp,*x256 = (simd256_q15_t *)x; @@ -2974,8 +3242,10 @@ void dft32768(int16_t *x,int16_t *y,unsigned char scale) xtmpp+=32; } - dft16384((int16_t*)(xtmp),(int16_t*)ytmp,1); - dft16384((int16_t*)(xtmp+2048),(int16_t*)(ytmp+2048),1); + unsigned int *scale16384=NULL; + if (scale) scale16384=scale+1; + dft16384((int16_t*)(xtmp),(int16_t*)ytmp,scale16384); + dft16384((int16_t*)(xtmp+2048),(int16_t*)(ytmp+2048),scale16384); for (i=0; i<2048; i++) { @@ -2987,33 +3257,56 @@ void dft32768(int16_t *x,int16_t *y,unsigned char scale) ytmpp++; } - if (scale>0) { + if (scale && *scale>0) { y256p = y256; - - for (i=0; i<64; i++) { - y256p[0] = mulhi_int16_simd256(y256p[0],ONE_OVER_SQRT2_Q15_128); - y256p[1] = mulhi_int16_simd256(y256p[1],ONE_OVER_SQRT2_Q15_128); - y256p[2] = mulhi_int16_simd256(y256p[2],ONE_OVER_SQRT2_Q15_128); - y256p[3] = mulhi_int16_simd256(y256p[3],ONE_OVER_SQRT2_Q15_128); - y256p[4] = mulhi_int16_simd256(y256p[4],ONE_OVER_SQRT2_Q15_128); - y256p[5] = mulhi_int16_simd256(y256p[5],ONE_OVER_SQRT2_Q15_128); - y256p[6] = mulhi_int16_simd256(y256p[6],ONE_OVER_SQRT2_Q15_128); - y256p[7] = mulhi_int16_simd256(y256p[7],ONE_OVER_SQRT2_Q15_128); - y256p[8] = mulhi_int16_simd256(y256p[8],ONE_OVER_SQRT2_Q15_128); - y256p[9] = mulhi_int16_simd256(y256p[9],ONE_OVER_SQRT2_Q15_128); - y256p[10] = mulhi_int16_simd256(y256p[10],ONE_OVER_SQRT2_Q15_128); - y256p[11] = mulhi_int16_simd256(y256p[11],ONE_OVER_SQRT2_Q15_128); - y256p[12] = mulhi_int16_simd256(y256p[12],ONE_OVER_SQRT2_Q15_128); - y256p[13] = mulhi_int16_simd256(y256p[13],ONE_OVER_SQRT2_Q15_128); - y256p[14] = mulhi_int16_simd256(y256p[14],ONE_OVER_SQRT2_Q15_128); - y256p[15] = mulhi_int16_simd256(y256p[15],ONE_OVER_SQRT2_Q15_128); - y256p+=16; + if (*scale > 1) { + uint32_t scale2=*scale-1; + for (i=0; i<256; i++) { + y256p[0] = mulhi_int16_simd256(shiftright_int16_simd256(y256p[0],scale2),ONE_OVER_SQRT2_Q15_128); + y256p[1] = mulhi_int16_simd256(shiftright_int16_simd256(y256p[1],scale2),ONE_OVER_SQRT2_Q15_128); + y256p[2] = mulhi_int16_simd256(shiftright_int16_simd256(y256p[2],scale2),ONE_OVER_SQRT2_Q15_128); + y256p[3] = mulhi_int16_simd256(shiftright_int16_simd256(y256p[3],scale2),ONE_OVER_SQRT2_Q15_128); + y256p[4] = mulhi_int16_simd256(shiftright_int16_simd256(y256p[4],scale2),ONE_OVER_SQRT2_Q15_128); + y256p[5] = mulhi_int16_simd256(shiftright_int16_simd256(y256p[5],scale2),ONE_OVER_SQRT2_Q15_128); + y256p[6] = mulhi_int16_simd256(shiftright_int16_simd256(y256p[6],scale2),ONE_OVER_SQRT2_Q15_128); + y256p[7] = mulhi_int16_simd256(shiftright_int16_simd256(y256p[7],scale2),ONE_OVER_SQRT2_Q15_128); + y256p[8] = mulhi_int16_simd256(shiftright_int16_simd256(y256p[8],scale2),ONE_OVER_SQRT2_Q15_128); + y256p[9] = mulhi_int16_simd256(shiftright_int16_simd256(y256p[9],scale2),ONE_OVER_SQRT2_Q15_128); + y256p[10] = mulhi_int16_simd256(shiftright_int16_simd256(y256p[10],scale2),ONE_OVER_SQRT2_Q15_128); + y256p[11] = mulhi_int16_simd256(shiftright_int16_simd256(y256p[11],scale2),ONE_OVER_SQRT2_Q15_128); + y256p[12] = mulhi_int16_simd256(shiftright_int16_simd256(y256p[12],scale2),ONE_OVER_SQRT2_Q15_128); + y256p[13] = mulhi_int16_simd256(shiftright_int16_simd256(y256p[13],scale2),ONE_OVER_SQRT2_Q15_128); + y256p[14] = mulhi_int16_simd256(shiftright_int16_simd256(y256p[14],scale2),ONE_OVER_SQRT2_Q15_128); + y256p[15] = mulhi_int16_simd256(shiftright_int16_simd256(y256p[15],scale2),ONE_OVER_SQRT2_Q15_128); + y256p+=16; + } + } + else { + for (i=0; i<256; i++) { + y256p[0] = mulhi_int16_simd256(y256p[0],ONE_OVER_SQRT2_Q15_128); + y256p[1] = mulhi_int16_simd256(y256p[1],ONE_OVER_SQRT2_Q15_128); + y256p[2] = mulhi_int16_simd256(y256p[2],ONE_OVER_SQRT2_Q15_128); + y256p[3] = mulhi_int16_simd256(y256p[3],ONE_OVER_SQRT2_Q15_128); + y256p[4] = mulhi_int16_simd256(y256p[4],ONE_OVER_SQRT2_Q15_128); + y256p[5] = mulhi_int16_simd256(y256p[5],ONE_OVER_SQRT2_Q15_128); + y256p[6] = mulhi_int16_simd256(y256p[6],ONE_OVER_SQRT2_Q15_128); + y256p[7] = mulhi_int16_simd256(y256p[7],ONE_OVER_SQRT2_Q15_128); + y256p[8] = mulhi_int16_simd256(y256p[8],ONE_OVER_SQRT2_Q15_128); + y256p[9] = mulhi_int16_simd256(y256p[9],ONE_OVER_SQRT2_Q15_128); + y256p[10] = mulhi_int16_simd256(y256p[10],ONE_OVER_SQRT2_Q15_128); + y256p[11] = mulhi_int16_simd256(y256p[11],ONE_OVER_SQRT2_Q15_128); + y256p[12] = mulhi_int16_simd256(y256p[12],ONE_OVER_SQRT2_Q15_128); + y256p[13] = mulhi_int16_simd256(y256p[13],ONE_OVER_SQRT2_Q15_128); + y256p[14] = mulhi_int16_simd256(y256p[14],ONE_OVER_SQRT2_Q15_128); + y256p[15] = mulhi_int16_simd256(y256p[15],ONE_OVER_SQRT2_Q15_128); + y256p+=16; + } } } } -void idft32768(int16_t *x,int16_t *y,unsigned char scale) +void idft32768(int16_t *x,int16_t *y,unsigned int *scale) { simd256_q15_t xtmp[4096],*xtmpp,*x256 = (simd256_q15_t *)x; @@ -3061,8 +3354,10 @@ void idft32768(int16_t *x,int16_t *y,unsigned char scale) xtmpp+=32; } - idft16384((int16_t*)(xtmp),(int16_t*)ytmp,1); - idft16384((int16_t*)(xtmp+2048),(int16_t*)(ytmp+2048),1); + unsigned int *scale16384=NULL; + if (scale) scale16384=scale+1; + idft16384((int16_t*)(xtmp),(int16_t*)ytmp,scale16384); + idft16384((int16_t*)(xtmp+2048),(int16_t*)(ytmp+2048),scale16384); for (i=0; i<2048; i++) { @@ -3074,36 +3369,58 @@ void idft32768(int16_t *x,int16_t *y,unsigned char scale) ytmpp++; } - if (scale>0) { + if (scale && *scale>0) { y256p = y256; - - for (i=0; i<256; i++) { - y256p[0] = mulhi_int16_simd256(y256p[0],ONE_OVER_SQRT2_Q15_128); - y256p[1] = mulhi_int16_simd256(y256p[1],ONE_OVER_SQRT2_Q15_128); - y256p[2] = mulhi_int16_simd256(y256p[2],ONE_OVER_SQRT2_Q15_128); - y256p[3] = mulhi_int16_simd256(y256p[3],ONE_OVER_SQRT2_Q15_128); - y256p[4] = mulhi_int16_simd256(y256p[4],ONE_OVER_SQRT2_Q15_128); - y256p[5] = mulhi_int16_simd256(y256p[5],ONE_OVER_SQRT2_Q15_128); - y256p[6] = mulhi_int16_simd256(y256p[6],ONE_OVER_SQRT2_Q15_128); - y256p[7] = mulhi_int16_simd256(y256p[7],ONE_OVER_SQRT2_Q15_128); - y256p[8] = mulhi_int16_simd256(y256p[8],ONE_OVER_SQRT2_Q15_128); - y256p[9] = mulhi_int16_simd256(y256p[9],ONE_OVER_SQRT2_Q15_128); - y256p[10] = mulhi_int16_simd256(y256p[10],ONE_OVER_SQRT2_Q15_128); - y256p[11] = mulhi_int16_simd256(y256p[11],ONE_OVER_SQRT2_Q15_128); - y256p[12] = mulhi_int16_simd256(y256p[12],ONE_OVER_SQRT2_Q15_128); - y256p[13] = mulhi_int16_simd256(y256p[13],ONE_OVER_SQRT2_Q15_128); - y256p[14] = mulhi_int16_simd256(y256p[14],ONE_OVER_SQRT2_Q15_128); - y256p[15] = mulhi_int16_simd256(y256p[15],ONE_OVER_SQRT2_Q15_128); - y256p+=16; + if (*scale > 1) { + uint32_t scale2=*scale-1; + for (i=0; i<256; i++) { + y256p[0] = mulhi_int16_simd256(shiftright_int16_simd256(y256p[0],scale2),ONE_OVER_SQRT2_Q15_128); + y256p[1] = mulhi_int16_simd256(shiftright_int16_simd256(y256p[1],scale2),ONE_OVER_SQRT2_Q15_128); + y256p[2] = mulhi_int16_simd256(shiftright_int16_simd256(y256p[2],scale2),ONE_OVER_SQRT2_Q15_128); + y256p[3] = mulhi_int16_simd256(shiftright_int16_simd256(y256p[3],scale2),ONE_OVER_SQRT2_Q15_128); + y256p[4] = mulhi_int16_simd256(shiftright_int16_simd256(y256p[4],scale2),ONE_OVER_SQRT2_Q15_128); + y256p[5] = mulhi_int16_simd256(shiftright_int16_simd256(y256p[5],scale2),ONE_OVER_SQRT2_Q15_128); + y256p[6] = mulhi_int16_simd256(shiftright_int16_simd256(y256p[6],scale2),ONE_OVER_SQRT2_Q15_128); + y256p[7] = mulhi_int16_simd256(shiftright_int16_simd256(y256p[7],scale2),ONE_OVER_SQRT2_Q15_128); + y256p[8] = mulhi_int16_simd256(shiftright_int16_simd256(y256p[8],scale2),ONE_OVER_SQRT2_Q15_128); + y256p[9] = mulhi_int16_simd256(shiftright_int16_simd256(y256p[9],scale2),ONE_OVER_SQRT2_Q15_128); + y256p[10] = mulhi_int16_simd256(shiftright_int16_simd256(y256p[10],scale2),ONE_OVER_SQRT2_Q15_128); + y256p[11] = mulhi_int16_simd256(shiftright_int16_simd256(y256p[11],scale2),ONE_OVER_SQRT2_Q15_128); + y256p[12] = mulhi_int16_simd256(shiftright_int16_simd256(y256p[12],scale2),ONE_OVER_SQRT2_Q15_128); + y256p[13] = mulhi_int16_simd256(shiftright_int16_simd256(y256p[13],scale2),ONE_OVER_SQRT2_Q15_128); + y256p[14] = mulhi_int16_simd256(shiftright_int16_simd256(y256p[14],scale2),ONE_OVER_SQRT2_Q15_128); + y256p[15] = mulhi_int16_simd256(shiftright_int16_simd256(y256p[15],scale2),ONE_OVER_SQRT2_Q15_128); + y256p+=16; + } + } + else { + for (i=0; i<256; i++) { + y256p[0] = mulhi_int16_simd256(y256p[0],ONE_OVER_SQRT2_Q15_128); + y256p[1] = mulhi_int16_simd256(y256p[1],ONE_OVER_SQRT2_Q15_128); + y256p[2] = mulhi_int16_simd256(y256p[2],ONE_OVER_SQRT2_Q15_128); + y256p[3] = mulhi_int16_simd256(y256p[3],ONE_OVER_SQRT2_Q15_128); + y256p[4] = mulhi_int16_simd256(y256p[4],ONE_OVER_SQRT2_Q15_128); + y256p[5] = mulhi_int16_simd256(y256p[5],ONE_OVER_SQRT2_Q15_128); + y256p[6] = mulhi_int16_simd256(y256p[6],ONE_OVER_SQRT2_Q15_128); + y256p[7] = mulhi_int16_simd256(y256p[7],ONE_OVER_SQRT2_Q15_128); + y256p[8] = mulhi_int16_simd256(y256p[8],ONE_OVER_SQRT2_Q15_128); + y256p[9] = mulhi_int16_simd256(y256p[9],ONE_OVER_SQRT2_Q15_128); + y256p[10] = mulhi_int16_simd256(y256p[10],ONE_OVER_SQRT2_Q15_128); + y256p[11] = mulhi_int16_simd256(y256p[11],ONE_OVER_SQRT2_Q15_128); + y256p[12] = mulhi_int16_simd256(y256p[12],ONE_OVER_SQRT2_Q15_128); + y256p[13] = mulhi_int16_simd256(y256p[13],ONE_OVER_SQRT2_Q15_128); + y256p[14] = mulhi_int16_simd256(y256p[14],ONE_OVER_SQRT2_Q15_128); + y256p[15] = mulhi_int16_simd256(y256p[15],ONE_OVER_SQRT2_Q15_128); + y256p+=16; + } } } - } int16_t twa768[512],twb768[512]; // 256 x 3 -void idft768(int16_t *input, int16_t *output, unsigned char scale) +void idft768(int16_t *input, int16_t *output, unsigned int *scale) { int i,i2,j; uint32_t tmp[3][256]__attribute__((aligned(32))); @@ -3117,9 +3434,11 @@ void idft768(int16_t *input, int16_t *output, unsigned char scale) tmp[2][i] = ((uint32_t *)input)[j++]; } - idft256((int16_t*)(tmp[0]),(int16_t*)(tmpo[0]),1); - idft256((int16_t*)(tmp[1]),(int16_t*)(tmpo[1]),1); - idft256((int16_t*)(tmp[2]),(int16_t*)(tmpo[2]),1); + unsigned int *scale256=NULL; + if (scale) scale256=scale+1; + idft256((int16_t*)(tmp[0]),(int16_t*)(tmpo[0]),scale256); + idft256((int16_t*)(tmp[1]),(int16_t*)(tmpo[1]),scale256); + idft256((int16_t*)(tmp[2]),(int16_t*)(tmpo[2]),scale256); for (i=0,i2=0; i<512; i+=8,i2+=4) { ibfly3((simd_q15_t*)(&tmpo[0][i2]),(simd_q15_t*)(&tmpo[1][i2]),((simd_q15_t*)&tmpo[2][i2]), @@ -3128,7 +3447,7 @@ void idft768(int16_t *input, int16_t *output, unsigned char scale) } - if (scale==1) { + if (scale && *scale>0) { for (i=0; i<12; i++) { y128p[0] = mulhi_int16(y128p[0],ONE_OVER_SQRT3_Q15_128); y128p[1] = mulhi_int16(y128p[1],ONE_OVER_SQRT3_Q15_128); @@ -3152,7 +3471,7 @@ void idft768(int16_t *input, int16_t *output, unsigned char scale) } -void dft768(int16_t *input, int16_t *output, unsigned char scale) +void dft768(int16_t *input, int16_t *output, unsigned int *scale) { int i,i2,j; uint32_t tmp[3][256] __attribute__((aligned(32))); @@ -3166,9 +3485,11 @@ void dft768(int16_t *input, int16_t *output, unsigned char scale) tmp[2][i] = ((uint32_t *)input)[j++]; } - dft256((int16_t*)(tmp[0]),(int16_t*)(tmpo[0]),1); - dft256((int16_t*)(tmp[1]),(int16_t*)(tmpo[1]),1); - dft256((int16_t*)(tmp[2]),(int16_t*)(tmpo[2]),1); + unsigned int *scale256=NULL; + if (scale) scale256=scale+1; + dft256((int16_t*)(tmp[0]),(int16_t*)(tmpo[0]),scale256); + dft256((int16_t*)(tmp[1]),(int16_t*)(tmpo[1]),scale256); + dft256((int16_t*)(tmp[2]),(int16_t*)(tmpo[2]),scale256); /* for (i=1; i<512; i++) { @@ -3189,7 +3510,7 @@ void dft768(int16_t *input, int16_t *output, unsigned char scale) (simd_q15_t*)(twa768+i),(simd_q15_t*)(twb768+i)); } - if (scale==1) { + if (scale && *scale>0) { for (i=0; i<12; i++) { y128p[0] = mulhi_int16(y128p[0],ONE_OVER_SQRT3_Q15_128); y128p[1] = mulhi_int16(y128p[1],ONE_OVER_SQRT3_Q15_128); @@ -3215,7 +3536,7 @@ void dft768(int16_t *input, int16_t *output, unsigned char scale) int16_t twa1536[1024],twb1536[1024]; // 512 x 3 -void idft1536(int16_t *input, int16_t *output, unsigned char scale) +void idft1536(int16_t *input, int16_t *output, unsigned int *scale) { int i,i2,j; uint32_t tmp[3][512 ]__attribute__((aligned(32))); @@ -3229,9 +3550,11 @@ void idft1536(int16_t *input, int16_t *output, unsigned char scale) tmp[2][i] = ((uint32_t *)input)[j++]; } - idft512((int16_t*)(tmp[0]),(int16_t*)(tmpo[0]),1); - idft512((int16_t*)(tmp[1]),(int16_t*)(tmpo[1]),1); - idft512((int16_t*)(tmp[2]),(int16_t*)(tmpo[2]),1); + unsigned int *scale512=NULL; + if (scale) scale512=scale+1; + idft512((int16_t*)(tmp[0]),(int16_t*)(tmpo[0]),scale512); + idft512((int16_t*)(tmp[1]),(int16_t*)(tmpo[1]),scale512); + idft512((int16_t*)(tmp[2]),(int16_t*)(tmpo[2]),scale512); for (i=0,i2=0; i<1024; i+=8,i2+=4) { ibfly3((simd_q15_t*)(&tmpo[0][i2]),(simd_q15_t*)(&tmpo[1][i2]),((simd_q15_t*)&tmpo[2][i2]), @@ -3240,7 +3563,7 @@ void idft1536(int16_t *input, int16_t *output, unsigned char scale) } - if (scale==1) { + if (scale && *scale>0) { for (i=0; i<24; i++) { y128p[0] = mulhi_int16(y128p[0],ONE_OVER_SQRT3_Q15_128); y128p[1] = mulhi_int16(y128p[1],ONE_OVER_SQRT3_Q15_128); @@ -3264,7 +3587,7 @@ void idft1536(int16_t *input, int16_t *output, unsigned char scale) } -void dft1536(int16_t *input, int16_t *output, unsigned char scale) +void dft1536(int16_t *input, int16_t *output, unsigned int *scale) { int i,i2,j; uint32_t tmp[3][512] __attribute__((aligned(32))); @@ -3278,9 +3601,11 @@ void dft1536(int16_t *input, int16_t *output, unsigned char scale) tmp[2][i] = ((uint32_t *)input)[j++]; } - dft512((int16_t*)(tmp[0]),(int16_t*)(tmpo[0]),1); - dft512((int16_t*)(tmp[1]),(int16_t*)(tmpo[1]),1); - dft512((int16_t*)(tmp[2]),(int16_t*)(tmpo[2]),1); + unsigned int *scale512=NULL; + if (scale) scale512=scale+1; + dft512((int16_t*)(tmp[0]),(int16_t*)(tmpo[0]),scale512); + dft512((int16_t*)(tmp[1]),(int16_t*)(tmpo[1]),scale512); + dft512((int16_t*)(tmp[2]),(int16_t*)(tmpo[2]),scale512); /* for (i=1; i<512; i++) { @@ -3301,7 +3626,7 @@ void dft1536(int16_t *input, int16_t *output, unsigned char scale) (simd_q15_t*)(twa1536+i),(simd_q15_t*)(twb1536+i)); } - if (scale==1) { + if (scale && *scale>0) { for (i=0; i<24; i++) { y128p[0] = mulhi_int16(y128p[0],ONE_OVER_SQRT3_Q15_128); y128p[1] = mulhi_int16(y128p[1],ONE_OVER_SQRT3_Q15_128); @@ -3328,7 +3653,7 @@ void dft1536(int16_t *input, int16_t *output, unsigned char scale) int16_t twa3072[2048] __attribute__((aligned(32))); int16_t twb3072[2048] __attribute__((aligned(32))); // 1024 x 3 -void dft3072(int16_t *input, int16_t *output,unsigned char scale) +void dft3072(int16_t *input, int16_t *output,unsigned int *scale) { int i,i2,j; uint32_t tmp[3][1024] __attribute__((aligned(32))); @@ -3342,9 +3667,11 @@ void dft3072(int16_t *input, int16_t *output,unsigned char scale) tmp[2][i] = ((uint32_t *)input)[j++]; } - dft1024((int16_t*)(tmp[0]),(int16_t*)(tmpo[0]),1); - dft1024((int16_t*)(tmp[1]),(int16_t*)(tmpo[1]),1); - dft1024((int16_t*)(tmp[2]),(int16_t*)(tmpo[2]),1); + unsigned int *scale1024=NULL; + if (scale) scale1024=scale+1; + dft1024((int16_t*)(tmp[0]),(int16_t*)(tmpo[0]),scale1024); + dft1024((int16_t*)(tmp[1]),(int16_t*)(tmpo[1]),scale1024); + dft1024((int16_t*)(tmp[2]),(int16_t*)(tmpo[2]),scale1024); for (i=0,i2=0; i<2048; i+=8,i2+=4) { bfly3((simd_q15_t*)(&tmpo[0][i2]),(simd_q15_t*)(&tmpo[1][i2]),(simd_q15_t*)(&tmpo[2][i2]), @@ -3352,7 +3679,7 @@ void dft3072(int16_t *input, int16_t *output,unsigned char scale) (simd_q15_t*)(twa3072+i),(simd_q15_t*)(twb3072+i)); } - if (scale==1) { + if (scale && *scale>0) { for (i=0; i<48; i++) { y128p[0] = mulhi_int16(y128p[0],ONE_OVER_SQRT3_Q15_128); y128p[1] = mulhi_int16(y128p[1],ONE_OVER_SQRT3_Q15_128); @@ -3376,7 +3703,7 @@ void dft3072(int16_t *input, int16_t *output,unsigned char scale) } -void idft3072(int16_t *input, int16_t *output,unsigned char scale) +void idft3072(int16_t *input, int16_t *output,unsigned int *scale) { int i,i2,j; uint32_t tmp[3][1024]__attribute__((aligned(32))); @@ -3389,9 +3716,11 @@ void idft3072(int16_t *input, int16_t *output,unsigned char scale) tmp[1][i] = ((uint32_t *)input)[j++]; tmp[2][i] = ((uint32_t *)input)[j++]; } - idft1024((int16_t*)(tmp[0]),(int16_t*)(tmpo[0]),1); - idft1024((int16_t*)(tmp[1]),(int16_t*)(tmpo[1]),1); - idft1024((int16_t*)(tmp[2]),(int16_t*)(tmpo[2]),1); + unsigned int *scale1024=NULL; + if (scale) scale1024=scale+1; + idft1024((int16_t*)(tmp[0]),(int16_t*)(tmpo[0]),scale1024); + idft1024((int16_t*)(tmp[1]),(int16_t*)(tmpo[1]),scale1024); + idft1024((int16_t*)(tmp[2]),(int16_t*)(tmpo[2]),scale1024); for (i=0,i2=0; i<2048; i+=8,i2+=4) { ibfly3((simd_q15_t*)(&tmpo[0][i2]),(simd_q15_t*)(&tmpo[1][i2]),((simd_q15_t*)&tmpo[2][i2]), @@ -3400,7 +3729,7 @@ void idft3072(int16_t *input, int16_t *output,unsigned char scale) } - if (scale==1) { + if (scale && *scale>0) { for (i=0; i<48; i++) { y128p[0] = mulhi_int16(y128p[0],ONE_OVER_SQRT3_Q15_128); y128p[1] = mulhi_int16(y128p[1],ONE_OVER_SQRT3_Q15_128); @@ -3428,7 +3757,7 @@ void idft3072(int16_t *input, int16_t *output,unsigned char scale) int16_t twa6144[4096] __attribute__((aligned(32))); int16_t twb6144[4096] __attribute__((aligned(32))); -void idft6144(int16_t *input, int16_t *output,unsigned char scale) +void idft6144(int16_t *input, int16_t *output,unsigned int *scale) { int i,i2,j; uint32_t tmp[3][2048] __attribute__((aligned(32))); @@ -3442,9 +3771,11 @@ void idft6144(int16_t *input, int16_t *output,unsigned char scale) tmp[2][i] = ((uint32_t *)input)[j++]; } - idft2048((int16_t*)(tmp[0]),(int16_t*)(tmpo[0]),1); - idft2048((int16_t*)(tmp[1]),(int16_t*)(tmpo[1]),1); - idft2048((int16_t*)(tmp[2]),(int16_t*)(tmpo[2]),1); + unsigned int *scale2048=NULL; + if (scale) scale2048=scale+1; + idft2048((int16_t*)(tmp[0]),(int16_t*)(tmpo[0]),scale2048); + idft2048((int16_t*)(tmp[1]),(int16_t*)(tmpo[1]),scale2048); + idft2048((int16_t*)(tmp[2]),(int16_t*)(tmpo[2]),scale2048); #ifndef MR_MAIN if (LOG_DUMPFLAG(DEBUG_DFT)) { LOG_M("idft6144in.m","in",input,6144,1,1); @@ -3460,7 +3791,7 @@ void idft6144(int16_t *input, int16_t *output,unsigned char scale) } - if (scale==1) { + if (scale && *scale>0) { for (i=0; i<96; i++) { y128p[0] = mulhi_int16(y128p[0],ONE_OVER_SQRT3_Q15_128); y128p[1] = mulhi_int16(y128p[1],ONE_OVER_SQRT3_Q15_128); @@ -3485,7 +3816,7 @@ void idft6144(int16_t *input, int16_t *output,unsigned char scale) } -void dft6144(int16_t *input, int16_t *output,unsigned char scale) +void dft6144(int16_t *input, int16_t *output,unsigned int *scale) { int i,i2,j; uint32_t tmp[3][2048] __attribute__((aligned(32))); @@ -3499,9 +3830,11 @@ void dft6144(int16_t *input, int16_t *output,unsigned char scale) tmp[2][i] = ((uint32_t *)input)[j++]; } - dft2048((int16_t*)(tmp[0]),(int16_t*)(tmpo[0]),1); - dft2048((int16_t*)(tmp[1]),(int16_t*)(tmpo[1]),1); - dft2048((int16_t*)(tmp[2]),(int16_t*)(tmpo[2]),1); + unsigned int *scale2048=NULL; + if (scale) scale2048=scale+1; + dft2048((int16_t*)(tmp[0]),(int16_t*)(tmpo[0]),scale2048); + dft2048((int16_t*)(tmp[1]),(int16_t*)(tmpo[1]),scale2048); + dft2048((int16_t*)(tmp[2]),(int16_t*)(tmpo[2]),scale2048); /* for (i=1; i<2048; i++) { @@ -3522,7 +3855,7 @@ void dft6144(int16_t *input, int16_t *output,unsigned char scale) (simd_q15_t*)(twa6144+i),(simd_q15_t*)(twb6144+i)); } - if (scale==1) { + if (scale && *scale>0) { for (i=0; i<96; i++) { y128p[0] = mulhi_int16(y128p[0],ONE_OVER_SQRT3_Q15_128); y128p[1] = mulhi_int16(y128p[1],ONE_OVER_SQRT3_Q15_128); @@ -3545,10 +3878,23 @@ void dft6144(int16_t *input, int16_t *output,unsigned char scale) } } +int16_t twa9216[6144] __attribute__((aligned(32))); +int16_t twb9216[6144] __attribute__((aligned(32))); +// 3072 x 3 +void dft9216(int16_t *input, int16_t *output,uint32_t *scale) { + + AssertFatal(1==0,"Need to do this ..\n"); +} + +void idft9216(int16_t *input, int16_t *output,uint32_t *scale) { + + AssertFatal(1==0,"Need to do this ..\n"); +} + int16_t twa12288[8192] __attribute__((aligned(32))); int16_t twb12288[8192] __attribute__((aligned(32))); // 4096 x 3 -void dft12288(int16_t *input, int16_t *output,unsigned char scale) +void dft12288(int16_t *input, int16_t *output,unsigned int *scale) { int i,i2,j; uint32_t tmp[3][4096] __attribute__((aligned(32))); @@ -3562,9 +3908,11 @@ void dft12288(int16_t *input, int16_t *output,unsigned char scale) tmp[2][i] = ((uint32_t *)input)[j++]; } - dft4096((int16_t*)(tmp[0]),(int16_t*)(tmpo[0]),scale); - dft4096((int16_t*)(tmp[1]),(int16_t*)(tmpo[1]),scale); - dft4096((int16_t*)(tmp[2]),(int16_t*)(tmpo[2]),scale); + unsigned int *scale4096=NULL; + if (scale) scale4096=scale+1; + dft4096((int16_t*)(tmp[0]),(int16_t*)(tmpo[0]),scale4096); + dft4096((int16_t*)(tmp[1]),(int16_t*)(tmpo[1]),scale4096); + dft4096((int16_t*)(tmp[2]),(int16_t*)(tmpo[2]),scale4096); /* for (i=1; i<4096; i++) { tmpo[0][i] = tmpo[0][i<<1]; @@ -3584,7 +3932,7 @@ void dft12288(int16_t *input, int16_t *output,unsigned char scale) (simd_q15_t*)(twa12288+i),(simd_q15_t*)(twb12288+i)); } - if (scale==1) { + if (scale && *scale>0) { for (i=0; i<192; i++) { y128p[0] = mulhi_int16(y128p[0],ONE_OVER_SQRT3_Q15_128); y128p[1] = mulhi_int16(y128p[1],ONE_OVER_SQRT3_Q15_128); @@ -3607,7 +3955,7 @@ void dft12288(int16_t *input, int16_t *output,unsigned char scale) } } -void idft12288(int16_t *input, int16_t *output,unsigned char scale) +void idft12288(int16_t *input, int16_t *output,unsigned int *scale) { int i,i2,j; uint32_t tmp[3][4096] __attribute__((aligned(32))); @@ -3623,9 +3971,11 @@ void idft12288(int16_t *input, int16_t *output,unsigned char scale) - idft4096((int16_t*)(tmp[0]),(int16_t*)(tmpo[0]),scale); - idft4096((int16_t*)(tmp[1]),(int16_t*)(tmpo[1]),scale); - idft4096((int16_t*)(tmp[2]),(int16_t*)(tmpo[2]),scale); + unsigned int *scale4096=NULL; + if (scale) scale4096=scale+1; + idft4096((int16_t*)(tmp[0]),(int16_t*)(tmpo[0]),scale4096); + idft4096((int16_t*)(tmp[1]),(int16_t*)(tmpo[1]),scale4096); + idft4096((int16_t*)(tmp[2]),(int16_t*)(tmpo[2]),scale4096); #ifndef MR_MAIN if (LOG_DUMPFLAG(DEBUG_DFT)) { LOG_M("idft12288in.m","in",input,12288,1,1); @@ -3640,7 +3990,7 @@ void idft12288(int16_t *input, int16_t *output,unsigned char scale) (simd_q15_t*)(twa12288+i),(simd_q15_t*)(twb12288+i)); } - if (scale==1) { + if (scale && *scale>0) { for (i=0; i<192; i++) { y128p[0] = mulhi_int16(y128p[0],ONE_OVER_SQRT3_Q15_128); y128p[1] = mulhi_int16(y128p[1],ONE_OVER_SQRT3_Q15_128); @@ -3671,7 +4021,7 @@ void idft12288(int16_t *input, int16_t *output,unsigned char scale) int16_t twa18432[12288] __attribute__((aligned(32))); int16_t twb18432[12288] __attribute__((aligned(32))); // 6144 x 3 -void dft18432(int16_t *input, int16_t *output,unsigned char scale) { +void dft18432(int16_t *input, int16_t *output,unsigned int *scale) { int i,i2,j; uint32_t tmp[3][6144] __attribute__((aligned(32))); @@ -3685,16 +4035,18 @@ void dft18432(int16_t *input, int16_t *output,unsigned char scale) { tmp[2][i] = ((uint32_t *)input)[j++]; } - dft6144((int16_t*)(tmp[0]),(int16_t*)(tmpo[0]),scale); - dft6144((int16_t*)(tmp[1]),(int16_t*)(tmpo[1]),scale); - dft6144((int16_t*)(tmp[2]),(int16_t*)(tmpo[2]),scale); + unsigned int *scale6144=NULL; + if (scale) scale6144=scale+1; + dft6144((int16_t*)(tmp[0]),(int16_t*)(tmpo[0]),scale6144); + dft6144((int16_t*)(tmp[1]),(int16_t*)(tmpo[1]),scale6144); + dft6144((int16_t*)(tmp[2]),(int16_t*)(tmpo[2]),scale6144); for (i=0,i2=0; i<12288; i+=8,i2+=4) { bfly3((simd_q15_t*)(&tmpo[0][i2]),(simd_q15_t*)(&tmpo[1][i2]),(simd_q15_t*)(&tmpo[2][i2]), (simd_q15_t*)(output+i),(simd_q15_t*)(output+12288+i),(simd_q15_t*)(output+24576+i), (simd_q15_t*)(twa18432+i),(simd_q15_t*)(twb18432+i)); } - if (scale==1) { + if (scale && *scale>0) { for (i=0; i<288; i++) { y128p[0] = mulhi_int16(y128p[0],ONE_OVER_SQRT3_Q15_128); y128p[1] = mulhi_int16(y128p[1],ONE_OVER_SQRT3_Q15_128); @@ -3717,7 +4069,7 @@ void dft18432(int16_t *input, int16_t *output,unsigned char scale) { } } -void idft18432(int16_t *input, int16_t *output,unsigned char scale) { +void idft18432(int16_t *input, int16_t *output,unsigned int *scale) { int i,i2,j; uint32_t tmp[3][6144] __attribute__((aligned(32))); @@ -3731,16 +4083,18 @@ void idft18432(int16_t *input, int16_t *output,unsigned char scale) { tmp[2][i] = ((uint32_t *)input)[j++]; } - idft6144((int16_t*)(tmp[0]),(int16_t*)(tmpo[0]),scale); - idft6144((int16_t*)(tmp[1]),(int16_t*)(tmpo[1]),scale); - idft6144((int16_t*)(tmp[2]),(int16_t*)(tmpo[2]),scale); + unsigned int *scale6144=NULL; + if (scale) scale6144=scale+1; + idft6144((int16_t*)(tmp[0]),(int16_t*)(tmpo[0]),scale6144); + idft6144((int16_t*)(tmp[1]),(int16_t*)(tmpo[1]),scale6144); + idft6144((int16_t*)(tmp[2]),(int16_t*)(tmpo[2]),scale6144); for (i=0,i2=0; i<12288; i+=8,i2+=4) { ibfly3((simd_q15_t*)(&tmpo[0][i2]),(simd_q15_t*)(&tmpo[1][i2]),(simd_q15_t*)(&tmpo[2][i2]), (simd_q15_t*)(output+i),(simd_q15_t*)(output+12288+i),(simd_q15_t*)(output+24576+i), (simd_q15_t*)(twa18432+i),(simd_q15_t*)(twb18432+i)); } - if (scale==1) { + if (scale && *scale>0) { for (i=0; i<288; i++) { y128p[0] = mulhi_int16(y128p[0],ONE_OVER_SQRT3_Q15_128); y128p[1] = mulhi_int16(y128p[1],ONE_OVER_SQRT3_Q15_128); @@ -3767,7 +4121,7 @@ void idft18432(int16_t *input, int16_t *output,unsigned char scale) { int16_t twa24576[16384] __attribute__((aligned(32))); int16_t twb24576[16384] __attribute__((aligned(32))); // 8192 x 3 -void dft24576(int16_t *input, int16_t *output,unsigned char scale) +void dft24576(int16_t *input, int16_t *output,unsigned int *scale) { int i,i2,j; uint32_t tmp[3][8192] __attribute__((aligned(32))); @@ -3781,9 +4135,11 @@ void dft24576(int16_t *input, int16_t *output,unsigned char scale) tmp[2][i] = ((uint32_t *)input)[j++]; } - dft8192((int16_t*)(tmp[0]),(int16_t*)(tmpo[0]),1); - dft8192((int16_t*)(tmp[1]),(int16_t*)(tmpo[1]),1); - dft8192((int16_t*)(tmp[2]),(int16_t*)(tmpo[2]),1); + unsigned int *scale8192=NULL; + if (scale) scale8192=scale+1; + dft8192((int16_t*)(tmp[0]),(int16_t*)(tmpo[0]),scale8192); + dft8192((int16_t*)(tmp[1]),(int16_t*)(tmpo[1]),scale8192); + dft8192((int16_t*)(tmp[2]),(int16_t*)(tmpo[2]),scale8192); /* for (i=1; i<8192; i++) { tmpo[0][i] = tmpo[0][i<<1]; @@ -3804,7 +4160,7 @@ void dft24576(int16_t *input, int16_t *output,unsigned char scale) } - if (scale==1) { + if (scale && *scale>0) { for (i=0; i<384; i++) { y128p[0] = mulhi_int16(y128p[0],ONE_OVER_SQRT3_Q15_128); y128p[1] = mulhi_int16(y128p[1],ONE_OVER_SQRT3_Q15_128); @@ -3832,7 +4188,7 @@ void dft24576(int16_t *input, int16_t *output,unsigned char scale) #endif } -void idft24576(int16_t *input, int16_t *output,unsigned char scale) +void idft24576(int16_t *input, int16_t *output,unsigned int *scale) { int i,i2,j; uint32_t tmp[3][8192] __attribute__((aligned(32))); @@ -3846,9 +4202,11 @@ void idft24576(int16_t *input, int16_t *output,unsigned char scale) tmp[2][i] = ((uint32_t *)input)[j++]; } - idft8192((int16_t*)(tmp[0]),(int16_t*)(tmpo[0]),1); - idft8192((int16_t*)(tmp[1]),(int16_t*)(tmpo[1]),1); - idft8192((int16_t*)(tmp[2]),(int16_t*)(tmpo[2]),1); + unsigned int *scale8192=NULL; + if (scale) scale8192=scale+1; + idft8192((int16_t*)(tmp[0]),(int16_t*)(tmpo[0]),scale8192); + idft8192((int16_t*)(tmp[1]),(int16_t*)(tmpo[1]),scale8192); + idft8192((int16_t*)(tmp[2]),(int16_t*)(tmpo[2]),scale8192); #ifndef MR_MAIN if (LOG_DUMPFLAG(DEBUG_DFT)) { LOG_M("idft24576in.m","in",input,24576,1,1); @@ -3862,7 +4220,7 @@ void idft24576(int16_t *input, int16_t *output,unsigned char scale) (simd_q15_t*)(output+i),(simd_q15_t*)(output+16384+i),(simd_q15_t*)(output+32768+i), (simd_q15_t*)(twa24576+i),(simd_q15_t*)(twb24576+i)); } - if (scale==1) { + if (scale && *scale>0) { for (i=0; i<384; i++) { y128p[0] = mulhi_int16(y128p[0],ONE_OVER_SQRT3_Q15_128); y128p[1] = mulhi_int16(y128p[1],ONE_OVER_SQRT3_Q15_128); @@ -3894,7 +4252,7 @@ int16_t twa36864[24576] __attribute__((aligned(32))); int16_t twb36864[24576] __attribute__((aligned(32))); // 12288 x 3 -void dft36864(int16_t *input, int16_t *output,uint8_t scale) { +void dft36864(int16_t *input, int16_t *output,uint32_t *scale) { int i,i2,j; uint32_t tmp[3][12288] __attribute__((aligned(32))); @@ -3908,9 +4266,11 @@ void dft36864(int16_t *input, int16_t *output,uint8_t scale) { tmp[2][i] = ((uint32_t *)input)[j++]; } - dft12288((int16_t*)(tmp[0]),(int16_t*)(tmpo[0]),1); - dft12288((int16_t*)(tmp[1]),(int16_t*)(tmpo[1]),1); - dft12288((int16_t*)(tmp[2]),(int16_t*)(tmpo[2]),1); + unsigned int *scale12288=NULL; + if (scale) scale12288=scale+1; + dft12288((int16_t*)(tmp[0]),(int16_t*)(tmpo[0]),scale12288); + dft12288((int16_t*)(tmp[1]),(int16_t*)(tmpo[1]),scale12288); + dft12288((int16_t*)(tmp[2]),(int16_t*)(tmpo[2]),scale12288); #ifndef MR_MAIN if (LOG_DUMPFLAG(DEBUG_DFT)) { LOG_M("dft36864out0.m","o0",tmpo[0],12288,1,1); @@ -3924,7 +4284,7 @@ void dft36864(int16_t *input, int16_t *output,uint8_t scale) { (simd_q15_t*)(twa36864+i),(simd_q15_t*)(twb36864+i)); } - if (scale==1) { + if (scale && *scale>0) { for (i=0; i<576; i++) { y128p[0] = mulhi_int16(y128p[0],ONE_OVER_SQRT3_Q15_128); y128p[1] = mulhi_int16(y128p[1],ONE_OVER_SQRT3_Q15_128); @@ -3952,7 +4312,7 @@ void dft36864(int16_t *input, int16_t *output,uint8_t scale) { #endif } -void idft36864(int16_t *input, int16_t *output,uint8_t scale) { +void idft36864(int16_t *input, int16_t *output,uint32_t *scale) { int i,i2,j; uint32_t tmp[3][12288] __attribute__((aligned(32))); @@ -3966,16 +4326,18 @@ void idft36864(int16_t *input, int16_t *output,uint8_t scale) { tmp[2][i] = ((uint32_t *)input)[j++]; } - idft12288((int16_t*)(tmp[0]),(int16_t*)(tmpo[0]),1); - idft12288((int16_t*)(tmp[1]),(int16_t*)(tmpo[1]),1); - idft12288((int16_t*)(tmp[2]),(int16_t*)(tmpo[2]),1); + unsigned int *scale12288=NULL; + if (scale) scale12288=scale+1; + idft12288((int16_t*)(tmp[0]),(int16_t*)(tmpo[0]),scale12288); + idft12288((int16_t*)(tmp[1]),(int16_t*)(tmpo[1]),scale12288); + idft12288((int16_t*)(tmp[2]),(int16_t*)(tmpo[2]),scale12288); for (i=0,i2=0; i<24576; i+=8,i2+=4) { ibfly3((simd_q15_t*)(&tmpo[0][i2]),(simd_q15_t*)(&tmpo[1][i2]),((simd_q15_t*)&tmpo[2][i2]), (simd_q15_t*)(output+i),(simd_q15_t*)(output+24576+i),(simd_q15_t*)(output+49152+i), (simd_q15_t*)(twa36864+i),(simd_q15_t*)(twb36864+i)); } - if (scale==1) { + if (scale && *scale>0) { for (i=0; i<576; i++) { y128p[0] = mulhi_int16(y128p[0],ONE_OVER_SQRT3_Q15_128); y128p[1] = mulhi_int16(y128p[1],ONE_OVER_SQRT3_Q15_128); @@ -4002,7 +4364,7 @@ int16_t twa49152[32768] __attribute__((aligned(32))); int16_t twb49152[32768] __attribute__((aligned(32))); // 16384 x 3 -void dft49152(int16_t *input, int16_t *output,uint8_t scale) { +void dft49152(int16_t *input, int16_t *output,uint32_t *scale) { int i,i2,j; uint32_t tmp[3][16384] __attribute__((aligned(32))); @@ -4016,16 +4378,18 @@ void dft49152(int16_t *input, int16_t *output,uint8_t scale) { tmp[2][i] = ((uint32_t *)input)[j++]; } - dft16384((int16_t*)(tmp[0]),(int16_t*)(tmpo[0]),1); - dft16384((int16_t*)(tmp[1]),(int16_t*)(tmpo[1]),1); - dft16384((int16_t*)(tmp[2]),(int16_t*)(tmpo[2]),1); + unsigned int *scale16384=NULL; + if (scale) scale16384=scale+1; + dft16384((int16_t*)(tmp[0]),(int16_t*)(tmpo[0]),scale16384); + dft16384((int16_t*)(tmp[1]),(int16_t*)(tmpo[1]),scale16384); + dft16384((int16_t*)(tmp[2]),(int16_t*)(tmpo[2]),scale16384); for (i=0,i2=0; i<32768; i+=8,i2+=4) { bfly3((simd_q15_t*)(&tmpo[0][i2]),(simd_q15_t*)(&tmpo[1][i2]),((simd_q15_t*)&tmpo[2][i2]), (simd_q15_t*)(output+i),(simd_q15_t*)(output+32768+i),(simd_q15_t*)(output+65536+i), (simd_q15_t*)(twa49152+i),(simd_q15_t*)(twb49152+i)); } - if (scale==1) { + if (scale && *scale>0) { for (i=0; i<768; i++) { y128p[0] = mulhi_int16(y128p[0],ONE_OVER_SQRT3_Q15_128); y128p[1] = mulhi_int16(y128p[1],ONE_OVER_SQRT3_Q15_128); @@ -4048,7 +4412,7 @@ void dft49152(int16_t *input, int16_t *output,uint8_t scale) { } } -void idft49152(int16_t *input, int16_t *output,uint8_t scale) { +void idft49152(int16_t *input, int16_t *output,uint32_t *scale) { int i,i2,j; uint32_t tmp[3][16384] __attribute__((aligned(32))); @@ -4062,16 +4426,18 @@ void idft49152(int16_t *input, int16_t *output,uint8_t scale) { tmp[2][i] = ((uint32_t *)input)[j++]; } - idft16384((int16_t*)(tmp[0]),(int16_t*)(tmpo[0]),1); - idft16384((int16_t*)(tmp[1]),(int16_t*)(tmpo[1]),1); - idft16384((int16_t*)(tmp[2]),(int16_t*)(tmpo[2]),1); + unsigned int *scale16384=NULL; + if (scale) scale16384=scale+1; + idft16384((int16_t*)(tmp[0]),(int16_t*)(tmpo[0]),scale16384); + idft16384((int16_t*)(tmp[1]),(int16_t*)(tmpo[1]),scale16384); + idft16384((int16_t*)(tmp[2]),(int16_t*)(tmpo[2]),scale16384); for (i=0,i2=0; i<32768; i+=8,i2+=4) { ibfly3((simd_q15_t*)(&tmpo[0][i2]),(simd_q15_t*)(&tmpo[1][i2]),((simd_q15_t*)&tmpo[2][i2]), (simd_q15_t*)(output+i),(simd_q15_t*)(output+32768+i),(simd_q15_t*)(output+65536+i), (simd_q15_t*)(twa49152+i),(simd_q15_t*)(twb49152+i)); } - if (scale==1) { + if (scale && *scale>0) { for (i=0; i<768; i++) { y128p[0] = mulhi_int16(y128p[0],ONE_OVER_SQRT3_Q15_128); y128p[1] = mulhi_int16(y128p[1],ONE_OVER_SQRT3_Q15_128); @@ -4096,7 +4462,7 @@ void idft49152(int16_t *input, int16_t *output,uint8_t scale) { int16_t tw65536[3*2*16384] __attribute__((aligned(32))); -void idft65536(int16_t *x,int16_t *y,unsigned char scale) +void idft65536(int16_t *x,int16_t *y,unsigned int *scale) { simd256_q15_t xtmp[8192],ytmp[8192],*tw65536_256p=(simd256_q15_t *)tw65536,*x256=(simd256_q15_t *)x,*y256=(simd256_q15_t *)y,*y256p=(simd256_q15_t *)y; @@ -4108,10 +4474,12 @@ void idft65536(int16_t *x,int16_t *y,unsigned char scale) } - idft16384((int16_t*)(xtmp),(int16_t*)(ytmp),1); - idft16384((int16_t*)(xtmp+2048),(int16_t*)(ytmp+2048),1); - idft16384((int16_t*)(xtmp+4096),(int16_t*)(ytmp+4096),1); - idft16384((int16_t*)(xtmp+6144),(int16_t*)(ytmp+6144),1); + unsigned int *scale16384=NULL; + if (scale) scale16384=scale+1; + idft16384((int16_t*)(xtmp),(int16_t*)(ytmp),scale16384); + idft16384((int16_t*)(xtmp+2048),(int16_t*)(ytmp+2048),scale16384); + idft16384((int16_t*)(xtmp+4096),(int16_t*)(ytmp+4096),scale16384); + idft16384((int16_t*)(xtmp+6144),(int16_t*)(ytmp+6144),scale16384); for (i=0; i<2048; i++) { ibfly4_256(ytmpp,ytmpp+2048,ytmpp+4096,ytmpp+6144, @@ -4122,25 +4490,25 @@ void idft65536(int16_t *x,int16_t *y,unsigned char scale) ytmpp++; } - if (scale>0) { - + if (scale && *scale>0) { + unsigned int scalec=*scale; for (i=0; i<512; i++) { - y256[0] = shiftright_int16_simd256(y256[0],scale); - y256[1] = shiftright_int16_simd256(y256[1],scale); - y256[2] = shiftright_int16_simd256(y256[2],scale); - y256[3] = shiftright_int16_simd256(y256[3],scale); - y256[4] = shiftright_int16_simd256(y256[4],scale); - y256[5] = shiftright_int16_simd256(y256[5],scale); - y256[6] = shiftright_int16_simd256(y256[6],scale); - y256[7] = shiftright_int16_simd256(y256[7],scale); - y256[8] = shiftright_int16_simd256(y256[8],scale); - y256[9] = shiftright_int16_simd256(y256[9],scale); - y256[10] = shiftright_int16_simd256(y256[10],scale); - y256[11] = shiftright_int16_simd256(y256[11],scale); - y256[12] = shiftright_int16_simd256(y256[12],scale); - y256[13] = shiftright_int16_simd256(y256[13],scale); - y256[14] = shiftright_int16_simd256(y256[14],scale); - y256[15] = shiftright_int16_simd256(y256[15],scale); + y256[0] = shiftright_int16_simd256(y256[0],scalec); + y256[1] = shiftright_int16_simd256(y256[1],scalec); + y256[2] = shiftright_int16_simd256(y256[2],scalec); + y256[3] = shiftright_int16_simd256(y256[3],scalec); + y256[4] = shiftright_int16_simd256(y256[4],scalec); + y256[5] = shiftright_int16_simd256(y256[5],scalec); + y256[6] = shiftright_int16_simd256(y256[6],scalec); + y256[7] = shiftright_int16_simd256(y256[7],scalec); + y256[8] = shiftright_int16_simd256(y256[8],scalec); + y256[9] = shiftright_int16_simd256(y256[9],scalec); + y256[10] = shiftright_int16_simd256(y256[10],scalec); + y256[11] = shiftright_int16_simd256(y256[11],scalec); + y256[12] = shiftright_int16_simd256(y256[12],scalec); + y256[13] = shiftright_int16_simd256(y256[13],scalec); + y256[14] = shiftright_int16_simd256(y256[14],scalec); + y256[15] = shiftright_int16_simd256(y256[15],scalec); y256+=16; } @@ -4149,10 +4517,24 @@ void idft65536(int16_t *x,int16_t *y,unsigned char scale) } +int16_t twa73728[49152] __attribute__((aligned(32))); +int16_t twb73728[49152] __attribute__((aligned(32))); +// 24576 x 3 +void dft73728(int16_t *input, int16_t *output,uint32_t *scale) { + + AssertFatal(1==0,"Need to do this ..\n"); +} + +void idft73728(int16_t *input, int16_t *output,uint32_t *scale) { + + AssertFatal(1==0,"Need to do this ..\n"); +} + + int16_t twa98304[65536] __attribute__((aligned(32))); int16_t twb98304[65536] __attribute__((aligned(32))); // 32768 x 3 -void dft98304(int16_t *input, int16_t *output,uint8_t scale) { +void dft98304(int16_t *input, int16_t *output,uint32_t *scale) { int i,i2,j; uint32_t tmp[3][32768] __attribute__((aligned(32))); @@ -4166,16 +4548,18 @@ void dft98304(int16_t *input, int16_t *output,uint8_t scale) { tmp[2][i] = ((uint32_t *)input)[j++]; } - dft32768((int16_t*)(tmp[0]),(int16_t*)(tmpo[0]),1); - dft32768((int16_t*)(tmp[1]),(int16_t*)(tmpo[1]),1); - dft32768((int16_t*)(tmp[2]),(int16_t*)(tmpo[2]),1); + unsigned int *scale32768=NULL; + if (scale) scale32768=scale+1; + dft32768((int16_t*)(tmp[0]),(int16_t*)(tmpo[0]),scale32768); + dft32768((int16_t*)(tmp[1]),(int16_t*)(tmpo[1]),scale32768); + dft32768((int16_t*)(tmp[2]),(int16_t*)(tmpo[2]),scale32768); for (i=0,i2=0; i<65536; i+=8,i2+=4) { bfly3((simd_q15_t*)(&tmpo[0][i2]),(simd_q15_t*)(&tmpo[1][i2]),((simd_q15_t*)&tmpo[2][i2]), (simd_q15_t*)(output+i),(simd_q15_t*)(output+65536+i),(simd_q15_t*)(output+131072+i), (simd_q15_t*)(twa98304+i),(simd_q15_t*)(twb98304+i)); } - if (scale==1) { + if (scale && *scale>0) { for (i=0; i<1536; i++) { y128p[0] = mulhi_int16(y128p[0],ONE_OVER_SQRT3_Q15_128); y128p[1] = mulhi_int16(y128p[1],ONE_OVER_SQRT3_Q15_128); @@ -4198,7 +4582,7 @@ void dft98304(int16_t *input, int16_t *output,uint8_t scale) { } } -void idft98304(int16_t *input, int16_t *output,uint8_t scale) { +void idft98304(int16_t *input, int16_t *output,uint32_t *scale) { int i,i2,j; uint32_t tmp[3][32768] __attribute__((aligned(32))); @@ -4212,16 +4596,18 @@ void idft98304(int16_t *input, int16_t *output,uint8_t scale) { tmp[2][i] = ((uint32_t *)input)[j++]; } - idft32768((int16_t*)(tmp[0]),(int16_t*)(tmpo[0]),1); - idft32768((int16_t*)(tmp[1]),(int16_t*)(tmpo[1]),1); - idft32768((int16_t*)(tmp[2]),(int16_t*)(tmpo[2]),1); + unsigned int *scale32768=NULL; + if (scale) scale32768=scale+1; + idft32768((int16_t*)(tmp[0]),(int16_t*)(tmpo[0]),scale32768); + idft32768((int16_t*)(tmp[1]),(int16_t*)(tmpo[1]),scale32768); + idft32768((int16_t*)(tmp[2]),(int16_t*)(tmpo[2]),scale32768); for (i=0,i2=0; i<65536; i+=8,i2+=4) { ibfly3((simd_q15_t*)(&tmpo[0][i2]),(simd_q15_t*)(&tmpo[1][i2]),((simd_q15_t*)&tmpo[2][i2]), (simd_q15_t*)(output+i),(simd_q15_t*)(output+65536+i),(simd_q15_t*)(output+131072+i), (simd_q15_t*)(twa98304+i),(simd_q15_t*)(twb98304+i)); } - if (scale==1) { + if (scale && *scale>0) { for (i=0; i<1536; i++) { y128p[0] = mulhi_int16(y128p[0],ONE_OVER_SQRT3_Q15_128); y128p[1] = mulhi_int16(y128p[1],ONE_OVER_SQRT3_Q15_128); @@ -4366,7 +4752,7 @@ __attribute__((always_inline)) static inline void dft12f(simd_q15_t *x0, -void dft12(int16_t *x,int16_t *y ,unsigned char scale_flag) +void dft12(int16_t *x,int16_t *y ,unsigned int *scale_flag) { simd_q15_t *x128 = (simd_q15_t *)x,*y128 = (simd_q15_t *)y; @@ -4568,7 +4954,7 @@ void dft12_simd256(int16_t *x,int16_t *y) static int16_t tw24[88]__attribute__((aligned(32))); -void dft24(int16_t *x,int16_t *y,unsigned char scale_flag) +void dft24(int16_t *x,int16_t *y,unsigned int *scale_flag) { simd_q15_t *x128=(simd_q15_t *)x; @@ -4648,7 +5034,7 @@ void dft24(int16_t *x,int16_t *y,unsigned char scale_flag) // msg("dft24e\n"); } - if (scale_flag==1) { + if (scale_flag) { const simd_q15_t norm128 = set1_int16(dft_norm_table[1]); for (i=0; i<24; i++) { @@ -4661,7 +5047,7 @@ void dft24(int16_t *x,int16_t *y,unsigned char scale_flag) static int16_t twa36[88]__attribute__((aligned(32))); static int16_t twb36[88]__attribute__((aligned(32))); -void dft36(int16_t *x,int16_t *y,unsigned char scale_flag) +void dft36(int16_t *x,int16_t *y,unsigned int *scale_flag) { simd_q15_t *x128=(simd_q15_t *)x; @@ -4768,7 +5154,7 @@ void dft36(int16_t *x,int16_t *y,unsigned char scale_flag) twb128+k); } - if (scale_flag==1) { + if (scale_flag) { const simd_q15_t norm128 = set1_int16(dft_norm_table[2]); for (i=0; i<36; i++) { @@ -4782,7 +5168,7 @@ static int16_t twa48[88]__attribute__((aligned(32))); static int16_t twb48[88]__attribute__((aligned(32))); static int16_t twc48[88]__attribute__((aligned(32))); -void dft48(int16_t *x, int16_t *y,unsigned char scale_flag) +void dft48(int16_t *x, int16_t *y,unsigned int *scale_flag) { simd_q15_t *x128=(simd_q15_t *)x; @@ -4926,7 +5312,7 @@ void dft48(int16_t *x, int16_t *y,unsigned char scale_flag) } - if (scale_flag == 1) { + if (scale_flag) { const simd_q15_t norm128 = set1_int16(dft_norm_table[3]); for (i=0; i<48; i++) { @@ -4941,7 +5327,7 @@ static int16_t twb60[88]__attribute__((aligned(32))); static int16_t twc60[88]__attribute__((aligned(32))); static int16_t twd60[88]__attribute__((aligned(32))); -void dft60(int16_t *x,int16_t *y,unsigned char scale) +void dft60(int16_t *x,int16_t *y,unsigned int *scale) { simd_q15_t *x128=(simd_q15_t *)x; @@ -5107,7 +5493,7 @@ void dft60(int16_t *x,int16_t *y,unsigned char scale) twd128+k); } - if (scale == 1) { + if (scale) { const simd_q15_t norm128 = set1_int16(dft_norm_table[4]); for (i=0; i<60; i++) { @@ -5120,7 +5506,7 @@ void dft60(int16_t *x,int16_t *y,unsigned char scale) static int16_t tw72[280]__attribute__((aligned(32))); -void dft72(int16_t *x,int16_t *y,unsigned char scale_flag) +void dft72(int16_t *x,int16_t *y,unsigned int *scale_flag) { int i,j; @@ -5136,8 +5522,8 @@ void dft72(int16_t *x,int16_t *y,unsigned char scale_flag) x2128[i+36] = x128[j+1]; // odd inputs } - dft36((int16_t *)x2128,(int16_t *)ytmp128,1); - dft36((int16_t *)(x2128+36),(int16_t *)(ytmp128+36),1); + dft36((int16_t *)x2128,(int16_t *)ytmp128,scale_flag); + dft36((int16_t *)(x2128+36),(int16_t *)(ytmp128+36),scale_flag); bfly2_tw1(ytmp128,ytmp128+36,y128,y128+36); @@ -5149,7 +5535,7 @@ void dft72(int16_t *x,int16_t *y,unsigned char scale_flag) tw128+j); } - if (scale_flag==1) { + if (scale_flag) { const simd_q15_t norm128 = set1_int16(dft_norm_table[5]); for (i=0; i<72; i++) { @@ -5161,7 +5547,7 @@ void dft72(int16_t *x,int16_t *y,unsigned char scale_flag) static int16_t tw96[376]__attribute__((aligned(32))); -void dft96(int16_t *x,int16_t *y,unsigned char scale_flag) +void dft96(int16_t *x,int16_t *y,unsigned int *scale_flag) { @@ -5192,7 +5578,7 @@ void dft96(int16_t *x,int16_t *y,unsigned char scale_flag) tw128+j); } - if (scale_flag==1) { + if (scale_flag) { const simd_q15_t norm128 = set1_int16(dft_norm_table[6]); for (i=0; i<96; i++) { @@ -5205,7 +5591,7 @@ void dft96(int16_t *x,int16_t *y,unsigned char scale_flag) static int16_t twa108[280]__attribute__((aligned(32))); static int16_t twb108[280]__attribute__((aligned(32))); -void dft108(int16_t *x,int16_t *y,unsigned char scale_flag) +void dft108(int16_t *x,int16_t *y,unsigned int *scale_flag) { int i,j; simd_q15_t *x128=(simd_q15_t *)x; @@ -5240,7 +5626,7 @@ void dft108(int16_t *x,int16_t *y,unsigned char scale_flag) } - if (scale_flag==1) { + if (scale_flag) { const simd_q15_t norm128 = set1_int16(dft_norm_table[7]); for (i=0; i<108; i++) { @@ -5251,7 +5637,7 @@ void dft108(int16_t *x,int16_t *y,unsigned char scale_flag) } static int16_t tw120[472]__attribute__((aligned(32))); -void dft120(int16_t *x,int16_t *y, unsigned char scale_flag) +void dft120(int16_t *x,int16_t *y, unsigned int *scale_flag) { int i,j; simd_q15_t *x128=(simd_q15_t *)x; @@ -5279,7 +5665,7 @@ void dft120(int16_t *x,int16_t *y, unsigned char scale_flag) tw128+j); } - if (scale_flag==1) { + if (scale_flag) { const simd_q15_t norm128 = set1_int16(dft_norm_table[8]); for (i=0; i<120; i++) { @@ -5292,7 +5678,7 @@ void dft120(int16_t *x,int16_t *y, unsigned char scale_flag) static int16_t twa144[376]__attribute__((aligned(32))); static int16_t twb144[376]__attribute__((aligned(32))); -void dft144(int16_t *x,int16_t *y,unsigned char scale_flag) +void dft144(int16_t *x,int16_t *y,unsigned int *scale_flag) { int i,j; simd_q15_t *x128=(simd_q15_t *)x; @@ -5310,9 +5696,9 @@ void dft144(int16_t *x,int16_t *y,unsigned char scale_flag) x2128[i+96] = x128[j+2]; } - dft48((int16_t *)x2128,(int16_t *)ytmp128,1); - dft48((int16_t *)(x2128+48),(int16_t *)(ytmp128+48),1); - dft48((int16_t *)(x2128+96),(int16_t *)(ytmp128+96),1); + dft48((int16_t *)x2128,(int16_t *)ytmp128,scale_flag); + dft48((int16_t *)(x2128+48),(int16_t *)(ytmp128+48),scale_flag); + dft48((int16_t *)(x2128+96),(int16_t *)(ytmp128+96),scale_flag); bfly3_tw1(ytmp128,ytmp128+48,ytmp128+96,y128,y128+48,y128+96); @@ -5327,7 +5713,7 @@ void dft144(int16_t *x,int16_t *y,unsigned char scale_flag) twb128+j); } - if (scale_flag==1) { + if (scale_flag) { const simd_q15_t norm128 = set1_int16(dft_norm_table[9]); for (i=0; i<144; i++) { @@ -5340,7 +5726,7 @@ void dft144(int16_t *x,int16_t *y,unsigned char scale_flag) static int16_t twa180[472]__attribute__((aligned(32))); static int16_t twb180[472]__attribute__((aligned(32))); -void dft180(int16_t *x,int16_t *y,unsigned char scale_flag) +void dft180(int16_t *x,int16_t *y,unsigned int *scale_flag) { int i,j; @@ -5359,9 +5745,9 @@ void dft180(int16_t *x,int16_t *y,unsigned char scale_flag) x2128[i+120] = x128[j+2]; } - dft60((int16_t *)x2128,(int16_t *)ytmp128,1); - dft60((int16_t *)(x2128+60),(int16_t *)(ytmp128+60),1); - dft60((int16_t *)(x2128+120),(int16_t *)(ytmp128+120),1); + dft60((int16_t *)x2128,(int16_t *)ytmp128,scale_flag); + dft60((int16_t *)(x2128+60),(int16_t *)(ytmp128+60),scale_flag); + dft60((int16_t *)(x2128+120),(int16_t *)(ytmp128+120),scale_flag); bfly3_tw1(ytmp128,ytmp128+60,ytmp128+120,y128,y128+60,y128+120); @@ -5376,7 +5762,7 @@ void dft180(int16_t *x,int16_t *y,unsigned char scale_flag) twb128+j); } - if (scale_flag==1) { + if (scale_flag) { const simd_q15_t norm128 = set1_int16(dft_norm_table[10]); for (i=0; i<180; i++) { @@ -5390,7 +5776,7 @@ static int16_t twa192[376]__attribute__((aligned(32))); static int16_t twb192[376]__attribute__((aligned(32))); static int16_t twc192[376]__attribute__((aligned(32))); -void dft192(int16_t *x,int16_t *y,unsigned char scale_flag) +void dft192(int16_t *x,int16_t *y,unsigned int *scale_flag) { int i,j; @@ -5411,10 +5797,10 @@ void dft192(int16_t *x,int16_t *y,unsigned char scale_flag) x2128[i+144] = x128[j+3]; } - dft48((int16_t *)x2128,(int16_t *)ytmp128,1); - dft48((int16_t *)(x2128+48),(int16_t *)(ytmp128+48),1); - dft48((int16_t *)(x2128+96),(int16_t *)(ytmp128+96),1); - dft48((int16_t *)(x2128+144),(int16_t *)(ytmp128+144),1); + dft48((int16_t *)x2128,(int16_t *)ytmp128,scale_flag); + dft48((int16_t *)(x2128+48),(int16_t *)(ytmp128+48),scale_flag); + dft48((int16_t *)(x2128+96),(int16_t *)(ytmp128+96),scale_flag); + dft48((int16_t *)(x2128+144),(int16_t *)(ytmp128+144),scale_flag); bfly4_tw1(ytmp128,ytmp128+48,ytmp128+96,ytmp128+144,y128,y128+48,y128+96,y128+144); @@ -5432,7 +5818,7 @@ void dft192(int16_t *x,int16_t *y,unsigned char scale_flag) twc128+j); } - if (scale_flag==1) { + if (scale_flag) { const simd_q15_t norm128 = set1_int16(dft_norm_table[11]); for (i=0; i<192; i++) { @@ -5445,7 +5831,7 @@ void dft192(int16_t *x,int16_t *y,unsigned char scale_flag) static int16_t twa216[568]__attribute__((aligned(32))); static int16_t twb216[568]__attribute__((aligned(32))); -void dft216(int16_t *x,int16_t *y,unsigned char scale_flag) +void dft216(int16_t *x,int16_t *y,unsigned int *scale_flag) { int i,j; @@ -5464,9 +5850,9 @@ void dft216(int16_t *x,int16_t *y,unsigned char scale_flag) x2128[i+144] = x128[j+2]; } - dft72((int16_t *)x2128,(int16_t *)ytmp128,1); - dft72((int16_t *)(x2128+72),(int16_t *)(ytmp128+72),1); - dft72((int16_t *)(x2128+144),(int16_t *)(ytmp128+144),1); + dft72((int16_t *)x2128,(int16_t *)ytmp128,scale_flag); + dft72((int16_t *)(x2128+72),(int16_t *)(ytmp128+72),scale_flag); + dft72((int16_t *)(x2128+144),(int16_t *)(ytmp128+144),scale_flag); bfly3_tw1(ytmp128,ytmp128+72,ytmp128+144,y128,y128+72,y128+144); @@ -5481,7 +5867,7 @@ void dft216(int16_t *x,int16_t *y,unsigned char scale_flag) twb128+j); } - if (scale_flag==1) { + if (scale_flag) { const simd_q15_t norm128 = set1_int16(dft_norm_table[12]); for (i=0; i<216; i++) { @@ -5495,7 +5881,7 @@ static int16_t twa240[472]__attribute__((aligned(32))); static int16_t twb240[472]__attribute__((aligned(32))); static int16_t twc240[472]__attribute__((aligned(32))); -void dft240(int16_t *x,int16_t *y,unsigned char scale_flag) +void dft240(int16_t *x,int16_t *y,unsigned int *scale_flag) { int i,j; @@ -5516,10 +5902,10 @@ void dft240(int16_t *x,int16_t *y,unsigned char scale_flag) x2128[i+180] = x128[j+3]; } - dft60((int16_t *)x2128,(int16_t *)ytmp128,1); - dft60((int16_t *)(x2128+60),(int16_t *)(ytmp128+60),1); - dft60((int16_t *)(x2128+120),(int16_t *)(ytmp128+120),1); - dft60((int16_t *)(x2128+180),(int16_t *)(ytmp128+180),1); + dft60((int16_t *)x2128,(int16_t *)ytmp128,scale_flag); + dft60((int16_t *)(x2128+60),(int16_t *)(ytmp128+60),scale_flag); + dft60((int16_t *)(x2128+120),(int16_t *)(ytmp128+120),scale_flag); + dft60((int16_t *)(x2128+180),(int16_t *)(ytmp128+180),scale_flag); bfly4_tw1(ytmp128,ytmp128+60,ytmp128+120,ytmp128+180,y128,y128+60,y128+120,y128+180); @@ -5537,7 +5923,7 @@ void dft240(int16_t *x,int16_t *y,unsigned char scale_flag) twc128+j); } - if (scale_flag==1) { + if (scale_flag) { const simd_q15_t norm128 = set1_int16(dft_norm_table[13]); for (i=0; i<240; i++) { @@ -5550,7 +5936,7 @@ void dft240(int16_t *x,int16_t *y,unsigned char scale_flag) static int16_t twa288[760]__attribute__((aligned(32))); static int16_t twb288[760]__attribute__((aligned(32))); -void dft288(int16_t *x,int16_t *y,unsigned char scale_flag) +void dft288(int16_t *x,int16_t *y,unsigned int *scale_flag) { int i,j; @@ -5569,9 +5955,9 @@ void dft288(int16_t *x,int16_t *y,unsigned char scale_flag) x2128[i+192] = x128[j+2]; } - dft96((int16_t *)x2128,(int16_t *)ytmp128,1); - dft96((int16_t *)(x2128+96),(int16_t *)(ytmp128+96),1); - dft96((int16_t *)(x2128+192),(int16_t *)(ytmp128+192),1); + dft96((int16_t *)x2128,(int16_t *)ytmp128,scale_flag); + dft96((int16_t *)(x2128+96),(int16_t *)(ytmp128+96),scale_flag); + dft96((int16_t *)(x2128+192),(int16_t *)(ytmp128+192),scale_flag); bfly3_tw1(ytmp128,ytmp128+96,ytmp128+192,y128,y128+96,y128+192); @@ -5586,7 +5972,7 @@ void dft288(int16_t *x,int16_t *y,unsigned char scale_flag) twb128+j); } - if (scale_flag==1) { + if (scale_flag) { const simd_q15_t norm128 = set1_int16(dft_norm_table[14]); for (i=0; i<288; i++) { @@ -5601,7 +5987,7 @@ static int16_t twb300[472]__attribute__((aligned(32))); static int16_t twc300[472]__attribute__((aligned(32))); static int16_t twd300[472]__attribute__((aligned(32))); -void dft300(int16_t *x,int16_t *y,unsigned char scale_flag) +void dft300(int16_t *x,int16_t *y,unsigned int *scale_flag) { int i,j; @@ -5624,11 +6010,11 @@ void dft300(int16_t *x,int16_t *y,unsigned char scale_flag) x2128[i+240] = x128[j+4]; } - dft60((int16_t *)x2128,(int16_t *)ytmp128,1); - dft60((int16_t *)(x2128+60),(int16_t *)(ytmp128+60),1); - dft60((int16_t *)(x2128+120),(int16_t *)(ytmp128+120),1); - dft60((int16_t *)(x2128+180),(int16_t *)(ytmp128+180),1); - dft60((int16_t *)(x2128+240),(int16_t *)(ytmp128+240),1); + dft60((int16_t *)x2128,(int16_t *)ytmp128,scale_flag); + dft60((int16_t *)(x2128+60),(int16_t *)(ytmp128+60),scale_flag); + dft60((int16_t *)(x2128+120),(int16_t *)(ytmp128+120),scale_flag); + dft60((int16_t *)(x2128+180),(int16_t *)(ytmp128+180),scale_flag); + dft60((int16_t *)(x2128+240),(int16_t *)(ytmp128+240),scale_flag); bfly5_tw1(ytmp128,ytmp128+60,ytmp128+120,ytmp128+180,ytmp128+240,y128,y128+60,y128+120,y128+180,y128+240); @@ -5649,7 +6035,7 @@ void dft300(int16_t *x,int16_t *y,unsigned char scale_flag) twd128+j); } - if (scale_flag==1) { + if (scale_flag) { const simd_q15_t norm128 = set1_int16(dft_norm_table[15]); for (i=0; i<300; i++) { @@ -5662,7 +6048,7 @@ void dft300(int16_t *x,int16_t *y,unsigned char scale_flag) static int16_t twa324[107*2*4]; static int16_t twb324[107*2*4]; -void dft324(int16_t *x,int16_t *y,unsigned char scale_flag) // 108 x 3 +void dft324(int16_t *x,int16_t *y,unsigned int *scale_flag) // 108 x 3 { int i,j; simd_q15_t *x128=(simd_q15_t *)x; @@ -5680,9 +6066,9 @@ void dft324(int16_t *x,int16_t *y,unsigned char scale_flag) // 108 x 3 x2128[i+216] = x128[j+2]; } - dft108((int16_t *)x2128,(int16_t *)ytmp128,1); - dft108((int16_t *)(x2128+108),(int16_t *)(ytmp128+108),1); - dft108((int16_t *)(x2128+216),(int16_t *)(ytmp128+216),1); + dft108((int16_t *)x2128,(int16_t *)ytmp128,scale_flag); + dft108((int16_t *)(x2128+108),(int16_t *)(ytmp128+108),scale_flag); + dft108((int16_t *)(x2128+216),(int16_t *)(ytmp128+216),scale_flag); bfly3_tw1(ytmp128,ytmp128+108,ytmp128+216,y128,y128+108,y128+216); @@ -5697,7 +6083,7 @@ void dft324(int16_t *x,int16_t *y,unsigned char scale_flag) // 108 x 3 twb128+j); } - if (scale_flag==1) { + if (scale_flag) { const simd_q15_t norm128 = set1_int16(dft_norm_table[14]); for (i=0; i<324; i++) { @@ -5710,7 +6096,7 @@ void dft324(int16_t *x,int16_t *y,unsigned char scale_flag) // 108 x 3 static int16_t twa360[119*2*4]; static int16_t twb360[119*2*4]; -void dft360(int16_t *x,int16_t *y,unsigned char scale_flag) // 120 x 3 +void dft360(int16_t *x,int16_t *y,unsigned int *scale_flag) // 120 x 3 { int i,j; simd_q15_t *x128=(simd_q15_t *)x; @@ -5728,9 +6114,9 @@ void dft360(int16_t *x,int16_t *y,unsigned char scale_flag) // 120 x 3 x2128[i+240] = x128[j+2]; } - dft120((int16_t *)x2128,(int16_t *)ytmp128,1); - dft120((int16_t *)(x2128+120),(int16_t *)(ytmp128+120),1); - dft120((int16_t *)(x2128+240),(int16_t *)(ytmp128+240),1); + dft120((int16_t *)x2128,(int16_t *)ytmp128,scale_flag); + dft120((int16_t *)(x2128+120),(int16_t *)(ytmp128+120),scale_flag); + dft120((int16_t *)(x2128+240),(int16_t *)(ytmp128+240),scale_flag); bfly3_tw1(ytmp128,ytmp128+120,ytmp128+240,y128,y128+120,y128+240); @@ -5745,7 +6131,7 @@ void dft360(int16_t *x,int16_t *y,unsigned char scale_flag) // 120 x 3 twb128+j); } - if (scale_flag==1) { + if (scale_flag) { const simd_q15_t norm128 = set1_int16(dft_norm_table[14]); for (i=0; i<360; i++) { @@ -5759,7 +6145,7 @@ static int16_t twa384[95*2*4]; static int16_t twb384[95*2*4]; static int16_t twc384[95*2*4]; -void dft384(int16_t *x,int16_t *y,unsigned char scale_flag) // 96 x 4 +void dft384(int16_t *x,int16_t *y,unsigned int *scale_flag) // 96 x 4 { int i,j; simd_q15_t *x128=(simd_q15_t *)x; @@ -5779,10 +6165,10 @@ void dft384(int16_t *x,int16_t *y,unsigned char scale_flag) // 96 x 4 x2128[i+288] = x128[j+3]; } - dft96((int16_t *)x2128,(int16_t *)ytmp128,1); - dft96((int16_t *)(x2128+96),(int16_t *)(ytmp128+96),1); - dft96((int16_t *)(x2128+192),(int16_t *)(ytmp128+192),1); - dft96((int16_t *)(x2128+288),(int16_t *)(ytmp128+288),1); + dft96((int16_t *)x2128,(int16_t *)ytmp128,scale_flag); + dft96((int16_t *)(x2128+96),(int16_t *)(ytmp128+96),scale_flag); + dft96((int16_t *)(x2128+192),(int16_t *)(ytmp128+192),scale_flag); + dft96((int16_t *)(x2128+288),(int16_t *)(ytmp128+288),scale_flag); bfly4_tw1(ytmp128,ytmp128+96,ytmp128+192,ytmp128+288,y128,y128+96,y128+192,y128+288); @@ -5800,7 +6186,7 @@ void dft384(int16_t *x,int16_t *y,unsigned char scale_flag) // 96 x 4 twc128+j); } - if (scale_flag==1) { + if (scale_flag) { const simd_q15_t norm128 = set1_int16(16384); // dft_norm_table[13]); for (i=0; i<384; i++) { @@ -5814,7 +6200,7 @@ static int16_t twa432[107*2*4]; static int16_t twb432[107*2*4]; static int16_t twc432[107*2*4]; -void dft432(int16_t *x,int16_t *y,unsigned char scale_flag) // 108 x 4 +void dft432(int16_t *x,int16_t *y,unsigned int *scale_flag) // 108 x 4 { int i,j; simd_q15_t *x128=(simd_q15_t *)x; @@ -5833,10 +6219,10 @@ void dft432(int16_t *x,int16_t *y,unsigned char scale_flag) // 108 x 4 x2128[i+324] = x128[j+3]; } - dft108((int16_t *)x2128,(int16_t *)ytmp128,1); - dft108((int16_t *)(x2128+108),(int16_t *)(ytmp128+108),1); - dft108((int16_t *)(x2128+216),(int16_t *)(ytmp128+216),1); - dft108((int16_t *)(x2128+324),(int16_t *)(ytmp128+324),1); + dft108((int16_t *)x2128,(int16_t *)ytmp128,scale_flag); + dft108((int16_t *)(x2128+108),(int16_t *)(ytmp128+108),scale_flag); + dft108((int16_t *)(x2128+216),(int16_t *)(ytmp128+216),scale_flag); + dft108((int16_t *)(x2128+324),(int16_t *)(ytmp128+324),scale_flag); bfly4_tw1(ytmp128,ytmp128+108,ytmp128+216,ytmp128+324,y128,y128+108,y128+216,y128+324); @@ -5854,7 +6240,7 @@ void dft432(int16_t *x,int16_t *y,unsigned char scale_flag) // 108 x 4 twc128+j); } - if (scale_flag==1) { + if (scale_flag) { const simd_q15_t norm128 = set1_int16(16384); // dft_norm_table[13]); for (i=0; i<432; i++) { @@ -5867,7 +6253,7 @@ static int16_t twa480[119*2*4]; static int16_t twb480[119*2*4]; static int16_t twc480[119*2*4]; -void dft480(int16_t *x,int16_t *y,unsigned char scale_flag) // 120 x 4 +void dft480(int16_t *x,int16_t *y,unsigned int *scale_flag) // 120 x 4 { int i,j; simd_q15_t *x128=(simd_q15_t *)x; @@ -5887,10 +6273,10 @@ void dft480(int16_t *x,int16_t *y,unsigned char scale_flag) // 120 x 4 x2128[i+360] = x128[j+3]; } - dft120((int16_t *)x2128,(int16_t *)ytmp128,1); - dft120((int16_t *)(x2128+120),(int16_t *)(ytmp128+120),1); - dft120((int16_t *)(x2128+240),(int16_t *)(ytmp128+240),1); - dft120((int16_t *)(x2128+360),(int16_t *)(ytmp128+360),1); + dft120((int16_t *)x2128,(int16_t *)ytmp128,scale_flag); + dft120((int16_t *)(x2128+120),(int16_t *)(ytmp128+120),scale_flag); + dft120((int16_t *)(x2128+240),(int16_t *)(ytmp128+240),scale_flag); + dft120((int16_t *)(x2128+360),(int16_t *)(ytmp128+360),scale_flag); bfly4_tw1(ytmp128,ytmp128+120,ytmp128+240,ytmp128+360,y128,y128+120,y128+240,y128+360); @@ -5908,7 +6294,7 @@ void dft480(int16_t *x,int16_t *y,unsigned char scale_flag) // 120 x 4 twc128+j); } - if (scale_flag==1) { + if (scale_flag) { const simd_q15_t norm128 = set1_int16(16384); // dft_norm_table[13]); for (i=0; i<480; i++) { @@ -5922,7 +6308,7 @@ void dft480(int16_t *x,int16_t *y,unsigned char scale_flag) // 120 x 4 static int16_t twa540[179*2*4]; static int16_t twb540[179*2*4]; -void dft540(int16_t *x,int16_t *y,unsigned char scale_flag) // 180 x 3 +void dft540(int16_t *x,int16_t *y,unsigned int *scale_flag) // 180 x 3 { int i,j; simd_q15_t *x128=(simd_q15_t *)x; @@ -5940,9 +6326,9 @@ void dft540(int16_t *x,int16_t *y,unsigned char scale_flag) // 180 x 3 x2128[i+360] = x128[j+2]; } - dft180((int16_t *)x2128,(int16_t *)ytmp128,1); - dft180((int16_t *)(x2128+180),(int16_t *)(ytmp128+180),1); - dft180((int16_t *)(x2128+360),(int16_t *)(ytmp128+360),1); + dft180((int16_t *)x2128,(int16_t *)ytmp128,scale_flag); + dft180((int16_t *)(x2128+180),(int16_t *)(ytmp128+180),scale_flag); + dft180((int16_t *)(x2128+360),(int16_t *)(ytmp128+360),scale_flag); bfly3_tw1(ytmp128,ytmp128+180,ytmp128+360,y128,y128+180,y128+360); @@ -5957,7 +6343,7 @@ void dft540(int16_t *x,int16_t *y,unsigned char scale_flag) // 180 x 3 twb128+j); } - if (scale_flag==1) { + if (scale_flag) { const simd_q15_t norm128 = set1_int16(dft_norm_table[14]); for (i=0; i<540; i++) { @@ -5970,7 +6356,7 @@ void dft540(int16_t *x,int16_t *y,unsigned char scale_flag) // 180 x 3 static int16_t twa576[191*2*4]; static int16_t twb576[191*2*4]; -void dft576(int16_t *x,int16_t *y,unsigned char scale_flag) // 192 x 3 +void dft576(int16_t *x,int16_t *y,unsigned int *scale_flag) // 192 x 3 { int i,j; simd_q15_t *x128=(simd_q15_t *)x; @@ -5989,9 +6375,9 @@ void dft576(int16_t *x,int16_t *y,unsigned char scale_flag) // 192 x 3 } - dft192((int16_t *)x2128,(int16_t *)ytmp128,1); - dft192((int16_t *)(x2128+192),(int16_t *)(ytmp128+192),1); - dft192((int16_t *)(x2128+384),(int16_t *)(ytmp128+384),1); + dft192((int16_t *)x2128,(int16_t *)ytmp128,scale_flag); + dft192((int16_t *)(x2128+192),(int16_t *)(ytmp128+192),scale_flag); + dft192((int16_t *)(x2128+384),(int16_t *)(ytmp128+384),scale_flag); bfly3_tw1(ytmp128,ytmp128+192,ytmp128+384,y128,y128+192,y128+384); @@ -6006,7 +6392,7 @@ void dft576(int16_t *x,int16_t *y,unsigned char scale_flag) // 192 x 3 twb128+j); } - if (scale_flag==1) { + if (scale_flag) { const simd_q15_t norm128 = set1_int16(dft_norm_table[14]); for (i=0; i<576; i++) { @@ -6019,7 +6405,7 @@ void dft576(int16_t *x,int16_t *y,unsigned char scale_flag) // 192 x 3 static int16_t twa600[299*2*4]; -void dft600(int16_t *x,int16_t *y,unsigned char scale_flag) // 300 x 2 +void dft600(int16_t *x,int16_t *y,unsigned int *scale_flag) // 300 x 2 { int i,j; simd_q15_t *x128=(simd_q15_t *)x; @@ -6034,8 +6420,8 @@ void dft600(int16_t *x,int16_t *y,unsigned char scale_flag) // 300 x 2 x2128[i+300] = x128[j+1]; } - dft300((int16_t *)x2128,(int16_t *)ytmp128,1); - dft300((int16_t *)(x2128+300),(int16_t *)(ytmp128+300),1); + dft300((int16_t *)x2128,(int16_t *)ytmp128,scale_flag); + dft300((int16_t *)(x2128+300),(int16_t *)(ytmp128+300),scale_flag); bfly2_tw1(ytmp128,ytmp128+300,y128,y128+300); @@ -6048,7 +6434,7 @@ void dft600(int16_t *x,int16_t *y,unsigned char scale_flag) // 300 x 2 tw128+j); } - if (scale_flag==1) { + if (scale_flag) { const simd_q15_t norm128 = set1_int16(ONE_OVER_SQRT2_Q15); for (i=0; i<600; i++) { @@ -6062,7 +6448,7 @@ void dft600(int16_t *x,int16_t *y,unsigned char scale_flag) // 300 x 2 static int16_t twa648[215*2*4]; static int16_t twb648[215*2*4]; -void dft648(int16_t *x,int16_t *y,unsigned char scale_flag) // 216 x 3 +void dft648(int16_t *x,int16_t *y,unsigned int *scale_flag) // 216 x 3 { int i,j; simd_q15_t *x128=(simd_q15_t *)x; @@ -6080,9 +6466,9 @@ void dft648(int16_t *x,int16_t *y,unsigned char scale_flag) // 216 x 3 x2128[i+432] = x128[j+2]; } - dft216((int16_t *)x2128,(int16_t *)ytmp128,1); - dft216((int16_t *)(x2128+216),(int16_t *)(ytmp128+216),1); - dft216((int16_t *)(x2128+432),(int16_t *)(ytmp128+432),1); + dft216((int16_t *)x2128,(int16_t *)ytmp128,scale_flag); + dft216((int16_t *)(x2128+216),(int16_t *)(ytmp128+216),scale_flag); + dft216((int16_t *)(x2128+432),(int16_t *)(ytmp128+432),scale_flag); bfly3_tw1(ytmp128,ytmp128+216,ytmp128+432,y128,y128+216,y128+432); @@ -6097,7 +6483,7 @@ void dft648(int16_t *x,int16_t *y,unsigned char scale_flag) // 216 x 3 twb128+j); } - if (scale_flag==1) { + if (scale_flag) { const simd_q15_t norm128 = set1_int16(dft_norm_table[14]); for (i=0; i<648; i++) { @@ -6113,7 +6499,7 @@ static int16_t twb720[179*2*4]; static int16_t twc720[179*2*4]; -void dft720(int16_t *x,int16_t *y,unsigned char scale_flag) // 180 x 4 +void dft720(int16_t *x,int16_t *y,unsigned int *scale_flag) // 180 x 4 { int i,j; simd_q15_t *x128=(simd_q15_t *)x; @@ -6133,10 +6519,10 @@ void dft720(int16_t *x,int16_t *y,unsigned char scale_flag) // 180 x 4 x2128[i+540] = x128[j+3]; } - dft180((int16_t *)x2128,(int16_t *)ytmp128,1); - dft180((int16_t *)(x2128+180),(int16_t *)(ytmp128+180),1); - dft180((int16_t *)(x2128+360),(int16_t *)(ytmp128+360),1); - dft180((int16_t *)(x2128+540),(int16_t *)(ytmp128+540),1); + dft180((int16_t *)x2128,(int16_t *)ytmp128,scale_flag); + dft180((int16_t *)(x2128+180),(int16_t *)(ytmp128+180),scale_flag); + dft180((int16_t *)(x2128+360),(int16_t *)(ytmp128+360),scale_flag); + dft180((int16_t *)(x2128+540),(int16_t *)(ytmp128+540),scale_flag); bfly4_tw1(ytmp128,ytmp128+180,ytmp128+360,ytmp128+540,y128,y128+180,y128+360,y128+540); @@ -6154,7 +6540,7 @@ void dft720(int16_t *x,int16_t *y,unsigned char scale_flag) // 180 x 4 twc128+j); } - if (scale_flag==1) { + if (scale_flag) { const simd_q15_t norm128 = set1_int16(16384); // dft_norm_table[13]); for (i=0; i<720; i++) { @@ -6168,7 +6554,7 @@ static int16_t twa768p[191*2*4]; static int16_t twb768p[191*2*4]; static int16_t twc768p[191*2*4]; -void dft768p(int16_t *x,int16_t *y,unsigned char scale_flag) { // 192x 4; +void dft768p(int16_t *x,int16_t *y,unsigned int *scale_flag) { // 192x 4; int i,j; simd_q15_t *x128=(simd_q15_t *)x; @@ -6188,10 +6574,10 @@ void dft768p(int16_t *x,int16_t *y,unsigned char scale_flag) { // 192x 4; x2128[i+576] = x128[j+3]; } - dft192((int16_t *)x2128,(int16_t *)ytmp128,1); - dft192((int16_t *)(x2128+192),(int16_t *)(ytmp128+192),1); - dft192((int16_t *)(x2128+384),(int16_t *)(ytmp128+384),1); - dft192((int16_t *)(x2128+576),(int16_t *)(ytmp128+576),1); + dft192((int16_t *)x2128,(int16_t *)ytmp128,scale_flag); + dft192((int16_t *)(x2128+192),(int16_t *)(ytmp128+192),scale_flag); + dft192((int16_t *)(x2128+384),(int16_t *)(ytmp128+384),scale_flag); + dft192((int16_t *)(x2128+576),(int16_t *)(ytmp128+576),scale_flag); bfly4_tw1(ytmp128,ytmp128+192,ytmp128+384,ytmp128+576,y128,y128+192,y128+384,y128+576); @@ -6209,7 +6595,7 @@ void dft768p(int16_t *x,int16_t *y,unsigned char scale_flag) { // 192x 4; twc128+j); } - if (scale_flag==1) { + if (scale_flag) { const simd_q15_t norm128 = set1_int16(16384); // dft_norm_table[13]); for (i=0; i<768; i++) { @@ -6222,7 +6608,7 @@ void dft768p(int16_t *x,int16_t *y,unsigned char scale_flag) { // 192x 4; static int16_t twa384i[256]; static int16_t twb384i[256]; // 128 x 3 -void idft384(int16_t *input, int16_t *output, unsigned char scale) +void idft384(int16_t *input, int16_t *output, unsigned int *scale) { int i,i2,j; uint32_t tmp[3][128]__attribute__((aligned(32))); @@ -6236,9 +6622,9 @@ void idft384(int16_t *input, int16_t *output, unsigned char scale) tmp[2][i] = ((uint32_t *)input)[j++]; } - idft128((int16_t*)(tmp[0]),(int16_t*)(tmpo[0]),1); - idft128((int16_t*)(tmp[1]),(int16_t*)(tmpo[1]),1); - idft128((int16_t*)(tmp[2]),(int16_t*)(tmpo[2]),1); + idft128((int16_t*)(tmp[0]),(int16_t*)(tmpo[0]),scale); + idft128((int16_t*)(tmp[1]),(int16_t*)(tmpo[1]),scale); + idft128((int16_t*)(tmp[2]),(int16_t*)(tmpo[2]),scale); for (i=0,i2=0; i<256; i+=8,i2+=4) { ibfly3((simd_q15_t*)(&tmpo[0][i2]),(simd_q15_t*)(&tmpo[1][i2]),(simd_q15_t*)(&tmpo[2][i2]), @@ -6247,7 +6633,7 @@ void idft384(int16_t *input, int16_t *output, unsigned char scale) } - if (scale==1) { + if (scale && *scale>0) { for (i=0; i<6; i++) { y128p[0] = mulhi_int16(y128p[0],ONE_OVER_SQRT3_Q15_128); y128p[1] = mulhi_int16(y128p[1],ONE_OVER_SQRT3_Q15_128); @@ -6275,7 +6661,7 @@ void idft384(int16_t *input, int16_t *output, unsigned char scale) static int16_t twa864[287*2*4]; static int16_t twb864[287*2*4]; -void dft864(int16_t *x,int16_t *y,unsigned char scale_flag) // 288 x 3 +void dft864(int16_t *x,int16_t *y,unsigned int *scale_flag) // 288 x 3 { int i,j; simd_q15_t *x128=(simd_q15_t *)x; @@ -6293,9 +6679,9 @@ void dft864(int16_t *x,int16_t *y,unsigned char scale_flag) // 288 x 3 x2128[i+576] = x128[j+2]; } - dft288((int16_t *)x2128,(int16_t *)ytmp128,1); - dft288((int16_t *)(x2128+288),(int16_t *)(ytmp128+288),1); - dft288((int16_t *)(x2128+576),(int16_t *)(ytmp128+576),1); + dft288((int16_t *)x2128,(int16_t *)ytmp128,scale_flag); + dft288((int16_t *)(x2128+288),(int16_t *)(ytmp128+288),scale_flag); + dft288((int16_t *)(x2128+576),(int16_t *)(ytmp128+576),scale_flag); bfly3_tw1(ytmp128,ytmp128+288,ytmp128+576,y128,y128+288,y128+576); @@ -6310,7 +6696,7 @@ void dft864(int16_t *x,int16_t *y,unsigned char scale_flag) // 288 x 3 twb128+j); } - if (scale_flag==1) { + if (scale_flag) { const simd_q15_t norm128 = set1_int16(dft_norm_table[14]); for (i=0; i<864; i++) { @@ -6323,7 +6709,7 @@ void dft864(int16_t *x,int16_t *y,unsigned char scale_flag) // 288 x 3 static int16_t twa900[299*2*4]; static int16_t twb900[299*2*4]; -void dft900(int16_t *x,int16_t *y,unsigned char scale_flag) // 300 x 3 +void dft900(int16_t *x,int16_t *y,unsigned int *scale_flag) // 300 x 3 { int i,j; simd_q15_t *x128=(simd_q15_t *)x; @@ -6341,9 +6727,9 @@ void dft900(int16_t *x,int16_t *y,unsigned char scale_flag) // 300 x 3 x2128[i+600] = x128[j+2]; } - dft300((int16_t *)x2128,(int16_t *)ytmp128,1); - dft300((int16_t *)(x2128+300),(int16_t *)(ytmp128+300),1); - dft300((int16_t *)(x2128+600),(int16_t *)(ytmp128+600),1); + dft300((int16_t *)x2128,(int16_t *)ytmp128,scale_flag); + dft300((int16_t *)(x2128+300),(int16_t *)(ytmp128+300),scale_flag); + dft300((int16_t *)(x2128+600),(int16_t *)(ytmp128+600),scale_flag); bfly3_tw1(ytmp128,ytmp128+300,ytmp128+600,y128,y128+300,y128+600); @@ -6358,7 +6744,7 @@ void dft900(int16_t *x,int16_t *y,unsigned char scale_flag) // 300 x 3 twb128+j); } - if (scale_flag==1) { + if (scale_flag) { const simd_q15_t norm128 = set1_int16(dft_norm_table[14]); for (i=0; i<900; i++) { @@ -6374,7 +6760,7 @@ static int16_t twb960[239*2*4]; static int16_t twc960[239*2*4]; -void dft960(int16_t *x,int16_t *y,unsigned char scale_flag) // 240 x 4 +void dft960(int16_t *x,int16_t *y,unsigned int *scale_flag) // 240 x 4 { int i,j; simd_q15_t *x128=(simd_q15_t *)x; @@ -6394,10 +6780,10 @@ void dft960(int16_t *x,int16_t *y,unsigned char scale_flag) // 240 x 4 x2128[i+720] = x128[j+3]; } - dft240((int16_t *)x2128,(int16_t *)ytmp128,1); - dft240((int16_t *)(x2128+240),(int16_t *)(ytmp128+240),1); - dft240((int16_t *)(x2128+480),(int16_t *)(ytmp128+480),1); - dft240((int16_t *)(x2128+720),(int16_t *)(ytmp128+720),1); + dft240((int16_t *)x2128,(int16_t *)ytmp128,scale_flag); + dft240((int16_t *)(x2128+240),(int16_t *)(ytmp128+240),scale_flag); + dft240((int16_t *)(x2128+480),(int16_t *)(ytmp128+480),scale_flag); + dft240((int16_t *)(x2128+720),(int16_t *)(ytmp128+720),scale_flag); bfly4_tw1(ytmp128,ytmp128+240,ytmp128+480,ytmp128+720,y128,y128+240,y128+480,y128+720); @@ -6415,7 +6801,7 @@ void dft960(int16_t *x,int16_t *y,unsigned char scale_flag) // 240 x 4 twc128+j); } - if (scale_flag==1) { + if (scale_flag) { const simd_q15_t norm128 = set1_int16(16384); // dft_norm_table[13]); for (i=0; i<960; i++) { @@ -6429,7 +6815,7 @@ void dft960(int16_t *x,int16_t *y,unsigned char scale_flag) // 240 x 4 static int16_t twa972[323*2*4]; static int16_t twb972[323*2*4]; -void dft972(int16_t *x,int16_t *y,unsigned char scale_flag) // 324 x 3 +void dft972(int16_t *x,int16_t *y,unsigned int *scale_flag) // 324 x 3 { int i,j; simd_q15_t *x128=(simd_q15_t *)x; @@ -6447,9 +6833,9 @@ void dft972(int16_t *x,int16_t *y,unsigned char scale_flag) // 324 x 3 x2128[i+648] = x128[j+2]; } - dft324((int16_t *)x2128,(int16_t *)ytmp128,1); - dft324((int16_t *)(x2128+324),(int16_t *)(ytmp128+324),1); - dft324((int16_t *)(x2128+648),(int16_t *)(ytmp128+648),1); + dft324((int16_t *)x2128,(int16_t *)ytmp128,scale_flag); + dft324((int16_t *)(x2128+324),(int16_t *)(ytmp128+324),scale_flag); + dft324((int16_t *)(x2128+648),(int16_t *)(ytmp128+648),scale_flag); bfly3_tw1(ytmp128,ytmp128+324,ytmp128+648,y128,y128+324,y128+648); @@ -6464,7 +6850,7 @@ void dft972(int16_t *x,int16_t *y,unsigned char scale_flag) // 324 x 3 twb128+j); } - if (scale_flag==1) { + if (scale_flag) { const simd_q15_t norm128 = set1_int16(dft_norm_table[14]); for (i=0; i<972; i++) { @@ -6477,7 +6863,7 @@ void dft972(int16_t *x,int16_t *y,unsigned char scale_flag) // 324 x 3 static int16_t twa1080[359*2*4]; static int16_t twb1080[359*2*4]; -void dft1080(int16_t *x,int16_t *y,unsigned char scale_flag) // 360 x 3 +void dft1080(int16_t *x,int16_t *y,unsigned int *scale_flag) // 360 x 3 { int i,j; simd_q15_t *x128=(simd_q15_t *)x; @@ -6495,9 +6881,9 @@ void dft1080(int16_t *x,int16_t *y,unsigned char scale_flag) // 360 x 3 x2128[i+720] = x128[j+2]; } - dft360((int16_t *)x2128,(int16_t *)ytmp128,1); - dft360((int16_t *)(x2128+360),(int16_t *)(ytmp128+360),1); - dft360((int16_t *)(x2128+720),(int16_t *)(ytmp128+720),1); + dft360((int16_t *)x2128,(int16_t *)ytmp128,scale_flag); + dft360((int16_t *)(x2128+360),(int16_t *)(ytmp128+360),scale_flag); + dft360((int16_t *)(x2128+720),(int16_t *)(ytmp128+720),scale_flag); bfly3_tw1(ytmp128,ytmp128+360,ytmp128+720,y128,y128+360,y128+720); @@ -6512,7 +6898,7 @@ void dft1080(int16_t *x,int16_t *y,unsigned char scale_flag) // 360 x 3 twb128+j); } - if (scale_flag==1) { + if (scale_flag) { const simd_q15_t norm128 = set1_int16(dft_norm_table[14]); for (i=0; i<1080; i++) { @@ -6526,7 +6912,7 @@ static int16_t twa1152[287*2*4]; static int16_t twb1152[287*2*4]; static int16_t twc1152[287*2*4]; -void dft1152(int16_t *x,int16_t *y,unsigned char scale_flag) // 288 x 4 +void dft1152(int16_t *x,int16_t *y,unsigned int *scale_flag) // 288 x 4 { int i,j; @@ -6547,10 +6933,10 @@ void dft1152(int16_t *x,int16_t *y,unsigned char scale_flag) // 288 x 4 x2128[i+864] = x128[j+3]; } - dft288((int16_t *)x2128,(int16_t *)ytmp128,1); - dft288((int16_t *)(x2128+288),(int16_t *)(ytmp128+288),1); - dft288((int16_t *)(x2128+576),(int16_t *)(ytmp128+576),1); - dft288((int16_t *)(x2128+864),(int16_t *)(ytmp128+864),1); + dft288((int16_t *)x2128,(int16_t *)ytmp128,scale_flag); + dft288((int16_t *)(x2128+288),(int16_t *)(ytmp128+288),scale_flag); + dft288((int16_t *)(x2128+576),(int16_t *)(ytmp128+576),scale_flag); + dft288((int16_t *)(x2128+864),(int16_t *)(ytmp128+864),scale_flag); bfly4_tw1(ytmp128,ytmp128+288,ytmp128+576,ytmp128+864,y128,y128+288,y128+576,y128+864); @@ -6568,7 +6954,7 @@ void dft1152(int16_t *x,int16_t *y,unsigned char scale_flag) // 288 x 4 twc128+j); } - if (scale_flag==1) { + if (scale_flag) { const simd_q15_t norm128 = set1_int16(16384); // dft_norm_table[13]); for (i=0; i<1152; i++) { @@ -6582,7 +6968,7 @@ int16_t twa1200[4784]; int16_t twb1200[4784]; int16_t twc1200[4784]; -void dft1200(int16_t *x,int16_t *y,unsigned char scale_flag) +void dft1200(int16_t *x,int16_t *y,unsigned int *scale_flag) { int i,j; @@ -6603,10 +6989,10 @@ void dft1200(int16_t *x,int16_t *y,unsigned char scale_flag) x2128[i+900] = x128[j+3]; } - dft300((int16_t *)x2128,(int16_t *)ytmp128,1); - dft300((int16_t *)(x2128+300),(int16_t *)(ytmp128+300),1); - dft300((int16_t *)(x2128+600),(int16_t *)(ytmp128+600),1); - dft300((int16_t *)(x2128+900),(int16_t *)(ytmp128+900),1); + dft300((int16_t *)x2128,(int16_t *)ytmp128,scale_flag); + dft300((int16_t *)(x2128+300),(int16_t *)(ytmp128+300),scale_flag); + dft300((int16_t *)(x2128+600),(int16_t *)(ytmp128+600),scale_flag); + dft300((int16_t *)(x2128+900),(int16_t *)(ytmp128+900),scale_flag); bfly4_tw1(ytmp128,ytmp128+300,ytmp128+600,ytmp128+900,y128,y128+300,y128+600,y128+900); @@ -6624,7 +7010,7 @@ void dft1200(int16_t *x,int16_t *y,unsigned char scale_flag) twc128+j); } - if (scale_flag==1) { + if (scale_flag) { const simd_q15_t norm128 = set1_int16(16384); // dft_norm_table[13]); for (i=0; i<1200; i++) { y128[i] = mulhi_int16(y128[i],norm128); @@ -6637,7 +7023,7 @@ void dft1200(int16_t *x,int16_t *y,unsigned char scale_flag) static int16_t twa1296[431*2*4]; static int16_t twb1296[431*2*4]; -void dft1296(int16_t *x,int16_t *y,unsigned char scale_flag) //432 * 3 +void dft1296(int16_t *x,int16_t *y,unsigned int *scale_flag) //432 * 3 { int i,j; @@ -6656,9 +7042,9 @@ void dft1296(int16_t *x,int16_t *y,unsigned char scale_flag) //432 * 3 x2128[i+864] = x128[j+2]; } - dft432((int16_t *)x2128,(int16_t *)ytmp128,1); - dft432((int16_t *)(x2128+432),(int16_t *)(ytmp128+432),1); - dft432((int16_t *)(x2128+864),(int16_t *)(ytmp128+864),1); + dft432((int16_t *)x2128,(int16_t *)ytmp128,scale_flag); + dft432((int16_t *)(x2128+432),(int16_t *)(ytmp128+432),scale_flag); + dft432((int16_t *)(x2128+864),(int16_t *)(ytmp128+864),scale_flag); bfly3_tw1(ytmp128,ytmp128+432,ytmp128+864,y128,y128+432,y128+864); @@ -6673,7 +7059,7 @@ void dft1296(int16_t *x,int16_t *y,unsigned char scale_flag) //432 * 3 twb128+j); } - if (scale_flag==1) { + if (scale_flag) { const simd_q15_t norm128 = set1_int16(dft_norm_table[14]); for (i=0; i<1296; i++) { @@ -6687,7 +7073,7 @@ void dft1296(int16_t *x,int16_t *y,unsigned char scale_flag) //432 * 3 static int16_t twa1440[479*2*4]; static int16_t twb1440[479*2*4]; -void dft1440(int16_t *x,int16_t *y,unsigned char scale_flag) // 480 x 3 +void dft1440(int16_t *x,int16_t *y,unsigned int *scale_flag) // 480 x 3 { int i,j; simd_q15_t *x128=(simd_q15_t *)x; @@ -6705,9 +7091,9 @@ void dft1440(int16_t *x,int16_t *y,unsigned char scale_flag) // 480 x 3 x2128[i+960] = x128[j+2]; } - dft480((int16_t *)x2128,(int16_t *)ytmp128,1); - dft480((int16_t *)(x2128+480),(int16_t *)(ytmp128+480),1); - dft480((int16_t *)(x2128+960),(int16_t *)(ytmp128+960),1); + dft480((int16_t *)x2128,(int16_t *)ytmp128,scale_flag); + dft480((int16_t *)(x2128+480),(int16_t *)(ytmp128+480),scale_flag); + dft480((int16_t *)(x2128+960),(int16_t *)(ytmp128+960),scale_flag); bfly3_tw1(ytmp128,ytmp128+480,ytmp128+960,y128,y128+480,y128+960); @@ -6722,7 +7108,7 @@ void dft1440(int16_t *x,int16_t *y,unsigned char scale_flag) // 480 x 3 twb128+j); } - if (scale_flag==1) { + if (scale_flag) { const simd_q15_t norm128 = set1_int16(dft_norm_table[14]); for (i=0; i<1440; i++) { @@ -6737,7 +7123,7 @@ static int16_t twb1500[2392]__attribute__((aligned(32))); static int16_t twc1500[2392]__attribute__((aligned(32))); static int16_t twd1500[2392]__attribute__((aligned(32))); -void dft1500(int16_t *x,int16_t *y,unsigned char scale_flag) +void dft1500(int16_t *x,int16_t *y,unsigned int *scale_flag) { int i,j; @@ -6760,11 +7146,11 @@ void dft1500(int16_t *x,int16_t *y,unsigned char scale_flag) x2128[i+1200] = x128[j+4]; } - dft300((int16_t *)x2128,(int16_t *)ytmp128,1); - dft300((int16_t *)(x2128+300),(int16_t *)(ytmp128+300),1); - dft300((int16_t *)(x2128+600),(int16_t *)(ytmp128+600),1); - dft300((int16_t *)(x2128+900),(int16_t *)(ytmp128+900),1); - dft300((int16_t *)(x2128+1200),(int16_t *)(ytmp128+1200),1); + dft300((int16_t *)x2128,(int16_t *)ytmp128,scale_flag); + dft300((int16_t *)(x2128+300),(int16_t *)(ytmp128+300),scale_flag); + dft300((int16_t *)(x2128+600),(int16_t *)(ytmp128+600),scale_flag); + dft300((int16_t *)(x2128+900),(int16_t *)(ytmp128+900),scale_flag); + dft300((int16_t *)(x2128+1200),(int16_t *)(ytmp128+1200),scale_flag); bfly5_tw1(ytmp128,ytmp128+300,ytmp128+600,ytmp128+900,ytmp128+1200,y128,y128+300,y128+600,y128+900,y128+1200); @@ -6785,7 +7171,7 @@ void dft1500(int16_t *x,int16_t *y,unsigned char scale_flag) twd128+j); } - if (scale_flag==1) { + if (scale_flag) { const simd_q15_t norm128 = set1_int16(dft_norm_table[15]); for (i=0; i<1500; i++) { @@ -6798,7 +7184,7 @@ void dft1500(int16_t *x,int16_t *y,unsigned char scale_flag) static int16_t twa1620[539*2*4]; static int16_t twb1620[539*2*4]; -void dft1620(int16_t *x,int16_t *y,unsigned char scale_flag) // 540 x 3 +void dft1620(int16_t *x,int16_t *y,unsigned int *scale_flag) // 540 x 3 { int i,j; simd_q15_t *x128=(simd_q15_t *)x; @@ -6816,9 +7202,9 @@ void dft1620(int16_t *x,int16_t *y,unsigned char scale_flag) // 540 x 3 x2128[i+1080] = x128[j+2]; } - dft540((int16_t *)x2128,(int16_t *)ytmp128,1); - dft540((int16_t *)(x2128+540),(int16_t *)(ytmp128+540),1); - dft540((int16_t *)(x2128+1080),(int16_t *)(ytmp128+1080),1); + dft540((int16_t *)x2128,(int16_t *)ytmp128,scale_flag); + dft540((int16_t *)(x2128+540),(int16_t *)(ytmp128+540),scale_flag); + dft540((int16_t *)(x2128+1080),(int16_t *)(ytmp128+1080),scale_flag); bfly3_tw1(ytmp128,ytmp128+540,ytmp128+1080,y128,y128+540,y128+1080); @@ -6833,7 +7219,7 @@ void dft1620(int16_t *x,int16_t *y,unsigned char scale_flag) // 540 x 3 twb128+j); } - if (scale_flag==1) { + if (scale_flag) { const simd_q15_t norm128 = set1_int16(dft_norm_table[14]); for (i=0; i<1620; i++) { @@ -6846,7 +7232,7 @@ void dft1620(int16_t *x,int16_t *y,unsigned char scale_flag) // 540 x 3 static int16_t twa1728[575*2*4]; static int16_t twb1728[575*2*4]; -void dft1728(int16_t *x,int16_t *y,unsigned char scale_flag) // 576 x 3 +void dft1728(int16_t *x,int16_t *y,unsigned int *scale_flag) // 576 x 3 { int i,j; simd_q15_t *x128=(simd_q15_t *)x; @@ -6864,9 +7250,9 @@ void dft1728(int16_t *x,int16_t *y,unsigned char scale_flag) // 576 x 3 x2128[i+1152] = x128[j+2]; } - dft576((int16_t *)x2128,(int16_t *)ytmp128,1); - dft576((int16_t *)(x2128+576),(int16_t *)(ytmp128+576),1); - dft576((int16_t *)(x2128+1152),(int16_t *)(ytmp128+1152),1); + dft576((int16_t *)x2128,(int16_t *)ytmp128,scale_flag); + dft576((int16_t *)(x2128+576),(int16_t *)(ytmp128+576),scale_flag); + dft576((int16_t *)(x2128+1152),(int16_t *)(ytmp128+1152),scale_flag); bfly3_tw1(ytmp128,ytmp128+576,ytmp128+1152,y128,y128+576,y128+1152); @@ -6881,7 +7267,7 @@ void dft1728(int16_t *x,int16_t *y,unsigned char scale_flag) // 576 x 3 twb128+j); } - if (scale_flag==1) { + if (scale_flag) { const simd_q15_t norm128 = set1_int16(dft_norm_table[14]); for (i=0; i<1728; i++) { @@ -6894,7 +7280,7 @@ void dft1728(int16_t *x,int16_t *y,unsigned char scale_flag) // 576 x 3 static int16_t twa1800[599*2*4]; static int16_t twb1800[599*2*4]; -void dft1800(int16_t *x,int16_t *y,unsigned char scale_flag) // 600 x 3 +void dft1800(int16_t *x,int16_t *y,unsigned int *scale_flag) // 600 x 3 { int i,j; simd_q15_t *x128=(simd_q15_t *)x; @@ -6912,9 +7298,9 @@ void dft1800(int16_t *x,int16_t *y,unsigned char scale_flag) // 600 x 3 x2128[i+1200] = x128[j+2]; } - dft600((int16_t *)x2128,(int16_t *)ytmp128,1); - dft600((int16_t *)(x2128+600),(int16_t *)(ytmp128+600),1); - dft600((int16_t *)(x2128+1200),(int16_t *)(ytmp128+1200),1); + dft600((int16_t *)x2128,(int16_t *)ytmp128,scale_flag); + dft600((int16_t *)(x2128+600),(int16_t *)(ytmp128+600),scale_flag); + dft600((int16_t *)(x2128+1200),(int16_t *)(ytmp128+1200),scale_flag); bfly3_tw1(ytmp128,ytmp128+600,ytmp128+1200,y128,y128+600,y128+1200); @@ -6929,7 +7315,7 @@ void dft1800(int16_t *x,int16_t *y,unsigned char scale_flag) // 600 x 3 twb128+j); } - if (scale_flag==1) { + if (scale_flag) { const simd_q15_t norm128 = set1_int16(dft_norm_table[14]); for (i=0; i<1800; i++) { @@ -6943,7 +7329,7 @@ static int16_t twa1920[479*2*4]; static int16_t twb1920[479*2*4]; static int16_t twc1920[479*2*4]; -void dft1920(int16_t *x,int16_t *y,unsigned char scale_flag) // 480 x 4 +void dft1920(int16_t *x,int16_t *y,unsigned int *scale_flag) // 480 x 4 { int i,j; simd_q15_t *x128=(simd_q15_t *)x; @@ -6963,10 +7349,10 @@ void dft1920(int16_t *x,int16_t *y,unsigned char scale_flag) // 480 x 4 x2128[i+1440] = x128[j+3]; } - dft480((int16_t *)x2128,(int16_t *)ytmp128,1); - dft480((int16_t *)(x2128+480),(int16_t *)(ytmp128+480),1); - dft480((int16_t *)(x2128+960),(int16_t *)(ytmp128+960),1); - dft480((int16_t *)(x2128+1440),(int16_t *)(ytmp128+1440),1); + dft480((int16_t *)x2128,(int16_t *)ytmp128,scale_flag); + dft480((int16_t *)(x2128+480),(int16_t *)(ytmp128+480),scale_flag); + dft480((int16_t *)(x2128+960),(int16_t *)(ytmp128+960),scale_flag); + dft480((int16_t *)(x2128+1440),(int16_t *)(ytmp128+1440),scale_flag); bfly4_tw1(ytmp128,ytmp128+480,ytmp128+960,ytmp128+1440,y128,y128+480,y128+960,y128+1440); @@ -6984,7 +7370,7 @@ void dft1920(int16_t *x,int16_t *y,unsigned char scale_flag) // 480 x 4 twc128+j); } - if (scale_flag==1) { + if (scale_flag) { const simd_q15_t norm128 = set1_int16(dft_norm_table[13]); for (i=0; i<1920; i++) { y128[i] = mulhi_int16(y128[i],norm128); @@ -6996,7 +7382,7 @@ void dft1920(int16_t *x,int16_t *y,unsigned char scale_flag) // 480 x 4 static int16_t twa1944[647*2*4]; static int16_t twb1944[647*2*4]; -void dft1944(int16_t *x,int16_t *y,unsigned char scale_flag) // 648 x 3 +void dft1944(int16_t *x,int16_t *y,unsigned int *scale_flag) // 648 x 3 { int i,j; simd_q15_t *x128=(simd_q15_t *)x; @@ -7014,9 +7400,9 @@ void dft1944(int16_t *x,int16_t *y,unsigned char scale_flag) // 648 x 3 x2128[i+1296] = x128[j+2]; } - dft648((int16_t *)x2128,(int16_t *)ytmp128,1); - dft648((int16_t *)(x2128+648),(int16_t *)(ytmp128+648),1); - dft648((int16_t *)(x2128+1296),(int16_t *)(ytmp128+1296),1); + dft648((int16_t *)x2128,(int16_t *)ytmp128,scale_flag); + dft648((int16_t *)(x2128+648),(int16_t *)(ytmp128+648),scale_flag); + dft648((int16_t *)(x2128+1296),(int16_t *)(ytmp128+1296),scale_flag); bfly3_tw1(ytmp128,ytmp128+648,ytmp128+1296,y128,y128+648,y128+1296); @@ -7031,7 +7417,7 @@ void dft1944(int16_t *x,int16_t *y,unsigned char scale_flag) // 648 x 3 twb128+j); } - if (scale_flag==1) { + if (scale_flag) { const simd_q15_t norm128 = set1_int16(dft_norm_table[14]); for (i=0; i<1944; i++) { @@ -7044,7 +7430,7 @@ void dft1944(int16_t *x,int16_t *y,unsigned char scale_flag) // 648 x 3 static int16_t twa2160[719*2*4]; static int16_t twb2160[719*2*4]; -void dft2160(int16_t *x,int16_t *y,unsigned char scale_flag) // 720 x 3 +void dft2160(int16_t *x,int16_t *y,unsigned int *scale_flag) // 720 x 3 { int i,j; simd_q15_t *x128=(simd_q15_t *)x; @@ -7062,9 +7448,9 @@ void dft2160(int16_t *x,int16_t *y,unsigned char scale_flag) // 720 x 3 x2128[i+1440] = x128[j+2]; } - dft720((int16_t *)x2128,(int16_t *)ytmp128,1); - dft720((int16_t *)(x2128+720),(int16_t *)(ytmp128+720),1); - dft720((int16_t *)(x2128+1440),(int16_t *)(ytmp128+1440),1); + dft720((int16_t *)x2128,(int16_t *)ytmp128,scale_flag); + dft720((int16_t *)(x2128+720),(int16_t *)(ytmp128+720),scale_flag); + dft720((int16_t *)(x2128+1440),(int16_t *)(ytmp128+1440),scale_flag); bfly3_tw1(ytmp128,ytmp128+720,ytmp128+1440,y128,y128+720,y128+1440); @@ -7079,7 +7465,7 @@ void dft2160(int16_t *x,int16_t *y,unsigned char scale_flag) // 720 x 3 twb128+j); } - if (scale_flag==1) { + if (scale_flag) { const simd_q15_t norm128 = set1_int16(dft_norm_table[14]); for (i=0; i<2160; i++) { @@ -7092,7 +7478,7 @@ void dft2160(int16_t *x,int16_t *y,unsigned char scale_flag) // 720 x 3 static int16_t twa2304[767*2*4]; static int16_t twb2304[767*2*4]; -void dft2304(int16_t *x,int16_t *y,unsigned char scale_flag) // 768 x 3 +void dft2304(int16_t *x,int16_t *y,unsigned int *scale_flag) // 768 x 3 { int i,j; simd_q15_t *x128=(simd_q15_t *)x; @@ -7110,9 +7496,9 @@ void dft2304(int16_t *x,int16_t *y,unsigned char scale_flag) // 768 x 3 x2128[i+1536] = x128[j+2]; } - dft768((int16_t *)x2128,(int16_t *)ytmp128,1); - dft768((int16_t *)(x2128+768),(int16_t *)(ytmp128+768),1); - dft768((int16_t *)(x2128+1536),(int16_t *)(ytmp128+1536),1); + dft768((int16_t *)x2128,(int16_t *)ytmp128,scale_flag); + dft768((int16_t *)(x2128+768),(int16_t *)(ytmp128+768),scale_flag); + dft768((int16_t *)(x2128+1536),(int16_t *)(ytmp128+1536),scale_flag); bfly3_tw1(ytmp128,ytmp128+768,ytmp128+1536,y128,y128+768,y128+1536); @@ -7127,7 +7513,7 @@ void dft2304(int16_t *x,int16_t *y,unsigned char scale_flag) // 768 x 3 twb128+j); } - if (scale_flag==1) { + if (scale_flag) { const simd_q15_t norm128 = set1_int16(dft_norm_table[14]); for (i=0; i<2304; i++) { @@ -7141,7 +7527,7 @@ static int16_t twa2400[599*2*4]; static int16_t twb2400[599*2*4]; static int16_t twc2400[599*2*4]; -void dft2400(int16_t *x,int16_t *y,unsigned char scale_flag) // 600 x 4 +void dft2400(int16_t *x,int16_t *y,unsigned int *scale_flag) // 600 x 4 { int i,j; @@ -7162,10 +7548,10 @@ void dft2400(int16_t *x,int16_t *y,unsigned char scale_flag) // 600 x 4 x2128[i+1800] = x128[j+3]; } - dft600((int16_t *)x2128,(int16_t *)ytmp128,1); - dft600((int16_t *)(x2128+600),(int16_t *)(ytmp128+600),1); - dft600((int16_t *)(x2128+1200),(int16_t *)(ytmp128+1200),1); - dft600((int16_t *)(x2128+1800),(int16_t *)(ytmp128+1800),1); + dft600((int16_t *)x2128,(int16_t *)ytmp128,scale_flag); + dft600((int16_t *)(x2128+600),(int16_t *)(ytmp128+600),scale_flag); + dft600((int16_t *)(x2128+1200),(int16_t *)(ytmp128+1200),scale_flag); + dft600((int16_t *)(x2128+1800),(int16_t *)(ytmp128+1800),scale_flag); bfly4_tw1(ytmp128,ytmp128+600,ytmp128+1200,ytmp128+1800,y128,y128+600,y128+1200,y128+1800); @@ -7183,7 +7569,7 @@ void dft2400(int16_t *x,int16_t *y,unsigned char scale_flag) // 600 x 4 twc128+j); } - if (scale_flag==1) { + if (scale_flag) { const simd_q15_t norm128 = set1_int16(dft_norm_table[13]); for (i=0; i<2400; i++) { y128[i] = mulhi_int16(y128[i],norm128); @@ -7195,7 +7581,7 @@ void dft2400(int16_t *x,int16_t *y,unsigned char scale_flag) // 600 x 4 static int16_t twa2592[863*2*4]; static int16_t twb2592[863*2*4]; -void dft2592(int16_t *x,int16_t *y,unsigned char scale_flag) // 864 x 3 +void dft2592(int16_t *x,int16_t *y,unsigned int *scale_flag) // 864 x 3 { int i,j; simd_q15_t *x128=(simd_q15_t *)x; @@ -7213,9 +7599,9 @@ void dft2592(int16_t *x,int16_t *y,unsigned char scale_flag) // 864 x 3 x2128[i+1728] = x128[j+2]; } - dft864((int16_t *)x2128,(int16_t *)ytmp128,1); - dft864((int16_t *)(x2128+864),(int16_t *)(ytmp128+864),1); - dft864((int16_t *)(x2128+1728),(int16_t *)(ytmp128+1728),1); + dft864((int16_t *)x2128,(int16_t *)ytmp128,scale_flag); + dft864((int16_t *)(x2128+864),(int16_t *)(ytmp128+864),scale_flag); + dft864((int16_t *)(x2128+1728),(int16_t *)(ytmp128+1728),scale_flag); bfly3_tw1(ytmp128,ytmp128+864,ytmp128+1728,y128,y128+864,y128+1728); @@ -7230,7 +7616,7 @@ void dft2592(int16_t *x,int16_t *y,unsigned char scale_flag) // 864 x 3 twb128+j); } - if (scale_flag==1) { + if (scale_flag) { const simd_q15_t norm128 = set1_int16(dft_norm_table[14]); for (i=0; i<2592; i++) { @@ -7243,7 +7629,7 @@ void dft2592(int16_t *x,int16_t *y,unsigned char scale_flag) // 864 x 3 static int16_t twa2700[899*2*4]; static int16_t twb2700[899*2*4]; -void dft2700(int16_t *x,int16_t *y,unsigned char scale_flag) // 900 x 3 +void dft2700(int16_t *x,int16_t *y,unsigned int *scale_flag) // 900 x 3 { int i,j; simd_q15_t *x128=(simd_q15_t *)x; @@ -7261,9 +7647,9 @@ void dft2700(int16_t *x,int16_t *y,unsigned char scale_flag) // 900 x 3 x2128[i+1800] = x128[j+2]; } - dft900((int16_t *)x2128,(int16_t *)ytmp128,1); - dft900((int16_t *)(x2128+900),(int16_t *)(ytmp128+900),1); - dft900((int16_t *)(x2128+1800),(int16_t *)(ytmp128+1800),1); + dft900((int16_t *)x2128,(int16_t *)ytmp128,scale_flag); + dft900((int16_t *)(x2128+900),(int16_t *)(ytmp128+900),scale_flag); + dft900((int16_t *)(x2128+1800),(int16_t *)(ytmp128+1800),scale_flag); bfly3_tw1(ytmp128,ytmp128+900,ytmp128+1800,y128,y128+900,y128+1800); @@ -7278,7 +7664,7 @@ void dft2700(int16_t *x,int16_t *y,unsigned char scale_flag) // 900 x 3 twb128+j); } - if (scale_flag==1) { + if (scale_flag) { const simd_q15_t norm128 = set1_int16(dft_norm_table[14]); for (i=0; i<2700; i++) { @@ -7291,7 +7677,7 @@ void dft2700(int16_t *x,int16_t *y,unsigned char scale_flag) // 900 x 3 static int16_t twa2880[959*2*4]; static int16_t twb2880[959*2*4]; -void dft2880(int16_t *x,int16_t *y,unsigned char scale_flag) // 960 x 3 +void dft2880(int16_t *x,int16_t *y,unsigned int *scale_flag) // 960 x 3 { int i,j; simd_q15_t *x128=(simd_q15_t *)x; @@ -7309,9 +7695,9 @@ void dft2880(int16_t *x,int16_t *y,unsigned char scale_flag) // 960 x 3 x2128[i+1920] = x128[j+2]; } - dft960((int16_t *)x2128,(int16_t *)ytmp128,1); - dft960((int16_t *)(x2128+960),(int16_t *)(ytmp128+960),1); - dft960((int16_t *)(x2128+1920),(int16_t *)(ytmp128+1920),1); + dft960((int16_t *)x2128,(int16_t *)ytmp128,scale_flag); + dft960((int16_t *)(x2128+960),(int16_t *)(ytmp128+960),scale_flag); + dft960((int16_t *)(x2128+1920),(int16_t *)(ytmp128+1920),scale_flag); bfly3_tw1(ytmp128,ytmp128+960,ytmp128+1920,y128,y128+960,y128+1920); @@ -7326,7 +7712,7 @@ void dft2880(int16_t *x,int16_t *y,unsigned char scale_flag) // 960 x 3 twb128+j); } - if (scale_flag==1) { + if (scale_flag) { const simd_q15_t norm128 = set1_int16(dft_norm_table[14]); for (i=0; i<2880; i++) { @@ -7339,7 +7725,7 @@ void dft2880(int16_t *x,int16_t *y,unsigned char scale_flag) // 960 x 3 static int16_t twa2916[971*2*4]; static int16_t twb2916[971*2*4]; -void dft2916(int16_t *x,int16_t *y,unsigned char scale_flag) // 972 x 3 +void dft2916(int16_t *x,int16_t *y,unsigned int *scale_flag) // 972 x 3 { int i,j; simd_q15_t *x128=(simd_q15_t *)x; @@ -7357,9 +7743,9 @@ void dft2916(int16_t *x,int16_t *y,unsigned char scale_flag) // 972 x 3 x2128[i+1944] = x128[j+2]; } - dft972((int16_t *)x2128,(int16_t *)ytmp128,1); - dft972((int16_t *)(x2128+972),(int16_t *)(ytmp128+972),1); - dft972((int16_t *)(x2128+1944),(int16_t *)(ytmp128+1944),1); + dft972((int16_t *)x2128,(int16_t *)ytmp128,scale_flag); + dft972((int16_t *)(x2128+972),(int16_t *)(ytmp128+972),scale_flag); + dft972((int16_t *)(x2128+1944),(int16_t *)(ytmp128+1944),scale_flag); bfly3_tw1(ytmp128,ytmp128+972,ytmp128+1944,y128,y128+972,y128+1944); @@ -7374,7 +7760,7 @@ void dft2916(int16_t *x,int16_t *y,unsigned char scale_flag) // 972 x 3 twb128+j); } - if (scale_flag==1) { + if (scale_flag) { const simd_q15_t norm128 = set1_int16(dft_norm_table[14]); for (i=0; i<2916; i++) { @@ -7389,7 +7775,7 @@ static int16_t twb3000[599*8]__attribute__((aligned(32))); static int16_t twc3000[599*8]__attribute__((aligned(32))); static int16_t twd3000[599*8]__attribute__((aligned(32))); -void dft3000(int16_t *x,int16_t *y,unsigned char scale_flag) // 600 * 5 +void dft3000(int16_t *x,int16_t *y,unsigned int *scale_flag) // 600 * 5 { int i,j; @@ -7412,11 +7798,11 @@ void dft3000(int16_t *x,int16_t *y,unsigned char scale_flag) // 600 * 5 x2128[i+2400] = x128[j+4]; } - dft600((int16_t *)x2128,(int16_t *)ytmp128,1); - dft600((int16_t *)(x2128+600),(int16_t *)(ytmp128+600),1); - dft600((int16_t *)(x2128+1200),(int16_t *)(ytmp128+1200),1); - dft600((int16_t *)(x2128+1800),(int16_t *)(ytmp128+1800),1); - dft600((int16_t *)(x2128+2400),(int16_t *)(ytmp128+2400),1); + dft600((int16_t *)x2128,(int16_t *)ytmp128,scale_flag); + dft600((int16_t *)(x2128+600),(int16_t *)(ytmp128+600),scale_flag); + dft600((int16_t *)(x2128+1200),(int16_t *)(ytmp128+1200),scale_flag); + dft600((int16_t *)(x2128+1800),(int16_t *)(ytmp128+1800),scale_flag); + dft600((int16_t *)(x2128+2400),(int16_t *)(ytmp128+2400),scale_flag); bfly5_tw1(ytmp128,ytmp128+600,ytmp128+1200,ytmp128+1800,ytmp128+2400,y128,y128+600,y128+1200,y128+1800,y128+2400); @@ -7437,7 +7823,7 @@ void dft3000(int16_t *x,int16_t *y,unsigned char scale_flag) // 600 * 5 twd128+j); } - if (scale_flag==1) { + if (scale_flag) { const simd_q15_t norm128 = set1_int16(dft_norm_table[15]); for (i=0; i<3000; i++) { @@ -7450,7 +7836,7 @@ void dft3000(int16_t *x,int16_t *y,unsigned char scale_flag) // 600 * 5 static int16_t twa3240[1079*2*4]; static int16_t twb3240[1079*2*4]; -void dft3240(int16_t *x,int16_t *y,unsigned char scale_flag) // 1080 x 3 +void dft3240(int16_t *x,int16_t *y,unsigned int *scale_flag) // 1080 x 3 { int i,j; simd_q15_t *x128=(simd_q15_t *)x; @@ -7468,9 +7854,9 @@ void dft3240(int16_t *x,int16_t *y,unsigned char scale_flag) // 1080 x 3 x2128[i+2160] = x128[j+2]; } - dft1080((int16_t *)x2128,(int16_t *)ytmp128,1); - dft1080((int16_t *)(x2128+1080),(int16_t *)(ytmp128+1080),1); - dft1080((int16_t *)(x2128+2160),(int16_t *)(ytmp128+2160),1); + dft1080((int16_t *)x2128,(int16_t *)ytmp128,scale_flag); + dft1080((int16_t *)(x2128+1080),(int16_t *)(ytmp128+1080),scale_flag); + dft1080((int16_t *)(x2128+2160),(int16_t *)(ytmp128+2160),scale_flag); bfly3_tw1(ytmp128,ytmp128+1080,ytmp128+2160,y128,y128+1080,y128+2160); @@ -7485,7 +7871,7 @@ void dft3240(int16_t *x,int16_t *y,unsigned char scale_flag) // 1080 x 3 twb128+j); } - if (scale_flag==1) { + if (scale_flag) { const simd_q15_t norm128 = set1_int16(dft_norm_table[14]); for (i=0; i<3240; i++) { @@ -7697,7 +8083,7 @@ int dfts_autoinit(void) #ifndef MR_MAIN -void dft_implementation(uint8_t sizeidx, int16_t *input, int16_t *output, unsigned char scale_flag) +void dft_implementation(uint8_t sizeidx, int16_t *input,int16_t *output,unsigned int *scale_flag) { AssertFatal((sizeidx >= 0 && sizeidx<DFT_SIZE_IDXTABLESIZE),"Invalid dft size index %i\n",sizeidx); int algn=0xF; @@ -7716,7 +8102,7 @@ void dft_implementation(uint8_t sizeidx, int16_t *input, int16_t *output, unsign dft_ftab[sizeidx].func(input,output,scale_flag); }; -void idft_implementation(uint8_t sizeidx, int16_t *input, int16_t *output, unsigned char scale_flag) +void idft_implementation(uint8_t sizeidx, int16_t *input,int16_t *output,unsigned int *scale_flag) { AssertFatal((sizeidx>=0 && sizeidx<DFT_SIZE_IDXTABLESIZE),"Invalid idft size index %i\n",sizeidx); int algn=0xF; @@ -7739,9 +8125,26 @@ void idft_implementation(uint8_t sizeidx, int16_t *input, int16_t *output, unsig #ifdef MR_MAIN #include <string.h> #include <stdio.h> +#include "../../../common/config/config_paramdesc.h" +#include "../openair1/SIMULATION/TOOLS/sim.h" +#include "../common/utils/utils.h" + +configmodule_interface_t *uniqCfg = NULL; +extern int bitrev4096[4096],bitrev2048[2048],bitrev1024[1024],bitrev512[512],bitrev256[256],bitrev128[128]; +void init_bitrev(); +void radix2(cd_t *x, int N); +void normalize(cd_t *x,cd_t *y,int *bitrev, int N); + +void exit_function(const char *file, const char *function, const int line, const char *s, const int assert) { +exit(-1); +} + +int config_get(configmodule_interface_t *cfg, paramdef_t *params, int numparams, char *prefix) { +return(0); +} -#define LOG_M write_output -int write_output(const char *fname,const char *vname,void *data,int length,int dec,char format) +//#define LOG_M write_output +int write_file_matlab(const char *fname,const char *vname,void *data,int length,int dec,unsigned int format,int dummy) { FILE *fp=NULL; @@ -7895,7 +8298,36 @@ int write_output(const char *fname,const char *vname,void *data,int length,int d return 0; } +double compute_error(int16_t *x, int16_t *y, int N, int *bitrev, int idft) { + + int i; + cd_t xcd[N],ycd[N]; + + double error=0; + + for (i=0;i<N;i++) { + xcd[i].r = (double)(((int16_t *)x)[i<<1]); + xcd[i].i = (double)(((int16_t *)x)[1+(i<<1)]); + if (idft==1) xcd[i].i=-xcd[i].i; + } + + double input_lev=0; + for (i=0;i<N;i++) input_lev += pow(xcd[i].r,2.0) + pow(xcd[i].i,2.0); + input_lev/=N; + radix2(xcd,N); + normalize(xcd,ycd,bitrev,N); + if (idft==0) for (i=0;i<N;i++) error += pow((ycd[i].r - (double)((int16_t*)y)[i<<1]),2.0) + pow(ycd[i].i-(double)((int16_t*)y)[1+(i<<1)],2.0); + else for (i=0;i<N;i++) error += pow((ycd[i].r - (double)((int16_t*)y)[i<<1]),2.0) + pow(ycd[i].i+(double)((int16_t*)y)[1+(i<<1)],2.0); + return(input_lev/(error/N)); +} +void fill_gauss(c16_t *x,int N,double dBFS) { + + for (int i=0; i < N; i++) { + x[i].r = (int16_t)(gaussZiggurat(0,1.0)*SHRT_MAX*pow(10.0,dBFS*.05)); + x[i].i = (int16_t)(gaussZiggurat(0,1.0)*SHRT_MAX*pow(10.0,dBFS*.05)); + } +} int main(int argc, char**argv) { @@ -7906,8 +8338,12 @@ int main(int argc, char**argv) int i; simd_q15_t *x128=(simd_q15_t*)x,*y128=(simd_q15_t*)y; + double sqnr; + dfts_autoinit(); + init_bitrev(); + set_taus_seed(0); cpu_meas_enabled = 1; /* @@ -8081,7 +8517,6 @@ int main(int argc, char**argv) printf("\n"); memset((void*)&x[0],0,2048*4); - for (i=0; i<2048; i+=4) { ((int16_t*)x)[i<<1] = 1024; ((int16_t*)x)[1+(i<<1)] = 0; @@ -8092,18 +8527,6 @@ int main(int argc, char**argv) ((int16_t*)x)[6+(i<<1)] = 0; ((int16_t*)x)[7+(i<<1)] = -1024; } - /* - for (i=0; i<2048; i+=2) { - ((int16_t*)x)[i<<1] = 1024; - ((int16_t*)x)[1+(i<<1)] = 0; - ((int16_t*)x)[2+(i<<1)] = -1024; - ((int16_t*)x)[3+(i<<1)] = 0; - } - - for (i=0;i<2048*2;i++) { - ((int16_t*)x)[i] = i/2;//(int16_t)((taus()&0xffff))>>5; - } - */ memset((void*)&x[0],0,64*sizeof(int32_t)); for (i=2;i<36;i++) { if ((taus() & 1)==0) @@ -8117,7 +8540,8 @@ int main(int argc, char**argv) else ((int16_t*)x)[i] = -364; } - idft64((int16_t *)x,(int16_t *)y,1); + uint32_t scale64 = 3; + idft64((int16_t *)x,(int16_t *)y,&scale64); printf("64-point\n"); @@ -8134,14 +8558,14 @@ int main(int argc, char**argv) - idft64((int16_t *)x,(int16_t *)y,1); - idft64((int16_t *)x,(int16_t *)y,1); - idft64((int16_t *)x,(int16_t *)y,1); + idft64((int16_t *)x,(int16_t *)y,&scale64); + idft64((int16_t *)x,(int16_t *)y,&scale64); + idft64((int16_t *)x,(int16_t *)y,&scale64); reset_meas(&ts); for (i=0; i<10000000; i++) { start_meas(&ts); - idft64((int16_t *)x,(int16_t *)y,1); + idft64((int16_t *)x,(int16_t *)y,&scale64); stop_meas(&ts); } @@ -8186,12 +8610,16 @@ int main(int argc, char**argv) } reset_meas(&ts); + uint32_t scale128_tx[2] = {4,0}; for (i=0; i<10000; i++) { start_meas(&ts); - idft128((int16_t *)x,(int16_t *)y,1); + idft128((int16_t *)x,(int16_t *)y,scale128_tx); stop_meas(&ts); } + sqnr = compute_error((int16_t*)x,(int16_t*)y,128,bitrev128,1); + + printf("128 point IDFT SQNR (QPSK) : %f dB\n",10*log10(sqnr)); printf("\n\n128-point(%f cycles)\n",(double)ts.diff/(double)ts.trials); LOG_M("y128.m","y128",y,128,1,1); LOG_M("x128.m","x128",x,128,1,1); @@ -8227,10 +8655,11 @@ int main(int argc, char**argv) ((int16_t*)x)[i] = -364; } reset_meas(&ts); + uint32_t scale256_tx[3]={4,0}; for (i=0; i<10000; i++) { start_meas(&ts); - idft256((int16_t *)x,(int16_t *)y,1); + idft256((int16_t *)x,(int16_t *)y,scale256_tx); stop_meas(&ts); } @@ -8238,6 +8667,9 @@ int main(int argc, char**argv) LOG_M("y256.m","y256",y,256,1,1); LOG_M("x256.m","x256",x,256,1,1); + sqnr = compute_error((int16_t*)x,(int16_t*)y,256,bitrev256,1); + + printf("256 point IDFT SQNR (QPSK) : %f dB\n",10*log10(sqnr)); memset((void*)&x[0],0,512*sizeof(int32_t)); for (i=2;i<302;i++) { if ((taus() & 1)==0) @@ -8253,15 +8685,21 @@ int main(int argc, char**argv) } reset_meas(&ts); + uint32_t scale512_tx[4]={4,1,0}; + for (i=0; i<10000; i++) { start_meas(&ts); - idft512((int16_t *)x,(int16_t *)y,1); + idft512((int16_t *)x,(int16_t *)y,scale512_tx); stop_meas(&ts); } printf("\n\n512-point(%f cycles)\n",(double)ts.diff/(double)ts.trials); LOG_M("y512.m","y512",y,512,1,1); LOG_M("x512.m","x512",x,512,1,1); + sqnr = compute_error((int16_t*)x,(int16_t*)y,512,bitrev512,1); + + printf("512 point IDFT SQNR (QPSK) : %f dB\n",10*log10(sqnr)); + memset((void*)x,0,1024*sizeof(int32_t)); /* printf("X: "); for (i=0;i<64;i++) @@ -8288,9 +8726,10 @@ int main(int argc, char**argv) } reset_meas(&ts); + uint32_t scale1024_tx[4]={4,1,0}; for (i=0; i<10000; i++) { start_meas(&ts); - idft1024((int16_t *)x,(int16_t *)y,1); + idft1024((int16_t *)x,(int16_t *)y,scale1024_tx); stop_meas(&ts); } @@ -8298,6 +8737,9 @@ int main(int argc, char**argv) LOG_M("y1024.m","y1024",y,1024,1,1); LOG_M("x1024.m","x1024",x,1024,1,1); + sqnr = compute_error((int16_t*)x,(int16_t*)y,1024,bitrev1024,1); + + printf("1024 point IDFT SQNR (QPSK) : %f dB\n",10*log10(sqnr)); memset((void*)x,0,1536*sizeof(int32_t)); for (i=2;i<1202;i++) { @@ -8314,15 +8756,16 @@ int main(int argc, char**argv) } reset_meas(&ts); + uint32_t scale1536[4]={1,1,1,3}; for (i=0; i<10000; i++) { start_meas(&ts); - idft1536((int16_t *)x,(int16_t *)y,1); + idft1536((int16_t *)x,(int16_t *)y,scale1536); stop_meas(&ts); } printf("\n\n1536-point(%f cycles)\n",(double)ts.diff/(double)ts.trials); - write_output("y1536.m","y1536",y,1536,1,1); - write_output("x1536.m","x1536",x,1536,1,1); + LOG_M("y1536.m","y1536",y,1536,1,1); + LOG_M("x1536.m","x1536",x,1536,1,1); memset((void*)x,0,2048*sizeof(int32_t)); @@ -8340,9 +8783,10 @@ int main(int argc, char**argv) } reset_meas(&ts); + uint32_t scale2048_tx[4]={3,2,1,0}; for (i=0; i<10000; i++) { start_meas(&ts); - dft2048((int16_t *)x,(int16_t *)y,1); + idft2048((int16_t *)x,(int16_t *)y,scale2048_tx); stop_meas(&ts); } @@ -8350,6 +8794,9 @@ int main(int argc, char**argv) LOG_M("y2048.m","y2048",y,2048,1,1); LOG_M("x2048.m","x2048",x,2048,1,1); + sqnr = compute_error((int16_t*)x,(int16_t*)y,2048,bitrev2048,1); + + printf("2048 point IDFT SQNR (QPSK) : %f dB\n",10*log10(sqnr)); // NR 80Mhz, 217 PRB, 3/4 sampling memset((void*)x, 0, 3072*sizeof(int32_t)); for (i=2;i<2506;i++) { @@ -8367,15 +8814,16 @@ int main(int argc, char**argv) reset_meas(&ts); + uint32_t scale3072[4]={1,1,1,3}; for (i=0; i<10000; i++) { start_meas(&ts); - idft3072((int16_t *)x,(int16_t *)y,1); + idft3072((int16_t *)x,(int16_t *)y,scale3072); stop_meas(&ts); } printf("\n\n3072-point(%f cycles)\n",(double)ts.diff/(double)ts.trials); - write_output("y3072.m","y3072",y,3072,1,1); - write_output("x3072.m","x3072",x,3072,1,1); + LOG_M("y3072.m","y3072",y,3072,1,1); + LOG_M("x3072.m","x3072",x,3072,1,1); memset((void*)x,0,4096*sizeof(int32_t)); @@ -8393,9 +8841,10 @@ int main(int argc, char**argv) } reset_meas(&ts); + uint32_t scale4096_tx[4]={3,2,1,0}; for (i=0; i<10000; i++) { start_meas(&ts); - idft4096((int16_t *)x,(int16_t *)y,1); + idft4096((int16_t *)x,(int16_t *)y,scale4096_tx); stop_meas(&ts); } @@ -8403,9 +8852,29 @@ int main(int argc, char**argv) LOG_M("y4096.m","y4096",y,4096,1,1); LOG_M("x4096.m","x4096",x,4096,1,1); - dft4096((int16_t *)y,(int16_t *)x2,1); - LOG_M("x4096_2.m","x4096_2",x2,4096,1,1); + sqnr = compute_error((int16_t*)x,(int16_t*)y,4096,bitrev4096,1); + printf("4096 point IDFT SQNR (QPSK) : %f dB\n",10*log10(sqnr)); + + float sqrt2 = 0.70711; + float sqrt170 = 0.076696; + + for (i=0;i<2400;i++) { + uint32_t n=taus(); + ((int16_t*)x)[i] = (short)((1-2*(n&1))*(8-(1-2*((n>>1)&1))*(4-(1-2*((n>>2)&1))*(2-(1-2*((n>>3)&1))))))*512*sqrt170*sqrt2; + } + for (i=2*(4096-1200);i<8192;i++) { + uint32_t n=taus(); + ((int16_t*)x)[i] = (short)((1-2*(n&1))*(8-(1-2*((n>>1)&1))*(4-(1-2*((n>>2)&1))*(2-(1-2*((n>>3)&1))))))*512*sqrt170*sqrt2; + } + + uint32_t scale4096_tx256qam[4]={3,2,1,0}; + idft4096((int16_t *)x,(int16_t *)y,scale4096_tx256qam); + LOG_M("y4096_256qam.m","y4096_256qam",y,4096,1,1); + LOG_M("x4096_256qam.m","x4096_256qam",x,4096,1,1); + + sqnr = compute_error((int16_t*)x,(int16_t*)y,4096,bitrev4096,1); + printf("4096 point IDFT SQNR (256QAM) : %f dB\n",10*log10(sqnr)); // NR 160Mhz, 434 PRB, 3/4 sampling memset((void*)x, 0, 6144*sizeof(int32_t)); for (i=2;i<5010;i++) { @@ -8423,15 +8892,16 @@ int main(int argc, char**argv) reset_meas(&ts); + uint32_t scale6144[5]={1,1,1,1,3}; for (i=0; i<10000; i++) { start_meas(&ts); - idft6144((int16_t *)x,(int16_t *)y,1); + idft6144((int16_t *)x,(int16_t *)y,scale6144); stop_meas(&ts); } printf("\n\n6144-point(%f cycles)\n",(double)ts.diff/(double)ts.trials); - write_output("y6144.m","y6144",y,6144,1,1); - write_output("x6144.m","x6144",x,6144,1,1); + LOG_M("y6144.m","y6144",y,6144,1,1); + LOG_M("x6144.m","x6144",x,6144,1,1); memset((void*)x,0,8192*sizeof(int32_t)); for (i=2;i<4802;i++) { @@ -8447,9 +8917,10 @@ int main(int argc, char**argv) ((int16_t*)x)[i] = -364; } reset_meas(&ts); + uint32_t scale8192[5]={1,1,1,1,3}; for (i=0; i<10000; i++) { start_meas(&ts); - idft8192((int16_t *)x,(int16_t *)y,1); + idft8192((int16_t *)x,(int16_t *)y,scale8192); stop_meas(&ts); } @@ -8471,9 +8942,10 @@ int main(int argc, char**argv) ((int16_t*)x)[i] = -364; } reset_meas(&ts); + uint32_t scale16384[5]={1,1,1,1,3}; for (i=0; i<10000; i++) { start_meas(&ts); - dft16384((int16_t *)x,(int16_t *)y,1); + dft16384((int16_t *)x,(int16_t *)y,scale16384); stop_meas(&ts); } @@ -8481,82 +8953,6 @@ int main(int argc, char**argv) LOG_M("y16384.m","y16384",y,16384,1,1); LOG_M("x16384.m","x16384",x,16384,1,1); - memset((void*)x,0,1536*sizeof(int32_t)); - for (i=2;i<1202;i++) { - if ((taus() & 1)==0) - ((int16_t*)x)[i] = 364; - else - ((int16_t*)x)[i] = -364; - } - for (i=2*(1536-600);i<3072;i++) { - if ((taus() & 1)==0) - ((int16_t*)x)[i] = 364; - else - ((int16_t*)x)[i] = -364; - } - reset_meas(&ts); - for (i=0; i<10000; i++) { - start_meas(&ts); - idft1536((int16_t *)x,(int16_t *)y,1); - stop_meas(&ts); - } - - printf("\n\n1536-point(%f cycles)\n",(double)ts.diff/(double)ts.trials); - LOG_M("y1536.m","y1536",y,1536,1,1); - LOG_M("x1536.m","x1536",x,1536,1,1); - - printf("\n\n1536-point(%f cycles)\n",(double)ts.diff/(double)ts.trials); - LOG_M("y8192.m","y8192",y,8192,1,1); - LOG_M("x8192.m","x8192",x,8192,1,1); - - memset((void*)x,0,3072*sizeof(int32_t)); - for (i=2;i<1202;i++) { - if ((taus() & 1)==0) - ((int16_t*)x)[i] = 364; - else - ((int16_t*)x)[i] = -364; - } - for (i=2*(3072-600);i<3072;i++) { - if ((taus() & 1)==0) - ((int16_t*)x)[i] = 364; - else - ((int16_t*)x)[i] = -364; - } - reset_meas(&ts); - for (i=0; i<10000; i++) { - start_meas(&ts); - idft3072((int16_t *)x,(int16_t *)y,1); - stop_meas(&ts); - } - - printf("\n\n3072-point(%f cycles)\n",(double)ts.diff/(double)ts.trials); - LOG_M("y3072.m","y3072",y,3072,1,1); - LOG_M("x3072.m","x3072",x,3072,1,1); - - memset((void*)x,0,6144*sizeof(int32_t)); - for (i=2;i<4802;i++) { - if ((taus() & 1)==0) - ((int16_t*)x)[i] = 364; - else - ((int16_t*)x)[i] = -364; - } - for (i=2*(6144-2400);i<12288;i++) { - if ((taus() & 1)==0) - ((int16_t*)x)[i] = 364; - else - ((int16_t*)x)[i] = -364; - } - reset_meas(&ts); - for (i=0; i<10000; i++) { - start_meas(&ts); - idft6144((int16_t *)x,(int16_t *)y,1); - stop_meas(&ts); - } - - printf("\n\n6144-point(%f cycles)\n",(double)ts.diff/(double)ts.trials); - LOG_M("y6144.m","y6144",y,6144,1,1); - LOG_M("x6144.m","x6144",x,6144,1,1); - memset((void*)x,0,12288*sizeof(int32_t)); for (i=2;i<9602;i++) { if ((taus() & 1)==0) @@ -8571,9 +8967,10 @@ int main(int argc, char**argv) ((int16_t*)x)[i] = -364; } reset_meas(&ts); + uint32_t scale12288[5]={1,1,1,1,3}; for (i=0; i<10000; i++) { start_meas(&ts); - idft12288((int16_t *)x,(int16_t *)y,1); + idft12288((int16_t *)x,(int16_t *)y,scale12288); stop_meas(&ts); } @@ -8595,9 +8992,11 @@ int main(int argc, char**argv) ((int16_t*)x)[i] = -364; } reset_meas(&ts); + + uint32_t scale18432[6]={1,1,1,1,1,3}; for (i=0; i<10000; i++) { start_meas(&ts); - idft18432((int16_t *)x,(int16_t *)y,1); + idft18432((int16_t *)x,(int16_t *)y,scale18432); stop_meas(&ts); } @@ -8619,9 +9018,11 @@ int main(int argc, char**argv) ((int16_t*)x)[i] = -364; } reset_meas(&ts); + + uint32_t scale24576[6]={1,1,1,1,1,3}; for (i=0; i<10000; i++) { start_meas(&ts); - idft24576((int16_t *)x,(int16_t *)y,1); + idft24576((int16_t *)x,(int16_t *)y,scale24576); stop_meas(&ts); } @@ -8644,9 +9045,10 @@ int main(int argc, char**argv) ((int16_t*)x)[i] = -364; } reset_meas(&ts); + uint32_t scale36864[6] = {1,1,1,1,1,3}; for (i=0; i<10000; i++) { start_meas(&ts); - dft36864((int16_t *)x,(int16_t *)y,1); + dft36864((int16_t *)x,(int16_t *)y,scale36864); stop_meas(&ts); } @@ -8669,9 +9071,10 @@ int main(int argc, char**argv) ((int16_t*)x)[i] = -364; } reset_meas(&ts); + uint32_t scale49152[6]={1,1,1,1,1,3}; for (i=0; i<10000; i++) { start_meas(&ts); - idft49152((int16_t *)x,(int16_t *)y,1); + idft49152((int16_t *)x,(int16_t *)y,scale49152); stop_meas(&ts); } @@ -8679,6 +9082,268 @@ int main(int argc, char**argv) LOG_M("y49152.m","y49152",y,49152,1,1); LOG_M("x49152.m","x49152",x,49152,1,1); + memset((void*)x,0,128*sizeof(int32_t)); + for (i=0;i<128;i++) { + ((int16_t*)x)[i<<1] = (int16_t)(364 * cos(2*M_PI*3*i/128)); + ((int16_t*)x)[1+(i<<1)] = (int16_t)(364 * sin(2*M_PI*3*i/128)); + } + uint32_t scale128_rx[3]={1,3,0}; + dft128((int16_t*)x,(int16_t*)y,scale128_rx); + LOG_M("x128_exp.m","x128_exp",x,128,1,1); + LOG_M("y128_exp.m","y128_exp",y,128,1,1); + sqnr = compute_error((int16_t*)x,(int16_t*)y,128,bitrev128,0); + + printf("128 point DFT SQNR (cplx sinusoid) : %f dB\n",10*log10(sqnr)); + + // scaling optimization + for (double dBFS = -80;dBFS < 0; dBFS+=2) { + int scale128_min[3]; + double sqnr,max_sqnr=-99; + double input_lev=0; + int inputcnt=0; + for (scale128_rx[0]=1;scale128_rx[0]<=4;scale128_rx[0]++) + for (scale128_rx[1]=0;scale128_rx[1]<=4-scale128_rx[0];scale128_rx[1]++) { + scale128_rx[2]=4-scale128_rx[0]-scale128_rx[1]; + sqnr=0; + int n; + for (n=0;n<16384/128;n++) { + fill_gauss((c16_t*)x,128,dBFS); + for (i=0;i<128;i++) input_lev += pow((double)(((c16_t*)x)[i].r),2.0) + pow((double)(((c16_t*)x)[i].i),2.0); + dft128((int16_t*)x,(int16_t*)y,scale128_rx); + sqnr += compute_error((int16_t*)x,(int16_t*)y,128,bitrev128,0); + } + sqnr/=n; + inputcnt+=(n*128); + if (sqnr>max_sqnr) { + max_sqnr = sqnr; + scale128_min[0]=scale128_rx[0]; scale128_min[1]=scale128_rx[1]; scale128_min[2]=scale128_rx[2]; + } + } + printf("128-point dBFS %f(input lev %f) dB SQNR %f(%f): (%d,%d,%d)\n",dBFS,10*log10(input_lev/inputcnt),10*log10(max_sqnr),max_sqnr,scale128_min[0],scale128_min[1],scale128_min[2]); + } + + memset((void*)x,0,256*sizeof(int32_t)); + for (i=0;i<256;i++) { + ((int16_t*)x)[i<<1] = (int16_t)(364 * cos(2*M_PI*3*i/256)); + ((int16_t*)x)[1+(i<<1)] = (int16_t)(364 * sin(2*M_PI*3*i/256)); + } + uint32_t scale256_rx[3]={0,2,2}; + dft256((int16_t*)x,(int16_t*)y,scale256_rx); + LOG_M("x256_exp.m","x256_exp",x,256,1,1); + LOG_M("y256_exp.m","y256_exp",y,256,1,1); + sqnr = compute_error((int16_t*)x,(int16_t*)y,256,bitrev256,0); + + printf("256 point DFT SQNR (cplx sinusoid) : %f dB\n",10*log10(sqnr)); + + // scaling optimization + for (double dBFS = -80;dBFS < 0; dBFS+=2) { + int scale256_min[3]; + double sqnr,max_sqnr=-99; + double input_lev=0; + int inputcnt=0; + for (scale256_rx[0]=0;scale256_rx[0]<=4;scale256_rx[0]++) + for (scale256_rx[1]=0;scale256_rx[1]<=4-scale256_rx[0];scale256_rx[1]++) { + scale256_rx[2]=4-scale256_rx[0]-scale256_rx[1]; + sqnr=0; + int n; + for (n=0;n<16384/256;n++) { + fill_gauss((c16_t*)x,256,dBFS); + for (i=0;i<256;i++) input_lev += pow((double)(((c16_t*)x)[i].r),2.0) + pow((double)(((c16_t*)x)[i].i),2.0); + dft256((int16_t*)x,(int16_t*)y,scale256_rx); + sqnr += compute_error((int16_t*)x,(int16_t*)y,256,bitrev256,0); + } + sqnr/=n; + inputcnt+=(n*256); + if (sqnr>max_sqnr) { + max_sqnr = sqnr; + scale256_min[0]=scale256_rx[0]; scale256_min[1]=scale256_rx[1]; scale256_min[2]=scale256_rx[2]; + } + } + printf("256-point dBFS %f(input lev %f) dB SQNR %f(%f): (%d,%d,%d)\n",dBFS,10*log10(input_lev/inputcnt),10*log10(max_sqnr),max_sqnr,scale256_min[0],scale256_min[1],scale256_min[2]); + } + + memset((void*)x,0,512*sizeof(int32_t)); + for (i=0;i<512;i++) { + ((int16_t*)x)[i<<1] = (int16_t)(364 * cos(2*M_PI*3*i/512)); + ((int16_t*)x)[1+(i<<1)] = (int16_t)(364 * sin(2*M_PI*3*i/512)); + } + uint32_t scale512_rx[4]={1,1,1,2}; + + dft512((int16_t*)x,(int16_t*)y,scale512_rx); + LOG_M("x512_exp.m","x512_exp",x,512,1,1); + LOG_M("y512_exp.m","y512_exp",y,512,1,1); + + sqnr = compute_error((int16_t*)x,(int16_t*)y,512,bitrev512,0); + + printf("512 point DFT SQNR (cplx sinusoid) : %f dB\n",10*log10(sqnr)); + // scaling optimization + for (double dBFS = -80;dBFS < 0; dBFS+=2) { + int scale512_min[4]; + double sqnr,max_sqnr=-99; + double input_lev=0; + int inputcnt=0; + for (scale512_rx[0]=1;scale512_rx[0]<=5;scale512_rx[0]++) + for (scale512_rx[1]=0;scale512_rx[1]<=5-scale512_rx[0];scale512_rx[1]++) + for (scale512_rx[2]=0;scale512_rx[2]<=5-scale512_rx[0]-scale512_rx[1];scale512_rx[2]++) { + scale512_rx[3]=5-scale512_rx[0]-scale512_rx[1]-scale512_rx[2]; + sqnr=0; + int n; + for (n=0;n<16384/512;n++) { + fill_gauss((c16_t*)x,512,dBFS); + for (i=0;i<512;i++) input_lev += pow((double)(((c16_t*)x)[i].r),2.0) + pow((double)(((c16_t*)x)[i].i),2.0); + dft512((int16_t*)x,(int16_t*)y,scale512_rx); + sqnr += compute_error((int16_t*)x,(int16_t*)y,512,bitrev512,0); + } + sqnr/=n; + inputcnt+=(n*512); + if (sqnr>max_sqnr) { + max_sqnr = sqnr; + scale512_min[0]=scale512_rx[0]; scale512_min[1]=scale512_rx[1]; scale512_min[2]=scale512_rx[2]; scale512_min[3]=scale512_rx[3]; + } + } + printf("512-point dBFS %f(input lev %f) dB SQNR %f(%f): (%d,%d,%d,%d)\n",dBFS,10*log10(input_lev/inputcnt),10*log10(max_sqnr),max_sqnr,scale512_min[0],scale512_min[1],scale512_min[2],scale512_min[3]); + } + + memset((void*)x,0,1024*sizeof(int32_t)); + uint32_t scale1024_rx[4]={1,2,2,0}; + for (i=0;i<1024;i++) { + ((int16_t*)x)[i<<1] = (int16_t)(364 * cos(2*M_PI*3*i/1024)); + ((int16_t*)x)[1+(i<<1)] = (int16_t)(364 * sin(2*M_PI*3*i/1024)); + } + dft1024((int16_t*)x,(int16_t*)y,scale1024_rx); + LOG_M("x1024_exp.m","x1024_exp",x,1024,1,1); + LOG_M("y1024_exp.m","y1024_exp",y,1024,1,1); + + sqnr = compute_error((int16_t*)x,(int16_t*)y,1024,bitrev1024,0); + + printf("1024 point DFT SQNR (cplx sinusoid) : %f dB\n",10*log10(sqnr)); + // scaling optimization + for (double dBFS = -80;dBFS < 0; dBFS+=2) { + int scale1024_min[4]; + double sqnr,max_sqnr=-99; + double input_lev=0; + int inputcnt=0; + for (scale1024_rx[0]=0;scale1024_rx[0]<=5;scale1024_rx[0]++) + for (scale1024_rx[1]=0;scale1024_rx[1]<=5-scale1024_rx[0];scale1024_rx[1]++) + for (scale1024_rx[2]=0;scale1024_rx[2]<=5-scale1024_rx[0]-scale1024_rx[1];scale1024_rx[2]++) { + scale1024_rx[3]=5-scale1024_rx[0]-scale1024_rx[1]-scale1024_rx[2]; + sqnr=0; + int n; + for (n=0;n<16384/1024;n++) { + fill_gauss((c16_t*)x,1024,dBFS); + for (i=0;i<1024;i++) input_lev += pow((double)(((c16_t*)x)[i].r),2.0) + pow((double)(((c16_t*)x)[i].i),2.0); + dft1024((int16_t*)x,(int16_t*)y,scale1024_rx); + sqnr += compute_error((int16_t*)x,(int16_t*)y,1024,bitrev1024,0); + } + sqnr/=n; + inputcnt+=(n*1024); + if (sqnr>max_sqnr) { + max_sqnr = sqnr; + scale1024_min[0]=scale1024_rx[0]; scale1024_min[1]=scale1024_rx[1]; scale1024_min[2]=scale1024_rx[2]; scale1024_min[3]=scale1024_rx[3]; + } + } + printf("1024-point dBFS %f(input lev %f) dB SQNR %f(%f): (%d,%d,%d,%d)\n",dBFS,10*log10(input_lev/inputcnt),10*log10(max_sqnr),max_sqnr,scale1024_min[0],scale1024_min[1],scale1024_min[2],scale1024_min[3]); + } + memset((void*)x,0,1536*sizeof(int32_t)); + for (i=0;i<1536;i++) { + ((int16_t*)x)[i<<1] = (int16_t)(364 * cos(2*M_PI*3*i/1536)); + ((int16_t*)x)[1+(i<<1)] = (int16_t)(364 * sin(2*M_PI*3*i/1536)); + } + dft1536((int16_t*)x,(int16_t*)y,scale1536); + LOG_M("x1536_exp.m","x1536_exp",x,1536,1,1); + LOG_M("y1536_exp.m","y1536_exp",y,1536,1,1); + + memset((void*)x,0,2048*sizeof(int32_t)); + for (i=0;i<2048;i++) { + ((int16_t*)x)[i<<1] = (int16_t)(384 * cos(2*M_PI*3*i/2048)); + ((int16_t*)x)[1+(i<<1)] = (int16_t)(384 * sin(2*M_PI*3*i/2048)); + } + uint32_t scale2048_rx[5]={1,1,1,1,2}; + + dft2048((int16_t*)x,(int16_t*)y,scale2048_rx); + LOG_M("x2048_exp.m","x2048_exp",x,2048,1,1); + LOG_M("y2048_exp.m","y2048_exp",y,2048,1,1); + + sqnr = compute_error((int16_t*)x,(int16_t*)y,2048,bitrev2048,0); + + printf("2048 point DFT SQNR (cplx sinusoid) : %f dB\n",10*log10(sqnr)); + // scaling optimization + for (double dBFS = -80;dBFS < 0; dBFS+=2) { + int scale2048_min[5]; + double sqnr,max_sqnr=-99; + double input_lev=0; + int inputcnt=0; + for (scale2048_rx[0]=1;scale2048_rx[0]<=6;scale2048_rx[0]++) + for (scale2048_rx[1]=0;scale2048_rx[1]<=6-scale2048_rx[0];scale2048_rx[1]++) + for (scale2048_rx[2]=0;scale2048_rx[2]<=6-scale2048_rx[0]-scale2048_rx[1];scale2048_rx[2]++) + for (scale2048_rx[3]=0;scale2048_rx[3]<=6-scale2048_rx[0]-scale2048_rx[1]-scale2048_rx[2];scale2048_rx[3]++) { + scale2048_rx[4]=6-scale2048_rx[0]-scale2048_rx[1]-scale2048_rx[2]-scale2048_rx[3]; + sqnr=0; + int n; + for (n=0;n<16384/2048;n++) { + fill_gauss((c16_t*)x,2048,dBFS); + for (i=0;i<2048;i++) input_lev += pow((double)(((c16_t*)x)[i].r),2.0) + pow((double)(((c16_t*)x)[i].i),2.0); + dft2048((int16_t*)x,(int16_t*)y,scale2048_rx); + sqnr += compute_error((int16_t*)x,(int16_t*)y,2048,bitrev2048,0); + } + sqnr/=n; + inputcnt+=(n*2048); + if (sqnr>max_sqnr) { + max_sqnr = sqnr; + scale2048_min[0]=scale2048_rx[0]; scale2048_min[1]=scale2048_rx[1]; scale2048_min[2]=scale2048_rx[2]; scale2048_min[3]=scale2048_rx[3]; scale2048_min[4]=scale2048_rx[4]; + } + } + printf("2048-point dBFS %f(input lev %f) dB SQNR %f(%f): (%d,%d,%d,%d,%d)\n",dBFS,10*log10(input_lev/inputcnt),10*log10(max_sqnr),max_sqnr,scale2048_min[0],scale2048_min[1],scale2048_min[2],scale2048_min[3],scale2048_min[4]); + } + memset((void*)x,0,3072*sizeof(int32_t)); + for (i=0;i<3072;i++) { + ((int16_t*)x)[i<<1] = (int16_t)(200 * cos(2*M_PI*3*i/3072)); + ((int16_t*)x)[1+(i<<1)] = (int16_t)(200 * sin(2*M_PI*3*i/3072)); + } + dft3072((int16_t*)x,(int16_t*)y,scale3072); + LOG_M("x3072_exp.m","x3072_exp",x,3072,1,1); + LOG_M("y3072_exp.m","y3072_exp",y,3072,1,1); + + memset((void*)x,0,4096*sizeof(int32_t)); + for (i=0;i<4096;i++) { + ((int16_t*)x)[i<<1] = (int16_t)(384 * cos(2*M_PI*331*i/4096)); + ((int16_t*)x)[1+(i<<1)] = (int16_t)(384 * sin(2*M_PI*331*i/4096)); + } + uint32_t scale4096_rx[5]={0,1,1,2,2}; + dft4096((int16_t*)x,(int16_t*)y,scale4096_rx); + LOG_M("x4096_exp.m","x4096_exp",x,4096,1,1); + LOG_M("y4096_exp.m","y4096_exp",y,4096,1,1); + + sqnr = compute_error((int16_t*)x,(int16_t*)y,4096,bitrev4096,0); + printf("4096 point DFT SQNR (cplx sinusoid) : %f dB\n",10*log10(sqnr)); + // scaling optimization + for (double dBFS = -80;dBFS < 0; dBFS+=2) { + int scale4096_min[5]; + double sqnr,max_sqnr=-99; + double input_lev=0; + int inputcnt=0; + for (scale4096_rx[0]=0;scale4096_rx[0]<=6;scale4096_rx[0]++) + for (scale4096_rx[1]=0;scale4096_rx[1]<=6-scale4096_rx[0];scale4096_rx[1]++) + for (scale4096_rx[2]=0;scale4096_rx[2]<=6-scale4096_rx[0]-scale4096_rx[1];scale4096_rx[2]++) + for (scale4096_rx[3]=0;scale4096_rx[3]<=6-scale4096_rx[0]-scale4096_rx[1]-scale4096_rx[2];scale4096_rx[3]++) { + scale4096_rx[4]=6-scale4096_rx[0]-scale4096_rx[1]-scale4096_rx[2]-scale4096_rx[3]; + sqnr=0; + int n; + for (n=0;n<16384/4096;n++) { + fill_gauss((c16_t*)x,4096,dBFS); + for (i=0;i<4096;i++) input_lev += pow((double)(((c16_t*)x)[i].r),2.0) + pow((double)(((c16_t*)x)[i].i),2.0); + dft4096((int16_t*)x,(int16_t*)y,scale4096_rx); + sqnr += compute_error((int16_t*)x,(int16_t*)y,4096,bitrev4096,0); + } + sqnr/=n; + inputcnt+=(n*4096); + if (sqnr>max_sqnr) { + max_sqnr = sqnr; + scale4096_min[0]=scale4096_rx[0]; scale4096_min[1]=scale4096_rx[1]; scale4096_min[2]=scale4096_rx[2]; scale4096_min[3]=scale4096_rx[3];scale4096_min[4]=scale4096_rx[4]; + } + } + printf("4096-point dBFS %f(input lev %f) dB SQNR %f(%f): (%d,%d,%d,%d,%d)\n",dBFS,10*log10(input_lev/inputcnt),10*log10(max_sqnr),max_sqnr,scale4096_min[0],scale4096_min[1],scale4096_min[2],scale4096_min[3],scale4096_min[4]); + } return(0); } diff --git a/openair1/PHY/TOOLS/oai_dfts_neon.c b/openair1/PHY/TOOLS/oai_dfts_neon.c index ddf8a59bf524dd99c0a44cb08d535c86a115abeb..b746a3d628d9398a3278ae46653a2dd364aca3d4 100644 --- a/openair1/PHY/TOOLS/oai_dfts_neon.c +++ b/openair1/PHY/TOOLS/oai_dfts_neon.c @@ -925,7 +925,7 @@ const static int16_t tw64c[96] __attribute__((aligned(32))) = { #define set1_int16(a) vdupq_n_s16(a) #define mulhi_int16(a,b) vqdmulhq_s16(a,b); -void dft64(int16_t *x,int16_t *y,unsigned char scale) +void dft64(int16_t *x,int16_t *y,unsigned int *scale) { simd_q15_t xtmp[16],ytmp[16],*tw64a_128=(simd_q15_t *)tw64a,*tw64b_128=(simd_q15_t *)tw64b,*x128=(simd_q15_t *)x,*y128=(simd_q15_t *)y; @@ -1018,23 +1018,24 @@ void dft64(int16_t *x,int16_t *y,unsigned char scale) #endif - if (scale>0) { - y128[0] = shiftright_int16(y128[0],3); - y128[1] = shiftright_int16(y128[1],3); - y128[2] = shiftright_int16(y128[2],3); - y128[3] = shiftright_int16(y128[3],3); - y128[4] = shiftright_int16(y128[4],3); - y128[5] = shiftright_int16(y128[5],3); - y128[6] = shiftright_int16(y128[6],3); - y128[7] = shiftright_int16(y128[7],3); - y128[8] = shiftright_int16(y128[8],3); - y128[9] = shiftright_int16(y128[9],3); - y128[10] = shiftright_int16(y128[10],3); - y128[11] = shiftright_int16(y128[11],3); - y128[12] = shiftright_int16(y128[12],3); - y128[13] = shiftright_int16(y128[13],3); - y128[14] = shiftright_int16(y128[14],3); - y128[15] = shiftright_int16(y128[15],3); + if (scale && *scale>0) { + unsigned int scalec=*scale; + y128[0] = shiftright_int16(y128[0],scalec); + y128[1] = shiftright_int16(y128[1],scalec); + y128[2] = shiftright_int16(y128[2],scalec); + y128[3] = shiftright_int16(y128[3],scalec); + y128[4] = shiftright_int16(y128[4],scalec); + y128[5] = shiftright_int16(y128[5],scalec); + y128[6] = shiftright_int16(y128[6],scalec); + y128[7] = shiftright_int16(y128[7],scalec); + y128[8] = shiftright_int16(y128[8],scalec); + y128[9] = shiftright_int16(y128[9],scalec); + y128[10] = shiftright_int16(y128[10],scalec); + y128[11] = shiftright_int16(y128[11],scalec); + y128[12] = shiftright_int16(y128[12],scalec); + y128[13] = shiftright_int16(y128[13],scalec); + y128[14] = shiftright_int16(y128[14],scalec); + y128[15] = shiftright_int16(y128[15],scalec); } @@ -1042,7 +1043,7 @@ void dft64(int16_t *x,int16_t *y,unsigned char scale) } -void idft64(int16_t *x,int16_t *y,unsigned char scale) +void idft64(int16_t *x,int16_t *y,unsigned int *scale) { simd_q15_t xtmp[16],ytmp[16],*tw64a_128=(simd_q15_t *)tw64,*tw64b_128=(simd_q15_t *)tw64c,*x128=(simd_q15_t *)x,*y128=(simd_q15_t *)y; @@ -1111,24 +1112,24 @@ void idft64(int16_t *x,int16_t *y,unsigned char scale) #endif - if (scale>0) { - - y128[0] = shiftright_int16(y128[0],3); - y128[1] = shiftright_int16(y128[1],3); - y128[2] = shiftright_int16(y128[2],3); - y128[3] = shiftright_int16(y128[3],3); - y128[4] = shiftright_int16(y128[4],3); - y128[5] = shiftright_int16(y128[5],3); - y128[6] = shiftright_int16(y128[6],3); - y128[7] = shiftright_int16(y128[7],3); - y128[8] = shiftright_int16(y128[8],3); - y128[9] = shiftright_int16(y128[9],3); - y128[10] = shiftright_int16(y128[10],3); - y128[11] = shiftright_int16(y128[11],3); - y128[12] = shiftright_int16(y128[12],3); - y128[13] = shiftright_int16(y128[13],3); - y128[14] = shiftright_int16(y128[14],3); - y128[15] = shiftright_int16(y128[15],3); + if (scale && *scale>0) { + unsigned int scalec=*scale; + y128[0] = shiftright_int16(y128[0],scalec); + y128[1] = shiftright_int16(y128[1],scalec); + y128[2] = shiftright_int16(y128[2],scalec); + y128[3] = shiftright_int16(y128[3],scalec); + y128[4] = shiftright_int16(y128[4],scalec); + y128[5] = shiftright_int16(y128[5],scalec); + y128[6] = shiftright_int16(y128[6],scalec); + y128[7] = shiftright_int16(y128[7],scalec); + y128[8] = shiftright_int16(y128[8],scalec); + y128[9] = shiftright_int16(y128[9],scalec); + y128[10] = shiftright_int16(y128[10],scalec); + y128[11] = shiftright_int16(y128[11],scalec); + y128[12] = shiftright_int16(y128[12],scalec); + y128[13] = shiftright_int16(y128[13],scalec); + y128[14] = shiftright_int16(y128[14],scalec); + y128[15] = shiftright_int16(y128[15],scalec); } @@ -1145,7 +1146,7 @@ int16_t tw128b[128] __attribute__((aligned(32))) = {0,32767,-1608,32727,-3212,32 int16_t tw128c[128] __attribute__((aligned(32))) = {0,32767,1608,32727,3212,32609,4808,32412,6393,32137,7962,31785,9512,31356,11039,30851,12540,30272,14010,29621,15447,28897,16846,28105,18205,27244,19520,26318,20788,25329,22005,24278,23170,23169,24279,22004,25330,20787,26319,19519,27245,18204,28106,16845,28898,15446,29622,14009,30273,12539,30852,11038,31357,9511,31786,7961,32138,6392,32413,4807,32610,3211,32728,1607,32767,0,32728,-1608,32610,-3212,32413,-4808,32138,-6393,31786,-7962,31357,-9512,30852,-11039,30273,-12540,29622,-14010,28898,-15447,28106,-16846,27245,-18205,26319,-19520,25330,-20788,24279,-22005,23170,-23170,22005,-24279,20788,-25330,19520,-26319,18205,-27245,16846,-28106,15447,-28898,14010,-29622,12540,-30273,11039,-30852,9512,-31357,7962,-31786,6393,-32138,4808,-32413,3212,-32610,1608,-32728}; -void dft128(int16_t *x,int16_t *y,unsigned char scale) +void dft128(int16_t *x,int16_t *y,unsigned int *scale) { simdshort_q15_t xtmp[64],*x64 = (simdshort_q15_t *)x; @@ -1188,8 +1189,10 @@ void dft128(int16_t *x,int16_t *y,unsigned char scale) transpose4_ooff(x64+60,xtmp+30,32); transpose4_ooff(x64+62,xtmp+31,32); - dft64((int16_t*)(xtmp),(int16_t*)ytmp,1); - dft64((int16_t*)(xtmp+32),(int16_t*)(ytmp+16),1); + unsigned int *scale64=NULL; + if (scale) scale64=scale+1; + dft64((int16_t*)(xtmp),(int16_t*)ytmp,scale64); + dft64((int16_t*)(xtmp+32),(int16_t*)(ytmp+16),scale64); #ifndef MR_MAIN if (LOG_DUMPFLAG(DEBUG_DFT)) { LOG_M("dft128a.m","dfta",ytmp,64,1,1); @@ -1207,41 +1210,76 @@ void dft128(int16_t *x,int16_t *y,unsigned char scale) ytmpp++; } - if (scale>0) { - - y128[0] = mulhi_int16(y128[0],ONE_OVER_SQRT2_Q15_128); - y128[1] = mulhi_int16(y128[1],ONE_OVER_SQRT2_Q15_128); - y128[2] = mulhi_int16(y128[2],ONE_OVER_SQRT2_Q15_128); - y128[3] = mulhi_int16(y128[3],ONE_OVER_SQRT2_Q15_128); - y128[4] = mulhi_int16(y128[4],ONE_OVER_SQRT2_Q15_128); - y128[5] = mulhi_int16(y128[5],ONE_OVER_SQRT2_Q15_128); - y128[6] = mulhi_int16(y128[6],ONE_OVER_SQRT2_Q15_128); - y128[7] = mulhi_int16(y128[7],ONE_OVER_SQRT2_Q15_128); - y128[8] = mulhi_int16(y128[8],ONE_OVER_SQRT2_Q15_128); - y128[9] = mulhi_int16(y128[9],ONE_OVER_SQRT2_Q15_128); - y128[10] = mulhi_int16(y128[10],ONE_OVER_SQRT2_Q15_128); - y128[11] = mulhi_int16(y128[11],ONE_OVER_SQRT2_Q15_128); - y128[12] = mulhi_int16(y128[12],ONE_OVER_SQRT2_Q15_128); - y128[13] = mulhi_int16(y128[13],ONE_OVER_SQRT2_Q15_128); - y128[14] = mulhi_int16(y128[14],ONE_OVER_SQRT2_Q15_128); - y128[15] = mulhi_int16(y128[15],ONE_OVER_SQRT2_Q15_128); - y128[16] = mulhi_int16(y128[16],ONE_OVER_SQRT2_Q15_128); - y128[17] = mulhi_int16(y128[17],ONE_OVER_SQRT2_Q15_128); - y128[18] = mulhi_int16(y128[18],ONE_OVER_SQRT2_Q15_128); - y128[19] = mulhi_int16(y128[19],ONE_OVER_SQRT2_Q15_128); - y128[20] = mulhi_int16(y128[20],ONE_OVER_SQRT2_Q15_128); - y128[21] = mulhi_int16(y128[21],ONE_OVER_SQRT2_Q15_128); - y128[22] = mulhi_int16(y128[22],ONE_OVER_SQRT2_Q15_128); - y128[23] = mulhi_int16(y128[23],ONE_OVER_SQRT2_Q15_128); - y128[24] = mulhi_int16(y128[24],ONE_OVER_SQRT2_Q15_128); - y128[25] = mulhi_int16(y128[25],ONE_OVER_SQRT2_Q15_128); - y128[26] = mulhi_int16(y128[26],ONE_OVER_SQRT2_Q15_128); - y128[27] = mulhi_int16(y128[27],ONE_OVER_SQRT2_Q15_128); - y128[28] = mulhi_int16(y128[28],ONE_OVER_SQRT2_Q15_128); - y128[29] = mulhi_int16(y128[29],ONE_OVER_SQRT2_Q15_128); - y128[30] = mulhi_int16(y128[30],ONE_OVER_SQRT2_Q15_128); - y128[31] = mulhi_int16(y128[31],ONE_OVER_SQRT2_Q15_128); - + if (scale && *scale>0) { + if (*scale>1) { + uint32_t scale2=*scale-1; + y128[0] = mulhi_int16(shiftright_int16(y128[0],scale2),ONE_OVER_SQRT2_Q15_128); + y128[1] = mulhi_int16(shiftright_int16(y128[1],scale2),ONE_OVER_SQRT2_Q15_128); + y128[2] = mulhi_int16(shiftright_int16(y128[2],scale2),ONE_OVER_SQRT2_Q15_128); + y128[3] = mulhi_int16(shiftright_int16(y128[3],scale2),ONE_OVER_SQRT2_Q15_128); + y128[4] = mulhi_int16(shiftright_int16(y128[4],scale2),ONE_OVER_SQRT2_Q15_128); + y128[5] = mulhi_int16(shiftright_int16(y128[5],scale2),ONE_OVER_SQRT2_Q15_128); + y128[6] = mulhi_int16(shiftright_int16(y128[6],scale2),ONE_OVER_SQRT2_Q15_128); + y128[7] = mulhi_int16(shiftright_int16(y128[7],scale2),ONE_OVER_SQRT2_Q15_128); + y128[8] = mulhi_int16(shiftright_int16(y128[8],scale2),ONE_OVER_SQRT2_Q15_128); + y128[9] = mulhi_int16(shiftright_int16(y128[9],scale2),ONE_OVER_SQRT2_Q15_128); + y128[10] = mulhi_int16(shiftright_int16(y128[10],scale2),ONE_OVER_SQRT2_Q15_128); + y128[11] = mulhi_int16(shiftright_int16(y128[11],scale2),ONE_OVER_SQRT2_Q15_128); + y128[12] = mulhi_int16(shiftright_int16(y128[12],scale2),ONE_OVER_SQRT2_Q15_128); + y128[13] = mulhi_int16(shiftright_int16(y128[13],scale2),ONE_OVER_SQRT2_Q15_128); + y128[14] = mulhi_int16(shiftright_int16(y128[14],scale2),ONE_OVER_SQRT2_Q15_128); + y128[15] = mulhi_int16(shiftright_int16(y128[15],scale2),ONE_OVER_SQRT2_Q15_128); + y128[16] = mulhi_int16(shiftright_int16(y128[16],scale2),ONE_OVER_SQRT2_Q15_128); + y128[17] = mulhi_int16(shiftright_int16(y128[17],scale2),ONE_OVER_SQRT2_Q15_128); + y128[18] = mulhi_int16(shiftright_int16(y128[18],scale2),ONE_OVER_SQRT2_Q15_128); + y128[19] = mulhi_int16(shiftright_int16(y128[19],scale2),ONE_OVER_SQRT2_Q15_128); + y128[20] = mulhi_int16(shiftright_int16(y128[20],scale2),ONE_OVER_SQRT2_Q15_128); + y128[21] = mulhi_int16(shiftright_int16(y128[21],scale2),ONE_OVER_SQRT2_Q15_128); + y128[22] = mulhi_int16(shiftright_int16(y128[22],scale2),ONE_OVER_SQRT2_Q15_128); + y128[23] = mulhi_int16(shiftright_int16(y128[23],scale2),ONE_OVER_SQRT2_Q15_128); + y128[24] = mulhi_int16(shiftright_int16(y128[24],scale2),ONE_OVER_SQRT2_Q15_128); + y128[25] = mulhi_int16(shiftright_int16(y128[25],scale2),ONE_OVER_SQRT2_Q15_128); + y128[26] = mulhi_int16(shiftright_int16(y128[26],scale2),ONE_OVER_SQRT2_Q15_128); + y128[27] = mulhi_int16(shiftright_int16(y128[27],scale2),ONE_OVER_SQRT2_Q15_128); + y128[28] = mulhi_int16(shiftright_int16(y128[28],scale2),ONE_OVER_SQRT2_Q15_128); + y128[29] = mulhi_int16(shiftright_int16(y128[29],scale2),ONE_OVER_SQRT2_Q15_128); + y128[30] = mulhi_int16(shiftright_int16(y128[30],scale2),ONE_OVER_SQRT2_Q15_128); + y128[31] = mulhi_int16(shiftright_int16(y128[31],scale2),ONE_OVER_SQRT2_Q15_128); + } + else { + y128[0] = mulhi_int16(y128[0],ONE_OVER_SQRT2_Q15_128); + y128[1] = mulhi_int16(y128[1],ONE_OVER_SQRT2_Q15_128); + y128[2] = mulhi_int16(y128[2],ONE_OVER_SQRT2_Q15_128); + y128[3] = mulhi_int16(y128[3],ONE_OVER_SQRT2_Q15_128); + y128[4] = mulhi_int16(y128[4],ONE_OVER_SQRT2_Q15_128); + y128[5] = mulhi_int16(y128[5],ONE_OVER_SQRT2_Q15_128); + y128[6] = mulhi_int16(y128[6],ONE_OVER_SQRT2_Q15_128); + y128[7] = mulhi_int16(y128[7],ONE_OVER_SQRT2_Q15_128); + y128[8] = mulhi_int16(y128[8],ONE_OVER_SQRT2_Q15_128); + y128[9] = mulhi_int16(y128[9],ONE_OVER_SQRT2_Q15_128); + y128[10] = mulhi_int16(y128[10],ONE_OVER_SQRT2_Q15_128); + y128[11] = mulhi_int16(y128[11],ONE_OVER_SQRT2_Q15_128); + y128[12] = mulhi_int16(y128[12],ONE_OVER_SQRT2_Q15_128); + y128[13] = mulhi_int16(y128[13],ONE_OVER_SQRT2_Q15_128); + y128[14] = mulhi_int16(y128[14],ONE_OVER_SQRT2_Q15_128); + y128[15] = mulhi_int16(y128[15],ONE_OVER_SQRT2_Q15_128); + y128[16] = mulhi_int16(y128[16],ONE_OVER_SQRT2_Q15_128); + y128[17] = mulhi_int16(y128[17],ONE_OVER_SQRT2_Q15_128); + y128[18] = mulhi_int16(y128[18],ONE_OVER_SQRT2_Q15_128); + y128[19] = mulhi_int16(y128[19],ONE_OVER_SQRT2_Q15_128); + y128[20] = mulhi_int16(y128[20],ONE_OVER_SQRT2_Q15_128); + y128[21] = mulhi_int16(y128[21],ONE_OVER_SQRT2_Q15_128); + y128[22] = mulhi_int16(y128[22],ONE_OVER_SQRT2_Q15_128); + y128[23] = mulhi_int16(y128[23],ONE_OVER_SQRT2_Q15_128); + y128[24] = mulhi_int16(y128[24],ONE_OVER_SQRT2_Q15_128); + y128[25] = mulhi_int16(y128[25],ONE_OVER_SQRT2_Q15_128); + y128[26] = mulhi_int16(y128[26],ONE_OVER_SQRT2_Q15_128); + y128[27] = mulhi_int16(y128[27],ONE_OVER_SQRT2_Q15_128); + y128[28] = mulhi_int16(y128[28],ONE_OVER_SQRT2_Q15_128); + y128[29] = mulhi_int16(y128[29],ONE_OVER_SQRT2_Q15_128); + y128[30] = mulhi_int16(y128[30],ONE_OVER_SQRT2_Q15_128); + y128[31] = mulhi_int16(y128[31],ONE_OVER_SQRT2_Q15_128); + } } #ifndef MR_MAIN @@ -1252,7 +1290,7 @@ void dft128(int16_t *x,int16_t *y,unsigned char scale) #endif } -void idft128(int16_t *x,int16_t *y,unsigned char scale) +void idft128(int16_t *x,int16_t *y,unsigned int *scale) { simdshort_q15_t xtmp[64],*x64 = (simdshort_q15_t *)x; @@ -1295,8 +1333,10 @@ void idft128(int16_t *x,int16_t *y,unsigned char scale) transpose4_ooff(x64+60,xtmp+30,32); transpose4_ooff(x64+62,xtmp+31,32); - idft64((int16_t*)(xtmp),(int16_t*)ytmp,1); - idft64((int16_t*)(xtmp+32),(int16_t*)(ytmp+16),1); + unsigned int *scale64=NULL; + if (scale) scale64=scale+1; + idft64((int16_t*)(xtmp),(int16_t*)ytmp,scale64); + idft64((int16_t*)(xtmp+32),(int16_t*)(ytmp+16),scale64); for (i=0; i<16; i++) { @@ -1308,41 +1348,76 @@ void idft128(int16_t *x,int16_t *y,unsigned char scale) ytmpp++; } - if (scale>0) { - - y128[0] = mulhi_int16(y128[0],ONE_OVER_SQRT2_Q15_128); - y128[1] = mulhi_int16(y128[1],ONE_OVER_SQRT2_Q15_128); - y128[2] = mulhi_int16(y128[2],ONE_OVER_SQRT2_Q15_128); - y128[3] = mulhi_int16(y128[3],ONE_OVER_SQRT2_Q15_128); - y128[4] = mulhi_int16(y128[4],ONE_OVER_SQRT2_Q15_128); - y128[5] = mulhi_int16(y128[5],ONE_OVER_SQRT2_Q15_128); - y128[6] = mulhi_int16(y128[6],ONE_OVER_SQRT2_Q15_128); - y128[7] = mulhi_int16(y128[7],ONE_OVER_SQRT2_Q15_128); - y128[8] = mulhi_int16(y128[8],ONE_OVER_SQRT2_Q15_128); - y128[9] = mulhi_int16(y128[9],ONE_OVER_SQRT2_Q15_128); - y128[10] = mulhi_int16(y128[10],ONE_OVER_SQRT2_Q15_128); - y128[11] = mulhi_int16(y128[11],ONE_OVER_SQRT2_Q15_128); - y128[12] = mulhi_int16(y128[12],ONE_OVER_SQRT2_Q15_128); - y128[13] = mulhi_int16(y128[13],ONE_OVER_SQRT2_Q15_128); - y128[14] = mulhi_int16(y128[14],ONE_OVER_SQRT2_Q15_128); - y128[15] = mulhi_int16(y128[15],ONE_OVER_SQRT2_Q15_128); - y128[16] = mulhi_int16(y128[16],ONE_OVER_SQRT2_Q15_128); - y128[17] = mulhi_int16(y128[17],ONE_OVER_SQRT2_Q15_128); - y128[18] = mulhi_int16(y128[18],ONE_OVER_SQRT2_Q15_128); - y128[19] = mulhi_int16(y128[19],ONE_OVER_SQRT2_Q15_128); - y128[20] = mulhi_int16(y128[20],ONE_OVER_SQRT2_Q15_128); - y128[21] = mulhi_int16(y128[21],ONE_OVER_SQRT2_Q15_128); - y128[22] = mulhi_int16(y128[22],ONE_OVER_SQRT2_Q15_128); - y128[23] = mulhi_int16(y128[23],ONE_OVER_SQRT2_Q15_128); - y128[24] = mulhi_int16(y128[24],ONE_OVER_SQRT2_Q15_128); - y128[25] = mulhi_int16(y128[25],ONE_OVER_SQRT2_Q15_128); - y128[26] = mulhi_int16(y128[26],ONE_OVER_SQRT2_Q15_128); - y128[27] = mulhi_int16(y128[27],ONE_OVER_SQRT2_Q15_128); - y128[28] = mulhi_int16(y128[28],ONE_OVER_SQRT2_Q15_128); - y128[29] = mulhi_int16(y128[29],ONE_OVER_SQRT2_Q15_128); - y128[30] = mulhi_int16(y128[30],ONE_OVER_SQRT2_Q15_128); - y128[31] = mulhi_int16(y128[31],ONE_OVER_SQRT2_Q15_128); - + if (scale && *scale>0) { + if (*scale>1) { + uint32_t scale2=*scale-1; + y128[0] = mulhi_int16(shiftright_int16(y128[0],scale2),ONE_OVER_SQRT2_Q15_128); + y128[1] = mulhi_int16(shiftright_int16(y128[1],scale2),ONE_OVER_SQRT2_Q15_128); + y128[2] = mulhi_int16(shiftright_int16(y128[2],scale2),ONE_OVER_SQRT2_Q15_128); + y128[3] = mulhi_int16(shiftright_int16(y128[3],scale2),ONE_OVER_SQRT2_Q15_128); + y128[4] = mulhi_int16(shiftright_int16(y128[4],scale2),ONE_OVER_SQRT2_Q15_128); + y128[5] = mulhi_int16(shiftright_int16(y128[5],scale2),ONE_OVER_SQRT2_Q15_128); + y128[6] = mulhi_int16(shiftright_int16(y128[6],scale2),ONE_OVER_SQRT2_Q15_128); + y128[7] = mulhi_int16(shiftright_int16(y128[7],scale2),ONE_OVER_SQRT2_Q15_128); + y128[8] = mulhi_int16(shiftright_int16(y128[8],scale2),ONE_OVER_SQRT2_Q15_128); + y128[9] = mulhi_int16(shiftright_int16(y128[9],scale2),ONE_OVER_SQRT2_Q15_128); + y128[10] = mulhi_int16(shiftright_int16(y128[10],scale2),ONE_OVER_SQRT2_Q15_128); + y128[11] = mulhi_int16(shiftright_int16(y128[11],scale2),ONE_OVER_SQRT2_Q15_128); + y128[12] = mulhi_int16(shiftright_int16(y128[12],scale2),ONE_OVER_SQRT2_Q15_128); + y128[13] = mulhi_int16(shiftright_int16(y128[13],scale2),ONE_OVER_SQRT2_Q15_128); + y128[14] = mulhi_int16(shiftright_int16(y128[14],scale2),ONE_OVER_SQRT2_Q15_128); + y128[15] = mulhi_int16(shiftright_int16(y128[15],scale2),ONE_OVER_SQRT2_Q15_128); + y128[16] = mulhi_int16(shiftright_int16(y128[16],scale2),ONE_OVER_SQRT2_Q15_128); + y128[17] = mulhi_int16(shiftright_int16(y128[17],scale2),ONE_OVER_SQRT2_Q15_128); + y128[18] = mulhi_int16(shiftright_int16(y128[18],scale2),ONE_OVER_SQRT2_Q15_128); + y128[19] = mulhi_int16(shiftright_int16(y128[19],scale2),ONE_OVER_SQRT2_Q15_128); + y128[20] = mulhi_int16(shiftright_int16(y128[20],scale2),ONE_OVER_SQRT2_Q15_128); + y128[21] = mulhi_int16(shiftright_int16(y128[21],scale2),ONE_OVER_SQRT2_Q15_128); + y128[22] = mulhi_int16(shiftright_int16(y128[22],scale2),ONE_OVER_SQRT2_Q15_128); + y128[23] = mulhi_int16(shiftright_int16(y128[23],scale2),ONE_OVER_SQRT2_Q15_128); + y128[24] = mulhi_int16(shiftright_int16(y128[24],scale2),ONE_OVER_SQRT2_Q15_128); + y128[25] = mulhi_int16(shiftright_int16(y128[25],scale2),ONE_OVER_SQRT2_Q15_128); + y128[26] = mulhi_int16(shiftright_int16(y128[26],scale2),ONE_OVER_SQRT2_Q15_128); + y128[27] = mulhi_int16(shiftright_int16(y128[27],scale2),ONE_OVER_SQRT2_Q15_128); + y128[28] = mulhi_int16(shiftright_int16(y128[28],scale2),ONE_OVER_SQRT2_Q15_128); + y128[29] = mulhi_int16(shiftright_int16(y128[29],scale2),ONE_OVER_SQRT2_Q15_128); + y128[30] = mulhi_int16(shiftright_int16(y128[30],scale2),ONE_OVER_SQRT2_Q15_128); + y128[31] = mulhi_int16(shiftright_int16(y128[31],scale2),ONE_OVER_SQRT2_Q15_128); + } + else { + y128[0] = mulhi_int16(y128[0],ONE_OVER_SQRT2_Q15_128); + y128[1] = mulhi_int16(y128[1],ONE_OVER_SQRT2_Q15_128); + y128[2] = mulhi_int16(y128[2],ONE_OVER_SQRT2_Q15_128); + y128[3] = mulhi_int16(y128[3],ONE_OVER_SQRT2_Q15_128); + y128[4] = mulhi_int16(y128[4],ONE_OVER_SQRT2_Q15_128); + y128[5] = mulhi_int16(y128[5],ONE_OVER_SQRT2_Q15_128); + y128[6] = mulhi_int16(y128[6],ONE_OVER_SQRT2_Q15_128); + y128[7] = mulhi_int16(y128[7],ONE_OVER_SQRT2_Q15_128); + y128[8] = mulhi_int16(y128[8],ONE_OVER_SQRT2_Q15_128); + y128[9] = mulhi_int16(y128[9],ONE_OVER_SQRT2_Q15_128); + y128[10] = mulhi_int16(y128[10],ONE_OVER_SQRT2_Q15_128); + y128[11] = mulhi_int16(y128[11],ONE_OVER_SQRT2_Q15_128); + y128[12] = mulhi_int16(y128[12],ONE_OVER_SQRT2_Q15_128); + y128[13] = mulhi_int16(y128[13],ONE_OVER_SQRT2_Q15_128); + y128[14] = mulhi_int16(y128[14],ONE_OVER_SQRT2_Q15_128); + y128[15] = mulhi_int16(y128[15],ONE_OVER_SQRT2_Q15_128); + y128[16] = mulhi_int16(y128[16],ONE_OVER_SQRT2_Q15_128); + y128[17] = mulhi_int16(y128[17],ONE_OVER_SQRT2_Q15_128); + y128[18] = mulhi_int16(y128[18],ONE_OVER_SQRT2_Q15_128); + y128[19] = mulhi_int16(y128[19],ONE_OVER_SQRT2_Q15_128); + y128[20] = mulhi_int16(y128[20],ONE_OVER_SQRT2_Q15_128); + y128[21] = mulhi_int16(y128[21],ONE_OVER_SQRT2_Q15_128); + y128[22] = mulhi_int16(y128[22],ONE_OVER_SQRT2_Q15_128); + y128[23] = mulhi_int16(y128[23],ONE_OVER_SQRT2_Q15_128); + y128[24] = mulhi_int16(y128[24],ONE_OVER_SQRT2_Q15_128); + y128[25] = mulhi_int16(y128[25],ONE_OVER_SQRT2_Q15_128); + y128[26] = mulhi_int16(y128[26],ONE_OVER_SQRT2_Q15_128); + y128[27] = mulhi_int16(y128[27],ONE_OVER_SQRT2_Q15_128); + y128[28] = mulhi_int16(y128[28],ONE_OVER_SQRT2_Q15_128); + y128[29] = mulhi_int16(y128[29],ONE_OVER_SQRT2_Q15_128); + y128[30] = mulhi_int16(y128[30],ONE_OVER_SQRT2_Q15_128); + y128[31] = mulhi_int16(y128[31],ONE_OVER_SQRT2_Q15_128); + } } } @@ -1361,7 +1436,7 @@ int16_t tw256b[384] __attribute__((aligned(32))) = {0,32767,-805,32757,-1608,327 0,32767,-1608,32727,-3212,32609,-4808,32412,-6393,32137,-7962,31785,-9512,31356,-11039,30851,-12540,30272,-14010,29621,-15447,28897,-16846,28105,-18205,27244,-19520,26318,-20788,25329,-22005,24278,-23170,23169,-24279,22004,-25330,20787,-26319,19519,-27245,18204,-28106,16845,-28898,15446,-29622,14009,-30273,12539,-30852,11038,-31357,9511,-31786,7961,-32138,6392,-32413,4807,-32610,3211,-32728,1607,-32767,0,-32728,-1608,-32610,-3212,-32413,-4808,-32138,-6393,-31786,-7962,-31357,-9512,-30852,-11039,-30273,-12540,-29622,-14010,-28898,-15447,-28106,-16846,-27245,-18205,-26319,-19520,-25330,-20788,-24279,-22005,-23170,-23170,-22005,-24279,-20788,-25330,-19520,-26319,-18205,-27245,-16846,-28106,-15447,-28898,-14010,-29622,-12540,-30273,-11039,-30852,-9512,-31357,-7962,-31786,-6393,-32138,-4808,-32413,-3212,-32610,-1608,-32728, 0,32767,-2411,32678,-4808,32412,-7180,31970,-9512,31356,-11793,30571,-14010,29621,-16151,28510,-18205,27244,-20160,25831,-22005,24278,-23732,22594,-25330,20787,-26790,18867,-28106,16845,-29269,14732,-30273,12539,-31114,10278,-31786,7961,-32285,5601,-32610,3211,-32758,804,-32728,-1608,-32521,-4012,-32138,-6393,-31581,-8740,-30852,-11039,-29956,-13279,-28898,-15447,-27684,-17531,-26319,-19520,-24812,-21403,-23170,-23170,-21403,-24812,-19520,-26319,-17531,-27684,-15447,-28898,-13279,-29956,-11039,-30852,-8740,-31581,-6393,-32138,-4012,-32521,-1608,-32728,804,-32758,3211,-32610,5601,-32285,7961,-31786,10278,-31114,12539,-30273,14732,-29269,16845,-28106,18867,-26790,20787,-25330,22594,-23732,24278,-22005,25831,-20160,27244,-18205,28510,-16151,29621,-14010,30571,-11793,31356,-9512,31970,-7180,32412,-4808,32678,-2411 }; -void dft256(int16_t *x,int16_t *y,unsigned char scale) +void dft256(int16_t *x,int16_t *y,unsigned int *scale) { simd_q15_t xtmp[64],ytmp[64],*tw256a_128p=(simd_q15_t *)tw256a,*tw256b_128p=(simd_q15_t *)tw256b,*x128=(simd_q15_t *)x,*y128=(simd_q15_t *)y,*y128p=(simd_q15_t *)y; @@ -1403,10 +1478,12 @@ void dft256(int16_t *x,int16_t *y,unsigned char scale) start_meas(&ts_d); #endif - dft64((int16_t*)(xtmp),(int16_t*)(ytmp),1); - dft64((int16_t*)(xtmp+16),(int16_t*)(ytmp+16),1); - dft64((int16_t*)(xtmp+32),(int16_t*)(ytmp+32),1); - dft64((int16_t*)(xtmp+48),(int16_t*)(ytmp+48),1); + unsigned int *scale64=NULL; + if (scale) scale64=scale+1; + dft64((int16_t*)(xtmp),(int16_t*)(ytmp),scale64); + dft64((int16_t*)(xtmp+16),(int16_t*)(ytmp+16),scale64); + dft64((int16_t*)(xtmp+32),(int16_t*)(ytmp+32),scale64); + dft64((int16_t*)(xtmp+48),(int16_t*)(ytmp+48),scale64); #ifdef D256STATS stop_meas(&ts_d); @@ -1445,25 +1522,26 @@ void dft256(int16_t *x,int16_t *y,unsigned char scale) #endif #endif - if (scale>0) { + if (scale && *scale>0) { + unsigned int scalec=*scale; for (i=0; i<4; i++) { - y128[0] = shiftright_int16(y128[0],1); - y128[1] = shiftright_int16(y128[1],1); - y128[2] = shiftright_int16(y128[2],1); - y128[3] = shiftright_int16(y128[3],1); - y128[4] = shiftright_int16(y128[4],1); - y128[5] = shiftright_int16(y128[5],1); - y128[6] = shiftright_int16(y128[6],1); - y128[7] = shiftright_int16(y128[7],1); - y128[8] = shiftright_int16(y128[8],1); - y128[9] = shiftright_int16(y128[9],1); - y128[10] = shiftright_int16(y128[10],1); - y128[11] = shiftright_int16(y128[11],1); - y128[12] = shiftright_int16(y128[12],1); - y128[13] = shiftright_int16(y128[13],1); - y128[14] = shiftright_int16(y128[14],1); - y128[15] = shiftright_int16(y128[15],1); + y128[0] = shiftright_int16(y128[0],scalec); + y128[1] = shiftright_int16(y128[1],scalec); + y128[2] = shiftright_int16(y128[2],scalec); + y128[3] = shiftright_int16(y128[3],scalec); + y128[4] = shiftright_int16(y128[4],scalec); + y128[5] = shiftright_int16(y128[5],scalec); + y128[6] = shiftright_int16(y128[6],scalec); + y128[7] = shiftright_int16(y128[7],scalec); + y128[8] = shiftright_int16(y128[8],scalec); + y128[9] = shiftright_int16(y128[9],scalec); + y128[10] = shiftright_int16(y128[10],scalec); + y128[11] = shiftright_int16(y128[11],scalec); + y128[12] = shiftright_int16(y128[12],scalec); + y128[13] = shiftright_int16(y128[13],scalec); + y128[14] = shiftright_int16(y128[14],scalec); + y128[15] = shiftright_int16(y128[15],scalec); y128+=16; } @@ -1473,7 +1551,7 @@ void dft256(int16_t *x,int16_t *y,unsigned char scale) -void idft256(int16_t *x,int16_t *y,unsigned char scale) +void idft256(int16_t *x,int16_t *y,unsigned int *scale) { simd_q15_t xtmp[64],ytmp[64],*tw256_128p=(simd_q15_t *)tw256,*x128=(simd_q15_t *)x,*y128=(simd_q15_t *)y,*y128p=(simd_q15_t *)y; @@ -1485,10 +1563,12 @@ void idft256(int16_t *x,int16_t *y,unsigned char scale) } - idft64((int16_t*)(xtmp),(int16_t*)(ytmp),1); - idft64((int16_t*)(xtmp+16),(int16_t*)(ytmp+16),1); - idft64((int16_t*)(xtmp+32),(int16_t*)(ytmp+32),1); - idft64((int16_t*)(xtmp+48),(int16_t*)(ytmp+48),1); + unsigned int *scale64=NULL; + if (scale) scale64=scale+1; + idft64((int16_t*)(xtmp),(int16_t*)(ytmp),scale64); + idft64((int16_t*)(xtmp+16),(int16_t*)(ytmp+16),scale64); + idft64((int16_t*)(xtmp+32),(int16_t*)(ytmp+32),scale64); + idft64((int16_t*)(xtmp+48),(int16_t*)(ytmp+48),scale64); for (i=0; i<16; i++) { ibfly4(ytmpp,ytmpp+16,ytmpp+32,ytmpp+48, @@ -1499,25 +1579,26 @@ void idft256(int16_t *x,int16_t *y,unsigned char scale) ytmpp++; } - if (scale>0) { + if (scale && *scale>0) { + unsigned int scalec=*scale; for (i=0; i<4; i++) { - y128[0] = shiftright_int16(y128[0],1); - y128[1] = shiftright_int16(y128[1],1); - y128[2] = shiftright_int16(y128[2],1); - y128[3] = shiftright_int16(y128[3],1); - y128[4] = shiftright_int16(y128[4],1); - y128[5] = shiftright_int16(y128[5],1); - y128[6] = shiftright_int16(y128[6],1); - y128[7] = shiftright_int16(y128[7],1); - y128[8] = shiftright_int16(y128[8],1); - y128[9] = shiftright_int16(y128[9],1); - y128[10] = shiftright_int16(y128[10],1); - y128[11] = shiftright_int16(y128[11],1); - y128[12] = shiftright_int16(y128[12],1); - y128[13] = shiftright_int16(y128[13],1); - y128[14] = shiftright_int16(y128[14],1); - y128[15] = shiftright_int16(y128[15],1); + y128[0] = shiftright_int16(y128[0],scalec); + y128[1] = shiftright_int16(y128[1],scalec); + y128[2] = shiftright_int16(y128[2],scalec); + y128[3] = shiftright_int16(y128[3],scalec); + y128[4] = shiftright_int16(y128[4],scalec); + y128[5] = shiftright_int16(y128[5],scalec); + y128[6] = shiftright_int16(y128[6],scalec); + y128[7] = shiftright_int16(y128[7],scalec); + y128[8] = shiftright_int16(y128[8],scalec); + y128[9] = shiftright_int16(y128[9],scalec); + y128[10] = shiftright_int16(y128[10],scalec); + y128[11] = shiftright_int16(y128[11],scalec); + y128[12] = shiftright_int16(y128[12],scalec); + y128[13] = shiftright_int16(y128[13],scalec); + y128[14] = shiftright_int16(y128[14],scalec); + y128[15] = shiftright_int16(y128[15],scalec); y128+=16; } @@ -1542,7 +1623,7 @@ int16_t tw512c[512] __attribute__((aligned(32))) = { 0,32767,403,32764,805,32757,1207,32744,1608,32727,2010,32705,2411,32678,2812,32646,3212,32609,3612,32567,4012,32520,4410,32468,4808,32412,5206,32350,5602,32284,5998,32213,6393,32137,6787,32056,7180,31970,7572,31880,7962,31785,8352,31684,8740,31580,9127,31470,9512,31356,9896,31236,10279,31113,10660,30984,11039,30851,11417,30713,11793,30571,12167,30424,12540,30272,12910,30116,13279,29955,13646,29790,14010,29621,14373,29446,14733,29268,15091,29085,15447,28897,15800,28706,16151,28510,16500,28309,16846,28105,17190,27896,17531,27683,17869,27466,18205,27244,18538,27019,18868,26789,19195,26556,19520,26318,19841,26077,20160,25831,20475,25582,20788,25329,21097,25072,21403,24811,21706,24546,22005,24278,22302,24006,22595,23731,22884,23452,23170,23169,23453,22883,23732,22594,24007,22301,24279,22004,24547,21705,24812,21402,25073,21096,25330,20787,25583,20474,25832,20159,26078,19840,26319,19519,26557,19194,26790,18867,27020,18537,27245,18204,27467,17868,27684,17530,27897,17189,28106,16845,28310,16499,28511,16150,28707,15799,28898,15446,29086,15090,29269,14732,29447,14372,29622,14009,29791,13645,29956,13278,30117,12909,30273,12539,30425,12166,30572,11792,30714,11416,30852,11038,30985,10659,31114,10278,31237,9895,31357,9511,31471,9126,31581,8739,31685,8351,31786,7961,31881,7571,31971,7179,32057,6786,32138,6392,32214,5997,32285,5601,32351,5205,32413,4807,32469,4409,32521,4011,32568,3611,32610,3211,32647,2811,32679,2410,32706,2009,32728,1607,32745,1206,32758,804,32765,402,32767,0,32765,-403,32758,-805,32745,-1207,32728,-1608,32706,-2010,32679,-2411,32647,-2812,32610,-3212,32568,-3612,32521,-4012,32469,-4410,32413,-4808,32351,-5206,32285,-5602,32214,-5998,32138,-6393,32057,-6787,31971,-7180,31881,-7572,31786,-7962,31685,-8352,31581,-8740,31471,-9127,31357,-9512,31237,-9896,31114,-10279,30985,-10660,30852,-11039,30714,-11417,30572,-11793,30425,-12167,30273,-12540,30117,-12910,29956,-13279,29791,-13646,29622,-14010,29447,-14373,29269,-14733,29086,-15091,28898,-15447,28707,-15800,28511,-16151,28310,-16500,28106,-16846,27897,-17190,27684,-17531,27467,-17869,27245,-18205,27020,-18538,26790,-18868,26557,-19195,26319,-19520,26078,-19841,25832,-20160,25583,-20475,25330,-20788,25073,-21097,24812,-21403,24547,-21706,24279,-22005,24007,-22302,23732,-22595,23453,-22884,23170,-23170,22884,-23453,22595,-23732,22302,-24007,22005,-24279,21706,-24547,21403,-24812,21097,-25073,20788,-25330,20475,-25583,20160,-25832,19841,-26078,19520,-26319,19195,-26557,18868,-26790,18538,-27020,18205,-27245,17869,-27467,17531,-27684,17190,-27897,16846,-28106,16500,-28310,16151,-28511,15800,-28707,15447,-28898,15091,-29086,14733,-29269,14373,-29447,14010,-29622,13646,-29791,13279,-29956,12910,-30117,12540,-30273,12167,-30425,11793,-30572,11417,-30714,11039,-30852,10660,-30985,10279,-31114,9896,-31237,9512,-31357,9127,-31471,8740,-31581,8352,-31685,7962,-31786,7572,-31881,7180,-31971,6787,-32057,6393,-32138,5998,-32214,5602,-32285,5206,-32351,4808,-32413,4410,-32469,4012,-32521,3612,-32568,3212,-32610,2812,-32647,2411,-32679,2010,-32706,1608,-32728,1207,-32745,805,-32758,403,-32765 }; -void dft512(int16_t *x,int16_t *y,unsigned char scale) +void dft512(int16_t *x,int16_t *y,unsigned int *scale) { simdshort_q15_t xtmp[256],*xtmpp,*x64 = (simdshort_q15_t *)x; @@ -1590,8 +1671,10 @@ void dft512(int16_t *x,int16_t *y,unsigned char scale) xtmpp+=32; } - dft256((int16_t*)(xtmp),(int16_t*)ytmp,1); - dft256((int16_t*)(xtmp+128),(int16_t*)(ytmp+64),1); + unsigned int *scale256=NULL; + if (scale) scale256=scale+1; + dft256((int16_t*)(xtmp),(int16_t*)ytmp,scale256); + dft256((int16_t*)(xtmp+128),(int16_t*)(ytmp+64),scale256); for (i=0; i<64; i+=8) { @@ -1633,32 +1716,56 @@ void dft512(int16_t *x,int16_t *y,unsigned char scale) ytmpp+=8; } - if (scale>0) { + if (scale && *scale>0) { y128p = y128; - for (i=0; i<8; i++) { - y128p[0] = mulhi_int16(y128p[0],ONE_OVER_SQRT2_Q15_128); - y128p[1] = mulhi_int16(y128p[1],ONE_OVER_SQRT2_Q15_128); - y128p[2] = mulhi_int16(y128p[2],ONE_OVER_SQRT2_Q15_128); - y128p[3] = mulhi_int16(y128p[3],ONE_OVER_SQRT2_Q15_128); - y128p[4] = mulhi_int16(y128p[4],ONE_OVER_SQRT2_Q15_128); - y128p[5] = mulhi_int16(y128p[5],ONE_OVER_SQRT2_Q15_128); - y128p[6] = mulhi_int16(y128p[6],ONE_OVER_SQRT2_Q15_128); - y128p[7] = mulhi_int16(y128p[7],ONE_OVER_SQRT2_Q15_128); - y128p[8] = mulhi_int16(y128p[8],ONE_OVER_SQRT2_Q15_128); - y128p[9] = mulhi_int16(y128p[9],ONE_OVER_SQRT2_Q15_128); - y128p[10] = mulhi_int16(y128p[10],ONE_OVER_SQRT2_Q15_128); - y128p[11] = mulhi_int16(y128p[11],ONE_OVER_SQRT2_Q15_128); - y128p[12] = mulhi_int16(y128p[12],ONE_OVER_SQRT2_Q15_128); - y128p[13] = mulhi_int16(y128p[13],ONE_OVER_SQRT2_Q15_128); - y128p[14] = mulhi_int16(y128p[14],ONE_OVER_SQRT2_Q15_128); - y128p[15] = mulhi_int16(y128p[15],ONE_OVER_SQRT2_Q15_128); - y128p+=16; + if (*scale>1) { + uint32_t scale2=*scale-1; + for (i=0; i<8; i++) { + y128p[0] = mulhi_int16(shiftright_int16(y128p[0],scale2),ONE_OVER_SQRT2_Q15_128); + y128p[1] = mulhi_int16(shiftright_int16(y128p[1],scale2),ONE_OVER_SQRT2_Q15_128); + y128p[2] = mulhi_int16(shiftright_int16(y128p[2],scale2),ONE_OVER_SQRT2_Q15_128); + y128p[3] = mulhi_int16(shiftright_int16(y128p[3],scale2),ONE_OVER_SQRT2_Q15_128); + y128p[4] = mulhi_int16(shiftright_int16(y128p[4],scale2),ONE_OVER_SQRT2_Q15_128); + y128p[5] = mulhi_int16(shiftright_int16(y128p[5],scale2),ONE_OVER_SQRT2_Q15_128); + y128p[6] = mulhi_int16(shiftright_int16(y128p[6],scale2),ONE_OVER_SQRT2_Q15_128); + y128p[7] = mulhi_int16(shiftright_int16(y128p[7],scale2),ONE_OVER_SQRT2_Q15_128); + y128p[8] = mulhi_int16(shiftright_int16(y128p[8],scale2),ONE_OVER_SQRT2_Q15_128); + y128p[9] = mulhi_int16(shiftright_int16(y128p[9],scale2),ONE_OVER_SQRT2_Q15_128); + y128p[10] = mulhi_int16(shiftright_int16(y128p[10],scale2),ONE_OVER_SQRT2_Q15_128); + y128p[11] = mulhi_int16(shiftright_int16(y128p[11],scale2),ONE_OVER_SQRT2_Q15_128); + y128p[12] = mulhi_int16(shiftright_int16(y128p[12],scale2),ONE_OVER_SQRT2_Q15_128); + y128p[13] = mulhi_int16(shiftright_int16(y128p[13],scale2),ONE_OVER_SQRT2_Q15_128); + y128p[14] = mulhi_int16(shiftright_int16(y128p[14],scale2),ONE_OVER_SQRT2_Q15_128); + y128p[15] = mulhi_int16(shiftright_int16(y128p[15],scale2),ONE_OVER_SQRT2_Q15_128); + y128p+=16; + } + } + else { + for (i=0; i<8; i++) { + y128p[0] = mulhi_int16(y128p[0],ONE_OVER_SQRT2_Q15_128); + y128p[1] = mulhi_int16(y128p[1],ONE_OVER_SQRT2_Q15_128); + y128p[2] = mulhi_int16(y128p[2],ONE_OVER_SQRT2_Q15_128); + y128p[3] = mulhi_int16(y128p[3],ONE_OVER_SQRT2_Q15_128); + y128p[4] = mulhi_int16(y128p[4],ONE_OVER_SQRT2_Q15_128); + y128p[5] = mulhi_int16(y128p[5],ONE_OVER_SQRT2_Q15_128); + y128p[6] = mulhi_int16(y128p[6],ONE_OVER_SQRT2_Q15_128); + y128p[7] = mulhi_int16(y128p[7],ONE_OVER_SQRT2_Q15_128); + y128p[8] = mulhi_int16(y128p[8],ONE_OVER_SQRT2_Q15_128); + y128p[9] = mulhi_int16(y128p[9],ONE_OVER_SQRT2_Q15_128); + y128p[10] = mulhi_int16(y128p[10],ONE_OVER_SQRT2_Q15_128); + y128p[11] = mulhi_int16(y128p[11],ONE_OVER_SQRT2_Q15_128); + y128p[12] = mulhi_int16(y128p[12],ONE_OVER_SQRT2_Q15_128); + y128p[13] = mulhi_int16(y128p[13],ONE_OVER_SQRT2_Q15_128); + y128p[14] = mulhi_int16(y128p[14],ONE_OVER_SQRT2_Q15_128); + y128p[15] = mulhi_int16(y128p[15],ONE_OVER_SQRT2_Q15_128); + y128p+=16; + } } } } -void idft512(int16_t *x,int16_t *y,unsigned char scale) +void idft512(int16_t *x,int16_t *y,unsigned int *scale) { simdshort_q15_t xtmp[256],*xtmpp,*x64 = (simdshort_q15_t *)x; @@ -1706,8 +1813,10 @@ void idft512(int16_t *x,int16_t *y,unsigned char scale) xtmpp+=32; } - idft256((int16_t*)(xtmp),(int16_t*)ytmp,1); - idft256((int16_t*)(xtmp+128),(int16_t*)(ytmp+64),1); + unsigned int *scale256=NULL; + if (scale) scale256=scale+1; + idft256((int16_t*)(xtmp),(int16_t*)ytmp,scale256); + idft256((int16_t*)(xtmp+128),(int16_t*)(ytmp+64),scale256); for (i=0; i<64; i++) { @@ -1719,34 +1828,58 @@ void idft512(int16_t *x,int16_t *y,unsigned char scale) ytmpp++; } - if (scale>0) { + if (scale && *scale>0) { y128p = y128; - for (i=0; i<8; i++) { - y128p[0] = mulhi_int16(y128p[0],ONE_OVER_SQRT2_Q15_128); - y128p[1] = mulhi_int16(y128p[1],ONE_OVER_SQRT2_Q15_128); - y128p[2] = mulhi_int16(y128p[2],ONE_OVER_SQRT2_Q15_128); - y128p[3] = mulhi_int16(y128p[3],ONE_OVER_SQRT2_Q15_128); - y128p[4] = mulhi_int16(y128p[4],ONE_OVER_SQRT2_Q15_128); - y128p[5] = mulhi_int16(y128p[5],ONE_OVER_SQRT2_Q15_128); - y128p[6] = mulhi_int16(y128p[6],ONE_OVER_SQRT2_Q15_128); - y128p[7] = mulhi_int16(y128p[7],ONE_OVER_SQRT2_Q15_128); - y128p[8] = mulhi_int16(y128p[8],ONE_OVER_SQRT2_Q15_128); - y128p[9] = mulhi_int16(y128p[9],ONE_OVER_SQRT2_Q15_128); - y128p[10] = mulhi_int16(y128p[10],ONE_OVER_SQRT2_Q15_128); - y128p[11] = mulhi_int16(y128p[11],ONE_OVER_SQRT2_Q15_128); - y128p[12] = mulhi_int16(y128p[12],ONE_OVER_SQRT2_Q15_128); - y128p[13] = mulhi_int16(y128p[13],ONE_OVER_SQRT2_Q15_128); - y128p[14] = mulhi_int16(y128p[14],ONE_OVER_SQRT2_Q15_128); - y128p[15] = mulhi_int16(y128p[15],ONE_OVER_SQRT2_Q15_128); - y128p+=16; + if (*scale>1) { + uint32_t scale2=*scale-1; + for (i=0; i<8; i++) { + y128p[0] = mulhi_int16(shiftright_int16(y128p[0],scale2),ONE_OVER_SQRT2_Q15_128); + y128p[1] = mulhi_int16(shiftright_int16(y128p[1],scale2),ONE_OVER_SQRT2_Q15_128); + y128p[2] = mulhi_int16(shiftright_int16(y128p[2],scale2),ONE_OVER_SQRT2_Q15_128); + y128p[3] = mulhi_int16(shiftright_int16(y128p[3],scale2),ONE_OVER_SQRT2_Q15_128); + y128p[4] = mulhi_int16(shiftright_int16(y128p[4],scale2),ONE_OVER_SQRT2_Q15_128); + y128p[5] = mulhi_int16(shiftright_int16(y128p[5],scale2),ONE_OVER_SQRT2_Q15_128); + y128p[6] = mulhi_int16(shiftright_int16(y128p[6],scale2),ONE_OVER_SQRT2_Q15_128); + y128p[7] = mulhi_int16(shiftright_int16(y128p[7],scale2),ONE_OVER_SQRT2_Q15_128); + y128p[8] = mulhi_int16(shiftright_int16(y128p[8],scale2),ONE_OVER_SQRT2_Q15_128); + y128p[9] = mulhi_int16(shiftright_int16(y128p[9],scale2),ONE_OVER_SQRT2_Q15_128); + y128p[10] = mulhi_int16(shiftright_int16(y128p[10],scale2),ONE_OVER_SQRT2_Q15_128); + y128p[11] = mulhi_int16(shiftright_int16(y128p[11],scale2),ONE_OVER_SQRT2_Q15_128); + y128p[12] = mulhi_int16(shiftright_int16(y128p[12],scale2),ONE_OVER_SQRT2_Q15_128); + y128p[13] = mulhi_int16(shiftright_int16(y128p[13],scale2),ONE_OVER_SQRT2_Q15_128); + y128p[14] = mulhi_int16(shiftright_int16(y128p[14],scale2),ONE_OVER_SQRT2_Q15_128); + y128p[15] = mulhi_int16(shiftright_int16(y128p[15],scale2),ONE_OVER_SQRT2_Q15_128); + y128p+=16; + } + } + else { + for (i=0; i<8; i++) { + y128p[0] = mulhi_int16(y128p[0],ONE_OVER_SQRT2_Q15_128); + y128p[1] = mulhi_int16(y128p[1],ONE_OVER_SQRT2_Q15_128); + y128p[2] = mulhi_int16(y128p[2],ONE_OVER_SQRT2_Q15_128); + y128p[3] = mulhi_int16(y128p[3],ONE_OVER_SQRT2_Q15_128); + y128p[4] = mulhi_int16(y128p[4],ONE_OVER_SQRT2_Q15_128); + y128p[5] = mulhi_int16(y128p[5],ONE_OVER_SQRT2_Q15_128); + y128p[6] = mulhi_int16(y128p[6],ONE_OVER_SQRT2_Q15_128); + y128p[7] = mulhi_int16(y128p[7],ONE_OVER_SQRT2_Q15_128); + y128p[8] = mulhi_int16(y128p[8],ONE_OVER_SQRT2_Q15_128); + y128p[9] = mulhi_int16(y128p[9],ONE_OVER_SQRT2_Q15_128); + y128p[10] = mulhi_int16(y128p[10],ONE_OVER_SQRT2_Q15_128); + y128p[11] = mulhi_int16(y128p[11],ONE_OVER_SQRT2_Q15_128); + y128p[12] = mulhi_int16(y128p[12],ONE_OVER_SQRT2_Q15_128); + y128p[13] = mulhi_int16(y128p[13],ONE_OVER_SQRT2_Q15_128); + y128p[14] = mulhi_int16(y128p[14],ONE_OVER_SQRT2_Q15_128); + y128p[15] = mulhi_int16(y128p[15],ONE_OVER_SQRT2_Q15_128); + y128p+=16; + } } } } int16_t tw1024[1536] __attribute__((aligned(32))); -void dft1024(int16_t *x,int16_t *y,unsigned char scale) +void dft1024(int16_t *x,int16_t *y,unsigned int *scale) { simd_q15_t xtmp[256],ytmp[256],*tw1024_128p=(simd_q15_t *)tw1024,*x128=(simd_q15_t *)x,*y128=(simd_q15_t *)y,*y128p=(simd_q15_t *)y; @@ -1758,10 +1891,12 @@ void dft1024(int16_t *x,int16_t *y,unsigned char scale) } - dft256((int16_t*)(xtmp),(int16_t*)(ytmp),1); - dft256((int16_t*)(xtmp+64),(int16_t*)(ytmp+64),1); - dft256((int16_t*)(xtmp+128),(int16_t*)(ytmp+128),1); - dft256((int16_t*)(xtmp+192),(int16_t*)(ytmp+192),1); + unsigned int *scale256=NULL; + if (scale) scale256=scale+1; + dft256((int16_t*)(xtmp),(int16_t*)(ytmp),scale256); + dft256((int16_t*)(xtmp+64),(int16_t*)(ytmp+64),scale256); + dft256((int16_t*)(xtmp+128),(int16_t*)(ytmp+128),scale256); + dft256((int16_t*)(xtmp+192),(int16_t*)(ytmp+192),scale256); for (i=0; i<64; i++) { bfly4(ytmpp,ytmpp+64,ytmpp+128,ytmpp+192, @@ -1772,32 +1907,33 @@ void dft1024(int16_t *x,int16_t *y,unsigned char scale) ytmpp++; } - if (scale>0) { + if (scale && *scale>0) { + unsigned int scalec=*scale; for (i=0; i<16; i++) { - y128[0] = shiftright_int16(y128[0],1); - y128[1] = shiftright_int16(y128[1],1); - y128[2] = shiftright_int16(y128[2],1); - y128[3] = shiftright_int16(y128[3],1); - y128[4] = shiftright_int16(y128[4],1); - y128[5] = shiftright_int16(y128[5],1); - y128[6] = shiftright_int16(y128[6],1); - y128[7] = shiftright_int16(y128[7],1); - y128[8] = shiftright_int16(y128[8],1); - y128[9] = shiftright_int16(y128[9],1); - y128[10] = shiftright_int16(y128[10],1); - y128[11] = shiftright_int16(y128[11],1); - y128[12] = shiftright_int16(y128[12],1); - y128[13] = shiftright_int16(y128[13],1); - y128[14] = shiftright_int16(y128[14],1); - y128[15] = shiftright_int16(y128[15],1); + y128[0] = shiftright_int16(y128[0],scalec); + y128[1] = shiftright_int16(y128[1],scalec); + y128[2] = shiftright_int16(y128[2],scalec); + y128[3] = shiftright_int16(y128[3],scalec); + y128[4] = shiftright_int16(y128[4],scalec); + y128[5] = shiftright_int16(y128[5],scalec); + y128[6] = shiftright_int16(y128[6],scalec); + y128[7] = shiftright_int16(y128[7],scalec); + y128[8] = shiftright_int16(y128[8],scalec); + y128[9] = shiftright_int16(y128[9],scalec); + y128[10] = shiftright_int16(y128[10],scalec); + y128[11] = shiftright_int16(y128[11],scalec); + y128[12] = shiftright_int16(y128[12],scalec); + y128[13] = shiftright_int16(y128[13],scalec); + y128[14] = shiftright_int16(y128[14],scalec); + y128[15] = shiftright_int16(y128[15],scalec); y128+=16; } } } -void idft1024(int16_t *x,int16_t *y,unsigned char scale) +void idft1024(int16_t *x,int16_t *y,unsigned int *scale) { simd_q15_t xtmp[256],ytmp[256],*tw1024_128p=(simd_q15_t *)tw1024,*x128=(simd_q15_t *)x,*y128=(simd_q15_t *)y,*y128p=(simd_q15_t *)y; @@ -1809,10 +1945,12 @@ void idft1024(int16_t *x,int16_t *y,unsigned char scale) } - idft256((int16_t*)(xtmp),(int16_t*)(ytmp),1); - idft256((int16_t*)(xtmp+64),(int16_t*)(ytmp+64),1); - idft256((int16_t*)(xtmp+128),(int16_t*)(ytmp+128),1); - idft256((int16_t*)(xtmp+192),(int16_t*)(ytmp+192),1); + unsigned int *scale256=NULL; + if (scale) scale256=scale+1; + idft256((int16_t*)(xtmp),(int16_t*)(ytmp),scale256); + idft256((int16_t*)(xtmp+64),(int16_t*)(ytmp+64),scale256); + idft256((int16_t*)(xtmp+128),(int16_t*)(ytmp+128),scale256); + idft256((int16_t*)(xtmp+192),(int16_t*)(ytmp+192),scale256); for (i=0; i<64; i++) { ibfly4(ytmpp,ytmpp+64,ytmpp+128,ytmpp+192, @@ -1823,25 +1961,26 @@ void idft1024(int16_t *x,int16_t *y,unsigned char scale) ytmpp++; } - if (scale>0) { + if (scale && *scale>0) { + unsigned int scalec=*scale; for (i=0; i<16; i++) { - y128[0] = shiftright_int16(y128[0],1); - y128[1] = shiftright_int16(y128[1],1); - y128[2] = shiftright_int16(y128[2],1); - y128[3] = shiftright_int16(y128[3],1); - y128[4] = shiftright_int16(y128[4],1); - y128[5] = shiftright_int16(y128[5],1); - y128[6] = shiftright_int16(y128[6],1); - y128[7] = shiftright_int16(y128[7],1); - y128[8] = shiftright_int16(y128[8],1); - y128[9] = shiftright_int16(y128[9],1); - y128[10] = shiftright_int16(y128[10],1); - y128[11] = shiftright_int16(y128[11],1); - y128[12] = shiftright_int16(y128[12],1); - y128[13] = shiftright_int16(y128[13],1); - y128[14] = shiftright_int16(y128[14],1); - y128[15] = shiftright_int16(y128[15],1); + y128[0] = shiftright_int16(y128[0],scalec); + y128[1] = shiftright_int16(y128[1],scalec); + y128[2] = shiftright_int16(y128[2],scalec); + y128[3] = shiftright_int16(y128[3],scalec); + y128[4] = shiftright_int16(y128[4],scalec); + y128[5] = shiftright_int16(y128[5],scalec); + y128[6] = shiftright_int16(y128[6],scalec); + y128[7] = shiftright_int16(y128[7],scalec); + y128[8] = shiftright_int16(y128[8],scalec); + y128[9] = shiftright_int16(y128[9],scalec); + y128[10] = shiftright_int16(y128[10],scalec); + y128[11] = shiftright_int16(y128[11],scalec); + y128[12] = shiftright_int16(y128[12],scalec); + y128[13] = shiftright_int16(y128[13],scalec); + y128[14] = shiftright_int16(y128[14],scalec); + y128[15] = shiftright_int16(y128[15],scalec); y128+=16; } @@ -1850,7 +1989,7 @@ void idft1024(int16_t *x,int16_t *y,unsigned char scale) int16_t tw2048[2048] __attribute__((aligned(32))); -void dft2048(int16_t *x,int16_t *y,unsigned char scale) +void dft2048(int16_t *x,int16_t *y,unsigned int *scale) { simdshort_q15_t xtmp[1024],*xtmpp,*x64 = (simdshort_q15_t *)x; @@ -1898,8 +2037,10 @@ void dft2048(int16_t *x,int16_t *y,unsigned char scale) xtmpp+=32; } - dft1024((int16_t*)(xtmp),(int16_t*)ytmp,1); - dft1024((int16_t*)(xtmp+512),(int16_t*)(ytmp+256),1); + unsigned int *scale1024=NULL; + if (scale) scale1024=scale+1; + dft1024((int16_t*)(xtmp),(int16_t*)ytmp,scale1024); + dft1024((int16_t*)(xtmp+512),(int16_t*)(ytmp+256),scale1024); for (i=0; i<256; i++) { @@ -1911,32 +2052,56 @@ void dft2048(int16_t *x,int16_t *y,unsigned char scale) ytmpp++; } - if (scale>0) { + if (scale && *scale>0) { y128p = y128; - for (i=0; i<32; i++) { - y128p[0] = mulhi_int16(y128p[0],ONE_OVER_SQRT2_Q15_128); - y128p[1] = mulhi_int16(y128p[1],ONE_OVER_SQRT2_Q15_128); - y128p[2] = mulhi_int16(y128p[2],ONE_OVER_SQRT2_Q15_128); - y128p[3] = mulhi_int16(y128p[3],ONE_OVER_SQRT2_Q15_128); - y128p[4] = mulhi_int16(y128p[4],ONE_OVER_SQRT2_Q15_128); - y128p[5] = mulhi_int16(y128p[5],ONE_OVER_SQRT2_Q15_128); - y128p[6] = mulhi_int16(y128p[6],ONE_OVER_SQRT2_Q15_128); - y128p[7] = mulhi_int16(y128p[7],ONE_OVER_SQRT2_Q15_128); - y128p[8] = mulhi_int16(y128p[8],ONE_OVER_SQRT2_Q15_128); - y128p[9] = mulhi_int16(y128p[9],ONE_OVER_SQRT2_Q15_128); - y128p[10] = mulhi_int16(y128p[10],ONE_OVER_SQRT2_Q15_128); - y128p[11] = mulhi_int16(y128p[11],ONE_OVER_SQRT2_Q15_128); - y128p[12] = mulhi_int16(y128p[12],ONE_OVER_SQRT2_Q15_128); - y128p[13] = mulhi_int16(y128p[13],ONE_OVER_SQRT2_Q15_128); - y128p[14] = mulhi_int16(y128p[14],ONE_OVER_SQRT2_Q15_128); - y128p[15] = mulhi_int16(y128p[15],ONE_OVER_SQRT2_Q15_128); - y128p+=16; + if (*scale>1) { + uint32_t scale2=*scale-1; + for (i=0; i<32; i++) { + y128p[0] = mulhi_int16(shiftright_int16(y128p[0],scale2),ONE_OVER_SQRT2_Q15_128); + y128p[1] = mulhi_int16(shiftright_int16(y128p[1],scale2),ONE_OVER_SQRT2_Q15_128); + y128p[2] = mulhi_int16(shiftright_int16(y128p[2],scale2),ONE_OVER_SQRT2_Q15_128); + y128p[3] = mulhi_int16(shiftright_int16(y128p[3],scale2),ONE_OVER_SQRT2_Q15_128); + y128p[4] = mulhi_int16(shiftright_int16(y128p[4],scale2),ONE_OVER_SQRT2_Q15_128); + y128p[5] = mulhi_int16(shiftright_int16(y128p[5],scale2),ONE_OVER_SQRT2_Q15_128); + y128p[6] = mulhi_int16(shiftright_int16(y128p[6],scale2),ONE_OVER_SQRT2_Q15_128); + y128p[7] = mulhi_int16(shiftright_int16(y128p[7],scale2),ONE_OVER_SQRT2_Q15_128); + y128p[8] = mulhi_int16(shiftright_int16(y128p[8],scale2),ONE_OVER_SQRT2_Q15_128); + y128p[9] = mulhi_int16(shiftright_int16(y128p[9],scale2),ONE_OVER_SQRT2_Q15_128); + y128p[10] = mulhi_int16(shiftright_int16(y128p[10],scale2),ONE_OVER_SQRT2_Q15_128); + y128p[11] = mulhi_int16(shiftright_int16(y128p[11],scale2),ONE_OVER_SQRT2_Q15_128); + y128p[12] = mulhi_int16(shiftright_int16(y128p[12],scale2),ONE_OVER_SQRT2_Q15_128); + y128p[13] = mulhi_int16(shiftright_int16(y128p[13],scale2),ONE_OVER_SQRT2_Q15_128); + y128p[14] = mulhi_int16(shiftright_int16(y128p[14],scale2),ONE_OVER_SQRT2_Q15_128); + y128p[15] = mulhi_int16(shiftright_int16(y128p[15],scale2),ONE_OVER_SQRT2_Q15_128); + y128p+=16; + } + } + else { + for (i=0; i<32; i++) { + y128p[0] = mulhi_int16(y128p[0],ONE_OVER_SQRT2_Q15_128); + y128p[1] = mulhi_int16(y128p[1],ONE_OVER_SQRT2_Q15_128); + y128p[2] = mulhi_int16(y128p[2],ONE_OVER_SQRT2_Q15_128); + y128p[3] = mulhi_int16(y128p[3],ONE_OVER_SQRT2_Q15_128); + y128p[4] = mulhi_int16(y128p[4],ONE_OVER_SQRT2_Q15_128); + y128p[5] = mulhi_int16(y128p[5],ONE_OVER_SQRT2_Q15_128); + y128p[6] = mulhi_int16(y128p[6],ONE_OVER_SQRT2_Q15_128); + y128p[7] = mulhi_int16(y128p[7],ONE_OVER_SQRT2_Q15_128); + y128p[8] = mulhi_int16(y128p[8],ONE_OVER_SQRT2_Q15_128); + y128p[9] = mulhi_int16(y128p[9],ONE_OVER_SQRT2_Q15_128); + y128p[10] = mulhi_int16(y128p[10],ONE_OVER_SQRT2_Q15_128); + y128p[11] = mulhi_int16(y128p[11],ONE_OVER_SQRT2_Q15_128); + y128p[12] = mulhi_int16(y128p[12],ONE_OVER_SQRT2_Q15_128); + y128p[13] = mulhi_int16(y128p[13],ONE_OVER_SQRT2_Q15_128); + y128p[14] = mulhi_int16(y128p[14],ONE_OVER_SQRT2_Q15_128); + y128p[15] = mulhi_int16(y128p[15],ONE_OVER_SQRT2_Q15_128); + y128p+=16; + } } } } -void idft2048(int16_t *x,int16_t *y,unsigned char scale) +void idft2048(int16_t *x,int16_t *y,unsigned int *scale) { simdshort_q15_t xtmp[1024],*xtmpp,*x64 = (simdshort_q15_t *)x; @@ -1984,8 +2149,10 @@ void idft2048(int16_t *x,int16_t *y,unsigned char scale) xtmpp+=32; } - idft1024((int16_t*)(xtmp),(int16_t*)ytmp,1); - idft1024((int16_t*)(xtmp+512),(int16_t*)(ytmp+256),1); + unsigned int *scale1024=NULL; + if (scale) scale1024=scale+1; + idft1024((int16_t*)(xtmp),(int16_t*)ytmp,scale1024); + idft1024((int16_t*)(xtmp+512),(int16_t*)(ytmp+256),scale1024); for (i=0; i<256; i++) { @@ -1997,34 +2164,58 @@ void idft2048(int16_t *x,int16_t *y,unsigned char scale) ytmpp++; } - if (scale>0) { + if (scale && *scale>0) { y128p = y128; - for (i=0; i<32; i++) { - y128p[0] = mulhi_int16(y128p[0],ONE_OVER_SQRT2_Q15_128); - y128p[1] = mulhi_int16(y128p[1],ONE_OVER_SQRT2_Q15_128); - y128p[2] = mulhi_int16(y128p[2],ONE_OVER_SQRT2_Q15_128); - y128p[3] = mulhi_int16(y128p[3],ONE_OVER_SQRT2_Q15_128); - y128p[4] = mulhi_int16(y128p[4],ONE_OVER_SQRT2_Q15_128); - y128p[5] = mulhi_int16(y128p[5],ONE_OVER_SQRT2_Q15_128); - y128p[6] = mulhi_int16(y128p[6],ONE_OVER_SQRT2_Q15_128); - y128p[7] = mulhi_int16(y128p[7],ONE_OVER_SQRT2_Q15_128); - y128p[8] = mulhi_int16(y128p[8],ONE_OVER_SQRT2_Q15_128); - y128p[9] = mulhi_int16(y128p[9],ONE_OVER_SQRT2_Q15_128); - y128p[10] = mulhi_int16(y128p[10],ONE_OVER_SQRT2_Q15_128); - y128p[11] = mulhi_int16(y128p[11],ONE_OVER_SQRT2_Q15_128); - y128p[12] = mulhi_int16(y128p[12],ONE_OVER_SQRT2_Q15_128); - y128p[13] = mulhi_int16(y128p[13],ONE_OVER_SQRT2_Q15_128); - y128p[14] = mulhi_int16(y128p[14],ONE_OVER_SQRT2_Q15_128); - y128p[15] = mulhi_int16(y128p[15],ONE_OVER_SQRT2_Q15_128); - y128p+=16; + if (*scale>1) { + uint32_t scale2=*scale-1; + for (i=0; i<32; i++) { + y128p[0] = mulhi_int16(shiftright_int16(y128p[0],scale2),ONE_OVER_SQRT2_Q15_128); + y128p[1] = mulhi_int16(shiftright_int16(y128p[1],scale2),ONE_OVER_SQRT2_Q15_128); + y128p[2] = mulhi_int16(shiftright_int16(y128p[2],scale2),ONE_OVER_SQRT2_Q15_128); + y128p[3] = mulhi_int16(shiftright_int16(y128p[3],scale2),ONE_OVER_SQRT2_Q15_128); + y128p[4] = mulhi_int16(shiftright_int16(y128p[4],scale2),ONE_OVER_SQRT2_Q15_128); + y128p[5] = mulhi_int16(shiftright_int16(y128p[5],scale2),ONE_OVER_SQRT2_Q15_128); + y128p[6] = mulhi_int16(shiftright_int16(y128p[6],scale2),ONE_OVER_SQRT2_Q15_128); + y128p[7] = mulhi_int16(shiftright_int16(y128p[7],scale2),ONE_OVER_SQRT2_Q15_128); + y128p[8] = mulhi_int16(shiftright_int16(y128p[8],scale2),ONE_OVER_SQRT2_Q15_128); + y128p[9] = mulhi_int16(shiftright_int16(y128p[9],scale2),ONE_OVER_SQRT2_Q15_128); + y128p[10] = mulhi_int16(shiftright_int16(y128p[10],scale2),ONE_OVER_SQRT2_Q15_128); + y128p[11] = mulhi_int16(shiftright_int16(y128p[11],scale2),ONE_OVER_SQRT2_Q15_128); + y128p[12] = mulhi_int16(shiftright_int16(y128p[12],scale2),ONE_OVER_SQRT2_Q15_128); + y128p[13] = mulhi_int16(shiftright_int16(y128p[13],scale2),ONE_OVER_SQRT2_Q15_128); + y128p[14] = mulhi_int16(shiftright_int16(y128p[14],scale2),ONE_OVER_SQRT2_Q15_128); + y128p[15] = mulhi_int16(shiftright_int16(y128p[15],scale2),ONE_OVER_SQRT2_Q15_128); + y128p+=16; + } + } + else { + for (i=0; i<32; i++) { + y128p[0] = mulhi_int16(y128p[0],ONE_OVER_SQRT2_Q15_128); + y128p[1] = mulhi_int16(y128p[1],ONE_OVER_SQRT2_Q15_128); + y128p[2] = mulhi_int16(y128p[2],ONE_OVER_SQRT2_Q15_128); + y128p[3] = mulhi_int16(y128p[3],ONE_OVER_SQRT2_Q15_128); + y128p[4] = mulhi_int16(y128p[4],ONE_OVER_SQRT2_Q15_128); + y128p[5] = mulhi_int16(y128p[5],ONE_OVER_SQRT2_Q15_128); + y128p[6] = mulhi_int16(y128p[6],ONE_OVER_SQRT2_Q15_128); + y128p[7] = mulhi_int16(y128p[7],ONE_OVER_SQRT2_Q15_128); + y128p[8] = mulhi_int16(y128p[8],ONE_OVER_SQRT2_Q15_128); + y128p[9] = mulhi_int16(y128p[9],ONE_OVER_SQRT2_Q15_128); + y128p[10] = mulhi_int16(y128p[10],ONE_OVER_SQRT2_Q15_128); + y128p[11] = mulhi_int16(y128p[11],ONE_OVER_SQRT2_Q15_128); + y128p[12] = mulhi_int16(y128p[12],ONE_OVER_SQRT2_Q15_128); + y128p[13] = mulhi_int16(y128p[13],ONE_OVER_SQRT2_Q15_128); + y128p[14] = mulhi_int16(y128p[14],ONE_OVER_SQRT2_Q15_128); + y128p[15] = mulhi_int16(y128p[15],ONE_OVER_SQRT2_Q15_128); + y128p+=16; + } } } } int16_t tw4096[3*2*1024]; -void dft4096(int16_t *x,int16_t *y,unsigned char scale) +void dft4096(int16_t *x,int16_t *y,unsigned int *scale) { simd_q15_t xtmp[1024],ytmp[1024],*tw4096_128p=(simd_q15_t *)tw4096,*x128=(simd_q15_t *)x,*y128=(simd_q15_t *)y,*y128p=(simd_q15_t *)y; @@ -2036,10 +2227,12 @@ void dft4096(int16_t *x,int16_t *y,unsigned char scale) } - dft1024((int16_t*)(xtmp),(int16_t*)(ytmp),1); - dft1024((int16_t*)(xtmp+256),(int16_t*)(ytmp+256),1); - dft1024((int16_t*)(xtmp+512),(int16_t*)(ytmp+512),1); - dft1024((int16_t*)(xtmp+768),(int16_t*)(ytmp+768),1); + unsigned int *scale1024=NULL; + if (scale) scale1024=scale+1; + dft1024((int16_t*)(xtmp),(int16_t*)(ytmp),scale1024); + dft1024((int16_t*)(xtmp+256),(int16_t*)(ytmp+256),scale1024); + dft1024((int16_t*)(xtmp+512),(int16_t*)(ytmp+512),scale1024); + dft1024((int16_t*)(xtmp+768),(int16_t*)(ytmp+768),scale1024); for (i=0; i<256; i++) { bfly4(ytmpp,ytmpp+256,ytmpp+512,ytmpp+768, @@ -2050,25 +2243,26 @@ void dft4096(int16_t *x,int16_t *y,unsigned char scale) ytmpp++; } - if (scale>0) { + if (scale && *scale>0) { + unsigned int scalec=*scale; for (i=0; i<64; i++) { - y128[0] = shiftright_int16(y128[0],1); - y128[1] = shiftright_int16(y128[1],1); - y128[2] = shiftright_int16(y128[2],1); - y128[3] = shiftright_int16(y128[3],1); - y128[4] = shiftright_int16(y128[4],1); - y128[5] = shiftright_int16(y128[5],1); - y128[6] = shiftright_int16(y128[6],1); - y128[7] = shiftright_int16(y128[7],1); - y128[8] = shiftright_int16(y128[8],1); - y128[9] = shiftright_int16(y128[9],1); - y128[10] = shiftright_int16(y128[10],1); - y128[11] = shiftright_int16(y128[11],1); - y128[12] = shiftright_int16(y128[12],1); - y128[13] = shiftright_int16(y128[13],1); - y128[14] = shiftright_int16(y128[14],1); - y128[15] = shiftright_int16(y128[15],1); + y128[0] = shiftright_int16(y128[0],scalec); + y128[1] = shiftright_int16(y128[1],scalec); + y128[2] = shiftright_int16(y128[2],scalec); + y128[3] = shiftright_int16(y128[3],scalec); + y128[4] = shiftright_int16(y128[4],scalec); + y128[5] = shiftright_int16(y128[5],scalec); + y128[6] = shiftright_int16(y128[6],scalec); + y128[7] = shiftright_int16(y128[7],scalec); + y128[8] = shiftright_int16(y128[8],scalec); + y128[9] = shiftright_int16(y128[9],scalec); + y128[10] = shiftright_int16(y128[10],scalec); + y128[11] = shiftright_int16(y128[11],scalec); + y128[12] = shiftright_int16(y128[12],scalec); + y128[13] = shiftright_int16(y128[13],scalec); + y128[14] = shiftright_int16(y128[14],scalec); + y128[15] = shiftright_int16(y128[15],scalec); y128+=16; } @@ -2077,7 +2271,7 @@ void dft4096(int16_t *x,int16_t *y,unsigned char scale) -void idft4096(int16_t *x,int16_t *y,unsigned char scale) +void idft4096(int16_t *x,int16_t *y,unsigned int *scale) { simd_q15_t xtmp[1024],ytmp[1024],*tw4096_128p=(simd_q15_t *)tw4096,*x128=(simd_q15_t *)x,*y128=(simd_q15_t *)y,*y128p=(simd_q15_t *)y; @@ -2089,10 +2283,12 @@ void idft4096(int16_t *x,int16_t *y,unsigned char scale) } - idft1024((int16_t*)(xtmp),(int16_t*)(ytmp),1); - idft1024((int16_t*)(xtmp+256),(int16_t*)(ytmp+256),1); - idft1024((int16_t*)(xtmp+512),(int16_t*)(ytmp+512),1); - idft1024((int16_t*)(xtmp+768),(int16_t*)(ytmp+768),1); + unsigned int *scale1024=NULL; + if (scale) scale1024=scale+1; + idft1024((int16_t*)(xtmp),(int16_t*)(ytmp),scale1024); + idft1024((int16_t*)(xtmp+256),(int16_t*)(ytmp+256),scale1024); + idft1024((int16_t*)(xtmp+512),(int16_t*)(ytmp+512),scale1024); + idft1024((int16_t*)(xtmp+768),(int16_t*)(ytmp+768),scale1024); for (i=0; i<256; i++) { ibfly4(ytmpp,ytmpp+256,ytmpp+512,ytmpp+768, @@ -2103,25 +2299,26 @@ void idft4096(int16_t *x,int16_t *y,unsigned char scale) ytmpp++; } - if (scale>0) { + if (scale && *scale>0) { + unsigned int scalec=*scale; for (i=0; i<64; i++) { - y128[0] = shiftright_int16(y128[0],scale); - y128[1] = shiftright_int16(y128[1],scale); - y128[2] = shiftright_int16(y128[2],scale); - y128[3] = shiftright_int16(y128[3],scale); - y128[4] = shiftright_int16(y128[4],scale); - y128[5] = shiftright_int16(y128[5],scale); - y128[6] = shiftright_int16(y128[6],scale); - y128[7] = shiftright_int16(y128[7],scale); - y128[8] = shiftright_int16(y128[8],scale); - y128[9] = shiftright_int16(y128[9],scale); - y128[10] = shiftright_int16(y128[10],scale); - y128[11] = shiftright_int16(y128[11],scale); - y128[12] = shiftright_int16(y128[12],scale); - y128[13] = shiftright_int16(y128[13],scale); - y128[14] = shiftright_int16(y128[14],scale); - y128[15] = shiftright_int16(y128[15],scale); + y128[0] = shiftright_int16(y128[0],scalec); + y128[1] = shiftright_int16(y128[1],scalec); + y128[2] = shiftright_int16(y128[2],scalec); + y128[3] = shiftright_int16(y128[3],scalec); + y128[4] = shiftright_int16(y128[4],scalec); + y128[5] = shiftright_int16(y128[5],scalec); + y128[6] = shiftright_int16(y128[6],scalec); + y128[7] = shiftright_int16(y128[7],scalec); + y128[8] = shiftright_int16(y128[8],scalec); + y128[9] = shiftright_int16(y128[9],scalec); + y128[10] = shiftright_int16(y128[10],scalec); + y128[11] = shiftright_int16(y128[11],scalec); + y128[12] = shiftright_int16(y128[12],scalec); + y128[13] = shiftright_int16(y128[13],scalec); + y128[14] = shiftright_int16(y128[14],scalec); + y128[15] = shiftright_int16(y128[15],scalec); y128+=16; } @@ -2130,7 +2327,7 @@ void idft4096(int16_t *x,int16_t *y,unsigned char scale) int16_t tw8192[2*4096] __attribute__((aligned(32))); -void dft8192(int16_t *x,int16_t *y,unsigned char scale) +void dft8192(int16_t *x,int16_t *y,unsigned int *scale) { simdshort_q15_t xtmp[4096],*xtmpp,*x64 = (simdshort_q15_t *)x; @@ -2178,8 +2375,10 @@ void dft8192(int16_t *x,int16_t *y,unsigned char scale) xtmpp+=32; } - dft4096((int16_t*)(xtmp),(int16_t*)ytmp,1); - dft4096((int16_t*)(xtmp+2048),(int16_t*)(ytmp+1024),1); + unsigned int *scale4096=NULL; + if (scale) scale4096=scale+1; + dft4096((int16_t*)(xtmp),(int16_t*)ytmp,scale4096); + dft4096((int16_t*)(xtmp+2048),(int16_t*)(ytmp+1024),scale4096); for (i=0; i<1024; i++) { @@ -2191,32 +2390,56 @@ void dft8192(int16_t *x,int16_t *y,unsigned char scale) ytmpp++; } - if (scale>0) { + if (scale && scale>0) { y128p = y128; - for (i=0; i<128; i++) { - y128p[0] = mulhi_int16(y128p[0],ONE_OVER_SQRT2_Q15_128); - y128p[1] = mulhi_int16(y128p[1],ONE_OVER_SQRT2_Q15_128); - y128p[2] = mulhi_int16(y128p[2],ONE_OVER_SQRT2_Q15_128); - y128p[3] = mulhi_int16(y128p[3],ONE_OVER_SQRT2_Q15_128); - y128p[4] = mulhi_int16(y128p[4],ONE_OVER_SQRT2_Q15_128); - y128p[5] = mulhi_int16(y128p[5],ONE_OVER_SQRT2_Q15_128); - y128p[6] = mulhi_int16(y128p[6],ONE_OVER_SQRT2_Q15_128); - y128p[7] = mulhi_int16(y128p[7],ONE_OVER_SQRT2_Q15_128); - y128p[8] = mulhi_int16(y128p[8],ONE_OVER_SQRT2_Q15_128); - y128p[9] = mulhi_int16(y128p[9],ONE_OVER_SQRT2_Q15_128); - y128p[10] = mulhi_int16(y128p[10],ONE_OVER_SQRT2_Q15_128); - y128p[11] = mulhi_int16(y128p[11],ONE_OVER_SQRT2_Q15_128); - y128p[12] = mulhi_int16(y128p[12],ONE_OVER_SQRT2_Q15_128); - y128p[13] = mulhi_int16(y128p[13],ONE_OVER_SQRT2_Q15_128); - y128p[14] = mulhi_int16(y128p[14],ONE_OVER_SQRT2_Q15_128); - y128p[15] = mulhi_int16(y128p[15],ONE_OVER_SQRT2_Q15_128); - y128p+=16; + if (*scale>1) { + uint32_t scale2=*scale-1; + for (i=0; i<128; i++) { + y128p[0] = mulhi_int16(shiftright_int16(y128p[0],scale2),ONE_OVER_SQRT2_Q15_128); + y128p[1] = mulhi_int16(shiftright_int16(y128p[1],scale2),ONE_OVER_SQRT2_Q15_128); + y128p[2] = mulhi_int16(shiftright_int16(y128p[2],scale2),ONE_OVER_SQRT2_Q15_128); + y128p[3] = mulhi_int16(shiftright_int16(y128p[3],scale2),ONE_OVER_SQRT2_Q15_128); + y128p[4] = mulhi_int16(shiftright_int16(y128p[4],scale2),ONE_OVER_SQRT2_Q15_128); + y128p[5] = mulhi_int16(shiftright_int16(y128p[5],scale2),ONE_OVER_SQRT2_Q15_128); + y128p[6] = mulhi_int16(shiftright_int16(y128p[6],scale2),ONE_OVER_SQRT2_Q15_128); + y128p[7] = mulhi_int16(shiftright_int16(y128p[7],scale2),ONE_OVER_SQRT2_Q15_128); + y128p[8] = mulhi_int16(shiftright_int16(y128p[8],scale2),ONE_OVER_SQRT2_Q15_128); + y128p[9] = mulhi_int16(shiftright_int16(y128p[9],scale2),ONE_OVER_SQRT2_Q15_128); + y128p[10] = mulhi_int16(shiftright_int16(y128p[10],scale2),ONE_OVER_SQRT2_Q15_128); + y128p[11] = mulhi_int16(shiftright_int16(y128p[11],scale2),ONE_OVER_SQRT2_Q15_128); + y128p[12] = mulhi_int16(shiftright_int16(y128p[12],scale2),ONE_OVER_SQRT2_Q15_128); + y128p[13] = mulhi_int16(shiftright_int16(y128p[13],scale2),ONE_OVER_SQRT2_Q15_128); + y128p[14] = mulhi_int16(shiftright_int16(y128p[14],scale2),ONE_OVER_SQRT2_Q15_128); + y128p[15] = mulhi_int16(shiftright_int16(y128p[15],scale2),ONE_OVER_SQRT2_Q15_128); + y128p+=16; + } + } + else { + for (i=0; i<128; i++) { + y128p[0] = mulhi_int16(y128p[0],ONE_OVER_SQRT2_Q15_128); + y128p[1] = mulhi_int16(y128p[1],ONE_OVER_SQRT2_Q15_128); + y128p[2] = mulhi_int16(y128p[2],ONE_OVER_SQRT2_Q15_128); + y128p[3] = mulhi_int16(y128p[3],ONE_OVER_SQRT2_Q15_128); + y128p[4] = mulhi_int16(y128p[4],ONE_OVER_SQRT2_Q15_128); + y128p[5] = mulhi_int16(y128p[5],ONE_OVER_SQRT2_Q15_128); + y128p[6] = mulhi_int16(y128p[6],ONE_OVER_SQRT2_Q15_128); + y128p[7] = mulhi_int16(y128p[7],ONE_OVER_SQRT2_Q15_128); + y128p[8] = mulhi_int16(y128p[8],ONE_OVER_SQRT2_Q15_128); + y128p[9] = mulhi_int16(y128p[9],ONE_OVER_SQRT2_Q15_128); + y128p[10] = mulhi_int16(y128p[10],ONE_OVER_SQRT2_Q15_128); + y128p[11] = mulhi_int16(y128p[11],ONE_OVER_SQRT2_Q15_128); + y128p[12] = mulhi_int16(y128p[12],ONE_OVER_SQRT2_Q15_128); + y128p[13] = mulhi_int16(y128p[13],ONE_OVER_SQRT2_Q15_128); + y128p[14] = mulhi_int16(y128p[14],ONE_OVER_SQRT2_Q15_128); + y128p[15] = mulhi_int16(y128p[15],ONE_OVER_SQRT2_Q15_128); + y128p+=16; + } } } } -void idft8192(int16_t *x,int16_t *y,unsigned char scale) +void idft8192(int16_t *x,int16_t *y,unsigned int *scale) { simdshort_q15_t xtmp[4096],*xtmpp,*x64 = (simdshort_q15_t *)x; @@ -2264,8 +2487,10 @@ void idft8192(int16_t *x,int16_t *y,unsigned char scale) xtmpp+=32; } - idft4096((int16_t*)(xtmp),(int16_t*)ytmp,1); - idft4096((int16_t*)(xtmp+2048),(int16_t*)(ytmp+1024),1); + unsigned int *scale4096=NULL; + if (scale) scale4096=scale+1; + idft4096((int16_t*)(xtmp),(int16_t*)ytmp,scale4096); + idft4096((int16_t*)(xtmp+2048),(int16_t*)(ytmp+1024),scale4096); for (i=0; i<1024; i++) { @@ -2277,34 +2502,58 @@ void idft8192(int16_t *x,int16_t *y,unsigned char scale) ytmpp++; } - if (scale>0) { + if (scale && scale>0) { y128p = y128; - for (i=0; i<128; i++) { - y128p[0] = mulhi_int16(y128p[0],ONE_OVER_SQRT2_Q15_128); - y128p[1] = mulhi_int16(y128p[1],ONE_OVER_SQRT2_Q15_128); - y128p[2] = mulhi_int16(y128p[2],ONE_OVER_SQRT2_Q15_128); - y128p[3] = mulhi_int16(y128p[3],ONE_OVER_SQRT2_Q15_128); - y128p[4] = mulhi_int16(y128p[4],ONE_OVER_SQRT2_Q15_128); - y128p[5] = mulhi_int16(y128p[5],ONE_OVER_SQRT2_Q15_128); - y128p[6] = mulhi_int16(y128p[6],ONE_OVER_SQRT2_Q15_128); - y128p[7] = mulhi_int16(y128p[7],ONE_OVER_SQRT2_Q15_128); - y128p[8] = mulhi_int16(y128p[8],ONE_OVER_SQRT2_Q15_128); - y128p[9] = mulhi_int16(y128p[9],ONE_OVER_SQRT2_Q15_128); - y128p[10] = mulhi_int16(y128p[10],ONE_OVER_SQRT2_Q15_128); - y128p[11] = mulhi_int16(y128p[11],ONE_OVER_SQRT2_Q15_128); - y128p[12] = mulhi_int16(y128p[12],ONE_OVER_SQRT2_Q15_128); - y128p[13] = mulhi_int16(y128p[13],ONE_OVER_SQRT2_Q15_128); - y128p[14] = mulhi_int16(y128p[14],ONE_OVER_SQRT2_Q15_128); - y128p[15] = mulhi_int16(y128p[15],ONE_OVER_SQRT2_Q15_128); - y128p+=16; + if (*scale>1) { + uint32_t scale2=*scale-1; + for (i=0; i<128; i++) { + y128p[0] = mulhi_int16(shiftright_int16(y128p[0],scale2),ONE_OVER_SQRT2_Q15_128); + y128p[1] = mulhi_int16(shiftright_int16(y128p[1],scale2),ONE_OVER_SQRT2_Q15_128); + y128p[2] = mulhi_int16(shiftright_int16(y128p[2],scale2),ONE_OVER_SQRT2_Q15_128); + y128p[3] = mulhi_int16(shiftright_int16(y128p[3],scale2),ONE_OVER_SQRT2_Q15_128); + y128p[4] = mulhi_int16(shiftright_int16(y128p[4],scale2),ONE_OVER_SQRT2_Q15_128); + y128p[5] = mulhi_int16(shiftright_int16(y128p[5],scale2),ONE_OVER_SQRT2_Q15_128); + y128p[6] = mulhi_int16(shiftright_int16(y128p[6],scale2),ONE_OVER_SQRT2_Q15_128); + y128p[7] = mulhi_int16(shiftright_int16(y128p[7],scale2),ONE_OVER_SQRT2_Q15_128); + y128p[8] = mulhi_int16(shiftright_int16(y128p[8],scale2),ONE_OVER_SQRT2_Q15_128); + y128p[9] = mulhi_int16(shiftright_int16(y128p[9],scale2),ONE_OVER_SQRT2_Q15_128); + y128p[10] = mulhi_int16(shiftright_int16(y128p[10],scale2),ONE_OVER_SQRT2_Q15_128); + y128p[11] = mulhi_int16(shiftright_int16(y128p[11],scale2),ONE_OVER_SQRT2_Q15_128); + y128p[12] = mulhi_int16(shiftright_int16(y128p[12],scale2),ONE_OVER_SQRT2_Q15_128); + y128p[13] = mulhi_int16(shiftright_int16(y128p[13],scale2),ONE_OVER_SQRT2_Q15_128); + y128p[14] = mulhi_int16(shiftright_int16(y128p[14],scale2),ONE_OVER_SQRT2_Q15_128); + y128p[15] = mulhi_int16(shiftright_int16(y128p[15],scale2),ONE_OVER_SQRT2_Q15_128); + y128p+=16; + } + } + else { + for (i=0; i<128; i++) { + y128p[0] = mulhi_int16(y128p[0],ONE_OVER_SQRT2_Q15_128); + y128p[1] = mulhi_int16(y128p[1],ONE_OVER_SQRT2_Q15_128); + y128p[2] = mulhi_int16(y128p[2],ONE_OVER_SQRT2_Q15_128); + y128p[3] = mulhi_int16(y128p[3],ONE_OVER_SQRT2_Q15_128); + y128p[4] = mulhi_int16(y128p[4],ONE_OVER_SQRT2_Q15_128); + y128p[5] = mulhi_int16(y128p[5],ONE_OVER_SQRT2_Q15_128); + y128p[6] = mulhi_int16(y128p[6],ONE_OVER_SQRT2_Q15_128); + y128p[7] = mulhi_int16(y128p[7],ONE_OVER_SQRT2_Q15_128); + y128p[8] = mulhi_int16(y128p[8],ONE_OVER_SQRT2_Q15_128); + y128p[9] = mulhi_int16(y128p[9],ONE_OVER_SQRT2_Q15_128); + y128p[10] = mulhi_int16(y128p[10],ONE_OVER_SQRT2_Q15_128); + y128p[11] = mulhi_int16(y128p[11],ONE_OVER_SQRT2_Q15_128); + y128p[12] = mulhi_int16(y128p[12],ONE_OVER_SQRT2_Q15_128); + y128p[13] = mulhi_int16(y128p[13],ONE_OVER_SQRT2_Q15_128); + y128p[14] = mulhi_int16(y128p[14],ONE_OVER_SQRT2_Q15_128); + y128p[15] = mulhi_int16(y128p[15],ONE_OVER_SQRT2_Q15_128); + y128p+=16; + } } } } int16_t tw16384[3*2*4096]; -void dft16384(int16_t *x,int16_t *y,unsigned char scale) +void dft16384(int16_t *x,int16_t *y,unsigned int *scale) { simd_q15_t xtmp[4096],ytmp[4096],*tw16384_128p=(simd_q15_t *)tw16384,*x128=(simd_q15_t *)x,*y128=(simd_q15_t *)y,*y128p=(simd_q15_t *)y; @@ -2316,10 +2565,12 @@ void dft16384(int16_t *x,int16_t *y,unsigned char scale) } - dft4096((int16_t*)(xtmp),(int16_t*)(ytmp),1); - dft4096((int16_t*)(xtmp+1024),(int16_t*)(ytmp+1024),1); - dft4096((int16_t*)(xtmp+2048),(int16_t*)(ytmp+2048),1); - dft4096((int16_t*)(xtmp+3072),(int16_t*)(ytmp+3072),1); + unsigned int *scale4096=NULL; + if (scale) scale4096=scale+1; + dft4096((int16_t*)(xtmp),(int16_t*)(ytmp),scale4096); + dft4096((int16_t*)(xtmp+1024),(int16_t*)(ytmp+1024),scale4096); + dft4096((int16_t*)(xtmp+2048),(int16_t*)(ytmp+2048),scale4096); + dft4096((int16_t*)(xtmp+3072),(int16_t*)(ytmp+3072),scale4096); for (i=0; i<1024; i++) { bfly4(ytmpp,ytmpp+1024,ytmpp+2048,ytmpp+3072, @@ -2330,39 +2581,35 @@ void dft16384(int16_t *x,int16_t *y,unsigned char scale) ytmpp++; } - if (scale>0) { + if (scale && *scale>0) { + unsigned int scalec=*scale; for (i=0; i<256; i++) { - y128[0] = shiftright_int16(y128[0],1); - y128[1] = shiftright_int16(y128[1],1); - y128[2] = shiftright_int16(y128[2],1); - y128[3] = shiftright_int16(y128[3],1); - y128[4] = shiftright_int16(y128[4],1); - y128[5] = shiftright_int16(y128[5],1); - y128[6] = shiftright_int16(y128[6],1); - y128[7] = shiftright_int16(y128[7],1); - y128[8] = shiftright_int16(y128[8],1); - y128[9] = shiftright_int16(y128[9],1); - y128[10] = shiftright_int16(y128[10],1); - y128[11] = shiftright_int16(y128[11],1); - y128[12] = shiftright_int16(y128[12],1); - y128[13] = shiftright_int16(y128[13],1); - y128[14] = shiftright_int16(y128[14],1); - y128[15] = shiftright_int16(y128[15],1); + y128[0] = shiftright_int16(y128[0],scalec); + y128[1] = shiftright_int16(y128[1],scalec); + y128[2] = shiftright_int16(y128[2],scalec); + y128[3] = shiftright_int16(y128[3],scalec); + y128[4] = shiftright_int16(y128[4],scalec); + y128[5] = shiftright_int16(y128[5],scalec); + y128[6] = shiftright_int16(y128[6],scalec); + y128[7] = shiftright_int16(y128[7],scalec); + y128[8] = shiftright_int16(y128[8],scalec); + y128[9] = shiftright_int16(y128[9],scalec); + y128[10] = shiftright_int16(y128[10],scalec); + y128[11] = shiftright_int16(y128[11],scalec); + y128[12] = shiftright_int16(y128[12],scalec); + y128[13] = shiftright_int16(y128[13],scalec); + y128[14] = shiftright_int16(y128[14],scalec); + y128[15] = shiftright_int16(y128[15],scalec); y128+=16; } - } - - - - } -void idft16384(int16_t *x,int16_t *y,unsigned char scale) +void idft16384(int16_t *x,int16_t *y,unsigned int *scale) { simd_q15_t xtmp[4096],ytmp[4096],*tw16384_128p=(simd_q15_t *)tw16384,*x128=(simd_q15_t *)x,*y128=(simd_q15_t *)y,*y128p=(simd_q15_t *)y; @@ -2374,10 +2621,12 @@ void idft16384(int16_t *x,int16_t *y,unsigned char scale) } - idft4096((int16_t*)(xtmp),(int16_t*)(ytmp),1); - idft4096((int16_t*)(xtmp+1024),(int16_t*)(ytmp+1024),1); - idft4096((int16_t*)(xtmp+2048),(int16_t*)(ytmp+2048),1); - idft4096((int16_t*)(xtmp+3072),(int16_t*)(ytmp+3072),1); + unsigned int *scale4096=NULL; + if (scale) scale4096=scale+1; + idft4096((int16_t*)(xtmp),(int16_t*)(ytmp),scale4096); + idft4096((int16_t*)(xtmp+1024),(int16_t*)(ytmp+1024),scale4096); + idft4096((int16_t*)(xtmp+2048),(int16_t*)(ytmp+2048),scale4096); + idft4096((int16_t*)(xtmp+3072),(int16_t*)(ytmp+3072),scale4096); for (i=0; i<1024; i++) { ibfly4(ytmpp,ytmpp+1024,ytmpp+2048,ytmpp+3072, @@ -2388,25 +2637,26 @@ void idft16384(int16_t *x,int16_t *y,unsigned char scale) ytmpp++; } - if (scale>0) { + if (scale && *scale>0) { + unsigned int scalec=*scale; for (i=0; i<256; i++) { - y128[0] = shiftright_int16(y128[0],scale); - y128[1] = shiftright_int16(y128[1],scale); - y128[2] = shiftright_int16(y128[2],scale); - y128[3] = shiftright_int16(y128[3],scale); - y128[4] = shiftright_int16(y128[4],scale); - y128[5] = shiftright_int16(y128[5],scale); - y128[6] = shiftright_int16(y128[6],scale); - y128[7] = shiftright_int16(y128[7],scale); - y128[8] = shiftright_int16(y128[8],scale); - y128[9] = shiftright_int16(y128[9],scale); - y128[10] = shiftright_int16(y128[10],scale); - y128[11] = shiftright_int16(y128[11],scale); - y128[12] = shiftright_int16(y128[12],scale); - y128[13] = shiftright_int16(y128[13],scale); - y128[14] = shiftright_int16(y128[14],scale); - y128[15] = shiftright_int16(y128[15],scale); + y128[0] = shiftright_int16(y128[0],scalec); + y128[1] = shiftright_int16(y128[1],scalec); + y128[2] = shiftright_int16(y128[2],scalec); + y128[3] = shiftright_int16(y128[3],scalec); + y128[4] = shiftright_int16(y128[4],scalec); + y128[5] = shiftright_int16(y128[5],scalec); + y128[6] = shiftright_int16(y128[6],scalec); + y128[7] = shiftright_int16(y128[7],scalec); + y128[8] = shiftright_int16(y128[8],scalec); + y128[9] = shiftright_int16(y128[9],scalec); + y128[10] = shiftright_int16(y128[10],scalec); + y128[11] = shiftright_int16(y128[11],scalec); + y128[12] = shiftright_int16(y128[12],scalec); + y128[13] = shiftright_int16(y128[13],scalec); + y128[14] = shiftright_int16(y128[14],scalec); + y128[15] = shiftright_int16(y128[15],scalec); y128+=16; } @@ -2416,7 +2666,7 @@ void idft16384(int16_t *x,int16_t *y,unsigned char scale) int16_t tw32768[2*16384] __attribute__((aligned(32))); -void dft32768(int16_t *x,int16_t *y,unsigned char scale) +void dft32768(int16_t *x,int16_t *y,unsigned int *scale) { simdshort_q15_t xtmp[16384],*xtmpp,*x64 = (simdshort_q15_t *)x; @@ -2464,8 +2714,10 @@ void dft32768(int16_t *x,int16_t *y,unsigned char scale) xtmpp+=32; } - dft16384((int16_t*)(xtmp),(int16_t*)ytmp,1); - dft16384((int16_t*)(xtmp+8192),(int16_t*)(ytmp+4096),1); + unsigned int *scale16384=NULL; + if (scale) scale16384=scale+1; + dft16384((int16_t*)(xtmp),(int16_t*)ytmp,scale16384); + dft16384((int16_t*)(xtmp+8192),(int16_t*)(ytmp+4096),scale16384); for (i=0; i<4096; i++) { @@ -2477,32 +2729,55 @@ void dft32768(int16_t *x,int16_t *y,unsigned char scale) ytmpp++; } - if (scale>0) { + if (scale && *scale>0) { y128p = y128; - - for (i=0; i<512; i++) { - y128p[0] = mulhi_int16(y128p[0],ONE_OVER_SQRT2_Q15_128); - y128p[1] = mulhi_int16(y128p[1],ONE_OVER_SQRT2_Q15_128); - y128p[2] = mulhi_int16(y128p[2],ONE_OVER_SQRT2_Q15_128); - y128p[3] = mulhi_int16(y128p[3],ONE_OVER_SQRT2_Q15_128); - y128p[4] = mulhi_int16(y128p[4],ONE_OVER_SQRT2_Q15_128); - y128p[5] = mulhi_int16(y128p[5],ONE_OVER_SQRT2_Q15_128); - y128p[6] = mulhi_int16(y128p[6],ONE_OVER_SQRT2_Q15_128); - y128p[7] = mulhi_int16(y128p[7],ONE_OVER_SQRT2_Q15_128); - y128p[8] = mulhi_int16(y128p[8],ONE_OVER_SQRT2_Q15_128); - y128p[9] = mulhi_int16(y128p[9],ONE_OVER_SQRT2_Q15_128); - y128p[10] = mulhi_int16(y128p[10],ONE_OVER_SQRT2_Q15_128); - y128p[11] = mulhi_int16(y128p[11],ONE_OVER_SQRT2_Q15_128); - y128p[12] = mulhi_int16(y128p[12],ONE_OVER_SQRT2_Q15_128); - y128p[13] = mulhi_int16(y128p[13],ONE_OVER_SQRT2_Q15_128); - y128p[14] = mulhi_int16(y128p[14],ONE_OVER_SQRT2_Q15_128); - y128p[15] = mulhi_int16(y128p[15],ONE_OVER_SQRT2_Q15_128); - y128p+=16; + if (*scale>1) { + uint32_t scale2=*scale-1; + for (i=0; i<512; i++) { + y128p[0] = mulhi_int16(shiftright_int16(y128p[0],scale2),ONE_OVER_SQRT2_Q15_128); + y128p[1] = mulhi_int16(shiftright_int16(y128p[1],scale2),ONE_OVER_SQRT2_Q15_128); + y128p[2] = mulhi_int16(shiftright_int16(y128p[2],scale2),ONE_OVER_SQRT2_Q15_128); + y128p[3] = mulhi_int16(shiftright_int16(y128p[3],scale2),ONE_OVER_SQRT2_Q15_128); + y128p[4] = mulhi_int16(shiftright_int16(y128p[4],scale2),ONE_OVER_SQRT2_Q15_128); + y128p[5] = mulhi_int16(shiftright_int16(y128p[5],scale2),ONE_OVER_SQRT2_Q15_128); + y128p[6] = mulhi_int16(shiftright_int16(y128p[6],scale2),ONE_OVER_SQRT2_Q15_128); + y128p[7] = mulhi_int16(shiftright_int16(y128p[7],scale2),ONE_OVER_SQRT2_Q15_128); + y128p[8] = mulhi_int16(shiftright_int16(y128p[8],scale2),ONE_OVER_SQRT2_Q15_128); + y128p[9] = mulhi_int16(shiftright_int16(y128p[9],scale2),ONE_OVER_SQRT2_Q15_128); + y128p[10] = mulhi_int16(shiftright_int16(y128p[10],scale2),ONE_OVER_SQRT2_Q15_128); + y128p[11] = mulhi_int16(shiftright_int16(y128p[11],scale2),ONE_OVER_SQRT2_Q15_128); + y128p[12] = mulhi_int16(shiftright_int16(y128p[12],scale2),ONE_OVER_SQRT2_Q15_128); + y128p[13] = mulhi_int16(shiftright_int16(y128p[13],scale2),ONE_OVER_SQRT2_Q15_128); + y128p[14] = mulhi_int16(shiftright_int16(y128p[14],scale2),ONE_OVER_SQRT2_Q15_128); + y128p[15] = mulhi_int16(shiftright_int16(y128p[15],scale2),ONE_OVER_SQRT2_Q15_128); + y128p+=16; + } + } + else { + for (i=0; i<512; i++) { + y128p[0] = mulhi_int16(y128p[0],ONE_OVER_SQRT2_Q15_128); + y128p[1] = mulhi_int16(y128p[1],ONE_OVER_SQRT2_Q15_128); + y128p[2] = mulhi_int16(y128p[2],ONE_OVER_SQRT2_Q15_128); + y128p[3] = mulhi_int16(y128p[3],ONE_OVER_SQRT2_Q15_128); + y128p[4] = mulhi_int16(y128p[4],ONE_OVER_SQRT2_Q15_128); + y128p[5] = mulhi_int16(y128p[5],ONE_OVER_SQRT2_Q15_128); + y128p[6] = mulhi_int16(y128p[6],ONE_OVER_SQRT2_Q15_128); + y128p[7] = mulhi_int16(y128p[7],ONE_OVER_SQRT2_Q15_128); + y128p[8] = mulhi_int16(y128p[8],ONE_OVER_SQRT2_Q15_128); + y128p[9] = mulhi_int16(y128p[9],ONE_OVER_SQRT2_Q15_128); + y128p[10] = mulhi_int16(y128p[10],ONE_OVER_SQRT2_Q15_128); + y128p[11] = mulhi_int16(y128p[11],ONE_OVER_SQRT2_Q15_128); + y128p[12] = mulhi_int16(y128p[12],ONE_OVER_SQRT2_Q15_128); + y128p[13] = mulhi_int16(y128p[13],ONE_OVER_SQRT2_Q15_128); + y128p[14] = mulhi_int16(y128p[14],ONE_OVER_SQRT2_Q15_128); + y128p[15] = mulhi_int16(y128p[15],ONE_OVER_SQRT2_Q15_128); + y128p+=16; + } } } } -void idft32768(int16_t *x,int16_t *y,unsigned char scale) +void idft32768(int16_t *x,int16_t *y,unsigned int *scale) { simdshort_q15_t xtmp[16384],*xtmpp,*x64 = (simdshort_q15_t *)x; @@ -2550,8 +2825,10 @@ void idft32768(int16_t *x,int16_t *y,unsigned char scale) xtmpp+=32; } - idft16384((int16_t*)(xtmp),(int16_t*)ytmp,1); - idft16384((int16_t*)(xtmp+8192),(int16_t*)(ytmp+4096),1); + unsigned int *scale16384=NULL; + if (scale) scale16384=scale+1; + idft16384((int16_t*)(xtmp),(int16_t*)ytmp,scale16384); + idft16384((int16_t*)(xtmp+8192),(int16_t*)(ytmp+4096),scale16384); for (i=0; i<4096; i++) { @@ -2563,27 +2840,50 @@ void idft32768(int16_t *x,int16_t *y,unsigned char scale) ytmpp++; } - if (scale>0) { + if (scale && *scale>0) { y128p = y128; - - for (i=0; i<512; i++) { - y128p[0] = mulhi_int16(y128p[0],ONE_OVER_SQRT2_Q15_128); - y128p[1] = mulhi_int16(y128p[1],ONE_OVER_SQRT2_Q15_128); - y128p[2] = mulhi_int16(y128p[2],ONE_OVER_SQRT2_Q15_128); - y128p[3] = mulhi_int16(y128p[3],ONE_OVER_SQRT2_Q15_128); - y128p[4] = mulhi_int16(y128p[4],ONE_OVER_SQRT2_Q15_128); - y128p[5] = mulhi_int16(y128p[5],ONE_OVER_SQRT2_Q15_128); - y128p[6] = mulhi_int16(y128p[6],ONE_OVER_SQRT2_Q15_128); - y128p[7] = mulhi_int16(y128p[7],ONE_OVER_SQRT2_Q15_128); - y128p[8] = mulhi_int16(y128p[8],ONE_OVER_SQRT2_Q15_128); - y128p[9] = mulhi_int16(y128p[9],ONE_OVER_SQRT2_Q15_128); - y128p[10] = mulhi_int16(y128p[10],ONE_OVER_SQRT2_Q15_128); - y128p[11] = mulhi_int16(y128p[11],ONE_OVER_SQRT2_Q15_128); - y128p[12] = mulhi_int16(y128p[12],ONE_OVER_SQRT2_Q15_128); - y128p[13] = mulhi_int16(y128p[13],ONE_OVER_SQRT2_Q15_128); - y128p[14] = mulhi_int16(y128p[14],ONE_OVER_SQRT2_Q15_128); - y128p[15] = mulhi_int16(y128p[15],ONE_OVER_SQRT2_Q15_128); - y128p+=16; + if (*scale>1) { + uint32_t scale2=*scale-1; + for (i=0; i<512; i++) { + y128p[0] = mulhi_int16(shiftright_int16(y128p[0],scale2),ONE_OVER_SQRT2_Q15_128); + y128p[1] = mulhi_int16(shiftright_int16(y128p[1],scale2),ONE_OVER_SQRT2_Q15_128); + y128p[2] = mulhi_int16(shiftright_int16(y128p[2],scale2),ONE_OVER_SQRT2_Q15_128); + y128p[3] = mulhi_int16(shiftright_int16(y128p[3],scale2),ONE_OVER_SQRT2_Q15_128); + y128p[4] = mulhi_int16(shiftright_int16(y128p[4],scale2),ONE_OVER_SQRT2_Q15_128); + y128p[5] = mulhi_int16(shiftright_int16(y128p[5],scale2),ONE_OVER_SQRT2_Q15_128); + y128p[6] = mulhi_int16(shiftright_int16(y128p[6],scale2),ONE_OVER_SQRT2_Q15_128); + y128p[7] = mulhi_int16(shiftright_int16(y128p[7],scale2),ONE_OVER_SQRT2_Q15_128); + y128p[8] = mulhi_int16(shiftright_int16(y128p[8],scale2),ONE_OVER_SQRT2_Q15_128); + y128p[9] = mulhi_int16(shiftright_int16(y128p[9],scale2),ONE_OVER_SQRT2_Q15_128); + y128p[10] = mulhi_int16(shiftright_int16(y128p[10],scale2),ONE_OVER_SQRT2_Q15_128); + y128p[11] = mulhi_int16(shiftright_int16(y128p[11],scale2),ONE_OVER_SQRT2_Q15_128); + y128p[12] = mulhi_int16(shiftright_int16(y128p[12],scale2),ONE_OVER_SQRT2_Q15_128); + y128p[13] = mulhi_int16(shiftright_int16(y128p[13],scale2),ONE_OVER_SQRT2_Q15_128); + y128p[14] = mulhi_int16(shiftright_int16(y128p[14],scale2),ONE_OVER_SQRT2_Q15_128); + y128p[15] = mulhi_int16(shiftright_int16(y128p[15],scale2),ONE_OVER_SQRT2_Q15_128); + y128p+=16; + } + } + else { + for (i=0; i<512; i++) { + y128p[0] = mulhi_int16(y128p[0],ONE_OVER_SQRT2_Q15_128); + y128p[1] = mulhi_int16(y128p[1],ONE_OVER_SQRT2_Q15_128); + y128p[2] = mulhi_int16(y128p[2],ONE_OVER_SQRT2_Q15_128); + y128p[3] = mulhi_int16(y128p[3],ONE_OVER_SQRT2_Q15_128); + y128p[4] = mulhi_int16(y128p[4],ONE_OVER_SQRT2_Q15_128); + y128p[5] = mulhi_int16(y128p[5],ONE_OVER_SQRT2_Q15_128); + y128p[6] = mulhi_int16(y128p[6],ONE_OVER_SQRT2_Q15_128); + y128p[7] = mulhi_int16(y128p[7],ONE_OVER_SQRT2_Q15_128); + y128p[8] = mulhi_int16(y128p[8],ONE_OVER_SQRT2_Q15_128); + y128p[9] = mulhi_int16(y128p[9],ONE_OVER_SQRT2_Q15_128); + y128p[10] = mulhi_int16(y128p[10],ONE_OVER_SQRT2_Q15_128); + y128p[11] = mulhi_int16(y128p[11],ONE_OVER_SQRT2_Q15_128); + y128p[12] = mulhi_int16(y128p[12],ONE_OVER_SQRT2_Q15_128); + y128p[13] = mulhi_int16(y128p[13],ONE_OVER_SQRT2_Q15_128); + y128p[14] = mulhi_int16(y128p[14],ONE_OVER_SQRT2_Q15_128); + y128p[15] = mulhi_int16(y128p[15],ONE_OVER_SQRT2_Q15_128); + y128p+=16; + } } } } @@ -2591,7 +2891,7 @@ void idft32768(int16_t *x,int16_t *y,unsigned char scale) int16_t twa768[512],twb768[512]; // 256 x 3 -void idft768(int16_t *input, int16_t *output, unsigned char scale) +void idft768(int16_t *input, int16_t *output, unsigned int *scale) { int i,i2,j; uint32_t tmp[3][256]__attribute__((aligned(32))); @@ -2605,9 +2905,11 @@ void idft768(int16_t *input, int16_t *output, unsigned char scale) tmp[2][i] = ((uint32_t *)input)[j++]; } - idft256((int16_t*)(tmp[0]),(int16_t*)(tmpo[0]),1); - idft256((int16_t*)(tmp[1]),(int16_t*)(tmpo[1]),1); - idft256((int16_t*)(tmp[2]),(int16_t*)(tmpo[2]),1); + unsigned int *scale256=NULL; + if (scale) scale256=scale+1; + idft256((int16_t*)(tmp[0]),(int16_t*)(tmpo[0]),scale256); + idft256((int16_t*)(tmp[1]),(int16_t*)(tmpo[1]),scale256); + idft256((int16_t*)(tmp[2]),(int16_t*)(tmpo[2]),scale256); for (i=0,i2=0; i<512; i+=8,i2+=4) { ibfly3((simd_q15_t*)(&tmpo[0][i2]),(simd_q15_t*)(&tmpo[1][i2]),((simd_q15_t*)&tmpo[2][i2]), @@ -2616,7 +2918,7 @@ void idft768(int16_t *input, int16_t *output, unsigned char scale) } - if (scale==1) { + if (scale && *scale>1) { for (i=0; i<12; i++) { y128p[0] = mulhi_int16(y128p[0],ONE_OVER_SQRT3_Q15_128); y128p[1] = mulhi_int16(y128p[1],ONE_OVER_SQRT3_Q15_128); @@ -2639,7 +2941,7 @@ void idft768(int16_t *input, int16_t *output, unsigned char scale) } } -void dft768(int16_t *input, int16_t *output, unsigned char scale) +void dft768(int16_t *input, int16_t *output, unsigned int *scale) { int i,i2,j; uint32_t tmp[3][256] __attribute__((aligned(32))); @@ -2653,9 +2955,11 @@ void dft768(int16_t *input, int16_t *output, unsigned char scale) tmp[2][i] = ((uint32_t *)input)[j++]; } - dft256((int16_t*)(tmp[0]),(int16_t*)(tmpo[0]),1); - dft256((int16_t*)(tmp[1]),(int16_t*)(tmpo[1]),1); - dft256((int16_t*)(tmp[2]),(int16_t*)(tmpo[2]),1); + unsigned int *scale256=NULL; + if (scale) scale256=scale+1; + dft256((int16_t*)(tmp[0]),(int16_t*)(tmpo[0]),scale256); + dft256((int16_t*)(tmp[1]),(int16_t*)(tmpo[1]),scale256); + dft256((int16_t*)(tmp[2]),(int16_t*)(tmpo[2]),scale256); /* for (i=1; i<512; i++) { @@ -2676,7 +2980,7 @@ void dft768(int16_t *input, int16_t *output, unsigned char scale) (simd_q15_t*)(twa768+i),(simd_q15_t*)(twb768+i)); } - if (scale==1) { + if (scale && *scale>0) { for (i=0; i<12; i++) { y128p[0] = mulhi_int16(y128p[0],ONE_OVER_SQRT3_Q15_128); y128p[1] = mulhi_int16(y128p[1],ONE_OVER_SQRT3_Q15_128); @@ -2701,7 +3005,7 @@ void dft768(int16_t *input, int16_t *output, unsigned char scale) int16_t twa1536[1024],twb1536[1024]; // 512 x 3 -void idft1536(int16_t *input, int16_t *output, unsigned char scale) +void idft1536(int16_t *input, int16_t *output, unsigned int *scale) { int i,i2,j; uint32_t tmp[3][512 ]__attribute__((aligned(32))); @@ -2715,9 +3019,11 @@ void idft1536(int16_t *input, int16_t *output, unsigned char scale) tmp[2][i] = ((uint32_t *)input)[j++]; } - idft512((int16_t*)(tmp[0]),(int16_t*)(tmpo[0]),1); - idft512((int16_t*)(tmp[1]),(int16_t*)(tmpo[1]),1); - idft512((int16_t*)(tmp[2]),(int16_t*)(tmpo[2]),1); + unsigned int *scale512=NULL; + if (scale) scale512=scale+1; + idft512((int16_t*)(tmp[0]),(int16_t*)(tmpo[0]),scale512); + idft512((int16_t*)(tmp[1]),(int16_t*)(tmpo[1]),scale512); + idft512((int16_t*)(tmp[2]),(int16_t*)(tmpo[2]),scale512); for (i=0,i2=0; i<1024; i+=8,i2+=4) { ibfly3((simd_q15_t*)(&tmpo[0][i2]),(simd_q15_t*)(&tmpo[1][i2]),((simd_q15_t*)&tmpo[2][i2]), @@ -2726,7 +3032,7 @@ void idft1536(int16_t *input, int16_t *output, unsigned char scale) } - if (scale==1) { + if (scale && *scale>0) { for (i=0; i<24; i++) { y128p[0] = mulhi_int16(y128p[0],ONE_OVER_SQRT3_Q15_128); y128p[1] = mulhi_int16(y128p[1],ONE_OVER_SQRT3_Q15_128); @@ -2749,7 +3055,7 @@ void idft1536(int16_t *input, int16_t *output, unsigned char scale) } } -void dft1536(int16_t *input, int16_t *output, unsigned char scale) +void dft1536(int16_t *input, int16_t *output, unsigned int *scale) { int i,i2,j; uint32_t tmp[3][512] __attribute__((aligned(32))); @@ -2763,9 +3069,11 @@ void dft1536(int16_t *input, int16_t *output, unsigned char scale) tmp[2][i] = ((uint32_t *)input)[j++]; } - dft512((int16_t*)(tmp[0]),(int16_t*)(tmpo[0]),1); - dft512((int16_t*)(tmp[1]),(int16_t*)(tmpo[1]),1); - dft512((int16_t*)(tmp[2]),(int16_t*)(tmpo[2]),1); + unsigned int *scale512=NULL; + if (scale) scale512=scale+1; + dft512((int16_t*)(tmp[0]),(int16_t*)(tmpo[0]),scale512); + dft512((int16_t*)(tmp[1]),(int16_t*)(tmpo[1]),scale512); + dft512((int16_t*)(tmp[2]),(int16_t*)(tmpo[2]),scale512); /* for (i=1; i<512; i++) { @@ -2786,7 +3094,7 @@ void dft1536(int16_t *input, int16_t *output, unsigned char scale) (simd_q15_t*)(twa1536+i),(simd_q15_t*)(twb1536+i)); } - if (scale==1) { + if (scale && *scale>0) { for (i=0; i<24; i++) { y128p[0] = mulhi_int16(y128p[0],ONE_OVER_SQRT3_Q15_128); y128p[1] = mulhi_int16(y128p[1],ONE_OVER_SQRT3_Q15_128); @@ -2812,7 +3120,7 @@ void dft1536(int16_t *input, int16_t *output, unsigned char scale) int16_t twa3072[2048] __attribute__((aligned(32))); int16_t twb3072[2048] __attribute__((aligned(32))); // 1024 x 3 -void dft3072(int16_t *input, int16_t *output,unsigned char scale) +void dft3072(int16_t *input, int16_t *output,unsigned int *scale) { int i,i2,j; uint32_t tmp[3][1024] __attribute__((aligned(32))); @@ -2826,9 +3134,11 @@ void dft3072(int16_t *input, int16_t *output,unsigned char scale) tmp[2][i] = ((uint32_t *)input)[j++]; } - dft1024((int16_t*)(tmp[0]),(int16_t*)(tmpo[0]),1); - dft1024((int16_t*)(tmp[1]),(int16_t*)(tmpo[1]),1); - dft1024((int16_t*)(tmp[2]),(int16_t*)(tmpo[2]),1); + unsigned int *scale1024=NULL; + if (scale) scale1024=scale+1; + dft1024((int16_t*)(tmp[0]),(int16_t*)(tmpo[0]),scale1024); + dft1024((int16_t*)(tmp[1]),(int16_t*)(tmpo[1]),scale1024); + dft1024((int16_t*)(tmp[2]),(int16_t*)(tmpo[2]),scale1024); for (i=0,i2=0; i<2048; i+=8,i2+=4) { bfly3((simd_q15_t*)(&tmpo[0][i2]),(simd_q15_t*)(&tmpo[1][i2]),(simd_q15_t*)(&tmpo[2][i2]), @@ -2836,7 +3146,7 @@ void dft3072(int16_t *input, int16_t *output,unsigned char scale) (simd_q15_t*)(twa3072+i),(simd_q15_t*)(twb3072+i)); } - if (scale==1) { + if (scale && *scale>0) { for (i=0; i<48; i++) { y128p[0] = mulhi_int16(y128p[0],ONE_OVER_SQRT3_Q15_128); y128p[1] = mulhi_int16(y128p[1],ONE_OVER_SQRT3_Q15_128); @@ -2859,7 +3169,7 @@ void dft3072(int16_t *input, int16_t *output,unsigned char scale) } } -void idft3072(int16_t *input, int16_t *output,unsigned char scale) +void idft3072(int16_t *input, int16_t *output,unsigned int *scale) { int i,i2,j; uint32_t tmp[3][1024]__attribute__((aligned(32))); @@ -2872,9 +3182,11 @@ void idft3072(int16_t *input, int16_t *output,unsigned char scale) tmp[1][i] = ((uint32_t *)input)[j++]; tmp[2][i] = ((uint32_t *)input)[j++]; } - idft1024((int16_t*)(tmp[0]),(int16_t*)(tmpo[0]),1); - idft1024((int16_t*)(tmp[1]),(int16_t*)(tmpo[1]),1); - idft1024((int16_t*)(tmp[2]),(int16_t*)(tmpo[2]),1); + unsigned int *scale1024=NULL; + if (scale) scale1024=scale+1; + idft1024((int16_t*)(tmp[0]),(int16_t*)(tmpo[0]),scale1024); + idft1024((int16_t*)(tmp[1]),(int16_t*)(tmpo[1]),scale1024); + idft1024((int16_t*)(tmp[2]),(int16_t*)(tmpo[2]),scale1024); for (i=0,i2=0; i<2048; i+=8,i2+=4) { ibfly3((simd_q15_t*)(&tmpo[0][i2]),(simd_q15_t*)(&tmpo[1][i2]),((simd_q15_t*)&tmpo[2][i2]), @@ -2883,7 +3195,7 @@ void idft3072(int16_t *input, int16_t *output,unsigned char scale) } - if (scale==1) { + if (scale && *scale>0) { for (i=0; i<48; i++) { y128p[0] = mulhi_int16(y128p[0],ONE_OVER_SQRT3_Q15_128); y128p[1] = mulhi_int16(y128p[1],ONE_OVER_SQRT3_Q15_128); @@ -2910,7 +3222,7 @@ void idft3072(int16_t *input, int16_t *output,unsigned char scale) int16_t twa6144[4096] __attribute__((aligned(32))); int16_t twb6144[4096] __attribute__((aligned(32))); -void idft6144(int16_t *input, int16_t *output,unsigned char scale) +void idft6144(int16_t *input, int16_t *output,unsigned int *scale) { int i,i2,j; uint32_t tmp[3][2048] __attribute__((aligned(32))); @@ -2924,9 +3236,11 @@ void idft6144(int16_t *input, int16_t *output,unsigned char scale) tmp[2][i] = ((uint32_t *)input)[j++]; } - idft2048((int16_t*)(tmp[0]),(int16_t*)(tmpo[0]),1); - idft2048((int16_t*)(tmp[1]),(int16_t*)(tmpo[1]),1); - idft2048((int16_t*)(tmp[2]),(int16_t*)(tmpo[2]),1); + unsigned int *scale2048=NULL; + if (scale) scale2048=scale+1; + idft2048((int16_t*)(tmp[0]),(int16_t*)(tmpo[0]),scale2048); + idft2048((int16_t*)(tmp[1]),(int16_t*)(tmpo[1]),scale2048); + idft2048((int16_t*)(tmp[2]),(int16_t*)(tmpo[2]),scale2048); #ifndef MR_MAIN if (LOG_DUMPFLAG(DEBUG_DFT)) { LOG_M("idft6144in.m","in",input,6144,1,1); @@ -2942,7 +3256,7 @@ void idft6144(int16_t *input, int16_t *output,unsigned char scale) } - if (scale==1) { + if (scale && *scale>0) { for (i=0; i<96; i++) { y128p[0] = mulhi_int16(y128p[0],ONE_OVER_SQRT3_Q15_128); y128p[1] = mulhi_int16(y128p[1],ONE_OVER_SQRT3_Q15_128); @@ -2966,7 +3280,7 @@ void idft6144(int16_t *input, int16_t *output,unsigned char scale) } -void dft6144(int16_t *input, int16_t *output,unsigned char scale) +void dft6144(int16_t *input, int16_t *output,unsigned int *scale) { int i,i2,j; uint32_t tmp[3][2048] __attribute__((aligned(32))); @@ -2980,9 +3294,11 @@ void dft6144(int16_t *input, int16_t *output,unsigned char scale) tmp[2][i] = ((uint32_t *)input)[j++]; } - dft2048((int16_t*)(tmp[0]),(int16_t*)(tmpo[0]),1); - dft2048((int16_t*)(tmp[1]),(int16_t*)(tmpo[1]),1); - dft2048((int16_t*)(tmp[2]),(int16_t*)(tmpo[2]),1); + unsigned int *scale2048=NULL; + if (scale) scale2048=scale+1; + dft2048((int16_t*)(tmp[0]),(int16_t*)(tmpo[0]),scale2048); + dft2048((int16_t*)(tmp[1]),(int16_t*)(tmpo[1]),scale2048); + dft2048((int16_t*)(tmp[2]),(int16_t*)(tmpo[2]),scale2048); /* for (i=1; i<2048; i++) { @@ -3003,7 +3319,7 @@ void dft6144(int16_t *input, int16_t *output,unsigned char scale) (simd_q15_t*)(twa6144+i),(simd_q15_t*)(twb6144+i)); } - if (scale==1) { + if (scale && *scale>0) { for (i=0; i<96; i++) { y128p[0] = mulhi_int16(y128p[0],ONE_OVER_SQRT3_Q15_128); y128p[1] = mulhi_int16(y128p[1],ONE_OVER_SQRT3_Q15_128); @@ -3029,7 +3345,7 @@ void dft6144(int16_t *input, int16_t *output,unsigned char scale) int16_t twa12288[8192] __attribute__((aligned(32))); int16_t twb12288[8192] __attribute__((aligned(32))); // 4096 x 3 -void dft12288(int16_t *input, int16_t *output,unsigned char scale) +void dft12288(int16_t *input, int16_t *output,unsigned int *scale) { int i,i2,j; uint32_t tmp[3][4096] __attribute__((aligned(32))); @@ -3043,9 +3359,11 @@ void dft12288(int16_t *input, int16_t *output,unsigned char scale) tmp[2][i] = ((uint32_t *)input)[j++]; } - dft4096((int16_t*)(tmp[0]),(int16_t*)(tmpo[0]),scale); - dft4096((int16_t*)(tmp[1]),(int16_t*)(tmpo[1]),scale); - dft4096((int16_t*)(tmp[2]),(int16_t*)(tmpo[2]),scale); + unsigned int *scale4096=NULL; + if (scale) scale4096=scale+1; + dft4096((int16_t*)(tmp[0]),(int16_t*)(tmpo[0]),scale4096); + dft4096((int16_t*)(tmp[1]),(int16_t*)(tmpo[1]),scale4096); + dft4096((int16_t*)(tmp[2]),(int16_t*)(tmpo[2]),scale4096); /* for (i=1; i<4096; i++) { tmpo[0][i] = tmpo[0][i<<1]; @@ -3065,7 +3383,7 @@ void dft12288(int16_t *input, int16_t *output,unsigned char scale) (simd_q15_t*)(twa12288+i),(simd_q15_t*)(twb12288+i)); } - if (scale==1) { + if (scale && *scale>0) { for (i=0; i<192; i++) { y128p[0] = mulhi_int16(y128p[0],ONE_OVER_SQRT3_Q15_128); y128p[1] = mulhi_int16(y128p[1],ONE_OVER_SQRT3_Q15_128); @@ -3088,7 +3406,7 @@ void dft12288(int16_t *input, int16_t *output,unsigned char scale) } } -void idft12288(int16_t *input, int16_t *output,unsigned char scale) +void idft12288(int16_t *input, int16_t *output,unsigned int *scale) { int i,i2,j; uint32_t tmp[3][4096] __attribute__((aligned(32))); @@ -3104,9 +3422,11 @@ void idft12288(int16_t *input, int16_t *output,unsigned char scale) - idft4096((int16_t*)(tmp[0]),(int16_t*)(tmpo[0]),scale); - idft4096((int16_t*)(tmp[1]),(int16_t*)(tmpo[1]),scale); - idft4096((int16_t*)(tmp[2]),(int16_t*)(tmpo[2]),scale); + unsigned int *scale4096=NULL; + if (scale) scale4096=scale+1; + idft4096((int16_t*)(tmp[0]),(int16_t*)(tmpo[0]),scale4096); + idft4096((int16_t*)(tmp[1]),(int16_t*)(tmpo[1]),scale4096); + idft4096((int16_t*)(tmp[2]),(int16_t*)(tmpo[2]),scale4096); #ifndef MR_MAIN if (LOG_DUMPFLAG(DEBUG_DFT)) { LOG_M("idft12288in.m","in",input,12288,1,1); @@ -3121,7 +3441,7 @@ void idft12288(int16_t *input, int16_t *output,unsigned char scale) (simd_q15_t*)(twa12288+i),(simd_q15_t*)(twb12288+i)); } - if (scale==1) { + if (scale && *scale>0) { for (i=0; i<192; i++) { y128p[0] = mulhi_int16(y128p[0],ONE_OVER_SQRT3_Q15_128); y128p[1] = mulhi_int16(y128p[1],ONE_OVER_SQRT3_Q15_128); @@ -3154,7 +3474,7 @@ void idft12288(int16_t *input, int16_t *output,unsigned char scale) int16_t twa18432[12288] __attribute__((aligned(32))); int16_t twb18432[12288] __attribute__((aligned(32))); // 6144 x 3 -void dft18432(int16_t *input, int16_t *output,unsigned char scale) { +void dft18432(int16_t *input, int16_t *output,unsigned int *scale) { int i,i2,j; uint32_t tmp[3][6144] __attribute__((aligned(32))); @@ -3168,16 +3488,18 @@ void dft18432(int16_t *input, int16_t *output,unsigned char scale) { tmp[2][i] = ((uint32_t *)input)[j++]; } - dft6144((int16_t*)(tmp[0]),(int16_t*)(tmpo[0]),scale); - dft6144((int16_t*)(tmp[1]),(int16_t*)(tmpo[1]),scale); - dft6144((int16_t*)(tmp[2]),(int16_t*)(tmpo[2]),scale); + unsigned int *scale6144=NULL; + if (scale) scale6144=scale+1; + dft6144((int16_t*)(tmp[0]),(int16_t*)(tmpo[0]),scale6144); + dft6144((int16_t*)(tmp[1]),(int16_t*)(tmpo[1]),scale6144); + dft6144((int16_t*)(tmp[2]),(int16_t*)(tmpo[2]),scale6144); for (i=0,i2=0; i<12288; i+=8,i2+=4) { bfly3((simd_q15_t*)(&tmpo[0][i2]),(simd_q15_t*)(&tmpo[1][i2]),(simd_q15_t*)(&tmpo[2][i2]), (simd_q15_t*)(output+i),(simd_q15_t*)(output+12288+i),(simd_q15_t*)(output+24576+i), (simd_q15_t*)(twa18432+i),(simd_q15_t*)(twb18432+i)); } - if (scale==1) { + if (scale && *scale>0) { for (i=0; i<288; i++) { y128p[0] = mulhi_int16(y128p[0],ONE_OVER_SQRT3_Q15_128); y128p[1] = mulhi_int16(y128p[1],ONE_OVER_SQRT3_Q15_128); @@ -3200,7 +3522,7 @@ void dft18432(int16_t *input, int16_t *output,unsigned char scale) { } } -void idft18432(int16_t *input, int16_t *output,unsigned char scale) { +void idft18432(int16_t *input, int16_t *output,unsigned int *scale) { int i,i2,j; uint32_t tmp[3][6144] __attribute__((aligned(32))); @@ -3214,16 +3536,18 @@ void idft18432(int16_t *input, int16_t *output,unsigned char scale) { tmp[2][i] = ((uint32_t *)input)[j++]; } - idft6144((int16_t*)(tmp[0]),(int16_t*)(tmpo[0]),scale); - idft6144((int16_t*)(tmp[1]),(int16_t*)(tmpo[1]),scale); - idft6144((int16_t*)(tmp[2]),(int16_t*)(tmpo[2]),scale); + unsigned int *scale6144=NULL; + if (scale) scale6144=scale+1; + idft6144((int16_t*)(tmp[0]),(int16_t*)(tmpo[0]),scale6144); + idft6144((int16_t*)(tmp[1]),(int16_t*)(tmpo[1]),scale6144); + idft6144((int16_t*)(tmp[2]),(int16_t*)(tmpo[2]),scale6144); for (i=0,i2=0; i<12288; i+=8,i2+=4) { ibfly3((simd_q15_t*)(&tmpo[0][i2]),(simd_q15_t*)(&tmpo[1][i2]),(simd_q15_t*)(&tmpo[2][i2]), (simd_q15_t*)(output+i),(simd_q15_t*)(output+12288+i),(simd_q15_t*)(output+24576+i), (simd_q15_t*)(twa18432+i),(simd_q15_t*)(twb18432+i)); } - if (scale==1) { + if (scale && *scale>0) { for (i=0; i<288; i++) { y128p[0] = mulhi_int16(y128p[0],ONE_OVER_SQRT3_Q15_128); y128p[1] = mulhi_int16(y128p[1],ONE_OVER_SQRT3_Q15_128); @@ -3250,7 +3574,7 @@ void idft18432(int16_t *input, int16_t *output,unsigned char scale) { int16_t twa24576[16384] __attribute__((aligned(32))); int16_t twb24576[16384] __attribute__((aligned(32))); // 8192 x 3 -void dft24576(int16_t *input, int16_t *output,unsigned char scale) +void dft24576(int16_t *input, int16_t *output,unsigned int *scale) { int i,i2,j; uint32_t tmp[3][8192] __attribute__((aligned(32))); @@ -3264,9 +3588,11 @@ void dft24576(int16_t *input, int16_t *output,unsigned char scale) tmp[2][i] = ((uint32_t *)input)[j++]; } - dft8192((int16_t*)(tmp[0]),(int16_t*)(tmpo[0]),1); - dft8192((int16_t*)(tmp[1]),(int16_t*)(tmpo[1]),1); - dft8192((int16_t*)(tmp[2]),(int16_t*)(tmpo[2]),1); + unsigned int *scale8192=NULL; + if (scale) scale8192=scale+1; + dft8192((int16_t*)(tmp[0]),(int16_t*)(tmpo[0]),scale8192); + dft8192((int16_t*)(tmp[1]),(int16_t*)(tmpo[1]),scale8192); + dft8192((int16_t*)(tmp[2]),(int16_t*)(tmpo[2]),scale8192); /* for (i=1; i<8192; i++) { tmpo[0][i] = tmpo[0][i<<1]; @@ -3287,7 +3613,7 @@ void dft24576(int16_t *input, int16_t *output,unsigned char scale) } - if (scale==1) { + if (scale && *scale>0) { for (i=0; i<384; i++) { y128p[0] = mulhi_int16(y128p[0],ONE_OVER_SQRT3_Q15_128); y128p[1] = mulhi_int16(y128p[1],ONE_OVER_SQRT3_Q15_128); @@ -3315,7 +3641,7 @@ void dft24576(int16_t *input, int16_t *output,unsigned char scale) #endif } -void idft24576(int16_t *input, int16_t *output,unsigned char scale) +void idft24576(int16_t *input, int16_t *output,unsigned int *scale) { int i,i2,j; uint32_t tmp[3][8192] __attribute__((aligned(32))); @@ -3329,9 +3655,11 @@ void idft24576(int16_t *input, int16_t *output,unsigned char scale) tmp[2][i] = ((uint32_t *)input)[j++]; } - idft8192((int16_t*)(tmp[0]),(int16_t*)(tmpo[0]),1); - idft8192((int16_t*)(tmp[1]),(int16_t*)(tmpo[1]),1); - idft8192((int16_t*)(tmp[2]),(int16_t*)(tmpo[2]),1); + unsigned int *scale8192=NULL; + if (scale) scale8192=scale+1; + idft8192((int16_t*)(tmp[0]),(int16_t*)(tmpo[0]),scale8192); + idft8192((int16_t*)(tmp[1]),(int16_t*)(tmpo[1]),scale8192); + idft8192((int16_t*)(tmp[2]),(int16_t*)(tmpo[2]),scale8192); #ifndef MR_MAIN if (LOG_DUMPFLAG(DEBUG_DFT)) { LOG_M("idft24576in.m","in",input,24576,1,1); @@ -3345,7 +3673,7 @@ void idft24576(int16_t *input, int16_t *output,unsigned char scale) (simd_q15_t*)(output+i),(simd_q15_t*)(output+16384+i),(simd_q15_t*)(output+32768+i), (simd_q15_t*)(twa24576+i),(simd_q15_t*)(twb24576+i)); } - if (scale==1) { + if (scale && *scale>0) { for (i=0; i<384; i++) { y128p[0] = mulhi_int16(y128p[0],ONE_OVER_SQRT3_Q15_128); y128p[1] = mulhi_int16(y128p[1],ONE_OVER_SQRT3_Q15_128); @@ -3377,7 +3705,7 @@ int16_t twa36864[24576] __attribute__((aligned(32))); int16_t twb36864[24576] __attribute__((aligned(32))); // 12288 x 3 -void dft36864(int16_t *input, int16_t *output,uint8_t scale) { +void dft36864(int16_t *input, int16_t *output,uint32_t *scale) { int i,i2,j; uint32_t tmp[3][12288] __attribute__((aligned(32))); @@ -3391,9 +3719,11 @@ void dft36864(int16_t *input, int16_t *output,uint8_t scale) { tmp[2][i] = ((uint32_t *)input)[j++]; } - dft12288((int16_t*)(tmp[0]),(int16_t*)(tmpo[0]),1); - dft12288((int16_t*)(tmp[1]),(int16_t*)(tmpo[1]),1); - dft12288((int16_t*)(tmp[2]),(int16_t*)(tmpo[2]),1); + unsigned int *scale12288=NULL; + if (scale) scale12288=scale+1; + dft12288((int16_t*)(tmp[0]),(int16_t*)(tmpo[0]),scale12288); + dft12288((int16_t*)(tmp[1]),(int16_t*)(tmpo[1]),scale12288); + dft12288((int16_t*)(tmp[2]),(int16_t*)(tmpo[2]),scale12288); #ifndef MR_MAIN if (LOG_DUMPFLAG(DEBUG_DFT)) { LOG_M("dft36864out0.m","o0",tmpo[0],12288,1,1); @@ -3407,7 +3737,7 @@ void dft36864(int16_t *input, int16_t *output,uint8_t scale) { (simd_q15_t*)(twa36864+i),(simd_q15_t*)(twb36864+i)); } - if (scale==1) { + if (scale && *scale>0) { for (i=0; i<576; i++) { y128p[0] = mulhi_int16(y128p[0],ONE_OVER_SQRT3_Q15_128); y128p[1] = mulhi_int16(y128p[1],ONE_OVER_SQRT3_Q15_128); @@ -3435,7 +3765,7 @@ void dft36864(int16_t *input, int16_t *output,uint8_t scale) { #endif } -void idft36864(int16_t *input, int16_t *output,uint8_t scale) { +void idft36864(int16_t *input, int16_t *output,uint32_t *scale) { int i,i2,j; uint32_t tmp[3][12288] __attribute__((aligned(32))); @@ -3449,16 +3779,18 @@ void idft36864(int16_t *input, int16_t *output,uint8_t scale) { tmp[2][i] = ((uint32_t *)input)[j++]; } - idft12288((int16_t*)(tmp[0]),(int16_t*)(tmpo[0]),1); - idft12288((int16_t*)(tmp[1]),(int16_t*)(tmpo[1]),1); - idft12288((int16_t*)(tmp[2]),(int16_t*)(tmpo[2]),1); + unsigned int *scale12288=NULL; + if (scale) scale12288=scale+1; + idft12288((int16_t*)(tmp[0]),(int16_t*)(tmpo[0]),scale12288); + idft12288((int16_t*)(tmp[1]),(int16_t*)(tmpo[1]),scale12288); + idft12288((int16_t*)(tmp[2]),(int16_t*)(tmpo[2]),scale12288); for (i=0,i2=0; i<24576; i+=8,i2+=4) { ibfly3((simd_q15_t*)(&tmpo[0][i2]),(simd_q15_t*)(&tmpo[1][i2]),((simd_q15_t*)&tmpo[2][i2]), (simd_q15_t*)(output+i),(simd_q15_t*)(output+24576+i),(simd_q15_t*)(output+49152+i), (simd_q15_t*)(twa36864+i),(simd_q15_t*)(twb36864+i)); } - if (scale==1) { + if (scale && *scale>0) { for (i=0; i<576; i++) { y128p[0] = mulhi_int16(y128p[0],ONE_OVER_SQRT3_Q15_128); y128p[1] = mulhi_int16(y128p[1],ONE_OVER_SQRT3_Q15_128); @@ -3485,7 +3817,7 @@ int16_t twa49152[32768] __attribute__((aligned(32))); int16_t twb49152[32768] __attribute__((aligned(32))); // 16384 x 3 -void dft49152(int16_t *input, int16_t *output,uint8_t scale) { +void dft49152(int16_t *input, int16_t *output,uint32_t *scale) { int i,i2,j; uint32_t tmp[3][16384] __attribute__((aligned(32))); @@ -3499,16 +3831,18 @@ void dft49152(int16_t *input, int16_t *output,uint8_t scale) { tmp[2][i] = ((uint32_t *)input)[j++]; } - dft16384((int16_t*)(tmp[0]),(int16_t*)(tmpo[0]),1); - dft16384((int16_t*)(tmp[1]),(int16_t*)(tmpo[1]),1); - dft16384((int16_t*)(tmp[2]),(int16_t*)(tmpo[2]),1); + unsigned int *scale16384=NULL; + if (scale) scale16384=scale+1; + dft16384((int16_t*)(tmp[0]),(int16_t*)(tmpo[0]),scale16384); + dft16384((int16_t*)(tmp[1]),(int16_t*)(tmpo[1]),scale16384); + dft16384((int16_t*)(tmp[2]),(int16_t*)(tmpo[2]),scale16384); for (i=0,i2=0; i<32768; i+=8,i2+=4) { bfly3((simd_q15_t*)(&tmpo[0][i2]),(simd_q15_t*)(&tmpo[1][i2]),((simd_q15_t*)&tmpo[2][i2]), (simd_q15_t*)(output+i),(simd_q15_t*)(output+32768+i),(simd_q15_t*)(output+65536+i), (simd_q15_t*)(twa49152+i),(simd_q15_t*)(twb49152+i)); } - if (scale==1) { + if (scale && *scale>0) { for (i=0; i<768; i++) { y128p[0] = mulhi_int16(y128p[0],ONE_OVER_SQRT3_Q15_128); y128p[1] = mulhi_int16(y128p[1],ONE_OVER_SQRT3_Q15_128); @@ -3531,7 +3865,7 @@ void dft49152(int16_t *input, int16_t *output,uint8_t scale) { } } -void idft49152(int16_t *input, int16_t *output,uint8_t scale) { +void idft49152(int16_t *input, int16_t *output,uint32_t *scale) { int i,i2,j; uint32_t tmp[3][16384] __attribute__((aligned(32))); @@ -3545,16 +3879,18 @@ void idft49152(int16_t *input, int16_t *output,uint8_t scale) { tmp[2][i] = ((uint32_t *)input)[j++]; } - idft16384((int16_t*)(tmp[0]),(int16_t*)(tmpo[0]),1); - idft16384((int16_t*)(tmp[1]),(int16_t*)(tmpo[1]),1); - idft16384((int16_t*)(tmp[2]),(int16_t*)(tmpo[2]),1); + unsigned int *scale16384=NULL; + if (scale) scale16384=scale+1; + idft16384((int16_t*)(tmp[0]),(int16_t*)(tmpo[0]),scale16384); + idft16384((int16_t*)(tmp[1]),(int16_t*)(tmpo[1]),scale16384); + idft16384((int16_t*)(tmp[2]),(int16_t*)(tmpo[2]),scale16384); for (i=0,i2=0; i<32768; i+=8,i2+=4) { ibfly3((simd_q15_t*)(&tmpo[0][i2]),(simd_q15_t*)(&tmpo[1][i2]),((simd_q15_t*)&tmpo[2][i2]), (simd_q15_t*)(output+i),(simd_q15_t*)(output+32768+i),(simd_q15_t*)(output+65536+i), (simd_q15_t*)(twa49152+i),(simd_q15_t*)(twb49152+i)); } - if (scale==1) { + if (scale && *scale>0) { for (i=0; i<768; i++) { y128p[0] = mulhi_int16(y128p[0],ONE_OVER_SQRT3_Q15_128); y128p[1] = mulhi_int16(y128p[1],ONE_OVER_SQRT3_Q15_128); @@ -3579,7 +3915,7 @@ void idft49152(int16_t *input, int16_t *output,uint8_t scale) { int16_t tw65536[3*2*16384] __attribute__((aligned(32))); -void idft65536(int16_t *x,int16_t *y,unsigned char scale) +void idft65536(int16_t *x,int16_t *y,unsigned int *scale) { simd_q15_t xtmp[16384],ytmp[16384],*tw65536_128p=(simd_q15_t *)tw65536,*x128=(simd_q15_t *)x,*y128p=(simd_q15_t *)y; @@ -3591,10 +3927,12 @@ void idft65536(int16_t *x,int16_t *y,unsigned char scale) } - idft16384((int16_t*)(xtmp),(int16_t*)(ytmp),1); - idft16384((int16_t*)(xtmp+4096),(int16_t*)(ytmp+4096),1); - idft16384((int16_t*)(xtmp+8192),(int16_t*)(ytmp+8192),1); - idft16384((int16_t*)(xtmp+12299),(int16_t*)(ytmp+12288),1); + unsigned int *scale16384=NULL; + if (scale) scale16384=scale+1; + idft16384((int16_t*)(xtmp),(int16_t*)(ytmp),scale16384); + idft16384((int16_t*)(xtmp+4096),(int16_t*)(ytmp+4096),scale16384); + idft16384((int16_t*)(xtmp+8192),(int16_t*)(ytmp+8192),scale16384); + idft16384((int16_t*)(xtmp+12299),(int16_t*)(ytmp+12288),scale16384); for (i=0; i<4096; i++) { ibfly4(ytmpp,ytmpp+4096,ytmpp+8192,ytmpp+12288, @@ -3605,25 +3943,26 @@ void idft65536(int16_t *x,int16_t *y,unsigned char scale) ytmpp++; } - if (scale>0) { + if (scale && *scale>0) { + unsigned int scalec=*scale; for (i=0; i<1024; i++) { - y128p[0] = shiftright_int16(y128p[0],scale); - y128p[1] = shiftright_int16(y128p[1],scale); - y128p[2] = shiftright_int16(y128p[2],scale); - y128p[3] = shiftright_int16(y128p[3],scale); - y128p[4] = shiftright_int16(y128p[4],scale); - y128p[5] = shiftright_int16(y128p[5],scale); - y128p[6] = shiftright_int16(y128p[6],scale); - y128p[7] = shiftright_int16(y128p[7],scale); - y128p[8] = shiftright_int16(y128p[8],scale); - y128p[9] = shiftright_int16(y128p[9],scale); - y128p[10] = shiftright_int16(y128p[10],scale); - y128p[11] = shiftright_int16(y128p[11],scale); - y128p[12] = shiftright_int16(y128p[12],scale); - y128p[13] = shiftright_int16(y128p[13],scale); - y128p[14] = shiftright_int16(y128p[14],scale); - y128p[15] = shiftright_int16(y128p[15],scale); + y128p[0] = shiftright_int16(y128p[0],scalec); + y128p[1] = shiftright_int16(y128p[1],scalec); + y128p[2] = shiftright_int16(y128p[2],scalec); + y128p[3] = shiftright_int16(y128p[3],scalec); + y128p[4] = shiftright_int16(y128p[4],scalec); + y128p[5] = shiftright_int16(y128p[5],scalec); + y128p[6] = shiftright_int16(y128p[6],scalec); + y128p[7] = shiftright_int16(y128p[7],scalec); + y128p[8] = shiftright_int16(y128p[8],scalec); + y128p[9] = shiftright_int16(y128p[9],scalec); + y128p[10] = shiftright_int16(y128p[10],scalec); + y128p[11] = shiftright_int16(y128p[11],scalec); + y128p[12] = shiftright_int16(y128p[12],scalec); + y128p[13] = shiftright_int16(y128p[13],scalec); + y128p[14] = shiftright_int16(y128p[14],scalec); + y128p[15] = shiftright_int16(y128p[15],scalec); y128p+=16; } @@ -3635,7 +3974,7 @@ void idft65536(int16_t *x,int16_t *y,unsigned char scale) int16_t twa98304[65536] __attribute__((aligned(32))); int16_t twb98304[65536] __attribute__((aligned(32))); // 32768 x 3 -void dft98304(int16_t *input, int16_t *output,uint8_t scale) { +void dft98304(int16_t *input, int16_t *output,uint32_t *scale) { int i,i2,j; uint32_t tmp[3][32768] __attribute__((aligned(32))); @@ -3649,16 +3988,18 @@ void dft98304(int16_t *input, int16_t *output,uint8_t scale) { tmp[2][i] = ((uint32_t *)input)[j++]; } - dft32768((int16_t*)(tmp[0]),(int16_t*)(tmpo[0]),1); - dft32768((int16_t*)(tmp[1]),(int16_t*)(tmpo[1]),1); - dft32768((int16_t*)(tmp[2]),(int16_t*)(tmpo[2]),1); + unsigned int *scale32768=NULL; + if (scale) scale32768=scale+1; + dft32768((int16_t*)(tmp[0]),(int16_t*)(tmpo[0]),scale32768); + dft32768((int16_t*)(tmp[1]),(int16_t*)(tmpo[1]),scale32768); + dft32768((int16_t*)(tmp[2]),(int16_t*)(tmpo[2]),scale32768); for (i=0,i2=0; i<65536; i+=8,i2+=4) { bfly3((simd_q15_t*)(&tmpo[0][i2]),(simd_q15_t*)(&tmpo[1][i2]),((simd_q15_t*)&tmpo[2][i2]), (simd_q15_t*)(output+i),(simd_q15_t*)(output+65536+i),(simd_q15_t*)(output+131072+i), (simd_q15_t*)(twa98304+i),(simd_q15_t*)(twb98304+i)); } - if (scale==1) { + if (scale && *scale>0) { for (i=0; i<1536; i++) { y128p[0] = mulhi_int16(y128p[0],ONE_OVER_SQRT3_Q15_128); y128p[1] = mulhi_int16(y128p[1],ONE_OVER_SQRT3_Q15_128); @@ -3681,7 +4022,7 @@ void dft98304(int16_t *input, int16_t *output,uint8_t scale) { } } -void idft98304(int16_t *input, int16_t *output,uint8_t scale) { +void idft98304(int16_t *input, int16_t *output,uint32_t *scale) { int i,i2,j; uint32_t tmp[3][32768] __attribute__((aligned(32))); @@ -3695,16 +4036,18 @@ void idft98304(int16_t *input, int16_t *output,uint8_t scale) { tmp[2][i] = ((uint32_t *)input)[j++]; } - idft32768((int16_t*)(tmp[0]),(int16_t*)(tmpo[0]),1); - idft32768((int16_t*)(tmp[1]),(int16_t*)(tmpo[1]),1); - idft32768((int16_t*)(tmp[2]),(int16_t*)(tmpo[2]),1); + unsigned int *scale32768=NULL; + if (scale) scale32768=scale+1; + idft32768((int16_t*)(tmp[0]),(int16_t*)(tmpo[0]),scale32768); + idft32768((int16_t*)(tmp[1]),(int16_t*)(tmpo[1]),scale32768); + idft32768((int16_t*)(tmp[2]),(int16_t*)(tmpo[2]),scale32768); for (i=0,i2=0; i<65536; i+=8,i2+=4) { ibfly3((simd_q15_t*)(&tmpo[0][i2]),(simd_q15_t*)(&tmpo[1][i2]),((simd_q15_t*)&tmpo[2][i2]), (simd_q15_t*)(output+i),(simd_q15_t*)(output+65536+i),(simd_q15_t*)(output+131072+i), (simd_q15_t*)(twa98304+i),(simd_q15_t*)(twb98304+i)); } - if (scale==1) { + if (scale && *scale>0) { for (i=0; i<1536; i++) { y128p[0] = mulhi_int16(y128p[0],ONE_OVER_SQRT3_Q15_128); y128p[1] = mulhi_int16(y128p[1],ONE_OVER_SQRT3_Q15_128); @@ -3877,7 +4220,7 @@ static inline void dft12f(simd_q15_t *x0, -void dft12(int16_t *x,int16_t *y ,unsigned char scale_flag) +void dft12(int16_t *x,int16_t *y ,unsigned int *scale_flag) { simd_q15_t *x128 = (simd_q15_t *)x,*y128 = (simd_q15_t *)y; @@ -3910,7 +4253,7 @@ void dft12(int16_t *x,int16_t *y ,unsigned char scale_flag) static int16_t tw24[88]__attribute__((aligned(32))); -void dft24(int16_t *x,int16_t *y,unsigned char scale_flag) +void dft24(int16_t *x,int16_t *y,unsigned int *scale_flag) { simd_q15_t *x128=(simd_q15_t *)x; @@ -3990,7 +4333,7 @@ void dft24(int16_t *x,int16_t *y,unsigned char scale_flag) // msg("dft24e\n"); } - if (scale_flag==1) { + if (scale_flag) { norm128 = set1_int16(dft_norm_table[1]); for (i=0; i<24; i++) { @@ -4006,7 +4349,7 @@ void dft24(int16_t *x,int16_t *y,unsigned char scale_flag) static int16_t twa36[88]__attribute__((aligned(32))); static int16_t twb36[88]__attribute__((aligned(32))); -void dft36(int16_t *x,int16_t *y,unsigned char scale_flag) +void dft36(int16_t *x,int16_t *y,unsigned int *scale_flag) { simd_q15_t *x128=(simd_q15_t *)x; @@ -4113,7 +4456,7 @@ void dft36(int16_t *x,int16_t *y,unsigned char scale_flag) twb128+k); } - if (scale_flag==1) { + if (scale_flag) { norm128 = set1_int16(dft_norm_table[2]); for (i=0; i<36; i++) { @@ -4126,7 +4469,7 @@ static int16_t twa48[88]__attribute__((aligned(32))); static int16_t twb48[88]__attribute__((aligned(32))); static int16_t twc48[88]__attribute__((aligned(32))); -void dft48(int16_t *x, int16_t *y,unsigned char scale_flag) +void dft48(int16_t *x, int16_t *y,unsigned int *scale_flag) { simd_q15_t *x128=(simd_q15_t *)x; @@ -4270,7 +4613,7 @@ void dft48(int16_t *x, int16_t *y,unsigned char scale_flag) } - if (scale_flag == 1) { + if (scale_flag) { norm128 = set1_int16(dft_norm_table[3]); for (i=0; i<48; i++) { @@ -4284,7 +4627,7 @@ static int16_t twb60[88]__attribute__((aligned(32))); static int16_t twc60[88]__attribute__((aligned(32))); static int16_t twd60[88]__attribute__((aligned(32))); -void dft60(int16_t *x,int16_t *y,unsigned char scale) +void dft60(int16_t *x,int16_t *y,unsigned int *scale) { simd_q15_t *x128=(simd_q15_t *)x; @@ -4450,7 +4793,7 @@ void dft60(int16_t *x,int16_t *y,unsigned char scale) twd128+k); } - if (scale == 1) { + if (scale) { norm128 = set1_int16(dft_norm_table[4]); for (i=0; i<60; i++) { @@ -4462,7 +4805,7 @@ void dft60(int16_t *x,int16_t *y,unsigned char scale) static int16_t tw72[280]__attribute__((aligned(32))); -void dft72(int16_t *x,int16_t *y,unsigned char scale_flag) +void dft72(int16_t *x,int16_t *y,unsigned int *scale_flag) { int i,j; @@ -4478,8 +4821,8 @@ void dft72(int16_t *x,int16_t *y,unsigned char scale_flag) x2128[i+36] = x128[j+1]; // odd inputs } - dft36((int16_t *)x2128,(int16_t *)ytmp128,1); - dft36((int16_t *)(x2128+36),(int16_t *)(ytmp128+36),1); + dft36((int16_t *)x2128,(int16_t *)ytmp128,scale_flag); + dft36((int16_t *)(x2128+36),(int16_t *)(ytmp128+36),scale_flag); bfly2_tw1(ytmp128,ytmp128+36,y128,y128+36); @@ -4491,7 +4834,7 @@ void dft72(int16_t *x,int16_t *y,unsigned char scale_flag) tw128+j); } - if (scale_flag==1) { + if (scale_flag) { norm128 = set1_int16(dft_norm_table[5]); for (i=0; i<72; i++) { @@ -4502,7 +4845,7 @@ void dft72(int16_t *x,int16_t *y,unsigned char scale_flag) static int16_t tw96[376]__attribute__((aligned(32))); -void dft96(int16_t *x,int16_t *y,unsigned char scale_flag) +void dft96(int16_t *x,int16_t *y,unsigned int *scale_flag) { @@ -4533,7 +4876,7 @@ void dft96(int16_t *x,int16_t *y,unsigned char scale_flag) tw128+j); } - if (scale_flag==1) { + if (scale_flag) { norm128 = set1_int16(dft_norm_table[6]); for (i=0; i<96; i++) { @@ -4545,7 +4888,7 @@ void dft96(int16_t *x,int16_t *y,unsigned char scale_flag) static int16_t twa108[280]__attribute__((aligned(32))); static int16_t twb108[280]__attribute__((aligned(32))); -void dft108(int16_t *x,int16_t *y,unsigned char scale_flag) +void dft108(int16_t *x,int16_t *y,unsigned int *scale_flag) { int i,j; simd_q15_t *x128=(simd_q15_t *)x; @@ -4580,7 +4923,7 @@ void dft108(int16_t *x,int16_t *y,unsigned char scale_flag) } - if (scale_flag==1) { + if (scale_flag) { norm128 = set1_int16(dft_norm_table[7]); for (i=0; i<108; i++) { @@ -4590,7 +4933,7 @@ void dft108(int16_t *x,int16_t *y,unsigned char scale_flag) } static int16_t tw120[472]__attribute__((aligned(32))); -void dft120(int16_t *x,int16_t *y, unsigned char scale_flag) +void dft120(int16_t *x,int16_t *y, unsigned int *scale_flag) { int i,j; simd_q15_t *x128=(simd_q15_t *)x; @@ -4618,7 +4961,7 @@ void dft120(int16_t *x,int16_t *y, unsigned char scale_flag) tw128+j); } - if (scale_flag==1) { + if (scale_flag) { norm128 = set1_int16(dft_norm_table[8]); for (i=0; i<120; i++) { @@ -4630,7 +4973,7 @@ void dft120(int16_t *x,int16_t *y, unsigned char scale_flag) static int16_t twa144[376]__attribute__((aligned(32))); static int16_t twb144[376]__attribute__((aligned(32))); -void dft144(int16_t *x,int16_t *y,unsigned char scale_flag) +void dft144(int16_t *x,int16_t *y,unsigned int *scale_flag) { int i,j; simd_q15_t *x128=(simd_q15_t *)x; @@ -4648,9 +4991,9 @@ void dft144(int16_t *x,int16_t *y,unsigned char scale_flag) x2128[i+96] = x128[j+2]; } - dft48((int16_t *)x2128,(int16_t *)ytmp128,1); - dft48((int16_t *)(x2128+48),(int16_t *)(ytmp128+48),1); - dft48((int16_t *)(x2128+96),(int16_t *)(ytmp128+96),1); + dft48((int16_t *)x2128,(int16_t *)ytmp128,scale_flag); + dft48((int16_t *)(x2128+48),(int16_t *)(ytmp128+48),scale_flag); + dft48((int16_t *)(x2128+96),(int16_t *)(ytmp128+96),scale_flag); bfly3_tw1(ytmp128,ytmp128+48,ytmp128+96,y128,y128+48,y128+96); @@ -4665,7 +5008,7 @@ void dft144(int16_t *x,int16_t *y,unsigned char scale_flag) twb128+j); } - if (scale_flag==1) { + if (scale_flag) { norm128 = set1_int16(dft_norm_table[9]); for (i=0; i<144; i++) { @@ -4677,7 +5020,7 @@ void dft144(int16_t *x,int16_t *y,unsigned char scale_flag) static int16_t twa180[472]__attribute__((aligned(32))); static int16_t twb180[472]__attribute__((aligned(32))); -void dft180(int16_t *x,int16_t *y,unsigned char scale_flag) +void dft180(int16_t *x,int16_t *y,unsigned int *scale_flag) { int i,j; @@ -4696,9 +5039,9 @@ void dft180(int16_t *x,int16_t *y,unsigned char scale_flag) x2128[i+120] = x128[j+2]; } - dft60((int16_t *)x2128,(int16_t *)ytmp128,1); - dft60((int16_t *)(x2128+60),(int16_t *)(ytmp128+60),1); - dft60((int16_t *)(x2128+120),(int16_t *)(ytmp128+120),1); + dft60((int16_t *)x2128,(int16_t *)ytmp128,scale_flag); + dft60((int16_t *)(x2128+60),(int16_t *)(ytmp128+60),scale_flag); + dft60((int16_t *)(x2128+120),(int16_t *)(ytmp128+120),scale_flag); bfly3_tw1(ytmp128,ytmp128+60,ytmp128+120,y128,y128+60,y128+120); @@ -4713,7 +5056,7 @@ void dft180(int16_t *x,int16_t *y,unsigned char scale_flag) twb128+j); } - if (scale_flag==1) { + if (scale_flag) { norm128 = set1_int16(dft_norm_table[10]); for (i=0; i<180; i++) { @@ -4726,7 +5069,7 @@ static int16_t twa192[376]__attribute__((aligned(32))); static int16_t twb192[376]__attribute__((aligned(32))); static int16_t twc192[376]__attribute__((aligned(32))); -void dft192(int16_t *x,int16_t *y,unsigned char scale_flag) +void dft192(int16_t *x,int16_t *y,unsigned int *scale_flag) { int i,j; @@ -4747,10 +5090,10 @@ void dft192(int16_t *x,int16_t *y,unsigned char scale_flag) x2128[i+144] = x128[j+3]; } - dft48((int16_t *)x2128,(int16_t *)ytmp128,1); - dft48((int16_t *)(x2128+48),(int16_t *)(ytmp128+48),1); - dft48((int16_t *)(x2128+96),(int16_t *)(ytmp128+96),1); - dft48((int16_t *)(x2128+144),(int16_t *)(ytmp128+144),1); + dft48((int16_t *)x2128,(int16_t *)ytmp128,scale_flag); + dft48((int16_t *)(x2128+48),(int16_t *)(ytmp128+48),scale_flag); + dft48((int16_t *)(x2128+96),(int16_t *)(ytmp128+96),scale_flag); + dft48((int16_t *)(x2128+144),(int16_t *)(ytmp128+144),scale_flag); bfly4_tw1(ytmp128,ytmp128+48,ytmp128+96,ytmp128+144,y128,y128+48,y128+96,y128+144); @@ -4768,7 +5111,7 @@ void dft192(int16_t *x,int16_t *y,unsigned char scale_flag) twc128+j); } - if (scale_flag==1) { + if (scale_flag) { norm128 = set1_int16(dft_norm_table[11]); for (i=0; i<192; i++) { @@ -4780,7 +5123,7 @@ void dft192(int16_t *x,int16_t *y,unsigned char scale_flag) static int16_t twa216[568]__attribute__((aligned(32))); static int16_t twb216[568]__attribute__((aligned(32))); -void dft216(int16_t *x,int16_t *y,unsigned char scale_flag) +void dft216(int16_t *x,int16_t *y,unsigned int *scale_flag) { int i,j; @@ -4799,9 +5142,9 @@ void dft216(int16_t *x,int16_t *y,unsigned char scale_flag) x2128[i+144] = x128[j+2]; } - dft72((int16_t *)x2128,(int16_t *)ytmp128,1); - dft72((int16_t *)(x2128+72),(int16_t *)(ytmp128+72),1); - dft72((int16_t *)(x2128+144),(int16_t *)(ytmp128+144),1); + dft72((int16_t *)x2128,(int16_t *)ytmp128,scale_flag); + dft72((int16_t *)(x2128+72),(int16_t *)(ytmp128+72),scale_flag); + dft72((int16_t *)(x2128+144),(int16_t *)(ytmp128+144),scale_flag); bfly3_tw1(ytmp128,ytmp128+72,ytmp128+144,y128,y128+72,y128+144); @@ -4816,7 +5159,7 @@ void dft216(int16_t *x,int16_t *y,unsigned char scale_flag) twb128+j); } - if (scale_flag==1) { + if (scale_flag) { norm128 = set1_int16(dft_norm_table[12]); for (i=0; i<216; i++) { @@ -4829,7 +5172,7 @@ static int16_t twa240[472]__attribute__((aligned(32))); static int16_t twb240[472]__attribute__((aligned(32))); static int16_t twc240[472]__attribute__((aligned(32))); -void dft240(int16_t *x,int16_t *y,unsigned char scale_flag) +void dft240(int16_t *x,int16_t *y,unsigned int *scale_flag) { int i,j; @@ -4850,10 +5193,10 @@ void dft240(int16_t *x,int16_t *y,unsigned char scale_flag) x2128[i+180] = x128[j+3]; } - dft60((int16_t *)x2128,(int16_t *)ytmp128,1); - dft60((int16_t *)(x2128+60),(int16_t *)(ytmp128+60),1); - dft60((int16_t *)(x2128+120),(int16_t *)(ytmp128+120),1); - dft60((int16_t *)(x2128+180),(int16_t *)(ytmp128+180),1); + dft60((int16_t *)x2128,(int16_t *)ytmp128,scale_flag); + dft60((int16_t *)(x2128+60),(int16_t *)(ytmp128+60),scale_flag); + dft60((int16_t *)(x2128+120),(int16_t *)(ytmp128+120),scale_flag); + dft60((int16_t *)(x2128+180),(int16_t *)(ytmp128+180),scale_flag); bfly4_tw1(ytmp128,ytmp128+60,ytmp128+120,ytmp128+180,y128,y128+60,y128+120,y128+180); @@ -4871,7 +5214,7 @@ void dft240(int16_t *x,int16_t *y,unsigned char scale_flag) twc128+j); } - if (scale_flag==1) { + if (scale_flag) { norm128 = set1_int16(dft_norm_table[13]); for (i=0; i<240; i++) { @@ -4883,7 +5226,7 @@ void dft240(int16_t *x,int16_t *y,unsigned char scale_flag) static int16_t twa288[760]__attribute__((aligned(32))); static int16_t twb288[760]__attribute__((aligned(32))); -void dft288(int16_t *x,int16_t *y,unsigned char scale_flag) +void dft288(int16_t *x,int16_t *y,unsigned int *scale_flag) { int i,j; @@ -4902,9 +5245,9 @@ void dft288(int16_t *x,int16_t *y,unsigned char scale_flag) x2128[i+192] = x128[j+2]; } - dft96((int16_t *)x2128,(int16_t *)ytmp128,1); - dft96((int16_t *)(x2128+96),(int16_t *)(ytmp128+96),1); - dft96((int16_t *)(x2128+192),(int16_t *)(ytmp128+192),1); + dft96((int16_t *)x2128,(int16_t *)ytmp128,scale_flag); + dft96((int16_t *)(x2128+96),(int16_t *)(ytmp128+96),scale_flag); + dft96((int16_t *)(x2128+192),(int16_t *)(ytmp128+192),scale_flag); bfly3_tw1(ytmp128,ytmp128+96,ytmp128+192,y128,y128+96,y128+192); @@ -4919,7 +5262,7 @@ void dft288(int16_t *x,int16_t *y,unsigned char scale_flag) twb128+j); } - if (scale_flag==1) { + if (scale_flag) { norm128 = set1_int16(dft_norm_table[14]); for (i=0; i<288; i++) { @@ -4933,7 +5276,7 @@ static int16_t twb300[472]__attribute__((aligned(32))); static int16_t twc300[472]__attribute__((aligned(32))); static int16_t twd300[472]__attribute__((aligned(32))); -void dft300(int16_t *x,int16_t *y,unsigned char scale_flag) +void dft300(int16_t *x,int16_t *y,unsigned int *scale_flag) { int i,j; @@ -4956,11 +5299,11 @@ void dft300(int16_t *x,int16_t *y,unsigned char scale_flag) x2128[i+240] = x128[j+4]; } - dft60((int16_t *)x2128,(int16_t *)ytmp128,1); - dft60((int16_t *)(x2128+60),(int16_t *)(ytmp128+60),1); - dft60((int16_t *)(x2128+120),(int16_t *)(ytmp128+120),1); - dft60((int16_t *)(x2128+180),(int16_t *)(ytmp128+180),1); - dft60((int16_t *)(x2128+240),(int16_t *)(ytmp128+240),1); + dft60((int16_t *)x2128,(int16_t *)ytmp128,scale_flag); + dft60((int16_t *)(x2128+60),(int16_t *)(ytmp128+60),scale_flag); + dft60((int16_t *)(x2128+120),(int16_t *)(ytmp128+120),scale_flag); + dft60((int16_t *)(x2128+180),(int16_t *)(ytmp128+180),scale_flag); + dft60((int16_t *)(x2128+240),(int16_t *)(ytmp128+240),scale_flag); bfly5_tw1(ytmp128,ytmp128+60,ytmp128+120,ytmp128+180,ytmp128+240,y128,y128+60,y128+120,y128+180,y128+240); @@ -4981,7 +5324,7 @@ void dft300(int16_t *x,int16_t *y,unsigned char scale_flag) twd128+j); } - if (scale_flag==1) { + if (scale_flag) { norm128 = set1_int16(dft_norm_table[15]); for (i=0; i<300; i++) { @@ -4993,7 +5336,7 @@ void dft300(int16_t *x,int16_t *y,unsigned char scale_flag) static int16_t twa324[107*2*4]; static int16_t twb324[107*2*4]; -void dft324(int16_t *x,int16_t *y,unsigned char scale_flag) // 108 x 3 +void dft324(int16_t *x,int16_t *y,unsigned int *scale_flag) // 108 x 3 { int i,j; simd_q15_t *x128=(simd_q15_t *)x; @@ -5011,9 +5354,9 @@ void dft324(int16_t *x,int16_t *y,unsigned char scale_flag) // 108 x 3 x2128[i+216] = x128[j+2]; } - dft108((int16_t *)x2128,(int16_t *)ytmp128,1); - dft108((int16_t *)(x2128+108),(int16_t *)(ytmp128+108),1); - dft108((int16_t *)(x2128+216),(int16_t *)(ytmp128+216),1); + dft108((int16_t *)x2128,(int16_t *)ytmp128,scale_flag); + dft108((int16_t *)(x2128+108),(int16_t *)(ytmp128+108),scale_flag); + dft108((int16_t *)(x2128+216),(int16_t *)(ytmp128+216),scale_flag); bfly3_tw1(ytmp128,ytmp128+108,ytmp128+216,y128,y128+108,y128+216); @@ -5028,7 +5371,7 @@ void dft324(int16_t *x,int16_t *y,unsigned char scale_flag) // 108 x 3 twb128+j); } - if (scale_flag==1) { + if (scale_flag) { norm128 = set1_int16(dft_norm_table[14]); for (i=0; i<324; i++) { @@ -5040,7 +5383,7 @@ void dft324(int16_t *x,int16_t *y,unsigned char scale_flag) // 108 x 3 static int16_t twa360[119*2*4]; static int16_t twb360[119*2*4]; -void dft360(int16_t *x,int16_t *y,unsigned char scale_flag) // 120 x 3 +void dft360(int16_t *x,int16_t *y,unsigned int *scale_flag) // 120 x 3 { int i,j; simd_q15_t *x128=(simd_q15_t *)x; @@ -5058,9 +5401,9 @@ void dft360(int16_t *x,int16_t *y,unsigned char scale_flag) // 120 x 3 x2128[i+240] = x128[j+2]; } - dft120((int16_t *)x2128,(int16_t *)ytmp128,1); - dft120((int16_t *)(x2128+120),(int16_t *)(ytmp128+120),1); - dft120((int16_t *)(x2128+240),(int16_t *)(ytmp128+240),1); + dft120((int16_t *)x2128,(int16_t *)ytmp128,scale_flag); + dft120((int16_t *)(x2128+120),(int16_t *)(ytmp128+120),scale_flag); + dft120((int16_t *)(x2128+240),(int16_t *)(ytmp128+240),scale_flag); bfly3_tw1(ytmp128,ytmp128+120,ytmp128+240,y128,y128+120,y128+240); @@ -5075,7 +5418,7 @@ void dft360(int16_t *x,int16_t *y,unsigned char scale_flag) // 120 x 3 twb128+j); } - if (scale_flag==1) { + if (scale_flag) { norm128 = set1_int16(dft_norm_table[14]); for (i=0; i<360; i++) { @@ -5088,7 +5431,7 @@ static int16_t twa384[95*2*4]; static int16_t twb384[95*2*4]; static int16_t twc384[95*2*4]; -void dft384(int16_t *x,int16_t *y,unsigned char scale_flag) // 96 x 4 +void dft384(int16_t *x,int16_t *y,unsigned int *scale_flag) // 96 x 4 { int i,j; simd_q15_t *x128=(simd_q15_t *)x; @@ -5108,10 +5451,10 @@ void dft384(int16_t *x,int16_t *y,unsigned char scale_flag) // 96 x 4 x2128[i+288] = x128[j+3]; } - dft96((int16_t *)x2128,(int16_t *)ytmp128,1); - dft96((int16_t *)(x2128+96),(int16_t *)(ytmp128+96),1); - dft96((int16_t *)(x2128+192),(int16_t *)(ytmp128+192),1); - dft96((int16_t *)(x2128+288),(int16_t *)(ytmp128+288),1); + dft96((int16_t *)x2128,(int16_t *)ytmp128,scale_flag); + dft96((int16_t *)(x2128+96),(int16_t *)(ytmp128+96),scale_flag); + dft96((int16_t *)(x2128+192),(int16_t *)(ytmp128+192),scale_flag); + dft96((int16_t *)(x2128+288),(int16_t *)(ytmp128+288),scale_flag); bfly4_tw1(ytmp128,ytmp128+96,ytmp128+192,ytmp128+288,y128,y128+96,y128+192,y128+288); @@ -5129,7 +5472,7 @@ void dft384(int16_t *x,int16_t *y,unsigned char scale_flag) // 96 x 4 twc128+j); } - if (scale_flag==1) { + if (scale_flag) { norm128 = set1_int16(16384);//dft_norm_table[13]); for (i=0; i<384; i++) { @@ -5142,7 +5485,7 @@ static int16_t twa432[107*2*4]; static int16_t twb432[107*2*4]; static int16_t twc432[107*2*4]; -void dft432(int16_t *x,int16_t *y,unsigned char scale_flag) // 108 x 4 +void dft432(int16_t *x,int16_t *y,unsigned int *scale_flag) // 108 x 4 { int i,j; simd_q15_t *x128=(simd_q15_t *)x; @@ -5161,10 +5504,10 @@ void dft432(int16_t *x,int16_t *y,unsigned char scale_flag) // 108 x 4 x2128[i+324] = x128[j+3]; } - dft108((int16_t *)x2128,(int16_t *)ytmp128,1); - dft108((int16_t *)(x2128+108),(int16_t *)(ytmp128+108),1); - dft108((int16_t *)(x2128+216),(int16_t *)(ytmp128+216),1); - dft108((int16_t *)(x2128+324),(int16_t *)(ytmp128+324),1); + dft108((int16_t *)x2128,(int16_t *)ytmp128,scale_flag); + dft108((int16_t *)(x2128+108),(int16_t *)(ytmp128+108),scale_flag); + dft108((int16_t *)(x2128+216),(int16_t *)(ytmp128+216),scale_flag); + dft108((int16_t *)(x2128+324),(int16_t *)(ytmp128+324),scale_flag); bfly4_tw1(ytmp128,ytmp128+108,ytmp128+216,ytmp128+324,y128,y128+108,y128+216,y128+324); @@ -5182,7 +5525,7 @@ void dft432(int16_t *x,int16_t *y,unsigned char scale_flag) // 108 x 4 twc128+j); } - if (scale_flag==1) { + if (scale_flag) { norm128 = set1_int16(16384);//dft_norm_table[13]); for (i=0; i<432; i++) { @@ -5194,7 +5537,7 @@ static int16_t twa480[119*2*4]; static int16_t twb480[119*2*4]; static int16_t twc480[119*2*4]; -void dft480(int16_t *x,int16_t *y,unsigned char scale_flag) // 120 x 4 +void dft480(int16_t *x,int16_t *y,unsigned int *scale_flag) // 120 x 4 { int i,j; simd_q15_t *x128=(simd_q15_t *)x; @@ -5214,10 +5557,10 @@ void dft480(int16_t *x,int16_t *y,unsigned char scale_flag) // 120 x 4 x2128[i+360] = x128[j+3]; } - dft120((int16_t *)x2128,(int16_t *)ytmp128,1); - dft120((int16_t *)(x2128+120),(int16_t *)(ytmp128+120),1); - dft120((int16_t *)(x2128+240),(int16_t *)(ytmp128+240),1); - dft120((int16_t *)(x2128+360),(int16_t *)(ytmp128+360),1); + dft120((int16_t *)x2128,(int16_t *)ytmp128,scale_flag); + dft120((int16_t *)(x2128+120),(int16_t *)(ytmp128+120),scale_flag); + dft120((int16_t *)(x2128+240),(int16_t *)(ytmp128+240),scale_flag); + dft120((int16_t *)(x2128+360),(int16_t *)(ytmp128+360),scale_flag); bfly4_tw1(ytmp128,ytmp128+120,ytmp128+240,ytmp128+360,y128,y128+120,y128+240,y128+360); @@ -5235,7 +5578,7 @@ void dft480(int16_t *x,int16_t *y,unsigned char scale_flag) // 120 x 4 twc128+j); } - if (scale_flag==1) { + if (scale_flag) { norm128 = set1_int16(16384);//dft_norm_table[13]); for (i=0; i<480; i++) { @@ -5248,7 +5591,7 @@ void dft480(int16_t *x,int16_t *y,unsigned char scale_flag) // 120 x 4 static int16_t twa540[179*2*4]; static int16_t twb540[179*2*4]; -void dft540(int16_t *x,int16_t *y,unsigned char scale_flag) // 180 x 3 +void dft540(int16_t *x,int16_t *y,unsigned int *scale_flag) // 180 x 3 { int i,j; simd_q15_t *x128=(simd_q15_t *)x; @@ -5266,9 +5609,9 @@ void dft540(int16_t *x,int16_t *y,unsigned char scale_flag) // 180 x 3 x2128[i+360] = x128[j+2]; } - dft180((int16_t *)x2128,(int16_t *)ytmp128,1); - dft180((int16_t *)(x2128+180),(int16_t *)(ytmp128+180),1); - dft180((int16_t *)(x2128+360),(int16_t *)(ytmp128+360),1); + dft180((int16_t *)x2128,(int16_t *)ytmp128,scale_flag); + dft180((int16_t *)(x2128+180),(int16_t *)(ytmp128+180),scale_flag); + dft180((int16_t *)(x2128+360),(int16_t *)(ytmp128+360),scale_flag); bfly3_tw1(ytmp128,ytmp128+180,ytmp128+360,y128,y128+180,y128+360); @@ -5283,7 +5626,7 @@ void dft540(int16_t *x,int16_t *y,unsigned char scale_flag) // 180 x 3 twb128+j); } - if (scale_flag==1) { + if (scale_flag) { norm128 = set1_int16(dft_norm_table[14]); for (i=0; i<540; i++) { @@ -5295,7 +5638,7 @@ void dft540(int16_t *x,int16_t *y,unsigned char scale_flag) // 180 x 3 static int16_t twa576[191*2*4]; static int16_t twb576[191*2*4]; -void dft576(int16_t *x,int16_t *y,unsigned char scale_flag) // 192 x 3 +void dft576(int16_t *x,int16_t *y,unsigned int *scale_flag) // 192 x 3 { int i,j; simd_q15_t *x128=(simd_q15_t *)x; @@ -5314,9 +5657,9 @@ void dft576(int16_t *x,int16_t *y,unsigned char scale_flag) // 192 x 3 } - dft192((int16_t *)x2128,(int16_t *)ytmp128,1); - dft192((int16_t *)(x2128+192),(int16_t *)(ytmp128+192),1); - dft192((int16_t *)(x2128+384),(int16_t *)(ytmp128+384),1); + dft192((int16_t *)x2128,(int16_t *)ytmp128,scale_flag); + dft192((int16_t *)(x2128+192),(int16_t *)(ytmp128+192),scale_flag); + dft192((int16_t *)(x2128+384),(int16_t *)(ytmp128+384),scale_flag); bfly3_tw1(ytmp128,ytmp128+192,ytmp128+384,y128,y128+192,y128+384); @@ -5331,7 +5674,7 @@ void dft576(int16_t *x,int16_t *y,unsigned char scale_flag) // 192 x 3 twb128+j); } - if (scale_flag==1) { + if (scale_flag) { norm128 = set1_int16(dft_norm_table[14]); for (i=0; i<576; i++) { @@ -5343,7 +5686,7 @@ void dft576(int16_t *x,int16_t *y,unsigned char scale_flag) // 192 x 3 static int16_t twa600[299*2*4]; -void dft600(int16_t *x,int16_t *y,unsigned char scale_flag) // 300 x 2 +void dft600(int16_t *x,int16_t *y,unsigned int *scale_flag) // 300 x 2 { int i,j; simd_q15_t *x128=(simd_q15_t *)x; @@ -5358,8 +5701,8 @@ void dft600(int16_t *x,int16_t *y,unsigned char scale_flag) // 300 x 2 x2128[i+300] = x128[j+1]; } - dft300((int16_t *)x2128,(int16_t *)ytmp128,1); - dft300((int16_t *)(x2128+300),(int16_t *)(ytmp128+300),1); + dft300((int16_t *)x2128,(int16_t *)ytmp128,scale_flag); + dft300((int16_t *)(x2128+300),(int16_t *)(ytmp128+300),scale_flag); bfly2_tw1(ytmp128,ytmp128+300,y128,y128+300); @@ -5372,7 +5715,7 @@ void dft600(int16_t *x,int16_t *y,unsigned char scale_flag) // 300 x 2 tw128+j); } - if (scale_flag==1) { + if (scale_flag) { norm128 = set1_int16(ONE_OVER_SQRT2_Q15); for (i=0; i<600; i++) { @@ -5385,7 +5728,7 @@ void dft600(int16_t *x,int16_t *y,unsigned char scale_flag) // 300 x 2 static int16_t twa648[215*2*4]; static int16_t twb648[215*2*4]; -void dft648(int16_t *x,int16_t *y,unsigned char scale_flag) // 216 x 3 +void dft648(int16_t *x,int16_t *y,unsigned int *scale_flag) // 216 x 3 { int i,j; simd_q15_t *x128=(simd_q15_t *)x; @@ -5403,9 +5746,9 @@ void dft648(int16_t *x,int16_t *y,unsigned char scale_flag) // 216 x 3 x2128[i+432] = x128[j+2]; } - dft216((int16_t *)x2128,(int16_t *)ytmp128,1); - dft216((int16_t *)(x2128+216),(int16_t *)(ytmp128+216),1); - dft216((int16_t *)(x2128+432),(int16_t *)(ytmp128+432),1); + dft216((int16_t *)x2128,(int16_t *)ytmp128,scale_flag); + dft216((int16_t *)(x2128+216),(int16_t *)(ytmp128+216),scale_flag); + dft216((int16_t *)(x2128+432),(int16_t *)(ytmp128+432),scale_flag); bfly3_tw1(ytmp128,ytmp128+216,ytmp128+432,y128,y128+216,y128+432); @@ -5420,7 +5763,7 @@ void dft648(int16_t *x,int16_t *y,unsigned char scale_flag) // 216 x 3 twb128+j); } - if (scale_flag==1) { + if (scale_flag) { norm128 = set1_int16(dft_norm_table[14]); for (i=0; i<648; i++) { @@ -5439,7 +5782,7 @@ static int16_t twb720[179*2*4]; static int16_t twc720[179*2*4]; -void dft720(int16_t *x,int16_t *y,unsigned char scale_flag) // 180 x 4 +void dft720(int16_t *x,int16_t *y,unsigned int *scale_flag) // 180 x 4 { int i,j; simd_q15_t *x128=(simd_q15_t *)x; @@ -5459,10 +5802,10 @@ void dft720(int16_t *x,int16_t *y,unsigned char scale_flag) // 180 x 4 x2128[i+540] = x128[j+3]; } - dft180((int16_t *)x2128,(int16_t *)ytmp128,1); - dft180((int16_t *)(x2128+180),(int16_t *)(ytmp128+180),1); - dft180((int16_t *)(x2128+360),(int16_t *)(ytmp128+360),1); - dft180((int16_t *)(x2128+540),(int16_t *)(ytmp128+540),1); + dft180((int16_t *)x2128,(int16_t *)ytmp128,scale_flag); + dft180((int16_t *)(x2128+180),(int16_t *)(ytmp128+180),scale_flag); + dft180((int16_t *)(x2128+360),(int16_t *)(ytmp128+360),scale_flag); + dft180((int16_t *)(x2128+540),(int16_t *)(ytmp128+540),scale_flag); bfly4_tw1(ytmp128,ytmp128+180,ytmp128+360,ytmp128+540,y128,y128+180,y128+360,y128+540); @@ -5480,7 +5823,7 @@ void dft720(int16_t *x,int16_t *y,unsigned char scale_flag) // 180 x 4 twc128+j); } - if (scale_flag==1) { + if (scale_flag) { norm128 = set1_int16(16384);//dft_norm_table[13]); for (i=0; i<720; i++) { @@ -5497,7 +5840,7 @@ static int16_t twa768p[191*2*4]; static int16_t twb768p[191*2*4]; static int16_t twc768p[191*2*4]; -void dft768p(int16_t *x,int16_t *y,unsigned char scale_flag) { // 192x 4; +void dft768p(int16_t *x,int16_t *y,unsigned int *scale_flag) { // 192x 4; int i,j; simd_q15_t *x128=(simd_q15_t *)x; @@ -5517,10 +5860,10 @@ void dft768p(int16_t *x,int16_t *y,unsigned char scale_flag) { // 192x 4; x2128[i+576] = x128[j+3]; } - dft192((int16_t *)x2128,(int16_t *)ytmp128,1); - dft192((int16_t *)(x2128+192),(int16_t *)(ytmp128+192),1); - dft192((int16_t *)(x2128+384),(int16_t *)(ytmp128+384),1); - dft192((int16_t *)(x2128+576),(int16_t *)(ytmp128+576),1); + dft192((int16_t *)x2128,(int16_t *)ytmp128,scale_flag); + dft192((int16_t *)(x2128+192),(int16_t *)(ytmp128+192),scale_flag); + dft192((int16_t *)(x2128+384),(int16_t *)(ytmp128+384),scale_flag); + dft192((int16_t *)(x2128+576),(int16_t *)(ytmp128+576),scale_flag); bfly4_tw1(ytmp128,ytmp128+192,ytmp128+384,ytmp128+576,y128,y128+192,y128+384,y128+576); @@ -5538,7 +5881,7 @@ void dft768p(int16_t *x,int16_t *y,unsigned char scale_flag) { // 192x 4; twc128+j); } - if (scale_flag==1) { + if (scale_flag) { norm128 = set1_int16(16384);//dft_norm_table[13]); for (i=0; i<768; i++) { @@ -5556,7 +5899,7 @@ void dft768p(int16_t *x,int16_t *y,unsigned char scale_flag) { // 192x 4; static int16_t twa384i[256]; static int16_t twb384i[256]; // 128 x 3 -void idft384(int16_t *input, int16_t *output, unsigned char scale) +void idft384(int16_t *input, int16_t *output, unsigned int *scale) { int i,i2,j; uint32_t tmp[3][128]__attribute__((aligned(32))); @@ -5570,9 +5913,9 @@ void idft384(int16_t *input, int16_t *output, unsigned char scale) tmp[2][i] = ((uint32_t *)input)[j++]; } - idft128((int16_t*)(tmp[0]),(int16_t*)(tmpo[0]),1); - idft128((int16_t*)(tmp[1]),(int16_t*)(tmpo[1]),1); - idft128((int16_t*)(tmp[2]),(int16_t*)(tmpo[2]),1); + idft128((int16_t*)(tmp[0]),(int16_t*)(tmpo[0]),scale); + idft128((int16_t*)(tmp[1]),(int16_t*)(tmpo[1]),scale); + idft128((int16_t*)(tmp[2]),(int16_t*)(tmpo[2]),scale); for (i=0,i2=0; i<256; i+=8,i2+=4) { ibfly3((simd_q15_t*)(&tmpo[0][i2]),(simd_q15_t*)(&tmpo[1][i2]),(simd_q15_t*)(&tmpo[2][i2]), @@ -5581,7 +5924,7 @@ void idft384(int16_t *input, int16_t *output, unsigned char scale) } - if (scale==1) { + if (scale) { for (i=0; i<6; i++) { y128p[0] = mulhi_int16(y128p[0],ONE_OVER_SQRT3_Q15_128); y128p[1] = mulhi_int16(y128p[1],ONE_OVER_SQRT3_Q15_128); @@ -5606,7 +5949,7 @@ void idft384(int16_t *input, int16_t *output, unsigned char scale) static int16_t twa864[287*2*4]; static int16_t twb864[287*2*4]; -void dft864(int16_t *x,int16_t *y,unsigned char scale_flag) // 288 x 3 +void dft864(int16_t *x,int16_t *y,unsigned int *scale_flag) // 288 x 3 { int i,j; simd_q15_t *x128=(simd_q15_t *)x; @@ -5624,9 +5967,9 @@ void dft864(int16_t *x,int16_t *y,unsigned char scale_flag) // 288 x 3 x2128[i+576] = x128[j+2]; } - dft288((int16_t *)x2128,(int16_t *)ytmp128,1); - dft288((int16_t *)(x2128+288),(int16_t *)(ytmp128+288),1); - dft288((int16_t *)(x2128+576),(int16_t *)(ytmp128+576),1); + dft288((int16_t *)x2128,(int16_t *)ytmp128,scale_flag); + dft288((int16_t *)(x2128+288),(int16_t *)(ytmp128+288),scale_flag); + dft288((int16_t *)(x2128+576),(int16_t *)(ytmp128+576),scale_flag); bfly3_tw1(ytmp128,ytmp128+288,ytmp128+576,y128,y128+288,y128+576); @@ -5641,7 +5984,7 @@ void dft864(int16_t *x,int16_t *y,unsigned char scale_flag) // 288 x 3 twb128+j); } - if (scale_flag==1) { + if (scale_flag) { norm128 = set1_int16(dft_norm_table[14]); for (i=0; i<864; i++) { @@ -5657,7 +6000,7 @@ void dft864(int16_t *x,int16_t *y,unsigned char scale_flag) // 288 x 3 static int16_t twa900[299*2*4]; static int16_t twb900[299*2*4]; -void dft900(int16_t *x,int16_t *y,unsigned char scale_flag) // 300 x 3 +void dft900(int16_t *x,int16_t *y,unsigned int *scale_flag) // 300 x 3 { int i,j; simd_q15_t *x128=(simd_q15_t *)x; @@ -5675,9 +6018,9 @@ void dft900(int16_t *x,int16_t *y,unsigned char scale_flag) // 300 x 3 x2128[i+600] = x128[j+2]; } - dft300((int16_t *)x2128,(int16_t *)ytmp128,1); - dft300((int16_t *)(x2128+300),(int16_t *)(ytmp128+300),1); - dft300((int16_t *)(x2128+600),(int16_t *)(ytmp128+600),1); + dft300((int16_t *)x2128,(int16_t *)ytmp128,scale_flag); + dft300((int16_t *)(x2128+300),(int16_t *)(ytmp128+300),scale_flag); + dft300((int16_t *)(x2128+600),(int16_t *)(ytmp128+600),scale_flag); bfly3_tw1(ytmp128,ytmp128+300,ytmp128+600,y128,y128+300,y128+600); @@ -5692,7 +6035,7 @@ void dft900(int16_t *x,int16_t *y,unsigned char scale_flag) // 300 x 3 twb128+j); } - if (scale_flag==1) { + if (scale_flag) { norm128 = set1_int16(dft_norm_table[14]); for (i=0; i<900; i++) { @@ -5711,7 +6054,7 @@ static int16_t twb960[239*2*4]; static int16_t twc960[239*2*4]; -void dft960(int16_t *x,int16_t *y,unsigned char scale_flag) // 240 x 4 +void dft960(int16_t *x,int16_t *y,unsigned int *scale_flag) // 240 x 4 { int i,j; simd_q15_t *x128=(simd_q15_t *)x; @@ -5731,10 +6074,10 @@ void dft960(int16_t *x,int16_t *y,unsigned char scale_flag) // 240 x 4 x2128[i+720] = x128[j+3]; } - dft240((int16_t *)x2128,(int16_t *)ytmp128,1); - dft240((int16_t *)(x2128+240),(int16_t *)(ytmp128+240),1); - dft240((int16_t *)(x2128+480),(int16_t *)(ytmp128+480),1); - dft240((int16_t *)(x2128+720),(int16_t *)(ytmp128+720),1); + dft240((int16_t *)x2128,(int16_t *)ytmp128,scale_flag); + dft240((int16_t *)(x2128+240),(int16_t *)(ytmp128+240),scale_flag); + dft240((int16_t *)(x2128+480),(int16_t *)(ytmp128+480),scale_flag); + dft240((int16_t *)(x2128+720),(int16_t *)(ytmp128+720),scale_flag); bfly4_tw1(ytmp128,ytmp128+240,ytmp128+480,ytmp128+720,y128,y128+240,y128+480,y128+720); @@ -5752,7 +6095,7 @@ void dft960(int16_t *x,int16_t *y,unsigned char scale_flag) // 240 x 4 twc128+j); } - if (scale_flag==1) { + if (scale_flag) { norm128 = set1_int16(16384);//dft_norm_table[13]); for (i=0; i<960; i++) { @@ -5769,7 +6112,7 @@ void dft960(int16_t *x,int16_t *y,unsigned char scale_flag) // 240 x 4 static int16_t twa972[323*2*4]; static int16_t twb972[323*2*4]; -void dft972(int16_t *x,int16_t *y,unsigned char scale_flag) // 324 x 3 +void dft972(int16_t *x,int16_t *y,unsigned int *scale_flag) // 324 x 3 { int i,j; simd_q15_t *x128=(simd_q15_t *)x; @@ -5787,9 +6130,9 @@ void dft972(int16_t *x,int16_t *y,unsigned char scale_flag) // 324 x 3 x2128[i+648] = x128[j+2]; } - dft324((int16_t *)x2128,(int16_t *)ytmp128,1); - dft324((int16_t *)(x2128+324),(int16_t *)(ytmp128+324),1); - dft324((int16_t *)(x2128+648),(int16_t *)(ytmp128+648),1); + dft324((int16_t *)x2128,(int16_t *)ytmp128,scale_flag); + dft324((int16_t *)(x2128+324),(int16_t *)(ytmp128+324),scale_flag); + dft324((int16_t *)(x2128+648),(int16_t *)(ytmp128+648),scale_flag); bfly3_tw1(ytmp128,ytmp128+324,ytmp128+648,y128,y128+324,y128+648); @@ -5804,7 +6147,7 @@ void dft972(int16_t *x,int16_t *y,unsigned char scale_flag) // 324 x 3 twb128+j); } - if (scale_flag==1) { + if (scale_flag) { norm128 = set1_int16(dft_norm_table[14]); for (i=0; i<972; i++) { @@ -5820,7 +6163,7 @@ void dft972(int16_t *x,int16_t *y,unsigned char scale_flag) // 324 x 3 static int16_t twa1080[359*2*4]; static int16_t twb1080[359*2*4]; -void dft1080(int16_t *x,int16_t *y,unsigned char scale_flag) // 360 x 3 +void dft1080(int16_t *x,int16_t *y,unsigned int *scale_flag) // 360 x 3 { int i,j; simd_q15_t *x128=(simd_q15_t *)x; @@ -5838,9 +6181,9 @@ void dft1080(int16_t *x,int16_t *y,unsigned char scale_flag) // 360 x 3 x2128[i+720] = x128[j+2]; } - dft360((int16_t *)x2128,(int16_t *)ytmp128,1); - dft360((int16_t *)(x2128+360),(int16_t *)(ytmp128+360),1); - dft360((int16_t *)(x2128+720),(int16_t *)(ytmp128+720),1); + dft360((int16_t *)x2128,(int16_t *)ytmp128,scale_flag); + dft360((int16_t *)(x2128+360),(int16_t *)(ytmp128+360),scale_flag); + dft360((int16_t *)(x2128+720),(int16_t *)(ytmp128+720),scale_flag); bfly3_tw1(ytmp128,ytmp128+360,ytmp128+720,y128,y128+360,y128+720); @@ -5855,7 +6198,7 @@ void dft1080(int16_t *x,int16_t *y,unsigned char scale_flag) // 360 x 3 twb128+j); } - if (scale_flag==1) { + if (scale_flag) { norm128 = set1_int16(dft_norm_table[14]); for (i=0; i<1080; i++) { @@ -5872,7 +6215,7 @@ static int16_t twa1152[287*2*4]; static int16_t twb1152[287*2*4]; static int16_t twc1152[287*2*4]; -void dft1152(int16_t *x,int16_t *y,unsigned char scale_flag) // 288 x 4 +void dft1152(int16_t *x,int16_t *y,unsigned int *scale_flag) // 288 x 4 { int i,j; @@ -5893,10 +6236,10 @@ void dft1152(int16_t *x,int16_t *y,unsigned char scale_flag) // 288 x 4 x2128[i+864] = x128[j+3]; } - dft288((int16_t *)x2128,(int16_t *)ytmp128,1); - dft288((int16_t *)(x2128+288),(int16_t *)(ytmp128+288),1); - dft288((int16_t *)(x2128+576),(int16_t *)(ytmp128+576),1); - dft288((int16_t *)(x2128+864),(int16_t *)(ytmp128+864),1); + dft288((int16_t *)x2128,(int16_t *)ytmp128,scale_flag); + dft288((int16_t *)(x2128+288),(int16_t *)(ytmp128+288),scale_flag); + dft288((int16_t *)(x2128+576),(int16_t *)(ytmp128+576),scale_flag); + dft288((int16_t *)(x2128+864),(int16_t *)(ytmp128+864),scale_flag); bfly4_tw1(ytmp128,ytmp128+288,ytmp128+576,ytmp128+864,y128,y128+288,y128+576,y128+864); @@ -5914,7 +6257,7 @@ void dft1152(int16_t *x,int16_t *y,unsigned char scale_flag) // 288 x 4 twc128+j); } - if (scale_flag==1) { + if (scale_flag) { norm128 = set1_int16(16384);//dft_norm_table[13]); for (i=0; i<1152; i++) { @@ -5930,7 +6273,7 @@ int16_t twa1200[4784]; int16_t twb1200[4784]; int16_t twc1200[4784]; -void dft1200(int16_t *x,int16_t *y,unsigned char scale_flag) +void dft1200(int16_t *x,int16_t *y,unsigned int *scale_flag) { int i,j; @@ -5951,10 +6294,10 @@ void dft1200(int16_t *x,int16_t *y,unsigned char scale_flag) x2128[i+900] = x128[j+3]; } - dft300((int16_t *)x2128,(int16_t *)ytmp128,1); - dft300((int16_t *)(x2128+300),(int16_t *)(ytmp128+300),1); - dft300((int16_t *)(x2128+600),(int16_t *)(ytmp128+600),1); - dft300((int16_t *)(x2128+900),(int16_t *)(ytmp128+900),1); + dft300((int16_t *)x2128,(int16_t *)ytmp128,scale_flag); + dft300((int16_t *)(x2128+300),(int16_t *)(ytmp128+300),scale_flag); + dft300((int16_t *)(x2128+600),(int16_t *)(ytmp128+600),scale_flag); + dft300((int16_t *)(x2128+900),(int16_t *)(ytmp128+900),scale_flag); bfly4_tw1(ytmp128,ytmp128+300,ytmp128+600,ytmp128+900,y128,y128+300,y128+600,y128+900); @@ -5972,7 +6315,7 @@ void dft1200(int16_t *x,int16_t *y,unsigned char scale_flag) twc128+j); } - if (scale_flag==1) { + if (scale_flag) { norm128 = set1_int16(16384);//dft_norm_table[13]); for (i=0; i<1200; i++) { y128[i] = mulhi_int16(y128[i],norm128); @@ -5988,7 +6331,7 @@ void dft1200(int16_t *x,int16_t *y,unsigned char scale_flag) static int16_t twa1296[431*2*4]; static int16_t twb1296[431*2*4]; -void dft1296(int16_t *x,int16_t *y,unsigned char scale_flag) //432 * 3 +void dft1296(int16_t *x,int16_t *y,unsigned int *scale_flag) //432 * 3 { int i,j; @@ -6007,9 +6350,9 @@ void dft1296(int16_t *x,int16_t *y,unsigned char scale_flag) //432 * 3 x2128[i+864] = x128[j+2]; } - dft432((int16_t *)x2128,(int16_t *)ytmp128,1); - dft432((int16_t *)(x2128+432),(int16_t *)(ytmp128+432),1); - dft432((int16_t *)(x2128+864),(int16_t *)(ytmp128+864),1); + dft432((int16_t *)x2128,(int16_t *)ytmp128,scale_flag); + dft432((int16_t *)(x2128+432),(int16_t *)(ytmp128+432),scale_flag); + dft432((int16_t *)(x2128+864),(int16_t *)(ytmp128+864),scale_flag); bfly3_tw1(ytmp128,ytmp128+432,ytmp128+864,y128,y128+432,y128+864); @@ -6024,7 +6367,7 @@ void dft1296(int16_t *x,int16_t *y,unsigned char scale_flag) //432 * 3 twb128+j); } - if (scale_flag==1) { + if (scale_flag) { norm128 = set1_int16(dft_norm_table[14]); for (i=0; i<1296; i++) { @@ -6041,7 +6384,7 @@ void dft1296(int16_t *x,int16_t *y,unsigned char scale_flag) //432 * 3 static int16_t twa1440[479*2*4]; static int16_t twb1440[479*2*4]; -void dft1440(int16_t *x,int16_t *y,unsigned char scale_flag) // 480 x 3 +void dft1440(int16_t *x,int16_t *y,unsigned int *scale_flag) // 480 x 3 { int i,j; simd_q15_t *x128=(simd_q15_t *)x; @@ -6059,9 +6402,9 @@ void dft1440(int16_t *x,int16_t *y,unsigned char scale_flag) // 480 x 3 x2128[i+960] = x128[j+2]; } - dft480((int16_t *)x2128,(int16_t *)ytmp128,1); - dft480((int16_t *)(x2128+480),(int16_t *)(ytmp128+480),1); - dft480((int16_t *)(x2128+960),(int16_t *)(ytmp128+960),1); + dft480((int16_t *)x2128,(int16_t *)ytmp128,scale_flag); + dft480((int16_t *)(x2128+480),(int16_t *)(ytmp128+480),scale_flag); + dft480((int16_t *)(x2128+960),(int16_t *)(ytmp128+960),scale_flag); bfly3_tw1(ytmp128,ytmp128+480,ytmp128+960,y128,y128+480,y128+960); @@ -6076,7 +6419,7 @@ void dft1440(int16_t *x,int16_t *y,unsigned char scale_flag) // 480 x 3 twb128+j); } - if (scale_flag==1) { + if (scale_flag) { norm128 = set1_int16(dft_norm_table[14]); for (i=0; i<1440; i++) { @@ -6094,7 +6437,7 @@ static int16_t twb1500[2392]__attribute__((aligned(32))); static int16_t twc1500[2392]__attribute__((aligned(32))); static int16_t twd1500[2392]__attribute__((aligned(32))); -void dft1500(int16_t *x,int16_t *y,unsigned char scale_flag) +void dft1500(int16_t *x,int16_t *y,unsigned int *scale_flag) { int i,j; @@ -6117,11 +6460,11 @@ void dft1500(int16_t *x,int16_t *y,unsigned char scale_flag) x2128[i+1200] = x128[j+4]; } - dft300((int16_t *)x2128,(int16_t *)ytmp128,1); - dft300((int16_t *)(x2128+300),(int16_t *)(ytmp128+300),1); - dft300((int16_t *)(x2128+600),(int16_t *)(ytmp128+600),1); - dft300((int16_t *)(x2128+900),(int16_t *)(ytmp128+900),1); - dft300((int16_t *)(x2128+1200),(int16_t *)(ytmp128+1200),1); + dft300((int16_t *)x2128,(int16_t *)ytmp128,scale_flag); + dft300((int16_t *)(x2128+300),(int16_t *)(ytmp128+300),scale_flag); + dft300((int16_t *)(x2128+600),(int16_t *)(ytmp128+600),scale_flag); + dft300((int16_t *)(x2128+900),(int16_t *)(ytmp128+900),scale_flag); + dft300((int16_t *)(x2128+1200),(int16_t *)(ytmp128+1200),scale_flag); bfly5_tw1(ytmp128,ytmp128+300,ytmp128+600,ytmp128+900,ytmp128+1200,y128,y128+300,y128+600,y128+900,y128+1200); @@ -6142,7 +6485,7 @@ void dft1500(int16_t *x,int16_t *y,unsigned char scale_flag) twd128+j); } - if (scale_flag==1) { + if (scale_flag) { norm128 = set1_int16(dft_norm_table[15]); for (i=0; i<1500; i++) { @@ -6158,7 +6501,7 @@ void dft1500(int16_t *x,int16_t *y,unsigned char scale_flag) static int16_t twa1620[539*2*4]; static int16_t twb1620[539*2*4]; -void dft1620(int16_t *x,int16_t *y,unsigned char scale_flag) // 540 x 3 +void dft1620(int16_t *x,int16_t *y,unsigned int *scale_flag) // 540 x 3 { int i,j; simd_q15_t *x128=(simd_q15_t *)x; @@ -6176,9 +6519,9 @@ void dft1620(int16_t *x,int16_t *y,unsigned char scale_flag) // 540 x 3 x2128[i+1080] = x128[j+2]; } - dft540((int16_t *)x2128,(int16_t *)ytmp128,1); - dft540((int16_t *)(x2128+540),(int16_t *)(ytmp128+540),1); - dft540((int16_t *)(x2128+1080),(int16_t *)(ytmp128+1080),1); + dft540((int16_t *)x2128,(int16_t *)ytmp128,scale_flag); + dft540((int16_t *)(x2128+540),(int16_t *)(ytmp128+540),scale_flag); + dft540((int16_t *)(x2128+1080),(int16_t *)(ytmp128+1080),scale_flag); bfly3_tw1(ytmp128,ytmp128+540,ytmp128+1080,y128,y128+540,y128+1080); @@ -6193,7 +6536,7 @@ void dft1620(int16_t *x,int16_t *y,unsigned char scale_flag) // 540 x 3 twb128+j); } - if (scale_flag==1) { + if (scale_flag) { norm128 = set1_int16(dft_norm_table[14]); for (i=0; i<1620; i++) { @@ -6209,7 +6552,7 @@ void dft1620(int16_t *x,int16_t *y,unsigned char scale_flag) // 540 x 3 static int16_t twa1728[575*2*4]; static int16_t twb1728[575*2*4]; -void dft1728(int16_t *x,int16_t *y,unsigned char scale_flag) // 576 x 3 +void dft1728(int16_t *x,int16_t *y,unsigned int *scale_flag) // 576 x 3 { int i,j; simd_q15_t *x128=(simd_q15_t *)x; @@ -6227,9 +6570,9 @@ void dft1728(int16_t *x,int16_t *y,unsigned char scale_flag) // 576 x 3 x2128[i+1152] = x128[j+2]; } - dft576((int16_t *)x2128,(int16_t *)ytmp128,1); - dft576((int16_t *)(x2128+576),(int16_t *)(ytmp128+576),1); - dft576((int16_t *)(x2128+1152),(int16_t *)(ytmp128+1152),1); + dft576((int16_t *)x2128,(int16_t *)ytmp128,scale_flag); + dft576((int16_t *)(x2128+576),(int16_t *)(ytmp128+576),scale_flag); + dft576((int16_t *)(x2128+1152),(int16_t *)(ytmp128+1152),scale_flag); bfly3_tw1(ytmp128,ytmp128+576,ytmp128+1152,y128,y128+576,y128+1152); @@ -6244,7 +6587,7 @@ void dft1728(int16_t *x,int16_t *y,unsigned char scale_flag) // 576 x 3 twb128+j); } - if (scale_flag==1) { + if (scale_flag) { norm128 = set1_int16(dft_norm_table[14]); for (i=0; i<1728; i++) { @@ -6260,7 +6603,7 @@ void dft1728(int16_t *x,int16_t *y,unsigned char scale_flag) // 576 x 3 static int16_t twa1800[599*2*4]; static int16_t twb1800[599*2*4]; -void dft1800(int16_t *x,int16_t *y,unsigned char scale_flag) // 600 x 3 +void dft1800(int16_t *x,int16_t *y,unsigned int *scale_flag) // 600 x 3 { int i,j; simd_q15_t *x128=(simd_q15_t *)x; @@ -6278,9 +6621,9 @@ void dft1800(int16_t *x,int16_t *y,unsigned char scale_flag) // 600 x 3 x2128[i+1200] = x128[j+2]; } - dft600((int16_t *)x2128,(int16_t *)ytmp128,1); - dft600((int16_t *)(x2128+600),(int16_t *)(ytmp128+600),1); - dft600((int16_t *)(x2128+1200),(int16_t *)(ytmp128+1200),1); + dft600((int16_t *)x2128,(int16_t *)ytmp128,scale_flag); + dft600((int16_t *)(x2128+600),(int16_t *)(ytmp128+600),scale_flag); + dft600((int16_t *)(x2128+1200),(int16_t *)(ytmp128+1200),scale_flag); bfly3_tw1(ytmp128,ytmp128+600,ytmp128+1200,y128,y128+600,y128+1200); @@ -6295,7 +6638,7 @@ void dft1800(int16_t *x,int16_t *y,unsigned char scale_flag) // 600 x 3 twb128+j); } - if (scale_flag==1) { + if (scale_flag) { norm128 = set1_int16(dft_norm_table[14]); for (i=0; i<1800; i++) { @@ -6312,7 +6655,7 @@ static int16_t twa1920[479*2*4]; static int16_t twb1920[479*2*4]; static int16_t twc1920[479*2*4]; -void dft1920(int16_t *x,int16_t *y,unsigned char scale_flag) // 480 x 4 +void dft1920(int16_t *x,int16_t *y,unsigned int *scale_flag) // 480 x 4 { int i,j; simd_q15_t *x128=(simd_q15_t *)x; @@ -6332,10 +6675,10 @@ void dft1920(int16_t *x,int16_t *y,unsigned char scale_flag) // 480 x 4 x2128[i+1440] = x128[j+3]; } - dft480((int16_t *)x2128,(int16_t *)ytmp128,1); - dft480((int16_t *)(x2128+480),(int16_t *)(ytmp128+480),1); - dft480((int16_t *)(x2128+960),(int16_t *)(ytmp128+960),1); - dft480((int16_t *)(x2128+1440),(int16_t *)(ytmp128+1440),1); + dft480((int16_t *)x2128,(int16_t *)ytmp128,scale_flag); + dft480((int16_t *)(x2128+480),(int16_t *)(ytmp128+480),scale_flag); + dft480((int16_t *)(x2128+960),(int16_t *)(ytmp128+960),scale_flag); + dft480((int16_t *)(x2128+1440),(int16_t *)(ytmp128+1440),scale_flag); bfly4_tw1(ytmp128,ytmp128+480,ytmp128+960,ytmp128+1440,y128,y128+480,y128+960,y128+1440); @@ -6353,7 +6696,7 @@ void dft1920(int16_t *x,int16_t *y,unsigned char scale_flag) // 480 x 4 twc128+j); } - if (scale_flag==1) { + if (scale_flag) { norm128 = set1_int16(dft_norm_table[13]); for (i=0; i<1920; i++) { y128[i] = mulhi_int16(y128[i],norm128); @@ -6368,7 +6711,7 @@ void dft1920(int16_t *x,int16_t *y,unsigned char scale_flag) // 480 x 4 static int16_t twa1944[647*2*4]; static int16_t twb1944[647*2*4]; -void dft1944(int16_t *x,int16_t *y,unsigned char scale_flag) // 648 x 3 +void dft1944(int16_t *x,int16_t *y,unsigned int *scale_flag) // 648 x 3 { int i,j; simd_q15_t *x128=(simd_q15_t *)x; @@ -6386,9 +6729,9 @@ void dft1944(int16_t *x,int16_t *y,unsigned char scale_flag) // 648 x 3 x2128[i+1296] = x128[j+2]; } - dft648((int16_t *)x2128,(int16_t *)ytmp128,1); - dft648((int16_t *)(x2128+648),(int16_t *)(ytmp128+648),1); - dft648((int16_t *)(x2128+1296),(int16_t *)(ytmp128+1296),1); + dft648((int16_t *)x2128,(int16_t *)ytmp128,scale_flag); + dft648((int16_t *)(x2128+648),(int16_t *)(ytmp128+648),scale_flag); + dft648((int16_t *)(x2128+1296),(int16_t *)(ytmp128+1296),scale_flag); bfly3_tw1(ytmp128,ytmp128+648,ytmp128+1296,y128,y128+648,y128+1296); @@ -6403,7 +6746,7 @@ void dft1944(int16_t *x,int16_t *y,unsigned char scale_flag) // 648 x 3 twb128+j); } - if (scale_flag==1) { + if (scale_flag) { norm128 = set1_int16(dft_norm_table[14]); for (i=0; i<1944; i++) { @@ -6419,7 +6762,7 @@ void dft1944(int16_t *x,int16_t *y,unsigned char scale_flag) // 648 x 3 static int16_t twa2160[719*2*4]; static int16_t twb2160[719*2*4]; -void dft2160(int16_t *x,int16_t *y,unsigned char scale_flag) // 720 x 3 +void dft2160(int16_t *x,int16_t *y,unsigned int *scale_flag) // 720 x 3 { int i,j; simd_q15_t *x128=(simd_q15_t *)x; @@ -6437,9 +6780,9 @@ void dft2160(int16_t *x,int16_t *y,unsigned char scale_flag) // 720 x 3 x2128[i+1440] = x128[j+2]; } - dft720((int16_t *)x2128,(int16_t *)ytmp128,1); - dft720((int16_t *)(x2128+720),(int16_t *)(ytmp128+720),1); - dft720((int16_t *)(x2128+1440),(int16_t *)(ytmp128+1440),1); + dft720((int16_t *)x2128,(int16_t *)ytmp128,scale_flag); + dft720((int16_t *)(x2128+720),(int16_t *)(ytmp128+720),scale_flag); + dft720((int16_t *)(x2128+1440),(int16_t *)(ytmp128+1440),scale_flag); bfly3_tw1(ytmp128,ytmp128+720,ytmp128+1440,y128,y128+720,y128+1440); @@ -6454,7 +6797,7 @@ void dft2160(int16_t *x,int16_t *y,unsigned char scale_flag) // 720 x 3 twb128+j); } - if (scale_flag==1) { + if (scale_flag) { norm128 = set1_int16(dft_norm_table[14]); for (i=0; i<2160; i++) { @@ -6470,7 +6813,7 @@ void dft2160(int16_t *x,int16_t *y,unsigned char scale_flag) // 720 x 3 static int16_t twa2304[767*2*4]; static int16_t twb2304[767*2*4]; -void dft2304(int16_t *x,int16_t *y,unsigned char scale_flag) // 768 x 3 +void dft2304(int16_t *x,int16_t *y,unsigned int *scale_flag) // 768 x 3 { int i,j; simd_q15_t *x128=(simd_q15_t *)x; @@ -6488,9 +6831,9 @@ void dft2304(int16_t *x,int16_t *y,unsigned char scale_flag) // 768 x 3 x2128[i+1536] = x128[j+2]; } - dft768((int16_t *)x2128,(int16_t *)ytmp128,1); - dft768((int16_t *)(x2128+768),(int16_t *)(ytmp128+768),1); - dft768((int16_t *)(x2128+1536),(int16_t *)(ytmp128+1536),1); + dft768((int16_t *)x2128,(int16_t *)ytmp128,scale_flag); + dft768((int16_t *)(x2128+768),(int16_t *)(ytmp128+768),scale_flag); + dft768((int16_t *)(x2128+1536),(int16_t *)(ytmp128+1536),scale_flag); bfly3_tw1(ytmp128,ytmp128+768,ytmp128+1536,y128,y128+768,y128+1536); @@ -6505,7 +6848,7 @@ void dft2304(int16_t *x,int16_t *y,unsigned char scale_flag) // 768 x 3 twb128+j); } - if (scale_flag==1) { + if (scale_flag) { norm128 = set1_int16(dft_norm_table[14]); for (i=0; i<2304; i++) { @@ -6522,7 +6865,7 @@ static int16_t twa2400[599*2*4]; static int16_t twb2400[599*2*4]; static int16_t twc2400[599*2*4]; -void dft2400(int16_t *x,int16_t *y,unsigned char scale_flag) // 600 x 4 +void dft2400(int16_t *x,int16_t *y,unsigned int *scale_flag) // 600 x 4 { int i,j; @@ -6543,10 +6886,10 @@ void dft2400(int16_t *x,int16_t *y,unsigned char scale_flag) // 600 x 4 x2128[i+1800] = x128[j+3]; } - dft600((int16_t *)x2128,(int16_t *)ytmp128,1); - dft600((int16_t *)(x2128+600),(int16_t *)(ytmp128+600),1); - dft600((int16_t *)(x2128+1200),(int16_t *)(ytmp128+1200),1); - dft600((int16_t *)(x2128+1800),(int16_t *)(ytmp128+1800),1); + dft600((int16_t *)x2128,(int16_t *)ytmp128,scale_flag); + dft600((int16_t *)(x2128+600),(int16_t *)(ytmp128+600),scale_flag); + dft600((int16_t *)(x2128+1200),(int16_t *)(ytmp128+1200),scale_flag); + dft600((int16_t *)(x2128+1800),(int16_t *)(ytmp128+1800),scale_flag); bfly4_tw1(ytmp128,ytmp128+600,ytmp128+1200,ytmp128+1800,y128,y128+600,y128+1200,y128+1800); @@ -6564,7 +6907,7 @@ void dft2400(int16_t *x,int16_t *y,unsigned char scale_flag) // 600 x 4 twc128+j); } - if (scale_flag==1) { + if (scale_flag) { norm128 = set1_int16(dft_norm_table[13]); for (i=0; i<2400; i++) { y128[i] = mulhi_int16(y128[i],norm128); @@ -6579,7 +6922,7 @@ void dft2400(int16_t *x,int16_t *y,unsigned char scale_flag) // 600 x 4 static int16_t twa2592[863*2*4]; static int16_t twb2592[863*2*4]; -void dft2592(int16_t *x,int16_t *y,unsigned char scale_flag) // 864 x 3 +void dft2592(int16_t *x,int16_t *y,unsigned int *scale_flag) // 864 x 3 { int i,j; simd_q15_t *x128=(simd_q15_t *)x; @@ -6597,9 +6940,9 @@ void dft2592(int16_t *x,int16_t *y,unsigned char scale_flag) // 864 x 3 x2128[i+1728] = x128[j+2]; } - dft864((int16_t *)x2128,(int16_t *)ytmp128,1); - dft864((int16_t *)(x2128+864),(int16_t *)(ytmp128+864),1); - dft864((int16_t *)(x2128+1728),(int16_t *)(ytmp128+1728),1); + dft864((int16_t *)x2128,(int16_t *)ytmp128,scale_flag); + dft864((int16_t *)(x2128+864),(int16_t *)(ytmp128+864),scale_flag); + dft864((int16_t *)(x2128+1728),(int16_t *)(ytmp128+1728),scale_flag); bfly3_tw1(ytmp128,ytmp128+864,ytmp128+1728,y128,y128+864,y128+1728); @@ -6614,7 +6957,7 @@ void dft2592(int16_t *x,int16_t *y,unsigned char scale_flag) // 864 x 3 twb128+j); } - if (scale_flag==1) { + if (scale_flag) { norm128 = set1_int16(dft_norm_table[14]); for (i=0; i<2592; i++) { @@ -6630,7 +6973,7 @@ void dft2592(int16_t *x,int16_t *y,unsigned char scale_flag) // 864 x 3 static int16_t twa2700[899*2*4]; static int16_t twb2700[899*2*4]; -void dft2700(int16_t *x,int16_t *y,unsigned char scale_flag) // 900 x 3 +void dft2700(int16_t *x,int16_t *y,unsigned int *scale_flag) // 900 x 3 { int i,j; simd_q15_t *x128=(simd_q15_t *)x; @@ -6648,9 +6991,9 @@ void dft2700(int16_t *x,int16_t *y,unsigned char scale_flag) // 900 x 3 x2128[i+1800] = x128[j+2]; } - dft900((int16_t *)x2128,(int16_t *)ytmp128,1); - dft900((int16_t *)(x2128+900),(int16_t *)(ytmp128+900),1); - dft900((int16_t *)(x2128+1800),(int16_t *)(ytmp128+1800),1); + dft900((int16_t *)x2128,(int16_t *)ytmp128,scale_flag); + dft900((int16_t *)(x2128+900),(int16_t *)(ytmp128+900),scale_flag); + dft900((int16_t *)(x2128+1800),(int16_t *)(ytmp128+1800),scale_flag); bfly3_tw1(ytmp128,ytmp128+900,ytmp128+1800,y128,y128+900,y128+1800); @@ -6665,7 +7008,7 @@ void dft2700(int16_t *x,int16_t *y,unsigned char scale_flag) // 900 x 3 twb128+j); } - if (scale_flag==1) { + if (scale_flag) { norm128 = set1_int16(dft_norm_table[14]); for (i=0; i<2700; i++) { @@ -6681,7 +7024,7 @@ void dft2700(int16_t *x,int16_t *y,unsigned char scale_flag) // 900 x 3 static int16_t twa2880[959*2*4]; static int16_t twb2880[959*2*4]; -void dft2880(int16_t *x,int16_t *y,unsigned char scale_flag) // 960 x 3 +void dft2880(int16_t *x,int16_t *y,unsigned int *scale_flag) // 960 x 3 { int i,j; simd_q15_t *x128=(simd_q15_t *)x; @@ -6699,9 +7042,9 @@ void dft2880(int16_t *x,int16_t *y,unsigned char scale_flag) // 960 x 3 x2128[i+1920] = x128[j+2]; } - dft960((int16_t *)x2128,(int16_t *)ytmp128,1); - dft960((int16_t *)(x2128+960),(int16_t *)(ytmp128+960),1); - dft960((int16_t *)(x2128+1920),(int16_t *)(ytmp128+1920),1); + dft960((int16_t *)x2128,(int16_t *)ytmp128,scale_flag); + dft960((int16_t *)(x2128+960),(int16_t *)(ytmp128+960),scale_flag); + dft960((int16_t *)(x2128+1920),(int16_t *)(ytmp128+1920),scale_flag); bfly3_tw1(ytmp128,ytmp128+960,ytmp128+1920,y128,y128+960,y128+1920); @@ -6716,7 +7059,7 @@ void dft2880(int16_t *x,int16_t *y,unsigned char scale_flag) // 960 x 3 twb128+j); } - if (scale_flag==1) { + if (scale_flag) { norm128 = set1_int16(dft_norm_table[14]); for (i=0; i<2880; i++) { @@ -6732,7 +7075,7 @@ void dft2880(int16_t *x,int16_t *y,unsigned char scale_flag) // 960 x 3 static int16_t twa2916[971*2*4]; static int16_t twb2916[971*2*4]; -void dft2916(int16_t *x,int16_t *y,unsigned char scale_flag) // 972 x 3 +void dft2916(int16_t *x,int16_t *y,unsigned int *scale_flag) // 972 x 3 { int i,j; simd_q15_t *x128=(simd_q15_t *)x; @@ -6750,9 +7093,9 @@ void dft2916(int16_t *x,int16_t *y,unsigned char scale_flag) // 972 x 3 x2128[i+1944] = x128[j+2]; } - dft972((int16_t *)x2128,(int16_t *)ytmp128,1); - dft972((int16_t *)(x2128+972),(int16_t *)(ytmp128+972),1); - dft972((int16_t *)(x2128+1944),(int16_t *)(ytmp128+1944),1); + dft972((int16_t *)x2128,(int16_t *)ytmp128,scale_flag); + dft972((int16_t *)(x2128+972),(int16_t *)(ytmp128+972),scale_flag); + dft972((int16_t *)(x2128+1944),(int16_t *)(ytmp128+1944),scale_flag); bfly3_tw1(ytmp128,ytmp128+972,ytmp128+1944,y128,y128+972,y128+1944); @@ -6767,7 +7110,7 @@ void dft2916(int16_t *x,int16_t *y,unsigned char scale_flag) // 972 x 3 twb128+j); } - if (scale_flag==1) { + if (scale_flag) { norm128 = set1_int16(dft_norm_table[14]); for (i=0; i<2916; i++) { @@ -6785,7 +7128,7 @@ static int16_t twb3000[599*8]__attribute__((aligned(32))); static int16_t twc3000[599*8]__attribute__((aligned(32))); static int16_t twd3000[599*8]__attribute__((aligned(32))); -void dft3000(int16_t *x,int16_t *y,unsigned char scale_flag) // 600 * 5 +void dft3000(int16_t *x,int16_t *y,unsigned int *scale_flag) // 600 * 5 { int i,j; @@ -6808,11 +7151,11 @@ void dft3000(int16_t *x,int16_t *y,unsigned char scale_flag) // 600 * 5 x2128[i+2400] = x128[j+4]; } - dft600((int16_t *)x2128,(int16_t *)ytmp128,1); - dft600((int16_t *)(x2128+600),(int16_t *)(ytmp128+600),1); - dft600((int16_t *)(x2128+1200),(int16_t *)(ytmp128+1200),1); - dft600((int16_t *)(x2128+1800),(int16_t *)(ytmp128+1800),1); - dft600((int16_t *)(x2128+2400),(int16_t *)(ytmp128+2400),1); + dft600((int16_t *)x2128,(int16_t *)ytmp128,scale_flag); + dft600((int16_t *)(x2128+600),(int16_t *)(ytmp128+600),scale_flag); + dft600((int16_t *)(x2128+1200),(int16_t *)(ytmp128+1200),scale_flag); + dft600((int16_t *)(x2128+1800),(int16_t *)(ytmp128+1800),scale_flag); + dft600((int16_t *)(x2128+2400),(int16_t *)(ytmp128+2400),scale_flag); bfly5_tw1(ytmp128,ytmp128+600,ytmp128+1200,ytmp128+1800,ytmp128+2400,y128,y128+600,y128+1200,y128+1800,y128+2400); @@ -6833,7 +7176,7 @@ void dft3000(int16_t *x,int16_t *y,unsigned char scale_flag) // 600 * 5 twd128+j); } - if (scale_flag==1) { + if (scale_flag) { norm128 = set1_int16(dft_norm_table[15]); for (i=0; i<3000; i++) { @@ -6849,7 +7192,7 @@ void dft3000(int16_t *x,int16_t *y,unsigned char scale_flag) // 600 * 5 static int16_t twa3240[1079*2*4]; static int16_t twb3240[1079*2*4]; -void dft3240(int16_t *x,int16_t *y,unsigned char scale_flag) // 1080 x 3 +void dft3240(int16_t *x,int16_t *y,unsigned int *scale_flag) // 1080 x 3 { int i,j; simd_q15_t *x128=(simd_q15_t *)x; @@ -6867,9 +7210,9 @@ void dft3240(int16_t *x,int16_t *y,unsigned char scale_flag) // 1080 x 3 x2128[i+2160] = x128[j+2]; } - dft1080((int16_t *)x2128,(int16_t *)ytmp128,1); - dft1080((int16_t *)(x2128+1080),(int16_t *)(ytmp128+1080),1); - dft1080((int16_t *)(x2128+2160),(int16_t *)(ytmp128+2160),1); + dft1080((int16_t *)x2128,(int16_t *)ytmp128,scale_flag); + dft1080((int16_t *)(x2128+1080),(int16_t *)(ytmp128+1080),scale_flag); + dft1080((int16_t *)(x2128+2160),(int16_t *)(ytmp128+2160),scale_flag); bfly3_tw1(ytmp128,ytmp128+1080,ytmp128+2160,y128,y128+1080,y128+2160); @@ -6884,7 +7227,7 @@ void dft3240(int16_t *x,int16_t *y,unsigned char scale_flag) // 1080 x 3 twb128+j); } - if (scale_flag==1) { + if (scale_flag) { norm128 = set1_int16(dft_norm_table[14]); for (i=0; i<3240; i++) { @@ -7094,7 +7437,7 @@ int dfts_autoinit(void) #ifndef MR_MAIN -void dft_implementation(uint8_t sizeidx, int16_t *input, int16_t *output, unsigned char scale_flag) +void dft_implementation(uint8_t sizeidx, int16_t *input, int16_t *output, unsigned int *scale_flag) { AssertFatal((sizeidx >= 0 && sizeidx<DFT_SIZE_IDXTABLESIZE),"Invalid dft size index %i\n",sizeidx); int algn=0xF; @@ -7111,7 +7454,7 @@ void dft_implementation(uint8_t sizeidx, int16_t *input, int16_t *output, unsign dft_ftab[sizeidx].func(input,output,scale_flag); }; -void idft_implementation(uint8_t sizeidx, int16_t *input, int16_t *output, unsigned char scale_flag) +void idft_implementation(uint8_t sizeidx, int16_t *input, int16_t *output, unsigned int *scale_flag) { AssertFatal((sizeidx>=0 && sizeidx<DFT_SIZE_IDXTABLESIZE),"Invalid idft size index %i\n",sizeidx); int algn=0xF; @@ -7133,9 +7476,23 @@ void idft_implementation(uint8_t sizeidx, int16_t *input, int16_t *output, unsig #ifdef MR_MAIN #include <string.h> #include <stdio.h> -/* -#define LOG_M write_output -int write_output(const char *fname,const char *vname,void *data,int length,int dec,char format) +#include "../../../common/config/config_paramdesc.h" + +struct configmodule_interface_s *uniqCfg = NULL; +extern int bitrev4096[4096],bitrev2048[2048],bitrev1024[1024],bitrev512[512],bitrev256[256],bitrev128[128]; +void init_bitrev(); +void radix2(cd_t *x, int N); +void normalize(cd_t *x,cd_t *y,int *bitrev, int N); + +void exit_function(const char *file, const char *function, const int line, const char *s, const int assert) { +exit(-1); +} +int config_get(paramdef_t *params,int numparams, char *prefix) { +return(0); +} + +//#define LOG_M write_output +int write_file_matlab(const char *fname,const char *vname,void *data,int length,int dec,unsigned int format,int dummy) { FILE *fp=NULL; @@ -7289,24 +7646,43 @@ int write_output(const char *fname,const char *vname,void *data,int length,int d return 0; } -*/ -#include "common/config/config_paramdesc.h" -void exit_function(const char *file, const char *function, const int line, const char *s, const int assert) { return; } -int oai_exit=0; -int config_get(paramdef_t *params,int numparams, char *prefix) { return;} -int config_check_unknown_cmdlineopt(char *prefix) { return; } +double compute_error(int16_t *x, int16_t *y, int N, int *bitrev, int idft) { + + int i; + cd_t xcd[N],ycd[N]; + + double error=0; + + for (i=0;i<N;i++) { + xcd[i].r = (double)(((int16_t *)x)[i<<1]); + xcd[i].i = (double)(((int16_t *)x)[1+(i<<1)]); + if (idft==1) xcd[i].i=-xcd[i].i; + } + + double input_lev=0; + for (i=0;i<N;i++) input_lev += pow(xcd[i].r,2.0) + pow(xcd[i].i,2.0); + input_lev/=N; + radix2(xcd,N); + normalize(xcd,ycd,bitrev,N); + if (idft==0) for (i=0;i<N;i++) error += pow((ycd[i].r - (double)((int16_t*)y)[i<<1]),2.0) + pow(ycd[i].i-(double)((int16_t*)y)[1+(i<<1)],2.0); + else for (i=0;i<N;i++) error += pow((ycd[i].r - (double)((int16_t*)y)[i<<1]),2.0) + pow(ycd[i].i+(double)((int16_t*)y)[1+(i<<1)],2.0); + return(input_lev/(error/N)); +} int main(int argc, char**argv) { time_stats_t ts; - simd_q15_t x[32768],y[32768],tw0,tw1,tw2,tw3; + simde__m256i x[16384],x2[16384],y[16384],tw0,tw1,tw2,tw3; int i; - simd_q15_t *x128=(simd_q15_t*)x,*y128=(simd_q15_t*)y; + + double sqnr; dfts_autoinit(); + init_bitrev(); + set_taus_seed(0); cpu_meas_enabled = 1; /* @@ -7347,8 +7723,8 @@ int main(int argc, char**argv) ((int16_t *)&tw3)[7] = 0; */ for (i = 0; i < 300; i++) { - x[i] = (int16x8_t)vdupq_n_s32(taus()); - x[i] = vshrq_n_s16(x[i], 4); + x[i] = simde_mm256_set1_epi32(taus()); + x[i] = simde_mm256_srai_epi16(x[i], 4); } /* bfly2_tw1(x,x+1,y,y+1); @@ -7491,18 +7867,6 @@ int main(int argc, char**argv) ((int16_t*)x)[6+(i<<1)] = 0; ((int16_t*)x)[7+(i<<1)] = -1024; } - /* - for (i=0; i<2048; i+=2) { - ((int16_t*)x)[i<<1] = 1024; - ((int16_t*)x)[1+(i<<1)] = 0; - ((int16_t*)x)[2+(i<<1)] = -1024; - ((int16_t*)x)[3+(i<<1)] = 0; - } - - for (i=0;i<2048*2;i++) { - ((int16_t*)x)[i] = i/2;//(int16_t)((taus()&0xffff))>>5; - } - */ memset((void*)&x[0],0,64*sizeof(int32_t)); for (i=2;i<36;i++) { if ((taus() & 1)==0) @@ -7516,7 +7880,8 @@ int main(int argc, char**argv) else ((int16_t*)x)[i] = -364; } - idft64((int16_t *)x,(int16_t *)y,1); + uint32_t scale64 = 3; + idft64((int16_t *)x,(int16_t *)y,&scale64); printf("64-point\n"); @@ -7533,14 +7898,14 @@ int main(int argc, char**argv) - idft64((int16_t *)x,(int16_t *)y,1); - idft64((int16_t *)x,(int16_t *)y,1); - idft64((int16_t *)x,(int16_t *)y,1); + idft64((int16_t *)x,(int16_t *)y,&scale64); + idft64((int16_t *)x,(int16_t *)y,&scale64); + idft64((int16_t *)x,(int16_t *)y,&scale64); reset_meas(&ts); for (i=0; i<10000000; i++) { start_meas(&ts); - idft64((int16_t *)x,(int16_t *)y,1); + idft64((int16_t *)x,(int16_t *)y,&scale64); stop_meas(&ts); } @@ -7585,12 +7950,16 @@ int main(int argc, char**argv) } reset_meas(&ts); + uint32_t scale128_tx[2] = {4,0}; for (i=0; i<10000; i++) { start_meas(&ts); - idft128((int16_t *)x,(int16_t *)y,1); + idft128((int16_t *)x,(int16_t *)y,scale128_tx); stop_meas(&ts); } + sqnr = compute_error((int16_t*)x,(int16_t*)y,128,bitrev128,1); + + printf("128 point IDFT SQNR (QPSK) : %f dB\n",10*log10(sqnr)); printf("\n\n128-point(%f cycles)\n",(double)ts.diff/(double)ts.trials); LOG_M("y128.m","y128",y,128,1,1); LOG_M("x128.m","x128",x,128,1,1); @@ -7626,10 +7995,11 @@ int main(int argc, char**argv) ((int16_t*)x)[i] = -364; } reset_meas(&ts); + uint32_t scale256_tx[3]={4,0}; for (i=0; i<10000; i++) { start_meas(&ts); - idft256((int16_t *)x,(int16_t *)y,1); + idft256((int16_t *)x,(int16_t *)y,scale256_tx); stop_meas(&ts); } @@ -7637,6 +8007,9 @@ int main(int argc, char**argv) LOG_M("y256.m","y256",y,256,1,1); LOG_M("x256.m","x256",x,256,1,1); + sqnr = compute_error((int16_t*)x,(int16_t*)y,256,bitrev256,1); + + printf("256 point IDFT SQNR (QPSK) : %f dB\n",10*log10(sqnr)); memset((void*)&x[0],0,512*sizeof(int32_t)); for (i=2;i<302;i++) { if ((taus() & 1)==0) @@ -7652,19 +8025,21 @@ int main(int argc, char**argv) } reset_meas(&ts); + uint32_t scale512_tx[4]={4,1,0}; + for (i=0; i<10000; i++) { start_meas(&ts); - idft512((int16_t *)x,(int16_t *)y,1); + idft512((int16_t *)x,(int16_t *)y,scale512_tx); stop_meas(&ts); } printf("\n\n512-point(%f cycles)\n",(double)ts.diff/(double)ts.trials); LOG_M("y512.m","y512",y,512,1,1); LOG_M("x512.m","x512",x,512,1,1); - dft512((int16_t*)y,(int16_t*)x,1); - LOG_M("y512_dft.m","y512",y,512,1,1); - LOG_M("x512_dft.m","x512",x,512,1,1); + sqnr = compute_error((int16_t*)x,(int16_t*)y,512,bitrev512,1); + printf("512 point IDFT SQNR (QPSK) : %f dB\n",10*log10(sqnr)); + memset((void*)x,0,1024*sizeof(int32_t)); /* printf("X: "); for (i=0;i<64;i++) @@ -7691,9 +8066,10 @@ int main(int argc, char**argv) } reset_meas(&ts); + uint32_t scale1024_tx[4]={4,1,0}; for (i=0; i<10000; i++) { start_meas(&ts); - idft1024((int16_t *)x,(int16_t *)y,1); + idft1024((int16_t *)x,(int16_t *)y,scale1024_tx); stop_meas(&ts); } @@ -7701,6 +8077,9 @@ int main(int argc, char**argv) LOG_M("y1024.m","y1024",y,1024,1,1); LOG_M("x1024.m","x1024",x,1024,1,1); + sqnr = compute_error((int16_t*)x,(int16_t*)y,1024,bitrev1024,1); + + printf("1024 point IDFT SQNR (QPSK) : %f dB\n",10*log10(sqnr)); memset((void*)x,0,1536*sizeof(int32_t)); for (i=2;i<1202;i++) { @@ -7717,15 +8096,16 @@ int main(int argc, char**argv) } reset_meas(&ts); + uint32_t scale1536[4]={1,1,1,3}; for (i=0; i<10000; i++) { start_meas(&ts); - idft1536((int16_t *)x,(int16_t *)y,1); + idft1536((int16_t *)x,(int16_t *)y,scale1536); stop_meas(&ts); } printf("\n\n1536-point(%f cycles)\n",(double)ts.diff/(double)ts.trials); - write_output("y1536.m","y1536",y,1536,1,1); - write_output("x1536.m","x1536",x,1536,1,1); + LOG_M("y1536.m","y1536",y,1536,1,1); + LOG_M("x1536.m","x1536",x,1536,1,1); memset((void*)x,0,2048*sizeof(int32_t)); @@ -7743,9 +8123,10 @@ int main(int argc, char**argv) } reset_meas(&ts); + uint32_t scale2048_tx[4]={3,2,1,0}; for (i=0; i<10000; i++) { start_meas(&ts); - dft2048((int16_t *)x,(int16_t *)y,1); + idft2048((int16_t *)x,(int16_t *)y,scale2048_tx); stop_meas(&ts); } @@ -7753,6 +8134,9 @@ int main(int argc, char**argv) LOG_M("y2048.m","y2048",y,2048,1,1); LOG_M("x2048.m","x2048",x,2048,1,1); + sqnr = compute_error((int16_t*)x,(int16_t*)y,2048,bitrev2048,1); + + printf("2048 point IDFT SQNR (QPSK) : %f dB\n",10*log10(sqnr)); // NR 80Mhz, 217 PRB, 3/4 sampling memset((void*)x, 0, 3072*sizeof(int32_t)); for (i=2;i<2506;i++) { @@ -7770,15 +8154,16 @@ int main(int argc, char**argv) reset_meas(&ts); + uint32_t scale3072[4]={1,1,1,3}; for (i=0; i<10000; i++) { start_meas(&ts); - idft3072((int16_t *)x,(int16_t *)y,1); + idft3072((int16_t *)x,(int16_t *)y,scale3072); stop_meas(&ts); } printf("\n\n3072-point(%f cycles)\n",(double)ts.diff/(double)ts.trials); - write_output("y3072.m","y3072",y,3072,1,1); - write_output("x3072.m","x3072",x,3072,1,1); + LOG_M("y3072.m","y3072",y,3072,1,1); + LOG_M("x3072.m","x3072",x,3072,1,1); memset((void*)x,0,4096*sizeof(int32_t)); @@ -7796,9 +8181,10 @@ int main(int argc, char**argv) } reset_meas(&ts); + uint32_t scale4096_tx[4]={3,2,1,0}; for (i=0; i<10000; i++) { start_meas(&ts); - idft4096((int16_t *)x,(int16_t *)y,1); + idft4096((int16_t *)x,(int16_t *)y,scale4096_tx); stop_meas(&ts); } @@ -7806,9 +8192,29 @@ int main(int argc, char**argv) LOG_M("y4096.m","y4096",y,4096,1,1); LOG_M("x4096.m","x4096",x,4096,1,1); - dft4096((int16_t *)y,(int16_t *)x,1); - LOG_M("x4096_2.m","x4096_2",x,4096,1,1); + sqnr = compute_error((int16_t*)x,(int16_t*)y,4096,bitrev4096,1); + + printf("4096 point IDFT SQNR (QPSK) : %f dB\n",10*log10(sqnr)); + + float sqrt2 = 0.70711; + float sqrt170 = 0.076696; + + for (i=0;i<2400;i++) { + uint32_t n=taus(); + ((int16_t*)x)[i] = (short)((1-2*(n&1))*(8-(1-2*((n>>1)&1))*(4-(1-2*((n>>2)&1))*(2-(1-2*((n>>3)&1))))))*512*sqrt170*sqrt2; + } + for (i=2*(4096-1200);i<8192;i++) { + uint32_t n=taus(); + ((int16_t*)x)[i] = (short)((1-2*(n&1))*(8-(1-2*((n>>1)&1))*(4-(1-2*((n>>2)&1))*(2-(1-2*((n>>3)&1))))))*512*sqrt170*sqrt2; + } + + uint32_t scale4096_tx256qam[4]={3,2,1,0}; + idft4096((int16_t *)x,(int16_t *)y,scale4096_tx256qam); + LOG_M("y4096_256qam.m","y4096_256qam",y,4096,1,1); + LOG_M("x4096_256qam.m","x4096_256qam",x,4096,1,1); + sqnr = compute_error((int16_t*)x,(int16_t*)y,4096,bitrev4096,1); + printf("4096 point IDFT SQNR (256QAM) : %f dB\n",10*log10(sqnr)); // NR 160Mhz, 434 PRB, 3/4 sampling memset((void*)x, 0, 6144*sizeof(int32_t)); for (i=2;i<5010;i++) { @@ -7826,15 +8232,16 @@ int main(int argc, char**argv) reset_meas(&ts); + uint32_t scale6144[5]={1,1,1,1,3}; for (i=0; i<10000; i++) { start_meas(&ts); - idft6144((int16_t *)x,(int16_t *)y,1); + idft6144((int16_t *)x,(int16_t *)y,scale6144); stop_meas(&ts); } printf("\n\n6144-point(%f cycles)\n",(double)ts.diff/(double)ts.trials); - write_output("y6144.m","y6144",y,6144,1,1); - write_output("x6144.m","x6144",x,6144,1,1); + LOG_M("y6144.m","y6144",y,6144,1,1); + LOG_M("x6144.m","x6144",x,6144,1,1); memset((void*)x,0,8192*sizeof(int32_t)); for (i=2;i<4802;i++) { @@ -7850,9 +8257,10 @@ int main(int argc, char**argv) ((int16_t*)x)[i] = -364; } reset_meas(&ts); + uint32_t scale8192[5]={1,1,1,1,3}; for (i=0; i<10000; i++) { start_meas(&ts); - idft8192((int16_t *)x,(int16_t *)y,1); + idft8192((int16_t *)x,(int16_t *)y,scale8192); stop_meas(&ts); } @@ -7874,9 +8282,10 @@ int main(int argc, char**argv) ((int16_t*)x)[i] = -364; } reset_meas(&ts); + uint32_t scale16384[5]={1,1,1,1,3}; for (i=0; i<10000; i++) { start_meas(&ts); - dft16384((int16_t *)x,(int16_t *)y,1); + dft16384((int16_t *)x,(int16_t *)y,scale16384); stop_meas(&ts); } @@ -7884,82 +8293,6 @@ int main(int argc, char**argv) LOG_M("y16384.m","y16384",y,16384,1,1); LOG_M("x16384.m","x16384",x,16384,1,1); - memset((void*)x,0,1536*sizeof(int32_t)); - for (i=2;i<1202;i++) { - if ((taus() & 1)==0) - ((int16_t*)x)[i] = 364; - else - ((int16_t*)x)[i] = -364; - } - for (i=2*(1536-600);i<3072;i++) { - if ((taus() & 1)==0) - ((int16_t*)x)[i] = 364; - else - ((int16_t*)x)[i] = -364; - } - reset_meas(&ts); - for (i=0; i<10000; i++) { - start_meas(&ts); - idft1536((int16_t *)x,(int16_t *)y,1); - stop_meas(&ts); - } - - printf("\n\n1536-point(%f cycles)\n",(double)ts.diff/(double)ts.trials); - LOG_M("y1536.m","y1536",y,1536,1,1); - LOG_M("x1536.m","x1536",x,1536,1,1); - - printf("\n\n1536-point(%f cycles)\n",(double)ts.diff/(double)ts.trials); - LOG_M("y8192.m","y8192",y,8192,1,1); - LOG_M("x8192.m","x8192",x,8192,1,1); - - memset((void*)x,0,3072*sizeof(int32_t)); - for (i=2;i<1202;i++) { - if ((taus() & 1)==0) - ((int16_t*)x)[i] = 364; - else - ((int16_t*)x)[i] = -364; - } - for (i=2*(3072-600);i<3072;i++) { - if ((taus() & 1)==0) - ((int16_t*)x)[i] = 364; - else - ((int16_t*)x)[i] = -364; - } - reset_meas(&ts); - for (i=0; i<10000; i++) { - start_meas(&ts); - idft3072((int16_t *)x,(int16_t *)y,1); - stop_meas(&ts); - } - - printf("\n\n3072-point(%f cycles)\n",(double)ts.diff/(double)ts.trials); - LOG_M("y3072.m","y3072",y,3072,1,1); - LOG_M("x3072.m","x3072",x,3072,1,1); - - memset((void*)x,0,6144*sizeof(int32_t)); - for (i=2;i<4802;i++) { - if ((taus() & 1)==0) - ((int16_t*)x)[i] = 364; - else - ((int16_t*)x)[i] = -364; - } - for (i=2*(6144-2400);i<12288;i++) { - if ((taus() & 1)==0) - ((int16_t*)x)[i] = 364; - else - ((int16_t*)x)[i] = -364; - } - reset_meas(&ts); - for (i=0; i<10000; i++) { - start_meas(&ts); - idft6144((int16_t *)x,(int16_t *)y,1); - stop_meas(&ts); - } - - printf("\n\n6144-point(%f cycles)\n",(double)ts.diff/(double)ts.trials); - LOG_M("y6144.m","y6144",y,6144,1,1); - LOG_M("x6144.m","x6144",x,6144,1,1); - memset((void*)x,0,12288*sizeof(int32_t)); for (i=2;i<9602;i++) { if ((taus() & 1)==0) @@ -7974,9 +8307,10 @@ int main(int argc, char**argv) ((int16_t*)x)[i] = -364; } reset_meas(&ts); + uint32_t scale12288[5]={1,1,1,1,3}; for (i=0; i<10000; i++) { start_meas(&ts); - idft12288((int16_t *)x,(int16_t *)y,1); + idft12288((int16_t *)x,(int16_t *)y,scale12288); stop_meas(&ts); } @@ -7998,9 +8332,11 @@ int main(int argc, char**argv) ((int16_t*)x)[i] = -364; } reset_meas(&ts); + + uint32_t scale18432[6]={1,1,1,1,1,3}; for (i=0; i<10000; i++) { start_meas(&ts); - idft18432((int16_t *)x,(int16_t *)y,1); + idft18432((int16_t *)x,(int16_t *)y,scale18432); stop_meas(&ts); } @@ -8022,9 +8358,11 @@ int main(int argc, char**argv) ((int16_t*)x)[i] = -364; } reset_meas(&ts); + + uint32_t scale24576[6]={1,1,1,1,1,3}; for (i=0; i<10000; i++) { start_meas(&ts); - idft24576((int16_t *)x,(int16_t *)y,1); + idft24576((int16_t *)x,(int16_t *)y,scale24576); stop_meas(&ts); } @@ -8047,9 +8385,10 @@ int main(int argc, char**argv) ((int16_t*)x)[i] = -364; } reset_meas(&ts); + uint32_t scale36864[6] = {1,1,1,1,1,3}; for (i=0; i<10000; i++) { start_meas(&ts); - dft36864((int16_t *)x,(int16_t *)y,1); + dft36864((int16_t *)x,(int16_t *)y,scale36864); stop_meas(&ts); } @@ -8072,50 +8411,138 @@ int main(int argc, char**argv) ((int16_t*)x)[i] = -364; } reset_meas(&ts); + uint32_t scale49152[6]={1,1,1,1,1,3}; for (i=0; i<10000; i++) { start_meas(&ts); - idft49152((int16_t *)x,(int16_t *)y,1); + idft49152((int16_t *)x,(int16_t *)y,scale49152); stop_meas(&ts); } printf("\n\n49152-point(%f cycles)\n",(double)ts.diff/(double)ts.trials); LOG_M("y49152.m","y49152",y,49152,1,1); LOG_M("x49152.m","x49152",x,49152,1,1); - /* - int dftsizes[33]={24,36,48,60,72,96,108,120,144,180,192,216,240,288,300,324,360,384,432,480,540,576,600,648,720,768,864,900,960,972,1080,1152,1200}; - void (*dft)(int16_t *x,int16_t *y,uint8_t scale)[33] = {dft24,dft36,dft48,dft60,dft72,dft96,dft108,dft120,dft144,dft180,dft192,dft216,dft240,dft288,dft300,dft324,dft360,dft384,dft432,dft480,dft540,dft576,dft600,dft648,dft720,dft768,dft864,dft900,dft960,dft972,dft1080,dft1152,dft1200}; - for (int n=0;n<33;n++) { - // 4xN-point DFT - memset((void*)x,0,dftsizes[n]*8*sizeof(int16_t)); - for (i=0;i<dftsizes[n]*8;i+=8) { - if ((taus() & 1)==0) - ((int16_t*)x)[i] = 364; - else - ((int16_t*)x)[i] = -364; - if ((taus() & 1)==0) - ((int16_t*)x)[i+1] = 364; - else - ((int16_t*)x)[i+1] = -364; - } - - reset_meas(&ts); - for (i=0; i<10000; i++) { - start_meas(&ts); - (dft[n])((int16_t *)x,(int16_t *)y,1); - stop_meas(&ts); - } - - printf("\n\n4x%d-point(%f cycles)\n",dftsizes[n],(double)ts.diff/(double)ts.trials); - char ystr[5],xstr[5],ystr2[5],xstr2[5]; - sprintf(ystr,"y%d.m",dftsizes[n]); - sprintf(xstr,"x%d.m",dftsizes[n]); - sprintf(ystr2,"y%d",dftsizes[n]); - sprintf(xstr2,"x%d",dftsizes[n]); - LOG_M(ystr,ystr2,y,dftsizes[n]*4,1,1); - LOG_M(xstr,xstr2,x,dftsizes[n]*4,1,1); - } - */ + memset((void*)x,0,128*sizeof(int32_t)); + for (i=0;i<128;i++) { + ((int16_t*)x)[i<<1] = (int16_t)(364 * cos(2*M_PI*3*i/128)); + ((int16_t*)x)[1+(i<<1)] = (int16_t)(364 * sin(2*M_PI*3*i/128)); + } +#ifdef USE_DFT16_SHIFT + uint32_t scale128_rx[3]={2,0}; +#else + uint32_t scale128_rx[3]={2,2}; +#endif + dft128((int16_t*)x,(int16_t*)y,scale128_rx); + LOG_M("x128_exp.m","x128_exp",x,128,1,1); + LOG_M("y128_exp.m","y128_exp",y,128,1,1); + sqnr = compute_error((int16_t*)x,(int16_t*)y,128,bitrev128,0); + + printf("128 point DFT SQNR (cplx sinusoid) : %f dB\n",10*log10(sqnr)); + + memset((void*)x,0,256*sizeof(int32_t)); + for (i=0;i<256;i++) { + ((int16_t*)x)[i<<1] = (int16_t)(364 * cos(2*M_PI*3*i/256)); + ((int16_t*)x)[1+(i<<1)] = (int16_t)(364 * sin(2*M_PI*3*i/256)); + } +#ifdef USE_DFT16_SHIFT + uint32_t scale256_rx[3]={2,0}; +#else + uint32_t scale256_rx[3]={2,2}; +#endif + dft256((int16_t*)x,(int16_t*)y,scale256_rx); + LOG_M("x256_exp.m","x256_exp",x,256,1,1); + LOG_M("y256_exp.m","y256_exp",y,256,1,1); + sqnr = compute_error((int16_t*)x,(int16_t*)y,256,bitrev256,0); + + printf("256 point DFT SQNR (cplx sinusoid) : %f dB\n",10*log10(sqnr)); + + + memset((void*)x,0,512*sizeof(int32_t)); + for (i=0;i<512;i++) { + ((int16_t*)x)[i<<1] = (int16_t)(364 * cos(2*M_PI*3*i/512)); + ((int16_t*)x)[1+(i<<1)] = (int16_t)(364 * sin(2*M_PI*3*i/512)); + } +#ifdef USE_DFT16_SHIFT + uint32_t scale512_rx[3]={1,2,0}; +#else + uint32_t scale512_rx[3]={1,2,2}; +#endif + dft512((int16_t*)x,(int16_t*)y,scale512_rx); + LOG_M("x512_exp.m","x512_exp",x,512,1,1); + LOG_M("y512_exp.m","y512_exp",y,512,1,1); + + sqnr = compute_error((int16_t*)x,(int16_t*)y,512,bitrev512,0); + + printf("512 point DFT SQNR (cplx sinusoid) : %f dB\n",10*log10(sqnr)); + memset((void*)x,0,1024*sizeof(int32_t)); +#ifdef USE_DFT16_SHIFT + uint32_t scale1024_rx[3]={1,2,0}; +#else + uint32_t scale1024_rx[3]={1,2,2}; +#endif + for (i=0;i<1024;i++) { + ((int16_t*)x)[i<<1] = (int16_t)(364 * cos(2*M_PI*3*i/1024)); + ((int16_t*)x)[1+(i<<1)] = (int16_t)(364 * sin(2*M_PI*3*i/1024)); + } + dft1024((int16_t*)x,(int16_t*)y,scale1024_rx); + LOG_M("x1024_exp.m","x1024_exp",x,1024,1,1); + LOG_M("y1024_exp.m","y1024_exp",y,1024,1,1); + + sqnr = compute_error((int16_t*)x,(int16_t*)y,1024,bitrev1024,0); + + printf("1024 point DFT SQNR (cplx sinusoid) : %f dB\n",10*log10(sqnr)); + memset((void*)x,0,1536*sizeof(int32_t)); + for (i=0;i<1536;i++) { + ((int16_t*)x)[i<<1] = (int16_t)(364 * cos(2*M_PI*3*i/1536)); + ((int16_t*)x)[1+(i<<1)] = (int16_t)(364 * sin(2*M_PI*3*i/1536)); + } + dft1536((int16_t*)x,(int16_t*)y,scale1536); + LOG_M("x1536_exp.m","x1536_exp",x,1536,1,1); + LOG_M("y1536_exp.m","y1536_exp",y,1536,1,1); + + memset((void*)x,0,2048*sizeof(int32_t)); + for (i=0;i<2048;i++) { + ((int16_t*)x)[i<<1] = (int16_t)(384 * cos(2*M_PI*3*i/2048)); + ((int16_t*)x)[1+(i<<1)] = (int16_t)(384 * sin(2*M_PI*3*i/2048)); + } +#ifdef USE_DFT16_SHIFT + uint32_t scale2048_rx[4]={1,0,3,0}; +#else + uint32_t scale2048_rx[4]={1,0,3,2}; +#endif + + dft2048((int16_t*)x,(int16_t*)y,scale2048_rx); + LOG_M("x2048_exp.m","x2048_exp",x,2048,1,1); + LOG_M("y2048_exp.m","y2048_exp",y,2048,1,1); + + sqnr = compute_error((int16_t*)x,(int16_t*)y,2048,bitrev2048,0); + + printf("2048 point DFT SQNR (cplx sinusoid) : %f dB\n",10*log10(sqnr)); + memset((void*)x,0,3072*sizeof(int32_t)); + for (i=0;i<3072;i++) { + ((int16_t*)x)[i<<1] = (int16_t)(200 * cos(2*M_PI*3*i/3072)); + ((int16_t*)x)[1+(i<<1)] = (int16_t)(200 * sin(2*M_PI*3*i/3072)); + } + dft3072((int16_t*)x,(int16_t*)y,scale3072); + LOG_M("x3072_exp.m","x3072_exp",x,3072,1,1); + LOG_M("y3072_exp.m","y3072_exp",y,3072,1,1); + + memset((void*)x,0,4096*sizeof(int32_t)); + for (i=0;i<4096;i++) { + ((int16_t*)x)[i<<1] = (int16_t)(384 * cos(2*M_PI*331*i/4096)); + ((int16_t*)x)[1+(i<<1)] = (int16_t)(384 * sin(2*M_PI*331*i/4096)); + } +#ifndef USE_DFT16_SHIFT + uint32_t scale4096_rx[4]={0,0,3,3}; +#else + uint32_t scale4096_rx[4]={0,0,3,1}; +#endif + dft4096((int16_t*)x,(int16_t*)y,scale4096_rx); + LOG_M("x4096_exp.m","x4096_exp",x,4096,1,1); + LOG_M("y4096_exp.m","y4096_exp",y,4096,1,1); + + sqnr = compute_error((int16_t*)x,(int16_t*)y,4096,bitrev4096,0); + printf("4096 point DFT SQNR (cplx sinusoid) : %f dB\n",10*log10(sqnr)); return(0); } diff --git a/openair1/PHY/TOOLS/tests/test_dft.c b/openair1/PHY/TOOLS/tests/test_dft.c index 8b4c344c6c153e9ea2f2edc6ddb223d8aafc701d..19f2e734f66b539a4b3e0cce6bc07ede101e1e41 100644 --- a/openair1/PHY/TOOLS/tests/test_dft.c +++ b/openair1/PHY/TOOLS/tests/test_dft.c @@ -15,6 +15,7 @@ SZ_DEF(1024) \ SZ_DEF(1536) \ SZ_DEF(2048) \ + SZ_DEF(3072) \ SZ_DEF(4096) \ SZ_DEF(6144) \ SZ_DEF(8192) \ @@ -44,83 +45,148 @@ bool error(c16_t v16, cd_t vd, double percent) return false; } -void math_dft(cd_t *in, cd_t *out, int len) +void math_dft(c16_t *in, cd_t *out, int len,int dir,int norm) { for (int k = 0; k < len; k++) { cd_t tmp = {0}; // wrote this way to help gcc to generate SIMD double phi[len], sint[len], cost[len]; for (int n = 0; n < len; n++) - phi[n] = -2 * M_PI * ((double)k / len) * n; + if (dir ==0) phi[n] = -2 * M_PI * ((double)k / len) * n; + else phi[n] = 2* M_PI * ((double)k/len)*n; for (int n = 0; n < len; n++) sint[n] = sin(phi[n]); for (int n = 0; n < len; n++) cost[n] = cos(phi[n]); for (int n = 0; n < len; n++) { cd_t coeff = {.r = cost[n], .i = sint[n]}; - cd_t component = cdMul(coeff, in[n]); + cd_t in16q = {.r = (double)in[n].r, .i = (double)in[n].i}; + cd_t component = cdMul(coeff, in16q); tmp.r += component.r; tmp.i += component.i; } - out[k].r = tmp.r / sqrt(len); - out[k].i = tmp.i / sqrt(len); + out[k].r = tmp.r / ((norm==0) ? 1.0 : sqrt(len)); + out[k].i = tmp.i / ((norm==0) ? 1.0 : sqrt(len)); } } +void fill_qam(int n, cd_t *x, int mod) { + int size; + if (mod < 0 || mod >1) { + printf("Illegal modulation %d\n",mod); + exit(-1); + } + double sqrt170 = 1.0/sqrt(170); + memset((void*)&x[0],0,n*sizeof(cd_t)); + switch (n) { + case 128: size=72; break; + case 256: size=180; break; + case 512: size=300; break; + case 768: size=612; break; + case 1024: size=612; break; + case 1536: size=900; break; + case 2048: size=1596; break; + case 3072: size=2556; break; + case 4096: size=3276; break; + default: printf("Illegal FFT length %d\n",n); exit(-1);; + } + for (int i=0;i<size/2;i++) { + if (mod==0) { + int rv=taus()&1; + x[i].r = (1/sqrt(2.0)) * ((rv<<1) - 1); + rv=taus()&1; + x[i].i = (1/sqrt(2.0)) * ((rv<<1) - 1); + } + else { + int rvi=taus()&15; + int rvq=taus()&15; + x[i].r = ((1-2*(rvi&1))*(8-(1-2*((rvi>>1)&1))*(4-(1-2*((rvi>>2)&1))*(2-(1-2*((rvi>>3)&1))))))*sqrt170; + x[i].i = ((1-2*(rvq&1))*(8-(1-2*((rvq>>1)&1))*(4-(1-2*((rvq>>2)&1))*(2-(1-2*((rvq>>3)&1))))))*sqrt170; + } + } + for (int i=n-(size/2);i<n;i++) { + if (mod==0) { + int rv=taus()&1; + x[i].r = (1/sqrt(2.0)) * ((rv<<1) - 1); + rv=taus()&1; + x[i].i = (1/sqrt(2.0)) * ((rv<<1) - 1); + } + else { + int rvi=taus()&15; + int rvq=taus()&15; + x[i].r = ((1-2*(rvi&1))*(8-(1-2*((rvi>>1)&1))*(4-(1-2*((rvi>>2)&1))*(2-(1-2*((rvi>>3)&1))))))*sqrt170; + x[i].i = ((1-2*(rvq&1))*(8-(1-2*((rvq>>1)&1))*(4-(1-2*((rvq>>2)&1))*(2-(1-2*((rvq>>3)&1))))))*sqrt170; + } + } +} + + int main(void) { int ret = 0; load_dftslib(); - c16_t *d16 = malloc16(12 * dftFtab[sizeofArray(dftFtab) - 1].size * sizeof(*d16)); + c16_t *d16 = malloc16(12 * dftFtab[sizeofArray(dftFtab) - 1].size * sizeof(*d16)); + c16_t *d16_2 = malloc16(12 * sizeof(*d16_2)); c16_t *o16 = malloc16(12 * dftFtab[sizeofArray(dftFtab) - 1].size * sizeof(*d16)); + set_taus_seed(0); for (int sz = 0; sz < sizeofArray(dftFtab); sz++) { const int n = dftFtab[sz].size; cd_t data[n]; - double coeffs[] = {0.25, 0.5, 1, 1.5, 2, 2.5, 3}; + double coeffs[] = {30,40,50,60,70}; + printf("Testing size %d\n",n); cd_t out[n]; for (int i = 0; i < n; i++) { data[i].r = gaussZiggurat(0, 1.0); // gaussZiggurat not used paramters, to fix data[i].i = gaussZiggurat(0, 1.0); } - math_dft(data, out, n); double evm[sizeofArray(coeffs)] = {0}; + double sqnr[sizeofArray(coeffs)] = {0}; double samples[sizeofArray(coeffs)] = {0}; + double samples_out[sizeofArray(coeffs)] = {0}; for (int coeff = 0; coeff < sizeofArray(coeffs); coeff++) { - double expand = coeffs[coeff] * SHRT_MAX / sqrt(n); + double expand = pow(10.0,.05*coeffs[coeff])/sqrt(2); if (n == 12) { - for (int i = 0; i < n; i++) + for (int i = 0; i < n; i++) { for (int j = 0; j < 4; j++) { d16[i * 4 + j].r = expand * data[i].r; d16[i * 4 + j].i = expand * data[i].i; } + d16_2[i].r = d16[i * 4 ].r; + d16_2[i].i = d16[i * 4 ].i; + } } else { for (int i = 0; i < n; i++) { d16[i].r = expand * data[i].r; d16[i].i = expand * data[i].i; } } - dft(get_dft(n), (int16_t *)d16, (int16_t *)o16, 1); + if (n==12) math_dft(d16_2,out,n,0,0); + else math_dft(d16, out, n,0,1); + dft(get_dft(n), (int16_t *)d16, (int16_t *)o16,get_dft_scaling(n,(int32_t)(coeffs[coeff]))); if (n == 12) { for (int i = 0; i < n; i++) { - cd_t error = {.r = o16[i * 4].r / (expand * sqrt(n)) - out[i].r, .i = o16[i * 4].i / (expand * sqrt(n)) - out[i].i}; + cd_t error = {.r = o16[i * 4].r - out[i].r, .i = o16[i * 4].i - out[i].i}; + sqnr[coeff] += squaredMod(error); evm[coeff] += sqrt(squaredMod(error)) / sqrt(squaredMod(out[i])); - samples[coeff] += sqrt(squaredMod(d16[i])); + samples_out[coeff] += (squaredMod(out[i])/n); + samples[coeff] += squaredMod(d16_2[i]); } } else { for (int i = 0; i < n; i++) { - cd_t error = {.r = o16[i].r / expand - out[i].r, .i = o16[i].i / expand - out[i].i}; + cd_t error = {.r = o16[i].r - out[i].r , .i = o16[i].i - out[i].i}; evm[coeff] += sqrt(squaredMod(error)) / sqrt(squaredMod(out[i])); - samples[coeff] += sqrt(squaredMod(d16[i])); - /* - if (error(o16[i], out[i], 5)) - printf("Error in dft %d at %d, (%d, %d) != %f, %f)\n", n, i, o16[i].r, o16[i].i, gslout[i].r, gslout[i].i); - */ + double error_dB = 10*log10(squaredMod(error)); + if (coeffs[coeff] == 50 && n==4096 && error_dB >= 10) printf("error in DFT pos %d : in %f dB %f dB \n",i,coeffs[coeff],error_dB); + sqnr[coeff] += squaredMod(error); + samples[coeff] += squaredMod(d16[i]); + samples_out[coeff] += squaredMod(out[i]); } } + sqnr[coeff] = samples_out[coeff] / sqnr[coeff]; } - printf("done DFT size %d (evm (%%), avg samples amplitude) = ", n); + printf("done DFT size %d (evm (%%), SQNRdB, avg in samples amplitude, avg out samples amplitude) = ", n); for (int coeff = 0; coeff < sizeofArray(coeffs); coeff++) - printf("(%.2f, %.0f) ", (evm[coeff] / n) * 100, samples[coeff] / n); + printf("input_lev %f (%.2f, %f, %.1f, %.1f) ", coeffs[coeff],(evm[coeff] / n) * 100, 10*log10(sqnr[coeff]),10*log10(samples[coeff] / n), 10*log10(samples_out[coeff] / n)); printf("\n"); int i; for (i = 0; i < sizeofArray(coeffs); i++) @@ -132,7 +198,56 @@ int main(void) } fflush(stdout); } + + // TX test: modulate all used sizss with QPSK and 256QAM. Compute IDFT using + // QAM levels and IDFT scaling used in gNB transmit chain. Use double precision DFT + // to bring back to frequency-domain. Compute EVM and SQNR compared to + // transmitted waveform + for (int sz = 0; sz < sizeofArray(dftFtab); sz++) { + const int n = dftFtab[sz].size; + cd_t data[n]; + if (n > 4096) break; + if (n < 128) continue; + printf("Testing IDFT size %d\n",n); + cd_t out[n]; + for (int mod=0;mod<2;mod++) { + fill_qam(n,data,mod); + int16_t amp=512; + for (int i = 0; i < n; i++) { + d16[i].r = (int16_t)(amp*data[i].r); + d16[i].i = (int16_t)(amp*data[i].i); + } + idft(get_idft(n), (int16_t *)d16, (int16_t *)o16,get_idft_scaling(n,0)); + math_dft(o16, out, n,0,1); + double evm = 0; + double sqnr = 0; + double samples = 0; + double samples_out = 0; + int nz=0; + for (int i = 0; i < n; i++) { + if (data[i].r != 0) { + cd_t error = {.r = (double)d16[i].r - out[i].r, .i =(double) d16[i].i - out[i].i}; + evm += sqrt(squaredMod(error)) / sqrt(squaredMod(out[i])); + sqnr += squaredMod(error); + samples += sqrt(squaredMod(d16[i])); + samples_out += squaredMod(out[i]); + nz++; + } + } + sqnr = samples_out / sqnr; + printf("done IDFT size %d nz %d mod %s (evm (%%), SQNRdB, avg samples amplitude) = ", n,nz, mod==0?"QPSK":"256QAM"); + printf("(%.2f, %f, %.1f) ", (evm / nz) * 100, 10*log10(sqnr),10*log10(samples_out/ nz)); + printf("\n"); + if (evm / nz > 0.01){ + printf("IDFT size: %d/ mod %s, minimum error is more than 1%%, setting the test as failed\n", n, mod==0?"QPSK":"256QAM"); + ret = 1; + break; + } + } + fflush(stdout); + } free(d16); free(o16); + free(d16_2); return ret; } diff --git a/openair1/PHY/TOOLS/tools_defs.h b/openair1/PHY/TOOLS/tools_defs.h index d7e738a9f44fc5cc39f8c25013187d8f61425c33..0b2e3d56d643b47fab969a4353d9d11709857f68 100644 --- a/openair1/PHY/TOOLS/tools_defs.h +++ b/openair1/PHY/TOOLS/tools_defs.h @@ -591,8 +591,59 @@ void init_fft(uint16_t size, SZ_DEF(65536) \ SZ_DEF(98304) -typedef void(*dftfunc_t)(uint8_t sizeidx,int16_t *sigF,int16_t *sig,unsigned char scale_flag); -typedef void (*idftfunc_t)(uint8_t sizeidx, int16_t *sigF, int16_t *sig, unsigned char scale_flag); +extern uint32_t DFT_SCALING_64[5][2]; +extern uint32_t DFT_SCALING_128[5][3]; +extern uint32_t DFT_SCALING_256[5][3]; +extern uint32_t DFT_SCALING_512[7][4]; +extern int32_t DFT_SCALING_512_THRES[7]; +extern uint32_t DFT_SCALING_768[5][4]; +extern uint32_t DFT_SCALING_1024[5][4]; +extern int32_t DFT_SCALING_1024_THRES[5]; +extern uint32_t DFT_SCALING_1536[5][5]; +extern uint32_t DFT_SCALING_2048[10][5]; +extern int32_t DFT_SCALING_2048_THRES[10]; +extern uint32_t DFT_SCALING_3072[5][5]; +extern uint32_t DFT_SCALING_4096[8][5]; +extern int32_t DFT_SCALING_4096_THRES[8]; +extern uint32_t DFT_SCALING_6144[5][6]; +extern uint32_t DFT_SCALING_8192[5][6]; +extern uint32_t DFT_SCALING_9216[5][6]; +extern uint32_t DFT_SCALING_12288[5][6]; +extern uint32_t DFT_SCALING_16384[5][6]; +extern uint32_t DFT_SCALING_18432[5][7]; +extern uint32_t DFT_SCALING_24576[5][7]; +extern uint32_t DFT_SCALING_32768[5][7]; +extern uint32_t DFT_SCALING_36864[5][7]; +extern uint32_t DFT_SCALING_49152[5][7]; +extern uint32_t DFT_SCALING_65536[5][7]; +extern uint32_t DFT_SCALING_73728[5][8]; +extern uint32_t DFT_SCALING_98304[5][8]; + +extern uint32_t IDFT_SCALING_128[2][2]; +extern uint32_t IDFT_SCALING_256[2][2]; +extern uint32_t IDFT_SCALING_512[2][3]; +extern uint32_t IDFT_SCALING_768[2][3]; +extern uint32_t IDFT_SCALING_1024[2][3]; +extern uint32_t IDFT_SCALING_1536[2][4]; +extern uint32_t IDFT_SCALING_2048[2][4]; +extern uint32_t IDFT_SCALING_3072[2][4]; +extern uint32_t IDFT_SCALING_4096[2][4]; +extern uint32_t IDFT_SCALING_6144[2][5]; +extern uint32_t IDFT_SCALING_8192[2][5]; +extern uint32_t IDFT_SCALING_9216[2][5]; +extern uint32_t IDFT_SCALING_12288[2][5]; +extern uint32_t IDFT_SCALING_16384[2][5]; +extern uint32_t IDFT_SCALING_18432[2][6]; +extern uint32_t IDFT_SCALING_24576[2][6]; +extern uint32_t IDFT_SCALING_32768[2][6]; +extern uint32_t IDFT_SCALING_36864[2][6]; +extern uint32_t IDFT_SCALING_49152[2][6]; +extern uint32_t IDFT_SCALING_65536[2][6]; +extern uint32_t IDFT_SCALING_73728[2][7]; +extern uint32_t IDFT_SCALING_98304[2][7]; + +typedef void(*dftfunc_t)(uint8_t sizeidx,int16_t *sigF,int16_t *sig,unsigned int *scale); +typedef void (*idftfunc_t)(uint8_t sizeidx, int16_t *sigF, int16_t *sig, unsigned int *scale); extern dftfunc_t dft; extern idftfunc_t idft; int load_dftslib(void); @@ -629,6 +680,146 @@ static inline dft_size_idx_t get_dft(int size) return DFT_SIZE_IDXTABLESIZE; } +/******************************************************************* +* +* NAME : get_dft_scaling +* +* PARAMETERS : size of ofdm symbol +* +* RETURN : pointer to default scaling schedule +* +* DESCRIPTION : return point to the default (best) scaling schedule for DFT of a given length +* +*********************************************************************/ +static inline +uint32_t *get_dft_scaling(int ofdm_symbol_size,int32_t levdB) +{ + size_t i=0; + switch (ofdm_symbol_size) { + case 64: + return DFT_SCALING_64[0]; + case 128: + return DFT_SCALING_128[0]; + case 256: + return DFT_SCALING_256[0]; + case 512: + while (i<sizeof(DFT_SCALING_512_THRES)/sizeof(DFT_SCALING_512_THRES[0])) { + if (levdB < DFT_SCALING_512_THRES[i]) break; + i++; + } + return DFT_SCALING_512[i]; + case 768: + return DFT_SCALING_768[0]; + case 1024: + while (i<sizeof(DFT_SCALING_1024_THRES)/sizeof(DFT_SCALING_1024_THRES[0])) { + if (levdB < DFT_SCALING_1024_THRES[i]) break; + i++; + } + return DFT_SCALING_1024[i]; + case 1536: + return DFT_SCALING_1536[0]; + case 2048: + while (i<sizeof(DFT_SCALING_2048_THRES)/sizeof(DFT_SCALING_2048_THRES[0])) { + if (levdB < DFT_SCALING_2048_THRES[i]) break; + i++; + } + return DFT_SCALING_2048[i]; + case 3072: + return DFT_SCALING_3072[0]; + case 4096: + while (i<sizeof(DFT_SCALING_2048_THRES)/sizeof(DFT_SCALING_2048_THRES[0])) { + if (levdB < DFT_SCALING_4096_THRES[i]) break; + i++; + } + return DFT_SCALING_2048[i]; + case 6144: + return DFT_SCALING_6144[0]; + case 8192: + return DFT_SCALING_8192[0]; + case 9216: + return DFT_SCALING_9216[0]; + case 12288: + return DFT_SCALING_12288[0]; + case 18432: + return DFT_SCALING_18432[0]; + case 24576: + return DFT_SCALING_24576[0]; + case 36864: + return DFT_SCALING_36864[0]; + case 49152: + return DFT_SCALING_49152[0]; + case 73728: + return DFT_SCALING_73728[0]; + case 98304: + return DFT_SCALING_98304[0]; + default: + return (uint32_t*)1; + break; + } + return NULL; +} + +/******************************************************************* +* +* NAME : get_idft_scaling +* +* PARAMETERS : size of ofdm symbol +* +* RETURN : pointer to default scaling schedule +* +* DESCRIPTION : return point to the default (best) scaling schedule for IDFT of a given length +* +*********************************************************************/ +static inline +uint32_t *get_idft_scaling(int ofdm_symbol_size,unsigned int lev_ind) +{ + AssertFatal(lev_ind < 2, "Illegal lev_ind %u\n",lev_ind); + switch (ofdm_symbol_size) { + case 128: + return IDFT_SCALING_128[lev_ind]; + case 256: + return IDFT_SCALING_256[lev_ind]; + case 512: + return IDFT_SCALING_512[lev_ind]; + case 768: + return IDFT_SCALING_768[lev_ind]; + case 1024: + return IDFT_SCALING_1024[lev_ind]; + case 1536: + return IDFT_SCALING_1536[lev_ind]; + case 2048: + return IDFT_SCALING_2048[lev_ind]; + case 3072: + return IDFT_SCALING_3072[lev_ind]; + case 4096: + return IDFT_SCALING_4096[lev_ind]; + case 6144: + return IDFT_SCALING_6144[lev_ind]; + case 8192: + return IDFT_SCALING_8192[lev_ind]; + case 9216: + return IDFT_SCALING_9216[lev_ind]; + case 12288: + return IDFT_SCALING_12288[lev_ind]; + case 18432: + return IDFT_SCALING_18432[lev_ind]; + case 24576: + return IDFT_SCALING_24576[lev_ind]; + case 36864: + return IDFT_SCALING_36864[lev_ind]; + case 49152: + return IDFT_SCALING_49152[lev_ind]; + case 73728: + return IDFT_SCALING_73728[lev_ind]; + case 98304: + return IDFT_SCALING_98304[lev_ind]; + default: + printf("function get_idft_scaling : unsupported ofdm symbol size \n"); + assert(0); + break; + } + return NULL; +} #define SZ_iENUM(Sz) IDFT_##Sz, typedef enum idft_size_idx { FOREACH_IDFTSZ(SZ_iENUM) @@ -636,14 +827,14 @@ typedef enum idft_size_idx { } idft_size_idx_t; #ifdef OAIDFTS_MAIN -typedef void (*adftfunc_t)(int16_t *sigF, int16_t *sig, unsigned char scale_flag); -typedef void (*aidftfunc_t)(int16_t *sigF, int16_t *sig, unsigned char scale_flag); +typedef void (*adftfunc_t)(int16_t *sigF, int16_t *sig, unsigned int *scale); +typedef void (*aidftfunc_t)(int16_t *sigF, int16_t *sig, unsigned int *scale); -#define SZ_FUNC(Sz) void dft##Sz(int16_t *x, int16_t *y, uint8_t scale_flag); +#define SZ_FUNC(Sz) void dft##Sz(int16_t *x, int16_t *y, unsigned int *scale); FOREACH_DFTSZ(SZ_FUNC) -#define SZ_iFUNC(Sz) void idft##Sz(int16_t *x, int16_t *y, uint8_t scale_flag); +#define SZ_iFUNC(Sz) void idft##Sz(int16_t *x, int16_t *y, unsigned int *scale); FOREACH_IDFTSZ(SZ_iFUNC) #define SZ_PTR(Sz) {dft ## Sz,Sz}, diff --git a/openair1/PHY/defs_RU.h b/openair1/PHY/defs_RU.h index 1b0182bffde8d921cf9a03a5c5de95f8fc8d50b7..43fabc055e158642433ffe0d1e96ed47f6049b07 100644 --- a/openair1/PHY/defs_RU.h +++ b/openair1/PHY/defs_RU.h @@ -666,6 +666,7 @@ typedef struct RU_t_s { /// structure for analyzing high-level RT measurements rt_ru_profiling_t rt_ru_profiling; void* scopeData; + int32_t dft_in_levdB; } RU_t; diff --git a/openair1/PHY/defs_UE.h b/openair1/PHY/defs_UE.h index 5a090752d5f982b7a1e5d79d763a74557a5c294c..ac093056e4f682710fa3d650feae6c73699eb596 100644 --- a/openair1/PHY/defs_UE.h +++ b/openair1/PHY/defs_UE.h @@ -840,6 +840,7 @@ typedef struct { openair0_device rfdevice; void *scopeData; + int dft_in_levdB; } PHY_VARS_UE; /* this structure is used to pass both UE phy vars and diff --git a/openair1/PHY/defs_nr_UE.h b/openair1/PHY/defs_nr_UE.h index dd035a73e3b5111609e0f4ddc35467c62697e43f..8e1332c126734c5cc612f179976ff372086bc114 100644 --- a/openair1/PHY/defs_nr_UE.h +++ b/openair1/PHY/defs_nr_UE.h @@ -552,6 +552,7 @@ typedef struct PHY_VARS_NR_UE_s { Actor_t dl_actors[NUM_DL_ACTORS]; Actor_t ul_actor; ntn_config_message_t* ntn_config_message; + int32_t dft_in_levdB; } PHY_VARS_NR_UE; typedef struct { diff --git a/openair1/PHY/nr_phy_common/src/nr_phy_common.c b/openair1/PHY/nr_phy_common/src/nr_phy_common.c index 9aa45e20b385f1092739671d25f5f643315ce7da..3493c9872c42d5d6ac1001c930e5fc5867905542 100644 --- a/openair1/PHY/nr_phy_common/src/nr_phy_common.c +++ b/openair1/PHY/nr_phy_common/src/nr_phy_common.c @@ -358,7 +358,8 @@ void nr_256qam_llr(int32_t *rxdataF_comp, int32_t *ch_mag, int32_t *ch_mag2, int void freq2time(uint16_t ofdm_symbol_size, int16_t *freq_signal, int16_t *time_signal) { const idft_size_idx_t idft_size = get_idft(ofdm_symbol_size); - idft(idft_size, freq_signal, time_signal, 1); + uint32_t *scaling_sched = get_idft_scaling(ofdm_symbol_size,1); + idft(idft_size, freq_signal, time_signal, scaling_sched); } void nr_est_delay(int ofdm_symbol_size, const c16_t *ls_est, c16_t *ch_estimates_time, delay_t *delay) diff --git a/openair1/SCHED_NR/nr_ru_procedures.c b/openair1/SCHED_NR/nr_ru_procedures.c index a0b3a69ba8c24becbbc24e8a6aa5fd8d5abb3c9c..ab809cbc399bf52c0c15f09819483075fdfed569 100644 --- a/openair1/SCHED_NR/nr_ru_procedures.c +++ b/openair1/SCHED_NR/nr_ru_procedures.c @@ -363,7 +363,8 @@ void nr_fep(void* arg) &ru->common.rxdataF[aid][offset], l, tti_rx, - ru->N_TA_offset); + ru->N_TA_offset, + ru->dft_in_levdB); VCD_SIGNAL_DUMPER_DUMP_FUNCTION_BY_NAME(VCD_SIGNAL_DUMPER_FUNCTIONS_PHY_PROCEDURES_RU_FEPRX+aid, 0); // Task completed in // diff --git a/openair1/SCHED_NR_UE/phy_procedures_nr_ue.c b/openair1/SCHED_NR_UE/phy_procedures_nr_ue.c index 809c534acc30975af0c897c3443d926dadb3dd9f..e01433ea998929abbcfd9a9e9991565e4f0f7a3c 100644 --- a/openair1/SCHED_NR_UE/phy_procedures_nr_ue.c +++ b/openair1/SCHED_NR_UE/phy_procedures_nr_ue.c @@ -877,6 +877,7 @@ int pbch_pdcch_processing(PHY_VARS_NR_UE *ue, const UE_nr_rxtx_proc_t *proc, nr_ __attribute__ ((aligned(32))) struct complex16 dl_ch_estimates_time[fp->nb_antennas_rx][fp->ofdm_symbol_size]; for (int i=1; i<4; i++) { + if (i==1) ue->dft_in_levdB=-1; // trigger recalculation of DFT scaling nr_slot_fep(ue, fp, proc->nr_slot_rx, diff --git a/openair1/SCHED_UE/phy_procedures_lte_ue.c b/openair1/SCHED_UE/phy_procedures_lte_ue.c index 1f165380d4a7edcc2ddde19654df44fee548c2e9..6118bb35662cc8b09cb60371f88f50411797c560 100644 --- a/openair1/SCHED_UE/phy_procedures_lte_ue.c +++ b/openair1/SCHED_UE/phy_procedures_lte_ue.c @@ -4452,6 +4452,7 @@ int phy_procedures_UE_RX(PHY_VARS_UE *ue, } VCD_SIGNAL_DUMPER_DUMP_FUNCTION_BY_NAME(VCD_SIGNAL_DUMPER_FUNCTIONS_UE_SLOT_FEP, VCD_FUNCTION_IN); + if (l==0) ue->dft_in_levdB = -1; //trigger dft scaling adjustment slot_fep(ue, l, (subframe_rx<<1), diff --git a/openair1/SIMULATION/LTE_PHY/dlsim.c b/openair1/SIMULATION/LTE_PHY/dlsim.c index 40d35c4a00891e0148d917f16830ec4042bc0be5..6ae5c5fa92ee15e2a42d7696d3dc4b19d6d0b162 100644 --- a/openair1/SIMULATION/LTE_PHY/dlsim.c +++ b/openair1/SIMULATION/LTE_PHY/dlsim.c @@ -256,6 +256,12 @@ void DL_channel(RU_t *ru,PHY_VARS_UE *UE,uint subframe,int awgn_flag,double SNR, (short) (r_im[aa][i] + (iqim*r_re[aa][i]) + sqrt(sigma2/2)*gaussdouble(0.0,1.0)); } } + + int sigenergy=0; + for (aa=0;aa<UE->frame_parms.nb_antennas_rx; aa++) { + sigenergy+=signal_energy((int32_t*)(UE->common_vars.rxdata[aa]+subframe*UE->frame_parms.samples_per_tti),UE->frame_parms.samples_per_tti); + } + UE->dft_in_levdB = dB_fixed(sigenergy); } uint16_t diff --git a/openair1/SIMULATION/LTE_PHY/ulsim.c b/openair1/SIMULATION/LTE_PHY/ulsim.c index 14e3c696965093522c24b7709050f445b6157827..cada8e5a89572baad3a34e67e2668768699fc9cf 100644 --- a/openair1/SIMULATION/LTE_PHY/ulsim.c +++ b/openair1/SIMULATION/LTE_PHY/ulsim.c @@ -1142,6 +1142,11 @@ int main(int argc, char **argv) { &ru->common.rxdata[0][(eNB->frame_parms.samples_per_tti<<1) -eNB->frame_parms.ofdm_symbol_size], OFDM_SYMBOL_SIZE_COMPLEX_SAMPLES/2)) - 1)+10*log10(eNB->frame_parms.N_RB_UL/nb_rb); + int sigenergy=0; + for (aa=0;aa<eNB->frame_parms.nb_antennas_rx; aa++) { + sigenergy+=signal_energy((int32_t*)(ru->common.rxdata[aa]+subframe*eNB->frame_parms.samples_per_tti),eNB->frame_parms.samples_per_tti); + } + ru->dft_in_levdB = dB_fixed(sigenergy); if (n_frames<=10) { printf("SNRmeas %f\n",SNRmeas); LOG_M("rxsig0UL.m","rxs0", &ru->common.rxdata[0][eNB->frame_parms.samples_per_tti*subframe],eNB->frame_parms.samples_per_tti,1,1); diff --git a/openair1/SIMULATION/NR_PHY/dlsim.c b/openair1/SIMULATION/NR_PHY/dlsim.c index 95a5b9ea647ea467bd9fc54e946cbe1a30b71a69..779e9d1a48f390b490d488c48a5f4d2cd29ba301 100644 --- a/openair1/SIMULATION/NR_PHY/dlsim.c +++ b/openair1/SIMULATION/NR_PHY/dlsim.c @@ -351,6 +351,7 @@ int main(int argc, char **argv) if ((uniqCfg = load_configmodule(argc, argv, CONFIG_ENABLECMDLINEONLY)) == 0) { exit_fun("[NR_DLSIM] Error, configuration module init failed\n"); } + int tx_amp=36; randominit(0); @@ -358,7 +359,7 @@ int main(int argc, char **argv) FILE *scg_fd=NULL; - while ((c = getopt(argc, argv, "--:O:f:hA:p:f:g:i:n:s:S:t:v:x:y:z:o:M:N:F:GR:d:PI:L:a:b:e:m:w:T:U:q:X:Y:Z:")) != -1) { + while ((c = getopt(argc, argv, "--:O:f:hA:p:f:g:i:n:s:S:t:v:x:y:z:o:M:N:F:GR:d:PI:L:a:b:e:m:w:T:U:q:X:Y:Z:cQ:")) != -1) { /* ignore long options starting with '--', option '-O' and their arguments that are handled by configmodule */ /* with this opstring getopt returns 1 for non-option arguments, refer to 'man 3 getopt' */ @@ -551,7 +552,9 @@ int main(int argc, char **argv) case 'o': delay = atoi(optarg); break; - + case 'Q': + tx_amp = atoi(optarg); + break; default: case 'h': printf("%s -h(elp) -p(extended_prefix) -N cell_id -f output_filename -F input_filename -g channel_model -n n_frames -s snr0 -S snr1 -x transmission_mode -y TXant -z RXant -i Intefrence0 -j Interference1 -A interpolation_file -C(alibration offset dB) -N CellId\n", @@ -634,6 +637,7 @@ int main(int argc, char **argv) gNB = RC.gNB[0]; gNB->ofdm_offset_divisor = UINT_MAX; gNB->phase_comp = true; // we need to perform phase compensation, otherwise everything will fail + gNB->TX_AMP = (int16_t)(32767.0 / pow(10.0, .05 * (double)(tx_amp))); frame_parms = &gNB->frame_parms; //to be initialized I suppose (maybe not necessary for PBCH) frame_parms->nb_antennas_tx = n_tx; frame_parms->nb_antennas_rx = n_rx; @@ -1147,6 +1151,11 @@ int main(int argc, char **argv) UE->frame_parms.nb_antennas_rx); dl_config.sfn = frame; dl_config.slot = slot; + int sigenergy=0; + for (int aarx=0;aarx<UE->frame_parms.nb_antennas_rx;aarx++) { + sigenergy += signal_energy((int32_t*)(UE->common_vars.rxdata[aarx]+slot_offset),slot_length)/UE->frame_parms.nb_antennas_rx; + } + UE->dft_in_levdB=dB_fixed(sigenergy); ue_dci_configuration(UE_mac, &dl_config, frame, slot); nr_ue_scheduled_response(&scheduled_response); diff --git a/openair1/SIMULATION/NR_PHY/pbchsim.c b/openair1/SIMULATION/NR_PHY/pbchsim.c index b6d92b9b8109d7d2a3bb6376b027d78cfb94b37f..b771e01e0cdb2c867ca06213f42b65a91a7f44dc 100644 --- a/openair1/SIMULATION/NR_PHY/pbchsim.c +++ b/openair1/SIMULATION/NR_PHY/pbchsim.c @@ -674,6 +674,17 @@ int main(int argc, char **argv) UE->common_vars.rxdata[aa][i].i = (short)(r_im[aa][i] + sqrt(sigma2 / 2) * gaussdouble(0.0, 1.0)); } } + int sigenergy=0; + + int start_symbol = nr_get_ssb_start_symbol(&UE->frame_parms,0); + int slot = start_symbol/14; + + int off = UE->frame_parms.get_samples_slot_timestamp(slot, &UE->frame_parms, 0); + int slot_length = UE->frame_parms.get_samples_slot_timestamp(slot+1,&UE->frame_parms,0) - off; + for (int aarx=0;aarx<UE->frame_parms.nb_antennas_rx;aarx++) { + sigenergy += signal_energy((int32_t*)(UE->common_vars.rxdata[aarx]+off),slot_length)/UE->frame_parms.nb_antennas_rx; + } + UE->dft_in_levdB=dB_fixed(sigenergy); if (n_trials==1) { LOG_M("rxsig0.m", "rxs0", UE->common_vars.rxdata[0], frame_parms->samples_per_frame, 1, 1); diff --git a/openair1/SIMULATION/NR_PHY/prachsim.c b/openair1/SIMULATION/NR_PHY/prachsim.c index 5eab1c69748086522f58f0e6bd3b7df2d186900c..0eed5181fc9425bc94ed17bff0f97976010cdabf 100644 --- a/openair1/SIMULATION/NR_PHY/prachsim.c +++ b/openair1/SIMULATION/NR_PHY/prachsim.c @@ -761,6 +761,12 @@ int main(int argc, char **argv){ } } + int sigenergy=0; + ru->dft_in_levdB=dB_fixed(sigenergy); + + for (int aarx = 0; aarx < frame_parms->nb_antennas_rx ; aarx++) { + sigenergy += signal_energy((int32_t *)ru->common.rxdata[aarx]+rx_prach_start,frame_parms->samples_per_subframe); + } for (l = 0; l < frame_parms->symbols_per_slot; l++) { for (aa = 0; aa < frame_parms->nb_antennas_rx; aa++) { nr_slot_fep_ul(frame_parms, @@ -768,7 +774,8 @@ int main(int argc, char **argv){ (int32_t *)ru->common.rxdataF[aa], l, slot, - ru->N_TA_offset); + ru->N_TA_offset, + ru->dft_in_levdB); } } diff --git a/openair1/SIMULATION/NR_PHY/ulsim.c b/openair1/SIMULATION/NR_PHY/ulsim.c index ffc6690b42461213b13857cdeafbf651d6176e27..a72b5c5e3eedebc0f039b37127e2345e966a5e7b 100644 --- a/openair1/SIMULATION/NR_PHY/ulsim.c +++ b/openair1/SIMULATION/NR_PHY/ulsim.c @@ -1250,8 +1250,12 @@ int main(int argc, char *argv[]) multipath_channel(UE2gNB, s_re, s_im, r_re, r_im, slot_length, 0, (n_trials == 1) ? 1 : 0); add_noise(rxdata, (const double **) r_re, (const double **) r_im, sigma, slot_length, slot_offset, ts, delay, pdu_bit_map, PUSCH_PDU_BITMAP_PUSCH_PTRS, frame_parms->nb_antennas_rx); - } /*End input_fd */ + int sigenergy=0; + for (int aarx=0;aarx<n_rx;aarx++) { + sigenergy += signal_energy((int32_t*)(rxdata[aarx]+slot_offset),slot_length)/n_rx; + } + //---------------------------------------------------------- //------------------- gNB phy procedures ------------------- @@ -1267,7 +1271,8 @@ int main(int argc, char *argv[]) (int32_t *)gNB->common_vars.rxdataF[0][aa], symbol, slot, - 0); + 0, + dB_fixed(sigenergy)+9); } int offset = (slot & 3) * gNB->frame_parms.symbols_per_slot * gNB->frame_parms.ofdm_symbol_size; for (int aa = 0; aa < gNB->frame_parms.nb_antennas_rx; aa++) {