diff --git a/executables/lte-ru.c b/executables/lte-ru.c
index 326e55b2887ac3b5f55843b8dba5c9b50c535a06..7b291aa611f67313c83f67ec9e1b958cb560bc58 100644
--- a/executables/lte-ru.c
+++ b/executables/lte-ru.c
@@ -1815,6 +1815,12 @@ static void *ru_thread( void *param ) {
 
         // do RX front-end processing (frequency-shift, dft) if needed
         if (ru->feprx) ru->feprx(ru, proc->tti_rx);
+        if (ru->dft_in_levdB==-1) {
+              int sigenergy=0;
+              for (int aa=0;aa<ru->nb_rx;aa++)
+                 sigenergy += signal_energy(ru->common.rxdata[aa]+proc->tti_rx*ru->frame_parms->samples_per_tti,2048);
+              ru->dft_in_levdB = dB_fixed(sigenergy)+30;
+        }
 
         // wakeup all eNB processes waiting for this RU
         AssertFatal((ret=pthread_mutex_lock(&proc->mutex_eNBs))==0,"mutex_lock returns %d\n",ret);
diff --git a/executables/nr-ru.c b/executables/nr-ru.c
index 90d7e235dec71a2ffd77f78aafca1ad07e05e9fa..f7bb294f16c5bbc82e2f4074e6d11fbac5861782 100644
--- a/executables/nr-ru.c
+++ b/executables/nr-ru.c
@@ -1320,6 +1320,12 @@ void *ru_thread(void *param)
         // set the tti that was generated to busy
         rx_tti_busy[proc->tti_rx % RU_RX_SLOT_DEPTH] = true;
         ru->feprx(ru,proc->tti_rx);
+        if (ru->dft_in_levdB==-1) {
+          int sigenergy=0;
+          for (int aa=0;aa<ru->nb_rx;aa++)
+            sigenergy += signal_energy(ru->common.rxdata[aa]+fp->get_samples_slot_timestamp(proc->tti_rx,fp,0),2048);
+          ru->dft_in_levdB = dB_fixed(sigenergy)+40;
+        }
         LOG_D(NR_PHY, "Setting %d.%d (%d) to busy\n", proc->frame_rx, proc->tti_rx, proc->tti_rx % RU_RX_SLOT_DEPTH);
         clock_gettime(CLOCK_MONOTONIC,&ru->rt_ru_profiling.return_RU_feprx[rt_prof_idx]);
         //LOG_M("rxdata.m","rxs",ru->common.rxdata[0],1228800,1,1);
diff --git a/openair1/PHY/INIT/lte_init_ru.c b/openair1/PHY/INIT/lte_init_ru.c
index 967d352769a9cf703c0fa0912f3c078233830d81..108eee38b21fd569b1a508d1f891400387d811e4 100644
--- a/openair1/PHY/INIT/lte_init_ru.c
+++ b/openair1/PHY/INIT/lte_init_ru.c
@@ -181,6 +181,7 @@ int phy_init_RU(RU_t *ru) {
   } // !=IF5
 
   ru->common.sync_corr = (uint32_t *)malloc16_clear( LTE_NUMBER_OF_SUBFRAMES_PER_FRAME*sizeof(uint32_t)*fp->samples_per_tti );
+  ru->dft_in_levdB = -1;
   return(0);
 }
 
diff --git a/openair1/PHY/INIT/nr_init_ru.c b/openair1/PHY/INIT/nr_init_ru.c
index 674c70cdc39eae21dcef00eda29f0196b3b3a6d3..075d2e20877663e8b8008d5f75abad662acca107 100644
--- a/openair1/PHY/INIT/nr_init_ru.c
+++ b/openair1/PHY/INIT/nr_init_ru.c
@@ -131,6 +131,7 @@ int nr_phy_init_RU(RU_t *ru)
 
   init_prach_ru_list(ru);
 
+  ru->dft_in_levdB = -1;
   return(0);
 }
 
diff --git a/openair1/PHY/LTE_ESTIMATION/lte_adjust_sync_eNB.c b/openair1/PHY/LTE_ESTIMATION/lte_adjust_sync_eNB.c
index 137035d0d71b850cf7ca5aaa4196dabd78247bb2..beac833e4f9ce274e7d3da6ce2cb19bb03d66ccc 100644
--- a/openair1/PHY/LTE_ESTIMATION/lte_adjust_sync_eNB.c
+++ b/openair1/PHY/LTE_ESTIMATION/lte_adjust_sync_eNB.c
@@ -69,7 +69,7 @@ int lte_est_timing_advance(LTE_DL_FRAME_PARMS *frame_parms,
         break;
       }
        if (len)
-      dft(get_dft(len), (int16_t *)lte_eNB_srs->srs_ch_estimates[aa], (int16_t *)lte_eNB_srs->srs_ch_estimates_time[aa], 1);
+      dft(get_dft(len), (int16_t *)lte_eNB_srs->srs_ch_estimates[aa], (int16_t *)lte_eNB_srs->srs_ch_estimates_time[aa], get_dft_scaling(len,0));
 #ifdef DEBUG_PHY
       sprintf(fname,"srs_ch_estimates_time_%d%d.m",ind,aa);
       sprintf(vname,"srs_time_%d%d",ind,aa);
diff --git a/openair1/PHY/LTE_ESTIMATION/lte_dl_channel_estimation.c b/openair1/PHY/LTE_ESTIMATION/lte_dl_channel_estimation.c
index fe995885cb42e35d54a581ade0910068d9302862..d2ce4513bd677b7df30f653942ce44a16f37552a 100644
--- a/openair1/PHY/LTE_ESTIMATION/lte_dl_channel_estimation.c
+++ b/openair1/PHY/LTE_ESTIMATION/lte_dl_channel_estimation.c
@@ -671,7 +671,7 @@ int lte_dl_channel_estimation(PHY_VARS_UE *ue,
           idft(get_idft(s),
                (int16_t *)&dl_ch_estimates[(p << 1) + aarx][8],
                (int16_t *)vars->dl_ch_estimates_time[eNB_offset][(p << 1) + aarx],
-               1);
+               get_idft_scaling(s,1));
         }
       }
   }
diff --git a/openair1/PHY/LTE_ESTIMATION/lte_dl_mbsfn_channel_estimation.c b/openair1/PHY/LTE_ESTIMATION/lte_dl_mbsfn_channel_estimation.c
index 90606c63c4659f88bab3b9482bce31390667a9ad..1698e7bf26323a1d229b19cbc75971060d1315c2 100644
--- a/openair1/PHY/LTE_ESTIMATION/lte_dl_mbsfn_channel_estimation.c
+++ b/openair1/PHY/LTE_ESTIMATION/lte_dl_mbsfn_channel_estimation.c
@@ -749,7 +749,7 @@ int lte_dl_mbsfn_channel_estimation(PHY_VARS_UE *ue,
            (int16_t *)&tmp[8],
            (int16_t *)ue->common_vars.common_vars_rx_data_per_thread[ue->current_thread_id[subframe]]
                .dl_ch_estimates_time[eNB_offset][aa],
-           1);
+           get_idft_scaling(len,1));
     }
   }
   return(0);
@@ -918,7 +918,7 @@ int lte_dl_mbsfn_khz_1dot25_channel_estimation(PHY_VARS_UE *ue,
         idft(get_idft(len),
              (int16_t *)&tmp[8],
              (int16_t *)ue->common_vars.common_vars_rx_data_per_thread[ue->current_thread_id[subframe]].dl_ch_estimates_time[0][aa],
-             1);
+             get_idft_scaling(len,1));
     }
   }
   return(0);
diff --git a/openair1/PHY/LTE_ESTIMATION/lte_sync_time.c b/openair1/PHY/LTE_ESTIMATION/lte_sync_time.c
index 4a1980af8c3cf3d06296458d369bbe1f8f58037c..97a63704480a1218d125cfc2966ad36225776603 100644
--- a/openair1/PHY/LTE_ESTIMATION/lte_sync_time.c
+++ b/openair1/PHY/LTE_ESTIMATION/lte_sync_time.c
@@ -58,7 +58,7 @@ static void doIdft(int size, short *in, short *out) {
       LOG_E(PHY, "Unknown N_RB_DL %d\n", size);
       return;
   }
-  idft(get_idft(len), in, out, 1);
+  idft(get_idft(len), in, out, get_idft_scaling(len,0));
 }
 
 static void copyPrimary( c16_t *out, struct complex16 *in, int ofdmSize) {
@@ -205,7 +205,7 @@ int ru_sync_time_init(RU_t *ru) { // LTE_UE_COMMON *common_vars
       return -1;
   }
   idft(get_idft(len), (int16_t *)&dmrsp[0][3 * ru->frame_parms->ofdm_symbol_size], ru->dmrssync,
-       1); /// complex output
+       get_idft_scaling(len,0)); /// complex output
   return(0);
 }
 
diff --git a/openair1/PHY/LTE_ESTIMATION/lte_sync_timefreq.c b/openair1/PHY/LTE_ESTIMATION/lte_sync_timefreq.c
index bfda00607f8beef42dbbc65164a5b3684ceb5611..34ce9453c60fdeb9e2cfe91f6496e0dc447e70f7 100644
--- a/openair1/PHY/LTE_ESTIMATION/lte_sync_timefreq.c
+++ b/openair1/PHY/LTE_ESTIMATION/lte_sync_timefreq.c
@@ -79,7 +79,7 @@ void lte_sync_timefreq(PHY_VARS_UE *ue,int band,unsigned int DL_freq)
 
       //compute frequency-domain representation of 6144-sample chunk
       dft(DFT_6144,(int16_t *)rxp,
-              sp,1);
+              sp,get_dft_scaling(6144,0));
 
 
       /*
@@ -274,7 +274,7 @@ void lte_sync_timefreq(PHY_VARS_UE *ue,int band,unsigned int DL_freq)
         }
 
         // ifft, accumulate energy over two half-frames
-        idft(IDFT_256,(int16_t*)autocorr0,(int16_t*)tmp_t,1);
+        idft(IDFT_256,(int16_t*)autocorr0,(int16_t*)tmp_t,get_idft_scaling(256,1));
         /*
               if (i==12288) {
           sprintf(fname,"corr256F_%d.m",abs(f));
@@ -292,12 +292,12 @@ void lte_sync_timefreq(PHY_VARS_UE *ue,int band,unsigned int DL_freq)
         for (re=0; re<(256/4); re++)
           autocorr0_t[re] = simde_mm_add_epi32(autocorr0_t[re], simde_mm_madd_epi16(tmp_t[re], tmp_t[re]));
 
-        idft(IDFT_256,(int16_t*)autocorr1,(int16_t*)tmp_t,1);
+        idft(IDFT_256,(int16_t*)autocorr1,(int16_t*)tmp_t,get_idft_scaling(256,1));
 
         for (re=0; re<(256/4); re++)
           autocorr1_t[re] = simde_mm_add_epi32(autocorr1_t[re], simde_mm_madd_epi16(tmp_t[re], tmp_t[re]));
 
-        idft(IDFT_256,(int16_t*)autocorr2,(int16_t*)tmp_t,1);
+        idft(IDFT_256,(int16_t*)autocorr2,(int16_t*)tmp_t,get_idft_scaling(256,1));
 
         for (re=0; re<(256/4); re++)
           autocorr2_t[re] = simde_mm_add_epi32(autocorr2_t[re], simde_mm_madd_epi16(tmp_t[re], tmp_t[re]));
diff --git a/openair1/PHY/LTE_ESTIMATION/lte_ul_channel_estimation.c b/openair1/PHY/LTE_ESTIMATION/lte_ul_channel_estimation.c
index a2c64945344f7e6e30cc23e84ff4e3c6cbd1fa5e..1812ef8d814a8b10ec634e526c2269127ed223de 100644
--- a/openair1/PHY/LTE_ESTIMATION/lte_ul_channel_estimation.c
+++ b/openair1/PHY/LTE_ESTIMATION/lte_ul_channel_estimation.c
@@ -240,7 +240,7 @@ int32_t lte_ul_channel_estimation(LTE_DL_FRAME_PARMS *frame_parms,
           LOG_E(PHY, "Unknown N_RB_DL %d\n", frame_parms->N_RB_DL);
           return -1;
       }
-      idft(get_idft(len), (int16_t *)temp_in_ifft_0, (int16_t *)ul_ch_estimates_time[aa], 1);
+      idft(get_idft(len), (int16_t *)temp_in_ifft_0, (int16_t *)ul_ch_estimates_time[aa], get_idft_scaling(len,1));
 #if T_TRACER
 
       if (aa == 0)
@@ -511,7 +511,7 @@ int32_t lte_ul_channel_estimation_RRU(LTE_DL_FRAME_PARMS *frame_parms,
           LOG_E(PHY, "Unknown N_RB_DL %d\n", frame_parms->N_RB_DL);
           return -1;
       }
-      idft(get_idft(len), (int16_t *)temp_in_ifft_0, (int16_t *)ul_ch_estimates_time[aa], 1);
+      idft(get_idft(len), (int16_t *)temp_in_ifft_0, (int16_t *)ul_ch_estimates_time[aa], get_idft_scaling(len,1));
 #if T_TRACER
 
       if (aa == 0)
diff --git a/openair1/PHY/LTE_TRANSPORT/prach.c b/openair1/PHY/LTE_TRANSPORT/prach.c
index 92350194ca6512e2227f377a419677bcf0288d69..d19b8875447f19b6aaae4c222ea730f15bab0a53 100644
--- a/openair1/PHY/LTE_TRANSPORT/prach.c
+++ b/openair1/PHY/LTE_TRANSPORT/prach.c
@@ -320,9 +320,9 @@ void rx_prach0(PHY_VARS_eNB *eNB,
           break;
       }
 
-      dft(get_dft(fft_size), prach2, rxsigF[aa], 1);
+      dft(get_dft(fft_size), prach2, rxsigF[aa], get_dft_scaling(fft_size,ru->dft_in_levdB));
       if (prach_fmt > 1 && prach_fmt != 4)
-          dft(get_dft(fft_size), prach2 + 2 * fft_size, rxsigF[aa] + 2 * fft_size, 1);
+          dft(get_dft(fft_size), prach2 + 2 * fft_size, rxsigF[aa] + 2 * fft_size, get_dft_scaling(fft_size,ru->dft_in_levdB));
 
       k = (12*n_ra_prb) - 6*fp->N_RB_UL;
 
@@ -529,13 +529,13 @@ void rx_prach0(PHY_VARS_eNB *eNB,
         // Now do IFFT of size 1024 (N_ZC=839) or 256 (N_ZC=139)
         if (N_ZC == 839) {
           log2_ifft_size = 10;
-          idft(IDFT_1024,(int16_t*)prachF,prach_ifft_tmp,1);
+          idft(IDFT_1024,(int16_t*)prachF,prach_ifft_tmp,get_idft_scaling(1024,1));
 
           // compute energy and accumulate over receive antennas and repetitions for BR
           for (i=0; i<2048; i++)
             prach_ifft[i] += (prach_ifft_tmp[i<<1]*prach_ifft_tmp[i<<1] + prach_ifft_tmp[1+(i<<1)]*prach_ifft_tmp[1+(i<<1)])>>9;
         } else {
-          idft(IDFT_256,(int16_t*)prachF,prach_ifft_tmp,1);
+          idft(IDFT_256,(int16_t*)prachF,prach_ifft_tmp,get_idft_scaling(1024,1));
           log2_ifft_size = 8;
 
           // compute energy and accumulate over receive antennas and repetitions for BR
diff --git a/openair1/PHY/LTE_TRANSPORT/ulsch_demodulation.c b/openair1/PHY/LTE_TRANSPORT/ulsch_demodulation.c
index 25068537054b769c1dbd0ba25e870fb8944606cf..4604f752eb84d48edb5bc280ee0d071b76ed5b47 100644
--- a/openair1/PHY/LTE_TRANSPORT/ulsch_demodulation.c
+++ b/openair1/PHY/LTE_TRANSPORT/ulsch_demodulation.c
@@ -130,9 +130,9 @@ void lte_idft(LTE_DL_FRAME_PARMS *frame_parms,uint32_t *z, uint16_t Msc_PUSCH) {
   dft_size_idx_t dftsize = get_dft(Msc_PUSCH);
   switch (Msc_PUSCH) {
     case 12:
-      dft(dftsize, (int16_t *)idft_in0, (int16_t *)idft_out0, 0);
-      dft(dftsize, (int16_t *)idft_in1, (int16_t *)idft_out1, 0);
-      dft(dftsize, (int16_t *)idft_in2, (int16_t *)idft_out2, 0);
+      dft(dftsize, (int16_t *)idft_in0, (int16_t *)idft_out0, (uint32_t*)0);
+      dft(dftsize, (int16_t *)idft_in1, (int16_t *)idft_out1, (uint32_t*)0);
+      dft(dftsize, (int16_t *)idft_in2, (int16_t *)idft_out2, (uint32_t*)0);
       norm128 = simde_mm_set1_epi16(9459);
 
       for (i=0; i<12; i++) {
@@ -144,9 +144,9 @@ void lte_idft(LTE_DL_FRAME_PARMS *frame_parms,uint32_t *z, uint16_t Msc_PUSCH) {
       break;
 
     default:
-      dft(dftsize, idft_in0, idft_out0, 1);
-      dft(dftsize, idft_in1, idft_out1, 1);
-      dft(dftsize, idft_in2, idft_out2, 1);
+      dft(dftsize, idft_in0, idft_out0, (uint32_t*)1);
+      dft(dftsize, idft_in1, idft_out1, (uint32_t*)1);
+      dft(dftsize, idft_in2, idft_out2, (uint32_t*)1);
   }
 
   for (i=0,ip=0; i<Msc_PUSCH; i++,ip+=4) {
diff --git a/openair1/PHY/LTE_UE_TRANSPORT/prach_ue.c b/openair1/PHY/LTE_UE_TRANSPORT/prach_ue.c
index 9fe33c189d3c71fa305a36f4b53622082cadeb3c..5c30ac0fd764bb744136f11236b439736c3bc903 100644
--- a/openair1/PHY/LTE_UE_TRANSPORT/prach_ue.c
+++ b/openair1/PHY/LTE_UE_TRANSPORT/prach_ue.c
@@ -356,12 +356,12 @@ int32_t generate_prach( PHY_VARS_UE *ue, uint8_t eNB_id, uint8_t subframe, uint1
       break;
   }
   if (prach_fmt == 4) {
-    idft(get_idft(len), prachF, prach2, 1);
+    idft(get_idft(len), prachF, prach2, get_idft_scaling(len,0));
     // TODO: account for repeated format in dft output
     memmove(prach, prach + 2 * len, Ncp << 2);
     prach_len = len + Ncp;
   } else {
-    idft(get_idft(len), prachF, prach2, 1);
+    idft(get_idft(len), prachF, prach2, get_idft_scaling(len,0));
     memmove(prach, prach + 2 * len, Ncp << 2);
     prach_len = len + Ncp;
     if (prach_fmt > 1) {
diff --git a/openair1/PHY/LTE_UE_TRANSPORT/ulsch_modulation.c b/openair1/PHY/LTE_UE_TRANSPORT/ulsch_modulation.c
index 9c4ccf067992bd0d8396ade60e8b3a5f4459bf80..df230f4d37348f6b2a5a20d874972cbf487b22ea 100644
--- a/openair1/PHY/LTE_UE_TRANSPORT/ulsch_modulation.c
+++ b/openair1/PHY/LTE_UE_TRANSPORT/ulsch_modulation.c
@@ -100,9 +100,9 @@ void dft_lte(int32_t *z,struct complex16 *input, int32_t Msc_PUSCH, uint8_t Nsym
 
   switch (Msc_PUSCH) {
   case 12:
-    dft(dftsize, (int16_t *)dft_in0, (int16_t *)dft_out0, 0);
-    dft(dftsize, (int16_t *)dft_in1, (int16_t *)dft_out1, 0);
-    dft(dftsize, (int16_t *)dft_in2, (int16_t *)dft_out2, 0);
+    dft(dftsize, (int16_t *)dft_in0, (int16_t *)dft_out0, (uint32_t*)0);
+    dft(dftsize, (int16_t *)dft_in1, (int16_t *)dft_out1, (uint32_t*)0);
+    dft(dftsize, (int16_t *)dft_in2, (int16_t *)dft_out2, (uint32_t*)0);
     norm128 = simde_mm_set1_epi16(9459);
     for (i = 0; i < 12; i++) {
       ((simde__m128i *)dft_out0)[i] = simde_mm_slli_epi16(simde_mm_mulhi_epi16(((simde__m128i *)dft_out0)[i], norm128), 1);
@@ -113,9 +113,9 @@ void dft_lte(int32_t *z,struct complex16 *input, int32_t Msc_PUSCH, uint8_t Nsym
     break;
 
   default:
-    dft(dftsize, (int16_t *)dft_in0, (int16_t *)dft_out0, 1);
-    dft(dftsize, (int16_t *)dft_in1, (int16_t *)dft_out1, 1);
-    dft(dftsize, (int16_t *)dft_in2, (int16_t *)dft_out2, 1);
+    dft(dftsize, (int16_t *)dft_in0, (int16_t *)dft_out0, (uint32_t*)1);
+    dft(dftsize, (int16_t *)dft_in1, (int16_t *)dft_out1, (uint32_t*)1);
+    dft(dftsize, (int16_t *)dft_in2, (int16_t *)dft_out2, (uint32_t*)1);
     break;
   }
 
diff --git a/openair1/PHY/MODULATION/nr_modulation.c b/openair1/PHY/MODULATION/nr_modulation.c
index e8b215acccaa9dd1333fdc700c83af1e4e71fb23..f53453275e0c28fb61121a9cbff7916a89c1c4b8 100644
--- a/openair1/PHY/MODULATION/nr_modulation.c
+++ b/openair1/PHY/MODULATION/nr_modulation.c
@@ -344,7 +344,8 @@ void nr_dft(c16_t *z, c16_t *d, uint32_t Msc_PUSCH)
   dft_size_idx_t dftsize = get_dft(Msc_PUSCH);
   switch (Msc_PUSCH) {
     case 12:
-      dft(dftsize, (int16_t *)dft_in0, (int16_t *)dft_out0, 0);
+      dft(DFT_12,(int16_t *)dft_in0, (int16_t *)dft_out0, (uint32_t*)0);
+
       norm128 = simde_mm_set1_epi16(9459);
       for (i = 0; i < 12; i++) {
         ((simde__m128i *)dft_out0)[i] = simde_mm_slli_epi16(simde_mm_mulhi_epi16(((simde__m128i *)dft_out0)[i], norm128), 1);
@@ -352,7 +353,7 @@ void nr_dft(c16_t *z, c16_t *d, uint32_t Msc_PUSCH)
 
       break;
     default:
-      dft(dftsize, (int16_t *)dft_in0, (int16_t *)dft_out0, 1);
+      dft(dftsize, (int16_t *)dft_in0, (int16_t *)dft_out0, (uint32_t*)1);
       break;
   }
 
diff --git a/openair1/PHY/MODULATION/nr_modulation.h b/openair1/PHY/MODULATION/nr_modulation.h
index f066a967b5b8d47178fc4eea23050c7f8a2b14c8..46fbacccfb1b1c222c3d8ec1679f9ce7d82a349c 100644
--- a/openair1/PHY/MODULATION/nr_modulation.h
+++ b/openair1/PHY/MODULATION/nr_modulation.h
@@ -78,6 +78,7 @@ void nr_ue_layer_mapping(const c16_t *mod_symbs, const int n_layers, const int n
 \param symbol symbol within slot (0..12/14)
 \param Ns Slot number (0..19)
 \param sample_offset offset within rxdata (points to beginning of subframe)
+\param levdB Input level to select scaling of dft in OFDM demod
 */
 
 int nr_slot_fep_ul(NR_DL_FRAME_PARMS *frame_parms,
@@ -85,7 +86,8 @@ int nr_slot_fep_ul(NR_DL_FRAME_PARMS *frame_parms,
                    int32_t *rxdataF,
                    unsigned char symbol,
                    unsigned char Ns,
-                   int sample_offset);
+                   int sample_offset,
+                   uint32_t levdB);
 
 /*!
 \brief This function implements the dft transform precoding in PUSCH
diff --git a/openair1/PHY/MODULATION/ofdm_mod.c b/openair1/PHY/MODULATION/ofdm_mod.c
index e25f7cd7f2457542bded79101f3bc0185e45311a..36c6756b7f88c9475c56e013077b54005eb418bd 100644
--- a/openair1/PHY/MODULATION/ofdm_mod.c
+++ b/openair1/PHY/MODULATION/ofdm_mod.c
@@ -137,6 +137,7 @@ void PHY_ofdm_mod(const int *input, /// pointer to complex input
     return;
 
   idft_size_idx_t idft_size = get_idft(fftsize);
+  uint32_t *scaling_sched = get_idft_scaling(fftsize,0);
 
 #ifdef DEBUG_OFDM_MOD
   printf("[PHY] OFDM mod (size %d,prefix %d) Symbols %d, input %p, output %p\n",
@@ -169,11 +170,11 @@ void PHY_ofdm_mod(const int *input, /// pointer to complex input
         // Current idft implementation uses AVX-256: Check if buffer is already aligned to 256 bits (32 bytes)
         if ((uintptr_t)output_ptr % 32 == 0) {
           // output ptr is aligned, do ifft inplace
-          idft(idft_size, (int16_t *)&input[i * fftsize], (int16_t *)output_ptr, 1);
+          idft(idft_size, (int16_t *)&input[i * fftsize], (int16_t *)output_ptr, scaling_sched);
         } else {
           // output ptr is not aligned, needs an extra memcpy
           c16_t temp[fftsize] __attribute__((aligned(IDFT_OUTPUT_BUFFER_ALIGNMENT)));
-          idft(idft_size, (int16_t *)&input[i * fftsize], (int16_t *)temp, 1);
+          idft(idft_size, (int16_t *)&input[i * fftsize], (int16_t *)temp, scaling_sched);
           memcpy((void *)output_ptr, (void *)temp, sizeof(temp));
         }
         // perform cyclic prefix insertion
@@ -184,7 +185,7 @@ void PHY_ofdm_mod(const int *input, /// pointer to complex input
       case CYCLIC_SUFFIX: {
         // Use alignment of 64 bytes
         c16_t temp[fftsize] __attribute__((aligned(IDFT_OUTPUT_BUFFER_ALIGNMENT)));
-        idft(idft_size, (int16_t *)&input[i * fftsize], (int16_t *)temp, 1);
+        idft(idft_size, (int16_t *)&input[i * fftsize], (int16_t *)temp, scaling_sched);
         int *output_ptr = &output[(i * fftsize) + (i * nb_prefix_samples)];
         memcpy(output_ptr, temp, sizeof(temp));
         memcpy(&output_ptr[fftsize], temp, nb_prefix_samples * sizeof(c16_t));
@@ -197,7 +198,7 @@ void PHY_ofdm_mod(const int *input, /// pointer to complex input
 
       case NONE: {
         c16_t temp[fftsize] __attribute__((aligned(IDFT_OUTPUT_BUFFER_ALIGNMENT)));
-        idft(idft_size, (int16_t *)&input[i * fftsize], (int16_t *)temp, 1);
+        idft(idft_size, (int16_t *)&input[i * fftsize], (int16_t *)temp, scaling_sched);
         int *output_ptr = &output[i * fftsize];
         memcpy(output_ptr, temp, sizeof(temp));
         break;
diff --git a/openair1/PHY/MODULATION/slot_fep.c b/openair1/PHY/MODULATION/slot_fep.c
index 8488545355228e53b36690b5f045ea03c6321608..37ae8e4f7f2f58858075d796dc5ad343c19262b8 100644
--- a/openair1/PHY/MODULATION/slot_fep.c
+++ b/openair1/PHY/MODULATION/slot_fep.c
@@ -74,6 +74,7 @@ int slot_fep(PHY_VARS_UE *ue,
     return(-1);
   }
 
+  uint32_t sigenergy_avg=0;
   for (aa=0; aa<frame_parms->nb_antennas_rx; aa++) {
     memset(&common_vars->common_vars_rx_data_per_thread[ue->current_thread_id[Ns>>1]].rxdataF[aa][frame_parms->ofdm_symbol_size*symbol],0,frame_parms->ofdm_symbol_size*sizeof(int));
     rx_offset = sample_offset + slot_offset + nb_prefix_samples0 + subframe_offset - SOFFSET;
@@ -85,17 +86,24 @@ int slot_fep(PHY_VARS_UE *ue,
         memcpy((short *)&common_vars->rxdata[aa][frame_length_samples],
                (short *)&common_vars->rxdata[aa][0],
                frame_parms->ofdm_symbol_size*sizeof(int));
-
+      uint32_t sigenergy=0;
+      int dft_in_levdB;
+      if (ue->dft_in_levdB < 0) {
+        sigenergy=signal_energy((int32_t*)&common_vars->rxdata[aa][rx_offset & frame_length_samples],frame_parms->ofdm_symbol_size*sizeof(int));
+        dft_in_levdB = dB_fixed(sigenergy);
+        sigenergy_avg += (sigenergy/frame_parms->nb_antennas_rx);
+      }	
+      else dft_in_levdB = ue->dft_in_levdB;
       if ((rx_offset&7)!=0) {  // if input to dft is not 256-bit aligned, issue for size 6,15 and 25 PRBs
         memcpy((void *)tmp_dft_in,
                (void *)&common_vars->rxdata[aa][rx_offset % frame_length_samples],
                frame_parms->ofdm_symbol_size*sizeof(int));
         dft(dftsizeidx,(int16_t *)tmp_dft_in,
-            (int16_t *)&common_vars->common_vars_rx_data_per_thread[ue->current_thread_id[Ns>>1]].rxdataF[aa][frame_parms->ofdm_symbol_size*symbol],1);
+            (int16_t *)&common_vars->common_vars_rx_data_per_thread[ue->current_thread_id[Ns>>1]].rxdataF[aa][frame_parms->ofdm_symbol_size*symbol],get_dft_scaling(s,dft_in_levdB));
       } else { // use dft input from RX buffer directly
         start_UE_TIMING(ue->rx_dft_stats);
         dft(dftsizeidx,(int16_t *)&common_vars->rxdata[aa][(rx_offset) % frame_length_samples],
-            (int16_t *)&common_vars->common_vars_rx_data_per_thread[ue->current_thread_id[Ns>>1]].rxdataF[aa][frame_parms->ofdm_symbol_size*symbol],1);
+            (int16_t *)&common_vars->common_vars_rx_data_per_thread[ue->current_thread_id[Ns>>1]].rxdataF[aa][frame_parms->ofdm_symbol_size*symbol],get_dft_scaling(s,dft_in_levdB));
         stop_UE_TIMING(ue->rx_dft_stats);
       }
     } else {
@@ -120,14 +128,15 @@ int slot_fep(PHY_VARS_UE *ue,
                (void *)&common_vars->rxdata[aa][(rx_offset) % frame_length_samples],
                frame_parms->ofdm_symbol_size*sizeof(int));
         dft(dftsizeidx,(int16_t *)tmp_dft_in,
-            (int16_t *)&common_vars->common_vars_rx_data_per_thread[ue->current_thread_id[Ns>>1]].rxdataF[aa][frame_parms->ofdm_symbol_size*symbol],1);
+            (int16_t *)&common_vars->common_vars_rx_data_per_thread[ue->current_thread_id[Ns>>1]].rxdataF[aa][frame_parms->ofdm_symbol_size*symbol],get_dft_scaling(s,ue->dft_in_levdB));
       } else { // use dft input from RX buffer directly
         dft(dftsizeidx,(int16_t *)&common_vars->rxdata[aa][(rx_offset) % frame_length_samples],
-            (int16_t *)&common_vars->common_vars_rx_data_per_thread[ue->current_thread_id[Ns>>1]].rxdataF[aa][frame_parms->ofdm_symbol_size*symbol],1);
+            (int16_t *)&common_vars->common_vars_rx_data_per_thread[ue->current_thread_id[Ns>>1]].rxdataF[aa][frame_parms->ofdm_symbol_size*symbol],get_dft_scaling(s,ue->dft_in_levdB));
       }
 
       stop_UE_TIMING(ue->rx_dft_stats);
     }
+    if (ue->dft_in_levdB < 0) ue->dft_in_levdB = dB_fixed(sigenergy_avg)+20;
 
 
 #ifdef DEBUG_FEP
@@ -249,11 +258,11 @@ int front_end_fft(PHY_VARS_UE *ue,
                (void *)&common_vars->rxdata[aa][rx_offset % frame_length_samples],
                frame_parms->ofdm_symbol_size*sizeof(int));
         dft(dftsizeidx,(int16_t *)tmp_dft_in,
-            (int16_t *)&common_vars->common_vars_rx_data_per_thread[threadId].rxdataF[aa][frame_parms->ofdm_symbol_size*symbol],1);
+            (int16_t *)&common_vars->common_vars_rx_data_per_thread[threadId].rxdataF[aa][frame_parms->ofdm_symbol_size*symbol],get_dft_scaling(s,ue->dft_in_levdB));
       } else { // use dft input from RX buffer directly
         start_meas(&ue->rx_dft_stats);
         dft(dftsizeidx,(int16_t *)&common_vars->rxdata[aa][(rx_offset) % frame_length_samples],
-            (int16_t *)&common_vars->common_vars_rx_data_per_thread[threadId].rxdataF[aa][frame_parms->ofdm_symbol_size*symbol],1);
+            (int16_t *)&common_vars->common_vars_rx_data_per_thread[threadId].rxdataF[aa][frame_parms->ofdm_symbol_size*symbol],get_dft_scaling(s,ue->dft_in_levdB));
         stop_meas(&ue->rx_dft_stats);
       }
     } else {
@@ -279,10 +288,10 @@ int front_end_fft(PHY_VARS_UE *ue,
                (void *)&common_vars->rxdata[aa][(rx_offset) % frame_length_samples],
                frame_parms->ofdm_symbol_size*sizeof(int));
         dft(dftsizeidx,(int16_t *)tmp_dft_in,
-            (int16_t *)&common_vars->common_vars_rx_data_per_thread[threadId].rxdataF[aa][frame_parms->ofdm_symbol_size*symbol],1);
+            (int16_t *)&common_vars->common_vars_rx_data_per_thread[threadId].rxdataF[aa][frame_parms->ofdm_symbol_size*symbol],get_dft_scaling(s,ue->dft_in_levdB));
       } else { // use dft input from RX buffer directly
         dft(dftsizeidx,(int16_t *)&common_vars->rxdata[aa][(rx_offset) % frame_length_samples],
-            (int16_t *)&common_vars->common_vars_rx_data_per_thread[threadId].rxdataF[aa][frame_parms->ofdm_symbol_size*symbol],1);
+            (int16_t *)&common_vars->common_vars_rx_data_per_thread[threadId].rxdataF[aa][frame_parms->ofdm_symbol_size*symbol],get_dft_scaling(s,ue->dft_in_levdB));
       }
 
       stop_meas(&ue->rx_dft_stats);
diff --git a/openair1/PHY/MODULATION/slot_fep_mbsfn.c b/openair1/PHY/MODULATION/slot_fep_mbsfn.c
index fa3421959387a2ba9b8db8527450a38ad31dffa5..ca22b3b0c75fb46e1db6eeac4ffc763d05385ec3 100644
--- a/openair1/PHY/MODULATION/slot_fep_mbsfn.c
+++ b/openair1/PHY/MODULATION/slot_fep_mbsfn.c
@@ -84,7 +84,7 @@ int slot_fep_mbsfn(PHY_VARS_UE *ue,
                                               nb_prefix_samples0 +
                                               subframe_offset -
                                               SOFFSET) % frame_length_samples],
-          (int16_t *)&common_vars->common_vars_rx_data_per_thread[ue->current_thread_id[subframe]].rxdataF[aa][frame_parms->ofdm_symbol_size*l],1);
+          (int16_t *)&common_vars->common_vars_rx_data_per_thread[ue->current_thread_id[subframe]].rxdataF[aa][frame_parms->ofdm_symbol_size*l],get_dft_scaling(s,ue->dft_in_levdB));
       stop_UE_TIMING(ue->rx_dft_stats);
     } else {
       if ((sample_offset +
@@ -102,7 +102,7 @@ int slot_fep_mbsfn(PHY_VARS_UE *ue,
                                               (frame_parms->ofdm_symbol_size+nb_prefix_samples)*(l-1) +
                                               subframe_offset-
                                               SOFFSET) % frame_length_samples],
-          (int16_t *)&common_vars->common_vars_rx_data_per_thread[ue->current_thread_id[subframe]].rxdataF[aa][frame_parms->ofdm_symbol_size*l],1);
+          (int16_t *)&common_vars->common_vars_rx_data_per_thread[ue->current_thread_id[subframe]].rxdataF[aa][frame_parms->ofdm_symbol_size*l],get_dft_scaling(s,ue->dft_in_levdB));
       stop_UE_TIMING(ue->rx_dft_stats);
     }
   }
@@ -223,7 +223,7 @@ int slot_fep_mbsfn_khz_1dot25(PHY_VARS_UE *ue,
                                             nb_prefix_samples +
                                             subframe_offset -
                                             SOFFSET) % frame_length_samples],
-        (int16_t *)&common_vars->common_vars_rx_data_per_thread[ue->current_thread_id[subframe]].rxdataF[aa][0],1);
+        (int16_t *)&common_vars->common_vars_rx_data_per_thread[ue->current_thread_id[subframe]].rxdataF[aa][0],get_dft_scaling(ofdm_symbol_size,ue->dft_in_levdB));
     stop_UE_TIMING(ue->rx_dft_stats);
   }
 
diff --git a/openair1/PHY/MODULATION/slot_fep_nr.c b/openair1/PHY/MODULATION/slot_fep_nr.c
index bd61d100f837e6d60f5800ca97da077cb2b287a3..269fcf39c32570fbd020b4703fe8b3cefff56e40 100644
--- a/openair1/PHY/MODULATION/slot_fep_nr.c
+++ b/openair1/PHY/MODULATION/slot_fep_nr.c
@@ -86,6 +86,12 @@ int nr_slot_fep(PHY_VARS_NR_UE *ue,
   Ns, symbol, nb_prefix_samples, nb_prefix_samples0, rx_offset, dB_fixed(signal_energy((int32_t *)&common_vars->rxdata[0][rx_offset],frame_parms->ofdm_symbol_size)));
 #endif
 
+  uint32_t *scaling_sched=NULL;
+
+  if (ue && ue->dft_in_levdB >=0)  
+    scaling_sched = get_dft_scaling(frame_parms->ofdm_symbol_size,ue->dft_in_levdB);
+  
+  uint32_t sigenergy_avg=0;
   for (unsigned char aa=0; aa<frame_parms->nb_antennas_rx; aa++) {
     int16_t *rxdata_ptr = (int16_t *)&rxdata[aa][rx_offset];
 
@@ -110,18 +116,25 @@ int nr_slot_fep(PHY_VARS_NR_UE *ue,
 
     if (ue)
       start_meas_nr_ue_phy(ue, RX_DFT_STATS);
-
+    else 
+      scaling_sched = get_dft_scaling(frame_parms->ofdm_symbol_size,dB_fixed(signal_energy((int32_t*)rxdata_ptr,dftsize)));
+  
+    if (ue && ue->dft_in_levdB < 0) { // this means dft scaling level needs to be recomputed
+      uint32_t sigenergy= signal_energy((int32_t*)rxdata_ptr,dftsize);
+      scaling_sched = get_dft_scaling(frame_parms->ofdm_symbol_size,dB_fixed(sigenergy));
+      sigenergy_avg += sigenergy/frame_parms->nb_antennas_rx; 
+    }
     dft(dftsize,
         rxdata_ptr,
         (int16_t *)&rxdataF[aa][frame_parms->ofdm_symbol_size*symbol],
-        1);
+        scaling_sched);
 
     if (ue)
       stop_meas_nr_ue_phy(ue, RX_DFT_STATS);
 
     apply_nr_rotation_RX(frame_parms, rxdataF[aa], frame_parms->symbol_rotation[linktype], slot, N_RB, 0, symbol, 1);
   }
-
+  if (ue && ue->dft_in_levdB < 0) ue->dft_in_levdB = dB_fixed(sigenergy_avg) + 20;
 #ifdef DEBUG_FEP
   printf("slot_fep: done\n");
 #endif
@@ -134,12 +147,14 @@ int nr_slot_fep_ul(NR_DL_FRAME_PARMS *frame_parms,
                    int32_t *rxdataF,
                    unsigned char symbol,
                    unsigned char Ns,
-                   int sample_offset)
+                   int sample_offset,
+                   uint32_t levdB)
 {
   unsigned int nb_prefix_samples  = frame_parms->nb_prefix_samples;
   unsigned int nb_prefix_samples0 = frame_parms->nb_prefix_samples0;
 
   dft_size_idx_t dftsize = get_dft(frame_parms->ofdm_symbol_size);
+  uint32_t *scaling_sched = get_dft_scaling(frame_parms->ofdm_symbol_size,levdB);
   // This is for misalignment issues
   int32_t tmp_dft_in[8192] __attribute__ ((aligned (32)));
 
@@ -183,7 +198,7 @@ int nr_slot_fep_ul(NR_DL_FRAME_PARMS *frame_parms,
   dft(dftsize,
       rxdata_ptr,
       (int16_t *)&rxdataF[symbol * frame_parms->ofdm_symbol_size],
-      1);
+      scaling_sched);
 
   return 0;
 }
diff --git a/openair1/PHY/MODULATION/slot_fep_ul.c b/openair1/PHY/MODULATION/slot_fep_ul.c
index 51c215fe4656266a8b47239b5c7ebcf395f54fb3..a6fa11864b3297704df4f9ad86fa40a68d0ce738 100644
--- a/openair1/PHY/MODULATION/slot_fep_ul.c
+++ b/openair1/PHY/MODULATION/slot_fep_ul.c
@@ -82,7 +82,7 @@ int slot_fep_ul(RU_t *ru,
 #endif
       dft( dftsize,(int16_t *)&common->rxdata_7_5kHz[aa][rx_offset],
            (int16_t *)&common->rxdataF[aa][fp->ofdm_symbol_size*symbol],
-           1
+           get_dft_scaling(s,ru->dft_in_levdB) 
          );
     } else {
       
@@ -94,13 +94,13 @@ int slot_fep_ul(RU_t *ru,
 	       fp->ofdm_symbol_size*sizeof(int));
         dft( dftsize,(short *) tmp_dft_in,
              (short*)  &common->rxdataF[aa][fp->ofdm_symbol_size*symbol],
-             1
+             get_dft_scaling(s,ru->dft_in_levdB) 
            );
       }
       else{
       dft( dftsize,(short *)&common->rxdata_7_5kHz[aa][rx_offset],
            (short*)&common->rxdataF[aa][fp->ofdm_symbol_size*symbol],
-           1
+           get_dft_scaling(s,ru->dft_in_levdB) 
          );
       }
     }
diff --git a/openair1/PHY/NR_ESTIMATION/nr_measurements_gNB.c b/openair1/PHY/NR_ESTIMATION/nr_measurements_gNB.c
index 42b0b0b5155a4ffb7d0fdca407faa5d5f02ba129..5d78835e8e431068a0f63e40fe204a6e1806f6d8 100644
--- a/openair1/PHY/NR_ESTIMATION/nr_measurements_gNB.c
+++ b/openair1/PHY/NR_ESTIMATION/nr_measurements_gNB.c
@@ -102,7 +102,7 @@ void dump_nr_I0_stats(FILE *fd,PHY_VARS_gNB *gNB) {
      if (i%25 == 24) fprintf(fd,"\n");
     }
     fprintf(fd,"\n");
-    fprintf(fd,"max_IO = %d (%d), min_I0 = %d (%d), avg_I0 = %d dB",max_I0,amax,min_I0,amin,gNB->measurements.n0_subband_power_avg_dB);
+    fprintf(fd,"max_IO = %d (%d), min_I0 = %d (%d), avg_I0 = %d dB, dft_in_levdB %d dB",max_I0,amax,min_I0,amin,gNB->measurements.n0_subband_power_avg_dB,gNB->RU_list[0]->dft_in_levdB);
     if (gNB->frame_parms.nb_antennas_rx>1) {
        fprintf(fd,"(");
        for (int aarx=0;aarx<gNB->frame_parms.nb_antennas_rx;aarx++)
diff --git a/openair1/PHY/NR_TRANSPORT/nr_prach.c b/openair1/PHY/NR_TRANSPORT/nr_prach.c
index 3f41712f2cf734aac57bb718c69c18714177e53e..2b1dfcf7257b264c21386c1f75b7e2cb0e6dff59 100644
--- a/openair1/PHY/NR_TRANSPORT/nr_prach.c
+++ b/openair1/PHY/NR_TRANSPORT/nr_prach.c
@@ -381,6 +381,7 @@ void rx_nr_prach_ru(RU_t *ru, int prachFormat, int numRA, int prachStartSymbol,
   }
 
   const dft_size_idx_t dftsize = get_dft(dftlen);
+  uint32_t *scaling_sched = get_dft_scaling(dftlen,ru->dft_in_levdB);
 
   // Do forward transform
   if (LOG_DEBUGFLAG(PRACH)) {
@@ -416,7 +417,7 @@ void rx_nr_prach_ru(RU_t *ru, int prachFormat, int numRA, int prachStartSymbol,
     // do DFT
     int16_t *prach2 = prach[aa] + (2*Ncp); // times 2 for complex samples
     for (int i = 0; i < reps; i++)
-      dft(dftsize, prach2 + 2*dftlen*i, rxsigF[aa] + 2*dftlen*i, 1);
+      dft(dftsize, prach2 + 2*dftlen*i, rxsigF[aa] + 2*dftlen*i, scaling_sched);
 
     //LOG_M("ru_rxsigF_tmp.m","rxsFtmp", rxsigF[aa], dftlen*2*reps, 1, 1);
 
@@ -641,12 +642,12 @@ void rx_nr_prach(PHY_VARS_gNB *gNB,
 	
         // Now do IFFT of size 1024 (N_ZC=839) or 256 (N_ZC=139)
         if (N_ZC == 839) {
-          idft(IDFT_1024, prachF, prach_ifft_tmp, 1);
+          idft(IDFT_1024, prachF, prach_ifft_tmp, IDFT_SCALING_1024[0]);
           // compute energy and accumulate over receive antennas
           for (int i = 0; i < 1024; i++)
             prach_ifft[i] += (int32_t)prach_ifft_tmp[i<<1]*(int32_t)prach_ifft_tmp[i<<1] + (int32_t)prach_ifft_tmp[1+(i<<1)]*(int32_t)prach_ifft_tmp[1+(i<<1)];
         } else {
-          idft(IDFT_256, prachF, prach_ifft_tmp, 1);
+          idft(IDFT_256, prachF, prach_ifft_tmp, IDFT_SCALING_256[0]);
           log2_ifft_size = 8;
           // compute energy and accumulate over receive antennas and repetitions for BR
           for (int i = 0; i < 256; i++)
diff --git a/openair1/PHY/NR_TRANSPORT/nr_ulsch_demodulation.c b/openair1/PHY/NR_TRANSPORT/nr_ulsch_demodulation.c
index 396c8d89f4f3c9101dc5f645b7c6602ee80b52c6..1017191c49e2f5bba8b1b223834e706cbc4c25f2 100644
--- a/openair1/PHY/NR_TRANSPORT/nr_ulsch_demodulation.c
+++ b/openair1/PHY/NR_TRANSPORT/nr_ulsch_demodulation.c
@@ -35,7 +35,7 @@ void nr_idft(int32_t *z, uint32_t Msc_PUSCH)
   dft_size_idx_t dftsize = get_dft(Msc_PUSCH);
   switch (Msc_PUSCH) {
     case 12:
-      dft(dftsize, (int16_t *)idft_in0, (int16_t *)idft_out0, 0);
+      dft(dftsize, (int16_t *)idft_in0, (int16_t *)idft_out0, (uint32_t*)0);
 
       norm128 = simde_mm_set1_epi16(9459);
 
@@ -45,7 +45,7 @@ void nr_idft(int32_t *z, uint32_t Msc_PUSCH)
 
       break;
     default:
-      dft(dftsize, idft_in0, idft_out0, 1);
+      dft(dftsize, idft_in0, idft_out0, (uint32_t*)1);
       break;
   }
 
diff --git a/openair1/PHY/NR_UE_TRANSPORT/nr_prach.c b/openair1/PHY/NR_UE_TRANSPORT/nr_prach.c
index e2928b2df465322e67b269a95b65b1633b076a54..e7364be44cd368e49f63b676512380646b94d267 100644
--- a/openair1/PHY/NR_UE_TRANSPORT/nr_prach.c
+++ b/openair1/PHY/NR_UE_TRANSPORT/nr_prach.c
@@ -428,7 +428,8 @@ int32_t generate_nr_prach(PHY_VARS_NR_UE *ue, uint8_t gNB_id, int frame, uint8_t
   // This is after cyclic prefix
     c16_t *prach2 = prach + Ncp;
     const idft_size_idx_t idft_size = get_idft(dftlen);
-    idft(idft_size, prachF, (int16_t *)prach, 1);
+    uint32_t *scaling_sched = get_idft_scaling(dftlen,0);
+    idft(idft_size, prachF, (int16_t *)prach, scaling_sched);
     memmove(prach2, prach, (dftlen << 2));
 
     if (prach_sequence_length == 0) {
diff --git a/openair1/PHY/NR_UE_TRANSPORT/pss_nr.c b/openair1/PHY/NR_UE_TRANSPORT/pss_nr.c
index 180ca999184d69c4e03d9a83cfb1f7bbaaf4c19e..51859b8e3005249401c487d1627cd26b731eaa58 100644
--- a/openair1/PHY/NR_UE_TRANSPORT/pss_nr.c
+++ b/openair1/PHY/NR_UE_TRANSPORT/pss_nr.c
@@ -162,7 +162,7 @@ void generate_pss_nr_time(const NR_DL_FRAME_PARMS *fp, const int N_ID_2, int ssb
   idft((int16_t)get_idft(fp->ofdm_symbol_size),
        (int16_t *)synchroF_tmp, /* complex input but legacy type is wrong*/
        (int16_t *)pssTime, /* complex output */
-       1); /* scaling factor */
+       get_idft_scaling(fp->ofdm_symbol_size,1)); /* scaling factor */
 
 #ifdef DBG_PSS_NR
 
@@ -201,7 +201,7 @@ void generate_pss_nr_time(const NR_DL_FRAME_PARMS *fp, const int N_ID_2, int ssb
     dft((int16_t)get_dft(length),
     	synchro_tmp,           /* complex input */
         synchroF_tmp,          /* complex output */
-        1);                 /* scaling factor */
+        get_dft_scaling(length,0)); /* scaling factor */
 
     if ((N_ID_2 == 0) && (length == 256)) {
       LOG_M("pss_f_0.m","pss_f_0",synchroF_tmp,length,1,1);
@@ -731,7 +731,7 @@ void sl_generate_pss_ifft_samples(sl_nr_ue_phy_params_t *sl_ue_params, SL_NR_UE_
     idft((int16_t)get_idft(sl_fp->ofdm_symbol_size),
          (int16_t *)&pss_F[0], /* complex input */
          (int16_t *)&pss_T[0], /* complex output */
-         1); /* scaling factor */
+         get_idft_scaling(sl_fp->ofdm_symbol_size,1)); /* scaling factor */
   }
 
 #ifdef SL_DUMP_PSBCH_TX_SAMPLES
diff --git a/openair1/PHY/NR_UE_TRANSPORT/sss_nr.c b/openair1/PHY/NR_UE_TRANSPORT/sss_nr.c
index 0a13462d8c37de53dcd82489327ee8e97bf0f37f..286ae1d2ac8545768139adec28a5f97149da4425 100644
--- a/openair1/PHY/NR_UE_TRANSPORT/sss_nr.c
+++ b/openair1/PHY/NR_UE_TRANSPORT/sss_nr.c
@@ -169,7 +169,7 @@ void insert_sss_nr(int16_t *sss_time,
   idft(IDFT_2048,
        (int16_t *)synchroF_tmp, /* complex input */
        (int16_t *)synchro_tmp, /* complex output */
-       1); /* scaling factor */
+       IDFT_SCALING_2048[1]); /* scaling factor */
 
   /* then get final sss in time */
   memcpy(sss_time, synchro_tmp, ofdm_symbol_size * sizeof(c16_t));
diff --git a/openair1/PHY/TOOLS/Makefile b/openair1/PHY/TOOLS/Makefile
index a881bd40707c6fd82af1f083cfaa3c98baf41aee..14d24112c5631a21bd02a3dc9c71e9e12e164cef 100644
--- a/openair1/PHY/TOOLS/Makefile
+++ b/openair1/PHY/TOOLS/Makefile
@@ -1,8 +1,11 @@
 oai_dfts_sse4: oai_dfts.c
-	gcc -O3 -std=gnu99 -msse4.1 -o oai_dfts_sse4 oai_dfts.c time_meas.c  ../../SIMULATION/TOOLS/taus.c -I$$OPENAIR_HOME -I$$OPENAIR1_DIR -I$$OPENAIR_TARGETS -I$$OPENAIR_TARGETS/COMMON -I$$OPENAIR_HOME/radio/COMMON -I$$OPENAIR2_DIR -I$$OPENAIR2_DIR/COMMON -I$$OPENAIR_HOME/common/utils -I$$OPENAIR_HOME/common/utils/T -I$$OPENAIR_HOME/common/utils/msc -I$$OPENAIR_HOME/nfapi/open-nFAPI/nfapi/public_inc -DMR_MAIN -DNB_ANTENNAS_RX=1 -lm -lpthread # -DD256STATS #-DD64STATS
+	gcc -O3 -std=gnu99 -msse4.1 -o oai_dfts_sse4 oai_dfts.c time_meas.c  ../../SIMULATION/TOOLS/taus.c -I$$OPENAIR_HOME -I$$OPENAIR1_DIR -I$$OPENAIR_TARGETS -I$$OPENAIR_TARGETS/COMMON -I$$OPENAIR_HOME/radio/COMMON -I$$OPENAIR2_DIR -I$$OPENAIR2_DIR/COMMON -I$$OPENAIR_HOME/common/utils -I$$OPENAIR_HOME/common/utils/T -I$$OPENAIR_HOME/common/utils/msc -I$$OPENAIR_HOME/nfapi/open-nFAPI/nfapi/public_inc -DMR_MAIN -DNB_ANTENNAS_RX=1 -DNB_ANTENNAS_TX=1 -lm -lpthread # -DD256STATS #-DD64STATS
 
-oai_dfts_avx2: oai_dfts.c
-	gcc -O2 -std=gnu99 -mavx2 -g -ggdb -o oai_dfts_avx2 oai_dfts.c time_meas.c ../../SIMULATION/TOOLS/taus.c -I$$OPENAIR_HOME -I$$OPENAIR1_DIR -I$$OPENAIR_TARGETS -I$$OPENAIR_TARGETS/COMMON -I$$OPENAIR_HOME/radio/COMMON -I$$OPENAIR2_DIR -I$$OPENAIR2_DIR/COMMON -I$$OPENAIR_HOME/common/utils -I$$OPENAIR_HOME/common/utils/T -I$$OPENAIR_HOME/common/utils/msc -I$$OPENAIR_HOME/nfapi/open-nFAPI/nfapi/public_inc -DMR_MAIN -DNB_ANTENNAS_RX=1 -lm -lpthread # -DD256STATS #-DD64STATS
+oai_dfts_avx2: oai_dfts.c fft_double.c
+	gcc -O2 -std=gnu99 -mavx2 -g -ggdb -o oai_dfts_avx2 fft_double.c oai_dfts.c ../../../common/utils/time_meas.c ../../SIMULATION/TOOLS/taus.c ../../SIMULATION/TOOLS/rangen_double.c -I$$OPENAIR_HOME -I$$OPENAIR1_DIR -I$$OPENAIR_TARGETS -I$$OPENAIR_TARGETS/COMMON -I$$OPENAIR_HOME/radio/COMMON -I$$OPENAIR2_DIR -I$$OPENAIR2_DIR/COMMON -I$$OPENAIR_HOME/common/utils -I$$OPENAIR_HOME/common/utils/T -I$$OPENAIR_HOME/common/utils/msc -I$$OPENAIR_HOME/nfapi/open-nFAPI/nfapi/public_inc -DMR_MAIN -DNB_ANTENNAS_RX=1 -DNB_ANTENNAS_TX=1 -DMAX_NUM_CCs=1 -lm -lpthread # -DD256STATS #-DD64STATS
+
+oai_dfts_avx512: oai_dfts.c fft_double.c
+	gcc -O2 -std=gnu99 -mavx512bw -march=skylake-avx512 -mtune=skylake-avx512 -g -ggdb -o oai_dfts_avx2 fft_double.c oai_dfts.c ../../../common/utils/time_meas.c ../../SIMULATION/TOOLS/taus.c ../../SIMULATION/TOOLS/rangen_double.c -I$$OPENAIR_HOME -I$$OPENAIR1_DIR -I$$OPENAIR_TARGETS -I$$OPENAIR_TARGETS/COMMON -I$$OPENAIR_HOME/radio/COMMON -I$$OPENAIR2_DIR -I$$OPENAIR2_DIR/COMMON -I$$OPENAIR_HOME/common/utils -I$$OPENAIR_HOME/common/utils/T -I$$OPENAIR_HOME/common/utils/msc -I$$OPENAIR_HOME/nfapi/open-nFAPI/nfapi/public_inc -DMR_MAIN -DNB_ANTENNAS_RX=1 -DNB_ANTENNAS_TX=1 -DMAX_NUM_CCs=1 -lm -lpthread # -DD256STATS #-DD64STATS
 
 oai_dfts_avx2.s: oai_dfts.c
 	gcc -O2 -std=gnu99 -mavx2 -S oai_dfts.c time_meas.c ../../SIMULATION/TOOLS/taus.c -I$$OPENAIR_HOME -I$$OPENAIR1_DIR -I$$OPENAIR_TARGETS -I$$OPENAIR_TARGETS/COMMON -I$$OPENAIR_HOME/radio/COMMON -I$$OPENAIR2_DIR -I$$OPENAIR2_DIR/COMMON -I$$OPENAIR_HOME/common/utils -I$$OPENAIR_HOME/common/utils/T -I$$OPENAIR_HOME/common/utils/msc -I$$OPENAIR_HOME/nfapi/open-nFAPI/nfapi/public_inc -DMR_MAIN -DNB_ANTENNAS_RX=1 -lm -lpthread # -DD256STATS #-DD64STATS
@@ -15,5 +18,5 @@ oai_dfts_sse4.s: oai_dfts.c
 dft_cycles_avx2: oai_dfts_avx2
 	./oai_dfts_avx2 | grep -E cycles
 
-oai_dfts_aarch64: oai_dfts_neon.c
-	 gcc -O2 -std=gnu99 -gdwarf-2 -lgcc -lrt -g -ggdb -o oai_dfts_neon oai_dfts_neon.c ../../../common/utils/time_meas.c ../../SIMULATION/TOOLS/taus.c $$OPENAIR_HOME/common/utils/LOG/log.c  ../../SIMULATION/TOOLS/rangen_double.c -I$$OPENAIR_HOME -I$$OPENAIR1_DIR -I$$OPENAIR_TARGETS -I$$OPENAIR_TARGETS/COMMON -I$$OPENAIR_HOME/radio/COMMON -I$$OPENAIR2_DIR -I$$OPENAIR2_DIR/COMMON -I$$OPENAIR_HOME/common/utils -I$$OPENAIR_HOME/common/utils/T -I$$OPENAIR_HOME/common/utils/msc -I$$OPENAIR_HOME/nfapi/open-nFAPI/nfapi/public_inc -DMR_MAIN -DNB_ANTENNAS_RX=1 -DNB_ANTENNAS_TX=1 -DMAX_NUM_CCs=1 -lm -lpthread
+oai_dfts_aarch64: oai_dfts_neon.c fft_double.c
+	 gcc -O2 -std=gnu99 -gdwarf-2 -lgcc -lrt -g -ggdb -o oai_dfts_neon fft_double.c oai_dfts_neon.c  ../../../common/utils/time_meas.c ../../SIMULATION/TOOLS/taus.c ../../SIMULATION/TOOLS/rangen_double.c -I$$OPENAIR_HOME -I$$OPENAIR1_DIR -I$$OPENAIR_TARGETS -I$$OPENAIR_TARGETS/COMMON -I$$OPENAIR_HOME/radio/COMMON -I$$OPENAIR2_DIR -I$$OPENAIR2_DIR/COMMON -I$$OPENAIR_HOME/common/utils -I$$OPENAIR_HOME/common/utils/T -I$$OPENAIR_HOME/common/utils/msc -I$$OPENAIR_HOME/nfapi/open-nFAPI/nfapi/public_inc -DMR_MAIN -DNB_ANTENNAS_RX=1 -DNB_ANTENNAS_TX=1 -DMAX_NUM_CCs=1 -lm -lpthread
diff --git a/openair1/PHY/TOOLS/dfts_load.c b/openair1/PHY/TOOLS/dfts_load.c
index 2e4ddb595fe5796f8d484d3baec978ea5089f9d9..a8a455ddd69098ac7deadc3fe3515579acc84de2 100644
--- a/openair1/PHY/TOOLS/dfts_load.c
+++ b/openair1/PHY/TOOLS/dfts_load.c
@@ -40,6 +40,56 @@
 #include "common/config/config_userapi.h" 
 #include "common/utils/load_module_shlib.h" 
 
+uint32_t DFT_SCALING_64[5][2]   = {{3,0},{2,1},{1,2},{1,2},{1,2}};
+uint32_t DFT_SCALING_128[5][3]  = {{4,0,0},{3,1,0},{2,2,0},{1,3,0},{0,4,0}};
+uint32_t DFT_SCALING_256[5][3]  = {{4,0,0},{3,1,0},{2,2,0},{1,3,0},{0,4,0}};
+int32_t DFT_SCALING_512_THRES[7] = {53,57,59,63,65,69,100};
+uint32_t DFT_SCALING_512[7][4]  = {{5,0,0,0},{4,1,0,0},{4,0,1,0},{3,1,1,0},{3,0,1,1},{2,1,1,1},{2,0,1,2}};
+uint32_t DFT_SCALING_768[][4]   = {{1,2,2,0},{1,2,2,0},{1,2,2,0},{1,2,2,0},{1,2,2,0}};
+int32_t DFT_SCALING_1024_THRES[5] = {49,55,63,69,100};
+uint32_t DFT_SCALING_1024[5][4] = {{5,0,0,0},{4,1,0,0},{3,1,1,0},{2,1,1,1},{1,1,1,2}};
+uint32_t DFT_SCALING_1536[5][5] = {{1,1,2,2,0},{1,1,2,2,0},{1,1,2,2,0},{1,1,2,2,0},{1,1,2,2,0}};
+int32_t DFT_SCALING_2048_THRES[10] = {47,49,53,55,59,61,65,67,69,100};
+uint32_t DFT_SCALING_2048[10][5] = {{1,1,1,1,2},{1,1,1,1,2},{1,1,1,1,2},{1,1,1,1,2},{1,1,1,1,2},{1,1,1,1,2},{1,1,1,1,2},{1,1,1,1,2},{1,1,1,1,2},{1,1,1,1,2}} ; //}{{6,0,0,0,0},{5,1,0,0,0},{5,0,1,0,0},{4,1,1,0,0},{4,0,1,1,0},{3,1,1,1,0},{3,0,1,1,1},{2,1,1,1,1},{2,0,2,0,2},{1,0,1,0,4}};
+uint32_t DFT_SCALING_3072[5][5] = {{1,4,1,0,0},{1,0,3,2,0},{1,0,3,2,0},{1,0,3,2,0},{1,0,3,2,0}};
+int32_t DFT_SCALING_4096_THRES[8] = {43,49,57,61,63,65,69,100};
+uint32_t DFT_SCALING_4096[8][5] = {{6,0,0,0,0},{5,1,0,0,0},{4,1,1,0,0},{3,1,1,1,0},{2,2,0,1,1},{2,1,1,1,1},{1,1,2,1,1},{0,0,3,0,3}};
+uint32_t DFT_SCALING_6144[5][6] = {{1,1,0,3,2,0},{1,1,0,3,2,0},{1,1,0,3,2,0},{1,1,0,3,2,0},{1,1,0,3,2,0}};
+uint32_t DFT_SCALING_8192[5][6] = {{1,0,0,3,3,0},{1,0,0,3,3,0},{1,0,0,3,3,0},{1,0,0,3,3,0},{1,0,0,3,3,0}};
+uint32_t DFT_SCALING_9216[5][6] = {{1,1,0,3,2,1},{1,1,0,3,2,0},{1,1,0,3,2,0},{1,1,0,3,2,0},{1,1,0,3,2,0}};
+uint32_t DFT_SCALING_12288[5][6] = {{1,0,0,3,3,0},{1,0,0,3,3,0},{1,0,0,3,3,0},{1,0,0,3,3,0},{1,0,0,3,3,0}};
+uint32_t DFT_SCALING_16384[5][6] = {{0,0,1,3,3,0},{0,0,1,3,3,0},{0,0,1,3,3,0},{0,0,1,3,3,0},{0,0,1,3,3,0}};
+uint32_t DFT_SCALING_18432[5][7] = {{1,1,0,0,3,3,0},{1,1,0,0,3,3,0},{1,1,0,0,3,3,0},{1,1,0,0,3,3,0},{1,1,0,0,3,3,0}};
+uint32_t DFT_SCALING_24576[5][7] = {{1,1,0,0,3,3,0},{1,1,0,0,3,3,0},{1,1,0,0,3,3,0},{1,1,0,0,3,3,0},{1,1,0,0,3,3,0}};
+uint32_t DFT_SCALING_32768[5][7] = {{1,0,0,1,3,3,0},{1,0,0,1,3,3,0},{1,0,0,1,3,3,0},{1,0,0,1,3,3,0},{1,0,0,1,3,3,0}};
+uint32_t DFT_SCALING_36864[5][7] = {{1,1,0,0,3,3,0},{1,1,0,0,3,3,0},{1,1,0,0,3,3,0},{1,1,0,0,3,3,0},{1,1,0,0,3,3,0}};
+uint32_t DFT_SCALING_49152[5][7] = {{1,0,0,1,3,3,0},{1,0,0,1,3,3,0},{1,0,0,1,3,3,0},{1,0,0,1,3,3,0},{1,0,0,1,3,3,0}};
+uint32_t DFT_SCALING_65536[5][7] = {{0,0,0,2,3,3,0},{0,0,0,2,3,3,0},{0,0,0,2,3,3,0},{0,0,0,2,3,3,0},{0,0,0,2,3,3,0}};
+uint32_t DFT_SCALING_73728[5][8] = {{1,1,1,0,0,3,3,0},{1,1,1,0,0,3,3,0},{1,1,1,0,0,3,3,0},{1,1,1,0,0,3,3,0},{1,1,1,0,0,3,3,0}};
+uint32_t DFT_SCALING_98304[5][8] = {{1,1,0,0,1,3,3,0},{1,1,0,0,1,3,3,0},{1,1,0,0,1,3,3,0},{1,1,0,0,1,3,3,0},{1,1,0,0,1,3,3,0}};
+
+uint32_t IDFT_SCALING_128[2][2] = {{2,2},{1,3}};
+uint32_t IDFT_SCALING_256[2][2] = {{2,2},{1,3}};
+uint32_t IDFT_SCALING_512[2][3] = {{1,2,2},{1,1,3}};
+uint32_t IDFT_SCALING_768[2][3] = {{1,2,2},{1,1,3}};
+uint32_t IDFT_SCALING_1024[2][3] = {{4,1,0},{1,1,3}};
+uint32_t IDFT_SCALING_1536[2][4] = {{1,1,1,3},{1,1,1,3}};
+uint32_t IDFT_SCALING_2048[2][4] = {{3,2,1,0},{1,1,1,3}};
+uint32_t IDFT_SCALING_3072[2][4] = {{1,1,1,3},{1,1,1,3}};
+uint32_t IDFT_SCALING_4096[2][4] = {{3,2,1,0},{1,1,1,3}};
+uint32_t IDFT_SCALING_6144[2][5] = {{1,1,0,3,2},{1,1,1,1,3}};
+uint32_t IDFT_SCALING_8192[2][5] = {{1,0,0,3,3},{1,1,1,1,3}};
+uint32_t IDFT_SCALING_9216[2][5] = {{1,0,0,3,3},{1,1,1,1,3}};
+uint32_t IDFT_SCALING_12288[2][5] = {{1,0,0,3,3},{1,1,1,1,3}};
+uint32_t IDFT_SCALING_16384[2][5] = {{0,0,1,3,3},{1,1,1,1,3}};
+uint32_t IDFT_SCALING_18432[2][6] = {{1,1,0,0,3,3},{1,1,1,1,1,3}};
+uint32_t IDFT_SCALING_24576[2][6] = {{1,1,0,0,3,3},{1,1,1,1,1,3}};
+uint32_t IDFT_SCALING_32768[2][6] = {{1,0,0,1,3,3},{1,1,1,1,1,3}};
+uint32_t IDFT_SCALING_36864[2][6] = {{1,1,0,0,3,3},{1,1,1,1,1,3}};
+uint32_t IDFT_SCALING_49152[2][6] = {{1,0,0,1,3,3},{1,1,1,1,1,3}};
+uint32_t IDFT_SCALING_65536[2][6] = {{0,0,0,2,3,3},{1,1,1,1,1,3}};
+uint32_t IDFT_SCALING_73728[2][7] = {{1,1,1,0,0,3,3},{1,1,1,1,1,1,3}};
+uint32_t IDFT_SCALING_98304[2][7] = {{1,1,0,0,1,3,3},{1,1,1,1,1,1,3}};
 
 /* function description array, to be used when loading the dfts/idfts lib */
 static loader_shlibfunc_t shlib_fdesc[2];
diff --git a/openair1/PHY/TOOLS/fft_double.c b/openair1/PHY/TOOLS/fft_double.c
new file mode 100644
index 0000000000000000000000000000000000000000..4109d0681748f491f76c211e83f0c7b94b9f236f
--- /dev/null
+++ b/openair1/PHY/TOOLS/fft_double.c
@@ -0,0 +1,89 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <math.h>
+#include "tools_defs.h"
+
+
+void twiddle(cd_t *W, int N, double stuff)
+{
+  W->r=cos(stuff*2.0*M_PI/(double)N);
+  W->i=-sin(stuff*2.0*M_PI/(double)N);
+}
+
+
+int bitrev64[64] = {0,32,16,48,8,40,24,56,4,36,20,52,12,44,28,60,2,34,18,50,10,42,26,58,
+6,38,22,54,14,46,30,62,1,33,17,49,9,41,25,57,5,37,21,53,13,45,29,61,
+3,35,19,51,11,43,27,59,7,39,23,55,15,47,31,63};
+int bitrev128[128];
+int bitrev256[256];
+int bitrev512[512];
+int bitrev1024[1024];
+int bitrev2048[2048];
+int bitrev4096[4096];
+
+void init_bitrev() {
+
+  // 128
+  for (int i=0;i<64;i++) { bitrev128[i]=2*bitrev64[i]; bitrev128[i+64]=1+bitrev128[i]; }
+
+  // 256 
+  for (int i=0;i<128;i++) { bitrev256[i]=2*bitrev128[i]; bitrev256[i+128]=1+bitrev256[i]; }
+    
+  // 512 
+  for (int i=0;i<256;i++) { bitrev512[i]=2*bitrev256[i]; bitrev512[i+256]=1+bitrev512[i]; }
+
+  // 1024 
+  for (int i=0;i<512;i++) { bitrev1024[i]=2*bitrev512[i]; bitrev1024[i+512]=1+bitrev1024[i]; }
+
+  // 2048 
+  for (int i=0;i<1024;i++) { bitrev2048[i]=2*bitrev1024[i]; bitrev2048[i+1024]=1+bitrev2048[i]; }
+
+  // 4096 
+  for (int i=0;i<2048;i++) { bitrev4096[i]=2*bitrev2048[i]; bitrev4096[i+2048]=1+bitrev4096[i]; }
+
+}
+
+/** RADIX-2 FFT ALGORITHM */
+/* Double precision*/
+void radix2(cd_t *x, int N)
+{
+  int    n2, k1, N1, N2;
+  cd_t W, bfly[2];
+
+  N1=2;
+  N2=N/2;
+  /** Do 2 Point DFT */
+  for (n2=0; n2<N2; n2++)
+    {
+      /** Radix 2 butterfly */
+      bfly[0].r = (x[n2].r + x[N2 + n2].r);
+      bfly[0].i = (x[n2].i + x[N2 + n2].i);
+
+      bfly[1].r = (x[n2].r - x[N2 + n2].r);
+      bfly[1].i = (x[n2].i - x[N2 + n2].i);
+
+
+
+      twiddle(&W, N, (double)n2);
+      x[n2].r = bfly[0].r;
+      x[n2].i = bfly[0].i;
+      x[n2 + N2].r = bfly[1].r*W.r - bfly[1].i*W.i;
+      x[n2 + N2].i = bfly[1].i*W.r + bfly[1].r*W.i;
+    }
+ 
+  /** Don't recurse if we're down to one butterfly */
+  if (N2!=1) {
+	radix2(&x[0], N2);
+	radix2(&x[N2], N2);
+  }
+}
+
+void normalize(cd_t *x,cd_t *y, int *bitrev, int N) {
+  for (int i=0;i<N;i++) {
+    y[i].r = x[bitrev[i]].r / sqrt((double)N);
+    y[i].i = x[bitrev[i]].i / sqrt((double)N);
+  }
+}
+
+
+
diff --git a/openair1/PHY/TOOLS/oai_dfts.c b/openair1/PHY/TOOLS/oai_dfts.c
index 36bae5bd3ddea5c9781fd6070f746650226fdfd9..ced79bef7d7f9e46c8bd48d131f0c3c6ef38d063 100644
--- a/openair1/PHY/TOOLS/oai_dfts.c
+++ b/openair1/PHY/TOOLS/oai_dfts.c
@@ -58,7 +58,6 @@
 
 #define print_ints(s,x) printf("%s %d %d %d %d\n",s,(x)[0],(x)[1],(x)[2],(x)[3])
 
-
 const static int16_t conjugatedft[32] __attribute__((aligned(32))) = {-1,1,-1,1,-1,1,-1,1,-1,1,-1,1,-1,1,-1,1,-1,1};
 
 
@@ -1038,45 +1037,82 @@ __attribute__((always_inline)) static inline void bfly5_tw1(simde__m128i *x0,
   *(y4) = simde_mm_adds_epi16(*(x0), *(y4));
 }
 
-// performs 4x4 transpose of input x (complex interleaved) using 128bit SIMD intrinsics
+// performs 8x4 transpose of input x (complex interleaved) using 256bit SIMD intrinsics
 // i.e. x = [x0r x0i x1r x1i ... x15r x15i], y = [x0r x0i x4r x4i x8r x8i x12r x12i x1r x1i x5r x5i x9r x9i x13r x13i x2r x2i ... x15r x15i]
 __attribute__((always_inline)) static inline void transpose16_ooff_simd256(simde__m256i *x, simde__m256i *y, int off)
 {
-  register simde__m256i ytmp0, ytmp1, ytmp2, ytmp3, ytmp4, ytmp5, ytmp6, ytmp7;
+  // x[0] = [x0 x1 x2 x3 x4 x5 x6 x7]
+  // x[1] = [x8 x9 x10 x11 x12 x13 x14 x15]
+  // x[2] = [x16 x17 x18 x19 x20 x21 x22 x23]
+  // x[3] = [x24 x25 x26 x27 x28 x29 x30 x31]
+  // y[0] = [x0 x4 x8 x12 x16 x20 x24 x28]
+  // y[off] = [x1 x5 x9 x13 x17 x21 x25 x29]
+  // y[2*off] = [x2 x6 x10 x14 x18 x22 x26 x30]
+  // y[3*off] = [x3 x7 x11 x15 x19 x23 x27 x31]
   simde__m256i *y2 = y;
+#ifndef __AVX512VBMI__
+  register simde__m256i ytmp0, ytmp1, ytmp2, ytmp3, ytmp4, ytmp5, ytmp6, ytmp7;
   simde__m256i const perm_mask = simde_mm256_set_epi32(7, 3, 5, 1, 6, 2, 4, 0);
 
-  ytmp0 = simde_mm256_permutevar8x32_epi32(x[0],perm_mask);  // x00 x10 x01 x11 x02 x12 x03 x13
-  ytmp1 = simde_mm256_permutevar8x32_epi32(x[1],perm_mask);  // x20 x30 x21 x31 x22 x32 x23 x33
-  ytmp2 = simde_mm256_permutevar8x32_epi32(x[2],perm_mask);  // x40 x50 x41 x51 x42 x52 x43 x53
-  ytmp3 = simde_mm256_permutevar8x32_epi32(x[3],perm_mask);  // x60 x70 x61 x71 x62 x72 x63 x73
-  ytmp4 = simde_mm256_unpacklo_epi64(ytmp0,ytmp1);           // x00 x10 x20 x30 x01 x11 x21 x31
-  ytmp5 = simde_mm256_unpackhi_epi64(ytmp0,ytmp1);           // x02 x12 x22 x32 x03 x13 x23 x33
-  ytmp6 = simde_mm256_unpacklo_epi64(ytmp2,ytmp3);           // x40 x50 x60 x70 x41 x51 x61 x71
-  ytmp7 = simde_mm256_unpackhi_epi64(ytmp2,ytmp3);           // x42 x52 x62 x72 x43 x53 x63 x73
+  ytmp0 = simde_mm256_permutevar8x32_epi32(x[0],perm_mask); // x0 x4  x2  x6  x1 x5  x3  x7 
+  ytmp1 = simde_mm256_permutevar8x32_epi32(x[1],perm_mask); // x8 x12 x10 x14 x9 x13 x11 x18 
+  ytmp2 = simde_mm256_permutevar8x32_epi32(x[2],perm_mask); // x16 x20 x18 x22 x17 x21 x19 x23 
+
+  ytmp3 = simde_mm256_permutevar8x32_epi32(x[3],perm_mask); // x24 x28 x26 x30 x25 x29 x27 x31
+                                                            
+  ytmp4 = simde_mm256_unpacklo_epi64(ytmp0,ytmp1);          // x0  x4  x8  x12 x1  x5  x9  x13 
+  ytmp5 = simde_mm256_unpackhi_epi64(ytmp0,ytmp1);          // x2  x6  x10 x14 x3  x7  x11 x18 
+  ytmp6 = simde_mm256_unpacklo_epi64(ytmp2,ytmp3);          // x16 x20 x24 x28 x17 x21 x25 x29 
+  ytmp7 = simde_mm256_unpackhi_epi64(ytmp2,ytmp3);          // x18 x22 x26 x30 x19 x23 x27 x31 
 
-  *y2    = simde_mm256_insertf128_si256(ytmp4,simde_mm256_extracti128_si256(ytmp6,0),1);  //x00 x10 x20 x30 x40 x50 x60 x70
+  *y2    = simde_mm256_insertf128_si256(ytmp4,simde_mm256_extracti128_si256(ytmp6,0),1);  // x0 x4 x8 x12 x16 x20 x24 x28
   y2+=off;  
-  *y2    = simde_mm256_insertf128_si256(ytmp6,simde_mm256_extracti128_si256(ytmp4,1),0);  //x01 x11 x21 x31 x41 x51 x61 x71
+  *y2    = simde_mm256_insertf128_si256(ytmp6,simde_mm256_extracti128_si256(ytmp4,1),0);  // x1 x5 x9 x13 x17 x21 x25 x29
   y2+=off;  
-  *y2    = simde_mm256_insertf128_si256(ytmp5,simde_mm256_extracti128_si256(ytmp7,0),1);  //x00 x10 x20 x30 x40 x50 x60 x70
+  *y2    = simde_mm256_insertf128_si256(ytmp5,simde_mm256_extracti128_si256(ytmp7,0),1);  // x2 x6 x10 x14 x18 x22 x26 x30
   y2+=off;  
-  *y2    = simde_mm256_insertf128_si256(ytmp7,simde_mm256_extracti128_si256(ytmp5,1),0);  //x01 x11 x21 x31 x41 x51 x61 x71
+  *y2    = simde_mm256_insertf128_si256(ytmp7,simde_mm256_extracti128_si256(ytmp5,1),0);  // x3 x7 x11 x15 x19 x23 x27 x31
+#else
+  register simde__m256i ytmp0, ytmp1, ytmp2, ytmp3;
+  simde__m256i const perm_mask1 = simde_mm256_set_epi32(13, 9, 5, 1, 12, 8, 4, 0);
+  simde__m256i const perm_mask2 = simde_mm256_set_epi32(15, 11, 7, 3, 14, 10, 6, 2);
+
+  simde__m256i const perm_mask3 = simde_mm256_set_epi64x(5, 4, 1, 0);
+  simde__m256i const perm_mask4 = simde_mm256_set_epi64x(7, 6, 3, 2);
+  ytmp0 = _mm256_permutex2var_epi32(x[0],perm_mask1,x[1]); // x0 x4  x8  x12  x1 x5  x9  x13 
+  ytmp1 = _mm256_permutex2var_epi32(x[2],perm_mask1,x[3]); // x16 x20 x24 x28 x17 x21 x25 x29 
+  ytmp2 = _mm256_permutex2var_epi32(x[0],perm_mask2,x[1]); // x2 x6  x10  x14  x3 x7  x11  x15 
+  ytmp3 = _mm256_permutex2var_epi32(x[2],perm_mask2,x[3]); // x18 x22 x26 x30 x19 x23 x27 x31
+  *y2 = _mm256_permutex2var_epi64(ytmp0,perm_mask3,ytmp1);
+  y2+=off;
+  *y2 = _mm256_permutex2var_epi64(ytmp0,perm_mask4,ytmp1);
+  y2+=off;
+  *y2 = _mm256_permutex2var_epi64(ytmp2,perm_mask3,ytmp3);
+  y2+=off;
+  *y2 = _mm256_permutex2var_epi64(ytmp2,perm_mask4,ytmp3);
+#endif
 }
 
 __attribute__((always_inline)) static inline void transpose4_ooff_simd256(simde__m256i *x, simde__m256i *y, int off)
 {
-  simde__m256i const perm_mask = simde_mm256_set_epi32(7, 5, 3, 1, 6, 4, 2, 0);
-  simde__m256i perm_tmp0, perm_tmp1;
-
   // x[0] = [x0 x1 x2 x3 x4 x5 x6 x7]
   // x[1] = [x8 x9 x10 x11 x12 x13 x14]
   // y[0] = [x0 x2 x4 x6 x8 x10 x12 x14]
   // y[off] = [x1 x3 x5 x7 x9 x11 x13 x15]
+#ifndef __AVX512VBMI__
+  simde__m256i const perm_mask = simde_mm256_set_epi32(7, 5, 3, 1, 6, 4, 2, 0);
+  simde__m256i perm_tmp0, perm_tmp1;
+
   perm_tmp0 = simde_mm256_permutevar8x32_epi32(x[0],perm_mask);
   perm_tmp1 = simde_mm256_permutevar8x32_epi32(x[1],perm_mask);
   y[0]   = simde_mm256_insertf128_si256(perm_tmp0,simde_mm256_extracti128_si256(perm_tmp1,0),1);
   y[off] = simde_mm256_insertf128_si256(perm_tmp1,simde_mm256_extracti128_si256(perm_tmp0,1),0);
+#else
+  __m256i const perm_mask1 = _mm256_set_epi32(14,12,10,8,6,4,2,0);
+  __m256i const perm_mask2 = _mm256_set_epi32(15,13,11,9,7,5,3,1);
+  y[0]   = _mm256_permutex2var_epi32(x[0],perm_mask1,x[1]);
+  y[off] = _mm256_permutex2var_epi32(x[0],perm_mask2,x[1]);
+#endif
 }
 
 // 16-point optimized DFT kernel
@@ -1186,8 +1222,10 @@ static inline void dft16(int16_t *x,int16_t *y) __attribute__((always_inline)
 }
 #endif
 
+//#define USE_DFT16_SHIFT
+
 // Does two 16-point DFTS (x[0 .. 15] is 128 LSBs of input vector, x[16..31] is in 128 MSBs)
-__attribute__((always_inline)) static inline void dft16_simd256(int16_t *x, int16_t *y)
+__attribute__((always_inline)) static inline void dft16_simd256(int16_t *x, int16_t *y,int scale)
 {
   simde__m256i *tw16a_256 = (simde__m256i *)tw16arep, *tw16b_256 = (simde__m256i *)tw16brep, *x256 = (simde__m256i *)x,
                *y256 = (simde__m256i *)y;
@@ -1226,7 +1264,10 @@ __attribute__((always_inline)) static inline void dft16_simd256(int16_t *x, int1
                                                             0,
                                                             3,
                                                             2);
-
+#ifdef __AVX512VBMI__
+  const __m256i outputshufa = _mm256_set_epi64x(5,4,1,0);
+  const __m256i outputshufb = _mm256_set_epi64x(7,6,3,2);
+#endif
   // First stage : 4 Radix-4 butterflies without input twiddles
 
   x02t    = simde_mm256_adds_epi16(x256[0],x256[2]);
@@ -1247,14 +1288,18 @@ __attribute__((always_inline)) static inline void dft16_simd256(int16_t *x, int1
   print_shorts256("xtmp2",(int16_t*)&xtmp2);
   print_shorts256("xtmp3",(int16_t*)&xtmp3);*/
 
-  ytmp0   = simde_mm256_unpacklo_epi32(xtmp0,xtmp1);  
-  ytmp1   = simde_mm256_unpackhi_epi32(xtmp0,xtmp1);
-  ytmp2   = simde_mm256_unpacklo_epi32(xtmp2,xtmp3);
-  ytmp3   = simde_mm256_unpackhi_epi32(xtmp2,xtmp3);
-  xtmp0   = simde_mm256_unpacklo_epi64(ytmp0,ytmp2);
-  xtmp1   = simde_mm256_unpackhi_epi64(ytmp0,ytmp2);
-  xtmp2   = simde_mm256_unpacklo_epi64(ytmp1,ytmp3);
-  xtmp3   = simde_mm256_unpackhi_epi64(ytmp1,ytmp3);
+  // x0  x1  x2  x3  x4  x5  x6  x7
+  // x8  x9  x10 x11 x12 x13 x14 x15
+  // x16 x17 x18 x19 x20 x21 x22 x23
+  // x24 x25 x26 x27 x28 x29 x30 x31
+  ytmp0   = simde_mm256_unpacklo_epi32(xtmp0,xtmp1); // x0 x8 x1 x9  x4 x12 x5 x13  
+  ytmp1   = simde_mm256_unpackhi_epi32(xtmp0,xtmp1); // x2 x10 x3 x11 x6 x14 x7 x15
+  ytmp2   = simde_mm256_unpacklo_epi32(xtmp2,xtmp3); // x16 x24 x17 x25 x20 x28 x21 x29
+  ytmp3   = simde_mm256_unpackhi_epi32(xtmp2,xtmp3); // x18 x26 x19 x27 x22 x30 x23 x31
+  xtmp0   = simde_mm256_unpacklo_epi64(ytmp0,ytmp2); // x0 x8 x16 x24 x4 x12 x20 x28
+  xtmp1   = simde_mm256_unpackhi_epi64(ytmp0,ytmp2); // x1 x9 x17 x25 x5 x13 x21 x29
+  xtmp2   = simde_mm256_unpacklo_epi64(ytmp1,ytmp3); // x2 x10 x18 x26 x6 x14 x22 x30
+  xtmp3   = simde_mm256_unpackhi_epi64(ytmp1,ytmp3); // x3 x11 x19 x27 x7 x15 x23 x31
 
   // Second stage : 4 Radix-4 butterflies with input twiddles
   xtmp1 = packed_cmult2_256(xtmp1,tw16a_256[0],tw16b_256[0]);
@@ -1268,27 +1313,32 @@ __attribute__((always_inline)) static inline void dft16_simd256(int16_t *x, int1
 
   x02t    = simde_mm256_adds_epi16(xtmp0,xtmp2);
   x13t    = simde_mm256_adds_epi16(xtmp1,xtmp3);
-  ytmp0 = simde_mm256_srai_epi16(simde_mm256_adds_epi16(x02t, x13t), 2);
-  ytmp2 = simde_mm256_srai_epi16(simde_mm256_subs_epi16(x02t, x13t), 2);
+  ytmp0   = simde_mm256_srai_epi16(simde_mm256_adds_epi16(x02t, x13t), scale);
+  ytmp2   = simde_mm256_srai_epi16(simde_mm256_subs_epi16(x02t, x13t), scale);
   x1_flip = simde_mm256_sign_epi16(xtmp1, *(simde__m256i *)conjugatedft);
   x1_flip = simde_mm256_shuffle_epi8(x1_flip,complex_shuffle);
   x3_flip = simde_mm256_sign_epi16(xtmp3, *(simde__m256i *)conjugatedft);
   x3_flip = simde_mm256_shuffle_epi8(x3_flip,complex_shuffle);
   x02t    = simde_mm256_subs_epi16(xtmp0,xtmp2);
   x13t    = simde_mm256_subs_epi16(x1_flip,x3_flip);
-  ytmp1 = simde_mm256_srai_epi16(simde_mm256_adds_epi16(x02t, x13t), 2); // x0 + x1f - x2 - x3f
-  ytmp3 = simde_mm256_srai_epi16(simde_mm256_subs_epi16(x02t, x13t), 2); // x0 - x1f - x2 + x3f
+  ytmp1 = simde_mm256_srai_epi16(simde_mm256_adds_epi16(x02t, x13t), scale); // x0 + x1f - x2 - x3f
+  ytmp3 = simde_mm256_srai_epi16(simde_mm256_subs_epi16(x02t, x13t), scale); // x0 - x1f - x2 + x3f
 
   // [y0  y1  y2  y3  y16 y17 y18 y19]
   // [y4  y5  y6  y7  y20 y21 y22 y23]
   // [y8  y9  y10 y11 y24 y25 y26 y27]
   // [y12 y13 y14 y15 y28 y29 y30 y31]
-
+#ifndef __AVX512VBMI__ 
   y256[0] = simde_mm256_insertf128_si256(ytmp0,simde_mm256_extracti128_si256(ytmp1,0),1);
   y256[1] = simde_mm256_insertf128_si256(ytmp2,simde_mm256_extracti128_si256(ytmp3,0),1);
   y256[2] = simde_mm256_insertf128_si256(ytmp1,simde_mm256_extracti128_si256(ytmp0,1),0);
   y256[3] = simde_mm256_insertf128_si256(ytmp3,simde_mm256_extracti128_si256(ytmp2,1),0);
-
+#else
+  y256[0] = _mm256_permutex2var_epi64(ytmp0,outputshufa,ytmp1);
+  y256[1] = _mm256_permutex2var_epi64(ytmp2,outputshufa,ytmp3);
+  y256[2] = _mm256_permutex2var_epi64(ytmp0,outputshufb,ytmp1);
+  y256[3] = _mm256_permutex2var_epi64(ytmp2,outputshufb,ytmp3);
+#endif
   // [y0  y1  y2  y3  y4  y5  y6  y7]
   // [y8  y9  y10 y11 y12 y13 y14 y15]
   // [y16 y17 y18 y19 y20 y21 y22 y23]
@@ -1401,6 +1451,10 @@ __attribute__((always_inline)) static inline void idft16_simd256(int16_t *x, int
                                                             3,
                                                             2);
 
+#ifdef __AVX512VBMI__
+  const __m256i outputshufa = _mm256_set_epi64x(5,4,1,0);
+  const __m256i outputshufb = _mm256_set_epi64x(7,6,3,2);
+#endif
   // First stage : 4 Radix-4 butterflies without input twiddles
 
   x02t    = simde_mm256_adds_epi16(x256[0],x256[2]);
@@ -1448,11 +1502,17 @@ __attribute__((always_inline)) static inline void idft16_simd256(int16_t *x, int
   // [y8  y9  y10 y11 y24 y25 y26 y27]
   // [y12 y13 y14 y15 y28 y29 y30 y31]
 
+#ifndef __AVX512VBMI__
   y256[0] = simde_mm256_insertf128_si256(ytmp0,simde_mm256_extracti128_si256(ytmp1,0),1);
   y256[1] = simde_mm256_insertf128_si256(ytmp2,simde_mm256_extracti128_si256(ytmp3,0),1);
   y256[2] = simde_mm256_insertf128_si256(ytmp1,simde_mm256_extracti128_si256(ytmp0,1),0);
   y256[3] = simde_mm256_insertf128_si256(ytmp3,simde_mm256_extracti128_si256(ytmp2,1),0);
-
+#else
+  y256[0] = _mm256_permutex2var_epi64(ytmp0,outputshufa,ytmp1);
+  y256[1] = _mm256_permutex2var_epi64(ytmp2,outputshufa,ytmp3);
+  y256[2] = _mm256_permutex2var_epi64(ytmp0,outputshufb,ytmp1);
+  y256[3] = _mm256_permutex2var_epi64(ytmp2,outputshufb,ytmp3);
+#endif
 }
 // 64-point optimized DFT
 
@@ -1520,14 +1580,13 @@ const static int16_t tw64c[96] __attribute__((aligned(32))) = {
 #define set1_int16_simd256(a) simde_mm256_set1_epi16(a);
 #define mulhi_int16_simd256(a,b) simde_mm256_mulhrs_epi16(a,b); //simde_mm256_slli_epi16(simde_mm256_mulhi_epi16(a,b),1);
 
-void dft64(int16_t *x,int16_t *y,unsigned char scale)
+void dft64(int16_t *x,int16_t *y,unsigned int *scale)
 {
 
   simd256_q15_t xtmp[16],ytmp[16],*tw64a_256=(simd256_q15_t *)tw64a,*tw64b_256=(simd256_q15_t *)tw64b,*x256=(simd256_q15_t *)x,*y256=(simd256_q15_t *)y;
-  simd256_q15_t xintl0,xintl1,xintl2,xintl3,xintl4,xintl5,xintl6,xintl7;
-  simd256_q15_t const perm_mask = simde_mm256_set_epi32(7, 3, 5, 1, 6, 2, 4, 0);
-
 
+  int scale16=0;
+  if (scale) scale16 = scale[1];
 #ifdef D64STATS
   time_stats_t ts_t,ts_d,ts_b;
 
@@ -1541,53 +1600,38 @@ void dft64(int16_t *x,int16_t *y,unsigned char scale)
   stop_meas(&ts_t);
   start_meas(&ts_d);
 #endif
-  /*  
-  print_shorts256("x2560",(int16_t*)x256);
-  print_shorts256("x2561",(int16_t*)(x256+1));
-  print_shorts256("x2562",(int16_t*)(x256+2));
-  print_shorts256("x2563",(int16_t*)(x256+3));
-  print_shorts256("x2564",(int16_t*)(x256+4));
-  print_shorts256("x2565",(int16_t*)(x256+5));
-  print_shorts256("x2566",(int16_t*)(x256+6));
-  print_shorts256("x2567",(int16_t*)(x256+7));
-  */
-  xintl0 = simde_mm256_permutevar8x32_epi32(x256[0],perm_mask);  // x0  x4  x1  x5  x2  x6  x3  x7
-  xintl1 = simde_mm256_permutevar8x32_epi32(x256[1],perm_mask);  // x8  x12 x9  x13 x10 x14 x11 x15
-  xintl2 = simde_mm256_permutevar8x32_epi32(x256[2],perm_mask);  // x16 x20 x17 x21 x18 x22 x19 x23
-  xintl3 = simde_mm256_permutevar8x32_epi32(x256[3],perm_mask);  // x24 x28 x25 x29 x26 x30 x27 x31
-  xintl4 = simde_mm256_permutevar8x32_epi32(x256[4],perm_mask);  // x32 x28 x25 x29 x26 x30 x27 x31
-  xintl5 = simde_mm256_permutevar8x32_epi32(x256[5],perm_mask);  // x40 x28 x25 x29 x26 x30 x27 x31
-  xintl6 = simde_mm256_permutevar8x32_epi32(x256[6],perm_mask);  // x48 x28 x25 x29 x26 x30 x27 x31
-  xintl7 = simde_mm256_permutevar8x32_epi32(x256[7],perm_mask);  // x56 x28 x25 x29 x26 x30 x27 x31
-  /*
-  print_shorts256("xintl0",(int16_t*)&xintl0);
-  print_shorts256("xintl1",(int16_t*)&xintl1);
-  print_shorts256("xintl2",(int16_t*)&xintl2);
-  print_shorts256("xintl3",(int16_t*)&xintl3);
-  print_shorts256("xintl4",(int16_t*)&xintl4);
-  print_shorts256("xintl5",(int16_t*)&xintl5);
-  print_shorts256("xintl6",(int16_t*)&xintl6);
-  print_shorts256("xintl7",(int16_t*)&xintl7);
-  */
+#ifndef __AVX512VBMI__
+  simd256_q15_t xintl0,xintl1,xintl2,xintl3,xintl4,xintl5,xintl6,xintl7;
+  simd256_q15_t const perm_mask = simde_mm256_set_epi32(7, 3, 5, 1, 6, 2, 4, 0);
+  xintl0 = simde_mm256_permutevar8x32_epi32(x256[0],perm_mask);  // x0  x4  x2  x6  x1  x5  x3  x7
+  xintl1 = simde_mm256_permutevar8x32_epi32(x256[1],perm_mask);  // x8  x12 x10  x14 x9 x13 x11 x15
+  xintl2 = simde_mm256_permutevar8x32_epi32(x256[2],perm_mask);  // x16 x20 x18 x22 x17 x21 x19 x23
+  xintl3 = simde_mm256_permutevar8x32_epi32(x256[3],perm_mask);  // x24 x28 x26 x30 x25 x29 x27 x31
+  xintl4 = simde_mm256_permutevar8x32_epi32(x256[4],perm_mask);  // x32 x28 x34 x38 x33 x37 x35 x39
+  xintl5 = simde_mm256_permutevar8x32_epi32(x256[5],perm_mask);  
+  xintl6 = simde_mm256_permutevar8x32_epi32(x256[6],perm_mask);  
+  xintl7 = simde_mm256_permutevar8x32_epi32(x256[7],perm_mask);  
   xtmp[0] = simde_mm256_unpacklo_epi64(xintl0,xintl1);        // x0  x4  x8  x12 x1  x5  x9  x13
   xtmp[4] = simde_mm256_unpackhi_epi64(xintl0,xintl1);        // x2  x6  x10 x14 x3  x7  x11 x15
-  xtmp[1] = simde_mm256_unpacklo_epi64(xintl2,xintl3);        // x16 x20 x24 x28 x17 x21 x25 x29
-  xtmp[5] = simde_mm256_unpackhi_epi64(xintl2,xintl3);        // x18 x22 x26 x30 x19 x23 x27 x31
-  xtmp[2] = simde_mm256_unpacklo_epi64(xintl4,xintl5);        // x32 x36 x40 x44 x33 x37 x41 x45
-  xtmp[6] = simde_mm256_unpackhi_epi64(xintl4,xintl5);        // x34 x38 x42 x46 x35 x39 x43 x47
-  xtmp[3] = simde_mm256_unpacklo_epi64(xintl6,xintl7);        // x48 x52 x56 x60 x49 x53 x57 x61
-  xtmp[7] = simde_mm256_unpackhi_epi64(xintl6,xintl7);        // x50 x54 x58 x62 x51 x55 x59 x63
-  /*
-  print_shorts256("xtmp0",(int16_t*)xtmp);
-  print_shorts256("xtmp1",(int16_t*)(xtmp+1));
-  print_shorts256("xtmp2",(int16_t*)(xtmp+2));
-  print_shorts256("xtmp3",(int16_t*)(xtmp+3));
-  print_shorts256("xtmp4",(int16_t*)(xtmp+4));
-  print_shorts256("xtmp5",(int16_t*)(xtmp+5));
-  print_shorts256("xtmp6",(int16_t*)(xtmp+6));
-  print_shorts256("xtmp7",(int16_t*)(xtmp+7));
-  */
-  dft16_simd256((int16_t*)(xtmp),(int16_t*)ytmp);
+  xtmp[1] = simde_mm256_unpacklo_epi64(xintl2,xintl3);        
+  xtmp[5] = simde_mm256_unpackhi_epi64(xintl2,xintl3);        
+  xtmp[2] = simde_mm256_unpacklo_epi64(xintl4,xintl5);        
+  xtmp[6] = simde_mm256_unpackhi_epi64(xintl4,xintl5);        
+  xtmp[3] = simde_mm256_unpacklo_epi64(xintl6,xintl7);        
+  xtmp[7] = simde_mm256_unpackhi_epi64(xintl6,xintl7);        
+#else
+  __m256i const perm_mask1 = _mm256_set_epi32(13, 9, 5, 1, 12, 8, 4, 0);
+  __m256i const perm_mask2 = _mm256_set_epi32(15, 11, 7, 3, 14, 10, 6, 2);
+  xtmp[0] = _mm256_permutex2var_epi32(x256[0],perm_mask1,x256[1]); // x0 x4  x8  x12  x2 x6  x10  x14 
+  xtmp[1] = _mm256_permutex2var_epi32(x256[2],perm_mask1,x256[3]); // x16 x20 x24 x28 x18 x22 x26 x30 
+  xtmp[2] = _mm256_permutex2var_epi32(x256[4],perm_mask1,x256[5]); // x32 x36 x40 x44 x34 x38 x42 x46 
+  xtmp[3] = _mm256_permutex2var_epi32(x256[6],perm_mask1,x256[7]); // x48 x52 x56 x60 x50 x54 x58 x62 
+  xtmp[4] = _mm256_permutex2var_epi32(x256[0],perm_mask2,x256[1]); // x1 x5  x9  x13  x3 x7  x11  x15 
+  xtmp[5] = _mm256_permutex2var_epi32(x256[2],perm_mask2,x256[3]); // x17 x21 x25 x29 x19 x23 x27 x31 
+  xtmp[6] = _mm256_permutex2var_epi32(x256[4],perm_mask2,x256[5]); // x33 x37 x41 x45 x35 x39 x43 x46 
+  xtmp[7] = _mm256_permutex2var_epi32(x256[6],perm_mask2,x256[7]); // x49 x53 x57 x61 x51 x55 x59 x63 
+#endif
+  dft16_simd256((int16_t*)(xtmp),(int16_t*)ytmp,scale16);
   // [y0  y1  y2  y3  y4  y5  y6  y7]
   // [y8  y9  y10 y11 y12 y13 y14 y15]
   // [y16 y17 y18 y19 y20 y21 y22 y23]
@@ -1598,7 +1642,7 @@ void dft64(int16_t *x,int16_t *y,unsigned char scale)
   print_shorts256("ytmp2",(int16_t*)(ytmp+2));
   print_shorts256("ytmp3",(int16_t*)(ytmp+3));
   */
-  dft16_simd256((int16_t*)(xtmp+4),(int16_t*)(ytmp+4));
+  dft16_simd256((int16_t*)(xtmp+4),(int16_t*)(ytmp+4),scale16);
   // [y32 y33 y34 y35 y36 y37 y38 y39]
   // [y40 y41 y42 y43 y44 y45 y46 y47]
   // [y48 y49 y50 y51 y52 y53 y54 y55]
@@ -1649,25 +1693,24 @@ void dft64(int16_t *x,int16_t *y,unsigned char scale)
 #endif
 
 
-  if (scale>0) {
-    y256[0] = shiftright_int16_simd256(y256[0], 1);
-    y256[1] = shiftright_int16_simd256(y256[1], 1);
-    y256[2] = shiftright_int16_simd256(y256[2], 1);
-    y256[3] = shiftright_int16_simd256(y256[3], 1);
-    y256[4] = shiftright_int16_simd256(y256[4], 1);
-    y256[5] = shiftright_int16_simd256(y256[5], 1);
-    y256[6] = shiftright_int16_simd256(y256[6], 1);
-    y256[7] = shiftright_int16_simd256(y256[7], 1);
+  if (scale && *scale>0) {
+    unsigned int scalec=*scale;
+    y256[0]  = shiftright_int16_simd256(y256[0],scalec);
+    y256[1]  = shiftright_int16_simd256(y256[1],scalec);
+    y256[2]  = shiftright_int16_simd256(y256[2],scalec);
+    y256[3]  = shiftright_int16_simd256(y256[3],scalec);
+    y256[4]  = shiftright_int16_simd256(y256[4],scalec);
+    y256[5]  = shiftright_int16_simd256(y256[5],scalec);
+    y256[6]  = shiftright_int16_simd256(y256[6],scalec);
+    y256[7]  = shiftright_int16_simd256(y256[7],scalec);
   }
 
 }
 
-void idft64(int16_t *x,int16_t *y,unsigned char scale)
+void idft64(int16_t *x,int16_t *y,unsigned int *scale)
 {
 
   simd256_q15_t xtmp[16],ytmp[16],*tw64a_256=(simd256_q15_t *)tw64,*tw64b_256=(simd256_q15_t *)tw64c,*x256=(simd256_q15_t *)x,*y256=(simd256_q15_t *)y;
-  register simd256_q15_t xintl0,xintl1,xintl2,xintl3,xintl4,xintl5,xintl6,xintl7;
-  simd256_q15_t const perm_mask = simde_mm256_set_epi32(7, 3, 5, 1, 6, 2, 4, 0);
 
 
 #ifdef D64STATS
@@ -1684,24 +1727,37 @@ void idft64(int16_t *x,int16_t *y,unsigned char scale)
   start_meas(&ts_d);
 #endif
 
-  xintl0 = simde_mm256_permutevar8x32_epi32(x256[0],perm_mask);  // x0  x4  x1  x5  x2  x6  x3  x7
-  xintl1 = simde_mm256_permutevar8x32_epi32(x256[1],perm_mask);  // x8  x12 x9  x13 x10 x14 x11 x15
-  xintl2 = simde_mm256_permutevar8x32_epi32(x256[2],perm_mask);  // x16 x20 x17 x21 x18 x22 x19 x23
-  xintl3 = simde_mm256_permutevar8x32_epi32(x256[3],perm_mask);  // x24 x28 x25 x29 x26 x30 x27 x31
-  xintl4 = simde_mm256_permutevar8x32_epi32(x256[4],perm_mask);  // x24 x28 x25 x29 x26 x30 x27 x31
-  xintl5 = simde_mm256_permutevar8x32_epi32(x256[5],perm_mask);  // x24 x28 x25 x29 x26 x30 x27 x31
-  xintl6 = simde_mm256_permutevar8x32_epi32(x256[6],perm_mask);  // x24 x28 x25 x29 x26 x30 x27 x31
-  xintl7 = simde_mm256_permutevar8x32_epi32(x256[7],perm_mask);  // x24 x28 x25 x29 x26 x30 x27 x31
-
+#ifndef __AVX512VBMI__
+  simd256_q15_t xintl0,xintl1,xintl2,xintl3,xintl4,xintl5,xintl6,xintl7;
+  simd256_q15_t const perm_mask = simde_mm256_set_epi32(7, 3, 5, 1, 6, 2, 4, 0);
+  xintl0 = simde_mm256_permutevar8x32_epi32(x256[0],perm_mask);  // x0  x4  x2  x6  x1  x5  x3  x7
+  xintl1 = simde_mm256_permutevar8x32_epi32(x256[1],perm_mask);  // x8  x12 x10  x14 x9 x13 x11 x15
+  xintl2 = simde_mm256_permutevar8x32_epi32(x256[2],perm_mask);  // x16 x20 x18 x22 x17 x21 x19 x23
+  xintl3 = simde_mm256_permutevar8x32_epi32(x256[3],perm_mask);  // x24 x28 x26 x30 x25 x29 x27 x31
+  xintl4 = simde_mm256_permutevar8x32_epi32(x256[4],perm_mask);  // x32 x28 x34 x38 x33 x37 x35 x39
+  xintl5 = simde_mm256_permutevar8x32_epi32(x256[5],perm_mask);  
+  xintl6 = simde_mm256_permutevar8x32_epi32(x256[6],perm_mask);  
+  xintl7 = simde_mm256_permutevar8x32_epi32(x256[7],perm_mask);  
   xtmp[0] = simde_mm256_unpacklo_epi64(xintl0,xintl1);        // x0  x4  x8  x12 x1  x5  x9  x13
   xtmp[4] = simde_mm256_unpackhi_epi64(xintl0,xintl1);        // x2  x6  x10 x14 x3  x7  x11 x15
-  xtmp[1] = simde_mm256_unpacklo_epi64(xintl2,xintl3);        // x16 x20 x24 x28 x17 x21 x25 x29
-  xtmp[5] = simde_mm256_unpackhi_epi64(xintl2,xintl3);        // x18 x22 x26 x30 x19 x23 x27 x31
-  xtmp[2] = simde_mm256_unpacklo_epi64(xintl4,xintl5);        // x32 x36 x40 x44 x33 x37 x41 x45
-  xtmp[6] = simde_mm256_unpackhi_epi64(xintl4,xintl5);        // x34 x38 x42 x46 x35 x39 x43 x47
-  xtmp[3] = simde_mm256_unpacklo_epi64(xintl6,xintl7);        // x48 x52 x56 x60 x49 x53 x57 x61
-  xtmp[7] = simde_mm256_unpackhi_epi64(xintl6,xintl7);        // x50 x54 x58 x62 x51 x55 x59 x63
-
+  xtmp[1] = simde_mm256_unpacklo_epi64(xintl2,xintl3);        
+  xtmp[5] = simde_mm256_unpackhi_epi64(xintl2,xintl3);        
+  xtmp[2] = simde_mm256_unpacklo_epi64(xintl4,xintl5);        
+  xtmp[6] = simde_mm256_unpackhi_epi64(xintl4,xintl5);        
+  xtmp[3] = simde_mm256_unpacklo_epi64(xintl6,xintl7);        
+  xtmp[7] = simde_mm256_unpackhi_epi64(xintl6,xintl7);        
+#else
+  __m256i const perm_mask1 = _mm256_set_epi32(13, 9, 5, 1, 12, 8, 4, 0);
+  __m256i const perm_mask2 = _mm256_set_epi32(15, 11, 7, 3, 14, 10, 6, 2);
+  xtmp[0] = _mm256_permutex2var_epi32(x256[0],perm_mask1,x256[1]); // x0 x4  x8  x12  x2 x6  x10  x14 
+  xtmp[1] = _mm256_permutex2var_epi32(x256[2],perm_mask1,x256[3]); // x16 x20 x24 x28 x18 x22 x26 x30 
+  xtmp[2] = _mm256_permutex2var_epi32(x256[4],perm_mask1,x256[5]); // x32 x36 x40 x44 x34 x38 x42 x46 
+  xtmp[3] = _mm256_permutex2var_epi32(x256[6],perm_mask1,x256[7]); // x48 x52 x56 x60 x50 x54 x58 x62 
+  xtmp[4] = _mm256_permutex2var_epi32(x256[0],perm_mask2,x256[1]); // x1 x5  x9  x13  x3 x7  x11  x15 
+  xtmp[5] = _mm256_permutex2var_epi32(x256[2],perm_mask2,x256[3]); // x17 x21 x25 x29 x19 x23 x27 x31 
+  xtmp[6] = _mm256_permutex2var_epi32(x256[4],perm_mask2,x256[5]); // x33 x37 x41 x45 x35 x39 x43 x46 
+  xtmp[7] = _mm256_permutex2var_epi32(x256[6],perm_mask2,x256[7]); // x49 x53 x57 x61 x51 x55 x59 x63 
+#endif
 
   idft16_simd256((int16_t*)(xtmp),(int16_t*)ytmp);
   // [y0  y1  y2  y3  y16 y17 y18 y19]
@@ -1746,15 +1802,16 @@ void idft64(int16_t *x,int16_t *y,unsigned char scale)
 #endif
 
 
-  if (scale>0) {
-    y256[0]  = shiftright_int16_simd256(y256[0],3);
-    y256[1]  = shiftright_int16_simd256(y256[1],3);
-    y256[2]  = shiftright_int16_simd256(y256[2],3);
-    y256[3]  = shiftright_int16_simd256(y256[3],3);
-    y256[4]  = shiftright_int16_simd256(y256[4],3);
-    y256[5]  = shiftright_int16_simd256(y256[5],3);
-    y256[6]  = shiftright_int16_simd256(y256[6],3);
-    y256[7]  = shiftright_int16_simd256(y256[7],3);
+  if (scale && *scale>0) {
+    unsigned int scalec = *scale;
+    y256[0]  = shiftright_int16_simd256(y256[0],scalec);
+    y256[1]  = shiftright_int16_simd256(y256[1],scalec);
+    y256[2]  = shiftright_int16_simd256(y256[2],scalec);
+    y256[3]  = shiftright_int16_simd256(y256[3],scalec);
+    y256[4]  = shiftright_int16_simd256(y256[4],scalec);
+    y256[5]  = shiftright_int16_simd256(y256[5],scalec);
+    y256[6]  = shiftright_int16_simd256(y256[6],scalec);
+    y256[7]  = shiftright_int16_simd256(y256[7],scalec);
   }
 
 }
@@ -1765,7 +1822,7 @@ static const int16_t tw128a[128] __attribute__((aligned(32))) = { 32767,0,32727,
 
 static const int16_t tw128b[128] __attribute__((aligned(32))) = {0,32767,-1608,32727,-3212,32609,-4808,32412,-6393,32137,-7962,31785,-9512,31356,-11039,30851,-12540,30272,-14010,29621,-15447,28897,-16846,28105,-18205,27244,-19520,26318,-20788,25329,-22005,24278,-23170,23169,-24279,22004,-25330,20787,-26319,19519,-27245,18204,-28106,16845,-28898,15446,-29622,14009,-30273,12539,-30852,11038,-31357,9511,-31786,7961,-32138,6392,-32413,4807,-32610,3211,-32728,1607,-32767,0,-32728,-1608,-32610,-3212,-32413,-4808,-32138,-6393,-31786,-7962,-31357,-9512,-30852,-11039,-30273,-12540,-29622,-14010,-28898,-15447,-28106,-16846,-27245,-18205,-26319,-19520,-25330,-20788,-24279,-22005,-23170,-23170,-22005,-24279,-20788,-25330,-19520,-26319,-18205,-27245,-16846,-28106,-15447,-28898,-14010,-29622,-12540,-30273,-11039,-30852,-9512,-31357,-7962,-31786,-6393,-32138,-4808,-32413,-3212,-32610,-1608,-32728};
 
-void dft128(int16_t *x,int16_t *y,unsigned char scale)
+void dft128(int16_t *x,int16_t *y,unsigned int *scale)
 {
 
   simd256_q15_t xtmp[16],*x256 = (simd256_q15_t *)x;
@@ -1789,8 +1846,10 @@ void dft128(int16_t *x,int16_t *y,unsigned char scale)
      LOG_M("dft128inb_256.m","dftinb",xtmp+8,64,1,1);
   }
 #endif
-  dft64((int16_t*)(xtmp),(int16_t*)ytmp,1);
-  dft64((int16_t*)(xtmp+8),(int16_t*)(ytmp+8),1);
+  unsigned int *scale64=NULL;
+  if (scale) scale64=scale+1;
+  dft64((int16_t*)(xtmp),(int16_t*)ytmp,scale64);
+  dft64((int16_t*)(xtmp+8),(int16_t*)(ytmp+8),scale64);
 #ifndef MR_MAIN
   if (LOG_DUMPFLAG(DEBUG_DFT)) {  
     LOG_M("dft128outa_256.m","dftouta",ytmp,64,1,1);
@@ -1808,25 +1867,45 @@ void dft128(int16_t *x,int16_t *y,unsigned char scale)
     ytmpp++;
   }
 
-  if (scale>0) {
-
-    y256[0] = mulhi_int16_simd256(y256[0],ONE_OVER_SQRT2_Q15_256);
-    y256[1] = mulhi_int16_simd256(y256[1],ONE_OVER_SQRT2_Q15_256);
-    y256[2] = mulhi_int16_simd256(y256[2],ONE_OVER_SQRT2_Q15_256);
-    y256[3] = mulhi_int16_simd256(y256[3],ONE_OVER_SQRT2_Q15_256);
-    y256[4] = mulhi_int16_simd256(y256[4],ONE_OVER_SQRT2_Q15_256);
-    y256[5] = mulhi_int16_simd256(y256[5],ONE_OVER_SQRT2_Q15_256);
-    y256[6] = mulhi_int16_simd256(y256[6],ONE_OVER_SQRT2_Q15_256);
-    y256[7] = mulhi_int16_simd256(y256[7],ONE_OVER_SQRT2_Q15_256);
-    y256[8] = mulhi_int16_simd256(y256[8],ONE_OVER_SQRT2_Q15_256);
-    y256[9] = mulhi_int16_simd256(y256[9],ONE_OVER_SQRT2_Q15_256);
-    y256[10] = mulhi_int16_simd256(y256[10],ONE_OVER_SQRT2_Q15_256);
-    y256[11] = mulhi_int16_simd256(y256[11],ONE_OVER_SQRT2_Q15_256);
-    y256[12] = mulhi_int16_simd256(y256[12],ONE_OVER_SQRT2_Q15_256);
-    y256[13] = mulhi_int16_simd256(y256[13],ONE_OVER_SQRT2_Q15_256);
-    y256[14] = mulhi_int16_simd256(y256[14],ONE_OVER_SQRT2_Q15_256);
-    y256[15] = mulhi_int16_simd256(y256[15],ONE_OVER_SQRT2_Q15_256);
-
+  if (scale && *scale>0) {
+
+    if (*scale>1) {
+      uint32_t scale2=*scale-1;
+      y256[0]  = mulhi_int16_simd256(shiftright_int16_simd256(y256[0],scale2),ONE_OVER_SQRT2_Q15_256);
+      y256[1]  = mulhi_int16_simd256(shiftright_int16_simd256(y256[1],scale2),ONE_OVER_SQRT2_Q15_256);
+      y256[2]  = mulhi_int16_simd256(shiftright_int16_simd256(y256[2],scale2),ONE_OVER_SQRT2_Q15_256);
+      y256[3]  = mulhi_int16_simd256(shiftright_int16_simd256(y256[3],scale2),ONE_OVER_SQRT2_Q15_256);
+      y256[4]  = mulhi_int16_simd256(shiftright_int16_simd256(y256[4],scale2),ONE_OVER_SQRT2_Q15_256);
+      y256[5]  = mulhi_int16_simd256(shiftright_int16_simd256(y256[5],scale2),ONE_OVER_SQRT2_Q15_256);
+      y256[6]  = mulhi_int16_simd256(shiftright_int16_simd256(y256[6],scale2),ONE_OVER_SQRT2_Q15_256);
+      y256[7]  = mulhi_int16_simd256(shiftright_int16_simd256(y256[7],scale2),ONE_OVER_SQRT2_Q15_256);
+      y256[8]  = mulhi_int16_simd256(shiftright_int16_simd256(y256[8],scale2),ONE_OVER_SQRT2_Q15_256);
+      y256[9]  = mulhi_int16_simd256(shiftright_int16_simd256(y256[9],scale2),ONE_OVER_SQRT2_Q15_256);
+      y256[10]  = mulhi_int16_simd256(shiftright_int16_simd256(y256[10],scale2),ONE_OVER_SQRT2_Q15_256);
+      y256[11]  = mulhi_int16_simd256(shiftright_int16_simd256(y256[11],scale2),ONE_OVER_SQRT2_Q15_256);
+      y256[12]  = mulhi_int16_simd256(shiftright_int16_simd256(y256[12],scale2),ONE_OVER_SQRT2_Q15_256);
+      y256[13]  = mulhi_int16_simd256(shiftright_int16_simd256(y256[13],scale2),ONE_OVER_SQRT2_Q15_256);
+      y256[14]  = mulhi_int16_simd256(shiftright_int16_simd256(y256[14],scale2),ONE_OVER_SQRT2_Q15_256);
+      y256[15]  = mulhi_int16_simd256(shiftright_int16_simd256(y256[15],scale2),ONE_OVER_SQRT2_Q15_256);
+    } 
+    else {
+      y256[0] = mulhi_int16_simd256(y256[0],ONE_OVER_SQRT2_Q15_256);
+      y256[1] = mulhi_int16_simd256(y256[1],ONE_OVER_SQRT2_Q15_256);
+      y256[2] = mulhi_int16_simd256(y256[2],ONE_OVER_SQRT2_Q15_256);
+      y256[3] = mulhi_int16_simd256(y256[3],ONE_OVER_SQRT2_Q15_256);
+      y256[4] = mulhi_int16_simd256(y256[4],ONE_OVER_SQRT2_Q15_256);
+      y256[5] = mulhi_int16_simd256(y256[5],ONE_OVER_SQRT2_Q15_256);
+      y256[6] = mulhi_int16_simd256(y256[6],ONE_OVER_SQRT2_Q15_256);
+      y256[7] = mulhi_int16_simd256(y256[7],ONE_OVER_SQRT2_Q15_256);
+      y256[8] = mulhi_int16_simd256(y256[8],ONE_OVER_SQRT2_Q15_256);
+      y256[9] = mulhi_int16_simd256(y256[9],ONE_OVER_SQRT2_Q15_256);
+      y256[10] = mulhi_int16_simd256(y256[10],ONE_OVER_SQRT2_Q15_256);
+      y256[11] = mulhi_int16_simd256(y256[11],ONE_OVER_SQRT2_Q15_256);
+      y256[12] = mulhi_int16_simd256(y256[12],ONE_OVER_SQRT2_Q15_256);
+      y256[13] = mulhi_int16_simd256(y256[13],ONE_OVER_SQRT2_Q15_256);
+      y256[14] = mulhi_int16_simd256(y256[14],ONE_OVER_SQRT2_Q15_256);
+      y256[15] = mulhi_int16_simd256(y256[15],ONE_OVER_SQRT2_Q15_256);
+    }
   }
 #ifndef MR_MAIN
   if (LOG_DUMPFLAG(DEBUG_DFT)) {  
@@ -1836,7 +1915,7 @@ void dft128(int16_t *x,int16_t *y,unsigned char scale)
 #endif
 }
 
-void idft128(int16_t *x,int16_t *y,unsigned char scale)
+void idft128(int16_t *x,int16_t *y,unsigned int *scale)
 {
 
   simd256_q15_t xtmp[16],*x256 = (simd256_q15_t *)x;
@@ -1856,8 +1935,10 @@ void idft128(int16_t *x,int16_t *y,unsigned char scale)
   transpose4_ooff_simd256(x256+12,xtmp+6,8);
   transpose4_ooff_simd256(x256+14,xtmp+7,8);
 
-  idft64((int16_t*)(xtmp),(int16_t*)ytmp,1);
-  idft64((int16_t*)(xtmp+8),(int16_t*)(ytmp+8),1);
+  unsigned int *scale64=NULL;
+  if (scale) scale64=scale+1;
+  idft64((int16_t*)(xtmp),(int16_t*)ytmp,scale64);
+  idft64((int16_t*)(xtmp+8),(int16_t*)(ytmp+8),scale64);
 
 
   for (i=0; i<8; i++) {
@@ -1869,25 +1950,45 @@ void idft128(int16_t *x,int16_t *y,unsigned char scale)
     ytmpp++;
   }
 
-  if (scale>0) {
-
-    y256[0] = mulhi_int16_simd256(y256[0],ONE_OVER_SQRT2_Q15_256);
-    y256[1] = mulhi_int16_simd256(y256[1],ONE_OVER_SQRT2_Q15_256);
-    y256[2] = mulhi_int16_simd256(y256[2],ONE_OVER_SQRT2_Q15_256);
-    y256[3] = mulhi_int16_simd256(y256[3],ONE_OVER_SQRT2_Q15_256);
-    y256[4] = mulhi_int16_simd256(y256[4],ONE_OVER_SQRT2_Q15_256);
-    y256[5] = mulhi_int16_simd256(y256[5],ONE_OVER_SQRT2_Q15_256);
-    y256[6] = mulhi_int16_simd256(y256[6],ONE_OVER_SQRT2_Q15_256);
-    y256[7] = mulhi_int16_simd256(y256[7],ONE_OVER_SQRT2_Q15_256);
-    y256[8] = mulhi_int16_simd256(y256[8],ONE_OVER_SQRT2_Q15_256);
-    y256[9] = mulhi_int16_simd256(y256[9],ONE_OVER_SQRT2_Q15_256);
-    y256[10] = mulhi_int16_simd256(y256[10],ONE_OVER_SQRT2_Q15_256);
-    y256[11] = mulhi_int16_simd256(y256[11],ONE_OVER_SQRT2_Q15_256);
-    y256[12] = mulhi_int16_simd256(y256[12],ONE_OVER_SQRT2_Q15_256);
-    y256[13] = mulhi_int16_simd256(y256[13],ONE_OVER_SQRT2_Q15_256);
-    y256[14] = mulhi_int16_simd256(y256[14],ONE_OVER_SQRT2_Q15_256);
-    y256[15] = mulhi_int16_simd256(y256[15],ONE_OVER_SQRT2_Q15_256);
-
+  if (scale && *scale>0) {
+
+    if (*scale>1) {
+      uint32_t scale2=*scale-1;
+      y256[0]  = mulhi_int16_simd256(shiftright_int16_simd256(y256[0],scale2),ONE_OVER_SQRT2_Q15_256);
+      y256[1]  = mulhi_int16_simd256(shiftright_int16_simd256(y256[1],scale2),ONE_OVER_SQRT2_Q15_256);
+      y256[2]  = mulhi_int16_simd256(shiftright_int16_simd256(y256[2],scale2),ONE_OVER_SQRT2_Q15_256);
+      y256[3]  = mulhi_int16_simd256(shiftright_int16_simd256(y256[3],scale2),ONE_OVER_SQRT2_Q15_256);
+      y256[4]  = mulhi_int16_simd256(shiftright_int16_simd256(y256[4],scale2),ONE_OVER_SQRT2_Q15_256);
+      y256[5]  = mulhi_int16_simd256(shiftright_int16_simd256(y256[5],scale2),ONE_OVER_SQRT2_Q15_256);
+      y256[6]  = mulhi_int16_simd256(shiftright_int16_simd256(y256[6],scale2),ONE_OVER_SQRT2_Q15_256);
+      y256[7]  = mulhi_int16_simd256(shiftright_int16_simd256(y256[7],scale2),ONE_OVER_SQRT2_Q15_256);
+      y256[8]  = mulhi_int16_simd256(shiftright_int16_simd256(y256[8],scale2),ONE_OVER_SQRT2_Q15_256);
+      y256[9]  = mulhi_int16_simd256(shiftright_int16_simd256(y256[9],scale2),ONE_OVER_SQRT2_Q15_256);
+      y256[10]  = mulhi_int16_simd256(shiftright_int16_simd256(y256[10],scale2),ONE_OVER_SQRT2_Q15_256);
+      y256[11]  = mulhi_int16_simd256(shiftright_int16_simd256(y256[11],scale2),ONE_OVER_SQRT2_Q15_256);
+      y256[12]  = mulhi_int16_simd256(shiftright_int16_simd256(y256[12],scale2),ONE_OVER_SQRT2_Q15_256);
+      y256[13]  = mulhi_int16_simd256(shiftright_int16_simd256(y256[13],scale2),ONE_OVER_SQRT2_Q15_256);
+      y256[14]  = mulhi_int16_simd256(shiftright_int16_simd256(y256[14],scale2),ONE_OVER_SQRT2_Q15_256);
+      y256[15]  = mulhi_int16_simd256(shiftright_int16_simd256(y256[15],scale2),ONE_OVER_SQRT2_Q15_256);
+    } 
+    else {
+      y256[0] = mulhi_int16_simd256(y256[0],ONE_OVER_SQRT2_Q15_256);
+      y256[1] = mulhi_int16_simd256(y256[1],ONE_OVER_SQRT2_Q15_256);
+      y256[2] = mulhi_int16_simd256(y256[2],ONE_OVER_SQRT2_Q15_256);
+      y256[3] = mulhi_int16_simd256(y256[3],ONE_OVER_SQRT2_Q15_256);
+      y256[4] = mulhi_int16_simd256(y256[4],ONE_OVER_SQRT2_Q15_256);
+      y256[5] = mulhi_int16_simd256(y256[5],ONE_OVER_SQRT2_Q15_256);
+      y256[6] = mulhi_int16_simd256(y256[6],ONE_OVER_SQRT2_Q15_256);
+      y256[7] = mulhi_int16_simd256(y256[7],ONE_OVER_SQRT2_Q15_256);
+      y256[8] = mulhi_int16_simd256(y256[8],ONE_OVER_SQRT2_Q15_256);
+      y256[9] = mulhi_int16_simd256(y256[9],ONE_OVER_SQRT2_Q15_256);
+      y256[10] = mulhi_int16_simd256(y256[10],ONE_OVER_SQRT2_Q15_256);
+      y256[11] = mulhi_int16_simd256(y256[11],ONE_OVER_SQRT2_Q15_256);
+      y256[12] = mulhi_int16_simd256(y256[12],ONE_OVER_SQRT2_Q15_256);
+      y256[13] = mulhi_int16_simd256(y256[13],ONE_OVER_SQRT2_Q15_256);
+      y256[14] = mulhi_int16_simd256(y256[14],ONE_OVER_SQRT2_Q15_256);
+      y256[15] = mulhi_int16_simd256(y256[15],ONE_OVER_SQRT2_Q15_256);
+    }
   }
 
 }
@@ -1906,7 +2007,7 @@ static const int16_t tw256b[384] __attribute__((aligned(32))) = {0,32767,-805,32
                                                     0,32767,-1608,32727,-3212,32609,-4808,32412,-6393,32137,-7962,31785,-9512,31356,-11039,30851,-12540,30272,-14010,29621,-15447,28897,-16846,28105,-18205,27244,-19520,26318,-20788,25329,-22005,24278,-23170,23169,-24279,22004,-25330,20787,-26319,19519,-27245,18204,-28106,16845,-28898,15446,-29622,14009,-30273,12539,-30852,11038,-31357,9511,-31786,7961,-32138,6392,-32413,4807,-32610,3211,-32728,1607,-32767,0,-32728,-1608,-32610,-3212,-32413,-4808,-32138,-6393,-31786,-7962,-31357,-9512,-30852,-11039,-30273,-12540,-29622,-14010,-28898,-15447,-28106,-16846,-27245,-18205,-26319,-19520,-25330,-20788,-24279,-22005,-23170,-23170,-22005,-24279,-20788,-25330,-19520,-26319,-18205,-27245,-16846,-28106,-15447,-28898,-14010,-29622,-12540,-30273,-11039,-30852,-9512,-31357,-7962,-31786,-6393,-32138,-4808,-32413,-3212,-32610,-1608,-32728,
                                                     0,32767,-2411,32678,-4808,32412,-7180,31970,-9512,31356,-11793,30571,-14010,29621,-16151,28510,-18205,27244,-20160,25831,-22005,24278,-23732,22594,-25330,20787,-26790,18867,-28106,16845,-29269,14732,-30273,12539,-31114,10278,-31786,7961,-32285,5601,-32610,3211,-32758,804,-32728,-1608,-32521,-4012,-32138,-6393,-31581,-8740,-30852,-11039,-29956,-13279,-28898,-15447,-27684,-17531,-26319,-19520,-24812,-21403,-23170,-23170,-21403,-24812,-19520,-26319,-17531,-27684,-15447,-28898,-13279,-29956,-11039,-30852,-8740,-31581,-6393,-32138,-4012,-32521,-1608,-32728,804,-32758,3211,-32610,5601,-32285,7961,-31786,10278,-31114,12539,-30273,14732,-29269,16845,-28106,18867,-26790,20787,-25330,22594,-23732,24278,-22005,25831,-20160,27244,-18205,28510,-16151,29621,-14010,30571,-11793,31356,-9512,31970,-7180,32412,-4808,32678,-2411
                                                    };
-void dft256(int16_t *x,int16_t *y,unsigned char scale)
+void dft256(int16_t *x,int16_t *y,unsigned int *scale)
 {
 
   simd256_q15_t xtmp[32],ytmp[32],*tw256a_256p=(simd256_q15_t *)tw256a,*tw256b_256p=(simd256_q15_t *)tw256b,*x256=(simd256_q15_t *)x,*y256=(simd256_q15_t *)y,*y256p=(simd256_q15_t *)y;
@@ -1928,11 +2029,13 @@ void dft256(int16_t *x,int16_t *y,unsigned char scale)
     print_shorts256(vname,(int16_t*)(xtmp+i));
   }
   exit(-1);*/
-
-  dft64((int16_t*)(xtmp),(int16_t*)(ytmp),1);
-  dft64((int16_t*)(xtmp+8),(int16_t*)(ytmp+8),1);
-  dft64((int16_t*)(xtmp+16),(int16_t*)(ytmp+16),1);
-  dft64((int16_t*)(xtmp+24),(int16_t*)(ytmp+24),1);
+ 
+  unsigned int *scale64=NULL;
+  if (scale) scale64=scale+1;
+  dft64((int16_t*)(xtmp),(int16_t*)(ytmp),scale64);
+  dft64((int16_t*)(xtmp+8),(int16_t*)(ytmp+8),scale64);
+  dft64((int16_t*)(xtmp+16),(int16_t*)(ytmp+16),scale64);
+  dft64((int16_t*)(xtmp+24),(int16_t*)(ytmp+24),scale64);
 
 
   bfly4_16_256(ytmpp,ytmpp+8,ytmpp+16,ytmpp+24,
@@ -1968,25 +2071,25 @@ void dft256(int16_t *x,int16_t *y,unsigned char scale)
 	       tw256a_256p+7,tw256a_256p+15,tw256a_256p+23,
 	       tw256b_256p+7,tw256b_256p+15,tw256b_256p+23);
 
-  if (scale>0) {
-
+  if (scale && *scale>0) {
+    unsigned int scalec=*scale;
     for (i=0; i<2; i++) {
-      y256[0]  = shiftright_int16_simd256(y256[0],1);
-      y256[1]  = shiftright_int16_simd256(y256[1],1);
-      y256[2]  = shiftright_int16_simd256(y256[2],1);
-      y256[3]  = shiftright_int16_simd256(y256[3],1);
-      y256[4]  = shiftright_int16_simd256(y256[4],1);
-      y256[5]  = shiftright_int16_simd256(y256[5],1);
-      y256[6]  = shiftright_int16_simd256(y256[6],1);
-      y256[7]  = shiftright_int16_simd256(y256[7],1);
-      y256[8]  = shiftright_int16_simd256(y256[8],1);
-      y256[9]  = shiftright_int16_simd256(y256[9],1);
-      y256[10] = shiftright_int16_simd256(y256[10],1);
-      y256[11] = shiftright_int16_simd256(y256[11],1);
-      y256[12] = shiftright_int16_simd256(y256[12],1);
-      y256[13] = shiftright_int16_simd256(y256[13],1);
-      y256[14] = shiftright_int16_simd256(y256[14],1);
-      y256[15] = shiftright_int16_simd256(y256[15],1);
+      y256[0]  = shiftright_int16_simd256(y256[0],scalec);
+      y256[1]  = shiftright_int16_simd256(y256[1],scalec);
+      y256[2]  = shiftright_int16_simd256(y256[2],scalec);
+      y256[3]  = shiftright_int16_simd256(y256[3],scalec);
+      y256[4]  = shiftright_int16_simd256(y256[4],scalec);
+      y256[5]  = shiftright_int16_simd256(y256[5],scalec);
+      y256[6]  = shiftright_int16_simd256(y256[6],scalec);
+      y256[7]  = shiftright_int16_simd256(y256[7],scalec);
+      y256[8]  = shiftright_int16_simd256(y256[8],scalec);
+      y256[9]  = shiftright_int16_simd256(y256[9],scalec);
+      y256[10] = shiftright_int16_simd256(y256[10],scalec);
+      y256[11] = shiftright_int16_simd256(y256[11],scalec);
+      y256[12] = shiftright_int16_simd256(y256[12],scalec);
+      y256[13] = shiftright_int16_simd256(y256[13],scalec);
+      y256[14] = shiftright_int16_simd256(y256[14],scalec);
+      y256[15] = shiftright_int16_simd256(y256[15],scalec);
 
       y256+=16;
     }
@@ -1995,7 +2098,7 @@ void dft256(int16_t *x,int16_t *y,unsigned char scale)
 
 }
 
-void idft256(int16_t *x,int16_t *y,unsigned char scale)
+void idft256(int16_t *x,int16_t *y,unsigned int *scale)
 {
 
   simd256_q15_t xtmp[32],ytmp[32],*tw256_256p=(simd256_q15_t *)tw256,*x256=(simd256_q15_t *)x,*y256=(simd256_q15_t *)y,*y256p=(simd256_q15_t *)y;
@@ -2011,10 +2114,12 @@ void idft256(int16_t *x,int16_t *y,unsigned char scale)
   transpose16_ooff_simd256(x256+24,xtmp+6,8);
   transpose16_ooff_simd256(x256+28,xtmp+7,8);
   
-  idft64((int16_t*)(xtmp),(int16_t*)(ytmp),1);
-  idft64((int16_t*)(xtmp+8),(int16_t*)(ytmp+8),1);
-  idft64((int16_t*)(xtmp+16),(int16_t*)(ytmp+16),1);
-  idft64((int16_t*)(xtmp+24),(int16_t*)(ytmp+24),1);
+  unsigned int *scale64=NULL;
+  if (scale) scale64=scale+1;
+  idft64((int16_t*)(xtmp),(int16_t*)(ytmp),scale64);
+  idft64((int16_t*)(xtmp+8),(int16_t*)(ytmp+8),scale64);
+  idft64((int16_t*)(xtmp+16),(int16_t*)(ytmp+16),scale64);
+  idft64((int16_t*)(xtmp+24),(int16_t*)(ytmp+24),scale64);
   
   
   ibfly4_256(ytmpp,ytmpp+8,ytmpp+16,ytmpp+24,
@@ -2050,25 +2155,25 @@ void idft256(int16_t *x,int16_t *y,unsigned char scale)
 	     tw256_256p+7,tw256_256p+15,tw256_256p+23);
 
   
-  if (scale>0) {
-
+  if (scale && *scale>0) {
+    unsigned int scalec=*scale;
     for (i=0; i<2; i++) {
-      y256[0]  = shiftright_int16_simd256(y256[0],1);
-      y256[1]  = shiftright_int16_simd256(y256[1],1);
-      y256[2]  = shiftright_int16_simd256(y256[2],1);
-      y256[3]  = shiftright_int16_simd256(y256[3],1);
-      y256[4]  = shiftright_int16_simd256(y256[4],1);
-      y256[5]  = shiftright_int16_simd256(y256[5],1);
-      y256[6]  = shiftright_int16_simd256(y256[6],1);
-      y256[7]  = shiftright_int16_simd256(y256[7],1);
-      y256[8]  = shiftright_int16_simd256(y256[8],1);
-      y256[9]  = shiftright_int16_simd256(y256[9],1);
-      y256[10] = shiftright_int16_simd256(y256[10],1);
-      y256[11] = shiftright_int16_simd256(y256[11],1);
-      y256[12] = shiftright_int16_simd256(y256[12],1);
-      y256[13] = shiftright_int16_simd256(y256[13],1);
-      y256[14] = shiftright_int16_simd256(y256[14],1);
-      y256[15] = shiftright_int16_simd256(y256[15],1);
+      y256[0]  = shiftright_int16_simd256(y256[0],scalec);
+      y256[1]  = shiftright_int16_simd256(y256[1],scalec);
+      y256[2]  = shiftright_int16_simd256(y256[2],scalec);
+      y256[3]  = shiftright_int16_simd256(y256[3],scalec);
+      y256[4]  = shiftright_int16_simd256(y256[4],scalec);
+      y256[5]  = shiftright_int16_simd256(y256[5],scalec);
+      y256[6]  = shiftright_int16_simd256(y256[6],scalec);
+      y256[7]  = shiftright_int16_simd256(y256[7],scalec);
+      y256[8]  = shiftright_int16_simd256(y256[8],scalec);
+      y256[9]  = shiftright_int16_simd256(y256[9],scalec);
+      y256[10] = shiftright_int16_simd256(y256[10],scalec);
+      y256[11] = shiftright_int16_simd256(y256[11],scalec);
+      y256[12] = shiftright_int16_simd256(y256[12],scalec);
+      y256[13] = shiftright_int16_simd256(y256[13],scalec);
+      y256[14] = shiftright_int16_simd256(y256[14],scalec);
+      y256[15] = shiftright_int16_simd256(y256[15],scalec);
 
       y256+=16;
     }
@@ -2081,7 +2186,7 @@ static const int16_t tw512[512] __attribute__((aligned(32))) = {
   32767,0,32764,-403,32757,-805,32744,-1207,32727,-1608,32705,-2010,32678,-2411,32646,-2812,32609,-3212,32567,-3612,32520,-4012,32468,-4410,32412,-4808,32350,-5206,32284,-5602,32213,-5998,32137,-6393,32056,-6787,31970,-7180,31880,-7572,31785,-7962,31684,-8352,31580,-8740,31470,-9127,31356,-9512,31236,-9896,31113,-10279,30984,-10660,30851,-11039,30713,-11417,30571,-11793,30424,-12167,30272,-12540,30116,-12910,29955,-13279,29790,-13646,29621,-14010,29446,-14373,29268,-14733,29085,-15091,28897,-15447,28706,-15800,28510,-16151,28309,-16500,28105,-16846,27896,-17190,27683,-17531,27466,-17869,27244,-18205,27019,-18538,26789,-18868,26556,-19195,26318,-19520,26077,-19841,25831,-20160,25582,-20475,25329,-20788,25072,-21097,24811,-21403,24546,-21706,24278,-22005,24006,-22302,23731,-22595,23452,-22884,23169,-23170,22883,-23453,22594,-23732,22301,-24007,22004,-24279,21705,-24547,21402,-24812,21096,-25073,20787,-25330,20474,-25583,20159,-25832,19840,-26078,19519,-26319,19194,-26557,18867,-26790,18537,-27020,18204,-27245,17868,-27467,17530,-27684,17189,-27897,16845,-28106,16499,-28310,16150,-28511,15799,-28707,15446,-28898,15090,-29086,14732,-29269,14372,-29447,14009,-29622,13645,-29791,13278,-29956,12909,-30117,12539,-30273,12166,-30425,11792,-30572,11416,-30714,11038,-30852,10659,-30985,10278,-31114,9895,-31237,9511,-31357,9126,-31471,8739,-31581,8351,-31685,7961,-31786,7571,-31881,7179,-31971,6786,-32057,6392,-32138,5997,-32214,5601,-32285,5205,-32351,4807,-32413,4409,-32469,4011,-32521,3611,-32568,3211,-32610,2811,-32647,2410,-32679,2009,-32706,1607,-32728,1206,-32745,804,-32758,402,-32765,0,-32767,-403,-32765,-805,-32758,-1207,-32745,-1608,-32728,-2010,-32706,-2411,-32679,-2812,-32647,-3212,-32610,-3612,-32568,-4012,-32521,-4410,-32469,-4808,-32413,-5206,-32351,-5602,-32285,-5998,-32214,-6393,-32138,-6787,-32057,-7180,-31971,-7572,-31881,-7962,-31786,-8352,-31685,-8740,-31581,-9127,-31471,-9512,-31357,-9896,-31237,-10279,-31114,-10660,-30985,-11039,-30852,-11417,-30714,-11793,-30572,-12167,-30425,-12540,-30273,-12910,-30117,-13279,-29956,-13646,-29791,-14010,-29622,-14373,-29447,-14733,-29269,-15091,-29086,-15447,-28898,-15800,-28707,-16151,-28511,-16500,-28310,-16846,-28106,-17190,-27897,-17531,-27684,-17869,-27467,-18205,-27245,-18538,-27020,-18868,-26790,-19195,-26557,-19520,-26319,-19841,-26078,-20160,-25832,-20475,-25583,-20788,-25330,-21097,-25073,-21403,-24812,-21706,-24547,-22005,-24279,-22302,-24007,-22595,-23732,-22884,-23453,-23170,-23170,-23453,-22884,-23732,-22595,-24007,-22302,-24279,-22005,-24547,-21706,-24812,-21403,-25073,-21097,-25330,-20788,-25583,-20475,-25832,-20160,-26078,-19841,-26319,-19520,-26557,-19195,-26790,-18868,-27020,-18538,-27245,-18205,-27467,-17869,-27684,-17531,-27897,-17190,-28106,-16846,-28310,-16500,-28511,-16151,-28707,-15800,-28898,-15447,-29086,-15091,-29269,-14733,-29447,-14373,-29622,-14010,-29791,-13646,-29956,-13279,-30117,-12910,-30273,-12540,-30425,-12167,-30572,-11793,-30714,-11417,-30852,-11039,-30985,-10660,-31114,-10279,-31237,-9896,-31357,-9512,-31471,-9127,-31581,-8740,-31685,-8352,-31786,-7962,-31881,-7572,-31971,-7180,-32057,-6787,-32138,-6393,-32214,-5998,-32285,-5602,-32351,-5206,-32413,-4808,-32469,-4410,-32521,-4012,-32568,-3612,-32610,-3212,-32647,-2812,-32679,-2411,-32706,-2010,-32728,-1608,-32745,-1207,-32758,-805,-32765,-403
 };
 
-void dft512(int16_t *x,int16_t *y,unsigned char scale)
+void dft512(int16_t *x,int16_t *y,unsigned int *scale)
 {
 
   simd256_q15_t xtmp[64],*x256 = (simd256_q15_t *)x;
@@ -2124,9 +2229,12 @@ void dft512(int16_t *x,int16_t *y,unsigned char scale)
   transpose4_ooff_simd256(x256+58,xtmp+29,32);
   transpose4_ooff_simd256(x256+60,xtmp+30,32);
   transpose4_ooff_simd256(x256+62,xtmp+31,32);
-
-  dft256((int16_t*)(xtmp),(int16_t*)ytmp,1);
-  dft256((int16_t*)(xtmp+32),(int16_t*)(ytmp+32),1);
+ 
+   
+  unsigned int *scale256=NULL;
+  if (scale) scale256=scale+1;
+  dft256((int16_t*)(xtmp),(int16_t*)ytmp,scale256);
+  dft256((int16_t*)(xtmp+32),(int16_t*)(ytmp+32),scale256);
 
 
   for (i=0; i<32; i++) {
@@ -2138,32 +2246,56 @@ void dft512(int16_t *x,int16_t *y,unsigned char scale)
     ytmpp++;
   }
 
-  if (scale>0) {
-
-    for (i=0;i<4;i++) {
-      y256[0] = mulhi_int16_simd256(y256[0],ONE_OVER_SQRT2_Q15_256);
-      y256[1] = mulhi_int16_simd256(y256[1],ONE_OVER_SQRT2_Q15_256);
-      y256[2] = mulhi_int16_simd256(y256[2],ONE_OVER_SQRT2_Q15_256);
-      y256[3] = mulhi_int16_simd256(y256[3],ONE_OVER_SQRT2_Q15_256);
-      y256[4] = mulhi_int16_simd256(y256[4],ONE_OVER_SQRT2_Q15_256);
-      y256[5] = mulhi_int16_simd256(y256[5],ONE_OVER_SQRT2_Q15_256);
-      y256[6] = mulhi_int16_simd256(y256[6],ONE_OVER_SQRT2_Q15_256);
-      y256[7] = mulhi_int16_simd256(y256[7],ONE_OVER_SQRT2_Q15_256);
-      y256[8] = mulhi_int16_simd256(y256[8],ONE_OVER_SQRT2_Q15_256);
-      y256[9] = mulhi_int16_simd256(y256[9],ONE_OVER_SQRT2_Q15_256);
-      y256[10] = mulhi_int16_simd256(y256[10],ONE_OVER_SQRT2_Q15_256);
-      y256[11] = mulhi_int16_simd256(y256[11],ONE_OVER_SQRT2_Q15_256);
-      y256[12] = mulhi_int16_simd256(y256[12],ONE_OVER_SQRT2_Q15_256);
-      y256[13] = mulhi_int16_simd256(y256[13],ONE_OVER_SQRT2_Q15_256);
-      y256[14] = mulhi_int16_simd256(y256[14],ONE_OVER_SQRT2_Q15_256);
-      y256[15] = mulhi_int16_simd256(y256[15],ONE_OVER_SQRT2_Q15_256);
-      y256+=16;
+  if (scale && *scale>0) {
+    y256p = y256;
+    if (*scale>1) {
+      uint32_t scale2=*scale-1;
+      for (i=0; i<4; i++) {
+        y256p[0]  = mulhi_int16_simd256(shiftright_int16_simd256(y256p[0],scale2),ONE_OVER_SQRT2_Q15_256);
+        y256p[1]  = mulhi_int16_simd256(shiftright_int16_simd256(y256p[1],scale2),ONE_OVER_SQRT2_Q15_256);
+        y256p[2]  = mulhi_int16_simd256(shiftright_int16_simd256(y256p[2],scale2),ONE_OVER_SQRT2_Q15_256);
+        y256p[3]  = mulhi_int16_simd256(shiftright_int16_simd256(y256p[3],scale2),ONE_OVER_SQRT2_Q15_256);
+        y256p[4]  = mulhi_int16_simd256(shiftright_int16_simd256(y256p[4],scale2),ONE_OVER_SQRT2_Q15_256);
+        y256p[5]  = mulhi_int16_simd256(shiftright_int16_simd256(y256p[5],scale2),ONE_OVER_SQRT2_Q15_256);
+        y256p[6]  = mulhi_int16_simd256(shiftright_int16_simd256(y256p[6],scale2),ONE_OVER_SQRT2_Q15_256);
+        y256p[7]  = mulhi_int16_simd256(shiftright_int16_simd256(y256p[7],scale2),ONE_OVER_SQRT2_Q15_256);
+        y256p[8]  = mulhi_int16_simd256(shiftright_int16_simd256(y256p[8],scale2),ONE_OVER_SQRT2_Q15_256);
+        y256p[9]  = mulhi_int16_simd256(shiftright_int16_simd256(y256p[9],scale2),ONE_OVER_SQRT2_Q15_256);
+        y256p[10]  = mulhi_int16_simd256(shiftright_int16_simd256(y256p[10],scale2),ONE_OVER_SQRT2_Q15_256);
+        y256p[11]  = mulhi_int16_simd256(shiftright_int16_simd256(y256p[11],scale2),ONE_OVER_SQRT2_Q15_256);
+        y256p[12]  = mulhi_int16_simd256(shiftright_int16_simd256(y256p[12],scale2),ONE_OVER_SQRT2_Q15_256);
+        y256p[13]  = mulhi_int16_simd256(shiftright_int16_simd256(y256p[13],scale2),ONE_OVER_SQRT2_Q15_256);
+        y256p[14]  = mulhi_int16_simd256(shiftright_int16_simd256(y256p[14],scale2),ONE_OVER_SQRT2_Q15_256);
+        y256p[15]  = mulhi_int16_simd256(shiftright_int16_simd256(y256p[15],scale2),ONE_OVER_SQRT2_Q15_256);
+        y256p+=16;
+      }
+    }
+    else {
+      for (i=0;i<4;i++) {
+        y256p[0] = mulhi_int16_simd256(y256p[0],ONE_OVER_SQRT2_Q15_256);
+        y256p[1] = mulhi_int16_simd256(y256p[1],ONE_OVER_SQRT2_Q15_256);
+        y256p[2] = mulhi_int16_simd256(y256p[2],ONE_OVER_SQRT2_Q15_256);
+        y256p[3] = mulhi_int16_simd256(y256p[3],ONE_OVER_SQRT2_Q15_256);
+        y256p[4] = mulhi_int16_simd256(y256p[4],ONE_OVER_SQRT2_Q15_256);
+        y256p[5] = mulhi_int16_simd256(y256p[5],ONE_OVER_SQRT2_Q15_256);
+        y256p[6] = mulhi_int16_simd256(y256p[6],ONE_OVER_SQRT2_Q15_256);
+        y256p[7] = mulhi_int16_simd256(y256p[7],ONE_OVER_SQRT2_Q15_256);
+        y256p[8] = mulhi_int16_simd256(y256p[8],ONE_OVER_SQRT2_Q15_256);
+        y256p[9] = mulhi_int16_simd256(y256p[9],ONE_OVER_SQRT2_Q15_256);
+        y256p[10] = mulhi_int16_simd256(y256p[10],ONE_OVER_SQRT2_Q15_256);
+        y256p[11] = mulhi_int16_simd256(y256p[11],ONE_OVER_SQRT2_Q15_256);
+        y256p[12] = mulhi_int16_simd256(y256p[12],ONE_OVER_SQRT2_Q15_256);
+        y256p[13] = mulhi_int16_simd256(y256p[13],ONE_OVER_SQRT2_Q15_256);
+        y256p[14] = mulhi_int16_simd256(y256p[14],ONE_OVER_SQRT2_Q15_256);
+        y256p[15] = mulhi_int16_simd256(y256p[15],ONE_OVER_SQRT2_Q15_256);
+        y256p+=16;
+      }
     }
   }
 
 }
 
-void idft512(int16_t *x,int16_t *y,unsigned char scale)
+void idft512(int16_t *x,int16_t *y,unsigned int *scale)
 {
 
   simd256_q15_t xtmp[64],*x256 = (simd256_q15_t *)x;
@@ -2207,8 +2339,10 @@ void idft512(int16_t *x,int16_t *y,unsigned char scale)
   transpose4_ooff_simd256(x256+60,xtmp+30,32);
   transpose4_ooff_simd256(x256+62,xtmp+31,32);
 
-  idft256((int16_t*)(xtmp),(int16_t*)ytmp,1);
-  idft256((int16_t*)(xtmp+32),(int16_t*)(ytmp+32),1);
+  unsigned int *scale256=NULL;
+  if (scale) scale256=scale+1;
+  idft256((int16_t*)(xtmp),(int16_t*)ytmp,scale256);
+  idft256((int16_t*)(xtmp+32),(int16_t*)(ytmp+32),scale256);
 
 
   for (i=0; i<32; i++) {
@@ -2220,34 +2354,57 @@ void idft512(int16_t *x,int16_t *y,unsigned char scale)
     ytmpp++;
   }
 
-  if (scale>0) {
-
-    for (i=0;i<4;i++) {
-      y256[0] = mulhi_int16_simd256(y256[0],ONE_OVER_SQRT2_Q15_256);
-      y256[1] = mulhi_int16_simd256(y256[1],ONE_OVER_SQRT2_Q15_256);
-      y256[2] = mulhi_int16_simd256(y256[2],ONE_OVER_SQRT2_Q15_256);
-      y256[3] = mulhi_int16_simd256(y256[3],ONE_OVER_SQRT2_Q15_256);
-      y256[4] = mulhi_int16_simd256(y256[4],ONE_OVER_SQRT2_Q15_256);
-      y256[5] = mulhi_int16_simd256(y256[5],ONE_OVER_SQRT2_Q15_256);
-      y256[6] = mulhi_int16_simd256(y256[6],ONE_OVER_SQRT2_Q15_256);
-      y256[7] = mulhi_int16_simd256(y256[7],ONE_OVER_SQRT2_Q15_256);
-      y256[8] = mulhi_int16_simd256(y256[8],ONE_OVER_SQRT2_Q15_256);
-      y256[9] = mulhi_int16_simd256(y256[9],ONE_OVER_SQRT2_Q15_256);
-      y256[10] = mulhi_int16_simd256(y256[10],ONE_OVER_SQRT2_Q15_256);
-      y256[11] = mulhi_int16_simd256(y256[11],ONE_OVER_SQRT2_Q15_256);
-      y256[12] = mulhi_int16_simd256(y256[12],ONE_OVER_SQRT2_Q15_256);
-      y256[13] = mulhi_int16_simd256(y256[13],ONE_OVER_SQRT2_Q15_256);
-      y256[14] = mulhi_int16_simd256(y256[14],ONE_OVER_SQRT2_Q15_256);
-      y256[15] = mulhi_int16_simd256(y256[15],ONE_OVER_SQRT2_Q15_256);
-      y256+=16;
+  if (scale && *scale>0) {
+    y256p = y256;
+    if (*scale>1) {
+      uint32_t scale2=*scale-1;
+      for (i=0; i<4; i++) {
+        y256p[0]  = mulhi_int16_simd256(shiftright_int16_simd256(y256p[0],scale2),ONE_OVER_SQRT2_Q15_256);
+        y256p[1]  = mulhi_int16_simd256(shiftright_int16_simd256(y256p[1],scale2),ONE_OVER_SQRT2_Q15_256);
+        y256p[2]  = mulhi_int16_simd256(shiftright_int16_simd256(y256p[2],scale2),ONE_OVER_SQRT2_Q15_256);
+        y256p[3]  = mulhi_int16_simd256(shiftright_int16_simd256(y256p[3],scale2),ONE_OVER_SQRT2_Q15_256);
+        y256p[4]  = mulhi_int16_simd256(shiftright_int16_simd256(y256p[4],scale2),ONE_OVER_SQRT2_Q15_256);
+        y256p[5]  = mulhi_int16_simd256(shiftright_int16_simd256(y256p[5],scale2),ONE_OVER_SQRT2_Q15_256);
+        y256p[6]  = mulhi_int16_simd256(shiftright_int16_simd256(y256p[6],scale2),ONE_OVER_SQRT2_Q15_256);
+        y256p[7]  = mulhi_int16_simd256(shiftright_int16_simd256(y256p[7],scale2),ONE_OVER_SQRT2_Q15_256);
+        y256p[8]  = mulhi_int16_simd256(shiftright_int16_simd256(y256p[8],scale2),ONE_OVER_SQRT2_Q15_256);
+        y256p[9]  = mulhi_int16_simd256(shiftright_int16_simd256(y256p[9],scale2),ONE_OVER_SQRT2_Q15_256);
+        y256p[10]  = mulhi_int16_simd256(shiftright_int16_simd256(y256p[10],scale2),ONE_OVER_SQRT2_Q15_256);
+        y256p[11]  = mulhi_int16_simd256(shiftright_int16_simd256(y256p[11],scale2),ONE_OVER_SQRT2_Q15_256);
+        y256p[12]  = mulhi_int16_simd256(shiftright_int16_simd256(y256p[12],scale2),ONE_OVER_SQRT2_Q15_256);
+        y256p[13]  = mulhi_int16_simd256(shiftright_int16_simd256(y256p[13],scale2),ONE_OVER_SQRT2_Q15_256);
+        y256p[14]  = mulhi_int16_simd256(shiftright_int16_simd256(y256p[14],scale2),ONE_OVER_SQRT2_Q15_256);
+        y256p[15]  = mulhi_int16_simd256(shiftright_int16_simd256(y256p[15],scale2),ONE_OVER_SQRT2_Q15_256);
+        y256p+=16;
+      }
+    }
+    else {
+      for (i=0; i<4; i++) {
+        y256p[0]  = mulhi_int16_simd256(y256p[0],ONE_OVER_SQRT2_Q15_256);
+        y256p[1]  = mulhi_int16_simd256(y256p[1],ONE_OVER_SQRT2_Q15_256);
+        y256p[2]  = mulhi_int16_simd256(y256p[2],ONE_OVER_SQRT2_Q15_256);
+        y256p[3]  = mulhi_int16_simd256(y256p[3],ONE_OVER_SQRT2_Q15_256);
+        y256p[4]  = mulhi_int16_simd256(y256p[4],ONE_OVER_SQRT2_Q15_256);
+        y256p[5]  = mulhi_int16_simd256(y256p[5],ONE_OVER_SQRT2_Q15_256);
+        y256p[6]  = mulhi_int16_simd256(y256p[6],ONE_OVER_SQRT2_Q15_256);
+        y256p[7]  = mulhi_int16_simd256(y256p[7],ONE_OVER_SQRT2_Q15_256);
+        y256p[8]  = mulhi_int16_simd256(y256p[8],ONE_OVER_SQRT2_Q15_256);
+        y256p[9]  = mulhi_int16_simd256(y256p[9],ONE_OVER_SQRT2_Q15_256);
+        y256p[10] = mulhi_int16_simd256(y256p[10],ONE_OVER_SQRT2_Q15_256);
+        y256p[11] = mulhi_int16_simd256(y256p[11],ONE_OVER_SQRT2_Q15_256);
+        y256p[12] = mulhi_int16_simd256(y256p[12],ONE_OVER_SQRT2_Q15_256);
+        y256p[13] = mulhi_int16_simd256(y256p[13],ONE_OVER_SQRT2_Q15_256);
+        y256p[14] = mulhi_int16_simd256(y256p[14],ONE_OVER_SQRT2_Q15_256);
+        y256p[15] = mulhi_int16_simd256(y256p[15],ONE_OVER_SQRT2_Q15_256);
+        y256p+=16;
+      }
     }
   }
-
 }
 
 int16_t tw1024[1536] __attribute__((aligned(32)));
 
-void dft1024(int16_t *x,int16_t *y,unsigned char scale)
+void dft1024(int16_t *x,int16_t *y,unsigned int *scale)
 {
 
   simd256_q15_t xtmp[128],ytmp[128],*tw1024_256p=(simd256_q15_t *)tw1024,*x256=(simd256_q15_t *)x,*y256=(simd256_q15_t *)y,*y256p=(simd256_q15_t *)y;
@@ -2259,10 +2416,12 @@ void dft1024(int16_t *x,int16_t *y,unsigned char scale)
   }
 
 
-  dft256((int16_t*)(xtmp),(int16_t*)(ytmp),1);
-  dft256((int16_t*)(xtmp+32),(int16_t*)(ytmp+32),1);
-  dft256((int16_t*)(xtmp+64),(int16_t*)(ytmp+64),1);
-  dft256((int16_t*)(xtmp+96),(int16_t*)(ytmp+96),1);
+  unsigned int *scale256=NULL;
+  if (scale) scale256=scale+1;
+  dft256((int16_t*)(xtmp),(int16_t*)(ytmp),scale256);
+  dft256((int16_t*)(xtmp+32),(int16_t*)(ytmp+32),scale256);
+  dft256((int16_t*)(xtmp+64),(int16_t*)(ytmp+64),scale256);
+  dft256((int16_t*)(xtmp+96),(int16_t*)(ytmp+96),scale256);
 
   for (i=0; i<32; i++) {
     bfly4_256(ytmpp,ytmpp+32,ytmpp+64,ytmpp+96,
@@ -2273,25 +2432,25 @@ void dft1024(int16_t *x,int16_t *y,unsigned char scale)
     ytmpp++;
   }
 
-  if (scale>0) {
-
+  if (scale && *scale>0) {
+    unsigned int scalec=*scale;
     for (i=0; i<8; i++) {
-      y256[0]  = shiftright_int16_simd256(y256[0],1);
-      y256[1]  = shiftright_int16_simd256(y256[1],1);
-      y256[2]  = shiftright_int16_simd256(y256[2],1);
-      y256[3]  = shiftright_int16_simd256(y256[3],1);
-      y256[4]  = shiftright_int16_simd256(y256[4],1);
-      y256[5]  = shiftright_int16_simd256(y256[5],1);
-      y256[6]  = shiftright_int16_simd256(y256[6],1);
-      y256[7]  = shiftright_int16_simd256(y256[7],1);
-      y256[8]  = shiftright_int16_simd256(y256[8],1);
-      y256[9]  = shiftright_int16_simd256(y256[9],1);
-      y256[10] = shiftright_int16_simd256(y256[10],1);
-      y256[11] = shiftright_int16_simd256(y256[11],1);
-      y256[12] = shiftright_int16_simd256(y256[12],1);
-      y256[13] = shiftright_int16_simd256(y256[13],1);
-      y256[14] = shiftright_int16_simd256(y256[14],1);
-      y256[15] = shiftright_int16_simd256(y256[15],1);
+      y256[0]  = shiftright_int16_simd256(y256[0],scalec);
+      y256[1]  = shiftright_int16_simd256(y256[1],scalec);
+      y256[2]  = shiftright_int16_simd256(y256[2],scalec);
+      y256[3]  = shiftright_int16_simd256(y256[3],scalec);
+      y256[4]  = shiftright_int16_simd256(y256[4],scalec);
+      y256[5]  = shiftright_int16_simd256(y256[5],scalec);
+      y256[6]  = shiftright_int16_simd256(y256[6],scalec);
+      y256[7]  = shiftright_int16_simd256(y256[7],scalec);
+      y256[8]  = shiftright_int16_simd256(y256[8],scalec);
+      y256[9]  = shiftright_int16_simd256(y256[9],scalec);
+      y256[10] = shiftright_int16_simd256(y256[10],scalec);
+      y256[11] = shiftright_int16_simd256(y256[11],scalec);
+      y256[12] = shiftright_int16_simd256(y256[12],scalec);
+      y256[13] = shiftright_int16_simd256(y256[13],scalec);
+      y256[14] = shiftright_int16_simd256(y256[14],scalec);
+      y256[15] = shiftright_int16_simd256(y256[15],scalec);
 
       y256+=16;
     }
@@ -2300,7 +2459,7 @@ void dft1024(int16_t *x,int16_t *y,unsigned char scale)
 
 }
 
-void idft1024(int16_t *x,int16_t *y,unsigned char scale)
+void idft1024(int16_t *x,int16_t *y,unsigned int *scale)
 {
 
   simd256_q15_t xtmp[128],ytmp[128],*tw1024_256p=(simd256_q15_t *)tw1024,*x256=(simd256_q15_t *)x,*y256=(simd256_q15_t *)y,*y256p=(simd256_q15_t *)y;
@@ -2312,10 +2471,12 @@ void idft1024(int16_t *x,int16_t *y,unsigned char scale)
   }
 
 
-  idft256((int16_t*)(xtmp),(int16_t*)(ytmp),1);
-  idft256((int16_t*)(xtmp+32),(int16_t*)(ytmp+32),1);
-  idft256((int16_t*)(xtmp+64),(int16_t*)(ytmp+64),1);
-  idft256((int16_t*)(xtmp+96),(int16_t*)(ytmp+96),1);
+  unsigned int *scale256=NULL;
+  if (scale) scale256=scale+1;
+  idft256((int16_t*)(xtmp),(int16_t*)(ytmp),scale256);
+  idft256((int16_t*)(xtmp+32),(int16_t*)(ytmp+32),scale256);
+  idft256((int16_t*)(xtmp+64),(int16_t*)(ytmp+64),scale256);
+  idft256((int16_t*)(xtmp+96),(int16_t*)(ytmp+96),scale256);
 
   for (i=0; i<32; i++) {
     ibfly4_256(ytmpp,ytmpp+32,ytmpp+64,ytmpp+96,
@@ -2326,25 +2487,25 @@ void idft1024(int16_t *x,int16_t *y,unsigned char scale)
     ytmpp++;
   }
 
-  if (scale>0) {
-
+  if (scale && *scale>0) {
+    unsigned int scalec=*scale;
     for (i=0; i<8; i++) {
-      y256[0]  = shiftright_int16_simd256(y256[0],1);
-      y256[1]  = shiftright_int16_simd256(y256[1],1);
-      y256[2]  = shiftright_int16_simd256(y256[2],1);
-      y256[3]  = shiftright_int16_simd256(y256[3],1);
-      y256[4]  = shiftright_int16_simd256(y256[4],1);
-      y256[5]  = shiftright_int16_simd256(y256[5],1);
-      y256[6]  = shiftright_int16_simd256(y256[6],1);
-      y256[7]  = shiftright_int16_simd256(y256[7],1);
-      y256[8]  = shiftright_int16_simd256(y256[8],1);
-      y256[9]  = shiftright_int16_simd256(y256[9],1);
-      y256[10] = shiftright_int16_simd256(y256[10],1);
-      y256[11] = shiftright_int16_simd256(y256[11],1);
-      y256[12] = shiftright_int16_simd256(y256[12],1);
-      y256[13] = shiftright_int16_simd256(y256[13],1);
-      y256[14] = shiftright_int16_simd256(y256[14],1);
-      y256[15] = shiftright_int16_simd256(y256[15],1);
+      y256[0]  = shiftright_int16_simd256(y256[0],scalec);
+      y256[1]  = shiftright_int16_simd256(y256[1],scalec);
+      y256[2]  = shiftright_int16_simd256(y256[2],scalec);
+      y256[3]  = shiftright_int16_simd256(y256[3],scalec);
+      y256[4]  = shiftright_int16_simd256(y256[4],scalec);
+      y256[5]  = shiftright_int16_simd256(y256[5],scalec);
+      y256[6]  = shiftright_int16_simd256(y256[6],scalec);
+      y256[7]  = shiftright_int16_simd256(y256[7],scalec);
+      y256[8]  = shiftright_int16_simd256(y256[8],scalec);
+      y256[9]  = shiftright_int16_simd256(y256[9],scalec);
+      y256[10] = shiftright_int16_simd256(y256[10],scalec);
+      y256[11] = shiftright_int16_simd256(y256[11],scalec);
+      y256[12] = shiftright_int16_simd256(y256[12],scalec);
+      y256[13] = shiftright_int16_simd256(y256[13],scalec);
+      y256[14] = shiftright_int16_simd256(y256[14],scalec);
+      y256[15] = shiftright_int16_simd256(y256[15],scalec);
 
       y256+=16;
     }
@@ -2355,7 +2516,7 @@ void idft1024(int16_t *x,int16_t *y,unsigned char scale)
 
 int16_t tw2048[2048] __attribute__((aligned(32)));
 
-void dft2048(int16_t *x,int16_t *y,unsigned char scale)
+void dft2048(int16_t *x,int16_t *y,unsigned int *scale)
 {
 
   simd256_q15_t xtmp[256],*xtmpp,*x256 = (simd256_q15_t *)x;
@@ -2404,8 +2565,10 @@ void dft2048(int16_t *x,int16_t *y,unsigned char scale)
     xtmpp+=32;
   }
 
-  dft1024((int16_t*)(xtmp),(int16_t*)ytmp,1);
-  dft1024((int16_t*)(xtmp+128),(int16_t*)(ytmp+128),1);
+  unsigned int *scale1024=NULL;
+  if (scale) scale1024=scale+1;
+  dft1024((int16_t*)(xtmp),(int16_t*)ytmp,scale1024);
+  dft1024((int16_t*)(xtmp+128),(int16_t*)(ytmp+128),scale1024);
 
 
   for (i=0; i<128; i++) {
@@ -2417,33 +2580,55 @@ void dft2048(int16_t *x,int16_t *y,unsigned char scale)
     ytmpp++;
   }
 
-  if (scale>0) {
+  if (scale && *scale>0) {
     y256p = y256;
-
-    for (i=0; i<16; i++) {
-      y256p[0]  = mulhi_int16_simd256(y256p[0],ONE_OVER_SQRT2_Q15_128);
-      y256p[1]  = mulhi_int16_simd256(y256p[1],ONE_OVER_SQRT2_Q15_128);
-      y256p[2]  = mulhi_int16_simd256(y256p[2],ONE_OVER_SQRT2_Q15_128);
-      y256p[3]  = mulhi_int16_simd256(y256p[3],ONE_OVER_SQRT2_Q15_128);
-      y256p[4]  = mulhi_int16_simd256(y256p[4],ONE_OVER_SQRT2_Q15_128);
-      y256p[5]  = mulhi_int16_simd256(y256p[5],ONE_OVER_SQRT2_Q15_128);
-      y256p[6]  = mulhi_int16_simd256(y256p[6],ONE_OVER_SQRT2_Q15_128);
-      y256p[7]  = mulhi_int16_simd256(y256p[7],ONE_OVER_SQRT2_Q15_128);
-      y256p[8]  = mulhi_int16_simd256(y256p[8],ONE_OVER_SQRT2_Q15_128);
-      y256p[9]  = mulhi_int16_simd256(y256p[9],ONE_OVER_SQRT2_Q15_128);
-      y256p[10] = mulhi_int16_simd256(y256p[10],ONE_OVER_SQRT2_Q15_128);
-      y256p[11] = mulhi_int16_simd256(y256p[11],ONE_OVER_SQRT2_Q15_128);
-      y256p[12] = mulhi_int16_simd256(y256p[12],ONE_OVER_SQRT2_Q15_128);
-      y256p[13] = mulhi_int16_simd256(y256p[13],ONE_OVER_SQRT2_Q15_128);
-      y256p[14] = mulhi_int16_simd256(y256p[14],ONE_OVER_SQRT2_Q15_128);
-      y256p[15] = mulhi_int16_simd256(y256p[15],ONE_OVER_SQRT2_Q15_128);
-      y256p+=16;
+    if (*scale>1) {
+      uint32_t scale2=*scale-1;
+      for (i=0; i<16; i++) {
+        y256p[0]  = mulhi_int16_simd256(shiftright_int16_simd256(y256p[0],scale2),ONE_OVER_SQRT2_Q15_128);
+        y256p[1]  = mulhi_int16_simd256(shiftright_int16_simd256(y256p[1],scale2),ONE_OVER_SQRT2_Q15_128);
+        y256p[2]  = mulhi_int16_simd256(shiftright_int16_simd256(y256p[2],scale2),ONE_OVER_SQRT2_Q15_128);
+        y256p[3]  = mulhi_int16_simd256(shiftright_int16_simd256(y256p[3],scale2),ONE_OVER_SQRT2_Q15_128);
+        y256p[4]  = mulhi_int16_simd256(shiftright_int16_simd256(y256p[4],scale2),ONE_OVER_SQRT2_Q15_128);
+        y256p[5]  = mulhi_int16_simd256(shiftright_int16_simd256(y256p[5],scale2),ONE_OVER_SQRT2_Q15_128);
+        y256p[6]  = mulhi_int16_simd256(shiftright_int16_simd256(y256p[6],scale2),ONE_OVER_SQRT2_Q15_128);
+        y256p[7]  = mulhi_int16_simd256(shiftright_int16_simd256(y256p[7],scale2),ONE_OVER_SQRT2_Q15_128);
+        y256p[8]  = mulhi_int16_simd256(shiftright_int16_simd256(y256p[8],scale2),ONE_OVER_SQRT2_Q15_128);
+        y256p[9]  = mulhi_int16_simd256(shiftright_int16_simd256(y256p[9],scale2),ONE_OVER_SQRT2_Q15_128);
+        y256p[10]  = mulhi_int16_simd256(shiftright_int16_simd256(y256p[10],scale2),ONE_OVER_SQRT2_Q15_128);
+        y256p[11]  = mulhi_int16_simd256(shiftright_int16_simd256(y256p[11],scale2),ONE_OVER_SQRT2_Q15_128);
+        y256p[12]  = mulhi_int16_simd256(shiftright_int16_simd256(y256p[12],scale2),ONE_OVER_SQRT2_Q15_128);
+        y256p[13]  = mulhi_int16_simd256(shiftright_int16_simd256(y256p[13],scale2),ONE_OVER_SQRT2_Q15_128);
+        y256p[14]  = mulhi_int16_simd256(shiftright_int16_simd256(y256p[14],scale2),ONE_OVER_SQRT2_Q15_128);
+        y256p[15]  = mulhi_int16_simd256(shiftright_int16_simd256(y256p[15],scale2),ONE_OVER_SQRT2_Q15_128);
+        y256p+=16;
+      }
+    }
+    else {
+      for (i=0; i<16; i++) {
+        y256p[0]  = mulhi_int16_simd256(y256p[0],ONE_OVER_SQRT2_Q15_128);
+        y256p[1]  = mulhi_int16_simd256(y256p[1],ONE_OVER_SQRT2_Q15_128);
+        y256p[2]  = mulhi_int16_simd256(y256p[2],ONE_OVER_SQRT2_Q15_128);
+        y256p[3]  = mulhi_int16_simd256(y256p[3],ONE_OVER_SQRT2_Q15_128);
+        y256p[4]  = mulhi_int16_simd256(y256p[4],ONE_OVER_SQRT2_Q15_128);
+        y256p[5]  = mulhi_int16_simd256(y256p[5],ONE_OVER_SQRT2_Q15_128);
+        y256p[6]  = mulhi_int16_simd256(y256p[6],ONE_OVER_SQRT2_Q15_128);
+        y256p[7]  = mulhi_int16_simd256(y256p[7],ONE_OVER_SQRT2_Q15_128);
+        y256p[8]  = mulhi_int16_simd256(y256p[8],ONE_OVER_SQRT2_Q15_128);
+        y256p[9]  = mulhi_int16_simd256(y256p[9],ONE_OVER_SQRT2_Q15_128);
+        y256p[10] = mulhi_int16_simd256(y256p[10],ONE_OVER_SQRT2_Q15_128);
+        y256p[11] = mulhi_int16_simd256(y256p[11],ONE_OVER_SQRT2_Q15_128);
+        y256p[12] = mulhi_int16_simd256(y256p[12],ONE_OVER_SQRT2_Q15_128);
+        y256p[13] = mulhi_int16_simd256(y256p[13],ONE_OVER_SQRT2_Q15_128);
+        y256p[14] = mulhi_int16_simd256(y256p[14],ONE_OVER_SQRT2_Q15_128);
+        y256p[15] = mulhi_int16_simd256(y256p[15],ONE_OVER_SQRT2_Q15_128);
+        y256p+=16;
+      }
     }
   }
-
 }
 
-void idft2048(int16_t *x,int16_t *y,unsigned char scale)
+void idft2048(int16_t *x,int16_t *y,unsigned int *scale)
 {
 
   simd256_q15_t xtmp[256],*xtmpp,*x256 = (simd256_q15_t *)x;
@@ -2491,8 +2676,10 @@ void idft2048(int16_t *x,int16_t *y,unsigned char scale)
     xtmpp+=32;
   }
 
-  idft1024((int16_t*)(xtmp),(int16_t*)ytmp,1);
-  idft1024((int16_t*)(xtmp+128),(int16_t*)(ytmp+128),1);
+  unsigned int *scale1024=NULL;
+  if (scale) scale1024=scale+1;
+  idft1024((int16_t*)(xtmp),(int16_t*)ytmp,scale1024);
+  idft1024((int16_t*)(xtmp+128),(int16_t*)(ytmp+128),scale1024);
 
 
   for (i=0; i<128; i++) {
@@ -2504,27 +2691,50 @@ void idft2048(int16_t *x,int16_t *y,unsigned char scale)
     ytmpp++;
   }
 
-  if (scale>0) {
+  if (scale && *scale>0) {
     y256p = y256;
-
-    for (i=0; i<16; i++) {
-      y256p[0]  = mulhi_int16_simd256(y256p[0],ONE_OVER_SQRT2_Q15_128);
-      y256p[1]  = mulhi_int16_simd256(y256p[1],ONE_OVER_SQRT2_Q15_128);
-      y256p[2]  = mulhi_int16_simd256(y256p[2],ONE_OVER_SQRT2_Q15_128);
-      y256p[3]  = mulhi_int16_simd256(y256p[3],ONE_OVER_SQRT2_Q15_128);
-      y256p[4]  = mulhi_int16_simd256(y256p[4],ONE_OVER_SQRT2_Q15_128);
-      y256p[5]  = mulhi_int16_simd256(y256p[5],ONE_OVER_SQRT2_Q15_128);
-      y256p[6]  = mulhi_int16_simd256(y256p[6],ONE_OVER_SQRT2_Q15_128);
-      y256p[7]  = mulhi_int16_simd256(y256p[7],ONE_OVER_SQRT2_Q15_128);
-      y256p[8]  = mulhi_int16_simd256(y256p[8],ONE_OVER_SQRT2_Q15_128);
-      y256p[9]  = mulhi_int16_simd256(y256p[9],ONE_OVER_SQRT2_Q15_128);
-      y256p[10] = mulhi_int16_simd256(y256p[10],ONE_OVER_SQRT2_Q15_128);
-      y256p[11] = mulhi_int16_simd256(y256p[11],ONE_OVER_SQRT2_Q15_128);
-      y256p[12] = mulhi_int16_simd256(y256p[12],ONE_OVER_SQRT2_Q15_128);
-      y256p[13] = mulhi_int16_simd256(y256p[13],ONE_OVER_SQRT2_Q15_128);
-      y256p[14] = mulhi_int16_simd256(y256p[14],ONE_OVER_SQRT2_Q15_128);
-      y256p[15] = mulhi_int16_simd256(y256p[15],ONE_OVER_SQRT2_Q15_128);
-      y256p+=16;
+    if (*scale>1) {
+      uint32_t scale2=*scale-1;
+      for (i=0; i<16; i++) {
+        y256p[0]  = mulhi_int16_simd256(shiftright_int16_simd256(y256p[0],scale2),ONE_OVER_SQRT2_Q15_128);
+        y256p[1]  = mulhi_int16_simd256(shiftright_int16_simd256(y256p[1],scale2),ONE_OVER_SQRT2_Q15_128);
+        y256p[2]  = mulhi_int16_simd256(shiftright_int16_simd256(y256p[2],scale2),ONE_OVER_SQRT2_Q15_128);
+        y256p[3]  = mulhi_int16_simd256(shiftright_int16_simd256(y256p[3],scale2),ONE_OVER_SQRT2_Q15_128);
+        y256p[4]  = mulhi_int16_simd256(shiftright_int16_simd256(y256p[4],scale2),ONE_OVER_SQRT2_Q15_128);
+        y256p[5]  = mulhi_int16_simd256(shiftright_int16_simd256(y256p[5],scale2),ONE_OVER_SQRT2_Q15_128);
+        y256p[6]  = mulhi_int16_simd256(shiftright_int16_simd256(y256p[6],scale2),ONE_OVER_SQRT2_Q15_128);
+        y256p[7]  = mulhi_int16_simd256(shiftright_int16_simd256(y256p[7],scale2),ONE_OVER_SQRT2_Q15_128);
+        y256p[8]  = mulhi_int16_simd256(shiftright_int16_simd256(y256p[8],scale2),ONE_OVER_SQRT2_Q15_128);
+        y256p[9]  = mulhi_int16_simd256(shiftright_int16_simd256(y256p[9],scale2),ONE_OVER_SQRT2_Q15_128);
+        y256p[10]  = mulhi_int16_simd256(shiftright_int16_simd256(y256p[10],scale2),ONE_OVER_SQRT2_Q15_128);
+        y256p[11]  = mulhi_int16_simd256(shiftright_int16_simd256(y256p[11],scale2),ONE_OVER_SQRT2_Q15_128);
+        y256p[12]  = mulhi_int16_simd256(shiftright_int16_simd256(y256p[12],scale2),ONE_OVER_SQRT2_Q15_128);
+        y256p[13]  = mulhi_int16_simd256(shiftright_int16_simd256(y256p[13],scale2),ONE_OVER_SQRT2_Q15_128);
+        y256p[14]  = mulhi_int16_simd256(shiftright_int16_simd256(y256p[14],scale2),ONE_OVER_SQRT2_Q15_128);
+        y256p[15]  = mulhi_int16_simd256(shiftright_int16_simd256(y256p[15],scale2),ONE_OVER_SQRT2_Q15_128);
+        y256p+=16;
+      }
+    }
+    else {
+      for (i=0; i<16; i++) {
+        y256p[0]  = mulhi_int16_simd256(y256p[0],ONE_OVER_SQRT2_Q15_128);
+        y256p[1]  = mulhi_int16_simd256(y256p[1],ONE_OVER_SQRT2_Q15_128);
+        y256p[2]  = mulhi_int16_simd256(y256p[2],ONE_OVER_SQRT2_Q15_128);
+        y256p[3]  = mulhi_int16_simd256(y256p[3],ONE_OVER_SQRT2_Q15_128);
+        y256p[4]  = mulhi_int16_simd256(y256p[4],ONE_OVER_SQRT2_Q15_128);
+        y256p[5]  = mulhi_int16_simd256(y256p[5],ONE_OVER_SQRT2_Q15_128);
+        y256p[6]  = mulhi_int16_simd256(y256p[6],ONE_OVER_SQRT2_Q15_128);
+        y256p[7]  = mulhi_int16_simd256(y256p[7],ONE_OVER_SQRT2_Q15_128);
+        y256p[8]  = mulhi_int16_simd256(y256p[8],ONE_OVER_SQRT2_Q15_128);
+        y256p[9]  = mulhi_int16_simd256(y256p[9],ONE_OVER_SQRT2_Q15_128);
+        y256p[10] = mulhi_int16_simd256(y256p[10],ONE_OVER_SQRT2_Q15_128);
+        y256p[11] = mulhi_int16_simd256(y256p[11],ONE_OVER_SQRT2_Q15_128);
+        y256p[12] = mulhi_int16_simd256(y256p[12],ONE_OVER_SQRT2_Q15_128);
+        y256p[13] = mulhi_int16_simd256(y256p[13],ONE_OVER_SQRT2_Q15_128);
+        y256p[14] = mulhi_int16_simd256(y256p[14],ONE_OVER_SQRT2_Q15_128);
+        y256p[15] = mulhi_int16_simd256(y256p[15],ONE_OVER_SQRT2_Q15_128);
+        y256p+=16;
+      }
     }
   }
 
@@ -2532,7 +2742,7 @@ void idft2048(int16_t *x,int16_t *y,unsigned char scale)
 
 int16_t tw4096[3*2*1024];
 
-void dft4096(int16_t *x,int16_t *y,unsigned char scale)
+void dft4096(int16_t *x,int16_t *y,unsigned int *scale)
 {
 
   simd256_q15_t xtmp[512],ytmp[512],*tw4096_256p=(simd256_q15_t *)tw4096,*x256=(simd256_q15_t *)x,*y256=(simd256_q15_t *)y,*y256p=(simd256_q15_t *)y;
@@ -2544,10 +2754,12 @@ void dft4096(int16_t *x,int16_t *y,unsigned char scale)
   }
 
 
-  dft1024((int16_t*)(xtmp),(int16_t*)(ytmp),1);
-  dft1024((int16_t*)(xtmp+128),(int16_t*)(ytmp+128),1);
-  dft1024((int16_t*)(xtmp+256),(int16_t*)(ytmp+256),1);
-  dft1024((int16_t*)(xtmp+384),(int16_t*)(ytmp+384),1);
+  unsigned int *scale1024=NULL;
+  if (scale) scale1024=scale+1;
+  dft1024((int16_t*)(xtmp),(int16_t*)(ytmp),scale1024);
+  dft1024((int16_t*)(xtmp+128),(int16_t*)(ytmp+128),scale1024);
+  dft1024((int16_t*)(xtmp+256),(int16_t*)(ytmp+256),scale1024);
+  dft1024((int16_t*)(xtmp+384),(int16_t*)(ytmp+384),scale1024);
 
   for (i=0; i<128; i++) {
     bfly4_256(ytmpp,ytmpp+128,ytmpp+256,ytmpp+384,
@@ -2558,25 +2770,25 @@ void dft4096(int16_t *x,int16_t *y,unsigned char scale)
     ytmpp++;
   }
 
-  if (scale>0) {
-
+  if (scale && *scale>0) {
+    unsigned int scalec=*scale;
     for (i=0; i<32; i++) {
-      y256[0]  = shiftright_int16_simd256(y256[0],1);
-      y256[1]  = shiftright_int16_simd256(y256[1],1);
-      y256[2]  = shiftright_int16_simd256(y256[2],1);
-      y256[3]  = shiftright_int16_simd256(y256[3],1);
-      y256[4]  = shiftright_int16_simd256(y256[4],1);
-      y256[5]  = shiftright_int16_simd256(y256[5],1);
-      y256[6]  = shiftright_int16_simd256(y256[6],1);
-      y256[7]  = shiftright_int16_simd256(y256[7],1);
-      y256[8]  = shiftright_int16_simd256(y256[8],1);
-      y256[9]  = shiftright_int16_simd256(y256[9],1);
-      y256[10] = shiftright_int16_simd256(y256[10],1);
-      y256[11] = shiftright_int16_simd256(y256[11],1);
-      y256[12] = shiftright_int16_simd256(y256[12],1);
-      y256[13] = shiftright_int16_simd256(y256[13],1);
-      y256[14] = shiftright_int16_simd256(y256[14],1);
-      y256[15] = shiftright_int16_simd256(y256[15],1);
+      y256[0]  = shiftright_int16_simd256(y256[0],scalec);
+      y256[1]  = shiftright_int16_simd256(y256[1],scalec);
+      y256[2]  = shiftright_int16_simd256(y256[2],scalec);
+      y256[3]  = shiftright_int16_simd256(y256[3],scalec);
+      y256[4]  = shiftright_int16_simd256(y256[4],scalec);
+      y256[5]  = shiftright_int16_simd256(y256[5],scalec);
+      y256[6]  = shiftright_int16_simd256(y256[6],scalec);
+      y256[7]  = shiftright_int16_simd256(y256[7],scalec);
+      y256[8]  = shiftright_int16_simd256(y256[8],scalec);
+      y256[9]  = shiftright_int16_simd256(y256[9],scalec);
+      y256[10] = shiftright_int16_simd256(y256[10],scalec);
+      y256[11] = shiftright_int16_simd256(y256[11],scalec);
+      y256[12] = shiftright_int16_simd256(y256[12],scalec);
+      y256[13] = shiftright_int16_simd256(y256[13],scalec);
+      y256[14] = shiftright_int16_simd256(y256[14],scalec);
+      y256[15] = shiftright_int16_simd256(y256[15],scalec);
 
       y256+=16;
     }
@@ -2585,7 +2797,7 @@ void dft4096(int16_t *x,int16_t *y,unsigned char scale)
 
 }
 
-void idft4096(int16_t *x,int16_t *y,unsigned char scale)
+void idft4096(int16_t *x,int16_t *y,unsigned int *scale)
 {
 
   simd256_q15_t xtmp[512],ytmp[512],*tw4096_256p=(simd256_q15_t *)tw4096,*x256=(simd256_q15_t *)x,*y256=(simd256_q15_t *)y,*y256p=(simd256_q15_t *)y;
@@ -2597,10 +2809,12 @@ void idft4096(int16_t *x,int16_t *y,unsigned char scale)
   }
 
 
-  idft1024((int16_t*)(xtmp),(int16_t*)(ytmp),1);
-  idft1024((int16_t*)(xtmp+128),(int16_t*)(ytmp+128),1);
-  idft1024((int16_t*)(xtmp+256),(int16_t*)(ytmp+256),1);
-  idft1024((int16_t*)(xtmp+384),(int16_t*)(ytmp+384),1);
+  unsigned int *scale1024=NULL;
+  if (scale) scale1024=scale+1;
+  idft1024((int16_t*)(xtmp),(int16_t*)(ytmp),scale1024);
+  idft1024((int16_t*)(xtmp+128),(int16_t*)(ytmp+128),scale1024);
+  idft1024((int16_t*)(xtmp+256),(int16_t*)(ytmp+256),scale1024);
+  idft1024((int16_t*)(xtmp+384),(int16_t*)(ytmp+384),scale1024);
 
   for (i=0; i<128; i++) {
     ibfly4_256(ytmpp,ytmpp+128,ytmpp+256,ytmpp+384,
@@ -2611,25 +2825,25 @@ void idft4096(int16_t *x,int16_t *y,unsigned char scale)
     ytmpp++;
   }
 
-  if (scale>0) {
-
+  if (scale && *scale>0) {
+    unsigned int scalec=*scale;
     for (i=0; i<32; i++) {
-      y256[0]  = shiftright_int16_simd256(y256[0],1);
-      y256[1]  = shiftright_int16_simd256(y256[1],1);
-      y256[2]  = shiftright_int16_simd256(y256[2],1);
-      y256[3]  = shiftright_int16_simd256(y256[3],1);
-      y256[4]  = shiftright_int16_simd256(y256[4],1);
-      y256[5]  = shiftright_int16_simd256(y256[5],1);
-      y256[6]  = shiftright_int16_simd256(y256[6],1);
-      y256[7]  = shiftright_int16_simd256(y256[7],1);
-      y256[8]  = shiftright_int16_simd256(y256[8],1);
-      y256[9]  = shiftright_int16_simd256(y256[9],1);
-      y256[10] = shiftright_int16_simd256(y256[10],1);
-      y256[11] = shiftright_int16_simd256(y256[11],1);
-      y256[12] = shiftright_int16_simd256(y256[12],1);
-      y256[13] = shiftright_int16_simd256(y256[13],1);
-      y256[14] = shiftright_int16_simd256(y256[14],1);
-      y256[15] = shiftright_int16_simd256(y256[15],1);
+      y256[0]  = shiftright_int16_simd256(y256[0],scalec);
+      y256[1]  = shiftright_int16_simd256(y256[1],scalec);
+      y256[2]  = shiftright_int16_simd256(y256[2],scalec);
+      y256[3]  = shiftright_int16_simd256(y256[3],scalec);
+      y256[4]  = shiftright_int16_simd256(y256[4],scalec);
+      y256[5]  = shiftright_int16_simd256(y256[5],scalec);
+      y256[6]  = shiftright_int16_simd256(y256[6],scalec);
+      y256[7]  = shiftright_int16_simd256(y256[7],scalec);
+      y256[8]  = shiftright_int16_simd256(y256[8],scalec);
+      y256[9]  = shiftright_int16_simd256(y256[9],scalec);
+      y256[10] = shiftright_int16_simd256(y256[10],scalec);
+      y256[11] = shiftright_int16_simd256(y256[11],scalec);
+      y256[12] = shiftright_int16_simd256(y256[12],scalec);
+      y256[13] = shiftright_int16_simd256(y256[13],scalec);
+      y256[14] = shiftright_int16_simd256(y256[14],scalec);
+      y256[15] = shiftright_int16_simd256(y256[15],scalec);
 
       y256+=16;
     }
@@ -2640,7 +2854,7 @@ void idft4096(int16_t *x,int16_t *y,unsigned char scale)
 
 int16_t tw8192[2*4096] __attribute__((aligned(32)));
 
-void dft8192(int16_t *x,int16_t *y,unsigned char scale)
+void dft8192(int16_t *x,int16_t *y,unsigned int *scale)
 {
 
   simd256_q15_t xtmp[1024],*xtmpp,*x256 = (simd256_q15_t *)x;
@@ -2689,8 +2903,10 @@ void dft8192(int16_t *x,int16_t *y,unsigned char scale)
     xtmpp+=32;
   }
 
-  dft4096((int16_t*)(xtmp),(int16_t*)ytmp,1);
-  dft4096((int16_t*)(xtmp+512),(int16_t*)(ytmp+512),1);
+  unsigned int *scale4096=NULL;
+  if (scale) scale4096=scale+1;
+  dft4096((int16_t*)(xtmp),(int16_t*)ytmp,scale4096);
+  dft4096((int16_t*)(xtmp+512),(int16_t*)(ytmp+512),scale4096);
 
 
   for (i=0; i<512; i++) {
@@ -2702,33 +2918,56 @@ void dft8192(int16_t *x,int16_t *y,unsigned char scale)
     ytmpp++;
   }
 
-  if (scale>0) {
+  if (scale && *scale>0) {
     y256p = y256;
-
-    for (i=0; i<64; i++) {
-      y256p[0]  = mulhi_int16_simd256(y256p[0],ONE_OVER_SQRT2_Q15_128);
-      y256p[1]  = mulhi_int16_simd256(y256p[1],ONE_OVER_SQRT2_Q15_128);
-      y256p[2]  = mulhi_int16_simd256(y256p[2],ONE_OVER_SQRT2_Q15_128);
-      y256p[3]  = mulhi_int16_simd256(y256p[3],ONE_OVER_SQRT2_Q15_128);
-      y256p[4]  = mulhi_int16_simd256(y256p[4],ONE_OVER_SQRT2_Q15_128);
-      y256p[5]  = mulhi_int16_simd256(y256p[5],ONE_OVER_SQRT2_Q15_128);
-      y256p[6]  = mulhi_int16_simd256(y256p[6],ONE_OVER_SQRT2_Q15_128);
-      y256p[7]  = mulhi_int16_simd256(y256p[7],ONE_OVER_SQRT2_Q15_128);
-      y256p[8]  = mulhi_int16_simd256(y256p[8],ONE_OVER_SQRT2_Q15_128);
-      y256p[9]  = mulhi_int16_simd256(y256p[9],ONE_OVER_SQRT2_Q15_128);
-      y256p[10] = mulhi_int16_simd256(y256p[10],ONE_OVER_SQRT2_Q15_128);
-      y256p[11] = mulhi_int16_simd256(y256p[11],ONE_OVER_SQRT2_Q15_128);
-      y256p[12] = mulhi_int16_simd256(y256p[12],ONE_OVER_SQRT2_Q15_128);
-      y256p[13] = mulhi_int16_simd256(y256p[13],ONE_OVER_SQRT2_Q15_128);
-      y256p[14] = mulhi_int16_simd256(y256p[14],ONE_OVER_SQRT2_Q15_128);
-      y256p[15] = mulhi_int16_simd256(y256p[15],ONE_OVER_SQRT2_Q15_128);
-      y256p+=16;
+    if (*scale > 1) {
+      uint32_t scale2=*scale-1;
+      for (i=0; i<64; i++) {
+        y256p[0]  = mulhi_int16_simd256(shiftright_int16_simd256(y256p[0],scale2),ONE_OVER_SQRT2_Q15_128);
+        y256p[1]  = mulhi_int16_simd256(shiftright_int16_simd256(y256p[1],scale2),ONE_OVER_SQRT2_Q15_128);
+        y256p[2]  = mulhi_int16_simd256(shiftright_int16_simd256(y256p[2],scale2),ONE_OVER_SQRT2_Q15_128);
+        y256p[3]  = mulhi_int16_simd256(shiftright_int16_simd256(y256p[3],scale2),ONE_OVER_SQRT2_Q15_128);
+        y256p[4]  = mulhi_int16_simd256(shiftright_int16_simd256(y256p[4],scale2),ONE_OVER_SQRT2_Q15_128);
+        y256p[5]  = mulhi_int16_simd256(shiftright_int16_simd256(y256p[5],scale2),ONE_OVER_SQRT2_Q15_128);
+        y256p[6]  = mulhi_int16_simd256(shiftright_int16_simd256(y256p[6],scale2),ONE_OVER_SQRT2_Q15_128);
+        y256p[7]  = mulhi_int16_simd256(shiftright_int16_simd256(y256p[7],scale2),ONE_OVER_SQRT2_Q15_128);
+        y256p[8]  = mulhi_int16_simd256(shiftright_int16_simd256(y256p[8],scale2),ONE_OVER_SQRT2_Q15_128);
+        y256p[9]  = mulhi_int16_simd256(shiftright_int16_simd256(y256p[9],scale2),ONE_OVER_SQRT2_Q15_128);
+        y256p[10]  = mulhi_int16_simd256(shiftright_int16_simd256(y256p[10],scale2),ONE_OVER_SQRT2_Q15_128);
+        y256p[11]  = mulhi_int16_simd256(shiftright_int16_simd256(y256p[11],scale2),ONE_OVER_SQRT2_Q15_128);
+        y256p[12]  = mulhi_int16_simd256(shiftright_int16_simd256(y256p[12],scale2),ONE_OVER_SQRT2_Q15_128);
+        y256p[13]  = mulhi_int16_simd256(shiftright_int16_simd256(y256p[13],scale2),ONE_OVER_SQRT2_Q15_128);
+        y256p[14]  = mulhi_int16_simd256(shiftright_int16_simd256(y256p[14],scale2),ONE_OVER_SQRT2_Q15_128);
+        y256p[15]  = mulhi_int16_simd256(shiftright_int16_simd256(y256p[15],scale2),ONE_OVER_SQRT2_Q15_128);
+        y256p+=16;
+      }
+    }
+    else {
+      for (i=0; i<64; i++) {
+        y256p[0]  = mulhi_int16_simd256(y256p[0],ONE_OVER_SQRT2_Q15_128);
+        y256p[1]  = mulhi_int16_simd256(y256p[1],ONE_OVER_SQRT2_Q15_128);
+        y256p[2]  = mulhi_int16_simd256(y256p[2],ONE_OVER_SQRT2_Q15_128);
+        y256p[3]  = mulhi_int16_simd256(y256p[3],ONE_OVER_SQRT2_Q15_128);
+        y256p[4]  = mulhi_int16_simd256(y256p[4],ONE_OVER_SQRT2_Q15_128);
+        y256p[5]  = mulhi_int16_simd256(y256p[5],ONE_OVER_SQRT2_Q15_128);
+        y256p[6]  = mulhi_int16_simd256(y256p[6],ONE_OVER_SQRT2_Q15_128);
+        y256p[7]  = mulhi_int16_simd256(y256p[7],ONE_OVER_SQRT2_Q15_128);
+        y256p[8]  = mulhi_int16_simd256(y256p[8],ONE_OVER_SQRT2_Q15_128);
+        y256p[9]  = mulhi_int16_simd256(y256p[9],ONE_OVER_SQRT2_Q15_128);
+        y256p[10] = mulhi_int16_simd256(y256p[10],ONE_OVER_SQRT2_Q15_128);
+        y256p[11] = mulhi_int16_simd256(y256p[11],ONE_OVER_SQRT2_Q15_128);
+        y256p[12] = mulhi_int16_simd256(y256p[12],ONE_OVER_SQRT2_Q15_128);
+        y256p[13] = mulhi_int16_simd256(y256p[13],ONE_OVER_SQRT2_Q15_128);
+        y256p[14] = mulhi_int16_simd256(y256p[14],ONE_OVER_SQRT2_Q15_128);
+        y256p[15] = mulhi_int16_simd256(y256p[15],ONE_OVER_SQRT2_Q15_128);
+        y256p+=16;
+      }
     }
   }
 
 }
 
-void idft8192(int16_t *x,int16_t *y,unsigned char scale)
+void idft8192(int16_t *x,int16_t *y,unsigned int *scale)
 {
 
   simd256_q15_t xtmp[1024],*xtmpp,*x256 = (simd256_q15_t *)x;
@@ -2776,8 +3015,10 @@ void idft8192(int16_t *x,int16_t *y,unsigned char scale)
     xtmpp+=32;
   }
 
-  idft4096((int16_t*)(xtmp),(int16_t*)ytmp,1);
-  idft4096((int16_t*)(xtmp+512),(int16_t*)(ytmp+512),1);
+  unsigned int *scale4096=NULL;
+  if (scale) scale4096=scale+1;
+  idft4096((int16_t*)(xtmp),(int16_t*)ytmp,scale4096);
+  idft4096((int16_t*)(xtmp+512),(int16_t*)(ytmp+512),scale4096);
 
 
   for (i=0; i<512; i++) {
@@ -2789,27 +3030,50 @@ void idft8192(int16_t *x,int16_t *y,unsigned char scale)
     ytmpp++;
   }
 
-  if (scale>0) {
+  if (scale && *scale>0) {
     y256p = y256;
-
-    for (i=0; i<64; i++) {
-      y256p[0]  = mulhi_int16_simd256(y256p[0],ONE_OVER_SQRT2_Q15_128);
-      y256p[1]  = mulhi_int16_simd256(y256p[1],ONE_OVER_SQRT2_Q15_128);
-      y256p[2]  = mulhi_int16_simd256(y256p[2],ONE_OVER_SQRT2_Q15_128);
-      y256p[3]  = mulhi_int16_simd256(y256p[3],ONE_OVER_SQRT2_Q15_128);
-      y256p[4]  = mulhi_int16_simd256(y256p[4],ONE_OVER_SQRT2_Q15_128);
-      y256p[5]  = mulhi_int16_simd256(y256p[5],ONE_OVER_SQRT2_Q15_128);
-      y256p[6]  = mulhi_int16_simd256(y256p[6],ONE_OVER_SQRT2_Q15_128);
-      y256p[7]  = mulhi_int16_simd256(y256p[7],ONE_OVER_SQRT2_Q15_128);
-      y256p[8]  = mulhi_int16_simd256(y256p[8],ONE_OVER_SQRT2_Q15_128);
-      y256p[9]  = mulhi_int16_simd256(y256p[9],ONE_OVER_SQRT2_Q15_128);
-      y256p[10] = mulhi_int16_simd256(y256p[10],ONE_OVER_SQRT2_Q15_128);
-      y256p[11] = mulhi_int16_simd256(y256p[11],ONE_OVER_SQRT2_Q15_128);
-      y256p[12] = mulhi_int16_simd256(y256p[12],ONE_OVER_SQRT2_Q15_128);
-      y256p[13] = mulhi_int16_simd256(y256p[13],ONE_OVER_SQRT2_Q15_128);
-      y256p[14] = mulhi_int16_simd256(y256p[14],ONE_OVER_SQRT2_Q15_128);
-      y256p[15] = mulhi_int16_simd256(y256p[15],ONE_OVER_SQRT2_Q15_128);
-      y256p+=16;
+    if (*scale > 1) {
+      uint32_t scale2=*scale-1;
+      for (i=0; i<64; i++) {
+        y256p[0]  = mulhi_int16_simd256(shiftright_int16_simd256(y256p[0],scale2),ONE_OVER_SQRT2_Q15_128);
+        y256p[1]  = mulhi_int16_simd256(shiftright_int16_simd256(y256p[1],scale2),ONE_OVER_SQRT2_Q15_128);
+        y256p[2]  = mulhi_int16_simd256(shiftright_int16_simd256(y256p[2],scale2),ONE_OVER_SQRT2_Q15_128);
+        y256p[3]  = mulhi_int16_simd256(shiftright_int16_simd256(y256p[3],scale2),ONE_OVER_SQRT2_Q15_128);
+        y256p[4]  = mulhi_int16_simd256(shiftright_int16_simd256(y256p[4],scale2),ONE_OVER_SQRT2_Q15_128);
+        y256p[5]  = mulhi_int16_simd256(shiftright_int16_simd256(y256p[5],scale2),ONE_OVER_SQRT2_Q15_128);
+        y256p[6]  = mulhi_int16_simd256(shiftright_int16_simd256(y256p[6],scale2),ONE_OVER_SQRT2_Q15_128);
+        y256p[7]  = mulhi_int16_simd256(shiftright_int16_simd256(y256p[7],scale2),ONE_OVER_SQRT2_Q15_128);
+        y256p[8]  = mulhi_int16_simd256(shiftright_int16_simd256(y256p[8],scale2),ONE_OVER_SQRT2_Q15_128);
+        y256p[9]  = mulhi_int16_simd256(shiftright_int16_simd256(y256p[9],scale2),ONE_OVER_SQRT2_Q15_128);
+        y256p[10]  = mulhi_int16_simd256(shiftright_int16_simd256(y256p[10],scale2),ONE_OVER_SQRT2_Q15_128);
+        y256p[11]  = mulhi_int16_simd256(shiftright_int16_simd256(y256p[11],scale2),ONE_OVER_SQRT2_Q15_128);
+        y256p[12]  = mulhi_int16_simd256(shiftright_int16_simd256(y256p[12],scale2),ONE_OVER_SQRT2_Q15_128);
+        y256p[13]  = mulhi_int16_simd256(shiftright_int16_simd256(y256p[13],scale2),ONE_OVER_SQRT2_Q15_128);
+        y256p[14]  = mulhi_int16_simd256(shiftright_int16_simd256(y256p[14],scale2),ONE_OVER_SQRT2_Q15_128);
+        y256p[15]  = mulhi_int16_simd256(shiftright_int16_simd256(y256p[15],scale2),ONE_OVER_SQRT2_Q15_128);
+        y256p+=16;
+      }
+    }
+    else {
+      for (i=0; i<64; i++) {
+        y256p[0]  = mulhi_int16_simd256(y256p[0],ONE_OVER_SQRT2_Q15_128);
+        y256p[1]  = mulhi_int16_simd256(y256p[1],ONE_OVER_SQRT2_Q15_128);
+        y256p[2]  = mulhi_int16_simd256(y256p[2],ONE_OVER_SQRT2_Q15_128);
+        y256p[3]  = mulhi_int16_simd256(y256p[3],ONE_OVER_SQRT2_Q15_128);
+        y256p[4]  = mulhi_int16_simd256(y256p[4],ONE_OVER_SQRT2_Q15_128);
+        y256p[5]  = mulhi_int16_simd256(y256p[5],ONE_OVER_SQRT2_Q15_128);
+        y256p[6]  = mulhi_int16_simd256(y256p[6],ONE_OVER_SQRT2_Q15_128);
+        y256p[7]  = mulhi_int16_simd256(y256p[7],ONE_OVER_SQRT2_Q15_128);
+        y256p[8]  = mulhi_int16_simd256(y256p[8],ONE_OVER_SQRT2_Q15_128);
+        y256p[9]  = mulhi_int16_simd256(y256p[9],ONE_OVER_SQRT2_Q15_128);
+        y256p[10] = mulhi_int16_simd256(y256p[10],ONE_OVER_SQRT2_Q15_128);
+        y256p[11] = mulhi_int16_simd256(y256p[11],ONE_OVER_SQRT2_Q15_128);
+        y256p[12] = mulhi_int16_simd256(y256p[12],ONE_OVER_SQRT2_Q15_128);
+        y256p[13] = mulhi_int16_simd256(y256p[13],ONE_OVER_SQRT2_Q15_128);
+        y256p[14] = mulhi_int16_simd256(y256p[14],ONE_OVER_SQRT2_Q15_128);
+        y256p[15] = mulhi_int16_simd256(y256p[15],ONE_OVER_SQRT2_Q15_128);
+        y256p+=16;
+      }
     }
   }
 
@@ -2817,7 +3081,7 @@ void idft8192(int16_t *x,int16_t *y,unsigned char scale)
 
 int16_t tw16384[3*2*4096] __attribute__((aligned(32)));
 
-void dft16384(int16_t *x,int16_t *y,unsigned char scale)
+void dft16384(int16_t *x,int16_t *y,unsigned int *scale)
 {
 
   simd256_q15_t xtmp[2048],ytmp[2048],*tw16384_256p=(simd256_q15_t *)tw16384,*x256=(simd256_q15_t *)x,*y256=(simd256_q15_t *)y,*y256p=(simd256_q15_t *)y;
@@ -2829,10 +3093,12 @@ void dft16384(int16_t *x,int16_t *y,unsigned char scale)
   }
 
 
-  dft4096((int16_t*)(xtmp),(int16_t*)(ytmp),1);
-  dft4096((int16_t*)(xtmp+512),(int16_t*)(ytmp+512),1);
-  dft4096((int16_t*)(xtmp+1024),(int16_t*)(ytmp+1024),1);
-  dft4096((int16_t*)(xtmp+1536),(int16_t*)(ytmp+1536),1);
+  unsigned int *scale4096=NULL;
+  if (scale) scale4096=scale+1;
+  dft4096((int16_t*)(xtmp),(int16_t*)(ytmp),scale4096);
+  dft4096((int16_t*)(xtmp+512),(int16_t*)(ytmp+512),scale4096);
+  dft4096((int16_t*)(xtmp+1024),(int16_t*)(ytmp+1024),scale4096);
+  dft4096((int16_t*)(xtmp+1536),(int16_t*)(ytmp+1536),scale4096);
 
   for (i=0; i<512; i++) {
     bfly4_256(ytmpp,ytmpp+512,ytmpp+1024,ytmpp+1536,
@@ -2843,25 +3109,25 @@ void dft16384(int16_t *x,int16_t *y,unsigned char scale)
     ytmpp++;
   }
 
-  if (scale>0) {
-
+  if (scale && *scale>0) {
+    unsigned int scalec=*scale;
     for (i=0; i<128; i++) {
-      y256[0]  = shiftright_int16_simd256(y256[0],1);
-      y256[1]  = shiftright_int16_simd256(y256[1],1);
-      y256[2]  = shiftright_int16_simd256(y256[2],1);
-      y256[3]  = shiftright_int16_simd256(y256[3],1);
-      y256[4]  = shiftright_int16_simd256(y256[4],1);
-      y256[5]  = shiftright_int16_simd256(y256[5],1);
-      y256[6]  = shiftright_int16_simd256(y256[6],1);
-      y256[7]  = shiftright_int16_simd256(y256[7],1);
-      y256[8]  = shiftright_int16_simd256(y256[8],1);
-      y256[9]  = shiftright_int16_simd256(y256[9],1);
-      y256[10] = shiftright_int16_simd256(y256[10],1);
-      y256[11] = shiftright_int16_simd256(y256[11],1);
-      y256[12] = shiftright_int16_simd256(y256[12],1);
-      y256[13] = shiftright_int16_simd256(y256[13],1);
-      y256[14] = shiftright_int16_simd256(y256[14],1);
-      y256[15] = shiftright_int16_simd256(y256[15],1);
+      y256[0]  = shiftright_int16_simd256(y256[0],scalec);
+      y256[1]  = shiftright_int16_simd256(y256[1],scalec);
+      y256[2]  = shiftright_int16_simd256(y256[2],scalec);
+      y256[3]  = shiftright_int16_simd256(y256[3],scalec);
+      y256[4]  = shiftright_int16_simd256(y256[4],scalec);
+      y256[5]  = shiftright_int16_simd256(y256[5],scalec);
+      y256[6]  = shiftright_int16_simd256(y256[6],scalec);
+      y256[7]  = shiftright_int16_simd256(y256[7],scalec);
+      y256[8]  = shiftright_int16_simd256(y256[8],scalec);
+      y256[9]  = shiftright_int16_simd256(y256[9],scalec);
+      y256[10] = shiftright_int16_simd256(y256[10],scalec);
+      y256[11] = shiftright_int16_simd256(y256[11],scalec);
+      y256[12] = shiftright_int16_simd256(y256[12],scalec);
+      y256[13] = shiftright_int16_simd256(y256[13],scalec);
+      y256[14] = shiftright_int16_simd256(y256[14],scalec);
+      y256[15] = shiftright_int16_simd256(y256[15],scalec);
 
       y256+=16;
     }
@@ -2870,7 +3136,7 @@ void dft16384(int16_t *x,int16_t *y,unsigned char scale)
 
 }
 
-void idft16384(int16_t *x,int16_t *y,unsigned char scale)
+void idft16384(int16_t *x,int16_t *y,unsigned int *scale)
 {
 
   simd256_q15_t xtmp[2048],ytmp[2048],*tw16384_256p=(simd256_q15_t *)tw16384,*x256=(simd256_q15_t *)x,*y256=(simd256_q15_t *)y,*y256p=(simd256_q15_t *)y;
@@ -2882,10 +3148,12 @@ void idft16384(int16_t *x,int16_t *y,unsigned char scale)
   }
 
 
-  idft4096((int16_t*)(xtmp),(int16_t*)(ytmp),1);
-  idft4096((int16_t*)(xtmp+512),(int16_t*)(ytmp+512),1);
-  idft4096((int16_t*)(xtmp+1024),(int16_t*)(ytmp+1024),1);
-  idft4096((int16_t*)(xtmp+1536),(int16_t*)(ytmp+1536),1);
+  unsigned int *scale4096=NULL;
+  if (scale) scale4096=scale+1;
+  idft4096((int16_t*)(xtmp),(int16_t*)(ytmp),scale4096);
+  idft4096((int16_t*)(xtmp+512),(int16_t*)(ytmp+512),scale4096);
+  idft4096((int16_t*)(xtmp+1024),(int16_t*)(ytmp+1024),scale4096);
+  idft4096((int16_t*)(xtmp+1536),(int16_t*)(ytmp+1536),scale4096);
 
   for (i=0; i<512; i++) {
     ibfly4_256(ytmpp,ytmpp+512,ytmpp+1024,ytmpp+1536,
@@ -2896,25 +3164,25 @@ void idft16384(int16_t *x,int16_t *y,unsigned char scale)
     ytmpp++;
   }
 
-  if (scale>0) {
-
+  if (scale && *scale>0) {
+    unsigned int scalec=*scale;
     for (i=0; i<128; i++) {
-      y256[0]  = shiftright_int16_simd256(y256[0],1);
-      y256[1]  = shiftright_int16_simd256(y256[1],1);
-      y256[2]  = shiftright_int16_simd256(y256[2],1);
-      y256[3]  = shiftright_int16_simd256(y256[3],1);
-      y256[4]  = shiftright_int16_simd256(y256[4],1);
-      y256[5]  = shiftright_int16_simd256(y256[5],1);
-      y256[6]  = shiftright_int16_simd256(y256[6],1);
-      y256[7]  = shiftright_int16_simd256(y256[7],1);
-      y256[8]  = shiftright_int16_simd256(y256[8],1);
-      y256[9]  = shiftright_int16_simd256(y256[9],1);
-      y256[10] = shiftright_int16_simd256(y256[10],1);
-      y256[11] = shiftright_int16_simd256(y256[11],1);
-      y256[12] = shiftright_int16_simd256(y256[12],1);
-      y256[13] = shiftright_int16_simd256(y256[13],1);
-      y256[14] = shiftright_int16_simd256(y256[14],1);
-      y256[15] = shiftright_int16_simd256(y256[15],1);
+      y256[0]  = shiftright_int16_simd256(y256[0],scalec);
+      y256[1]  = shiftright_int16_simd256(y256[1],scalec);
+      y256[2]  = shiftright_int16_simd256(y256[2],scalec);
+      y256[3]  = shiftright_int16_simd256(y256[3],scalec);
+      y256[4]  = shiftright_int16_simd256(y256[4],scalec);
+      y256[5]  = shiftright_int16_simd256(y256[5],scalec);
+      y256[6]  = shiftright_int16_simd256(y256[6],scalec);
+      y256[7]  = shiftright_int16_simd256(y256[7],scalec);
+      y256[8]  = shiftright_int16_simd256(y256[8],scalec);
+      y256[9]  = shiftright_int16_simd256(y256[9],scalec);
+      y256[10] = shiftright_int16_simd256(y256[10],scalec);
+      y256[11] = shiftright_int16_simd256(y256[11],scalec);
+      y256[12] = shiftright_int16_simd256(y256[12],scalec);
+      y256[13] = shiftright_int16_simd256(y256[13],scalec);
+      y256[14] = shiftright_int16_simd256(y256[14],scalec);
+      y256[15] = shiftright_int16_simd256(y256[15],scalec);
 
       y256+=16;
     }
@@ -2925,7 +3193,7 @@ void idft16384(int16_t *x,int16_t *y,unsigned char scale)
 
 int16_t tw32768[2*16384] __attribute__((aligned(32)));
 
-void dft32768(int16_t *x,int16_t *y,unsigned char scale)
+void dft32768(int16_t *x,int16_t *y,unsigned int *scale)
 {
 
   simd256_q15_t xtmp[4096],*xtmpp,*x256 = (simd256_q15_t *)x;
@@ -2974,8 +3242,10 @@ void dft32768(int16_t *x,int16_t *y,unsigned char scale)
     xtmpp+=32;
   }
 
-  dft16384((int16_t*)(xtmp),(int16_t*)ytmp,1);
-  dft16384((int16_t*)(xtmp+2048),(int16_t*)(ytmp+2048),1);
+  unsigned int *scale16384=NULL;
+  if (scale) scale16384=scale+1;
+  dft16384((int16_t*)(xtmp),(int16_t*)ytmp,scale16384);
+  dft16384((int16_t*)(xtmp+2048),(int16_t*)(ytmp+2048),scale16384);
 
 
   for (i=0; i<2048; i++) {
@@ -2987,33 +3257,56 @@ void dft32768(int16_t *x,int16_t *y,unsigned char scale)
     ytmpp++;
   }
 
-  if (scale>0) {
+  if (scale && *scale>0) {
     y256p = y256;
-
-    for (i=0; i<64; i++) {
-      y256p[0]  = mulhi_int16_simd256(y256p[0],ONE_OVER_SQRT2_Q15_128);
-      y256p[1]  = mulhi_int16_simd256(y256p[1],ONE_OVER_SQRT2_Q15_128);
-      y256p[2]  = mulhi_int16_simd256(y256p[2],ONE_OVER_SQRT2_Q15_128);
-      y256p[3]  = mulhi_int16_simd256(y256p[3],ONE_OVER_SQRT2_Q15_128);
-      y256p[4]  = mulhi_int16_simd256(y256p[4],ONE_OVER_SQRT2_Q15_128);
-      y256p[5]  = mulhi_int16_simd256(y256p[5],ONE_OVER_SQRT2_Q15_128);
-      y256p[6]  = mulhi_int16_simd256(y256p[6],ONE_OVER_SQRT2_Q15_128);
-      y256p[7]  = mulhi_int16_simd256(y256p[7],ONE_OVER_SQRT2_Q15_128);
-      y256p[8]  = mulhi_int16_simd256(y256p[8],ONE_OVER_SQRT2_Q15_128);
-      y256p[9]  = mulhi_int16_simd256(y256p[9],ONE_OVER_SQRT2_Q15_128);
-      y256p[10] = mulhi_int16_simd256(y256p[10],ONE_OVER_SQRT2_Q15_128);
-      y256p[11] = mulhi_int16_simd256(y256p[11],ONE_OVER_SQRT2_Q15_128);
-      y256p[12] = mulhi_int16_simd256(y256p[12],ONE_OVER_SQRT2_Q15_128);
-      y256p[13] = mulhi_int16_simd256(y256p[13],ONE_OVER_SQRT2_Q15_128);
-      y256p[14] = mulhi_int16_simd256(y256p[14],ONE_OVER_SQRT2_Q15_128);
-      y256p[15] = mulhi_int16_simd256(y256p[15],ONE_OVER_SQRT2_Q15_128);
-      y256p+=16;
+    if (*scale > 1) {
+      uint32_t scale2=*scale-1;
+      for (i=0; i<256; i++) {
+        y256p[0]  = mulhi_int16_simd256(shiftright_int16_simd256(y256p[0],scale2),ONE_OVER_SQRT2_Q15_128);
+        y256p[1]  = mulhi_int16_simd256(shiftright_int16_simd256(y256p[1],scale2),ONE_OVER_SQRT2_Q15_128);
+        y256p[2]  = mulhi_int16_simd256(shiftright_int16_simd256(y256p[2],scale2),ONE_OVER_SQRT2_Q15_128);
+        y256p[3]  = mulhi_int16_simd256(shiftright_int16_simd256(y256p[3],scale2),ONE_OVER_SQRT2_Q15_128);
+        y256p[4]  = mulhi_int16_simd256(shiftright_int16_simd256(y256p[4],scale2),ONE_OVER_SQRT2_Q15_128);
+        y256p[5]  = mulhi_int16_simd256(shiftright_int16_simd256(y256p[5],scale2),ONE_OVER_SQRT2_Q15_128);
+        y256p[6]  = mulhi_int16_simd256(shiftright_int16_simd256(y256p[6],scale2),ONE_OVER_SQRT2_Q15_128);
+        y256p[7]  = mulhi_int16_simd256(shiftright_int16_simd256(y256p[7],scale2),ONE_OVER_SQRT2_Q15_128);
+        y256p[8]  = mulhi_int16_simd256(shiftright_int16_simd256(y256p[8],scale2),ONE_OVER_SQRT2_Q15_128);
+        y256p[9]  = mulhi_int16_simd256(shiftright_int16_simd256(y256p[9],scale2),ONE_OVER_SQRT2_Q15_128);
+        y256p[10]  = mulhi_int16_simd256(shiftright_int16_simd256(y256p[10],scale2),ONE_OVER_SQRT2_Q15_128);
+        y256p[11]  = mulhi_int16_simd256(shiftright_int16_simd256(y256p[11],scale2),ONE_OVER_SQRT2_Q15_128);
+        y256p[12]  = mulhi_int16_simd256(shiftright_int16_simd256(y256p[12],scale2),ONE_OVER_SQRT2_Q15_128);
+        y256p[13]  = mulhi_int16_simd256(shiftright_int16_simd256(y256p[13],scale2),ONE_OVER_SQRT2_Q15_128);
+        y256p[14]  = mulhi_int16_simd256(shiftright_int16_simd256(y256p[14],scale2),ONE_OVER_SQRT2_Q15_128);
+        y256p[15]  = mulhi_int16_simd256(shiftright_int16_simd256(y256p[15],scale2),ONE_OVER_SQRT2_Q15_128);
+        y256p+=16;
+      }
+    }
+    else {
+      for (i=0; i<256; i++) {
+        y256p[0]  = mulhi_int16_simd256(y256p[0],ONE_OVER_SQRT2_Q15_128);
+        y256p[1]  = mulhi_int16_simd256(y256p[1],ONE_OVER_SQRT2_Q15_128);
+        y256p[2]  = mulhi_int16_simd256(y256p[2],ONE_OVER_SQRT2_Q15_128);
+        y256p[3]  = mulhi_int16_simd256(y256p[3],ONE_OVER_SQRT2_Q15_128);
+        y256p[4]  = mulhi_int16_simd256(y256p[4],ONE_OVER_SQRT2_Q15_128);
+        y256p[5]  = mulhi_int16_simd256(y256p[5],ONE_OVER_SQRT2_Q15_128);
+        y256p[6]  = mulhi_int16_simd256(y256p[6],ONE_OVER_SQRT2_Q15_128);
+        y256p[7]  = mulhi_int16_simd256(y256p[7],ONE_OVER_SQRT2_Q15_128);
+        y256p[8]  = mulhi_int16_simd256(y256p[8],ONE_OVER_SQRT2_Q15_128);
+        y256p[9]  = mulhi_int16_simd256(y256p[9],ONE_OVER_SQRT2_Q15_128);
+        y256p[10] = mulhi_int16_simd256(y256p[10],ONE_OVER_SQRT2_Q15_128);
+        y256p[11] = mulhi_int16_simd256(y256p[11],ONE_OVER_SQRT2_Q15_128);
+        y256p[12] = mulhi_int16_simd256(y256p[12],ONE_OVER_SQRT2_Q15_128);
+        y256p[13] = mulhi_int16_simd256(y256p[13],ONE_OVER_SQRT2_Q15_128);
+        y256p[14] = mulhi_int16_simd256(y256p[14],ONE_OVER_SQRT2_Q15_128);
+        y256p[15] = mulhi_int16_simd256(y256p[15],ONE_OVER_SQRT2_Q15_128);
+        y256p+=16;
+      }
     }
   }
 
 }
 
-void idft32768(int16_t *x,int16_t *y,unsigned char scale)
+void idft32768(int16_t *x,int16_t *y,unsigned int *scale)
 {
 
   simd256_q15_t xtmp[4096],*xtmpp,*x256 = (simd256_q15_t *)x;
@@ -3061,8 +3354,10 @@ void idft32768(int16_t *x,int16_t *y,unsigned char scale)
     xtmpp+=32;
   }
 
-  idft16384((int16_t*)(xtmp),(int16_t*)ytmp,1);
-  idft16384((int16_t*)(xtmp+2048),(int16_t*)(ytmp+2048),1);
+  unsigned int *scale16384=NULL;
+  if (scale) scale16384=scale+1;
+  idft16384((int16_t*)(xtmp),(int16_t*)ytmp,scale16384);
+  idft16384((int16_t*)(xtmp+2048),(int16_t*)(ytmp+2048),scale16384);
 
 
   for (i=0; i<2048; i++) {
@@ -3074,36 +3369,58 @@ void idft32768(int16_t *x,int16_t *y,unsigned char scale)
     ytmpp++;
   }
 
-  if (scale>0) {
+  if (scale && *scale>0) {
     y256p = y256;
-
-    for (i=0; i<256; i++) {
-      y256p[0]  = mulhi_int16_simd256(y256p[0],ONE_OVER_SQRT2_Q15_128);
-      y256p[1]  = mulhi_int16_simd256(y256p[1],ONE_OVER_SQRT2_Q15_128);
-      y256p[2]  = mulhi_int16_simd256(y256p[2],ONE_OVER_SQRT2_Q15_128);
-      y256p[3]  = mulhi_int16_simd256(y256p[3],ONE_OVER_SQRT2_Q15_128);
-      y256p[4]  = mulhi_int16_simd256(y256p[4],ONE_OVER_SQRT2_Q15_128);
-      y256p[5]  = mulhi_int16_simd256(y256p[5],ONE_OVER_SQRT2_Q15_128);
-      y256p[6]  = mulhi_int16_simd256(y256p[6],ONE_OVER_SQRT2_Q15_128);
-      y256p[7]  = mulhi_int16_simd256(y256p[7],ONE_OVER_SQRT2_Q15_128);
-      y256p[8]  = mulhi_int16_simd256(y256p[8],ONE_OVER_SQRT2_Q15_128);
-      y256p[9]  = mulhi_int16_simd256(y256p[9],ONE_OVER_SQRT2_Q15_128);
-      y256p[10] = mulhi_int16_simd256(y256p[10],ONE_OVER_SQRT2_Q15_128);
-      y256p[11] = mulhi_int16_simd256(y256p[11],ONE_OVER_SQRT2_Q15_128);
-      y256p[12] = mulhi_int16_simd256(y256p[12],ONE_OVER_SQRT2_Q15_128);
-      y256p[13] = mulhi_int16_simd256(y256p[13],ONE_OVER_SQRT2_Q15_128);
-      y256p[14] = mulhi_int16_simd256(y256p[14],ONE_OVER_SQRT2_Q15_128);
-      y256p[15] = mulhi_int16_simd256(y256p[15],ONE_OVER_SQRT2_Q15_128);
-      y256p+=16;
+    if (*scale > 1) {
+      uint32_t scale2=*scale-1;
+      for (i=0; i<256; i++) {
+        y256p[0]  = mulhi_int16_simd256(shiftright_int16_simd256(y256p[0],scale2),ONE_OVER_SQRT2_Q15_128);
+        y256p[1]  = mulhi_int16_simd256(shiftright_int16_simd256(y256p[1],scale2),ONE_OVER_SQRT2_Q15_128);
+        y256p[2]  = mulhi_int16_simd256(shiftright_int16_simd256(y256p[2],scale2),ONE_OVER_SQRT2_Q15_128);
+        y256p[3]  = mulhi_int16_simd256(shiftright_int16_simd256(y256p[3],scale2),ONE_OVER_SQRT2_Q15_128);
+        y256p[4]  = mulhi_int16_simd256(shiftright_int16_simd256(y256p[4],scale2),ONE_OVER_SQRT2_Q15_128);
+        y256p[5]  = mulhi_int16_simd256(shiftright_int16_simd256(y256p[5],scale2),ONE_OVER_SQRT2_Q15_128);
+        y256p[6]  = mulhi_int16_simd256(shiftright_int16_simd256(y256p[6],scale2),ONE_OVER_SQRT2_Q15_128);
+        y256p[7]  = mulhi_int16_simd256(shiftright_int16_simd256(y256p[7],scale2),ONE_OVER_SQRT2_Q15_128);
+        y256p[8]  = mulhi_int16_simd256(shiftright_int16_simd256(y256p[8],scale2),ONE_OVER_SQRT2_Q15_128);
+        y256p[9]  = mulhi_int16_simd256(shiftright_int16_simd256(y256p[9],scale2),ONE_OVER_SQRT2_Q15_128);
+        y256p[10]  = mulhi_int16_simd256(shiftright_int16_simd256(y256p[10],scale2),ONE_OVER_SQRT2_Q15_128);
+        y256p[11]  = mulhi_int16_simd256(shiftright_int16_simd256(y256p[11],scale2),ONE_OVER_SQRT2_Q15_128);
+        y256p[12]  = mulhi_int16_simd256(shiftright_int16_simd256(y256p[12],scale2),ONE_OVER_SQRT2_Q15_128);
+        y256p[13]  = mulhi_int16_simd256(shiftright_int16_simd256(y256p[13],scale2),ONE_OVER_SQRT2_Q15_128);
+        y256p[14]  = mulhi_int16_simd256(shiftright_int16_simd256(y256p[14],scale2),ONE_OVER_SQRT2_Q15_128);
+        y256p[15]  = mulhi_int16_simd256(shiftright_int16_simd256(y256p[15],scale2),ONE_OVER_SQRT2_Q15_128);
+        y256p+=16;
+      }
+    }
+    else {
+      for (i=0; i<256; i++) {
+        y256p[0]  = mulhi_int16_simd256(y256p[0],ONE_OVER_SQRT2_Q15_128);
+        y256p[1]  = mulhi_int16_simd256(y256p[1],ONE_OVER_SQRT2_Q15_128);
+        y256p[2]  = mulhi_int16_simd256(y256p[2],ONE_OVER_SQRT2_Q15_128);
+        y256p[3]  = mulhi_int16_simd256(y256p[3],ONE_OVER_SQRT2_Q15_128);
+        y256p[4]  = mulhi_int16_simd256(y256p[4],ONE_OVER_SQRT2_Q15_128);
+        y256p[5]  = mulhi_int16_simd256(y256p[5],ONE_OVER_SQRT2_Q15_128);
+        y256p[6]  = mulhi_int16_simd256(y256p[6],ONE_OVER_SQRT2_Q15_128);
+        y256p[7]  = mulhi_int16_simd256(y256p[7],ONE_OVER_SQRT2_Q15_128);
+        y256p[8]  = mulhi_int16_simd256(y256p[8],ONE_OVER_SQRT2_Q15_128);
+        y256p[9]  = mulhi_int16_simd256(y256p[9],ONE_OVER_SQRT2_Q15_128);
+        y256p[10] = mulhi_int16_simd256(y256p[10],ONE_OVER_SQRT2_Q15_128);
+        y256p[11] = mulhi_int16_simd256(y256p[11],ONE_OVER_SQRT2_Q15_128);
+        y256p[12] = mulhi_int16_simd256(y256p[12],ONE_OVER_SQRT2_Q15_128);
+        y256p[13] = mulhi_int16_simd256(y256p[13],ONE_OVER_SQRT2_Q15_128);
+        y256p[14] = mulhi_int16_simd256(y256p[14],ONE_OVER_SQRT2_Q15_128);
+        y256p[15] = mulhi_int16_simd256(y256p[15],ONE_OVER_SQRT2_Q15_128);
+        y256p+=16;
+      }
     }
   }
-
 }
 
 int16_t twa768[512],twb768[512];
 
 // 256 x 3
-void idft768(int16_t *input, int16_t *output, unsigned char scale)
+void idft768(int16_t *input, int16_t *output, unsigned int *scale)
 {
   int i,i2,j;
   uint32_t tmp[3][256]__attribute__((aligned(32)));
@@ -3117,9 +3434,11 @@ void idft768(int16_t *input, int16_t *output, unsigned char scale)
     tmp[2][i] = ((uint32_t *)input)[j++];
   }
 
-  idft256((int16_t*)(tmp[0]),(int16_t*)(tmpo[0]),1);
-  idft256((int16_t*)(tmp[1]),(int16_t*)(tmpo[1]),1);
-  idft256((int16_t*)(tmp[2]),(int16_t*)(tmpo[2]),1);
+  unsigned int *scale256=NULL;
+  if (scale) scale256=scale+1;
+  idft256((int16_t*)(tmp[0]),(int16_t*)(tmpo[0]),scale256);
+  idft256((int16_t*)(tmp[1]),(int16_t*)(tmpo[1]),scale256);
+  idft256((int16_t*)(tmp[2]),(int16_t*)(tmpo[2]),scale256);
 
   for (i=0,i2=0; i<512; i+=8,i2+=4)  {
     ibfly3((simd_q15_t*)(&tmpo[0][i2]),(simd_q15_t*)(&tmpo[1][i2]),((simd_q15_t*)&tmpo[2][i2]),
@@ -3128,7 +3447,7 @@ void idft768(int16_t *input, int16_t *output, unsigned char scale)
   }
 
 
-  if (scale==1) {
+  if (scale && *scale>0) {
     for (i=0; i<12; i++) {
       y128p[0]  = mulhi_int16(y128p[0],ONE_OVER_SQRT3_Q15_128);
       y128p[1]  = mulhi_int16(y128p[1],ONE_OVER_SQRT3_Q15_128);
@@ -3152,7 +3471,7 @@ void idft768(int16_t *input, int16_t *output, unsigned char scale)
 
 }
 
-void dft768(int16_t *input, int16_t *output, unsigned char scale)
+void dft768(int16_t *input, int16_t *output, unsigned int *scale)
 {
   int i,i2,j;
   uint32_t tmp[3][256] __attribute__((aligned(32)));
@@ -3166,9 +3485,11 @@ void dft768(int16_t *input, int16_t *output, unsigned char scale)
     tmp[2][i] = ((uint32_t *)input)[j++];
   }
 
-  dft256((int16_t*)(tmp[0]),(int16_t*)(tmpo[0]),1);
-  dft256((int16_t*)(tmp[1]),(int16_t*)(tmpo[1]),1);
-  dft256((int16_t*)(tmp[2]),(int16_t*)(tmpo[2]),1);
+  unsigned int *scale256=NULL;
+  if (scale) scale256=scale+1;
+  dft256((int16_t*)(tmp[0]),(int16_t*)(tmpo[0]),scale256);
+  dft256((int16_t*)(tmp[1]),(int16_t*)(tmpo[1]),scale256);
+  dft256((int16_t*)(tmp[2]),(int16_t*)(tmpo[2]),scale256);
 
   /*
   for (i=1; i<512; i++) {
@@ -3189,7 +3510,7 @@ void dft768(int16_t *input, int16_t *output, unsigned char scale)
           (simd_q15_t*)(twa768+i),(simd_q15_t*)(twb768+i));
   }
 
-  if (scale==1) {
+  if (scale && *scale>0) {
     for (i=0; i<12; i++) {
       y128p[0]  = mulhi_int16(y128p[0],ONE_OVER_SQRT3_Q15_128);
       y128p[1]  = mulhi_int16(y128p[1],ONE_OVER_SQRT3_Q15_128);
@@ -3215,7 +3536,7 @@ void dft768(int16_t *input, int16_t *output, unsigned char scale)
 int16_t twa1536[1024],twb1536[1024];
 
 // 512 x 3
-void idft1536(int16_t *input, int16_t *output, unsigned char scale)
+void idft1536(int16_t *input, int16_t *output, unsigned int *scale)
 {
   int i,i2,j;
   uint32_t tmp[3][512 ]__attribute__((aligned(32)));
@@ -3229,9 +3550,11 @@ void idft1536(int16_t *input, int16_t *output, unsigned char scale)
     tmp[2][i] = ((uint32_t *)input)[j++];
   }
 
-  idft512((int16_t*)(tmp[0]),(int16_t*)(tmpo[0]),1);
-  idft512((int16_t*)(tmp[1]),(int16_t*)(tmpo[1]),1);
-  idft512((int16_t*)(tmp[2]),(int16_t*)(tmpo[2]),1);
+  unsigned int *scale512=NULL;
+  if (scale) scale512=scale+1;
+  idft512((int16_t*)(tmp[0]),(int16_t*)(tmpo[0]),scale512);
+  idft512((int16_t*)(tmp[1]),(int16_t*)(tmpo[1]),scale512);
+  idft512((int16_t*)(tmp[2]),(int16_t*)(tmpo[2]),scale512);
 
   for (i=0,i2=0; i<1024; i+=8,i2+=4)  {
     ibfly3((simd_q15_t*)(&tmpo[0][i2]),(simd_q15_t*)(&tmpo[1][i2]),((simd_q15_t*)&tmpo[2][i2]),
@@ -3240,7 +3563,7 @@ void idft1536(int16_t *input, int16_t *output, unsigned char scale)
   }
 
 
-  if (scale==1) {
+  if (scale && *scale>0) {
     for (i=0; i<24; i++) {
       y128p[0]  = mulhi_int16(y128p[0],ONE_OVER_SQRT3_Q15_128);
       y128p[1]  = mulhi_int16(y128p[1],ONE_OVER_SQRT3_Q15_128);
@@ -3264,7 +3587,7 @@ void idft1536(int16_t *input, int16_t *output, unsigned char scale)
 
 }
 
-void dft1536(int16_t *input, int16_t *output, unsigned char scale)
+void dft1536(int16_t *input, int16_t *output, unsigned int *scale)
 {
   int i,i2,j;
   uint32_t tmp[3][512] __attribute__((aligned(32)));
@@ -3278,9 +3601,11 @@ void dft1536(int16_t *input, int16_t *output, unsigned char scale)
     tmp[2][i] = ((uint32_t *)input)[j++];
   }
 
-  dft512((int16_t*)(tmp[0]),(int16_t*)(tmpo[0]),1);
-  dft512((int16_t*)(tmp[1]),(int16_t*)(tmpo[1]),1);
-  dft512((int16_t*)(tmp[2]),(int16_t*)(tmpo[2]),1);
+  unsigned int *scale512=NULL;
+  if (scale) scale512=scale+1;
+  dft512((int16_t*)(tmp[0]),(int16_t*)(tmpo[0]),scale512);
+  dft512((int16_t*)(tmp[1]),(int16_t*)(tmpo[1]),scale512);
+  dft512((int16_t*)(tmp[2]),(int16_t*)(tmpo[2]),scale512);
 
   /*
   for (i=1; i<512; i++) {
@@ -3301,7 +3626,7 @@ void dft1536(int16_t *input, int16_t *output, unsigned char scale)
           (simd_q15_t*)(twa1536+i),(simd_q15_t*)(twb1536+i));
   }
 
-  if (scale==1) {
+  if (scale && *scale>0) {
     for (i=0; i<24; i++) {
       y128p[0]  = mulhi_int16(y128p[0],ONE_OVER_SQRT3_Q15_128);
       y128p[1]  = mulhi_int16(y128p[1],ONE_OVER_SQRT3_Q15_128);
@@ -3328,7 +3653,7 @@ void dft1536(int16_t *input, int16_t *output, unsigned char scale)
 int16_t twa3072[2048] __attribute__((aligned(32)));
 int16_t twb3072[2048] __attribute__((aligned(32)));
 // 1024 x 3
-void dft3072(int16_t *input, int16_t *output,unsigned char scale)
+void dft3072(int16_t *input, int16_t *output,unsigned int *scale)
 {
   int i,i2,j;
   uint32_t tmp[3][1024] __attribute__((aligned(32)));
@@ -3342,9 +3667,11 @@ void dft3072(int16_t *input, int16_t *output,unsigned char scale)
     tmp[2][i] = ((uint32_t *)input)[j++];
   }
 
-  dft1024((int16_t*)(tmp[0]),(int16_t*)(tmpo[0]),1);
-  dft1024((int16_t*)(tmp[1]),(int16_t*)(tmpo[1]),1);
-  dft1024((int16_t*)(tmp[2]),(int16_t*)(tmpo[2]),1);
+  unsigned int *scale1024=NULL;
+  if (scale) scale1024=scale+1;
+  dft1024((int16_t*)(tmp[0]),(int16_t*)(tmpo[0]),scale1024);
+  dft1024((int16_t*)(tmp[1]),(int16_t*)(tmpo[1]),scale1024);
+  dft1024((int16_t*)(tmp[2]),(int16_t*)(tmpo[2]),scale1024);
 
   for (i=0,i2=0; i<2048; i+=8,i2+=4)  {
     bfly3((simd_q15_t*)(&tmpo[0][i2]),(simd_q15_t*)(&tmpo[1][i2]),(simd_q15_t*)(&tmpo[2][i2]),
@@ -3352,7 +3679,7 @@ void dft3072(int16_t *input, int16_t *output,unsigned char scale)
           (simd_q15_t*)(twa3072+i),(simd_q15_t*)(twb3072+i));
   }
 
-  if (scale==1) {
+  if (scale && *scale>0) {
     for (i=0; i<48; i++) {
       y128p[0]  = mulhi_int16(y128p[0],ONE_OVER_SQRT3_Q15_128);
       y128p[1]  = mulhi_int16(y128p[1],ONE_OVER_SQRT3_Q15_128);
@@ -3376,7 +3703,7 @@ void dft3072(int16_t *input, int16_t *output,unsigned char scale)
 
 }
 
-void idft3072(int16_t *input, int16_t *output,unsigned char scale)
+void idft3072(int16_t *input, int16_t *output,unsigned int *scale)
 {
   int i,i2,j;
   uint32_t tmp[3][1024]__attribute__((aligned(32)));
@@ -3389,9 +3716,11 @@ void idft3072(int16_t *input, int16_t *output,unsigned char scale)
     tmp[1][i] = ((uint32_t *)input)[j++];
     tmp[2][i] = ((uint32_t *)input)[j++];
   }
-  idft1024((int16_t*)(tmp[0]),(int16_t*)(tmpo[0]),1);
-  idft1024((int16_t*)(tmp[1]),(int16_t*)(tmpo[1]),1);
-  idft1024((int16_t*)(tmp[2]),(int16_t*)(tmpo[2]),1);
+  unsigned int *scale1024=NULL;
+  if (scale) scale1024=scale+1;
+  idft1024((int16_t*)(tmp[0]),(int16_t*)(tmpo[0]),scale1024);
+  idft1024((int16_t*)(tmp[1]),(int16_t*)(tmpo[1]),scale1024);
+  idft1024((int16_t*)(tmp[2]),(int16_t*)(tmpo[2]),scale1024);
 
   for (i=0,i2=0; i<2048; i+=8,i2+=4)  {
     ibfly3((simd_q15_t*)(&tmpo[0][i2]),(simd_q15_t*)(&tmpo[1][i2]),((simd_q15_t*)&tmpo[2][i2]),
@@ -3400,7 +3729,7 @@ void idft3072(int16_t *input, int16_t *output,unsigned char scale)
   }
 
 
-  if (scale==1) {
+  if (scale && *scale>0) {
     for (i=0; i<48; i++) {
       y128p[0]  = mulhi_int16(y128p[0],ONE_OVER_SQRT3_Q15_128);
       y128p[1]  = mulhi_int16(y128p[1],ONE_OVER_SQRT3_Q15_128);
@@ -3428,7 +3757,7 @@ void idft3072(int16_t *input, int16_t *output,unsigned char scale)
 int16_t twa6144[4096] __attribute__((aligned(32)));
 int16_t twb6144[4096] __attribute__((aligned(32)));
 
-void idft6144(int16_t *input, int16_t *output,unsigned char scale)
+void idft6144(int16_t *input, int16_t *output,unsigned int *scale)
 {
   int i,i2,j;
   uint32_t tmp[3][2048] __attribute__((aligned(32)));
@@ -3442,9 +3771,11 @@ void idft6144(int16_t *input, int16_t *output,unsigned char scale)
     tmp[2][i] = ((uint32_t *)input)[j++];
   }
 
-  idft2048((int16_t*)(tmp[0]),(int16_t*)(tmpo[0]),1);
-  idft2048((int16_t*)(tmp[1]),(int16_t*)(tmpo[1]),1);
-  idft2048((int16_t*)(tmp[2]),(int16_t*)(tmpo[2]),1);
+  unsigned int *scale2048=NULL;
+  if (scale) scale2048=scale+1;
+  idft2048((int16_t*)(tmp[0]),(int16_t*)(tmpo[0]),scale2048);
+  idft2048((int16_t*)(tmp[1]),(int16_t*)(tmpo[1]),scale2048);
+  idft2048((int16_t*)(tmp[2]),(int16_t*)(tmpo[2]),scale2048);
 #ifndef MR_MAIN
   if (LOG_DUMPFLAG(DEBUG_DFT)) {
     LOG_M("idft6144in.m","in",input,6144,1,1);
@@ -3460,7 +3791,7 @@ void idft6144(int16_t *input, int16_t *output,unsigned char scale)
   }
 
 
-  if (scale==1) {
+  if (scale && *scale>0) {
     for (i=0; i<96; i++) {
       y128p[0]  = mulhi_int16(y128p[0],ONE_OVER_SQRT3_Q15_128);
       y128p[1]  = mulhi_int16(y128p[1],ONE_OVER_SQRT3_Q15_128);
@@ -3485,7 +3816,7 @@ void idft6144(int16_t *input, int16_t *output,unsigned char scale)
 }
 
 
-void dft6144(int16_t *input, int16_t *output,unsigned char scale)
+void dft6144(int16_t *input, int16_t *output,unsigned int *scale)
 {
   int i,i2,j;
   uint32_t tmp[3][2048] __attribute__((aligned(32)));
@@ -3499,9 +3830,11 @@ void dft6144(int16_t *input, int16_t *output,unsigned char scale)
     tmp[2][i] = ((uint32_t *)input)[j++];
   }
 
-  dft2048((int16_t*)(tmp[0]),(int16_t*)(tmpo[0]),1);
-  dft2048((int16_t*)(tmp[1]),(int16_t*)(tmpo[1]),1);
-  dft2048((int16_t*)(tmp[2]),(int16_t*)(tmpo[2]),1);
+  unsigned int *scale2048=NULL;
+  if (scale) scale2048=scale+1;
+  dft2048((int16_t*)(tmp[0]),(int16_t*)(tmpo[0]),scale2048);
+  dft2048((int16_t*)(tmp[1]),(int16_t*)(tmpo[1]),scale2048);
+  dft2048((int16_t*)(tmp[2]),(int16_t*)(tmpo[2]),scale2048);
 
   /*
   for (i=1; i<2048; i++) {
@@ -3522,7 +3855,7 @@ void dft6144(int16_t *input, int16_t *output,unsigned char scale)
           (simd_q15_t*)(twa6144+i),(simd_q15_t*)(twb6144+i));
   }
 
-  if (scale==1) {
+  if (scale && *scale>0) {
     for (i=0; i<96; i++) {
       y128p[0]  = mulhi_int16(y128p[0],ONE_OVER_SQRT3_Q15_128);
       y128p[1]  = mulhi_int16(y128p[1],ONE_OVER_SQRT3_Q15_128);
@@ -3545,10 +3878,23 @@ void dft6144(int16_t *input, int16_t *output,unsigned char scale)
   }
 }
 
+int16_t twa9216[6144] __attribute__((aligned(32)));
+int16_t twb9216[6144] __attribute__((aligned(32)));
+// 3072 x 3
+void dft9216(int16_t *input, int16_t *output,uint32_t *scale) {
+
+  AssertFatal(1==0,"Need to do this ..\n");
+}
+
+void idft9216(int16_t *input, int16_t *output,uint32_t *scale) {
+
+  AssertFatal(1==0,"Need to do this ..\n");
+}
+
 int16_t twa12288[8192] __attribute__((aligned(32)));
 int16_t twb12288[8192] __attribute__((aligned(32)));
 // 4096 x 3
-void dft12288(int16_t *input, int16_t *output,unsigned char scale)
+void dft12288(int16_t *input, int16_t *output,unsigned int *scale)
 {
   int i,i2,j;
   uint32_t tmp[3][4096] __attribute__((aligned(32)));
@@ -3562,9 +3908,11 @@ void dft12288(int16_t *input, int16_t *output,unsigned char scale)
     tmp[2][i] = ((uint32_t *)input)[j++];
   }
 
-  dft4096((int16_t*)(tmp[0]),(int16_t*)(tmpo[0]),scale);
-  dft4096((int16_t*)(tmp[1]),(int16_t*)(tmpo[1]),scale);
-  dft4096((int16_t*)(tmp[2]),(int16_t*)(tmpo[2]),scale);
+  unsigned int *scale4096=NULL;
+  if (scale) scale4096=scale+1;
+  dft4096((int16_t*)(tmp[0]),(int16_t*)(tmpo[0]),scale4096);
+  dft4096((int16_t*)(tmp[1]),(int16_t*)(tmpo[1]),scale4096);
+  dft4096((int16_t*)(tmp[2]),(int16_t*)(tmpo[2]),scale4096);
   /*
   for (i=1; i<4096; i++) {
     tmpo[0][i] = tmpo[0][i<<1];
@@ -3584,7 +3932,7 @@ void dft12288(int16_t *input, int16_t *output,unsigned char scale)
           (simd_q15_t*)(twa12288+i),(simd_q15_t*)(twb12288+i));
   }
 
-  if (scale==1) {
+  if (scale && *scale>0) {
     for (i=0; i<192; i++) {
       y128p[0]  = mulhi_int16(y128p[0],ONE_OVER_SQRT3_Q15_128);
       y128p[1]  = mulhi_int16(y128p[1],ONE_OVER_SQRT3_Q15_128);
@@ -3607,7 +3955,7 @@ void dft12288(int16_t *input, int16_t *output,unsigned char scale)
   }
 }
 
-void idft12288(int16_t *input, int16_t *output,unsigned char scale)
+void idft12288(int16_t *input, int16_t *output,unsigned int *scale)
 {
   int i,i2,j;
   uint32_t tmp[3][4096] __attribute__((aligned(32)));
@@ -3623,9 +3971,11 @@ void idft12288(int16_t *input, int16_t *output,unsigned char scale)
 
 
 
-  idft4096((int16_t*)(tmp[0]),(int16_t*)(tmpo[0]),scale);
-  idft4096((int16_t*)(tmp[1]),(int16_t*)(tmpo[1]),scale);
-  idft4096((int16_t*)(tmp[2]),(int16_t*)(tmpo[2]),scale);
+  unsigned int *scale4096=NULL;
+  if (scale) scale4096=scale+1;
+  idft4096((int16_t*)(tmp[0]),(int16_t*)(tmpo[0]),scale4096);
+  idft4096((int16_t*)(tmp[1]),(int16_t*)(tmpo[1]),scale4096);
+  idft4096((int16_t*)(tmp[2]),(int16_t*)(tmpo[2]),scale4096);
 #ifndef MR_MAIN
   if (LOG_DUMPFLAG(DEBUG_DFT)) {
     LOG_M("idft12288in.m","in",input,12288,1,1);
@@ -3640,7 +3990,7 @@ void idft12288(int16_t *input, int16_t *output,unsigned char scale)
           (simd_q15_t*)(twa12288+i),(simd_q15_t*)(twb12288+i));
   }
 
-  if (scale==1) {
+  if (scale && *scale>0) {
     for (i=0; i<192; i++) {
       y128p[0]  = mulhi_int16(y128p[0],ONE_OVER_SQRT3_Q15_128);
       y128p[1]  = mulhi_int16(y128p[1],ONE_OVER_SQRT3_Q15_128);
@@ -3671,7 +4021,7 @@ void idft12288(int16_t *input, int16_t *output,unsigned char scale)
 int16_t twa18432[12288] __attribute__((aligned(32)));
 int16_t twb18432[12288] __attribute__((aligned(32)));
 // 6144 x 3
-void dft18432(int16_t *input, int16_t *output,unsigned char scale) {
+void dft18432(int16_t *input, int16_t *output,unsigned int *scale) {
 
   int i,i2,j;
   uint32_t tmp[3][6144] __attribute__((aligned(32)));
@@ -3685,16 +4035,18 @@ void dft18432(int16_t *input, int16_t *output,unsigned char scale) {
     tmp[2][i] = ((uint32_t *)input)[j++];
   }
 
-  dft6144((int16_t*)(tmp[0]),(int16_t*)(tmpo[0]),scale);
-  dft6144((int16_t*)(tmp[1]),(int16_t*)(tmpo[1]),scale);
-  dft6144((int16_t*)(tmp[2]),(int16_t*)(tmpo[2]),scale);
+  unsigned int *scale6144=NULL;
+  if (scale) scale6144=scale+1;
+  dft6144((int16_t*)(tmp[0]),(int16_t*)(tmpo[0]),scale6144);
+  dft6144((int16_t*)(tmp[1]),(int16_t*)(tmpo[1]),scale6144);
+  dft6144((int16_t*)(tmp[2]),(int16_t*)(tmpo[2]),scale6144);
 
   for (i=0,i2=0; i<12288; i+=8,i2+=4)  {
     bfly3((simd_q15_t*)(&tmpo[0][i2]),(simd_q15_t*)(&tmpo[1][i2]),(simd_q15_t*)(&tmpo[2][i2]),
           (simd_q15_t*)(output+i),(simd_q15_t*)(output+12288+i),(simd_q15_t*)(output+24576+i),
           (simd_q15_t*)(twa18432+i),(simd_q15_t*)(twb18432+i));
   }
-  if (scale==1) {
+  if (scale && *scale>0) {
     for (i=0; i<288; i++) {
       y128p[0]  = mulhi_int16(y128p[0],ONE_OVER_SQRT3_Q15_128);
       y128p[1]  = mulhi_int16(y128p[1],ONE_OVER_SQRT3_Q15_128);
@@ -3717,7 +4069,7 @@ void dft18432(int16_t *input, int16_t *output,unsigned char scale) {
   }
 }
 
-void idft18432(int16_t *input, int16_t *output,unsigned char scale) {
+void idft18432(int16_t *input, int16_t *output,unsigned int *scale) {
 
   int i,i2,j;
   uint32_t tmp[3][6144] __attribute__((aligned(32)));
@@ -3731,16 +4083,18 @@ void idft18432(int16_t *input, int16_t *output,unsigned char scale) {
     tmp[2][i] = ((uint32_t *)input)[j++];
   }
 
-  idft6144((int16_t*)(tmp[0]),(int16_t*)(tmpo[0]),scale);
-  idft6144((int16_t*)(tmp[1]),(int16_t*)(tmpo[1]),scale);
-  idft6144((int16_t*)(tmp[2]),(int16_t*)(tmpo[2]),scale);
+  unsigned int *scale6144=NULL;
+  if (scale) scale6144=scale+1;
+  idft6144((int16_t*)(tmp[0]),(int16_t*)(tmpo[0]),scale6144);
+  idft6144((int16_t*)(tmp[1]),(int16_t*)(tmpo[1]),scale6144);
+  idft6144((int16_t*)(tmp[2]),(int16_t*)(tmpo[2]),scale6144);
 
   for (i=0,i2=0; i<12288; i+=8,i2+=4)  {
     ibfly3((simd_q15_t*)(&tmpo[0][i2]),(simd_q15_t*)(&tmpo[1][i2]),(simd_q15_t*)(&tmpo[2][i2]),
 	   (simd_q15_t*)(output+i),(simd_q15_t*)(output+12288+i),(simd_q15_t*)(output+24576+i),
 	   (simd_q15_t*)(twa18432+i),(simd_q15_t*)(twb18432+i));
   }
-  if (scale==1) {
+  if (scale && *scale>0) {
     for (i=0; i<288; i++) {
       y128p[0]  = mulhi_int16(y128p[0],ONE_OVER_SQRT3_Q15_128);
       y128p[1]  = mulhi_int16(y128p[1],ONE_OVER_SQRT3_Q15_128);
@@ -3767,7 +4121,7 @@ void idft18432(int16_t *input, int16_t *output,unsigned char scale) {
 int16_t twa24576[16384] __attribute__((aligned(32)));
 int16_t twb24576[16384] __attribute__((aligned(32)));
 // 8192 x 3
-void dft24576(int16_t *input, int16_t *output,unsigned char scale)
+void dft24576(int16_t *input, int16_t *output,unsigned int *scale)
 {
   int i,i2,j;
   uint32_t tmp[3][8192] __attribute__((aligned(32)));
@@ -3781,9 +4135,11 @@ void dft24576(int16_t *input, int16_t *output,unsigned char scale)
     tmp[2][i] = ((uint32_t *)input)[j++];
   }
 
-  dft8192((int16_t*)(tmp[0]),(int16_t*)(tmpo[0]),1);
-  dft8192((int16_t*)(tmp[1]),(int16_t*)(tmpo[1]),1);
-  dft8192((int16_t*)(tmp[2]),(int16_t*)(tmpo[2]),1);
+  unsigned int *scale8192=NULL;
+  if (scale) scale8192=scale+1;
+  dft8192((int16_t*)(tmp[0]),(int16_t*)(tmpo[0]),scale8192);
+  dft8192((int16_t*)(tmp[1]),(int16_t*)(tmpo[1]),scale8192);
+  dft8192((int16_t*)(tmp[2]),(int16_t*)(tmpo[2]),scale8192);
   /*
   for (i=1; i<8192; i++) {
     tmpo[0][i] = tmpo[0][i<<1];
@@ -3804,7 +4160,7 @@ void dft24576(int16_t *input, int16_t *output,unsigned char scale)
   }
 
 
-  if (scale==1) {
+  if (scale && *scale>0) {
     for (i=0; i<384; i++) {
       y128p[0]  = mulhi_int16(y128p[0],ONE_OVER_SQRT3_Q15_128);
       y128p[1]  = mulhi_int16(y128p[1],ONE_OVER_SQRT3_Q15_128);
@@ -3832,7 +4188,7 @@ void dft24576(int16_t *input, int16_t *output,unsigned char scale)
 #endif
 }
 
-void idft24576(int16_t *input, int16_t *output,unsigned char scale)
+void idft24576(int16_t *input, int16_t *output,unsigned int *scale)
 {
   int i,i2,j;
   uint32_t tmp[3][8192] __attribute__((aligned(32)));
@@ -3846,9 +4202,11 @@ void idft24576(int16_t *input, int16_t *output,unsigned char scale)
     tmp[2][i] = ((uint32_t *)input)[j++];
   }
 
-  idft8192((int16_t*)(tmp[0]),(int16_t*)(tmpo[0]),1);
-  idft8192((int16_t*)(tmp[1]),(int16_t*)(tmpo[1]),1);
-  idft8192((int16_t*)(tmp[2]),(int16_t*)(tmpo[2]),1);
+  unsigned int *scale8192=NULL;
+  if (scale) scale8192=scale+1;
+  idft8192((int16_t*)(tmp[0]),(int16_t*)(tmpo[0]),scale8192);
+  idft8192((int16_t*)(tmp[1]),(int16_t*)(tmpo[1]),scale8192);
+  idft8192((int16_t*)(tmp[2]),(int16_t*)(tmpo[2]),scale8192);
  #ifndef MR_MAIN 
   if (LOG_DUMPFLAG(DEBUG_DFT)) {
     LOG_M("idft24576in.m","in",input,24576,1,1);
@@ -3862,7 +4220,7 @@ void idft24576(int16_t *input, int16_t *output,unsigned char scale)
           (simd_q15_t*)(output+i),(simd_q15_t*)(output+16384+i),(simd_q15_t*)(output+32768+i),
           (simd_q15_t*)(twa24576+i),(simd_q15_t*)(twb24576+i));
   }
-  if (scale==1) {
+  if (scale && *scale>0) {
     for (i=0; i<384; i++) {
       y128p[0]  = mulhi_int16(y128p[0],ONE_OVER_SQRT3_Q15_128);
       y128p[1]  = mulhi_int16(y128p[1],ONE_OVER_SQRT3_Q15_128);
@@ -3894,7 +4252,7 @@ int16_t twa36864[24576] __attribute__((aligned(32)));
 int16_t twb36864[24576] __attribute__((aligned(32)));
 
 // 12288 x 3
-void dft36864(int16_t *input, int16_t *output,uint8_t scale) {
+void dft36864(int16_t *input, int16_t *output,uint32_t *scale) {
 
   int i,i2,j;
   uint32_t tmp[3][12288] __attribute__((aligned(32)));
@@ -3908,9 +4266,11 @@ void dft36864(int16_t *input, int16_t *output,uint8_t scale) {
     tmp[2][i] = ((uint32_t *)input)[j++];
   }
 
-  dft12288((int16_t*)(tmp[0]),(int16_t*)(tmpo[0]),1);
-  dft12288((int16_t*)(tmp[1]),(int16_t*)(tmpo[1]),1);
-  dft12288((int16_t*)(tmp[2]),(int16_t*)(tmpo[2]),1);
+  unsigned int *scale12288=NULL;
+  if (scale) scale12288=scale+1;
+  dft12288((int16_t*)(tmp[0]),(int16_t*)(tmpo[0]),scale12288);
+  dft12288((int16_t*)(tmp[1]),(int16_t*)(tmpo[1]),scale12288);
+  dft12288((int16_t*)(tmp[2]),(int16_t*)(tmpo[2]),scale12288);
 #ifndef MR_MAIN
   if (LOG_DUMPFLAG(DEBUG_DFT)) {
     LOG_M("dft36864out0.m","o0",tmpo[0],12288,1,1);
@@ -3924,7 +4284,7 @@ void dft36864(int16_t *input, int16_t *output,uint8_t scale) {
           (simd_q15_t*)(twa36864+i),(simd_q15_t*)(twb36864+i));
   }
 
-  if (scale==1) {
+  if (scale && *scale>0) {
     for (i=0; i<576; i++) {
       y128p[0]  = mulhi_int16(y128p[0],ONE_OVER_SQRT3_Q15_128);
       y128p[1]  = mulhi_int16(y128p[1],ONE_OVER_SQRT3_Q15_128);
@@ -3952,7 +4312,7 @@ void dft36864(int16_t *input, int16_t *output,uint8_t scale) {
 #endif
 }
 
-void idft36864(int16_t *input, int16_t *output,uint8_t scale) {
+void idft36864(int16_t *input, int16_t *output,uint32_t *scale) {
 
   int i,i2,j;
   uint32_t tmp[3][12288] __attribute__((aligned(32)));
@@ -3966,16 +4326,18 @@ void idft36864(int16_t *input, int16_t *output,uint8_t scale) {
     tmp[2][i] = ((uint32_t *)input)[j++];
   }
 
-  idft12288((int16_t*)(tmp[0]),(int16_t*)(tmpo[0]),1);
-  idft12288((int16_t*)(tmp[1]),(int16_t*)(tmpo[1]),1);
-  idft12288((int16_t*)(tmp[2]),(int16_t*)(tmpo[2]),1);
+  unsigned int *scale12288=NULL;
+  if (scale) scale12288=scale+1;
+  idft12288((int16_t*)(tmp[0]),(int16_t*)(tmpo[0]),scale12288);
+  idft12288((int16_t*)(tmp[1]),(int16_t*)(tmpo[1]),scale12288);
+  idft12288((int16_t*)(tmp[2]),(int16_t*)(tmpo[2]),scale12288);
 
   for (i=0,i2=0; i<24576; i+=8,i2+=4)  {
     ibfly3((simd_q15_t*)(&tmpo[0][i2]),(simd_q15_t*)(&tmpo[1][i2]),((simd_q15_t*)&tmpo[2][i2]),
           (simd_q15_t*)(output+i),(simd_q15_t*)(output+24576+i),(simd_q15_t*)(output+49152+i),
           (simd_q15_t*)(twa36864+i),(simd_q15_t*)(twb36864+i));
   }
-  if (scale==1) {
+  if (scale && *scale>0) {
     for (i=0; i<576; i++) {
       y128p[0]  = mulhi_int16(y128p[0],ONE_OVER_SQRT3_Q15_128);
       y128p[1]  = mulhi_int16(y128p[1],ONE_OVER_SQRT3_Q15_128);
@@ -4002,7 +4364,7 @@ int16_t twa49152[32768] __attribute__((aligned(32)));
 int16_t twb49152[32768] __attribute__((aligned(32)));
 
 // 16384 x 3
-void dft49152(int16_t *input, int16_t *output,uint8_t scale) {
+void dft49152(int16_t *input, int16_t *output,uint32_t *scale) {
 
   int i,i2,j;
   uint32_t tmp[3][16384] __attribute__((aligned(32)));
@@ -4016,16 +4378,18 @@ void dft49152(int16_t *input, int16_t *output,uint8_t scale) {
     tmp[2][i] = ((uint32_t *)input)[j++];
   }
 
-  dft16384((int16_t*)(tmp[0]),(int16_t*)(tmpo[0]),1);
-  dft16384((int16_t*)(tmp[1]),(int16_t*)(tmpo[1]),1);
-  dft16384((int16_t*)(tmp[2]),(int16_t*)(tmpo[2]),1);
+  unsigned int *scale16384=NULL;
+  if (scale) scale16384=scale+1;
+  dft16384((int16_t*)(tmp[0]),(int16_t*)(tmpo[0]),scale16384);
+  dft16384((int16_t*)(tmp[1]),(int16_t*)(tmpo[1]),scale16384);
+  dft16384((int16_t*)(tmp[2]),(int16_t*)(tmpo[2]),scale16384);
 
   for (i=0,i2=0; i<32768; i+=8,i2+=4)  {
     bfly3((simd_q15_t*)(&tmpo[0][i2]),(simd_q15_t*)(&tmpo[1][i2]),((simd_q15_t*)&tmpo[2][i2]),
           (simd_q15_t*)(output+i),(simd_q15_t*)(output+32768+i),(simd_q15_t*)(output+65536+i),
           (simd_q15_t*)(twa49152+i),(simd_q15_t*)(twb49152+i));
   }
-  if (scale==1) {
+  if (scale && *scale>0) {
     for (i=0; i<768; i++) {
       y128p[0]  = mulhi_int16(y128p[0],ONE_OVER_SQRT3_Q15_128);
       y128p[1]  = mulhi_int16(y128p[1],ONE_OVER_SQRT3_Q15_128);
@@ -4048,7 +4412,7 @@ void dft49152(int16_t *input, int16_t *output,uint8_t scale) {
   }
 }
 
-void idft49152(int16_t *input, int16_t *output,uint8_t scale) {
+void idft49152(int16_t *input, int16_t *output,uint32_t *scale) {
 
    int i,i2,j;
   uint32_t tmp[3][16384] __attribute__((aligned(32)));
@@ -4062,16 +4426,18 @@ void idft49152(int16_t *input, int16_t *output,uint8_t scale) {
     tmp[2][i] = ((uint32_t *)input)[j++];
   }
 
-  idft16384((int16_t*)(tmp[0]),(int16_t*)(tmpo[0]),1);
-  idft16384((int16_t*)(tmp[1]),(int16_t*)(tmpo[1]),1);
-  idft16384((int16_t*)(tmp[2]),(int16_t*)(tmpo[2]),1);
+  unsigned int *scale16384=NULL;
+  if (scale) scale16384=scale+1;
+  idft16384((int16_t*)(tmp[0]),(int16_t*)(tmpo[0]),scale16384);
+  idft16384((int16_t*)(tmp[1]),(int16_t*)(tmpo[1]),scale16384);
+  idft16384((int16_t*)(tmp[2]),(int16_t*)(tmpo[2]),scale16384);
 
   for (i=0,i2=0; i<32768; i+=8,i2+=4)  {
     ibfly3((simd_q15_t*)(&tmpo[0][i2]),(simd_q15_t*)(&tmpo[1][i2]),((simd_q15_t*)&tmpo[2][i2]),
 	   (simd_q15_t*)(output+i),(simd_q15_t*)(output+32768+i),(simd_q15_t*)(output+65536+i),
 	   (simd_q15_t*)(twa49152+i),(simd_q15_t*)(twb49152+i));
   }
-  if (scale==1) {
+  if (scale && *scale>0) {
     for (i=0; i<768; i++) {
       y128p[0]  = mulhi_int16(y128p[0],ONE_OVER_SQRT3_Q15_128);
       y128p[1]  = mulhi_int16(y128p[1],ONE_OVER_SQRT3_Q15_128);
@@ -4096,7 +4462,7 @@ void idft49152(int16_t *input, int16_t *output,uint8_t scale) {
 
 int16_t tw65536[3*2*16384] __attribute__((aligned(32)));
 
-void idft65536(int16_t *x,int16_t *y,unsigned char scale)
+void idft65536(int16_t *x,int16_t *y,unsigned int *scale)
 {
 
   simd256_q15_t xtmp[8192],ytmp[8192],*tw65536_256p=(simd256_q15_t *)tw65536,*x256=(simd256_q15_t *)x,*y256=(simd256_q15_t *)y,*y256p=(simd256_q15_t *)y;
@@ -4108,10 +4474,12 @@ void idft65536(int16_t *x,int16_t *y,unsigned char scale)
   }
 
 
-  idft16384((int16_t*)(xtmp),(int16_t*)(ytmp),1);
-  idft16384((int16_t*)(xtmp+2048),(int16_t*)(ytmp+2048),1);
-  idft16384((int16_t*)(xtmp+4096),(int16_t*)(ytmp+4096),1);
-  idft16384((int16_t*)(xtmp+6144),(int16_t*)(ytmp+6144),1);
+  unsigned int *scale16384=NULL;
+  if (scale) scale16384=scale+1;
+  idft16384((int16_t*)(xtmp),(int16_t*)(ytmp),scale16384);
+  idft16384((int16_t*)(xtmp+2048),(int16_t*)(ytmp+2048),scale16384);
+  idft16384((int16_t*)(xtmp+4096),(int16_t*)(ytmp+4096),scale16384);
+  idft16384((int16_t*)(xtmp+6144),(int16_t*)(ytmp+6144),scale16384);
 
   for (i=0; i<2048; i++) {
     ibfly4_256(ytmpp,ytmpp+2048,ytmpp+4096,ytmpp+6144,
@@ -4122,25 +4490,25 @@ void idft65536(int16_t *x,int16_t *y,unsigned char scale)
     ytmpp++;
   }
 
-  if (scale>0) {
-
+  if (scale && *scale>0) {
+    unsigned int scalec=*scale;
     for (i=0; i<512; i++) {
-      y256[0]  = shiftright_int16_simd256(y256[0],scale);
-      y256[1]  = shiftright_int16_simd256(y256[1],scale);
-      y256[2]  = shiftright_int16_simd256(y256[2],scale);
-      y256[3]  = shiftright_int16_simd256(y256[3],scale);
-      y256[4]  = shiftright_int16_simd256(y256[4],scale);
-      y256[5]  = shiftright_int16_simd256(y256[5],scale);
-      y256[6]  = shiftright_int16_simd256(y256[6],scale);
-      y256[7]  = shiftright_int16_simd256(y256[7],scale);
-      y256[8]  = shiftright_int16_simd256(y256[8],scale);
-      y256[9]  = shiftright_int16_simd256(y256[9],scale);
-      y256[10] = shiftright_int16_simd256(y256[10],scale);
-      y256[11] = shiftright_int16_simd256(y256[11],scale);
-      y256[12] = shiftright_int16_simd256(y256[12],scale);
-      y256[13] = shiftright_int16_simd256(y256[13],scale);
-      y256[14] = shiftright_int16_simd256(y256[14],scale);
-      y256[15] = shiftright_int16_simd256(y256[15],scale);
+      y256[0]  = shiftright_int16_simd256(y256[0],scalec);
+      y256[1]  = shiftright_int16_simd256(y256[1],scalec);
+      y256[2]  = shiftright_int16_simd256(y256[2],scalec);
+      y256[3]  = shiftright_int16_simd256(y256[3],scalec);
+      y256[4]  = shiftright_int16_simd256(y256[4],scalec);
+      y256[5]  = shiftright_int16_simd256(y256[5],scalec);
+      y256[6]  = shiftright_int16_simd256(y256[6],scalec);
+      y256[7]  = shiftright_int16_simd256(y256[7],scalec);
+      y256[8]  = shiftright_int16_simd256(y256[8],scalec);
+      y256[9]  = shiftright_int16_simd256(y256[9],scalec);
+      y256[10] = shiftright_int16_simd256(y256[10],scalec);
+      y256[11] = shiftright_int16_simd256(y256[11],scalec);
+      y256[12] = shiftright_int16_simd256(y256[12],scalec);
+      y256[13] = shiftright_int16_simd256(y256[13],scalec);
+      y256[14] = shiftright_int16_simd256(y256[14],scalec);
+      y256[15] = shiftright_int16_simd256(y256[15],scalec);
 
       y256+=16;
     }
@@ -4149,10 +4517,24 @@ void idft65536(int16_t *x,int16_t *y,unsigned char scale)
 
 }
 
+int16_t twa73728[49152] __attribute__((aligned(32)));
+int16_t twb73728[49152] __attribute__((aligned(32)));
+// 24576 x 3
+void dft73728(int16_t *input, int16_t *output,uint32_t *scale) {
+
+  AssertFatal(1==0,"Need to do this ..\n");
+}
+
+void idft73728(int16_t *input, int16_t *output,uint32_t *scale) {
+
+  AssertFatal(1==0,"Need to do this ..\n");
+}
+
+
 int16_t twa98304[65536] __attribute__((aligned(32)));
 int16_t twb98304[65536] __attribute__((aligned(32)));
 // 32768 x 3
-void dft98304(int16_t *input, int16_t *output,uint8_t scale) {
+void dft98304(int16_t *input, int16_t *output,uint32_t *scale) {
 
   int i,i2,j;
   uint32_t tmp[3][32768] __attribute__((aligned(32)));
@@ -4166,16 +4548,18 @@ void dft98304(int16_t *input, int16_t *output,uint8_t scale) {
     tmp[2][i] = ((uint32_t *)input)[j++];
   }
 
-  dft32768((int16_t*)(tmp[0]),(int16_t*)(tmpo[0]),1);
-  dft32768((int16_t*)(tmp[1]),(int16_t*)(tmpo[1]),1);
-  dft32768((int16_t*)(tmp[2]),(int16_t*)(tmpo[2]),1);
+  unsigned int *scale32768=NULL;
+  if (scale) scale32768=scale+1;
+  dft32768((int16_t*)(tmp[0]),(int16_t*)(tmpo[0]),scale32768);
+  dft32768((int16_t*)(tmp[1]),(int16_t*)(tmpo[1]),scale32768);
+  dft32768((int16_t*)(tmp[2]),(int16_t*)(tmpo[2]),scale32768);
 
   for (i=0,i2=0; i<65536; i+=8,i2+=4)  {
     bfly3((simd_q15_t*)(&tmpo[0][i2]),(simd_q15_t*)(&tmpo[1][i2]),((simd_q15_t*)&tmpo[2][i2]),
           (simd_q15_t*)(output+i),(simd_q15_t*)(output+65536+i),(simd_q15_t*)(output+131072+i),
           (simd_q15_t*)(twa98304+i),(simd_q15_t*)(twb98304+i));
   }
-  if (scale==1) {
+  if (scale && *scale>0) {
     for (i=0; i<1536; i++) {
       y128p[0]  = mulhi_int16(y128p[0],ONE_OVER_SQRT3_Q15_128);
       y128p[1]  = mulhi_int16(y128p[1],ONE_OVER_SQRT3_Q15_128);
@@ -4198,7 +4582,7 @@ void dft98304(int16_t *input, int16_t *output,uint8_t scale) {
   }
 }
 
-void idft98304(int16_t *input, int16_t *output,uint8_t scale) {
+void idft98304(int16_t *input, int16_t *output,uint32_t *scale) {
 
   int i,i2,j;
   uint32_t tmp[3][32768] __attribute__((aligned(32)));
@@ -4212,16 +4596,18 @@ void idft98304(int16_t *input, int16_t *output,uint8_t scale) {
     tmp[2][i] = ((uint32_t *)input)[j++];
   }
 
-  idft32768((int16_t*)(tmp[0]),(int16_t*)(tmpo[0]),1);
-  idft32768((int16_t*)(tmp[1]),(int16_t*)(tmpo[1]),1);
-  idft32768((int16_t*)(tmp[2]),(int16_t*)(tmpo[2]),1);
+  unsigned int *scale32768=NULL;
+  if (scale) scale32768=scale+1;
+  idft32768((int16_t*)(tmp[0]),(int16_t*)(tmpo[0]),scale32768);
+  idft32768((int16_t*)(tmp[1]),(int16_t*)(tmpo[1]),scale32768);
+  idft32768((int16_t*)(tmp[2]),(int16_t*)(tmpo[2]),scale32768);
 
   for (i=0,i2=0; i<65536; i+=8,i2+=4)  {
     ibfly3((simd_q15_t*)(&tmpo[0][i2]),(simd_q15_t*)(&tmpo[1][i2]),((simd_q15_t*)&tmpo[2][i2]),
 	   (simd_q15_t*)(output+i),(simd_q15_t*)(output+65536+i),(simd_q15_t*)(output+131072+i),
 	   (simd_q15_t*)(twa98304+i),(simd_q15_t*)(twb98304+i));
   }
-  if (scale==1) {
+  if (scale && *scale>0) {
     for (i=0; i<1536; i++) {
       y128p[0]  = mulhi_int16(y128p[0],ONE_OVER_SQRT3_Q15_128);
       y128p[1]  = mulhi_int16(y128p[1],ONE_OVER_SQRT3_Q15_128);
@@ -4366,7 +4752,7 @@ __attribute__((always_inline)) static inline void dft12f(simd_q15_t *x0,
 
 
 
-void dft12(int16_t *x,int16_t *y ,unsigned char scale_flag)
+void dft12(int16_t *x,int16_t *y ,unsigned int *scale_flag)
 {
 
   simd_q15_t *x128 = (simd_q15_t *)x,*y128 = (simd_q15_t *)y;
@@ -4568,7 +4954,7 @@ void dft12_simd256(int16_t *x,int16_t *y)
 
 static int16_t tw24[88]__attribute__((aligned(32)));
 
-void dft24(int16_t *x,int16_t *y,unsigned char scale_flag)
+void dft24(int16_t *x,int16_t *y,unsigned int *scale_flag)
 {
 
   simd_q15_t *x128=(simd_q15_t *)x;
@@ -4648,7 +5034,7 @@ void dft24(int16_t *x,int16_t *y,unsigned char scale_flag)
     //    msg("dft24e\n");
   }
 
-  if (scale_flag==1) {
+  if (scale_flag) {
     const simd_q15_t norm128 = set1_int16(dft_norm_table[1]);
 
     for (i=0; i<24; i++) {
@@ -4661,7 +5047,7 @@ void dft24(int16_t *x,int16_t *y,unsigned char scale_flag)
 static int16_t twa36[88]__attribute__((aligned(32)));
 static int16_t twb36[88]__attribute__((aligned(32)));
 
-void dft36(int16_t *x,int16_t *y,unsigned char scale_flag)
+void dft36(int16_t *x,int16_t *y,unsigned int *scale_flag)
 {
 
   simd_q15_t *x128=(simd_q15_t *)x;
@@ -4768,7 +5154,7 @@ void dft36(int16_t *x,int16_t *y,unsigned char scale_flag)
           twb128+k);
   }
 
-  if (scale_flag==1) {
+  if (scale_flag) {
     const simd_q15_t norm128 = set1_int16(dft_norm_table[2]);
 
     for (i=0; i<36; i++) {
@@ -4782,7 +5168,7 @@ static int16_t twa48[88]__attribute__((aligned(32)));
 static int16_t twb48[88]__attribute__((aligned(32)));
 static int16_t twc48[88]__attribute__((aligned(32)));
 
-void dft48(int16_t *x, int16_t *y,unsigned char scale_flag)
+void dft48(int16_t *x, int16_t *y,unsigned int *scale_flag)
 {
 
   simd_q15_t *x128=(simd_q15_t *)x;
@@ -4926,7 +5312,7 @@ void dft48(int16_t *x, int16_t *y,unsigned char scale_flag)
 
   }
 
-  if (scale_flag == 1) {
+  if (scale_flag) {
     const simd_q15_t norm128 = set1_int16(dft_norm_table[3]);
 
     for (i=0; i<48; i++) {
@@ -4941,7 +5327,7 @@ static int16_t twb60[88]__attribute__((aligned(32)));
 static int16_t twc60[88]__attribute__((aligned(32)));
 static int16_t twd60[88]__attribute__((aligned(32)));
 
-void dft60(int16_t *x,int16_t *y,unsigned char scale)
+void dft60(int16_t *x,int16_t *y,unsigned int *scale)
 {
 
   simd_q15_t *x128=(simd_q15_t *)x;
@@ -5107,7 +5493,7 @@ void dft60(int16_t *x,int16_t *y,unsigned char scale)
           twd128+k);
   }
 
-  if (scale == 1) {
+  if (scale) {
     const simd_q15_t norm128 = set1_int16(dft_norm_table[4]);
 
     for (i=0; i<60; i++) {
@@ -5120,7 +5506,7 @@ void dft60(int16_t *x,int16_t *y,unsigned char scale)
 
 static int16_t tw72[280]__attribute__((aligned(32)));
 
-void dft72(int16_t *x,int16_t *y,unsigned char scale_flag)
+void dft72(int16_t *x,int16_t *y,unsigned int *scale_flag)
 {
 
   int i,j;
@@ -5136,8 +5522,8 @@ void dft72(int16_t *x,int16_t *y,unsigned char scale_flag)
     x2128[i+36] = x128[j+1];  // odd inputs
   }
 
-  dft36((int16_t *)x2128,(int16_t *)ytmp128,1);
-  dft36((int16_t *)(x2128+36),(int16_t *)(ytmp128+36),1);
+  dft36((int16_t *)x2128,(int16_t *)ytmp128,scale_flag);
+  dft36((int16_t *)(x2128+36),(int16_t *)(ytmp128+36),scale_flag);
 
   bfly2_tw1(ytmp128,ytmp128+36,y128,y128+36);
 
@@ -5149,7 +5535,7 @@ void dft72(int16_t *x,int16_t *y,unsigned char scale_flag)
           tw128+j);
   }
 
-  if (scale_flag==1) {
+  if (scale_flag) {
     const simd_q15_t norm128 = set1_int16(dft_norm_table[5]);
 
     for (i=0; i<72; i++) {
@@ -5161,7 +5547,7 @@ void dft72(int16_t *x,int16_t *y,unsigned char scale_flag)
 
 static int16_t tw96[376]__attribute__((aligned(32)));
 
-void dft96(int16_t *x,int16_t *y,unsigned char scale_flag)
+void dft96(int16_t *x,int16_t *y,unsigned int *scale_flag)
 {
 
 
@@ -5192,7 +5578,7 @@ void dft96(int16_t *x,int16_t *y,unsigned char scale_flag)
           tw128+j);
   }
 
-  if (scale_flag==1) {
+  if (scale_flag) {
     const simd_q15_t norm128 = set1_int16(dft_norm_table[6]);
 
     for (i=0; i<96; i++) {
@@ -5205,7 +5591,7 @@ void dft96(int16_t *x,int16_t *y,unsigned char scale_flag)
 static int16_t twa108[280]__attribute__((aligned(32)));
 static int16_t twb108[280]__attribute__((aligned(32)));
 
-void dft108(int16_t *x,int16_t *y,unsigned char scale_flag)
+void dft108(int16_t *x,int16_t *y,unsigned int *scale_flag)
 {
   int i,j;
   simd_q15_t *x128=(simd_q15_t *)x;
@@ -5240,7 +5626,7 @@ void dft108(int16_t *x,int16_t *y,unsigned char scale_flag)
 
   }
 
-  if (scale_flag==1) {
+  if (scale_flag) {
     const simd_q15_t norm128 = set1_int16(dft_norm_table[7]);
 
     for (i=0; i<108; i++) {
@@ -5251,7 +5637,7 @@ void dft108(int16_t *x,int16_t *y,unsigned char scale_flag)
 }
 
 static int16_t tw120[472]__attribute__((aligned(32)));
-void dft120(int16_t *x,int16_t *y, unsigned char scale_flag)
+void dft120(int16_t *x,int16_t *y, unsigned int *scale_flag)
 {
   int i,j;
   simd_q15_t *x128=(simd_q15_t *)x;
@@ -5279,7 +5665,7 @@ void dft120(int16_t *x,int16_t *y, unsigned char scale_flag)
           tw128+j);
   }
 
-  if (scale_flag==1) {
+  if (scale_flag) {
     const simd_q15_t norm128 = set1_int16(dft_norm_table[8]);
 
     for (i=0; i<120; i++) {
@@ -5292,7 +5678,7 @@ void dft120(int16_t *x,int16_t *y, unsigned char scale_flag)
 static int16_t twa144[376]__attribute__((aligned(32)));
 static int16_t twb144[376]__attribute__((aligned(32)));
 
-void dft144(int16_t *x,int16_t *y,unsigned char scale_flag)
+void dft144(int16_t *x,int16_t *y,unsigned int *scale_flag)
 {
   int i,j;
   simd_q15_t *x128=(simd_q15_t *)x;
@@ -5310,9 +5696,9 @@ void dft144(int16_t *x,int16_t *y,unsigned char scale_flag)
     x2128[i+96] = x128[j+2];
   }
 
-  dft48((int16_t *)x2128,(int16_t *)ytmp128,1);
-  dft48((int16_t *)(x2128+48),(int16_t *)(ytmp128+48),1);
-  dft48((int16_t *)(x2128+96),(int16_t *)(ytmp128+96),1);
+  dft48((int16_t *)x2128,(int16_t *)ytmp128,scale_flag);
+  dft48((int16_t *)(x2128+48),(int16_t *)(ytmp128+48),scale_flag);
+  dft48((int16_t *)(x2128+96),(int16_t *)(ytmp128+96),scale_flag);
 
   bfly3_tw1(ytmp128,ytmp128+48,ytmp128+96,y128,y128+48,y128+96);
 
@@ -5327,7 +5713,7 @@ void dft144(int16_t *x,int16_t *y,unsigned char scale_flag)
           twb128+j);
   }
 
-  if (scale_flag==1) {
+  if (scale_flag) {
     const simd_q15_t norm128 = set1_int16(dft_norm_table[9]);
 
     for (i=0; i<144; i++) {
@@ -5340,7 +5726,7 @@ void dft144(int16_t *x,int16_t *y,unsigned char scale_flag)
 static int16_t twa180[472]__attribute__((aligned(32)));
 static int16_t twb180[472]__attribute__((aligned(32)));
 
-void dft180(int16_t *x,int16_t *y,unsigned char scale_flag)
+void dft180(int16_t *x,int16_t *y,unsigned int *scale_flag)
 {
 
   int i,j;
@@ -5359,9 +5745,9 @@ void dft180(int16_t *x,int16_t *y,unsigned char scale_flag)
     x2128[i+120] = x128[j+2];
   }
 
-  dft60((int16_t *)x2128,(int16_t *)ytmp128,1);
-  dft60((int16_t *)(x2128+60),(int16_t *)(ytmp128+60),1);
-  dft60((int16_t *)(x2128+120),(int16_t *)(ytmp128+120),1);
+  dft60((int16_t *)x2128,(int16_t *)ytmp128,scale_flag);
+  dft60((int16_t *)(x2128+60),(int16_t *)(ytmp128+60),scale_flag);
+  dft60((int16_t *)(x2128+120),(int16_t *)(ytmp128+120),scale_flag);
 
   bfly3_tw1(ytmp128,ytmp128+60,ytmp128+120,y128,y128+60,y128+120);
 
@@ -5376,7 +5762,7 @@ void dft180(int16_t *x,int16_t *y,unsigned char scale_flag)
           twb128+j);
   }
 
-  if (scale_flag==1) {
+  if (scale_flag) {
     const simd_q15_t norm128 = set1_int16(dft_norm_table[10]);
 
     for (i=0; i<180; i++) {
@@ -5390,7 +5776,7 @@ static int16_t twa192[376]__attribute__((aligned(32)));
 static int16_t twb192[376]__attribute__((aligned(32)));
 static int16_t twc192[376]__attribute__((aligned(32)));
 
-void dft192(int16_t *x,int16_t *y,unsigned char scale_flag)
+void dft192(int16_t *x,int16_t *y,unsigned int *scale_flag)
 {
 
   int i,j;
@@ -5411,10 +5797,10 @@ void dft192(int16_t *x,int16_t *y,unsigned char scale_flag)
     x2128[i+144] = x128[j+3];
   }
 
-  dft48((int16_t *)x2128,(int16_t *)ytmp128,1);
-  dft48((int16_t *)(x2128+48),(int16_t *)(ytmp128+48),1);
-  dft48((int16_t *)(x2128+96),(int16_t *)(ytmp128+96),1);
-  dft48((int16_t *)(x2128+144),(int16_t *)(ytmp128+144),1);
+  dft48((int16_t *)x2128,(int16_t *)ytmp128,scale_flag);
+  dft48((int16_t *)(x2128+48),(int16_t *)(ytmp128+48),scale_flag);
+  dft48((int16_t *)(x2128+96),(int16_t *)(ytmp128+96),scale_flag);
+  dft48((int16_t *)(x2128+144),(int16_t *)(ytmp128+144),scale_flag);
 
   bfly4_tw1(ytmp128,ytmp128+48,ytmp128+96,ytmp128+144,y128,y128+48,y128+96,y128+144);
 
@@ -5432,7 +5818,7 @@ void dft192(int16_t *x,int16_t *y,unsigned char scale_flag)
           twc128+j);
   }
 
-  if (scale_flag==1) {
+  if (scale_flag) {
     const simd_q15_t norm128 = set1_int16(dft_norm_table[11]);
 
     for (i=0; i<192; i++) {
@@ -5445,7 +5831,7 @@ void dft192(int16_t *x,int16_t *y,unsigned char scale_flag)
 static int16_t twa216[568]__attribute__((aligned(32)));
 static int16_t twb216[568]__attribute__((aligned(32)));
 
-void dft216(int16_t *x,int16_t *y,unsigned char scale_flag)
+void dft216(int16_t *x,int16_t *y,unsigned int *scale_flag)
 {
 
   int i,j;
@@ -5464,9 +5850,9 @@ void dft216(int16_t *x,int16_t *y,unsigned char scale_flag)
     x2128[i+144] = x128[j+2];
   }
 
-  dft72((int16_t *)x2128,(int16_t *)ytmp128,1);
-  dft72((int16_t *)(x2128+72),(int16_t *)(ytmp128+72),1);
-  dft72((int16_t *)(x2128+144),(int16_t *)(ytmp128+144),1);
+  dft72((int16_t *)x2128,(int16_t *)ytmp128,scale_flag);
+  dft72((int16_t *)(x2128+72),(int16_t *)(ytmp128+72),scale_flag);
+  dft72((int16_t *)(x2128+144),(int16_t *)(ytmp128+144),scale_flag);
 
   bfly3_tw1(ytmp128,ytmp128+72,ytmp128+144,y128,y128+72,y128+144);
 
@@ -5481,7 +5867,7 @@ void dft216(int16_t *x,int16_t *y,unsigned char scale_flag)
           twb128+j);
   }
 
-  if (scale_flag==1) {
+  if (scale_flag) {
     const simd_q15_t norm128 = set1_int16(dft_norm_table[12]);
 
     for (i=0; i<216; i++) {
@@ -5495,7 +5881,7 @@ static int16_t twa240[472]__attribute__((aligned(32)));
 static int16_t twb240[472]__attribute__((aligned(32)));
 static int16_t twc240[472]__attribute__((aligned(32)));
 
-void dft240(int16_t *x,int16_t *y,unsigned char scale_flag)
+void dft240(int16_t *x,int16_t *y,unsigned int *scale_flag)
 {
 
   int i,j;
@@ -5516,10 +5902,10 @@ void dft240(int16_t *x,int16_t *y,unsigned char scale_flag)
     x2128[i+180] = x128[j+3];
   }
 
-  dft60((int16_t *)x2128,(int16_t *)ytmp128,1);
-  dft60((int16_t *)(x2128+60),(int16_t *)(ytmp128+60),1);
-  dft60((int16_t *)(x2128+120),(int16_t *)(ytmp128+120),1);
-  dft60((int16_t *)(x2128+180),(int16_t *)(ytmp128+180),1);
+  dft60((int16_t *)x2128,(int16_t *)ytmp128,scale_flag);
+  dft60((int16_t *)(x2128+60),(int16_t *)(ytmp128+60),scale_flag);
+  dft60((int16_t *)(x2128+120),(int16_t *)(ytmp128+120),scale_flag);
+  dft60((int16_t *)(x2128+180),(int16_t *)(ytmp128+180),scale_flag);
 
   bfly4_tw1(ytmp128,ytmp128+60,ytmp128+120,ytmp128+180,y128,y128+60,y128+120,y128+180);
 
@@ -5537,7 +5923,7 @@ void dft240(int16_t *x,int16_t *y,unsigned char scale_flag)
           twc128+j);
   }
 
-  if (scale_flag==1) {
+  if (scale_flag) {
     const simd_q15_t norm128 = set1_int16(dft_norm_table[13]);
 
     for (i=0; i<240; i++) {
@@ -5550,7 +5936,7 @@ void dft240(int16_t *x,int16_t *y,unsigned char scale_flag)
 static int16_t twa288[760]__attribute__((aligned(32)));
 static int16_t twb288[760]__attribute__((aligned(32)));
 
-void dft288(int16_t *x,int16_t *y,unsigned char scale_flag)
+void dft288(int16_t *x,int16_t *y,unsigned int *scale_flag)
 {
 
   int i,j;
@@ -5569,9 +5955,9 @@ void dft288(int16_t *x,int16_t *y,unsigned char scale_flag)
     x2128[i+192] = x128[j+2];
   }
 
-  dft96((int16_t *)x2128,(int16_t *)ytmp128,1);
-  dft96((int16_t *)(x2128+96),(int16_t *)(ytmp128+96),1);
-  dft96((int16_t *)(x2128+192),(int16_t *)(ytmp128+192),1);
+  dft96((int16_t *)x2128,(int16_t *)ytmp128,scale_flag);
+  dft96((int16_t *)(x2128+96),(int16_t *)(ytmp128+96),scale_flag);
+  dft96((int16_t *)(x2128+192),(int16_t *)(ytmp128+192),scale_flag);
 
   bfly3_tw1(ytmp128,ytmp128+96,ytmp128+192,y128,y128+96,y128+192);
 
@@ -5586,7 +5972,7 @@ void dft288(int16_t *x,int16_t *y,unsigned char scale_flag)
           twb128+j);
   }
 
-  if (scale_flag==1) {
+  if (scale_flag) {
     const simd_q15_t norm128 = set1_int16(dft_norm_table[14]);
 
     for (i=0; i<288; i++) {
@@ -5601,7 +5987,7 @@ static int16_t twb300[472]__attribute__((aligned(32)));
 static int16_t twc300[472]__attribute__((aligned(32)));
 static int16_t twd300[472]__attribute__((aligned(32)));
 
-void dft300(int16_t *x,int16_t *y,unsigned char scale_flag)
+void dft300(int16_t *x,int16_t *y,unsigned int *scale_flag)
 {
 
   int i,j;
@@ -5624,11 +6010,11 @@ void dft300(int16_t *x,int16_t *y,unsigned char scale_flag)
     x2128[i+240] = x128[j+4];
   }
 
-  dft60((int16_t *)x2128,(int16_t *)ytmp128,1);
-  dft60((int16_t *)(x2128+60),(int16_t *)(ytmp128+60),1);
-  dft60((int16_t *)(x2128+120),(int16_t *)(ytmp128+120),1);
-  dft60((int16_t *)(x2128+180),(int16_t *)(ytmp128+180),1);
-  dft60((int16_t *)(x2128+240),(int16_t *)(ytmp128+240),1);
+  dft60((int16_t *)x2128,(int16_t *)ytmp128,scale_flag);
+  dft60((int16_t *)(x2128+60),(int16_t *)(ytmp128+60),scale_flag);
+  dft60((int16_t *)(x2128+120),(int16_t *)(ytmp128+120),scale_flag);
+  dft60((int16_t *)(x2128+180),(int16_t *)(ytmp128+180),scale_flag);
+  dft60((int16_t *)(x2128+240),(int16_t *)(ytmp128+240),scale_flag);
 
   bfly5_tw1(ytmp128,ytmp128+60,ytmp128+120,ytmp128+180,ytmp128+240,y128,y128+60,y128+120,y128+180,y128+240);
 
@@ -5649,7 +6035,7 @@ void dft300(int16_t *x,int16_t *y,unsigned char scale_flag)
           twd128+j);
   }
 
-  if (scale_flag==1) {
+  if (scale_flag) {
     const simd_q15_t norm128 = set1_int16(dft_norm_table[15]);
 
     for (i=0; i<300; i++) {
@@ -5662,7 +6048,7 @@ void dft300(int16_t *x,int16_t *y,unsigned char scale_flag)
 static int16_t twa324[107*2*4];
 static int16_t twb324[107*2*4];
 
-void dft324(int16_t *x,int16_t *y,unsigned char scale_flag)  // 108 x 3
+void dft324(int16_t *x,int16_t *y,unsigned int *scale_flag)  // 108 x 3
 {
   int i,j;
   simd_q15_t *x128=(simd_q15_t *)x;
@@ -5680,9 +6066,9 @@ void dft324(int16_t *x,int16_t *y,unsigned char scale_flag)  // 108 x 3
     x2128[i+216] = x128[j+2];
   }
 
-  dft108((int16_t *)x2128,(int16_t *)ytmp128,1);
-  dft108((int16_t *)(x2128+108),(int16_t *)(ytmp128+108),1);
-  dft108((int16_t *)(x2128+216),(int16_t *)(ytmp128+216),1);
+  dft108((int16_t *)x2128,(int16_t *)ytmp128,scale_flag);
+  dft108((int16_t *)(x2128+108),(int16_t *)(ytmp128+108),scale_flag);
+  dft108((int16_t *)(x2128+216),(int16_t *)(ytmp128+216),scale_flag);
 
   bfly3_tw1(ytmp128,ytmp128+108,ytmp128+216,y128,y128+108,y128+216);
 
@@ -5697,7 +6083,7 @@ void dft324(int16_t *x,int16_t *y,unsigned char scale_flag)  // 108 x 3
           twb128+j);
   }
 
-  if (scale_flag==1) {
+  if (scale_flag) {
     const simd_q15_t norm128 = set1_int16(dft_norm_table[14]);
 
     for (i=0; i<324; i++) {
@@ -5710,7 +6096,7 @@ void dft324(int16_t *x,int16_t *y,unsigned char scale_flag)  // 108 x 3
 static int16_t twa360[119*2*4];
 static int16_t twb360[119*2*4];
 
-void dft360(int16_t *x,int16_t *y,unsigned char scale_flag)  // 120 x 3
+void dft360(int16_t *x,int16_t *y,unsigned int *scale_flag)  // 120 x 3
 {
   int i,j;
   simd_q15_t *x128=(simd_q15_t *)x;
@@ -5728,9 +6114,9 @@ void dft360(int16_t *x,int16_t *y,unsigned char scale_flag)  // 120 x 3
     x2128[i+240] = x128[j+2];
   }
 
-  dft120((int16_t *)x2128,(int16_t *)ytmp128,1);
-  dft120((int16_t *)(x2128+120),(int16_t *)(ytmp128+120),1);
-  dft120((int16_t *)(x2128+240),(int16_t *)(ytmp128+240),1);
+  dft120((int16_t *)x2128,(int16_t *)ytmp128,scale_flag);
+  dft120((int16_t *)(x2128+120),(int16_t *)(ytmp128+120),scale_flag);
+  dft120((int16_t *)(x2128+240),(int16_t *)(ytmp128+240),scale_flag);
 
   bfly3_tw1(ytmp128,ytmp128+120,ytmp128+240,y128,y128+120,y128+240);
 
@@ -5745,7 +6131,7 @@ void dft360(int16_t *x,int16_t *y,unsigned char scale_flag)  // 120 x 3
           twb128+j);
   }
 
-  if (scale_flag==1) {
+  if (scale_flag) {
     const simd_q15_t norm128 = set1_int16(dft_norm_table[14]);
 
     for (i=0; i<360; i++) {
@@ -5759,7 +6145,7 @@ static int16_t twa384[95*2*4];
 static int16_t twb384[95*2*4];
 static int16_t twc384[95*2*4];
 
-void dft384(int16_t *x,int16_t *y,unsigned char scale_flag)  // 96 x 4
+void dft384(int16_t *x,int16_t *y,unsigned int *scale_flag)  // 96 x 4
 {
   int i,j;
   simd_q15_t *x128=(simd_q15_t *)x;
@@ -5779,10 +6165,10 @@ void dft384(int16_t *x,int16_t *y,unsigned char scale_flag)  // 96 x 4
     x2128[i+288] = x128[j+3];
   }
 
-  dft96((int16_t *)x2128,(int16_t *)ytmp128,1);
-  dft96((int16_t *)(x2128+96),(int16_t *)(ytmp128+96),1);
-  dft96((int16_t *)(x2128+192),(int16_t *)(ytmp128+192),1);
-  dft96((int16_t *)(x2128+288),(int16_t *)(ytmp128+288),1);
+  dft96((int16_t *)x2128,(int16_t *)ytmp128,scale_flag);
+  dft96((int16_t *)(x2128+96),(int16_t *)(ytmp128+96),scale_flag);
+  dft96((int16_t *)(x2128+192),(int16_t *)(ytmp128+192),scale_flag);
+  dft96((int16_t *)(x2128+288),(int16_t *)(ytmp128+288),scale_flag);
 
   bfly4_tw1(ytmp128,ytmp128+96,ytmp128+192,ytmp128+288,y128,y128+96,y128+192,y128+288);
 
@@ -5800,7 +6186,7 @@ void dft384(int16_t *x,int16_t *y,unsigned char scale_flag)  // 96 x 4
           twc128+j);
   }
 
-  if (scale_flag==1) {
+  if (scale_flag) {
     const simd_q15_t norm128 = set1_int16(16384); // dft_norm_table[13]);
 
     for (i=0; i<384; i++) {
@@ -5814,7 +6200,7 @@ static int16_t twa432[107*2*4];
 static int16_t twb432[107*2*4];
 static int16_t twc432[107*2*4];
 
-void dft432(int16_t *x,int16_t *y,unsigned char scale_flag)  // 108 x 4
+void dft432(int16_t *x,int16_t *y,unsigned int *scale_flag)  // 108 x 4
 {
   int i,j;
   simd_q15_t *x128=(simd_q15_t *)x;
@@ -5833,10 +6219,10 @@ void dft432(int16_t *x,int16_t *y,unsigned char scale_flag)  // 108 x 4
     x2128[i+324] = x128[j+3];
   }
 
-  dft108((int16_t *)x2128,(int16_t *)ytmp128,1);
-  dft108((int16_t *)(x2128+108),(int16_t *)(ytmp128+108),1);
-  dft108((int16_t *)(x2128+216),(int16_t *)(ytmp128+216),1);
-  dft108((int16_t *)(x2128+324),(int16_t *)(ytmp128+324),1);
+  dft108((int16_t *)x2128,(int16_t *)ytmp128,scale_flag);
+  dft108((int16_t *)(x2128+108),(int16_t *)(ytmp128+108),scale_flag);
+  dft108((int16_t *)(x2128+216),(int16_t *)(ytmp128+216),scale_flag);
+  dft108((int16_t *)(x2128+324),(int16_t *)(ytmp128+324),scale_flag);
 
   bfly4_tw1(ytmp128,ytmp128+108,ytmp128+216,ytmp128+324,y128,y128+108,y128+216,y128+324);
 
@@ -5854,7 +6240,7 @@ void dft432(int16_t *x,int16_t *y,unsigned char scale_flag)  // 108 x 4
           twc128+j);
   }
 
-  if (scale_flag==1) {
+  if (scale_flag) {
     const simd_q15_t norm128 = set1_int16(16384); // dft_norm_table[13]);
 
     for (i=0; i<432; i++) {
@@ -5867,7 +6253,7 @@ static int16_t twa480[119*2*4];
 static int16_t twb480[119*2*4];
 static int16_t twc480[119*2*4];
 
-void dft480(int16_t *x,int16_t *y,unsigned char scale_flag)  // 120 x 4
+void dft480(int16_t *x,int16_t *y,unsigned int *scale_flag)  // 120 x 4
 {
   int i,j;
   simd_q15_t *x128=(simd_q15_t *)x;
@@ -5887,10 +6273,10 @@ void dft480(int16_t *x,int16_t *y,unsigned char scale_flag)  // 120 x 4
     x2128[i+360] = x128[j+3];
   }
 
-  dft120((int16_t *)x2128,(int16_t *)ytmp128,1);
-  dft120((int16_t *)(x2128+120),(int16_t *)(ytmp128+120),1);
-  dft120((int16_t *)(x2128+240),(int16_t *)(ytmp128+240),1);
-  dft120((int16_t *)(x2128+360),(int16_t *)(ytmp128+360),1);
+  dft120((int16_t *)x2128,(int16_t *)ytmp128,scale_flag);
+  dft120((int16_t *)(x2128+120),(int16_t *)(ytmp128+120),scale_flag);
+  dft120((int16_t *)(x2128+240),(int16_t *)(ytmp128+240),scale_flag);
+  dft120((int16_t *)(x2128+360),(int16_t *)(ytmp128+360),scale_flag);
 
   bfly4_tw1(ytmp128,ytmp128+120,ytmp128+240,ytmp128+360,y128,y128+120,y128+240,y128+360);
 
@@ -5908,7 +6294,7 @@ void dft480(int16_t *x,int16_t *y,unsigned char scale_flag)  // 120 x 4
           twc128+j);
   }
 
-  if (scale_flag==1) {
+  if (scale_flag) {
     const simd_q15_t norm128 = set1_int16(16384); // dft_norm_table[13]);
 
     for (i=0; i<480; i++) {
@@ -5922,7 +6308,7 @@ void dft480(int16_t *x,int16_t *y,unsigned char scale_flag)  // 120 x 4
 static int16_t twa540[179*2*4];
 static int16_t twb540[179*2*4];
 
-void dft540(int16_t *x,int16_t *y,unsigned char scale_flag)  // 180 x 3
+void dft540(int16_t *x,int16_t *y,unsigned int *scale_flag)  // 180 x 3
 {
   int i,j;
   simd_q15_t *x128=(simd_q15_t *)x;
@@ -5940,9 +6326,9 @@ void dft540(int16_t *x,int16_t *y,unsigned char scale_flag)  // 180 x 3
     x2128[i+360] = x128[j+2];
   }
 
-  dft180((int16_t *)x2128,(int16_t *)ytmp128,1);
-  dft180((int16_t *)(x2128+180),(int16_t *)(ytmp128+180),1);
-  dft180((int16_t *)(x2128+360),(int16_t *)(ytmp128+360),1);
+  dft180((int16_t *)x2128,(int16_t *)ytmp128,scale_flag);
+  dft180((int16_t *)(x2128+180),(int16_t *)(ytmp128+180),scale_flag);
+  dft180((int16_t *)(x2128+360),(int16_t *)(ytmp128+360),scale_flag);
 
   bfly3_tw1(ytmp128,ytmp128+180,ytmp128+360,y128,y128+180,y128+360);
 
@@ -5957,7 +6343,7 @@ void dft540(int16_t *x,int16_t *y,unsigned char scale_flag)  // 180 x 3
           twb128+j);
   }
 
-  if (scale_flag==1) {
+  if (scale_flag) {
     const simd_q15_t norm128 = set1_int16(dft_norm_table[14]);
 
     for (i=0; i<540; i++) {
@@ -5970,7 +6356,7 @@ void dft540(int16_t *x,int16_t *y,unsigned char scale_flag)  // 180 x 3
 static int16_t twa576[191*2*4];
 static int16_t twb576[191*2*4];
 
-void dft576(int16_t *x,int16_t *y,unsigned char scale_flag)  // 192 x 3
+void dft576(int16_t *x,int16_t *y,unsigned int *scale_flag)  // 192 x 3
 {
   int i,j;
   simd_q15_t *x128=(simd_q15_t *)x;
@@ -5989,9 +6375,9 @@ void dft576(int16_t *x,int16_t *y,unsigned char scale_flag)  // 192 x 3
   }
 
 
-  dft192((int16_t *)x2128,(int16_t *)ytmp128,1);
-  dft192((int16_t *)(x2128+192),(int16_t *)(ytmp128+192),1);
-  dft192((int16_t *)(x2128+384),(int16_t *)(ytmp128+384),1);
+  dft192((int16_t *)x2128,(int16_t *)ytmp128,scale_flag);
+  dft192((int16_t *)(x2128+192),(int16_t *)(ytmp128+192),scale_flag);
+  dft192((int16_t *)(x2128+384),(int16_t *)(ytmp128+384),scale_flag);
 
   bfly3_tw1(ytmp128,ytmp128+192,ytmp128+384,y128,y128+192,y128+384);
 
@@ -6006,7 +6392,7 @@ void dft576(int16_t *x,int16_t *y,unsigned char scale_flag)  // 192 x 3
           twb128+j);
   }
 
-  if (scale_flag==1) {
+  if (scale_flag) {
     const simd_q15_t norm128 = set1_int16(dft_norm_table[14]);
 
     for (i=0; i<576; i++) {
@@ -6019,7 +6405,7 @@ void dft576(int16_t *x,int16_t *y,unsigned char scale_flag)  // 192 x 3
 
 static int16_t twa600[299*2*4];
 
-void dft600(int16_t *x,int16_t *y,unsigned char scale_flag)  // 300 x 2
+void dft600(int16_t *x,int16_t *y,unsigned int *scale_flag)  // 300 x 2
 {
   int i,j;
   simd_q15_t *x128=(simd_q15_t *)x;
@@ -6034,8 +6420,8 @@ void dft600(int16_t *x,int16_t *y,unsigned char scale_flag)  // 300 x 2
     x2128[i+300] = x128[j+1];
   }
 
-  dft300((int16_t *)x2128,(int16_t *)ytmp128,1);
-  dft300((int16_t *)(x2128+300),(int16_t *)(ytmp128+300),1);
+  dft300((int16_t *)x2128,(int16_t *)ytmp128,scale_flag);
+  dft300((int16_t *)(x2128+300),(int16_t *)(ytmp128+300),scale_flag);
 
 
   bfly2_tw1(ytmp128,ytmp128+300,y128,y128+300);
@@ -6048,7 +6434,7 @@ void dft600(int16_t *x,int16_t *y,unsigned char scale_flag)  // 300 x 2
           tw128+j);
   }
 
-  if (scale_flag==1) {
+  if (scale_flag) {
     const simd_q15_t norm128 = set1_int16(ONE_OVER_SQRT2_Q15);
 
     for (i=0; i<600; i++) {
@@ -6062,7 +6448,7 @@ void dft600(int16_t *x,int16_t *y,unsigned char scale_flag)  // 300 x 2
 static int16_t twa648[215*2*4];
 static int16_t twb648[215*2*4];
 
-void dft648(int16_t *x,int16_t *y,unsigned char scale_flag)  // 216 x 3
+void dft648(int16_t *x,int16_t *y,unsigned int *scale_flag)  // 216 x 3
 {
   int i,j;
   simd_q15_t *x128=(simd_q15_t *)x;
@@ -6080,9 +6466,9 @@ void dft648(int16_t *x,int16_t *y,unsigned char scale_flag)  // 216 x 3
     x2128[i+432] = x128[j+2];
   }
 
-  dft216((int16_t *)x2128,(int16_t *)ytmp128,1);
-  dft216((int16_t *)(x2128+216),(int16_t *)(ytmp128+216),1);
-  dft216((int16_t *)(x2128+432),(int16_t *)(ytmp128+432),1);
+  dft216((int16_t *)x2128,(int16_t *)ytmp128,scale_flag);
+  dft216((int16_t *)(x2128+216),(int16_t *)(ytmp128+216),scale_flag);
+  dft216((int16_t *)(x2128+432),(int16_t *)(ytmp128+432),scale_flag);
 
   bfly3_tw1(ytmp128,ytmp128+216,ytmp128+432,y128,y128+216,y128+432);
 
@@ -6097,7 +6483,7 @@ void dft648(int16_t *x,int16_t *y,unsigned char scale_flag)  // 216 x 3
           twb128+j);
   }
 
-  if (scale_flag==1) {
+  if (scale_flag) {
     const simd_q15_t norm128 = set1_int16(dft_norm_table[14]);
 
     for (i=0; i<648; i++) {
@@ -6113,7 +6499,7 @@ static int16_t twb720[179*2*4];
 static int16_t twc720[179*2*4];
 
 
-void dft720(int16_t *x,int16_t *y,unsigned char scale_flag)  // 180 x 4
+void dft720(int16_t *x,int16_t *y,unsigned int *scale_flag)  // 180 x 4
 {
   int i,j;
   simd_q15_t *x128=(simd_q15_t *)x;
@@ -6133,10 +6519,10 @@ void dft720(int16_t *x,int16_t *y,unsigned char scale_flag)  // 180 x 4
     x2128[i+540] = x128[j+3];
   }
 
-  dft180((int16_t *)x2128,(int16_t *)ytmp128,1);
-  dft180((int16_t *)(x2128+180),(int16_t *)(ytmp128+180),1);
-  dft180((int16_t *)(x2128+360),(int16_t *)(ytmp128+360),1);
-  dft180((int16_t *)(x2128+540),(int16_t *)(ytmp128+540),1);
+  dft180((int16_t *)x2128,(int16_t *)ytmp128,scale_flag);
+  dft180((int16_t *)(x2128+180),(int16_t *)(ytmp128+180),scale_flag);
+  dft180((int16_t *)(x2128+360),(int16_t *)(ytmp128+360),scale_flag);
+  dft180((int16_t *)(x2128+540),(int16_t *)(ytmp128+540),scale_flag);
 
   bfly4_tw1(ytmp128,ytmp128+180,ytmp128+360,ytmp128+540,y128,y128+180,y128+360,y128+540);
 
@@ -6154,7 +6540,7 @@ void dft720(int16_t *x,int16_t *y,unsigned char scale_flag)  // 180 x 4
           twc128+j);
   }
 
-  if (scale_flag==1) {
+  if (scale_flag) {
     const simd_q15_t norm128 = set1_int16(16384); // dft_norm_table[13]);
 
     for (i=0; i<720; i++) {
@@ -6168,7 +6554,7 @@ static int16_t twa768p[191*2*4];
 static int16_t twb768p[191*2*4];
 static int16_t twc768p[191*2*4];
 
-void dft768p(int16_t *x,int16_t *y,unsigned char scale_flag) { // 192x 4;
+void dft768p(int16_t *x,int16_t *y,unsigned int *scale_flag) { // 192x 4;
 
   int i,j;
   simd_q15_t *x128=(simd_q15_t *)x;
@@ -6188,10 +6574,10 @@ void dft768p(int16_t *x,int16_t *y,unsigned char scale_flag) { // 192x 4;
     x2128[i+576] = x128[j+3];
   }
 
-  dft192((int16_t *)x2128,(int16_t *)ytmp128,1);
-  dft192((int16_t *)(x2128+192),(int16_t *)(ytmp128+192),1);
-  dft192((int16_t *)(x2128+384),(int16_t *)(ytmp128+384),1);
-  dft192((int16_t *)(x2128+576),(int16_t *)(ytmp128+576),1);
+  dft192((int16_t *)x2128,(int16_t *)ytmp128,scale_flag);
+  dft192((int16_t *)(x2128+192),(int16_t *)(ytmp128+192),scale_flag);
+  dft192((int16_t *)(x2128+384),(int16_t *)(ytmp128+384),scale_flag);
+  dft192((int16_t *)(x2128+576),(int16_t *)(ytmp128+576),scale_flag);
 
   bfly4_tw1(ytmp128,ytmp128+192,ytmp128+384,ytmp128+576,y128,y128+192,y128+384,y128+576);
 
@@ -6209,7 +6595,7 @@ void dft768p(int16_t *x,int16_t *y,unsigned char scale_flag) { // 192x 4;
           twc128+j);
   }
 
-  if (scale_flag==1) {
+  if (scale_flag) {
     const simd_q15_t norm128 = set1_int16(16384); // dft_norm_table[13]);
 
     for (i=0; i<768; i++) {
@@ -6222,7 +6608,7 @@ void dft768p(int16_t *x,int16_t *y,unsigned char scale_flag) { // 192x 4;
 static int16_t twa384i[256];
 static int16_t twb384i[256];
 // 128 x 3
-void idft384(int16_t *input, int16_t *output, unsigned char scale)
+void idft384(int16_t *input, int16_t *output, unsigned int *scale)
 {
   int i,i2,j;
   uint32_t tmp[3][128]__attribute__((aligned(32)));
@@ -6236,9 +6622,9 @@ void idft384(int16_t *input, int16_t *output, unsigned char scale)
     tmp[2][i] = ((uint32_t *)input)[j++];
   }
 
-  idft128((int16_t*)(tmp[0]),(int16_t*)(tmpo[0]),1);
-  idft128((int16_t*)(tmp[1]),(int16_t*)(tmpo[1]),1);
-  idft128((int16_t*)(tmp[2]),(int16_t*)(tmpo[2]),1);
+  idft128((int16_t*)(tmp[0]),(int16_t*)(tmpo[0]),scale);
+  idft128((int16_t*)(tmp[1]),(int16_t*)(tmpo[1]),scale);
+  idft128((int16_t*)(tmp[2]),(int16_t*)(tmpo[2]),scale);
 
   for (i=0,i2=0; i<256; i+=8,i2+=4)  {
     ibfly3((simd_q15_t*)(&tmpo[0][i2]),(simd_q15_t*)(&tmpo[1][i2]),(simd_q15_t*)(&tmpo[2][i2]),
@@ -6247,7 +6633,7 @@ void idft384(int16_t *input, int16_t *output, unsigned char scale)
   }
 
 
-  if (scale==1) {
+  if (scale && *scale>0) {
     for (i=0; i<6; i++) {
       y128p[0]  = mulhi_int16(y128p[0],ONE_OVER_SQRT3_Q15_128);
       y128p[1]  = mulhi_int16(y128p[1],ONE_OVER_SQRT3_Q15_128);
@@ -6275,7 +6661,7 @@ void idft384(int16_t *input, int16_t *output, unsigned char scale)
 static int16_t twa864[287*2*4];
 static int16_t twb864[287*2*4];
 
-void dft864(int16_t *x,int16_t *y,unsigned char scale_flag)  // 288 x 3
+void dft864(int16_t *x,int16_t *y,unsigned int *scale_flag)  // 288 x 3
 {
   int i,j;
   simd_q15_t *x128=(simd_q15_t *)x;
@@ -6293,9 +6679,9 @@ void dft864(int16_t *x,int16_t *y,unsigned char scale_flag)  // 288 x 3
     x2128[i+576] = x128[j+2];
   }
 
-  dft288((int16_t *)x2128,(int16_t *)ytmp128,1);
-  dft288((int16_t *)(x2128+288),(int16_t *)(ytmp128+288),1);
-  dft288((int16_t *)(x2128+576),(int16_t *)(ytmp128+576),1);
+  dft288((int16_t *)x2128,(int16_t *)ytmp128,scale_flag);
+  dft288((int16_t *)(x2128+288),(int16_t *)(ytmp128+288),scale_flag);
+  dft288((int16_t *)(x2128+576),(int16_t *)(ytmp128+576),scale_flag);
 
   bfly3_tw1(ytmp128,ytmp128+288,ytmp128+576,y128,y128+288,y128+576);
 
@@ -6310,7 +6696,7 @@ void dft864(int16_t *x,int16_t *y,unsigned char scale_flag)  // 288 x 3
           twb128+j);
   }
 
-  if (scale_flag==1) {
+  if (scale_flag) {
     const simd_q15_t norm128 = set1_int16(dft_norm_table[14]);
 
     for (i=0; i<864; i++) {
@@ -6323,7 +6709,7 @@ void dft864(int16_t *x,int16_t *y,unsigned char scale_flag)  // 288 x 3
 static int16_t twa900[299*2*4];
 static int16_t twb900[299*2*4];
 
-void dft900(int16_t *x,int16_t *y,unsigned char scale_flag)  // 300 x 3
+void dft900(int16_t *x,int16_t *y,unsigned int *scale_flag)  // 300 x 3
 {
   int i,j;
   simd_q15_t *x128=(simd_q15_t *)x;
@@ -6341,9 +6727,9 @@ void dft900(int16_t *x,int16_t *y,unsigned char scale_flag)  // 300 x 3
     x2128[i+600] = x128[j+2];
   }
 
-  dft300((int16_t *)x2128,(int16_t *)ytmp128,1);
-  dft300((int16_t *)(x2128+300),(int16_t *)(ytmp128+300),1);
-  dft300((int16_t *)(x2128+600),(int16_t *)(ytmp128+600),1);
+  dft300((int16_t *)x2128,(int16_t *)ytmp128,scale_flag);
+  dft300((int16_t *)(x2128+300),(int16_t *)(ytmp128+300),scale_flag);
+  dft300((int16_t *)(x2128+600),(int16_t *)(ytmp128+600),scale_flag);
 
   bfly3_tw1(ytmp128,ytmp128+300,ytmp128+600,y128,y128+300,y128+600);
 
@@ -6358,7 +6744,7 @@ void dft900(int16_t *x,int16_t *y,unsigned char scale_flag)  // 300 x 3
           twb128+j);
   }
 
-  if (scale_flag==1) {
+  if (scale_flag) {
     const simd_q15_t norm128 = set1_int16(dft_norm_table[14]);
 
     for (i=0; i<900; i++) {
@@ -6374,7 +6760,7 @@ static int16_t twb960[239*2*4];
 static int16_t twc960[239*2*4];
 
 
-void dft960(int16_t *x,int16_t *y,unsigned char scale_flag)  // 240 x 4
+void dft960(int16_t *x,int16_t *y,unsigned int *scale_flag)  // 240 x 4
 {
   int i,j;
   simd_q15_t *x128=(simd_q15_t *)x;
@@ -6394,10 +6780,10 @@ void dft960(int16_t *x,int16_t *y,unsigned char scale_flag)  // 240 x 4
     x2128[i+720] = x128[j+3];
   }
 
-  dft240((int16_t *)x2128,(int16_t *)ytmp128,1);
-  dft240((int16_t *)(x2128+240),(int16_t *)(ytmp128+240),1);
-  dft240((int16_t *)(x2128+480),(int16_t *)(ytmp128+480),1);
-  dft240((int16_t *)(x2128+720),(int16_t *)(ytmp128+720),1);
+  dft240((int16_t *)x2128,(int16_t *)ytmp128,scale_flag);
+  dft240((int16_t *)(x2128+240),(int16_t *)(ytmp128+240),scale_flag);
+  dft240((int16_t *)(x2128+480),(int16_t *)(ytmp128+480),scale_flag);
+  dft240((int16_t *)(x2128+720),(int16_t *)(ytmp128+720),scale_flag);
 
   bfly4_tw1(ytmp128,ytmp128+240,ytmp128+480,ytmp128+720,y128,y128+240,y128+480,y128+720);
 
@@ -6415,7 +6801,7 @@ void dft960(int16_t *x,int16_t *y,unsigned char scale_flag)  // 240 x 4
           twc128+j);
   }
 
-  if (scale_flag==1) {
+  if (scale_flag) {
     const simd_q15_t norm128 = set1_int16(16384); // dft_norm_table[13]);
 
     for (i=0; i<960; i++) {
@@ -6429,7 +6815,7 @@ void dft960(int16_t *x,int16_t *y,unsigned char scale_flag)  // 240 x 4
 static int16_t twa972[323*2*4];
 static int16_t twb972[323*2*4];
 
-void dft972(int16_t *x,int16_t *y,unsigned char scale_flag)  // 324 x 3
+void dft972(int16_t *x,int16_t *y,unsigned int *scale_flag)  // 324 x 3
 {
   int i,j;
   simd_q15_t *x128=(simd_q15_t *)x;
@@ -6447,9 +6833,9 @@ void dft972(int16_t *x,int16_t *y,unsigned char scale_flag)  // 324 x 3
     x2128[i+648] = x128[j+2];
   }
 
-  dft324((int16_t *)x2128,(int16_t *)ytmp128,1);
-  dft324((int16_t *)(x2128+324),(int16_t *)(ytmp128+324),1);
-  dft324((int16_t *)(x2128+648),(int16_t *)(ytmp128+648),1);
+  dft324((int16_t *)x2128,(int16_t *)ytmp128,scale_flag);
+  dft324((int16_t *)(x2128+324),(int16_t *)(ytmp128+324),scale_flag);
+  dft324((int16_t *)(x2128+648),(int16_t *)(ytmp128+648),scale_flag);
 
   bfly3_tw1(ytmp128,ytmp128+324,ytmp128+648,y128,y128+324,y128+648);
 
@@ -6464,7 +6850,7 @@ void dft972(int16_t *x,int16_t *y,unsigned char scale_flag)  // 324 x 3
           twb128+j);
   }
 
-  if (scale_flag==1) {
+  if (scale_flag) {
     const simd_q15_t norm128 = set1_int16(dft_norm_table[14]);
 
     for (i=0; i<972; i++) {
@@ -6477,7 +6863,7 @@ void dft972(int16_t *x,int16_t *y,unsigned char scale_flag)  // 324 x 3
 static int16_t twa1080[359*2*4];
 static int16_t twb1080[359*2*4];
 
-void dft1080(int16_t *x,int16_t *y,unsigned char scale_flag)  // 360 x 3
+void dft1080(int16_t *x,int16_t *y,unsigned int *scale_flag)  // 360 x 3
 {
   int i,j;
   simd_q15_t *x128=(simd_q15_t *)x;
@@ -6495,9 +6881,9 @@ void dft1080(int16_t *x,int16_t *y,unsigned char scale_flag)  // 360 x 3
     x2128[i+720] = x128[j+2];
   }
 
-  dft360((int16_t *)x2128,(int16_t *)ytmp128,1);
-  dft360((int16_t *)(x2128+360),(int16_t *)(ytmp128+360),1);
-  dft360((int16_t *)(x2128+720),(int16_t *)(ytmp128+720),1);
+  dft360((int16_t *)x2128,(int16_t *)ytmp128,scale_flag);
+  dft360((int16_t *)(x2128+360),(int16_t *)(ytmp128+360),scale_flag);
+  dft360((int16_t *)(x2128+720),(int16_t *)(ytmp128+720),scale_flag);
 
   bfly3_tw1(ytmp128,ytmp128+360,ytmp128+720,y128,y128+360,y128+720);
 
@@ -6512,7 +6898,7 @@ void dft1080(int16_t *x,int16_t *y,unsigned char scale_flag)  // 360 x 3
           twb128+j);
   }
 
-  if (scale_flag==1) {
+  if (scale_flag) {
     const simd_q15_t norm128 = set1_int16(dft_norm_table[14]);
 
     for (i=0; i<1080; i++) {
@@ -6526,7 +6912,7 @@ static int16_t twa1152[287*2*4];
 static int16_t twb1152[287*2*4];
 static int16_t twc1152[287*2*4];
 
-void dft1152(int16_t *x,int16_t *y,unsigned char scale_flag)  // 288 x 4
+void dft1152(int16_t *x,int16_t *y,unsigned int *scale_flag)  // 288 x 4
 {
 
   int i,j;
@@ -6547,10 +6933,10 @@ void dft1152(int16_t *x,int16_t *y,unsigned char scale_flag)  // 288 x 4
     x2128[i+864] = x128[j+3];
   }
 
-  dft288((int16_t *)x2128,(int16_t *)ytmp128,1);
-  dft288((int16_t *)(x2128+288),(int16_t *)(ytmp128+288),1);
-  dft288((int16_t *)(x2128+576),(int16_t *)(ytmp128+576),1);
-  dft288((int16_t *)(x2128+864),(int16_t *)(ytmp128+864),1);
+  dft288((int16_t *)x2128,(int16_t *)ytmp128,scale_flag);
+  dft288((int16_t *)(x2128+288),(int16_t *)(ytmp128+288),scale_flag);
+  dft288((int16_t *)(x2128+576),(int16_t *)(ytmp128+576),scale_flag);
+  dft288((int16_t *)(x2128+864),(int16_t *)(ytmp128+864),scale_flag);
 
   bfly4_tw1(ytmp128,ytmp128+288,ytmp128+576,ytmp128+864,y128,y128+288,y128+576,y128+864);
 
@@ -6568,7 +6954,7 @@ void dft1152(int16_t *x,int16_t *y,unsigned char scale_flag)  // 288 x 4
           twc128+j);
   }
 
-  if (scale_flag==1) {
+  if (scale_flag) {
     const simd_q15_t norm128 = set1_int16(16384); // dft_norm_table[13]);
 
     for (i=0; i<1152; i++) {
@@ -6582,7 +6968,7 @@ int16_t twa1200[4784];
 int16_t twb1200[4784];
 int16_t twc1200[4784];
 
-void dft1200(int16_t *x,int16_t *y,unsigned char scale_flag)
+void dft1200(int16_t *x,int16_t *y,unsigned int *scale_flag)
 {
 
   int i,j;
@@ -6603,10 +6989,10 @@ void dft1200(int16_t *x,int16_t *y,unsigned char scale_flag)
     x2128[i+900] = x128[j+3];
   }
 
-  dft300((int16_t *)x2128,(int16_t *)ytmp128,1);
-  dft300((int16_t *)(x2128+300),(int16_t *)(ytmp128+300),1);
-  dft300((int16_t *)(x2128+600),(int16_t *)(ytmp128+600),1);
-  dft300((int16_t *)(x2128+900),(int16_t *)(ytmp128+900),1);
+  dft300((int16_t *)x2128,(int16_t *)ytmp128,scale_flag);
+  dft300((int16_t *)(x2128+300),(int16_t *)(ytmp128+300),scale_flag);
+  dft300((int16_t *)(x2128+600),(int16_t *)(ytmp128+600),scale_flag);
+  dft300((int16_t *)(x2128+900),(int16_t *)(ytmp128+900),scale_flag);
 
   bfly4_tw1(ytmp128,ytmp128+300,ytmp128+600,ytmp128+900,y128,y128+300,y128+600,y128+900);
 
@@ -6624,7 +7010,7 @@ void dft1200(int16_t *x,int16_t *y,unsigned char scale_flag)
           twc128+j);
   }
 
-  if (scale_flag==1) {
+  if (scale_flag) {
     const simd_q15_t norm128 = set1_int16(16384); // dft_norm_table[13]);
     for (i=0; i<1200; i++) {
       y128[i] = mulhi_int16(y128[i],norm128);
@@ -6637,7 +7023,7 @@ void dft1200(int16_t *x,int16_t *y,unsigned char scale_flag)
 static int16_t twa1296[431*2*4];
 static int16_t twb1296[431*2*4];
 
-void dft1296(int16_t *x,int16_t *y,unsigned char scale_flag) //432 * 3
+void dft1296(int16_t *x,int16_t *y,unsigned int *scale_flag) //432 * 3
 {
 
   int i,j;
@@ -6656,9 +7042,9 @@ void dft1296(int16_t *x,int16_t *y,unsigned char scale_flag) //432 * 3
     x2128[i+864] = x128[j+2];
   }
 
-  dft432((int16_t *)x2128,(int16_t *)ytmp128,1);
-  dft432((int16_t *)(x2128+432),(int16_t *)(ytmp128+432),1);
-  dft432((int16_t *)(x2128+864),(int16_t *)(ytmp128+864),1);
+  dft432((int16_t *)x2128,(int16_t *)ytmp128,scale_flag);
+  dft432((int16_t *)(x2128+432),(int16_t *)(ytmp128+432),scale_flag);
+  dft432((int16_t *)(x2128+864),(int16_t *)(ytmp128+864),scale_flag);
 
   bfly3_tw1(ytmp128,ytmp128+432,ytmp128+864,y128,y128+432,y128+864);
 
@@ -6673,7 +7059,7 @@ void dft1296(int16_t *x,int16_t *y,unsigned char scale_flag) //432 * 3
           twb128+j);
   }
 
-  if (scale_flag==1) {
+  if (scale_flag) {
     const simd_q15_t norm128 = set1_int16(dft_norm_table[14]);
 
     for (i=0; i<1296; i++) {
@@ -6687,7 +7073,7 @@ void dft1296(int16_t *x,int16_t *y,unsigned char scale_flag) //432 * 3
 static int16_t twa1440[479*2*4];
 static int16_t twb1440[479*2*4];
 
-void dft1440(int16_t *x,int16_t *y,unsigned char scale_flag)  // 480 x 3
+void dft1440(int16_t *x,int16_t *y,unsigned int *scale_flag)  // 480 x 3
 {
   int i,j;
   simd_q15_t *x128=(simd_q15_t *)x;
@@ -6705,9 +7091,9 @@ void dft1440(int16_t *x,int16_t *y,unsigned char scale_flag)  // 480 x 3
     x2128[i+960] = x128[j+2];
   }
 
-  dft480((int16_t *)x2128,(int16_t *)ytmp128,1);
-  dft480((int16_t *)(x2128+480),(int16_t *)(ytmp128+480),1);
-  dft480((int16_t *)(x2128+960),(int16_t *)(ytmp128+960),1);
+  dft480((int16_t *)x2128,(int16_t *)ytmp128,scale_flag);
+  dft480((int16_t *)(x2128+480),(int16_t *)(ytmp128+480),scale_flag);
+  dft480((int16_t *)(x2128+960),(int16_t *)(ytmp128+960),scale_flag);
 
   bfly3_tw1(ytmp128,ytmp128+480,ytmp128+960,y128,y128+480,y128+960);
 
@@ -6722,7 +7108,7 @@ void dft1440(int16_t *x,int16_t *y,unsigned char scale_flag)  // 480 x 3
           twb128+j);
   }
 
-  if (scale_flag==1) {
+  if (scale_flag) {
     const simd_q15_t norm128 = set1_int16(dft_norm_table[14]);
 
     for (i=0; i<1440; i++) {
@@ -6737,7 +7123,7 @@ static int16_t twb1500[2392]__attribute__((aligned(32)));
 static int16_t twc1500[2392]__attribute__((aligned(32)));
 static int16_t twd1500[2392]__attribute__((aligned(32)));
 
-void dft1500(int16_t *x,int16_t *y,unsigned char scale_flag)
+void dft1500(int16_t *x,int16_t *y,unsigned int *scale_flag)
 {
 
   int i,j;
@@ -6760,11 +7146,11 @@ void dft1500(int16_t *x,int16_t *y,unsigned char scale_flag)
     x2128[i+1200] = x128[j+4];
   }
 
-  dft300((int16_t *)x2128,(int16_t *)ytmp128,1);
-  dft300((int16_t *)(x2128+300),(int16_t *)(ytmp128+300),1);
-  dft300((int16_t *)(x2128+600),(int16_t *)(ytmp128+600),1);
-  dft300((int16_t *)(x2128+900),(int16_t *)(ytmp128+900),1);
-  dft300((int16_t *)(x2128+1200),(int16_t *)(ytmp128+1200),1);
+  dft300((int16_t *)x2128,(int16_t *)ytmp128,scale_flag);
+  dft300((int16_t *)(x2128+300),(int16_t *)(ytmp128+300),scale_flag);
+  dft300((int16_t *)(x2128+600),(int16_t *)(ytmp128+600),scale_flag);
+  dft300((int16_t *)(x2128+900),(int16_t *)(ytmp128+900),scale_flag);
+  dft300((int16_t *)(x2128+1200),(int16_t *)(ytmp128+1200),scale_flag);
 
   bfly5_tw1(ytmp128,ytmp128+300,ytmp128+600,ytmp128+900,ytmp128+1200,y128,y128+300,y128+600,y128+900,y128+1200);
 
@@ -6785,7 +7171,7 @@ void dft1500(int16_t *x,int16_t *y,unsigned char scale_flag)
           twd128+j);
   }
 
-  if (scale_flag==1) {
+  if (scale_flag) {
     const simd_q15_t norm128 = set1_int16(dft_norm_table[15]);
 
     for (i=0; i<1500; i++) {
@@ -6798,7 +7184,7 @@ void dft1500(int16_t *x,int16_t *y,unsigned char scale_flag)
 static int16_t twa1620[539*2*4];
 static int16_t twb1620[539*2*4];
 
-void dft1620(int16_t *x,int16_t *y,unsigned char scale_flag)  // 540 x 3
+void dft1620(int16_t *x,int16_t *y,unsigned int *scale_flag)  // 540 x 3
 {
   int i,j;
   simd_q15_t *x128=(simd_q15_t *)x;
@@ -6816,9 +7202,9 @@ void dft1620(int16_t *x,int16_t *y,unsigned char scale_flag)  // 540 x 3
     x2128[i+1080] = x128[j+2];
   }
 
-  dft540((int16_t *)x2128,(int16_t *)ytmp128,1);
-  dft540((int16_t *)(x2128+540),(int16_t *)(ytmp128+540),1);
-  dft540((int16_t *)(x2128+1080),(int16_t *)(ytmp128+1080),1);
+  dft540((int16_t *)x2128,(int16_t *)ytmp128,scale_flag);
+  dft540((int16_t *)(x2128+540),(int16_t *)(ytmp128+540),scale_flag);
+  dft540((int16_t *)(x2128+1080),(int16_t *)(ytmp128+1080),scale_flag);
 
   bfly3_tw1(ytmp128,ytmp128+540,ytmp128+1080,y128,y128+540,y128+1080);
 
@@ -6833,7 +7219,7 @@ void dft1620(int16_t *x,int16_t *y,unsigned char scale_flag)  // 540 x 3
           twb128+j);
   }
 
-  if (scale_flag==1) {
+  if (scale_flag) {
     const simd_q15_t norm128 = set1_int16(dft_norm_table[14]);
 
     for (i=0; i<1620; i++) {
@@ -6846,7 +7232,7 @@ void dft1620(int16_t *x,int16_t *y,unsigned char scale_flag)  // 540 x 3
 static int16_t twa1728[575*2*4];
 static int16_t twb1728[575*2*4];
 
-void dft1728(int16_t *x,int16_t *y,unsigned char scale_flag)  // 576 x 3
+void dft1728(int16_t *x,int16_t *y,unsigned int *scale_flag)  // 576 x 3
 {
   int i,j;
   simd_q15_t *x128=(simd_q15_t *)x;
@@ -6864,9 +7250,9 @@ void dft1728(int16_t *x,int16_t *y,unsigned char scale_flag)  // 576 x 3
     x2128[i+1152] = x128[j+2];
   }
 
-  dft576((int16_t *)x2128,(int16_t *)ytmp128,1);
-  dft576((int16_t *)(x2128+576),(int16_t *)(ytmp128+576),1);
-  dft576((int16_t *)(x2128+1152),(int16_t *)(ytmp128+1152),1);
+  dft576((int16_t *)x2128,(int16_t *)ytmp128,scale_flag);
+  dft576((int16_t *)(x2128+576),(int16_t *)(ytmp128+576),scale_flag);
+  dft576((int16_t *)(x2128+1152),(int16_t *)(ytmp128+1152),scale_flag);
 
   bfly3_tw1(ytmp128,ytmp128+576,ytmp128+1152,y128,y128+576,y128+1152);
 
@@ -6881,7 +7267,7 @@ void dft1728(int16_t *x,int16_t *y,unsigned char scale_flag)  // 576 x 3
           twb128+j);
   }
 
-  if (scale_flag==1) {
+  if (scale_flag) {
     const simd_q15_t norm128 = set1_int16(dft_norm_table[14]);
 
     for (i=0; i<1728; i++) {
@@ -6894,7 +7280,7 @@ void dft1728(int16_t *x,int16_t *y,unsigned char scale_flag)  // 576 x 3
 static int16_t twa1800[599*2*4];
 static int16_t twb1800[599*2*4];
 
-void dft1800(int16_t *x,int16_t *y,unsigned char scale_flag)  // 600 x 3
+void dft1800(int16_t *x,int16_t *y,unsigned int *scale_flag)  // 600 x 3
 {
   int i,j;
   simd_q15_t *x128=(simd_q15_t *)x;
@@ -6912,9 +7298,9 @@ void dft1800(int16_t *x,int16_t *y,unsigned char scale_flag)  // 600 x 3
     x2128[i+1200] = x128[j+2];
   }
 
-  dft600((int16_t *)x2128,(int16_t *)ytmp128,1);
-  dft600((int16_t *)(x2128+600),(int16_t *)(ytmp128+600),1);
-  dft600((int16_t *)(x2128+1200),(int16_t *)(ytmp128+1200),1);
+  dft600((int16_t *)x2128,(int16_t *)ytmp128,scale_flag);
+  dft600((int16_t *)(x2128+600),(int16_t *)(ytmp128+600),scale_flag);
+  dft600((int16_t *)(x2128+1200),(int16_t *)(ytmp128+1200),scale_flag);
 
   bfly3_tw1(ytmp128,ytmp128+600,ytmp128+1200,y128,y128+600,y128+1200);
 
@@ -6929,7 +7315,7 @@ void dft1800(int16_t *x,int16_t *y,unsigned char scale_flag)  // 600 x 3
           twb128+j);
   }
 
-  if (scale_flag==1) {
+  if (scale_flag) {
     const simd_q15_t norm128 = set1_int16(dft_norm_table[14]);
 
     for (i=0; i<1800; i++) {
@@ -6943,7 +7329,7 @@ static int16_t twa1920[479*2*4];
 static int16_t twb1920[479*2*4];
 static int16_t twc1920[479*2*4];
 
-void dft1920(int16_t *x,int16_t *y,unsigned char scale_flag)  // 480 x 4
+void dft1920(int16_t *x,int16_t *y,unsigned int *scale_flag)  // 480 x 4
 {
   int i,j;
   simd_q15_t *x128=(simd_q15_t *)x;
@@ -6963,10 +7349,10 @@ void dft1920(int16_t *x,int16_t *y,unsigned char scale_flag)  // 480 x 4
     x2128[i+1440] = x128[j+3];
   }
 
-  dft480((int16_t *)x2128,(int16_t *)ytmp128,1);
-  dft480((int16_t *)(x2128+480),(int16_t *)(ytmp128+480),1);
-  dft480((int16_t *)(x2128+960),(int16_t *)(ytmp128+960),1);
-  dft480((int16_t *)(x2128+1440),(int16_t *)(ytmp128+1440),1);
+  dft480((int16_t *)x2128,(int16_t *)ytmp128,scale_flag);
+  dft480((int16_t *)(x2128+480),(int16_t *)(ytmp128+480),scale_flag);
+  dft480((int16_t *)(x2128+960),(int16_t *)(ytmp128+960),scale_flag);
+  dft480((int16_t *)(x2128+1440),(int16_t *)(ytmp128+1440),scale_flag);
 
   bfly4_tw1(ytmp128,ytmp128+480,ytmp128+960,ytmp128+1440,y128,y128+480,y128+960,y128+1440);
 
@@ -6984,7 +7370,7 @@ void dft1920(int16_t *x,int16_t *y,unsigned char scale_flag)  // 480 x 4
           twc128+j);
   }
 
-  if (scale_flag==1) {
+  if (scale_flag) {
     const simd_q15_t norm128 = set1_int16(dft_norm_table[13]);
     for (i=0; i<1920; i++) {
       y128[i] = mulhi_int16(y128[i],norm128);
@@ -6996,7 +7382,7 @@ void dft1920(int16_t *x,int16_t *y,unsigned char scale_flag)  // 480 x 4
 static int16_t twa1944[647*2*4];
 static int16_t twb1944[647*2*4];
 
-void dft1944(int16_t *x,int16_t *y,unsigned char scale_flag)  // 648 x 3
+void dft1944(int16_t *x,int16_t *y,unsigned int *scale_flag)  // 648 x 3
 {
   int i,j;
   simd_q15_t *x128=(simd_q15_t *)x;
@@ -7014,9 +7400,9 @@ void dft1944(int16_t *x,int16_t *y,unsigned char scale_flag)  // 648 x 3
     x2128[i+1296] = x128[j+2];
   }
 
-  dft648((int16_t *)x2128,(int16_t *)ytmp128,1);
-  dft648((int16_t *)(x2128+648),(int16_t *)(ytmp128+648),1);
-  dft648((int16_t *)(x2128+1296),(int16_t *)(ytmp128+1296),1);
+  dft648((int16_t *)x2128,(int16_t *)ytmp128,scale_flag);
+  dft648((int16_t *)(x2128+648),(int16_t *)(ytmp128+648),scale_flag);
+  dft648((int16_t *)(x2128+1296),(int16_t *)(ytmp128+1296),scale_flag);
 
   bfly3_tw1(ytmp128,ytmp128+648,ytmp128+1296,y128,y128+648,y128+1296);
 
@@ -7031,7 +7417,7 @@ void dft1944(int16_t *x,int16_t *y,unsigned char scale_flag)  // 648 x 3
           twb128+j);
   }
 
-  if (scale_flag==1) {
+  if (scale_flag) {
     const simd_q15_t norm128 = set1_int16(dft_norm_table[14]);
 
     for (i=0; i<1944; i++) {
@@ -7044,7 +7430,7 @@ void dft1944(int16_t *x,int16_t *y,unsigned char scale_flag)  // 648 x 3
 static int16_t twa2160[719*2*4];
 static int16_t twb2160[719*2*4];
 
-void dft2160(int16_t *x,int16_t *y,unsigned char scale_flag)  // 720 x 3
+void dft2160(int16_t *x,int16_t *y,unsigned int *scale_flag)  // 720 x 3
 {
   int i,j;
   simd_q15_t *x128=(simd_q15_t *)x;
@@ -7062,9 +7448,9 @@ void dft2160(int16_t *x,int16_t *y,unsigned char scale_flag)  // 720 x 3
     x2128[i+1440] = x128[j+2];
   }
 
-  dft720((int16_t *)x2128,(int16_t *)ytmp128,1);
-  dft720((int16_t *)(x2128+720),(int16_t *)(ytmp128+720),1);
-  dft720((int16_t *)(x2128+1440),(int16_t *)(ytmp128+1440),1);
+  dft720((int16_t *)x2128,(int16_t *)ytmp128,scale_flag);
+  dft720((int16_t *)(x2128+720),(int16_t *)(ytmp128+720),scale_flag);
+  dft720((int16_t *)(x2128+1440),(int16_t *)(ytmp128+1440),scale_flag);
 
   bfly3_tw1(ytmp128,ytmp128+720,ytmp128+1440,y128,y128+720,y128+1440);
 
@@ -7079,7 +7465,7 @@ void dft2160(int16_t *x,int16_t *y,unsigned char scale_flag)  // 720 x 3
           twb128+j);
   }
 
-  if (scale_flag==1) {
+  if (scale_flag) {
     const simd_q15_t norm128 = set1_int16(dft_norm_table[14]);
 
     for (i=0; i<2160; i++) {
@@ -7092,7 +7478,7 @@ void dft2160(int16_t *x,int16_t *y,unsigned char scale_flag)  // 720 x 3
 static int16_t twa2304[767*2*4];
 static int16_t twb2304[767*2*4];
 
-void dft2304(int16_t *x,int16_t *y,unsigned char scale_flag)  // 768 x 3
+void dft2304(int16_t *x,int16_t *y,unsigned int *scale_flag)  // 768 x 3
 {
   int i,j;
   simd_q15_t *x128=(simd_q15_t *)x;
@@ -7110,9 +7496,9 @@ void dft2304(int16_t *x,int16_t *y,unsigned char scale_flag)  // 768 x 3
     x2128[i+1536] = x128[j+2];
   }
 
-  dft768((int16_t *)x2128,(int16_t *)ytmp128,1);
-  dft768((int16_t *)(x2128+768),(int16_t *)(ytmp128+768),1);
-  dft768((int16_t *)(x2128+1536),(int16_t *)(ytmp128+1536),1);
+  dft768((int16_t *)x2128,(int16_t *)ytmp128,scale_flag);
+  dft768((int16_t *)(x2128+768),(int16_t *)(ytmp128+768),scale_flag);
+  dft768((int16_t *)(x2128+1536),(int16_t *)(ytmp128+1536),scale_flag);
 
   bfly3_tw1(ytmp128,ytmp128+768,ytmp128+1536,y128,y128+768,y128+1536);
 
@@ -7127,7 +7513,7 @@ void dft2304(int16_t *x,int16_t *y,unsigned char scale_flag)  // 768 x 3
           twb128+j);
   }
 
-  if (scale_flag==1) {
+  if (scale_flag) {
     const simd_q15_t norm128 = set1_int16(dft_norm_table[14]);
 
     for (i=0; i<2304; i++) {
@@ -7141,7 +7527,7 @@ static int16_t twa2400[599*2*4];
 static int16_t twb2400[599*2*4];
 static int16_t twc2400[599*2*4];
 
-void dft2400(int16_t *x,int16_t *y,unsigned char scale_flag)  // 600 x 4
+void dft2400(int16_t *x,int16_t *y,unsigned int *scale_flag)  // 600 x 4
 {
 
   int i,j;
@@ -7162,10 +7548,10 @@ void dft2400(int16_t *x,int16_t *y,unsigned char scale_flag)  // 600 x 4
     x2128[i+1800] = x128[j+3];
   }
 
-  dft600((int16_t *)x2128,(int16_t *)ytmp128,1);
-  dft600((int16_t *)(x2128+600),(int16_t *)(ytmp128+600),1);
-  dft600((int16_t *)(x2128+1200),(int16_t *)(ytmp128+1200),1);
-  dft600((int16_t *)(x2128+1800),(int16_t *)(ytmp128+1800),1);
+  dft600((int16_t *)x2128,(int16_t *)ytmp128,scale_flag);
+  dft600((int16_t *)(x2128+600),(int16_t *)(ytmp128+600),scale_flag);
+  dft600((int16_t *)(x2128+1200),(int16_t *)(ytmp128+1200),scale_flag);
+  dft600((int16_t *)(x2128+1800),(int16_t *)(ytmp128+1800),scale_flag);
 
   bfly4_tw1(ytmp128,ytmp128+600,ytmp128+1200,ytmp128+1800,y128,y128+600,y128+1200,y128+1800);
 
@@ -7183,7 +7569,7 @@ void dft2400(int16_t *x,int16_t *y,unsigned char scale_flag)  // 600 x 4
           twc128+j);
   }
 
-  if (scale_flag==1) {
+  if (scale_flag) {
     const simd_q15_t norm128 = set1_int16(dft_norm_table[13]);
     for (i=0; i<2400; i++) {
       y128[i] = mulhi_int16(y128[i],norm128);
@@ -7195,7 +7581,7 @@ void dft2400(int16_t *x,int16_t *y,unsigned char scale_flag)  // 600 x 4
 static int16_t twa2592[863*2*4];
 static int16_t twb2592[863*2*4];
 
-void dft2592(int16_t *x,int16_t *y,unsigned char scale_flag)  // 864 x 3
+void dft2592(int16_t *x,int16_t *y,unsigned int *scale_flag)  // 864 x 3
 {
   int i,j;
   simd_q15_t *x128=(simd_q15_t *)x;
@@ -7213,9 +7599,9 @@ void dft2592(int16_t *x,int16_t *y,unsigned char scale_flag)  // 864 x 3
     x2128[i+1728] = x128[j+2];
   }
 
-  dft864((int16_t *)x2128,(int16_t *)ytmp128,1);
-  dft864((int16_t *)(x2128+864),(int16_t *)(ytmp128+864),1);
-  dft864((int16_t *)(x2128+1728),(int16_t *)(ytmp128+1728),1);
+  dft864((int16_t *)x2128,(int16_t *)ytmp128,scale_flag);
+  dft864((int16_t *)(x2128+864),(int16_t *)(ytmp128+864),scale_flag);
+  dft864((int16_t *)(x2128+1728),(int16_t *)(ytmp128+1728),scale_flag);
 
   bfly3_tw1(ytmp128,ytmp128+864,ytmp128+1728,y128,y128+864,y128+1728);
 
@@ -7230,7 +7616,7 @@ void dft2592(int16_t *x,int16_t *y,unsigned char scale_flag)  // 864 x 3
           twb128+j);
   }
 
-  if (scale_flag==1) {
+  if (scale_flag) {
     const simd_q15_t norm128 = set1_int16(dft_norm_table[14]);
 
     for (i=0; i<2592; i++) {
@@ -7243,7 +7629,7 @@ void dft2592(int16_t *x,int16_t *y,unsigned char scale_flag)  // 864 x 3
 static int16_t twa2700[899*2*4];
 static int16_t twb2700[899*2*4];
 
-void dft2700(int16_t *x,int16_t *y,unsigned char scale_flag)  // 900 x 3
+void dft2700(int16_t *x,int16_t *y,unsigned int *scale_flag)  // 900 x 3
 {
   int i,j;
   simd_q15_t *x128=(simd_q15_t *)x;
@@ -7261,9 +7647,9 @@ void dft2700(int16_t *x,int16_t *y,unsigned char scale_flag)  // 900 x 3
     x2128[i+1800] = x128[j+2];
   }
 
-  dft900((int16_t *)x2128,(int16_t *)ytmp128,1);
-  dft900((int16_t *)(x2128+900),(int16_t *)(ytmp128+900),1);
-  dft900((int16_t *)(x2128+1800),(int16_t *)(ytmp128+1800),1);
+  dft900((int16_t *)x2128,(int16_t *)ytmp128,scale_flag);
+  dft900((int16_t *)(x2128+900),(int16_t *)(ytmp128+900),scale_flag);
+  dft900((int16_t *)(x2128+1800),(int16_t *)(ytmp128+1800),scale_flag);
 
   bfly3_tw1(ytmp128,ytmp128+900,ytmp128+1800,y128,y128+900,y128+1800);
 
@@ -7278,7 +7664,7 @@ void dft2700(int16_t *x,int16_t *y,unsigned char scale_flag)  // 900 x 3
           twb128+j);
   }
 
-  if (scale_flag==1) {
+  if (scale_flag) {
     const simd_q15_t norm128 = set1_int16(dft_norm_table[14]);
 
     for (i=0; i<2700; i++) {
@@ -7291,7 +7677,7 @@ void dft2700(int16_t *x,int16_t *y,unsigned char scale_flag)  // 900 x 3
 static int16_t twa2880[959*2*4];
 static int16_t twb2880[959*2*4];
 
-void dft2880(int16_t *x,int16_t *y,unsigned char scale_flag)  // 960 x 3
+void dft2880(int16_t *x,int16_t *y,unsigned int *scale_flag)  // 960 x 3
 {
   int i,j;
   simd_q15_t *x128=(simd_q15_t *)x;
@@ -7309,9 +7695,9 @@ void dft2880(int16_t *x,int16_t *y,unsigned char scale_flag)  // 960 x 3
     x2128[i+1920] = x128[j+2];
   }
 
-  dft960((int16_t *)x2128,(int16_t *)ytmp128,1);
-  dft960((int16_t *)(x2128+960),(int16_t *)(ytmp128+960),1);
-  dft960((int16_t *)(x2128+1920),(int16_t *)(ytmp128+1920),1);
+  dft960((int16_t *)x2128,(int16_t *)ytmp128,scale_flag);
+  dft960((int16_t *)(x2128+960),(int16_t *)(ytmp128+960),scale_flag);
+  dft960((int16_t *)(x2128+1920),(int16_t *)(ytmp128+1920),scale_flag);
 
   bfly3_tw1(ytmp128,ytmp128+960,ytmp128+1920,y128,y128+960,y128+1920);
 
@@ -7326,7 +7712,7 @@ void dft2880(int16_t *x,int16_t *y,unsigned char scale_flag)  // 960 x 3
           twb128+j);
   }
 
-  if (scale_flag==1) {
+  if (scale_flag) {
     const simd_q15_t norm128 = set1_int16(dft_norm_table[14]);
 
     for (i=0; i<2880; i++) {
@@ -7339,7 +7725,7 @@ void dft2880(int16_t *x,int16_t *y,unsigned char scale_flag)  // 960 x 3
 static int16_t twa2916[971*2*4];
 static int16_t twb2916[971*2*4];
 
-void dft2916(int16_t *x,int16_t *y,unsigned char scale_flag)  // 972 x 3
+void dft2916(int16_t *x,int16_t *y,unsigned int *scale_flag)  // 972 x 3
 {
   int i,j;
   simd_q15_t *x128=(simd_q15_t *)x;
@@ -7357,9 +7743,9 @@ void dft2916(int16_t *x,int16_t *y,unsigned char scale_flag)  // 972 x 3
     x2128[i+1944] = x128[j+2];
   }
 
-  dft972((int16_t *)x2128,(int16_t *)ytmp128,1);
-  dft972((int16_t *)(x2128+972),(int16_t *)(ytmp128+972),1);
-  dft972((int16_t *)(x2128+1944),(int16_t *)(ytmp128+1944),1);
+  dft972((int16_t *)x2128,(int16_t *)ytmp128,scale_flag);
+  dft972((int16_t *)(x2128+972),(int16_t *)(ytmp128+972),scale_flag);
+  dft972((int16_t *)(x2128+1944),(int16_t *)(ytmp128+1944),scale_flag);
 
   bfly3_tw1(ytmp128,ytmp128+972,ytmp128+1944,y128,y128+972,y128+1944);
 
@@ -7374,7 +7760,7 @@ void dft2916(int16_t *x,int16_t *y,unsigned char scale_flag)  // 972 x 3
           twb128+j);
   }
 
-  if (scale_flag==1) {
+  if (scale_flag) {
     const simd_q15_t norm128 = set1_int16(dft_norm_table[14]);
 
     for (i=0; i<2916; i++) {
@@ -7389,7 +7775,7 @@ static int16_t twb3000[599*8]__attribute__((aligned(32)));
 static int16_t twc3000[599*8]__attribute__((aligned(32)));
 static int16_t twd3000[599*8]__attribute__((aligned(32)));
 
-void dft3000(int16_t *x,int16_t *y,unsigned char scale_flag) // 600 * 5
+void dft3000(int16_t *x,int16_t *y,unsigned int *scale_flag) // 600 * 5
 {
 
   int i,j;
@@ -7412,11 +7798,11 @@ void dft3000(int16_t *x,int16_t *y,unsigned char scale_flag) // 600 * 5
     x2128[i+2400] = x128[j+4];
   }
 
-  dft600((int16_t *)x2128,(int16_t *)ytmp128,1);
-  dft600((int16_t *)(x2128+600),(int16_t *)(ytmp128+600),1);
-  dft600((int16_t *)(x2128+1200),(int16_t *)(ytmp128+1200),1);
-  dft600((int16_t *)(x2128+1800),(int16_t *)(ytmp128+1800),1);
-  dft600((int16_t *)(x2128+2400),(int16_t *)(ytmp128+2400),1);
+  dft600((int16_t *)x2128,(int16_t *)ytmp128,scale_flag);
+  dft600((int16_t *)(x2128+600),(int16_t *)(ytmp128+600),scale_flag);
+  dft600((int16_t *)(x2128+1200),(int16_t *)(ytmp128+1200),scale_flag);
+  dft600((int16_t *)(x2128+1800),(int16_t *)(ytmp128+1800),scale_flag);
+  dft600((int16_t *)(x2128+2400),(int16_t *)(ytmp128+2400),scale_flag);
 
   bfly5_tw1(ytmp128,ytmp128+600,ytmp128+1200,ytmp128+1800,ytmp128+2400,y128,y128+600,y128+1200,y128+1800,y128+2400);
 
@@ -7437,7 +7823,7 @@ void dft3000(int16_t *x,int16_t *y,unsigned char scale_flag) // 600 * 5
           twd128+j);
   }
 
-  if (scale_flag==1) {
+  if (scale_flag) {
     const simd_q15_t norm128 = set1_int16(dft_norm_table[15]);
 
     for (i=0; i<3000; i++) {
@@ -7450,7 +7836,7 @@ void dft3000(int16_t *x,int16_t *y,unsigned char scale_flag) // 600 * 5
 static int16_t twa3240[1079*2*4];
 static int16_t twb3240[1079*2*4];
 
-void dft3240(int16_t *x,int16_t *y,unsigned char scale_flag)  // 1080 x 3
+void dft3240(int16_t *x,int16_t *y,unsigned int *scale_flag)  // 1080 x 3
 {
   int i,j;
   simd_q15_t *x128=(simd_q15_t *)x;
@@ -7468,9 +7854,9 @@ void dft3240(int16_t *x,int16_t *y,unsigned char scale_flag)  // 1080 x 3
     x2128[i+2160] = x128[j+2];
   }
 
-  dft1080((int16_t *)x2128,(int16_t *)ytmp128,1);
-  dft1080((int16_t *)(x2128+1080),(int16_t *)(ytmp128+1080),1);
-  dft1080((int16_t *)(x2128+2160),(int16_t *)(ytmp128+2160),1);
+  dft1080((int16_t *)x2128,(int16_t *)ytmp128,scale_flag);
+  dft1080((int16_t *)(x2128+1080),(int16_t *)(ytmp128+1080),scale_flag);
+  dft1080((int16_t *)(x2128+2160),(int16_t *)(ytmp128+2160),scale_flag);
 
   bfly3_tw1(ytmp128,ytmp128+1080,ytmp128+2160,y128,y128+1080,y128+2160);
 
@@ -7485,7 +7871,7 @@ void dft3240(int16_t *x,int16_t *y,unsigned char scale_flag)  // 1080 x 3
           twb128+j);
   }
 
-  if (scale_flag==1) {
+  if (scale_flag) {
     const simd_q15_t norm128 = set1_int16(dft_norm_table[14]);
 
     for (i=0; i<3240; i++) {
@@ -7697,7 +8083,7 @@ int dfts_autoinit(void)
 
 #ifndef MR_MAIN
 
-void dft_implementation(uint8_t sizeidx, int16_t *input, int16_t *output, unsigned char scale_flag)
+void dft_implementation(uint8_t sizeidx, int16_t *input,int16_t *output,unsigned int *scale_flag)
 {
   AssertFatal((sizeidx >= 0 && sizeidx<DFT_SIZE_IDXTABLESIZE),"Invalid dft size index %i\n",sizeidx);
         int algn=0xF;
@@ -7716,7 +8102,7 @@ void dft_implementation(uint8_t sizeidx, int16_t *input, int16_t *output, unsign
           dft_ftab[sizeidx].func(input,output,scale_flag);
 };
 
-void idft_implementation(uint8_t sizeidx, int16_t *input, int16_t *output, unsigned char scale_flag)
+void idft_implementation(uint8_t sizeidx, int16_t *input,int16_t *output,unsigned int *scale_flag)
 {
   AssertFatal((sizeidx>=0 && sizeidx<DFT_SIZE_IDXTABLESIZE),"Invalid idft size index %i\n",sizeidx);
         int algn=0xF;
@@ -7739,9 +8125,26 @@ void idft_implementation(uint8_t sizeidx, int16_t *input, int16_t *output, unsig
 #ifdef MR_MAIN
 #include <string.h>
 #include <stdio.h>
+#include "../../../common/config/config_paramdesc.h"
+#include "../openair1/SIMULATION/TOOLS/sim.h"
+#include "../common/utils/utils.h"
+
+configmodule_interface_t *uniqCfg = NULL;
+extern int bitrev4096[4096],bitrev2048[2048],bitrev1024[1024],bitrev512[512],bitrev256[256],bitrev128[128];
+void init_bitrev();
+void radix2(cd_t *x, int N);
+void normalize(cd_t *x,cd_t *y,int *bitrev, int N);
+
+void exit_function(const char *file, const char *function, const int line, const char *s, const int assert) {
+exit(-1);
+}
+
+int config_get(configmodule_interface_t *cfg, paramdef_t *params, int numparams, char *prefix) {
+return(0);
+}
 
-#define LOG_M write_output
-int write_output(const char *fname,const char *vname,void *data,int length,int dec,char format)
+//#define LOG_M write_output
+int write_file_matlab(const char *fname,const char *vname,void *data,int length,int dec,unsigned int format,int dummy)
 {
 
   FILE *fp=NULL;
@@ -7895,7 +8298,36 @@ int write_output(const char *fname,const char *vname,void *data,int length,int d
 
   return 0;
 }
+double compute_error(int16_t *x, int16_t *y, int N, int *bitrev, int idft) {
+
+  int i;
+  cd_t xcd[N],ycd[N];
+
+  double error=0;
+
+  for (i=0;i<N;i++) {
+    xcd[i].r = (double)(((int16_t *)x)[i<<1]); 
+    xcd[i].i = (double)(((int16_t *)x)[1+(i<<1)]);
+    if (idft==1) xcd[i].i=-xcd[i].i; 
+  }
+  
+  double input_lev=0;
+  for (i=0;i<N;i++) input_lev += pow(xcd[i].r,2.0) + pow(xcd[i].i,2.0);
+  input_lev/=N;
+  radix2(xcd,N);
+  normalize(xcd,ycd,bitrev,N);
+  if (idft==0) for (i=0;i<N;i++) error += pow((ycd[i].r - (double)((int16_t*)y)[i<<1]),2.0) + pow(ycd[i].i-(double)((int16_t*)y)[1+(i<<1)],2.0);
+  else         for (i=0;i<N;i++) error += pow((ycd[i].r - (double)((int16_t*)y)[i<<1]),2.0) + pow(ycd[i].i+(double)((int16_t*)y)[1+(i<<1)],2.0);
+  return(input_lev/(error/N));
+}
 
+void fill_gauss(c16_t *x,int N,double dBFS) {
+
+  for (int i=0; i < N; i++) {
+     x[i].r = (int16_t)(gaussZiggurat(0,1.0)*SHRT_MAX*pow(10.0,dBFS*.05));
+     x[i].i = (int16_t)(gaussZiggurat(0,1.0)*SHRT_MAX*pow(10.0,dBFS*.05));
+  }
+}
 
 int main(int argc, char**argv)
 {
@@ -7906,8 +8338,12 @@ int main(int argc, char**argv)
   int i;
   simd_q15_t *x128=(simd_q15_t*)x,*y128=(simd_q15_t*)y;
 
+  double sqnr;
+
   dfts_autoinit();
 
+  init_bitrev();
+
   set_taus_seed(0);
   cpu_meas_enabled = 1;
   /*
@@ -8081,7 +8517,6 @@ int main(int argc, char**argv)
     printf("\n");
  
   memset((void*)&x[0],0,2048*4);
-      
   for (i=0; i<2048; i+=4) {
      ((int16_t*)x)[i<<1] = 1024;
      ((int16_t*)x)[1+(i<<1)] = 0;
@@ -8092,18 +8527,6 @@ int main(int argc, char**argv)
      ((int16_t*)x)[6+(i<<1)] = 0;
      ((int16_t*)x)[7+(i<<1)] = -1024;
      }
-  /*
-  for (i=0; i<2048; i+=2) {
-     ((int16_t*)x)[i<<1] = 1024;
-     ((int16_t*)x)[1+(i<<1)] = 0;
-     ((int16_t*)x)[2+(i<<1)] = -1024;
-     ((int16_t*)x)[3+(i<<1)] = 0;
-     }
-       
-  for (i=0;i<2048*2;i++) {
-    ((int16_t*)x)[i] = i/2;//(int16_t)((taus()&0xffff))>>5;
-  }
-     */
   memset((void*)&x[0],0,64*sizeof(int32_t));
   for (i=2;i<36;i++) {
     if ((taus() & 1)==0)
@@ -8117,7 +8540,8 @@ int main(int argc, char**argv)
     else
       ((int16_t*)x)[i] = -364;
   }
-  idft64((int16_t *)x,(int16_t *)y,1);
+  uint32_t scale64 = 3;
+  idft64((int16_t *)x,(int16_t *)y,&scale64);
   
 
   printf("64-point\n");
@@ -8134,14 +8558,14 @@ int main(int argc, char**argv)
   
 
 
-  idft64((int16_t *)x,(int16_t *)y,1);
-  idft64((int16_t *)x,(int16_t *)y,1);
-  idft64((int16_t *)x,(int16_t *)y,1);
+  idft64((int16_t *)x,(int16_t *)y,&scale64);
+  idft64((int16_t *)x,(int16_t *)y,&scale64);
+  idft64((int16_t *)x,(int16_t *)y,&scale64);
   reset_meas(&ts);
 
   for (i=0; i<10000000; i++) {
     start_meas(&ts);
-    idft64((int16_t *)x,(int16_t *)y,1);
+    idft64((int16_t *)x,(int16_t *)y,&scale64);
     stop_meas(&ts);
 
   }
@@ -8186,12 +8610,16 @@ int main(int argc, char**argv)
   }
   reset_meas(&ts);
 
+  uint32_t scale128_tx[2] = {4,0};
   for (i=0; i<10000; i++) {
     start_meas(&ts);
-    idft128((int16_t *)x,(int16_t *)y,1);
+    idft128((int16_t *)x,(int16_t *)y,scale128_tx);
     stop_meas(&ts);
   }
 
+  sqnr = compute_error((int16_t*)x,(int16_t*)y,128,bitrev128,1);
+
+  printf("128 point IDFT SQNR (QPSK) : %f dB\n",10*log10(sqnr));
   printf("\n\n128-point(%f cycles)\n",(double)ts.diff/(double)ts.trials);
   LOG_M("y128.m","y128",y,128,1,1);
   LOG_M("x128.m","x128",x,128,1,1);
@@ -8227,10 +8655,11 @@ int main(int argc, char**argv)
       ((int16_t*)x)[i] = -364;
   }
   reset_meas(&ts);
+  uint32_t scale256_tx[3]={4,0};
 
   for (i=0; i<10000; i++) {
     start_meas(&ts);
-    idft256((int16_t *)x,(int16_t *)y,1);
+    idft256((int16_t *)x,(int16_t *)y,scale256_tx);
     stop_meas(&ts);
   }
 
@@ -8238,6 +8667,9 @@ int main(int argc, char**argv)
   LOG_M("y256.m","y256",y,256,1,1);
   LOG_M("x256.m","x256",x,256,1,1);
 
+  sqnr = compute_error((int16_t*)x,(int16_t*)y,256,bitrev256,1);
+
+  printf("256 point IDFT SQNR (QPSK) : %f dB\n",10*log10(sqnr));
   memset((void*)&x[0],0,512*sizeof(int32_t));
   for (i=2;i<302;i++) {
     if ((taus() & 1)==0)
@@ -8253,15 +8685,21 @@ int main(int argc, char**argv)
   }
 
   reset_meas(&ts);
+  uint32_t scale512_tx[4]={4,1,0};
+
   for (i=0; i<10000; i++) {
     start_meas(&ts);
-    idft512((int16_t *)x,(int16_t *)y,1);
+    idft512((int16_t *)x,(int16_t *)y,scale512_tx);
     stop_meas(&ts);
   }
 
   printf("\n\n512-point(%f cycles)\n",(double)ts.diff/(double)ts.trials);
   LOG_M("y512.m","y512",y,512,1,1);
   LOG_M("x512.m","x512",x,512,1,1);
+  sqnr = compute_error((int16_t*)x,(int16_t*)y,512,bitrev512,1);
+
+  printf("512 point IDFT SQNR (QPSK) : %f dB\n",10*log10(sqnr));
+  memset((void*)x,0,1024*sizeof(int32_t));
   /*
   printf("X: ");
   for (i=0;i<64;i++)
@@ -8288,9 +8726,10 @@ int main(int argc, char**argv)
   }
   reset_meas(&ts);
 
+  uint32_t scale1024_tx[4]={4,1,0};
   for (i=0; i<10000; i++) {
     start_meas(&ts);
-    idft1024((int16_t *)x,(int16_t *)y,1);
+    idft1024((int16_t *)x,(int16_t *)y,scale1024_tx);
     stop_meas(&ts);
   }
 
@@ -8298,6 +8737,9 @@ int main(int argc, char**argv)
   LOG_M("y1024.m","y1024",y,1024,1,1);
   LOG_M("x1024.m","x1024",x,1024,1,1);
 
+  sqnr = compute_error((int16_t*)x,(int16_t*)y,1024,bitrev1024,1);
+
+  printf("1024 point IDFT SQNR (QPSK) : %f dB\n",10*log10(sqnr));
 
   memset((void*)x,0,1536*sizeof(int32_t));
   for (i=2;i<1202;i++) {
@@ -8314,15 +8756,16 @@ int main(int argc, char**argv)
   }
   reset_meas(&ts);
 
+  uint32_t scale1536[4]={1,1,1,3};
   for (i=0; i<10000; i++) {
     start_meas(&ts);
-    idft1536((int16_t *)x,(int16_t *)y,1);
+    idft1536((int16_t *)x,(int16_t *)y,scale1536);
     stop_meas(&ts);
   }
 
   printf("\n\n1536-point(%f cycles)\n",(double)ts.diff/(double)ts.trials);
-  write_output("y1536.m","y1536",y,1536,1,1);
-  write_output("x1536.m","x1536",x,1536,1,1);
+  LOG_M("y1536.m","y1536",y,1536,1,1);
+  LOG_M("x1536.m","x1536",x,1536,1,1);
 
 
   memset((void*)x,0,2048*sizeof(int32_t));
@@ -8340,9 +8783,10 @@ int main(int argc, char**argv)
   }
   reset_meas(&ts);
 
+  uint32_t scale2048_tx[4]={3,2,1,0};
   for (i=0; i<10000; i++) {
     start_meas(&ts);
-    dft2048((int16_t *)x,(int16_t *)y,1);
+    idft2048((int16_t *)x,(int16_t *)y,scale2048_tx);
     stop_meas(&ts);
   }
 
@@ -8350,6 +8794,9 @@ int main(int argc, char**argv)
   LOG_M("y2048.m","y2048",y,2048,1,1);
   LOG_M("x2048.m","x2048",x,2048,1,1);
 
+  sqnr = compute_error((int16_t*)x,(int16_t*)y,2048,bitrev2048,1);
+
+  printf("2048 point IDFT SQNR (QPSK) : %f dB\n",10*log10(sqnr));
 // NR 80Mhz, 217 PRB, 3/4 sampling
   memset((void*)x, 0, 3072*sizeof(int32_t));
   for (i=2;i<2506;i++) {
@@ -8367,15 +8814,16 @@ int main(int argc, char**argv)
 
   reset_meas(&ts);
 
+  uint32_t scale3072[4]={1,1,1,3};
   for (i=0; i<10000; i++) {
     start_meas(&ts);
-    idft3072((int16_t *)x,(int16_t *)y,1);
+    idft3072((int16_t *)x,(int16_t *)y,scale3072);
     stop_meas(&ts);
   }
 
   printf("\n\n3072-point(%f cycles)\n",(double)ts.diff/(double)ts.trials);
-  write_output("y3072.m","y3072",y,3072,1,1);
-  write_output("x3072.m","x3072",x,3072,1,1);
+  LOG_M("y3072.m","y3072",y,3072,1,1);
+  LOG_M("x3072.m","x3072",x,3072,1,1);
 
 
   memset((void*)x,0,4096*sizeof(int32_t));
@@ -8393,9 +8841,10 @@ int main(int argc, char**argv)
   }
   reset_meas(&ts);
 
+  uint32_t scale4096_tx[4]={3,2,1,0};
   for (i=0; i<10000; i++) {
     start_meas(&ts);
-    idft4096((int16_t *)x,(int16_t *)y,1);
+    idft4096((int16_t *)x,(int16_t *)y,scale4096_tx);
     stop_meas(&ts);
   }
 
@@ -8403,9 +8852,29 @@ int main(int argc, char**argv)
   LOG_M("y4096.m","y4096",y,4096,1,1);
   LOG_M("x4096.m","x4096",x,4096,1,1);
 
-  dft4096((int16_t *)y,(int16_t *)x2,1);
-  LOG_M("x4096_2.m","x4096_2",x2,4096,1,1);
+  sqnr = compute_error((int16_t*)x,(int16_t*)y,4096,bitrev4096,1);
 
+  printf("4096 point IDFT SQNR (QPSK) : %f dB\n",10*log10(sqnr));
+
+  float sqrt2 = 0.70711;
+  float sqrt170 = 0.076696;
+
+  for (i=0;i<2400;i++) {
+    uint32_t n=taus();
+    ((int16_t*)x)[i]   = (short)((1-2*(n&1))*(8-(1-2*((n>>1)&1))*(4-(1-2*((n>>2)&1))*(2-(1-2*((n>>3)&1))))))*512*sqrt170*sqrt2;
+  }
+  for (i=2*(4096-1200);i<8192;i++) {
+    uint32_t n=taus();
+    ((int16_t*)x)[i]   = (short)((1-2*(n&1))*(8-(1-2*((n>>1)&1))*(4-(1-2*((n>>2)&1))*(2-(1-2*((n>>3)&1))))))*512*sqrt170*sqrt2;
+  }
+
+  uint32_t scale4096_tx256qam[4]={3,2,1,0};
+  idft4096((int16_t *)x,(int16_t *)y,scale4096_tx256qam);
+  LOG_M("y4096_256qam.m","y4096_256qam",y,4096,1,1);
+  LOG_M("x4096_256qam.m","x4096_256qam",x,4096,1,1);
+
+  sqnr = compute_error((int16_t*)x,(int16_t*)y,4096,bitrev4096,1);
+  printf("4096 point IDFT SQNR (256QAM) : %f dB\n",10*log10(sqnr));
 // NR 160Mhz, 434 PRB, 3/4 sampling
   memset((void*)x, 0, 6144*sizeof(int32_t));
   for (i=2;i<5010;i++) {
@@ -8423,15 +8892,16 @@ int main(int argc, char**argv)
 
   reset_meas(&ts);
 
+  uint32_t scale6144[5]={1,1,1,1,3};
   for (i=0; i<10000; i++) {
     start_meas(&ts);
-    idft6144((int16_t *)x,(int16_t *)y,1);
+    idft6144((int16_t *)x,(int16_t *)y,scale6144);
     stop_meas(&ts);
   }
 
   printf("\n\n6144-point(%f cycles)\n",(double)ts.diff/(double)ts.trials);
-  write_output("y6144.m","y6144",y,6144,1,1);
-  write_output("x6144.m","x6144",x,6144,1,1);
+  LOG_M("y6144.m","y6144",y,6144,1,1);
+  LOG_M("x6144.m","x6144",x,6144,1,1);
 
   memset((void*)x,0,8192*sizeof(int32_t));
   for (i=2;i<4802;i++) {
@@ -8447,9 +8917,10 @@ int main(int argc, char**argv)
       ((int16_t*)x)[i] = -364;
   }
   reset_meas(&ts);
+  uint32_t scale8192[5]={1,1,1,1,3};
   for (i=0; i<10000; i++) {
     start_meas(&ts);
-    idft8192((int16_t *)x,(int16_t *)y,1);
+    idft8192((int16_t *)x,(int16_t *)y,scale8192);
     stop_meas(&ts);
   }
 
@@ -8471,9 +8942,10 @@ int main(int argc, char**argv)
       ((int16_t*)x)[i] = -364;
   }
   reset_meas(&ts);
+  uint32_t scale16384[5]={1,1,1,1,3};
   for (i=0; i<10000; i++) {
     start_meas(&ts);
-    dft16384((int16_t *)x,(int16_t *)y,1);
+    dft16384((int16_t *)x,(int16_t *)y,scale16384);
     stop_meas(&ts);
   }
 
@@ -8481,82 +8953,6 @@ int main(int argc, char**argv)
   LOG_M("y16384.m","y16384",y,16384,1,1);
   LOG_M("x16384.m","x16384",x,16384,1,1);
 
-  memset((void*)x,0,1536*sizeof(int32_t));
-  for (i=2;i<1202;i++) {
-    if ((taus() & 1)==0)
-      ((int16_t*)x)[i] = 364;
-    else
-      ((int16_t*)x)[i] = -364;
-  }
-  for (i=2*(1536-600);i<3072;i++) {
-    if ((taus() & 1)==0)
-      ((int16_t*)x)[i] = 364;
-    else
-      ((int16_t*)x)[i] = -364;
-  }
-  reset_meas(&ts);
-  for (i=0; i<10000; i++) {
-    start_meas(&ts);
-    idft1536((int16_t *)x,(int16_t *)y,1);
-    stop_meas(&ts);
-  }
-
-  printf("\n\n1536-point(%f cycles)\n",(double)ts.diff/(double)ts.trials);
-  LOG_M("y1536.m","y1536",y,1536,1,1);
-  LOG_M("x1536.m","x1536",x,1536,1,1);
-
-  printf("\n\n1536-point(%f cycles)\n",(double)ts.diff/(double)ts.trials);
-  LOG_M("y8192.m","y8192",y,8192,1,1);
-  LOG_M("x8192.m","x8192",x,8192,1,1);
-
-  memset((void*)x,0,3072*sizeof(int32_t));
-  for (i=2;i<1202;i++) {
-    if ((taus() & 1)==0)
-      ((int16_t*)x)[i] = 364;
-    else
-      ((int16_t*)x)[i] = -364;
-  }
-  for (i=2*(3072-600);i<3072;i++) {
-    if ((taus() & 1)==0)
-      ((int16_t*)x)[i] = 364;
-    else
-      ((int16_t*)x)[i] = -364;
-  }
-  reset_meas(&ts);
-  for (i=0; i<10000; i++) {
-    start_meas(&ts);
-    idft3072((int16_t *)x,(int16_t *)y,1);
-    stop_meas(&ts);
-  }
-
-  printf("\n\n3072-point(%f cycles)\n",(double)ts.diff/(double)ts.trials);
-  LOG_M("y3072.m","y3072",y,3072,1,1);
-  LOG_M("x3072.m","x3072",x,3072,1,1);
-
-  memset((void*)x,0,6144*sizeof(int32_t));
-  for (i=2;i<4802;i++) {
-    if ((taus() & 1)==0)
-      ((int16_t*)x)[i] = 364;
-    else
-      ((int16_t*)x)[i] = -364;
-  }
-  for (i=2*(6144-2400);i<12288;i++) {
-    if ((taus() & 1)==0)
-      ((int16_t*)x)[i] = 364;
-    else
-      ((int16_t*)x)[i] = -364;
-  }
-  reset_meas(&ts);
-  for (i=0; i<10000; i++) {
-    start_meas(&ts);
-    idft6144((int16_t *)x,(int16_t *)y,1);
-    stop_meas(&ts);
-  }
-
-  printf("\n\n6144-point(%f cycles)\n",(double)ts.diff/(double)ts.trials);
-  LOG_M("y6144.m","y6144",y,6144,1,1);
-  LOG_M("x6144.m","x6144",x,6144,1,1);
-
   memset((void*)x,0,12288*sizeof(int32_t));
   for (i=2;i<9602;i++) {
     if ((taus() & 1)==0)
@@ -8571,9 +8967,10 @@ int main(int argc, char**argv)
       ((int16_t*)x)[i] = -364;
   }
   reset_meas(&ts);
+  uint32_t scale12288[5]={1,1,1,1,3};
   for (i=0; i<10000; i++) {
     start_meas(&ts);
-    idft12288((int16_t *)x,(int16_t *)y,1);
+    idft12288((int16_t *)x,(int16_t *)y,scale12288);
     stop_meas(&ts);
   }
 
@@ -8595,9 +8992,11 @@ int main(int argc, char**argv)
       ((int16_t*)x)[i] = -364;
   }
   reset_meas(&ts);
+
+  uint32_t scale18432[6]={1,1,1,1,1,3};
   for (i=0; i<10000; i++) {
     start_meas(&ts);
-    idft18432((int16_t *)x,(int16_t *)y,1);
+    idft18432((int16_t *)x,(int16_t *)y,scale18432);
     stop_meas(&ts);
   }
 
@@ -8619,9 +9018,11 @@ int main(int argc, char**argv)
       ((int16_t*)x)[i] = -364;
   }
   reset_meas(&ts);
+
+  uint32_t scale24576[6]={1,1,1,1,1,3};
   for (i=0; i<10000; i++) {
     start_meas(&ts);
-    idft24576((int16_t *)x,(int16_t *)y,1);
+    idft24576((int16_t *)x,(int16_t *)y,scale24576);
     stop_meas(&ts);
   }
 
@@ -8644,9 +9045,10 @@ int main(int argc, char**argv)
       ((int16_t*)x)[i] = -364;
   }
   reset_meas(&ts);
+  uint32_t scale36864[6] = {1,1,1,1,1,3};
   for (i=0; i<10000; i++) {
     start_meas(&ts);
-    dft36864((int16_t *)x,(int16_t *)y,1);
+    dft36864((int16_t *)x,(int16_t *)y,scale36864);
     stop_meas(&ts);
   }
 
@@ -8669,9 +9071,10 @@ int main(int argc, char**argv)
       ((int16_t*)x)[i] = -364;
   }
   reset_meas(&ts);
+  uint32_t scale49152[6]={1,1,1,1,1,3};
   for (i=0; i<10000; i++) {
     start_meas(&ts);
-    idft49152((int16_t *)x,(int16_t *)y,1);
+    idft49152((int16_t *)x,(int16_t *)y,scale49152);
     stop_meas(&ts);
   }
 
@@ -8679,6 +9082,268 @@ int main(int argc, char**argv)
   LOG_M("y49152.m","y49152",y,49152,1,1);
   LOG_M("x49152.m","x49152",x,49152,1,1);
 
+  memset((void*)x,0,128*sizeof(int32_t));
+  for (i=0;i<128;i++) {
+    ((int16_t*)x)[i<<1] = (int16_t)(364 * cos(2*M_PI*3*i/128));
+    ((int16_t*)x)[1+(i<<1)] = (int16_t)(364 * sin(2*M_PI*3*i/128));
+  } 
+  uint32_t scale128_rx[3]={1,3,0};
+  dft128((int16_t*)x,(int16_t*)y,scale128_rx);
+  LOG_M("x128_exp.m","x128_exp",x,128,1,1); 
+  LOG_M("y128_exp.m","y128_exp",y,128,1,1); 
+  sqnr = compute_error((int16_t*)x,(int16_t*)y,128,bitrev128,0);
+
+  printf("128 point DFT SQNR (cplx sinusoid) : %f dB\n",10*log10(sqnr));
+
+  // scaling optimization
+  for (double dBFS = -80;dBFS < 0; dBFS+=2) {
+    int scale128_min[3];
+    double sqnr,max_sqnr=-99;
+    double input_lev=0;
+    int inputcnt=0;
+    for (scale128_rx[0]=1;scale128_rx[0]<=4;scale128_rx[0]++) 
+      for (scale128_rx[1]=0;scale128_rx[1]<=4-scale128_rx[0];scale128_rx[1]++) {
+         scale128_rx[2]=4-scale128_rx[0]-scale128_rx[1];
+         sqnr=0;
+         int n;
+         for (n=0;n<16384/128;n++) {
+           fill_gauss((c16_t*)x,128,dBFS);
+           for (i=0;i<128;i++) input_lev += pow((double)(((c16_t*)x)[i].r),2.0) + pow((double)(((c16_t*)x)[i].i),2.0);
+           dft128((int16_t*)x,(int16_t*)y,scale128_rx);
+           sqnr += compute_error((int16_t*)x,(int16_t*)y,128,bitrev128,0);
+         }
+         sqnr/=n;
+         inputcnt+=(n*128);
+         if (sqnr>max_sqnr) {
+           max_sqnr = sqnr;
+           scale128_min[0]=scale128_rx[0]; scale128_min[1]=scale128_rx[1]; scale128_min[2]=scale128_rx[2];
+         }
+      }
+    printf("128-point dBFS %f(input lev %f) dB SQNR %f(%f):  (%d,%d,%d)\n",dBFS,10*log10(input_lev/inputcnt),10*log10(max_sqnr),max_sqnr,scale128_min[0],scale128_min[1],scale128_min[2]);
+  }
+
+  memset((void*)x,0,256*sizeof(int32_t));
+  for (i=0;i<256;i++) {
+    ((int16_t*)x)[i<<1] = (int16_t)(364 * cos(2*M_PI*3*i/256));
+    ((int16_t*)x)[1+(i<<1)] = (int16_t)(364 * sin(2*M_PI*3*i/256));
+  } 
+  uint32_t scale256_rx[3]={0,2,2};
+  dft256((int16_t*)x,(int16_t*)y,scale256_rx);
+  LOG_M("x256_exp.m","x256_exp",x,256,1,1); 
+  LOG_M("y256_exp.m","y256_exp",y,256,1,1); 
+  sqnr = compute_error((int16_t*)x,(int16_t*)y,256,bitrev256,0);
+
+  printf("256 point DFT SQNR (cplx sinusoid) : %f dB\n",10*log10(sqnr));
+
+  // scaling optimization
+  for (double dBFS = -80;dBFS < 0; dBFS+=2) {
+    int scale256_min[3];
+    double sqnr,max_sqnr=-99;
+    double input_lev=0;
+    int inputcnt=0;
+    for (scale256_rx[0]=0;scale256_rx[0]<=4;scale256_rx[0]++) 
+      for (scale256_rx[1]=0;scale256_rx[1]<=4-scale256_rx[0];scale256_rx[1]++) {
+         scale256_rx[2]=4-scale256_rx[0]-scale256_rx[1];
+         sqnr=0;
+         int n;
+         for (n=0;n<16384/256;n++) {
+           fill_gauss((c16_t*)x,256,dBFS);
+           for (i=0;i<256;i++) input_lev += pow((double)(((c16_t*)x)[i].r),2.0) + pow((double)(((c16_t*)x)[i].i),2.0);
+           dft256((int16_t*)x,(int16_t*)y,scale256_rx);
+           sqnr += compute_error((int16_t*)x,(int16_t*)y,256,bitrev256,0);
+         }
+         sqnr/=n;
+         inputcnt+=(n*256);
+         if (sqnr>max_sqnr) {
+           max_sqnr = sqnr;
+           scale256_min[0]=scale256_rx[0]; scale256_min[1]=scale256_rx[1]; scale256_min[2]=scale256_rx[2];
+         }
+      }
+    printf("256-point dBFS %f(input lev %f) dB SQNR %f(%f):  (%d,%d,%d)\n",dBFS,10*log10(input_lev/inputcnt),10*log10(max_sqnr),max_sqnr,scale256_min[0],scale256_min[1],scale256_min[2]);
+  }
+
+  memset((void*)x,0,512*sizeof(int32_t));
+  for (i=0;i<512;i++) {
+    ((int16_t*)x)[i<<1] = (int16_t)(364 * cos(2*M_PI*3*i/512));
+    ((int16_t*)x)[1+(i<<1)] = (int16_t)(364 * sin(2*M_PI*3*i/512));
+  } 
+  uint32_t scale512_rx[4]={1,1,1,2};
+
+  dft512((int16_t*)x,(int16_t*)y,scale512_rx);
+  LOG_M("x512_exp.m","x512_exp",x,512,1,1); 
+  LOG_M("y512_exp.m","y512_exp",y,512,1,1); 
+
+  sqnr = compute_error((int16_t*)x,(int16_t*)y,512,bitrev512,0);
+
+  printf("512 point DFT SQNR (cplx sinusoid) : %f dB\n",10*log10(sqnr));
+  // scaling optimization
+  for (double dBFS = -80;dBFS < 0; dBFS+=2) {
+    int scale512_min[4];
+    double sqnr,max_sqnr=-99;
+    double input_lev=0;
+    int inputcnt=0;
+    for (scale512_rx[0]=1;scale512_rx[0]<=5;scale512_rx[0]++) 
+      for (scale512_rx[1]=0;scale512_rx[1]<=5-scale512_rx[0];scale512_rx[1]++) 
+        for (scale512_rx[2]=0;scale512_rx[2]<=5-scale512_rx[0]-scale512_rx[1];scale512_rx[2]++) {
+         scale512_rx[3]=5-scale512_rx[0]-scale512_rx[1]-scale512_rx[2];
+         sqnr=0;
+         int n;
+         for (n=0;n<16384/512;n++) {
+           fill_gauss((c16_t*)x,512,dBFS);
+           for (i=0;i<512;i++) input_lev += pow((double)(((c16_t*)x)[i].r),2.0) + pow((double)(((c16_t*)x)[i].i),2.0);
+           dft512((int16_t*)x,(int16_t*)y,scale512_rx);
+           sqnr += compute_error((int16_t*)x,(int16_t*)y,512,bitrev512,0);
+         }
+         sqnr/=n;
+         inputcnt+=(n*512);
+         if (sqnr>max_sqnr) {
+           max_sqnr = sqnr;
+           scale512_min[0]=scale512_rx[0]; scale512_min[1]=scale512_rx[1]; scale512_min[2]=scale512_rx[2]; scale512_min[3]=scale512_rx[3];
+         }
+      }
+    printf("512-point dBFS %f(input lev %f) dB SQNR %f(%f):  (%d,%d,%d,%d)\n",dBFS,10*log10(input_lev/inputcnt),10*log10(max_sqnr),max_sqnr,scale512_min[0],scale512_min[1],scale512_min[2],scale512_min[3]);
+  }
+
+  memset((void*)x,0,1024*sizeof(int32_t));
+  uint32_t scale1024_rx[4]={1,2,2,0};
+  for (i=0;i<1024;i++) {
+    ((int16_t*)x)[i<<1] = (int16_t)(364 * cos(2*M_PI*3*i/1024));
+    ((int16_t*)x)[1+(i<<1)] = (int16_t)(364 * sin(2*M_PI*3*i/1024));
+  } 
+  dft1024((int16_t*)x,(int16_t*)y,scale1024_rx);
+  LOG_M("x1024_exp.m","x1024_exp",x,1024,1,1); 
+  LOG_M("y1024_exp.m","y1024_exp",y,1024,1,1); 
+
+  sqnr = compute_error((int16_t*)x,(int16_t*)y,1024,bitrev1024,0);
+
+  printf("1024 point DFT SQNR (cplx sinusoid) : %f dB\n",10*log10(sqnr));
+  // scaling optimization
+  for (double dBFS = -80;dBFS < 0; dBFS+=2) {
+    int scale1024_min[4];
+    double sqnr,max_sqnr=-99;
+    double input_lev=0;
+    int inputcnt=0;
+    for (scale1024_rx[0]=0;scale1024_rx[0]<=5;scale1024_rx[0]++) 
+      for (scale1024_rx[1]=0;scale1024_rx[1]<=5-scale1024_rx[0];scale1024_rx[1]++) 
+        for (scale1024_rx[2]=0;scale1024_rx[2]<=5-scale1024_rx[0]-scale1024_rx[1];scale1024_rx[2]++) {
+         scale1024_rx[3]=5-scale1024_rx[0]-scale1024_rx[1]-scale1024_rx[2];
+         sqnr=0;
+         int n;
+         for (n=0;n<16384/1024;n++) {
+           fill_gauss((c16_t*)x,1024,dBFS);
+           for (i=0;i<1024;i++) input_lev += pow((double)(((c16_t*)x)[i].r),2.0) + pow((double)(((c16_t*)x)[i].i),2.0);
+           dft1024((int16_t*)x,(int16_t*)y,scale1024_rx);
+           sqnr += compute_error((int16_t*)x,(int16_t*)y,1024,bitrev1024,0);
+         }
+         sqnr/=n;
+         inputcnt+=(n*1024);
+         if (sqnr>max_sqnr) {
+           max_sqnr = sqnr;
+           scale1024_min[0]=scale1024_rx[0]; scale1024_min[1]=scale1024_rx[1]; scale1024_min[2]=scale1024_rx[2]; scale1024_min[3]=scale1024_rx[3];
+         }
+      }
+    printf("1024-point dBFS %f(input lev %f) dB SQNR %f(%f):  (%d,%d,%d,%d)\n",dBFS,10*log10(input_lev/inputcnt),10*log10(max_sqnr),max_sqnr,scale1024_min[0],scale1024_min[1],scale1024_min[2],scale1024_min[3]);
+  }
+  memset((void*)x,0,1536*sizeof(int32_t));
+  for (i=0;i<1536;i++) {
+    ((int16_t*)x)[i<<1] = (int16_t)(364 * cos(2*M_PI*3*i/1536));
+    ((int16_t*)x)[1+(i<<1)] = (int16_t)(364 * sin(2*M_PI*3*i/1536));
+  } 
+  dft1536((int16_t*)x,(int16_t*)y,scale1536);
+  LOG_M("x1536_exp.m","x1536_exp",x,1536,1,1); 
+  LOG_M("y1536_exp.m","y1536_exp",y,1536,1,1);
+ 
+  memset((void*)x,0,2048*sizeof(int32_t));
+  for (i=0;i<2048;i++) {
+    ((int16_t*)x)[i<<1] = (int16_t)(384 * cos(2*M_PI*3*i/2048));
+    ((int16_t*)x)[1+(i<<1)] = (int16_t)(384 * sin(2*M_PI*3*i/2048));
+  } 
+  uint32_t scale2048_rx[5]={1,1,1,1,2};
+
+  dft2048((int16_t*)x,(int16_t*)y,scale2048_rx);
+  LOG_M("x2048_exp.m","x2048_exp",x,2048,1,1); 
+  LOG_M("y2048_exp.m","y2048_exp",y,2048,1,1); 
+
+  sqnr = compute_error((int16_t*)x,(int16_t*)y,2048,bitrev2048,0);
+
+  printf("2048 point DFT SQNR (cplx sinusoid) : %f dB\n",10*log10(sqnr));
+  // scaling optimization
+  for (double dBFS = -80;dBFS < 0; dBFS+=2) {
+    int scale2048_min[5];
+    double sqnr,max_sqnr=-99;
+    double input_lev=0;
+    int inputcnt=0;
+    for (scale2048_rx[0]=1;scale2048_rx[0]<=6;scale2048_rx[0]++) 
+      for (scale2048_rx[1]=0;scale2048_rx[1]<=6-scale2048_rx[0];scale2048_rx[1]++) 
+        for (scale2048_rx[2]=0;scale2048_rx[2]<=6-scale2048_rx[0]-scale2048_rx[1];scale2048_rx[2]++) 
+          for (scale2048_rx[3]=0;scale2048_rx[3]<=6-scale2048_rx[0]-scale2048_rx[1]-scale2048_rx[2];scale2048_rx[3]++) {
+             scale2048_rx[4]=6-scale2048_rx[0]-scale2048_rx[1]-scale2048_rx[2]-scale2048_rx[3];
+             sqnr=0;
+             int n;
+             for (n=0;n<16384/2048;n++) {
+                fill_gauss((c16_t*)x,2048,dBFS);
+                for (i=0;i<2048;i++) input_lev += pow((double)(((c16_t*)x)[i].r),2.0) + pow((double)(((c16_t*)x)[i].i),2.0);
+                dft2048((int16_t*)x,(int16_t*)y,scale2048_rx);
+                sqnr += compute_error((int16_t*)x,(int16_t*)y,2048,bitrev2048,0);
+             }
+             sqnr/=n;
+             inputcnt+=(n*2048);
+             if (sqnr>max_sqnr) {
+                max_sqnr = sqnr;
+                scale2048_min[0]=scale2048_rx[0]; scale2048_min[1]=scale2048_rx[1]; scale2048_min[2]=scale2048_rx[2]; scale2048_min[3]=scale2048_rx[3]; scale2048_min[4]=scale2048_rx[4];
+             }
+          }
+    printf("2048-point dBFS %f(input lev %f) dB SQNR %f(%f):  (%d,%d,%d,%d,%d)\n",dBFS,10*log10(input_lev/inputcnt),10*log10(max_sqnr),max_sqnr,scale2048_min[0],scale2048_min[1],scale2048_min[2],scale2048_min[3],scale2048_min[4]);
+  }
+  memset((void*)x,0,3072*sizeof(int32_t));
+  for (i=0;i<3072;i++) {
+    ((int16_t*)x)[i<<1] = (int16_t)(200 * cos(2*M_PI*3*i/3072));
+    ((int16_t*)x)[1+(i<<1)] = (int16_t)(200 * sin(2*M_PI*3*i/3072));
+  } 
+  dft3072((int16_t*)x,(int16_t*)y,scale3072);
+  LOG_M("x3072_exp.m","x3072_exp",x,3072,1,1); 
+  LOG_M("y3072_exp.m","y3072_exp",y,3072,1,1); 
+
+  memset((void*)x,0,4096*sizeof(int32_t));
+  for (i=0;i<4096;i++) {
+    ((int16_t*)x)[i<<1] = (int16_t)(384 * cos(2*M_PI*331*i/4096));
+    ((int16_t*)x)[1+(i<<1)] = (int16_t)(384 * sin(2*M_PI*331*i/4096));
+  } 
+  uint32_t scale4096_rx[5]={0,1,1,2,2};
+  dft4096((int16_t*)x,(int16_t*)y,scale4096_rx);
+  LOG_M("x4096_exp.m","x4096_exp",x,4096,1,1); 
+  LOG_M("y4096_exp.m","y4096_exp",y,4096,1,1); 
+
+  sqnr = compute_error((int16_t*)x,(int16_t*)y,4096,bitrev4096,0);
+  printf("4096 point DFT SQNR (cplx sinusoid) : %f dB\n",10*log10(sqnr));
+  // scaling optimization
+  for (double dBFS = -80;dBFS < 0; dBFS+=2) {
+    int scale4096_min[5];
+    double sqnr,max_sqnr=-99;
+    double input_lev=0;
+    int inputcnt=0;
+    for (scale4096_rx[0]=0;scale4096_rx[0]<=6;scale4096_rx[0]++) 
+      for (scale4096_rx[1]=0;scale4096_rx[1]<=6-scale4096_rx[0];scale4096_rx[1]++) 
+        for (scale4096_rx[2]=0;scale4096_rx[2]<=6-scale4096_rx[0]-scale4096_rx[1];scale4096_rx[2]++) 
+          for (scale4096_rx[3]=0;scale4096_rx[3]<=6-scale4096_rx[0]-scale4096_rx[1]-scale4096_rx[2];scale4096_rx[3]++)  {
+             scale4096_rx[4]=6-scale4096_rx[0]-scale4096_rx[1]-scale4096_rx[2]-scale4096_rx[3];
+             sqnr=0;
+             int n;
+             for (n=0;n<16384/4096;n++) {
+                fill_gauss((c16_t*)x,4096,dBFS);
+                for (i=0;i<4096;i++) input_lev += pow((double)(((c16_t*)x)[i].r),2.0) + pow((double)(((c16_t*)x)[i].i),2.0);
+                dft4096((int16_t*)x,(int16_t*)y,scale4096_rx);
+                sqnr += compute_error((int16_t*)x,(int16_t*)y,4096,bitrev4096,0);
+             }
+             sqnr/=n;
+             inputcnt+=(n*4096);
+             if (sqnr>max_sqnr) {
+                max_sqnr = sqnr;
+                scale4096_min[0]=scale4096_rx[0]; scale4096_min[1]=scale4096_rx[1]; scale4096_min[2]=scale4096_rx[2]; scale4096_min[3]=scale4096_rx[3];scale4096_min[4]=scale4096_rx[4];
+             }
+          }
+    printf("4096-point dBFS %f(input lev %f) dB SQNR %f(%f):  (%d,%d,%d,%d,%d)\n",dBFS,10*log10(input_lev/inputcnt),10*log10(max_sqnr),max_sqnr,scale4096_min[0],scale4096_min[1],scale4096_min[2],scale4096_min[3],scale4096_min[4]);
+  }
   return(0);
 }
 
diff --git a/openair1/PHY/TOOLS/oai_dfts_neon.c b/openair1/PHY/TOOLS/oai_dfts_neon.c
index ddf8a59bf524dd99c0a44cb08d535c86a115abeb..b746a3d628d9398a3278ae46653a2dd364aca3d4 100644
--- a/openair1/PHY/TOOLS/oai_dfts_neon.c
+++ b/openair1/PHY/TOOLS/oai_dfts_neon.c
@@ -925,7 +925,7 @@ const static int16_t tw64c[96] __attribute__((aligned(32))) = {
 #define set1_int16(a) vdupq_n_s16(a)
 #define mulhi_int16(a,b) vqdmulhq_s16(a,b);
 
-void dft64(int16_t *x,int16_t *y,unsigned char scale)
+void dft64(int16_t *x,int16_t *y,unsigned int *scale)
 {
 
   simd_q15_t xtmp[16],ytmp[16],*tw64a_128=(simd_q15_t *)tw64a,*tw64b_128=(simd_q15_t *)tw64b,*x128=(simd_q15_t *)x,*y128=(simd_q15_t *)y;
@@ -1018,23 +1018,24 @@ void dft64(int16_t *x,int16_t *y,unsigned char scale)
 #endif
 
 
-  if (scale>0) {
-    y128[0]  = shiftright_int16(y128[0],3);
-    y128[1]  = shiftright_int16(y128[1],3);
-    y128[2]  = shiftright_int16(y128[2],3);
-    y128[3]  = shiftright_int16(y128[3],3);
-    y128[4]  = shiftright_int16(y128[4],3);
-    y128[5]  = shiftright_int16(y128[5],3);
-    y128[6]  = shiftright_int16(y128[6],3);
-    y128[7]  = shiftright_int16(y128[7],3);
-    y128[8]  = shiftright_int16(y128[8],3);
-    y128[9]  = shiftright_int16(y128[9],3);
-    y128[10] = shiftright_int16(y128[10],3);
-    y128[11] = shiftright_int16(y128[11],3);
-    y128[12] = shiftright_int16(y128[12],3);
-    y128[13] = shiftright_int16(y128[13],3);
-    y128[14] = shiftright_int16(y128[14],3);
-    y128[15] = shiftright_int16(y128[15],3);
+  if (scale && *scale>0) {
+    unsigned int scalec=*scale;
+    y128[0]  = shiftright_int16(y128[0],scalec);
+    y128[1]  = shiftright_int16(y128[1],scalec);
+    y128[2]  = shiftright_int16(y128[2],scalec);
+    y128[3]  = shiftright_int16(y128[3],scalec);
+    y128[4]  = shiftright_int16(y128[4],scalec);
+    y128[5]  = shiftright_int16(y128[5],scalec);
+    y128[6]  = shiftright_int16(y128[6],scalec);
+    y128[7]  = shiftright_int16(y128[7],scalec);
+    y128[8]  = shiftright_int16(y128[8],scalec);
+    y128[9]  = shiftright_int16(y128[9],scalec);
+    y128[10] = shiftright_int16(y128[10],scalec);
+    y128[11] = shiftright_int16(y128[11],scalec);
+    y128[12] = shiftright_int16(y128[12],scalec);
+    y128[13] = shiftright_int16(y128[13],scalec);
+    y128[14] = shiftright_int16(y128[14],scalec);
+    y128[15] = shiftright_int16(y128[15],scalec);
   }
 
   
@@ -1042,7 +1043,7 @@ void dft64(int16_t *x,int16_t *y,unsigned char scale)
 
 }
 
-void idft64(int16_t *x,int16_t *y,unsigned char scale)
+void idft64(int16_t *x,int16_t *y,unsigned int *scale)
 {
 
   simd_q15_t xtmp[16],ytmp[16],*tw64a_128=(simd_q15_t *)tw64,*tw64b_128=(simd_q15_t *)tw64c,*x128=(simd_q15_t *)x,*y128=(simd_q15_t *)y;
@@ -1111,24 +1112,24 @@ void idft64(int16_t *x,int16_t *y,unsigned char scale)
 #endif
 
 
-  if (scale>0) {
-
-    y128[0]  = shiftright_int16(y128[0],3);
-    y128[1]  = shiftright_int16(y128[1],3);
-    y128[2]  = shiftright_int16(y128[2],3);
-    y128[3]  = shiftright_int16(y128[3],3);
-    y128[4]  = shiftright_int16(y128[4],3);
-    y128[5]  = shiftright_int16(y128[5],3);
-    y128[6]  = shiftright_int16(y128[6],3);
-    y128[7]  = shiftright_int16(y128[7],3);
-    y128[8]  = shiftright_int16(y128[8],3);
-    y128[9]  = shiftright_int16(y128[9],3);
-    y128[10] = shiftright_int16(y128[10],3);
-    y128[11] = shiftright_int16(y128[11],3);
-    y128[12] = shiftright_int16(y128[12],3);
-    y128[13] = shiftright_int16(y128[13],3);
-    y128[14] = shiftright_int16(y128[14],3);
-    y128[15] = shiftright_int16(y128[15],3);
+  if (scale && *scale>0) {
+    unsigned int scalec=*scale;
+    y128[0]  = shiftright_int16(y128[0],scalec);
+    y128[1]  = shiftright_int16(y128[1],scalec);
+    y128[2]  = shiftright_int16(y128[2],scalec);
+    y128[3]  = shiftright_int16(y128[3],scalec);
+    y128[4]  = shiftright_int16(y128[4],scalec);
+    y128[5]  = shiftright_int16(y128[5],scalec);
+    y128[6]  = shiftright_int16(y128[6],scalec);
+    y128[7]  = shiftright_int16(y128[7],scalec);
+    y128[8]  = shiftright_int16(y128[8],scalec);
+    y128[9]  = shiftright_int16(y128[9],scalec);
+    y128[10] = shiftright_int16(y128[10],scalec);
+    y128[11] = shiftright_int16(y128[11],scalec);
+    y128[12] = shiftright_int16(y128[12],scalec);
+    y128[13] = shiftright_int16(y128[13],scalec);
+    y128[14] = shiftright_int16(y128[14],scalec);
+    y128[15] = shiftright_int16(y128[15],scalec);
 
   }
 
@@ -1145,7 +1146,7 @@ int16_t tw128b[128] __attribute__((aligned(32))) = {0,32767,-1608,32727,-3212,32
 
 int16_t tw128c[128] __attribute__((aligned(32))) = {0,32767,1608,32727,3212,32609,4808,32412,6393,32137,7962,31785,9512,31356,11039,30851,12540,30272,14010,29621,15447,28897,16846,28105,18205,27244,19520,26318,20788,25329,22005,24278,23170,23169,24279,22004,25330,20787,26319,19519,27245,18204,28106,16845,28898,15446,29622,14009,30273,12539,30852,11038,31357,9511,31786,7961,32138,6392,32413,4807,32610,3211,32728,1607,32767,0,32728,-1608,32610,-3212,32413,-4808,32138,-6393,31786,-7962,31357,-9512,30852,-11039,30273,-12540,29622,-14010,28898,-15447,28106,-16846,27245,-18205,26319,-19520,25330,-20788,24279,-22005,23170,-23170,22005,-24279,20788,-25330,19520,-26319,18205,-27245,16846,-28106,15447,-28898,14010,-29622,12540,-30273,11039,-30852,9512,-31357,7962,-31786,6393,-32138,4808,-32413,3212,-32610,1608,-32728};
 
-void dft128(int16_t *x,int16_t *y,unsigned char scale)
+void dft128(int16_t *x,int16_t *y,unsigned int *scale)
 {
 
   simdshort_q15_t xtmp[64],*x64 = (simdshort_q15_t *)x;
@@ -1188,8 +1189,10 @@ void dft128(int16_t *x,int16_t *y,unsigned char scale)
   transpose4_ooff(x64+60,xtmp+30,32);
   transpose4_ooff(x64+62,xtmp+31,32);
 
-  dft64((int16_t*)(xtmp),(int16_t*)ytmp,1);
-  dft64((int16_t*)(xtmp+32),(int16_t*)(ytmp+16),1);
+  unsigned int *scale64=NULL;
+  if (scale) scale64=scale+1;
+  dft64((int16_t*)(xtmp),(int16_t*)ytmp,scale64);
+  dft64((int16_t*)(xtmp+32),(int16_t*)(ytmp+16),scale64);
 #ifndef MR_MAIN
   if (LOG_DUMPFLAG(DEBUG_DFT)) {
     LOG_M("dft128a.m","dfta",ytmp,64,1,1);
@@ -1207,41 +1210,76 @@ void dft128(int16_t *x,int16_t *y,unsigned char scale)
     ytmpp++;
   }
 
-  if (scale>0) {
-
-    y128[0] = mulhi_int16(y128[0],ONE_OVER_SQRT2_Q15_128);
-    y128[1] = mulhi_int16(y128[1],ONE_OVER_SQRT2_Q15_128);
-    y128[2] = mulhi_int16(y128[2],ONE_OVER_SQRT2_Q15_128);
-    y128[3] = mulhi_int16(y128[3],ONE_OVER_SQRT2_Q15_128);
-    y128[4] = mulhi_int16(y128[4],ONE_OVER_SQRT2_Q15_128);
-    y128[5] = mulhi_int16(y128[5],ONE_OVER_SQRT2_Q15_128);
-    y128[6] = mulhi_int16(y128[6],ONE_OVER_SQRT2_Q15_128);
-    y128[7] = mulhi_int16(y128[7],ONE_OVER_SQRT2_Q15_128);
-    y128[8] = mulhi_int16(y128[8],ONE_OVER_SQRT2_Q15_128);
-    y128[9] = mulhi_int16(y128[9],ONE_OVER_SQRT2_Q15_128);
-    y128[10] = mulhi_int16(y128[10],ONE_OVER_SQRT2_Q15_128);
-    y128[11] = mulhi_int16(y128[11],ONE_OVER_SQRT2_Q15_128);
-    y128[12] = mulhi_int16(y128[12],ONE_OVER_SQRT2_Q15_128);
-    y128[13] = mulhi_int16(y128[13],ONE_OVER_SQRT2_Q15_128);
-    y128[14] = mulhi_int16(y128[14],ONE_OVER_SQRT2_Q15_128);
-    y128[15] = mulhi_int16(y128[15],ONE_OVER_SQRT2_Q15_128);
-    y128[16] = mulhi_int16(y128[16],ONE_OVER_SQRT2_Q15_128);
-    y128[17] = mulhi_int16(y128[17],ONE_OVER_SQRT2_Q15_128);
-    y128[18] = mulhi_int16(y128[18],ONE_OVER_SQRT2_Q15_128);
-    y128[19] = mulhi_int16(y128[19],ONE_OVER_SQRT2_Q15_128);
-    y128[20] = mulhi_int16(y128[20],ONE_OVER_SQRT2_Q15_128);
-    y128[21] = mulhi_int16(y128[21],ONE_OVER_SQRT2_Q15_128);
-    y128[22] = mulhi_int16(y128[22],ONE_OVER_SQRT2_Q15_128);
-    y128[23] = mulhi_int16(y128[23],ONE_OVER_SQRT2_Q15_128);
-    y128[24] = mulhi_int16(y128[24],ONE_OVER_SQRT2_Q15_128);
-    y128[25] = mulhi_int16(y128[25],ONE_OVER_SQRT2_Q15_128);
-    y128[26] = mulhi_int16(y128[26],ONE_OVER_SQRT2_Q15_128);
-    y128[27] = mulhi_int16(y128[27],ONE_OVER_SQRT2_Q15_128);
-    y128[28] = mulhi_int16(y128[28],ONE_OVER_SQRT2_Q15_128);
-    y128[29] = mulhi_int16(y128[29],ONE_OVER_SQRT2_Q15_128);
-    y128[30] = mulhi_int16(y128[30],ONE_OVER_SQRT2_Q15_128);
-    y128[31] = mulhi_int16(y128[31],ONE_OVER_SQRT2_Q15_128);
-
+  if (scale && *scale>0) {
+    if (*scale>1) {
+      uint32_t scale2=*scale-1;
+      y128[0] = mulhi_int16(shiftright_int16(y128[0],scale2),ONE_OVER_SQRT2_Q15_128);
+      y128[1] = mulhi_int16(shiftright_int16(y128[1],scale2),ONE_OVER_SQRT2_Q15_128);
+      y128[2] = mulhi_int16(shiftright_int16(y128[2],scale2),ONE_OVER_SQRT2_Q15_128);
+      y128[3] = mulhi_int16(shiftright_int16(y128[3],scale2),ONE_OVER_SQRT2_Q15_128);
+      y128[4] = mulhi_int16(shiftright_int16(y128[4],scale2),ONE_OVER_SQRT2_Q15_128);
+      y128[5] = mulhi_int16(shiftright_int16(y128[5],scale2),ONE_OVER_SQRT2_Q15_128);
+      y128[6] = mulhi_int16(shiftright_int16(y128[6],scale2),ONE_OVER_SQRT2_Q15_128);
+      y128[7] = mulhi_int16(shiftright_int16(y128[7],scale2),ONE_OVER_SQRT2_Q15_128);
+      y128[8] = mulhi_int16(shiftright_int16(y128[8],scale2),ONE_OVER_SQRT2_Q15_128);
+      y128[9] = mulhi_int16(shiftright_int16(y128[9],scale2),ONE_OVER_SQRT2_Q15_128);
+      y128[10] = mulhi_int16(shiftright_int16(y128[10],scale2),ONE_OVER_SQRT2_Q15_128);
+      y128[11] = mulhi_int16(shiftright_int16(y128[11],scale2),ONE_OVER_SQRT2_Q15_128);
+      y128[12] = mulhi_int16(shiftright_int16(y128[12],scale2),ONE_OVER_SQRT2_Q15_128);
+      y128[13] = mulhi_int16(shiftright_int16(y128[13],scale2),ONE_OVER_SQRT2_Q15_128);
+      y128[14] = mulhi_int16(shiftright_int16(y128[14],scale2),ONE_OVER_SQRT2_Q15_128);
+      y128[15] = mulhi_int16(shiftright_int16(y128[15],scale2),ONE_OVER_SQRT2_Q15_128);
+      y128[16] = mulhi_int16(shiftright_int16(y128[16],scale2),ONE_OVER_SQRT2_Q15_128);
+      y128[17] = mulhi_int16(shiftright_int16(y128[17],scale2),ONE_OVER_SQRT2_Q15_128);
+      y128[18] = mulhi_int16(shiftright_int16(y128[18],scale2),ONE_OVER_SQRT2_Q15_128);
+      y128[19] = mulhi_int16(shiftright_int16(y128[19],scale2),ONE_OVER_SQRT2_Q15_128);
+      y128[20] = mulhi_int16(shiftright_int16(y128[20],scale2),ONE_OVER_SQRT2_Q15_128);
+      y128[21] = mulhi_int16(shiftright_int16(y128[21],scale2),ONE_OVER_SQRT2_Q15_128);
+      y128[22] = mulhi_int16(shiftright_int16(y128[22],scale2),ONE_OVER_SQRT2_Q15_128);
+      y128[23] = mulhi_int16(shiftright_int16(y128[23],scale2),ONE_OVER_SQRT2_Q15_128);
+      y128[24] = mulhi_int16(shiftright_int16(y128[24],scale2),ONE_OVER_SQRT2_Q15_128);
+      y128[25] = mulhi_int16(shiftright_int16(y128[25],scale2),ONE_OVER_SQRT2_Q15_128);
+      y128[26] = mulhi_int16(shiftright_int16(y128[26],scale2),ONE_OVER_SQRT2_Q15_128);
+      y128[27] = mulhi_int16(shiftright_int16(y128[27],scale2),ONE_OVER_SQRT2_Q15_128);
+      y128[28] = mulhi_int16(shiftright_int16(y128[28],scale2),ONE_OVER_SQRT2_Q15_128);
+      y128[29] = mulhi_int16(shiftright_int16(y128[29],scale2),ONE_OVER_SQRT2_Q15_128);
+      y128[30] = mulhi_int16(shiftright_int16(y128[30],scale2),ONE_OVER_SQRT2_Q15_128);
+      y128[31] = mulhi_int16(shiftright_int16(y128[31],scale2),ONE_OVER_SQRT2_Q15_128);
+    }
+    else {
+      y128[0] = mulhi_int16(y128[0],ONE_OVER_SQRT2_Q15_128);
+      y128[1] = mulhi_int16(y128[1],ONE_OVER_SQRT2_Q15_128);
+      y128[2] = mulhi_int16(y128[2],ONE_OVER_SQRT2_Q15_128);
+      y128[3] = mulhi_int16(y128[3],ONE_OVER_SQRT2_Q15_128);
+      y128[4] = mulhi_int16(y128[4],ONE_OVER_SQRT2_Q15_128);
+      y128[5] = mulhi_int16(y128[5],ONE_OVER_SQRT2_Q15_128);
+      y128[6] = mulhi_int16(y128[6],ONE_OVER_SQRT2_Q15_128);
+      y128[7] = mulhi_int16(y128[7],ONE_OVER_SQRT2_Q15_128);
+      y128[8] = mulhi_int16(y128[8],ONE_OVER_SQRT2_Q15_128);
+      y128[9] = mulhi_int16(y128[9],ONE_OVER_SQRT2_Q15_128);
+      y128[10] = mulhi_int16(y128[10],ONE_OVER_SQRT2_Q15_128);
+      y128[11] = mulhi_int16(y128[11],ONE_OVER_SQRT2_Q15_128);
+      y128[12] = mulhi_int16(y128[12],ONE_OVER_SQRT2_Q15_128);
+      y128[13] = mulhi_int16(y128[13],ONE_OVER_SQRT2_Q15_128);
+      y128[14] = mulhi_int16(y128[14],ONE_OVER_SQRT2_Q15_128);
+      y128[15] = mulhi_int16(y128[15],ONE_OVER_SQRT2_Q15_128);
+      y128[16] = mulhi_int16(y128[16],ONE_OVER_SQRT2_Q15_128);
+      y128[17] = mulhi_int16(y128[17],ONE_OVER_SQRT2_Q15_128);
+      y128[18] = mulhi_int16(y128[18],ONE_OVER_SQRT2_Q15_128);
+      y128[19] = mulhi_int16(y128[19],ONE_OVER_SQRT2_Q15_128);
+      y128[20] = mulhi_int16(y128[20],ONE_OVER_SQRT2_Q15_128);
+      y128[21] = mulhi_int16(y128[21],ONE_OVER_SQRT2_Q15_128);
+      y128[22] = mulhi_int16(y128[22],ONE_OVER_SQRT2_Q15_128);
+      y128[23] = mulhi_int16(y128[23],ONE_OVER_SQRT2_Q15_128);
+      y128[24] = mulhi_int16(y128[24],ONE_OVER_SQRT2_Q15_128);
+      y128[25] = mulhi_int16(y128[25],ONE_OVER_SQRT2_Q15_128);
+      y128[26] = mulhi_int16(y128[26],ONE_OVER_SQRT2_Q15_128);
+      y128[27] = mulhi_int16(y128[27],ONE_OVER_SQRT2_Q15_128);
+      y128[28] = mulhi_int16(y128[28],ONE_OVER_SQRT2_Q15_128);
+      y128[29] = mulhi_int16(y128[29],ONE_OVER_SQRT2_Q15_128);
+      y128[30] = mulhi_int16(y128[30],ONE_OVER_SQRT2_Q15_128);
+      y128[31] = mulhi_int16(y128[31],ONE_OVER_SQRT2_Q15_128);
+    }
 
   }
 #ifndef MR_MAIN
@@ -1252,7 +1290,7 @@ void dft128(int16_t *x,int16_t *y,unsigned char scale)
 #endif
 }
 
-void idft128(int16_t *x,int16_t *y,unsigned char scale)
+void idft128(int16_t *x,int16_t *y,unsigned int *scale)
 {
 
   simdshort_q15_t xtmp[64],*x64 = (simdshort_q15_t *)x;
@@ -1295,8 +1333,10 @@ void idft128(int16_t *x,int16_t *y,unsigned char scale)
   transpose4_ooff(x64+60,xtmp+30,32);
   transpose4_ooff(x64+62,xtmp+31,32);
 
-  idft64((int16_t*)(xtmp),(int16_t*)ytmp,1);
-  idft64((int16_t*)(xtmp+32),(int16_t*)(ytmp+16),1);
+  unsigned int *scale64=NULL;
+  if (scale) scale64=scale+1;
+  idft64((int16_t*)(xtmp),(int16_t*)ytmp,scale64);
+  idft64((int16_t*)(xtmp+32),(int16_t*)(ytmp+16),scale64);
 
 
   for (i=0; i<16; i++) {
@@ -1308,41 +1348,76 @@ void idft128(int16_t *x,int16_t *y,unsigned char scale)
     ytmpp++;
   }
 
-  if (scale>0) {
-
-    y128[0]  = mulhi_int16(y128[0],ONE_OVER_SQRT2_Q15_128);
-    y128[1]  = mulhi_int16(y128[1],ONE_OVER_SQRT2_Q15_128);
-    y128[2]  = mulhi_int16(y128[2],ONE_OVER_SQRT2_Q15_128);
-    y128[3]  = mulhi_int16(y128[3],ONE_OVER_SQRT2_Q15_128);
-    y128[4]  = mulhi_int16(y128[4],ONE_OVER_SQRT2_Q15_128);
-    y128[5]  = mulhi_int16(y128[5],ONE_OVER_SQRT2_Q15_128);
-    y128[6]  = mulhi_int16(y128[6],ONE_OVER_SQRT2_Q15_128);
-    y128[7]  = mulhi_int16(y128[7],ONE_OVER_SQRT2_Q15_128);
-    y128[8]  = mulhi_int16(y128[8],ONE_OVER_SQRT2_Q15_128);
-    y128[9]  = mulhi_int16(y128[9],ONE_OVER_SQRT2_Q15_128);
-    y128[10] = mulhi_int16(y128[10],ONE_OVER_SQRT2_Q15_128);
-    y128[11] = mulhi_int16(y128[11],ONE_OVER_SQRT2_Q15_128);
-    y128[12] = mulhi_int16(y128[12],ONE_OVER_SQRT2_Q15_128);
-    y128[13] = mulhi_int16(y128[13],ONE_OVER_SQRT2_Q15_128);
-    y128[14] = mulhi_int16(y128[14],ONE_OVER_SQRT2_Q15_128);
-    y128[15] = mulhi_int16(y128[15],ONE_OVER_SQRT2_Q15_128);
-    y128[16] = mulhi_int16(y128[16],ONE_OVER_SQRT2_Q15_128);
-    y128[17] = mulhi_int16(y128[17],ONE_OVER_SQRT2_Q15_128);
-    y128[18] = mulhi_int16(y128[18],ONE_OVER_SQRT2_Q15_128);
-    y128[19] = mulhi_int16(y128[19],ONE_OVER_SQRT2_Q15_128);
-    y128[20] = mulhi_int16(y128[20],ONE_OVER_SQRT2_Q15_128);
-    y128[21] = mulhi_int16(y128[21],ONE_OVER_SQRT2_Q15_128);
-    y128[22] = mulhi_int16(y128[22],ONE_OVER_SQRT2_Q15_128);
-    y128[23] = mulhi_int16(y128[23],ONE_OVER_SQRT2_Q15_128);
-    y128[24] = mulhi_int16(y128[24],ONE_OVER_SQRT2_Q15_128);
-    y128[25] = mulhi_int16(y128[25],ONE_OVER_SQRT2_Q15_128);
-    y128[26] = mulhi_int16(y128[26],ONE_OVER_SQRT2_Q15_128);
-    y128[27] = mulhi_int16(y128[27],ONE_OVER_SQRT2_Q15_128);
-    y128[28] = mulhi_int16(y128[28],ONE_OVER_SQRT2_Q15_128);
-    y128[29] = mulhi_int16(y128[29],ONE_OVER_SQRT2_Q15_128);
-    y128[30] = mulhi_int16(y128[30],ONE_OVER_SQRT2_Q15_128);
-    y128[31] = mulhi_int16(y128[31],ONE_OVER_SQRT2_Q15_128);
-
+  if (scale && *scale>0) {
+    if (*scale>1) {
+      uint32_t scale2=*scale-1;
+      y128[0] = mulhi_int16(shiftright_int16(y128[0],scale2),ONE_OVER_SQRT2_Q15_128);
+      y128[1] = mulhi_int16(shiftright_int16(y128[1],scale2),ONE_OVER_SQRT2_Q15_128);
+      y128[2] = mulhi_int16(shiftright_int16(y128[2],scale2),ONE_OVER_SQRT2_Q15_128);
+      y128[3] = mulhi_int16(shiftright_int16(y128[3],scale2),ONE_OVER_SQRT2_Q15_128);
+      y128[4] = mulhi_int16(shiftright_int16(y128[4],scale2),ONE_OVER_SQRT2_Q15_128);
+      y128[5] = mulhi_int16(shiftright_int16(y128[5],scale2),ONE_OVER_SQRT2_Q15_128);
+      y128[6] = mulhi_int16(shiftright_int16(y128[6],scale2),ONE_OVER_SQRT2_Q15_128);
+      y128[7] = mulhi_int16(shiftright_int16(y128[7],scale2),ONE_OVER_SQRT2_Q15_128);
+      y128[8] = mulhi_int16(shiftright_int16(y128[8],scale2),ONE_OVER_SQRT2_Q15_128);
+      y128[9] = mulhi_int16(shiftright_int16(y128[9],scale2),ONE_OVER_SQRT2_Q15_128);
+      y128[10] = mulhi_int16(shiftright_int16(y128[10],scale2),ONE_OVER_SQRT2_Q15_128);
+      y128[11] = mulhi_int16(shiftright_int16(y128[11],scale2),ONE_OVER_SQRT2_Q15_128);
+      y128[12] = mulhi_int16(shiftright_int16(y128[12],scale2),ONE_OVER_SQRT2_Q15_128);
+      y128[13] = mulhi_int16(shiftright_int16(y128[13],scale2),ONE_OVER_SQRT2_Q15_128);
+      y128[14] = mulhi_int16(shiftright_int16(y128[14],scale2),ONE_OVER_SQRT2_Q15_128);
+      y128[15] = mulhi_int16(shiftright_int16(y128[15],scale2),ONE_OVER_SQRT2_Q15_128);
+      y128[16] = mulhi_int16(shiftright_int16(y128[16],scale2),ONE_OVER_SQRT2_Q15_128);
+      y128[17] = mulhi_int16(shiftright_int16(y128[17],scale2),ONE_OVER_SQRT2_Q15_128);
+      y128[18] = mulhi_int16(shiftright_int16(y128[18],scale2),ONE_OVER_SQRT2_Q15_128);
+      y128[19] = mulhi_int16(shiftright_int16(y128[19],scale2),ONE_OVER_SQRT2_Q15_128);
+      y128[20] = mulhi_int16(shiftright_int16(y128[20],scale2),ONE_OVER_SQRT2_Q15_128);
+      y128[21] = mulhi_int16(shiftright_int16(y128[21],scale2),ONE_OVER_SQRT2_Q15_128);
+      y128[22] = mulhi_int16(shiftright_int16(y128[22],scale2),ONE_OVER_SQRT2_Q15_128);
+      y128[23] = mulhi_int16(shiftright_int16(y128[23],scale2),ONE_OVER_SQRT2_Q15_128);
+      y128[24] = mulhi_int16(shiftright_int16(y128[24],scale2),ONE_OVER_SQRT2_Q15_128);
+      y128[25] = mulhi_int16(shiftright_int16(y128[25],scale2),ONE_OVER_SQRT2_Q15_128);
+      y128[26] = mulhi_int16(shiftright_int16(y128[26],scale2),ONE_OVER_SQRT2_Q15_128);
+      y128[27] = mulhi_int16(shiftright_int16(y128[27],scale2),ONE_OVER_SQRT2_Q15_128);
+      y128[28] = mulhi_int16(shiftright_int16(y128[28],scale2),ONE_OVER_SQRT2_Q15_128);
+      y128[29] = mulhi_int16(shiftright_int16(y128[29],scale2),ONE_OVER_SQRT2_Q15_128);
+      y128[30] = mulhi_int16(shiftright_int16(y128[30],scale2),ONE_OVER_SQRT2_Q15_128);
+      y128[31] = mulhi_int16(shiftright_int16(y128[31],scale2),ONE_OVER_SQRT2_Q15_128);
+    }
+    else {
+      y128[0]  = mulhi_int16(y128[0],ONE_OVER_SQRT2_Q15_128);
+      y128[1]  = mulhi_int16(y128[1],ONE_OVER_SQRT2_Q15_128);
+      y128[2]  = mulhi_int16(y128[2],ONE_OVER_SQRT2_Q15_128);
+      y128[3]  = mulhi_int16(y128[3],ONE_OVER_SQRT2_Q15_128);
+      y128[4]  = mulhi_int16(y128[4],ONE_OVER_SQRT2_Q15_128);
+      y128[5]  = mulhi_int16(y128[5],ONE_OVER_SQRT2_Q15_128);
+      y128[6]  = mulhi_int16(y128[6],ONE_OVER_SQRT2_Q15_128);
+      y128[7]  = mulhi_int16(y128[7],ONE_OVER_SQRT2_Q15_128);
+      y128[8]  = mulhi_int16(y128[8],ONE_OVER_SQRT2_Q15_128);
+      y128[9]  = mulhi_int16(y128[9],ONE_OVER_SQRT2_Q15_128);
+      y128[10] = mulhi_int16(y128[10],ONE_OVER_SQRT2_Q15_128);
+      y128[11] = mulhi_int16(y128[11],ONE_OVER_SQRT2_Q15_128);
+      y128[12] = mulhi_int16(y128[12],ONE_OVER_SQRT2_Q15_128);
+      y128[13] = mulhi_int16(y128[13],ONE_OVER_SQRT2_Q15_128);
+      y128[14] = mulhi_int16(y128[14],ONE_OVER_SQRT2_Q15_128);
+      y128[15] = mulhi_int16(y128[15],ONE_OVER_SQRT2_Q15_128);
+      y128[16] = mulhi_int16(y128[16],ONE_OVER_SQRT2_Q15_128);
+      y128[17] = mulhi_int16(y128[17],ONE_OVER_SQRT2_Q15_128);
+      y128[18] = mulhi_int16(y128[18],ONE_OVER_SQRT2_Q15_128);
+      y128[19] = mulhi_int16(y128[19],ONE_OVER_SQRT2_Q15_128);
+      y128[20] = mulhi_int16(y128[20],ONE_OVER_SQRT2_Q15_128);
+      y128[21] = mulhi_int16(y128[21],ONE_OVER_SQRT2_Q15_128);
+      y128[22] = mulhi_int16(y128[22],ONE_OVER_SQRT2_Q15_128);
+      y128[23] = mulhi_int16(y128[23],ONE_OVER_SQRT2_Q15_128);
+      y128[24] = mulhi_int16(y128[24],ONE_OVER_SQRT2_Q15_128);
+      y128[25] = mulhi_int16(y128[25],ONE_OVER_SQRT2_Q15_128);
+      y128[26] = mulhi_int16(y128[26],ONE_OVER_SQRT2_Q15_128);
+      y128[27] = mulhi_int16(y128[27],ONE_OVER_SQRT2_Q15_128);
+      y128[28] = mulhi_int16(y128[28],ONE_OVER_SQRT2_Q15_128);
+      y128[29] = mulhi_int16(y128[29],ONE_OVER_SQRT2_Q15_128);
+      y128[30] = mulhi_int16(y128[30],ONE_OVER_SQRT2_Q15_128);
+      y128[31] = mulhi_int16(y128[31],ONE_OVER_SQRT2_Q15_128);
+    }
   }
 
 }
@@ -1361,7 +1436,7 @@ int16_t tw256b[384] __attribute__((aligned(32))) = {0,32767,-805,32757,-1608,327
                                                     0,32767,-1608,32727,-3212,32609,-4808,32412,-6393,32137,-7962,31785,-9512,31356,-11039,30851,-12540,30272,-14010,29621,-15447,28897,-16846,28105,-18205,27244,-19520,26318,-20788,25329,-22005,24278,-23170,23169,-24279,22004,-25330,20787,-26319,19519,-27245,18204,-28106,16845,-28898,15446,-29622,14009,-30273,12539,-30852,11038,-31357,9511,-31786,7961,-32138,6392,-32413,4807,-32610,3211,-32728,1607,-32767,0,-32728,-1608,-32610,-3212,-32413,-4808,-32138,-6393,-31786,-7962,-31357,-9512,-30852,-11039,-30273,-12540,-29622,-14010,-28898,-15447,-28106,-16846,-27245,-18205,-26319,-19520,-25330,-20788,-24279,-22005,-23170,-23170,-22005,-24279,-20788,-25330,-19520,-26319,-18205,-27245,-16846,-28106,-15447,-28898,-14010,-29622,-12540,-30273,-11039,-30852,-9512,-31357,-7962,-31786,-6393,-32138,-4808,-32413,-3212,-32610,-1608,-32728,
                                                     0,32767,-2411,32678,-4808,32412,-7180,31970,-9512,31356,-11793,30571,-14010,29621,-16151,28510,-18205,27244,-20160,25831,-22005,24278,-23732,22594,-25330,20787,-26790,18867,-28106,16845,-29269,14732,-30273,12539,-31114,10278,-31786,7961,-32285,5601,-32610,3211,-32758,804,-32728,-1608,-32521,-4012,-32138,-6393,-31581,-8740,-30852,-11039,-29956,-13279,-28898,-15447,-27684,-17531,-26319,-19520,-24812,-21403,-23170,-23170,-21403,-24812,-19520,-26319,-17531,-27684,-15447,-28898,-13279,-29956,-11039,-30852,-8740,-31581,-6393,-32138,-4012,-32521,-1608,-32728,804,-32758,3211,-32610,5601,-32285,7961,-31786,10278,-31114,12539,-30273,14732,-29269,16845,-28106,18867,-26790,20787,-25330,22594,-23732,24278,-22005,25831,-20160,27244,-18205,28510,-16151,29621,-14010,30571,-11793,31356,-9512,31970,-7180,32412,-4808,32678,-2411
                                                    };
-void dft256(int16_t *x,int16_t *y,unsigned char scale)
+void dft256(int16_t *x,int16_t *y,unsigned int *scale)
 {
 
   simd_q15_t xtmp[64],ytmp[64],*tw256a_128p=(simd_q15_t *)tw256a,*tw256b_128p=(simd_q15_t *)tw256b,*x128=(simd_q15_t *)x,*y128=(simd_q15_t *)y,*y128p=(simd_q15_t *)y;
@@ -1403,10 +1478,12 @@ void dft256(int16_t *x,int16_t *y,unsigned char scale)
   start_meas(&ts_d);
 #endif
 
-  dft64((int16_t*)(xtmp),(int16_t*)(ytmp),1);
-  dft64((int16_t*)(xtmp+16),(int16_t*)(ytmp+16),1);
-  dft64((int16_t*)(xtmp+32),(int16_t*)(ytmp+32),1);
-  dft64((int16_t*)(xtmp+48),(int16_t*)(ytmp+48),1);
+  unsigned int *scale64=NULL;
+  if (scale) scale64=scale+1;
+  dft64((int16_t*)(xtmp),(int16_t*)(ytmp),scale64);
+  dft64((int16_t*)(xtmp+16),(int16_t*)(ytmp+16),scale64);
+  dft64((int16_t*)(xtmp+32),(int16_t*)(ytmp+32),scale64);
+  dft64((int16_t*)(xtmp+48),(int16_t*)(ytmp+48),scale64);
 
 #ifdef D256STATS
   stop_meas(&ts_d);
@@ -1445,25 +1522,26 @@ void dft256(int16_t *x,int16_t *y,unsigned char scale)
 #endif
 #endif
 
-  if (scale>0) {
+  if (scale && *scale>0) {
+    unsigned int scalec=*scale;
 
     for (i=0; i<4; i++) {
-      y128[0]  = shiftright_int16(y128[0],1);
-      y128[1]  = shiftright_int16(y128[1],1);
-      y128[2]  = shiftright_int16(y128[2],1);
-      y128[3]  = shiftright_int16(y128[3],1);
-      y128[4]  = shiftright_int16(y128[4],1);
-      y128[5]  = shiftright_int16(y128[5],1);
-      y128[6]  = shiftright_int16(y128[6],1);
-      y128[7]  = shiftright_int16(y128[7],1);
-      y128[8]  = shiftright_int16(y128[8],1);
-      y128[9]  = shiftright_int16(y128[9],1);
-      y128[10] = shiftright_int16(y128[10],1);
-      y128[11] = shiftright_int16(y128[11],1);
-      y128[12] = shiftright_int16(y128[12],1);
-      y128[13] = shiftright_int16(y128[13],1);
-      y128[14] = shiftright_int16(y128[14],1);
-      y128[15] = shiftright_int16(y128[15],1);
+      y128[0]  = shiftright_int16(y128[0],scalec);
+      y128[1]  = shiftright_int16(y128[1],scalec);
+      y128[2]  = shiftright_int16(y128[2],scalec);
+      y128[3]  = shiftright_int16(y128[3],scalec);
+      y128[4]  = shiftright_int16(y128[4],scalec);
+      y128[5]  = shiftright_int16(y128[5],scalec);
+      y128[6]  = shiftright_int16(y128[6],scalec);
+      y128[7]  = shiftright_int16(y128[7],scalec);
+      y128[8]  = shiftright_int16(y128[8],scalec);
+      y128[9]  = shiftright_int16(y128[9],scalec);
+      y128[10] = shiftright_int16(y128[10],scalec);
+      y128[11] = shiftright_int16(y128[11],scalec);
+      y128[12] = shiftright_int16(y128[12],scalec);
+      y128[13] = shiftright_int16(y128[13],scalec);
+      y128[14] = shiftright_int16(y128[14],scalec);
+      y128[15] = shiftright_int16(y128[15],scalec);
 
       y128+=16;
     }
@@ -1473,7 +1551,7 @@ void dft256(int16_t *x,int16_t *y,unsigned char scale)
 
 
 
-void idft256(int16_t *x,int16_t *y,unsigned char scale)
+void idft256(int16_t *x,int16_t *y,unsigned int *scale)
 {
 
   simd_q15_t xtmp[64],ytmp[64],*tw256_128p=(simd_q15_t *)tw256,*x128=(simd_q15_t *)x,*y128=(simd_q15_t *)y,*y128p=(simd_q15_t *)y;
@@ -1485,10 +1563,12 @@ void idft256(int16_t *x,int16_t *y,unsigned char scale)
   }
 
 
-  idft64((int16_t*)(xtmp),(int16_t*)(ytmp),1);
-  idft64((int16_t*)(xtmp+16),(int16_t*)(ytmp+16),1);
-  idft64((int16_t*)(xtmp+32),(int16_t*)(ytmp+32),1);
-  idft64((int16_t*)(xtmp+48),(int16_t*)(ytmp+48),1);
+  unsigned int *scale64=NULL;
+  if (scale) scale64=scale+1;
+  idft64((int16_t*)(xtmp),(int16_t*)(ytmp),scale64);
+  idft64((int16_t*)(xtmp+16),(int16_t*)(ytmp+16),scale64);
+  idft64((int16_t*)(xtmp+32),(int16_t*)(ytmp+32),scale64);
+  idft64((int16_t*)(xtmp+48),(int16_t*)(ytmp+48),scale64);
 
   for (i=0; i<16; i++) {
     ibfly4(ytmpp,ytmpp+16,ytmpp+32,ytmpp+48,
@@ -1499,25 +1579,26 @@ void idft256(int16_t *x,int16_t *y,unsigned char scale)
     ytmpp++;
   }
 
-  if (scale>0) {
+  if (scale && *scale>0) {
+    unsigned int scalec=*scale;
 
     for (i=0; i<4; i++) {
-      y128[0]  = shiftright_int16(y128[0],1);
-      y128[1]  = shiftright_int16(y128[1],1);
-      y128[2]  = shiftright_int16(y128[2],1);
-      y128[3]  = shiftright_int16(y128[3],1);
-      y128[4]  = shiftright_int16(y128[4],1);
-      y128[5]  = shiftright_int16(y128[5],1);
-      y128[6]  = shiftright_int16(y128[6],1);
-      y128[7]  = shiftright_int16(y128[7],1);
-      y128[8]  = shiftright_int16(y128[8],1);
-      y128[9]  = shiftright_int16(y128[9],1);
-      y128[10] = shiftright_int16(y128[10],1);
-      y128[11] = shiftright_int16(y128[11],1);
-      y128[12] = shiftright_int16(y128[12],1);
-      y128[13] = shiftright_int16(y128[13],1);
-      y128[14] = shiftright_int16(y128[14],1);
-      y128[15] = shiftright_int16(y128[15],1);
+      y128[0]  = shiftright_int16(y128[0],scalec);
+      y128[1]  = shiftright_int16(y128[1],scalec);
+      y128[2]  = shiftright_int16(y128[2],scalec);
+      y128[3]  = shiftright_int16(y128[3],scalec);
+      y128[4]  = shiftright_int16(y128[4],scalec);
+      y128[5]  = shiftright_int16(y128[5],scalec);
+      y128[6]  = shiftright_int16(y128[6],scalec);
+      y128[7]  = shiftright_int16(y128[7],scalec);
+      y128[8]  = shiftright_int16(y128[8],scalec);
+      y128[9]  = shiftright_int16(y128[9],scalec);
+      y128[10] = shiftright_int16(y128[10],scalec);
+      y128[11] = shiftright_int16(y128[11],scalec);
+      y128[12] = shiftright_int16(y128[12],scalec);
+      y128[13] = shiftright_int16(y128[13],scalec);
+      y128[14] = shiftright_int16(y128[14],scalec);
+      y128[15] = shiftright_int16(y128[15],scalec);
 
       y128+=16;
     }
@@ -1542,7 +1623,7 @@ int16_t tw512c[512] __attribute__((aligned(32))) = {
   0,32767,403,32764,805,32757,1207,32744,1608,32727,2010,32705,2411,32678,2812,32646,3212,32609,3612,32567,4012,32520,4410,32468,4808,32412,5206,32350,5602,32284,5998,32213,6393,32137,6787,32056,7180,31970,7572,31880,7962,31785,8352,31684,8740,31580,9127,31470,9512,31356,9896,31236,10279,31113,10660,30984,11039,30851,11417,30713,11793,30571,12167,30424,12540,30272,12910,30116,13279,29955,13646,29790,14010,29621,14373,29446,14733,29268,15091,29085,15447,28897,15800,28706,16151,28510,16500,28309,16846,28105,17190,27896,17531,27683,17869,27466,18205,27244,18538,27019,18868,26789,19195,26556,19520,26318,19841,26077,20160,25831,20475,25582,20788,25329,21097,25072,21403,24811,21706,24546,22005,24278,22302,24006,22595,23731,22884,23452,23170,23169,23453,22883,23732,22594,24007,22301,24279,22004,24547,21705,24812,21402,25073,21096,25330,20787,25583,20474,25832,20159,26078,19840,26319,19519,26557,19194,26790,18867,27020,18537,27245,18204,27467,17868,27684,17530,27897,17189,28106,16845,28310,16499,28511,16150,28707,15799,28898,15446,29086,15090,29269,14732,29447,14372,29622,14009,29791,13645,29956,13278,30117,12909,30273,12539,30425,12166,30572,11792,30714,11416,30852,11038,30985,10659,31114,10278,31237,9895,31357,9511,31471,9126,31581,8739,31685,8351,31786,7961,31881,7571,31971,7179,32057,6786,32138,6392,32214,5997,32285,5601,32351,5205,32413,4807,32469,4409,32521,4011,32568,3611,32610,3211,32647,2811,32679,2410,32706,2009,32728,1607,32745,1206,32758,804,32765,402,32767,0,32765,-403,32758,-805,32745,-1207,32728,-1608,32706,-2010,32679,-2411,32647,-2812,32610,-3212,32568,-3612,32521,-4012,32469,-4410,32413,-4808,32351,-5206,32285,-5602,32214,-5998,32138,-6393,32057,-6787,31971,-7180,31881,-7572,31786,-7962,31685,-8352,31581,-8740,31471,-9127,31357,-9512,31237,-9896,31114,-10279,30985,-10660,30852,-11039,30714,-11417,30572,-11793,30425,-12167,30273,-12540,30117,-12910,29956,-13279,29791,-13646,29622,-14010,29447,-14373,29269,-14733,29086,-15091,28898,-15447,28707,-15800,28511,-16151,28310,-16500,28106,-16846,27897,-17190,27684,-17531,27467,-17869,27245,-18205,27020,-18538,26790,-18868,26557,-19195,26319,-19520,26078,-19841,25832,-20160,25583,-20475,25330,-20788,25073,-21097,24812,-21403,24547,-21706,24279,-22005,24007,-22302,23732,-22595,23453,-22884,23170,-23170,22884,-23453,22595,-23732,22302,-24007,22005,-24279,21706,-24547,21403,-24812,21097,-25073,20788,-25330,20475,-25583,20160,-25832,19841,-26078,19520,-26319,19195,-26557,18868,-26790,18538,-27020,18205,-27245,17869,-27467,17531,-27684,17190,-27897,16846,-28106,16500,-28310,16151,-28511,15800,-28707,15447,-28898,15091,-29086,14733,-29269,14373,-29447,14010,-29622,13646,-29791,13279,-29956,12910,-30117,12540,-30273,12167,-30425,11793,-30572,11417,-30714,11039,-30852,10660,-30985,10279,-31114,9896,-31237,9512,-31357,9127,-31471,8740,-31581,8352,-31685,7962,-31786,7572,-31881,7180,-31971,6787,-32057,6393,-32138,5998,-32214,5602,-32285,5206,-32351,4808,-32413,4410,-32469,4012,-32521,3612,-32568,3212,-32610,2812,-32647,2411,-32679,2010,-32706,1608,-32728,1207,-32745,805,-32758,403,-32765
 };
 
-void dft512(int16_t *x,int16_t *y,unsigned char scale)
+void dft512(int16_t *x,int16_t *y,unsigned int *scale)
 {
 
   simdshort_q15_t xtmp[256],*xtmpp,*x64 = (simdshort_q15_t *)x;
@@ -1590,8 +1671,10 @@ void dft512(int16_t *x,int16_t *y,unsigned char scale)
     xtmpp+=32;
   }
 
-  dft256((int16_t*)(xtmp),(int16_t*)ytmp,1);
-  dft256((int16_t*)(xtmp+128),(int16_t*)(ytmp+64),1);
+  unsigned int *scale256=NULL;
+  if (scale) scale256=scale+1;
+  dft256((int16_t*)(xtmp),(int16_t*)ytmp,scale256);
+  dft256((int16_t*)(xtmp+128),(int16_t*)(ytmp+64),scale256);
 
 
   for (i=0; i<64; i+=8) {
@@ -1633,32 +1716,56 @@ void dft512(int16_t *x,int16_t *y,unsigned char scale)
     ytmpp+=8;
   }
 
-  if (scale>0) {
+  if (scale && *scale>0) {
     y128p = y128;
 
-    for (i=0; i<8; i++) {
-      y128p[0]  = mulhi_int16(y128p[0],ONE_OVER_SQRT2_Q15_128);
-      y128p[1]  = mulhi_int16(y128p[1],ONE_OVER_SQRT2_Q15_128);
-      y128p[2]  = mulhi_int16(y128p[2],ONE_OVER_SQRT2_Q15_128);
-      y128p[3]  = mulhi_int16(y128p[3],ONE_OVER_SQRT2_Q15_128);
-      y128p[4]  = mulhi_int16(y128p[4],ONE_OVER_SQRT2_Q15_128);
-      y128p[5]  = mulhi_int16(y128p[5],ONE_OVER_SQRT2_Q15_128);
-      y128p[6]  = mulhi_int16(y128p[6],ONE_OVER_SQRT2_Q15_128);
-      y128p[7]  = mulhi_int16(y128p[7],ONE_OVER_SQRT2_Q15_128);
-      y128p[8]  = mulhi_int16(y128p[8],ONE_OVER_SQRT2_Q15_128);
-      y128p[9]  = mulhi_int16(y128p[9],ONE_OVER_SQRT2_Q15_128);
-      y128p[10] = mulhi_int16(y128p[10],ONE_OVER_SQRT2_Q15_128);
-      y128p[11] = mulhi_int16(y128p[11],ONE_OVER_SQRT2_Q15_128);
-      y128p[12] = mulhi_int16(y128p[12],ONE_OVER_SQRT2_Q15_128);
-      y128p[13] = mulhi_int16(y128p[13],ONE_OVER_SQRT2_Q15_128);
-      y128p[14] = mulhi_int16(y128p[14],ONE_OVER_SQRT2_Q15_128);
-      y128p[15] = mulhi_int16(y128p[15],ONE_OVER_SQRT2_Q15_128);
-      y128p+=16;
+    if (*scale>1) {
+      uint32_t scale2=*scale-1;
+      for (i=0; i<8; i++) {
+        y128p[0]  = mulhi_int16(shiftright_int16(y128p[0],scale2),ONE_OVER_SQRT2_Q15_128);
+        y128p[1]  = mulhi_int16(shiftright_int16(y128p[1],scale2),ONE_OVER_SQRT2_Q15_128);
+        y128p[2]  = mulhi_int16(shiftright_int16(y128p[2],scale2),ONE_OVER_SQRT2_Q15_128);
+        y128p[3]  = mulhi_int16(shiftright_int16(y128p[3],scale2),ONE_OVER_SQRT2_Q15_128);
+        y128p[4]  = mulhi_int16(shiftright_int16(y128p[4],scale2),ONE_OVER_SQRT2_Q15_128);
+        y128p[5]  = mulhi_int16(shiftright_int16(y128p[5],scale2),ONE_OVER_SQRT2_Q15_128);
+        y128p[6]  = mulhi_int16(shiftright_int16(y128p[6],scale2),ONE_OVER_SQRT2_Q15_128);
+        y128p[7]  = mulhi_int16(shiftright_int16(y128p[7],scale2),ONE_OVER_SQRT2_Q15_128);
+        y128p[8]  = mulhi_int16(shiftright_int16(y128p[8],scale2),ONE_OVER_SQRT2_Q15_128);
+        y128p[9]  = mulhi_int16(shiftright_int16(y128p[9],scale2),ONE_OVER_SQRT2_Q15_128);
+        y128p[10]  = mulhi_int16(shiftright_int16(y128p[10],scale2),ONE_OVER_SQRT2_Q15_128);
+        y128p[11]  = mulhi_int16(shiftright_int16(y128p[11],scale2),ONE_OVER_SQRT2_Q15_128);
+        y128p[12]  = mulhi_int16(shiftright_int16(y128p[12],scale2),ONE_OVER_SQRT2_Q15_128);
+        y128p[13]  = mulhi_int16(shiftright_int16(y128p[13],scale2),ONE_OVER_SQRT2_Q15_128);
+        y128p[14]  = mulhi_int16(shiftright_int16(y128p[14],scale2),ONE_OVER_SQRT2_Q15_128);
+        y128p[15]  = mulhi_int16(shiftright_int16(y128p[15],scale2),ONE_OVER_SQRT2_Q15_128);
+        y128p+=16;
+      }
+    }
+    else {
+      for (i=0; i<8; i++) {
+        y128p[0]  = mulhi_int16(y128p[0],ONE_OVER_SQRT2_Q15_128);
+        y128p[1]  = mulhi_int16(y128p[1],ONE_OVER_SQRT2_Q15_128);
+        y128p[2]  = mulhi_int16(y128p[2],ONE_OVER_SQRT2_Q15_128);
+        y128p[3]  = mulhi_int16(y128p[3],ONE_OVER_SQRT2_Q15_128);
+        y128p[4]  = mulhi_int16(y128p[4],ONE_OVER_SQRT2_Q15_128);
+        y128p[5]  = mulhi_int16(y128p[5],ONE_OVER_SQRT2_Q15_128);
+        y128p[6]  = mulhi_int16(y128p[6],ONE_OVER_SQRT2_Q15_128);
+        y128p[7]  = mulhi_int16(y128p[7],ONE_OVER_SQRT2_Q15_128);
+        y128p[8]  = mulhi_int16(y128p[8],ONE_OVER_SQRT2_Q15_128);
+        y128p[9]  = mulhi_int16(y128p[9],ONE_OVER_SQRT2_Q15_128);
+        y128p[10] = mulhi_int16(y128p[10],ONE_OVER_SQRT2_Q15_128);
+        y128p[11] = mulhi_int16(y128p[11],ONE_OVER_SQRT2_Q15_128);
+        y128p[12] = mulhi_int16(y128p[12],ONE_OVER_SQRT2_Q15_128);
+        y128p[13] = mulhi_int16(y128p[13],ONE_OVER_SQRT2_Q15_128);
+        y128p[14] = mulhi_int16(y128p[14],ONE_OVER_SQRT2_Q15_128);
+        y128p[15] = mulhi_int16(y128p[15],ONE_OVER_SQRT2_Q15_128);
+        y128p+=16;
+      }
     }
   }
 }
 
-void idft512(int16_t *x,int16_t *y,unsigned char scale)
+void idft512(int16_t *x,int16_t *y,unsigned int *scale)
 {
 
   simdshort_q15_t xtmp[256],*xtmpp,*x64 = (simdshort_q15_t *)x;
@@ -1706,8 +1813,10 @@ void idft512(int16_t *x,int16_t *y,unsigned char scale)
     xtmpp+=32;
   }
 
-  idft256((int16_t*)(xtmp),(int16_t*)ytmp,1);
-  idft256((int16_t*)(xtmp+128),(int16_t*)(ytmp+64),1);
+  unsigned int *scale256=NULL;
+  if (scale) scale256=scale+1;
+  idft256((int16_t*)(xtmp),(int16_t*)ytmp,scale256);
+  idft256((int16_t*)(xtmp+128),(int16_t*)(ytmp+64),scale256);
 
 
   for (i=0; i<64; i++) {
@@ -1719,34 +1828,58 @@ void idft512(int16_t *x,int16_t *y,unsigned char scale)
     ytmpp++;
   }
 
-  if (scale>0) {
+  if (scale && *scale>0) {
     y128p = y128;
 
-    for (i=0; i<8; i++) {
-      y128p[0]  = mulhi_int16(y128p[0],ONE_OVER_SQRT2_Q15_128);
-      y128p[1]  = mulhi_int16(y128p[1],ONE_OVER_SQRT2_Q15_128);
-      y128p[2]  = mulhi_int16(y128p[2],ONE_OVER_SQRT2_Q15_128);
-      y128p[3]  = mulhi_int16(y128p[3],ONE_OVER_SQRT2_Q15_128);
-      y128p[4]  = mulhi_int16(y128p[4],ONE_OVER_SQRT2_Q15_128);
-      y128p[5]  = mulhi_int16(y128p[5],ONE_OVER_SQRT2_Q15_128);
-      y128p[6]  = mulhi_int16(y128p[6],ONE_OVER_SQRT2_Q15_128);
-      y128p[7]  = mulhi_int16(y128p[7],ONE_OVER_SQRT2_Q15_128);
-      y128p[8]  = mulhi_int16(y128p[8],ONE_OVER_SQRT2_Q15_128);
-      y128p[9]  = mulhi_int16(y128p[9],ONE_OVER_SQRT2_Q15_128);
-      y128p[10] = mulhi_int16(y128p[10],ONE_OVER_SQRT2_Q15_128);
-      y128p[11] = mulhi_int16(y128p[11],ONE_OVER_SQRT2_Q15_128);
-      y128p[12] = mulhi_int16(y128p[12],ONE_OVER_SQRT2_Q15_128);
-      y128p[13] = mulhi_int16(y128p[13],ONE_OVER_SQRT2_Q15_128);
-      y128p[14] = mulhi_int16(y128p[14],ONE_OVER_SQRT2_Q15_128);
-      y128p[15] = mulhi_int16(y128p[15],ONE_OVER_SQRT2_Q15_128);
-      y128p+=16;
+    if (*scale>1) {
+      uint32_t scale2=*scale-1;
+      for (i=0; i<8; i++) {
+        y128p[0]  = mulhi_int16(shiftright_int16(y128p[0],scale2),ONE_OVER_SQRT2_Q15_128);
+        y128p[1]  = mulhi_int16(shiftright_int16(y128p[1],scale2),ONE_OVER_SQRT2_Q15_128);
+        y128p[2]  = mulhi_int16(shiftright_int16(y128p[2],scale2),ONE_OVER_SQRT2_Q15_128);
+        y128p[3]  = mulhi_int16(shiftright_int16(y128p[3],scale2),ONE_OVER_SQRT2_Q15_128);
+        y128p[4]  = mulhi_int16(shiftright_int16(y128p[4],scale2),ONE_OVER_SQRT2_Q15_128);
+        y128p[5]  = mulhi_int16(shiftright_int16(y128p[5],scale2),ONE_OVER_SQRT2_Q15_128);
+        y128p[6]  = mulhi_int16(shiftright_int16(y128p[6],scale2),ONE_OVER_SQRT2_Q15_128);
+        y128p[7]  = mulhi_int16(shiftright_int16(y128p[7],scale2),ONE_OVER_SQRT2_Q15_128);
+        y128p[8]  = mulhi_int16(shiftright_int16(y128p[8],scale2),ONE_OVER_SQRT2_Q15_128);
+        y128p[9]  = mulhi_int16(shiftright_int16(y128p[9],scale2),ONE_OVER_SQRT2_Q15_128);
+        y128p[10]  = mulhi_int16(shiftright_int16(y128p[10],scale2),ONE_OVER_SQRT2_Q15_128);
+        y128p[11]  = mulhi_int16(shiftright_int16(y128p[11],scale2),ONE_OVER_SQRT2_Q15_128);
+        y128p[12]  = mulhi_int16(shiftright_int16(y128p[12],scale2),ONE_OVER_SQRT2_Q15_128);
+        y128p[13]  = mulhi_int16(shiftright_int16(y128p[13],scale2),ONE_OVER_SQRT2_Q15_128);
+        y128p[14]  = mulhi_int16(shiftright_int16(y128p[14],scale2),ONE_OVER_SQRT2_Q15_128);
+        y128p[15]  = mulhi_int16(shiftright_int16(y128p[15],scale2),ONE_OVER_SQRT2_Q15_128);
+        y128p+=16;
+      }
+    }
+    else {
+      for (i=0; i<8; i++) {
+        y128p[0]  = mulhi_int16(y128p[0],ONE_OVER_SQRT2_Q15_128);
+        y128p[1]  = mulhi_int16(y128p[1],ONE_OVER_SQRT2_Q15_128);
+        y128p[2]  = mulhi_int16(y128p[2],ONE_OVER_SQRT2_Q15_128);
+        y128p[3]  = mulhi_int16(y128p[3],ONE_OVER_SQRT2_Q15_128);
+        y128p[4]  = mulhi_int16(y128p[4],ONE_OVER_SQRT2_Q15_128);
+        y128p[5]  = mulhi_int16(y128p[5],ONE_OVER_SQRT2_Q15_128);
+        y128p[6]  = mulhi_int16(y128p[6],ONE_OVER_SQRT2_Q15_128);
+        y128p[7]  = mulhi_int16(y128p[7],ONE_OVER_SQRT2_Q15_128);
+        y128p[8]  = mulhi_int16(y128p[8],ONE_OVER_SQRT2_Q15_128);
+        y128p[9]  = mulhi_int16(y128p[9],ONE_OVER_SQRT2_Q15_128);
+        y128p[10] = mulhi_int16(y128p[10],ONE_OVER_SQRT2_Q15_128);
+        y128p[11] = mulhi_int16(y128p[11],ONE_OVER_SQRT2_Q15_128);
+        y128p[12] = mulhi_int16(y128p[12],ONE_OVER_SQRT2_Q15_128);
+        y128p[13] = mulhi_int16(y128p[13],ONE_OVER_SQRT2_Q15_128);
+        y128p[14] = mulhi_int16(y128p[14],ONE_OVER_SQRT2_Q15_128);
+        y128p[15] = mulhi_int16(y128p[15],ONE_OVER_SQRT2_Q15_128);
+        y128p+=16;
+      }
     }
   }
 }
 
 int16_t tw1024[1536] __attribute__((aligned(32)));
 
-void dft1024(int16_t *x,int16_t *y,unsigned char scale)
+void dft1024(int16_t *x,int16_t *y,unsigned int *scale)
 {
 
   simd_q15_t xtmp[256],ytmp[256],*tw1024_128p=(simd_q15_t *)tw1024,*x128=(simd_q15_t *)x,*y128=(simd_q15_t *)y,*y128p=(simd_q15_t *)y;
@@ -1758,10 +1891,12 @@ void dft1024(int16_t *x,int16_t *y,unsigned char scale)
   }
 
 
-  dft256((int16_t*)(xtmp),(int16_t*)(ytmp),1);
-  dft256((int16_t*)(xtmp+64),(int16_t*)(ytmp+64),1);
-  dft256((int16_t*)(xtmp+128),(int16_t*)(ytmp+128),1);
-  dft256((int16_t*)(xtmp+192),(int16_t*)(ytmp+192),1);
+  unsigned int *scale256=NULL;
+  if (scale) scale256=scale+1;
+  dft256((int16_t*)(xtmp),(int16_t*)(ytmp),scale256);
+  dft256((int16_t*)(xtmp+64),(int16_t*)(ytmp+64),scale256);
+  dft256((int16_t*)(xtmp+128),(int16_t*)(ytmp+128),scale256);
+  dft256((int16_t*)(xtmp+192),(int16_t*)(ytmp+192),scale256);
 
   for (i=0; i<64; i++) {
     bfly4(ytmpp,ytmpp+64,ytmpp+128,ytmpp+192,
@@ -1772,32 +1907,33 @@ void dft1024(int16_t *x,int16_t *y,unsigned char scale)
     ytmpp++;
   }
 
-  if (scale>0) {
+  if (scale && *scale>0) {
+    unsigned int scalec=*scale;
 
     for (i=0; i<16; i++) {
-      y128[0]  = shiftright_int16(y128[0],1);
-      y128[1]  = shiftright_int16(y128[1],1);
-      y128[2]  = shiftright_int16(y128[2],1);
-      y128[3]  = shiftright_int16(y128[3],1);
-      y128[4]  = shiftright_int16(y128[4],1);
-      y128[5]  = shiftright_int16(y128[5],1);
-      y128[6]  = shiftright_int16(y128[6],1);
-      y128[7]  = shiftright_int16(y128[7],1);
-      y128[8]  = shiftright_int16(y128[8],1);
-      y128[9]  = shiftright_int16(y128[9],1);
-      y128[10] = shiftright_int16(y128[10],1);
-      y128[11] = shiftright_int16(y128[11],1);
-      y128[12] = shiftright_int16(y128[12],1);
-      y128[13] = shiftright_int16(y128[13],1);
-      y128[14] = shiftright_int16(y128[14],1);
-      y128[15] = shiftright_int16(y128[15],1);
+      y128[0]  = shiftright_int16(y128[0],scalec);
+      y128[1]  = shiftright_int16(y128[1],scalec);
+      y128[2]  = shiftright_int16(y128[2],scalec);
+      y128[3]  = shiftright_int16(y128[3],scalec);
+      y128[4]  = shiftright_int16(y128[4],scalec);
+      y128[5]  = shiftright_int16(y128[5],scalec);
+      y128[6]  = shiftright_int16(y128[6],scalec);
+      y128[7]  = shiftright_int16(y128[7],scalec);
+      y128[8]  = shiftright_int16(y128[8],scalec);
+      y128[9]  = shiftright_int16(y128[9],scalec);
+      y128[10] = shiftright_int16(y128[10],scalec);
+      y128[11] = shiftright_int16(y128[11],scalec);
+      y128[12] = shiftright_int16(y128[12],scalec);
+      y128[13] = shiftright_int16(y128[13],scalec);
+      y128[14] = shiftright_int16(y128[14],scalec);
+      y128[15] = shiftright_int16(y128[15],scalec);
 
       y128+=16;
     }
   }
 }
 
-void idft1024(int16_t *x,int16_t *y,unsigned char scale)
+void idft1024(int16_t *x,int16_t *y,unsigned int *scale)
 {
 
   simd_q15_t xtmp[256],ytmp[256],*tw1024_128p=(simd_q15_t *)tw1024,*x128=(simd_q15_t *)x,*y128=(simd_q15_t *)y,*y128p=(simd_q15_t *)y;
@@ -1809,10 +1945,12 @@ void idft1024(int16_t *x,int16_t *y,unsigned char scale)
   }
 
 
-  idft256((int16_t*)(xtmp),(int16_t*)(ytmp),1);
-  idft256((int16_t*)(xtmp+64),(int16_t*)(ytmp+64),1);
-  idft256((int16_t*)(xtmp+128),(int16_t*)(ytmp+128),1);
-  idft256((int16_t*)(xtmp+192),(int16_t*)(ytmp+192),1);
+  unsigned int *scale256=NULL;
+  if (scale) scale256=scale+1;
+  idft256((int16_t*)(xtmp),(int16_t*)(ytmp),scale256);
+  idft256((int16_t*)(xtmp+64),(int16_t*)(ytmp+64),scale256);
+  idft256((int16_t*)(xtmp+128),(int16_t*)(ytmp+128),scale256);
+  idft256((int16_t*)(xtmp+192),(int16_t*)(ytmp+192),scale256);
 
   for (i=0; i<64; i++) {
     ibfly4(ytmpp,ytmpp+64,ytmpp+128,ytmpp+192,
@@ -1823,25 +1961,26 @@ void idft1024(int16_t *x,int16_t *y,unsigned char scale)
     ytmpp++;
   }
 
-  if (scale>0) {
+  if (scale && *scale>0) {
+    unsigned int scalec=*scale;
 
     for (i=0; i<16; i++) {
-      y128[0]  = shiftright_int16(y128[0],1);
-      y128[1]  = shiftright_int16(y128[1],1);
-      y128[2]  = shiftright_int16(y128[2],1);
-      y128[3]  = shiftright_int16(y128[3],1);
-      y128[4]  = shiftright_int16(y128[4],1);
-      y128[5]  = shiftright_int16(y128[5],1);
-      y128[6]  = shiftright_int16(y128[6],1);
-      y128[7]  = shiftright_int16(y128[7],1);
-      y128[8]  = shiftright_int16(y128[8],1);
-      y128[9]  = shiftright_int16(y128[9],1);
-      y128[10] = shiftright_int16(y128[10],1);
-      y128[11] = shiftright_int16(y128[11],1);
-      y128[12] = shiftright_int16(y128[12],1);
-      y128[13] = shiftright_int16(y128[13],1);
-      y128[14] = shiftright_int16(y128[14],1);
-      y128[15] = shiftright_int16(y128[15],1);
+      y128[0]  = shiftright_int16(y128[0],scalec);
+      y128[1]  = shiftright_int16(y128[1],scalec);
+      y128[2]  = shiftright_int16(y128[2],scalec);
+      y128[3]  = shiftright_int16(y128[3],scalec);
+      y128[4]  = shiftright_int16(y128[4],scalec);
+      y128[5]  = shiftright_int16(y128[5],scalec);
+      y128[6]  = shiftright_int16(y128[6],scalec);
+      y128[7]  = shiftright_int16(y128[7],scalec);
+      y128[8]  = shiftright_int16(y128[8],scalec);
+      y128[9]  = shiftright_int16(y128[9],scalec);
+      y128[10] = shiftright_int16(y128[10],scalec);
+      y128[11] = shiftright_int16(y128[11],scalec);
+      y128[12] = shiftright_int16(y128[12],scalec);
+      y128[13] = shiftright_int16(y128[13],scalec);
+      y128[14] = shiftright_int16(y128[14],scalec);
+      y128[15] = shiftright_int16(y128[15],scalec);
 
       y128+=16;
     }
@@ -1850,7 +1989,7 @@ void idft1024(int16_t *x,int16_t *y,unsigned char scale)
 
 int16_t tw2048[2048] __attribute__((aligned(32)));
 
-void dft2048(int16_t *x,int16_t *y,unsigned char scale)
+void dft2048(int16_t *x,int16_t *y,unsigned int *scale)
 {
 
   simdshort_q15_t xtmp[1024],*xtmpp,*x64 = (simdshort_q15_t *)x;
@@ -1898,8 +2037,10 @@ void dft2048(int16_t *x,int16_t *y,unsigned char scale)
     xtmpp+=32;
   }
 
-  dft1024((int16_t*)(xtmp),(int16_t*)ytmp,1);
-  dft1024((int16_t*)(xtmp+512),(int16_t*)(ytmp+256),1);
+  unsigned int *scale1024=NULL;
+  if (scale) scale1024=scale+1;
+  dft1024((int16_t*)(xtmp),(int16_t*)ytmp,scale1024);
+  dft1024((int16_t*)(xtmp+512),(int16_t*)(ytmp+256),scale1024);
 
 
   for (i=0; i<256; i++) {
@@ -1911,32 +2052,56 @@ void dft2048(int16_t *x,int16_t *y,unsigned char scale)
     ytmpp++;
   }
 
-  if (scale>0) {
+  if (scale && *scale>0) {
     y128p = y128;
 
-    for (i=0; i<32; i++) {
-      y128p[0]  = mulhi_int16(y128p[0],ONE_OVER_SQRT2_Q15_128);
-      y128p[1]  = mulhi_int16(y128p[1],ONE_OVER_SQRT2_Q15_128);
-      y128p[2]  = mulhi_int16(y128p[2],ONE_OVER_SQRT2_Q15_128);
-      y128p[3]  = mulhi_int16(y128p[3],ONE_OVER_SQRT2_Q15_128);
-      y128p[4]  = mulhi_int16(y128p[4],ONE_OVER_SQRT2_Q15_128);
-      y128p[5]  = mulhi_int16(y128p[5],ONE_OVER_SQRT2_Q15_128);
-      y128p[6]  = mulhi_int16(y128p[6],ONE_OVER_SQRT2_Q15_128);
-      y128p[7]  = mulhi_int16(y128p[7],ONE_OVER_SQRT2_Q15_128);
-      y128p[8]  = mulhi_int16(y128p[8],ONE_OVER_SQRT2_Q15_128);
-      y128p[9]  = mulhi_int16(y128p[9],ONE_OVER_SQRT2_Q15_128);
-      y128p[10] = mulhi_int16(y128p[10],ONE_OVER_SQRT2_Q15_128);
-      y128p[11] = mulhi_int16(y128p[11],ONE_OVER_SQRT2_Q15_128);
-      y128p[12] = mulhi_int16(y128p[12],ONE_OVER_SQRT2_Q15_128);
-      y128p[13] = mulhi_int16(y128p[13],ONE_OVER_SQRT2_Q15_128);
-      y128p[14] = mulhi_int16(y128p[14],ONE_OVER_SQRT2_Q15_128);
-      y128p[15] = mulhi_int16(y128p[15],ONE_OVER_SQRT2_Q15_128);
-      y128p+=16;
+    if (*scale>1) {
+      uint32_t scale2=*scale-1;
+      for (i=0; i<32; i++) {
+        y128p[0]  = mulhi_int16(shiftright_int16(y128p[0],scale2),ONE_OVER_SQRT2_Q15_128);
+        y128p[1]  = mulhi_int16(shiftright_int16(y128p[1],scale2),ONE_OVER_SQRT2_Q15_128);
+        y128p[2]  = mulhi_int16(shiftright_int16(y128p[2],scale2),ONE_OVER_SQRT2_Q15_128);
+        y128p[3]  = mulhi_int16(shiftright_int16(y128p[3],scale2),ONE_OVER_SQRT2_Q15_128);
+        y128p[4]  = mulhi_int16(shiftright_int16(y128p[4],scale2),ONE_OVER_SQRT2_Q15_128);
+        y128p[5]  = mulhi_int16(shiftright_int16(y128p[5],scale2),ONE_OVER_SQRT2_Q15_128);
+        y128p[6]  = mulhi_int16(shiftright_int16(y128p[6],scale2),ONE_OVER_SQRT2_Q15_128);
+        y128p[7]  = mulhi_int16(shiftright_int16(y128p[7],scale2),ONE_OVER_SQRT2_Q15_128);
+        y128p[8]  = mulhi_int16(shiftright_int16(y128p[8],scale2),ONE_OVER_SQRT2_Q15_128);
+        y128p[9]  = mulhi_int16(shiftright_int16(y128p[9],scale2),ONE_OVER_SQRT2_Q15_128);
+        y128p[10]  = mulhi_int16(shiftright_int16(y128p[10],scale2),ONE_OVER_SQRT2_Q15_128);
+        y128p[11]  = mulhi_int16(shiftright_int16(y128p[11],scale2),ONE_OVER_SQRT2_Q15_128);
+        y128p[12]  = mulhi_int16(shiftright_int16(y128p[12],scale2),ONE_OVER_SQRT2_Q15_128);
+        y128p[13]  = mulhi_int16(shiftright_int16(y128p[13],scale2),ONE_OVER_SQRT2_Q15_128);
+        y128p[14]  = mulhi_int16(shiftright_int16(y128p[14],scale2),ONE_OVER_SQRT2_Q15_128);
+        y128p[15]  = mulhi_int16(shiftright_int16(y128p[15],scale2),ONE_OVER_SQRT2_Q15_128);
+        y128p+=16;
+      }
+    }
+    else {
+      for (i=0; i<32; i++) {
+        y128p[0]  = mulhi_int16(y128p[0],ONE_OVER_SQRT2_Q15_128);
+        y128p[1]  = mulhi_int16(y128p[1],ONE_OVER_SQRT2_Q15_128);
+        y128p[2]  = mulhi_int16(y128p[2],ONE_OVER_SQRT2_Q15_128);
+        y128p[3]  = mulhi_int16(y128p[3],ONE_OVER_SQRT2_Q15_128);
+        y128p[4]  = mulhi_int16(y128p[4],ONE_OVER_SQRT2_Q15_128);
+        y128p[5]  = mulhi_int16(y128p[5],ONE_OVER_SQRT2_Q15_128);
+        y128p[6]  = mulhi_int16(y128p[6],ONE_OVER_SQRT2_Q15_128);
+        y128p[7]  = mulhi_int16(y128p[7],ONE_OVER_SQRT2_Q15_128);
+        y128p[8]  = mulhi_int16(y128p[8],ONE_OVER_SQRT2_Q15_128);
+        y128p[9]  = mulhi_int16(y128p[9],ONE_OVER_SQRT2_Q15_128);
+        y128p[10] = mulhi_int16(y128p[10],ONE_OVER_SQRT2_Q15_128);
+        y128p[11] = mulhi_int16(y128p[11],ONE_OVER_SQRT2_Q15_128);
+        y128p[12] = mulhi_int16(y128p[12],ONE_OVER_SQRT2_Q15_128);
+        y128p[13] = mulhi_int16(y128p[13],ONE_OVER_SQRT2_Q15_128);
+        y128p[14] = mulhi_int16(y128p[14],ONE_OVER_SQRT2_Q15_128);
+        y128p[15] = mulhi_int16(y128p[15],ONE_OVER_SQRT2_Q15_128);
+        y128p+=16;
+      }
     }
   }
 }
 
-void idft2048(int16_t *x,int16_t *y,unsigned char scale)
+void idft2048(int16_t *x,int16_t *y,unsigned int *scale)
 {
 
   simdshort_q15_t xtmp[1024],*xtmpp,*x64 = (simdshort_q15_t *)x;
@@ -1984,8 +2149,10 @@ void idft2048(int16_t *x,int16_t *y,unsigned char scale)
     xtmpp+=32;
   }
 
-  idft1024((int16_t*)(xtmp),(int16_t*)ytmp,1);
-  idft1024((int16_t*)(xtmp+512),(int16_t*)(ytmp+256),1);
+  unsigned int *scale1024=NULL;
+  if (scale) scale1024=scale+1;
+  idft1024((int16_t*)(xtmp),(int16_t*)ytmp,scale1024);
+  idft1024((int16_t*)(xtmp+512),(int16_t*)(ytmp+256),scale1024);
 
 
   for (i=0; i<256; i++) {
@@ -1997,34 +2164,58 @@ void idft2048(int16_t *x,int16_t *y,unsigned char scale)
     ytmpp++;
   }
 
-  if (scale>0) {
+  if (scale && *scale>0) {
     y128p = y128;
 
-    for (i=0; i<32; i++) {
-      y128p[0]  = mulhi_int16(y128p[0],ONE_OVER_SQRT2_Q15_128);
-      y128p[1]  = mulhi_int16(y128p[1],ONE_OVER_SQRT2_Q15_128);
-      y128p[2]  = mulhi_int16(y128p[2],ONE_OVER_SQRT2_Q15_128);
-      y128p[3]  = mulhi_int16(y128p[3],ONE_OVER_SQRT2_Q15_128);
-      y128p[4]  = mulhi_int16(y128p[4],ONE_OVER_SQRT2_Q15_128);
-      y128p[5]  = mulhi_int16(y128p[5],ONE_OVER_SQRT2_Q15_128);
-      y128p[6]  = mulhi_int16(y128p[6],ONE_OVER_SQRT2_Q15_128);
-      y128p[7]  = mulhi_int16(y128p[7],ONE_OVER_SQRT2_Q15_128);
-      y128p[8]  = mulhi_int16(y128p[8],ONE_OVER_SQRT2_Q15_128);
-      y128p[9]  = mulhi_int16(y128p[9],ONE_OVER_SQRT2_Q15_128);
-      y128p[10] = mulhi_int16(y128p[10],ONE_OVER_SQRT2_Q15_128);
-      y128p[11] = mulhi_int16(y128p[11],ONE_OVER_SQRT2_Q15_128);
-      y128p[12] = mulhi_int16(y128p[12],ONE_OVER_SQRT2_Q15_128);
-      y128p[13] = mulhi_int16(y128p[13],ONE_OVER_SQRT2_Q15_128);
-      y128p[14] = mulhi_int16(y128p[14],ONE_OVER_SQRT2_Q15_128);
-      y128p[15] = mulhi_int16(y128p[15],ONE_OVER_SQRT2_Q15_128);
-      y128p+=16;
+    if (*scale>1) {
+      uint32_t scale2=*scale-1;
+      for (i=0; i<32; i++) {
+        y128p[0]  = mulhi_int16(shiftright_int16(y128p[0],scale2),ONE_OVER_SQRT2_Q15_128);
+        y128p[1]  = mulhi_int16(shiftright_int16(y128p[1],scale2),ONE_OVER_SQRT2_Q15_128);
+        y128p[2]  = mulhi_int16(shiftright_int16(y128p[2],scale2),ONE_OVER_SQRT2_Q15_128);
+        y128p[3]  = mulhi_int16(shiftright_int16(y128p[3],scale2),ONE_OVER_SQRT2_Q15_128);
+        y128p[4]  = mulhi_int16(shiftright_int16(y128p[4],scale2),ONE_OVER_SQRT2_Q15_128);
+        y128p[5]  = mulhi_int16(shiftright_int16(y128p[5],scale2),ONE_OVER_SQRT2_Q15_128);
+        y128p[6]  = mulhi_int16(shiftright_int16(y128p[6],scale2),ONE_OVER_SQRT2_Q15_128);
+        y128p[7]  = mulhi_int16(shiftright_int16(y128p[7],scale2),ONE_OVER_SQRT2_Q15_128);
+        y128p[8]  = mulhi_int16(shiftright_int16(y128p[8],scale2),ONE_OVER_SQRT2_Q15_128);
+        y128p[9]  = mulhi_int16(shiftright_int16(y128p[9],scale2),ONE_OVER_SQRT2_Q15_128);
+        y128p[10]  = mulhi_int16(shiftright_int16(y128p[10],scale2),ONE_OVER_SQRT2_Q15_128);
+        y128p[11]  = mulhi_int16(shiftright_int16(y128p[11],scale2),ONE_OVER_SQRT2_Q15_128);
+        y128p[12]  = mulhi_int16(shiftright_int16(y128p[12],scale2),ONE_OVER_SQRT2_Q15_128);
+        y128p[13]  = mulhi_int16(shiftright_int16(y128p[13],scale2),ONE_OVER_SQRT2_Q15_128);
+        y128p[14]  = mulhi_int16(shiftright_int16(y128p[14],scale2),ONE_OVER_SQRT2_Q15_128);
+        y128p[15]  = mulhi_int16(shiftright_int16(y128p[15],scale2),ONE_OVER_SQRT2_Q15_128);
+        y128p+=16;
+      }
+    }
+    else {
+      for (i=0; i<32; i++) {
+        y128p[0]  = mulhi_int16(y128p[0],ONE_OVER_SQRT2_Q15_128);
+        y128p[1]  = mulhi_int16(y128p[1],ONE_OVER_SQRT2_Q15_128);
+        y128p[2]  = mulhi_int16(y128p[2],ONE_OVER_SQRT2_Q15_128);
+        y128p[3]  = mulhi_int16(y128p[3],ONE_OVER_SQRT2_Q15_128);
+        y128p[4]  = mulhi_int16(y128p[4],ONE_OVER_SQRT2_Q15_128);
+        y128p[5]  = mulhi_int16(y128p[5],ONE_OVER_SQRT2_Q15_128);
+        y128p[6]  = mulhi_int16(y128p[6],ONE_OVER_SQRT2_Q15_128);
+        y128p[7]  = mulhi_int16(y128p[7],ONE_OVER_SQRT2_Q15_128);
+        y128p[8]  = mulhi_int16(y128p[8],ONE_OVER_SQRT2_Q15_128);
+        y128p[9]  = mulhi_int16(y128p[9],ONE_OVER_SQRT2_Q15_128);
+        y128p[10] = mulhi_int16(y128p[10],ONE_OVER_SQRT2_Q15_128);
+        y128p[11] = mulhi_int16(y128p[11],ONE_OVER_SQRT2_Q15_128);
+        y128p[12] = mulhi_int16(y128p[12],ONE_OVER_SQRT2_Q15_128);
+        y128p[13] = mulhi_int16(y128p[13],ONE_OVER_SQRT2_Q15_128);
+        y128p[14] = mulhi_int16(y128p[14],ONE_OVER_SQRT2_Q15_128);
+        y128p[15] = mulhi_int16(y128p[15],ONE_OVER_SQRT2_Q15_128);
+        y128p+=16;
+      }
     }
   }
 }
 
 int16_t tw4096[3*2*1024];
 
-void dft4096(int16_t *x,int16_t *y,unsigned char scale)
+void dft4096(int16_t *x,int16_t *y,unsigned int *scale)
 {
 
   simd_q15_t xtmp[1024],ytmp[1024],*tw4096_128p=(simd_q15_t *)tw4096,*x128=(simd_q15_t *)x,*y128=(simd_q15_t *)y,*y128p=(simd_q15_t *)y;
@@ -2036,10 +2227,12 @@ void dft4096(int16_t *x,int16_t *y,unsigned char scale)
   }
 
 
-  dft1024((int16_t*)(xtmp),(int16_t*)(ytmp),1);
-  dft1024((int16_t*)(xtmp+256),(int16_t*)(ytmp+256),1);
-  dft1024((int16_t*)(xtmp+512),(int16_t*)(ytmp+512),1);
-  dft1024((int16_t*)(xtmp+768),(int16_t*)(ytmp+768),1);
+  unsigned int *scale1024=NULL;
+  if (scale) scale1024=scale+1;
+  dft1024((int16_t*)(xtmp),(int16_t*)(ytmp),scale1024);
+  dft1024((int16_t*)(xtmp+256),(int16_t*)(ytmp+256),scale1024);
+  dft1024((int16_t*)(xtmp+512),(int16_t*)(ytmp+512),scale1024);
+  dft1024((int16_t*)(xtmp+768),(int16_t*)(ytmp+768),scale1024);
 
   for (i=0; i<256; i++) {
     bfly4(ytmpp,ytmpp+256,ytmpp+512,ytmpp+768,
@@ -2050,25 +2243,26 @@ void dft4096(int16_t *x,int16_t *y,unsigned char scale)
     ytmpp++;
   }
 
-  if (scale>0) {
+  if (scale && *scale>0) {
+    unsigned int scalec=*scale;
 
     for (i=0; i<64; i++) {
-      y128[0]  = shiftright_int16(y128[0],1);
-      y128[1]  = shiftright_int16(y128[1],1);
-      y128[2]  = shiftright_int16(y128[2],1);
-      y128[3]  = shiftright_int16(y128[3],1);
-      y128[4]  = shiftright_int16(y128[4],1);
-      y128[5]  = shiftright_int16(y128[5],1);
-      y128[6]  = shiftright_int16(y128[6],1);
-      y128[7]  = shiftright_int16(y128[7],1);
-      y128[8]  = shiftright_int16(y128[8],1);
-      y128[9]  = shiftright_int16(y128[9],1);
-      y128[10] = shiftright_int16(y128[10],1);
-      y128[11] = shiftright_int16(y128[11],1);
-      y128[12] = shiftright_int16(y128[12],1);
-      y128[13] = shiftright_int16(y128[13],1);
-      y128[14] = shiftright_int16(y128[14],1);
-      y128[15] = shiftright_int16(y128[15],1);
+      y128[0]  = shiftright_int16(y128[0],scalec);
+      y128[1]  = shiftright_int16(y128[1],scalec);
+      y128[2]  = shiftright_int16(y128[2],scalec);
+      y128[3]  = shiftright_int16(y128[3],scalec);
+      y128[4]  = shiftright_int16(y128[4],scalec);
+      y128[5]  = shiftright_int16(y128[5],scalec);
+      y128[6]  = shiftright_int16(y128[6],scalec);
+      y128[7]  = shiftright_int16(y128[7],scalec);
+      y128[8]  = shiftright_int16(y128[8],scalec);
+      y128[9]  = shiftright_int16(y128[9],scalec);
+      y128[10] = shiftright_int16(y128[10],scalec);
+      y128[11] = shiftright_int16(y128[11],scalec);
+      y128[12] = shiftright_int16(y128[12],scalec);
+      y128[13] = shiftright_int16(y128[13],scalec);
+      y128[14] = shiftright_int16(y128[14],scalec);
+      y128[15] = shiftright_int16(y128[15],scalec);
 
       y128+=16;
     }
@@ -2077,7 +2271,7 @@ void dft4096(int16_t *x,int16_t *y,unsigned char scale)
 
  
 
-void idft4096(int16_t *x,int16_t *y,unsigned char scale)
+void idft4096(int16_t *x,int16_t *y,unsigned int *scale)
 {
 
   simd_q15_t xtmp[1024],ytmp[1024],*tw4096_128p=(simd_q15_t *)tw4096,*x128=(simd_q15_t *)x,*y128=(simd_q15_t *)y,*y128p=(simd_q15_t *)y;
@@ -2089,10 +2283,12 @@ void idft4096(int16_t *x,int16_t *y,unsigned char scale)
   }
 
 
-  idft1024((int16_t*)(xtmp),(int16_t*)(ytmp),1);
-  idft1024((int16_t*)(xtmp+256),(int16_t*)(ytmp+256),1);
-  idft1024((int16_t*)(xtmp+512),(int16_t*)(ytmp+512),1);
-  idft1024((int16_t*)(xtmp+768),(int16_t*)(ytmp+768),1);
+  unsigned int *scale1024=NULL;
+  if (scale) scale1024=scale+1;
+  idft1024((int16_t*)(xtmp),(int16_t*)(ytmp),scale1024);
+  idft1024((int16_t*)(xtmp+256),(int16_t*)(ytmp+256),scale1024);
+  idft1024((int16_t*)(xtmp+512),(int16_t*)(ytmp+512),scale1024);
+  idft1024((int16_t*)(xtmp+768),(int16_t*)(ytmp+768),scale1024);
 
   for (i=0; i<256; i++) {
     ibfly4(ytmpp,ytmpp+256,ytmpp+512,ytmpp+768,
@@ -2103,25 +2299,26 @@ void idft4096(int16_t *x,int16_t *y,unsigned char scale)
     ytmpp++;
   }
 
-  if (scale>0) {
+  if (scale && *scale>0) {
+    unsigned int scalec=*scale;
 
     for (i=0; i<64; i++) {
-      y128[0]  = shiftright_int16(y128[0],scale);
-      y128[1]  = shiftright_int16(y128[1],scale);
-      y128[2]  = shiftright_int16(y128[2],scale);
-      y128[3]  = shiftright_int16(y128[3],scale);
-      y128[4]  = shiftright_int16(y128[4],scale);
-      y128[5]  = shiftright_int16(y128[5],scale);
-      y128[6]  = shiftright_int16(y128[6],scale);
-      y128[7]  = shiftright_int16(y128[7],scale);
-      y128[8]  = shiftright_int16(y128[8],scale);
-      y128[9]  = shiftright_int16(y128[9],scale);
-      y128[10] = shiftright_int16(y128[10],scale);
-      y128[11] = shiftright_int16(y128[11],scale);
-      y128[12] = shiftright_int16(y128[12],scale);
-      y128[13] = shiftright_int16(y128[13],scale);
-      y128[14] = shiftright_int16(y128[14],scale);
-      y128[15] = shiftright_int16(y128[15],scale);
+      y128[0]  = shiftright_int16(y128[0],scalec);
+      y128[1]  = shiftright_int16(y128[1],scalec);
+      y128[2]  = shiftright_int16(y128[2],scalec);
+      y128[3]  = shiftright_int16(y128[3],scalec);
+      y128[4]  = shiftright_int16(y128[4],scalec);
+      y128[5]  = shiftright_int16(y128[5],scalec);
+      y128[6]  = shiftright_int16(y128[6],scalec);
+      y128[7]  = shiftright_int16(y128[7],scalec);
+      y128[8]  = shiftright_int16(y128[8],scalec);
+      y128[9]  = shiftright_int16(y128[9],scalec);
+      y128[10] = shiftright_int16(y128[10],scalec);
+      y128[11] = shiftright_int16(y128[11],scalec);
+      y128[12] = shiftright_int16(y128[12],scalec);
+      y128[13] = shiftright_int16(y128[13],scalec);
+      y128[14] = shiftright_int16(y128[14],scalec);
+      y128[15] = shiftright_int16(y128[15],scalec);
 
       y128+=16;
     }
@@ -2130,7 +2327,7 @@ void idft4096(int16_t *x,int16_t *y,unsigned char scale)
 
 int16_t tw8192[2*4096] __attribute__((aligned(32)));
 
-void dft8192(int16_t *x,int16_t *y,unsigned char scale)
+void dft8192(int16_t *x,int16_t *y,unsigned int *scale)
 {
 
   simdshort_q15_t xtmp[4096],*xtmpp,*x64 = (simdshort_q15_t *)x;
@@ -2178,8 +2375,10 @@ void dft8192(int16_t *x,int16_t *y,unsigned char scale)
     xtmpp+=32;
   }
 
-  dft4096((int16_t*)(xtmp),(int16_t*)ytmp,1);
-  dft4096((int16_t*)(xtmp+2048),(int16_t*)(ytmp+1024),1);
+  unsigned int *scale4096=NULL;
+  if (scale) scale4096=scale+1;
+  dft4096((int16_t*)(xtmp),(int16_t*)ytmp,scale4096);
+  dft4096((int16_t*)(xtmp+2048),(int16_t*)(ytmp+1024),scale4096);
 
 
   for (i=0; i<1024; i++) {
@@ -2191,32 +2390,56 @@ void dft8192(int16_t *x,int16_t *y,unsigned char scale)
     ytmpp++;
   }
 
-  if (scale>0) {
+  if (scale && scale>0) {
     y128p = y128;
 
-    for (i=0; i<128; i++) {
-      y128p[0]  = mulhi_int16(y128p[0],ONE_OVER_SQRT2_Q15_128);
-      y128p[1]  = mulhi_int16(y128p[1],ONE_OVER_SQRT2_Q15_128);
-      y128p[2]  = mulhi_int16(y128p[2],ONE_OVER_SQRT2_Q15_128);
-      y128p[3]  = mulhi_int16(y128p[3],ONE_OVER_SQRT2_Q15_128);
-      y128p[4]  = mulhi_int16(y128p[4],ONE_OVER_SQRT2_Q15_128);
-      y128p[5]  = mulhi_int16(y128p[5],ONE_OVER_SQRT2_Q15_128);
-      y128p[6]  = mulhi_int16(y128p[6],ONE_OVER_SQRT2_Q15_128);
-      y128p[7]  = mulhi_int16(y128p[7],ONE_OVER_SQRT2_Q15_128);
-      y128p[8]  = mulhi_int16(y128p[8],ONE_OVER_SQRT2_Q15_128);
-      y128p[9]  = mulhi_int16(y128p[9],ONE_OVER_SQRT2_Q15_128);
-      y128p[10] = mulhi_int16(y128p[10],ONE_OVER_SQRT2_Q15_128);
-      y128p[11] = mulhi_int16(y128p[11],ONE_OVER_SQRT2_Q15_128);
-      y128p[12] = mulhi_int16(y128p[12],ONE_OVER_SQRT2_Q15_128);
-      y128p[13] = mulhi_int16(y128p[13],ONE_OVER_SQRT2_Q15_128);
-      y128p[14] = mulhi_int16(y128p[14],ONE_OVER_SQRT2_Q15_128);
-      y128p[15] = mulhi_int16(y128p[15],ONE_OVER_SQRT2_Q15_128);
-      y128p+=16;
+    if (*scale>1) {
+      uint32_t scale2=*scale-1;
+      for (i=0; i<128; i++) {
+        y128p[0]  = mulhi_int16(shiftright_int16(y128p[0],scale2),ONE_OVER_SQRT2_Q15_128);
+        y128p[1]  = mulhi_int16(shiftright_int16(y128p[1],scale2),ONE_OVER_SQRT2_Q15_128);
+        y128p[2]  = mulhi_int16(shiftright_int16(y128p[2],scale2),ONE_OVER_SQRT2_Q15_128);
+        y128p[3]  = mulhi_int16(shiftright_int16(y128p[3],scale2),ONE_OVER_SQRT2_Q15_128);
+        y128p[4]  = mulhi_int16(shiftright_int16(y128p[4],scale2),ONE_OVER_SQRT2_Q15_128);
+        y128p[5]  = mulhi_int16(shiftright_int16(y128p[5],scale2),ONE_OVER_SQRT2_Q15_128);
+        y128p[6]  = mulhi_int16(shiftright_int16(y128p[6],scale2),ONE_OVER_SQRT2_Q15_128);
+        y128p[7]  = mulhi_int16(shiftright_int16(y128p[7],scale2),ONE_OVER_SQRT2_Q15_128);
+        y128p[8]  = mulhi_int16(shiftright_int16(y128p[8],scale2),ONE_OVER_SQRT2_Q15_128);
+        y128p[9]  = mulhi_int16(shiftright_int16(y128p[9],scale2),ONE_OVER_SQRT2_Q15_128);
+        y128p[10]  = mulhi_int16(shiftright_int16(y128p[10],scale2),ONE_OVER_SQRT2_Q15_128);
+        y128p[11]  = mulhi_int16(shiftright_int16(y128p[11],scale2),ONE_OVER_SQRT2_Q15_128);
+        y128p[12]  = mulhi_int16(shiftright_int16(y128p[12],scale2),ONE_OVER_SQRT2_Q15_128);
+        y128p[13]  = mulhi_int16(shiftright_int16(y128p[13],scale2),ONE_OVER_SQRT2_Q15_128);
+        y128p[14]  = mulhi_int16(shiftright_int16(y128p[14],scale2),ONE_OVER_SQRT2_Q15_128);
+        y128p[15]  = mulhi_int16(shiftright_int16(y128p[15],scale2),ONE_OVER_SQRT2_Q15_128);
+        y128p+=16;
+      }
+    }
+    else {
+      for (i=0; i<128; i++) {
+        y128p[0]  = mulhi_int16(y128p[0],ONE_OVER_SQRT2_Q15_128);
+        y128p[1]  = mulhi_int16(y128p[1],ONE_OVER_SQRT2_Q15_128);
+        y128p[2]  = mulhi_int16(y128p[2],ONE_OVER_SQRT2_Q15_128);
+        y128p[3]  = mulhi_int16(y128p[3],ONE_OVER_SQRT2_Q15_128);
+        y128p[4]  = mulhi_int16(y128p[4],ONE_OVER_SQRT2_Q15_128);
+        y128p[5]  = mulhi_int16(y128p[5],ONE_OVER_SQRT2_Q15_128);
+        y128p[6]  = mulhi_int16(y128p[6],ONE_OVER_SQRT2_Q15_128);
+        y128p[7]  = mulhi_int16(y128p[7],ONE_OVER_SQRT2_Q15_128);
+        y128p[8]  = mulhi_int16(y128p[8],ONE_OVER_SQRT2_Q15_128);
+        y128p[9]  = mulhi_int16(y128p[9],ONE_OVER_SQRT2_Q15_128);
+        y128p[10] = mulhi_int16(y128p[10],ONE_OVER_SQRT2_Q15_128);
+        y128p[11] = mulhi_int16(y128p[11],ONE_OVER_SQRT2_Q15_128);
+        y128p[12] = mulhi_int16(y128p[12],ONE_OVER_SQRT2_Q15_128);
+        y128p[13] = mulhi_int16(y128p[13],ONE_OVER_SQRT2_Q15_128);
+        y128p[14] = mulhi_int16(y128p[14],ONE_OVER_SQRT2_Q15_128);
+        y128p[15] = mulhi_int16(y128p[15],ONE_OVER_SQRT2_Q15_128);
+        y128p+=16;
+      }
     }
   }
 }
 
-void idft8192(int16_t *x,int16_t *y,unsigned char scale)
+void idft8192(int16_t *x,int16_t *y,unsigned int *scale)
 {
 
   simdshort_q15_t xtmp[4096],*xtmpp,*x64 = (simdshort_q15_t *)x;
@@ -2264,8 +2487,10 @@ void idft8192(int16_t *x,int16_t *y,unsigned char scale)
     xtmpp+=32;
   }
 
-  idft4096((int16_t*)(xtmp),(int16_t*)ytmp,1);
-  idft4096((int16_t*)(xtmp+2048),(int16_t*)(ytmp+1024),1);
+  unsigned int *scale4096=NULL;
+  if (scale) scale4096=scale+1;
+  idft4096((int16_t*)(xtmp),(int16_t*)ytmp,scale4096);
+  idft4096((int16_t*)(xtmp+2048),(int16_t*)(ytmp+1024),scale4096);
 
 
   for (i=0; i<1024; i++) {
@@ -2277,34 +2502,58 @@ void idft8192(int16_t *x,int16_t *y,unsigned char scale)
     ytmpp++;
   }
 
-  if (scale>0) {
+  if (scale && scale>0) {
     y128p = y128;
 
-    for (i=0; i<128; i++) {
-      y128p[0]  = mulhi_int16(y128p[0],ONE_OVER_SQRT2_Q15_128);
-      y128p[1]  = mulhi_int16(y128p[1],ONE_OVER_SQRT2_Q15_128);
-      y128p[2]  = mulhi_int16(y128p[2],ONE_OVER_SQRT2_Q15_128);
-      y128p[3]  = mulhi_int16(y128p[3],ONE_OVER_SQRT2_Q15_128);
-      y128p[4]  = mulhi_int16(y128p[4],ONE_OVER_SQRT2_Q15_128);
-      y128p[5]  = mulhi_int16(y128p[5],ONE_OVER_SQRT2_Q15_128);
-      y128p[6]  = mulhi_int16(y128p[6],ONE_OVER_SQRT2_Q15_128);
-      y128p[7]  = mulhi_int16(y128p[7],ONE_OVER_SQRT2_Q15_128);
-      y128p[8]  = mulhi_int16(y128p[8],ONE_OVER_SQRT2_Q15_128);
-      y128p[9]  = mulhi_int16(y128p[9],ONE_OVER_SQRT2_Q15_128);
-      y128p[10] = mulhi_int16(y128p[10],ONE_OVER_SQRT2_Q15_128);
-      y128p[11] = mulhi_int16(y128p[11],ONE_OVER_SQRT2_Q15_128);
-      y128p[12] = mulhi_int16(y128p[12],ONE_OVER_SQRT2_Q15_128);
-      y128p[13] = mulhi_int16(y128p[13],ONE_OVER_SQRT2_Q15_128);
-      y128p[14] = mulhi_int16(y128p[14],ONE_OVER_SQRT2_Q15_128);
-      y128p[15] = mulhi_int16(y128p[15],ONE_OVER_SQRT2_Q15_128);
-      y128p+=16;
+    if (*scale>1) {
+      uint32_t scale2=*scale-1;
+      for (i=0; i<128; i++) {
+        y128p[0]  = mulhi_int16(shiftright_int16(y128p[0],scale2),ONE_OVER_SQRT2_Q15_128);
+        y128p[1]  = mulhi_int16(shiftright_int16(y128p[1],scale2),ONE_OVER_SQRT2_Q15_128);
+        y128p[2]  = mulhi_int16(shiftright_int16(y128p[2],scale2),ONE_OVER_SQRT2_Q15_128);
+        y128p[3]  = mulhi_int16(shiftright_int16(y128p[3],scale2),ONE_OVER_SQRT2_Q15_128);
+        y128p[4]  = mulhi_int16(shiftright_int16(y128p[4],scale2),ONE_OVER_SQRT2_Q15_128);
+        y128p[5]  = mulhi_int16(shiftright_int16(y128p[5],scale2),ONE_OVER_SQRT2_Q15_128);
+        y128p[6]  = mulhi_int16(shiftright_int16(y128p[6],scale2),ONE_OVER_SQRT2_Q15_128);
+        y128p[7]  = mulhi_int16(shiftright_int16(y128p[7],scale2),ONE_OVER_SQRT2_Q15_128);
+        y128p[8]  = mulhi_int16(shiftright_int16(y128p[8],scale2),ONE_OVER_SQRT2_Q15_128);
+        y128p[9]  = mulhi_int16(shiftright_int16(y128p[9],scale2),ONE_OVER_SQRT2_Q15_128);
+        y128p[10]  = mulhi_int16(shiftright_int16(y128p[10],scale2),ONE_OVER_SQRT2_Q15_128);
+        y128p[11]  = mulhi_int16(shiftright_int16(y128p[11],scale2),ONE_OVER_SQRT2_Q15_128);
+        y128p[12]  = mulhi_int16(shiftright_int16(y128p[12],scale2),ONE_OVER_SQRT2_Q15_128);
+        y128p[13]  = mulhi_int16(shiftright_int16(y128p[13],scale2),ONE_OVER_SQRT2_Q15_128);
+        y128p[14]  = mulhi_int16(shiftright_int16(y128p[14],scale2),ONE_OVER_SQRT2_Q15_128);
+        y128p[15]  = mulhi_int16(shiftright_int16(y128p[15],scale2),ONE_OVER_SQRT2_Q15_128);
+        y128p+=16;
+      }
+    }
+    else {
+      for (i=0; i<128; i++) {
+        y128p[0]  = mulhi_int16(y128p[0],ONE_OVER_SQRT2_Q15_128);
+        y128p[1]  = mulhi_int16(y128p[1],ONE_OVER_SQRT2_Q15_128);
+        y128p[2]  = mulhi_int16(y128p[2],ONE_OVER_SQRT2_Q15_128);
+        y128p[3]  = mulhi_int16(y128p[3],ONE_OVER_SQRT2_Q15_128);
+        y128p[4]  = mulhi_int16(y128p[4],ONE_OVER_SQRT2_Q15_128);
+        y128p[5]  = mulhi_int16(y128p[5],ONE_OVER_SQRT2_Q15_128);
+        y128p[6]  = mulhi_int16(y128p[6],ONE_OVER_SQRT2_Q15_128);
+        y128p[7]  = mulhi_int16(y128p[7],ONE_OVER_SQRT2_Q15_128);
+        y128p[8]  = mulhi_int16(y128p[8],ONE_OVER_SQRT2_Q15_128);
+        y128p[9]  = mulhi_int16(y128p[9],ONE_OVER_SQRT2_Q15_128);
+        y128p[10] = mulhi_int16(y128p[10],ONE_OVER_SQRT2_Q15_128);
+        y128p[11] = mulhi_int16(y128p[11],ONE_OVER_SQRT2_Q15_128);
+        y128p[12] = mulhi_int16(y128p[12],ONE_OVER_SQRT2_Q15_128);
+        y128p[13] = mulhi_int16(y128p[13],ONE_OVER_SQRT2_Q15_128);
+        y128p[14] = mulhi_int16(y128p[14],ONE_OVER_SQRT2_Q15_128);
+        y128p[15] = mulhi_int16(y128p[15],ONE_OVER_SQRT2_Q15_128);
+        y128p+=16;
+      }
     }
   }
 }
 
 int16_t tw16384[3*2*4096];
 
-void dft16384(int16_t *x,int16_t *y,unsigned char scale)
+void dft16384(int16_t *x,int16_t *y,unsigned int *scale)
 {
 
   simd_q15_t xtmp[4096],ytmp[4096],*tw16384_128p=(simd_q15_t *)tw16384,*x128=(simd_q15_t *)x,*y128=(simd_q15_t *)y,*y128p=(simd_q15_t *)y;
@@ -2316,10 +2565,12 @@ void dft16384(int16_t *x,int16_t *y,unsigned char scale)
   }
 
 
-  dft4096((int16_t*)(xtmp),(int16_t*)(ytmp),1);
-  dft4096((int16_t*)(xtmp+1024),(int16_t*)(ytmp+1024),1);
-  dft4096((int16_t*)(xtmp+2048),(int16_t*)(ytmp+2048),1);
-  dft4096((int16_t*)(xtmp+3072),(int16_t*)(ytmp+3072),1);
+  unsigned int *scale4096=NULL;
+  if (scale) scale4096=scale+1;
+  dft4096((int16_t*)(xtmp),(int16_t*)(ytmp),scale4096);
+  dft4096((int16_t*)(xtmp+1024),(int16_t*)(ytmp+1024),scale4096);
+  dft4096((int16_t*)(xtmp+2048),(int16_t*)(ytmp+2048),scale4096);
+  dft4096((int16_t*)(xtmp+3072),(int16_t*)(ytmp+3072),scale4096);
 
   for (i=0; i<1024; i++) {
     bfly4(ytmpp,ytmpp+1024,ytmpp+2048,ytmpp+3072,
@@ -2330,39 +2581,35 @@ void dft16384(int16_t *x,int16_t *y,unsigned char scale)
     ytmpp++;
   }
 
-  if (scale>0) {
+  if (scale && *scale>0) {
+    unsigned int scalec=*scale;
 
     for (i=0; i<256; i++) {
-      y128[0]  = shiftright_int16(y128[0],1);
-      y128[1]  = shiftright_int16(y128[1],1);
-      y128[2]  = shiftright_int16(y128[2],1);
-      y128[3]  = shiftright_int16(y128[3],1);
-      y128[4]  = shiftright_int16(y128[4],1);
-      y128[5]  = shiftright_int16(y128[5],1);
-      y128[6]  = shiftright_int16(y128[6],1);
-      y128[7]  = shiftright_int16(y128[7],1);
-      y128[8]  = shiftright_int16(y128[8],1);
-      y128[9]  = shiftright_int16(y128[9],1);
-      y128[10] = shiftright_int16(y128[10],1);
-      y128[11] = shiftright_int16(y128[11],1);
-      y128[12] = shiftright_int16(y128[12],1);
-      y128[13] = shiftright_int16(y128[13],1);
-      y128[14] = shiftright_int16(y128[14],1);
-      y128[15] = shiftright_int16(y128[15],1);
+      y128[0]  = shiftright_int16(y128[0],scalec);
+      y128[1]  = shiftright_int16(y128[1],scalec);
+      y128[2]  = shiftright_int16(y128[2],scalec);
+      y128[3]  = shiftright_int16(y128[3],scalec);
+      y128[4]  = shiftright_int16(y128[4],scalec);
+      y128[5]  = shiftright_int16(y128[5],scalec);
+      y128[6]  = shiftright_int16(y128[6],scalec);
+      y128[7]  = shiftright_int16(y128[7],scalec);
+      y128[8]  = shiftright_int16(y128[8],scalec);
+      y128[9]  = shiftright_int16(y128[9],scalec);
+      y128[10] = shiftright_int16(y128[10],scalec);
+      y128[11] = shiftright_int16(y128[11],scalec);
+      y128[12] = shiftright_int16(y128[12],scalec);
+      y128[13] = shiftright_int16(y128[13],scalec);
+      y128[14] = shiftright_int16(y128[14],scalec);
+      y128[15] = shiftright_int16(y128[15],scalec);
 
       y128+=16;
     }
-
   }
-
-  
-  
-
 }
 
  
 
-void idft16384(int16_t *x,int16_t *y,unsigned char scale)
+void idft16384(int16_t *x,int16_t *y,unsigned int *scale)
 {
 
   simd_q15_t xtmp[4096],ytmp[4096],*tw16384_128p=(simd_q15_t *)tw16384,*x128=(simd_q15_t *)x,*y128=(simd_q15_t *)y,*y128p=(simd_q15_t *)y;
@@ -2374,10 +2621,12 @@ void idft16384(int16_t *x,int16_t *y,unsigned char scale)
   }
 
 
-  idft4096((int16_t*)(xtmp),(int16_t*)(ytmp),1);
-  idft4096((int16_t*)(xtmp+1024),(int16_t*)(ytmp+1024),1);
-  idft4096((int16_t*)(xtmp+2048),(int16_t*)(ytmp+2048),1);
-  idft4096((int16_t*)(xtmp+3072),(int16_t*)(ytmp+3072),1);
+  unsigned int *scale4096=NULL;
+  if (scale) scale4096=scale+1;
+  idft4096((int16_t*)(xtmp),(int16_t*)(ytmp),scale4096);
+  idft4096((int16_t*)(xtmp+1024),(int16_t*)(ytmp+1024),scale4096);
+  idft4096((int16_t*)(xtmp+2048),(int16_t*)(ytmp+2048),scale4096);
+  idft4096((int16_t*)(xtmp+3072),(int16_t*)(ytmp+3072),scale4096);
 
   for (i=0; i<1024; i++) {
     ibfly4(ytmpp,ytmpp+1024,ytmpp+2048,ytmpp+3072,
@@ -2388,25 +2637,26 @@ void idft16384(int16_t *x,int16_t *y,unsigned char scale)
     ytmpp++;
   }
 
-  if (scale>0) {
+  if (scale && *scale>0) {
+    unsigned int scalec=*scale;
 
     for (i=0; i<256; i++) {
-      y128[0]  = shiftright_int16(y128[0],scale);
-      y128[1]  = shiftright_int16(y128[1],scale);
-      y128[2]  = shiftright_int16(y128[2],scale);
-      y128[3]  = shiftright_int16(y128[3],scale);
-      y128[4]  = shiftright_int16(y128[4],scale);
-      y128[5]  = shiftright_int16(y128[5],scale);
-      y128[6]  = shiftright_int16(y128[6],scale);
-      y128[7]  = shiftright_int16(y128[7],scale);
-      y128[8]  = shiftright_int16(y128[8],scale);
-      y128[9]  = shiftright_int16(y128[9],scale);
-      y128[10] = shiftright_int16(y128[10],scale);
-      y128[11] = shiftright_int16(y128[11],scale);
-      y128[12] = shiftright_int16(y128[12],scale);
-      y128[13] = shiftright_int16(y128[13],scale);
-      y128[14] = shiftright_int16(y128[14],scale);
-      y128[15] = shiftright_int16(y128[15],scale);
+      y128[0]  = shiftright_int16(y128[0],scalec);
+      y128[1]  = shiftright_int16(y128[1],scalec);
+      y128[2]  = shiftright_int16(y128[2],scalec);
+      y128[3]  = shiftright_int16(y128[3],scalec);
+      y128[4]  = shiftright_int16(y128[4],scalec);
+      y128[5]  = shiftright_int16(y128[5],scalec);
+      y128[6]  = shiftright_int16(y128[6],scalec);
+      y128[7]  = shiftright_int16(y128[7],scalec);
+      y128[8]  = shiftright_int16(y128[8],scalec);
+      y128[9]  = shiftright_int16(y128[9],scalec);
+      y128[10] = shiftright_int16(y128[10],scalec);
+      y128[11] = shiftright_int16(y128[11],scalec);
+      y128[12] = shiftright_int16(y128[12],scalec);
+      y128[13] = shiftright_int16(y128[13],scalec);
+      y128[14] = shiftright_int16(y128[14],scalec);
+      y128[15] = shiftright_int16(y128[15],scalec);
 
       y128+=16;
     }
@@ -2416,7 +2666,7 @@ void idft16384(int16_t *x,int16_t *y,unsigned char scale)
 
 int16_t tw32768[2*16384] __attribute__((aligned(32)));
 
-void dft32768(int16_t *x,int16_t *y,unsigned char scale)
+void dft32768(int16_t *x,int16_t *y,unsigned int *scale)
 {
 
   simdshort_q15_t xtmp[16384],*xtmpp,*x64 = (simdshort_q15_t *)x;
@@ -2464,8 +2714,10 @@ void dft32768(int16_t *x,int16_t *y,unsigned char scale)
     xtmpp+=32;
   }
 
-  dft16384((int16_t*)(xtmp),(int16_t*)ytmp,1);
-  dft16384((int16_t*)(xtmp+8192),(int16_t*)(ytmp+4096),1);
+  unsigned int *scale16384=NULL;
+  if (scale) scale16384=scale+1;
+  dft16384((int16_t*)(xtmp),(int16_t*)ytmp,scale16384);
+  dft16384((int16_t*)(xtmp+8192),(int16_t*)(ytmp+4096),scale16384);
 
 
   for (i=0; i<4096; i++) {
@@ -2477,32 +2729,55 @@ void dft32768(int16_t *x,int16_t *y,unsigned char scale)
     ytmpp++;
   }
 
-  if (scale>0) {
+  if (scale && *scale>0) {
     y128p = y128;
-
-    for (i=0; i<512; i++) {
-      y128p[0]  = mulhi_int16(y128p[0],ONE_OVER_SQRT2_Q15_128);
-      y128p[1]  = mulhi_int16(y128p[1],ONE_OVER_SQRT2_Q15_128);
-      y128p[2]  = mulhi_int16(y128p[2],ONE_OVER_SQRT2_Q15_128);
-      y128p[3]  = mulhi_int16(y128p[3],ONE_OVER_SQRT2_Q15_128);
-      y128p[4]  = mulhi_int16(y128p[4],ONE_OVER_SQRT2_Q15_128);
-      y128p[5]  = mulhi_int16(y128p[5],ONE_OVER_SQRT2_Q15_128);
-      y128p[6]  = mulhi_int16(y128p[6],ONE_OVER_SQRT2_Q15_128);
-      y128p[7]  = mulhi_int16(y128p[7],ONE_OVER_SQRT2_Q15_128);
-      y128p[8]  = mulhi_int16(y128p[8],ONE_OVER_SQRT2_Q15_128);
-      y128p[9]  = mulhi_int16(y128p[9],ONE_OVER_SQRT2_Q15_128);
-      y128p[10] = mulhi_int16(y128p[10],ONE_OVER_SQRT2_Q15_128);
-      y128p[11] = mulhi_int16(y128p[11],ONE_OVER_SQRT2_Q15_128);
-      y128p[12] = mulhi_int16(y128p[12],ONE_OVER_SQRT2_Q15_128);
-      y128p[13] = mulhi_int16(y128p[13],ONE_OVER_SQRT2_Q15_128);
-      y128p[14] = mulhi_int16(y128p[14],ONE_OVER_SQRT2_Q15_128);
-      y128p[15] = mulhi_int16(y128p[15],ONE_OVER_SQRT2_Q15_128);
-      y128p+=16;
+    if (*scale>1) {
+      uint32_t scale2=*scale-1;
+      for (i=0; i<512; i++) {
+        y128p[0]  = mulhi_int16(shiftright_int16(y128p[0],scale2),ONE_OVER_SQRT2_Q15_128);
+        y128p[1]  = mulhi_int16(shiftright_int16(y128p[1],scale2),ONE_OVER_SQRT2_Q15_128);
+        y128p[2]  = mulhi_int16(shiftright_int16(y128p[2],scale2),ONE_OVER_SQRT2_Q15_128);
+        y128p[3]  = mulhi_int16(shiftright_int16(y128p[3],scale2),ONE_OVER_SQRT2_Q15_128);
+        y128p[4]  = mulhi_int16(shiftright_int16(y128p[4],scale2),ONE_OVER_SQRT2_Q15_128);
+        y128p[5]  = mulhi_int16(shiftright_int16(y128p[5],scale2),ONE_OVER_SQRT2_Q15_128);
+        y128p[6]  = mulhi_int16(shiftright_int16(y128p[6],scale2),ONE_OVER_SQRT2_Q15_128);
+        y128p[7]  = mulhi_int16(shiftright_int16(y128p[7],scale2),ONE_OVER_SQRT2_Q15_128);
+        y128p[8]  = mulhi_int16(shiftright_int16(y128p[8],scale2),ONE_OVER_SQRT2_Q15_128);
+        y128p[9]  = mulhi_int16(shiftright_int16(y128p[9],scale2),ONE_OVER_SQRT2_Q15_128);
+        y128p[10]  = mulhi_int16(shiftright_int16(y128p[10],scale2),ONE_OVER_SQRT2_Q15_128);
+        y128p[11]  = mulhi_int16(shiftright_int16(y128p[11],scale2),ONE_OVER_SQRT2_Q15_128);
+        y128p[12]  = mulhi_int16(shiftright_int16(y128p[12],scale2),ONE_OVER_SQRT2_Q15_128);
+        y128p[13]  = mulhi_int16(shiftright_int16(y128p[13],scale2),ONE_OVER_SQRT2_Q15_128);
+        y128p[14]  = mulhi_int16(shiftright_int16(y128p[14],scale2),ONE_OVER_SQRT2_Q15_128);
+        y128p[15]  = mulhi_int16(shiftright_int16(y128p[15],scale2),ONE_OVER_SQRT2_Q15_128);
+        y128p+=16;
+      }
+    }
+    else {
+      for (i=0; i<512; i++) {
+        y128p[0]  = mulhi_int16(y128p[0],ONE_OVER_SQRT2_Q15_128);
+        y128p[1]  = mulhi_int16(y128p[1],ONE_OVER_SQRT2_Q15_128);
+        y128p[2]  = mulhi_int16(y128p[2],ONE_OVER_SQRT2_Q15_128);
+        y128p[3]  = mulhi_int16(y128p[3],ONE_OVER_SQRT2_Q15_128);
+        y128p[4]  = mulhi_int16(y128p[4],ONE_OVER_SQRT2_Q15_128);
+        y128p[5]  = mulhi_int16(y128p[5],ONE_OVER_SQRT2_Q15_128);
+        y128p[6]  = mulhi_int16(y128p[6],ONE_OVER_SQRT2_Q15_128);
+        y128p[7]  = mulhi_int16(y128p[7],ONE_OVER_SQRT2_Q15_128);
+        y128p[8]  = mulhi_int16(y128p[8],ONE_OVER_SQRT2_Q15_128);
+        y128p[9]  = mulhi_int16(y128p[9],ONE_OVER_SQRT2_Q15_128);
+        y128p[10] = mulhi_int16(y128p[10],ONE_OVER_SQRT2_Q15_128);
+        y128p[11] = mulhi_int16(y128p[11],ONE_OVER_SQRT2_Q15_128);
+        y128p[12] = mulhi_int16(y128p[12],ONE_OVER_SQRT2_Q15_128);
+        y128p[13] = mulhi_int16(y128p[13],ONE_OVER_SQRT2_Q15_128);
+        y128p[14] = mulhi_int16(y128p[14],ONE_OVER_SQRT2_Q15_128);
+        y128p[15] = mulhi_int16(y128p[15],ONE_OVER_SQRT2_Q15_128);
+        y128p+=16;
+      }
     }
   }
 }
 
-void idft32768(int16_t *x,int16_t *y,unsigned char scale)
+void idft32768(int16_t *x,int16_t *y,unsigned int *scale)
 {
 
   simdshort_q15_t xtmp[16384],*xtmpp,*x64 = (simdshort_q15_t *)x;
@@ -2550,8 +2825,10 @@ void idft32768(int16_t *x,int16_t *y,unsigned char scale)
     xtmpp+=32;
   }
 
-  idft16384((int16_t*)(xtmp),(int16_t*)ytmp,1);
-  idft16384((int16_t*)(xtmp+8192),(int16_t*)(ytmp+4096),1);
+  unsigned int *scale16384=NULL;
+  if (scale) scale16384=scale+1;
+  idft16384((int16_t*)(xtmp),(int16_t*)ytmp,scale16384);
+  idft16384((int16_t*)(xtmp+8192),(int16_t*)(ytmp+4096),scale16384);
 
 
   for (i=0; i<4096; i++) {
@@ -2563,27 +2840,50 @@ void idft32768(int16_t *x,int16_t *y,unsigned char scale)
     ytmpp++;
   }
 
-  if (scale>0) {
+  if (scale && *scale>0) {
     y128p = y128;
-
-    for (i=0; i<512; i++) {
-      y128p[0]  = mulhi_int16(y128p[0],ONE_OVER_SQRT2_Q15_128);
-      y128p[1]  = mulhi_int16(y128p[1],ONE_OVER_SQRT2_Q15_128);
-      y128p[2]  = mulhi_int16(y128p[2],ONE_OVER_SQRT2_Q15_128);
-      y128p[3]  = mulhi_int16(y128p[3],ONE_OVER_SQRT2_Q15_128);
-      y128p[4]  = mulhi_int16(y128p[4],ONE_OVER_SQRT2_Q15_128);
-      y128p[5]  = mulhi_int16(y128p[5],ONE_OVER_SQRT2_Q15_128);
-      y128p[6]  = mulhi_int16(y128p[6],ONE_OVER_SQRT2_Q15_128);
-      y128p[7]  = mulhi_int16(y128p[7],ONE_OVER_SQRT2_Q15_128);
-      y128p[8]  = mulhi_int16(y128p[8],ONE_OVER_SQRT2_Q15_128);
-      y128p[9]  = mulhi_int16(y128p[9],ONE_OVER_SQRT2_Q15_128);
-      y128p[10] = mulhi_int16(y128p[10],ONE_OVER_SQRT2_Q15_128);
-      y128p[11] = mulhi_int16(y128p[11],ONE_OVER_SQRT2_Q15_128);
-      y128p[12] = mulhi_int16(y128p[12],ONE_OVER_SQRT2_Q15_128);
-      y128p[13] = mulhi_int16(y128p[13],ONE_OVER_SQRT2_Q15_128);
-      y128p[14] = mulhi_int16(y128p[14],ONE_OVER_SQRT2_Q15_128);
-      y128p[15] = mulhi_int16(y128p[15],ONE_OVER_SQRT2_Q15_128);
-      y128p+=16;
+    if (*scale>1) {
+      uint32_t scale2=*scale-1;
+      for (i=0; i<512; i++) {
+        y128p[0]  = mulhi_int16(shiftright_int16(y128p[0],scale2),ONE_OVER_SQRT2_Q15_128);
+        y128p[1]  = mulhi_int16(shiftright_int16(y128p[1],scale2),ONE_OVER_SQRT2_Q15_128);
+        y128p[2]  = mulhi_int16(shiftright_int16(y128p[2],scale2),ONE_OVER_SQRT2_Q15_128);
+        y128p[3]  = mulhi_int16(shiftright_int16(y128p[3],scale2),ONE_OVER_SQRT2_Q15_128);
+        y128p[4]  = mulhi_int16(shiftright_int16(y128p[4],scale2),ONE_OVER_SQRT2_Q15_128);
+        y128p[5]  = mulhi_int16(shiftright_int16(y128p[5],scale2),ONE_OVER_SQRT2_Q15_128);
+        y128p[6]  = mulhi_int16(shiftright_int16(y128p[6],scale2),ONE_OVER_SQRT2_Q15_128);
+        y128p[7]  = mulhi_int16(shiftright_int16(y128p[7],scale2),ONE_OVER_SQRT2_Q15_128);
+        y128p[8]  = mulhi_int16(shiftright_int16(y128p[8],scale2),ONE_OVER_SQRT2_Q15_128);
+        y128p[9]  = mulhi_int16(shiftright_int16(y128p[9],scale2),ONE_OVER_SQRT2_Q15_128);
+        y128p[10]  = mulhi_int16(shiftright_int16(y128p[10],scale2),ONE_OVER_SQRT2_Q15_128);
+        y128p[11]  = mulhi_int16(shiftright_int16(y128p[11],scale2),ONE_OVER_SQRT2_Q15_128);
+        y128p[12]  = mulhi_int16(shiftright_int16(y128p[12],scale2),ONE_OVER_SQRT2_Q15_128);
+        y128p[13]  = mulhi_int16(shiftright_int16(y128p[13],scale2),ONE_OVER_SQRT2_Q15_128);
+        y128p[14]  = mulhi_int16(shiftright_int16(y128p[14],scale2),ONE_OVER_SQRT2_Q15_128);
+        y128p[15]  = mulhi_int16(shiftright_int16(y128p[15],scale2),ONE_OVER_SQRT2_Q15_128);
+        y128p+=16;
+      }
+    }
+    else {
+      for (i=0; i<512; i++) {
+        y128p[0]  = mulhi_int16(y128p[0],ONE_OVER_SQRT2_Q15_128);
+        y128p[1]  = mulhi_int16(y128p[1],ONE_OVER_SQRT2_Q15_128);
+        y128p[2]  = mulhi_int16(y128p[2],ONE_OVER_SQRT2_Q15_128);
+        y128p[3]  = mulhi_int16(y128p[3],ONE_OVER_SQRT2_Q15_128);
+        y128p[4]  = mulhi_int16(y128p[4],ONE_OVER_SQRT2_Q15_128);
+        y128p[5]  = mulhi_int16(y128p[5],ONE_OVER_SQRT2_Q15_128);
+        y128p[6]  = mulhi_int16(y128p[6],ONE_OVER_SQRT2_Q15_128);
+        y128p[7]  = mulhi_int16(y128p[7],ONE_OVER_SQRT2_Q15_128);
+        y128p[8]  = mulhi_int16(y128p[8],ONE_OVER_SQRT2_Q15_128);
+        y128p[9]  = mulhi_int16(y128p[9],ONE_OVER_SQRT2_Q15_128);
+        y128p[10] = mulhi_int16(y128p[10],ONE_OVER_SQRT2_Q15_128);
+        y128p[11] = mulhi_int16(y128p[11],ONE_OVER_SQRT2_Q15_128);
+        y128p[12] = mulhi_int16(y128p[12],ONE_OVER_SQRT2_Q15_128);
+        y128p[13] = mulhi_int16(y128p[13],ONE_OVER_SQRT2_Q15_128);
+        y128p[14] = mulhi_int16(y128p[14],ONE_OVER_SQRT2_Q15_128);
+        y128p[15] = mulhi_int16(y128p[15],ONE_OVER_SQRT2_Q15_128);
+        y128p+=16;
+      }
     }
   }
 }
@@ -2591,7 +2891,7 @@ void idft32768(int16_t *x,int16_t *y,unsigned char scale)
 int16_t twa768[512],twb768[512];
 
 // 256 x 3
-void idft768(int16_t *input, int16_t *output, unsigned char scale)
+void idft768(int16_t *input, int16_t *output, unsigned int *scale)
 {
   int i,i2,j;
   uint32_t tmp[3][256]__attribute__((aligned(32)));
@@ -2605,9 +2905,11 @@ void idft768(int16_t *input, int16_t *output, unsigned char scale)
     tmp[2][i] = ((uint32_t *)input)[j++];
   }
 
-  idft256((int16_t*)(tmp[0]),(int16_t*)(tmpo[0]),1);
-  idft256((int16_t*)(tmp[1]),(int16_t*)(tmpo[1]),1);
-  idft256((int16_t*)(tmp[2]),(int16_t*)(tmpo[2]),1);
+  unsigned int *scale256=NULL;
+  if (scale) scale256=scale+1;
+  idft256((int16_t*)(tmp[0]),(int16_t*)(tmpo[0]),scale256);
+  idft256((int16_t*)(tmp[1]),(int16_t*)(tmpo[1]),scale256);
+  idft256((int16_t*)(tmp[2]),(int16_t*)(tmpo[2]),scale256);
 
   for (i=0,i2=0; i<512; i+=8,i2+=4)  {
     ibfly3((simd_q15_t*)(&tmpo[0][i2]),(simd_q15_t*)(&tmpo[1][i2]),((simd_q15_t*)&tmpo[2][i2]),
@@ -2616,7 +2918,7 @@ void idft768(int16_t *input, int16_t *output, unsigned char scale)
   }
 
 
-  if (scale==1) {
+  if (scale && *scale>1) {
     for (i=0; i<12; i++) {
       y128p[0]  = mulhi_int16(y128p[0],ONE_OVER_SQRT3_Q15_128);
       y128p[1]  = mulhi_int16(y128p[1],ONE_OVER_SQRT3_Q15_128);
@@ -2639,7 +2941,7 @@ void idft768(int16_t *input, int16_t *output, unsigned char scale)
   }
 }
 
-void dft768(int16_t *input, int16_t *output, unsigned char scale)
+void dft768(int16_t *input, int16_t *output, unsigned int *scale)
 {
   int i,i2,j;
   uint32_t tmp[3][256] __attribute__((aligned(32)));
@@ -2653,9 +2955,11 @@ void dft768(int16_t *input, int16_t *output, unsigned char scale)
     tmp[2][i] = ((uint32_t *)input)[j++];
   }
 
-  dft256((int16_t*)(tmp[0]),(int16_t*)(tmpo[0]),1);
-  dft256((int16_t*)(tmp[1]),(int16_t*)(tmpo[1]),1);
-  dft256((int16_t*)(tmp[2]),(int16_t*)(tmpo[2]),1);
+  unsigned int *scale256=NULL;
+  if (scale) scale256=scale+1;
+  dft256((int16_t*)(tmp[0]),(int16_t*)(tmpo[0]),scale256);
+  dft256((int16_t*)(tmp[1]),(int16_t*)(tmpo[1]),scale256);
+  dft256((int16_t*)(tmp[2]),(int16_t*)(tmpo[2]),scale256);
 
   /*
   for (i=1; i<512; i++) {
@@ -2676,7 +2980,7 @@ void dft768(int16_t *input, int16_t *output, unsigned char scale)
           (simd_q15_t*)(twa768+i),(simd_q15_t*)(twb768+i));
   }
 
-  if (scale==1) {
+  if (scale && *scale>0) {
     for (i=0; i<12; i++) {
       y128p[0]  = mulhi_int16(y128p[0],ONE_OVER_SQRT3_Q15_128);
       y128p[1]  = mulhi_int16(y128p[1],ONE_OVER_SQRT3_Q15_128);
@@ -2701,7 +3005,7 @@ void dft768(int16_t *input, int16_t *output, unsigned char scale)
 int16_t twa1536[1024],twb1536[1024];
 
 // 512 x 3
-void idft1536(int16_t *input, int16_t *output, unsigned char scale)
+void idft1536(int16_t *input, int16_t *output, unsigned int *scale)
 {
   int i,i2,j;
   uint32_t tmp[3][512 ]__attribute__((aligned(32)));
@@ -2715,9 +3019,11 @@ void idft1536(int16_t *input, int16_t *output, unsigned char scale)
     tmp[2][i] = ((uint32_t *)input)[j++];
   }
 
-  idft512((int16_t*)(tmp[0]),(int16_t*)(tmpo[0]),1);
-  idft512((int16_t*)(tmp[1]),(int16_t*)(tmpo[1]),1);
-  idft512((int16_t*)(tmp[2]),(int16_t*)(tmpo[2]),1);
+  unsigned int *scale512=NULL;
+  if (scale) scale512=scale+1;
+  idft512((int16_t*)(tmp[0]),(int16_t*)(tmpo[0]),scale512);
+  idft512((int16_t*)(tmp[1]),(int16_t*)(tmpo[1]),scale512);
+  idft512((int16_t*)(tmp[2]),(int16_t*)(tmpo[2]),scale512);
 
   for (i=0,i2=0; i<1024; i+=8,i2+=4)  {
     ibfly3((simd_q15_t*)(&tmpo[0][i2]),(simd_q15_t*)(&tmpo[1][i2]),((simd_q15_t*)&tmpo[2][i2]),
@@ -2726,7 +3032,7 @@ void idft1536(int16_t *input, int16_t *output, unsigned char scale)
   }
 
 
-  if (scale==1) {
+  if (scale && *scale>0) {
     for (i=0; i<24; i++) {
       y128p[0]  = mulhi_int16(y128p[0],ONE_OVER_SQRT3_Q15_128);
       y128p[1]  = mulhi_int16(y128p[1],ONE_OVER_SQRT3_Q15_128);
@@ -2749,7 +3055,7 @@ void idft1536(int16_t *input, int16_t *output, unsigned char scale)
   }
 }
 
-void dft1536(int16_t *input, int16_t *output, unsigned char scale)
+void dft1536(int16_t *input, int16_t *output, unsigned int *scale)
 {
   int i,i2,j;
   uint32_t tmp[3][512] __attribute__((aligned(32)));
@@ -2763,9 +3069,11 @@ void dft1536(int16_t *input, int16_t *output, unsigned char scale)
     tmp[2][i] = ((uint32_t *)input)[j++];
   }
 
-  dft512((int16_t*)(tmp[0]),(int16_t*)(tmpo[0]),1);
-  dft512((int16_t*)(tmp[1]),(int16_t*)(tmpo[1]),1);
-  dft512((int16_t*)(tmp[2]),(int16_t*)(tmpo[2]),1);
+  unsigned int *scale512=NULL;
+  if (scale) scale512=scale+1;
+  dft512((int16_t*)(tmp[0]),(int16_t*)(tmpo[0]),scale512);
+  dft512((int16_t*)(tmp[1]),(int16_t*)(tmpo[1]),scale512);
+  dft512((int16_t*)(tmp[2]),(int16_t*)(tmpo[2]),scale512);
 
   /*
   for (i=1; i<512; i++) {
@@ -2786,7 +3094,7 @@ void dft1536(int16_t *input, int16_t *output, unsigned char scale)
           (simd_q15_t*)(twa1536+i),(simd_q15_t*)(twb1536+i));
   }
 
-  if (scale==1) {
+  if (scale && *scale>0) {
     for (i=0; i<24; i++) {
       y128p[0]  = mulhi_int16(y128p[0],ONE_OVER_SQRT3_Q15_128);
       y128p[1]  = mulhi_int16(y128p[1],ONE_OVER_SQRT3_Q15_128);
@@ -2812,7 +3120,7 @@ void dft1536(int16_t *input, int16_t *output, unsigned char scale)
 int16_t twa3072[2048] __attribute__((aligned(32)));
 int16_t twb3072[2048] __attribute__((aligned(32)));
 // 1024 x 3
-void dft3072(int16_t *input, int16_t *output,unsigned char scale)
+void dft3072(int16_t *input, int16_t *output,unsigned int *scale)
 {
   int i,i2,j;
   uint32_t tmp[3][1024] __attribute__((aligned(32)));
@@ -2826,9 +3134,11 @@ void dft3072(int16_t *input, int16_t *output,unsigned char scale)
     tmp[2][i] = ((uint32_t *)input)[j++];
   }
 
-  dft1024((int16_t*)(tmp[0]),(int16_t*)(tmpo[0]),1);
-  dft1024((int16_t*)(tmp[1]),(int16_t*)(tmpo[1]),1);
-  dft1024((int16_t*)(tmp[2]),(int16_t*)(tmpo[2]),1);
+  unsigned int *scale1024=NULL;
+  if (scale) scale1024=scale+1;
+  dft1024((int16_t*)(tmp[0]),(int16_t*)(tmpo[0]),scale1024);
+  dft1024((int16_t*)(tmp[1]),(int16_t*)(tmpo[1]),scale1024);
+  dft1024((int16_t*)(tmp[2]),(int16_t*)(tmpo[2]),scale1024);
 
   for (i=0,i2=0; i<2048; i+=8,i2+=4)  {
     bfly3((simd_q15_t*)(&tmpo[0][i2]),(simd_q15_t*)(&tmpo[1][i2]),(simd_q15_t*)(&tmpo[2][i2]),
@@ -2836,7 +3146,7 @@ void dft3072(int16_t *input, int16_t *output,unsigned char scale)
           (simd_q15_t*)(twa3072+i),(simd_q15_t*)(twb3072+i));
   }
 
-  if (scale==1) {
+  if (scale && *scale>0) {
     for (i=0; i<48; i++) {
       y128p[0]  = mulhi_int16(y128p[0],ONE_OVER_SQRT3_Q15_128);
       y128p[1]  = mulhi_int16(y128p[1],ONE_OVER_SQRT3_Q15_128);
@@ -2859,7 +3169,7 @@ void dft3072(int16_t *input, int16_t *output,unsigned char scale)
   }
 }
 
-void idft3072(int16_t *input, int16_t *output,unsigned char scale)
+void idft3072(int16_t *input, int16_t *output,unsigned int *scale)
 {
   int i,i2,j;
   uint32_t tmp[3][1024]__attribute__((aligned(32)));
@@ -2872,9 +3182,11 @@ void idft3072(int16_t *input, int16_t *output,unsigned char scale)
     tmp[1][i] = ((uint32_t *)input)[j++];
     tmp[2][i] = ((uint32_t *)input)[j++];
   }
-  idft1024((int16_t*)(tmp[0]),(int16_t*)(tmpo[0]),1);
-  idft1024((int16_t*)(tmp[1]),(int16_t*)(tmpo[1]),1);
-  idft1024((int16_t*)(tmp[2]),(int16_t*)(tmpo[2]),1);
+  unsigned int *scale1024=NULL;
+  if (scale) scale1024=scale+1;
+  idft1024((int16_t*)(tmp[0]),(int16_t*)(tmpo[0]),scale1024);
+  idft1024((int16_t*)(tmp[1]),(int16_t*)(tmpo[1]),scale1024);
+  idft1024((int16_t*)(tmp[2]),(int16_t*)(tmpo[2]),scale1024);
 
   for (i=0,i2=0; i<2048; i+=8,i2+=4)  {
     ibfly3((simd_q15_t*)(&tmpo[0][i2]),(simd_q15_t*)(&tmpo[1][i2]),((simd_q15_t*)&tmpo[2][i2]),
@@ -2883,7 +3195,7 @@ void idft3072(int16_t *input, int16_t *output,unsigned char scale)
   }
 
 
-  if (scale==1) {
+  if (scale && *scale>0) {
     for (i=0; i<48; i++) {
       y128p[0]  = mulhi_int16(y128p[0],ONE_OVER_SQRT3_Q15_128);
       y128p[1]  = mulhi_int16(y128p[1],ONE_OVER_SQRT3_Q15_128);
@@ -2910,7 +3222,7 @@ void idft3072(int16_t *input, int16_t *output,unsigned char scale)
 int16_t twa6144[4096] __attribute__((aligned(32)));
 int16_t twb6144[4096] __attribute__((aligned(32)));
 
-void idft6144(int16_t *input, int16_t *output,unsigned char scale)
+void idft6144(int16_t *input, int16_t *output,unsigned int *scale)
 {
   int i,i2,j;
   uint32_t tmp[3][2048] __attribute__((aligned(32)));
@@ -2924,9 +3236,11 @@ void idft6144(int16_t *input, int16_t *output,unsigned char scale)
     tmp[2][i] = ((uint32_t *)input)[j++];
   }
 
-  idft2048((int16_t*)(tmp[0]),(int16_t*)(tmpo[0]),1);
-  idft2048((int16_t*)(tmp[1]),(int16_t*)(tmpo[1]),1);
-  idft2048((int16_t*)(tmp[2]),(int16_t*)(tmpo[2]),1);
+  unsigned int *scale2048=NULL;
+  if (scale) scale2048=scale+1;
+  idft2048((int16_t*)(tmp[0]),(int16_t*)(tmpo[0]),scale2048);
+  idft2048((int16_t*)(tmp[1]),(int16_t*)(tmpo[1]),scale2048);
+  idft2048((int16_t*)(tmp[2]),(int16_t*)(tmpo[2]),scale2048);
 #ifndef MR_MAIN
   if (LOG_DUMPFLAG(DEBUG_DFT)) {
     LOG_M("idft6144in.m","in",input,6144,1,1);
@@ -2942,7 +3256,7 @@ void idft6144(int16_t *input, int16_t *output,unsigned char scale)
   }
 
 
-  if (scale==1) {
+  if (scale && *scale>0) {
     for (i=0; i<96; i++) {
       y128p[0]  = mulhi_int16(y128p[0],ONE_OVER_SQRT3_Q15_128);
       y128p[1]  = mulhi_int16(y128p[1],ONE_OVER_SQRT3_Q15_128);
@@ -2966,7 +3280,7 @@ void idft6144(int16_t *input, int16_t *output,unsigned char scale)
 }
 
 
-void dft6144(int16_t *input, int16_t *output,unsigned char scale)
+void dft6144(int16_t *input, int16_t *output,unsigned int *scale)
 {
   int i,i2,j;
   uint32_t tmp[3][2048] __attribute__((aligned(32)));
@@ -2980,9 +3294,11 @@ void dft6144(int16_t *input, int16_t *output,unsigned char scale)
     tmp[2][i] = ((uint32_t *)input)[j++];
   }
 
-  dft2048((int16_t*)(tmp[0]),(int16_t*)(tmpo[0]),1);
-  dft2048((int16_t*)(tmp[1]),(int16_t*)(tmpo[1]),1);
-  dft2048((int16_t*)(tmp[2]),(int16_t*)(tmpo[2]),1);
+  unsigned int *scale2048=NULL;
+  if (scale) scale2048=scale+1;
+  dft2048((int16_t*)(tmp[0]),(int16_t*)(tmpo[0]),scale2048);
+  dft2048((int16_t*)(tmp[1]),(int16_t*)(tmpo[1]),scale2048);
+  dft2048((int16_t*)(tmp[2]),(int16_t*)(tmpo[2]),scale2048);
 
   /*
   for (i=1; i<2048; i++) {
@@ -3003,7 +3319,7 @@ void dft6144(int16_t *input, int16_t *output,unsigned char scale)
           (simd_q15_t*)(twa6144+i),(simd_q15_t*)(twb6144+i));
   }
 
-  if (scale==1) {
+  if (scale && *scale>0) {
     for (i=0; i<96; i++) {
       y128p[0]  = mulhi_int16(y128p[0],ONE_OVER_SQRT3_Q15_128);
       y128p[1]  = mulhi_int16(y128p[1],ONE_OVER_SQRT3_Q15_128);
@@ -3029,7 +3345,7 @@ void dft6144(int16_t *input, int16_t *output,unsigned char scale)
 int16_t twa12288[8192] __attribute__((aligned(32)));
 int16_t twb12288[8192] __attribute__((aligned(32)));
 // 4096 x 3
-void dft12288(int16_t *input, int16_t *output,unsigned char scale)
+void dft12288(int16_t *input, int16_t *output,unsigned int *scale)
 {
   int i,i2,j;
   uint32_t tmp[3][4096] __attribute__((aligned(32)));
@@ -3043,9 +3359,11 @@ void dft12288(int16_t *input, int16_t *output,unsigned char scale)
     tmp[2][i] = ((uint32_t *)input)[j++];
   }
 
-  dft4096((int16_t*)(tmp[0]),(int16_t*)(tmpo[0]),scale);
-  dft4096((int16_t*)(tmp[1]),(int16_t*)(tmpo[1]),scale);
-  dft4096((int16_t*)(tmp[2]),(int16_t*)(tmpo[2]),scale);
+  unsigned int *scale4096=NULL;
+  if (scale) scale4096=scale+1;
+  dft4096((int16_t*)(tmp[0]),(int16_t*)(tmpo[0]),scale4096);
+  dft4096((int16_t*)(tmp[1]),(int16_t*)(tmpo[1]),scale4096);
+  dft4096((int16_t*)(tmp[2]),(int16_t*)(tmpo[2]),scale4096);
   /*
   for (i=1; i<4096; i++) {
     tmpo[0][i] = tmpo[0][i<<1];
@@ -3065,7 +3383,7 @@ void dft12288(int16_t *input, int16_t *output,unsigned char scale)
           (simd_q15_t*)(twa12288+i),(simd_q15_t*)(twb12288+i));
   }
 
-  if (scale==1) {
+  if (scale && *scale>0) {
     for (i=0; i<192; i++) {
       y128p[0]  = mulhi_int16(y128p[0],ONE_OVER_SQRT3_Q15_128);
       y128p[1]  = mulhi_int16(y128p[1],ONE_OVER_SQRT3_Q15_128);
@@ -3088,7 +3406,7 @@ void dft12288(int16_t *input, int16_t *output,unsigned char scale)
   }
 }
 
-void idft12288(int16_t *input, int16_t *output,unsigned char scale)
+void idft12288(int16_t *input, int16_t *output,unsigned int *scale)
 {
   int i,i2,j;
   uint32_t tmp[3][4096] __attribute__((aligned(32)));
@@ -3104,9 +3422,11 @@ void idft12288(int16_t *input, int16_t *output,unsigned char scale)
 
 
 
-  idft4096((int16_t*)(tmp[0]),(int16_t*)(tmpo[0]),scale);
-  idft4096((int16_t*)(tmp[1]),(int16_t*)(tmpo[1]),scale);
-  idft4096((int16_t*)(tmp[2]),(int16_t*)(tmpo[2]),scale);
+  unsigned int *scale4096=NULL;
+  if (scale) scale4096=scale+1;
+  idft4096((int16_t*)(tmp[0]),(int16_t*)(tmpo[0]),scale4096);
+  idft4096((int16_t*)(tmp[1]),(int16_t*)(tmpo[1]),scale4096);
+  idft4096((int16_t*)(tmp[2]),(int16_t*)(tmpo[2]),scale4096);
 #ifndef MR_MAIN
   if (LOG_DUMPFLAG(DEBUG_DFT)) {
     LOG_M("idft12288in.m","in",input,12288,1,1);
@@ -3121,7 +3441,7 @@ void idft12288(int16_t *input, int16_t *output,unsigned char scale)
           (simd_q15_t*)(twa12288+i),(simd_q15_t*)(twb12288+i));
   }
 
-  if (scale==1) {
+  if (scale && *scale>0) {
     for (i=0; i<192; i++) {
       y128p[0]  = mulhi_int16(y128p[0],ONE_OVER_SQRT3_Q15_128);
       y128p[1]  = mulhi_int16(y128p[1],ONE_OVER_SQRT3_Q15_128);
@@ -3154,7 +3474,7 @@ void idft12288(int16_t *input, int16_t *output,unsigned char scale)
 int16_t twa18432[12288] __attribute__((aligned(32)));
 int16_t twb18432[12288] __attribute__((aligned(32)));
 // 6144 x 3
-void dft18432(int16_t *input, int16_t *output,unsigned char scale) {
+void dft18432(int16_t *input, int16_t *output,unsigned int *scale) {
 
   int i,i2,j;
   uint32_t tmp[3][6144] __attribute__((aligned(32)));
@@ -3168,16 +3488,18 @@ void dft18432(int16_t *input, int16_t *output,unsigned char scale) {
     tmp[2][i] = ((uint32_t *)input)[j++];
   }
 
-  dft6144((int16_t*)(tmp[0]),(int16_t*)(tmpo[0]),scale);
-  dft6144((int16_t*)(tmp[1]),(int16_t*)(tmpo[1]),scale);
-  dft6144((int16_t*)(tmp[2]),(int16_t*)(tmpo[2]),scale);
+  unsigned int *scale6144=NULL;
+  if (scale) scale6144=scale+1;
+  dft6144((int16_t*)(tmp[0]),(int16_t*)(tmpo[0]),scale6144);
+  dft6144((int16_t*)(tmp[1]),(int16_t*)(tmpo[1]),scale6144);
+  dft6144((int16_t*)(tmp[2]),(int16_t*)(tmpo[2]),scale6144);
 
   for (i=0,i2=0; i<12288; i+=8,i2+=4)  {
     bfly3((simd_q15_t*)(&tmpo[0][i2]),(simd_q15_t*)(&tmpo[1][i2]),(simd_q15_t*)(&tmpo[2][i2]),
           (simd_q15_t*)(output+i),(simd_q15_t*)(output+12288+i),(simd_q15_t*)(output+24576+i),
           (simd_q15_t*)(twa18432+i),(simd_q15_t*)(twb18432+i));
   }
-  if (scale==1) {
+  if (scale && *scale>0) {
     for (i=0; i<288; i++) {
       y128p[0]  = mulhi_int16(y128p[0],ONE_OVER_SQRT3_Q15_128);
       y128p[1]  = mulhi_int16(y128p[1],ONE_OVER_SQRT3_Q15_128);
@@ -3200,7 +3522,7 @@ void dft18432(int16_t *input, int16_t *output,unsigned char scale) {
   }
 }
 
-void idft18432(int16_t *input, int16_t *output,unsigned char scale) {
+void idft18432(int16_t *input, int16_t *output,unsigned int *scale) {
 
   int i,i2,j;
   uint32_t tmp[3][6144] __attribute__((aligned(32)));
@@ -3214,16 +3536,18 @@ void idft18432(int16_t *input, int16_t *output,unsigned char scale) {
     tmp[2][i] = ((uint32_t *)input)[j++];
   }
 
-  idft6144((int16_t*)(tmp[0]),(int16_t*)(tmpo[0]),scale);
-  idft6144((int16_t*)(tmp[1]),(int16_t*)(tmpo[1]),scale);
-  idft6144((int16_t*)(tmp[2]),(int16_t*)(tmpo[2]),scale);
+  unsigned int *scale6144=NULL;
+  if (scale) scale6144=scale+1;
+  idft6144((int16_t*)(tmp[0]),(int16_t*)(tmpo[0]),scale6144);
+  idft6144((int16_t*)(tmp[1]),(int16_t*)(tmpo[1]),scale6144);
+  idft6144((int16_t*)(tmp[2]),(int16_t*)(tmpo[2]),scale6144);
 
   for (i=0,i2=0; i<12288; i+=8,i2+=4)  {
     ibfly3((simd_q15_t*)(&tmpo[0][i2]),(simd_q15_t*)(&tmpo[1][i2]),(simd_q15_t*)(&tmpo[2][i2]),
 	   (simd_q15_t*)(output+i),(simd_q15_t*)(output+12288+i),(simd_q15_t*)(output+24576+i),
 	   (simd_q15_t*)(twa18432+i),(simd_q15_t*)(twb18432+i));
   }
-  if (scale==1) {
+  if (scale && *scale>0) {
     for (i=0; i<288; i++) {
       y128p[0]  = mulhi_int16(y128p[0],ONE_OVER_SQRT3_Q15_128);
       y128p[1]  = mulhi_int16(y128p[1],ONE_OVER_SQRT3_Q15_128);
@@ -3250,7 +3574,7 @@ void idft18432(int16_t *input, int16_t *output,unsigned char scale) {
 int16_t twa24576[16384] __attribute__((aligned(32)));
 int16_t twb24576[16384] __attribute__((aligned(32)));
 // 8192 x 3
-void dft24576(int16_t *input, int16_t *output,unsigned char scale)
+void dft24576(int16_t *input, int16_t *output,unsigned int *scale)
 {
   int i,i2,j;
   uint32_t tmp[3][8192] __attribute__((aligned(32)));
@@ -3264,9 +3588,11 @@ void dft24576(int16_t *input, int16_t *output,unsigned char scale)
     tmp[2][i] = ((uint32_t *)input)[j++];
   }
 
-  dft8192((int16_t*)(tmp[0]),(int16_t*)(tmpo[0]),1);
-  dft8192((int16_t*)(tmp[1]),(int16_t*)(tmpo[1]),1);
-  dft8192((int16_t*)(tmp[2]),(int16_t*)(tmpo[2]),1);
+  unsigned int *scale8192=NULL;
+  if (scale) scale8192=scale+1;
+  dft8192((int16_t*)(tmp[0]),(int16_t*)(tmpo[0]),scale8192);
+  dft8192((int16_t*)(tmp[1]),(int16_t*)(tmpo[1]),scale8192);
+  dft8192((int16_t*)(tmp[2]),(int16_t*)(tmpo[2]),scale8192);
   /*
   for (i=1; i<8192; i++) {
     tmpo[0][i] = tmpo[0][i<<1];
@@ -3287,7 +3613,7 @@ void dft24576(int16_t *input, int16_t *output,unsigned char scale)
   }
 
 
-  if (scale==1) {
+  if (scale && *scale>0) {
     for (i=0; i<384; i++) {
       y128p[0]  = mulhi_int16(y128p[0],ONE_OVER_SQRT3_Q15_128);
       y128p[1]  = mulhi_int16(y128p[1],ONE_OVER_SQRT3_Q15_128);
@@ -3315,7 +3641,7 @@ void dft24576(int16_t *input, int16_t *output,unsigned char scale)
 #endif
 }
 
-void idft24576(int16_t *input, int16_t *output,unsigned char scale)
+void idft24576(int16_t *input, int16_t *output,unsigned int *scale)
 {
   int i,i2,j;
   uint32_t tmp[3][8192] __attribute__((aligned(32)));
@@ -3329,9 +3655,11 @@ void idft24576(int16_t *input, int16_t *output,unsigned char scale)
     tmp[2][i] = ((uint32_t *)input)[j++];
   }
 
-  idft8192((int16_t*)(tmp[0]),(int16_t*)(tmpo[0]),1);
-  idft8192((int16_t*)(tmp[1]),(int16_t*)(tmpo[1]),1);
-  idft8192((int16_t*)(tmp[2]),(int16_t*)(tmpo[2]),1);
+  unsigned int *scale8192=NULL;
+  if (scale) scale8192=scale+1;
+  idft8192((int16_t*)(tmp[0]),(int16_t*)(tmpo[0]),scale8192);
+  idft8192((int16_t*)(tmp[1]),(int16_t*)(tmpo[1]),scale8192);
+  idft8192((int16_t*)(tmp[2]),(int16_t*)(tmpo[2]),scale8192);
  #ifndef MR_MAIN 
   if (LOG_DUMPFLAG(DEBUG_DFT)) {
     LOG_M("idft24576in.m","in",input,24576,1,1);
@@ -3345,7 +3673,7 @@ void idft24576(int16_t *input, int16_t *output,unsigned char scale)
           (simd_q15_t*)(output+i),(simd_q15_t*)(output+16384+i),(simd_q15_t*)(output+32768+i),
           (simd_q15_t*)(twa24576+i),(simd_q15_t*)(twb24576+i));
   }
-  if (scale==1) {
+  if (scale && *scale>0) {
     for (i=0; i<384; i++) {
       y128p[0]  = mulhi_int16(y128p[0],ONE_OVER_SQRT3_Q15_128);
       y128p[1]  = mulhi_int16(y128p[1],ONE_OVER_SQRT3_Q15_128);
@@ -3377,7 +3705,7 @@ int16_t twa36864[24576] __attribute__((aligned(32)));
 int16_t twb36864[24576] __attribute__((aligned(32)));
 
 // 12288 x 3
-void dft36864(int16_t *input, int16_t *output,uint8_t scale) {
+void dft36864(int16_t *input, int16_t *output,uint32_t *scale) {
 
   int i,i2,j;
   uint32_t tmp[3][12288] __attribute__((aligned(32)));
@@ -3391,9 +3719,11 @@ void dft36864(int16_t *input, int16_t *output,uint8_t scale) {
     tmp[2][i] = ((uint32_t *)input)[j++];
   }
 
-  dft12288((int16_t*)(tmp[0]),(int16_t*)(tmpo[0]),1);
-  dft12288((int16_t*)(tmp[1]),(int16_t*)(tmpo[1]),1);
-  dft12288((int16_t*)(tmp[2]),(int16_t*)(tmpo[2]),1);
+  unsigned int *scale12288=NULL;
+  if (scale) scale12288=scale+1;
+  dft12288((int16_t*)(tmp[0]),(int16_t*)(tmpo[0]),scale12288);
+  dft12288((int16_t*)(tmp[1]),(int16_t*)(tmpo[1]),scale12288);
+  dft12288((int16_t*)(tmp[2]),(int16_t*)(tmpo[2]),scale12288);
 #ifndef MR_MAIN
   if (LOG_DUMPFLAG(DEBUG_DFT)) {
     LOG_M("dft36864out0.m","o0",tmpo[0],12288,1,1);
@@ -3407,7 +3737,7 @@ void dft36864(int16_t *input, int16_t *output,uint8_t scale) {
           (simd_q15_t*)(twa36864+i),(simd_q15_t*)(twb36864+i));
   }
 
-  if (scale==1) {
+  if (scale && *scale>0) {
     for (i=0; i<576; i++) {
       y128p[0]  = mulhi_int16(y128p[0],ONE_OVER_SQRT3_Q15_128);
       y128p[1]  = mulhi_int16(y128p[1],ONE_OVER_SQRT3_Q15_128);
@@ -3435,7 +3765,7 @@ void dft36864(int16_t *input, int16_t *output,uint8_t scale) {
 #endif
 }
 
-void idft36864(int16_t *input, int16_t *output,uint8_t scale) {
+void idft36864(int16_t *input, int16_t *output,uint32_t *scale) {
 
   int i,i2,j;
   uint32_t tmp[3][12288] __attribute__((aligned(32)));
@@ -3449,16 +3779,18 @@ void idft36864(int16_t *input, int16_t *output,uint8_t scale) {
     tmp[2][i] = ((uint32_t *)input)[j++];
   }
 
-  idft12288((int16_t*)(tmp[0]),(int16_t*)(tmpo[0]),1);
-  idft12288((int16_t*)(tmp[1]),(int16_t*)(tmpo[1]),1);
-  idft12288((int16_t*)(tmp[2]),(int16_t*)(tmpo[2]),1);
+  unsigned int *scale12288=NULL;
+  if (scale) scale12288=scale+1;
+  idft12288((int16_t*)(tmp[0]),(int16_t*)(tmpo[0]),scale12288);
+  idft12288((int16_t*)(tmp[1]),(int16_t*)(tmpo[1]),scale12288);
+  idft12288((int16_t*)(tmp[2]),(int16_t*)(tmpo[2]),scale12288);
 
   for (i=0,i2=0; i<24576; i+=8,i2+=4)  {
     ibfly3((simd_q15_t*)(&tmpo[0][i2]),(simd_q15_t*)(&tmpo[1][i2]),((simd_q15_t*)&tmpo[2][i2]),
           (simd_q15_t*)(output+i),(simd_q15_t*)(output+24576+i),(simd_q15_t*)(output+49152+i),
           (simd_q15_t*)(twa36864+i),(simd_q15_t*)(twb36864+i));
   }
-  if (scale==1) {
+  if (scale && *scale>0) {
     for (i=0; i<576; i++) {
       y128p[0]  = mulhi_int16(y128p[0],ONE_OVER_SQRT3_Q15_128);
       y128p[1]  = mulhi_int16(y128p[1],ONE_OVER_SQRT3_Q15_128);
@@ -3485,7 +3817,7 @@ int16_t twa49152[32768] __attribute__((aligned(32)));
 int16_t twb49152[32768] __attribute__((aligned(32)));
 
 // 16384 x 3
-void dft49152(int16_t *input, int16_t *output,uint8_t scale) {
+void dft49152(int16_t *input, int16_t *output,uint32_t *scale) {
 
   int i,i2,j;
   uint32_t tmp[3][16384] __attribute__((aligned(32)));
@@ -3499,16 +3831,18 @@ void dft49152(int16_t *input, int16_t *output,uint8_t scale) {
     tmp[2][i] = ((uint32_t *)input)[j++];
   }
 
-  dft16384((int16_t*)(tmp[0]),(int16_t*)(tmpo[0]),1);
-  dft16384((int16_t*)(tmp[1]),(int16_t*)(tmpo[1]),1);
-  dft16384((int16_t*)(tmp[2]),(int16_t*)(tmpo[2]),1);
+  unsigned int *scale16384=NULL;
+  if (scale) scale16384=scale+1;
+  dft16384((int16_t*)(tmp[0]),(int16_t*)(tmpo[0]),scale16384);
+  dft16384((int16_t*)(tmp[1]),(int16_t*)(tmpo[1]),scale16384);
+  dft16384((int16_t*)(tmp[2]),(int16_t*)(tmpo[2]),scale16384);
 
   for (i=0,i2=0; i<32768; i+=8,i2+=4)  {
     bfly3((simd_q15_t*)(&tmpo[0][i2]),(simd_q15_t*)(&tmpo[1][i2]),((simd_q15_t*)&tmpo[2][i2]),
           (simd_q15_t*)(output+i),(simd_q15_t*)(output+32768+i),(simd_q15_t*)(output+65536+i),
           (simd_q15_t*)(twa49152+i),(simd_q15_t*)(twb49152+i));
   }
-  if (scale==1) {
+  if (scale && *scale>0) {
     for (i=0; i<768; i++) {
       y128p[0]  = mulhi_int16(y128p[0],ONE_OVER_SQRT3_Q15_128);
       y128p[1]  = mulhi_int16(y128p[1],ONE_OVER_SQRT3_Q15_128);
@@ -3531,7 +3865,7 @@ void dft49152(int16_t *input, int16_t *output,uint8_t scale) {
   }
 }
 
-void idft49152(int16_t *input, int16_t *output,uint8_t scale) {
+void idft49152(int16_t *input, int16_t *output,uint32_t *scale) {
 
    int i,i2,j;
   uint32_t tmp[3][16384] __attribute__((aligned(32)));
@@ -3545,16 +3879,18 @@ void idft49152(int16_t *input, int16_t *output,uint8_t scale) {
     tmp[2][i] = ((uint32_t *)input)[j++];
   }
 
-  idft16384((int16_t*)(tmp[0]),(int16_t*)(tmpo[0]),1);
-  idft16384((int16_t*)(tmp[1]),(int16_t*)(tmpo[1]),1);
-  idft16384((int16_t*)(tmp[2]),(int16_t*)(tmpo[2]),1);
+  unsigned int *scale16384=NULL;
+  if (scale) scale16384=scale+1;
+  idft16384((int16_t*)(tmp[0]),(int16_t*)(tmpo[0]),scale16384);
+  idft16384((int16_t*)(tmp[1]),(int16_t*)(tmpo[1]),scale16384);
+  idft16384((int16_t*)(tmp[2]),(int16_t*)(tmpo[2]),scale16384);
 
   for (i=0,i2=0; i<32768; i+=8,i2+=4)  {
     ibfly3((simd_q15_t*)(&tmpo[0][i2]),(simd_q15_t*)(&tmpo[1][i2]),((simd_q15_t*)&tmpo[2][i2]),
 	   (simd_q15_t*)(output+i),(simd_q15_t*)(output+32768+i),(simd_q15_t*)(output+65536+i),
 	   (simd_q15_t*)(twa49152+i),(simd_q15_t*)(twb49152+i));
   }
-  if (scale==1) {
+  if (scale && *scale>0) {
     for (i=0; i<768; i++) {
       y128p[0]  = mulhi_int16(y128p[0],ONE_OVER_SQRT3_Q15_128);
       y128p[1]  = mulhi_int16(y128p[1],ONE_OVER_SQRT3_Q15_128);
@@ -3579,7 +3915,7 @@ void idft49152(int16_t *input, int16_t *output,uint8_t scale) {
 
 int16_t tw65536[3*2*16384] __attribute__((aligned(32)));
 
-void idft65536(int16_t *x,int16_t *y,unsigned char scale)
+void idft65536(int16_t *x,int16_t *y,unsigned int *scale)
 {
 
   simd_q15_t xtmp[16384],ytmp[16384],*tw65536_128p=(simd_q15_t *)tw65536,*x128=(simd_q15_t *)x,*y128p=(simd_q15_t *)y;
@@ -3591,10 +3927,12 @@ void idft65536(int16_t *x,int16_t *y,unsigned char scale)
   }
 
 
-  idft16384((int16_t*)(xtmp),(int16_t*)(ytmp),1);
-  idft16384((int16_t*)(xtmp+4096),(int16_t*)(ytmp+4096),1);
-  idft16384((int16_t*)(xtmp+8192),(int16_t*)(ytmp+8192),1);
-  idft16384((int16_t*)(xtmp+12299),(int16_t*)(ytmp+12288),1);
+  unsigned int *scale16384=NULL;
+  if (scale) scale16384=scale+1;
+  idft16384((int16_t*)(xtmp),(int16_t*)(ytmp),scale16384);
+  idft16384((int16_t*)(xtmp+4096),(int16_t*)(ytmp+4096),scale16384);
+  idft16384((int16_t*)(xtmp+8192),(int16_t*)(ytmp+8192),scale16384);
+  idft16384((int16_t*)(xtmp+12299),(int16_t*)(ytmp+12288),scale16384);
 
   for (i=0; i<4096; i++) {
     ibfly4(ytmpp,ytmpp+4096,ytmpp+8192,ytmpp+12288,
@@ -3605,25 +3943,26 @@ void idft65536(int16_t *x,int16_t *y,unsigned char scale)
     ytmpp++;
   }
 
-  if (scale>0) {
+  if (scale && *scale>0) {
+    unsigned int scalec=*scale;
 
     for (i=0; i<1024; i++) {
-      y128p[0]  = shiftright_int16(y128p[0],scale);
-      y128p[1]  = shiftright_int16(y128p[1],scale);
-      y128p[2]  = shiftright_int16(y128p[2],scale);
-      y128p[3]  = shiftright_int16(y128p[3],scale);
-      y128p[4]  = shiftright_int16(y128p[4],scale);
-      y128p[5]  = shiftright_int16(y128p[5],scale);
-      y128p[6]  = shiftright_int16(y128p[6],scale);
-      y128p[7]  = shiftright_int16(y128p[7],scale);
-      y128p[8]  = shiftright_int16(y128p[8],scale);
-      y128p[9]  = shiftright_int16(y128p[9],scale);
-      y128p[10] = shiftright_int16(y128p[10],scale);
-      y128p[11] = shiftright_int16(y128p[11],scale);
-      y128p[12] = shiftright_int16(y128p[12],scale);
-      y128p[13] = shiftright_int16(y128p[13],scale);
-      y128p[14] = shiftright_int16(y128p[14],scale);
-      y128p[15] = shiftright_int16(y128p[15],scale);
+      y128p[0]  = shiftright_int16(y128p[0],scalec);
+      y128p[1]  = shiftright_int16(y128p[1],scalec);
+      y128p[2]  = shiftright_int16(y128p[2],scalec);
+      y128p[3]  = shiftright_int16(y128p[3],scalec);
+      y128p[4]  = shiftright_int16(y128p[4],scalec);
+      y128p[5]  = shiftright_int16(y128p[5],scalec);
+      y128p[6]  = shiftright_int16(y128p[6],scalec);
+      y128p[7]  = shiftright_int16(y128p[7],scalec);
+      y128p[8]  = shiftright_int16(y128p[8],scalec);
+      y128p[9]  = shiftright_int16(y128p[9],scalec);
+      y128p[10] = shiftright_int16(y128p[10],scalec);
+      y128p[11] = shiftright_int16(y128p[11],scalec);
+      y128p[12] = shiftright_int16(y128p[12],scalec);
+      y128p[13] = shiftright_int16(y128p[13],scalec);
+      y128p[14] = shiftright_int16(y128p[14],scalec);
+      y128p[15] = shiftright_int16(y128p[15],scalec);
 
       y128p+=16;
     }
@@ -3635,7 +3974,7 @@ void idft65536(int16_t *x,int16_t *y,unsigned char scale)
 int16_t twa98304[65536] __attribute__((aligned(32)));
 int16_t twb98304[65536] __attribute__((aligned(32)));
 // 32768 x 3
-void dft98304(int16_t *input, int16_t *output,uint8_t scale) {
+void dft98304(int16_t *input, int16_t *output,uint32_t *scale) {
 
   int i,i2,j;
   uint32_t tmp[3][32768] __attribute__((aligned(32)));
@@ -3649,16 +3988,18 @@ void dft98304(int16_t *input, int16_t *output,uint8_t scale) {
     tmp[2][i] = ((uint32_t *)input)[j++];
   }
 
-  dft32768((int16_t*)(tmp[0]),(int16_t*)(tmpo[0]),1);
-  dft32768((int16_t*)(tmp[1]),(int16_t*)(tmpo[1]),1);
-  dft32768((int16_t*)(tmp[2]),(int16_t*)(tmpo[2]),1);
+  unsigned int *scale32768=NULL;
+  if (scale) scale32768=scale+1;
+  dft32768((int16_t*)(tmp[0]),(int16_t*)(tmpo[0]),scale32768);
+  dft32768((int16_t*)(tmp[1]),(int16_t*)(tmpo[1]),scale32768);
+  dft32768((int16_t*)(tmp[2]),(int16_t*)(tmpo[2]),scale32768);
 
   for (i=0,i2=0; i<65536; i+=8,i2+=4)  {
     bfly3((simd_q15_t*)(&tmpo[0][i2]),(simd_q15_t*)(&tmpo[1][i2]),((simd_q15_t*)&tmpo[2][i2]),
           (simd_q15_t*)(output+i),(simd_q15_t*)(output+65536+i),(simd_q15_t*)(output+131072+i),
           (simd_q15_t*)(twa98304+i),(simd_q15_t*)(twb98304+i));
   }
-  if (scale==1) {
+  if (scale && *scale>0) {
     for (i=0; i<1536; i++) {
       y128p[0]  = mulhi_int16(y128p[0],ONE_OVER_SQRT3_Q15_128);
       y128p[1]  = mulhi_int16(y128p[1],ONE_OVER_SQRT3_Q15_128);
@@ -3681,7 +4022,7 @@ void dft98304(int16_t *input, int16_t *output,uint8_t scale) {
   }
 }
 
-void idft98304(int16_t *input, int16_t *output,uint8_t scale) {
+void idft98304(int16_t *input, int16_t *output,uint32_t *scale) {
 
   int i,i2,j;
   uint32_t tmp[3][32768] __attribute__((aligned(32)));
@@ -3695,16 +4036,18 @@ void idft98304(int16_t *input, int16_t *output,uint8_t scale) {
     tmp[2][i] = ((uint32_t *)input)[j++];
   }
 
-  idft32768((int16_t*)(tmp[0]),(int16_t*)(tmpo[0]),1);
-  idft32768((int16_t*)(tmp[1]),(int16_t*)(tmpo[1]),1);
-  idft32768((int16_t*)(tmp[2]),(int16_t*)(tmpo[2]),1);
+  unsigned int *scale32768=NULL;
+  if (scale) scale32768=scale+1;
+  idft32768((int16_t*)(tmp[0]),(int16_t*)(tmpo[0]),scale32768);
+  idft32768((int16_t*)(tmp[1]),(int16_t*)(tmpo[1]),scale32768);
+  idft32768((int16_t*)(tmp[2]),(int16_t*)(tmpo[2]),scale32768);
 
   for (i=0,i2=0; i<65536; i+=8,i2+=4)  {
     ibfly3((simd_q15_t*)(&tmpo[0][i2]),(simd_q15_t*)(&tmpo[1][i2]),((simd_q15_t*)&tmpo[2][i2]),
 	   (simd_q15_t*)(output+i),(simd_q15_t*)(output+65536+i),(simd_q15_t*)(output+131072+i),
 	   (simd_q15_t*)(twa98304+i),(simd_q15_t*)(twb98304+i));
   }
-  if (scale==1) {
+  if (scale && *scale>0) {
     for (i=0; i<1536; i++) {
       y128p[0]  = mulhi_int16(y128p[0],ONE_OVER_SQRT3_Q15_128);
       y128p[1]  = mulhi_int16(y128p[1],ONE_OVER_SQRT3_Q15_128);
@@ -3877,7 +4220,7 @@ static inline void dft12f(simd_q15_t *x0,
 
 
 
-void dft12(int16_t *x,int16_t *y ,unsigned char scale_flag)
+void dft12(int16_t *x,int16_t *y ,unsigned int *scale_flag)
 {
 
   simd_q15_t *x128 = (simd_q15_t *)x,*y128 = (simd_q15_t *)y;
@@ -3910,7 +4253,7 @@ void dft12(int16_t *x,int16_t *y ,unsigned char scale_flag)
 
 static int16_t tw24[88]__attribute__((aligned(32)));
 
-void dft24(int16_t *x,int16_t *y,unsigned char scale_flag)
+void dft24(int16_t *x,int16_t *y,unsigned int *scale_flag)
 {
 
   simd_q15_t *x128=(simd_q15_t *)x;
@@ -3990,7 +4333,7 @@ void dft24(int16_t *x,int16_t *y,unsigned char scale_flag)
     //    msg("dft24e\n");
   }
 
-  if (scale_flag==1) {
+  if (scale_flag) {
     norm128 = set1_int16(dft_norm_table[1]);
 
     for (i=0; i<24; i++) {
@@ -4006,7 +4349,7 @@ void dft24(int16_t *x,int16_t *y,unsigned char scale_flag)
 static int16_t twa36[88]__attribute__((aligned(32)));
 static int16_t twb36[88]__attribute__((aligned(32)));
 
-void dft36(int16_t *x,int16_t *y,unsigned char scale_flag)
+void dft36(int16_t *x,int16_t *y,unsigned int *scale_flag)
 {
 
   simd_q15_t *x128=(simd_q15_t *)x;
@@ -4113,7 +4456,7 @@ void dft36(int16_t *x,int16_t *y,unsigned char scale_flag)
           twb128+k);
   }
 
-  if (scale_flag==1) {
+  if (scale_flag) {
     norm128 = set1_int16(dft_norm_table[2]);
 
     for (i=0; i<36; i++) {
@@ -4126,7 +4469,7 @@ static int16_t twa48[88]__attribute__((aligned(32)));
 static int16_t twb48[88]__attribute__((aligned(32)));
 static int16_t twc48[88]__attribute__((aligned(32)));
 
-void dft48(int16_t *x, int16_t *y,unsigned char scale_flag)
+void dft48(int16_t *x, int16_t *y,unsigned int *scale_flag)
 {
 
   simd_q15_t *x128=(simd_q15_t *)x;
@@ -4270,7 +4613,7 @@ void dft48(int16_t *x, int16_t *y,unsigned char scale_flag)
 
   }
 
-  if (scale_flag == 1) {
+  if (scale_flag) {
     norm128 = set1_int16(dft_norm_table[3]);
 
     for (i=0; i<48; i++) {
@@ -4284,7 +4627,7 @@ static int16_t twb60[88]__attribute__((aligned(32)));
 static int16_t twc60[88]__attribute__((aligned(32)));
 static int16_t twd60[88]__attribute__((aligned(32)));
 
-void dft60(int16_t *x,int16_t *y,unsigned char scale)
+void dft60(int16_t *x,int16_t *y,unsigned int *scale)
 {
 
   simd_q15_t *x128=(simd_q15_t *)x;
@@ -4450,7 +4793,7 @@ void dft60(int16_t *x,int16_t *y,unsigned char scale)
           twd128+k);
   }
 
-  if (scale == 1) {
+  if (scale) {
     norm128 = set1_int16(dft_norm_table[4]);
 
     for (i=0; i<60; i++) {
@@ -4462,7 +4805,7 @@ void dft60(int16_t *x,int16_t *y,unsigned char scale)
 
 static int16_t tw72[280]__attribute__((aligned(32)));
 
-void dft72(int16_t *x,int16_t *y,unsigned char scale_flag)
+void dft72(int16_t *x,int16_t *y,unsigned int *scale_flag)
 {
 
   int i,j;
@@ -4478,8 +4821,8 @@ void dft72(int16_t *x,int16_t *y,unsigned char scale_flag)
     x2128[i+36] = x128[j+1];  // odd inputs
   }
 
-  dft36((int16_t *)x2128,(int16_t *)ytmp128,1);
-  dft36((int16_t *)(x2128+36),(int16_t *)(ytmp128+36),1);
+  dft36((int16_t *)x2128,(int16_t *)ytmp128,scale_flag);
+  dft36((int16_t *)(x2128+36),(int16_t *)(ytmp128+36),scale_flag);
 
   bfly2_tw1(ytmp128,ytmp128+36,y128,y128+36);
 
@@ -4491,7 +4834,7 @@ void dft72(int16_t *x,int16_t *y,unsigned char scale_flag)
           tw128+j);
   }
 
-  if (scale_flag==1) {
+  if (scale_flag) {
     norm128 = set1_int16(dft_norm_table[5]);
 
     for (i=0; i<72; i++) {
@@ -4502,7 +4845,7 @@ void dft72(int16_t *x,int16_t *y,unsigned char scale_flag)
 
 static int16_t tw96[376]__attribute__((aligned(32)));
 
-void dft96(int16_t *x,int16_t *y,unsigned char scale_flag)
+void dft96(int16_t *x,int16_t *y,unsigned int *scale_flag)
 {
 
 
@@ -4533,7 +4876,7 @@ void dft96(int16_t *x,int16_t *y,unsigned char scale_flag)
           tw128+j);
   }
 
-  if (scale_flag==1) {
+  if (scale_flag) {
     norm128 = set1_int16(dft_norm_table[6]);
 
     for (i=0; i<96; i++) {
@@ -4545,7 +4888,7 @@ void dft96(int16_t *x,int16_t *y,unsigned char scale_flag)
 static int16_t twa108[280]__attribute__((aligned(32)));
 static int16_t twb108[280]__attribute__((aligned(32)));
 
-void dft108(int16_t *x,int16_t *y,unsigned char scale_flag)
+void dft108(int16_t *x,int16_t *y,unsigned int *scale_flag)
 {
   int i,j;
   simd_q15_t *x128=(simd_q15_t *)x;
@@ -4580,7 +4923,7 @@ void dft108(int16_t *x,int16_t *y,unsigned char scale_flag)
 
   }
 
-  if (scale_flag==1) {
+  if (scale_flag) {
     norm128 = set1_int16(dft_norm_table[7]);
 
     for (i=0; i<108; i++) {
@@ -4590,7 +4933,7 @@ void dft108(int16_t *x,int16_t *y,unsigned char scale_flag)
 }
 
 static int16_t tw120[472]__attribute__((aligned(32)));
-void dft120(int16_t *x,int16_t *y, unsigned char scale_flag)
+void dft120(int16_t *x,int16_t *y, unsigned int *scale_flag)
 {
   int i,j;
   simd_q15_t *x128=(simd_q15_t *)x;
@@ -4618,7 +4961,7 @@ void dft120(int16_t *x,int16_t *y, unsigned char scale_flag)
           tw128+j);
   }
 
-  if (scale_flag==1) {
+  if (scale_flag) {
     norm128 = set1_int16(dft_norm_table[8]);
 
     for (i=0; i<120; i++) {
@@ -4630,7 +4973,7 @@ void dft120(int16_t *x,int16_t *y, unsigned char scale_flag)
 static int16_t twa144[376]__attribute__((aligned(32)));
 static int16_t twb144[376]__attribute__((aligned(32)));
 
-void dft144(int16_t *x,int16_t *y,unsigned char scale_flag)
+void dft144(int16_t *x,int16_t *y,unsigned int *scale_flag)
 {
   int i,j;
   simd_q15_t *x128=(simd_q15_t *)x;
@@ -4648,9 +4991,9 @@ void dft144(int16_t *x,int16_t *y,unsigned char scale_flag)
     x2128[i+96] = x128[j+2];
   }
 
-  dft48((int16_t *)x2128,(int16_t *)ytmp128,1);
-  dft48((int16_t *)(x2128+48),(int16_t *)(ytmp128+48),1);
-  dft48((int16_t *)(x2128+96),(int16_t *)(ytmp128+96),1);
+  dft48((int16_t *)x2128,(int16_t *)ytmp128,scale_flag);
+  dft48((int16_t *)(x2128+48),(int16_t *)(ytmp128+48),scale_flag);
+  dft48((int16_t *)(x2128+96),(int16_t *)(ytmp128+96),scale_flag);
 
   bfly3_tw1(ytmp128,ytmp128+48,ytmp128+96,y128,y128+48,y128+96);
 
@@ -4665,7 +5008,7 @@ void dft144(int16_t *x,int16_t *y,unsigned char scale_flag)
           twb128+j);
   }
 
-  if (scale_flag==1) {
+  if (scale_flag) {
     norm128 = set1_int16(dft_norm_table[9]);
 
     for (i=0; i<144; i++) {
@@ -4677,7 +5020,7 @@ void dft144(int16_t *x,int16_t *y,unsigned char scale_flag)
 static int16_t twa180[472]__attribute__((aligned(32)));
 static int16_t twb180[472]__attribute__((aligned(32)));
 
-void dft180(int16_t *x,int16_t *y,unsigned char scale_flag)
+void dft180(int16_t *x,int16_t *y,unsigned int *scale_flag)
 {
 
   int i,j;
@@ -4696,9 +5039,9 @@ void dft180(int16_t *x,int16_t *y,unsigned char scale_flag)
     x2128[i+120] = x128[j+2];
   }
 
-  dft60((int16_t *)x2128,(int16_t *)ytmp128,1);
-  dft60((int16_t *)(x2128+60),(int16_t *)(ytmp128+60),1);
-  dft60((int16_t *)(x2128+120),(int16_t *)(ytmp128+120),1);
+  dft60((int16_t *)x2128,(int16_t *)ytmp128,scale_flag);
+  dft60((int16_t *)(x2128+60),(int16_t *)(ytmp128+60),scale_flag);
+  dft60((int16_t *)(x2128+120),(int16_t *)(ytmp128+120),scale_flag);
 
   bfly3_tw1(ytmp128,ytmp128+60,ytmp128+120,y128,y128+60,y128+120);
 
@@ -4713,7 +5056,7 @@ void dft180(int16_t *x,int16_t *y,unsigned char scale_flag)
           twb128+j);
   }
 
-  if (scale_flag==1) {
+  if (scale_flag) {
     norm128 = set1_int16(dft_norm_table[10]);
 
     for (i=0; i<180; i++) {
@@ -4726,7 +5069,7 @@ static int16_t twa192[376]__attribute__((aligned(32)));
 static int16_t twb192[376]__attribute__((aligned(32)));
 static int16_t twc192[376]__attribute__((aligned(32)));
 
-void dft192(int16_t *x,int16_t *y,unsigned char scale_flag)
+void dft192(int16_t *x,int16_t *y,unsigned int *scale_flag)
 {
 
   int i,j;
@@ -4747,10 +5090,10 @@ void dft192(int16_t *x,int16_t *y,unsigned char scale_flag)
     x2128[i+144] = x128[j+3];
   }
 
-  dft48((int16_t *)x2128,(int16_t *)ytmp128,1);
-  dft48((int16_t *)(x2128+48),(int16_t *)(ytmp128+48),1);
-  dft48((int16_t *)(x2128+96),(int16_t *)(ytmp128+96),1);
-  dft48((int16_t *)(x2128+144),(int16_t *)(ytmp128+144),1);
+  dft48((int16_t *)x2128,(int16_t *)ytmp128,scale_flag);
+  dft48((int16_t *)(x2128+48),(int16_t *)(ytmp128+48),scale_flag);
+  dft48((int16_t *)(x2128+96),(int16_t *)(ytmp128+96),scale_flag);
+  dft48((int16_t *)(x2128+144),(int16_t *)(ytmp128+144),scale_flag);
 
   bfly4_tw1(ytmp128,ytmp128+48,ytmp128+96,ytmp128+144,y128,y128+48,y128+96,y128+144);
 
@@ -4768,7 +5111,7 @@ void dft192(int16_t *x,int16_t *y,unsigned char scale_flag)
           twc128+j);
   }
 
-  if (scale_flag==1) {
+  if (scale_flag) {
     norm128 = set1_int16(dft_norm_table[11]);
 
     for (i=0; i<192; i++) {
@@ -4780,7 +5123,7 @@ void dft192(int16_t *x,int16_t *y,unsigned char scale_flag)
 static int16_t twa216[568]__attribute__((aligned(32)));
 static int16_t twb216[568]__attribute__((aligned(32)));
 
-void dft216(int16_t *x,int16_t *y,unsigned char scale_flag)
+void dft216(int16_t *x,int16_t *y,unsigned int *scale_flag)
 {
 
   int i,j;
@@ -4799,9 +5142,9 @@ void dft216(int16_t *x,int16_t *y,unsigned char scale_flag)
     x2128[i+144] = x128[j+2];
   }
 
-  dft72((int16_t *)x2128,(int16_t *)ytmp128,1);
-  dft72((int16_t *)(x2128+72),(int16_t *)(ytmp128+72),1);
-  dft72((int16_t *)(x2128+144),(int16_t *)(ytmp128+144),1);
+  dft72((int16_t *)x2128,(int16_t *)ytmp128,scale_flag);
+  dft72((int16_t *)(x2128+72),(int16_t *)(ytmp128+72),scale_flag);
+  dft72((int16_t *)(x2128+144),(int16_t *)(ytmp128+144),scale_flag);
 
   bfly3_tw1(ytmp128,ytmp128+72,ytmp128+144,y128,y128+72,y128+144);
 
@@ -4816,7 +5159,7 @@ void dft216(int16_t *x,int16_t *y,unsigned char scale_flag)
           twb128+j);
   }
 
-  if (scale_flag==1) {
+  if (scale_flag) {
     norm128 = set1_int16(dft_norm_table[12]);
 
     for (i=0; i<216; i++) {
@@ -4829,7 +5172,7 @@ static int16_t twa240[472]__attribute__((aligned(32)));
 static int16_t twb240[472]__attribute__((aligned(32)));
 static int16_t twc240[472]__attribute__((aligned(32)));
 
-void dft240(int16_t *x,int16_t *y,unsigned char scale_flag)
+void dft240(int16_t *x,int16_t *y,unsigned int *scale_flag)
 {
 
   int i,j;
@@ -4850,10 +5193,10 @@ void dft240(int16_t *x,int16_t *y,unsigned char scale_flag)
     x2128[i+180] = x128[j+3];
   }
 
-  dft60((int16_t *)x2128,(int16_t *)ytmp128,1);
-  dft60((int16_t *)(x2128+60),(int16_t *)(ytmp128+60),1);
-  dft60((int16_t *)(x2128+120),(int16_t *)(ytmp128+120),1);
-  dft60((int16_t *)(x2128+180),(int16_t *)(ytmp128+180),1);
+  dft60((int16_t *)x2128,(int16_t *)ytmp128,scale_flag);
+  dft60((int16_t *)(x2128+60),(int16_t *)(ytmp128+60),scale_flag);
+  dft60((int16_t *)(x2128+120),(int16_t *)(ytmp128+120),scale_flag);
+  dft60((int16_t *)(x2128+180),(int16_t *)(ytmp128+180),scale_flag);
 
   bfly4_tw1(ytmp128,ytmp128+60,ytmp128+120,ytmp128+180,y128,y128+60,y128+120,y128+180);
 
@@ -4871,7 +5214,7 @@ void dft240(int16_t *x,int16_t *y,unsigned char scale_flag)
           twc128+j);
   }
 
-  if (scale_flag==1) {
+  if (scale_flag) {
     norm128 = set1_int16(dft_norm_table[13]);
 
     for (i=0; i<240; i++) {
@@ -4883,7 +5226,7 @@ void dft240(int16_t *x,int16_t *y,unsigned char scale_flag)
 static int16_t twa288[760]__attribute__((aligned(32)));
 static int16_t twb288[760]__attribute__((aligned(32)));
 
-void dft288(int16_t *x,int16_t *y,unsigned char scale_flag)
+void dft288(int16_t *x,int16_t *y,unsigned int *scale_flag)
 {
 
   int i,j;
@@ -4902,9 +5245,9 @@ void dft288(int16_t *x,int16_t *y,unsigned char scale_flag)
     x2128[i+192] = x128[j+2];
   }
 
-  dft96((int16_t *)x2128,(int16_t *)ytmp128,1);
-  dft96((int16_t *)(x2128+96),(int16_t *)(ytmp128+96),1);
-  dft96((int16_t *)(x2128+192),(int16_t *)(ytmp128+192),1);
+  dft96((int16_t *)x2128,(int16_t *)ytmp128,scale_flag);
+  dft96((int16_t *)(x2128+96),(int16_t *)(ytmp128+96),scale_flag);
+  dft96((int16_t *)(x2128+192),(int16_t *)(ytmp128+192),scale_flag);
 
   bfly3_tw1(ytmp128,ytmp128+96,ytmp128+192,y128,y128+96,y128+192);
 
@@ -4919,7 +5262,7 @@ void dft288(int16_t *x,int16_t *y,unsigned char scale_flag)
           twb128+j);
   }
 
-  if (scale_flag==1) {
+  if (scale_flag) {
     norm128 = set1_int16(dft_norm_table[14]);
 
     for (i=0; i<288; i++) {
@@ -4933,7 +5276,7 @@ static int16_t twb300[472]__attribute__((aligned(32)));
 static int16_t twc300[472]__attribute__((aligned(32)));
 static int16_t twd300[472]__attribute__((aligned(32)));
 
-void dft300(int16_t *x,int16_t *y,unsigned char scale_flag)
+void dft300(int16_t *x,int16_t *y,unsigned int *scale_flag)
 {
 
   int i,j;
@@ -4956,11 +5299,11 @@ void dft300(int16_t *x,int16_t *y,unsigned char scale_flag)
     x2128[i+240] = x128[j+4];
   }
 
-  dft60((int16_t *)x2128,(int16_t *)ytmp128,1);
-  dft60((int16_t *)(x2128+60),(int16_t *)(ytmp128+60),1);
-  dft60((int16_t *)(x2128+120),(int16_t *)(ytmp128+120),1);
-  dft60((int16_t *)(x2128+180),(int16_t *)(ytmp128+180),1);
-  dft60((int16_t *)(x2128+240),(int16_t *)(ytmp128+240),1);
+  dft60((int16_t *)x2128,(int16_t *)ytmp128,scale_flag);
+  dft60((int16_t *)(x2128+60),(int16_t *)(ytmp128+60),scale_flag);
+  dft60((int16_t *)(x2128+120),(int16_t *)(ytmp128+120),scale_flag);
+  dft60((int16_t *)(x2128+180),(int16_t *)(ytmp128+180),scale_flag);
+  dft60((int16_t *)(x2128+240),(int16_t *)(ytmp128+240),scale_flag);
 
   bfly5_tw1(ytmp128,ytmp128+60,ytmp128+120,ytmp128+180,ytmp128+240,y128,y128+60,y128+120,y128+180,y128+240);
 
@@ -4981,7 +5324,7 @@ void dft300(int16_t *x,int16_t *y,unsigned char scale_flag)
           twd128+j);
   }
 
-  if (scale_flag==1) {
+  if (scale_flag) {
     norm128 = set1_int16(dft_norm_table[15]);
 
     for (i=0; i<300; i++) {
@@ -4993,7 +5336,7 @@ void dft300(int16_t *x,int16_t *y,unsigned char scale_flag)
 static int16_t twa324[107*2*4];
 static int16_t twb324[107*2*4];
 
-void dft324(int16_t *x,int16_t *y,unsigned char scale_flag)  // 108 x 3
+void dft324(int16_t *x,int16_t *y,unsigned int *scale_flag)  // 108 x 3
 {
   int i,j;
   simd_q15_t *x128=(simd_q15_t *)x;
@@ -5011,9 +5354,9 @@ void dft324(int16_t *x,int16_t *y,unsigned char scale_flag)  // 108 x 3
     x2128[i+216] = x128[j+2];
   }
 
-  dft108((int16_t *)x2128,(int16_t *)ytmp128,1);
-  dft108((int16_t *)(x2128+108),(int16_t *)(ytmp128+108),1);
-  dft108((int16_t *)(x2128+216),(int16_t *)(ytmp128+216),1);
+  dft108((int16_t *)x2128,(int16_t *)ytmp128,scale_flag);
+  dft108((int16_t *)(x2128+108),(int16_t *)(ytmp128+108),scale_flag);
+  dft108((int16_t *)(x2128+216),(int16_t *)(ytmp128+216),scale_flag);
 
   bfly3_tw1(ytmp128,ytmp128+108,ytmp128+216,y128,y128+108,y128+216);
 
@@ -5028,7 +5371,7 @@ void dft324(int16_t *x,int16_t *y,unsigned char scale_flag)  // 108 x 3
           twb128+j);
   }
 
-  if (scale_flag==1) {
+  if (scale_flag) {
     norm128 = set1_int16(dft_norm_table[14]);
 
     for (i=0; i<324; i++) {
@@ -5040,7 +5383,7 @@ void dft324(int16_t *x,int16_t *y,unsigned char scale_flag)  // 108 x 3
 static int16_t twa360[119*2*4];
 static int16_t twb360[119*2*4];
 
-void dft360(int16_t *x,int16_t *y,unsigned char scale_flag)  // 120 x 3
+void dft360(int16_t *x,int16_t *y,unsigned int *scale_flag)  // 120 x 3
 {
   int i,j;
   simd_q15_t *x128=(simd_q15_t *)x;
@@ -5058,9 +5401,9 @@ void dft360(int16_t *x,int16_t *y,unsigned char scale_flag)  // 120 x 3
     x2128[i+240] = x128[j+2];
   }
 
-  dft120((int16_t *)x2128,(int16_t *)ytmp128,1);
-  dft120((int16_t *)(x2128+120),(int16_t *)(ytmp128+120),1);
-  dft120((int16_t *)(x2128+240),(int16_t *)(ytmp128+240),1);
+  dft120((int16_t *)x2128,(int16_t *)ytmp128,scale_flag);
+  dft120((int16_t *)(x2128+120),(int16_t *)(ytmp128+120),scale_flag);
+  dft120((int16_t *)(x2128+240),(int16_t *)(ytmp128+240),scale_flag);
 
   bfly3_tw1(ytmp128,ytmp128+120,ytmp128+240,y128,y128+120,y128+240);
 
@@ -5075,7 +5418,7 @@ void dft360(int16_t *x,int16_t *y,unsigned char scale_flag)  // 120 x 3
           twb128+j);
   }
 
-  if (scale_flag==1) {
+  if (scale_flag) {
     norm128 = set1_int16(dft_norm_table[14]);
 
     for (i=0; i<360; i++) {
@@ -5088,7 +5431,7 @@ static int16_t twa384[95*2*4];
 static int16_t twb384[95*2*4];
 static int16_t twc384[95*2*4];
 
-void dft384(int16_t *x,int16_t *y,unsigned char scale_flag)  // 96 x 4
+void dft384(int16_t *x,int16_t *y,unsigned int *scale_flag)  // 96 x 4
 {
   int i,j;
   simd_q15_t *x128=(simd_q15_t *)x;
@@ -5108,10 +5451,10 @@ void dft384(int16_t *x,int16_t *y,unsigned char scale_flag)  // 96 x 4
     x2128[i+288] = x128[j+3];
   }
 
-  dft96((int16_t *)x2128,(int16_t *)ytmp128,1);
-  dft96((int16_t *)(x2128+96),(int16_t *)(ytmp128+96),1);
-  dft96((int16_t *)(x2128+192),(int16_t *)(ytmp128+192),1);
-  dft96((int16_t *)(x2128+288),(int16_t *)(ytmp128+288),1);
+  dft96((int16_t *)x2128,(int16_t *)ytmp128,scale_flag);
+  dft96((int16_t *)(x2128+96),(int16_t *)(ytmp128+96),scale_flag);
+  dft96((int16_t *)(x2128+192),(int16_t *)(ytmp128+192),scale_flag);
+  dft96((int16_t *)(x2128+288),(int16_t *)(ytmp128+288),scale_flag);
 
   bfly4_tw1(ytmp128,ytmp128+96,ytmp128+192,ytmp128+288,y128,y128+96,y128+192,y128+288);
 
@@ -5129,7 +5472,7 @@ void dft384(int16_t *x,int16_t *y,unsigned char scale_flag)  // 96 x 4
           twc128+j);
   }
 
-  if (scale_flag==1) {
+  if (scale_flag) {
     norm128 = set1_int16(16384);//dft_norm_table[13]);
 
     for (i=0; i<384; i++) {
@@ -5142,7 +5485,7 @@ static int16_t twa432[107*2*4];
 static int16_t twb432[107*2*4];
 static int16_t twc432[107*2*4];
 
-void dft432(int16_t *x,int16_t *y,unsigned char scale_flag)  // 108 x 4
+void dft432(int16_t *x,int16_t *y,unsigned int *scale_flag)  // 108 x 4
 {
   int i,j;
   simd_q15_t *x128=(simd_q15_t *)x;
@@ -5161,10 +5504,10 @@ void dft432(int16_t *x,int16_t *y,unsigned char scale_flag)  // 108 x 4
     x2128[i+324] = x128[j+3];
   }
 
-  dft108((int16_t *)x2128,(int16_t *)ytmp128,1);
-  dft108((int16_t *)(x2128+108),(int16_t *)(ytmp128+108),1);
-  dft108((int16_t *)(x2128+216),(int16_t *)(ytmp128+216),1);
-  dft108((int16_t *)(x2128+324),(int16_t *)(ytmp128+324),1);
+  dft108((int16_t *)x2128,(int16_t *)ytmp128,scale_flag);
+  dft108((int16_t *)(x2128+108),(int16_t *)(ytmp128+108),scale_flag);
+  dft108((int16_t *)(x2128+216),(int16_t *)(ytmp128+216),scale_flag);
+  dft108((int16_t *)(x2128+324),(int16_t *)(ytmp128+324),scale_flag);
 
   bfly4_tw1(ytmp128,ytmp128+108,ytmp128+216,ytmp128+324,y128,y128+108,y128+216,y128+324);
 
@@ -5182,7 +5525,7 @@ void dft432(int16_t *x,int16_t *y,unsigned char scale_flag)  // 108 x 4
           twc128+j);
   }
 
-  if (scale_flag==1) {
+  if (scale_flag) {
     norm128 = set1_int16(16384);//dft_norm_table[13]);
 
     for (i=0; i<432; i++) {
@@ -5194,7 +5537,7 @@ static int16_t twa480[119*2*4];
 static int16_t twb480[119*2*4];
 static int16_t twc480[119*2*4];
 
-void dft480(int16_t *x,int16_t *y,unsigned char scale_flag)  // 120 x 4
+void dft480(int16_t *x,int16_t *y,unsigned int *scale_flag)  // 120 x 4
 {
   int i,j;
   simd_q15_t *x128=(simd_q15_t *)x;
@@ -5214,10 +5557,10 @@ void dft480(int16_t *x,int16_t *y,unsigned char scale_flag)  // 120 x 4
     x2128[i+360] = x128[j+3];
   }
 
-  dft120((int16_t *)x2128,(int16_t *)ytmp128,1);
-  dft120((int16_t *)(x2128+120),(int16_t *)(ytmp128+120),1);
-  dft120((int16_t *)(x2128+240),(int16_t *)(ytmp128+240),1);
-  dft120((int16_t *)(x2128+360),(int16_t *)(ytmp128+360),1);
+  dft120((int16_t *)x2128,(int16_t *)ytmp128,scale_flag);
+  dft120((int16_t *)(x2128+120),(int16_t *)(ytmp128+120),scale_flag);
+  dft120((int16_t *)(x2128+240),(int16_t *)(ytmp128+240),scale_flag);
+  dft120((int16_t *)(x2128+360),(int16_t *)(ytmp128+360),scale_flag);
 
   bfly4_tw1(ytmp128,ytmp128+120,ytmp128+240,ytmp128+360,y128,y128+120,y128+240,y128+360);
 
@@ -5235,7 +5578,7 @@ void dft480(int16_t *x,int16_t *y,unsigned char scale_flag)  // 120 x 4
           twc128+j);
   }
 
-  if (scale_flag==1) {
+  if (scale_flag) {
     norm128 = set1_int16(16384);//dft_norm_table[13]);
 
     for (i=0; i<480; i++) {
@@ -5248,7 +5591,7 @@ void dft480(int16_t *x,int16_t *y,unsigned char scale_flag)  // 120 x 4
 static int16_t twa540[179*2*4];
 static int16_t twb540[179*2*4];
 
-void dft540(int16_t *x,int16_t *y,unsigned char scale_flag)  // 180 x 3
+void dft540(int16_t *x,int16_t *y,unsigned int *scale_flag)  // 180 x 3
 {
   int i,j;
   simd_q15_t *x128=(simd_q15_t *)x;
@@ -5266,9 +5609,9 @@ void dft540(int16_t *x,int16_t *y,unsigned char scale_flag)  // 180 x 3
     x2128[i+360] = x128[j+2];
   }
 
-  dft180((int16_t *)x2128,(int16_t *)ytmp128,1);
-  dft180((int16_t *)(x2128+180),(int16_t *)(ytmp128+180),1);
-  dft180((int16_t *)(x2128+360),(int16_t *)(ytmp128+360),1);
+  dft180((int16_t *)x2128,(int16_t *)ytmp128,scale_flag);
+  dft180((int16_t *)(x2128+180),(int16_t *)(ytmp128+180),scale_flag);
+  dft180((int16_t *)(x2128+360),(int16_t *)(ytmp128+360),scale_flag);
 
   bfly3_tw1(ytmp128,ytmp128+180,ytmp128+360,y128,y128+180,y128+360);
 
@@ -5283,7 +5626,7 @@ void dft540(int16_t *x,int16_t *y,unsigned char scale_flag)  // 180 x 3
           twb128+j);
   }
 
-  if (scale_flag==1) {
+  if (scale_flag) {
     norm128 = set1_int16(dft_norm_table[14]);
 
     for (i=0; i<540; i++) {
@@ -5295,7 +5638,7 @@ void dft540(int16_t *x,int16_t *y,unsigned char scale_flag)  // 180 x 3
 static int16_t twa576[191*2*4];
 static int16_t twb576[191*2*4];
 
-void dft576(int16_t *x,int16_t *y,unsigned char scale_flag)  // 192 x 3
+void dft576(int16_t *x,int16_t *y,unsigned int *scale_flag)  // 192 x 3
 {
   int i,j;
   simd_q15_t *x128=(simd_q15_t *)x;
@@ -5314,9 +5657,9 @@ void dft576(int16_t *x,int16_t *y,unsigned char scale_flag)  // 192 x 3
   }
 
 
-  dft192((int16_t *)x2128,(int16_t *)ytmp128,1);
-  dft192((int16_t *)(x2128+192),(int16_t *)(ytmp128+192),1);
-  dft192((int16_t *)(x2128+384),(int16_t *)(ytmp128+384),1);
+  dft192((int16_t *)x2128,(int16_t *)ytmp128,scale_flag);
+  dft192((int16_t *)(x2128+192),(int16_t *)(ytmp128+192),scale_flag);
+  dft192((int16_t *)(x2128+384),(int16_t *)(ytmp128+384),scale_flag);
 
   bfly3_tw1(ytmp128,ytmp128+192,ytmp128+384,y128,y128+192,y128+384);
 
@@ -5331,7 +5674,7 @@ void dft576(int16_t *x,int16_t *y,unsigned char scale_flag)  // 192 x 3
           twb128+j);
   }
 
-  if (scale_flag==1) {
+  if (scale_flag) {
     norm128 = set1_int16(dft_norm_table[14]);
 
     for (i=0; i<576; i++) {
@@ -5343,7 +5686,7 @@ void dft576(int16_t *x,int16_t *y,unsigned char scale_flag)  // 192 x 3
 
 static int16_t twa600[299*2*4];
 
-void dft600(int16_t *x,int16_t *y,unsigned char scale_flag)  // 300 x 2
+void dft600(int16_t *x,int16_t *y,unsigned int *scale_flag)  // 300 x 2
 {
   int i,j;
   simd_q15_t *x128=(simd_q15_t *)x;
@@ -5358,8 +5701,8 @@ void dft600(int16_t *x,int16_t *y,unsigned char scale_flag)  // 300 x 2
     x2128[i+300] = x128[j+1];
   }
 
-  dft300((int16_t *)x2128,(int16_t *)ytmp128,1);
-  dft300((int16_t *)(x2128+300),(int16_t *)(ytmp128+300),1);
+  dft300((int16_t *)x2128,(int16_t *)ytmp128,scale_flag);
+  dft300((int16_t *)(x2128+300),(int16_t *)(ytmp128+300),scale_flag);
 
 
   bfly2_tw1(ytmp128,ytmp128+300,y128,y128+300);
@@ -5372,7 +5715,7 @@ void dft600(int16_t *x,int16_t *y,unsigned char scale_flag)  // 300 x 2
           tw128+j);
   }
 
-  if (scale_flag==1) {
+  if (scale_flag) {
     norm128 = set1_int16(ONE_OVER_SQRT2_Q15);
 
     for (i=0; i<600; i++) {
@@ -5385,7 +5728,7 @@ void dft600(int16_t *x,int16_t *y,unsigned char scale_flag)  // 300 x 2
 static int16_t twa648[215*2*4];
 static int16_t twb648[215*2*4];
 
-void dft648(int16_t *x,int16_t *y,unsigned char scale_flag)  // 216 x 3
+void dft648(int16_t *x,int16_t *y,unsigned int *scale_flag)  // 216 x 3
 {
   int i,j;
   simd_q15_t *x128=(simd_q15_t *)x;
@@ -5403,9 +5746,9 @@ void dft648(int16_t *x,int16_t *y,unsigned char scale_flag)  // 216 x 3
     x2128[i+432] = x128[j+2];
   }
 
-  dft216((int16_t *)x2128,(int16_t *)ytmp128,1);
-  dft216((int16_t *)(x2128+216),(int16_t *)(ytmp128+216),1);
-  dft216((int16_t *)(x2128+432),(int16_t *)(ytmp128+432),1);
+  dft216((int16_t *)x2128,(int16_t *)ytmp128,scale_flag);
+  dft216((int16_t *)(x2128+216),(int16_t *)(ytmp128+216),scale_flag);
+  dft216((int16_t *)(x2128+432),(int16_t *)(ytmp128+432),scale_flag);
 
   bfly3_tw1(ytmp128,ytmp128+216,ytmp128+432,y128,y128+216,y128+432);
 
@@ -5420,7 +5763,7 @@ void dft648(int16_t *x,int16_t *y,unsigned char scale_flag)  // 216 x 3
           twb128+j);
   }
 
-  if (scale_flag==1) {
+  if (scale_flag) {
     norm128 = set1_int16(dft_norm_table[14]);
 
     for (i=0; i<648; i++) {
@@ -5439,7 +5782,7 @@ static int16_t twb720[179*2*4];
 static int16_t twc720[179*2*4];
 
 
-void dft720(int16_t *x,int16_t *y,unsigned char scale_flag)  // 180 x 4
+void dft720(int16_t *x,int16_t *y,unsigned int *scale_flag)  // 180 x 4
 {
   int i,j;
   simd_q15_t *x128=(simd_q15_t *)x;
@@ -5459,10 +5802,10 @@ void dft720(int16_t *x,int16_t *y,unsigned char scale_flag)  // 180 x 4
     x2128[i+540] = x128[j+3];
   }
 
-  dft180((int16_t *)x2128,(int16_t *)ytmp128,1);
-  dft180((int16_t *)(x2128+180),(int16_t *)(ytmp128+180),1);
-  dft180((int16_t *)(x2128+360),(int16_t *)(ytmp128+360),1);
-  dft180((int16_t *)(x2128+540),(int16_t *)(ytmp128+540),1);
+  dft180((int16_t *)x2128,(int16_t *)ytmp128,scale_flag);
+  dft180((int16_t *)(x2128+180),(int16_t *)(ytmp128+180),scale_flag);
+  dft180((int16_t *)(x2128+360),(int16_t *)(ytmp128+360),scale_flag);
+  dft180((int16_t *)(x2128+540),(int16_t *)(ytmp128+540),scale_flag);
 
   bfly4_tw1(ytmp128,ytmp128+180,ytmp128+360,ytmp128+540,y128,y128+180,y128+360,y128+540);
 
@@ -5480,7 +5823,7 @@ void dft720(int16_t *x,int16_t *y,unsigned char scale_flag)  // 180 x 4
           twc128+j);
   }
 
-  if (scale_flag==1) {
+  if (scale_flag) {
     norm128 = set1_int16(16384);//dft_norm_table[13]);
 
     for (i=0; i<720; i++) {
@@ -5497,7 +5840,7 @@ static int16_t twa768p[191*2*4];
 static int16_t twb768p[191*2*4];
 static int16_t twc768p[191*2*4];
 
-void dft768p(int16_t *x,int16_t *y,unsigned char scale_flag) { // 192x 4;
+void dft768p(int16_t *x,int16_t *y,unsigned int *scale_flag) { // 192x 4;
 
   int i,j;
   simd_q15_t *x128=(simd_q15_t *)x;
@@ -5517,10 +5860,10 @@ void dft768p(int16_t *x,int16_t *y,unsigned char scale_flag) { // 192x 4;
     x2128[i+576] = x128[j+3];
   }
 
-  dft192((int16_t *)x2128,(int16_t *)ytmp128,1);
-  dft192((int16_t *)(x2128+192),(int16_t *)(ytmp128+192),1);
-  dft192((int16_t *)(x2128+384),(int16_t *)(ytmp128+384),1);
-  dft192((int16_t *)(x2128+576),(int16_t *)(ytmp128+576),1);
+  dft192((int16_t *)x2128,(int16_t *)ytmp128,scale_flag);
+  dft192((int16_t *)(x2128+192),(int16_t *)(ytmp128+192),scale_flag);
+  dft192((int16_t *)(x2128+384),(int16_t *)(ytmp128+384),scale_flag);
+  dft192((int16_t *)(x2128+576),(int16_t *)(ytmp128+576),scale_flag);
 
   bfly4_tw1(ytmp128,ytmp128+192,ytmp128+384,ytmp128+576,y128,y128+192,y128+384,y128+576);
 
@@ -5538,7 +5881,7 @@ void dft768p(int16_t *x,int16_t *y,unsigned char scale_flag) { // 192x 4;
           twc128+j);
   }
 
-  if (scale_flag==1) {
+  if (scale_flag) {
     norm128 = set1_int16(16384);//dft_norm_table[13]);
 
     for (i=0; i<768; i++) {
@@ -5556,7 +5899,7 @@ void dft768p(int16_t *x,int16_t *y,unsigned char scale_flag) { // 192x 4;
 static int16_t twa384i[256];
 static int16_t twb384i[256];
 // 128 x 3
-void idft384(int16_t *input, int16_t *output, unsigned char scale)
+void idft384(int16_t *input, int16_t *output, unsigned int *scale)
 {
   int i,i2,j;
   uint32_t tmp[3][128]__attribute__((aligned(32)));
@@ -5570,9 +5913,9 @@ void idft384(int16_t *input, int16_t *output, unsigned char scale)
     tmp[2][i] = ((uint32_t *)input)[j++];
   }
 
-  idft128((int16_t*)(tmp[0]),(int16_t*)(tmpo[0]),1);
-  idft128((int16_t*)(tmp[1]),(int16_t*)(tmpo[1]),1);
-  idft128((int16_t*)(tmp[2]),(int16_t*)(tmpo[2]),1);
+  idft128((int16_t*)(tmp[0]),(int16_t*)(tmpo[0]),scale);
+  idft128((int16_t*)(tmp[1]),(int16_t*)(tmpo[1]),scale);
+  idft128((int16_t*)(tmp[2]),(int16_t*)(tmpo[2]),scale);
 
   for (i=0,i2=0; i<256; i+=8,i2+=4)  {
     ibfly3((simd_q15_t*)(&tmpo[0][i2]),(simd_q15_t*)(&tmpo[1][i2]),(simd_q15_t*)(&tmpo[2][i2]),
@@ -5581,7 +5924,7 @@ void idft384(int16_t *input, int16_t *output, unsigned char scale)
   }
 
 
-  if (scale==1) {
+  if (scale) {
     for (i=0; i<6; i++) {
       y128p[0]  = mulhi_int16(y128p[0],ONE_OVER_SQRT3_Q15_128);
       y128p[1]  = mulhi_int16(y128p[1],ONE_OVER_SQRT3_Q15_128);
@@ -5606,7 +5949,7 @@ void idft384(int16_t *input, int16_t *output, unsigned char scale)
 static int16_t twa864[287*2*4];
 static int16_t twb864[287*2*4];
 
-void dft864(int16_t *x,int16_t *y,unsigned char scale_flag)  // 288 x 3
+void dft864(int16_t *x,int16_t *y,unsigned int *scale_flag)  // 288 x 3
 {
   int i,j;
   simd_q15_t *x128=(simd_q15_t *)x;
@@ -5624,9 +5967,9 @@ void dft864(int16_t *x,int16_t *y,unsigned char scale_flag)  // 288 x 3
     x2128[i+576] = x128[j+2];
   }
 
-  dft288((int16_t *)x2128,(int16_t *)ytmp128,1);
-  dft288((int16_t *)(x2128+288),(int16_t *)(ytmp128+288),1);
-  dft288((int16_t *)(x2128+576),(int16_t *)(ytmp128+576),1);
+  dft288((int16_t *)x2128,(int16_t *)ytmp128,scale_flag);
+  dft288((int16_t *)(x2128+288),(int16_t *)(ytmp128+288),scale_flag);
+  dft288((int16_t *)(x2128+576),(int16_t *)(ytmp128+576),scale_flag);
 
   bfly3_tw1(ytmp128,ytmp128+288,ytmp128+576,y128,y128+288,y128+576);
 
@@ -5641,7 +5984,7 @@ void dft864(int16_t *x,int16_t *y,unsigned char scale_flag)  // 288 x 3
           twb128+j);
   }
 
-  if (scale_flag==1) {
+  if (scale_flag) {
     norm128 = set1_int16(dft_norm_table[14]);
 
     for (i=0; i<864; i++) {
@@ -5657,7 +6000,7 @@ void dft864(int16_t *x,int16_t *y,unsigned char scale_flag)  // 288 x 3
 static int16_t twa900[299*2*4];
 static int16_t twb900[299*2*4];
 
-void dft900(int16_t *x,int16_t *y,unsigned char scale_flag)  // 300 x 3
+void dft900(int16_t *x,int16_t *y,unsigned int *scale_flag)  // 300 x 3
 {
   int i,j;
   simd_q15_t *x128=(simd_q15_t *)x;
@@ -5675,9 +6018,9 @@ void dft900(int16_t *x,int16_t *y,unsigned char scale_flag)  // 300 x 3
     x2128[i+600] = x128[j+2];
   }
 
-  dft300((int16_t *)x2128,(int16_t *)ytmp128,1);
-  dft300((int16_t *)(x2128+300),(int16_t *)(ytmp128+300),1);
-  dft300((int16_t *)(x2128+600),(int16_t *)(ytmp128+600),1);
+  dft300((int16_t *)x2128,(int16_t *)ytmp128,scale_flag);
+  dft300((int16_t *)(x2128+300),(int16_t *)(ytmp128+300),scale_flag);
+  dft300((int16_t *)(x2128+600),(int16_t *)(ytmp128+600),scale_flag);
 
   bfly3_tw1(ytmp128,ytmp128+300,ytmp128+600,y128,y128+300,y128+600);
 
@@ -5692,7 +6035,7 @@ void dft900(int16_t *x,int16_t *y,unsigned char scale_flag)  // 300 x 3
           twb128+j);
   }
 
-  if (scale_flag==1) {
+  if (scale_flag) {
     norm128 = set1_int16(dft_norm_table[14]);
 
     for (i=0; i<900; i++) {
@@ -5711,7 +6054,7 @@ static int16_t twb960[239*2*4];
 static int16_t twc960[239*2*4];
 
 
-void dft960(int16_t *x,int16_t *y,unsigned char scale_flag)  // 240 x 4
+void dft960(int16_t *x,int16_t *y,unsigned int *scale_flag)  // 240 x 4
 {
   int i,j;
   simd_q15_t *x128=(simd_q15_t *)x;
@@ -5731,10 +6074,10 @@ void dft960(int16_t *x,int16_t *y,unsigned char scale_flag)  // 240 x 4
     x2128[i+720] = x128[j+3];
   }
 
-  dft240((int16_t *)x2128,(int16_t *)ytmp128,1);
-  dft240((int16_t *)(x2128+240),(int16_t *)(ytmp128+240),1);
-  dft240((int16_t *)(x2128+480),(int16_t *)(ytmp128+480),1);
-  dft240((int16_t *)(x2128+720),(int16_t *)(ytmp128+720),1);
+  dft240((int16_t *)x2128,(int16_t *)ytmp128,scale_flag);
+  dft240((int16_t *)(x2128+240),(int16_t *)(ytmp128+240),scale_flag);
+  dft240((int16_t *)(x2128+480),(int16_t *)(ytmp128+480),scale_flag);
+  dft240((int16_t *)(x2128+720),(int16_t *)(ytmp128+720),scale_flag);
 
   bfly4_tw1(ytmp128,ytmp128+240,ytmp128+480,ytmp128+720,y128,y128+240,y128+480,y128+720);
 
@@ -5752,7 +6095,7 @@ void dft960(int16_t *x,int16_t *y,unsigned char scale_flag)  // 240 x 4
           twc128+j);
   }
 
-  if (scale_flag==1) {
+  if (scale_flag) {
     norm128 = set1_int16(16384);//dft_norm_table[13]);
 
     for (i=0; i<960; i++) {
@@ -5769,7 +6112,7 @@ void dft960(int16_t *x,int16_t *y,unsigned char scale_flag)  // 240 x 4
 static int16_t twa972[323*2*4];
 static int16_t twb972[323*2*4];
 
-void dft972(int16_t *x,int16_t *y,unsigned char scale_flag)  // 324 x 3
+void dft972(int16_t *x,int16_t *y,unsigned int *scale_flag)  // 324 x 3
 {
   int i,j;
   simd_q15_t *x128=(simd_q15_t *)x;
@@ -5787,9 +6130,9 @@ void dft972(int16_t *x,int16_t *y,unsigned char scale_flag)  // 324 x 3
     x2128[i+648] = x128[j+2];
   }
 
-  dft324((int16_t *)x2128,(int16_t *)ytmp128,1);
-  dft324((int16_t *)(x2128+324),(int16_t *)(ytmp128+324),1);
-  dft324((int16_t *)(x2128+648),(int16_t *)(ytmp128+648),1);
+  dft324((int16_t *)x2128,(int16_t *)ytmp128,scale_flag);
+  dft324((int16_t *)(x2128+324),(int16_t *)(ytmp128+324),scale_flag);
+  dft324((int16_t *)(x2128+648),(int16_t *)(ytmp128+648),scale_flag);
 
   bfly3_tw1(ytmp128,ytmp128+324,ytmp128+648,y128,y128+324,y128+648);
 
@@ -5804,7 +6147,7 @@ void dft972(int16_t *x,int16_t *y,unsigned char scale_flag)  // 324 x 3
           twb128+j);
   }
 
-  if (scale_flag==1) {
+  if (scale_flag) {
     norm128 = set1_int16(dft_norm_table[14]);
 
     for (i=0; i<972; i++) {
@@ -5820,7 +6163,7 @@ void dft972(int16_t *x,int16_t *y,unsigned char scale_flag)  // 324 x 3
 static int16_t twa1080[359*2*4];
 static int16_t twb1080[359*2*4];
 
-void dft1080(int16_t *x,int16_t *y,unsigned char scale_flag)  // 360 x 3
+void dft1080(int16_t *x,int16_t *y,unsigned int *scale_flag)  // 360 x 3
 {
   int i,j;
   simd_q15_t *x128=(simd_q15_t *)x;
@@ -5838,9 +6181,9 @@ void dft1080(int16_t *x,int16_t *y,unsigned char scale_flag)  // 360 x 3
     x2128[i+720] = x128[j+2];
   }
 
-  dft360((int16_t *)x2128,(int16_t *)ytmp128,1);
-  dft360((int16_t *)(x2128+360),(int16_t *)(ytmp128+360),1);
-  dft360((int16_t *)(x2128+720),(int16_t *)(ytmp128+720),1);
+  dft360((int16_t *)x2128,(int16_t *)ytmp128,scale_flag);
+  dft360((int16_t *)(x2128+360),(int16_t *)(ytmp128+360),scale_flag);
+  dft360((int16_t *)(x2128+720),(int16_t *)(ytmp128+720),scale_flag);
 
   bfly3_tw1(ytmp128,ytmp128+360,ytmp128+720,y128,y128+360,y128+720);
 
@@ -5855,7 +6198,7 @@ void dft1080(int16_t *x,int16_t *y,unsigned char scale_flag)  // 360 x 3
           twb128+j);
   }
 
-  if (scale_flag==1) {
+  if (scale_flag) {
     norm128 = set1_int16(dft_norm_table[14]);
 
     for (i=0; i<1080; i++) {
@@ -5872,7 +6215,7 @@ static int16_t twa1152[287*2*4];
 static int16_t twb1152[287*2*4];
 static int16_t twc1152[287*2*4];
 
-void dft1152(int16_t *x,int16_t *y,unsigned char scale_flag)  // 288 x 4
+void dft1152(int16_t *x,int16_t *y,unsigned int *scale_flag)  // 288 x 4
 {
 
   int i,j;
@@ -5893,10 +6236,10 @@ void dft1152(int16_t *x,int16_t *y,unsigned char scale_flag)  // 288 x 4
     x2128[i+864] = x128[j+3];
   }
 
-  dft288((int16_t *)x2128,(int16_t *)ytmp128,1);
-  dft288((int16_t *)(x2128+288),(int16_t *)(ytmp128+288),1);
-  dft288((int16_t *)(x2128+576),(int16_t *)(ytmp128+576),1);
-  dft288((int16_t *)(x2128+864),(int16_t *)(ytmp128+864),1);
+  dft288((int16_t *)x2128,(int16_t *)ytmp128,scale_flag);
+  dft288((int16_t *)(x2128+288),(int16_t *)(ytmp128+288),scale_flag);
+  dft288((int16_t *)(x2128+576),(int16_t *)(ytmp128+576),scale_flag);
+  dft288((int16_t *)(x2128+864),(int16_t *)(ytmp128+864),scale_flag);
 
   bfly4_tw1(ytmp128,ytmp128+288,ytmp128+576,ytmp128+864,y128,y128+288,y128+576,y128+864);
 
@@ -5914,7 +6257,7 @@ void dft1152(int16_t *x,int16_t *y,unsigned char scale_flag)  // 288 x 4
           twc128+j);
   }
 
-  if (scale_flag==1) {
+  if (scale_flag) {
     norm128 = set1_int16(16384);//dft_norm_table[13]);
 
     for (i=0; i<1152; i++) {
@@ -5930,7 +6273,7 @@ int16_t twa1200[4784];
 int16_t twb1200[4784];
 int16_t twc1200[4784];
 
-void dft1200(int16_t *x,int16_t *y,unsigned char scale_flag)
+void dft1200(int16_t *x,int16_t *y,unsigned int *scale_flag)
 {
 
   int i,j;
@@ -5951,10 +6294,10 @@ void dft1200(int16_t *x,int16_t *y,unsigned char scale_flag)
     x2128[i+900] = x128[j+3];
   }
 
-  dft300((int16_t *)x2128,(int16_t *)ytmp128,1);
-  dft300((int16_t *)(x2128+300),(int16_t *)(ytmp128+300),1);
-  dft300((int16_t *)(x2128+600),(int16_t *)(ytmp128+600),1);
-  dft300((int16_t *)(x2128+900),(int16_t *)(ytmp128+900),1);
+  dft300((int16_t *)x2128,(int16_t *)ytmp128,scale_flag);
+  dft300((int16_t *)(x2128+300),(int16_t *)(ytmp128+300),scale_flag);
+  dft300((int16_t *)(x2128+600),(int16_t *)(ytmp128+600),scale_flag);
+  dft300((int16_t *)(x2128+900),(int16_t *)(ytmp128+900),scale_flag);
 
   bfly4_tw1(ytmp128,ytmp128+300,ytmp128+600,ytmp128+900,y128,y128+300,y128+600,y128+900);
 
@@ -5972,7 +6315,7 @@ void dft1200(int16_t *x,int16_t *y,unsigned char scale_flag)
           twc128+j);
   }
 
-  if (scale_flag==1) {
+  if (scale_flag) {
     norm128 = set1_int16(16384);//dft_norm_table[13]);
     for (i=0; i<1200; i++) {
       y128[i] = mulhi_int16(y128[i],norm128);
@@ -5988,7 +6331,7 @@ void dft1200(int16_t *x,int16_t *y,unsigned char scale_flag)
 static int16_t twa1296[431*2*4];
 static int16_t twb1296[431*2*4];
 
-void dft1296(int16_t *x,int16_t *y,unsigned char scale_flag) //432 * 3
+void dft1296(int16_t *x,int16_t *y,unsigned int *scale_flag) //432 * 3
 {
 
   int i,j;
@@ -6007,9 +6350,9 @@ void dft1296(int16_t *x,int16_t *y,unsigned char scale_flag) //432 * 3
     x2128[i+864] = x128[j+2];
   }
 
-  dft432((int16_t *)x2128,(int16_t *)ytmp128,1);
-  dft432((int16_t *)(x2128+432),(int16_t *)(ytmp128+432),1);
-  dft432((int16_t *)(x2128+864),(int16_t *)(ytmp128+864),1);
+  dft432((int16_t *)x2128,(int16_t *)ytmp128,scale_flag);
+  dft432((int16_t *)(x2128+432),(int16_t *)(ytmp128+432),scale_flag);
+  dft432((int16_t *)(x2128+864),(int16_t *)(ytmp128+864),scale_flag);
 
   bfly3_tw1(ytmp128,ytmp128+432,ytmp128+864,y128,y128+432,y128+864);
 
@@ -6024,7 +6367,7 @@ void dft1296(int16_t *x,int16_t *y,unsigned char scale_flag) //432 * 3
           twb128+j);
   }
 
-  if (scale_flag==1) {
+  if (scale_flag) {
     norm128 = set1_int16(dft_norm_table[14]);
 
     for (i=0; i<1296; i++) {
@@ -6041,7 +6384,7 @@ void dft1296(int16_t *x,int16_t *y,unsigned char scale_flag) //432 * 3
 static int16_t twa1440[479*2*4];
 static int16_t twb1440[479*2*4];
 
-void dft1440(int16_t *x,int16_t *y,unsigned char scale_flag)  // 480 x 3
+void dft1440(int16_t *x,int16_t *y,unsigned int *scale_flag)  // 480 x 3
 {
   int i,j;
   simd_q15_t *x128=(simd_q15_t *)x;
@@ -6059,9 +6402,9 @@ void dft1440(int16_t *x,int16_t *y,unsigned char scale_flag)  // 480 x 3
     x2128[i+960] = x128[j+2];
   }
 
-  dft480((int16_t *)x2128,(int16_t *)ytmp128,1);
-  dft480((int16_t *)(x2128+480),(int16_t *)(ytmp128+480),1);
-  dft480((int16_t *)(x2128+960),(int16_t *)(ytmp128+960),1);
+  dft480((int16_t *)x2128,(int16_t *)ytmp128,scale_flag);
+  dft480((int16_t *)(x2128+480),(int16_t *)(ytmp128+480),scale_flag);
+  dft480((int16_t *)(x2128+960),(int16_t *)(ytmp128+960),scale_flag);
 
   bfly3_tw1(ytmp128,ytmp128+480,ytmp128+960,y128,y128+480,y128+960);
 
@@ -6076,7 +6419,7 @@ void dft1440(int16_t *x,int16_t *y,unsigned char scale_flag)  // 480 x 3
           twb128+j);
   }
 
-  if (scale_flag==1) {
+  if (scale_flag) {
     norm128 = set1_int16(dft_norm_table[14]);
 
     for (i=0; i<1440; i++) {
@@ -6094,7 +6437,7 @@ static int16_t twb1500[2392]__attribute__((aligned(32)));
 static int16_t twc1500[2392]__attribute__((aligned(32)));
 static int16_t twd1500[2392]__attribute__((aligned(32)));
 
-void dft1500(int16_t *x,int16_t *y,unsigned char scale_flag)
+void dft1500(int16_t *x,int16_t *y,unsigned int *scale_flag)
 {
 
   int i,j;
@@ -6117,11 +6460,11 @@ void dft1500(int16_t *x,int16_t *y,unsigned char scale_flag)
     x2128[i+1200] = x128[j+4];
   }
 
-  dft300((int16_t *)x2128,(int16_t *)ytmp128,1);
-  dft300((int16_t *)(x2128+300),(int16_t *)(ytmp128+300),1);
-  dft300((int16_t *)(x2128+600),(int16_t *)(ytmp128+600),1);
-  dft300((int16_t *)(x2128+900),(int16_t *)(ytmp128+900),1);
-  dft300((int16_t *)(x2128+1200),(int16_t *)(ytmp128+1200),1);
+  dft300((int16_t *)x2128,(int16_t *)ytmp128,scale_flag);
+  dft300((int16_t *)(x2128+300),(int16_t *)(ytmp128+300),scale_flag);
+  dft300((int16_t *)(x2128+600),(int16_t *)(ytmp128+600),scale_flag);
+  dft300((int16_t *)(x2128+900),(int16_t *)(ytmp128+900),scale_flag);
+  dft300((int16_t *)(x2128+1200),(int16_t *)(ytmp128+1200),scale_flag);
 
   bfly5_tw1(ytmp128,ytmp128+300,ytmp128+600,ytmp128+900,ytmp128+1200,y128,y128+300,y128+600,y128+900,y128+1200);
 
@@ -6142,7 +6485,7 @@ void dft1500(int16_t *x,int16_t *y,unsigned char scale_flag)
           twd128+j);
   }
 
-  if (scale_flag==1) {
+  if (scale_flag) {
     norm128 = set1_int16(dft_norm_table[15]);
 
     for (i=0; i<1500; i++) {
@@ -6158,7 +6501,7 @@ void dft1500(int16_t *x,int16_t *y,unsigned char scale_flag)
 static int16_t twa1620[539*2*4];
 static int16_t twb1620[539*2*4];
 
-void dft1620(int16_t *x,int16_t *y,unsigned char scale_flag)  // 540 x 3
+void dft1620(int16_t *x,int16_t *y,unsigned int *scale_flag)  // 540 x 3
 {
   int i,j;
   simd_q15_t *x128=(simd_q15_t *)x;
@@ -6176,9 +6519,9 @@ void dft1620(int16_t *x,int16_t *y,unsigned char scale_flag)  // 540 x 3
     x2128[i+1080] = x128[j+2];
   }
 
-  dft540((int16_t *)x2128,(int16_t *)ytmp128,1);
-  dft540((int16_t *)(x2128+540),(int16_t *)(ytmp128+540),1);
-  dft540((int16_t *)(x2128+1080),(int16_t *)(ytmp128+1080),1);
+  dft540((int16_t *)x2128,(int16_t *)ytmp128,scale_flag);
+  dft540((int16_t *)(x2128+540),(int16_t *)(ytmp128+540),scale_flag);
+  dft540((int16_t *)(x2128+1080),(int16_t *)(ytmp128+1080),scale_flag);
 
   bfly3_tw1(ytmp128,ytmp128+540,ytmp128+1080,y128,y128+540,y128+1080);
 
@@ -6193,7 +6536,7 @@ void dft1620(int16_t *x,int16_t *y,unsigned char scale_flag)  // 540 x 3
           twb128+j);
   }
 
-  if (scale_flag==1) {
+  if (scale_flag) {
     norm128 = set1_int16(dft_norm_table[14]);
 
     for (i=0; i<1620; i++) {
@@ -6209,7 +6552,7 @@ void dft1620(int16_t *x,int16_t *y,unsigned char scale_flag)  // 540 x 3
 static int16_t twa1728[575*2*4];
 static int16_t twb1728[575*2*4];
 
-void dft1728(int16_t *x,int16_t *y,unsigned char scale_flag)  // 576 x 3
+void dft1728(int16_t *x,int16_t *y,unsigned int *scale_flag)  // 576 x 3
 {
   int i,j;
   simd_q15_t *x128=(simd_q15_t *)x;
@@ -6227,9 +6570,9 @@ void dft1728(int16_t *x,int16_t *y,unsigned char scale_flag)  // 576 x 3
     x2128[i+1152] = x128[j+2];
   }
 
-  dft576((int16_t *)x2128,(int16_t *)ytmp128,1);
-  dft576((int16_t *)(x2128+576),(int16_t *)(ytmp128+576),1);
-  dft576((int16_t *)(x2128+1152),(int16_t *)(ytmp128+1152),1);
+  dft576((int16_t *)x2128,(int16_t *)ytmp128,scale_flag);
+  dft576((int16_t *)(x2128+576),(int16_t *)(ytmp128+576),scale_flag);
+  dft576((int16_t *)(x2128+1152),(int16_t *)(ytmp128+1152),scale_flag);
 
   bfly3_tw1(ytmp128,ytmp128+576,ytmp128+1152,y128,y128+576,y128+1152);
 
@@ -6244,7 +6587,7 @@ void dft1728(int16_t *x,int16_t *y,unsigned char scale_flag)  // 576 x 3
           twb128+j);
   }
 
-  if (scale_flag==1) {
+  if (scale_flag) {
     norm128 = set1_int16(dft_norm_table[14]);
 
     for (i=0; i<1728; i++) {
@@ -6260,7 +6603,7 @@ void dft1728(int16_t *x,int16_t *y,unsigned char scale_flag)  // 576 x 3
 static int16_t twa1800[599*2*4];
 static int16_t twb1800[599*2*4];
 
-void dft1800(int16_t *x,int16_t *y,unsigned char scale_flag)  // 600 x 3
+void dft1800(int16_t *x,int16_t *y,unsigned int *scale_flag)  // 600 x 3
 {
   int i,j;
   simd_q15_t *x128=(simd_q15_t *)x;
@@ -6278,9 +6621,9 @@ void dft1800(int16_t *x,int16_t *y,unsigned char scale_flag)  // 600 x 3
     x2128[i+1200] = x128[j+2];
   }
 
-  dft600((int16_t *)x2128,(int16_t *)ytmp128,1);
-  dft600((int16_t *)(x2128+600),(int16_t *)(ytmp128+600),1);
-  dft600((int16_t *)(x2128+1200),(int16_t *)(ytmp128+1200),1);
+  dft600((int16_t *)x2128,(int16_t *)ytmp128,scale_flag);
+  dft600((int16_t *)(x2128+600),(int16_t *)(ytmp128+600),scale_flag);
+  dft600((int16_t *)(x2128+1200),(int16_t *)(ytmp128+1200),scale_flag);
 
   bfly3_tw1(ytmp128,ytmp128+600,ytmp128+1200,y128,y128+600,y128+1200);
 
@@ -6295,7 +6638,7 @@ void dft1800(int16_t *x,int16_t *y,unsigned char scale_flag)  // 600 x 3
           twb128+j);
   }
 
-  if (scale_flag==1) {
+  if (scale_flag) {
     norm128 = set1_int16(dft_norm_table[14]);
 
     for (i=0; i<1800; i++) {
@@ -6312,7 +6655,7 @@ static int16_t twa1920[479*2*4];
 static int16_t twb1920[479*2*4];
 static int16_t twc1920[479*2*4];
 
-void dft1920(int16_t *x,int16_t *y,unsigned char scale_flag)  // 480 x 4
+void dft1920(int16_t *x,int16_t *y,unsigned int *scale_flag)  // 480 x 4
 {
   int i,j;
   simd_q15_t *x128=(simd_q15_t *)x;
@@ -6332,10 +6675,10 @@ void dft1920(int16_t *x,int16_t *y,unsigned char scale_flag)  // 480 x 4
     x2128[i+1440] = x128[j+3];
   }
 
-  dft480((int16_t *)x2128,(int16_t *)ytmp128,1);
-  dft480((int16_t *)(x2128+480),(int16_t *)(ytmp128+480),1);
-  dft480((int16_t *)(x2128+960),(int16_t *)(ytmp128+960),1);
-  dft480((int16_t *)(x2128+1440),(int16_t *)(ytmp128+1440),1);
+  dft480((int16_t *)x2128,(int16_t *)ytmp128,scale_flag);
+  dft480((int16_t *)(x2128+480),(int16_t *)(ytmp128+480),scale_flag);
+  dft480((int16_t *)(x2128+960),(int16_t *)(ytmp128+960),scale_flag);
+  dft480((int16_t *)(x2128+1440),(int16_t *)(ytmp128+1440),scale_flag);
 
   bfly4_tw1(ytmp128,ytmp128+480,ytmp128+960,ytmp128+1440,y128,y128+480,y128+960,y128+1440);
 
@@ -6353,7 +6696,7 @@ void dft1920(int16_t *x,int16_t *y,unsigned char scale_flag)  // 480 x 4
           twc128+j);
   }
 
-  if (scale_flag==1) {
+  if (scale_flag) {
     norm128 = set1_int16(dft_norm_table[13]);
     for (i=0; i<1920; i++) {
       y128[i] = mulhi_int16(y128[i],norm128);
@@ -6368,7 +6711,7 @@ void dft1920(int16_t *x,int16_t *y,unsigned char scale_flag)  // 480 x 4
 static int16_t twa1944[647*2*4];
 static int16_t twb1944[647*2*4];
 
-void dft1944(int16_t *x,int16_t *y,unsigned char scale_flag)  // 648 x 3
+void dft1944(int16_t *x,int16_t *y,unsigned int *scale_flag)  // 648 x 3
 {
   int i,j;
   simd_q15_t *x128=(simd_q15_t *)x;
@@ -6386,9 +6729,9 @@ void dft1944(int16_t *x,int16_t *y,unsigned char scale_flag)  // 648 x 3
     x2128[i+1296] = x128[j+2];
   }
 
-  dft648((int16_t *)x2128,(int16_t *)ytmp128,1);
-  dft648((int16_t *)(x2128+648),(int16_t *)(ytmp128+648),1);
-  dft648((int16_t *)(x2128+1296),(int16_t *)(ytmp128+1296),1);
+  dft648((int16_t *)x2128,(int16_t *)ytmp128,scale_flag);
+  dft648((int16_t *)(x2128+648),(int16_t *)(ytmp128+648),scale_flag);
+  dft648((int16_t *)(x2128+1296),(int16_t *)(ytmp128+1296),scale_flag);
 
   bfly3_tw1(ytmp128,ytmp128+648,ytmp128+1296,y128,y128+648,y128+1296);
 
@@ -6403,7 +6746,7 @@ void dft1944(int16_t *x,int16_t *y,unsigned char scale_flag)  // 648 x 3
           twb128+j);
   }
 
-  if (scale_flag==1) {
+  if (scale_flag) {
     norm128 = set1_int16(dft_norm_table[14]);
 
     for (i=0; i<1944; i++) {
@@ -6419,7 +6762,7 @@ void dft1944(int16_t *x,int16_t *y,unsigned char scale_flag)  // 648 x 3
 static int16_t twa2160[719*2*4];
 static int16_t twb2160[719*2*4];
 
-void dft2160(int16_t *x,int16_t *y,unsigned char scale_flag)  // 720 x 3
+void dft2160(int16_t *x,int16_t *y,unsigned int *scale_flag)  // 720 x 3
 {
   int i,j;
   simd_q15_t *x128=(simd_q15_t *)x;
@@ -6437,9 +6780,9 @@ void dft2160(int16_t *x,int16_t *y,unsigned char scale_flag)  // 720 x 3
     x2128[i+1440] = x128[j+2];
   }
 
-  dft720((int16_t *)x2128,(int16_t *)ytmp128,1);
-  dft720((int16_t *)(x2128+720),(int16_t *)(ytmp128+720),1);
-  dft720((int16_t *)(x2128+1440),(int16_t *)(ytmp128+1440),1);
+  dft720((int16_t *)x2128,(int16_t *)ytmp128,scale_flag);
+  dft720((int16_t *)(x2128+720),(int16_t *)(ytmp128+720),scale_flag);
+  dft720((int16_t *)(x2128+1440),(int16_t *)(ytmp128+1440),scale_flag);
 
   bfly3_tw1(ytmp128,ytmp128+720,ytmp128+1440,y128,y128+720,y128+1440);
 
@@ -6454,7 +6797,7 @@ void dft2160(int16_t *x,int16_t *y,unsigned char scale_flag)  // 720 x 3
           twb128+j);
   }
 
-  if (scale_flag==1) {
+  if (scale_flag) {
     norm128 = set1_int16(dft_norm_table[14]);
 
     for (i=0; i<2160; i++) {
@@ -6470,7 +6813,7 @@ void dft2160(int16_t *x,int16_t *y,unsigned char scale_flag)  // 720 x 3
 static int16_t twa2304[767*2*4];
 static int16_t twb2304[767*2*4];
 
-void dft2304(int16_t *x,int16_t *y,unsigned char scale_flag)  // 768 x 3
+void dft2304(int16_t *x,int16_t *y,unsigned int *scale_flag)  // 768 x 3
 {
   int i,j;
   simd_q15_t *x128=(simd_q15_t *)x;
@@ -6488,9 +6831,9 @@ void dft2304(int16_t *x,int16_t *y,unsigned char scale_flag)  // 768 x 3
     x2128[i+1536] = x128[j+2];
   }
 
-  dft768((int16_t *)x2128,(int16_t *)ytmp128,1);
-  dft768((int16_t *)(x2128+768),(int16_t *)(ytmp128+768),1);
-  dft768((int16_t *)(x2128+1536),(int16_t *)(ytmp128+1536),1);
+  dft768((int16_t *)x2128,(int16_t *)ytmp128,scale_flag);
+  dft768((int16_t *)(x2128+768),(int16_t *)(ytmp128+768),scale_flag);
+  dft768((int16_t *)(x2128+1536),(int16_t *)(ytmp128+1536),scale_flag);
 
   bfly3_tw1(ytmp128,ytmp128+768,ytmp128+1536,y128,y128+768,y128+1536);
 
@@ -6505,7 +6848,7 @@ void dft2304(int16_t *x,int16_t *y,unsigned char scale_flag)  // 768 x 3
           twb128+j);
   }
 
-  if (scale_flag==1) {
+  if (scale_flag) {
     norm128 = set1_int16(dft_norm_table[14]);
 
     for (i=0; i<2304; i++) {
@@ -6522,7 +6865,7 @@ static int16_t twa2400[599*2*4];
 static int16_t twb2400[599*2*4];
 static int16_t twc2400[599*2*4];
 
-void dft2400(int16_t *x,int16_t *y,unsigned char scale_flag)  // 600 x 4
+void dft2400(int16_t *x,int16_t *y,unsigned int *scale_flag)  // 600 x 4
 {
 
   int i,j;
@@ -6543,10 +6886,10 @@ void dft2400(int16_t *x,int16_t *y,unsigned char scale_flag)  // 600 x 4
     x2128[i+1800] = x128[j+3];
   }
 
-  dft600((int16_t *)x2128,(int16_t *)ytmp128,1);
-  dft600((int16_t *)(x2128+600),(int16_t *)(ytmp128+600),1);
-  dft600((int16_t *)(x2128+1200),(int16_t *)(ytmp128+1200),1);
-  dft600((int16_t *)(x2128+1800),(int16_t *)(ytmp128+1800),1);
+  dft600((int16_t *)x2128,(int16_t *)ytmp128,scale_flag);
+  dft600((int16_t *)(x2128+600),(int16_t *)(ytmp128+600),scale_flag);
+  dft600((int16_t *)(x2128+1200),(int16_t *)(ytmp128+1200),scale_flag);
+  dft600((int16_t *)(x2128+1800),(int16_t *)(ytmp128+1800),scale_flag);
 
   bfly4_tw1(ytmp128,ytmp128+600,ytmp128+1200,ytmp128+1800,y128,y128+600,y128+1200,y128+1800);
 
@@ -6564,7 +6907,7 @@ void dft2400(int16_t *x,int16_t *y,unsigned char scale_flag)  // 600 x 4
           twc128+j);
   }
 
-  if (scale_flag==1) {
+  if (scale_flag) {
     norm128 = set1_int16(dft_norm_table[13]);
     for (i=0; i<2400; i++) {
       y128[i] = mulhi_int16(y128[i],norm128);
@@ -6579,7 +6922,7 @@ void dft2400(int16_t *x,int16_t *y,unsigned char scale_flag)  // 600 x 4
 static int16_t twa2592[863*2*4];
 static int16_t twb2592[863*2*4];
 
-void dft2592(int16_t *x,int16_t *y,unsigned char scale_flag)  // 864 x 3
+void dft2592(int16_t *x,int16_t *y,unsigned int *scale_flag)  // 864 x 3
 {
   int i,j;
   simd_q15_t *x128=(simd_q15_t *)x;
@@ -6597,9 +6940,9 @@ void dft2592(int16_t *x,int16_t *y,unsigned char scale_flag)  // 864 x 3
     x2128[i+1728] = x128[j+2];
   }
 
-  dft864((int16_t *)x2128,(int16_t *)ytmp128,1);
-  dft864((int16_t *)(x2128+864),(int16_t *)(ytmp128+864),1);
-  dft864((int16_t *)(x2128+1728),(int16_t *)(ytmp128+1728),1);
+  dft864((int16_t *)x2128,(int16_t *)ytmp128,scale_flag);
+  dft864((int16_t *)(x2128+864),(int16_t *)(ytmp128+864),scale_flag);
+  dft864((int16_t *)(x2128+1728),(int16_t *)(ytmp128+1728),scale_flag);
 
   bfly3_tw1(ytmp128,ytmp128+864,ytmp128+1728,y128,y128+864,y128+1728);
 
@@ -6614,7 +6957,7 @@ void dft2592(int16_t *x,int16_t *y,unsigned char scale_flag)  // 864 x 3
           twb128+j);
   }
 
-  if (scale_flag==1) {
+  if (scale_flag) {
     norm128 = set1_int16(dft_norm_table[14]);
 
     for (i=0; i<2592; i++) {
@@ -6630,7 +6973,7 @@ void dft2592(int16_t *x,int16_t *y,unsigned char scale_flag)  // 864 x 3
 static int16_t twa2700[899*2*4];
 static int16_t twb2700[899*2*4];
 
-void dft2700(int16_t *x,int16_t *y,unsigned char scale_flag)  // 900 x 3
+void dft2700(int16_t *x,int16_t *y,unsigned int *scale_flag)  // 900 x 3
 {
   int i,j;
   simd_q15_t *x128=(simd_q15_t *)x;
@@ -6648,9 +6991,9 @@ void dft2700(int16_t *x,int16_t *y,unsigned char scale_flag)  // 900 x 3
     x2128[i+1800] = x128[j+2];
   }
 
-  dft900((int16_t *)x2128,(int16_t *)ytmp128,1);
-  dft900((int16_t *)(x2128+900),(int16_t *)(ytmp128+900),1);
-  dft900((int16_t *)(x2128+1800),(int16_t *)(ytmp128+1800),1);
+  dft900((int16_t *)x2128,(int16_t *)ytmp128,scale_flag);
+  dft900((int16_t *)(x2128+900),(int16_t *)(ytmp128+900),scale_flag);
+  dft900((int16_t *)(x2128+1800),(int16_t *)(ytmp128+1800),scale_flag);
 
   bfly3_tw1(ytmp128,ytmp128+900,ytmp128+1800,y128,y128+900,y128+1800);
 
@@ -6665,7 +7008,7 @@ void dft2700(int16_t *x,int16_t *y,unsigned char scale_flag)  // 900 x 3
           twb128+j);
   }
 
-  if (scale_flag==1) {
+  if (scale_flag) {
     norm128 = set1_int16(dft_norm_table[14]);
 
     for (i=0; i<2700; i++) {
@@ -6681,7 +7024,7 @@ void dft2700(int16_t *x,int16_t *y,unsigned char scale_flag)  // 900 x 3
 static int16_t twa2880[959*2*4];
 static int16_t twb2880[959*2*4];
 
-void dft2880(int16_t *x,int16_t *y,unsigned char scale_flag)  // 960 x 3
+void dft2880(int16_t *x,int16_t *y,unsigned int *scale_flag)  // 960 x 3
 {
   int i,j;
   simd_q15_t *x128=(simd_q15_t *)x;
@@ -6699,9 +7042,9 @@ void dft2880(int16_t *x,int16_t *y,unsigned char scale_flag)  // 960 x 3
     x2128[i+1920] = x128[j+2];
   }
 
-  dft960((int16_t *)x2128,(int16_t *)ytmp128,1);
-  dft960((int16_t *)(x2128+960),(int16_t *)(ytmp128+960),1);
-  dft960((int16_t *)(x2128+1920),(int16_t *)(ytmp128+1920),1);
+  dft960((int16_t *)x2128,(int16_t *)ytmp128,scale_flag);
+  dft960((int16_t *)(x2128+960),(int16_t *)(ytmp128+960),scale_flag);
+  dft960((int16_t *)(x2128+1920),(int16_t *)(ytmp128+1920),scale_flag);
 
   bfly3_tw1(ytmp128,ytmp128+960,ytmp128+1920,y128,y128+960,y128+1920);
 
@@ -6716,7 +7059,7 @@ void dft2880(int16_t *x,int16_t *y,unsigned char scale_flag)  // 960 x 3
           twb128+j);
   }
 
-  if (scale_flag==1) {
+  if (scale_flag) {
     norm128 = set1_int16(dft_norm_table[14]);
 
     for (i=0; i<2880; i++) {
@@ -6732,7 +7075,7 @@ void dft2880(int16_t *x,int16_t *y,unsigned char scale_flag)  // 960 x 3
 static int16_t twa2916[971*2*4];
 static int16_t twb2916[971*2*4];
 
-void dft2916(int16_t *x,int16_t *y,unsigned char scale_flag)  // 972 x 3
+void dft2916(int16_t *x,int16_t *y,unsigned int *scale_flag)  // 972 x 3
 {
   int i,j;
   simd_q15_t *x128=(simd_q15_t *)x;
@@ -6750,9 +7093,9 @@ void dft2916(int16_t *x,int16_t *y,unsigned char scale_flag)  // 972 x 3
     x2128[i+1944] = x128[j+2];
   }
 
-  dft972((int16_t *)x2128,(int16_t *)ytmp128,1);
-  dft972((int16_t *)(x2128+972),(int16_t *)(ytmp128+972),1);
-  dft972((int16_t *)(x2128+1944),(int16_t *)(ytmp128+1944),1);
+  dft972((int16_t *)x2128,(int16_t *)ytmp128,scale_flag);
+  dft972((int16_t *)(x2128+972),(int16_t *)(ytmp128+972),scale_flag);
+  dft972((int16_t *)(x2128+1944),(int16_t *)(ytmp128+1944),scale_flag);
 
   bfly3_tw1(ytmp128,ytmp128+972,ytmp128+1944,y128,y128+972,y128+1944);
 
@@ -6767,7 +7110,7 @@ void dft2916(int16_t *x,int16_t *y,unsigned char scale_flag)  // 972 x 3
           twb128+j);
   }
 
-  if (scale_flag==1) {
+  if (scale_flag) {
     norm128 = set1_int16(dft_norm_table[14]);
 
     for (i=0; i<2916; i++) {
@@ -6785,7 +7128,7 @@ static int16_t twb3000[599*8]__attribute__((aligned(32)));
 static int16_t twc3000[599*8]__attribute__((aligned(32)));
 static int16_t twd3000[599*8]__attribute__((aligned(32)));
 
-void dft3000(int16_t *x,int16_t *y,unsigned char scale_flag) // 600 * 5
+void dft3000(int16_t *x,int16_t *y,unsigned int *scale_flag) // 600 * 5
 {
 
   int i,j;
@@ -6808,11 +7151,11 @@ void dft3000(int16_t *x,int16_t *y,unsigned char scale_flag) // 600 * 5
     x2128[i+2400] = x128[j+4];
   }
 
-  dft600((int16_t *)x2128,(int16_t *)ytmp128,1);
-  dft600((int16_t *)(x2128+600),(int16_t *)(ytmp128+600),1);
-  dft600((int16_t *)(x2128+1200),(int16_t *)(ytmp128+1200),1);
-  dft600((int16_t *)(x2128+1800),(int16_t *)(ytmp128+1800),1);
-  dft600((int16_t *)(x2128+2400),(int16_t *)(ytmp128+2400),1);
+  dft600((int16_t *)x2128,(int16_t *)ytmp128,scale_flag);
+  dft600((int16_t *)(x2128+600),(int16_t *)(ytmp128+600),scale_flag);
+  dft600((int16_t *)(x2128+1200),(int16_t *)(ytmp128+1200),scale_flag);
+  dft600((int16_t *)(x2128+1800),(int16_t *)(ytmp128+1800),scale_flag);
+  dft600((int16_t *)(x2128+2400),(int16_t *)(ytmp128+2400),scale_flag);
 
   bfly5_tw1(ytmp128,ytmp128+600,ytmp128+1200,ytmp128+1800,ytmp128+2400,y128,y128+600,y128+1200,y128+1800,y128+2400);
 
@@ -6833,7 +7176,7 @@ void dft3000(int16_t *x,int16_t *y,unsigned char scale_flag) // 600 * 5
           twd128+j);
   }
 
-  if (scale_flag==1) {
+  if (scale_flag) {
     norm128 = set1_int16(dft_norm_table[15]);
 
     for (i=0; i<3000; i++) {
@@ -6849,7 +7192,7 @@ void dft3000(int16_t *x,int16_t *y,unsigned char scale_flag) // 600 * 5
 static int16_t twa3240[1079*2*4];
 static int16_t twb3240[1079*2*4];
 
-void dft3240(int16_t *x,int16_t *y,unsigned char scale_flag)  // 1080 x 3
+void dft3240(int16_t *x,int16_t *y,unsigned int *scale_flag)  // 1080 x 3
 {
   int i,j;
   simd_q15_t *x128=(simd_q15_t *)x;
@@ -6867,9 +7210,9 @@ void dft3240(int16_t *x,int16_t *y,unsigned char scale_flag)  // 1080 x 3
     x2128[i+2160] = x128[j+2];
   }
 
-  dft1080((int16_t *)x2128,(int16_t *)ytmp128,1);
-  dft1080((int16_t *)(x2128+1080),(int16_t *)(ytmp128+1080),1);
-  dft1080((int16_t *)(x2128+2160),(int16_t *)(ytmp128+2160),1);
+  dft1080((int16_t *)x2128,(int16_t *)ytmp128,scale_flag);
+  dft1080((int16_t *)(x2128+1080),(int16_t *)(ytmp128+1080),scale_flag);
+  dft1080((int16_t *)(x2128+2160),(int16_t *)(ytmp128+2160),scale_flag);
 
   bfly3_tw1(ytmp128,ytmp128+1080,ytmp128+2160,y128,y128+1080,y128+2160);
 
@@ -6884,7 +7227,7 @@ void dft3240(int16_t *x,int16_t *y,unsigned char scale_flag)  // 1080 x 3
           twb128+j);
   }
 
-  if (scale_flag==1) {
+  if (scale_flag) {
     norm128 = set1_int16(dft_norm_table[14]);
 
     for (i=0; i<3240; i++) {
@@ -7094,7 +7437,7 @@ int dfts_autoinit(void)
 
 #ifndef MR_MAIN
 
-void dft_implementation(uint8_t sizeidx, int16_t *input, int16_t *output, unsigned char scale_flag)
+void dft_implementation(uint8_t sizeidx, int16_t *input, int16_t *output, unsigned int *scale_flag)
 {
   AssertFatal((sizeidx >= 0 && sizeidx<DFT_SIZE_IDXTABLESIZE),"Invalid dft size index %i\n",sizeidx);
         int algn=0xF;
@@ -7111,7 +7454,7 @@ void dft_implementation(uint8_t sizeidx, int16_t *input, int16_t *output, unsign
           dft_ftab[sizeidx].func(input,output,scale_flag);
 };
 
-void idft_implementation(uint8_t sizeidx, int16_t *input, int16_t *output, unsigned char scale_flag)
+void idft_implementation(uint8_t sizeidx, int16_t *input, int16_t *output, unsigned int *scale_flag)
 {
   AssertFatal((sizeidx>=0 && sizeidx<DFT_SIZE_IDXTABLESIZE),"Invalid idft size index %i\n",sizeidx);
         int algn=0xF;
@@ -7133,9 +7476,23 @@ void idft_implementation(uint8_t sizeidx, int16_t *input, int16_t *output, unsig
 #ifdef MR_MAIN
 #include <string.h>
 #include <stdio.h>
-/*
-#define LOG_M write_output
-int write_output(const char *fname,const char *vname,void *data,int length,int dec,char format)
+#include "../../../common/config/config_paramdesc.h"
+
+struct configmodule_interface_s *uniqCfg = NULL;
+extern int bitrev4096[4096],bitrev2048[2048],bitrev1024[1024],bitrev512[512],bitrev256[256],bitrev128[128];
+void init_bitrev();
+void radix2(cd_t *x, int N);
+void normalize(cd_t *x,cd_t *y,int *bitrev, int N);
+
+void exit_function(const char *file, const char *function, const int line, const char *s, const int assert) {
+exit(-1);
+}
+int config_get(paramdef_t *params,int numparams, char *prefix) {
+return(0);
+}
+
+//#define LOG_M write_output
+int write_file_matlab(const char *fname,const char *vname,void *data,int length,int dec,unsigned int format,int dummy)
 {
 
   FILE *fp=NULL;
@@ -7289,24 +7646,43 @@ int write_output(const char *fname,const char *vname,void *data,int length,int d
 
   return 0;
 }
-*/
-#include "common/config/config_paramdesc.h"
-void exit_function(const char *file, const char *function, const int line, const char *s, const int assert) { return; }
-int oai_exit=0;
-int config_get(paramdef_t *params,int numparams, char *prefix) { return;}
-int config_check_unknown_cmdlineopt(char *prefix) { return; }
+double compute_error(int16_t *x, int16_t *y, int N, int *bitrev, int idft) {
+
+  int i;
+  cd_t xcd[N],ycd[N];
+
+  double error=0;
+
+  for (i=0;i<N;i++) {
+    xcd[i].r = (double)(((int16_t *)x)[i<<1]); 
+    xcd[i].i = (double)(((int16_t *)x)[1+(i<<1)]);
+    if (idft==1) xcd[i].i=-xcd[i].i; 
+  }
+  
+  double input_lev=0;
+  for (i=0;i<N;i++) input_lev += pow(xcd[i].r,2.0) + pow(xcd[i].i,2.0);
+  input_lev/=N;
+  radix2(xcd,N);
+  normalize(xcd,ycd,bitrev,N);
+  if (idft==0) for (i=0;i<N;i++) error += pow((ycd[i].r - (double)((int16_t*)y)[i<<1]),2.0) + pow(ycd[i].i-(double)((int16_t*)y)[1+(i<<1)],2.0);
+  else         for (i=0;i<N;i++) error += pow((ycd[i].r - (double)((int16_t*)y)[i<<1]),2.0) + pow(ycd[i].i+(double)((int16_t*)y)[1+(i<<1)],2.0);
+  return(input_lev/(error/N));
+}
 
 int main(int argc, char**argv)
 {
 
 
   time_stats_t ts;
-  simd_q15_t x[32768],y[32768],tw0,tw1,tw2,tw3;
+  simde__m256i x[16384],x2[16384],y[16384],tw0,tw1,tw2,tw3;
   int i;
-  simd_q15_t *x128=(simd_q15_t*)x,*y128=(simd_q15_t*)y;
+
+  double sqnr;
 
   dfts_autoinit();
 
+  init_bitrev();
+
   set_taus_seed(0);
   cpu_meas_enabled = 1;
   /*
@@ -7347,8 +7723,8 @@ int main(int argc, char**argv)
      ((int16_t *)&tw3)[7] = 0;
   */
   for (i = 0; i < 300; i++) {
-    x[i] = (int16x8_t)vdupq_n_s32(taus());
-    x[i] = vshrq_n_s16(x[i], 4);
+    x[i] = simde_mm256_set1_epi32(taus());
+    x[i] = simde_mm256_srai_epi16(x[i], 4);
     }
       /*
     bfly2_tw1(x,x+1,y,y+1);
@@ -7491,18 +7867,6 @@ int main(int argc, char**argv)
      ((int16_t*)x)[6+(i<<1)] = 0;
      ((int16_t*)x)[7+(i<<1)] = -1024;
      }
-  /*
-  for (i=0; i<2048; i+=2) {
-     ((int16_t*)x)[i<<1] = 1024;
-     ((int16_t*)x)[1+(i<<1)] = 0;
-     ((int16_t*)x)[2+(i<<1)] = -1024;
-     ((int16_t*)x)[3+(i<<1)] = 0;
-     }
-       
-  for (i=0;i<2048*2;i++) {
-    ((int16_t*)x)[i] = i/2;//(int16_t)((taus()&0xffff))>>5;
-  }
-     */
   memset((void*)&x[0],0,64*sizeof(int32_t));
   for (i=2;i<36;i++) {
     if ((taus() & 1)==0)
@@ -7516,7 +7880,8 @@ int main(int argc, char**argv)
     else
       ((int16_t*)x)[i] = -364;
   }
-  idft64((int16_t *)x,(int16_t *)y,1);
+  uint32_t scale64 = 3;
+  idft64((int16_t *)x,(int16_t *)y,&scale64);
   
 
   printf("64-point\n");
@@ -7533,14 +7898,14 @@ int main(int argc, char**argv)
   
 
 
-  idft64((int16_t *)x,(int16_t *)y,1);
-  idft64((int16_t *)x,(int16_t *)y,1);
-  idft64((int16_t *)x,(int16_t *)y,1);
+  idft64((int16_t *)x,(int16_t *)y,&scale64);
+  idft64((int16_t *)x,(int16_t *)y,&scale64);
+  idft64((int16_t *)x,(int16_t *)y,&scale64);
   reset_meas(&ts);
 
   for (i=0; i<10000000; i++) {
     start_meas(&ts);
-    idft64((int16_t *)x,(int16_t *)y,1);
+    idft64((int16_t *)x,(int16_t *)y,&scale64);
     stop_meas(&ts);
 
   }
@@ -7585,12 +7950,16 @@ int main(int argc, char**argv)
   }
   reset_meas(&ts);
 
+  uint32_t scale128_tx[2] = {4,0};
   for (i=0; i<10000; i++) {
     start_meas(&ts);
-    idft128((int16_t *)x,(int16_t *)y,1);
+    idft128((int16_t *)x,(int16_t *)y,scale128_tx);
     stop_meas(&ts);
   }
 
+  sqnr = compute_error((int16_t*)x,(int16_t*)y,128,bitrev128,1);
+
+  printf("128 point IDFT SQNR (QPSK) : %f dB\n",10*log10(sqnr));
   printf("\n\n128-point(%f cycles)\n",(double)ts.diff/(double)ts.trials);
   LOG_M("y128.m","y128",y,128,1,1);
   LOG_M("x128.m","x128",x,128,1,1);
@@ -7626,10 +7995,11 @@ int main(int argc, char**argv)
       ((int16_t*)x)[i] = -364;
   }
   reset_meas(&ts);
+  uint32_t scale256_tx[3]={4,0};
 
   for (i=0; i<10000; i++) {
     start_meas(&ts);
-    idft256((int16_t *)x,(int16_t *)y,1);
+    idft256((int16_t *)x,(int16_t *)y,scale256_tx);
     stop_meas(&ts);
   }
 
@@ -7637,6 +8007,9 @@ int main(int argc, char**argv)
   LOG_M("y256.m","y256",y,256,1,1);
   LOG_M("x256.m","x256",x,256,1,1);
 
+  sqnr = compute_error((int16_t*)x,(int16_t*)y,256,bitrev256,1);
+
+  printf("256 point IDFT SQNR (QPSK) : %f dB\n",10*log10(sqnr));
   memset((void*)&x[0],0,512*sizeof(int32_t));
   for (i=2;i<302;i++) {
     if ((taus() & 1)==0)
@@ -7652,19 +8025,21 @@ int main(int argc, char**argv)
   }
 
   reset_meas(&ts);
+  uint32_t scale512_tx[4]={4,1,0};
+
   for (i=0; i<10000; i++) {
     start_meas(&ts);
-    idft512((int16_t *)x,(int16_t *)y,1);
+    idft512((int16_t *)x,(int16_t *)y,scale512_tx);
     stop_meas(&ts);
   }
 
   printf("\n\n512-point(%f cycles)\n",(double)ts.diff/(double)ts.trials);
   LOG_M("y512.m","y512",y,512,1,1);
   LOG_M("x512.m","x512",x,512,1,1);
-  dft512((int16_t*)y,(int16_t*)x,1);
-  LOG_M("y512_dft.m","y512",y,512,1,1);
-  LOG_M("x512_dft.m","x512",x,512,1,1);
+  sqnr = compute_error((int16_t*)x,(int16_t*)y,512,bitrev512,1);
 
+  printf("512 point IDFT SQNR (QPSK) : %f dB\n",10*log10(sqnr));
+  memset((void*)x,0,1024*sizeof(int32_t));
   /*
   printf("X: ");
   for (i=0;i<64;i++)
@@ -7691,9 +8066,10 @@ int main(int argc, char**argv)
   }
   reset_meas(&ts);
 
+  uint32_t scale1024_tx[4]={4,1,0};
   for (i=0; i<10000; i++) {
     start_meas(&ts);
-    idft1024((int16_t *)x,(int16_t *)y,1);
+    idft1024((int16_t *)x,(int16_t *)y,scale1024_tx);
     stop_meas(&ts);
   }
 
@@ -7701,6 +8077,9 @@ int main(int argc, char**argv)
   LOG_M("y1024.m","y1024",y,1024,1,1);
   LOG_M("x1024.m","x1024",x,1024,1,1);
 
+  sqnr = compute_error((int16_t*)x,(int16_t*)y,1024,bitrev1024,1);
+
+  printf("1024 point IDFT SQNR (QPSK) : %f dB\n",10*log10(sqnr));
 
   memset((void*)x,0,1536*sizeof(int32_t));
   for (i=2;i<1202;i++) {
@@ -7717,15 +8096,16 @@ int main(int argc, char**argv)
   }
   reset_meas(&ts);
 
+  uint32_t scale1536[4]={1,1,1,3};
   for (i=0; i<10000; i++) {
     start_meas(&ts);
-    idft1536((int16_t *)x,(int16_t *)y,1);
+    idft1536((int16_t *)x,(int16_t *)y,scale1536);
     stop_meas(&ts);
   }
 
   printf("\n\n1536-point(%f cycles)\n",(double)ts.diff/(double)ts.trials);
-  write_output("y1536.m","y1536",y,1536,1,1);
-  write_output("x1536.m","x1536",x,1536,1,1);
+  LOG_M("y1536.m","y1536",y,1536,1,1);
+  LOG_M("x1536.m","x1536",x,1536,1,1);
 
 
   memset((void*)x,0,2048*sizeof(int32_t));
@@ -7743,9 +8123,10 @@ int main(int argc, char**argv)
   }
   reset_meas(&ts);
 
+  uint32_t scale2048_tx[4]={3,2,1,0};
   for (i=0; i<10000; i++) {
     start_meas(&ts);
-    dft2048((int16_t *)x,(int16_t *)y,1);
+    idft2048((int16_t *)x,(int16_t *)y,scale2048_tx);
     stop_meas(&ts);
   }
 
@@ -7753,6 +8134,9 @@ int main(int argc, char**argv)
   LOG_M("y2048.m","y2048",y,2048,1,1);
   LOG_M("x2048.m","x2048",x,2048,1,1);
 
+  sqnr = compute_error((int16_t*)x,(int16_t*)y,2048,bitrev2048,1);
+
+  printf("2048 point IDFT SQNR (QPSK) : %f dB\n",10*log10(sqnr));
 // NR 80Mhz, 217 PRB, 3/4 sampling
   memset((void*)x, 0, 3072*sizeof(int32_t));
   for (i=2;i<2506;i++) {
@@ -7770,15 +8154,16 @@ int main(int argc, char**argv)
 
   reset_meas(&ts);
 
+  uint32_t scale3072[4]={1,1,1,3};
   for (i=0; i<10000; i++) {
     start_meas(&ts);
-    idft3072((int16_t *)x,(int16_t *)y,1);
+    idft3072((int16_t *)x,(int16_t *)y,scale3072);
     stop_meas(&ts);
   }
 
   printf("\n\n3072-point(%f cycles)\n",(double)ts.diff/(double)ts.trials);
-  write_output("y3072.m","y3072",y,3072,1,1);
-  write_output("x3072.m","x3072",x,3072,1,1);
+  LOG_M("y3072.m","y3072",y,3072,1,1);
+  LOG_M("x3072.m","x3072",x,3072,1,1);
 
 
   memset((void*)x,0,4096*sizeof(int32_t));
@@ -7796,9 +8181,10 @@ int main(int argc, char**argv)
   }
   reset_meas(&ts);
 
+  uint32_t scale4096_tx[4]={3,2,1,0};
   for (i=0; i<10000; i++) {
     start_meas(&ts);
-    idft4096((int16_t *)x,(int16_t *)y,1);
+    idft4096((int16_t *)x,(int16_t *)y,scale4096_tx);
     stop_meas(&ts);
   }
 
@@ -7806,9 +8192,29 @@ int main(int argc, char**argv)
   LOG_M("y4096.m","y4096",y,4096,1,1);
   LOG_M("x4096.m","x4096",x,4096,1,1);
 
-  dft4096((int16_t *)y,(int16_t *)x,1);
-  LOG_M("x4096_2.m","x4096_2",x,4096,1,1);
+  sqnr = compute_error((int16_t*)x,(int16_t*)y,4096,bitrev4096,1);
+
+  printf("4096 point IDFT SQNR (QPSK) : %f dB\n",10*log10(sqnr));
+
+  float sqrt2 = 0.70711;
+  float sqrt170 = 0.076696;
+
+  for (i=0;i<2400;i++) {
+    uint32_t n=taus();
+    ((int16_t*)x)[i]   = (short)((1-2*(n&1))*(8-(1-2*((n>>1)&1))*(4-(1-2*((n>>2)&1))*(2-(1-2*((n>>3)&1))))))*512*sqrt170*sqrt2;
+  }
+  for (i=2*(4096-1200);i<8192;i++) {
+    uint32_t n=taus();
+    ((int16_t*)x)[i]   = (short)((1-2*(n&1))*(8-(1-2*((n>>1)&1))*(4-(1-2*((n>>2)&1))*(2-(1-2*((n>>3)&1))))))*512*sqrt170*sqrt2;
+  }
+
+  uint32_t scale4096_tx256qam[4]={3,2,1,0};
+  idft4096((int16_t *)x,(int16_t *)y,scale4096_tx256qam);
+  LOG_M("y4096_256qam.m","y4096_256qam",y,4096,1,1);
+  LOG_M("x4096_256qam.m","x4096_256qam",x,4096,1,1);
 
+  sqnr = compute_error((int16_t*)x,(int16_t*)y,4096,bitrev4096,1);
+  printf("4096 point IDFT SQNR (256QAM) : %f dB\n",10*log10(sqnr));
 // NR 160Mhz, 434 PRB, 3/4 sampling
   memset((void*)x, 0, 6144*sizeof(int32_t));
   for (i=2;i<5010;i++) {
@@ -7826,15 +8232,16 @@ int main(int argc, char**argv)
 
   reset_meas(&ts);
 
+  uint32_t scale6144[5]={1,1,1,1,3};
   for (i=0; i<10000; i++) {
     start_meas(&ts);
-    idft6144((int16_t *)x,(int16_t *)y,1);
+    idft6144((int16_t *)x,(int16_t *)y,scale6144);
     stop_meas(&ts);
   }
 
   printf("\n\n6144-point(%f cycles)\n",(double)ts.diff/(double)ts.trials);
-  write_output("y6144.m","y6144",y,6144,1,1);
-  write_output("x6144.m","x6144",x,6144,1,1);
+  LOG_M("y6144.m","y6144",y,6144,1,1);
+  LOG_M("x6144.m","x6144",x,6144,1,1);
 
   memset((void*)x,0,8192*sizeof(int32_t));
   for (i=2;i<4802;i++) {
@@ -7850,9 +8257,10 @@ int main(int argc, char**argv)
       ((int16_t*)x)[i] = -364;
   }
   reset_meas(&ts);
+  uint32_t scale8192[5]={1,1,1,1,3};
   for (i=0; i<10000; i++) {
     start_meas(&ts);
-    idft8192((int16_t *)x,(int16_t *)y,1);
+    idft8192((int16_t *)x,(int16_t *)y,scale8192);
     stop_meas(&ts);
   }
 
@@ -7874,9 +8282,10 @@ int main(int argc, char**argv)
       ((int16_t*)x)[i] = -364;
   }
   reset_meas(&ts);
+  uint32_t scale16384[5]={1,1,1,1,3};
   for (i=0; i<10000; i++) {
     start_meas(&ts);
-    dft16384((int16_t *)x,(int16_t *)y,1);
+    dft16384((int16_t *)x,(int16_t *)y,scale16384);
     stop_meas(&ts);
   }
 
@@ -7884,82 +8293,6 @@ int main(int argc, char**argv)
   LOG_M("y16384.m","y16384",y,16384,1,1);
   LOG_M("x16384.m","x16384",x,16384,1,1);
 
-  memset((void*)x,0,1536*sizeof(int32_t));
-  for (i=2;i<1202;i++) {
-    if ((taus() & 1)==0)
-      ((int16_t*)x)[i] = 364;
-    else
-      ((int16_t*)x)[i] = -364;
-  }
-  for (i=2*(1536-600);i<3072;i++) {
-    if ((taus() & 1)==0)
-      ((int16_t*)x)[i] = 364;
-    else
-      ((int16_t*)x)[i] = -364;
-  }
-  reset_meas(&ts);
-  for (i=0; i<10000; i++) {
-    start_meas(&ts);
-    idft1536((int16_t *)x,(int16_t *)y,1);
-    stop_meas(&ts);
-  }
-
-  printf("\n\n1536-point(%f cycles)\n",(double)ts.diff/(double)ts.trials);
-  LOG_M("y1536.m","y1536",y,1536,1,1);
-  LOG_M("x1536.m","x1536",x,1536,1,1);
-
-  printf("\n\n1536-point(%f cycles)\n",(double)ts.diff/(double)ts.trials);
-  LOG_M("y8192.m","y8192",y,8192,1,1);
-  LOG_M("x8192.m","x8192",x,8192,1,1);
-
-  memset((void*)x,0,3072*sizeof(int32_t));
-  for (i=2;i<1202;i++) {
-    if ((taus() & 1)==0)
-      ((int16_t*)x)[i] = 364;
-    else
-      ((int16_t*)x)[i] = -364;
-  }
-  for (i=2*(3072-600);i<3072;i++) {
-    if ((taus() & 1)==0)
-      ((int16_t*)x)[i] = 364;
-    else
-      ((int16_t*)x)[i] = -364;
-  }
-  reset_meas(&ts);
-  for (i=0; i<10000; i++) {
-    start_meas(&ts);
-    idft3072((int16_t *)x,(int16_t *)y,1);
-    stop_meas(&ts);
-  }
-
-  printf("\n\n3072-point(%f cycles)\n",(double)ts.diff/(double)ts.trials);
-  LOG_M("y3072.m","y3072",y,3072,1,1);
-  LOG_M("x3072.m","x3072",x,3072,1,1);
-
-  memset((void*)x,0,6144*sizeof(int32_t));
-  for (i=2;i<4802;i++) {
-    if ((taus() & 1)==0)
-      ((int16_t*)x)[i] = 364;
-    else
-      ((int16_t*)x)[i] = -364;
-  }
-  for (i=2*(6144-2400);i<12288;i++) {
-    if ((taus() & 1)==0)
-      ((int16_t*)x)[i] = 364;
-    else
-      ((int16_t*)x)[i] = -364;
-  }
-  reset_meas(&ts);
-  for (i=0; i<10000; i++) {
-    start_meas(&ts);
-    idft6144((int16_t *)x,(int16_t *)y,1);
-    stop_meas(&ts);
-  }
-
-  printf("\n\n6144-point(%f cycles)\n",(double)ts.diff/(double)ts.trials);
-  LOG_M("y6144.m","y6144",y,6144,1,1);
-  LOG_M("x6144.m","x6144",x,6144,1,1);
-
   memset((void*)x,0,12288*sizeof(int32_t));
   for (i=2;i<9602;i++) {
     if ((taus() & 1)==0)
@@ -7974,9 +8307,10 @@ int main(int argc, char**argv)
       ((int16_t*)x)[i] = -364;
   }
   reset_meas(&ts);
+  uint32_t scale12288[5]={1,1,1,1,3};
   for (i=0; i<10000; i++) {
     start_meas(&ts);
-    idft12288((int16_t *)x,(int16_t *)y,1);
+    idft12288((int16_t *)x,(int16_t *)y,scale12288);
     stop_meas(&ts);
   }
 
@@ -7998,9 +8332,11 @@ int main(int argc, char**argv)
       ((int16_t*)x)[i] = -364;
   }
   reset_meas(&ts);
+
+  uint32_t scale18432[6]={1,1,1,1,1,3};
   for (i=0; i<10000; i++) {
     start_meas(&ts);
-    idft18432((int16_t *)x,(int16_t *)y,1);
+    idft18432((int16_t *)x,(int16_t *)y,scale18432);
     stop_meas(&ts);
   }
 
@@ -8022,9 +8358,11 @@ int main(int argc, char**argv)
       ((int16_t*)x)[i] = -364;
   }
   reset_meas(&ts);
+
+  uint32_t scale24576[6]={1,1,1,1,1,3};
   for (i=0; i<10000; i++) {
     start_meas(&ts);
-    idft24576((int16_t *)x,(int16_t *)y,1);
+    idft24576((int16_t *)x,(int16_t *)y,scale24576);
     stop_meas(&ts);
   }
 
@@ -8047,9 +8385,10 @@ int main(int argc, char**argv)
       ((int16_t*)x)[i] = -364;
   }
   reset_meas(&ts);
+  uint32_t scale36864[6] = {1,1,1,1,1,3};
   for (i=0; i<10000; i++) {
     start_meas(&ts);
-    dft36864((int16_t *)x,(int16_t *)y,1);
+    dft36864((int16_t *)x,(int16_t *)y,scale36864);
     stop_meas(&ts);
   }
 
@@ -8072,50 +8411,138 @@ int main(int argc, char**argv)
       ((int16_t*)x)[i] = -364;
   }
   reset_meas(&ts);
+  uint32_t scale49152[6]={1,1,1,1,1,3};
   for (i=0; i<10000; i++) {
     start_meas(&ts);
-    idft49152((int16_t *)x,(int16_t *)y,1);
+    idft49152((int16_t *)x,(int16_t *)y,scale49152);
     stop_meas(&ts);
   }
 
   printf("\n\n49152-point(%f cycles)\n",(double)ts.diff/(double)ts.trials);
   LOG_M("y49152.m","y49152",y,49152,1,1);
   LOG_M("x49152.m","x49152",x,49152,1,1);
-  /*
-  int dftsizes[33]={24,36,48,60,72,96,108,120,144,180,192,216,240,288,300,324,360,384,432,480,540,576,600,648,720,768,864,900,960,972,1080,1152,1200};
-  void (*dft)(int16_t *x,int16_t *y,uint8_t scale)[33] = {dft24,dft36,dft48,dft60,dft72,dft96,dft108,dft120,dft144,dft180,dft192,dft216,dft240,dft288,dft300,dft324,dft360,dft384,dft432,dft480,dft540,dft576,dft600,dft648,dft720,dft768,dft864,dft900,dft960,dft972,dft1080,dft1152,dft1200};
-  for (int n=0;n<33;n++) {
-    // 4xN-point DFT
-    memset((void*)x,0,dftsizes[n]*8*sizeof(int16_t));
-    for (i=0;i<dftsizes[n]*8;i+=8) {
-      if ((taus() & 1)==0)
-	((int16_t*)x)[i]   = 364;
-      else
-	((int16_t*)x)[i]   = -364;
-      if ((taus() & 1)==0)
-	((int16_t*)x)[i+1] = 364;
-      else
-	((int16_t*)x)[i+1] = -364;
-    }
-    
-    reset_meas(&ts);
-    for (i=0; i<10000; i++) {
-      start_meas(&ts);
-      (dft[n])((int16_t *)x,(int16_t *)y,1);
-      stop_meas(&ts);
-    }
-    
-    printf("\n\n4x%d-point(%f cycles)\n",dftsizes[n],(double)ts.diff/(double)ts.trials);
-    char ystr[5],xstr[5],ystr2[5],xstr2[5];
-    sprintf(ystr,"y%d.m",dftsizes[n]);
-    sprintf(xstr,"x%d.m",dftsizes[n]);
-    sprintf(ystr2,"y%d",dftsizes[n]);
-    sprintf(xstr2,"x%d",dftsizes[n]);
-    LOG_M(ystr,ystr2,y,dftsizes[n]*4,1,1);
-    LOG_M(xstr,xstr2,x,dftsizes[n]*4,1,1);
-  }
-  */
 
+  memset((void*)x,0,128*sizeof(int32_t));
+  for (i=0;i<128;i++) {
+    ((int16_t*)x)[i<<1] = (int16_t)(364 * cos(2*M_PI*3*i/128));
+    ((int16_t*)x)[1+(i<<1)] = (int16_t)(364 * sin(2*M_PI*3*i/128));
+  } 
+#ifdef USE_DFT16_SHIFT
+  uint32_t scale128_rx[3]={2,0};
+#else
+  uint32_t scale128_rx[3]={2,2};
+#endif
+  dft128((int16_t*)x,(int16_t*)y,scale128_rx);
+  LOG_M("x128_exp.m","x128_exp",x,128,1,1); 
+  LOG_M("y128_exp.m","y128_exp",y,128,1,1); 
+  sqnr = compute_error((int16_t*)x,(int16_t*)y,128,bitrev128,0);
+
+  printf("128 point DFT SQNR (cplx sinusoid) : %f dB\n",10*log10(sqnr));
+
+  memset((void*)x,0,256*sizeof(int32_t));
+  for (i=0;i<256;i++) {
+    ((int16_t*)x)[i<<1] = (int16_t)(364 * cos(2*M_PI*3*i/256));
+    ((int16_t*)x)[1+(i<<1)] = (int16_t)(364 * sin(2*M_PI*3*i/256));
+  } 
+#ifdef USE_DFT16_SHIFT
+  uint32_t scale256_rx[3]={2,0};
+#else
+  uint32_t scale256_rx[3]={2,2};
+#endif
+  dft256((int16_t*)x,(int16_t*)y,scale256_rx);
+  LOG_M("x256_exp.m","x256_exp",x,256,1,1); 
+  LOG_M("y256_exp.m","y256_exp",y,256,1,1); 
+  sqnr = compute_error((int16_t*)x,(int16_t*)y,256,bitrev256,0);
+
+  printf("256 point DFT SQNR (cplx sinusoid) : %f dB\n",10*log10(sqnr));
+
+
+  memset((void*)x,0,512*sizeof(int32_t));
+  for (i=0;i<512;i++) {
+    ((int16_t*)x)[i<<1] = (int16_t)(364 * cos(2*M_PI*3*i/512));
+    ((int16_t*)x)[1+(i<<1)] = (int16_t)(364 * sin(2*M_PI*3*i/512));
+  } 
+#ifdef USE_DFT16_SHIFT
+  uint32_t scale512_rx[3]={1,2,0};
+#else
+  uint32_t scale512_rx[3]={1,2,2};
+#endif
+  dft512((int16_t*)x,(int16_t*)y,scale512_rx);
+  LOG_M("x512_exp.m","x512_exp",x,512,1,1); 
+  LOG_M("y512_exp.m","y512_exp",y,512,1,1); 
+
+  sqnr = compute_error((int16_t*)x,(int16_t*)y,512,bitrev512,0);
+
+  printf("512 point DFT SQNR (cplx sinusoid) : %f dB\n",10*log10(sqnr));
+  memset((void*)x,0,1024*sizeof(int32_t));
+#ifdef USE_DFT16_SHIFT
+  uint32_t scale1024_rx[3]={1,2,0};
+#else
+  uint32_t scale1024_rx[3]={1,2,2};
+#endif
+  for (i=0;i<1024;i++) {
+    ((int16_t*)x)[i<<1] = (int16_t)(364 * cos(2*M_PI*3*i/1024));
+    ((int16_t*)x)[1+(i<<1)] = (int16_t)(364 * sin(2*M_PI*3*i/1024));
+  } 
+  dft1024((int16_t*)x,(int16_t*)y,scale1024_rx);
+  LOG_M("x1024_exp.m","x1024_exp",x,1024,1,1); 
+  LOG_M("y1024_exp.m","y1024_exp",y,1024,1,1); 
+
+  sqnr = compute_error((int16_t*)x,(int16_t*)y,1024,bitrev1024,0);
+
+  printf("1024 point DFT SQNR (cplx sinusoid) : %f dB\n",10*log10(sqnr));
+  memset((void*)x,0,1536*sizeof(int32_t));
+  for (i=0;i<1536;i++) {
+    ((int16_t*)x)[i<<1] = (int16_t)(364 * cos(2*M_PI*3*i/1536));
+    ((int16_t*)x)[1+(i<<1)] = (int16_t)(364 * sin(2*M_PI*3*i/1536));
+  } 
+  dft1536((int16_t*)x,(int16_t*)y,scale1536);
+  LOG_M("x1536_exp.m","x1536_exp",x,1536,1,1); 
+  LOG_M("y1536_exp.m","y1536_exp",y,1536,1,1);
+ 
+  memset((void*)x,0,2048*sizeof(int32_t));
+  for (i=0;i<2048;i++) {
+    ((int16_t*)x)[i<<1] = (int16_t)(384 * cos(2*M_PI*3*i/2048));
+    ((int16_t*)x)[1+(i<<1)] = (int16_t)(384 * sin(2*M_PI*3*i/2048));
+  } 
+#ifdef USE_DFT16_SHIFT
+  uint32_t scale2048_rx[4]={1,0,3,0};
+#else
+  uint32_t scale2048_rx[4]={1,0,3,2};
+#endif
+
+  dft2048((int16_t*)x,(int16_t*)y,scale2048_rx);
+  LOG_M("x2048_exp.m","x2048_exp",x,2048,1,1); 
+  LOG_M("y2048_exp.m","y2048_exp",y,2048,1,1); 
+
+  sqnr = compute_error((int16_t*)x,(int16_t*)y,2048,bitrev2048,0);
+
+  printf("2048 point DFT SQNR (cplx sinusoid) : %f dB\n",10*log10(sqnr));
+  memset((void*)x,0,3072*sizeof(int32_t));
+  for (i=0;i<3072;i++) {
+    ((int16_t*)x)[i<<1] = (int16_t)(200 * cos(2*M_PI*3*i/3072));
+    ((int16_t*)x)[1+(i<<1)] = (int16_t)(200 * sin(2*M_PI*3*i/3072));
+  } 
+  dft3072((int16_t*)x,(int16_t*)y,scale3072);
+  LOG_M("x3072_exp.m","x3072_exp",x,3072,1,1); 
+  LOG_M("y3072_exp.m","y3072_exp",y,3072,1,1); 
+
+  memset((void*)x,0,4096*sizeof(int32_t));
+  for (i=0;i<4096;i++) {
+    ((int16_t*)x)[i<<1] = (int16_t)(384 * cos(2*M_PI*331*i/4096));
+    ((int16_t*)x)[1+(i<<1)] = (int16_t)(384 * sin(2*M_PI*331*i/4096));
+  } 
+#ifndef USE_DFT16_SHIFT
+  uint32_t scale4096_rx[4]={0,0,3,3};
+#else
+  uint32_t scale4096_rx[4]={0,0,3,1};
+#endif
+  dft4096((int16_t*)x,(int16_t*)y,scale4096_rx);
+  LOG_M("x4096_exp.m","x4096_exp",x,4096,1,1); 
+  LOG_M("y4096_exp.m","y4096_exp",y,4096,1,1); 
+
+  sqnr = compute_error((int16_t*)x,(int16_t*)y,4096,bitrev4096,0);
+  printf("4096 point DFT SQNR (cplx sinusoid) : %f dB\n",10*log10(sqnr));
   return(0);
 }
 
diff --git a/openair1/PHY/TOOLS/tests/test_dft.c b/openair1/PHY/TOOLS/tests/test_dft.c
index 8b4c344c6c153e9ea2f2edc6ddb223d8aafc701d..19f2e734f66b539a4b3e0cce6bc07ede101e1e41 100644
--- a/openair1/PHY/TOOLS/tests/test_dft.c
+++ b/openair1/PHY/TOOLS/tests/test_dft.c
@@ -15,6 +15,7 @@
   SZ_DEF(1024)                        \
   SZ_DEF(1536)                        \
   SZ_DEF(2048)                        \
+  SZ_DEF(3072)                        \
   SZ_DEF(4096)                        \
   SZ_DEF(6144)                        \
   SZ_DEF(8192)                        \
@@ -44,83 +45,148 @@ bool error(c16_t v16, cd_t vd, double percent)
   return false;
 }
 
-void math_dft(cd_t *in, cd_t *out, int len)
+void math_dft(c16_t *in, cd_t *out, int len,int dir,int norm)
 {
   for (int k = 0; k < len; k++) {
     cd_t tmp = {0};
     // wrote this way to help gcc to generate SIMD
     double phi[len], sint[len], cost[len];
     for (int n = 0; n < len; n++)
-      phi[n] = -2 * M_PI * ((double)k / len) * n;
+      if (dir ==0) phi[n] = -2 * M_PI * ((double)k / len) * n;
+      else         phi[n] =  2* M_PI * ((double)k/len)*n;
     for (int n = 0; n < len; n++)
       sint[n] = sin(phi[n]);
     for (int n = 0; n < len; n++)
       cost[n] = cos(phi[n]);
     for (int n = 0; n < len; n++) {
       cd_t coeff = {.r = cost[n], .i = sint[n]};
-      cd_t component = cdMul(coeff, in[n]);
+      cd_t in16q = {.r = (double)in[n].r, .i = (double)in[n].i};
+      cd_t component = cdMul(coeff, in16q);
       tmp.r += component.r;
       tmp.i += component.i;
     }
-    out[k].r = tmp.r / sqrt(len);
-    out[k].i = tmp.i / sqrt(len);
+    out[k].r = tmp.r / ((norm==0) ? 1.0 : sqrt(len));
+    out[k].i = tmp.i / ((norm==0) ? 1.0 : sqrt(len));
   }
 }
 
+void fill_qam(int n, cd_t *x, int mod) {
+  int size;
+  if (mod < 0 || mod >1) {
+    printf("Illegal modulation %d\n",mod);
+    exit(-1);
+  }
+  double sqrt170 = 1.0/sqrt(170);
+  memset((void*)&x[0],0,n*sizeof(cd_t));
+  switch (n) {
+    case 128:  size=72;   break;
+    case 256:  size=180;  break;
+    case 512:  size=300;  break;
+    case 768:  size=612;  break;
+    case 1024: size=612;  break;
+    case 1536: size=900;  break;
+    case 2048: size=1596; break;
+    case 3072: size=2556; break;
+    case 4096: size=3276; break;
+    default:   printf("Illegal FFT length %d\n",n); exit(-1);;
+  }
+  for (int i=0;i<size/2;i++) {
+    if (mod==0) {
+      int rv=taus()&1;
+      x[i].r = (1/sqrt(2.0)) * ((rv<<1) - 1); 
+      rv=taus()&1;
+      x[i].i = (1/sqrt(2.0)) * ((rv<<1) - 1);
+    }
+    else {
+      int rvi=taus()&15;
+      int rvq=taus()&15;
+      x[i].r   = ((1-2*(rvi&1))*(8-(1-2*((rvi>>1)&1))*(4-(1-2*((rvi>>2)&1))*(2-(1-2*((rvi>>3)&1))))))*sqrt170;
+      x[i].i   = ((1-2*(rvq&1))*(8-(1-2*((rvq>>1)&1))*(4-(1-2*((rvq>>2)&1))*(2-(1-2*((rvq>>3)&1))))))*sqrt170;
+    }
+  } 
+  for (int i=n-(size/2);i<n;i++) {
+    if (mod==0) {
+      int rv=taus()&1;
+      x[i].r = (1/sqrt(2.0)) * ((rv<<1) - 1); 
+      rv=taus()&1;
+      x[i].i = (1/sqrt(2.0)) * ((rv<<1) - 1);
+    }
+    else {
+      int rvi=taus()&15;
+      int rvq=taus()&15;
+      x[i].r   = ((1-2*(rvi&1))*(8-(1-2*((rvi>>1)&1))*(4-(1-2*((rvi>>2)&1))*(2-(1-2*((rvi>>3)&1))))))*sqrt170;
+      x[i].i   = ((1-2*(rvq&1))*(8-(1-2*((rvq>>1)&1))*(4-(1-2*((rvq>>2)&1))*(2-(1-2*((rvq>>3)&1))))))*sqrt170;
+    }
+  }
+}
+
+
 int main(void)
 {
   int ret = 0;
   load_dftslib();
-  c16_t *d16 = malloc16(12 * dftFtab[sizeofArray(dftFtab) - 1].size * sizeof(*d16));
+  c16_t *d16   = malloc16(12 * dftFtab[sizeofArray(dftFtab) - 1].size * sizeof(*d16));
+  c16_t *d16_2 = malloc16(12 * sizeof(*d16_2));
   c16_t *o16 = malloc16(12 * dftFtab[sizeofArray(dftFtab) - 1].size * sizeof(*d16));
+  set_taus_seed(0);
   for (int sz = 0; sz < sizeofArray(dftFtab); sz++) {
     const int n = dftFtab[sz].size;
     cd_t data[n];
-    double coeffs[] = {0.25, 0.5, 1, 1.5, 2, 2.5, 3};
+    double coeffs[] = {30,40,50,60,70};
+    printf("Testing size %d\n",n);
     cd_t out[n];
     for (int i = 0; i < n; i++) {
       data[i].r = gaussZiggurat(0, 1.0); // gaussZiggurat not used paramters, to fix
       data[i].i = gaussZiggurat(0, 1.0);
     }
-    math_dft(data, out, n);
     double evm[sizeofArray(coeffs)] = {0};
+    double sqnr[sizeofArray(coeffs)] = {0};
     double samples[sizeofArray(coeffs)] = {0};
+    double samples_out[sizeofArray(coeffs)] = {0};
     for (int coeff = 0; coeff < sizeofArray(coeffs); coeff++) {
-      double expand = coeffs[coeff] * SHRT_MAX / sqrt(n);
+      double expand = pow(10.0,.05*coeffs[coeff])/sqrt(2);
       if (n == 12) {
-        for (int i = 0; i < n; i++)
+        for (int i = 0; i < n; i++) {
           for (int j = 0; j < 4; j++) {
             d16[i * 4 + j].r = expand * data[i].r;
             d16[i * 4 + j].i = expand * data[i].i;
           }
+          d16_2[i].r = d16[i * 4 ].r;
+          d16_2[i].i = d16[i * 4 ].i;
+        }
       } else {
         for (int i = 0; i < n; i++) {
           d16[i].r = expand * data[i].r;
           d16[i].i = expand * data[i].i;
         }
       }
-      dft(get_dft(n), (int16_t *)d16, (int16_t *)o16, 1);
+      if (n==12) math_dft(d16_2,out,n,0,0);
+      else       math_dft(d16, out, n,0,1);
+      dft(get_dft(n), (int16_t *)d16, (int16_t *)o16,get_dft_scaling(n,(int32_t)(coeffs[coeff])));
       if (n == 12) {
         for (int i = 0; i < n; i++) {
-          cd_t error = {.r = o16[i * 4].r / (expand * sqrt(n)) - out[i].r, .i = o16[i * 4].i / (expand * sqrt(n)) - out[i].i};
+          cd_t error = {.r = o16[i * 4].r - out[i].r, .i = o16[i * 4].i - out[i].i};
+          sqnr[coeff] += squaredMod(error);
           evm[coeff] += sqrt(squaredMod(error)) / sqrt(squaredMod(out[i]));
-          samples[coeff] += sqrt(squaredMod(d16[i]));
+          samples_out[coeff] += (squaredMod(out[i])/n);
+          samples[coeff] += squaredMod(d16_2[i]);
         }
       } else {
         for (int i = 0; i < n; i++) {
-          cd_t error = {.r = o16[i].r / expand - out[i].r, .i = o16[i].i / expand - out[i].i};
+          cd_t error = {.r = o16[i].r - out[i].r , .i = o16[i].i - out[i].i};
           evm[coeff] += sqrt(squaredMod(error)) / sqrt(squaredMod(out[i]));
-          samples[coeff] += sqrt(squaredMod(d16[i]));
-          /*
-            if (error(o16[i], out[i], 5))
-            printf("Error in dft %d at %d, (%d, %d) != %f, %f)\n", n, i, o16[i].r, o16[i].i, gslout[i].r, gslout[i].i);
-          */
+          double error_dB = 10*log10(squaredMod(error));
+          if (coeffs[coeff] == 50 && n==4096 && error_dB >= 10) printf("error in DFT pos %d : in %f dB %f dB \n",i,coeffs[coeff],error_dB);
+          sqnr[coeff] += squaredMod(error);
+          samples[coeff] += squaredMod(d16[i]);
+          samples_out[coeff] += squaredMod(out[i]);
         }
       }
+      sqnr[coeff] = samples_out[coeff] / sqnr[coeff];
     }
-    printf("done DFT size %d (evm (%%), avg samples amplitude) = ", n);
+    printf("done DFT size %d (evm (%%), SQNRdB, avg in samples amplitude, avg out samples amplitude) = ", n);
     for (int coeff = 0; coeff < sizeofArray(coeffs); coeff++)
-      printf("(%.2f, %.0f) ", (evm[coeff] / n) * 100, samples[coeff] / n);
+      printf("input_lev %f (%.2f, %f, %.1f, %.1f) ", coeffs[coeff],(evm[coeff] / n) * 100, 10*log10(sqnr[coeff]),10*log10(samples[coeff] / n), 10*log10(samples_out[coeff] / n));
     printf("\n");
     int i;
     for (i = 0; i < sizeofArray(coeffs); i++)
@@ -132,7 +198,56 @@ int main(void)
     }
     fflush(stdout);
   }
+
+  // TX test: modulate all used sizss with QPSK and 256QAM. Compute IDFT using
+  // QAM levels and IDFT scaling used in gNB transmit chain. Use double precision DFT 
+  // to bring back to frequency-domain. Compute EVM and SQNR compared to
+  // transmitted waveform 
+  for (int sz = 0; sz < sizeofArray(dftFtab); sz++) {
+    const int n = dftFtab[sz].size;
+    cd_t data[n];
+    if (n > 4096) break;
+    if (n < 128) continue;
+    printf("Testing IDFT size %d\n",n);
+    cd_t out[n];
+    for (int mod=0;mod<2;mod++) {
+      fill_qam(n,data,mod);
+      int16_t amp=512;
+      for (int i = 0; i < n; i++) {
+        d16[i].r = (int16_t)(amp*data[i].r); 
+        d16[i].i = (int16_t)(amp*data[i].i);
+      }
+      idft(get_idft(n), (int16_t *)d16, (int16_t *)o16,get_idft_scaling(n,0));
+      math_dft(o16, out, n,0,1);
+      double evm = 0;
+      double sqnr = 0;
+      double samples = 0;
+      double samples_out = 0;
+      int nz=0;
+      for (int i = 0; i < n; i++) {
+        if (data[i].r != 0) {
+            cd_t error = {.r = (double)d16[i].r - out[i].r, .i =(double) d16[i].i - out[i].i};
+            evm += sqrt(squaredMod(error)) / sqrt(squaredMod(out[i]));
+            sqnr += squaredMod(error);
+            samples += sqrt(squaredMod(d16[i]));
+            samples_out += squaredMod(out[i]);
+            nz++;
+        }
+      }
+      sqnr = samples_out / sqnr;
+      printf("done IDFT size %d nz %d mod %s (evm (%%), SQNRdB, avg samples amplitude) = ", n,nz, mod==0?"QPSK":"256QAM");
+      printf("(%.2f, %f, %.1f) ", (evm / nz) * 100, 10*log10(sqnr),10*log10(samples_out/ nz));
+      printf("\n");
+      if (evm / nz > 0.01){
+        printf("IDFT size: %d/ mod %s, minimum error is more than 1%%, setting the test as failed\n", n, mod==0?"QPSK":"256QAM");
+        ret = 1;
+        break;
+      }
+    }
+    fflush(stdout);
+  }
   free(d16);
   free(o16);
+  free(d16_2);
   return ret;
 }
diff --git a/openair1/PHY/TOOLS/tools_defs.h b/openair1/PHY/TOOLS/tools_defs.h
index d7e738a9f44fc5cc39f8c25013187d8f61425c33..0b2e3d56d643b47fab969a4353d9d11709857f68 100644
--- a/openair1/PHY/TOOLS/tools_defs.h
+++ b/openair1/PHY/TOOLS/tools_defs.h
@@ -591,8 +591,59 @@ void init_fft(uint16_t size,
   SZ_DEF(65536)                \
   SZ_DEF(98304)
 
-typedef  void(*dftfunc_t)(uint8_t sizeidx,int16_t *sigF,int16_t *sig,unsigned char scale_flag);
-typedef void (*idftfunc_t)(uint8_t sizeidx, int16_t *sigF, int16_t *sig, unsigned char scale_flag);
+extern uint32_t DFT_SCALING_64[5][2];
+extern uint32_t DFT_SCALING_128[5][3];
+extern uint32_t DFT_SCALING_256[5][3];
+extern uint32_t DFT_SCALING_512[7][4];
+extern int32_t DFT_SCALING_512_THRES[7];
+extern uint32_t DFT_SCALING_768[5][4];
+extern uint32_t DFT_SCALING_1024[5][4];
+extern int32_t DFT_SCALING_1024_THRES[5];
+extern uint32_t DFT_SCALING_1536[5][5];
+extern uint32_t DFT_SCALING_2048[10][5];
+extern int32_t DFT_SCALING_2048_THRES[10];
+extern uint32_t DFT_SCALING_3072[5][5];
+extern uint32_t DFT_SCALING_4096[8][5];
+extern int32_t DFT_SCALING_4096_THRES[8];
+extern uint32_t DFT_SCALING_6144[5][6];
+extern uint32_t DFT_SCALING_8192[5][6];
+extern uint32_t DFT_SCALING_9216[5][6];
+extern uint32_t DFT_SCALING_12288[5][6];
+extern uint32_t DFT_SCALING_16384[5][6];
+extern uint32_t DFT_SCALING_18432[5][7];
+extern uint32_t DFT_SCALING_24576[5][7];
+extern uint32_t DFT_SCALING_32768[5][7];
+extern uint32_t DFT_SCALING_36864[5][7];
+extern uint32_t DFT_SCALING_49152[5][7];
+extern uint32_t DFT_SCALING_65536[5][7];
+extern uint32_t DFT_SCALING_73728[5][8];
+extern uint32_t DFT_SCALING_98304[5][8];
+
+extern uint32_t IDFT_SCALING_128[2][2];
+extern uint32_t IDFT_SCALING_256[2][2];
+extern uint32_t IDFT_SCALING_512[2][3];
+extern uint32_t IDFT_SCALING_768[2][3];
+extern uint32_t IDFT_SCALING_1024[2][3];
+extern uint32_t IDFT_SCALING_1536[2][4];
+extern uint32_t IDFT_SCALING_2048[2][4];
+extern uint32_t IDFT_SCALING_3072[2][4];
+extern uint32_t IDFT_SCALING_4096[2][4];
+extern uint32_t IDFT_SCALING_6144[2][5];
+extern uint32_t IDFT_SCALING_8192[2][5];
+extern uint32_t IDFT_SCALING_9216[2][5];
+extern uint32_t IDFT_SCALING_12288[2][5];
+extern uint32_t IDFT_SCALING_16384[2][5];
+extern uint32_t IDFT_SCALING_18432[2][6];
+extern uint32_t IDFT_SCALING_24576[2][6];
+extern uint32_t IDFT_SCALING_32768[2][6];
+extern uint32_t IDFT_SCALING_36864[2][6];
+extern uint32_t IDFT_SCALING_49152[2][6];
+extern uint32_t IDFT_SCALING_65536[2][6];
+extern uint32_t IDFT_SCALING_73728[2][7];
+extern uint32_t IDFT_SCALING_98304[2][7];
+
+typedef  void(*dftfunc_t)(uint8_t sizeidx,int16_t *sigF,int16_t *sig,unsigned int *scale);
+typedef void (*idftfunc_t)(uint8_t sizeidx, int16_t *sigF, int16_t *sig, unsigned int *scale);
 extern dftfunc_t dft;
 extern idftfunc_t idft;
 int load_dftslib(void);
@@ -629,6 +680,146 @@ static inline dft_size_idx_t get_dft(int size)
   return DFT_SIZE_IDXTABLESIZE;
 }
 
+/*******************************************************************
+*
+* NAME :         get_dft_scaling
+*
+* PARAMETERS :   size of ofdm symbol
+*
+* RETURN :       pointer to default scaling schedule 
+*
+* DESCRIPTION :  return point to the default (best) scaling schedule for DFT of a given length
+*
+*********************************************************************/
+static inline
+uint32_t *get_dft_scaling(int ofdm_symbol_size,int32_t levdB)
+{
+  size_t i=0;
+  switch (ofdm_symbol_size) {
+    case 64:
+      return DFT_SCALING_64[0];
+    case 128:
+      return DFT_SCALING_128[0];
+    case 256:
+      return DFT_SCALING_256[0];
+    case 512:
+      while (i<sizeof(DFT_SCALING_512_THRES)/sizeof(DFT_SCALING_512_THRES[0])) {
+        if (levdB < DFT_SCALING_512_THRES[i]) break;
+        i++;
+      }
+      return DFT_SCALING_512[i];
+    case 768:
+      return DFT_SCALING_768[0];
+    case 1024:
+      while (i<sizeof(DFT_SCALING_1024_THRES)/sizeof(DFT_SCALING_1024_THRES[0])) {
+        if (levdB < DFT_SCALING_1024_THRES[i]) break;
+        i++;
+      }
+      return DFT_SCALING_1024[i];
+    case 1536:
+      return DFT_SCALING_1536[0];
+    case 2048:
+      while (i<sizeof(DFT_SCALING_2048_THRES)/sizeof(DFT_SCALING_2048_THRES[0])) {
+        if (levdB < DFT_SCALING_2048_THRES[i]) break;
+        i++;
+      }
+      return DFT_SCALING_2048[i];
+    case 3072:
+      return DFT_SCALING_3072[0];
+    case 4096:
+      while (i<sizeof(DFT_SCALING_2048_THRES)/sizeof(DFT_SCALING_2048_THRES[0])) {
+        if (levdB < DFT_SCALING_4096_THRES[i]) break;
+        i++;
+      }
+      return DFT_SCALING_2048[i];
+    case 6144:
+      return DFT_SCALING_6144[0];
+    case 8192:
+      return DFT_SCALING_8192[0];
+    case 9216:
+      return DFT_SCALING_9216[0];
+    case 12288:
+      return DFT_SCALING_12288[0];
+    case 18432:
+      return DFT_SCALING_18432[0];
+    case 24576:
+      return DFT_SCALING_24576[0];
+    case 36864:
+      return DFT_SCALING_36864[0];
+    case 49152:
+      return DFT_SCALING_49152[0];
+    case 73728:
+      return DFT_SCALING_73728[0];
+    case 98304:
+      return DFT_SCALING_98304[0];
+    default:
+      return (uint32_t*)1;
+      break;
+  }
+  return NULL;
+}
+
+/*******************************************************************
+*
+* NAME :         get_idft_scaling
+*
+* PARAMETERS :   size of ofdm symbol
+*
+* RETURN :       pointer to default scaling schedule 
+*
+* DESCRIPTION :  return point to the default (best) scaling schedule for IDFT of a given length
+*
+*********************************************************************/
+static inline
+uint32_t *get_idft_scaling(int ofdm_symbol_size,unsigned int lev_ind)
+{
+  AssertFatal(lev_ind < 2, "Illegal lev_ind %u\n",lev_ind); 
+  switch (ofdm_symbol_size) {
+    case 128:
+      return IDFT_SCALING_128[lev_ind];
+    case 256:
+      return IDFT_SCALING_256[lev_ind];
+    case 512:
+      return IDFT_SCALING_512[lev_ind];
+    case 768:
+      return IDFT_SCALING_768[lev_ind];
+    case 1024:
+      return IDFT_SCALING_1024[lev_ind];
+    case 1536:
+      return IDFT_SCALING_1536[lev_ind];
+    case 2048:
+      return IDFT_SCALING_2048[lev_ind];
+    case 3072:
+      return IDFT_SCALING_3072[lev_ind];
+    case 4096:
+      return IDFT_SCALING_4096[lev_ind];
+    case 6144:
+      return IDFT_SCALING_6144[lev_ind];
+    case 8192:
+      return IDFT_SCALING_8192[lev_ind];
+    case 9216:
+      return IDFT_SCALING_9216[lev_ind];
+    case 12288:
+      return IDFT_SCALING_12288[lev_ind];
+    case 18432:
+      return IDFT_SCALING_18432[lev_ind];
+    case 24576:
+      return IDFT_SCALING_24576[lev_ind];
+    case 36864:
+      return IDFT_SCALING_36864[lev_ind];
+    case 49152:
+      return IDFT_SCALING_49152[lev_ind];
+    case 73728:
+      return IDFT_SCALING_73728[lev_ind];
+    case 98304:
+      return IDFT_SCALING_98304[lev_ind];
+    default:
+      printf("function get_idft_scaling : unsupported ofdm symbol size \n");
+      assert(0);
+      break;
+  }
+  return NULL;
+}
 #define SZ_iENUM(Sz) IDFT_##Sz,
 typedef enum idft_size_idx {
   FOREACH_IDFTSZ(SZ_iENUM)
@@ -636,14 +827,14 @@ typedef enum idft_size_idx {
 }  idft_size_idx_t;
 
 #ifdef OAIDFTS_MAIN
-typedef void (*adftfunc_t)(int16_t *sigF, int16_t *sig, unsigned char scale_flag);
-typedef void (*aidftfunc_t)(int16_t *sigF, int16_t *sig, unsigned char scale_flag);
+typedef void (*adftfunc_t)(int16_t *sigF, int16_t *sig, unsigned int *scale);
+typedef void (*aidftfunc_t)(int16_t *sigF, int16_t *sig, unsigned int *scale);
 
-#define SZ_FUNC(Sz) void dft##Sz(int16_t *x, int16_t *y, uint8_t scale_flag);
+#define SZ_FUNC(Sz) void dft##Sz(int16_t *x, int16_t *y, unsigned int *scale);
 
 FOREACH_DFTSZ(SZ_FUNC)
 
-#define SZ_iFUNC(Sz) void idft##Sz(int16_t *x, int16_t *y, uint8_t scale_flag);
+#define SZ_iFUNC(Sz) void idft##Sz(int16_t *x, int16_t *y, unsigned int *scale);
 
 FOREACH_IDFTSZ(SZ_iFUNC)
 #define SZ_PTR(Sz) {dft ## Sz,Sz},
diff --git a/openair1/PHY/defs_RU.h b/openair1/PHY/defs_RU.h
index 1b0182bffde8d921cf9a03a5c5de95f8fc8d50b7..43fabc055e158642433ffe0d1e96ed47f6049b07 100644
--- a/openair1/PHY/defs_RU.h
+++ b/openair1/PHY/defs_RU.h
@@ -666,6 +666,7 @@ typedef struct RU_t_s {
   /// structure for analyzing high-level RT measurements
   rt_ru_profiling_t rt_ru_profiling;
   void* scopeData;
+  int32_t dft_in_levdB;
 } RU_t;
 
 
diff --git a/openair1/PHY/defs_UE.h b/openair1/PHY/defs_UE.h
index 5a090752d5f982b7a1e5d79d763a74557a5c294c..ac093056e4f682710fa3d650feae6c73699eb596 100644
--- a/openair1/PHY/defs_UE.h
+++ b/openair1/PHY/defs_UE.h
@@ -840,6 +840,7 @@ typedef struct {
 
   openair0_device rfdevice;
   void *scopeData;
+  int dft_in_levdB;
 } PHY_VARS_UE;
 
 /* this structure is used to pass both UE phy vars and
diff --git a/openair1/PHY/defs_nr_UE.h b/openair1/PHY/defs_nr_UE.h
index dd035a73e3b5111609e0f4ddc35467c62697e43f..8e1332c126734c5cc612f179976ff372086bc114 100644
--- a/openair1/PHY/defs_nr_UE.h
+++ b/openair1/PHY/defs_nr_UE.h
@@ -552,6 +552,7 @@ typedef struct PHY_VARS_NR_UE_s {
   Actor_t dl_actors[NUM_DL_ACTORS];
   Actor_t ul_actor;
   ntn_config_message_t* ntn_config_message;
+  int32_t dft_in_levdB;
 } PHY_VARS_NR_UE;
 
 typedef struct {
diff --git a/openair1/PHY/nr_phy_common/src/nr_phy_common.c b/openair1/PHY/nr_phy_common/src/nr_phy_common.c
index 9aa45e20b385f1092739671d25f5f643315ce7da..3493c9872c42d5d6ac1001c930e5fc5867905542 100644
--- a/openair1/PHY/nr_phy_common/src/nr_phy_common.c
+++ b/openair1/PHY/nr_phy_common/src/nr_phy_common.c
@@ -358,7 +358,8 @@ void nr_256qam_llr(int32_t *rxdataF_comp, int32_t *ch_mag, int32_t *ch_mag2, int
 void freq2time(uint16_t ofdm_symbol_size, int16_t *freq_signal, int16_t *time_signal)
 {
   const idft_size_idx_t idft_size = get_idft(ofdm_symbol_size);
-  idft(idft_size, freq_signal, time_signal, 1);
+  uint32_t *scaling_sched = get_idft_scaling(ofdm_symbol_size,1);
+  idft(idft_size, freq_signal, time_signal, scaling_sched);
 }
 
 void nr_est_delay(int ofdm_symbol_size, const c16_t *ls_est, c16_t *ch_estimates_time, delay_t *delay)
diff --git a/openair1/SCHED_NR/nr_ru_procedures.c b/openair1/SCHED_NR/nr_ru_procedures.c
index a0b3a69ba8c24becbbc24e8a6aa5fd8d5abb3c9c..ab809cbc399bf52c0c15f09819483075fdfed569 100644
--- a/openair1/SCHED_NR/nr_ru_procedures.c
+++ b/openair1/SCHED_NR/nr_ru_procedures.c
@@ -363,7 +363,8 @@ void nr_fep(void* arg)
                      &ru->common.rxdataF[aid][offset],
                      l,
                      tti_rx,
-                     ru->N_TA_offset);
+                     ru->N_TA_offset,
+                     ru->dft_in_levdB);
   VCD_SIGNAL_DUMPER_DUMP_FUNCTION_BY_NAME(VCD_SIGNAL_DUMPER_FUNCTIONS_PHY_PROCEDURES_RU_FEPRX+aid, 0);
 
   // Task completed in //
diff --git a/openair1/SCHED_NR_UE/phy_procedures_nr_ue.c b/openair1/SCHED_NR_UE/phy_procedures_nr_ue.c
index 809c534acc30975af0c897c3443d926dadb3dd9f..e01433ea998929abbcfd9a9e9991565e4f0f7a3c 100644
--- a/openair1/SCHED_NR_UE/phy_procedures_nr_ue.c
+++ b/openair1/SCHED_NR_UE/phy_procedures_nr_ue.c
@@ -877,6 +877,7 @@ int pbch_pdcch_processing(PHY_VARS_NR_UE *ue, const UE_nr_rxtx_proc_t *proc, nr_
           __attribute__ ((aligned(32))) struct complex16 dl_ch_estimates_time[fp->nb_antennas_rx][fp->ofdm_symbol_size];
 
           for (int i=1; i<4; i++) {
+            if (i==1) ue->dft_in_levdB=-1; // trigger recalculation of DFT scaling 
             nr_slot_fep(ue,
                         fp,
                         proc->nr_slot_rx,
diff --git a/openair1/SCHED_UE/phy_procedures_lte_ue.c b/openair1/SCHED_UE/phy_procedures_lte_ue.c
index 1f165380d4a7edcc2ddde19654df44fee548c2e9..6118bb35662cc8b09cb60371f88f50411797c560 100644
--- a/openair1/SCHED_UE/phy_procedures_lte_ue.c
+++ b/openair1/SCHED_UE/phy_procedures_lte_ue.c
@@ -4452,6 +4452,7 @@ int phy_procedures_UE_RX(PHY_VARS_UE *ue,
         }
 
         VCD_SIGNAL_DUMPER_DUMP_FUNCTION_BY_NAME(VCD_SIGNAL_DUMPER_FUNCTIONS_UE_SLOT_FEP, VCD_FUNCTION_IN);
+	if (l==0) ue->dft_in_levdB = -1; //trigger dft scaling adjustment
         slot_fep(ue,
                  l,
                  (subframe_rx<<1),
diff --git a/openair1/SIMULATION/LTE_PHY/dlsim.c b/openair1/SIMULATION/LTE_PHY/dlsim.c
index 40d35c4a00891e0148d917f16830ec4042bc0be5..6ae5c5fa92ee15e2a42d7696d3dc4b19d6d0b162 100644
--- a/openair1/SIMULATION/LTE_PHY/dlsim.c
+++ b/openair1/SIMULATION/LTE_PHY/dlsim.c
@@ -256,6 +256,12 @@ void DL_channel(RU_t *ru,PHY_VARS_UE *UE,uint subframe,int awgn_flag,double SNR,
         (short) (r_im[aa][i] + (iqim*r_re[aa][i]) + sqrt(sigma2/2)*gaussdouble(0.0,1.0));
     }
   }
+ 
+  int sigenergy=0;
+  for (aa=0;aa<UE->frame_parms.nb_antennas_rx; aa++) {
+     sigenergy+=signal_energy((int32_t*)(UE->common_vars.rxdata[aa]+subframe*UE->frame_parms.samples_per_tti),UE->frame_parms.samples_per_tti);
+  }
+  UE->dft_in_levdB = dB_fixed(sigenergy);
 }
 
 uint16_t
diff --git a/openair1/SIMULATION/LTE_PHY/ulsim.c b/openair1/SIMULATION/LTE_PHY/ulsim.c
index 14e3c696965093522c24b7709050f445b6157827..cada8e5a89572baad3a34e67e2668768699fc9cf 100644
--- a/openair1/SIMULATION/LTE_PHY/ulsim.c
+++ b/openair1/SIMULATION/LTE_PHY/ulsim.c
@@ -1142,6 +1142,11 @@ int main(int argc, char **argv) {
                                   &ru->common.rxdata[0][(eNB->frame_parms.samples_per_tti<<1) -eNB->frame_parms.ofdm_symbol_size],
                                   OFDM_SYMBOL_SIZE_COMPLEX_SAMPLES/2)) - 1)+10*log10(eNB->frame_parms.N_RB_UL/nb_rb);
 
+          int sigenergy=0;
+          for (aa=0;aa<eNB->frame_parms.nb_antennas_rx; aa++) {
+             sigenergy+=signal_energy((int32_t*)(ru->common.rxdata[aa]+subframe*eNB->frame_parms.samples_per_tti),eNB->frame_parms.samples_per_tti);
+          }
+          ru->dft_in_levdB = dB_fixed(sigenergy);
           if (n_frames<=10) {
             printf("SNRmeas %f\n",SNRmeas);
             LOG_M("rxsig0UL.m","rxs0", &ru->common.rxdata[0][eNB->frame_parms.samples_per_tti*subframe],eNB->frame_parms.samples_per_tti,1,1);
diff --git a/openair1/SIMULATION/NR_PHY/dlsim.c b/openair1/SIMULATION/NR_PHY/dlsim.c
index 95a5b9ea647ea467bd9fc54e946cbe1a30b71a69..779e9d1a48f390b490d488c48a5f4d2cd29ba301 100644
--- a/openair1/SIMULATION/NR_PHY/dlsim.c
+++ b/openair1/SIMULATION/NR_PHY/dlsim.c
@@ -351,6 +351,7 @@ int main(int argc, char **argv)
   if ((uniqCfg = load_configmodule(argc, argv, CONFIG_ENABLECMDLINEONLY)) == 0) {
     exit_fun("[NR_DLSIM] Error, configuration module init failed\n");
   }
+  int tx_amp=36;
 
   randominit(0);
 
@@ -358,7 +359,7 @@ int main(int argc, char **argv)
 
   FILE *scg_fd=NULL;
 
-  while ((c = getopt(argc, argv, "--:O:f:hA:p:f:g:i:n:s:S:t:v:x:y:z:o:M:N:F:GR:d:PI:L:a:b:e:m:w:T:U:q:X:Y:Z:")) != -1) {
+  while ((c = getopt(argc, argv, "--:O:f:hA:p:f:g:i:n:s:S:t:v:x:y:z:o:M:N:F:GR:d:PI:L:a:b:e:m:w:T:U:q:X:Y:Z:cQ:")) != -1) {
 
     /* ignore long options starting with '--', option '-O' and their arguments that are handled by configmodule */
     /* with this opstring getopt returns 1 for non-option arguments, refer to 'man 3 getopt' */
@@ -551,7 +552,9 @@ int main(int argc, char **argv)
     case 'o':
       delay = atoi(optarg);
       break;
-
+    case 'Q':
+      tx_amp = atoi(optarg);
+      break;
     default:
     case 'h':
       printf("%s -h(elp) -p(extended_prefix) -N cell_id -f output_filename -F input_filename -g channel_model -n n_frames -s snr0 -S snr1 -x transmission_mode -y TXant -z RXant -i Intefrence0 -j Interference1 -A interpolation_file -C(alibration offset dB) -N CellId\n",
@@ -634,6 +637,7 @@ int main(int argc, char **argv)
   gNB = RC.gNB[0];
   gNB->ofdm_offset_divisor = UINT_MAX;
   gNB->phase_comp = true; // we need to perform phase compensation, otherwise everything will fail
+  gNB->TX_AMP = (int16_t)(32767.0 / pow(10.0, .05 * (double)(tx_amp)));
   frame_parms = &gNB->frame_parms; //to be initialized I suppose (maybe not necessary for PBCH)
   frame_parms->nb_antennas_tx = n_tx;
   frame_parms->nb_antennas_rx = n_rx;
@@ -1147,6 +1151,11 @@ int main(int argc, char **argv)
                   UE->frame_parms.nb_antennas_rx);
         dl_config.sfn = frame;
         dl_config.slot = slot;
+        int sigenergy=0;
+        for (int aarx=0;aarx<UE->frame_parms.nb_antennas_rx;aarx++) {
+            sigenergy += signal_energy((int32_t*)(UE->common_vars.rxdata[aarx]+slot_offset),slot_length)/UE->frame_parms.nb_antennas_rx;
+        }
+        UE->dft_in_levdB=dB_fixed(sigenergy);
         ue_dci_configuration(UE_mac, &dl_config, frame, slot);
         nr_ue_scheduled_response(&scheduled_response);
 
diff --git a/openair1/SIMULATION/NR_PHY/pbchsim.c b/openair1/SIMULATION/NR_PHY/pbchsim.c
index b6d92b9b8109d7d2a3bb6376b027d78cfb94b37f..b771e01e0cdb2c867ca06213f42b65a91a7f44dc 100644
--- a/openair1/SIMULATION/NR_PHY/pbchsim.c
+++ b/openair1/SIMULATION/NR_PHY/pbchsim.c
@@ -674,6 +674,17 @@ int main(int argc, char **argv)
           UE->common_vars.rxdata[aa][i].i = (short)(r_im[aa][i] + sqrt(sigma2 / 2) * gaussdouble(0.0, 1.0));
         }
       }
+      int sigenergy=0;
+
+      int start_symbol = nr_get_ssb_start_symbol(&UE->frame_parms,0);
+      int slot = start_symbol/14;
+
+      int off = UE->frame_parms.get_samples_slot_timestamp(slot, &UE->frame_parms, 0);
+      int slot_length = UE->frame_parms.get_samples_slot_timestamp(slot+1,&UE->frame_parms,0) - off;
+      for (int aarx=0;aarx<UE->frame_parms.nb_antennas_rx;aarx++) {
+          sigenergy += signal_energy((int32_t*)(UE->common_vars.rxdata[aarx]+off),slot_length)/UE->frame_parms.nb_antennas_rx;
+      }
+      UE->dft_in_levdB=dB_fixed(sigenergy);
 
       if (n_trials==1) {
         LOG_M("rxsig0.m", "rxs0", UE->common_vars.rxdata[0], frame_parms->samples_per_frame, 1, 1);
diff --git a/openair1/SIMULATION/NR_PHY/prachsim.c b/openair1/SIMULATION/NR_PHY/prachsim.c
index 5eab1c69748086522f58f0e6bd3b7df2d186900c..0eed5181fc9425bc94ed17bff0f97976010cdabf 100644
--- a/openair1/SIMULATION/NR_PHY/prachsim.c
+++ b/openair1/SIMULATION/NR_PHY/prachsim.c
@@ -761,6 +761,12 @@ int main(int argc, char **argv){
 	  }
 	}
 
+  int sigenergy=0;
+  ru->dft_in_levdB=dB_fixed(sigenergy);
+
+  for (int aarx = 0; aarx < frame_parms->nb_antennas_rx ; aarx++) {
+    sigenergy += signal_energy((int32_t *)ru->common.rxdata[aarx]+rx_prach_start,frame_parms->samples_per_subframe);
+  }
 	for (l = 0; l < frame_parms->symbols_per_slot; l++) {
 	  for (aa = 0; aa < frame_parms->nb_antennas_rx; aa++) {
 	    nr_slot_fep_ul(frame_parms,
@@ -768,7 +774,8 @@ int main(int argc, char **argv){
 			   (int32_t *)ru->common.rxdataF[aa],
 			   l,
 			   slot,
-			   ru->N_TA_offset);
+			   ru->N_TA_offset,
+         ru->dft_in_levdB);
 	  }
 	}
 	
diff --git a/openair1/SIMULATION/NR_PHY/ulsim.c b/openair1/SIMULATION/NR_PHY/ulsim.c
index ffc6690b42461213b13857cdeafbf651d6176e27..a72b5c5e3eedebc0f039b37127e2345e966a5e7b 100644
--- a/openair1/SIMULATION/NR_PHY/ulsim.c
+++ b/openair1/SIMULATION/NR_PHY/ulsim.c
@@ -1250,8 +1250,12 @@ int main(int argc, char *argv[])
 
           multipath_channel(UE2gNB, s_re, s_im, r_re, r_im, slot_length, 0, (n_trials == 1) ? 1 : 0);
           add_noise(rxdata, (const double **) r_re, (const double **) r_im, sigma, slot_length, slot_offset, ts, delay, pdu_bit_map, PUSCH_PDU_BITMAP_PUSCH_PTRS, frame_parms->nb_antennas_rx);
-
         } /*End input_fd */
+        int sigenergy=0;
+        for (int aarx=0;aarx<n_rx;aarx++) {
+            sigenergy += signal_energy((int32_t*)(rxdata[aarx]+slot_offset),slot_length)/n_rx;
+        }
+
 
         //----------------------------------------------------------
         //------------------- gNB phy procedures -------------------
@@ -1267,7 +1271,8 @@ int main(int argc, char *argv[])
                            (int32_t *)gNB->common_vars.rxdataF[0][aa],
                            symbol,
                            slot,
-                           0);
+                           0,
+                           dB_fixed(sigenergy)+9);
         }
         int offset = (slot & 3) * gNB->frame_parms.symbols_per_slot * gNB->frame_parms.ofdm_symbol_size;
         for (int aa = 0; aa < gNB->frame_parms.nb_antennas_rx; aa++)  {