diff --git a/openair1/PHY/TOOLS/dfts_load.c b/openair1/PHY/TOOLS/dfts_load.c
index c0cda9238c5e9f79ff67aec833425dfa1a96f046..2d1cf86f0f40e81c502b5788a462b59a4751de5e 100644
--- a/openair1/PHY/TOOLS/dfts_load.c
+++ b/openair1/PHY/TOOLS/dfts_load.c
@@ -100,8 +100,7 @@ uint32_t IDFT_SCALING_3072[2][4] = {{1, 1, 1, 3}, {1, 1, 1, 3}};
 #endif
 
 uint32_t DFT_SCALING_16[][1] = {{2}};
-uint32_t DFT_SCALING_64[5][2]   = {{3,0},{2,1},{1,2},{1,2},{1,2}};
-uint32_t DFT_SCALING_128[5][3]  = {{4,0,0},{3,1,0},{2,2,0},{1,3,0},{0,4,0}};
+uint32_t DFT_SCALING_64[5][2] = {{3, 0}, {2, 1}, {1, 2}, {1, 2}, {1, 2}};
 uint32_t DFT_SCALING_256[5][3]  = {{4,0,0},{3,1,0},{2,2,0},{1,3,0},{0,4,0}};
 int32_t DFT_SCALING_512_THRES[7] = {53, 57, 59, 63, 65, 69, 100};
 int32_t DFT_SCALING_1024_THRES[5] = {49,55,63,69,100};
diff --git a/openair1/PHY/TOOLS/oai_dfts.c b/openair1/PHY/TOOLS/oai_dfts.c
index d26fa6edafc8c4d0853ba1e7b4437faaf74f1385..46ef8dc9928223501014c1426092156e56196c47 100644
--- a/openair1/PHY/TOOLS/oai_dfts.c
+++ b/openair1/PHY/TOOLS/oai_dfts.c
@@ -66,6 +66,7 @@
 #include "tools_defs.h"
 
 #define print_shorts(s,x) printf("%s %d,%d,%d,%d,%d,%d,%d,%d\n",s,(x)[0],(x)[1],(x)[2],(x)[3],(x)[4],(x)[5],(x)[6],(x)[7])
+#define print_floats(s, x) printf("%s %f,%f,%f,%f,%f,%f,%f,%f\n", s, (x)[0], (x)[1], (x)[2], (x)[3], (x)[4], (x)[5], (x)[6], (x)[7])
 #define print_shorts256(s,x) printf("%s %d,%d,%d,%d,%d,%d,%d,%d,%d,%d,%d,%d,%d,%d,%d,%d\n",s,(x)[0],(x)[1],(x)[2],(x)[3],(x)[4],(x)[5],(x)[6],(x)[7],(x)[8],(x)[9],(x)[10],(x)[11],(x)[12],(x)[13],(x)[14],(x)[15])
 
 #define print_ints(s,x) printf("%s %d %d %d %d\n",s,(x)[0],(x)[1],(x)[2],(x)[3])
@@ -259,6 +260,14 @@ __attribute__((always_inline)) static inline void cmultc_256(simde__m256i a, sim
   *im32 = simde_mm256_madd_epi16(a, mmtmpb);
 }
 
+__attribute__((always_inline)) static inline simde__m128i cpack_noshift(simde__m128i xre, simde__m128i xim)
+{
+  register simde__m128i cpack_tmp1, cpack_tmp2;
+
+  cpack_tmp1 = simde_mm_unpacklo_epi32(xre, xim);
+  cpack_tmp2 = simde_mm_unpackhi_epi32(xre, xim);
+  return (simde_mm_packs_epi32(cpack_tmp1, cpack_tmp2));
+}
 
 #ifdef TWIDDLE_Q14
 #define MUL_SHIFT 14
@@ -275,6 +284,15 @@ __attribute__((always_inline)) static inline simde__m128i cpack(simde__m128i xre
   return (simde_mm_packs_epi32(simde_mm_srai_epi32(cpack_tmp1, MUL_SHIFT), simde_mm_srai_epi32(cpack_tmp2, MUL_SHIFT)));
 }
 
+__attribute__((always_inline)) static inline simde__m256i cpack_256_noshift(simde__m256i xre, simde__m256i xim)
+{
+  register simde__m256i cpack_tmp1, cpack_tmp2;
+
+  cpack_tmp1 = simde_mm256_unpacklo_epi32(xre, xim);
+  cpack_tmp2 = simde_mm256_unpackhi_epi32(xre, xim);
+  return (simde_mm256_packs_epi32(cpack_tmp1, cpack_tmp2));
+}
+
 __attribute__((always_inline)) static inline simde__m256i cpack_256(simde__m256i xre, simde__m256i xim)
 {
   register simde__m256i cpack_tmp1, cpack_tmp2;
@@ -666,6 +684,50 @@ __attribute__((always_inline)) static inline void bfly4(simde__m128i *x0,
   *(y3) = simde_mm_add_epi16(*(x0), cpack(dy3r, dy3i));
 }
 
+__attribute__((always_inline)) static inline simde__m256i cmult_float_128(simde__m128i x, simde__m256 a)
+{
+  simde__m256i x256_32 = simde_mm256_cvtepi16_epi32(x); // 3 2 1 0 im re im re im re im re
+  simde__m256 x256_f = simde_mm256_cvtepi32_ps(x256_32); // 3 2 1 0
+  float conj_mask[8] = {1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0};
+  simde__m256 y256_f_re = simde_mm256_mul_ps(x256_f, simde_mm256_mul_ps(a, *(simde__m256 *)conj_mask));
+  simde__m256 y256_f_im = simde_mm256_mul_ps(x256_f, simde_mm256_permute_ps(a, 0b10110001));
+  simde__m256 y256_f = simde_mm256_hadd_ps(y256_f_re, y256_f_im); // im im re re im im re re
+  simde__m256i y256_32 = simde_mm256_cvtps_epi32(simde_mm256_round_ps(y256_f, SIMDE_MM_FROUND_TO_NEAREST_INT));
+  return simde_mm256_permutevar8x32_epi32(y256_32, simde_mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0)); // im im im im re re re re
+}
+
+/*
+Floating point multiplication is done to all 8 complex samples.
+*/
+__attribute__((always_inline)) static inline simde__m256i pack_cmult_float_256(simde__m256i x,
+                                                                               const simde__m256i *ai,
+                                                                               const simde__m256 *a)
+{
+  simde__m256i y0tmp = cmult_float_128(*(simde__m128i *)&x, a[0]); // im im im im re re re re (3 ... 0)
+  simde__m128i y0 = cpack_noshift(*((simde__m128i *)&y0tmp), *(((simde__m128i *)&y0tmp) + 1)); // im re im re im re im re (3 ... 0)
+  simde__m256i y1tmp = cmult_float_128(*(((simde__m128i *)&x) + 1), a[1]); // im im im im re re re re (7 ... 4)
+  simde__m128i y1 = cpack_noshift(*((simde__m128i *)&y1tmp), *(((simde__m128i *)&y1tmp) + 1)); // im re im re im re im re (7 ... 4)
+  return (simde_mm256_loadu2_m128i(&y1, &y0));
+}
+
+/*
+Floating point multiplication is done to first 4 complex samples and fixed point for next 4 samples.
+*/
+__attribute__((always_inline)) static inline void cmult_float_256(simde__m256i x,
+                                                                  const simde__m256i *ai,
+                                                                  const simde__m256 *a,
+                                                                  simde__m256i *re,
+                                                                  simde__m256i *im)
+{
+  simde__m256i y0tmp = cmult_float_128(*(simde__m128i *)&x, a[0]); // im im im im re re re re (3 ... 0)
+  simde__m128i y1re, y1im;
+  cmult(*(((simde__m128i *)&x) + 1), *(((simde__m128i *)ai) + 1), &y1re, &y1im); // im im im im re re re re (7 ... 4)
+  *((simde__m128i *)re) = *((simde__m128i *)&y0tmp);
+  *(((simde__m128i *)re) + 1) = simde_mm_srai_epi32(y1re, MUL_SHIFT); // re re re re re re re re (7 ... 0)
+  *((simde__m128i *)im) = *(((simde__m128i *)&y0tmp) + 1);
+  *(((simde__m128i *)im) + 1) = simde_mm_srai_epi32(y1im, MUL_SHIFT); // im im im im im im im im (7 ... 0)
+}
+
 __attribute__((always_inline)) static inline void bfly4_256(simde__m256i *x0,
                                                             simde__m256i *x1,
                                                             simde__m256i *x2,
@@ -676,38 +738,46 @@ __attribute__((always_inline)) static inline void bfly4_256(simde__m256i *x0,
                                                             simde__m256i *y3,
                                                             simde__m256i *tw1,
                                                             simde__m256i *tw2,
-                                                            simde__m256i *tw3)
+                                                            simde__m256i *tw3,
+                                                            simde__m256 *twf)
 {
   simde__m256i x1r_2, x1i_2, x2r_2, x2i_2, x3r_2, x3i_2, dy0r, dy0i, dy1r, dy1i, dy2r, dy2i, dy3r, dy3i;
 
   //  cmult(*(x0),*(W0),&x0r_2,&x0i_2);
-  cmult_256(*(x1),*(tw1),&x1r_2,&x1i_2);
-  cmult_256(*(x2),*(tw2),&x2r_2,&x2i_2);
-  cmult_256(*(x3),*(tw3),&x3r_2,&x3i_2);
+  if (twf) {
+    cmult_float_256(*(x1), tw1, twf, &x1r_2, &x1i_2);
+    cmult_float_256(*(x2), tw2, twf + 1, &x2r_2, &x2i_2);
+    cmult_float_256(*(x3), tw3, twf + 2, &x3r_2, &x3i_2);
+  } else {
+    cmult_256(*(x1), *(tw1), &x1r_2, &x1i_2);
+    cmult_256(*(x2), *(tw2), &x2r_2, &x2i_2);
+    cmult_256(*(x3), *(tw3), &x3r_2, &x3i_2);
+  }
   //  dy0r = simde_mm_add_epi32(x0r_2,simde_mm_add_epi32(x1r_2,simde_mm_add_epi32(x2r_2,x3r_2)));
   //  dy0i = simde_mm_add_epi32(x0i_2,simde_mm_add_epi32(x1i_2,simde_mm_add_epi32(x2i_2,x3i_2)));
   //  *(y0)  = cpack(dy0r,dy0i);
   dy0r = simde_mm256_add_epi32(x1r_2,simde_mm256_add_epi32(x2r_2,x3r_2));
   dy0i = simde_mm256_add_epi32(x1i_2,simde_mm256_add_epi32(x2i_2,x3i_2));
-  *(y0)  = simde_mm256_add_epi16(*(x0),cpack_256(dy0r,dy0i));
+  *(y0) = simde_mm256_add_epi16(*(x0), twf ? cpack_256_noshift(dy0r, dy0i) : cpack_256(dy0r, dy0i));
+  //*(y0)  = simde_mm256_adds_epi16(*(y0), simde_mm256_set_epi16(0,1,0,1,0,1,0,0,0,1,0,1,0,1,0,0));
   //  dy1r = simde_mm_add_epi32(x0r_2,simde_mm_sub_epi32(x1i_2,simde_mm_add_epi32(x2r_2,x3i_2)));
   //  dy1i = simde_mm_sub_epi32(x0i_2,simde_mm_add_epi32(x1r_2,simde_mm_sub_epi32(x2i_2,x3r_2)));
   //  *(y1)  = cpack(dy1r,dy1i);
   dy1r = simde_mm256_sub_epi32(x1i_2,simde_mm256_add_epi32(x2r_2,x3i_2));
   dy1i = simde_mm256_sub_epi32(simde_mm256_sub_epi32(x3r_2,x2i_2),x1r_2);
-  *(y1)  = simde_mm256_add_epi16(*(x0),cpack_256(dy1r,dy1i));
+  *(y1) = simde_mm256_add_epi16(*(x0), twf ? cpack_256_noshift(dy1r, dy1i) : cpack_256(dy1r, dy1i));
   //  dy2r = simde_mm_sub_epi32(x0r_2,simde_mm_sub_epi32(x1r_2,simde_mm_sub_epi32(x2r_2,x3r_2)));
   //  dy2i = simde_mm_sub_epi32(x0i_2,simde_mm_sub_epi32(x1i_2,simde_mm_sub_epi32(x2i_2,x3i_2)));
   //  *(y2)  = cpack(dy2r,dy2i);
   dy2r = simde_mm256_sub_epi32(simde_mm256_sub_epi32(x2r_2,x3r_2),x1r_2);
   dy2i = simde_mm256_sub_epi32(simde_mm256_sub_epi32(x2i_2,x3i_2),x1i_2);
-  *(y2)  = simde_mm256_add_epi16(*(x0),cpack_256(dy2r,dy2i));
+  *(y2) = simde_mm256_add_epi16(*(x0), twf ? cpack_256_noshift(dy2r, dy2i) : cpack_256(dy2r, dy2i));
   //  dy3r = simde_mm_sub_epi32(x0r_2,simde_mm_add_epi32(x1i_2,simde_mm_sub_epi32(x2r_2,x3i_2)));
   //  dy3i = simde_mm_add_epi32(x0i_2,simde_mm_sub_epi32(x1r_2,simde_mm_add_epi32(x2i_2,x3r_2)));
   //  *(y3) = cpack(dy3r,dy3i);
   dy3r = simde_mm256_sub_epi32(simde_mm256_sub_epi32(x3i_2,x2r_2),x1i_2);
   dy3i = simde_mm256_sub_epi32(x1r_2,simde_mm256_add_epi32(x2i_2,x3r_2));
-  *(y3) = simde_mm256_add_epi16(*(x0),cpack_256(dy3r,dy3i));
+  *(y3) = simde_mm256_add_epi16(*(x0), twf ? cpack_256_noshift(dy3r, dy3i) : cpack_256(dy3r, dy3i));
 }
 
 __attribute__((always_inline)) static inline void ibfly4_256(simde__m256i *x0,
@@ -844,12 +914,13 @@ __attribute__((always_inline)) static inline void bfly4_16_256(simde__m256i *x0,
                                                                simde__m256i *y1,
                                                                simde__m256i *y2,
                                                                simde__m256i *y3,
-                                                               simde__m256i *tw1,
-                                                               simde__m256i *tw2,
-                                                               simde__m256i *tw3,
-                                                               simde__m256i *tw1b,
-                                                               simde__m256i *tw2b,
-                                                               simde__m256i *tw3b)
+                                                               void *tw1,
+                                                               void *tw2,
+                                                               void *tw3,
+                                                               void *tw1b,
+                                                               void *tw2b,
+                                                               void *tw3b,
+                                                               bool doFloat)
 {
   register simde__m256i x1t, x2t, x3t, x02t, x13t;
   register simde__m256i x1_flip, x3_flip;
@@ -890,9 +961,15 @@ __attribute__((always_inline)) static inline void bfly4_16_256(simde__m256i *x0,
   // [xi00 xi01 xi02 xi03 xi10 xi20 xi30 xi40]
   // each output yi is the same
 
-  x1t = packed_cmult2_256(*(x1),*(tw1),*(tw1b));
-  x2t = packed_cmult2_256(*(x2),*(tw2),*(tw2b));
-  x3t = packed_cmult2_256(*(x3),*(tw3),*(tw3b));
+  if (doFloat) {
+    x1t = pack_cmult_float_256(*(x1), (simde__m256i *)tw1, (simde__m256 *)tw1b);
+    x2t = pack_cmult_float_256(*(x2), ((simde__m256i *)tw2), ((simde__m256 *)tw2b));
+    x3t = pack_cmult_float_256(*(x3), ((simde__m256i *)tw3), ((simde__m256 *)tw3b));
+  } else {
+    x1t = packed_cmult2_256(*(x1), *((simde__m256i *)tw1), *((simde__m256i *)tw1b));
+    x2t = packed_cmult2_256(*(x2), *((simde__m256i *)tw2), *((simde__m256i *)tw2b));
+    x3t = packed_cmult2_256(*(x3), *((simde__m256i *)tw3), *((simde__m256i *)tw3b));
+  }
 
   x02t  = simde_mm256_adds_epi16(*(x0),x2t);
   x13t  = simde_mm256_adds_epi16(x1t,x3t);
@@ -1164,6 +1241,22 @@ const static int16_t tw16c[24]
                                     0,     32767,  23170,  23169, 32767, 0,     23170, -23170,
                                     0,     32767, 30273, 12539, 23170, -23170, -12539, -30273};
 
+const static float tw16repf[48] __attribute__((aligned(32))) = {
+  1.00000000e+00,  0.00000000e+00,  9.23879533e-01, -3.82683432e-01,
+  7.07106781e-01, -7.07106781e-01,  3.82683432e-01, -9.23879533e-01,
+  1.00000000e+00,  0.00000000e+00,  9.23879533e-01, -3.82683432e-01,
+  7.07106781e-01, -7.07106781e-01,  3.82683432e-01, -9.23879533e-01,
+
+  1.00000000e+00,  0.00000000e+00,  7.07106781e-01, -7.07106781e-01,
+  6.12323400e-17, -1.00000000e+00, -7.07106781e-01, -7.07106781e-01,
+  1.00000000e+00,  0.00000000e+00,  7.07106781e-01, -7.07106781e-01,
+  6.12323400e-17, -1.00000000e+00, -7.07106781e-01, -7.07106781e-01,
+
+  1.00000000e+00,  0.00000000e+00,  3.82683432e-01, -9.23879533e-01,
+ -7.07106781e-01, -7.07106781e-01, -9.23879533e-01,  3.82683432e-01,
+  1.00000000e+00,  0.00000000e+00,  3.82683432e-01, -9.23879533e-01,
+ -7.07106781e-01, -7.07106781e-01, -9.23879533e-01,  3.82683432e-01
+};
 #if defined(TWIDDLE_Q14)
 /*
 [16384.    +0.j 15137. -6270.j 11585.-11585.j  6270.-15137.j]
@@ -1289,10 +1382,10 @@ static inline void dft16(int16_t *x,int16_t *y) __attribute__((always_inline)
 //#define USE_DFT16_SHIFT
 
 // Does two 16-point DFTS (x[0 .. 15] is 128 LSBs of input vector, x[16..31] is in 128 MSBs)
-__attribute__((always_inline)) static inline void dft16_simd256(int16_t *x, int16_t *y,int scale)
+__attribute__((always_inline)) static inline void dft16_simd256(int16_t *x, int16_t *y, int scale, bool doFloat)
 {
-  simde__m256i *tw16a_256 = (simde__m256i *)tw16arep, *tw16b_256 = (simde__m256i *)tw16brep, *x256 = (simde__m256i *)x,
-               *y256 = (simde__m256i *)y;
+  simde__m256i *x256 = (simde__m256i *)x;
+  simde__m256i *y256 = (simde__m256i *)y;
 
   simde__m256i x1_flip, x3_flip, x02t, x13t;
   simde__m256i ytmp0, ytmp1, ytmp2, ytmp3, xtmp0, xtmp1, xtmp2, xtmp3;
@@ -1366,14 +1459,26 @@ __attribute__((always_inline)) static inline void dft16_simd256(int16_t *x, int1
   xtmp3   = simde_mm256_unpackhi_epi64(ytmp1,ytmp3); // x3 x11 x19 x27 x7 x15 x23 x31
 
   // Second stage : 4 Radix-4 butterflies with input twiddles
-  xtmp1 = packed_cmult2_256(xtmp1,tw16a_256[0],tw16b_256[0]);
-  xtmp2 = packed_cmult2_256(xtmp2,tw16a_256[1],tw16b_256[1]);
-  xtmp3 = packed_cmult2_256(xtmp3,tw16a_256[2],tw16b_256[2]);
+  if (doFloat) {
+    simde__m256i *tw16 = (simde__m256i *)tw16rep;
+    simde__m256 *tw16f = (simde__m256 *)tw16repf;
+    xtmp1 = pack_cmult_float_256(xtmp1, tw16, tw16f);
+    xtmp2 = pack_cmult_float_256(xtmp2, tw16 + 1, tw16f + 2);
+    xtmp3 = pack_cmult_float_256(xtmp3, tw16 + 2, tw16f + 4);
+  } else {
+    simde__m256i *tw16a_256 = (simde__m256i *)tw16arep;
+    simde__m256i *tw16b_256 = (simde__m256i *)tw16brep;
+    xtmp1 = packed_cmult2_256(xtmp1, tw16a_256[0], tw16b_256[0]);
+    xtmp2 = packed_cmult2_256(xtmp2, tw16a_256[1], tw16b_256[1]);
+    xtmp3 = packed_cmult2_256(xtmp3, tw16a_256[2], tw16b_256[2]);
+  }
 
-  /*  print_shorts256("xtmp0",(int16_t*)&xtmp0);
+  /*
+  print_shorts256("xtmp0",(int16_t*)&xtmp0);
   print_shorts256("xtmp1",(int16_t*)&xtmp1);
   print_shorts256("xtmp2",(int16_t*)&xtmp2);
-  print_shorts256("xtmp3",(int16_t*)&xtmp3);*/
+  print_shorts256("xtmp3",(int16_t*)&xtmp3);
+  */
 
   x02t    = simde_mm256_adds_epi16(xtmp0,xtmp2);
   x13t    = simde_mm256_adds_epi16(xtmp1,xtmp3);
@@ -1427,7 +1532,7 @@ void dft16(int16_t *x, int16_t *y, unsigned int *scale)
   */
   simde__m256i ytmp[4];
   const unsigned int scale16 = scale ? scale[0] : 0;
-  dft16_simd256((int16_t *)xtmp, (int16_t *)ytmp, scale16);
+  dft16_simd256((int16_t *)xtmp, (int16_t *)ytmp, scale16, true);
 
   simde__m256i *y256 = (simde__m256i *)y;
   y256[0] = ytmp[0];
@@ -1605,6 +1710,32 @@ __attribute__((always_inline)) static inline void idft16_simd256(int16_t *x, int
 }
 // 64-point optimized DFT
 
+const static float tw64f[96] __attribute__((aligned(32))) = {
+  1.00000000e+00,  0.00000000e+00,  9.95184727e-01, -9.80171403e-02,
+  9.80785280e-01, -1.95090322e-01,  9.56940336e-01, -2.90284677e-01,
+  9.23879533e-01, -3.82683432e-01,  8.81921264e-01, -4.71396737e-01,
+  8.31469612e-01, -5.55570233e-01,  7.73010453e-01, -6.34393284e-01,
+  7.07106781e-01, -7.07106781e-01,  6.34393284e-01, -7.73010453e-01,
+  5.55570233e-01, -8.31469612e-01,  4.71396737e-01, -8.81921264e-01,
+  3.82683432e-01, -9.23879533e-01,  2.90284677e-01, -9.56940336e-01,
+  1.95090322e-01, -9.80785280e-01,  9.80171403e-02, -9.95184727e-01,
+  1.00000000e+00,  0.00000000e+00,  9.80785280e-01, -1.95090322e-01,
+  9.23879533e-01, -3.82683432e-01,  8.31469612e-01, -5.55570233e-01,
+  7.07106781e-01, -7.07106781e-01,  5.55570233e-01, -8.31469612e-01,
+  3.82683432e-01, -9.23879533e-01,  1.95090322e-01, -9.80785280e-01,
+  6.12323400e-17, -1.00000000e+00, -1.95090322e-01, -9.80785280e-01,
+ -3.82683432e-01, -9.23879533e-01, -5.55570233e-01, -8.31469612e-01,
+ -7.07106781e-01, -7.07106781e-01, -8.31469612e-01, -5.55570233e-01,
+ -9.23879533e-01, -3.82683432e-01, -9.80785280e-01, -1.95090322e-01,
+  1.00000000e+00,  0.00000000e+00,  9.56940336e-01, -2.90284677e-01,
+  8.31469612e-01, -5.55570233e-01,  6.34393284e-01, -7.73010453e-01,
+  3.82683432e-01, -9.23879533e-01,  9.80171403e-02, -9.95184727e-01,
+ -1.95090322e-01, -9.80785280e-01, -4.71396737e-01, -8.81921264e-01,
+ -7.07106781e-01, -7.07106781e-01, -8.81921264e-01, -4.71396737e-01,
+ -9.80785280e-01, -1.95090322e-01, -9.95184727e-01,  9.80171403e-02,
+ -9.23879533e-01,  3.82683432e-01, -7.73010453e-01,  6.34393284e-01,
+ -5.55570233e-01,  8.31469612e-01, -2.90284677e-01,  9.56940336e-01
+};
 #ifdef TWIDDLE_Q14
 const static int16_t tw64[96] __attribute__((aligned(32))) = {
   16384,      0,  16305,  -1606,  16069,  -3196,  15679,  -4756,
@@ -1702,8 +1833,7 @@ const static int16_t tw64c[96] __attribute__((aligned(32))) = {
 
 void dft64(int16_t *x,int16_t *y,unsigned int *scale)
 {
-  simd256_q15_t xtmp[8], ytmp[8], *tw64a_256 = (simd256_q15_t *)tw64a, *tw64b_256 = (simd256_q15_t *)tw64b,
-                                  *x256 = (simd256_q15_t *)x, *y256 = (simd256_q15_t *)y;
+  simd256_q15_t xtmp[8], ytmp[8], *x256 = (simd256_q15_t *)x, *y256 = (simd256_q15_t *)y;
 
   int scale16=0;
   if (scale) scale16 = scale[1];
@@ -1751,7 +1881,7 @@ void dft64(int16_t *x,int16_t *y,unsigned int *scale)
   xtmp[6] = _mm256_permutex2var_epi32(x256[4],perm_mask2,x256[5]); // x33 x37 x41 x45 x35 x39 x43 x46 
   xtmp[7] = _mm256_permutex2var_epi32(x256[6],perm_mask2,x256[7]); // x49 x53 x57 x61 x51 x55 x59 x63 
 #endif
-  dft16_simd256((int16_t*)(xtmp),(int16_t*)ytmp,scale16);
+  dft16_simd256((int16_t *)(xtmp), (int16_t *)ytmp, scale16, true);
   // [y0  y1  y2  y3  y4  y5  y6  y7]
   // [y8  y9  y10 y11 y12 y13 y14 y15]
   // [y16 y17 y18 y19 y20 y21 y22 y23]
@@ -1762,7 +1892,7 @@ void dft64(int16_t *x,int16_t *y,unsigned int *scale)
   print_shorts256("ytmp2",(int16_t*)(ytmp+2));
   print_shorts256("ytmp3",(int16_t*)(ytmp+3));
   */
-  dft16_simd256((int16_t*)(xtmp+4),(int16_t*)(ytmp+4),scale16);
+  dft16_simd256((int16_t *)(xtmp + 4), (int16_t *)(ytmp + 4), scale16, true);
   // [y32 y33 y34 y35 y36 y37 y38 y39]
   // [y40 y41 y42 y43 y44 y45 y46 y47]
   // [y48 y49 y50 y51 y52 y53 y54 y55]
@@ -1778,20 +1908,43 @@ void dft64(int16_t *x,int16_t *y,unsigned int *scale)
   start_meas(&ts_b);
 #endif
 
-
-  bfly4_16_256(ytmp,ytmp+2,ytmp+4,ytmp+6,
-	       y256,y256+2,y256+4,y256+6,
-	       tw64a_256,tw64a_256+2,tw64a_256+4,
-	       tw64b_256,tw64b_256+2,tw64b_256+4);
+  simde__m256 *tw64f_256 = (simde__m256 *)tw64f;
+  simde__m256i *tw64_256 = (simde__m256i *)tw64;
+  bfly4_16_256(ytmp,
+               ytmp + 2,
+               ytmp + 4,
+               ytmp + 6,
+               y256,
+               y256 + 2,
+               y256 + 4,
+               y256 + 6,
+               tw64_256,
+               tw64_256 + 2,
+               tw64_256 + 4,
+               tw64f_256,
+               tw64f_256 + 4,
+               tw64f_256 + 8,
+               true);
   // [y0  y1  y2  y3  y4  y5  y6  y7]
   // [y16 y17 y18 y19 y20 y21 y22 y23]
   // [y32 y33 y34 y35 y36 y37 y38 y39]
   // [y48 y49 y50 y51 y52 y53 y54 y55]
 
-  bfly4_16_256(ytmp+1,ytmp+3,ytmp+5,ytmp+7,
-	       y256+1,y256+3,y256+5,y256+7,
-	       tw64a_256+1,tw64a_256+3,tw64a_256+5,
-	       tw64b_256+1,tw64b_256+3,tw64b_256+5);
+  bfly4_16_256(ytmp + 1,
+               ytmp + 3,
+               ytmp + 5,
+               ytmp + 7,
+               y256 + 1,
+               y256 + 3,
+               y256 + 5,
+               y256 + 7,
+               tw64_256 + 1,
+               tw64_256 + 3,
+               tw64_256 + 5,
+               tw64f_256 + 2,
+               tw64f_256 + 6,
+               tw64f_256 + 10,
+               true);
   // [y8  y9  y10 y11 y12 y13 y14 y15]
   // [y24 y25 y26 y27 y28 y29 y30 y31]
   // [y40 y41 y42 y43 y44 y45 y46 y47]
@@ -1936,11 +2089,68 @@ void idft64(int16_t *x,int16_t *y,unsigned int *scale)
 
 }
 
+#ifdef TWIDDLE_Q14
+static const int16_t tw128[128] __attribute__((aligned(32))) = {
+  16384,      0,  16364,   -804,  16305,  -1606,  16207,  -2404,
+  16069,  -3196,  15893,  -3981,  15679,  -4756,  15426,  -5520,
+  15137,  -6270,  14811,  -7005,  14449,  -7723,  14053,  -8423,
+  13623,  -9102,  13160,  -9760,  12665, -10394,  12140, -11003,
+  11585, -11585,  11003, -12140,  10394, -12665,   9760, -13160,
+   9102, -13623,   8423, -14053,   7723, -14449,   7005, -14811,
+   6270, -15137,   5520, -15426,   4756, -15679,   3981, -15893,
+   3196, -16069,   2404, -16207,   1606, -16305,    804, -16364,
+      0, -16384,   -804, -16364,  -1606, -16305,  -2404, -16207,
+  -3196, -16069,  -3981, -15893,  -4756, -15679,  -5520, -15426,
+  -6270, -15137,  -7005, -14811,  -7723, -14449,  -8423, -14053,
+  -9102, -13623,  -9760, -13160, -10394, -12665, -11003, -12140,
+ -11585, -11585, -12140, -11003, -12665, -10394, -13160,  -9760,
+ -13623,  -9102, -14053,  -8423, -14449,  -7723, -14811,  -7005,
+ -15137,  -6270, -15426,  -5520, -15679,  -4756, -15893,  -3981,
+ -16069,  -3196, -16207,  -2404, -16305,  -1606, -16364,   -804,
+};
+static const int16_t tw128a[128] __attribute__((aligned(32))) = {
+  16384,      0,  16364,    804,  16305,   1606,  16207,   2404,
+  16069,   3196,  15893,   3981,  15679,   4756,  15426,   5520,
+  15137,   6270,  14811,   7005,  14449,   7723,  14053,   8423,
+  13623,   9102,  13160,   9760,  12665,  10394,  12140,  11003,
+  11585,  11585,  11003,  12140,  10394,  12665,   9760,  13160,
+   9102,  13623,   8423,  14053,   7723,  14449,   7005,  14811,
+   6270,  15137,   5520,  15426,   4756,  15679,   3981,  15893,
+   3196,  16069,   2404,  16207,   1606,  16305,    804,  16364,
+      0,  16384,   -804,  16364,  -1606,  16305,  -2404,  16207,
+  -3196,  16069,  -3981,  15893,  -4756,  15679,  -5520,  15426,
+  -6270,  15137,  -7005,  14811,  -7723,  14449,  -8423,  14053,
+  -9102,  13623,  -9760,  13160, -10394,  12665, -11003,  12140,
+ -11585,  11585, -12140,  11003, -12665,  10394, -13160,   9760,
+ -13623,   9102, -14053,   8423, -14449,   7723, -14811,   7005,
+ -15137,   6270, -15426,   5520, -15679,   4756, -15893,   3981,
+ -16069,   3196, -16207,   2404, -16305,   1606, -16364,    804,
+};
+static const int16_t tw128b[128] __attribute__((aligned(32))) = {
+      0,  16384,   -804,  16364,  -1606,  16305,  -2404,  16207,
+  -3196,  16069,  -3981,  15893,  -4756,  15679,  -5520,  15426,
+  -6270,  15137,  -7005,  14811,  -7723,  14449,  -8423,  14053,
+  -9102,  13623,  -9760,  13160, -10394,  12665, -11003,  12140,
+ -11585,  11585, -12140,  11003, -12665,  10394, -13160,   9760,
+ -13623,   9102, -14053,   8423, -14449,   7723, -14811,   7005,
+ -15137,   6270, -15426,   5520, -15679,   4756, -15893,   3981,
+ -16069,   3196, -16207,   2404, -16305,   1606, -16364,    804,
+ -16384,      0, -16364,   -804, -16305,  -1606, -16207,  -2404,
+ -16069,  -3196, -15893,  -3981, -15679,  -4756, -15426,  -5520,
+ -15137,  -6270, -14811,  -7005, -14449,  -7723, -14053,  -8423,
+ -13623,  -9102, -13160,  -9760, -12665, -10394, -12140, -11003,
+ -11585, -11585, -11003, -12140, -10394, -12665,  -9760, -13160,
+  -9102, -13623,  -8423, -14053,  -7723, -14449,  -7005, -14811,
+  -6270, -15137,  -5520, -15426,  -4756, -15679,  -3981, -15893,
+  -3196, -16069,  -2404, -16207,  -1606, -16305,   -804, -16364,
+};
+#else
 static const int16_t tw128[128] __attribute__((aligned(32))) = {  32767,0,32727,-1608,32609,-3212,32412,-4808,32137,-6393,31785,-7962,31356,-9512,30851,-11039,30272,-12540,29621,-14010,28897,-15447,28105,-16846,27244,-18205,26318,-19520,25329,-20788,24278,-22005,23169,-23170,22004,-24279,20787,-25330,19519,-26319,18204,-27245,16845,-28106,15446,-28898,14009,-29622,12539,-30273,11038,-30852,9511,-31357,7961,-31786,6392,-32138,4807,-32413,3211,-32610,1607,-32728,0,-32767,-1608,-32728,-3212,-32610,-4808,-32413,-6393,-32138,-7962,-31786,-9512,-31357,-11039,-30852,-12540,-30273,-14010,-29622,-15447,-28898,-16846,-28106,-18205,-27245,-19520,-26319,-20788,-25330,-22005,-24279,-23170,-23170,-24279,-22005,-25330,-20788,-26319,-19520,-27245,-18205,-28106,-16846,-28898,-15447,-29622,-14010,-30273,-12540,-30852,-11039,-31357,-9512,-31786,-7962,-32138,-6393,-32413,-4808,-32610,-3212,-32728,-1608};
 
 static const int16_t tw128a[128] __attribute__((aligned(32))) = { 32767,0,32727,1608,32609,3212,32412,4808,32137,6393,31785,7962,31356,9512,30851,11039,30272,12540,29621,14010,28897,15447,28105,16846,27244,18205,26318,19520,25329,20788,24278,22005,23169,23170,22004,24279,20787,25330,19519,26319,18204,27245,16845,28106,15446,28898,14009,29622,12539,30273,11038,30852,9511,31357,7961,31786,6392,32138,4807,32413,3211,32610,1607,32728,0,32767,-1608,32728,-3212,32610,-4808,32413,-6393,32138,-7962,31786,-9512,31357,-11039,30852,-12540,30273,-14010,29622,-15447,28898,-16846,28106,-18205,27245,-19520,26319,-20788,25330,-22005,24279,-23170,23170,-24279,22005,-25330,20788,-26319,19520,-27245,18205,-28106,16846,-28898,15447,-29622,14010,-30273,12540,-30852,11039,-31357,9512,-31786,7962,-32138,6393,-32413,4808,-32610,3212,-32728,1608};
 
 static const int16_t tw128b[128] __attribute__((aligned(32))) = {0,32767,-1608,32727,-3212,32609,-4808,32412,-6393,32137,-7962,31785,-9512,31356,-11039,30851,-12540,30272,-14010,29621,-15447,28897,-16846,28105,-18205,27244,-19520,26318,-20788,25329,-22005,24278,-23170,23169,-24279,22004,-25330,20787,-26319,19519,-27245,18204,-28106,16845,-28898,15446,-29622,14009,-30273,12539,-30852,11038,-31357,9511,-31786,7961,-32138,6392,-32413,4807,-32610,3211,-32728,1607,-32767,0,-32728,-1608,-32610,-3212,-32413,-4808,-32138,-6393,-31786,-7962,-31357,-9512,-30852,-11039,-30273,-12540,-29622,-14010,-28898,-15447,-28106,-16846,-27245,-18205,-26319,-19520,-25330,-20788,-24279,-22005,-23170,-23170,-22005,-24279,-20788,-25330,-19520,-26319,-18205,-27245,-16846,-28106,-15447,-28898,-14010,-29622,-12540,-30273,-11039,-30852,-9512,-31357,-7962,-31786,-6393,-32138,-4808,-32413,-3212,-32610,-1608,-32728};
+#endif
 
 void dft128(int16_t *x,int16_t *y,unsigned int *scale)
 {
@@ -2113,6 +2323,21 @@ void idft128(int16_t *x,int16_t *y,unsigned int *scale)
 
 }
 
+static const float tw256f[48] __attribute__((aligned(32))) = {
+  1.00000000e+00,  0.00000000e+00,  9.99698819e-01, -2.45412285e-02,
+  9.98795456e-01, -4.90676743e-02,  9.97290457e-01, -7.35645636e-02,
+  9.95184727e-01, -9.80171403e-02,  9.92479535e-01, -1.22410675e-01,
+  9.89176510e-01, -1.46730474e-01,  9.85277642e-01, -1.70961889e-01,
+  1.00000000e+00,  0.00000000e+00,  9.98795456e-01, -4.90676743e-02,
+  9.95184727e-01, -9.80171403e-02,  9.89176510e-01, -1.46730474e-01,
+  9.80785280e-01, -1.95090322e-01,  9.70031253e-01, -2.42980180e-01,
+  9.56940336e-01, -2.90284677e-01,  9.41544065e-01, -3.36889853e-01,
+  1.00000000e+00,  0.00000000e+00,  9.97290457e-01, -7.35645636e-02,
+  9.89176510e-01, -1.46730474e-01,  9.75702130e-01, -2.19101240e-01,
+  9.56940336e-01, -2.90284677e-01,  9.32992799e-01, -3.59895037e-01,
+  9.03989293e-01, -4.27555093e-01,  8.70086991e-01, -4.92898192e-01
+};
+
 #ifdef TWIDDLE_Q14
 static const int16_t tw256[384] __attribute__((aligned(32))) = {
   16384,      0,  16379,   -402,  16364,   -804,  16340,  -1205,
@@ -2312,39 +2537,128 @@ void dft256(int16_t *x,int16_t *y,unsigned int *scale)
   dft64((int16_t*)(xtmp+16),(int16_t*)(ytmp+16),scale64);
   dft64((int16_t*)(xtmp+24),(int16_t*)(ytmp+24),scale64);
 
-
-  bfly4_16_256(ytmpp,ytmpp+8,ytmpp+16,ytmpp+24,
-	       y256p,y256p+8,y256p+16,y256p+24,
-	       tw256a_256p,tw256a_256p+8,tw256a_256p+16,
-	       tw256b_256p,tw256b_256p+8,tw256b_256p+16);
-  bfly4_16_256(ytmpp+1,ytmpp+9,ytmpp+17,ytmpp+25,
-	       y256p+1,y256p+9,y256p+17,y256p+25,
-	       tw256a_256p+1,tw256a_256p+9,tw256a_256p+17,
-	       tw256b_256p+1,tw256b_256p+9,tw256b_256p+17);
-  bfly4_16_256(ytmpp+2,ytmpp+10,ytmpp+18,ytmpp+26,
-	       y256p+2,y256p+10,y256p+18,y256p+26,
-	       tw256a_256p+2,tw256a_256p+10,tw256a_256p+18,
-	       tw256b_256p+2,tw256b_256p+10,tw256b_256p+18);
-  bfly4_16_256(ytmpp+3,ytmpp+11,ytmpp+19,ytmpp+27,
-	       y256p+3,y256p+11,y256p+19,y256p+27,
-	       tw256a_256p+3,tw256a_256p+11,tw256a_256p+19,
-	       tw256b_256p+3,tw256b_256p+11,tw256b_256p+19);
-  bfly4_16_256(ytmpp+4,ytmpp+12,ytmpp+20,ytmpp+28,
-	       y256p+4,y256p+12,y256p+20,y256p+28,
-	       tw256a_256p+4,tw256a_256p+12,tw256a_256p+20,
-	       tw256b_256p+4,tw256b_256p+12,tw256b_256p+20);
-  bfly4_16_256(ytmpp+5,ytmpp+13,ytmpp+21,ytmpp+29,
-	       y256p+5,y256p+13,y256p+21,y256p+29,
-	       tw256a_256p+5,tw256a_256p+13,tw256a_256p+21,
-	       tw256b_256p+5,tw256b_256p+13,tw256b_256p+21);
-  bfly4_16_256(ytmpp+6,ytmpp+14,ytmpp+22,ytmpp+30,
-	       y256p+6,y256p+14,y256p+22,y256p+30,
-	       tw256a_256p+6,tw256a_256p+14,tw256a_256p+22,
-	       tw256b_256p+6,tw256b_256p+14,tw256b_256p+22);
-  bfly4_16_256(ytmpp+7,ytmpp+15,ytmpp+23,ytmpp+31,
-	       y256p+7,y256p+15,y256p+23,y256p+31,
-	       tw256a_256p+7,tw256a_256p+15,tw256a_256p+23,
-	       tw256b_256p+7,tw256b_256p+15,tw256b_256p+23);
+  simde__m256 *tw256f_256 = (simde__m256 *)tw256f;
+  simde__m256i *tw256_256 = (simde__m256i *)tw256;
+  bfly4_16_256(ytmpp,
+               ytmpp + 8,
+               ytmpp + 16,
+               ytmpp + 24,
+               y256p,
+               y256p + 8,
+               y256p + 16,
+               y256p + 24,
+               tw256_256,
+               tw256_256 + 8,
+               tw256_256 + 16,
+               tw256f_256,
+               tw256f_256 + 2,
+               tw256f_256 + 4,
+               true);
+  bfly4_16_256(ytmpp + 1,
+               ytmpp + 9,
+               ytmpp + 17,
+               ytmpp + 25,
+               y256p + 1,
+               y256p + 9,
+               y256p + 17,
+               y256p + 25,
+               tw256a_256p + 1,
+               tw256a_256p + 9,
+               tw256a_256p + 17,
+               tw256b_256p + 1,
+               tw256b_256p + 9,
+               tw256b_256p + 17,
+               false);
+  bfly4_16_256(ytmpp + 2,
+               ytmpp + 10,
+               ytmpp + 18,
+               ytmpp + 26,
+               y256p + 2,
+               y256p + 10,
+               y256p + 18,
+               y256p + 26,
+               tw256a_256p + 2,
+               tw256a_256p + 10,
+               tw256a_256p + 18,
+               tw256b_256p + 2,
+               tw256b_256p + 10,
+               tw256b_256p + 18,
+               false);
+  bfly4_16_256(ytmpp + 3,
+               ytmpp + 11,
+               ytmpp + 19,
+               ytmpp + 27,
+               y256p + 3,
+               y256p + 11,
+               y256p + 19,
+               y256p + 27,
+               tw256a_256p + 3,
+               tw256a_256p + 11,
+               tw256a_256p + 19,
+               tw256b_256p + 3,
+               tw256b_256p + 11,
+               tw256b_256p + 19,
+               false);
+  bfly4_16_256(ytmpp + 4,
+               ytmpp + 12,
+               ytmpp + 20,
+               ytmpp + 28,
+               y256p + 4,
+               y256p + 12,
+               y256p + 20,
+               y256p + 28,
+               tw256a_256p + 4,
+               tw256a_256p + 12,
+               tw256a_256p + 20,
+               tw256b_256p + 4,
+               tw256b_256p + 12,
+               tw256b_256p + 20,
+               false);
+  bfly4_16_256(ytmpp + 5,
+               ytmpp + 13,
+               ytmpp + 21,
+               ytmpp + 29,
+               y256p + 5,
+               y256p + 13,
+               y256p + 21,
+               y256p + 29,
+               tw256a_256p + 5,
+               tw256a_256p + 13,
+               tw256a_256p + 21,
+               tw256b_256p + 5,
+               tw256b_256p + 13,
+               tw256b_256p + 21,
+               false);
+  bfly4_16_256(ytmpp + 6,
+               ytmpp + 14,
+               ytmpp + 22,
+               ytmpp + 30,
+               y256p + 6,
+               y256p + 14,
+               y256p + 22,
+               y256p + 30,
+               tw256a_256p + 6,
+               tw256a_256p + 14,
+               tw256a_256p + 22,
+               tw256b_256p + 6,
+               tw256b_256p + 14,
+               tw256b_256p + 22,
+               false);
+  bfly4_16_256(ytmpp + 7,
+               ytmpp + 15,
+               ytmpp + 23,
+               ytmpp + 31,
+               y256p + 7,
+               y256p + 15,
+               y256p + 23,
+               y256p + 31,
+               tw256a_256p + 7,
+               tw256a_256p + 15,
+               tw256a_256p + 23,
+               tw256b_256p + 7,
+               tw256b_256p + 15,
+               tw256b_256p + 23,
+               false);
 
   if (scale && *scale>0) {
     unsigned int scalec=*scale;
@@ -2457,9 +2771,78 @@ void idft256(int16_t *x,int16_t *y,unsigned int *scale)
 
 }
 
+#ifdef TWIDDLE_Q14
+static const int16_t tw512[512] __attribute__((aligned(32))) = {
+  16384,      0,  16383,   -201,  16379,   -402,  16373,   -603,
+  16364,   -804,  16353,  -1005,  16340,  -1205,  16324,  -1406,
+  16305,  -1606,  16284,  -1806,  16261,  -2006,  16235,  -2205,
+  16207,  -2404,  16176,  -2603,  16143,  -2801,  16107,  -2999,
+  16069,  -3196,  16029,  -3393,  15986,  -3590,  15941,  -3786,
+  15893,  -3981,  15843,  -4176,  15791,  -4370,  15736,  -4563,
+  15679,  -4756,  15619,  -4948,  15557,  -5139,  15493,  -5330,
+  15426,  -5520,  15357,  -5708,  15286,  -5897,  15213,  -6084,
+  15137,  -6270,  15059,  -6455,  14978,  -6639,  14896,  -6823,
+  14811,  -7005,  14724,  -7186,  14635,  -7366,  14543,  -7545,
+  14449,  -7723,  14354,  -7900,  14256,  -8076,  14155,  -8250,
+  14053,  -8423,  13949,  -8595,  13842,  -8765,  13733,  -8935,
+  13623,  -9102,  13510,  -9269,  13395,  -9434,  13279,  -9598,
+  13160,  -9760,  13039,  -9921,  12916, -10080,  12792, -10238,
+  12665, -10394,  12537, -10549,  12406, -10702,  12274, -10853,
+  12140, -11003,  12004, -11151,  11866, -11297,  11727, -11442,
+  11585, -11585,  11442, -11727,  11297, -11866,  11151, -12004,
+  11003, -12140,  10853, -12274,  10702, -12406,  10549, -12537,
+  10394, -12665,  10238, -12792,  10080, -12916,   9921, -13039,
+   9760, -13160,   9598, -13279,   9434, -13395,   9269, -13510,
+   9102, -13623,   8935, -13733,   8765, -13842,   8595, -13949,
+   8423, -14053,   8250, -14155,   8076, -14256,   7900, -14354,
+   7723, -14449,   7545, -14543,   7366, -14635,   7186, -14724,
+   7005, -14811,   6823, -14896,   6639, -14978,   6455, -15059,
+   6270, -15137,   6084, -15213,   5897, -15286,   5708, -15357,
+   5520, -15426,   5330, -15493,   5139, -15557,   4948, -15619,
+   4756, -15679,   4563, -15736,   4370, -15791,   4176, -15843,
+   3981, -15893,   3786, -15941,   3590, -15986,   3393, -16029,
+   3196, -16069,   2999, -16107,   2801, -16143,   2603, -16176,
+   2404, -16207,   2205, -16235,   2006, -16261,   1806, -16284,
+   1606, -16305,   1406, -16324,   1205, -16340,   1005, -16353,
+    804, -16364,    603, -16373,    402, -16379,    201, -16383,
+      0, -16384,   -201, -16383,   -402, -16379,   -603, -16373,
+   -804, -16364,  -1005, -16353,  -1205, -16340,  -1406, -16324,
+  -1606, -16305,  -1806, -16284,  -2006, -16261,  -2205, -16235,
+  -2404, -16207,  -2603, -16176,  -2801, -16143,  -2999, -16107,
+  -3196, -16069,  -3393, -16029,  -3590, -15986,  -3786, -15941,
+  -3981, -15893,  -4176, -15843,  -4370, -15791,  -4563, -15736,
+  -4756, -15679,  -4948, -15619,  -5139, -15557,  -5330, -15493,
+  -5520, -15426,  -5708, -15357,  -5897, -15286,  -6084, -15213,
+  -6270, -15137,  -6455, -15059,  -6639, -14978,  -6823, -14896,
+  -7005, -14811,  -7186, -14724,  -7366, -14635,  -7545, -14543,
+  -7723, -14449,  -7900, -14354,  -8076, -14256,  -8250, -14155,
+  -8423, -14053,  -8595, -13949,  -8765, -13842,  -8935, -13733,
+  -9102, -13623,  -9269, -13510,  -9434, -13395,  -9598, -13279,
+  -9760, -13160,  -9921, -13039, -10080, -12916, -10238, -12792,
+ -10394, -12665, -10549, -12537, -10702, -12406, -10853, -12274,
+ -11003, -12140, -11151, -12004, -11297, -11866, -11442, -11727,
+ -11585, -11585, -11727, -11442, -11866, -11297, -12004, -11151,
+ -12140, -11003, -12274, -10853, -12406, -10702, -12537, -10549,
+ -12665, -10394, -12792, -10238, -12916, -10080, -13039,  -9921,
+ -13160,  -9760, -13279,  -9598, -13395,  -9434, -13510,  -9269,
+ -13623,  -9102, -13733,  -8935, -13842,  -8765, -13949,  -8595,
+ -14053,  -8423, -14155,  -8250, -14256,  -8076, -14354,  -7900,
+ -14449,  -7723, -14543,  -7545, -14635,  -7366, -14724,  -7186,
+ -14811,  -7005, -14896,  -6823, -14978,  -6639, -15059,  -6455,
+ -15137,  -6270, -15213,  -6084, -15286,  -5897, -15357,  -5708,
+ -15426,  -5520, -15493,  -5330, -15557,  -5139, -15619,  -4948,
+ -15679,  -4756, -15736,  -4563, -15791,  -4370, -15843,  -4176,
+ -15893,  -3981, -15941,  -3786, -15986,  -3590, -16029,  -3393,
+ -16069,  -3196, -16107,  -2999, -16143,  -2801, -16176,  -2603,
+ -16207,  -2404, -16235,  -2205, -16261,  -2006, -16284,  -1806,
+ -16305,  -1606, -16324,  -1406, -16340,  -1205, -16353,  -1005,
+ -16364,   -804, -16373,   -603, -16379,   -402, -16383,   -201,
+};
+#else
 static const int16_t tw512[512] __attribute__((aligned(32))) = {
   32767,0,32764,-403,32757,-805,32744,-1207,32727,-1608,32705,-2010,32678,-2411,32646,-2812,32609,-3212,32567,-3612,32520,-4012,32468,-4410,32412,-4808,32350,-5206,32284,-5602,32213,-5998,32137,-6393,32056,-6787,31970,-7180,31880,-7572,31785,-7962,31684,-8352,31580,-8740,31470,-9127,31356,-9512,31236,-9896,31113,-10279,30984,-10660,30851,-11039,30713,-11417,30571,-11793,30424,-12167,30272,-12540,30116,-12910,29955,-13279,29790,-13646,29621,-14010,29446,-14373,29268,-14733,29085,-15091,28897,-15447,28706,-15800,28510,-16151,28309,-16500,28105,-16846,27896,-17190,27683,-17531,27466,-17869,27244,-18205,27019,-18538,26789,-18868,26556,-19195,26318,-19520,26077,-19841,25831,-20160,25582,-20475,25329,-20788,25072,-21097,24811,-21403,24546,-21706,24278,-22005,24006,-22302,23731,-22595,23452,-22884,23169,-23170,22883,-23453,22594,-23732,22301,-24007,22004,-24279,21705,-24547,21402,-24812,21096,-25073,20787,-25330,20474,-25583,20159,-25832,19840,-26078,19519,-26319,19194,-26557,18867,-26790,18537,-27020,18204,-27245,17868,-27467,17530,-27684,17189,-27897,16845,-28106,16499,-28310,16150,-28511,15799,-28707,15446,-28898,15090,-29086,14732,-29269,14372,-29447,14009,-29622,13645,-29791,13278,-29956,12909,-30117,12539,-30273,12166,-30425,11792,-30572,11416,-30714,11038,-30852,10659,-30985,10278,-31114,9895,-31237,9511,-31357,9126,-31471,8739,-31581,8351,-31685,7961,-31786,7571,-31881,7179,-31971,6786,-32057,6392,-32138,5997,-32214,5601,-32285,5205,-32351,4807,-32413,4409,-32469,4011,-32521,3611,-32568,3211,-32610,2811,-32647,2410,-32679,2009,-32706,1607,-32728,1206,-32745,804,-32758,402,-32765,0,-32767,-403,-32765,-805,-32758,-1207,-32745,-1608,-32728,-2010,-32706,-2411,-32679,-2812,-32647,-3212,-32610,-3612,-32568,-4012,-32521,-4410,-32469,-4808,-32413,-5206,-32351,-5602,-32285,-5998,-32214,-6393,-32138,-6787,-32057,-7180,-31971,-7572,-31881,-7962,-31786,-8352,-31685,-8740,-31581,-9127,-31471,-9512,-31357,-9896,-31237,-10279,-31114,-10660,-30985,-11039,-30852,-11417,-30714,-11793,-30572,-12167,-30425,-12540,-30273,-12910,-30117,-13279,-29956,-13646,-29791,-14010,-29622,-14373,-29447,-14733,-29269,-15091,-29086,-15447,-28898,-15800,-28707,-16151,-28511,-16500,-28310,-16846,-28106,-17190,-27897,-17531,-27684,-17869,-27467,-18205,-27245,-18538,-27020,-18868,-26790,-19195,-26557,-19520,-26319,-19841,-26078,-20160,-25832,-20475,-25583,-20788,-25330,-21097,-25073,-21403,-24812,-21706,-24547,-22005,-24279,-22302,-24007,-22595,-23732,-22884,-23453,-23170,-23170,-23453,-22884,-23732,-22595,-24007,-22302,-24279,-22005,-24547,-21706,-24812,-21403,-25073,-21097,-25330,-20788,-25583,-20475,-25832,-20160,-26078,-19841,-26319,-19520,-26557,-19195,-26790,-18868,-27020,-18538,-27245,-18205,-27467,-17869,-27684,-17531,-27897,-17190,-28106,-16846,-28310,-16500,-28511,-16151,-28707,-15800,-28898,-15447,-29086,-15091,-29269,-14733,-29447,-14373,-29622,-14010,-29791,-13646,-29956,-13279,-30117,-12910,-30273,-12540,-30425,-12167,-30572,-11793,-30714,-11417,-30852,-11039,-30985,-10660,-31114,-10279,-31237,-9896,-31357,-9512,-31471,-9127,-31581,-8740,-31685,-8352,-31786,-7962,-31881,-7572,-31971,-7180,-32057,-6787,-32138,-6393,-32214,-5998,-32285,-5602,-32351,-5206,-32413,-4808,-32469,-4410,-32521,-4012,-32568,-3612,-32610,-3212,-32647,-2812,-32679,-2411,-32706,-2010,-32728,-1608,-32745,-1207,-32758,-805,-32765,-403
 };
+#endif
 
 void dft512(int16_t *x,int16_t *y,unsigned int *scale)
 {
@@ -2677,6 +3060,12 @@ void idft512(int16_t *x,int16_t *y,unsigned int *scale)
   }
 }
 
+const static float tw1024f[24] __attribute__((aligned(32))) = {
+  1.,          0.,          0.99998118, -0.00613588,  0.9999247 , -0.01227154, 0.99983058, -0.01840673,
+  1.,          0.,          0.9999247 , -0.01227154,  0.99969882, -0.02454123, 0.99932238, -0.03680722,
+  1.,          0.,          0.99983058, -0.01840673,  0.99932238, -0.03680722, 0.99847558, -0.05519524,
+};
+
 int16_t tw1024[1536] __attribute__((aligned(32)));
 
 void dft1024(int16_t *x,int16_t *y,unsigned int *scale)
@@ -2684,13 +3073,11 @@ void dft1024(int16_t *x,int16_t *y,unsigned int *scale)
 
   simd256_q15_t xtmp[128],ytmp[128],*tw1024_256p=(simd256_q15_t *)tw1024,*x256=(simd256_q15_t *)x,*y256=(simd256_q15_t *)y,*y256p=(simd256_q15_t *)y;
   simd256_q15_t *ytmpp = &ytmp[0];
-  int i,j;
 
-  for (i=0,j=0; i<128; i+=4,j++) {
+  for (int_fast32_t i = 0, j = 0; i < 128; i += 4, j++) {
     transpose16_ooff_simd256(x256+i,xtmp+j,32);
   }
 
-
   unsigned int *scale256=NULL;
   if (scale) scale256=scale+1;
   dft256((int16_t*)(xtmp),(int16_t*)(ytmp),scale256);
@@ -2698,10 +3085,34 @@ void dft1024(int16_t *x,int16_t *y,unsigned int *scale)
   dft256((int16_t*)(xtmp+64),(int16_t*)(ytmp+64),scale256);
   dft256((int16_t*)(xtmp+96),(int16_t*)(ytmp+96),scale256);
 
-  for (i=0; i<32; i++) {
-    bfly4_256(ytmpp,ytmpp+32,ytmpp+64,ytmpp+96,
-	      y256p,y256p+32,y256p+64,y256p+96,
-	      tw1024_256p,tw1024_256p+32,tw1024_256p+64);
+  bfly4_256(ytmpp,
+            ytmpp + 32,
+            ytmpp + 64,
+            ytmpp + 96,
+            y256p,
+            y256p + 32,
+            y256p + 64,
+            y256p + 96,
+            tw1024_256p,
+            tw1024_256p + 32,
+            tw1024_256p + 64,
+            (simde__m256 *)tw1024f);
+  tw1024_256p++;
+  y256p++;
+  ytmpp++;
+  for (int_fast8_t i = 0; i < 31; i++) {
+    bfly4_256(ytmpp,
+              ytmpp + 32,
+              ytmpp + 64,
+              ytmpp + 96,
+              y256p,
+              y256p + 32,
+              y256p + 64,
+              y256p + 96,
+              tw1024_256p,
+              tw1024_256p + 32,
+              tw1024_256p + 64,
+              NULL);
     tw1024_256p++;
     y256p++;
     ytmpp++;
@@ -2709,7 +3120,7 @@ void dft1024(int16_t *x,int16_t *y,unsigned int *scale)
 
   if (scale && *scale>0) {
     unsigned int scalec=*scale;
-    for (i=0; i<8; i++) {
+    for (int_fast8_t i = 0; i < 8; i++) {
       y256[0]  = shiftright_int16_simd256(y256[0],scalec);
       y256[1]  = shiftright_int16_simd256(y256[1],scalec);
       y256[2]  = shiftright_int16_simd256(y256[2],scalec);
@@ -2729,7 +3140,6 @@ void dft1024(int16_t *x,int16_t *y,unsigned int *scale)
 
       y256+=16;
     }
-
   }
 
 }
@@ -3014,6 +3424,12 @@ void idft2048(int16_t *x,int16_t *y,unsigned int *scale)
 
 }
 
+const static float tw4096f[24] __attribute__((aligned(32))) = {
+  1.,          0.,          0.99999882, -0.00153398,  0.99999529, -0.00306796, 0.99998941, -0.00460193,
+  1.,          0.,          0.99999529, -0.00306796,  0.99998118, -0.00613588, 0.99995764, -0.00920375,
+  1.,          0.,          0.99998941, -0.00460193,  0.99995764, -0.00920375, 0.9999047 , -0.01380539,
+};
+
 int16_t tw4096[3*2*1024];
 
 void dft4096(int16_t *x,int16_t *y,unsigned int *scale)
@@ -3021,13 +3437,11 @@ void dft4096(int16_t *x,int16_t *y,unsigned int *scale)
 
   simd256_q15_t xtmp[512],ytmp[512],*tw4096_256p=(simd256_q15_t *)tw4096,*x256=(simd256_q15_t *)x,*y256=(simd256_q15_t *)y,*y256p=(simd256_q15_t *)y;
   simd256_q15_t *ytmpp = &ytmp[0];
-  int i,j;
 
-  for (i=0,j=0; i<512; i+=4,j++) {
+  for (int_fast16_t i = 0, j = 0; i < 512; i += 4, j++) {
     transpose16_ooff_simd256(x256+i,xtmp+j,128);
   }
 
-
   unsigned int *scale1024=NULL;
   if (scale) scale1024=scale+1;
   dft1024((int16_t*)(xtmp),(int16_t*)(ytmp),scale1024);
@@ -3035,10 +3449,34 @@ void dft4096(int16_t *x,int16_t *y,unsigned int *scale)
   dft1024((int16_t*)(xtmp+256),(int16_t*)(ytmp+256),scale1024);
   dft1024((int16_t*)(xtmp+384),(int16_t*)(ytmp+384),scale1024);
 
-  for (i=0; i<128; i++) {
-    bfly4_256(ytmpp,ytmpp+128,ytmpp+256,ytmpp+384,
-	      y256p,y256p+128,y256p+256,y256p+384,
-	      tw4096_256p,tw4096_256p+128,tw4096_256p+256);
+  bfly4_256(ytmpp,
+            ytmpp + 128,
+            ytmpp + 256,
+            ytmpp + 384,
+            y256p,
+            y256p + 128,
+            y256p + 256,
+            y256p + 384,
+            tw4096_256p,
+            tw4096_256p + 128,
+            tw4096_256p + 256,
+            (simde__m256 *)tw4096f);
+  tw4096_256p++;
+  y256p++;
+  ytmpp++;
+  for (int_fast16_t i = 1; i < 128; i++) {
+    bfly4_256(ytmpp,
+              ytmpp + 128,
+              ytmpp + 256,
+              ytmpp + 384,
+              y256p,
+              y256p + 128,
+              y256p + 256,
+              y256p + 384,
+              tw4096_256p,
+              tw4096_256p + 128,
+              tw4096_256p + 256,
+              NULL);
     tw4096_256p++;
     y256p++;
     ytmpp++;
@@ -3046,7 +3484,7 @@ void dft4096(int16_t *x,int16_t *y,unsigned int *scale)
 
   if (scale && *scale>0) {
     unsigned int scalec=*scale;
-    for (i=0; i<32; i++) {
+    for (int_fast8_t i = 0; i < 32; i++) {
       y256[0]  = shiftright_int16_simd256(y256[0],scalec);
       y256[1]  = shiftright_int16_simd256(y256[1],scalec);
       y256[2]  = shiftright_int16_simd256(y256[2],scalec);
@@ -3066,7 +3504,6 @@ void dft4096(int16_t *x,int16_t *y,unsigned int *scale)
 
       y256+=16;
     }
-
   }
 
 }
@@ -3375,9 +3812,18 @@ void dft16384(int16_t *x,int16_t *y,unsigned int *scale)
   dft4096((int16_t*)(xtmp+1536),(int16_t*)(ytmp+1536),scale4096);
 
   for (i=0; i<512; i++) {
-    bfly4_256(ytmpp,ytmpp+512,ytmpp+1024,ytmpp+1536,
-	      y256p,y256p+512,y256p+1024,y256p+1536,
-	      tw16384_256p,tw16384_256p+512,tw16384_256p+1024);
+    bfly4_256(ytmpp,
+              ytmpp + 512,
+              ytmpp + 1024,
+              ytmpp + 1536,
+              y256p,
+              y256p + 512,
+              y256p + 1024,
+              y256p + 1536,
+              tw16384_256p,
+              tw16384_256p + 512,
+              tw16384_256p + 1024,
+              NULL);
     tw16384_256p++;
     y256p++;
     ytmpp++;