diff --git a/openair1/PHY/TOOLS/dfts_load.c b/openair1/PHY/TOOLS/dfts_load.c index c0cda9238c5e9f79ff67aec833425dfa1a96f046..2d1cf86f0f40e81c502b5788a462b59a4751de5e 100644 --- a/openair1/PHY/TOOLS/dfts_load.c +++ b/openair1/PHY/TOOLS/dfts_load.c @@ -100,8 +100,7 @@ uint32_t IDFT_SCALING_3072[2][4] = {{1, 1, 1, 3}, {1, 1, 1, 3}}; #endif uint32_t DFT_SCALING_16[][1] = {{2}}; -uint32_t DFT_SCALING_64[5][2] = {{3,0},{2,1},{1,2},{1,2},{1,2}}; -uint32_t DFT_SCALING_128[5][3] = {{4,0,0},{3,1,0},{2,2,0},{1,3,0},{0,4,0}}; +uint32_t DFT_SCALING_64[5][2] = {{3, 0}, {2, 1}, {1, 2}, {1, 2}, {1, 2}}; uint32_t DFT_SCALING_256[5][3] = {{4,0,0},{3,1,0},{2,2,0},{1,3,0},{0,4,0}}; int32_t DFT_SCALING_512_THRES[7] = {53, 57, 59, 63, 65, 69, 100}; int32_t DFT_SCALING_1024_THRES[5] = {49,55,63,69,100}; diff --git a/openair1/PHY/TOOLS/oai_dfts.c b/openair1/PHY/TOOLS/oai_dfts.c index d26fa6edafc8c4d0853ba1e7b4437faaf74f1385..46ef8dc9928223501014c1426092156e56196c47 100644 --- a/openair1/PHY/TOOLS/oai_dfts.c +++ b/openair1/PHY/TOOLS/oai_dfts.c @@ -66,6 +66,7 @@ #include "tools_defs.h" #define print_shorts(s,x) printf("%s %d,%d,%d,%d,%d,%d,%d,%d\n",s,(x)[0],(x)[1],(x)[2],(x)[3],(x)[4],(x)[5],(x)[6],(x)[7]) +#define print_floats(s, x) printf("%s %f,%f,%f,%f,%f,%f,%f,%f\n", s, (x)[0], (x)[1], (x)[2], (x)[3], (x)[4], (x)[5], (x)[6], (x)[7]) #define print_shorts256(s,x) printf("%s %d,%d,%d,%d,%d,%d,%d,%d,%d,%d,%d,%d,%d,%d,%d,%d\n",s,(x)[0],(x)[1],(x)[2],(x)[3],(x)[4],(x)[5],(x)[6],(x)[7],(x)[8],(x)[9],(x)[10],(x)[11],(x)[12],(x)[13],(x)[14],(x)[15]) #define print_ints(s,x) printf("%s %d %d %d %d\n",s,(x)[0],(x)[1],(x)[2],(x)[3]) @@ -259,6 +260,14 @@ __attribute__((always_inline)) static inline void cmultc_256(simde__m256i a, sim *im32 = simde_mm256_madd_epi16(a, mmtmpb); } +__attribute__((always_inline)) static inline simde__m128i cpack_noshift(simde__m128i xre, simde__m128i xim) +{ + register simde__m128i cpack_tmp1, cpack_tmp2; + + cpack_tmp1 = simde_mm_unpacklo_epi32(xre, xim); + cpack_tmp2 = simde_mm_unpackhi_epi32(xre, xim); + return (simde_mm_packs_epi32(cpack_tmp1, cpack_tmp2)); +} #ifdef TWIDDLE_Q14 #define MUL_SHIFT 14 @@ -275,6 +284,15 @@ __attribute__((always_inline)) static inline simde__m128i cpack(simde__m128i xre return (simde_mm_packs_epi32(simde_mm_srai_epi32(cpack_tmp1, MUL_SHIFT), simde_mm_srai_epi32(cpack_tmp2, MUL_SHIFT))); } +__attribute__((always_inline)) static inline simde__m256i cpack_256_noshift(simde__m256i xre, simde__m256i xim) +{ + register simde__m256i cpack_tmp1, cpack_tmp2; + + cpack_tmp1 = simde_mm256_unpacklo_epi32(xre, xim); + cpack_tmp2 = simde_mm256_unpackhi_epi32(xre, xim); + return (simde_mm256_packs_epi32(cpack_tmp1, cpack_tmp2)); +} + __attribute__((always_inline)) static inline simde__m256i cpack_256(simde__m256i xre, simde__m256i xim) { register simde__m256i cpack_tmp1, cpack_tmp2; @@ -666,6 +684,50 @@ __attribute__((always_inline)) static inline void bfly4(simde__m128i *x0, *(y3) = simde_mm_add_epi16(*(x0), cpack(dy3r, dy3i)); } +__attribute__((always_inline)) static inline simde__m256i cmult_float_128(simde__m128i x, simde__m256 a) +{ + simde__m256i x256_32 = simde_mm256_cvtepi16_epi32(x); // 3 2 1 0 im re im re im re im re + simde__m256 x256_f = simde_mm256_cvtepi32_ps(x256_32); // 3 2 1 0 + float conj_mask[8] = {1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0}; + simde__m256 y256_f_re = simde_mm256_mul_ps(x256_f, simde_mm256_mul_ps(a, *(simde__m256 *)conj_mask)); + simde__m256 y256_f_im = simde_mm256_mul_ps(x256_f, simde_mm256_permute_ps(a, 0b10110001)); + simde__m256 y256_f = simde_mm256_hadd_ps(y256_f_re, y256_f_im); // im im re re im im re re + simde__m256i y256_32 = simde_mm256_cvtps_epi32(simde_mm256_round_ps(y256_f, SIMDE_MM_FROUND_TO_NEAREST_INT)); + return simde_mm256_permutevar8x32_epi32(y256_32, simde_mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0)); // im im im im re re re re +} + +/* +Floating point multiplication is done to all 8 complex samples. +*/ +__attribute__((always_inline)) static inline simde__m256i pack_cmult_float_256(simde__m256i x, + const simde__m256i *ai, + const simde__m256 *a) +{ + simde__m256i y0tmp = cmult_float_128(*(simde__m128i *)&x, a[0]); // im im im im re re re re (3 ... 0) + simde__m128i y0 = cpack_noshift(*((simde__m128i *)&y0tmp), *(((simde__m128i *)&y0tmp) + 1)); // im re im re im re im re (3 ... 0) + simde__m256i y1tmp = cmult_float_128(*(((simde__m128i *)&x) + 1), a[1]); // im im im im re re re re (7 ... 4) + simde__m128i y1 = cpack_noshift(*((simde__m128i *)&y1tmp), *(((simde__m128i *)&y1tmp) + 1)); // im re im re im re im re (7 ... 4) + return (simde_mm256_loadu2_m128i(&y1, &y0)); +} + +/* +Floating point multiplication is done to first 4 complex samples and fixed point for next 4 samples. +*/ +__attribute__((always_inline)) static inline void cmult_float_256(simde__m256i x, + const simde__m256i *ai, + const simde__m256 *a, + simde__m256i *re, + simde__m256i *im) +{ + simde__m256i y0tmp = cmult_float_128(*(simde__m128i *)&x, a[0]); // im im im im re re re re (3 ... 0) + simde__m128i y1re, y1im; + cmult(*(((simde__m128i *)&x) + 1), *(((simde__m128i *)ai) + 1), &y1re, &y1im); // im im im im re re re re (7 ... 4) + *((simde__m128i *)re) = *((simde__m128i *)&y0tmp); + *(((simde__m128i *)re) + 1) = simde_mm_srai_epi32(y1re, MUL_SHIFT); // re re re re re re re re (7 ... 0) + *((simde__m128i *)im) = *(((simde__m128i *)&y0tmp) + 1); + *(((simde__m128i *)im) + 1) = simde_mm_srai_epi32(y1im, MUL_SHIFT); // im im im im im im im im (7 ... 0) +} + __attribute__((always_inline)) static inline void bfly4_256(simde__m256i *x0, simde__m256i *x1, simde__m256i *x2, @@ -676,38 +738,46 @@ __attribute__((always_inline)) static inline void bfly4_256(simde__m256i *x0, simde__m256i *y3, simde__m256i *tw1, simde__m256i *tw2, - simde__m256i *tw3) + simde__m256i *tw3, + simde__m256 *twf) { simde__m256i x1r_2, x1i_2, x2r_2, x2i_2, x3r_2, x3i_2, dy0r, dy0i, dy1r, dy1i, dy2r, dy2i, dy3r, dy3i; // cmult(*(x0),*(W0),&x0r_2,&x0i_2); - cmult_256(*(x1),*(tw1),&x1r_2,&x1i_2); - cmult_256(*(x2),*(tw2),&x2r_2,&x2i_2); - cmult_256(*(x3),*(tw3),&x3r_2,&x3i_2); + if (twf) { + cmult_float_256(*(x1), tw1, twf, &x1r_2, &x1i_2); + cmult_float_256(*(x2), tw2, twf + 1, &x2r_2, &x2i_2); + cmult_float_256(*(x3), tw3, twf + 2, &x3r_2, &x3i_2); + } else { + cmult_256(*(x1), *(tw1), &x1r_2, &x1i_2); + cmult_256(*(x2), *(tw2), &x2r_2, &x2i_2); + cmult_256(*(x3), *(tw3), &x3r_2, &x3i_2); + } // dy0r = simde_mm_add_epi32(x0r_2,simde_mm_add_epi32(x1r_2,simde_mm_add_epi32(x2r_2,x3r_2))); // dy0i = simde_mm_add_epi32(x0i_2,simde_mm_add_epi32(x1i_2,simde_mm_add_epi32(x2i_2,x3i_2))); // *(y0) = cpack(dy0r,dy0i); dy0r = simde_mm256_add_epi32(x1r_2,simde_mm256_add_epi32(x2r_2,x3r_2)); dy0i = simde_mm256_add_epi32(x1i_2,simde_mm256_add_epi32(x2i_2,x3i_2)); - *(y0) = simde_mm256_add_epi16(*(x0),cpack_256(dy0r,dy0i)); + *(y0) = simde_mm256_add_epi16(*(x0), twf ? cpack_256_noshift(dy0r, dy0i) : cpack_256(dy0r, dy0i)); + //*(y0) = simde_mm256_adds_epi16(*(y0), simde_mm256_set_epi16(0,1,0,1,0,1,0,0,0,1,0,1,0,1,0,0)); // dy1r = simde_mm_add_epi32(x0r_2,simde_mm_sub_epi32(x1i_2,simde_mm_add_epi32(x2r_2,x3i_2))); // dy1i = simde_mm_sub_epi32(x0i_2,simde_mm_add_epi32(x1r_2,simde_mm_sub_epi32(x2i_2,x3r_2))); // *(y1) = cpack(dy1r,dy1i); dy1r = simde_mm256_sub_epi32(x1i_2,simde_mm256_add_epi32(x2r_2,x3i_2)); dy1i = simde_mm256_sub_epi32(simde_mm256_sub_epi32(x3r_2,x2i_2),x1r_2); - *(y1) = simde_mm256_add_epi16(*(x0),cpack_256(dy1r,dy1i)); + *(y1) = simde_mm256_add_epi16(*(x0), twf ? cpack_256_noshift(dy1r, dy1i) : cpack_256(dy1r, dy1i)); // dy2r = simde_mm_sub_epi32(x0r_2,simde_mm_sub_epi32(x1r_2,simde_mm_sub_epi32(x2r_2,x3r_2))); // dy2i = simde_mm_sub_epi32(x0i_2,simde_mm_sub_epi32(x1i_2,simde_mm_sub_epi32(x2i_2,x3i_2))); // *(y2) = cpack(dy2r,dy2i); dy2r = simde_mm256_sub_epi32(simde_mm256_sub_epi32(x2r_2,x3r_2),x1r_2); dy2i = simde_mm256_sub_epi32(simde_mm256_sub_epi32(x2i_2,x3i_2),x1i_2); - *(y2) = simde_mm256_add_epi16(*(x0),cpack_256(dy2r,dy2i)); + *(y2) = simde_mm256_add_epi16(*(x0), twf ? cpack_256_noshift(dy2r, dy2i) : cpack_256(dy2r, dy2i)); // dy3r = simde_mm_sub_epi32(x0r_2,simde_mm_add_epi32(x1i_2,simde_mm_sub_epi32(x2r_2,x3i_2))); // dy3i = simde_mm_add_epi32(x0i_2,simde_mm_sub_epi32(x1r_2,simde_mm_add_epi32(x2i_2,x3r_2))); // *(y3) = cpack(dy3r,dy3i); dy3r = simde_mm256_sub_epi32(simde_mm256_sub_epi32(x3i_2,x2r_2),x1i_2); dy3i = simde_mm256_sub_epi32(x1r_2,simde_mm256_add_epi32(x2i_2,x3r_2)); - *(y3) = simde_mm256_add_epi16(*(x0),cpack_256(dy3r,dy3i)); + *(y3) = simde_mm256_add_epi16(*(x0), twf ? cpack_256_noshift(dy3r, dy3i) : cpack_256(dy3r, dy3i)); } __attribute__((always_inline)) static inline void ibfly4_256(simde__m256i *x0, @@ -844,12 +914,13 @@ __attribute__((always_inline)) static inline void bfly4_16_256(simde__m256i *x0, simde__m256i *y1, simde__m256i *y2, simde__m256i *y3, - simde__m256i *tw1, - simde__m256i *tw2, - simde__m256i *tw3, - simde__m256i *tw1b, - simde__m256i *tw2b, - simde__m256i *tw3b) + void *tw1, + void *tw2, + void *tw3, + void *tw1b, + void *tw2b, + void *tw3b, + bool doFloat) { register simde__m256i x1t, x2t, x3t, x02t, x13t; register simde__m256i x1_flip, x3_flip; @@ -890,9 +961,15 @@ __attribute__((always_inline)) static inline void bfly4_16_256(simde__m256i *x0, // [xi00 xi01 xi02 xi03 xi10 xi20 xi30 xi40] // each output yi is the same - x1t = packed_cmult2_256(*(x1),*(tw1),*(tw1b)); - x2t = packed_cmult2_256(*(x2),*(tw2),*(tw2b)); - x3t = packed_cmult2_256(*(x3),*(tw3),*(tw3b)); + if (doFloat) { + x1t = pack_cmult_float_256(*(x1), (simde__m256i *)tw1, (simde__m256 *)tw1b); + x2t = pack_cmult_float_256(*(x2), ((simde__m256i *)tw2), ((simde__m256 *)tw2b)); + x3t = pack_cmult_float_256(*(x3), ((simde__m256i *)tw3), ((simde__m256 *)tw3b)); + } else { + x1t = packed_cmult2_256(*(x1), *((simde__m256i *)tw1), *((simde__m256i *)tw1b)); + x2t = packed_cmult2_256(*(x2), *((simde__m256i *)tw2), *((simde__m256i *)tw2b)); + x3t = packed_cmult2_256(*(x3), *((simde__m256i *)tw3), *((simde__m256i *)tw3b)); + } x02t = simde_mm256_adds_epi16(*(x0),x2t); x13t = simde_mm256_adds_epi16(x1t,x3t); @@ -1164,6 +1241,22 @@ const static int16_t tw16c[24] 0, 32767, 23170, 23169, 32767, 0, 23170, -23170, 0, 32767, 30273, 12539, 23170, -23170, -12539, -30273}; +const static float tw16repf[48] __attribute__((aligned(32))) = { + 1.00000000e+00, 0.00000000e+00, 9.23879533e-01, -3.82683432e-01, + 7.07106781e-01, -7.07106781e-01, 3.82683432e-01, -9.23879533e-01, + 1.00000000e+00, 0.00000000e+00, 9.23879533e-01, -3.82683432e-01, + 7.07106781e-01, -7.07106781e-01, 3.82683432e-01, -9.23879533e-01, + + 1.00000000e+00, 0.00000000e+00, 7.07106781e-01, -7.07106781e-01, + 6.12323400e-17, -1.00000000e+00, -7.07106781e-01, -7.07106781e-01, + 1.00000000e+00, 0.00000000e+00, 7.07106781e-01, -7.07106781e-01, + 6.12323400e-17, -1.00000000e+00, -7.07106781e-01, -7.07106781e-01, + + 1.00000000e+00, 0.00000000e+00, 3.82683432e-01, -9.23879533e-01, + -7.07106781e-01, -7.07106781e-01, -9.23879533e-01, 3.82683432e-01, + 1.00000000e+00, 0.00000000e+00, 3.82683432e-01, -9.23879533e-01, + -7.07106781e-01, -7.07106781e-01, -9.23879533e-01, 3.82683432e-01 +}; #if defined(TWIDDLE_Q14) /* [16384. +0.j 15137. -6270.j 11585.-11585.j 6270.-15137.j] @@ -1289,10 +1382,10 @@ static inline void dft16(int16_t *x,int16_t *y) __attribute__((always_inline) //#define USE_DFT16_SHIFT // Does two 16-point DFTS (x[0 .. 15] is 128 LSBs of input vector, x[16..31] is in 128 MSBs) -__attribute__((always_inline)) static inline void dft16_simd256(int16_t *x, int16_t *y,int scale) +__attribute__((always_inline)) static inline void dft16_simd256(int16_t *x, int16_t *y, int scale, bool doFloat) { - simde__m256i *tw16a_256 = (simde__m256i *)tw16arep, *tw16b_256 = (simde__m256i *)tw16brep, *x256 = (simde__m256i *)x, - *y256 = (simde__m256i *)y; + simde__m256i *x256 = (simde__m256i *)x; + simde__m256i *y256 = (simde__m256i *)y; simde__m256i x1_flip, x3_flip, x02t, x13t; simde__m256i ytmp0, ytmp1, ytmp2, ytmp3, xtmp0, xtmp1, xtmp2, xtmp3; @@ -1366,14 +1459,26 @@ __attribute__((always_inline)) static inline void dft16_simd256(int16_t *x, int1 xtmp3 = simde_mm256_unpackhi_epi64(ytmp1,ytmp3); // x3 x11 x19 x27 x7 x15 x23 x31 // Second stage : 4 Radix-4 butterflies with input twiddles - xtmp1 = packed_cmult2_256(xtmp1,tw16a_256[0],tw16b_256[0]); - xtmp2 = packed_cmult2_256(xtmp2,tw16a_256[1],tw16b_256[1]); - xtmp3 = packed_cmult2_256(xtmp3,tw16a_256[2],tw16b_256[2]); + if (doFloat) { + simde__m256i *tw16 = (simde__m256i *)tw16rep; + simde__m256 *tw16f = (simde__m256 *)tw16repf; + xtmp1 = pack_cmult_float_256(xtmp1, tw16, tw16f); + xtmp2 = pack_cmult_float_256(xtmp2, tw16 + 1, tw16f + 2); + xtmp3 = pack_cmult_float_256(xtmp3, tw16 + 2, tw16f + 4); + } else { + simde__m256i *tw16a_256 = (simde__m256i *)tw16arep; + simde__m256i *tw16b_256 = (simde__m256i *)tw16brep; + xtmp1 = packed_cmult2_256(xtmp1, tw16a_256[0], tw16b_256[0]); + xtmp2 = packed_cmult2_256(xtmp2, tw16a_256[1], tw16b_256[1]); + xtmp3 = packed_cmult2_256(xtmp3, tw16a_256[2], tw16b_256[2]); + } - /* print_shorts256("xtmp0",(int16_t*)&xtmp0); + /* + print_shorts256("xtmp0",(int16_t*)&xtmp0); print_shorts256("xtmp1",(int16_t*)&xtmp1); print_shorts256("xtmp2",(int16_t*)&xtmp2); - print_shorts256("xtmp3",(int16_t*)&xtmp3);*/ + print_shorts256("xtmp3",(int16_t*)&xtmp3); + */ x02t = simde_mm256_adds_epi16(xtmp0,xtmp2); x13t = simde_mm256_adds_epi16(xtmp1,xtmp3); @@ -1427,7 +1532,7 @@ void dft16(int16_t *x, int16_t *y, unsigned int *scale) */ simde__m256i ytmp[4]; const unsigned int scale16 = scale ? scale[0] : 0; - dft16_simd256((int16_t *)xtmp, (int16_t *)ytmp, scale16); + dft16_simd256((int16_t *)xtmp, (int16_t *)ytmp, scale16, true); simde__m256i *y256 = (simde__m256i *)y; y256[0] = ytmp[0]; @@ -1605,6 +1710,32 @@ __attribute__((always_inline)) static inline void idft16_simd256(int16_t *x, int } // 64-point optimized DFT +const static float tw64f[96] __attribute__((aligned(32))) = { + 1.00000000e+00, 0.00000000e+00, 9.95184727e-01, -9.80171403e-02, + 9.80785280e-01, -1.95090322e-01, 9.56940336e-01, -2.90284677e-01, + 9.23879533e-01, -3.82683432e-01, 8.81921264e-01, -4.71396737e-01, + 8.31469612e-01, -5.55570233e-01, 7.73010453e-01, -6.34393284e-01, + 7.07106781e-01, -7.07106781e-01, 6.34393284e-01, -7.73010453e-01, + 5.55570233e-01, -8.31469612e-01, 4.71396737e-01, -8.81921264e-01, + 3.82683432e-01, -9.23879533e-01, 2.90284677e-01, -9.56940336e-01, + 1.95090322e-01, -9.80785280e-01, 9.80171403e-02, -9.95184727e-01, + 1.00000000e+00, 0.00000000e+00, 9.80785280e-01, -1.95090322e-01, + 9.23879533e-01, -3.82683432e-01, 8.31469612e-01, -5.55570233e-01, + 7.07106781e-01, -7.07106781e-01, 5.55570233e-01, -8.31469612e-01, + 3.82683432e-01, -9.23879533e-01, 1.95090322e-01, -9.80785280e-01, + 6.12323400e-17, -1.00000000e+00, -1.95090322e-01, -9.80785280e-01, + -3.82683432e-01, -9.23879533e-01, -5.55570233e-01, -8.31469612e-01, + -7.07106781e-01, -7.07106781e-01, -8.31469612e-01, -5.55570233e-01, + -9.23879533e-01, -3.82683432e-01, -9.80785280e-01, -1.95090322e-01, + 1.00000000e+00, 0.00000000e+00, 9.56940336e-01, -2.90284677e-01, + 8.31469612e-01, -5.55570233e-01, 6.34393284e-01, -7.73010453e-01, + 3.82683432e-01, -9.23879533e-01, 9.80171403e-02, -9.95184727e-01, + -1.95090322e-01, -9.80785280e-01, -4.71396737e-01, -8.81921264e-01, + -7.07106781e-01, -7.07106781e-01, -8.81921264e-01, -4.71396737e-01, + -9.80785280e-01, -1.95090322e-01, -9.95184727e-01, 9.80171403e-02, + -9.23879533e-01, 3.82683432e-01, -7.73010453e-01, 6.34393284e-01, + -5.55570233e-01, 8.31469612e-01, -2.90284677e-01, 9.56940336e-01 +}; #ifdef TWIDDLE_Q14 const static int16_t tw64[96] __attribute__((aligned(32))) = { 16384, 0, 16305, -1606, 16069, -3196, 15679, -4756, @@ -1702,8 +1833,7 @@ const static int16_t tw64c[96] __attribute__((aligned(32))) = { void dft64(int16_t *x,int16_t *y,unsigned int *scale) { - simd256_q15_t xtmp[8], ytmp[8], *tw64a_256 = (simd256_q15_t *)tw64a, *tw64b_256 = (simd256_q15_t *)tw64b, - *x256 = (simd256_q15_t *)x, *y256 = (simd256_q15_t *)y; + simd256_q15_t xtmp[8], ytmp[8], *x256 = (simd256_q15_t *)x, *y256 = (simd256_q15_t *)y; int scale16=0; if (scale) scale16 = scale[1]; @@ -1751,7 +1881,7 @@ void dft64(int16_t *x,int16_t *y,unsigned int *scale) xtmp[6] = _mm256_permutex2var_epi32(x256[4],perm_mask2,x256[5]); // x33 x37 x41 x45 x35 x39 x43 x46 xtmp[7] = _mm256_permutex2var_epi32(x256[6],perm_mask2,x256[7]); // x49 x53 x57 x61 x51 x55 x59 x63 #endif - dft16_simd256((int16_t*)(xtmp),(int16_t*)ytmp,scale16); + dft16_simd256((int16_t *)(xtmp), (int16_t *)ytmp, scale16, true); // [y0 y1 y2 y3 y4 y5 y6 y7] // [y8 y9 y10 y11 y12 y13 y14 y15] // [y16 y17 y18 y19 y20 y21 y22 y23] @@ -1762,7 +1892,7 @@ void dft64(int16_t *x,int16_t *y,unsigned int *scale) print_shorts256("ytmp2",(int16_t*)(ytmp+2)); print_shorts256("ytmp3",(int16_t*)(ytmp+3)); */ - dft16_simd256((int16_t*)(xtmp+4),(int16_t*)(ytmp+4),scale16); + dft16_simd256((int16_t *)(xtmp + 4), (int16_t *)(ytmp + 4), scale16, true); // [y32 y33 y34 y35 y36 y37 y38 y39] // [y40 y41 y42 y43 y44 y45 y46 y47] // [y48 y49 y50 y51 y52 y53 y54 y55] @@ -1778,20 +1908,43 @@ void dft64(int16_t *x,int16_t *y,unsigned int *scale) start_meas(&ts_b); #endif - - bfly4_16_256(ytmp,ytmp+2,ytmp+4,ytmp+6, - y256,y256+2,y256+4,y256+6, - tw64a_256,tw64a_256+2,tw64a_256+4, - tw64b_256,tw64b_256+2,tw64b_256+4); + simde__m256 *tw64f_256 = (simde__m256 *)tw64f; + simde__m256i *tw64_256 = (simde__m256i *)tw64; + bfly4_16_256(ytmp, + ytmp + 2, + ytmp + 4, + ytmp + 6, + y256, + y256 + 2, + y256 + 4, + y256 + 6, + tw64_256, + tw64_256 + 2, + tw64_256 + 4, + tw64f_256, + tw64f_256 + 4, + tw64f_256 + 8, + true); // [y0 y1 y2 y3 y4 y5 y6 y7] // [y16 y17 y18 y19 y20 y21 y22 y23] // [y32 y33 y34 y35 y36 y37 y38 y39] // [y48 y49 y50 y51 y52 y53 y54 y55] - bfly4_16_256(ytmp+1,ytmp+3,ytmp+5,ytmp+7, - y256+1,y256+3,y256+5,y256+7, - tw64a_256+1,tw64a_256+3,tw64a_256+5, - tw64b_256+1,tw64b_256+3,tw64b_256+5); + bfly4_16_256(ytmp + 1, + ytmp + 3, + ytmp + 5, + ytmp + 7, + y256 + 1, + y256 + 3, + y256 + 5, + y256 + 7, + tw64_256 + 1, + tw64_256 + 3, + tw64_256 + 5, + tw64f_256 + 2, + tw64f_256 + 6, + tw64f_256 + 10, + true); // [y8 y9 y10 y11 y12 y13 y14 y15] // [y24 y25 y26 y27 y28 y29 y30 y31] // [y40 y41 y42 y43 y44 y45 y46 y47] @@ -1936,11 +2089,68 @@ void idft64(int16_t *x,int16_t *y,unsigned int *scale) } +#ifdef TWIDDLE_Q14 +static const int16_t tw128[128] __attribute__((aligned(32))) = { + 16384, 0, 16364, -804, 16305, -1606, 16207, -2404, + 16069, -3196, 15893, -3981, 15679, -4756, 15426, -5520, + 15137, -6270, 14811, -7005, 14449, -7723, 14053, -8423, + 13623, -9102, 13160, -9760, 12665, -10394, 12140, -11003, + 11585, -11585, 11003, -12140, 10394, -12665, 9760, -13160, + 9102, -13623, 8423, -14053, 7723, -14449, 7005, -14811, + 6270, -15137, 5520, -15426, 4756, -15679, 3981, -15893, + 3196, -16069, 2404, -16207, 1606, -16305, 804, -16364, + 0, -16384, -804, -16364, -1606, -16305, -2404, -16207, + -3196, -16069, -3981, -15893, -4756, -15679, -5520, -15426, + -6270, -15137, -7005, -14811, -7723, -14449, -8423, -14053, + -9102, -13623, -9760, -13160, -10394, -12665, -11003, -12140, + -11585, -11585, -12140, -11003, -12665, -10394, -13160, -9760, + -13623, -9102, -14053, -8423, -14449, -7723, -14811, -7005, + -15137, -6270, -15426, -5520, -15679, -4756, -15893, -3981, + -16069, -3196, -16207, -2404, -16305, -1606, -16364, -804, +}; +static const int16_t tw128a[128] __attribute__((aligned(32))) = { + 16384, 0, 16364, 804, 16305, 1606, 16207, 2404, + 16069, 3196, 15893, 3981, 15679, 4756, 15426, 5520, + 15137, 6270, 14811, 7005, 14449, 7723, 14053, 8423, + 13623, 9102, 13160, 9760, 12665, 10394, 12140, 11003, + 11585, 11585, 11003, 12140, 10394, 12665, 9760, 13160, + 9102, 13623, 8423, 14053, 7723, 14449, 7005, 14811, + 6270, 15137, 5520, 15426, 4756, 15679, 3981, 15893, + 3196, 16069, 2404, 16207, 1606, 16305, 804, 16364, + 0, 16384, -804, 16364, -1606, 16305, -2404, 16207, + -3196, 16069, -3981, 15893, -4756, 15679, -5520, 15426, + -6270, 15137, -7005, 14811, -7723, 14449, -8423, 14053, + -9102, 13623, -9760, 13160, -10394, 12665, -11003, 12140, + -11585, 11585, -12140, 11003, -12665, 10394, -13160, 9760, + -13623, 9102, -14053, 8423, -14449, 7723, -14811, 7005, + -15137, 6270, -15426, 5520, -15679, 4756, -15893, 3981, + -16069, 3196, -16207, 2404, -16305, 1606, -16364, 804, +}; +static const int16_t tw128b[128] __attribute__((aligned(32))) = { + 0, 16384, -804, 16364, -1606, 16305, -2404, 16207, + -3196, 16069, -3981, 15893, -4756, 15679, -5520, 15426, + -6270, 15137, -7005, 14811, -7723, 14449, -8423, 14053, + -9102, 13623, -9760, 13160, -10394, 12665, -11003, 12140, + -11585, 11585, -12140, 11003, -12665, 10394, -13160, 9760, + -13623, 9102, -14053, 8423, -14449, 7723, -14811, 7005, + -15137, 6270, -15426, 5520, -15679, 4756, -15893, 3981, + -16069, 3196, -16207, 2404, -16305, 1606, -16364, 804, + -16384, 0, -16364, -804, -16305, -1606, -16207, -2404, + -16069, -3196, -15893, -3981, -15679, -4756, -15426, -5520, + -15137, -6270, -14811, -7005, -14449, -7723, -14053, -8423, + -13623, -9102, -13160, -9760, -12665, -10394, -12140, -11003, + -11585, -11585, -11003, -12140, -10394, -12665, -9760, -13160, + -9102, -13623, -8423, -14053, -7723, -14449, -7005, -14811, + -6270, -15137, -5520, -15426, -4756, -15679, -3981, -15893, + -3196, -16069, -2404, -16207, -1606, -16305, -804, -16364, +}; +#else static const int16_t tw128[128] __attribute__((aligned(32))) = { 32767,0,32727,-1608,32609,-3212,32412,-4808,32137,-6393,31785,-7962,31356,-9512,30851,-11039,30272,-12540,29621,-14010,28897,-15447,28105,-16846,27244,-18205,26318,-19520,25329,-20788,24278,-22005,23169,-23170,22004,-24279,20787,-25330,19519,-26319,18204,-27245,16845,-28106,15446,-28898,14009,-29622,12539,-30273,11038,-30852,9511,-31357,7961,-31786,6392,-32138,4807,-32413,3211,-32610,1607,-32728,0,-32767,-1608,-32728,-3212,-32610,-4808,-32413,-6393,-32138,-7962,-31786,-9512,-31357,-11039,-30852,-12540,-30273,-14010,-29622,-15447,-28898,-16846,-28106,-18205,-27245,-19520,-26319,-20788,-25330,-22005,-24279,-23170,-23170,-24279,-22005,-25330,-20788,-26319,-19520,-27245,-18205,-28106,-16846,-28898,-15447,-29622,-14010,-30273,-12540,-30852,-11039,-31357,-9512,-31786,-7962,-32138,-6393,-32413,-4808,-32610,-3212,-32728,-1608}; static const int16_t tw128a[128] __attribute__((aligned(32))) = { 32767,0,32727,1608,32609,3212,32412,4808,32137,6393,31785,7962,31356,9512,30851,11039,30272,12540,29621,14010,28897,15447,28105,16846,27244,18205,26318,19520,25329,20788,24278,22005,23169,23170,22004,24279,20787,25330,19519,26319,18204,27245,16845,28106,15446,28898,14009,29622,12539,30273,11038,30852,9511,31357,7961,31786,6392,32138,4807,32413,3211,32610,1607,32728,0,32767,-1608,32728,-3212,32610,-4808,32413,-6393,32138,-7962,31786,-9512,31357,-11039,30852,-12540,30273,-14010,29622,-15447,28898,-16846,28106,-18205,27245,-19520,26319,-20788,25330,-22005,24279,-23170,23170,-24279,22005,-25330,20788,-26319,19520,-27245,18205,-28106,16846,-28898,15447,-29622,14010,-30273,12540,-30852,11039,-31357,9512,-31786,7962,-32138,6393,-32413,4808,-32610,3212,-32728,1608}; static const int16_t tw128b[128] __attribute__((aligned(32))) = {0,32767,-1608,32727,-3212,32609,-4808,32412,-6393,32137,-7962,31785,-9512,31356,-11039,30851,-12540,30272,-14010,29621,-15447,28897,-16846,28105,-18205,27244,-19520,26318,-20788,25329,-22005,24278,-23170,23169,-24279,22004,-25330,20787,-26319,19519,-27245,18204,-28106,16845,-28898,15446,-29622,14009,-30273,12539,-30852,11038,-31357,9511,-31786,7961,-32138,6392,-32413,4807,-32610,3211,-32728,1607,-32767,0,-32728,-1608,-32610,-3212,-32413,-4808,-32138,-6393,-31786,-7962,-31357,-9512,-30852,-11039,-30273,-12540,-29622,-14010,-28898,-15447,-28106,-16846,-27245,-18205,-26319,-19520,-25330,-20788,-24279,-22005,-23170,-23170,-22005,-24279,-20788,-25330,-19520,-26319,-18205,-27245,-16846,-28106,-15447,-28898,-14010,-29622,-12540,-30273,-11039,-30852,-9512,-31357,-7962,-31786,-6393,-32138,-4808,-32413,-3212,-32610,-1608,-32728}; +#endif void dft128(int16_t *x,int16_t *y,unsigned int *scale) { @@ -2113,6 +2323,21 @@ void idft128(int16_t *x,int16_t *y,unsigned int *scale) } +static const float tw256f[48] __attribute__((aligned(32))) = { + 1.00000000e+00, 0.00000000e+00, 9.99698819e-01, -2.45412285e-02, + 9.98795456e-01, -4.90676743e-02, 9.97290457e-01, -7.35645636e-02, + 9.95184727e-01, -9.80171403e-02, 9.92479535e-01, -1.22410675e-01, + 9.89176510e-01, -1.46730474e-01, 9.85277642e-01, -1.70961889e-01, + 1.00000000e+00, 0.00000000e+00, 9.98795456e-01, -4.90676743e-02, + 9.95184727e-01, -9.80171403e-02, 9.89176510e-01, -1.46730474e-01, + 9.80785280e-01, -1.95090322e-01, 9.70031253e-01, -2.42980180e-01, + 9.56940336e-01, -2.90284677e-01, 9.41544065e-01, -3.36889853e-01, + 1.00000000e+00, 0.00000000e+00, 9.97290457e-01, -7.35645636e-02, + 9.89176510e-01, -1.46730474e-01, 9.75702130e-01, -2.19101240e-01, + 9.56940336e-01, -2.90284677e-01, 9.32992799e-01, -3.59895037e-01, + 9.03989293e-01, -4.27555093e-01, 8.70086991e-01, -4.92898192e-01 +}; + #ifdef TWIDDLE_Q14 static const int16_t tw256[384] __attribute__((aligned(32))) = { 16384, 0, 16379, -402, 16364, -804, 16340, -1205, @@ -2312,39 +2537,128 @@ void dft256(int16_t *x,int16_t *y,unsigned int *scale) dft64((int16_t*)(xtmp+16),(int16_t*)(ytmp+16),scale64); dft64((int16_t*)(xtmp+24),(int16_t*)(ytmp+24),scale64); - - bfly4_16_256(ytmpp,ytmpp+8,ytmpp+16,ytmpp+24, - y256p,y256p+8,y256p+16,y256p+24, - tw256a_256p,tw256a_256p+8,tw256a_256p+16, - tw256b_256p,tw256b_256p+8,tw256b_256p+16); - bfly4_16_256(ytmpp+1,ytmpp+9,ytmpp+17,ytmpp+25, - y256p+1,y256p+9,y256p+17,y256p+25, - tw256a_256p+1,tw256a_256p+9,tw256a_256p+17, - tw256b_256p+1,tw256b_256p+9,tw256b_256p+17); - bfly4_16_256(ytmpp+2,ytmpp+10,ytmpp+18,ytmpp+26, - y256p+2,y256p+10,y256p+18,y256p+26, - tw256a_256p+2,tw256a_256p+10,tw256a_256p+18, - tw256b_256p+2,tw256b_256p+10,tw256b_256p+18); - bfly4_16_256(ytmpp+3,ytmpp+11,ytmpp+19,ytmpp+27, - y256p+3,y256p+11,y256p+19,y256p+27, - tw256a_256p+3,tw256a_256p+11,tw256a_256p+19, - tw256b_256p+3,tw256b_256p+11,tw256b_256p+19); - bfly4_16_256(ytmpp+4,ytmpp+12,ytmpp+20,ytmpp+28, - y256p+4,y256p+12,y256p+20,y256p+28, - tw256a_256p+4,tw256a_256p+12,tw256a_256p+20, - tw256b_256p+4,tw256b_256p+12,tw256b_256p+20); - bfly4_16_256(ytmpp+5,ytmpp+13,ytmpp+21,ytmpp+29, - y256p+5,y256p+13,y256p+21,y256p+29, - tw256a_256p+5,tw256a_256p+13,tw256a_256p+21, - tw256b_256p+5,tw256b_256p+13,tw256b_256p+21); - bfly4_16_256(ytmpp+6,ytmpp+14,ytmpp+22,ytmpp+30, - y256p+6,y256p+14,y256p+22,y256p+30, - tw256a_256p+6,tw256a_256p+14,tw256a_256p+22, - tw256b_256p+6,tw256b_256p+14,tw256b_256p+22); - bfly4_16_256(ytmpp+7,ytmpp+15,ytmpp+23,ytmpp+31, - y256p+7,y256p+15,y256p+23,y256p+31, - tw256a_256p+7,tw256a_256p+15,tw256a_256p+23, - tw256b_256p+7,tw256b_256p+15,tw256b_256p+23); + simde__m256 *tw256f_256 = (simde__m256 *)tw256f; + simde__m256i *tw256_256 = (simde__m256i *)tw256; + bfly4_16_256(ytmpp, + ytmpp + 8, + ytmpp + 16, + ytmpp + 24, + y256p, + y256p + 8, + y256p + 16, + y256p + 24, + tw256_256, + tw256_256 + 8, + tw256_256 + 16, + tw256f_256, + tw256f_256 + 2, + tw256f_256 + 4, + true); + bfly4_16_256(ytmpp + 1, + ytmpp + 9, + ytmpp + 17, + ytmpp + 25, + y256p + 1, + y256p + 9, + y256p + 17, + y256p + 25, + tw256a_256p + 1, + tw256a_256p + 9, + tw256a_256p + 17, + tw256b_256p + 1, + tw256b_256p + 9, + tw256b_256p + 17, + false); + bfly4_16_256(ytmpp + 2, + ytmpp + 10, + ytmpp + 18, + ytmpp + 26, + y256p + 2, + y256p + 10, + y256p + 18, + y256p + 26, + tw256a_256p + 2, + tw256a_256p + 10, + tw256a_256p + 18, + tw256b_256p + 2, + tw256b_256p + 10, + tw256b_256p + 18, + false); + bfly4_16_256(ytmpp + 3, + ytmpp + 11, + ytmpp + 19, + ytmpp + 27, + y256p + 3, + y256p + 11, + y256p + 19, + y256p + 27, + tw256a_256p + 3, + tw256a_256p + 11, + tw256a_256p + 19, + tw256b_256p + 3, + tw256b_256p + 11, + tw256b_256p + 19, + false); + bfly4_16_256(ytmpp + 4, + ytmpp + 12, + ytmpp + 20, + ytmpp + 28, + y256p + 4, + y256p + 12, + y256p + 20, + y256p + 28, + tw256a_256p + 4, + tw256a_256p + 12, + tw256a_256p + 20, + tw256b_256p + 4, + tw256b_256p + 12, + tw256b_256p + 20, + false); + bfly4_16_256(ytmpp + 5, + ytmpp + 13, + ytmpp + 21, + ytmpp + 29, + y256p + 5, + y256p + 13, + y256p + 21, + y256p + 29, + tw256a_256p + 5, + tw256a_256p + 13, + tw256a_256p + 21, + tw256b_256p + 5, + tw256b_256p + 13, + tw256b_256p + 21, + false); + bfly4_16_256(ytmpp + 6, + ytmpp + 14, + ytmpp + 22, + ytmpp + 30, + y256p + 6, + y256p + 14, + y256p + 22, + y256p + 30, + tw256a_256p + 6, + tw256a_256p + 14, + tw256a_256p + 22, + tw256b_256p + 6, + tw256b_256p + 14, + tw256b_256p + 22, + false); + bfly4_16_256(ytmpp + 7, + ytmpp + 15, + ytmpp + 23, + ytmpp + 31, + y256p + 7, + y256p + 15, + y256p + 23, + y256p + 31, + tw256a_256p + 7, + tw256a_256p + 15, + tw256a_256p + 23, + tw256b_256p + 7, + tw256b_256p + 15, + tw256b_256p + 23, + false); if (scale && *scale>0) { unsigned int scalec=*scale; @@ -2457,9 +2771,78 @@ void idft256(int16_t *x,int16_t *y,unsigned int *scale) } +#ifdef TWIDDLE_Q14 +static const int16_t tw512[512] __attribute__((aligned(32))) = { + 16384, 0, 16383, -201, 16379, -402, 16373, -603, + 16364, -804, 16353, -1005, 16340, -1205, 16324, -1406, + 16305, -1606, 16284, -1806, 16261, -2006, 16235, -2205, + 16207, -2404, 16176, -2603, 16143, -2801, 16107, -2999, + 16069, -3196, 16029, -3393, 15986, -3590, 15941, -3786, + 15893, -3981, 15843, -4176, 15791, -4370, 15736, -4563, + 15679, -4756, 15619, -4948, 15557, -5139, 15493, -5330, + 15426, -5520, 15357, -5708, 15286, -5897, 15213, -6084, + 15137, -6270, 15059, -6455, 14978, -6639, 14896, -6823, + 14811, -7005, 14724, -7186, 14635, -7366, 14543, -7545, + 14449, -7723, 14354, -7900, 14256, -8076, 14155, -8250, + 14053, -8423, 13949, -8595, 13842, -8765, 13733, -8935, + 13623, -9102, 13510, -9269, 13395, -9434, 13279, -9598, + 13160, -9760, 13039, -9921, 12916, -10080, 12792, -10238, + 12665, -10394, 12537, -10549, 12406, -10702, 12274, -10853, + 12140, -11003, 12004, -11151, 11866, -11297, 11727, -11442, + 11585, -11585, 11442, -11727, 11297, -11866, 11151, -12004, + 11003, -12140, 10853, -12274, 10702, -12406, 10549, -12537, + 10394, -12665, 10238, -12792, 10080, -12916, 9921, -13039, + 9760, -13160, 9598, -13279, 9434, -13395, 9269, -13510, + 9102, -13623, 8935, -13733, 8765, -13842, 8595, -13949, + 8423, -14053, 8250, -14155, 8076, -14256, 7900, -14354, + 7723, -14449, 7545, -14543, 7366, -14635, 7186, -14724, + 7005, -14811, 6823, -14896, 6639, -14978, 6455, -15059, + 6270, -15137, 6084, -15213, 5897, -15286, 5708, -15357, + 5520, -15426, 5330, -15493, 5139, -15557, 4948, -15619, + 4756, -15679, 4563, -15736, 4370, -15791, 4176, -15843, + 3981, -15893, 3786, -15941, 3590, -15986, 3393, -16029, + 3196, -16069, 2999, -16107, 2801, -16143, 2603, -16176, + 2404, -16207, 2205, -16235, 2006, -16261, 1806, -16284, + 1606, -16305, 1406, -16324, 1205, -16340, 1005, -16353, + 804, -16364, 603, -16373, 402, -16379, 201, -16383, + 0, -16384, -201, -16383, -402, -16379, -603, -16373, + -804, -16364, -1005, -16353, -1205, -16340, -1406, -16324, + -1606, -16305, -1806, -16284, -2006, -16261, -2205, -16235, + -2404, -16207, -2603, -16176, -2801, -16143, -2999, -16107, + -3196, -16069, -3393, -16029, -3590, -15986, -3786, -15941, + -3981, -15893, -4176, -15843, -4370, -15791, -4563, -15736, + -4756, -15679, -4948, -15619, -5139, -15557, -5330, -15493, + -5520, -15426, -5708, -15357, -5897, -15286, -6084, -15213, + -6270, -15137, -6455, -15059, -6639, -14978, -6823, -14896, + -7005, -14811, -7186, -14724, -7366, -14635, -7545, -14543, + -7723, -14449, -7900, -14354, -8076, -14256, -8250, -14155, + -8423, -14053, -8595, -13949, -8765, -13842, -8935, -13733, + -9102, -13623, -9269, -13510, -9434, -13395, -9598, -13279, + -9760, -13160, -9921, -13039, -10080, -12916, -10238, -12792, + -10394, -12665, -10549, -12537, -10702, -12406, -10853, -12274, + -11003, -12140, -11151, -12004, -11297, -11866, -11442, -11727, + -11585, -11585, -11727, -11442, -11866, -11297, -12004, -11151, + -12140, -11003, -12274, -10853, -12406, -10702, -12537, -10549, + -12665, -10394, -12792, -10238, -12916, -10080, -13039, -9921, + -13160, -9760, -13279, -9598, -13395, -9434, -13510, -9269, + -13623, -9102, -13733, -8935, -13842, -8765, -13949, -8595, + -14053, -8423, -14155, -8250, -14256, -8076, -14354, -7900, + -14449, -7723, -14543, -7545, -14635, -7366, -14724, -7186, + -14811, -7005, -14896, -6823, -14978, -6639, -15059, -6455, + -15137, -6270, -15213, -6084, -15286, -5897, -15357, -5708, + -15426, -5520, -15493, -5330, -15557, -5139, -15619, -4948, + -15679, -4756, -15736, -4563, -15791, -4370, -15843, -4176, + -15893, -3981, -15941, -3786, -15986, -3590, -16029, -3393, + -16069, -3196, -16107, -2999, -16143, -2801, -16176, -2603, + -16207, -2404, -16235, -2205, -16261, -2006, -16284, -1806, + -16305, -1606, -16324, -1406, -16340, -1205, -16353, -1005, + -16364, -804, -16373, -603, -16379, -402, -16383, -201, +}; +#else static const int16_t tw512[512] __attribute__((aligned(32))) = { 32767,0,32764,-403,32757,-805,32744,-1207,32727,-1608,32705,-2010,32678,-2411,32646,-2812,32609,-3212,32567,-3612,32520,-4012,32468,-4410,32412,-4808,32350,-5206,32284,-5602,32213,-5998,32137,-6393,32056,-6787,31970,-7180,31880,-7572,31785,-7962,31684,-8352,31580,-8740,31470,-9127,31356,-9512,31236,-9896,31113,-10279,30984,-10660,30851,-11039,30713,-11417,30571,-11793,30424,-12167,30272,-12540,30116,-12910,29955,-13279,29790,-13646,29621,-14010,29446,-14373,29268,-14733,29085,-15091,28897,-15447,28706,-15800,28510,-16151,28309,-16500,28105,-16846,27896,-17190,27683,-17531,27466,-17869,27244,-18205,27019,-18538,26789,-18868,26556,-19195,26318,-19520,26077,-19841,25831,-20160,25582,-20475,25329,-20788,25072,-21097,24811,-21403,24546,-21706,24278,-22005,24006,-22302,23731,-22595,23452,-22884,23169,-23170,22883,-23453,22594,-23732,22301,-24007,22004,-24279,21705,-24547,21402,-24812,21096,-25073,20787,-25330,20474,-25583,20159,-25832,19840,-26078,19519,-26319,19194,-26557,18867,-26790,18537,-27020,18204,-27245,17868,-27467,17530,-27684,17189,-27897,16845,-28106,16499,-28310,16150,-28511,15799,-28707,15446,-28898,15090,-29086,14732,-29269,14372,-29447,14009,-29622,13645,-29791,13278,-29956,12909,-30117,12539,-30273,12166,-30425,11792,-30572,11416,-30714,11038,-30852,10659,-30985,10278,-31114,9895,-31237,9511,-31357,9126,-31471,8739,-31581,8351,-31685,7961,-31786,7571,-31881,7179,-31971,6786,-32057,6392,-32138,5997,-32214,5601,-32285,5205,-32351,4807,-32413,4409,-32469,4011,-32521,3611,-32568,3211,-32610,2811,-32647,2410,-32679,2009,-32706,1607,-32728,1206,-32745,804,-32758,402,-32765,0,-32767,-403,-32765,-805,-32758,-1207,-32745,-1608,-32728,-2010,-32706,-2411,-32679,-2812,-32647,-3212,-32610,-3612,-32568,-4012,-32521,-4410,-32469,-4808,-32413,-5206,-32351,-5602,-32285,-5998,-32214,-6393,-32138,-6787,-32057,-7180,-31971,-7572,-31881,-7962,-31786,-8352,-31685,-8740,-31581,-9127,-31471,-9512,-31357,-9896,-31237,-10279,-31114,-10660,-30985,-11039,-30852,-11417,-30714,-11793,-30572,-12167,-30425,-12540,-30273,-12910,-30117,-13279,-29956,-13646,-29791,-14010,-29622,-14373,-29447,-14733,-29269,-15091,-29086,-15447,-28898,-15800,-28707,-16151,-28511,-16500,-28310,-16846,-28106,-17190,-27897,-17531,-27684,-17869,-27467,-18205,-27245,-18538,-27020,-18868,-26790,-19195,-26557,-19520,-26319,-19841,-26078,-20160,-25832,-20475,-25583,-20788,-25330,-21097,-25073,-21403,-24812,-21706,-24547,-22005,-24279,-22302,-24007,-22595,-23732,-22884,-23453,-23170,-23170,-23453,-22884,-23732,-22595,-24007,-22302,-24279,-22005,-24547,-21706,-24812,-21403,-25073,-21097,-25330,-20788,-25583,-20475,-25832,-20160,-26078,-19841,-26319,-19520,-26557,-19195,-26790,-18868,-27020,-18538,-27245,-18205,-27467,-17869,-27684,-17531,-27897,-17190,-28106,-16846,-28310,-16500,-28511,-16151,-28707,-15800,-28898,-15447,-29086,-15091,-29269,-14733,-29447,-14373,-29622,-14010,-29791,-13646,-29956,-13279,-30117,-12910,-30273,-12540,-30425,-12167,-30572,-11793,-30714,-11417,-30852,-11039,-30985,-10660,-31114,-10279,-31237,-9896,-31357,-9512,-31471,-9127,-31581,-8740,-31685,-8352,-31786,-7962,-31881,-7572,-31971,-7180,-32057,-6787,-32138,-6393,-32214,-5998,-32285,-5602,-32351,-5206,-32413,-4808,-32469,-4410,-32521,-4012,-32568,-3612,-32610,-3212,-32647,-2812,-32679,-2411,-32706,-2010,-32728,-1608,-32745,-1207,-32758,-805,-32765,-403 }; +#endif void dft512(int16_t *x,int16_t *y,unsigned int *scale) { @@ -2677,6 +3060,12 @@ void idft512(int16_t *x,int16_t *y,unsigned int *scale) } } +const static float tw1024f[24] __attribute__((aligned(32))) = { + 1., 0., 0.99998118, -0.00613588, 0.9999247 , -0.01227154, 0.99983058, -0.01840673, + 1., 0., 0.9999247 , -0.01227154, 0.99969882, -0.02454123, 0.99932238, -0.03680722, + 1., 0., 0.99983058, -0.01840673, 0.99932238, -0.03680722, 0.99847558, -0.05519524, +}; + int16_t tw1024[1536] __attribute__((aligned(32))); void dft1024(int16_t *x,int16_t *y,unsigned int *scale) @@ -2684,13 +3073,11 @@ void dft1024(int16_t *x,int16_t *y,unsigned int *scale) simd256_q15_t xtmp[128],ytmp[128],*tw1024_256p=(simd256_q15_t *)tw1024,*x256=(simd256_q15_t *)x,*y256=(simd256_q15_t *)y,*y256p=(simd256_q15_t *)y; simd256_q15_t *ytmpp = &ytmp[0]; - int i,j; - for (i=0,j=0; i<128; i+=4,j++) { + for (int_fast32_t i = 0, j = 0; i < 128; i += 4, j++) { transpose16_ooff_simd256(x256+i,xtmp+j,32); } - unsigned int *scale256=NULL; if (scale) scale256=scale+1; dft256((int16_t*)(xtmp),(int16_t*)(ytmp),scale256); @@ -2698,10 +3085,34 @@ void dft1024(int16_t *x,int16_t *y,unsigned int *scale) dft256((int16_t*)(xtmp+64),(int16_t*)(ytmp+64),scale256); dft256((int16_t*)(xtmp+96),(int16_t*)(ytmp+96),scale256); - for (i=0; i<32; i++) { - bfly4_256(ytmpp,ytmpp+32,ytmpp+64,ytmpp+96, - y256p,y256p+32,y256p+64,y256p+96, - tw1024_256p,tw1024_256p+32,tw1024_256p+64); + bfly4_256(ytmpp, + ytmpp + 32, + ytmpp + 64, + ytmpp + 96, + y256p, + y256p + 32, + y256p + 64, + y256p + 96, + tw1024_256p, + tw1024_256p + 32, + tw1024_256p + 64, + (simde__m256 *)tw1024f); + tw1024_256p++; + y256p++; + ytmpp++; + for (int_fast8_t i = 0; i < 31; i++) { + bfly4_256(ytmpp, + ytmpp + 32, + ytmpp + 64, + ytmpp + 96, + y256p, + y256p + 32, + y256p + 64, + y256p + 96, + tw1024_256p, + tw1024_256p + 32, + tw1024_256p + 64, + NULL); tw1024_256p++; y256p++; ytmpp++; @@ -2709,7 +3120,7 @@ void dft1024(int16_t *x,int16_t *y,unsigned int *scale) if (scale && *scale>0) { unsigned int scalec=*scale; - for (i=0; i<8; i++) { + for (int_fast8_t i = 0; i < 8; i++) { y256[0] = shiftright_int16_simd256(y256[0],scalec); y256[1] = shiftright_int16_simd256(y256[1],scalec); y256[2] = shiftright_int16_simd256(y256[2],scalec); @@ -2729,7 +3140,6 @@ void dft1024(int16_t *x,int16_t *y,unsigned int *scale) y256+=16; } - } } @@ -3014,6 +3424,12 @@ void idft2048(int16_t *x,int16_t *y,unsigned int *scale) } +const static float tw4096f[24] __attribute__((aligned(32))) = { + 1., 0., 0.99999882, -0.00153398, 0.99999529, -0.00306796, 0.99998941, -0.00460193, + 1., 0., 0.99999529, -0.00306796, 0.99998118, -0.00613588, 0.99995764, -0.00920375, + 1., 0., 0.99998941, -0.00460193, 0.99995764, -0.00920375, 0.9999047 , -0.01380539, +}; + int16_t tw4096[3*2*1024]; void dft4096(int16_t *x,int16_t *y,unsigned int *scale) @@ -3021,13 +3437,11 @@ void dft4096(int16_t *x,int16_t *y,unsigned int *scale) simd256_q15_t xtmp[512],ytmp[512],*tw4096_256p=(simd256_q15_t *)tw4096,*x256=(simd256_q15_t *)x,*y256=(simd256_q15_t *)y,*y256p=(simd256_q15_t *)y; simd256_q15_t *ytmpp = &ytmp[0]; - int i,j; - for (i=0,j=0; i<512; i+=4,j++) { + for (int_fast16_t i = 0, j = 0; i < 512; i += 4, j++) { transpose16_ooff_simd256(x256+i,xtmp+j,128); } - unsigned int *scale1024=NULL; if (scale) scale1024=scale+1; dft1024((int16_t*)(xtmp),(int16_t*)(ytmp),scale1024); @@ -3035,10 +3449,34 @@ void dft4096(int16_t *x,int16_t *y,unsigned int *scale) dft1024((int16_t*)(xtmp+256),(int16_t*)(ytmp+256),scale1024); dft1024((int16_t*)(xtmp+384),(int16_t*)(ytmp+384),scale1024); - for (i=0; i<128; i++) { - bfly4_256(ytmpp,ytmpp+128,ytmpp+256,ytmpp+384, - y256p,y256p+128,y256p+256,y256p+384, - tw4096_256p,tw4096_256p+128,tw4096_256p+256); + bfly4_256(ytmpp, + ytmpp + 128, + ytmpp + 256, + ytmpp + 384, + y256p, + y256p + 128, + y256p + 256, + y256p + 384, + tw4096_256p, + tw4096_256p + 128, + tw4096_256p + 256, + (simde__m256 *)tw4096f); + tw4096_256p++; + y256p++; + ytmpp++; + for (int_fast16_t i = 1; i < 128; i++) { + bfly4_256(ytmpp, + ytmpp + 128, + ytmpp + 256, + ytmpp + 384, + y256p, + y256p + 128, + y256p + 256, + y256p + 384, + tw4096_256p, + tw4096_256p + 128, + tw4096_256p + 256, + NULL); tw4096_256p++; y256p++; ytmpp++; @@ -3046,7 +3484,7 @@ void dft4096(int16_t *x,int16_t *y,unsigned int *scale) if (scale && *scale>0) { unsigned int scalec=*scale; - for (i=0; i<32; i++) { + for (int_fast8_t i = 0; i < 32; i++) { y256[0] = shiftright_int16_simd256(y256[0],scalec); y256[1] = shiftright_int16_simd256(y256[1],scalec); y256[2] = shiftright_int16_simd256(y256[2],scalec); @@ -3066,7 +3504,6 @@ void dft4096(int16_t *x,int16_t *y,unsigned int *scale) y256+=16; } - } } @@ -3375,9 +3812,18 @@ void dft16384(int16_t *x,int16_t *y,unsigned int *scale) dft4096((int16_t*)(xtmp+1536),(int16_t*)(ytmp+1536),scale4096); for (i=0; i<512; i++) { - bfly4_256(ytmpp,ytmpp+512,ytmpp+1024,ytmpp+1536, - y256p,y256p+512,y256p+1024,y256p+1536, - tw16384_256p,tw16384_256p+512,tw16384_256p+1024); + bfly4_256(ytmpp, + ytmpp + 512, + ytmpp + 1024, + ytmpp + 1536, + y256p, + y256p + 512, + y256p + 1024, + y256p + 1536, + tw16384_256p, + tw16384_256p + 512, + tw16384_256p + 1024, + NULL); tw16384_256p++; y256p++; ytmpp++;