diff --git a/openair1/PHY/LTE_UE_TRANSPORT/transport_ue.h b/openair1/PHY/LTE_UE_TRANSPORT/transport_ue.h
index 6b84a5f9693981e816767a68a8e09704d83895c3..b4ae44f6eab8b663a98de22b151338385c76bf33 100644
--- a/openair1/PHY/LTE_UE_TRANSPORT/transport_ue.h
+++ b/openair1/PHY/LTE_UE_TRANSPORT/transport_ue.h
@@ -155,8 +155,6 @@ typedef struct {
   uint8_t h[MAX_NUM_CHANNEL_BITS];
   /// Scrambled "b"-sequences (for definition see 36-211 V8.6 2009-03, p.14)
   uint8_t b_tilde[MAX_NUM_CHANNEL_BITS];
-  /// Modulated "d"-sequences (for definition see 36-211 V8.6 2009-03, p.14)
-  int32_t d[MAX_NUM_RE];
   /// Transform-coded "z"-sequences (for definition see 36-211 V8.6 2009-03, p.14-15)
   int32_t z[MAX_NUM_RE];
   /// "q" sequences for CQI/PMI (for definition see 36-212 V8.6 2009-03, p.27)
diff --git a/openair1/PHY/LTE_UE_TRANSPORT/ulsch_modulation.c b/openair1/PHY/LTE_UE_TRANSPORT/ulsch_modulation.c
index 7ec3e7bfb684dac869dc84bc04e4bb924995117b..6a663e89404e5a91a0187a20f52c4bf152863d16 100644
--- a/openair1/PHY/LTE_UE_TRANSPORT/ulsch_modulation.c
+++ b/openair1/PHY/LTE_UE_TRANSPORT/ulsch_modulation.c
@@ -42,7 +42,7 @@
 
 //#define DEBUG_ULSCH_MODULATION
 
-void dft_lte(int32_t *z,int32_t *d, int32_t Msc_PUSCH, uint8_t Nsymb)
+void dft_lte(int32_t *z,struct complex16 *input, int32_t Msc_PUSCH, uint8_t Nsymb)
 {
 
 #if defined(__x86_64__) || defined(__i386__)
@@ -66,7 +66,7 @@ void dft_lte(int32_t *z,int32_t *d, int32_t Msc_PUSCH, uint8_t Nsymb)
 #endif
   //  printf("Doing lte_dft for Msc_PUSCH %d\n",Msc_PUSCH);
 
-  d0 = (uint32_t *)d;
+  d0 = (uint32_t *)input;
   d1 = d0+Msc_PUSCH;
   d2 = d1+Msc_PUSCH;
   d3 = d2+Msc_PUSCH;
@@ -476,7 +476,8 @@ void ulsch_modulation(int32_t **txdataF,
   // Modulation
 
   ulsch_Msymb = G/Q_m;
-
+  /// Modulated "d"-sequences (for definition see 36-211 V8.6 2009-03, p.14)
+  struct complex16 d[MAX_NUM_RE] __attribute__((aligned(32)));
   if(ulsch->cooperation_flag == 2)
     // For Distributed Alamouti Scheme in Collabrative Communication
   {
@@ -488,14 +489,14 @@ void ulsch_modulation(int32_t **txdataF,
 
 
         //UE1, -x1*
-        ((int16_t*)&ulsch->d[i])[0] = (ulsch->b_tilde[j] == 1)  ? (gain_lin_QPSK) : -gain_lin_QPSK;
-        ((int16_t*)&ulsch->d[i])[1] = (ulsch->b_tilde[j+1] == 1)? (-gain_lin_QPSK) : gain_lin_QPSK;
+        d[i].r = (ulsch->b_tilde[j] == 1)  ? (gain_lin_QPSK) : -gain_lin_QPSK;
+        d[i].i = (ulsch->b_tilde[j+1] == 1)? (-gain_lin_QPSK) : gain_lin_QPSK;
         //      if (i<Msc_PUSCH)
         //  printf("input %d (%p): %d,%d\n", i,&ulsch->d[i],((int16_t*)&ulsch->d[i])[0],((int16_t*)&ulsch->d[i])[1]);
 
         // UE1, x0*
-        ((int16_t*)&ulsch->d[i+1])[0] = (ulsch->b_tilde[j-2] == 1)  ? (-gain_lin_QPSK) : gain_lin_QPSK;
-        ((int16_t*)&ulsch->d[i+1])[1] = (ulsch->b_tilde[j-1] == 1)? (gain_lin_QPSK) : -gain_lin_QPSK;
+        d[i+1].r = (ulsch->b_tilde[j-2] == 1)  ? (-gain_lin_QPSK) : gain_lin_QPSK;
+        d[i+1].i = (ulsch->b_tilde[j-1] == 1)? (gain_lin_QPSK) : -gain_lin_QPSK;
 
         break;
 
@@ -521,8 +522,8 @@ void ulsch_modulation(int32_t **txdataF,
           qam16_table_offset_im+=1;
 
 
-        ((int16_t*)&ulsch->d[i])[0]=-(int16_t)(((int32_t)amp*qam16_table[qam16_table_offset_re])>>15);
-        ((int16_t*)&ulsch->d[i])[1]=(int16_t)(((int32_t)amp*qam16_table[qam16_table_offset_im])>>15);
+        d[i].r =-(int16_t)(((int32_t)amp*qam16_table[qam16_table_offset_re])>>15);
+        d[i].i =(int16_t)(((int32_t)amp*qam16_table[qam16_table_offset_im])>>15);
 
         //UE1,x0*
         qam16_table_offset_re = 0;
@@ -544,8 +545,8 @@ void ulsch_modulation(int32_t **txdataF,
 
         //    ((int16_t*)&ulsch->d[i+1])[0]=-(int16_t)(((int32_t)amp*qam16_table[qam16_table_offset_re])>>15);
         //    ((int16_t*)&ulsch->d[i+1])[1]=(int16_t)(((int32_t)amp*qam16_table[qam16_table_offset_im])>>15);
-        ((int16_t*)&ulsch->d[i+1])[0]=(int16_t)(((int32_t)amp*qam16_table[qam16_table_offset_re])>>15);
-        ((int16_t*)&ulsch->d[i+1])[1]=-(int16_t)(((int32_t)amp*qam16_table[qam16_table_offset_im])>>15);
+        d[i+1].r=(int16_t)(((int32_t)amp*qam16_table[qam16_table_offset_re])>>15);
+        d[i+1].i=-(int16_t)(((int32_t)amp*qam16_table[qam16_table_offset_im])>>15);
 
 
         break;
@@ -578,8 +579,8 @@ void ulsch_modulation(int32_t **txdataF,
           qam64_table_offset_im+=1;
 
 
-        ((int16_t*)&ulsch->d[i])[0]=-(int16_t)(((int32_t)amp*qam64_table[qam64_table_offset_re])>>15);
-        ((int16_t*)&ulsch->d[i])[1]=(int16_t)(((int32_t)amp*qam64_table[qam64_table_offset_im])>>15);
+        d[i].r=-(int16_t)(((int32_t)amp*qam64_table[qam64_table_offset_re])>>15);
+        d[i].i=(int16_t)(((int32_t)amp*qam64_table[qam64_table_offset_im])>>15);
 
         //UE1,x0*
         qam64_table_offset_re = 0;
@@ -605,8 +606,8 @@ void ulsch_modulation(int32_t **txdataF,
           qam64_table_offset_im+=1;
 
 
-        ((int16_t*)&ulsch->d[i+1])[0]=(int16_t)(((int32_t)amp*qam64_table[qam64_table_offset_re])>>15);
-        ((int16_t*)&ulsch->d[i+1])[1]=-(int16_t)(((int32_t)amp*qam64_table[qam64_table_offset_im])>>15);
+        d[i+1].r=(int16_t)(((int32_t)amp*qam64_table[qam64_table_offset_re])>>15);
+        d[i+1].i=-(int16_t)(((int32_t)amp*qam64_table[qam64_table_offset_im])>>15);
 
         break;
 
@@ -621,8 +622,8 @@ void ulsch_modulation(int32_t **txdataF,
       case 2:
         // TODO: this has to be updated!!!
 
-        ((int16_t*)&ulsch->d[i])[0] = (ulsch->b_tilde[j] == 1)  ? (-gain_lin_QPSK) : gain_lin_QPSK;
-        ((int16_t*)&ulsch->d[i])[1] = (ulsch->b_tilde[j+1] == 1)? (-gain_lin_QPSK) : gain_lin_QPSK;
+        d[i].r = (ulsch->b_tilde[j] == 1)  ? (-gain_lin_QPSK) : gain_lin_QPSK;
+        d[i].i = (ulsch->b_tilde[j+1] == 1)? (-gain_lin_QPSK) : gain_lin_QPSK;
         //        if (i<Msc_PUSCH)
         //    printf("input %d/%d Msc_PUSCH %d (%p): %d,%d\n", i,Msymb,Msc_PUSCH,&ulsch->d[i],((int16_t*)&ulsch->d[i])[0],((int16_t*)&ulsch->d[i])[1]);
 
@@ -646,8 +647,8 @@ void ulsch_modulation(int32_t **txdataF,
           qam16_table_offset_im+=1;
 
 
-        ((int16_t*)&ulsch->d[i])[0]=(int16_t)(((int32_t)amp*qam16_table[qam16_table_offset_re])>>15);
-        ((int16_t*)&ulsch->d[i])[1]=(int16_t)(((int32_t)amp*qam16_table[qam16_table_offset_im])>>15);
+        d[i].r=(int16_t)(((int32_t)amp*qam16_table[qam16_table_offset_re])>>15);
+        d[i].i=(int16_t)(((int32_t)amp*qam16_table[qam16_table_offset_im])>>15);
         //      printf("input(16qam) %d (%p): %d,%d\n", i,&ulsch->d[i],((int16_t*)&ulsch->d[i])[0],((int16_t*)&ulsch->d[i])[1]);
         break;
 
@@ -676,8 +677,8 @@ void ulsch_modulation(int32_t **txdataF,
           qam64_table_offset_im+=1;
 
 
-        ((int16_t*)&ulsch->d[i])[0]=(int16_t)(((int32_t)amp*qam64_table[qam64_table_offset_re])>>15);
-        ((int16_t*)&ulsch->d[i])[1]=(int16_t)(((int32_t)amp*qam64_table[qam64_table_offset_im])>>15);
+        d[i].r=(int16_t)(((int32_t)amp*qam64_table[qam64_table_offset_re])>>15);
+        d[i].i=(int16_t)(((int32_t)amp*qam64_table[qam64_table_offset_im])>>15);
 
         break;
 
@@ -688,7 +689,7 @@ void ulsch_modulation(int32_t **txdataF,
 
   // Transform Precoding
 
-  dft_lte(ulsch->z,ulsch->d,Msc_PUSCH,ulsch->Nsymb_pusch);
+  dft_lte(ulsch->z,d,Msc_PUSCH,ulsch->Nsymb_pusch);
 
   DevAssert(txdataF);
 
diff --git a/openair1/PHY/MODULATION/slot_fep.c b/openair1/PHY/MODULATION/slot_fep.c
index 8f41735f10cfa5fd4a915a7d8cb825b2305c03ba..b2c7d934c6583c88acaf82f38a8f61d543b76b02 100644
--- a/openair1/PHY/MODULATION/slot_fep.c
+++ b/openair1/PHY/MODULATION/slot_fep.c
@@ -91,7 +91,7 @@ int slot_fep(PHY_VARS_UE *ue,
   }
 
   //  subframe_offset_F = frame_parms->ofdm_symbol_size * frame_parms->symbols_per_tti * (Ns>>1);
-
+ 
   if (l<0 || l>=7-frame_parms->Ncp) {
     printf("slot_fep: l must be between 0 and %d\n",7-frame_parms->Ncp);
     return(-1);
diff --git a/openair1/PHY/TOOLS/oai_dfts.c b/openair1/PHY/TOOLS/oai_dfts.c
index c266a7f82bc41dfae2f9f1dee5b276709c842a94..c2b6a55210822fd4e066adf7888d4463309f164d 100644
--- a/openair1/PHY/TOOLS/oai_dfts.c
+++ b/openair1/PHY/TOOLS/oai_dfts.c
@@ -7082,43 +7082,40 @@ static inline void dft12f(simd_q15_t *x0,
 
   simd_q15_t tmp_dft12[12];
 
-  simd_q15_t *tmp_dft12_ptr = &tmp_dft12[0];
-
   // msg("dft12\n");
 
   bfly4_tw1(x0,
             x3,
             x6,
             x9,
-            tmp_dft12_ptr,
-            tmp_dft12_ptr+3,
-            tmp_dft12_ptr+6,
-            tmp_dft12_ptr+9);
-
+            tmp_dft12,
+            tmp_dft12+3,
+            tmp_dft12+6,
+            tmp_dft12+9);
 
   bfly4_tw1(x1,
             x4,
             x7,
             x10,
-            tmp_dft12_ptr+1,
-            tmp_dft12_ptr+4,
-            tmp_dft12_ptr+7,
-            tmp_dft12_ptr+10);
+            tmp_dft12+1,
+            tmp_dft12+4,
+            tmp_dft12+7,
+            tmp_dft12+10);
 
 
   bfly4_tw1(x2,
             x5,
             x8,
             x11,
-            tmp_dft12_ptr+2,
-            tmp_dft12_ptr+5,
-            tmp_dft12_ptr+8,
-            tmp_dft12_ptr+11);
+            tmp_dft12+2,
+            tmp_dft12+5,
+            tmp_dft12+8,
+            tmp_dft12+11);
 
   //  k2=0;
-  bfly3_tw1(tmp_dft12_ptr,
-            tmp_dft12_ptr+1,
-            tmp_dft12_ptr+2,
+  bfly3_tw1(tmp_dft12,
+            tmp_dft12+1,
+            tmp_dft12+2,
             y0,
             y4,
             y8);
@@ -7126,9 +7123,9 @@ static inline void dft12f(simd_q15_t *x0,
 
 
   //  k2=1;
-  bfly3(tmp_dft12_ptr+3,
-        tmp_dft12_ptr+4,
-        tmp_dft12_ptr+5,
+  bfly3(tmp_dft12+3,
+        tmp_dft12+4,
+        tmp_dft12+5,
         y1,
         y5,
         y9,
@@ -7138,9 +7135,9 @@ static inline void dft12f(simd_q15_t *x0,
 
 
   //  k2=2;
-  bfly3(tmp_dft12_ptr+6,
-        tmp_dft12_ptr+7,
-        tmp_dft12_ptr+8,
+  bfly3(tmp_dft12+6,
+        tmp_dft12+7,
+        tmp_dft12+8,
         y2,
         y6,
         y10,
@@ -7148,9 +7145,9 @@ static inline void dft12f(simd_q15_t *x0,
         W4_12);
 
   //  k2=3;
-  bfly3(tmp_dft12_ptr+9,
-        tmp_dft12_ptr+10,
-        tmp_dft12_ptr+11,
+  bfly3(tmp_dft12+9,
+        tmp_dft12+10,
+        tmp_dft12+11,
         y3,
         y7,
         y11,
@@ -10607,11 +10604,19 @@ int dfts_autoinit(void)
 #ifndef MR_MAIN
 
 void dft(uint8_t sizeidx, int16_t *input,int16_t *output,unsigned char scale_flag){
-	AssertFatal((sizeidx>=0 && sizeidx<DFT_SIZE_IDXTABLESIZE),"Invalid dft size index %i\n",sizeidx);
-        AssertFatal( ((intptr_t)output&0x1F)==0,"Buffers should be 32 bytes aligned %p",output);
-        if ((intptr_t)input&0x1F) {
+	AssertFatal((sizeidx >= 0 && sizeidx<DFT_SIZE_IDXTABLESIZE),"Invalid dft size index %i\n",sizeidx);
+        int algn=0xF;
+        #ifdef __AVX2__
+        if ( (dft_ftab[sizeidx].size%3) != 0 ) // there is no AVX2 implementation for multiples of 3 DFTs
+          algn=0x1F;
+        #endif 
+        AssertFatal(((intptr_t)output&algn)==0,"Buffers should be aligned %p",output);
+        if (((intptr_t)input)&algn) {
           LOG_D(PHY, "DFT called with input not aligned, add a memcpy, size %d\n", sizeidx);
-          int16_t tmp[dft_ftab[sizeidx].size*2] __attribute__ ((aligned(32))); // input and output are not in right type (int16_t instead of c16_t)
+          int sz=dft_ftab[sizeidx].size;
+          if (sizeidx==DFT_12) // This case does 8 DFTs in //
+            sz*=8;
+          int16_t tmp[sz*2] __attribute__ ((aligned(32))); // input and output are not in right type (int16_t instead of c16_t)
           memcpy(tmp, input, sizeof tmp);
           dft_ftab[sizeidx].func(tmp,output,scale_flag);
         } else
@@ -10620,10 +10625,15 @@ void dft(uint8_t sizeidx, int16_t *input,int16_t *output,unsigned char scale_fla
 
 void idft(uint8_t sizeidx, int16_t *input,int16_t *output,unsigned char scale_flag){
 	AssertFatal((sizeidx>=0 && sizeidx<DFT_SIZE_IDXTABLESIZE),"Invalid idft size index %i\n",sizeidx);
-        AssertFatal( ((intptr_t)output&0x1F)==0,"Buffers should be 32 bytes aligned %p",output);
-        if ((intptr_t)input&0x1F) {
+        int algn=0xF;
+        #ifdef __AVX2__
+          algn=0x1F;
+        #endif
+        AssertFatal( ((intptr_t)output&algn)==0,"Buffers should be 16 bytes aligned %p",output);
+        if (((intptr_t)input)&algn ) {  
           LOG_D(PHY, "DFT called with input not aligned, add a memcpy\n");
-          int16_t tmp[idft_ftab[sizeidx].size*2] __attribute__ ((aligned(32))); // input and output are not in right type (int16_t instead of c16_t)
+          int sz=idft_ftab[sizeidx].size;
+          int16_t tmp[sz*2] __attribute__ ((aligned(32))); // input and output are not in right type (int16_t instead of c16_t)
           memcpy(tmp, input, sizeof tmp);
           dft_ftab[sizeidx].func(tmp,output,scale_flag);
         } else
diff --git a/openair1/PHY/TOOLS/tools_defs.h b/openair1/PHY/TOOLS/tools_defs.h
index 7cee04fc59b78659c8de7edbd4bf104dd9b8d152..86bce9614fd442ef47ad3670901c477c17d21378 100644
--- a/openair1/PHY/TOOLS/tools_defs.h
+++ b/openair1/PHY/TOOLS/tools_defs.h
@@ -253,7 +253,7 @@ This function performs optimized fixed-point radix-2 FFT/IFFT.
   SZ_DEF(73728) \
   SZ_DEF(98304)
 
-#define FOREACH_IDFTSZ(SZ_DEF)                  \
+#define FOREACH_IDFTSZ(SZ_DEF)\
   SZ_DEF(64) \
   SZ_DEF(128) \
   SZ_DEF(256) \