diff --git a/common/utils/LOG/log.c b/common/utils/LOG/log.c
index 4605df06e00a690895b90d6acc6c13cd62ff3e8b..043631d706a710f7384ba5897f5dc7160d6c68ec 100644
--- a/common/utils/LOG/log.c
+++ b/common/utils/LOG/log.c
@@ -48,6 +48,10 @@
 
 // main log variables
 
+// Fixme: a better place to be shure it is called 
+void read_cpu_hardware (void) __attribute__ ((constructor));
+void read_cpu_hardware (void) {__builtin_cpu_init(); }
+
 log_mem_cnt_t log_mem_d[2];
 int log_mem_flag=0;
 int log_mem_multi=1;
diff --git a/common/utils/LOG/log.h b/common/utils/LOG/log.h
index a1b70c7503f3aaed6f527cd79a21827da45adb6c..509c4581a92f3475a0f46f3324acb13a5fa3b1a3 100644
--- a/common/utils/LOG/log.h
+++ b/common/utils/LOG/log.h
@@ -366,6 +366,7 @@ typedef struct {
 #define MATLAB_CSHORT_BRACKET3 15
   
 int32_t write_file_matlab(const char *fname, const char *vname, void *data, int length, int dec, unsigned int format, int multiVec);
+#define write_output(a, b, c, d, e, f) write_file_matlab(a, b, c, d, e, f, 0)
 
 /*----------------macro definitions for reading log configuration from the config module */
 #define CONFIG_STRING_LOG_PREFIX                           "log_config"
diff --git a/common/utils/time_meas.c b/common/utils/time_meas.c
index 9d1215b5e5b255e979401dc2fa8089c2bd903bcb..15766b5a3ff48a70b1c36b1746ce90143386f2e5 100644
--- a/common/utils/time_meas.c
+++ b/common/utils/time_meas.c
@@ -39,14 +39,14 @@ notifiedFIFO_t measur_fifo;
 double get_cpu_freq_GHz(void)
 {
   if (cpu_freq_GHz <1 ) {
-  time_stats_t ts = {0};
-  reset_meas(&ts);
-  ts.trials++;
-  ts.in = rdtsc_oai();
-  sleep(1);
-  ts.diff = (rdtsc_oai()-ts.in);
-  cpu_freq_GHz = (double)ts.diff/1000000000;
-  printf("CPU Freq is %f \n", cpu_freq_GHz);
+    time_stats_t ts = {0};
+    reset_meas(&ts);
+    ts.trials++;
+    ts.in = rdtsc_oai();
+    sleep(1);
+    ts.diff = (rdtsc_oai()-ts.in);
+    cpu_freq_GHz = (double)ts.diff/1000000000;
+    printf("CPU Freq is %f \n", cpu_freq_GHz);
   }
   return cpu_freq_GHz;
 }
diff --git a/openair1/PHY/INIT/lte_init.c b/openair1/PHY/INIT/lte_init.c
index c92d926a8f321f18037392a5659ab60dc9eb605c..8e351a15f20edd1c3bf5294deb2e461ed3b84a55 100644
--- a/openair1/PHY/INIT/lte_init.c
+++ b/openair1/PHY/INIT/lte_init.c
@@ -475,9 +475,7 @@ int phy_init_lte_eNB(PHY_VARS_eNB *eNB,
       srs_vars[srs_id].srs = (int32_t *) malloc16_clear (2 * fp->ofdm_symbol_size * sizeof (int32_t));
     }
 
-    LOG_I(PHY,"PRACH allocation\n");
     // PRACH
-    prach_vars->prachF = (int16_t *) malloc16_clear (1024 * 2 * sizeof (int16_t));
     // assume maximum of 64 RX antennas for PRACH receiver
     prach_vars->prach_ifft[0] = (int32_t **) malloc16_clear (64 * sizeof (int32_t *));
 
@@ -485,8 +483,6 @@ int phy_init_lte_eNB(PHY_VARS_eNB *eNB,
       prach_vars->prach_ifft[0][i] = (int32_t *) malloc16_clear (1024 * 2 * sizeof (int32_t));
 
     prach_vars->rxsigF[0] = (int16_t **) malloc16_clear (64 * sizeof (int16_t *));
-    // PRACH BR
-    prach_vars_br->prachF = (int16_t *)malloc16_clear( 1024*2*sizeof(int32_t) );
 
     // assume maximum of 64 RX antennas for PRACH receiver
     for (int ce_level = 0; ce_level < 4; ce_level++) {
@@ -576,8 +572,6 @@ void phy_free_lte_eNB(PHY_VARS_eNB *eNB) {
 
   for (UE_id=0; UE_id<NUMBER_OF_SRS_MAX; UE_id++) free_and_zero(srs_vars[UE_id].srs);
 
-  free_and_zero(prach_vars->prachF);
-
   for (i = 0; i < 64; i++) free_and_zero(prach_vars->prach_ifft[0][i]);
 
   free_and_zero(prach_vars->prach_ifft[0]);
@@ -589,7 +583,6 @@ void phy_free_lte_eNB(PHY_VARS_eNB *eNB) {
     free_and_zero(prach_vars->rxsigF[ce_level]);
   }
 
-  free_and_zero(prach_vars_br->prachF);
   free_and_zero(prach_vars->rxsigF[0]);
 
   for (int ULSCH_id=0; ULSCH_id<NUMBER_OF_ULSCH_MAX; ULSCH_id++) {
diff --git a/openair1/PHY/LTE_ESTIMATION/lte_ul_channel_estimation.c b/openair1/PHY/LTE_ESTIMATION/lte_ul_channel_estimation.c
index 9327fe9fe89a255249794470e4160ec440732406..45c25b59ea11187be4b57d4b08aca88a3da810c2 100644
--- a/openair1/PHY/LTE_ESTIMATION/lte_ul_channel_estimation.c
+++ b/openair1/PHY/LTE_ESTIMATION/lte_ul_channel_estimation.c
@@ -106,9 +106,9 @@ int32_t lte_ul_channel_estimation(LTE_DL_FRAME_PARMS *frame_parms,
 #ifdef DEBUG_CH
 
   if (Ns==0)
-    LOG_M("drs_seq0.m","drsseq0",ul_ref_sigs_rx[u][v][Msc_RS_idx],2*Msc_RS,2,1);
+    LOG_M("drs_seq0.m","drsseq0",ul_ref_sigs_rx[u][v][Msc_RS_idx],2*Msc_RS,2,0);
   else
-    LOG_M("drs_seq1.m","drsseq1",ul_ref_sigs_rx[u][v][Msc_RS_idx],2*Msc_RS,2,1);
+    LOG_M("drs_seq1.m","drsseq1",ul_ref_sigs_rx[u][v][Msc_RS_idx],2*Msc_RS,2,0);
 
 #endif
 
@@ -311,7 +311,7 @@ int32_t lte_ul_channel_estimation(LTE_DL_FRAME_PARMS *frame_parms,
           //      alpha = (int16_t) (((int32_t) SCALE * (int32_t) (pilot_pos2-k))/(pilot_pos2-pilot_pos1));
           //      beta  = (int16_t) (((int32_t) SCALE * (int32_t) (k-pilot_pos1))/(pilot_pos2-pilot_pos1));
 #ifdef DEBUG_CH
-          LOG_D(PHY,"lte_ul_channel_estimation: k=%d, alpha = %d, beta = %d\n",k,alpha,beta);
+          LOG_D(PHY,"lte_ul_channel_estimation: k=%d\n",k);
 #endif
           //symbol_offset_subframe = frame_parms->N_RB_UL*12*k;
 
@@ -436,9 +436,9 @@ int32_t lte_ul_channel_estimation_RRU(LTE_DL_FRAME_PARMS *frame_parms,
 #ifdef DEBUG_CH
 
   if (l==pilot_pos1)
-    write_output("drs_seq0.m","drsseq0",ul_ref_sigs_rx[u][v][Msc_RS_idx],Msc_RS,1,1);
+    write_output("drs_seq0.m","drsseq0",ul_ref_sigs_rx[u][v][Msc_RS_idx],Msc_RS,1,0);
   else
-    write_output("drs_seq1.m","drsseq1",ul_ref_sigs_rx[u][v][Msc_RS_idx],Msc_RS,1,1);
+    write_output("drs_seq1.m","drsseq1",ul_ref_sigs_rx[u][v][Msc_RS_idx],Msc_RS,1,0);
 
 #endif
   symbol_offset = frame_parms->N_RB_UL*12*l;
diff --git a/openair1/PHY/LTE_TRANSPORT/phich.c b/openair1/PHY/LTE_TRANSPORT/phich.c
index 1725588f4c228f0ebdabb87926afeff7efce8969..944e8fe18f136fda9ba2007d073bb712547c527d 100644
--- a/openair1/PHY/LTE_TRANSPORT/phich.c
+++ b/openair1/PHY/LTE_TRANSPORT/phich.c
@@ -74,7 +74,8 @@ void generate_phich(LTE_DL_FRAME_PARMS *frame_parms,
   uint32_t subframe_offset=((frame_parms->Ncp==0)?14:12)*frame_parms->ofdm_symbol_size*subframe;
 
   memset(d,0,24*sizeof(int16_t));
-
+  const int nushift=frame_parms->nushift;
+  
   if (frame_parms->nb_antenna_ports_eNB==1)
     gain_lin_QPSK = (int16_t)(((int32_t)amp*ONE_OVER_SQRT2_Q15)>>15);
   else
@@ -245,7 +246,7 @@ void generate_phich(LTE_DL_FRAME_PARMS *frame_parms,
       y1_16[7] = -y0_16[5];
 
       for (i=0,j=0,m=0; i<6; i++,j+=2) {
-        if ((i!=(frame_parms->nushift))&&(i!=(frame_parms->nushift+3))) {
+        if ((i!=(nushift))&&(i!=((nushift+3)%6))) {
           y0[j]   += y0_16[m];
           y1[j]   += y1_16[m++];
           y0[j+1] += y0_16[m];
@@ -287,7 +288,7 @@ void generate_phich(LTE_DL_FRAME_PARMS *frame_parms,
       y1_16[7] = -y0_16[5];
 
       for (i=0,j=0,m=0; i<6; i++,j+=2) {
-        if ((i!=(frame_parms->nushift))&&(i!=(frame_parms->nushift+3))) {
+        if ((i!=(nushift))&&(i!=((nushift+3)%6))) {
           y0[j]   += y0_16[m];
           y1[j]   += y1_16[m++];
           y0[j+1] += y0_16[m];
@@ -329,7 +330,7 @@ void generate_phich(LTE_DL_FRAME_PARMS *frame_parms,
       y1_16[7] = -y0_16[5];
 
       for (i=0,j=0,m=0; i<6; i++,j+=2) {
-        if ((i!=(frame_parms->nushift))&&(i!=(frame_parms->nushift+3))) {
+        if ((i!=(nushift))&&(i!=((nushift+3)%6))) {
           y0[j]   += y0_16[m];
           y1[j]   += y1_16[m++];
           y0[j+1] += y0_16[m];
@@ -360,7 +361,7 @@ void generate_phich(LTE_DL_FRAME_PARMS *frame_parms,
       y0_16[7]   = d[7]*gain_lin_QPSK;
 
       for (i=0,j=0,m=0; i<6; i++,j+=2) {
-        if ((i!=(frame_parms->nushift))&&(i!=(frame_parms->nushift+3))) {
+        if ((i!=(nushift))&&(i!=((nushift+3)%6))) {
           y0[j]   += y0_16[m++];
           y0[j+1] += y0_16[m++];
         }
@@ -385,7 +386,7 @@ void generate_phich(LTE_DL_FRAME_PARMS *frame_parms,
       y0_16[7]   = d[15]*gain_lin_QPSK;
 
       for (i=0,j=0,m=0; i<6; i++,j+=2) {
-        if ((i!=(frame_parms->nushift))&&(i!=(frame_parms->nushift+3))) {
+        if ((i!=(nushift))&&(i!=((nushift+3)%6))) {
           y0[j]   += y0_16[m++];
           y0[j+1] += y0_16[m++];
         }
@@ -410,7 +411,7 @@ void generate_phich(LTE_DL_FRAME_PARMS *frame_parms,
       y0_16[7]   = d[23]*gain_lin_QPSK;
 
       for (i=0,j=0,m=0; i<6; i++,j+=2) {
-        if ((i!=(frame_parms->nushift))&&(i!=(frame_parms->nushift+3))) {
+        if ((i!=(nushift))&&(i!=((nushift+3)%6))) {
           y0[j]   += y0_16[m++];
           y0[j+1] += y0_16[m++];
         }
@@ -533,7 +534,7 @@ void generate_phich(LTE_DL_FRAME_PARMS *frame_parms,
       y1_16[7] = -y0_16[5];
 
       for (i=0,j=0,m=0; i<6; i++,j+=2) {
-        if ((i!=(frame_parms->nushift))&&(i!=(frame_parms->nushift+3))) {
+        if ((i!=(nushift))&&(i!=((nushift+3)%6))) {
           y0[j] += y0_16[m];
           y1[j] += y1_16[m++];
           y0[j+1] += y0_16[m];
@@ -644,7 +645,7 @@ void generate_phich(LTE_DL_FRAME_PARMS *frame_parms,
       y0_16[7]   = d[7]*gain_lin_QPSK;
 
       for (i=0,j=0,m=0; i<6; i++,j+=2) {
-        if ((i!=(frame_parms->nushift))&&(i!=(frame_parms->nushift+3))) {
+        if ((i!=(nushift))&&(i!=((nushift+3)%6))) {
           y0[j] += y0_16[m++];
           y0[j+1] += y0_16[m++];
         }
diff --git a/openair1/PHY/LTE_TRANSPORT/prach.c b/openair1/PHY/LTE_TRANSPORT/prach.c
index 91c419ebe39495931098001711d00ef45af97b28..6a989edc4d23a01e34fce301e11a9d55139e8ea6 100644
--- a/openair1/PHY/LTE_TRANSPORT/prach.c
+++ b/openair1/PHY/LTE_TRANSPORT/prach.c
@@ -62,9 +62,7 @@ void rx_prach0(PHY_VARS_eNB *eNB,
   uint8_t            Ncs_config;
   uint8_t            restricted_set;
   uint8_t            n_ra_prb;
-  int16_t            *prachF=NULL;
   int16_t            **rxsigF=NULL;
-  int16_t *prach2;
   uint8_t preamble_index;
   uint16_t NCS,NCS2;
   uint16_t preamble_offset=0,preamble_offset_old;
@@ -77,12 +75,10 @@ void rx_prach0(PHY_VARS_eNB *eNB,
   uint8_t not_found;
   int k=0;
   uint16_t u;
-  int16_t *Xu=0;
-  uint16_t offset;
+  c16_t *Xu=0;
   int16_t Ncp;
   uint16_t first_nonzero_root_idx=0;
   uint8_t new_dft=0;
-  uint8_t aa;
   int32_t lev;
   int16_t levdB;
   int fft_size=0,log2_ifft_size;
@@ -130,14 +126,11 @@ void rx_prach0(PHY_VARS_eNB *eNB,
                           tdd_mapindex,Nf);
   }
 
-  int16_t *prach[nb_rx];
   uint8_t prach_fmt = get_prach_fmt(prach_ConfigIndex,frame_type);
   uint16_t N_ZC = (prach_fmt <4)?839:139;
-
   if (eNB) {
     if (br_flag == 1) {
       prach_ifftp         = eNB->prach_vars_br.prach_ifft[ce_level];
-      prachF              = eNB->prach_vars_br.prachF;
       rxsigF              = eNB->prach_vars_br.rxsigF[ce_level];
 
       if (LOG_DEBUGFLAG(PRACH)) {
@@ -151,7 +144,6 @@ void rx_prach0(PHY_VARS_eNB *eNB,
       }
     } else {
       prach_ifftp       = eNB->prach_vars.prach_ifft[0];
-      prachF            = eNB->prach_vars.prachF;
       rxsigF            = eNB->prach_vars.rxsigF[0];
 
       //if (LOG_DEBUGFLAG(PRACH)) {
@@ -179,7 +171,8 @@ void rx_prach0(PHY_VARS_eNB *eNB,
 
   AssertFatal(ru!=NULL,"ru is null\n");
   int8_t dBEn0=0;
-  for (aa=0; aa<nb_rx; aa++) {
+  int16_t *prach[nb_rx];
+  for (int aa=0; aa<nb_rx; aa++) {
     if (ru->if_south == LOCAL_RF || ru->function == NGFI_RAU_IF5) { // set the time-domain signal if we have to use it in this node
       // DJP - indexing below in subframe zero takes us off the beginning of the array???
       prach[aa] = (int16_t *)&ru->common.rxdata[aa][(subframe*fp->samples_per_tti)-ru->N_TA_offset];
@@ -291,9 +284,9 @@ void rx_prach0(PHY_VARS_eNB *eNB,
       LOG_D(PHY,"rx_prach: Doing FFT for N_RB_UL %d nb_rx:%d Ncp:%d\n",fp->N_RB_UL, nb_rx, Ncp);
     }
 
-    for (aa=0; aa<nb_rx; aa++) {
+    for (int aa=0; aa<nb_rx; aa++) {
       AssertFatal(prach[aa]!=NULL,"prach[%d] is null\n",aa);
-      prach2 = prach[aa] + (Ncp<<1);
+      int16_t *prach2 = prach[aa] + (Ncp<<1); // <<1 because type int16 but input is c16
 
       // do DFT
       switch (fp->N_RB_UL) {
@@ -562,37 +555,38 @@ void rx_prach0(PHY_VARS_eNB *eNB,
 
     if (new_dft == 1) {
       new_dft = 0;
-
+      
       if (br_flag == 1) {
-        Xu=(int16_t *)eNB->X_u_br[ce_level][preamble_offset-first_nonzero_root_idx];
+        Xu=(c16_t *)eNB->X_u_br[ce_level][preamble_offset-first_nonzero_root_idx];
         prach_ifft = prach_ifftp[prach_ifft_cnt++];
-
+        
         if (eNB->prach_vars_br.repetition_number[ce_level]==1) memset(prach_ifft,0,((N_ZC==839)?2048:256)*sizeof(int32_t));
       } else {
-        Xu=(int16_t *)eNB->X_u[preamble_offset-first_nonzero_root_idx];
+        Xu=(c16_t *)eNB->X_u[preamble_offset-first_nonzero_root_idx];
         prach_ifft = prach_ifftp[0];
         memset(prach_ifft,0,((N_ZC==839) ? 2048 : 256)*sizeof(int32_t));
       }
-
-      memset(prachF, 0, sizeof(int16_t)*2*1024 );
-
-    if (LOG_DUMPFLAG(PRACH)) {
-        if (prach[0]!= NULL) LOG_M("prach_rx0.m","prach_rx0",prach[0],6144+792,1,1);
-
-        LOG_M("prach_rx1.m","prach_rx1",prach[1],6144+792,1,1);
-        LOG_M("prach_rxF0.m","prach_rxF0",rxsigF[0],12288,1,1);
-        LOG_M("prach_rxF1.m","prach_rxF1",rxsigF[1],12288,1,1);
-    }
-
-      for (aa=0; aa<nb_rx; aa++) {
+      c16_t prachF[1024] __attribute__((aligned(32)))={0};
+      
+      if (LOG_DUMPFLAG(PRACH)) 
+        for (int z=0; z<nb_rx; z++) 
+          if( prach[z] ) {
+            char tmp[128];
+            sprintf(tmp,"prach_rx%d.m", z);
+            LOG_M(tmp,tmp,prach[z],6144+792,1,1);
+            sprintf(tmp,"prach_rxF%d.m", z);
+            LOG_M(tmp,tmp,rxsigF[z],12288,1,1);
+          }
+      
+      for (int aa=0; aa<nb_rx; aa++) {
         // Do componentwise product with Xu* on each antenna
         k=0;
-
-        for (offset=0; offset<(N_ZC<<1); offset+=2) {
-          prachF[offset]   = (int16_t)(((int32_t)Xu[offset]*rxsigF[aa][k]   + (int32_t)Xu[offset+1]*rxsigF[aa][k+1])>>15);
-          prachF[offset+1] = (int16_t)(((int32_t)Xu[offset]*rxsigF[aa][k+1] - (int32_t)Xu[offset+1]*rxsigF[aa][k])>>15);
+        
+        for (int offset=0; offset<N_ZC; offset++) {
+          prachF[offset].r = (int16_t)(((int32_t)Xu[offset].r*rxsigF[aa][k]   + (int32_t)Xu[offset].i*rxsigF[aa][k+1])>>15);
+          prachF[offset].i = (int16_t)(((int32_t)Xu[offset].r*rxsigF[aa][k+1] - (int32_t)Xu[offset].i*rxsigF[aa][k])>>15);
           k+=2;
-
+          
           if (k==(12*2*fp->ofdm_symbol_size))
             k=0;
         }
@@ -600,13 +594,13 @@ void rx_prach0(PHY_VARS_eNB *eNB,
         // Now do IFFT of size 1024 (N_ZC=839) or 256 (N_ZC=139)
         if (N_ZC == 839) {
           log2_ifft_size = 10;
-          idft(IDFT_1024,prachF,prach_ifft_tmp,1);
+          idft(IDFT_1024,(int16_t*)prachF,prach_ifft_tmp,1);
 
           // compute energy and accumulate over receive antennas and repetitions for BR
           for (i=0; i<2048; i++)
             prach_ifft[i] += (prach_ifft_tmp[i<<1]*prach_ifft_tmp[i<<1] + prach_ifft_tmp[1+(i<<1)]*prach_ifft_tmp[1+(i<<1)])>>9;
         } else {
-          idft(IDFT_256,prachF,prach_ifft_tmp,1);
+          idft(IDFT_256,(int16_t*)prachF,prach_ifft_tmp,1);
           log2_ifft_size = 8;
 
           // compute energy and accumulate over receive antennas and repetitions for BR
@@ -674,14 +668,12 @@ void rx_prach0(PHY_VARS_eNB *eNB,
 
       if (br_flag == 0) {
         LOG_M("rxsigF.m","prach_rxF",&rxsigF[0][0],12288,1,1);
-        LOG_M("prach_rxF_comp0.m","prach_rxF_comp0",prachF,1024,1,1);
         LOG_M("Xu.m","xu",Xu,N_ZC,1,1);
         LOG_M("prach_ifft0.m","prach_t0",prach_ifft,1024,1,1);
         LOG_M("SF2_3.m","sf2_3",&ru->common.rxdata[0][2*fp->samples_per_tti],2*fp->samples_per_tti,1,1);
       } else {
         LOG_E(PHY,"Dumping prach (br_flag %d), k = %d (n_ra_prb %d)\n",br_flag,k,n_ra_prb);
         LOG_M("rxsigF_br.m","prach_rxF_br",&rxsigF[0][0],12288,1,1);
-        LOG_M("prach_rxF_comp0_br.m","prach_rxF_comp0_br",prachF,1024,1,1);
         LOG_M("Xu_br.m","xu_br",Xu,N_ZC,1,1);
         LOG_M("prach_ifft0_br.m","prach_t0_br",prach_ifft,1024,1,1);
       }
diff --git a/openair1/PHY/NR_ESTIMATION/nr_measurements_gNB.c b/openair1/PHY/NR_ESTIMATION/nr_measurements_gNB.c
index c527c3fc9df083c115f095d1d655e1c35a81c98f..d8cf2fdc1b9f006a07c1faf0540bc0d76606f51b 100644
--- a/openair1/PHY/NR_ESTIMATION/nr_measurements_gNB.c
+++ b/openair1/PHY/NR_ESTIMATION/nr_measurements_gNB.c
@@ -67,7 +67,7 @@ int nr_est_timing_advance_pusch(PHY_VARS_gNB* gNB, int UE_id)
   return max_pos - sync_pos;
 }
 
-int nr_est_timing_advance_srs(const NR_DL_FRAME_PARMS *frame_parms,
+int nr_est_timing_advance_srs(const NR_DL_FRAME_PARMS *frame_parms, 
                               const int32_t srs_estimated_channel_time[][frame_parms->ofdm_symbol_size]) {
   int timing_advance = 0;
   int max_val = 0;
diff --git a/openair1/PHY/NR_ESTIMATION/nr_ul_channel_estimation.c b/openair1/PHY/NR_ESTIMATION/nr_ul_channel_estimation.c
index 6093116ec903fe474af62405e6f9a8edb25987eb..1d6ff4603f8f5a0b9edec915011db31f4c13cfed 100644
--- a/openair1/PHY/NR_ESTIMATION/nr_ul_channel_estimation.c
+++ b/openair1/PHY/NR_ESTIMATION/nr_ul_channel_estimation.c
@@ -77,6 +77,29 @@ void freq2time(uint16_t ofdm_symbol_size,
   }
 }
 
+
+__attribute__((always_inline)) inline c16_t c32x16cumulVectVectWithSteps(c16_t *in1,
+                                                                         int *offset1,
+                                                                         const int step1,
+                                                                         c16_t *in2,
+                                                                         int *offset2,
+                                                                         const int step2,
+                                                                         const int modulo2,
+                                                                         const int N) {
+
+  int localOffset1=*offset1;
+  int localOffset2=*offset2;
+  c32_t cumul={0}; 
+  for (int i=0; i<N; i++) {
+    cumul=c32x16maddShift(in1[localOffset1], in2[localOffset2], cumul, 15);
+    localOffset1+=step1;
+    localOffset2= (localOffset2 + step2) % modulo2;
+  }
+  *offset1=localOffset1;
+  *offset2=localOffset2;
+  return c16x32div(cumul, N);
+}
+
 int nr_pusch_channel_estimation(PHY_VARS_gNB *gNB,
                                 unsigned char Ns,
                                 unsigned short p,
@@ -84,86 +107,72 @@ int nr_pusch_channel_estimation(PHY_VARS_gNB *gNB,
                                 int ul_id,
                                 unsigned short bwp_start_subcarrier,
                                 nfapi_nr_pusch_pdu_t *pusch_pdu) {
-
-  int pilot[3280] __attribute__((aligned(16)));
-  unsigned char aarx;
-  unsigned short k0;
-  unsigned int pilot_cnt,re_cnt;
-  int16_t ch[2],ch_r[2],ch_l[2],*pil,*rxF,*ul_ch;
+  c16_t pilot[3280] __attribute__((aligned(16)));
   int16_t *fl,*fm,*fr,*fml,*fmr,*fmm,*fdcl,*fdcr,*fdclh,*fdcrh;
-  int ch_offset,symbol_offset ;
-  int32_t **ul_ch_estimates_time =  gNB->pusch_vars[ul_id]->ul_ch_estimates_time;
-  int chest_freq = gNB->chest_freq;
-  __m128i *ul_ch_128;
+
+  const int chest_freq = gNB->chest_freq;
 
 #ifdef DEBUG_CH
   FILE *debug_ch_est;
   debug_ch_est = fopen("debug_ch_est.txt","w");
 #endif
-
   //uint16_t Nid_cell = (eNB_offset == 0) ? gNB->frame_parms.Nid_cell : gNB->measurements.adj_cell_id[eNB_offset-1];
-
-  uint8_t nushift;
-  int **ul_ch_estimates  = gNB->pusch_vars[ul_id]->ul_ch_estimates;
-  int **rxdataF = gNB->common_vars.rxdataF;
-  int soffset = (Ns&3)*gNB->frame_parms.symbols_per_slot*gNB->frame_parms.ofdm_symbol_size;
-  nushift = (p>>1)&1;
+  c16_t **ul_ch_estimates  = (c16_t **) gNB->pusch_vars[ul_id]->ul_ch_estimates;
+  const int symbolSize = gNB->frame_parms.ofdm_symbol_size;
+  const int soffset = (Ns&3)*gNB->frame_parms.symbols_per_slot*symbolSize;
+  const int nushift = (p>>1)&1;
   gNB->frame_parms.nushift = nushift;
+  int ch_offset     = symbolSize*symbol;
+  const int symbol_offset = symbolSize*symbol;
 
-  ch_offset     = gNB->frame_parms.ofdm_symbol_size*symbol;
-
-  symbol_offset = gNB->frame_parms.ofdm_symbol_size*symbol;
-
-  k0 = bwp_start_subcarrier;
-  int re_offset;
-
-  uint16_t nb_rb_pusch = pusch_pdu->rb_size;
+  const int k0 = bwp_start_subcarrier;
+  const int nb_rb_pusch = pusch_pdu->rb_size;
 
   LOG_D(PHY, "In %s: ch_offset %d, soffset %d, symbol_offset %d, OFDM size %d, Ns = %d, k0 = %d, symbol %d\n",
         __FUNCTION__,
         ch_offset, soffset,
         symbol_offset,
-        gNB->frame_parms.ofdm_symbol_size,
+        symbolSize,
         Ns,
         k0,
         symbol);
 
   switch (nushift) {
-   case 0:
-         fl = filt8_l0;
-         fm = filt8_m0;
-         fr = filt8_r0;
-         fmm = filt8_mm0;
-         fml = filt8_m0;
-         fmr = filt8_mr0;
-         fdcl = filt8_dcl0;
-         fdcr = filt8_dcr0;
-         fdclh = filt8_dcl0_h;
-         fdcrh = filt8_dcr0_h;
-         break;
-
-   case 1:
-         fl = filt8_l1;
-         fm = filt8_m1;
-         fr = filt8_r1;
-         fmm = filt8_mm1;
-         fml = filt8_ml1;
-         fmr = filt8_mm1;
-         fdcl = filt8_dcl1;
-         fdcr = filt8_dcr1;
-         fdclh = filt8_dcl1_h;
-         fdcrh = filt8_dcr1_h;
-         break;
-
-   default:
+    case 0:
+      fl = filt8_l0;
+      fm = filt8_m0;
+      fr = filt8_r0;
+      fmm = filt8_mm0;
+      fml = filt8_m0;
+      fmr = filt8_mr0;
+      fdcl = filt8_dcl0;
+      fdcr = filt8_dcr0;
+      fdclh = filt8_dcl0_h;
+      fdcrh = filt8_dcr0_h;
+      break;
+
+    case 1:
+      fl = filt8_l1;
+      fm = filt8_m1;
+      fr = filt8_r1;
+      fmm = filt8_mm1;
+      fml = filt8_ml1;
+      fmr = filt8_mm1;
+      fdcl = filt8_dcl1;
+      fdcr = filt8_dcr1;
+      fdclh = filt8_dcl1_h;
+      fdcrh = filt8_dcr1_h;
+      break;
+
+    default:
 #ifdef DEBUG_CH
       if (debug_ch_est)
         fclose(debug_ch_est);
 
 #endif
-     return(-1);
-     break;
-   }
+      return(-1);
+      break;
+  }
 
   //------------------generate DMRS------------------//
 
@@ -171,656 +180,402 @@ int nr_pusch_channel_estimation(PHY_VARS_gNB *gNB,
     gNB->pusch_gold_init[pusch_pdu->scid] = pusch_pdu->ul_dmrs_scrambling_id;
     nr_gold_pusch(gNB, pusch_pdu->scid, pusch_pdu->ul_dmrs_scrambling_id);
   }
+
   if (pusch_pdu->transform_precoding == transformPrecoder_disabled) {
-    nr_pusch_dmrs_rx(gNB, Ns, gNB->nr_gold_pusch_dmrs[pusch_pdu->scid][Ns][symbol], &pilot[0], (1000+p), 0, nb_rb_pusch,
+    nr_pusch_dmrs_rx(gNB, Ns, gNB->nr_gold_pusch_dmrs[pusch_pdu->scid][Ns][symbol], (int32_t *)pilot, (1000+p), 0, nb_rb_pusch,
                      (pusch_pdu->bwp_start + pusch_pdu->rb_start)*NR_NB_SC_PER_RB, pusch_pdu->dmrs_config_type);
-  }
-  else {  // if transform precoding or SC-FDMA is enabled in Uplink
-
+  } else { // if transform precoding or SC-FDMA is enabled in Uplink
     // NR_SC_FDMA supports type1 DMRS so only 6 DMRS REs per RB possible
-    uint16_t index = get_index_for_dmrs_lowpapr_seq(nb_rb_pusch * (NR_NB_SC_PER_RB/2));
-    uint8_t u = pusch_pdu->dfts_ofdm.low_papr_group_number; 
-    uint8_t v = pusch_pdu->dfts_ofdm.low_papr_sequence_number;
+    const uint16_t index = get_index_for_dmrs_lowpapr_seq(nb_rb_pusch * (NR_NB_SC_PER_RB/2));
+    const uint8_t u = pusch_pdu->dfts_ofdm.low_papr_group_number;
+    const uint8_t v = pusch_pdu->dfts_ofdm.low_papr_sequence_number;
     int16_t *dmrs_seq = gNB_dmrs_lowpaprtype1_sequence[u][v][index];
-
     AssertFatal(index >= 0, "Num RBs not configured according to 3GPP 38.211 section 6.3.1.4. For PUSCH with transform precoding, num RBs cannot be multiple of any other primenumber other than 2,3,5\n");
     AssertFatal(dmrs_seq != NULL, "DMRS low PAPR seq not found, check if DMRS sequences are generated");
-
     LOG_D(PHY,"Transform Precoding params. u: %d, v: %d, index for dmrsseq: %d\n", u, v, index);
-    
-    nr_pusch_lowpaprtype1_dmrs_rx(gNB, Ns, dmrs_seq, &pilot[0], 1000, 0, nb_rb_pusch, 0, pusch_pdu->dmrs_config_type);    
-
-    #ifdef DEBUG_PUSCH
-      printf ("NR_UL_CHANNEL_EST: index %d, u %d,v %d\n", index, u, v);
-      LOG_M("gNb_DMRS_SEQ.m","gNb_DMRS_SEQ", dmrs_seq,6*nb_rb_pusch,1,1);
-    #endif
-
+    nr_pusch_lowpaprtype1_dmrs_rx(gNB, Ns, dmrs_seq, (int32_t *)pilot, 1000, 0, nb_rb_pusch, 0, pusch_pdu->dmrs_config_type);
+#ifdef DEBUG_PUSCH
+    printf ("NR_UL_CHANNEL_EST: index %d, u %d,v %d\n", index, u, v);
+    LOG_M("gNb_DMRS_SEQ.m","gNb_DMRS_SEQ", dmrs_seq,6*nb_rb_pusch,1,1);
+#endif
   }
   //------------------------------------------------//
 
-
 #ifdef DEBUG_PUSCH
+  
   for (int i = 0; i < (6 * nb_rb_pusch); i++) {
-    LOG_I(PHY, "In %s: %d + j*(%d)\n",
-      __FUNCTION__,
-      ((int16_t*)pilot)[2 * i],
-      ((int16_t*)pilot)[1 + (2 * i)]);
+    LOG_I(PHY, "In %s: %d + j*(%d)\n", __FUNCTION__, pilot[i].r,pilot[i].i);
   }
+  
 #endif
+  const uint8_t b_shift = pusch_pdu->nrOfLayers == 1;
+  
+  for (int aarx=0; aarx<gNB->frame_parms.nb_antennas_rx; aarx++) {
+    c16_t *rxdataF = (c16_t *)&gNB->common_vars.rxdataF[aarx][symbol_offset];
+    c16_t *ul_ch = &ul_ch_estimates[p*gNB->frame_parms.nb_antennas_rx+aarx][ch_offset];
 
-  uint8_t b_shift = pusch_pdu->nrOfLayers == 1;
-
-  for (aarx=0; aarx<gNB->frame_parms.nb_antennas_rx; aarx++) {
-
-    pil   = (int16_t *)&pilot[0];
-    rxF   = (int16_t *)&rxdataF[aarx][(soffset + symbol_offset + k0 + nushift)];
-    ul_ch = (int16_t *)&ul_ch_estimates[p*gNB->frame_parms.nb_antennas_rx+aarx][ch_offset];
-    re_offset = k0;
-
-    memset(ul_ch,0,4*(gNB->frame_parms.ofdm_symbol_size));
-
+    memset(ul_ch,0,sizeof(*ul_ch)*symbolSize);
 #ifdef DEBUG_PUSCH
     LOG_I(PHY, "In %s symbol_offset %d, nushift %d\n", __FUNCTION__, symbol_offset, nushift);
-    LOG_I(PHY, "In %s ch est pilot addr %p, N_RB_UL %d\n", __FUNCTION__, &pilot[0], gNB->frame_parms.N_RB_UL);
+    LOG_I(PHY, "In %s ch est pilot, N_RB_UL %d\n", __FUNCTION__, gNB->frame_parms.N_RB_UL);
     LOG_I(PHY, "In %s bwp_start_subcarrier %d, k0 %d, first_carrier %d, nb_rb_pusch %d\n", __FUNCTION__, bwp_start_subcarrier, k0, gNB->frame_parms.first_carrier_offset, nb_rb_pusch);
-    LOG_I(PHY, "In %s rxF addr %p p %d\n", __FUNCTION__, rxF, p);
     LOG_I(PHY, "In %s ul_ch addr %p nushift %d\n", __FUNCTION__, ul_ch, nushift);
 #endif
-
-    if (pusch_pdu->dmrs_config_type == pusch_dmrs_type1 && chest_freq == 0){
+    
+    if (pusch_pdu->dmrs_config_type == pusch_dmrs_type1 && chest_freq == 0) {
+      c16_t *pil   = pilot;    
+      int re_offset = k0;
       LOG_D(PHY,"PUSCH estimation DMRS type 1, Freq-domain interpolation");
-
       // For configuration type 1: k = 4*n + 2*k' + delta,
       // where k' is 0 or 1, and delta is in Table 6.4.1.1.3-1 from TS 38.211
-
-      pilot_cnt = 0;
+      int pilot_cnt = 0;
       int delta = nr_pusch_dmrs_delta(pusch_dmrs_type1, p);
-
+      
       for (int n = 0; n < 3*nb_rb_pusch; n++) {
-
         // LS estimation
-        ch[0] = 0;
-        ch[1] = 0;
+        c32_t ch = {0};
+        
         for (int k_line = 0; k_line <= 1; k_line++) {
-          re_offset = (k0 + (n << 2) + (k_line << 1) + delta) % gNB->frame_parms.ofdm_symbol_size;
-          rxF = (int16_t *) &rxdataF[aarx][(soffset + symbol_offset + re_offset)];
-          ch[0] += (int16_t) (((int32_t) pil[0] * rxF[0] - (int32_t) pil[1] * rxF[1]) >> (15+b_shift));
-          ch[1] += (int16_t) (((int32_t) pil[0] * rxF[1] + (int32_t) pil[1] * rxF[0]) >> (15+b_shift));
-          pil += 2;
+          re_offset = (k0 + (n << 2) + (k_line << 1) + delta) % symbolSize;
+          ch=c32x16maddShift(*pil,
+                             rxdataF[soffset + re_offset],
+                             ch,
+                             15+b_shift);
+          pil++;
         }
-
+        
+        c16_t ch16= {.r=(int16_t)ch.r, .i=(int16_t)ch.i};
+        
         // Channel interpolation
         for (int k_line = 0; k_line <= 1; k_line++) {
 #ifdef DEBUG_PUSCH
-          re_offset = (k0 + (n << 2) + (k_line << 1)) % gNB->frame_parms.ofdm_symbol_size;
-          rxF = (int16_t *) &rxdataF[aarx][(soffset + symbol_offset + re_offset)];
+          re_offset = (k0 + (n << 2) + (k_line << 1)) % symbolSize;
+          c16_t *rxF = &rxdataF[soffset + re_offset];
           printf("pilot %4u: pil -> (%6d,%6d), rxF -> (%4d,%4d), ch -> (%4d,%4d)\n",
-                 pilot_cnt, pil[0], pil[1], rxF[0], rxF[1], ch[0], ch[1]);
-          //printf("data %4u: rxF -> (%4d,%4d) (%2d)\n",pilot_cnt, rxF[2], rxF[3], dBc(rxF[2], rxF[3]));
+                 pilot_cnt, pil->r, pil->i, rxF->r, rxF->i, ch.r, ch.i);
 #endif
+          
           if (pilot_cnt == 0) {
-            multadd_real_vector_complex_scalar(fl, ch, ul_ch, 8);
+            c16multaddVectRealComplex(fl, &ch16, ul_ch, 8);
           } else if (pilot_cnt == 1) {
-            multadd_real_vector_complex_scalar(fml, ch, ul_ch, 8);
+            c16multaddVectRealComplex(fml, &ch16, ul_ch, 8);
           } else if (pilot_cnt == (6*nb_rb_pusch-2)) {
-            multadd_real_vector_complex_scalar(fmr, ch, ul_ch, 8);
-            ul_ch+=8;
+            c16multaddVectRealComplex(fmr, &ch16, ul_ch, 8);
+            ul_ch+=4;
           } else if (pilot_cnt == (6*nb_rb_pusch-1)) {
-            multadd_real_vector_complex_scalar(fr, ch, ul_ch, 8);
+            c16multaddVectRealComplex(fr, &ch16, ul_ch, 8);
           } else if (pilot_cnt%2 == 0) {
-            multadd_real_vector_complex_scalar(fmm, ch, ul_ch, 8);
-            ul_ch+=8;
+            c16multaddVectRealComplex(fmm, &ch16, ul_ch, 8);
+            ul_ch+=4;
           } else {
-            multadd_real_vector_complex_scalar(fm, ch, ul_ch, 8);
+            c16multaddVectRealComplex(fm, &ch16, ul_ch, 8);
           }
+          
           pilot_cnt++;
         }
       }
 
       // check if PRB crosses DC and improve estimates around DC
-      if ((bwp_start_subcarrier < gNB->frame_parms.ofdm_symbol_size) && (bwp_start_subcarrier+nb_rb_pusch*12 >= gNB->frame_parms.ofdm_symbol_size)) {
-        ul_ch = (int16_t *)&ul_ch_estimates[p*gNB->frame_parms.nb_antennas_rx+aarx][ch_offset];
-        uint16_t idxDC = 2*(gNB->frame_parms.ofdm_symbol_size - bwp_start_subcarrier);
-        uint16_t idxPil = idxDC/2;
+      if ((bwp_start_subcarrier < symbolSize) && (bwp_start_subcarrier+nb_rb_pusch*12 >= symbolSize)) {
+        ul_ch = &ul_ch_estimates[p*gNB->frame_parms.nb_antennas_rx+aarx][ch_offset];
+        const uint16_t idxDC = symbolSize - bwp_start_subcarrier;
         re_offset = k0;
-        pil = (int16_t *)&pilot[0];
-        pil += (idxPil-2);
-        ul_ch += (idxDC-4);
-        ul_ch = memset(ul_ch, 0, sizeof(int16_t)*10);
-        re_offset = (re_offset+idxDC/2-2) % gNB->frame_parms.ofdm_symbol_size;
-        rxF   = (int16_t *)&rxdataF[aarx][soffset+(symbol_offset+nushift+re_offset)];
-        ch[0] = (int16_t)(((int32_t)pil[0]*rxF[0] - (int32_t)pil[1]*rxF[1])>>15);
-        ch[1] = (int16_t)(((int32_t)pil[0]*rxF[1] + (int32_t)pil[1]*rxF[0])>>15);
-
-        // for proper allignment of SIMD vectors
+        pil = pilot + idxDC / 2 - 1;
+        ul_ch += idxDC - 2 ;
+        ul_ch = memset(ul_ch, 0, sizeof(*ul_ch)*5);
+        re_offset = (re_offset+idxDC-1) % symbolSize;
+        const c16_t ch=c16mulShift(*pil,
+                                   rxdataF[soffset+nushift+re_offset],
+                                   15);
+
+        // for proper alignment of SIMD vectors
         if((gNB->frame_parms.N_RB_UL&1)==0) {
-
-          multadd_real_vector_complex_scalar(fdcl, ch, ul_ch-4, 8);
-        
-          pil += 4;
-          re_offset = (re_offset+4) % gNB->frame_parms.ofdm_symbol_size;
-          rxF   = (int16_t *)&rxdataF[aarx][(symbol_offset+nushift+re_offset)];
-          ch[0] = (int16_t)(((int32_t)pil[0]*rxF[0] - (int32_t)pil[1]*rxF[1])>>15);
-          ch[1] = (int16_t)(((int32_t)pil[0]*rxF[1] + (int32_t)pil[1]*rxF[0])>>15);
-        
-          multadd_real_vector_complex_scalar(fdcr, ch, ul_ch-4, 8);
-
+          c16multaddVectRealComplex(fdcl, &ch, ul_ch-2, 8);
+          pil += 2;
+          re_offset = (re_offset+4) % symbolSize;
+          const c16_t ch_tmp=c16mulShift(*pil,
+                                         rxdataF[nushift+re_offset],
+                                         15);
+          c16multaddVectRealComplex(fdcr, &ch_tmp, ul_ch-2, 8);
         } else {
-
-          multadd_real_vector_complex_scalar(fdclh, ch, ul_ch, 8);
-        
-          pil += 4;
-          re_offset = (re_offset+4) % gNB->frame_parms.ofdm_symbol_size;
-          rxF   = (int16_t *)&rxdataF[aarx][soffset+(symbol_offset+nushift+re_offset)];
-          ch[0] = (int16_t)(((int32_t)pil[0]*rxF[0] - (int32_t)pil[1]*rxF[1])>>15);
-          ch[1] = (int16_t)(((int32_t)pil[0]*rxF[1] + (int32_t)pil[1]*rxF[0])>>15);
-        
-          multadd_real_vector_complex_scalar(fdcrh, ch, ul_ch, 8);
+          c16multaddVectRealComplex(fdclh, &ch, ul_ch, 8);
+          pil += 2;
+          re_offset = (re_offset+4) % symbolSize;
+          const c16_t ch_tmp=c16mulShift(*pil,
+                                         rxdataF[soffset+nushift+re_offset],
+                                         15);
+          c16multaddVectRealComplex(fdcrh, &ch_tmp, ul_ch, 8);
         }
       }
 
 #ifdef DEBUG_PUSCH
-      ul_ch = (int16_t *)&ul_ch_estimates[p*gNB->frame_parms.nb_antennas_rx+aarx][ch_offset];
+      ul_ch = &ul_ch_estimates[p*gNB->frame_parms.nb_antennas_rx+aarx][ch_offset];
+
       for(uint16_t idxP=0; idxP<ceil((float)nb_rb_pusch*12/8); idxP++) {
         printf("(%3d)\t",idxP);
-        for(uint8_t idxI=0; idxI<16; idxI += 2) {
-          printf("%d\t%d\t",ul_ch[idxP*16+idxI],ul_ch[idxP*16+idxI+1]);
+
+        for(uint8_t idxI=0; idxI<8; idxI++) {
+          printf("%d\t%d\t",ul_ch[idxP*8+idxI].r,ul_ch[idxP*8+idxI].i);
         }
+
         printf("\n");
       }
-#endif    
-    }
-    else if (pusch_pdu->dmrs_config_type == pusch_dmrs_type2 && chest_freq == 0) { //pusch_dmrs_type2  |p_r,p_l,d,d,d,d,p_r,p_l,d,d,d,d|
+
+#endif
+    } else if (pusch_pdu->dmrs_config_type == pusch_dmrs_type2 && chest_freq == 0) { //pusch_dmrs_type2  |p_r,p_l,d,d,d,d,p_r,p_l,d,d,d,d|
       LOG_D(PHY,"PUSCH estimation DMRS type 2, Freq-domain interpolation");
+      c16_t *pil   = pilot;
+      int re_offset = k0;
       // Treat first DMRS specially (left edge)
-
-        rxF   = (int16_t *)&rxdataF[aarx][soffset+(symbol_offset+nushift+re_offset)];
-
-        ul_ch[0] = (int16_t)(((int32_t)pil[0]*rxF[0] - (int32_t)pil[1]*rxF[1])>>15);
-        ul_ch[1] = (int16_t)(((int32_t)pil[0]*rxF[1] + (int32_t)pil[1]*rxF[0])>>15);
-
-        pil += 2;
-        ul_ch += 2;
-        re_offset = (re_offset + 1)%gNB->frame_parms.ofdm_symbol_size;
+      *ul_ch=c16mulShift(*pil,
+                         rxdataF[soffset+nushift+re_offset],
+                         15);
+      pil++;
+      ul_ch++;
+      re_offset = (re_offset + 1)%symbolSize;
+      ch_offset++;
+
+      // TO verify: used after the loop, likely a piece of code is missing for ch_r
+      c16_t ch_r;
+      for (int re_cnt = 1; re_cnt < (nb_rb_pusch*NR_NB_SC_PER_RB) - 5; re_cnt += 6) {
+        c16_t ch_l=c16mulShift(*pil,
+                               rxdataF[soffset+nushift+re_offset],
+                               15);
+        *ul_ch = ch_l;
+        pil++;
+        ul_ch++;
         ch_offset++;
-
-        for (re_cnt = 1; re_cnt < (nb_rb_pusch*NR_NB_SC_PER_RB) - 5; re_cnt += 6){
-
-          rxF   = (int16_t *)&rxdataF[aarx][soffset+(symbol_offset+nushift+re_offset)];
-
-          ch_l[0] = (int16_t)(((int32_t)pil[0]*rxF[0] - (int32_t)pil[1]*rxF[1])>>15);
-          ch_l[1] = (int16_t)(((int32_t)pil[0]*rxF[1] + (int32_t)pil[1]*rxF[0])>>15);
-
-          ul_ch[0] = ch_l[0];
-          ul_ch[1] = ch_l[1];
-
-          pil += 2;
-          ul_ch += 2;
-          ch_offset++;
-
-          multadd_real_four_symbols_vector_complex_scalar(filt8_ml2,
-                                                          ch_l,
-                                                          ul_ch);
-
-          re_offset = (re_offset+5)%gNB->frame_parms.ofdm_symbol_size;
-
-          rxF   = (int16_t *)&rxdataF[aarx][soffset+symbol_offset+nushift+re_offset];
-
-          ch_r[0] = (int16_t)(((int32_t)pil[0]*rxF[0] - (int32_t)pil[1]*rxF[1])>>15);
-          ch_r[1] = (int16_t)(((int32_t)pil[0]*rxF[1] + (int32_t)pil[1]*rxF[0])>>15);
-
-
-          multadd_real_four_symbols_vector_complex_scalar(filt8_mr2,
-                                                          ch_r,
-                                                          ul_ch);
-
-          //for (int re_idx = 0; re_idx < 8; re_idx += 2)
-            //printf("ul_ch = %d + j*%d\n", ul_ch[re_idx], ul_ch[re_idx+1]);
-
-          ul_ch += 8;
-          ch_offset += 4;
-
-          ul_ch[0] = ch_r[0];
-          ul_ch[1] = ch_r[1];
-
-          pil += 2;
-          ul_ch += 2;
-          ch_offset++;
-          re_offset = (re_offset + 1)%gNB->frame_parms.ofdm_symbol_size;
-
-        }
-
-        // Treat last pilot specially (right edge)
-
-        rxF   = (int16_t *)&rxdataF[aarx][soffset+(symbol_offset+nushift+re_offset)];
-
-        ch_l[0] = (int16_t)(((int32_t)pil[0]*rxF[0] - (int32_t)pil[1]*rxF[1])>>15);
-        ch_l[1] = (int16_t)(((int32_t)pil[0]*rxF[1] + (int32_t)pil[1]*rxF[0])>>15);
-
-        ul_ch[0] = ch_l[0];
-        ul_ch[1] = ch_l[1];
-
-        ul_ch += 2;
-        ch_offset++;
-
-        multadd_real_four_symbols_vector_complex_scalar(filt8_rr1,
-                                                        ch_l,
-                                                        ul_ch);
-
-        multadd_real_four_symbols_vector_complex_scalar(filt8_rr2,
-                                                        ch_r,
+        multadd_real_four_symbols_vector_complex_scalar(filt8_ml2,
+                                                      &ch_l,
+                                                      ul_ch);
+        re_offset = (re_offset+5)%symbolSize;
+        ch_r=c16mulShift(*pil,
+                         rxdataF[soffset+nushift+re_offset],
+                         15);
+        multadd_real_four_symbols_vector_complex_scalar(filt8_mr2,
+                                                        &ch_r,
                                                         ul_ch);
-
-        ul_ch_128 = (__m128i *)&ul_ch_estimates[p*gNB->frame_parms.nb_antennas_rx+aarx][ch_offset];
-
-        ul_ch_128[0] = _mm_slli_epi16 (ul_ch_128[0], 2);
-    }
-
-    else if (pusch_pdu->dmrs_config_type == pusch_dmrs_type1) {// this is case without frequency-domain linear interpolation, just take average of LS channel estimates of 6 DMRS REs and use a common value for the whole PRB
-      LOG_D(PHY,"PUSCH estimation DMRS type 1, no Freq-domain interpolation");
-      int32_t ch_0, ch_1;
+        //for (int re_idx = 0; re_idx < 8; re_idx += 2)
+        //printf("ul_ch = %d + j*%d\n", ul_ch[re_idx], ul_ch[re_idx+1]);
+        ul_ch += 4;
+        ch_offset += 4;
+        *ul_ch = ch_r;
+        pil++;
+        ul_ch++;
+        ch_offset++;
+        re_offset = (re_offset + 1)%symbolSize;
+      }
+      
+      // Treat last pilot specially (right edge)
+      c16_t ch_l=c16mulShift(*pil,
+                             rxdataF[soffset+nushift+re_offset],
+                             15);
+      *ul_ch = ch_l;
+      ul_ch++;
+      ch_offset++;
+      multadd_real_four_symbols_vector_complex_scalar(filt8_rr1,
+                                                      &ch_l,
+                                                      ul_ch);
+      multadd_real_four_symbols_vector_complex_scalar(filt8_rr2,
+                                                      &ch_r,
+                                                      ul_ch);
+      __m128i *ul_ch_128 = (__m128i *)&ul_ch_estimates[p*gNB->frame_parms.nb_antennas_rx+aarx][ch_offset];
+      ul_ch_128[0] = _mm_slli_epi16 (ul_ch_128[0], 2);
+    } 
+
+    else if (pusch_pdu->dmrs_config_type == pusch_dmrs_type1) { // this is case without frequency-domain linear interpolation, just take average of LS channel estimates of 6 DMRS REs and use a common value for the whole PRB
+      LOG_D(PHY,"PUSCH estimation DMRS type 1, no Freq-domain interpolation\n");
+      c16_t *rxF   =  &rxdataF[soffset + nushift];
+      int pil_offset = 0;
+      int re_offset = k0;
+      c16_t ch;
+      
       // First PRB
-      ch_0 = ((int32_t)pil[0]*rxF[0] - (int32_t)pil[1]*rxF[1])>>15;
-      ch_1 = ((int32_t)pil[0]*rxF[1] + (int32_t)pil[1]*rxF[0])>>15;
-
-      pil += 2;
-      re_offset = (re_offset+2) % gNB->frame_parms.ofdm_symbol_size;
-      rxF   = (int16_t *)&rxdataF[aarx][(symbol_offset+nushift+re_offset)];
-
-      ch_0 += ((int32_t)pil[0]*rxF[0] - (int32_t)pil[1]*rxF[1])>>15;
-      ch_1 += ((int32_t)pil[0]*rxF[1] + (int32_t)pil[1]*rxF[0])>>15;
-
-      pil += 2;
-      re_offset = (re_offset+2) % gNB->frame_parms.ofdm_symbol_size;
-      rxF   = (int16_t *)&rxdataF[aarx][(symbol_offset+nushift+re_offset)];
-
-      ch_0 += ((int32_t)pil[0]*rxF[0] - (int32_t)pil[1]*rxF[1])>>15;
-      ch_1 += ((int32_t)pil[0]*rxF[1] + (int32_t)pil[1]*rxF[0])>>15;
-
-      pil += 2;
-      re_offset = (re_offset+2) % gNB->frame_parms.ofdm_symbol_size;
-      rxF   = (int16_t *)&rxdataF[aarx][(symbol_offset+nushift+re_offset)];
-
-      ch_0 += ((int32_t)pil[0]*rxF[0] - (int32_t)pil[1]*rxF[1])>>15;
-      ch_1 += ((int32_t)pil[0]*rxF[1] + (int32_t)pil[1]*rxF[0])>>15;
-
-      pil += 2;
-      re_offset = (re_offset+2) % gNB->frame_parms.ofdm_symbol_size;
-      rxF   = (int16_t *)&rxdataF[aarx][(symbol_offset+nushift+re_offset)];
-
-      ch_0 += ((int32_t)pil[0]*rxF[0] - (int32_t)pil[1]*rxF[1])>>15;
-      ch_1 += ((int32_t)pil[0]*rxF[1] + (int32_t)pil[1]*rxF[0])>>15;
-
-      pil += 2;
-      re_offset = (re_offset+2) % gNB->frame_parms.ofdm_symbol_size;
-      rxF   = (int16_t *)&rxdataF[aarx][(symbol_offset+nushift+re_offset)];
-
-      ch_0 += ((int32_t)pil[0]*rxF[0] - (int32_t)pil[1]*rxF[1])>>15;
-      ch_1 += ((int32_t)pil[0]*rxF[1] + (int32_t)pil[1]*rxF[0])>>15;
-
-      pil += 2;
-      re_offset = (re_offset+2) % gNB->frame_parms.ofdm_symbol_size;
-      rxF   = (int16_t *)&rxdataF[aarx][(symbol_offset+nushift+re_offset)];
-
-      ch[0] = ch_0 / 6;
-      ch[1] = ch_1 / 6;
-
-
-
+      ch=c32x16cumulVectVectWithSteps(pilot, &pil_offset, 1, rxF, &re_offset, 2, symbolSize, 6);
 
 #if NO_INTERP
-      for (int i=0;i<12;i++) ((int32_t*)ul_ch)[i] = *(int32_t*)ch;
-      ul_ch+=24;
+      for (c16_t *end=ul_ch+12; ul_ch<end; ul_ch++)
+        *ul_ch=ch;
 #else
-      multadd_real_vector_complex_scalar(filt8_avlip0,
-                                         ch,
-                                         ul_ch,
-                                         8);
-
-      ul_ch += 16;
-      multadd_real_vector_complex_scalar(filt8_avlip1,
-                                         ch,
-                                         ul_ch,
-                                         8);
-
-      ul_ch += 16;
-      multadd_real_vector_complex_scalar(filt8_avlip2,
-                                         ch,
-                                         ul_ch,
-                                         8);
-      ul_ch -= 24;
+      c16multaddVectRealComplex(filt8_avlip0, ch, ul_ch, 8);
+      ul_ch += 8;
+      c16multaddVectRealComplex(filt8_avlip1, ch, ul_ch, 8);
+      ul_ch += 8;
+      c16multaddVectRealComplex(filt8_avlip2, ch, ul_ch, 8);
+      ul_ch -= 12;
 #endif
 
-      for (pilot_cnt=6; pilot_cnt<6*(nb_rb_pusch-1); pilot_cnt += 6) {
-
-        ch_0 = ((int32_t)pil[0]*rxF[0] - (int32_t)pil[1]*rxF[1])>>15;
-        ch_1 = ((int32_t)pil[0]*rxF[1] + (int32_t)pil[1]*rxF[0])>>15;
-
-        pil += 2;
-        re_offset = (re_offset+2) % gNB->frame_parms.ofdm_symbol_size;
-        rxF   = (int16_t *)&rxdataF[aarx][(symbol_offset+nushift+re_offset)];
-
-        ch_0 += ((int32_t)pil[0]*rxF[0] - (int32_t)pil[1]*rxF[1])>>15;
-        ch_1 += ((int32_t)pil[0]*rxF[1] + (int32_t)pil[1]*rxF[0])>>15;
-
-        pil += 2;
-        re_offset = (re_offset+2) % gNB->frame_parms.ofdm_symbol_size;
-        rxF   = (int16_t *)&rxdataF[aarx][(symbol_offset+nushift+re_offset)];
-
-        ch_0 += ((int32_t)pil[0]*rxF[0] - (int32_t)pil[1]*rxF[1])>>15;
-        ch_1 += ((int32_t)pil[0]*rxF[1] + (int32_t)pil[1]*rxF[0])>>15;
-
-        pil += 2;
-        re_offset = (re_offset+2) % gNB->frame_parms.ofdm_symbol_size;
-        rxF   = (int16_t *)&rxdataF[aarx][(symbol_offset+nushift+re_offset)];
-
-        ch_0 += ((int32_t)pil[0]*rxF[0] - (int32_t)pil[1]*rxF[1])>>15;
-        ch_1 += ((int32_t)pil[0]*rxF[1] + (int32_t)pil[1]*rxF[0])>>15;
-
-        pil += 2;
-        re_offset = (re_offset+2) % gNB->frame_parms.ofdm_symbol_size;
-        rxF   = (int16_t *)&rxdataF[aarx][(symbol_offset+nushift+re_offset)];
-
-        ch_0 += ((int32_t)pil[0]*rxF[0] - (int32_t)pil[1]*rxF[1])>>15;
-        ch_1 += ((int32_t)pil[0]*rxF[1] + (int32_t)pil[1]*rxF[0])>>15;
-
-        pil += 2;
-        re_offset = (re_offset+2) % gNB->frame_parms.ofdm_symbol_size;
-        rxF   = (int16_t *)&rxdataF[aarx][(symbol_offset+nushift+re_offset)];
-
-        ch_0 += ((int32_t)pil[0]*rxF[0] - (int32_t)pil[1]*rxF[1])>>15;
-        ch_1 += ((int32_t)pil[0]*rxF[1] + (int32_t)pil[1]*rxF[0])>>15;
-
-        pil += 2;
-        re_offset = (re_offset+2) % gNB->frame_parms.ofdm_symbol_size;
-        rxF   = (int16_t *)&rxdataF[aarx][(symbol_offset+nushift+re_offset)];
-
-        ch[0] = ch_0 / 6;
-        ch[1] = ch_1 / 6;
+      for (int pilot_cnt=6; pilot_cnt<6*(nb_rb_pusch-1); pilot_cnt += 6) {
+        ch=c32x16cumulVectVectWithSteps(pilot, &pil_offset, 1, rxF, &re_offset, 2, symbolSize, 6);
 
 #if NO_INTERP
-      for (int i=0;i<12;i++) ((int32_t*)ul_ch)[i] = *(int32_t*)ch;
-      ul_ch+=24;
+      for (c16_t *end=ul_ch+12; ul_ch<end; ul_ch++)
+          *ul_ch=ch;
 #else
-        ul_ch[6] += (ch[0] * 1365)>>15; // 1/12*16384
-        ul_ch[7] += (ch[1] * 1365)>>15; // 1/12*16384
+        ul_ch[3].r += (ch.r * 1365)>>15; // 1/12*16384
+        ul_ch[3].i += (ch.i * 1365)>>15; // 1/12*16384
+
+        ul_ch += 4;
+        multadd_real_vector_complex_scalar(filt8_avlip3, ch, ul_ch, 8);
 
         ul_ch += 8;
-        multadd_real_vector_complex_scalar(filt8_avlip3,
-                                           ch,
-                                           ul_ch,
-                                           8);
-
-        ul_ch += 16;
-        multadd_real_vector_complex_scalar(filt8_avlip4,
-                                           ch,
-                                           ul_ch,
-                                           8);
-
-        ul_ch += 16;
-        multadd_real_vector_complex_scalar(filt8_avlip5,
-                                           ch,
-                                           ul_ch,
-                                           8);
-        ul_ch -= 16;
+        multadd_real_vector_complex_scalar(filt8_avlip4, ch, ul_ch, 8);
+
+        ul_ch += 8;
+        multadd_real_vector_complex_scalar(filt8_avlip5, ch, ul_ch, 8);
+        ul_ch -= 8;
 #endif
       }
       // Last PRB
-      ch_0 = ((int32_t)pil[0]*rxF[0] - (int32_t)pil[1]*rxF[1])>>15;
-      ch_1 = ((int32_t)pil[0]*rxF[1] + (int32_t)pil[1]*rxF[0])>>15;
-
-      pil += 2;
-      re_offset = (re_offset+2) % gNB->frame_parms.ofdm_symbol_size;
-      rxF   = (int16_t *)&rxdataF[aarx][(symbol_offset+nushift+re_offset)];
-
-      ch_0 += ((int32_t)pil[0]*rxF[0] - (int32_t)pil[1]*rxF[1])>>15;
-      ch_1 += ((int32_t)pil[0]*rxF[1] + (int32_t)pil[1]*rxF[0])>>15;
-
-      pil += 2;
-      re_offset = (re_offset+2) % gNB->frame_parms.ofdm_symbol_size;
-      rxF   = (int16_t *)&rxdataF[aarx][(symbol_offset+nushift+re_offset)];
-
-      ch_0 += ((int32_t)pil[0]*rxF[0] - (int32_t)pil[1]*rxF[1])>>15;
-      ch_1 += ((int32_t)pil[0]*rxF[1] + (int32_t)pil[1]*rxF[0])>>15;
-
-      pil += 2;
-      re_offset = (re_offset+2) % gNB->frame_parms.ofdm_symbol_size;
-      rxF   = (int16_t *)&rxdataF[aarx][(symbol_offset+nushift+re_offset)];
-
-      ch_0 += ((int32_t)pil[0]*rxF[0] - (int32_t)pil[1]*rxF[1])>>15;
-      ch_1 += ((int32_t)pil[0]*rxF[1] + (int32_t)pil[1]*rxF[0])>>15;
-
-      pil += 2;
-      re_offset = (re_offset+2) % gNB->frame_parms.ofdm_symbol_size;
-      rxF   = (int16_t *)&rxdataF[aarx][(symbol_offset+nushift+re_offset)];
-
-      ch_0 += ((int32_t)pil[0]*rxF[0] - (int32_t)pil[1]*rxF[1])>>15;
-      ch_1 += ((int32_t)pil[0]*rxF[1] + (int32_t)pil[1]*rxF[0])>>15;
-
-      pil += 2;
-      re_offset = (re_offset+2) % gNB->frame_parms.ofdm_symbol_size;
-      rxF   = (int16_t *)&rxdataF[aarx][(symbol_offset+nushift+re_offset)];
-
-      ch_0 += ((int32_t)pil[0]*rxF[0] - (int32_t)pil[1]*rxF[1])>>15;
-      ch_1 += ((int32_t)pil[0]*rxF[1] + (int32_t)pil[1]*rxF[0])>>15;
-
-      pil += 2;
-      re_offset = (re_offset+2) % gNB->frame_parms.ofdm_symbol_size;
-      rxF   = (int16_t *)&rxdataF[aarx][(symbol_offset+nushift+re_offset)];
-
-      ch[0] = ch_0 / 6;
-      ch[1] = ch_1 / 6;
-
+      ch=c32x16cumulVectVectWithSteps(pilot, &pil_offset, 1, rxF, &re_offset, 2, symbolSize, 6);
+      
 #if NO_INTERP
-      for (int i=0;i<12;i++) ((int32_t*)ul_ch)[i] = *(int32_t*)ch;
-      ul_ch+=24;
+      for (c16_t *end=ul_ch+12; ul_ch<end; ul_ch++)
+        *ul_ch=ch;
 #else
-      ul_ch[6] += (ch[0] * 1365)>>15; // 1/12*16384
-      ul_ch[7] += (ch[1] * 1365)>>15; // 1/12*16384
-
-      ul_ch += 8;
-      multadd_real_vector_complex_scalar(filt8_avlip3,
+      ul_ch[3].r += (ch.r * 1365)>>15; // 1/12*16384
+      ul_ch[3].i += (ch.i * 1365)>>15; // 1/12*16384
+      
+      ul_ch += 4;
+      c16multaddVectRealComplex(filt8_avlip3,
                                          ch,
                                          ul_ch,
                                          8);
-
-      ul_ch += 16;
-      multadd_real_vector_complex_scalar(filt8_avlip6,
+      
+      ul_ch += 8;
+      c16multaddVectRealComplex(filt8_avlip6,
                                          ch,
                                          ul_ch,
                                          8);
 #endif
-    }
-    else  { // this is case without frequency-domain linear interpolation, just take average of LS channel estimates of 4 DMRS REs and use a common value for the whole PRB
+    } else  { // this is case without frequency-domain linear interpolation, just take average of LS channel estimates of 4 DMRS REs and use a common value for the whole PRB
       LOG_D(PHY,"PUSCH estimation DMRS type 2, no Freq-domain interpolation");
-      int32_t ch_0, ch_1;
+      c16_t *pil   = pilot;
+      int re_offset = k0;
+      c32_t ch0={0};
       //First PRB
-      ch_0 = ((int32_t)pil[0]*rxF[0] - (int32_t)pil[1]*rxF[1])>>15;
-      ch_1 = ((int32_t)pil[0]*rxF[1] + (int32_t)pil[1]*rxF[0])>>15;
-
-      pil += 2;
-      re_offset = (re_offset+1) % gNB->frame_parms.ofdm_symbol_size;
-      rxF   = (int16_t *)&rxdataF[aarx][(symbol_offset+nushift+re_offset)];
-
-      ch_0 += ((int32_t)pil[0]*rxF[0] - (int32_t)pil[1]*rxF[1])>>15;
-      ch_1 += ((int32_t)pil[0]*rxF[1] + (int32_t)pil[1]*rxF[0])>>15;
-
-      pil += 2;
-      re_offset = (re_offset+5) % gNB->frame_parms.ofdm_symbol_size;
-      rxF   = (int16_t *)&rxdataF[aarx][(symbol_offset+nushift+re_offset)];
-
-      ch_0 += ((int32_t)pil[0]*rxF[0] - (int32_t)pil[1]*rxF[1])>>15;
-      ch_1 += ((int32_t)pil[0]*rxF[1] + (int32_t)pil[1]*rxF[0])>>15;
-
-      pil += 2;
-      re_offset = (re_offset+1) % gNB->frame_parms.ofdm_symbol_size;
-      rxF   = (int16_t *)&rxdataF[aarx][(symbol_offset+nushift+re_offset)];
-
-      ch_0 += ((int32_t)pil[0]*rxF[0] - (int32_t)pil[1]*rxF[1])>>15;
-      ch_1 += ((int32_t)pil[0]*rxF[1] + (int32_t)pil[1]*rxF[0])>>15;
-
-      pil += 2;
-      re_offset = (re_offset+5) % gNB->frame_parms.ofdm_symbol_size;
-      rxF   = (int16_t *)&rxdataF[aarx][(symbol_offset+nushift+re_offset)];
-
-      ch[0] = ch_0 / 4;
-      ch[1] = ch_1 / 4;
-
+      ch0=c32x16mulShift(*pil,
+                         rxdataF[soffset + nushift + re_offset],
+                         15);
+      pil++;
+      re_offset = (re_offset+1) % symbolSize;
+      ch0=c32x16maddShift(*pil,
+                          rxdataF[nushift+re_offset],
+                          ch0,
+                          15);
+      pil++;
+      re_offset = (re_offset+5) % symbolSize;
+      ch0=c32x16maddShift(*pil,
+                          rxdataF[nushift+re_offset],
+                          ch0,
+                          15);
+      re_offset = (re_offset+1) % symbolSize;
+      ch0=c32x16maddShift(*pil,
+                          rxdataF[nushift+re_offset],
+                          ch0,
+                          15);
+      pil++;
+      re_offset = (re_offset+5) % symbolSize;
+
+      c16_t ch=c16x32div(ch0, 4);
 #if NO_INTERP
-      for (int i=0;i<12;i++) ((int32_t*)ul_ch)[i] = *(int32_t*)ch;
-      ul_ch+=24;
+      for (c16_t *end=ul_ch+12; ul_ch<end; ul_ch++)
+        *ul_ch=ch;
 #else
-      multadd_real_vector_complex_scalar(filt8_avlip0,
-                                         ch,
-                                         ul_ch,
-                                         8);
-
-      ul_ch += 16;
-      multadd_real_vector_complex_scalar(filt8_avlip1,
-                                         ch,
-                                         ul_ch,
-                                         8);
-
-      ul_ch += 16;
-      multadd_real_vector_complex_scalar(filt8_avlip2,
-                                         ch,
-                                         ul_ch,
-                                         8);
-      ul_ch -= 24;
+      c16multaddVectRealComplex(filt8_avlip0, &ch, ul_ch, 8);
+      ul_ch += 8;
+      c16multaddVectRealComplex(filt8_avlip1, &ch, ul_ch, 8);
+      ul_ch += 8;
+      c16multaddVectRealComplex(filt8_avlip2, &ch, ul_ch, 8);
+      ul_ch -= 12;
 #endif
 
-      for (pilot_cnt=4; pilot_cnt<4*(nb_rb_pusch-1); pilot_cnt += 4) {
-
-        ch_0 = ((int32_t)pil[0]*rxF[0] - (int32_t)pil[1]*rxF[1])>>15;
-        ch_1 = ((int32_t)pil[0]*rxF[1] + (int32_t)pil[1]*rxF[0])>>15;
-
-        pil += 2;
-        re_offset = (re_offset+1) % gNB->frame_parms.ofdm_symbol_size;
-        rxF   = (int16_t *)&rxdataF[aarx][(symbol_offset+nushift+re_offset)];
-
-        ch_0 += ((int32_t)pil[0]*rxF[0] - (int32_t)pil[1]*rxF[1])>>15;
-        ch_1 += ((int32_t)pil[0]*rxF[1] + (int32_t)pil[1]*rxF[0])>>15;
+      for (int pilot_cnt=4; pilot_cnt<4*(nb_rb_pusch-1); pilot_cnt += 4) {
+        c32_t ch0;
+        ch0=c32x16mulShift(*pil, rxdataF[nushift+re_offset], 15);
+        pil++;
+        re_offset = (re_offset+1) % symbolSize;
 
-        pil += 2;
-        re_offset = (re_offset+5) % gNB->frame_parms.ofdm_symbol_size;
-        rxF   = (int16_t *)&rxdataF[aarx][(symbol_offset+nushift+re_offset)];
+        ch0=c32x16maddShift(*pil, rxdataF[nushift+re_offset], ch0, 15);
+        pil++;
+        re_offset = (re_offset+5) % symbolSize;
 
-        ch_0 += ((int32_t)pil[0]*rxF[0] - (int32_t)pil[1]*rxF[1])>>15;
-        ch_1 += ((int32_t)pil[0]*rxF[1] + (int32_t)pil[1]*rxF[0])>>15;
+        ch0=c32x16maddShift(*pil, rxdataF[nushift+re_offset], ch0, 15);
+        pil++;
+        re_offset = (re_offset+1) % symbolSize;
 
-        pil += 2;
-        re_offset = (re_offset+1) % gNB->frame_parms.ofdm_symbol_size;
-        rxF   = (int16_t *)&rxdataF[aarx][(symbol_offset+nushift+re_offset)];
+        ch0=c32x16maddShift(*pil, rxdataF[nushift+re_offset], ch0, 15);
+        pil++;
+        re_offset = (re_offset+5) % symbolSize;
 
-        ch_0 += ((int32_t)pil[0]*rxF[0] - (int32_t)pil[1]*rxF[1])>>15;
-        ch_1 += ((int32_t)pil[0]*rxF[1] + (int32_t)pil[1]*rxF[0])>>15;
-
-        pil += 2;
-        re_offset = (re_offset+5) % gNB->frame_parms.ofdm_symbol_size;
-        rxF   = (int16_t *)&rxdataF[aarx][(symbol_offset+nushift+re_offset)];
-
-        ch[0] = ch_0 / 4;
-        ch[1] = ch_1 / 4;
+        ch=c16x32div(ch0, 4);
 
 #if NO_INTERP
-        for (int i=0;i<12;i++) ((int32_t*)ul_ch)[i] = *(int32_t*)ch;
-        ul_ch+=24;
+        for (c16_t *end=ul_ch+12; ul_ch<end; ul_ch++)
+          *ul_ch=ch;
 #else
-        ul_ch[6] += (ch[0] * 1365)>>15; // 1/12*16384
-        ul_ch[7] += (ch[1] * 1365)>>15; // 1/12*16384
-
+        ul_ch[3]=c16maddShift(ch,(c16_t) {1365,1365},15); // 1365 = 1/12*16384 (full range is +/- 32768)
+        ul_ch += 4;
+        c16multaddVectRealComplex(filt8_avlip3, &ch, ul_ch, 8);
         ul_ch += 8;
-        multadd_real_vector_complex_scalar(filt8_avlip3,
-                                           ch,
-                                           ul_ch,
-                                           8);
-
-        ul_ch += 16;
-        multadd_real_vector_complex_scalar(filt8_avlip4,
-                                           ch,
-                                           ul_ch,
-                                           8);
-
-        ul_ch += 16;
-        multadd_real_vector_complex_scalar(filt8_avlip5,
-                                           ch,
-                                           ul_ch,
-                                           8);
-        ul_ch -= 16;
+        c16multaddVectRealComplex(filt8_avlip4, &ch, ul_ch, 8);
+        ul_ch += 8;
+        c16multaddVectRealComplex(filt8_avlip5, &ch, ul_ch, 8);
+        ul_ch -= 8;
 #endif
       }
-      // Last PRB
-      ch_0 = ((int32_t)pil[0]*rxF[0] - (int32_t)pil[1]*rxF[1])>>15;
-      ch_1 = ((int32_t)pil[0]*rxF[1] + (int32_t)pil[1]*rxF[0])>>15;
-
-      pil += 2;
-      re_offset = (re_offset+1) % gNB->frame_parms.ofdm_symbol_size;
-      rxF   = (int16_t *)&rxdataF[aarx][(symbol_offset+nushift+re_offset)];
-
-      ch_0 += ((int32_t)pil[0]*rxF[0] - (int32_t)pil[1]*rxF[1])>>15;
-      ch_1 += ((int32_t)pil[0]*rxF[1] + (int32_t)pil[1]*rxF[0])>>15;
-
-      pil += 2;
-      re_offset = (re_offset+5) % gNB->frame_parms.ofdm_symbol_size;
-      rxF   = (int16_t *)&rxdataF[aarx][(symbol_offset+nushift+re_offset)];
-
-      ch_0 += ((int32_t)pil[0]*rxF[0] - (int32_t)pil[1]*rxF[1])>>15;
-      ch_1 += ((int32_t)pil[0]*rxF[1] + (int32_t)pil[1]*rxF[0])>>15;
-
-      pil += 2;
-      re_offset = (re_offset+1) % gNB->frame_parms.ofdm_symbol_size;
-      rxF   = (int16_t *)&rxdataF[aarx][(symbol_offset+nushift+re_offset)];
-
-      ch_0 += ((int32_t)pil[0]*rxF[0] - (int32_t)pil[1]*rxF[1])>>15;
-      ch_1 += ((int32_t)pil[0]*rxF[1] + (int32_t)pil[1]*rxF[0])>>15;
-
-      pil += 2;
-      re_offset = (re_offset+5) % gNB->frame_parms.ofdm_symbol_size;
-      rxF   = (int16_t *)&rxdataF[aarx][(symbol_offset+nushift+re_offset)];
-
-      ch[0] = ch_0 / 4;
-      ch[1] = ch_1 / 4;
 
+      // Last PRB
+      ch0=c32x16mulShift(*pil, rxdataF[nushift+re_offset], 15);
+      pil++;
+      re_offset = (re_offset+1) % symbolSize;
+      
+      ch0=c32x16maddShift(*pil, rxdataF[nushift+re_offset], ch0, 15);
+      pil++;
+      re_offset = (re_offset+5) % symbolSize;
+      
+      ch0=c32x16maddShift(*pil, rxdataF[nushift+re_offset], ch0, 15);
+      pil++;
+      re_offset = (re_offset+1) % symbolSize;
+      
+      ch0=c32x16maddShift(*pil, rxdataF[nushift+re_offset], ch0, 15);
+      pil++;
+      re_offset = (re_offset+5) % symbolSize;
+      
+      ch=c16x32div(ch0, 4);
 #if NO_INTERP
-      for (int i=0;i<12;i++) ((int32_t*)ul_ch)[i] = *(int32_t*)ch;
-      ul_ch+=24;
+      for (c16_t *end=ul_ch+12; ul_ch<end; ul_ch++)
+          *ul_ch=ch;
 #else
-      ul_ch[6] += (ch[0] * 1365)>>15; // 1/12*16384
-      ul_ch[7] += (ch[1] * 1365)>>15; // 1/12*16384
-
+      ul_ch[3]=c16maddShift(ch, c16_t {1365,1365},15);// 1365 = 1/12*16384 (full range is +/- 32768)
+      ul_ch += 4;
+      c16multaddVectRealComplex(filt8_avlip3, &ch, ul_ch, 8);
       ul_ch += 8;
-      multadd_real_vector_complex_scalar(filt8_avlip3,
-                                         ch,
-                                         ul_ch,
-                                         8);
-
-      ul_ch += 16;
-      multadd_real_vector_complex_scalar(filt8_avlip6,
-                                         ch,
-                                         ul_ch,
-                                         8);
+      c16multaddVectRealComplex(filt8_avlip6, &ch, ul_ch, 8);
 #endif
     }
+    
 #ifdef DEBUG_PUSCH
-    ul_ch = (int16_t *)&ul_ch_estimates[p*gNB->frame_parms.nb_antennas_rx+aarx][ch_offset];
-    for(uint16_t idxP=0; idxP<ceil((float)nb_rb_pusch*12/8); idxP++) {
-      for(uint8_t idxI=0; idxI<16; idxI += 2) {
-        printf("%d\t%d\t",ul_ch[idxP*16+idxI],ul_ch[idxP*16+idxI+1]);
+    ul_ch = &ul_ch_estimates[p*gNB->frame_parms.nb_antennas_rx+aarx][ch_offset];
+
+    for(int idxP=0; idxP<ceil((float)nb_rb_pusch*12/8); idxP++) {
+      for(int idxI=0; idxI<8; idxI++) {
+        printf("%d\t%d\t",ul_ch[idxP*8+idxI].r,ul_ch[idxP*8+idxI].i);
       }
+
       printf("%d\n",idxP);
     }
-#endif
 
+#endif
     // Convert to time domain
-    freq2time(gNB->frame_parms.ofdm_symbol_size,
-              (int16_t*) &ul_ch_estimates[aarx][symbol_offset],
-              (int16_t*) ul_ch_estimates_time[aarx]);
+    freq2time(symbolSize,
+              (int16_t *) &ul_ch_estimates[aarx][symbol_offset],
+              (int16_t *) gNB->pusch_vars[ul_id]->ul_ch_estimates_time[aarx]);
   }
 
 #ifdef DEBUG_CH
   fclose(debug_ch_est);
 #endif
-
   return(0);
 }
 
@@ -852,12 +607,10 @@ void nr_pusch_ptrs_processing(PHY_VARS_gNB *gNB,
                               uint8_t ulsch_id,
                               uint8_t nr_tti_rx,
                               unsigned char symbol,
-                              uint32_t nb_re_pusch)
-{
+                              uint32_t nb_re_pusch) {
   //#define DEBUG_UL_PTRS 1
   int32_t *ptrs_re_symbol   = NULL;
   int8_t   ret = 0;
-
   uint8_t  symbInSlot       = rel15_ul->start_symbol_index + rel15_ul->nr_of_symbols;
   uint8_t *startSymbIndex   = &rel15_ul->start_symbol_index;
   uint8_t *nbSymb           = &rel15_ul->nr_of_symbols;
@@ -869,6 +622,7 @@ void nr_pusch_ptrs_processing(PHY_VARS_gNB *gNB,
   uint8_t  *dmrsConfigType  = &rel15_ul->dmrs_config_type;
   uint16_t *nb_rb           = &rel15_ul->rb_size;
   uint8_t  *ptrsReOffset    = &rel15_ul->pusch_ptrs.ptrs_ports_list[0].ptrs_re_offset;
+
   /* loop over antennas */
   for (int aarx=0; aarx<frame_parms->nb_antennas_rx; aarx++) {
     c16_t *phase_per_symbol = (c16_t*)gNB->pusch_vars[ulsch_id]->ptrs_phase_per_slot[aarx];
@@ -895,8 +649,10 @@ void nr_pusch_ptrs_processing(PHY_VARS_gNB *gNB,
                         1<< *L_ptrs,
                         *dmrsSymbPos);
     }
+
     /* if not PTRS symbol set current ptrs symbol index to zero*/
     *ptrsSymbIdx = 0;
+
     /* Check if current symbol contains PTRS */
     if(is_ptrs_symbol(symbol, *ptrsSymbPos)) {
       *ptrsSymbIdx = symbol;
@@ -907,11 +663,12 @@ void nr_pusch_ptrs_processing(PHY_VARS_gNB *gNB,
                              rel15_ul->rnti,
                              nr_tti_rx,
                              symbol,frame_parms->ofdm_symbol_size,
-                             (int16_t*)&gNB->pusch_vars[ulsch_id]->rxdataF_comp[aarx][(symbol * nb_re_pusch)],
+                             (int16_t *)&gNB->pusch_vars[ulsch_id]->rxdataF_comp[aarx][(symbol * nb_re_pusch)],
                              gNB->nr_gold_pusch_dmrs[rel15_ul->scid][nr_tti_rx][symbol],
                              (int16_t*)&phase_per_symbol[symbol],
                              ptrs_re_symbol);
     }
+
     /* For last OFDM symbol at each antenna perform interpolation and compensation for the slot*/
     if(symbol == (symbInSlot -1)) {
       /*------------------------------------------------------------------------------------------------------- */
@@ -924,16 +681,18 @@ void nr_pusch_ptrs_processing(PHY_VARS_gNB *gNB,
           LOG_W(PHY,"[PTRS] Compensation is skipped due to error in PTRS slot processing !!\n");
         }
       }
+
 #ifdef DEBUG_UL_PTRS
       LOG_M("ptrsEstUl.m","est",gNB->pusch_vars[ulsch_id]->ptrs_phase_per_slot[aarx],frame_parms->symbols_per_slot,1,1 );
       LOG_M("rxdataF_bf_ptrs_comp_ul.m","bf_ptrs_cmp",
             &gNB->pusch_vars[0]->rxdataF_comp[aarx][rel15_ul->start_symbol_index * NR_NB_SC_PER_RB * rel15_ul->rb_size],
             rel15_ul->nr_of_symbols * NR_NB_SC_PER_RB * rel15_ul->rb_size,1,1);
 #endif
+
       /*------------------------------------------------------------------------------------------------------- */
       /* 3) Compensated DMRS based estimated signal with PTRS estimation                                        */
       /*--------------------------------------------------------------------------------------------------------*/
-      for(uint8_t i = *startSymbIndex; i< symbInSlot ;i++) {
+      for(uint8_t i = *startSymbIndex; i< symbInSlot ; i++) {
         /* DMRS Symbol has 0 phase so no need to rotate the respective symbol */
         /* Skip rotation if the slot processing is wrong */
         if((!is_dmrs_symbol(i,*dmrsSymbPos)) && (ret == 0)) {
@@ -957,6 +716,7 @@ uint32_t calc_power(const int16_t *x, const uint32_t size) {
     sum_x = sum_x + x[k];
     sum_x2 = sum_x2 + x[k]*x[k];
   }
+
   return sum_x2/size - (sum_x/size)*(sum_x/size);
 }
 
@@ -1252,7 +1012,7 @@ int nr_srs_channel_estimation(const PHY_VARS_gNB *gNB,
   *noise_power = max(calc_power(noise_real,frame_parms->nb_antennas_rx*N_ap*M_sc_b_SRS)
                      + calc_power(noise_imag,frame_parms->nb_antennas_rx*N_ap*M_sc_b_SRS), 1);
 
-  *snr = dB_fixed((int32_t)((*signal_power<<factor_bits)/(*noise_power))) - factor_dB;
+    *snr = dB_fixed((int32_t)((*signal_power<<factor_bits)/(*noise_power))) - factor_dB;
 
 #ifdef SRS_DEBUG
   LOG_I(NR_PHY,"noise_power = %u, SNR = %i dB\n", *noise_power, *snr);
diff --git a/openair1/PHY/NR_TRANSPORT/srs_rx.c b/openair1/PHY/NR_TRANSPORT/srs_rx.c
index 54264b70e0a07599df7646da16d6a015fa371d41..ba95b9ac4499bb69610ef26c2a76b946f8110df6 100644
--- a/openair1/PHY/NR_TRANSPORT/srs_rx.c
+++ b/openair1/PHY/NR_TRANSPORT/srs_rx.c
@@ -183,4 +183,4 @@ int nr_get_srs_signal(PHY_VARS_gNB *gNB,
   } else {
     return 0;
   }
-}
\ No newline at end of file
+}
diff --git a/openair1/PHY/NR_UE_TRANSPORT/nr_dlsch_demodulation.c b/openair1/PHY/NR_UE_TRANSPORT/nr_dlsch_demodulation.c
index 6728f0260c093cdaacee71d757bbcee9c5c0e5d7..88cc419dc166a555752c819cae73512c94e098bb 100644
--- a/openair1/PHY/NR_UE_TRANSPORT/nr_dlsch_demodulation.c
+++ b/openair1/PHY/NR_UE_TRANSPORT/nr_dlsch_demodulation.c
@@ -480,45 +480,45 @@ int nr_rx_pdsch(PHY_VARS_NR_UE *ue,
     nbSymb = dlsch1_harq->nb_symbols;
     pduBitmap = dlsch1_harq->pduBitmap;
   }
-
+  
   /* Check for PTRS bitmap and process it respectively */
   if((pduBitmap & 0x1) && (type == PDSCH)) {
     nr_pdsch_ptrs_processing(ue,
-			     pdsch_vars,
-			     frame_parms,
-			     dlsch0_harq,
-			     dlsch1_harq,
-			     gNB_id,
-			     nr_slot_rx,
-			     symbol,
-			     (nb_rb_pdsch*12),
-			     dlsch[0]->rnti,rx_type);
+                             pdsch_vars,
+                             frame_parms,
+                             dlsch0_harq,
+                             dlsch1_harq,
+                             gNB_id,
+                             nr_slot_rx,
+                             symbol,
+                             (nb_rb_pdsch*12),
+                             dlsch[0]->rnti,rx_type);
     pdsch_vars[gNB_id]->dl_valid_re[symbol-1] -= pdsch_vars[gNB_id]->ptrs_re_per_slot[0][symbol];
   }
-
+  
   /* at last symbol in a slot calculate LLR's for whole slot */
   if(symbol == (startSymbIdx + nbSymb -1)) {
     for(uint8_t i =startSymbIdx; i < (startSymbIdx+nbSymb);i++) {
       /* re evaluating the first symbol flag as LLR's are done in symbol loop  */
       if(i == startSymbIdx && i < 3) {
-	first_symbol_flag =1;
+        first_symbol_flag =1;
       }
       else {
-	first_symbol_flag=0;
+        first_symbol_flag=0;
       }
       /* Calculate LLR's for each symbol */
       nr_dlsch_llr(pdsch_vars, frame_parms,
-		   rxdataF_comp_ptr, dl_ch_mag_ptr,
-		   dlsch0_harq, dlsch1_harq,
-		   rx_type, harq_pid,
-		   gNB_id, gNB_id_i,
-		   first_symbol_flag,
-		   i, nb_rb_pdsch,
-		   codeword_TB0, codeword_TB1,
-		   pdsch_vars[gNB_id]->dl_valid_re[i-1],
-		   nr_slot_rx, beamforming_mode);
+                   rxdataF_comp_ptr, dl_ch_mag_ptr,
+                   dlsch0_harq, dlsch1_harq,
+                   rx_type, harq_pid,
+                   gNB_id, gNB_id_i,
+                   first_symbol_flag,
+                   i, nb_rb_pdsch,
+                   codeword_TB0, codeword_TB1,
+                   pdsch_vars[gNB_id]->dl_valid_re[i-1],
+                   nr_slot_rx, beamforming_mode);
     }
-
+    
     int dmrs_type = dlsch[0]->harq_processes[harq_pid]->dmrsConfigType;
     uint8_t nb_re_dmrs;
     uint16_t dmrs_len = get_num_dmrs(dlsch[0]->harq_processes[harq_pid]->dlDmrsSymbPos);
@@ -528,29 +528,29 @@ int nr_rx_pdsch(PHY_VARS_NR_UE *ue,
       nb_re_dmrs = 4*dlsch[0]->harq_processes[harq_pid]->n_dmrs_cdm_groups;
     }
     dlsch[0]->harq_processes[harq_pid]->G = nr_get_G(dlsch[0]->harq_processes[harq_pid]->nb_rb,
-						     dlsch[0]->harq_processes[harq_pid]->nb_symbols,
-						     nb_re_dmrs,
-						     dmrs_len,
-						     dlsch[0]->harq_processes[harq_pid]->Qm,
-						     dlsch[0]->harq_processes[harq_pid]->Nl);
+                                                     dlsch[0]->harq_processes[harq_pid]->nb_symbols,
+                                                     nb_re_dmrs,
+                                                     dmrs_len,
+                                                     dlsch[0]->harq_processes[harq_pid]->Qm,
+                                                     dlsch[0]->harq_processes[harq_pid]->Nl);
     nr_dlsch_layer_demapping(pdsch_vars[gNB_id]->llr,
-			     dlsch[0]->harq_processes[harq_pid]->Nl,
-			     dlsch[0]->harq_processes[harq_pid]->Qm,
-			     dlsch[0]->harq_processes[harq_pid]->G,
-			     codeword_TB0,
-			     codeword_TB1,
-			     pdsch_vars[gNB_id]->layer_llr);
+                             dlsch[0]->harq_processes[harq_pid]->Nl,
+                             dlsch[0]->harq_processes[harq_pid]->Qm,
+                             dlsch[0]->harq_processes[harq_pid]->G,
+                             codeword_TB0,
+                             codeword_TB1,
+                             pdsch_vars[gNB_id]->layer_llr);    
   }
-
+  
   stop_meas(&ue->generic_stat_bis[proc->thread_id][slot]);
   if (cpumeas(CPUMEAS_GETSTATE))
     LOG_D(PHY, "[AbsSFN %u.%d] Slot%d Symbol %d: LLR Computation  %5.2f \n",frame,nr_slot_rx,slot,symbol,ue->generic_stat_bis[proc->thread_id][slot].p_time/(cpuf*1000.0));
-
+  
   // Please keep it: useful for debugging
 #ifdef DEBUG_PDSCH_RX
   char filename[50];
   uint8_t aa = 0;
-
+  
   snprintf(filename, 50, "rxdataF0_symb_%d_nr_slot_rx_%d.m", symbol, nr_slot_rx);
   write_output(filename, "rxdataF0", &common_vars->common_vars_rx_data_per_thread[proc->thread_id].rxdataF[0][0], NR_SYMBOLS_PER_SLOT*frame_parms->ofdm_symbol_size, 1, 1);
 
diff --git a/openair1/PHY/TOOLS/8bit_rxdemux.c b/openair1/PHY/TOOLS/8bit_rxdemux.c
deleted file mode 100644
index 030be47b5daf1a04b524de31f36eb93f314a8e4d..0000000000000000000000000000000000000000
--- a/openair1/PHY/TOOLS/8bit_rxdemux.c
+++ /dev/null
@@ -1,62 +0,0 @@
-/*
- * Licensed to the OpenAirInterface (OAI) Software Alliance under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The OpenAirInterface Software Alliance licenses this file to You under
- * the OAI Public License, Version 1.1  (the "License"); you may not use this file
- * except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.openairinterface.org/?page_id=698
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *-------------------------------------------------------------------------------
- * For more information about the OpenAirInterface (OAI) Software Alliance:
- *      contact@openairinterface.org
- */
-
-#include "PHY/types.h"
-#include "PHY/defs.h"
-#include "PHY/extern.h"
-
-
-void bit8_rxdemux(int length,int offset)
-{
-
-  int i;
-  short *rx1_ptr =  (short *)&PHY_vars->rx_vars[1].RX_DMA_BUFFER[offset];
-  short *rx0_ptr =  (short *)&PHY_vars->rx_vars[0].RX_DMA_BUFFER[offset];
-  char  *rx0_ptr2 = (char *)(&PHY_vars->rx_vars[0].RX_DMA_BUFFER[offset]);
-  //  short tmp;
-  short r0,i0,r1,i1;
-
-  //  printf("demuxing: %d,%d\n",length,offset);
-
-  //  printf("%x %x\n",PHY_vars->chsch_data[0].CHSCH_f_sync[0],    PHY_vars->chsch_data[0].CHSCH_f_sync[1]);
-
-  for (i=0; i<length; i++) {
-
-
-
-
-    r0= (short)(rx0_ptr2[i<<2]);        // Re 0
-
-    i0= (short)(rx0_ptr2[(i<<2)+1]);  // Im 0
-
-    r1= (short)(rx0_ptr2[(i<<2)+2]);    // Re 1
-
-    i1= (short)(rx0_ptr2[(i<<2)+3]);  // Im 1
-
-    rx0_ptr[(i<<1)] = r0;
-    rx0_ptr[(i<<1)+1] =i0;
-    rx1_ptr[i<<1] =r1;
-    rx1_ptr[(i<<1)+1] =i1;
-  }
-
-  //  printf("%x %x\n",PHY_vars->chsch_data[0].CHSCH_f_sync[0],    PHY_vars->chsch_data[0].CHSCH_f_sync[1]);
-
-}
diff --git a/openair1/PHY/TOOLS/8bit_txmux.c b/openair1/PHY/TOOLS/8bit_txmux.c
deleted file mode 100644
index b103bc9c5cd9b49a4d0df7411da79410c518250c..0000000000000000000000000000000000000000
--- a/openair1/PHY/TOOLS/8bit_txmux.c
+++ /dev/null
@@ -1,48 +0,0 @@
-/*
- * Licensed to the OpenAirInterface (OAI) Software Alliance under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The OpenAirInterface Software Alliance licenses this file to You under
- * the OAI Public License, Version 1.1  (the "License"); you may not use this file
- * except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.openairinterface.org/?page_id=698
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *-------------------------------------------------------------------------------
- * For more information about the OpenAirInterface (OAI) Software Alliance:
- *      contact@openairinterface.org
- */
-
-#include "PHY/types.h"
-#include "PHY/defs.h"
-#include "PHY/extern.h"
-
-
-void bit8_txmux(int length,int offset)
-{
-
-  int i;
-  short  *dest,*dest2;
-
-
-
-
-  for (i=0; i<length; i++) {
-
-    dest = (short *)&PHY_vars->tx_vars[0].TX_DMA_BUFFER[i+offset];
-    dest2 =   (short *)&PHY_vars->tx_vars[1].TX_DMA_BUFFER[i+offset];
-
-    ((char *)dest)[0] = (char)(dest[0]>>BIT8_TX_SHIFT);
-    ((char *)dest)[1] = (char)(dest[1]>>BIT8_TX_SHIFT);
-    ((char *)dest)[2] = (char)(dest2[0]>>BIT8_TX_SHIFT);
-    ((char *)dest)[3] = (char)(dest2[1]>>BIT8_TX_SHIFT);
-  }
-
-
-}
diff --git a/openair1/PHY/TOOLS/cadd_sv.c b/openair1/PHY/TOOLS/cadd_sv.c
deleted file mode 100644
index 56dcb718478c25ad843bee62eabfa535e6c11d72..0000000000000000000000000000000000000000
--- a/openair1/PHY/TOOLS/cadd_sv.c
+++ /dev/null
@@ -1,163 +0,0 @@
-/*
- * Licensed to the OpenAirInterface (OAI) Software Alliance under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The OpenAirInterface Software Alliance licenses this file to You under
- * the OAI Public License, Version 1.1  (the "License"); you may not use this file
- * except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.openairinterface.org/?page_id=698
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *-------------------------------------------------------------------------------
- * For more information about the OpenAirInterface (OAI) Software Alliance:
- *      contact@openairinterface.org
- */
-
-#include "defs.h"
-
-static  __m128i alpha_128 __attribute__ ((aligned(16)));
-static  __m128i shift     __attribute__ ((aligned(16)));
-
-int add_cpx_vector(short *x,
-                   short *alpha,
-                   short *y,
-                   unsigned int N)
-{
-  unsigned int i;                 // loop counter
-
-  __m128i *x_128;
-  __m128i *y_128;
-
-  x_128 = (__m128i *)&x[0];
-  y_128 = (__m128i *)&y[0];
-
-  alpha_128 = _mm_set1_epi32(*((int*)alpha));
-
-  // we compute 4 cpx multiply for each loop
-  for(i=0; i<(N>>3); i++) {
-    y_128[0] = _mm_adds_epi16(alpha_128, x_128[0]);
-    y_128[1] = _mm_adds_epi16(alpha_128, x_128[1]);
-    y_128[2] = _mm_adds_epi16(alpha_128, x_128[2]);
-    y_128[3] = _mm_adds_epi16(alpha_128, x_128[3]);
-
-
-    x_128+=4;
-    y_128 +=4;
-
-  }
-
-  return (0);
-}
-
-int add_vector32_scalar(short *x,
-                        int alpha,
-                        short *y,
-                        unsigned int N)
-{
-  unsigned int i;                 // loop counter
-
-  __m128i *x_128;
-  __m128i *y_128;
-
-  x_128 = (__m128i *)&x[0];
-  y_128 = (__m128i *)&y[0];
-
-  alpha_128 = _mm_setr_epi32(alpha,0,alpha,0);
-
-  // we compute 4 cpx multiply for each loop
-  for(i=0; i<(N>>3); i++) {
-    y_128[0] = _mm_add_epi32(alpha_128, x_128[0]);
-    y_128[1] = _mm_add_epi32(alpha_128, x_128[1]);
-    y_128[2] = _mm_add_epi32(alpha_128, x_128[2]);
-    y_128[3] = _mm_add_epi32(alpha_128, x_128[3]);
-
-
-    x_128+=4;
-    y_128 +=4;
-
-  }
-
-  return (0);
-}
-
-
-int add_real_vector64_scalar(short *x,
-                             long long int a,
-                             short *y,
-                             unsigned int N)
-{
-  unsigned int i;                 // loop counter
-
-  __m128i *x_128;
-  __m128i *y_128;
-
-  x_128 = (__m128i *)&x[0];
-  y_128 = (__m128i *)&y[0];
-
-  alpha_128 = _mm_set1_epi64((__m64) a);
-
-  // we compute 4 cpx multiply for each loop
-  for(i=0; i<(N>>3); i++) {
-    y_128[0] = _mm_add_epi64(alpha_128, x_128[0]);
-    y_128[1] = _mm_add_epi64(alpha_128, x_128[1]);
-    y_128[2] = _mm_add_epi64(alpha_128, x_128[2]);
-    y_128[3] = _mm_add_epi64(alpha_128, x_128[3]);
-
-
-    x_128+=4;
-    y_128+=4;
-
-  }
-
-  return(0);
-}
-
-
-#ifdef MAIN
-#include <stdio.h>
-
-main ()
-{
-
-  short input[256] __attribute__((aligned(16)));
-  short output[256] __attribute__((aligned(16)));
-
-  int i;
-  c16_t alpha;
-
-  Zero_Buffer(output,256*2);
-
-  input[0] = 100;
-  input[1] = 200;
-  input[2] = 100;
-  input[3] = 200;
-  input[4] = 1234;
-  input[5] = -1234;
-  input[6] = 1234;
-  input[7] = -1234;
-  input[8] = 100;
-  input[9] = 200;
-  input[10] = 100;
-  input[11] = 200;
-  input[12] = 1000;
-  input[13] = 2000;
-  input[14] = 1000;
-  input[15] = 2000;
-
-  alpha.r = 10;
-  alpha.i = -10;
-
-  add_cpx_vector(input,(short*) &alpha,input,8);
-
-  for (i=0; i<16; i+=2)
-    printf("output[%d] = %d + %d i\n",i,input[i],input[i+1]);
-
-}
-
-#endif //MAIN
diff --git a/openair1/PHY/TOOLS/cadd_vv.c b/openair1/PHY/TOOLS/cadd_vv.c
index 08bf514cb357b2f032b3922810b0440f2d29cfb0..88a2bd98140a71b6c73a3273d01e6c980ab53dd1 100644
--- a/openair1/PHY/TOOLS/cadd_vv.c
+++ b/openair1/PHY/TOOLS/cadd_vv.c
@@ -23,153 +23,6 @@
 #include "tools_defs.h"
 
 
-int add_vector16(short *x,
-                 short *y,
-                 short *z,
-                 unsigned int N)
-{
-  unsigned int i;                 // loop counter
-
-  __m128i *x_128;
-  __m128i *y_128;
-  __m128i *z_128;
-
-  x_128 = (__m128i *)&x[0];
-  y_128 = (__m128i *)&y[0];
-  z_128 = (__m128i *)&z[0];
-
-
-  for(i=0; i<(N>>5); i++) {
-    /*
-    printf("i = %d\n",i);
-    print_shorts(x_128[0],"x[0]=");
-    print_shorts(y_128[0],"y[0]=");
-    */
-
-    z_128[0] = _mm_adds_epi16(x_128[0],y_128[0]);
-    /*
-    print_shorts(z_128[0],"z[0]=");
-
-    print_shorts(x_128[1],"x[1]=");
-    print_shorts(y_128[1],"y[1]=");
-    */
-
-    z_128[1] = _mm_adds_epi16(x_128[1],y_128[1]);
-    /*
-    print_shorts(z_128[1],"z[1]=");
-
-    print_shorts(x_128[2],"x[2]=");
-    print_shorts(y_128[2],"y[2]=");
-    */
-
-    z_128[2] = _mm_adds_epi16(x_128[2],y_128[2]);
-    /*
-    print_shorts(z_128[2],"z[2]=");
-
-    print_shorts(x_128[3],"x[3]=");
-    print_shorts(y_128[3],"y[3]=");
-    */
-
-    z_128[3] = _mm_adds_epi16(x_128[3],y_128[3]);
-    /*
-    print_shorts(z_128[3],"z[3]=");
-    */
-
-
-
-
-
-    x_128 +=4;
-    y_128 +=4;
-    z_128 +=4;
-
-  }
-
-  _mm_empty();
-  _m_empty();
-
-  return(0);
-}
-
-/*
-print_shorts64(__m64 x,char *s) {
-
-  printf("%s ",s);
-  printf("%d %d %d %d\n",((short *)&x)[0],((short *)&x)[1],((short *)&x)[2],((short *)&x)[3]);
-
-}
-*/
-
-int add_vector16_64(short *x,
-                    short *y,
-                    short *z,
-                    unsigned int N)
-{
-  unsigned int i;                 // loop counter
-
-  __m64 *x_64;
-  __m64 *y_64;
-  __m64 *z_64;
-
-  x_64 = (__m64 *)&x[0];
-  y_64 = (__m64 *)&y[0];
-  z_64 = (__m64 *)&z[0];
-
-
-  for(i=0; i<(N>>2); i++) {
-    /*
-    printf("i = %d\n",i);
-    print_shorts64(x_64[i],"x[i]=");
-    print_shorts64(y_64[i],"y[i]=");
-    */
-
-    z_64[i] = _mm_adds_pi16(x_64[i],y_64[i]);
-    /*
-    print_shorts64(z_64[i],"z[i]=");
-    */
-
-
-  }
-
-  _mm_empty();
-  _m_empty();
-  return(0);
-}
-
-int add_cpx_vector32(short *x,
-                     short *y,
-                     short *z,
-                     unsigned int N)
-{
-  unsigned int i;                 // loop counter
-
-  __m128i *x_128;
-  __m128i *y_128;
-  __m128i *z_128;
-
-  x_128 = (__m128i *)&x[0];
-  y_128 = (__m128i *)&y[0];
-  z_128 = (__m128i *)&z[0];
-
-
-  for(i=0; i<(N>>3); i++) {
-    z_128[0] = _mm_add_epi32(x_128[0],y_128[0]);
-    z_128[1] = _mm_add_epi32(x_128[1],y_128[1]);
-    z_128[2] = _mm_add_epi32(x_128[2],y_128[2]);
-    z_128[3] = _mm_add_epi32(x_128[3],y_128[3]);
-
-
-    x_128+=4;
-    y_128 +=4;
-    z_128 +=4;
-
-  }
-
-  _mm_empty();
-  _m_empty();
-  return(0);
-}
-
 int32_t sub_cpx_vector16(int16_t *x,
                      int16_t *y,
                      int16_t *z,
@@ -201,73 +54,6 @@ int32_t sub_cpx_vector16(int16_t *x,
 
 
 
-int add_real_vector64(short *x,
-                      short* y,
-                      short *z,
-                      unsigned int N)
-{
-  unsigned int i;                 // loop counter
-
-  __m128i *x_128;
-  __m128i *y_128;
-  __m128i *z_128;
-
-  x_128 = (__m128i *)&x[0];
-  y_128 = (__m128i *)&y[0];
-  z_128 = (__m128i *)&z[0];
-
-  for(i=0; i<(N>>3); i++) {
-    z_128[0] = _mm_add_epi64(x_128[0], y_128[0]);
-    z_128[1] = _mm_add_epi64(x_128[1], y_128[1]);
-    z_128[2] = _mm_add_epi64(x_128[2], y_128[2]);
-    z_128[3] = _mm_add_epi64(x_128[3], y_128[3]);
-
-
-    x_128+=4;
-    y_128+=4;
-    z_128+=4;
-
-  }
-
-  _mm_empty();
-  _m_empty();
-  return(0);
-}
-
-int sub_real_vector64(short *x,
-                      short* y,
-                      short *z,
-                      unsigned int N)
-{
-  unsigned int i;                 // loop counter
-
-  __m128i *x_128;
-  __m128i *y_128;
-  __m128i *z_128;
-
-  x_128 = (__m128i *)&x[0];
-  y_128 = (__m128i *)&y[0];
-  z_128 = (__m128i *)&z[0];
-
-  for(i=0; i<(N>>3); i++) {
-    z_128[0] = _mm_sub_epi64(x_128[0], y_128[0]);
-    z_128[1] = _mm_sub_epi64(x_128[1], y_128[1]);
-    z_128[2] = _mm_sub_epi64(x_128[2], y_128[2]);
-    z_128[3] = _mm_sub_epi64(x_128[3], y_128[3]);
-
-
-    x_128+=4;
-    y_128+=4;
-    z_128+=4;
-
-  }
-
-  _mm_empty();
-  _m_empty();
-  return(0);
-}
-
-
 #ifdef MAIN
 #include <stdio.h>
 
@@ -280,7 +66,7 @@ main ()
   int i;
   c16_t alpha;
 
-  Zero_Buffer(output,256*2);
+  memset(output,0,256*2);
 
   input[0] = 100;
   input[1] = 200;
diff --git a/openair1/PHY/TOOLS/cmult_sv.c b/openair1/PHY/TOOLS/cmult_sv.c
index fb2f4d589e5e43d87d7fa8ab0bd4410adb88bf1c..a596446a44ab2609afdda1ca05db823347892014 100644
--- a/openair1/PHY/TOOLS/cmult_sv.c
+++ b/openair1/PHY/TOOLS/cmult_sv.c
@@ -22,28 +22,6 @@
 #include "PHY/sse_intrin.h"
 #include "tools_defs.h"
 
-#if defined(__x86_64__) || defined(__i386__)
-#define simd_q15_t __m128i
-#define simdshort_q15_t __m64
-#define shiftright_int16(a,shift) _mm_srai_epi16(a,shift)
-#define set1_int16(a) _mm_set1_epi16(a)
-#define mulhi_int16(a,b) _mm_slli_epi16(_mm_mulhi_epi16(a,b),1)
-#define mulhi_s1_int16(a,b) _mm_slli_epi16(_mm_mulhi_epi16(a,b),2)
-#define adds_int16(a,b) _mm_adds_epi16(a,b)
-#define mullo_int16(a,b) _mm_mullo_epi16(a,b)
-#elif defined(__arm__)
-#define simd_q15_t int16x8_t
-#define simdshort_q15_t int16x4_t
-#define shiftright_int16(a,shift) vshrq_n_s16(a,shift)
-#define set1_int16(a) vdupq_n_s16(a)
-#define mulhi_int16(a,b) vqdmulhq_s16(a,b)
-#define mulhi_s1_int16(a,b) vshlq_n_s16(vqdmulhq_s16(a,b),1)
-#define adds_int16(a,b) vqaddq_s16(a,b)
-#define mullo_int16(a,b) vmulq_s16(a,b)
-#define _mm_empty() 
-#define _m_empty()
-#endif
-
 
 void multadd_complex_vector_real_scalar(int16_t *x,
                                         int16_t alpha,
@@ -111,217 +89,138 @@ void multadd_real_vector_complex_scalar(int16_t *x,
   }
 }
 
-void multadd_real_four_symbols_vector_complex_scalar(int16_t *x,
-                                                     int16_t *alpha,
-                                                     int16_t *y)
-{
-
-  // do 8 multiplications at a time
-  simd_q15_t alpha_r_128,alpha_i_128,yr,yi,*x_128=(simd_q15_t*)x;
-  simd_q15_t y_128;
-  y_128 = _mm_loadu_si128((simd_q15_t*)y);
-
-  alpha_r_128 = set1_int16(alpha[0]);
-  alpha_i_128 = set1_int16(alpha[1]);
-
-
-  yr     = mulhi_s1_int16(alpha_r_128,x_128[0]);
-  yi     = mulhi_s1_int16(alpha_i_128,x_128[0]);
-  y_128   = _mm_adds_epi16(y_128,_mm_unpacklo_epi16(yr,yi));
-  y_128   = _mm_adds_epi16(y_128,_mm_unpackhi_epi16(yr,yi));
-
-  _mm_storeu_si128((simd_q15_t*)y, y_128);
-
-  _mm_empty();
-  _m_empty();
 
-}
-#ifdef __AVX2__
 void rotate_cpx_vector(c16_t *x,
-                      c16_t *alpha,
-                      c16_t *y,
-                      uint32_t N,
-                      uint16_t output_shift)
+                       c16_t *alpha,
+                       c16_t *y,
+                       uint32_t N,
+                       uint16_t output_shift)
 {
-  // Multiply elementwise two complex vectors of N elements
-  // x        - input 1    in the format  |Re0  Im0 |,......,|Re(N-1) Im(N-1)|
-  //            We assume x1 with a dynamic of 15 bit maximum
-  //
-  // alpha      - input 2    in the format  |Re0 Im0|
-  //            We assume x2 with a dynamic of 15 bit maximum
-  //
-  // y        - output     in the format  |Re0  Im0|,......,|Re(N-1) Im(N-1)|
-  //
-  // N        - the size f the vectors (this function does N cpx mpy. WARNING: N>=4;
-  //
-  // log2_amp - increase the output amplitude by a factor 2^log2_amp (default is 0)
-  //            WARNING: log2_amp>0 can cause overflow!!
-
-  uint32_t i;                 // loop counter
-
-
-  simd_q15_t *y_128,alpha_128;
-  int32_t *xd=(int32_t *)x; 
+  // multiply a complex vector with a complex value (alpha)
+  // stores result in y
+  // N is the number of complex numbers
+  // output_shift reduces the result of the multiplication by this number of bits
+  //AssertFatal(N%8==0, "To be developped");
+#ifdef __AVX2__
+  if ( (intptr_t)x%32 == 0  && !(intptr_t)y%32 == 0 && __builtin_cpu_supports("avx2")) {
+    // output is 32 bytes aligned, but not the input
+    
+    const c16_t for_re={alpha->r, -alpha->i};
+    __m256i const alpha_for_real =  _mm256_set1_epi32(*(uint32_t*)&for_re);
+    const c16_t for_im={alpha->i, alpha->r};
+    __m256i const alpha_for_im= _mm256_set1_epi32(*(uint32_t*)&for_im);
+    __m256i const perm_mask =
+      _mm256_set_epi8(31,30,23,22,29,28,21,20,27,26,19,18,25,24,17,16,
+                      15,14,7,6,13,12,5,4,11,10,3,2,9,8,1,0);
+    __m256i* xd= (__m256i*)x;
+    const __m256i *end=xd+N/8;
+    for( __m256i* yd = (__m256i *)y; xd<end ; yd++, xd++) {
+      const __m256i xre = _mm256_srai_epi32(_mm256_madd_epi16(*xd,alpha_for_real),
+                                            output_shift);
+      const __m256i xim = _mm256_srai_epi32(_mm256_madd_epi16(*xd,alpha_for_im),
+                                            output_shift);
+      // a bit faster than unpacklo+unpackhi+packs
+      const __m256i tmp=_mm256_packs_epi32(xre,xim);
+      *yd=_mm256_shuffle_epi8(tmp,perm_mask);
+    }
+    c16_t* alpha16=(c16_t*) alpha, *yLast;
+    yLast=((c16_t*)y)+(N/8)*8;
+    for (c16_t* xTail=(c16_t*) end;
+         xTail<((c16_t*) x)+N;
+         xTail++, yLast++) {
+      *yLast=c16mulShift(*xTail,*alpha16,output_shift);
+    }
+  } else {
+#endif
+    // Multiply elementwise two complex vectors of N elements
+    // x        - input 1    in the format  |Re0  Im0 |,......,|Re(N-1) Im(N-1)|
+    //            We assume x1 with a dynamic of 15 bit maximum
+    //
+    // alpha      - input 2    in the format  |Re0 Im0|
+    //            We assume x2 with a dynamic of 15 bit maximum
+    //
+    // y        - output     in the format  |Re0  Im0|,......,|Re(N-1) Im(N-1)|
+    //
+    // N        - the size f the vectors (this function does N cpx mpy. WARNING: N>=4;
+    //
+    // log2_amp - increase the output amplitude by a factor 2^log2_amp (default is 0)
+    //            WARNING: log2_amp>0 can cause overflow!!
+
+    uint32_t i;                 // loop counter
+
+
+    simd_q15_t *y_128,alpha_128;
+    int32_t *xd=(int32_t *)x; 
 
 #if defined(__x86_64__) || defined(__i386__)
-  __m128i shift = _mm_cvtsi32_si128(output_shift);
-  register simd_q15_t m0,m1,m2,m3;
-
-  ((int16_t *)&alpha_128)[0] = alpha->r;
-  ((int16_t *)&alpha_128)[1] = -alpha->i;
-  ((int16_t *)&alpha_128)[2] = alpha->i;
-  ((int16_t *)&alpha_128)[3] = alpha->r;
-  ((int16_t *)&alpha_128)[4] = alpha->r;
-  ((int16_t *)&alpha_128)[5] = -alpha->i;
-  ((int16_t *)&alpha_128)[6] = alpha->i;
-  ((int16_t *)&alpha_128)[7] = alpha->r;
+    __m128i shift = _mm_cvtsi32_si128(output_shift);
+    register simd_q15_t m0,m1,m2,m3;
+
+    ((int16_t *)&alpha_128)[0] = alpha->r;
+    ((int16_t *)&alpha_128)[1] = -alpha->i;
+    ((int16_t *)&alpha_128)[2] = alpha->i;
+    ((int16_t *)&alpha_128)[3] = alpha->r;
+    ((int16_t *)&alpha_128)[4] = alpha->r;
+    ((int16_t *)&alpha_128)[5] = -alpha->i;
+    ((int16_t *)&alpha_128)[6] = alpha->i;
+    ((int16_t *)&alpha_128)[7] = alpha->r;
 #elif defined(__arm__)
-  int32x4_t shift;
-  int32x4_t ab_re0,ab_re1,ab_im0,ab_im1,re32,im32;
-  int16_t reflip[8]  __attribute__((aligned(16))) = {1,-1,1,-1,1,-1,1,-1};
-  int32x4x2_t xtmp;
-
-  ((int16_t *)&alpha_128)[0] = alpha->r;
-  ((int16_t *)&alpha_128)[1] = alpha->i;
-  ((int16_t *)&alpha_128)[2] = alpha->r;
-  ((int16_t *)&alpha_128)[3] = alpha->i;
-  ((int16_t *)&alpha_128)[4] = alpha->r;
-  ((int16_t *)&alpha_128)[5] = alpha->i;
-  ((int16_t *)&alpha_128)[6] = alpha->r;
-  ((int16_t *)&alpha_128)[7] = alpha->i;
-  int16x8_t bflip = vrev32q_s16(alpha_128);
-  int16x8_t bconj = vmulq_s16(alpha_128,*(int16x8_t *)reflip);
-  shift = vdupq_n_s32(-output_shift);
+    int32x4_t shift;
+    int32x4_t ab_re0,ab_re1,ab_im0,ab_im1,re32,im32;
+    int16_t reflip[8]  __attribute__((aligned(16))) = {1,-1,1,-1,1,-1,1,-1};
+    int32x4x2_t xtmp;
+
+    ((int16_t *)&alpha_128)[0] = alpha->r;
+    ((int16_t *)&alpha_128)[1] = alpha->i;
+    ((int16_t *)&alpha_128)[2] = alpha->r;
+    ((int16_t *)&alpha_128)[3] = alpha->i;
+    ((int16_t *)&alpha_128)[4] = alpha->r;
+    ((int16_t *)&alpha_128)[5] = alpha->i;
+    ((int16_t *)&alpha_128)[6] = alpha->r;
+    ((int16_t *)&alpha_128)[7] = alpha->i;
+    int16x8_t bflip = vrev32q_s16(alpha_128);
+    int16x8_t bconj = vmulq_s16(alpha_128,*(int16x8_t *)reflip);
+    shift = vdupq_n_s32(-output_shift);
 #endif
-  y_128 = (simd_q15_t *) y;
+    y_128 = (simd_q15_t *) y;
 
 
-  for(i=0; i<N>>2; i++) {
+    for(i=0; i<N>>2; i++) {
 #if defined(__x86_64__) || defined(__i386__)
-    m0 = _mm_setr_epi32(xd[0],xd[0],xd[1],xd[1]);
-    m1 = _mm_setr_epi32(xd[2],xd[2],xd[3],xd[3]);
-    m2 = _mm_madd_epi16(m0,alpha_128); //complex multiply. result is 32bit [Re Im Re Im]
-    m3 = _mm_madd_epi16(m1,alpha_128); //complex multiply. result is 32bit [Re Im Re Im]
-    m2 = _mm_sra_epi32(m2,shift);        // shift right by shift in order to  compensate for the input amplitude
-    m3 = _mm_sra_epi32(m3,shift);        // shift right by shift in order to  compensate for the input amplitude
-
-    y_128[0] = _mm_packs_epi32(m2,m3);        // pack in 16bit integers with saturation [re im re im re im re im]
-    //print_ints("y_128[0]=", &y_128[0]);
+      m0 = _mm_setr_epi32(xd[0],xd[0],xd[1],xd[1]);
+      m1 = _mm_setr_epi32(xd[2],xd[2],xd[3],xd[3]);
+      m2 = _mm_madd_epi16(m0,alpha_128); //complex multiply. result is 32bit [Re Im Re Im]
+      m3 = _mm_madd_epi16(m1,alpha_128); //complex multiply. result is 32bit [Re Im Re Im]
+      m2 = _mm_sra_epi32(m2,shift);        // shift right by shift in order to  compensate for the input amplitude
+      m3 = _mm_sra_epi32(m3,shift);        // shift right by shift in order to  compensate for the input amplitude
+
+      y_128[0] = _mm_packs_epi32(m2,m3);        // pack in 16bit integers with saturation [re im re im re im re im]
+      //print_ints("y_128[0]=", &y_128[0]);
 #elif defined(__arm__)
 
-  ab_re0 = vmull_s16(((int16x4_t*)xd)[0],((int16x4_t*)&bconj)[0]);
-  ab_re1 = vmull_s16(((int16x4_t*)xd)[1],((int16x4_t*)&bconj)[1]);
-  ab_im0 = vmull_s16(((int16x4_t*)xd)[0],((int16x4_t*)&bflip)[0]);
-  ab_im1 = vmull_s16(((int16x4_t*)xd)[1],((int16x4_t*)&bflip)[1]);
-  re32 = vshlq_s32(vcombine_s32(vpadd_s32(((int32x2_t*)&ab_re0)[0],((int32x2_t*)&ab_re0)[1]),
-                                vpadd_s32(((int32x2_t*)&ab_re1)[0],((int32x2_t*)&ab_re1)[1])),
-                   shift);
-  im32 = vshlq_s32(vcombine_s32(vpadd_s32(((int32x2_t*)&ab_im0)[0],((int32x2_t*)&ab_im0)[1]),
-                                vpadd_s32(((int32x2_t*)&ab_im1)[0],((int32x2_t*)&ab_im1)[1])),
-                   shift);
-
-  xtmp = vzipq_s32(re32,im32);
+      ab_re0 = vmull_s16(((int16x4_t*)xd)[0],((int16x4_t*)&bconj)[0]);
+      ab_re1 = vmull_s16(((int16x4_t*)xd)[1],((int16x4_t*)&bconj)[1]);
+      ab_im0 = vmull_s16(((int16x4_t*)xd)[0],((int16x4_t*)&bflip)[0]);
+      ab_im1 = vmull_s16(((int16x4_t*)xd)[1],((int16x4_t*)&bflip)[1]);
+      re32 = vshlq_s32(vcombine_s32(vpadd_s32(((int32x2_t*)&ab_re0)[0],((int32x2_t*)&ab_re0)[1]),
+                                    vpadd_s32(((int32x2_t*)&ab_re1)[0],((int32x2_t*)&ab_re1)[1])),
+                       shift);
+      im32 = vshlq_s32(vcombine_s32(vpadd_s32(((int32x2_t*)&ab_im0)[0],((int32x2_t*)&ab_im0)[1]),
+                                    vpadd_s32(((int32x2_t*)&ab_im1)[0],((int32x2_t*)&ab_im1)[1])),
+                       shift);
+
+      xtmp = vzipq_s32(re32,im32);
   
-  y_128[0] = vcombine_s16(vmovn_s32(xtmp.val[0]),vmovn_s32(xtmp.val[1]));
-
-#endif
+      y_128[0] = vcombine_s16(vmovn_s32(xtmp.val[0]),vmovn_s32(xtmp.val[1]));
 
-    xd+=4;
-    y_128+=1;
-  }
-
-
-  _mm_empty();
-  _m_empty();
-
-  return;
-}
 #endif
-/*
-int mult_vector32_scalar(int16_t *x1,
-                         int x2,
-                         int16_t *y,
-                         uint32_t N)
 
-{
-  // Multiply elementwise two real vectors of N elements
-  // x1       - input 1    in the format  |Re(0)  xxx  Re(1) xxx|,......,|Re(N-2) xxx Re(N-1) xxx|
-  //            We assume x1 with a dinamic of 31 bit maximum
-  //
-  // x1       - input 2
-  //
-  // y        - output     in the format  |Re0 (64bit) |,......,|Re(N-1) (64bit)|
-  //
-  // N        - the size f the vectors (this function does N cpx mpy. WARNING: N>=4;
-  //
-
-  uint32_t i;                 // loop counter
-
-  __m128i *x1_128;
-  __m128i x2_128;
-  __m128i *y_128;
-
-
-  x1_128 = (__m128i *)&x1[0];
-  x2_128 = _mm_setr_epi32(x2,0,x2,0);
-  y_128 = (__m128i *)&y[0];
-
-
-  // we compute 4 cpx multiply for each loop
-  for(i=0; i<(N>>3); i++) {
-    y_128[0] = _mm_mul_epu32(x1_128[0],x2_128);
-    y_128[1] = _mm_mul_epu32(x1_128[1],x2_128);
-    y_128[2] = _mm_mul_epu32(x1_128[2],x2_128);
-    y_128[3] = _mm_mul_epu32(x1_128[3],x2_128);
-
-
-    x1_128+=4;
-    y_128 +=4;
-  }
-
-
-  _mm_empty();
-  _m_empty();
-
-  return(0);
-}
-*/
-
-int complex_conjugate(int16_t *x1,
-                      int16_t *y,
-                      uint32_t N)
-
-{
-  uint32_t i;                 // loop counter
-
-  simd_q15_t *x1_128;
-  simd_q15_t *y_128;
-  int16_t x2[8] __attribute__((aligned(16))) = {1,-1,1,-1,1,-1,1,-1}; 
-  simd_q15_t *x2_128 = (simd_q15_t*)&x2[0];
-  x1_128 = (simd_q15_t *)&x1[0];
-  y_128 = (simd_q15_t *)&y[0];
-
-
-  // we compute 4 cpx multiply for each loop
-  for(i=0; i<(N>>3); i++) {
-    y_128[0] = mullo_int16(x1_128[0],*x2_128);
-    y_128[1] = mullo_int16(x1_128[1],*x2_128);
-    y_128[2] = mullo_int16(x1_128[2],*x2_128);
-    y_128[3] = mullo_int16(x1_128[3],*x2_128);
-
-
-    x1_128+=4;
-    y_128 +=4;
+      xd+=4;
+      y_128+=1;
+    }
   }
-
-
-  _mm_empty();
-  _m_empty();
-
-  return(0);
 }
 
-
 #ifdef MAIN
 #define L 8
 
diff --git a/openair1/PHY/TOOLS/memory_routines.c b/openair1/PHY/TOOLS/memory_routines.c
deleted file mode 100644
index cef9928988bd79cd809b0108810d65f3c74fa262..0000000000000000000000000000000000000000
--- a/openair1/PHY/TOOLS/memory_routines.c
+++ /dev/null
@@ -1,67 +0,0 @@
-/*
- * Licensed to the OpenAirInterface (OAI) Software Alliance under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The OpenAirInterface Software Alliance licenses this file to You under
- * the OAI Public License, Version 1.1  (the "License"); you may not use this file
- * except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.openairinterface.org/?page_id=698
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *-------------------------------------------------------------------------------
- * For more information about the OpenAirInterface (OAI) Software Alliance:
- *      contact@openairinterface.org
- */
-
-#include "PHY/sse_intrin.h"
-
-void Zero_Buffer(void *buf,unsigned int length)
-{
-  // zeroes the mmx_t buffer 'buf' starting from buf[0] to buf[length-1] in bytes
-  int i;
-  register __m64 mm0;
-  __m64 *mbuf = (__m64 *)buf;
-
-  //  length>>=3;                    // put length in quadwords
-  mm0 = _m_pxor(mm0,mm0);         // clear the register
-
-  for(i=0; i<length>>3; i++)     // for each i
-    mbuf[i] = mm0;                // put 0 in buf[i]
-
-  _mm_empty();
-}
-
-void mmxcopy(void *dest,void *src,int size)
-{
-
-  // copy size bytes from src to dest
-  register int i;
-  register __m64 mm0;
-  __m64 *mmsrc = (__m64 *)src, *mmdest= (__m64 *)dest;
-
-
-
-  for (i=0; i<size>>3; i++) {
-    mm0 = mmsrc[i];
-    mmdest[i] = mm0;
-  }
-
-  _mm_empty();
-}
-
-void Zero_Buffer_nommx(void *buf,unsigned int length)
-{
-
-  int i;
-
-  for (i=0; i<length>>2; i++)
-    ((int *)buf)[i] = 0;
-
-}
-
diff --git a/openair1/PHY/TOOLS/oai_dfts.c b/openair1/PHY/TOOLS/oai_dfts.c
index c2b6a55210822fd4e066adf7888d4463309f164d..cbedfd815a47e7001762c3377c17eb933a9e8dc5 100644
--- a/openair1/PHY/TOOLS/oai_dfts.c
+++ b/openair1/PHY/TOOLS/oai_dfts.c
@@ -2575,7 +2575,6 @@ const static int16_t tw64c[96] __attribute__((aligned(32))) = {
 #define simd_q15_t __m128i
 #define simdshort_q15_t __m64
 #define shiftright_int16(a,shift) _mm_srai_epi16(a,shift)
-#define set1_int16(a) _mm_set1_epi16(a);
 #define mulhi_int16(a,b) _mm_mulhrs_epi16 (a,b)
 #ifdef __AVX2__
 #define simd256_q15_t __m256i
diff --git a/openair1/PHY/TOOLS/tools_defs.h b/openair1/PHY/TOOLS/tools_defs.h
index c471978f461bb9387cac167b5e5ff8f308d8c5b1..6b81575feeb30458bf428c35e8b2b73d9ff9ad83 100644
--- a/openair1/PHY/TOOLS/tools_defs.h
+++ b/openair1/PHY/TOOLS/tools_defs.h
@@ -29,46 +29,199 @@
 
 */
 
-#ifdef __cplusplus
-extern "C" {
-#endif
-
 #include <stdio.h>
 #include <stdint.h>
 #include <assert.h>
 #include "PHY/sse_intrin.h"
 #include "common/utils/assertions.h"
 
+#if defined(__x86_64__) || defined(__i386__)
+#define simd_q15_t __m128i
+#define simdshort_q15_t __m64
+#define shiftright_int16(a,shift) _mm_srai_epi16(a,shift)
+#define set1_int16(a) _mm_set1_epi16(a)
+#define mulhi_int16(a,b) _mm_mulhrs_epi16 (a,b)
+#define mulhi_s1_int16(a,b) _mm_slli_epi16(_mm_mulhi_epi16(a,b),2)
+#define adds_int16(a,b) _mm_adds_epi16(a,b)
+#define mullo_int16(a,b) _mm_mullo_epi16(a,b)
+#elif defined(__arm__)
+#define simd_q15_t int16x8_t
+#define simdshort_q15_t int16x4_t
+#define shiftright_int16(a,shift) vshrq_n_s16(a,shift)
+#define set1_int16(a) vdupq_n_s16(a)
+#define mulhi_int16(a,b) vqdmulhq_s16(a,b)
+#define mulhi_s1_int16(a,b) vshlq_n_s16(vqdmulhq_s16(a,b),1)
+#define adds_int16(a,b) vqaddq_s16(a,b)
+#define mullo_int16(a,b) vmulq_s16(a,b)
+#define _mm_empty() 
+#define _m_empty()
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
 #define CEILIDIV(a,b) ((a+b-1)/b)
 #define ROUNDIDIV(a,b) (((a<<1)+b)/(b<<1))
 
-typedef struct complexd {
-  double r;
-  double i;
-} cd_t;
-
-typedef struct complexf {
-  float r;
-  float i;
-} cf_t;
-
-typedef struct complex16 {
-  int16_t r;
-  int16_t i;
-} c16_t;
-
-typedef struct complex32 {
-  int32_t r;
-  int32_t i;
-} c32_t;
+  typedef struct complexd {
+    double r;
+    double i;
+  } cd_t;
+
+  typedef struct complexf {
+    float r;
+    float i;
+  } cf_t;
+
+  typedef struct complex16 {
+    int16_t r;
+    int16_t i;
+  } c16_t;
+
+  typedef struct complex32 {
+    int32_t r;
+    int32_t i;
+  } c32_t;
+
+  typedef struct complex64 {
+    int64_t r;
+    int64_t i;
+  } c64_t;
+
+#define squaredMod(a) ((a).r*(a).r + (a).i*(a).i)
+#define csum(res, i1, i2) (res).r = (i1).r + (i2).r ; (res).i = (i1).i + (i2).i
+
+  __attribute__((always_inline)) inline c16_t c16mulShift(const c16_t a, const c16_t b, const int Shift) {
+    return (c16_t) {
+      .r = (int16_t)((a.r * b.r - a.i * b.i) >> Shift),
+      .i = (int16_t)((a.r * b.i + a.i * b.r) >> Shift)
+    };
+  }
   
-typedef struct complex64 {
-  int64_t r;
-  int64_t i;
-} c64_t;
+  __attribute__((always_inline)) inline c16_t c16divShift(const c16_t a, const c16_t b, const int Shift) {
+    return (c16_t) {
+      .r = (int16_t)((a.r * b.r + a.i * b.i) >> Shift),
+      .i = (int16_t)((a.r * b.i - a.i * b.r) >> Shift)
+    };
+  }
+  
+  __attribute__((always_inline)) inline c16_t c16maddShift(const c16_t a, const c16_t b, c16_t c, const int Shift) {
+    return (c16_t) {
+      .r = (int16_t)(((a.r * b.r - a.i * b.i ) >> Shift) + c.r),
+      .i = (int16_t)(((a.r * b.i + a.i * b.r ) >> Shift) + c.i)
+    };
+  }
+
+  __attribute__((always_inline)) inline c32_t c32x16mulShift(const c16_t a, const c16_t b, const int Shift) {
+    return (c32_t) {
+      .r = (a.r * b.r - a.i * b.i) >> Shift,
+      .i = (a.r * b.i + a.i * b.r) >> Shift
+    };
+  }
+
+  __attribute__((always_inline)) inline c32_t c32x16maddShift(const c16_t a, const c16_t b, const c32_t c, const int Shift) {
+    return (c32_t) {
+      .r = ((a.r * b.r - a.i * b.i) >> Shift) + c.r,
+      .i = ((a.r * b.i + a.i * b.r) >> Shift) + c.i
+    };
+  }
+
+  __attribute__((always_inline)) inline c16_t c16x32div(const c32_t a, const int div) {
+    return (c16_t) {
+      .r = (int16_t)(a.r / div),
+      .i = (int16_t)(a.i / div)
+    };
+  }
+
+
+  // On N complex numbers
+  //   y.r += (x * alpha.r) >> 14 
+  //   y.i += (x * alpha.i) >> 14 
+  // See regular C implementation at the end
+  __attribute__((always_inline)) inline void c16multaddVectRealComplex(const int16_t *x,
+                                                                       const c16_t *alpha,
+                                                                       c16_t *y,
+                                                                       const int N) {
+#ifdef __AVX2__
+    const int8_t makePairs[32] __attribute__((aligned(32)))={
+      0,1,0+16,1+16,
+      2,3,2+16,3+16,
+      4,5,4+16,5+16,
+      6,7,6+16,7+16,
+      8,9,8+16,9+16,
+      10,11,10+16,11+16,
+      12,13,12+16,13+16,
+      14,15,14+16,15+16};
+    
+    __m256i alpha256= _mm256_set1_epi32(*(int32_t *)alpha);
+    __m128i *x128=(__m128i *)x;
+    __m128i *y128=(__m128i *)y;
+    AssertFatal(N%8==0,"Not implemented\n");
+    for (int i=0; i<N/8; i++) {
+      const __m256i xduplicate=_mm256_broadcastsi128_si256(*x128);
+      const __m256i x_duplicate_ordered=_mm256_shuffle_epi8(xduplicate,*(__m256i*)makePairs);
+      const __m256i x_mul_alpha_shift15 =_mm256_mulhrs_epi16(alpha256, x_duplicate_ordered);
+      // Existing multiplication normalization is weird, constant table in alpha need to be doubled
+      const __m256i x_mul_alpha_x2= _mm256_adds_epi16(x_mul_alpha_shift15,x_mul_alpha_shift15);
+      *y128= _mm_adds_epi16(_mm256_extracti128_si256(x_mul_alpha_x2,0),*y128);
+      y128++;
+      *y128= _mm_adds_epi16(_mm256_extracti128_si256(x_mul_alpha_x2,1),*y128);
+      y128++;
+      x128++;
+    } 
+    
+#elif defined(__x86_64__) || defined(__i386__) ||  defined(__arm__)
+    uint32_t i;
+    
+    // do 8 multiplications at a time
+    simd_q15_t alpha_r_128,alpha_i_128,yr,yi,*x_128=(simd_q15_t*)x,*y_128=(simd_q15_t*)y;
+    int j;
+
+    //  printf("alpha = %d,%d\n",alpha[0],alpha[1]);
+    alpha_r_128 = set1_int16(alpha->r);
+    alpha_i_128 = set1_int16(alpha->i);
+
+    j=0;
+
+    for (i=0; i<N>>3; i++) {
+
+      yr     = mulhi_s1_int16(alpha_r_128,x_128[i]);
+      yi     = mulhi_s1_int16(alpha_i_128,x_128[i]);
+#if defined(__x86_64__) || defined(__i386__)
+      y_128[j]   = _mm_adds_epi16(y_128[j],_mm_unpacklo_epi16(yr,yi));
+      j++;
+      y_128[j]   = _mm_adds_epi16(y_128[j],_mm_unpackhi_epi16(yr,yi));
+      j++;
+#elif defined(__arm__)
+      int16x8x2_t yint;
+      yint = vzipq_s16(yr,yi);
+      y_128[j]   = adds_int16(y_128[j],yint.val[0]);
+      j++;
+      y_128[j]   = adds_int16(y_128[j],yint.val[1]);
+ 
+      j++;
+#endif
+    }
 
-#define squaredMod(a) ((a).r*(a).r+(a).i*(a).i)
-#define csum(res, i1, i2) (res).r=(i1).r+(i2).r ; (res).i=(i1).i+(i2).i  
+#else
+    for (int i=0; i<N; i++) {
+      int tmpr=y[i].r+((x[i]*alpha->r)>>14);
+      if (tmpr>INT16_MAX)
+        tmpr=INT16_MAX;
+      if (tmpr<INT16_MIN)
+        tmpr=INT16_MIN;
+      int tmpi=y[i].i+((x[i]*alpha->i)>>14);
+      if (tmpi>INT16_MAX)
+        tmpi=INT16_MAX;
+      if (tmpi<INT16_MIN)
+        tmpi=INT16_MIN;
+      y[i].r=(int16_t)tmpr;
+      y[i].i=(int16_t)tmpi;
+    }
+
+#endif
+  }
 //cmult_sv.h
 
 /*!\fn void multadd_real_vector_complex_scalar(int16_t *x,int16_t *alpha,int16_t *y,uint32_t N)
@@ -85,10 +238,29 @@ void multadd_real_vector_complex_scalar(int16_t *x,
                                         int16_t *y,
                                         uint32_t N);
 
-void multadd_real_four_symbols_vector_complex_scalar(int16_t *x,
-                                                     int16_t *alpha,
-                                                     int16_t *y);
+__attribute__((always_inline)) inline void multadd_real_four_symbols_vector_complex_scalar(int16_t *x,
+                                                                                           c16_t *alpha,
+                                                                                           c16_t *y)
+{
+
+  // do 8 multiplications at a time
+  simd_q15_t alpha_r_128,alpha_i_128,yr,yi,*x_128=(simd_q15_t*)x;
+  simd_q15_t y_128;
+  y_128 = _mm_loadu_si128((simd_q15_t*)y);
+
+  alpha_r_128 = set1_int16(alpha->r);
+  alpha_i_128 = set1_int16(alpha->i);
+
+
+  yr     = mulhi_s1_int16(alpha_r_128,x_128[0]);
+  yi     = mulhi_s1_int16(alpha_i_128,x_128[0]);
+  y_128   = _mm_adds_epi16(y_128,_mm_unpacklo_epi16(yr,yi));
+  y_128   = _mm_adds_epi16(y_128,_mm_unpackhi_epi16(yr,yi));
 
+  _mm_storeu_si128((simd_q15_t*)y, y_128);
+
+}
+  
 /*!\fn void multadd_complex_vector_real_scalar(int16_t *x,int16_t alpha,int16_t *y,uint8_t zero_flag,uint32_t N)
 This function performs componentwise multiplication and accumulation of a real scalar and a complex vector.
 @param x Vector input (Q1.15) in the format |Re0 Im0|Re1 Im 1| ...
@@ -114,7 +286,7 @@ void multadd_complex_vector_real_scalar(int16_t *x,
 
 //cmult_vv.c
 /*!
-  Multiply elementwise the complex conjugate of x1 with x2. 
+  Multiply elementwise the complex conjugate of x1 with x2.
   @param x1       - input 1    in the format  |Re0 Im0 Re1 Im1|,......,|Re(N-2)  Im(N-2) Re(N-1) Im(N-1)|
               We assume x1 with a dinamic of 15 bit maximum
   @param x2       - input 2    in the format  |Re0 Im0 Re1 Im1|,......,|Re(N-2)  Im(N-2) Re(N-1) Im(N-1)|
@@ -364,8 +536,8 @@ dft_size_idx_t get_dft(int ofdm_symbol_size)
       printf("function get_dft : unsupported ofdm symbol size \n");
       assert(0);
       break;
- }
- return DFT_SIZE_IDXTABLESIZE; // never reached and will trigger assertion in idft function;
+  }
+  return DFT_SIZE_IDXTABLESIZE; // never reached and will trigger assertion in idft function;
 }
 
 typedef enum idft_size_idx {
@@ -448,8 +620,8 @@ idft_size_idx_t get_idft(int ofdm_symbol_size)
       printf("function get_idft : unsupported ofdm symbol size \n");
       assert(0);
       break;
- }
- return IDFT_SIZE_IDXTABLESIZE; // never reached and will trigger assertion in idft function
+  }
+  return IDFT_SIZE_IDXTABLESIZE; // never reached and will trigger assertion in idft function
 }
 
 
@@ -491,58 +663,6 @@ int32_t sub_cpx_vector16(int16_t *x,
                          int16_t *z,
                          uint32_t N);
 
-int32_t add_cpx_vector32(int16_t *x,
-                         int16_t *y,
-                         int16_t *z,
-                         uint32_t N);
-
-int32_t add_real_vector64(int16_t *x,
-                          int16_t *y,
-                          int16_t *z,
-                          uint32_t N);
-
-int32_t sub_real_vector64(int16_t *x,
-                          int16_t *y,
-                          int16_t *z,
-                          uint32_t N);
-
-int32_t add_real_vector64_scalar(int16_t *x,
-                                 long long int a,
-                                 int16_t *y,
-                                 uint32_t N);
-
-/*!\fn int32_t add_vector16(int16_t *x,int16_t *y,int16_t *z,uint32_t N)
-This function performs componentwise addition of two vectors with Q1.15 components.
-@param x Vector input (Q1.15)
-@param y Scalar input (Q1.15)
-@param z Scalar output (Q1.15)
-@param N Length of x WARNING: N must be a multiple of 32
-
-The function implemented is : \f$\mathbf{z} = \mathbf{x} + \mathbf{y}\f$
-*/
-int32_t add_vector16(int16_t *x,
-                     int16_t *y,
-                     int16_t *z,
-                     uint32_t N);
-
-int32_t add_vector16_64(int16_t *x,
-                        int16_t *y,
-                        int16_t *z,
-                        uint32_t N);
-
-int32_t complex_conjugate(int16_t *x1,
-                          int16_t *y,
-                          uint32_t N);
-
-void bit8_txmux(int32_t length,int32_t offset);
-
-void bit8_rxdemux(int32_t length,int32_t offset);
-
-void Zero_Buffer(void *,uint32_t);
-void Zero_Buffer_nommx(void *buf,uint32_t length);
-
-void mmxcopy(void *dest,void *src,int size);
-
 /*!\fn int32_t signal_energy(int *,uint32_t);
 \brief Computes the signal energy per subcarrier
 */
@@ -631,7 +751,6 @@ int64_t dot_product64(int16_t *x,
 
 double interp(double x, double *xs, double *ys, int count);
 
-int write_output(const char *fname,const char *vname,void *data,int length,int dec,char format);
 
 #ifdef __cplusplus
 }
diff --git a/openair1/PHY/defs_eNB.h b/openair1/PHY/defs_eNB.h
index 3a48dc51d66db3a0d4eba955513ade438a9bc6c2..d5ad705f4e0146da7d7b1bb8db0c7aff7caac2f5 100644
--- a/openair1/PHY/defs_eNB.h
+++ b/openair1/PHY/defs_eNB.h
@@ -181,9 +181,6 @@ typedef struct {
 
 
 typedef struct {
-  /// \brief ?.
-  /// first index: ? [0..1023] (hard coded)
-  int16_t *prachF;
   /// \brief ?.
   /// first index: ce_level [0..3]
   /// second index: rx antenna [0..63] (hard coded) \note Hard coded array size indexed by \c nb_antennas_rx.
diff --git a/openair1/SCHED/prach_procedures.c b/openair1/SCHED/prach_procedures.c
index 805354ce1be5ce7d7b7043f60e44482807fc78e0..fd78ed4bfb73eb913ca5dd197207ed9f74ec3e32 100644
--- a/openair1/SCHED/prach_procedures.c
+++ b/openair1/SCHED/prach_procedures.c
@@ -49,7 +49,7 @@ extern int oai_nfapi_rach_ind(nfapi_rach_indication_t *rach_ind);
 
 void prach_procedures(PHY_VARS_eNB *eNB,
                       int br_flag ) {
-  uint16_t max_preamble[4],max_preamble_energy[4],max_preamble_delay[4],avg_preamble_energy[4];
+  uint16_t max_preamble[4]={0},max_preamble_energy[4]={0},max_preamble_delay[4]={0},avg_preamble_energy[4]={0}; 
   uint16_t i;
   int frame,subframe;
 
diff --git a/openair3/ocp-gtpu/gtp_itf.cpp b/openair3/ocp-gtpu/gtp_itf.cpp
index afef8ea4357ed5ea291d7a635ae6b26f0f363b8a..9779233538516b2c6d7004d9c45a0fc2db849762 100644
--- a/openair3/ocp-gtpu/gtp_itf.cpp
+++ b/openair3/ocp-gtpu/gtp_itf.cpp
@@ -145,7 +145,7 @@ class gtpEndPoints {
 
   ~gtpEndPoints() {
     // automatically close all sockets on quit
-    for (const auto p : instances)
+    for (const auto &p : instances)
       close(p.first);
   }
 };