diff --git a/ci-scripts/datalog_rt_stats.1x1.60.yaml b/ci-scripts/datalog_rt_stats.1x1.60.yaml
index 140f3939d7f64da016ad206858127e1f69d38934..f2cd345737d6c35551b7c628be39f36893284607 100644
--- a/ci-scripts/datalog_rt_stats.1x1.60.yaml
+++ b/ci-scripts/datalog_rt_stats.1x1.60.yaml
@@ -10,7 +10,7 @@ Ref :
   feprx : 46.0
   feptx_prec : 15.0
   feptx_ofdm : 35.0
-  feptx_total : 57.0
+  feptx_total : 50.0
   L1 Tx processing : 260.0
   DLSCH encoding : 160.0
   L1 Rx processing : 420.0
diff --git a/ci-scripts/datalog_rt_stats.default.yaml b/ci-scripts/datalog_rt_stats.default.yaml
index 3e665f1f0b04120743247b12405fa46f92a86a57..1b83aed7837053c7d6de80c2fc858ce6e1ec54be 100644
--- a/ci-scripts/datalog_rt_stats.default.yaml
+++ b/ci-scripts/datalog_rt_stats.default.yaml
@@ -10,7 +10,7 @@ Ref :
   feprx : 43.0
   feptx_prec : 13.0
   feptx_ofdm : 33.0
-  feptx_total : 55.0
+  feptx_total : 50.0
   L1 Tx processing : 200.0
   DLSCH encoding : 100.0
   L1 Rx processing : 330.0
diff --git a/ci-scripts/xml_files/t2_offload_dec_nr_ulsim.xml b/ci-scripts/xml_files/t2_offload_dec_nr_ulsim.xml
index 808f7155b5460dc095615518e591e2c32b300e16..c959d99e1b3ec043d7fe034bacd91ea650ec6391 100644
--- a/ci-scripts/xml_files/t2_offload_dec_nr_ulsim.xml
+++ b/ci-scripts/xml_files/t2_offload_dec_nr_ulsim.xml
@@ -24,19 +24,36 @@
 	<htmlTabName>Test T2 Offload Decoder</htmlTabName>
 	<htmlTabIcon>tasks</htmlTabIcon>
         <TestCaseRequestedList>
+                010204
                 010111 010112 010121 010122 010131 010132
                 010211 010212 010221 010222 010231 010232
                 010311 010312 010321 010322 010331 010332
+                402010
         </TestCaseRequestedList>
 	<TestCaseExclusionList></TestCaseExclusionList>
 
+	<testCase id="010204">
+		<class>Custom_Command</class>
+		<desc>Disable Sleep States</desc>
+		<node>caracal</node>
+		<command>sudo cpupower idle-set -D 0</command>
+	</testCase>
+
+	<testCase id="402010">
+		<class>Custom_Command</class>
+		<always_exec>true</always_exec>
+		<desc>Enable Sleep States</desc>
+		<node>caracal</node>
+		<command>sudo cpupower idle-set -E</command>
+	</testCase>
+
 	<testCase id="010111">
 		<class>Run_Physim</class>
 		<desc>Run nr_ulsim with CPU: SNR = 30, MCS = 5, 106 PRBs, 1 layer</desc>
 		<always_exec>true</always_exec>
 	        <physim_test>nr_ulsim</physim_test>
 	        <physim_time_threshold>300</physim_time_threshold>
-		<physim_run_args>-n100 -s30 -S30.2 -m5 -r106 -R106 -C10 -P</physim_run_args>
+		<physim_run_args>-n1000 -s30 -S30.2 -m5 -r106 -R106 -C10 -P</physim_run_args>
 	</testCase>
 
 	<testCase id="010112">
@@ -45,7 +62,7 @@
 		<always_exec>true</always_exec>
 		<physim_test>nr_ulsim</physim_test>
 		<physim_time_threshold>100</physim_time_threshold>
-		<physim_run_args>-n100 -s30 -S30.2 -m5 -r106 -R106 -C10 -P --loader.ldpc.shlibversion _t2 --nrLDPC_coding_t2.dpdk_dev d8:00.0 --nrLDPC_coding_t2.dpdk_core_list 11-12</physim_run_args>
+		<physim_run_args>-n1000 -s30 -S30.2 -m5 -r106 -R106 -C10 -P --loader.ldpc.shlibversion _t2 --nrLDPC_coding_t2.dpdk_dev d8:00.0 --nrLDPC_coding_t2.dpdk_core_list 11-12</physim_run_args>
 	</testCase>
 
 	<testCase id="010121">
@@ -54,7 +71,7 @@
 		<always_exec>true</always_exec>
 		<physim_test>nr_ulsim</physim_test>
 		<physim_time_threshold>300</physim_time_threshold>
-		<physim_run_args>-n100 -s30 -S30.2 -m15 -r106 -R106 -C10 -P</physim_run_args>
+		<physim_run_args>-n1000 -s30 -S30.2 -m15 -r106 -R106 -C10 -P</physim_run_args>
 	</testCase>
 
 	<testCase id="010122">
@@ -63,7 +80,7 @@
 		<always_exec>true</always_exec>
 		<physim_test>nr_ulsim</physim_test>
 		<physim_time_threshold>150</physim_time_threshold>
-		<physim_run_args>-n100 -s30 -S30.2 -m15 -r106 -R106 -C10 -P --loader.ldpc.shlibversion _t2 --nrLDPC_coding_t2.dpdk_dev d8:00.0 --nrLDPC_coding_t2.dpdk_core_list 11-12</physim_run_args>
+		<physim_run_args>-n1000 -s30 -S30.2 -m15 -r106 -R106 -C10 -P --loader.ldpc.shlibversion _t2 --nrLDPC_coding_t2.dpdk_dev d8:00.0 --nrLDPC_coding_t2.dpdk_core_list 11-12</physim_run_args>
 	</testCase>
 
 	<testCase id="010131">
@@ -72,7 +89,7 @@
                 <always_exec>true</always_exec>
                 <physim_test>nr_ulsim</physim_test>
                 <physim_time_threshold>250</physim_time_threshold>
-                <physim_run_args>-n100 -s30 -S30.2 -m25 -r106 -R106 -C10 -P</physim_run_args>
+                <physim_run_args>-n1000 -s30 -S30.2 -m25 -r106 -R106 -C10 -P</physim_run_args>
         </testCase>
 
         <testCase id="010132">
@@ -81,7 +98,7 @@
                 <always_exec>true</always_exec>
                 <physim_test>nr_ulsim</physim_test>
                 <physim_time_threshold>250</physim_time_threshold>
-                <physim_run_args>-n100 -s30 -S30.2 -m25 -r106 -R106 -C10 -P --loader.ldpc.shlibversion _t2 --nrLDPC_coding_t2.dpdk_dev d8:00.0 --nrLDPC_coding_t2.dpdk_core_list 11-12</physim_run_args>
+                <physim_run_args>-n1000 -s30 -S30.2 -m25 -r106 -R106 -C10 -P --loader.ldpc.shlibversion _t2 --nrLDPC_coding_t2.dpdk_dev d8:00.0 --nrLDPC_coding_t2.dpdk_core_list 11-12</physim_run_args>
         </testCase>
 
         <testCase id="010211">
@@ -90,7 +107,7 @@
                 <always_exec>true</always_exec>
                 <physim_test>nr_ulsim</physim_test>
                 <physim_time_threshold>300</physim_time_threshold>
-                <physim_run_args>-n100 -s30 -S30.2 -m5 -r273 -R273 -C10 -P</physim_run_args>
+                <physim_run_args>-n1000 -s30 -S30.2 -m5 -r273 -R273 -C10 -P</physim_run_args>
         </testCase>
 
         <testCase id="010212">
@@ -99,7 +116,7 @@
                 <always_exec>true</always_exec>
                 <physim_test>nr_ulsim</physim_test>
                 <physim_time_threshold>150</physim_time_threshold>
-                <physim_run_args>-n100 -s30 -S30.2 -m5 -r273 -R273 -C10 -P --loader.ldpc.shlibversion _t2 --nrLDPC_coding_t2.dpdk_dev d8:00.0 --nrLDPC_coding_t2.dpdk_core_list 11-12</physim_run_args>
+                <physim_run_args>-n1000 -s30 -S30.2 -m5 -r273 -R273 -C10 -P --loader.ldpc.shlibversion _t2 --nrLDPC_coding_t2.dpdk_dev d8:00.0 --nrLDPC_coding_t2.dpdk_core_list 11-12</physim_run_args>
 	</testCase>
 
 	<testCase id="010221">
@@ -108,7 +125,7 @@
                 <always_exec>true</always_exec>
                 <physim_test>nr_ulsim</physim_test>
                 <physim_time_threshold>400</physim_time_threshold>
-                <physim_run_args>-n100 -s30 -S30.2 -m15 -r273 -R273 -C10 -P</physim_run_args>
+                <physim_run_args>-n1000 -s30 -S30.2 -m15 -r273 -R273 -C10 -P</physim_run_args>
         </testCase>
 
         <testCase id="010222">
@@ -117,7 +134,7 @@
                 <always_exec>true</always_exec>
                 <physim_test>nr_ulsim</physim_test>
                 <physim_time_threshold>350</physim_time_threshold>
-                <physim_run_args>-n100 -s30 -S30.2 -m15 -r273 -R273 -C10 -P --loader.ldpc.shlibversion _t2 --nrLDPC_coding_t2.dpdk_dev d8:00.0 --nrLDPC_coding_t2.dpdk_core_list 11-12</physim_run_args>
+                <physim_run_args>-n1000 -s30 -S30.2 -m15 -r273 -R273 -C10 -P --loader.ldpc.shlibversion _t2 --nrLDPC_coding_t2.dpdk_dev d8:00.0 --nrLDPC_coding_t2.dpdk_core_list 11-12</physim_run_args>
         </testCase>
 
         <testCase id="010231">
@@ -126,7 +143,7 @@
                 <always_exec>true</always_exec>
                 <physim_test>nr_ulsim</physim_test>
                 <physim_time_threshold>400</physim_time_threshold>
-                <physim_run_args>-n100 -s30 -S30.2 -m25 -r273 -R273 -C10 -P</physim_run_args>
+                <physim_run_args>-n1000 -s30 -S30.2 -m25 -r273 -R273 -C10 -P</physim_run_args>
         </testCase>
 
         <testCase id="010232">
@@ -135,7 +152,7 @@
                 <always_exec>true</always_exec>
                 <physim_test>nr_ulsim</physim_test>
                 <physim_time_threshold>550</physim_time_threshold>
-                <physim_run_args>-n100 -s30 -S30.2 -m25 -r273 -R273 -C10 -P --loader.ldpc.shlibversion _t2 --nrLDPC_coding_t2.dpdk_dev d8:00.0 --nrLDPC_coding_t2.dpdk_core_list 11-12</physim_run_args>
+                <physim_run_args>-n1000 -s30 -S30.2 -m25 -r273 -R273 -C10 -P --loader.ldpc.shlibversion _t2 --nrLDPC_coding_t2.dpdk_dev d8:00.0 --nrLDPC_coding_t2.dpdk_core_list 11-12</physim_run_args>
         </testCase>
 
         <testCase id="010311">
@@ -144,7 +161,7 @@
                 <always_exec>true</always_exec>
                 <physim_test>nr_ulsim</physim_test>
                 <physim_time_threshold>300</physim_time_threshold>
-                <physim_run_args>-n100 -s30 -S30.2 -m5 -r273 -R273 -C10 -W2 -z2 -y2 -P</physim_run_args>
+                <physim_run_args>-n1000 -s30 -S30.2 -m5 -r273 -R273 -C10 -W2 -z2 -y2 -P</physim_run_args>
         </testCase>
 
         <testCase id="010312">
@@ -153,7 +170,7 @@
                 <always_exec>true</always_exec>
                 <physim_test>nr_ulsim</physim_test>
                 <physim_time_threshold>250</physim_time_threshold>
-                <physim_run_args>-n100 -s30 -S30.2 -m5 -r273 -R273 -C10 -W2 -z2 -y2 -P --loader.ldpc.shlibversion _t2 --nrLDPC_coding_t2.dpdk_dev d8:00.0 --nrLDPC_coding_t2.dpdk_core_list 11-12</physim_run_args>
+                <physim_run_args>-n1000 -s30 -S30.2 -m5 -r273 -R273 -C10 -W2 -z2 -y2 -P --loader.ldpc.shlibversion _t2 --nrLDPC_coding_t2.dpdk_dev d8:00.0 --nrLDPC_coding_t2.dpdk_core_list 11-12</physim_run_args>
         </testCase>
 
         <testCase id="010321">
@@ -162,7 +179,7 @@
                 <always_exec>true</always_exec>
                 <physim_test>nr_ulsim</physim_test>
                 <physim_time_threshold>600</physim_time_threshold>
-                <physim_run_args>-n100 -s30 -S30.2 -m15 -r273 -R273 -C10 -W2 -z2 -y2 -P</physim_run_args>
+                <physim_run_args>-n1000 -s30 -S30.2 -m15 -r273 -R273 -C10 -W2 -z2 -y2 -P</physim_run_args>
         </testCase>
 
         <testCase id="010322">
@@ -171,7 +188,7 @@
                 <always_exec>true</always_exec>
                 <physim_test>nr_ulsim</physim_test>
                 <physim_time_threshold>650</physim_time_threshold>
-                <physim_run_args>-n100 -s30 -S30.2 -m15 -r273 -R273 -C10 -W2 -z2 -y2 -P --loader.ldpc.shlibversion _t2 --nrLDPC_coding_t2.dpdk_dev d8:00.0 --nrLDPC_coding_t2.dpdk_core_list 11-12</physim_run_args>
+                <physim_run_args>-n1000 -s30 -S30.2 -m15 -r273 -R273 -C10 -W2 -z2 -y2 -P --loader.ldpc.shlibversion _t2 --nrLDPC_coding_t2.dpdk_dev d8:00.0 --nrLDPC_coding_t2.dpdk_core_list 11-12</physim_run_args>
         </testCase>
 
         <testCase id="010331">
@@ -180,7 +197,7 @@
                 <always_exec>true</always_exec>
                 <physim_test>nr_ulsim</physim_test>
                 <physim_time_threshold>650</physim_time_threshold>
-                <physim_run_args>-n100 -s30 -S30.2 -m25 -r273 -R273 -C10 -W2 -z2 -y2 -P</physim_run_args>
+                <physim_run_args>-n1000 -s30 -S30.2 -m25 -r273 -R273 -C10 -W2 -z2 -y2 -P</physim_run_args>
         </testCase>
 
         <testCase id="010332">
@@ -189,7 +206,7 @@
                 <always_exec>true</always_exec>
                 <physim_test>nr_ulsim</physim_test>
                 <physim_time_threshold>1100</physim_time_threshold>
-                <physim_run_args>-n100 -s30 -S30.2 -m25 -r273 -R273 -C10 -W2 -z2 -y2 -P --loader.ldpc.shlibversion _t2 --nrLDPC_coding_t2.dpdk_dev d8:00.0 --nrLDPC_coding_t2.dpdk_core_list 11-12</physim_run_args>
+                <physim_run_args>-n1000 -s30 -S30.2 -m25 -r273 -R273 -C10 -W2 -z2 -y2 -P --loader.ldpc.shlibversion _t2 --nrLDPC_coding_t2.dpdk_dev d8:00.0 --nrLDPC_coding_t2.dpdk_core_list 11-12</physim_run_args>
         </testCase>
 
 </testCaseList>
diff --git a/ci-scripts/xml_files/t2_offload_enc_nr_dlsim.xml b/ci-scripts/xml_files/t2_offload_enc_nr_dlsim.xml
index f1217ae737891e7797081123c1e9de98f85ed199..400399d3a1a11cc9b83a798a1656e3d8a43251bc 100644
--- a/ci-scripts/xml_files/t2_offload_enc_nr_dlsim.xml
+++ b/ci-scripts/xml_files/t2_offload_enc_nr_dlsim.xml
@@ -24,20 +24,37 @@
 	<htmlTabName>Test T2 Offload Encoder</htmlTabName>
 	<htmlTabIcon>tasks</htmlTabIcon>
         <TestCaseRequestedList>
+                102040
                 000111 000112 000121 000122 000131 000132
                 000211 000212 000221 000222 000231 000232
                 000311 000312 000321 000322 000331 000332
                 000411 000412 000421 000422 000431 000432
+                040201
         </TestCaseRequestedList>
 	<TestCaseExclusionList></TestCaseExclusionList>
 
+	<testCase id="102040">
+		<class>Custom_Command</class>
+		<desc>Disable Sleep States</desc>
+		<node>caracal</node>
+		<command>sudo cpupower idle-set -D 0</command>
+	</testCase>
+
+	<testCase id="040201">
+		<class>Custom_Command</class>
+		<always_exec>true</always_exec>
+		<desc>Enable Sleep States</desc>
+		<node>caracal</node>
+		<command>sudo cpupower idle-set -E</command>
+	</testCase>
+
 	<testCase id="000111">
 		<class>Run_Physim</class>
 		<desc>Run nr_dlsim with CPU: SNR = 30, MCS = 5, 106 PRBs, 1 layer</desc>
 		<always_exec>true</always_exec>
 		<physim_test>nr_dlsim</physim_test>
 		<physim_time_threshold>230</physim_time_threshold>
-		<physim_run_args>-n100 -s30 -S30.2 -e5 -b106 -R106 -X 8,9,10,11,12 -P</physim_run_args>
+		<physim_run_args>-n1000 -s30 -S30.2 -e5 -b106 -R106 -X 8,9,10,11,12 -P</physim_run_args>
 	</testCase>
 
 	<testCase id="000112">
@@ -46,7 +63,7 @@
 		<always_exec>true</always_exec>
 		<physim_test>nr_dlsim</physim_test>
 		<physim_time_threshold>100</physim_time_threshold>
-		<physim_run_args>-n100 -s30 -S30.2 -e5 -b106 -R106 -X4,5,6,7,8,9 -P --loader.ldpc.shlibversion _t2 --nrLDPC_coding_t2.dpdk_dev d8:00.0 --nrLDPC_coding_t2.dpdk_core_list 11-12</physim_run_args>
+		<physim_run_args>-n1000 -s30 -S30.2 -e5 -b106 -R106 -X4,5,6,7,8,9 -P --loader.ldpc.shlibversion _t2 --nrLDPC_coding_t2.dpdk_dev d8:00.0 --nrLDPC_coding_t2.dpdk_core_list 11-12</physim_run_args>
 	</testCase>
 
 	<testCase id="000121">
@@ -55,7 +72,7 @@
 		<always_exec>true</always_exec>
 		<physim_test>nr_dlsim</physim_test>
 		<physim_time_threshold>300</physim_time_threshold>
-		<physim_run_args>-n100 -s30 -S30.2 -e15 -b106 -R106 -X 8,9,10,11,12 -P</physim_run_args>
+		<physim_run_args>-n1000 -s30 -S30.2 -e15 -b106 -R106 -X 8,9,10,11,12 -P</physim_run_args>
 	</testCase>
 
 	<testCase id="000122">
@@ -64,7 +81,7 @@
 		<always_exec>true</always_exec>
 		<physim_test>nr_dlsim</physim_test>
 		<physim_time_threshold>100</physim_time_threshold>
-		<physim_run_args>-n100 -s30 -S30.2 -e15 -b106 -R106 -X4,5,6,7,8,9 -P --loader.ldpc.shlibversion _t2 --nrLDPC_coding_t2.dpdk_dev d8:00.0 --nrLDPC_coding_t2.dpdk_core_list 11-12</physim_run_args>
+		<physim_run_args>-n1000 -s30 -S30.2 -e15 -b106 -R106 -X4,5,6,7,8,9 -P --loader.ldpc.shlibversion _t2 --nrLDPC_coding_t2.dpdk_dev d8:00.0 --nrLDPC_coding_t2.dpdk_core_list 11-12</physim_run_args>
 	</testCase>
 
 	<testCase id="000131">
@@ -73,7 +90,7 @@
                 <always_exec>true</always_exec>
                 <physim_test>nr_dlsim</physim_test>
                 <physim_time_threshold>350</physim_time_threshold>
-                <physim_run_args>-n100 -s30 -S30.2 -e25 -b106 -R106 -X 8,9,10,11,12 -P</physim_run_args>
+                <physim_run_args>-n1000 -s30 -S30.2 -e25 -b106 -R106 -X 8,9,10,11,12 -P</physim_run_args>
         </testCase>
 
         <testCase id="000132">
@@ -82,7 +99,7 @@
                 <always_exec>true</always_exec>
                 <physim_test>nr_dlsim</physim_test>
                 <physim_time_threshold>200</physim_time_threshold>
-                <physim_run_args>-n100 -s30 -S30.2 -e25 -b106 -R106 -X4,5,6,7,8,9 -P --loader.ldpc.shlibversion _t2 --nrLDPC_coding_t2.dpdk_dev d8:00.0 --nrLDPC_coding_t2.dpdk_core_list 11-12</physim_run_args>
+                <physim_run_args>-n1000 -s30 -S30.2 -e25 -b106 -R106 -X4,5,6,7,8,9 -P --loader.ldpc.shlibversion _t2 --nrLDPC_coding_t2.dpdk_dev d8:00.0 --nrLDPC_coding_t2.dpdk_core_list 11-12</physim_run_args>
         </testCase>
 
         <testCase id="000211">
@@ -91,7 +108,7 @@
                 <always_exec>true</always_exec>
                 <physim_test>nr_dlsim</physim_test>
                 <physim_time_threshold>300</physim_time_threshold>
-                <physim_run_args>-n100 -s30 -S30.2 -e5 -b273 -R273 -X 8,9,10,11,12 -P</physim_run_args>
+                <physim_run_args>-n1000 -s30 -S30.2 -e5 -b273 -R273 -X 8,9,10,11,12 -P</physim_run_args>
         </testCase>
 
         <testCase id="000212">
@@ -100,7 +117,7 @@
                 <always_exec>true</always_exec>
                 <physim_test>nr_dlsim</physim_test>
                 <physim_time_threshold>150</physim_time_threshold>
-                <physim_run_args>-n100 -s30 -S30.2 -e5 -b273 -R273 -X4,5,6,7,8,9 -P --loader.ldpc.shlibversion _t2 --nrLDPC_coding_t2.dpdk_dev d8:00.0 --nrLDPC_coding_t2.dpdk_core_list 11-12</physim_run_args>
+                <physim_run_args>-n1000 -s30 -S30.2 -e5 -b273 -R273 -X4,5,6,7,8,9 -P --loader.ldpc.shlibversion _t2 --nrLDPC_coding_t2.dpdk_dev d8:00.0 --nrLDPC_coding_t2.dpdk_core_list 11-12</physim_run_args>
 	</testCase>
 
 	<testCase id="000221">
@@ -109,7 +126,7 @@
                 <always_exec>true</always_exec>
                 <physim_test>nr_dlsim</physim_test>
                 <physim_time_threshold>350</physim_time_threshold>
-                <physim_run_args>-n100 -s30 -S30.2 -e15 -b273 -R273 -X 8,9,10,11,12 -P</physim_run_args>
+                <physim_run_args>-n1000 -s30 -S30.2 -e15 -b273 -R273 -X 8,9,10,11,12 -P</physim_run_args>
         </testCase>
 
         <testCase id="000222">
@@ -118,7 +135,7 @@
                 <always_exec>true</always_exec>
                 <physim_test>nr_dlsim</physim_test>
                 <physim_time_threshold>250</physim_time_threshold>
-                <physim_run_args>-n100 -s30 -S30.2 -e15 -b273 -R273 -X4,5,6,7,8,9 -P --loader.ldpc.shlibversion _t2 --nrLDPC_coding_t2.dpdk_dev d8:00.0 --nrLDPC_coding_t2.dpdk_core_list 11-12</physim_run_args>
+                <physim_run_args>-n1000 -s30 -S30.2 -e15 -b273 -R273 -X4,5,6,7,8,9 -P --loader.ldpc.shlibversion _t2 --nrLDPC_coding_t2.dpdk_dev d8:00.0 --nrLDPC_coding_t2.dpdk_core_list 11-12</physim_run_args>
         </testCase>
 
         <testCase id="000231">
@@ -127,7 +144,7 @@
                 <always_exec>true</always_exec>
                 <physim_test>nr_dlsim</physim_test>
                 <physim_time_threshold>400</physim_time_threshold>
-                <physim_run_args>-n100 -s30 -S30.2 -e25 -b273 -R273 -X 8,9,10,11,12 -P</physim_run_args>
+                <physim_run_args>-n1000 -s30 -S30.2 -e25 -b273 -R273 -X 8,9,10,11,12 -P</physim_run_args>
         </testCase>
 
         <testCase id="000232">
@@ -136,7 +153,7 @@
                 <always_exec>true</always_exec>
                 <physim_test>nr_dlsim</physim_test>
                 <physim_time_threshold>400</physim_time_threshold>
-                <physim_run_args>-n100 -s30 -S30.2 -e25 -b273 -R273 -X4,5,6,7,8,9 -P --loader.ldpc.shlibversion _t2 --nrLDPC_coding_t2.dpdk_dev d8:00.0 --nrLDPC_coding_t2.dpdk_core_list 11-12</physim_run_args>
+                <physim_run_args>-n1000 -s30 -S30.2 -e25 -b273 -R273 -X4,5,6,7,8,9 -P --loader.ldpc.shlibversion _t2 --nrLDPC_coding_t2.dpdk_dev d8:00.0 --nrLDPC_coding_t2.dpdk_core_list 11-12</physim_run_args>
         </testCase>
 
         <testCase id="000311">
@@ -145,7 +162,7 @@
                 <always_exec>true</always_exec>
                 <physim_test>nr_dlsim</physim_test>
                 <physim_time_threshold>400</physim_time_threshold>
-                <physim_run_args>-n100 -s30 -S30.2 -e5 -b273 -R273 -X 8,9,10,11,12 -x2 -z2 -y2 -P</physim_run_args>
+                <physim_run_args>-n1000 -s30 -S30.2 -e5 -b273 -R273 -X 8,9,10,11,12 -x2 -z2 -y2 -P</physim_run_args>
         </testCase>
 
         <testCase id="000312">
@@ -154,7 +171,7 @@
                 <always_exec>true</always_exec>
                 <physim_test>nr_dlsim</physim_test>
                 <physim_time_threshold>200</physim_time_threshold>
-                <physim_run_args>-n100 -s30 -S30.2 -e5 -b273 -R273 -X4,5,6,7,8,9 -x2 -z2 -y2 -P --loader.ldpc.shlibversion _t2 --nrLDPC_coding_t2.dpdk_dev d8:00.0 --nrLDPC_coding_t2.dpdk_core_list 11-12</physim_run_args>
+                <physim_run_args>-n1000 -s30 -S30.2 -e5 -b273 -R273 -X4,5,6,7,8,9 -x2 -z2 -y2 -P --loader.ldpc.shlibversion _t2 --nrLDPC_coding_t2.dpdk_dev d8:00.0 --nrLDPC_coding_t2.dpdk_core_list 11-12</physim_run_args>
         </testCase>
 
         <testCase id="000321">
@@ -163,7 +180,7 @@
                 <always_exec>true</always_exec>
                 <physim_test>nr_dlsim</physim_test>
                 <physim_time_threshold>450</physim_time_threshold>
-                <physim_run_args>-n100 -s30 -S30.2 -e15 -b273 -R273 -X 8,9,10,11,12 -x2 -z2 -y2 -P</physim_run_args>
+                <physim_run_args>-n1000 -s30 -S30.2 -e15 -b273 -R273 -X 8,9,10,11,12 -x2 -z2 -y2 -P</physim_run_args>
         </testCase>
 
         <testCase id="000322">
@@ -172,7 +189,7 @@
                 <always_exec>true</always_exec>
                 <physim_test>nr_dlsim</physim_test>
                 <physim_time_threshold>500</physim_time_threshold>
-                <physim_run_args>-n100 -s30 -S30.2 -e15 -b273 -R273 -X4,5,6,7,8,9 -x2 -z2 -y2 -P --loader.ldpc.shlibversion _t2 --nrLDPC_coding_t2.dpdk_dev d8:00.0 --nrLDPC_coding_t2.dpdk_core_list 11-12</physim_run_args>
+                <physim_run_args>-n1000 -s30 -S30.2 -e15 -b273 -R273 -X4,5,6,7,8,9 -x2 -z2 -y2 -P --loader.ldpc.shlibversion _t2 --nrLDPC_coding_t2.dpdk_dev d8:00.0 --nrLDPC_coding_t2.dpdk_core_list 11-12</physim_run_args>
         </testCase>
 
         <testCase id="000331">
@@ -181,7 +198,7 @@
                 <always_exec>true</always_exec>
                 <physim_test>nr_dlsim</physim_test>
                 <physim_time_threshold>500</physim_time_threshold>
-                <physim_run_args>-n100 -s30 -S30.2 -e25 -b273 -R273 -X 8,9,10,11,12 -x2 -z2 -y2 -P</physim_run_args>
+                <physim_run_args>-n1000 -s30 -S30.2 -e25 -b273 -R273 -X 8,9,10,11,12 -x2 -z2 -y2 -P</physim_run_args>
         </testCase>
 
         <testCase id="000332">
@@ -190,7 +207,7 @@
                 <always_exec>true</always_exec>
                 <physim_test>nr_dlsim</physim_test>
                 <physim_time_threshold>500</physim_time_threshold>
-                <physim_run_args>-n100 -s30 -S30.2 -e25 -b273 -R273 -X4,5,6,7,8,9 -x2 -z2 -y2 -P --loader.ldpc.shlibversion _t2 --nrLDPC_coding_t2.dpdk_dev d8:00.0 --nrLDPC_coding_t2.dpdk_core_list 11-12</physim_run_args>
+                <physim_run_args>-n1000 -s30 -S30.2 -e25 -b273 -R273 -X4,5,6,7,8,9 -x2 -z2 -y2 -P --loader.ldpc.shlibversion _t2 --nrLDPC_coding_t2.dpdk_dev d8:00.0 --nrLDPC_coding_t2.dpdk_core_list 11-12</physim_run_args>
         </testCase>
 
         <testCase id="000411">
@@ -199,7 +216,7 @@
                 <always_exec>true</always_exec>
                 <physim_test>nr_dlsim</physim_test>
                 <physim_time_threshold>400</physim_time_threshold>
-                <physim_run_args>-n100 -s30 -S30.2 -e5 -b273 -R273 -X8,9,10,11,12 -x2 -z4 -y4 -P</physim_run_args>
+                <physim_run_args>-n1000 -s30 -S30.2 -e5 -b273 -R273 -X8,9,10,11,12 -x2 -z4 -y4 -P</physim_run_args>
         </testCase>
 
         <testCase id="000412">
@@ -208,7 +225,7 @@
                 <always_exec>true</always_exec>
                 <physim_test>nr_dlsim</physim_test>
                 <physim_time_threshold>200</physim_time_threshold>
-                <physim_run_args>-n100 -s30 -S30.2 -e5 -b273 -R273 -X4,5,6,7,8,9 -x2 -z4 -y4 -P --loader.ldpc.shlibversion _t2 --nrLDPC_coding_t2.dpdk_dev d8:00.0 --nrLDPC_coding_t2.dpdk_core_list 11-12</physim_run_args>
+                <physim_run_args>-n1000 -s30 -S30.2 -e5 -b273 -R273 -X4,5,6,7,8,9 -x2 -z4 -y4 -P --loader.ldpc.shlibversion _t2 --nrLDPC_coding_t2.dpdk_dev d8:00.0 --nrLDPC_coding_t2.dpdk_core_list 11-12</physim_run_args>
         </testCase>
 
         <testCase id="000421">
@@ -217,7 +234,7 @@
                 <always_exec>true</always_exec>
                 <physim_test>nr_dlsim</physim_test>
                 <physim_time_threshold>400</physim_time_threshold>
-                <physim_run_args>-n100 -s30 -S30.2 -e15 -b273 -R273 -X8,9,10,11,12 -x2 -z4 -y4 -P</physim_run_args>
+                <physim_run_args>-n1000 -s30 -S30.2 -e15 -b273 -R273 -X8,9,10,11,12 -x2 -z4 -y4 -P</physim_run_args>
         </testCase>
 
         <testCase id="000422">
@@ -226,7 +243,7 @@
                 <always_exec>true</always_exec>
                 <physim_test>nr_dlsim</physim_test>
                 <physim_time_threshold>300</physim_time_threshold>
-                <physim_run_args>-n100 -s30 -S30.2 -e15 -b273 -R273 -X4,5,6,7,8,9 -x2 -z4 -y4 -P --loader.ldpc.shlibversion _t2 --nrLDPC_coding_t2.dpdk_dev d8:00.0 --nrLDPC_coding_t2.dpdk_core_list 11-12</physim_run_args>
+                <physim_run_args>-n1000 -s30 -S30.2 -e15 -b273 -R273 -X4,5,6,7,8,9 -x2 -z4 -y4 -P --loader.ldpc.shlibversion _t2 --nrLDPC_coding_t2.dpdk_dev d8:00.0 --nrLDPC_coding_t2.dpdk_core_list 11-12</physim_run_args>
         </testCase>
 
         <testCase id="000431">
@@ -235,7 +252,7 @@
                 <always_exec>true</always_exec>
                 <physim_test>nr_dlsim</physim_test>
                 <physim_time_threshold>450</physim_time_threshold>
-                <physim_run_args>-n100 -s30 -S30.2 -e25 -b273 -R273 -X8,9,10,11,12 -x2 -z4 -y4 -P</physim_run_args>
+                <physim_run_args>-n1000 -s30 -S30.2 -e25 -b273 -R273 -X8,9,10,11,12 -x2 -z4 -y4 -P</physim_run_args>
         </testCase>
 
         <testCase id="000432">
@@ -244,6 +261,6 @@
                 <always_exec>true</always_exec>
                 <physim_test>nr_dlsim</physim_test>
                 <physim_time_threshold>450</physim_time_threshold>
-                <physim_run_args>-n100 -s30 -S30.2 -e25 -b273 -R273 -X4,5,6,7,8,9 -x2 -z4 -y4 -P --loader.ldpc.shlibversion _t2 --nrLDPC_coding_t2.dpdk_dev d8:00.0 --nrLDPC_coding_t2.dpdk_core_list 11-12</physim_run_args>
+                <physim_run_args>-n1000 -s30 -S30.2 -e25 -b273 -R273 -X4,5,6,7,8,9 -x2 -z4 -y4 -P --loader.ldpc.shlibversion _t2 --nrLDPC_coding_t2.dpdk_dev d8:00.0 --nrLDPC_coding_t2.dpdk_core_list 11-12</physim_run_args>
         </testCase>
 </testCaseList>
diff --git a/cmake_targets/autotests/test_case_list.xml b/cmake_targets/autotests/test_case_list.xml
index 2c6dc73bfa629ce700e4f78fb61e094a2063577c..32876634c06bb945232ff7895e8754cdecc5ee51 100755
--- a/cmake_targets/autotests/test_case_list.xml
+++ b/cmake_targets/autotests/test_case_list.xml
@@ -318,7 +318,8 @@
                                     (Test25: Format 2 11-bit 2/273 PRB),
                                     (Test26: Format 2 12-bit 8/273 PRB),
                                     (Test27: Format 2 19-bit 8/273 PRB),
-                                    (Test28: Format 2 64-bit 16/273 PRB)</desc>
+                                    (Test28: Format 2 64-bit 16/273 PRB),
+                                    (Test29: Format 2 64-bit 16/273 PRB Delay 2us)</desc>
       <main_exec>nr_pucchsim</main_exec>
       <main_exec_args>-R 106 -i 1 -P 0 -b 1 -s-2 -n1000
                       -R 106 -i 1 -P 0 -b 2 -s-2 -n1000
@@ -347,8 +348,9 @@
                       -R 273 -z8 -i 1 -P 2 -b 11 -s6 -n1000
                       -R 273 -z8 -i 1 -P 2 -q8 -b 12 -s-3 -n1000
                       -R 273 -z8 -i 1 -P 2 -q8 -b 19 -s-3 -n1000
-                      -R 273 -z8 -i 1 -P 2 -q16 -b 64 -s-3 -n1000</main_exec_args>
-      <tags>test1 test2 test3 test4 test5 test6 test7 test8 test9 test10 test11 test12 test13 test14 test15 test16 test17 test18 test19 test20 test21 test22 test23 test24 test25 test26 test27 test28</tags>
+                      -R 273 -z8 -i 1 -P 2 -q16 -b 64 -s-3 -n1000
+                      -R 273 -z8 -i 1 -P 2 -q16 -b 64 -s-3 -d 2 -n1000</main_exec_args>
+      <tags>test1 test2 test3 test4 test5 test6 test7 test8 test9 test10 test11 test12 test13 test14 test15 test16 test17 test18 test19 test20 test21 test22 test23 test24 test25 test26 test27 test28 test29</tags>
       <search_expr_true>PUCCH test OK</search_expr_true>
       <search_expr_false>segmentation fault|assertion|exiting|fatal</search_expr_false>
       <nruns>3</nruns>
diff --git a/common/utils/LOG/log.c b/common/utils/LOG/log.c
index 80244c0912c37caa26d5a87992b5f6642b5119a9..af6c30b858d96882436a18ce7077a5ad9d33523b 100644
--- a/common/utils/LOG/log.c
+++ b/common/utils/LOG/log.c
@@ -651,6 +651,10 @@ void log_dump(int component,
       wbuf=malloc((buffsize * 10)  + 64 + MAX_LOG_TOTAL);
       break;
 
+    case LOG_DUMP_C16:
+      wbuf = malloc((buffsize * 10) + 64 + MAX_LOG_TOTAL);
+      break;
+
     case LOG_DUMP_CHAR:
     default:
       wbuf=malloc((buffsize * 3 ) + 64 + MAX_LOG_TOTAL);
@@ -669,6 +673,21 @@ void log_dump(int component,
           pos = pos + sprintf(wbuf+pos,"%04.4lf ", (double)((double *)buffer)[i]);
           break;
 
+        case LOG_DUMP_I16: {
+          int16_t *tmp = ((int16_t *)buffer) + i;
+          pos = pos + sprintf(wbuf + pos, "%d, ", *tmp);
+        } break;
+
+        case LOG_DUMP_C16: {
+          int16_t *tmp = ((int16_t *)buffer) + i * 2;
+          pos = pos + sprintf(wbuf + pos, "(%d,%d), ", *tmp, *(tmp + 1));
+        } break;
+
+        case LOG_DUMP_C32: {
+          int32_t *tmp = ((int32_t *)buffer) + i * 2;
+          pos = pos + sprintf(wbuf + pos, "(%d,%d), ", *tmp, *(tmp + 1));
+        } break;
+
         case LOG_DUMP_CHAR:
         default:
           pos = pos + sprintf(wbuf+pos,"%02x ", (unsigned char)((unsigned char *)buffer)[i]);
diff --git a/common/utils/LOG/log.h b/common/utils/LOG/log.h
index 1add78c6f1029d74ae5545b50cf799202a8e8b17..f16ee296d478ac0186fb7e2d03a180b80720e190 100644
--- a/common/utils/LOG/log.h
+++ b/common/utils/LOG/log.h
@@ -335,6 +335,9 @@ int32_t write_file_matlab(const char *fname, const char *vname, void *data, int
  * @{*/
 #define LOG_DUMP_CHAR       0
 #define LOG_DUMP_DOUBLE     1
+#define LOG_DUMP_I16 2
+#define LOG_DUMP_C16 3
+#define LOG_DUMP_C32 4
 // debugging macros
 #define LOG_F  LOG_I           /* because  LOG_F was originaly to dump a message or buffer but is also used as a regular level...., to dump use LOG_DUMPMSG */
 
diff --git a/common/utils/T/tracer/hacks/dump_nack_signal.c b/common/utils/T/tracer/hacks/dump_nack_signal.c
index f50e8a761a4807efe16a910feaed60ea37b8cb05..686425a4b93e3ff3e21a7b0256f1946ed30b5b40 100644
--- a/common/utils/T/tracer/hacks/dump_nack_signal.c
+++ b/common/utils/T/tracer/hacks/dump_nack_signal.c
@@ -4,7 +4,7 @@
 #include "utils.h"
 #include "event.h"
 #include "database.h"
-#include "config.h"
+#include "configuration.h"
 #include "../T_defs.h"
 
 void usage(void) {
diff --git a/common/utils/T/tracer/hacks/time_meas.c b/common/utils/T/tracer/hacks/time_meas.c
index ae38dcd1772e6e28d39c13c207219455dfe09703..dccba130ad143226e70fb32f33012a00cfb7ac14 100644
--- a/common/utils/T/tracer/hacks/time_meas.c
+++ b/common/utils/T/tracer/hacks/time_meas.c
@@ -4,7 +4,7 @@
 #include "utils.h"
 #include "event.h"
 #include "database.h"
-#include "config.h"
+#include "configuration.h"
 #include "../T_defs.h"
 
 void usage(void)
diff --git a/common/utils/threadPool/task_ans.c b/common/utils/threadPool/task_ans.c
index f7e876568f461551253ff62278bce35dcad9b92c..4f8fc265ab00de64d9da8f470f8248cda889496b 100644
--- a/common/utils/threadPool/task_ans.c
+++ b/common/utils/threadPool/task_ans.c
@@ -26,37 +26,50 @@
 #include <stdint.h>
 #include <stdlib.h>
 #include <time.h>
+#include "pthread_utils.h"
+#include "errno.h"
+#include <string.h>
 
-void completed_task_ans(task_ans_t* task)
-{
-  DevAssert(task != NULL);
-
-  int status = atomic_load_explicit(&task->status, memory_order_acquire);
-  AssertFatal(status == 0, "Task not expected to be finished here. Status = %d\n", status);
+#define seminit(sem)                                                                           \
+  {                                                                                            \
+    int ret = sem_init(&sem, 0, 0);                                                            \
+    AssertFatal(ret == 0, "sem_init(): ret=%d, errno=%d (%s)\n", ret, errno, strerror(errno)); \
+  }
+#define sempost(sem)                                                                           \
+  {                                                                                            \
+    int ret = sem_post(&sem);                                                                  \
+    AssertFatal(ret == 0, "sem_post(): ret=%d, errno=%d (%s)\n", ret, errno, strerror(errno)); \
+  }
+#define semwait(sem)                                                                           \
+  {                                                                                            \
+    int ret = sem_wait(&sem);                                                                  \
+    AssertFatal(ret == 0, "sem_wait(): ret=%d, errno=%d (%s)\n", ret, errno, strerror(errno)); \
+  }
+#define semdestroy(sem)                                                                           \
+  {                                                                                               \
+    int ret = sem_destroy(&sem);                                                                  \
+    AssertFatal(ret == 0, "sem_destroy(): ret=%d, errno=%d (%s)\n", ret, errno, strerror(errno)); \
+  }
 
-  atomic_store_explicit(&task->status, 1, memory_order_release);
+void init_task_ans(task_ans_t* ans, uint num_jobs)
+{
+  ans->counter = num_jobs;
+  seminit(ans->sem);
 }
 
-void join_task_ans(task_ans_t* arr, size_t len)
+void completed_many_task_ans(task_ans_t* ans, uint num_completed_jobs)
 {
-  DevAssert(len < INT_MAX);
-  DevAssert(arr != NULL);
-
-  // Spin lock inspired by:
-  // The Art of Writing Efficient Programs:
-  // An advanced programmer's guide to efficient hardware utilization
-  // and compiler optimizations using C++ examples
-  const struct timespec ns = {0, 1};
-  uint64_t i = 0;
-  int j = len - 1;
-  for (; j != -1; i++) {
-    for (; j != -1; --j) {
-      int const task_completed = 1;
-      if (atomic_load_explicit(&arr[j].status, memory_order_acquire) != task_completed)
-        break;
-    }
-    if (i % 8 == 0) {
-      nanosleep(&ns, NULL);
-    }
+  DevAssert(ans != NULL);
+  // Using atomic counter in contention scenario to avoid locking in producers
+  int num_jobs = atomic_fetch_sub_explicit(&ans->counter, num_completed_jobs, memory_order_relaxed);
+  if (num_jobs == num_completed_jobs) {
+    // Using semaphore to enable blocking call in join_task_ans
+    sempost(ans->sem);
   }
 }
+
+void join_task_ans(task_ans_t* ans)
+{
+  semwait(ans->sem);
+  semdestroy(ans->sem);
+}
diff --git a/common/utils/threadPool/task_ans.h b/common/utils/threadPool/task_ans.h
index 4d1203555df868e462bb379530b1823e6e788780..2fa7d0fc83eae577c972658e4d415f3e1edeb836 100644
--- a/common/utils/threadPool/task_ans.h
+++ b/common/utils/threadPool/task_ans.h
@@ -21,7 +21,7 @@
 
 #ifndef TASK_ANSWER_THREAD_POOL_H
 #define TASK_ANSWER_THREAD_POOL_H
-
+#include "pthread_utils.h"
 #ifdef __cplusplus
 extern "C" {
 #endif
@@ -31,12 +31,15 @@ extern "C" {
 #include <stdatomic.h>
 #else
 #include <atomic>
+#ifndef _Atomic
 #define _Atomic(X) std::atomic<X>
+#endif
 #define _Alignas(X) alignas(X)
 #endif
 
 #include <stddef.h>
 #include <stdint.h>
+#include <semaphore.h>
 
 #if defined(__i386__) || defined(__x86_64__)
 #define LEVEL1_DCACHE_LINESIZE 64
@@ -50,9 +53,17 @@ extern "C" {
 #error Unknown CPU architecture
 #endif
 
+/** @brief
+ * A multi-producer - single-consumer synchronization mechanism built for efficiency under
+ * contention.
+ *
+ * @param sem semaphore to wait on
+ * @param counter atomic counter to keep track of the number of tasks completed. Atomic counter
+ * is used for efficiency under contention.
+ */
 typedef struct {
-  // Avoid false sharing
-  _Alignas(LEVEL1_DCACHE_LINESIZE) _Atomic(int) status;
+  sem_t sem;
+  _Alignas(LEVEL1_DCACHE_LINESIZE) _Atomic(int) counter;
 } task_ans_t;
 
 typedef struct {
@@ -62,10 +73,29 @@ typedef struct {
   task_ans_t* ans;
 } thread_info_tm_t;
 
+/// @brief Initialize a task_ans_t struct
+///
+/// @param ans task_ans_t struct
+/// @param num_jobs number of tasks to wait for
+void init_task_ans(task_ans_t* ans, unsigned int num_jobs);
+
+/// @brief Wait for all tasks to complete
+/// @param ans task_ans_t struct
+void join_task_ans(task_ans_t* arr);
 
-void join_task_ans(task_ans_t* arr, size_t len);
+/// @brief Mark a number of tasks as completed.
+///
+/// @param ans task_ans_t struct
+/// @param num_completed_jobs number of tasks to mark as completed
+void completed_many_task_ans(task_ans_t* ans, uint num_completed_jobs);
 
-void completed_task_ans(task_ans_t* task);
+/// @brief Mark 1 tasks as completed.
+///
+/// @param ans task_ans_t struct
+static inline void completed_task_ans(task_ans_t* ans)
+{
+  completed_many_task_ans(ans, 1);
+}
 
 #ifdef __cplusplus
 }
diff --git a/common/utils/threadPool/test/test_thread-pool.c b/common/utils/threadPool/test/test_thread-pool.c
index f1e3322ac97aad450eb1734acb184c85a39642f5..16d40bc552a0af2c26b0837731d514042fd6ad6a 100644
--- a/common/utils/threadPool/test/test_thread-pool.c
+++ b/common/utils/threadPool/test/test_thread-pool.c
@@ -114,18 +114,18 @@ int main()
   int nb_jobs = 4;
   for (int i = 0; i < 1000; i++) {
     int parall = nb_jobs;
-    task_ans_t task_ans[parall];
-    memset(task_ans, 0, sizeof(task_ans));
+    task_ans_t task_ans;
+    init_task_ans(&task_ans, parall);
     struct testData test_data[parall];
     memset(test_data, 0, sizeof(test_data));
     for (int j = 0; j < parall; j++) {
       task_t task = {.args = &test_data[j], .func = processing};
       struct testData *x = (struct testData *)task.args;
       x->id = i;
-      x->task_ans = &task_ans[j];
+      x->task_ans = &task_ans;
       pushTpool(&pool, task);
     }
-    join_task_ans(task_ans, parall);
+    join_task_ans(&task_ans);
     int sleepmax = 0;
     for (int j = 0; j < parall; j++) {
       if (test_data[j].sleepTime > sleepmax) {
diff --git a/executables/nr-ue.c b/executables/nr-ue.c
index 6da78819f5a6ee993a13e83ff5d7de5bf44213bb..e54af77ab6284424e1b24d1e522f13924b736fd5 100644
--- a/executables/nr-ue.c
+++ b/executables/nr-ue.c
@@ -642,8 +642,9 @@ static int UE_dl_preprocessing(PHY_VARS_NR_UE *UE,
     }
   }
 
+  bool dl_slot = false;
   if (proc->rx_slot_type == NR_DOWNLINK_SLOT || proc->rx_slot_type == NR_MIXED_SLOT) {
-
+    dl_slot = true;
     if(UE->if_inst != NULL && UE->if_inst->dl_indication != NULL) {
       nr_downlink_indication_t dl_indication;
       nr_fill_dl_indication(&dl_indication, NULL, NULL, proc, UE, phy_data);
@@ -656,7 +657,8 @@ static int UE_dl_preprocessing(PHY_VARS_NR_UE *UE,
       const int ack_nack_slot = (proc->nr_slot_rx + phy_data->dlsch[0].dlsch_config.k1_feedback) % UE->frame_parms.slots_per_frame;
       tx_wait_for_dlsch[ack_nack_slot]++;
     }
-  } else {
+  }
+  if (fp->frame_type == FDD || !dl_slot) {
     // good time to print statistics, we don't have to spend time  to decode DCI
     if (proc->frame_rx % 128 == 0) {
       if (*stats_printed == false) {
diff --git a/openair1/PHY/CODING/3gpplte_sse.c b/openair1/PHY/CODING/3gpplte_sse.c
index c161682f7d43e6915f6ebecb0b1e20ddf6b859c3..79fd825a4b443ff31e2ded502fd412a39a1b8e87 100644
--- a/openair1/PHY/CODING/3gpplte_sse.c
+++ b/openair1/PHY/CODING/3gpplte_sse.c
@@ -349,8 +349,6 @@ void threegpplte_turbo_encoder_sse(unsigned char *input,
 #ifdef DEBUG_TURBO_ENCODER
   printf("term: x0 %u, x1 %u, state1 %d\n",x[10],x[11],state1);
 #endif // DEBUG_TURBO_ENCODER
-  simde_mm_empty();
-  simde_m_empty();
 }
 
 void init_encoder_sse (void) {
diff --git a/openair1/PHY/CODING/3gpplte_turbo_decoder_avx2_16bit.c b/openair1/PHY/CODING/3gpplte_turbo_decoder_avx2_16bit.c
index 77708f4489a7fc284c023f697760f3178ed13cee..57c53df1b1341c6cbd2f5d03761e629d8fd60640 100644
--- a/openair1/PHY/CODING/3gpplte_turbo_decoder_avx2_16bit.c
+++ b/openair1/PHY/CODING/3gpplte_turbo_decoder_avx2_16bit.c
@@ -1258,8 +1258,6 @@ unsigned char phy_threegpplte_turbo_decoder16avx2(int16_t *y,
 
   //  fprintf(fdavx2,"crc %x, oldcrc %x\n",crc,oldcrc);
 
-  simde_mm_empty();
-  simde_m_empty();
 
 #ifdef DEBUG_LOGMAP
   fclose(fdavx2);
diff --git a/openair1/PHY/CODING/3gpplte_turbo_decoder_sse.c b/openair1/PHY/CODING/3gpplte_turbo_decoder_sse.c
index fa1b55b45eee59d90e1094b58416b092f4fe970d..e6d1f87b4e70774bc7edd5573c8bc5a234e085aa 100644
--- a/openair1/PHY/CODING/3gpplte_turbo_decoder_sse.c
+++ b/openair1/PHY/CODING/3gpplte_turbo_decoder_sse.c
@@ -364,8 +364,6 @@ void compute_gamma(llr_t *m11,llr_t *m10,llr_t *systematic,channel_t *y_parity,
      simde_mm_extract_epi8(m11_128[k],15));
   */
 #endif
-  simde_mm_empty();
-  simde_m_empty();
 }
 
 #define L 40
@@ -1183,8 +1181,6 @@ void compute_alpha(llr_t *alpha,llr_t *beta,llr_t *m_11,llr_t *m_10,unsigned sho
       break;
   }
 
-  simde_mm_empty();
-  simde_m_empty();
 }
 
 
@@ -1747,8 +1743,6 @@ void compute_beta(llr_t *alpha,llr_t *beta,llr_t *m_11,llr_t *m_10,unsigned shor
       break;
   }
 
-  simde_mm_empty();
-  simde_m_empty();
 }
 
 void compute_ext(llr_t *alpha,llr_t *beta,llr_t *m_11,llr_t *m_10,llr_t *ext, llr_t *systematic,unsigned short frame_length) {
@@ -1910,8 +1904,6 @@ void compute_ext(llr_t *alpha,llr_t *beta,llr_t *m_11,llr_t *m_10,llr_t *ext, ll
   }
 
 #endif
-  simde_mm_empty();
-  simde_m_empty();
 }
 
 
diff --git a/openair1/PHY/CODING/3gpplte_turbo_decoder_sse_16bit.c b/openair1/PHY/CODING/3gpplte_turbo_decoder_sse_16bit.c
index c7f2c7663a3aff742658eae8c365cdd7af1d39b7..cae2093e9919081a425292599b5950739af7faab 100644
--- a/openair1/PHY/CODING/3gpplte_turbo_decoder_sse_16bit.c
+++ b/openair1/PHY/CODING/3gpplte_turbo_decoder_sse_16bit.c
@@ -1391,8 +1391,6 @@ uint8_t phy_threegpplte_turbo_decoder16(int16_t *y,
   fclose(fdsse4);
 #endif
 #if defined(__x86_64__) || defined(__i386__)
-  simde_mm_empty();
-  simde_m_empty();
 #endif
   if (iteration_cnt > max_iterations)
     set_abort(ab, true);
diff --git a/openair1/PHY/CODING/TESTBENCH/pdcch_test.c b/openair1/PHY/CODING/TESTBENCH/pdcch_test.c
index f4574bb0c523391ec3c960a2cbf34f6e0a423341..873e9e19bcfc8f93aa28269a6d1b7ef73bd191a3 100644
--- a/openair1/PHY/CODING/TESTBENCH/pdcch_test.c
+++ b/openair1/PHY/CODING/TESTBENCH/pdcch_test.c
@@ -246,7 +246,6 @@ int main(int argc, char *argv[])
   set_taus_seed(0);
 
   ccodelte_init();
-  ccodelte_init_inv();
 
   phy_generate_viterbi_tables_lte();
   lte_frame_parms = &(PHY_config->lte_frame_parms);
diff --git a/openair1/PHY/CODING/ccoding_byte_lte.c b/openair1/PHY/CODING/ccoding_byte_lte.c
index becdc590740a1e24721b22f0c67357afb560ee3d..4121cf2b3659c647eab82c7dc50662316084a489 100644
--- a/openair1/PHY/CODING/ccoding_byte_lte.c
+++ b/openair1/PHY/CODING/ccoding_byte_lte.c
@@ -32,9 +32,7 @@ static const unsigned short glte[] = {0133, 0171, 0165}; // {A,B}
 static const unsigned short glte_rev[] = {0155, 0117, 0127}; // {A,B}
 static const unsigned short gdab[] = {0133, 0171, 0145}; // {A,B}
 static const unsigned short gdab_rev[] = {0155, 0117, 0123}; // {A,B}
-unsigned char  ccodelte_table[128];      // for transmitter
-unsigned char  ccodelte_table_rev[128];  // for receiver
-
+static unsigned char ccodelte_table[128]; // for transmitter
 
 /*************************************************************************
 
@@ -45,8 +43,6 @@ unsigned char  ccodelte_table_rev[128];  // for receiver
   Trellis tail-biting is included here
 *************************************************************************/
 
-
-
 void
 ccodelte_encode (int32_t numbits,
                  uint8_t add_crc,
@@ -238,7 +234,8 @@ void ccodelte_init(void) {
 }
 
 /* Input in LSB, followed by state in 6 MSBs */
-void ccodelte_init_inv(void) {
+void ccodelte_init_inv(unsigned char ccodelte_table_rev[128])
+{
   unsigned int  i, j, k, sum;
 
   for (i = 0; i < 128; i++) {
@@ -279,7 +276,8 @@ void ccodedab_init(void) {
 }
 
 /* Input in LSB, followed by state in 6 MSBs */
-void ccodedab_init_inv(void) {
+void ccodedab_init_inv(unsigned char ccodelte_table_rev[128])
+{
   unsigned int  i, j, k, sum;
 
   for (i = 0; i < 128; i++) {
@@ -299,7 +297,6 @@ void ccodedab_init_inv(void) {
   }
 }
 
-
 /*****************************************************************/
 /**
   Test program
diff --git a/openair1/PHY/CODING/coding_defs.h b/openair1/PHY/CODING/coding_defs.h
index 7da151fea939f93a239ba590374ff9911b6845e1..3c91892d8c883023230a6f6f3cdd945a45a991ae 100644
--- a/openair1/PHY/CODING/coding_defs.h
+++ b/openair1/PHY/CODING/coding_defs.h
@@ -304,17 +304,17 @@ void ccodelte_encode(int32_t numbits, uint8_t add_crc, uint8_t *inPtr, uint8_t *
 \brief This function initializes the generator polynomials for an LTE convolutional code.*/
 void ccodelte_init(void);
 
-/*!\fn void ccodelte_init_inv(void)
+/*!\fn void ccodelte_init_inv(unsigned char  ccodelte_table_rev[128])
 \brief This function initializes the trellis structure for decoding an LTE convolutional code.*/
-void ccodelte_init_inv(void);
+void ccodelte_init_inv(unsigned char ccodelte_table_rev[128]);
 
 /*!\fn void ccodelte_init(void)
 \brief This function initializes the generator polynomials for an DAB convolutional code (first 3 bits).*/
 void ccodedab_init(void);
 
-/*!\fn void ccodelte_init_inv(void)
+/*!\fn void ccodelte_init_inv(unsigned char  ccodelte_table_rev[128])
 \brief This function initializes the trellis structure for decoding an DAB convolutional code (first 3 bits).*/
-void ccodedab_init_inv(void);
+void ccodedab_init_inv(unsigned char ccodelte_table_rev[128]);
 
 /*!\fn void crcTableInit(void)
 \brief This function initializes the different crc tables.*/
diff --git a/openair1/PHY/CODING/nrLDPC_coding/nrLDPC_coding_segment/nrLDPC_coding_segment_decoder.c b/openair1/PHY/CODING/nrLDPC_coding/nrLDPC_coding_segment/nrLDPC_coding_segment_decoder.c
index 616715269a4bc0567f8fdf4334707b87d798d78a..3d228da0089c441cae930af0a3c6d793856e056b 100644
--- a/openair1/PHY/CODING/nrLDPC_coding/nrLDPC_coding_segment/nrLDPC_coding_segment_decoder.c
+++ b/openair1/PHY/CODING/nrLDPC_coding/nrLDPC_coding_segment/nrLDPC_coding_segment_decoder.c
@@ -250,7 +250,7 @@ int nrLDPC_prepare_TB_decoding(nrLDPC_slot_decoding_parameters_t *nrLDPC_slot_de
   for (int r = 0; r < nrLDPC_TB_decoding_parameters->C; r++) {
     nrLDPC_decoding_parameters_t *rdata = &((nrLDPC_decoding_parameters_t *)t_info->buf)[t_info->len];
     DevAssert(t_info->len < t_info->cap);
-    rdata->ans = &t_info->ans[t_info->len];
+    rdata->ans = t_info->ans;
     t_info->len += 1;
 
     decParams.R = nrLDPC_TB_decoding_parameters->segments[r].R;
@@ -309,19 +309,16 @@ int32_t nrLDPC_coding_decoder(nrLDPC_slot_decoding_parameters_t *nrLDPC_slot_dec
     nbSegments += nrLDPC_TB_decoding_parameters->C;
   }
   nrLDPC_decoding_parameters_t arr[nbSegments];
-  task_ans_t ans[nbSegments];
-  memset(ans, 0, nbSegments * sizeof(task_ans_t));
-  thread_info_tm_t t_info = {.buf = (uint8_t *)arr, .len = 0, .cap = nbSegments, .ans = ans};
+  task_ans_t ans;
+  init_task_ans(&ans, nbSegments);
+  thread_info_tm_t t_info = {.buf = (uint8_t *)arr, .len = 0, .cap = nbSegments, .ans = &ans};
 
-  int nbDecode = 0;
   for (int pusch_id = 0; pusch_id < nrLDPC_slot_decoding_parameters->nb_TBs; pusch_id++) {
-    nbDecode += nrLDPC_prepare_TB_decoding(nrLDPC_slot_decoding_parameters, pusch_id, &t_info);
+    (void)nrLDPC_prepare_TB_decoding(nrLDPC_slot_decoding_parameters, pusch_id, &t_info);
   }
 
-  DevAssert(nbDecode == t_info.len);
-
-  // Execute thread poool tasks
-  join_task_ans(t_info.ans, t_info.len);
+  // Execute thread pool tasks
+  join_task_ans(t_info.ans);
 
   for (int pusch_id = 0; pusch_id < nrLDPC_slot_decoding_parameters->nb_TBs; pusch_id++) {
     nrLDPC_TB_decoding_parameters_t *nrLDPC_TB_decoding_parameters = &nrLDPC_slot_decoding_parameters->TBs[pusch_id];
diff --git a/openair1/PHY/CODING/nrLDPC_coding/nrLDPC_coding_segment/nrLDPC_coding_segment_encoder.c b/openair1/PHY/CODING/nrLDPC_coding/nrLDPC_coding_segment/nrLDPC_coding_segment_encoder.c
index 4d740de2cddc8f7e1a3079a2d1ee7c0d87fd4305..4a4e6beaafe07672368c86ada0199edacd64a667 100644
--- a/openair1/PHY/CODING/nrLDPC_coding/nrLDPC_coding_segment/nrLDPC_coding_segment_encoder.c
+++ b/openair1/PHY/CODING/nrLDPC_coding/nrLDPC_coding_segment/nrLDPC_coding_segment_encoder.c
@@ -189,7 +189,7 @@ static int nrLDPC_prepare_TB_encoding(nrLDPC_slot_encoding_parameters_t *nrLDPC_
   for (int j = 0; j < n_seg; j++) {
     ldpc8blocks_args_t *perJobImpp = &((ldpc8blocks_args_t *)t_info->buf)[t_info->len];
     DevAssert(t_info->len < t_info->cap);
-    impp.ans = &t_info->ans[t_info->len];
+    impp.ans = t_info->ans;
     t_info->len += 1;
 
     impp.macro_num = j;
@@ -211,19 +211,19 @@ int nrLDPC_coding_encoder(nrLDPC_slot_encoding_parameters_t *nrLDPC_slot_encodin
     nbTasks += n_seg;
   }
   ldpc8blocks_args_t arr[nbTasks];
-  task_ans_t ans[nbTasks];
-  memset(ans, 0, nbTasks * sizeof(task_ans_t));
-  thread_info_tm_t t_info = {.buf = (uint8_t *)arr, .len = 0, .cap = nbTasks, .ans = ans};
+  task_ans_t ans;
+  init_task_ans(&ans, nbTasks);
+  thread_info_tm_t t_info = {.buf = (uint8_t *)arr, .len = 0, .cap = nbTasks, .ans = &ans};
 
   int nbEncode = 0;
   for (int dlsch_id = 0; dlsch_id < nrLDPC_slot_encoding_parameters->nb_TBs; dlsch_id++) {
     nbEncode += nrLDPC_prepare_TB_encoding(nrLDPC_slot_encoding_parameters, dlsch_id, &t_info);
   }
-
-  DevAssert(nbEncode == t_info.len);
-
-  // Execute thread poool tasks
-  join_task_ans(ans, nbEncode);
+  if (nbEncode < nbTasks) {
+    completed_many_task_ans(&ans, nbTasks - nbEncode);
+  }
+  // Execute thread pool tasks
+  join_task_ans(&ans);
 
   return 0;
 }
diff --git a/openair1/PHY/CODING/nrLDPC_coding/nrLDPC_coding_xdma/nrLDPC_coding_xdma.c b/openair1/PHY/CODING/nrLDPC_coding/nrLDPC_coding_xdma/nrLDPC_coding_xdma.c
index 793825f14f3e946972cdde130006833941099f3c..5df0bd4406f82b65edc2c91f8f2850c9065bf95c 100644
--- a/openair1/PHY/CODING/nrLDPC_coding/nrLDPC_coding_xdma/nrLDPC_coding_xdma.c
+++ b/openair1/PHY/CODING/nrLDPC_coding/nrLDPC_coding_xdma/nrLDPC_coding_xdma.c
@@ -278,7 +278,7 @@ int decoder_xdma(nrLDPC_TB_decoding_parameters_t *TB_params, int frame_rx, int s
   DevAssert(num_threads_prepare == t_info.len);
 
   // wait for the prepare jobs to complete
-  join_task_ans(t_info.ans, t_info.len);
+  join_task_ans(t_info.ans);
 
   for (uint32_t job = 0; job < num_threads_prepare; job++) {
     args_fpga_decode_prepare_t *args = &arr[job];
diff --git a/openair1/PHY/CODING/viterbi.c b/openair1/PHY/CODING/viterbi.c
index 20242c89148459d8b23352a203ca18dc8dcfd184..4f8df56fc26611a465fda4b0374e5597959edaaa 100644
--- a/openair1/PHY/CODING/viterbi.c
+++ b/openair1/PHY/CODING/viterbi.c
@@ -314,7 +314,6 @@ void phy_viterbi_dot11_sse2(char *y,unsigned char *decoded_bytes,unsigned short
     }
   }
 
-  simde_mm_empty();
 }
 
 #ifdef TEST_DEBUG
diff --git a/openair1/PHY/CODING/viterbi_lte.c b/openair1/PHY/CODING/viterbi_lte.c
index 4efab84f497037bd999447b3756a4bbfd5b7993c..b074e1c91448605e7e8d1c44bba912749610e3a9 100644
--- a/openair1/PHY/CODING/viterbi_lte.c
+++ b/openair1/PHY/CODING/viterbi_lte.c
@@ -40,8 +40,7 @@
 
 
 #include "PHY/sse_intrin.h"
-
-extern uint8_t ccodelte_table[128],ccodelte_table_rev[128];
+#include "coding_defs.h"
 
 static int8_t m0_table[64*16*16*16] __attribute__ ((aligned(16)));
 static int8_t m1_table[64*16*16*16] __attribute__ ((aligned(16)));
@@ -50,6 +49,8 @@ static int8_t m1_table[64*16*16*16] __attribute__ ((aligned(16)));
 // Set up Viterbi tables for SSE2 implementation
 void phy_generate_viterbi_tables_lte( void )
 {
+  uint8_t ccodelte_table_rev[128];
+  ccodelte_init_inv(ccodelte_table_rev);
 
   int8_t w[8],in0,in1,in2;
   uint8_t state,index0,index1;
@@ -126,52 +127,29 @@ void print_shorts(simde__m128i x,char *s) {
 void phy_viterbi_lte_sse2(int8_t *y,uint8_t *decoded_bytes,uint16_t n)
 {
   simde__m128i TB[4 * 8192];
-  simde__m128i *m0_ptr, *m1_ptr, *TB_ptr = &TB[0];
-
-  simde__m128i metrics0_15, metrics16_31, metrics32_47, metrics48_63, even0_30a, even0_30b, even32_62a, even32_62b, odd1_31a,
-      odd1_31b, odd33_63a, odd33_63b, TBeven0_30, TBeven32_62, TBodd1_31, TBodd33_63;
-
-  simde__m128i min_state, min_state2;
-
-  int8_t *in = y;
-  uint8_t prev_state0,maxm,s;
-  static uint8_t *TB_ptr2;
-  uint32_t table_offset;
-  uint8_t iter;
-  int16_t position;
-
-  // set initial metrics
-  //debug_msg("Doing viterbi\n");
-
-  metrics0_15 = simde_mm_setzero_si128();
-  metrics16_31 = simde_mm_setzero_si128();
-  metrics32_47 = simde_mm_setzero_si128();
-  metrics48_63 = simde_mm_setzero_si128();
-
-  for (iter=0; iter<2; iter++) {
-    in = y;
-    TB_ptr=&TB[0];
-
-    for (position=0; position<n; position++) {
-
+  simde__m128i metrics0_15 = {0}, metrics16_31 = {0}, metrics32_47 = {0}, metrics48_63 = {0};
+  for (int iter = 0; iter < 2; iter++) {
+    int8_t *in = y;
+    simde__m128i *TB_ptr = TB;
 
+    for (int position = 0; position < n; position++) {
       // get branch metric offsets for the 64 states
-      table_offset = (in[0]+8 + ((in[1]+8)<<4) + ((in[2]+8)<<8))<<6;
+      uint table_offset = (in[0] + 8 + ((in[1] + 8) << 4) + ((in[2] + 8) << 8)) << 6;
 
-      m0_ptr = (simde__m128i *)&m0_table[table_offset];
-      m1_ptr = (simde__m128i *)&m1_table[table_offset];
+      simde__m128i *m0_ptr = (simde__m128i *)&m0_table[table_offset];
+      simde__m128i *m1_ptr = (simde__m128i *)&m1_table[table_offset];
 
       // even states
-      even0_30a = simde_mm_adds_epu8(metrics0_15, m0_ptr[0]);
-      even32_62a = simde_mm_adds_epu8(metrics16_31, m0_ptr[1]);
-      even0_30b = simde_mm_adds_epu8(metrics32_47, m0_ptr[2]);
-      even32_62b = simde_mm_adds_epu8(metrics48_63, m0_ptr[3]);
+      simde__m128i even0_30a = simde_mm_adds_epu8(metrics0_15, m0_ptr[0]);
+      simde__m128i even32_62a = simde_mm_adds_epu8(metrics16_31, m0_ptr[1]);
+      simde__m128i even0_30b = simde_mm_adds_epu8(metrics32_47, m0_ptr[2]);
+      simde__m128i even32_62b = simde_mm_adds_epu8(metrics48_63, m0_ptr[3]);
 
       // odd states
-      odd1_31a = simde_mm_adds_epu8(metrics0_15, m1_ptr[0]);
-      odd33_63a = simde_mm_adds_epu8(metrics16_31, m1_ptr[1]);
-      odd1_31b = simde_mm_adds_epu8(metrics32_47, m1_ptr[2]);
-      odd33_63b = simde_mm_adds_epu8(metrics48_63, m1_ptr[3]);
+      simde__m128i odd1_31a = simde_mm_adds_epu8(metrics0_15, m1_ptr[0]);
+      simde__m128i odd33_63a = simde_mm_adds_epu8(metrics16_31, m1_ptr[1]);
+      simde__m128i odd1_31b = simde_mm_adds_epu8(metrics32_47, m1_ptr[2]);
+      simde__m128i odd33_63b = simde_mm_adds_epu8(metrics48_63, m1_ptr[3]);
 
       // select maxima
 
@@ -182,33 +160,31 @@ void phy_viterbi_lte_sse2(int8_t *y,uint8_t *decoded_bytes,uint16_t n)
 
       // Traceback information
 
-      TBeven0_30 = simde_mm_cmpeq_epi8(even0_30a, even0_30b);
-      TBeven32_62 = simde_mm_cmpeq_epi8(even32_62a, even32_62b);
-      TBodd1_31 = simde_mm_cmpeq_epi8(odd1_31a, odd1_31b);
-      TBodd33_63 = simde_mm_cmpeq_epi8(odd33_63a, odd33_63b);
+      simde__m128i TBeven0_30 = simde_mm_cmpeq_epi8(even0_30a, even0_30b);
+      simde__m128i TBeven32_62 = simde_mm_cmpeq_epi8(even32_62a, even32_62b);
+      simde__m128i TBodd1_31 = simde_mm_cmpeq_epi8(odd1_31a, odd1_31b);
+      simde__m128i TBodd33_63 = simde_mm_cmpeq_epi8(odd33_63a, odd33_63b);
 
       metrics0_15 = simde_mm_unpacklo_epi8(even0_30a, odd1_31a);
       metrics16_31 = simde_mm_unpackhi_epi8(even0_30a, odd1_31a);
       metrics32_47 = simde_mm_unpacklo_epi8(even32_62a, odd33_63a);
       metrics48_63 = simde_mm_unpackhi_epi8(even32_62a, odd33_63a);
 
-      TB_ptr[0] = simde_mm_unpacklo_epi8(TBeven0_30, TBodd1_31);
-      TB_ptr[1] = simde_mm_unpackhi_epi8(TBeven0_30, TBodd1_31);
-      TB_ptr[2] = simde_mm_unpacklo_epi8(TBeven32_62, TBodd33_63);
-      TB_ptr[3] = simde_mm_unpackhi_epi8(TBeven32_62, TBodd33_63);
-
-      in+=3;
-      TB_ptr += 4;
+      *TB_ptr++ = simde_mm_unpacklo_epi8(TBeven0_30, TBodd1_31);
+      *TB_ptr++ = simde_mm_unpackhi_epi8(TBeven0_30, TBodd1_31);
+      *TB_ptr++ = simde_mm_unpacklo_epi8(TBeven32_62, TBodd33_63);
+      *TB_ptr++ = simde_mm_unpackhi_epi8(TBeven32_62, TBodd33_63);
 
+      in += 3;
       // rescale by subtracting minimum
       /****************************************************
       USE SSSE instruction phminpos!!!!!!!
       ****************************************************/
-      min_state = simde_mm_min_epu8(metrics0_15, metrics16_31);
+      simde__m128i min_state = simde_mm_min_epu8(metrics0_15, metrics16_31);
       min_state = simde_mm_min_epu8(min_state, metrics32_47);
       min_state = simde_mm_min_epu8(min_state, metrics48_63);
 
-      min_state2 = min_state;
+      simde__m128i min_state2 = min_state;
       min_state = simde_mm_unpacklo_epi8(min_state, min_state);
       min_state2 = simde_mm_unpackhi_epi8(min_state2, min_state2);
       min_state = simde_mm_min_epu8(min_state, min_state2);
@@ -237,37 +213,34 @@ void phy_viterbi_lte_sse2(int8_t *y,uint8_t *decoded_bytes,uint16_t n)
   } // iteration
 
   // Traceback
-  prev_state0 = 0;
-  maxm = 0;
-
-  for (s=0; s<16; s++)
-    if (((uint8_t *)&metrics0_15)[s] > maxm) {
-      maxm = ((uint8_t *)&metrics0_15)[s];
+  uint prev_state0 = 0;
+  uint maxm = 0;
+  uint s = 0;
+  for (uint8_t *ptr = (uint8_t *)&metrics0_15; s < 16; s++, ptr++)
+    if (*ptr > maxm) {
+      maxm = *ptr;
       prev_state0 = s;
     }
 
-  for (s=0; s<16; s++)
-    if (((uint8_t *)&metrics16_31)[s] > maxm) {
-      maxm = ((uint8_t *)&metrics16_31)[s];
-      prev_state0 = s+16;
+  for (uint8_t *ptr = (uint8_t *)&metrics16_31; s < 32; s++, ptr++)
+    if (*ptr > maxm) {
+      maxm = *ptr;
+      prev_state0 = s;
     }
-
-  for (s=0; s<16; s++)
-    if (((uint8_t *)&metrics32_47)[s] > maxm) {
-      maxm = ((uint8_t *)&metrics32_47)[s];
-      prev_state0 = s+32;
+  for (uint8_t *ptr = (uint8_t *)&metrics32_47; s < 48; s++, ptr++)
+    if (*ptr > maxm) {
+      maxm = *ptr;
+      prev_state0 = s;
     }
-
-  for (s=0; s<16; s++)
-    if (((uint8_t *)&metrics48_63)[s] > maxm) {
-      maxm = ((uint8_t *)&metrics48_63)[s];
-      prev_state0 = s+48;
+  for (uint8_t *ptr = (uint8_t *)&metrics48_63; s < 64; s++, ptr++)
+    if (*ptr > maxm) {
+      maxm = *ptr;
+      prev_state0 = s;
     }
 
-  TB_ptr2 = (uint8_t *)&TB[(n-1)*4];
-
-  for (position = n-1 ; position>-1; position--) {
+  uint8_t *TB_ptr2 = (uint8_t *)&TB[(n - 1) * 4];
 
+  for (int position = n - 1; position > -1; position--) {
     decoded_bytes[(position)>>3] += (prev_state0 & 0x1)<<(7-(position & 0x7));
 
 
@@ -278,9 +251,6 @@ void phy_viterbi_lte_sse2(int8_t *y,uint8_t *decoded_bytes,uint16_t n)
 
     TB_ptr2-=64;
   }
-
-  simde_mm_empty();
-  simde_m_empty();
 }
 
 #ifdef TEST_DEBUG
@@ -306,10 +276,8 @@ int test_viterbi(uint8_t dabflag)
 
   if (dabflag==0) {
     ccodelte_init();
-    ccodelte_init_inv();
   } else {
     ccodedab_init();
-    ccodedab_init_inv();
     printf("Running with DAB polynomials\n");
   }
 
diff --git a/openair1/PHY/INIT/init_top.c b/openair1/PHY/INIT/init_top.c
index 15b27b4db1dd18afd8bff61e862379b76889c6bb..b4ac0617a6673dd8c79061bbafc9ecbe05d9b32c 100644
--- a/openair1/PHY/INIT/init_top.c
+++ b/openair1/PHY/INIT/init_top.c
@@ -63,7 +63,6 @@ void generate_qpsk_table(void) {
 
 void init_lte_top(LTE_DL_FRAME_PARMS *frame_parms) {
   ccodelte_init();
-  ccodelte_init_inv();
   phy_generate_viterbi_tables_lte();
   load_codinglib();
   generate_ul_ref_sigs();
diff --git a/openair1/PHY/INIT/nr_init.c b/openair1/PHY/INIT/nr_init.c
index ca14ca99bea5b39f67e6729bfcbf743cf063b435..581338fa454f42b3bdadadddd32c25f30cacc5e2 100644
--- a/openair1/PHY/INIT/nr_init.c
+++ b/openair1/PHY/INIT/nr_init.c
@@ -138,6 +138,7 @@ void phy_init_nr_gNB(PHY_VARS_gNB *gNB)
 
   gNB->max_nb_pdsch = MAX_MOBILES_PER_GNB;
   init_delay_table(fp->ofdm_symbol_size, MAX_DELAY_COMP, NR_MAX_OFDM_SYMBOL_SIZE, fp->delay_table);
+  init_delay_table(128, MAX_DELAY_COMP, 128, fp->delay_table128);
 
   gNB->bad_pucch = 0;
   if (gNB->TX_AMP == 0)
diff --git a/openair1/PHY/LTE_ESTIMATION/lte_ue_measurements.c b/openair1/PHY/LTE_ESTIMATION/lte_ue_measurements.c
index c9ec2449e16f964b8e018bf1239c541c50763c6d..6abe02c941513a85ee4956430758fa875c01c5fc 100644
--- a/openair1/PHY/LTE_ESTIMATION/lte_ue_measurements.c
+++ b/openair1/PHY/LTE_ESTIMATION/lte_ue_measurements.c
@@ -493,8 +493,6 @@ void conjch0_mult_ch1(int *ch0,
     dl_ch1_128+=1;
     ch0conj_ch1_128+=1;
   }
-  simde_mm_empty();
-  simde_m_empty();
 }
 
 void construct_HhH_elements(int *ch0conj_ch0, //00_00
@@ -567,8 +565,6 @@ void construct_HhH_elements(int *ch0conj_ch0, //00_00
     after_mf_10_128+=1;
     after_mf_11_128+=1;
   }
-  simde_mm_empty();
-  simde_m_empty();
 }
 
 
@@ -595,8 +591,6 @@ void squared_matrix_element(int32_t *Hh_h_00,
     Hh_h_00_sq_128+=1;
     Hh_h_00_128+=1;
   }
-  simde_mm_empty();
-  simde_m_empty();
 }
 
 
@@ -645,8 +639,6 @@ void det_HhH(int32_t *after_mf_00,
     //after_mf_10_128+=1;
     after_mf_11_128+=1;
   }
-  simde_mm_empty();
-  simde_m_empty();
 }
 
 void numer(int32_t *Hh_h_00_sq,
@@ -691,8 +683,6 @@ void numer(int32_t *Hh_h_00_sq,
     h_h_10_sq_128+=1;
     h_h_11_sq_128+=1;
   }
-  simde_mm_empty();
-  simde_m_empty();
 }
 
 void dlsch_channel_level_TM34_meas(int *ch00,
@@ -768,8 +758,6 @@ void dlsch_channel_level_TM34_meas(int *ch00,
   avg_0[0] = min (avg_0[0], avg_1[0]);
   avg_1[0] = avg_0[0];
 
-  simde_mm_empty();
-  simde_m_empty();
 
 }
 
@@ -1339,8 +1327,6 @@ void lte_ue_measurements(PHY_VARS_UE *ue,
     // printf("in lte_ue_measurements: selected rx_antenna[eNB_id==0]:%u\n", ue->measurements.selected_rx_antennas[eNB_id][i]);
   }  // eNB_id loop
 
-  simde_mm_empty();
-  simde_m_empty();
 }
 
 
diff --git a/openair1/PHY/LTE_TRANSPORT/dlsch_coding.c b/openair1/PHY/LTE_TRANSPORT/dlsch_coding.c
index 52905cfaf9e39084126df17eed41fea2ad4d671d..8e98677872d61278604118ad63d0493da6adec64 100644
--- a/openair1/PHY/LTE_TRANSPORT/dlsch_coding.c
+++ b/openair1/PHY/LTE_TRANSPORT/dlsch_coding.c
@@ -348,12 +348,12 @@ int dlsch_encoding(PHY_VARS_eNB *eNB,
   }
 
   turboEncode_t arr[hadlsch->C];
-  task_ans_t ans[hadlsch->C];
-  memset(ans, 0, hadlsch->C * sizeof(task_ans_t));
+  task_ans_t ans;
+  init_task_ans(&ans, hadlsch->C);
 
   for (int r = 0, r_offset = 0; r < hadlsch->C; r++) {
     turboEncode_t *rdata = &arr[r];
-    rdata->ans = &ans[r];
+    rdata->ans = &ans;
 
     rdata->input=hadlsch->c[r];
     rdata->Kr_bytes= ( r<hadlsch->Cminus ? hadlsch->Kminus : hadlsch->Kplus) >>3;
@@ -382,7 +382,7 @@ int dlsch_encoding(PHY_VARS_eNB *eNB,
       r_offset += Nl*Qm * ((GpmodC==0?0:1) + (Gp/C));
   }
 
-  join_task_ans(ans, hadlsch->C);
+  join_task_ans(&ans);
 
   VCD_SIGNAL_DUMPER_DUMP_FUNCTION_BY_NAME(VCD_SIGNAL_DUMPER_FUNCTIONS_ENB_DLSCH_ENCODING, VCD_FUNCTION_OUT);
   return(0);
@@ -449,12 +449,12 @@ int dlsch_encoding_fembms_pmch(PHY_VARS_eNB *eNB,
       return(-1);
   }
   turboEncode_t arr[hadlsch->C];
-  task_ans_t ans[hadlsch->C];
-  memset(ans, 0, hadlsch->C * sizeof(task_ans_t));
+  task_ans_t ans;
+  init_task_ans(&ans, hadlsch->C);
 
   for (int r = 0, r_offset = 0; r < hadlsch->C; r++) {
     turboEncode_t *rdata = &arr[r];
-    rdata->ans = &ans[r];
+    rdata->ans = &ans;
 
     rdata->input=hadlsch->c[r];
     rdata->Kr_bytes= ( r<hadlsch->Cminus ? hadlsch->Kminus : hadlsch->Kplus) >>3;
@@ -483,7 +483,7 @@ int dlsch_encoding_fembms_pmch(PHY_VARS_eNB *eNB,
       r_offset += Nl*Qm * ((GpmodC==0?0:1) + (Gp/C));
   }
 
-  join_task_ans(ans, hadlsch->C);
+  join_task_ans(&ans);
 
   return(0);
 }
diff --git a/openair1/PHY/LTE_TRANSPORT/ulsch_decoding.c b/openair1/PHY/LTE_TRANSPORT/ulsch_decoding.c
index 9755455762dfaef79356e210af0ff8e75a60c55f..6b499f14db7ba69d26590e965ab52cd24f4a43ee 100644
--- a/openair1/PHY/LTE_TRANSPORT/ulsch_decoding.c
+++ b/openair1/PHY/LTE_TRANSPORT/ulsch_decoding.c
@@ -341,7 +341,7 @@ static int ulsch_decoding_data(PHY_VARS_eNB *eNB,
 
     turboDecode_t *rdata = &((turboDecode_t *)t_info->buf)[t_info->len];
     DevAssert(t_info->len < t_info->cap);
-    rdata->ans = &t_info->ans[t_info->len];
+    rdata->ans = t_info->ans;
     t_info->len += 1;
 
     rdata->eNB=eNB;
diff --git a/openair1/PHY/LTE_TRANSPORT/ulsch_demodulation.c b/openair1/PHY/LTE_TRANSPORT/ulsch_demodulation.c
index 0370285e0754172295cee582a290b1dd63a48db2..25068537054b769c1dbd0ba25e870fb8944606cf 100644
--- a/openair1/PHY/LTE_TRANSPORT/ulsch_demodulation.c
+++ b/openair1/PHY/LTE_TRANSPORT/ulsch_demodulation.c
@@ -216,8 +216,6 @@ int32_t ulsch_qpsk_llr(LTE_DL_FRAME_PARMS *frame_parms,
     (*llrp128)++;
   }
 
-  simde_mm_empty();
-  simde_m_empty();
   return(0);
 }
 
@@ -245,8 +243,6 @@ void ulsch_16qam_llr(LTE_DL_FRAME_PARMS *frame_parms,
     //    print_bytes("rxF[i+1]",&rxF[i+1]);
   }
 
-  simde_mm_empty();
-  simde_m_empty();
 }
 
 void ulsch_64qam_llr(LTE_DL_FRAME_PARMS *frame_parms,
@@ -289,8 +285,6 @@ void ulsch_64qam_llr(LTE_DL_FRAME_PARMS *frame_parms,
     (*llrp32)+=12;
   }
 
-  simde_mm_empty();
-  simde_m_empty();
 }
 
 void ulsch_detection_mrc(LTE_DL_FRAME_PARMS *frame_parms,
@@ -347,8 +341,6 @@ void ulsch_detection_mrc(LTE_DL_FRAME_PARMS *frame_parms,
       }
   }
 
-  simde_mm_empty();
-  simde_m_empty();
 }
 
 void ulsch_extract_rbs_single(int32_t **rxdataF,
@@ -524,8 +516,6 @@ void ulsch_channel_compensation(int32_t **rxdataF_ext,
     }
   }
 
-  simde_mm_empty();
-  simde_m_empty();
 }
 
 void ulsch_channel_level(int32_t **drs_ch_estimates_ext,
@@ -556,8 +546,6 @@ void ulsch_channel_level(int32_t **drs_ch_estimates_ext,
                        ((float *)&avg128U)[3])/(float)(nb_rb*12));
   }
 
-  simde_mm_empty();
-  simde_m_empty();
 }
 
 static int ulsch_power_LUT[750];
diff --git a/openair1/PHY/LTE_UE_TRANSPORT/dci_ue.c b/openair1/PHY/LTE_UE_TRANSPORT/dci_ue.c
index 381dfc0cdff6b1897865baa3d16c47e00dffb986..f3d14b80eacc9f2f23881c3e8078614c640385fb 100644
--- a/openair1/PHY/LTE_UE_TRANSPORT/dci_ue.c
+++ b/openair1/PHY/LTE_UE_TRANSPORT/dci_ue.c
@@ -384,8 +384,6 @@ void pdcch_channel_level(int32_t **dl_ch_estimates_ext,
       //      printf("Channel level : %d\n",avg[(aatx<<1)+aarx]);
     }
 
-  simde_mm_empty();
-  simde_m_empty();
 
 }
 
@@ -433,8 +431,6 @@ void pdcch_detection_mrc_i(LTE_DL_FRAME_PARMS *frame_parms,
     }
   }
 
-  simde_mm_empty();
-  simde_m_empty();
 }
 
 
@@ -1078,8 +1074,6 @@ void pdcch_channel_compensation(int32_t **rxdataF_ext,
 
   }
 
-  simde_mm_empty();
-  simde_m_empty();
 }
 
 void pdcch_detection_mrc(LTE_DL_FRAME_PARMS *frame_parms,
@@ -1103,8 +1097,6 @@ void pdcch_detection_mrc(LTE_DL_FRAME_PARMS *frame_parms,
     }
   }
 
-  simde_mm_empty();
-  simde_m_empty();
 
 }
 
diff --git a/openair1/PHY/LTE_UE_TRANSPORT/dlsch_demodulation.c b/openair1/PHY/LTE_UE_TRANSPORT/dlsch_demodulation.c
index b9ba9347cf07686b988d5be972a8f4fc45ca1588..0d13a1c38d6b881ef7eb738ba8d91a8c9d881308 100644
--- a/openair1/PHY/LTE_UE_TRANSPORT/dlsch_demodulation.c
+++ b/openair1/PHY/LTE_UE_TRANSPORT/dlsch_demodulation.c
@@ -1481,8 +1481,6 @@ void dlsch_channel_compensation(int **rxdataF_ext,
     }
   }
 
-  simde_mm_empty();
-  simde_m_empty();
 }
 
 void dlsch_channel_compensation_core(int **rxdataF_ext,
@@ -1666,8 +1664,6 @@ void dlsch_channel_compensation_core(int **rxdataF_ext,
     }
   }
 
-  simde_mm_empty();
-  simde_m_empty();
 }
 
 
@@ -1706,8 +1702,6 @@ void prec2A_TM56_128(unsigned char pmi,simde__m128i *ch0,simde__m128i *ch1) {
 
   ch0[0] = simde_mm_mulhi_epi16(ch0[0],amp);
   ch0[0] = simde_mm_slli_epi16(ch0[0],1);
-  simde_mm_empty();
-  simde_m_empty();
 }
 // precoding is stream 0 .5(1,1)  .5(1,-1) .5(1,1)  .5(1,-1)
 //              stream 1 .5(1,-1) .5(1,1)  .5(1,-1) .5(1,1)
@@ -1740,8 +1734,6 @@ void prec2A_TM3_128(simde__m128i *ch0,simde__m128i *ch1) {
   //ch1[0] = simde_mm_srai_epi16(ch1[0],1);
   //  print_shorts("prec2A_TM3 ch0 (after):",ch0);
   //  print_shorts("prec2A_TM3 ch1 (after):",ch1);
-  simde_mm_empty();
-  simde_m_empty();
 }
 
 // pmi = 0 => stream 0 (1,1), stream 1 (1,-1)
@@ -1782,8 +1774,6 @@ void prec2A_TM4_128(int pmi,simde__m128i *ch0,simde__m128i *ch1) {
   // ch1[0] = simde_mm_srai_epi16(ch1[0],1); //divide by 2
   //print_shorts("prec2A_TM4 ch0 (end):",ch0);
   //print_shorts("prec2A_TM4 ch1 (end):",ch1);
-  simde_mm_empty();
-  simde_m_empty();
   // print_shorts("prec2A_TM4 ch0 (end):",ch0);
   //print_shorts("prec2A_TM4 ch1 (end):",ch1);
 }
@@ -2359,8 +2349,6 @@ void dlsch_channel_compensation_TM34(LTE_DL_FRAME_PARMS *frame_parms,
   measurements->precoded_cqi_dB[eNB_id][1] = dB_fixed2(precoded_signal_strength1,measurements->n0_power_tot);
   // printf("eNB_id %d, symbol %d: precoded CQI %d dB\n",eNB_id,symbol,
   //  measurements->precoded_cqi_dB[eNB_id][0]);
-  simde_mm_empty();
-  simde_m_empty();
 }
 
 
@@ -2453,8 +2441,6 @@ void dlsch_dual_stream_correlation(LTE_DL_FRAME_PARMS *frame_parms,
     }
   }
 
-  simde_mm_empty();
-  simde_m_empty();
 }
 
 
@@ -2526,8 +2512,6 @@ void dlsch_detection_mrc(LTE_DL_FRAME_PARMS *frame_parms,
     }
   }
 
-  simde_mm_empty();
-  simde_m_empty();
 }
 
 void dlsch_detection_mrc_TM34(LTE_DL_FRAME_PARMS *frame_parms,
@@ -2651,8 +2635,6 @@ void dlsch_detection_mrc_TM34(LTE_DL_FRAME_PARMS *frame_parms,
     }
   }
 
-  simde_mm_empty();
-  simde_m_empty();
 }
 
 void dlsch_scale_channel(int **dl_ch_estimates_ext,
@@ -2765,8 +2747,6 @@ void dlsch_channel_level(int **dl_ch_estimates_ext,
           ((int32_t *)&avg128D)[3])/y;
     }
 
-  simde_mm_empty();
-  simde_m_empty();
 }
 
 void dlsch_channel_level_core(int **dl_ch_estimates_ext,
@@ -2809,8 +2789,6 @@ void dlsch_channel_level_core(int **dl_ch_estimates_ext,
       //printf("Channel level [%d]: %d\n",aatx*n_rx + aarx, avg[aatx*n_rx + aarx]);
     }
 
-  simde_mm_empty();
-  simde_m_empty();
   /* FIXME This part needs to be adapted like the one above */
 }
 
@@ -2857,8 +2835,6 @@ void dlsch_channel_level_median(int **dl_ch_estimates_ext,
     }
   }
 
-  simde_mm_empty();
-  simde_m_empty();
 }
 
 void mmse_processing_oai(LTE_UE_PDSCH *pdsch_vars,
@@ -3189,8 +3165,6 @@ void dlsch_channel_aver_band(int **dl_ch_estimates_ext,
     }
   }
 
-  simde_mm_empty();
-  simde_m_empty();
 }
 
 void rxdataF_to_float(int32_t **rxdataF_ext,
@@ -3558,8 +3532,6 @@ void dlsch_channel_level_TM34(int **dl_ch_estimates_ext,
   // printf("From Chan_level aver stream 1 final =%d\n", avg_1[0]);
   avg_0[0] = min (avg_0[0], avg_1[0]);
   avg_1[0] = avg_0[0];
-  simde_mm_empty();
-  simde_m_empty();
 }
 
 //compute average channel_level of effective (precoded) channel
@@ -3624,8 +3596,6 @@ void dlsch_channel_level_TM56(int **dl_ch_estimates_ext,
 
   // choose maximum of the 2 effective channels
   avg[0] = cmax(avg[0],avg[1]);
-  simde_mm_empty();
-  simde_m_empty();
 }
 
 //compute average channel_level for TM7
@@ -3684,8 +3654,6 @@ void dlsch_channel_level_TM7(int **dl_bf_ch_estimates_ext,
       //            printf("Channel level : %d\n",avg[(aatx<<1)+aarx]);
     }
 
-  simde_mm_empty();
-  simde_m_empty();
 }
 //#define ONE_OVER_2_Q15 16384
 void dlsch_alamouti(LTE_DL_FRAME_PARMS *frame_parms,
@@ -3753,8 +3721,6 @@ void dlsch_alamouti(LTE_DL_FRAME_PARMS *frame_parms,
     }
   }
 
-  simde_mm_empty();
-  simde_m_empty();
 }
 
 //==============================================================================================
@@ -5587,8 +5553,6 @@ unsigned short dlsch_extract_rbs_TM7(int **rxdataF,
     }
   }
   
-  simde_mm_empty();
-  simde_m_empty();
   return(nb_rb/frame_parms->nb_antennas_rx);
 }
 
diff --git a/openair1/PHY/LTE_UE_TRANSPORT/dlsch_llr_computation_avx2.c b/openair1/PHY/LTE_UE_TRANSPORT/dlsch_llr_computation_avx2.c
index 9995f9b8d9d50fbb720b2a95eb4286ae7fbd92f2..72422d4dcf2a45653a4937f4d462270753da7ff9 100644
--- a/openair1/PHY/LTE_UE_TRANSPORT/dlsch_llr_computation_avx2.c
+++ b/openair1/PHY/LTE_UE_TRANSPORT/dlsch_llr_computation_avx2.c
@@ -1668,8 +1668,6 @@ void qam64_qam16_avx2(short *stream0_in,
 
   }
 
-  simde_mm_empty();
-  simde_m_empty();
 
 }
 
@@ -3499,8 +3497,6 @@ void qam64_qam64_avx2(int32_t *stream0_in,
 
   }
 
-  simde_mm_empty();
-  simde_m_empty();
 }
 
-#endif
\ No newline at end of file
+#endif
diff --git a/openair1/PHY/LTE_UE_TRANSPORT/pbch_ue.c b/openair1/PHY/LTE_UE_TRANSPORT/pbch_ue.c
index cf62a3a945136fc17ff97dee85defa7b388c7359..9eefe212c70cf38d5def31d1f7e2e6dde8346f0b 100644
--- a/openair1/PHY/LTE_UE_TRANSPORT/pbch_ue.c
+++ b/openair1/PHY/LTE_UE_TRANSPORT/pbch_ue.c
@@ -185,8 +185,6 @@ int pbch_channel_level(int **dl_ch_estimates_ext,
       //msg("Channel level : %d, %d\n",avg1, avg2);
     }
 
-  simde_mm_empty();
-  simde_m_empty();
   return(avg2);
 }
 
@@ -277,8 +275,6 @@ void pbch_channel_compensation(int **rxdataF_ext,
       }
     }
 
-  simde_mm_empty();
-  simde_m_empty();
 }
 
 void pbch_detection_mrc(LTE_DL_FRAME_PARMS *frame_parms,
@@ -302,8 +298,6 @@ void pbch_detection_mrc(LTE_DL_FRAME_PARMS *frame_parms,
     }
   }
 
-  simde_mm_empty();
-  simde_m_empty();
 }
 
 void pbch_unscrambling(LTE_DL_FRAME_PARMS *frame_parms,
diff --git a/openair1/PHY/LTE_UE_TRANSPORT/pmch_ue.c b/openair1/PHY/LTE_UE_TRANSPORT/pmch_ue.c
index a3b33cf54b87adf7128130c4a880130e1c9863c0..04546b2a193b76574c2e4f01d6cba3ad6b71333e 100644
--- a/openair1/PHY/LTE_UE_TRANSPORT/pmch_ue.c
+++ b/openair1/PHY/LTE_UE_TRANSPORT/pmch_ue.c
@@ -244,8 +244,6 @@ void mch_channel_level(int **dl_ch_estimates_ext,
     //            printf("Channel level : %d\n",avg[(aatx<<1)+aarx]);
   }
 
-  simde_mm_empty();
-  simde_m_empty();
 }
 
 void mch_channel_level_khz_1dot25(int **dl_ch_estimates_ext,
@@ -285,8 +283,6 @@ void mch_channel_level_khz_1dot25(int **dl_ch_estimates_ext,
                 //printf("Channel level : %d\n",avg[(aatx<<1)+aarx]);
   }
 
-  simde_mm_empty();
-  simde_m_empty();
 }
 
 
@@ -393,8 +389,6 @@ void mch_channel_compensation(int **rxdataF_ext,
     }
   }
 
-  simde_mm_empty();
-  simde_m_empty();
 }
 
 
@@ -501,8 +495,6 @@ void mch_channel_compensation_khz_1dot25(int **rxdataF_ext,
     }
   }
 
-  simde_mm_empty();
-  simde_m_empty();
 }
 
 
@@ -533,8 +525,6 @@ void mch_detection_mrc(LTE_DL_FRAME_PARMS *frame_parms,
     }
   }
 
-  simde_mm_empty();
-  simde_m_empty();
 }
 
 
@@ -564,8 +554,6 @@ void mch_detection_mrc_khz_1dot25(LTE_DL_FRAME_PARMS *frame_parms,
     }
   }
 
-  simde_mm_empty();
-  simde_m_empty();
 }
 
 
@@ -603,8 +591,6 @@ int mch_qpsk_llr(LTE_DL_FRAME_PARMS *frame_parms,
   }
 
   *llr32p = (short *)llr32;
-  simde_mm_empty();
-  simde_m_empty();
   return(0);
 }
 
@@ -634,8 +620,6 @@ int mch_qpsk_llr_khz_1dot25(LTE_DL_FRAME_PARMS *frame_parms,
   }
 
   *llr32p = (short *)llr32;
-  simde_mm_empty();
-  simde_m_empty();
   return(0);
 }
 
@@ -700,8 +684,6 @@ void mch_16qam_llr(LTE_DL_FRAME_PARMS *frame_parms,
     llr32 += 8;
   }
 
-  simde_mm_empty();
-  simde_m_empty();
 }
 
 void mch_16qam_llr_khz_1dot25(LTE_DL_FRAME_PARMS *frame_parms,
@@ -750,8 +732,6 @@ void mch_16qam_llr_khz_1dot25(LTE_DL_FRAME_PARMS *frame_parms,
     llr32 += 8;
   }
 
-  simde_mm_empty();
-  simde_m_empty();
 }
 
 //----------------------------------------------------------------------------------------------
@@ -843,8 +823,6 @@ void mch_64qam_llr(LTE_DL_FRAME_PARMS *frame_parms,
   }
 
   *llr_save = llr;
-  simde_mm_empty();
-  simde_m_empty();
 }
 
 void mch_64qam_llr_khz_1dot25(LTE_DL_FRAME_PARMS *frame_parms,
@@ -924,8 +902,6 @@ void mch_64qam_llr_khz_1dot25(LTE_DL_FRAME_PARMS *frame_parms,
   }
 
   *llr_save = llr;
-  simde_mm_empty();
-  simde_m_empty();
 }
 
 int avg_pmch[4];
diff --git a/openair1/PHY/LTE_UE_TRANSPORT/rar_tools_ue.c b/openair1/PHY/LTE_UE_TRANSPORT/rar_tools_ue.c
index 44855e551e80b22b18ea73f944051190d3ad6ad4..ad513a424e124b84214d3fbfff86f717f3756bd7 100644
--- a/openair1/PHY/LTE_UE_TRANSPORT/rar_tools_ue.c
+++ b/openair1/PHY/LTE_UE_TRANSPORT/rar_tools_ue.c
@@ -39,7 +39,7 @@
 #include "PHY/LTE_TRANSPORT/transport_vars.h"
 
 #include "assertions.h"
-static int8_t delta_PUSCH_msg2[8] = {-6, -4, -2, 0, 2, 4, 6, 8};
+static const int8_t delta_PUSCH_msg2[8] = {-6, -4, -2, 0, 2, 4, 6, 8};
 
 int generate_ue_ulsch_params_from_rar(PHY_VARS_UE *ue,
 				      UE_rxtx_proc_t *proc,
diff --git a/openair1/PHY/LTE_UE_TRANSPORT/sss_ue.c b/openair1/PHY/LTE_UE_TRANSPORT/sss_ue.c
index 79b20dc10c8f26165e6b1f06fe49156c29436d69..6d155f896f9a1af2b2b05fde9fd6c52ed62d1a7e 100644
--- a/openair1/PHY/LTE_UE_TRANSPORT/sss_ue.c
+++ b/openair1/PHY/LTE_UE_TRANSPORT/sss_ue.c
@@ -225,26 +225,9 @@ int pss_sss_extract(PHY_VARS_UE *phy_vars_ue,
   return _do_pss_sss_extract(phy_vars_ue, pss_ext, sss_ext, 1 /* doPss */, 1 /* doSss */, subframe);
 }
 
-int pss_only_extract(PHY_VARS_UE *phy_vars_ue,
-                    int32_t pss_ext[4][72],
-                    uint8_t subframe)
-{
-  static int32_t dummy[4][72];
-  return _do_pss_sss_extract(phy_vars_ue, pss_ext, dummy, 1 /* doPss */, 0 /* doSss */, subframe);
-}
-
-
-int sss_only_extract(PHY_VARS_UE *phy_vars_ue,
-                    int32_t sss_ext[4][72],
-                    uint8_t subframe)
-{
-  static int32_t dummy[4][72];
-  return _do_pss_sss_extract(phy_vars_ue, dummy, sss_ext, 0 /* doPss */, 1 /* doSss */, subframe);
-}
-
 
-int16_t phase_re[7] = {16383, 25101, 30791, 32767, 30791, 25101, 16383};
-int16_t phase_im[7] = {-28378, -21063, -11208, 0, 11207, 21062, 28377};
+static const int16_t phase_re[7] = {16383, 25101, 30791, 32767, 30791, 25101, 16383};
+static const int16_t phase_im[7] = {-28378, -21063, -11208, 0, 11207, 21062, 28377};
 
 
 int rx_sss(PHY_VARS_UE *ue,int32_t *tot_metric,uint8_t *flip_max,uint8_t *phase_max)
diff --git a/openair1/PHY/LTE_UE_TRANSPORT/transport_proto_ue.h b/openair1/PHY/LTE_UE_TRANSPORT/transport_proto_ue.h
index 18eb375f62b753f7347cceb103c8583689cacf8b..9aab56e52a54d5bdc998545cdd777125951b9e8a 100644
--- a/openair1/PHY/LTE_UE_TRANSPORT/transport_proto_ue.h
+++ b/openair1/PHY/LTE_UE_TRANSPORT/transport_proto_ue.h
@@ -1082,16 +1082,6 @@ int pss_sss_extract(PHY_VARS_UE *phy_vars_ue,
                     int32_t sss_ext[4][72],
                     uint8_t subframe);
 
-/*! \brief Extract only PSS resource elements
-  @param phy_vars_ue Pointer to UE variables
-  @param[out] pss_ext contain the PSS signals after the extraction
-@param subframe
-  @returns 0 on success
-*/
-int pss_only_extract(PHY_VARS_UE *phy_vars_ue,
-                     int32_t pss_ext[4][72],
-                     uint8_t subframe);
-
 /*! \brief Extract only SSS resource elements
   @param phy_vars_ue Pointer to UE variables
   @param[out] sss_ext contain the SSS signals after the extraction
diff --git a/openair1/PHY/NR_ESTIMATION/nr_ul_channel_estimation.c b/openair1/PHY/NR_ESTIMATION/nr_ul_channel_estimation.c
index cae3a301607baa766c0f80f41aad78ad806e07cb..6fdf6cd99538400f331aee2b49b41e0ed8514330 100644
--- a/openair1/PHY/NR_ESTIMATION/nr_ul_channel_estimation.c
+++ b/openair1/PHY/NR_ESTIMATION/nr_ul_channel_estimation.c
@@ -506,8 +506,8 @@ int nr_pusch_channel_estimation(PHY_VARS_gNB *gNB,
   int num_jobs = CEILIDIV(gNB->frame_parms.nb_antennas_rx, numAntennas);
   puschAntennaProc_t rdatas[num_jobs];
   memset(rdatas, 0, sizeof(rdatas));
-  task_ans_t ans[num_jobs];
-  memset(ans, 0, sizeof(ans));
+  task_ans_t ans;
+  init_task_ans(&ans, num_jobs);
   for (int job_id = 0; job_id < num_jobs; job_id++) {
     puschAntennaProc_t *rdata = &rdatas[job_id];
     task_t task = {.func = nr_pusch_antenna_processing, .args = rdata};
@@ -531,7 +531,7 @@ int nr_pusch_channel_estimation(PHY_VARS_gNB *gNB,
     rdata->pusch_vars = &gNB->pusch_vars[ul_id];
     rdata->chest_freq = gNB->chest_freq;
     rdata->rxdataF = gNB->common_vars.rxdataF;
-    rdata->ans = &ans[job_id];
+    rdata->ans = &ans;
     // Call the nr_pusch_antenna_processing function
     if (job_id == num_jobs - 1) {
       // Run the last job inline
@@ -539,10 +539,9 @@ int nr_pusch_channel_estimation(PHY_VARS_gNB *gNB,
     } else {
       pushTpool(&gNB->threadPool, task);
     }
-    LOG_D(PHY, "Added Antenna (count %d/%d) to process, in pipe\n", job_id, num_jobs);
   } // Antenna Loop
 
-  join_task_ans(ans, num_jobs - 1);
+  join_task_ans(&ans);
 
   stop_meas(&gNB->pusch_channel_estimation_antenna_processing_stats);
   for (int aarx = 0; aarx < gNB->frame_parms.nb_antennas_rx; aarx++) {
diff --git a/openair1/PHY/NR_TRANSPORT/nr_ulsch_demodulation.c b/openair1/PHY/NR_TRANSPORT/nr_ulsch_demodulation.c
index 8c8b8212a2ee68e84e458b0ae231d0984ecf1a37..396c8d89f4f3c9101dc5f645b7c6602ee80b52c6 100644
--- a/openair1/PHY/NR_TRANSPORT/nr_ulsch_demodulation.c
+++ b/openair1/PHY/NR_TRANSPORT/nr_ulsch_demodulation.c
@@ -223,8 +223,6 @@ static void nr_ulsch_channel_level(int size_est,
     }
   }
 
-  simde_mm_empty();
-  simde_m_empty();
 }
 
 static void nr_ulsch_channel_compensation(c16_t *rxFext,
@@ -334,8 +332,6 @@ static void nr_ulsch_channel_compensation(c16_t *rxFext,
     }
   }
 
-  simde_mm_empty();
-  simde_m_empty();
 }
 
 // Zero Forcing Rx function: nr_det_HhH()
@@ -397,8 +393,6 @@ static void nr_ulsch_det_HhH (int32_t *after_mf_00,//a
     after_mf_10_128+=1;
     after_mf_11_128+=1;
   }
-  simde_mm_empty();
-  simde_m_empty();
 }
 
 /* Zero Forcing Rx function: nr_conjch0_mult_ch1()
@@ -444,8 +438,6 @@ static void nr_ulsch_conjch0_mult_ch1(int *ch0,
     dl_ch1_128+=1;
     ch0conj_ch1_128+=1;
   }
-  simde_mm_empty();
-  simde_m_empty();
 }
 
 static simde__m128i nr_ulsch_comp_muli_sum(simde__m128i input_x,
@@ -507,8 +499,6 @@ static simde__m128i nr_ulsch_comp_muli_sum(simde__m128i input_x,
   //print_ints("unpack hi:",&tmp_z1[0]);
   output = simde_mm_packs_epi32(tmp_z0,tmp_z1);
 
-  simde_mm_empty();
-  simde_m_empty();
   return(output);
 }
 
@@ -625,8 +615,6 @@ static void nr_ulsch_construct_HhH_elements(int *conjch00_ch00,
     after_mf_10_128 += 1;
     after_mf_11_128 += 1;
   }
-  simde_mm_empty();
-  simde_m_empty();
 }
 
 // MMSE Rx function: nr_ulsch_mmse_2layers()
@@ -1017,8 +1005,6 @@ static uint8_t nr_ulsch_mmse_2layers(NR_DL_FRAME_PARMS *frame_parms,
     after_mf_c_128 += 1;
     after_mf_d_128 += 1;
   }
-  simde_mm_empty();
-  simde_m_empty();
    return(0);
 }
 
@@ -1260,7 +1246,6 @@ int nr_rx_pusch_tp(PHY_VARS_gNB *gNB,
   nfapi_nr_pusch_pdu_t *rel15_ul = &gNB->ulsch[ulsch_id].harq_process->ulsch_pdu;
 
   NR_gNB_PUSCH *pusch_vars = &gNB->pusch_vars[ulsch_id];
-  int nbSymb = 0;
   uint32_t bwp_start_subcarrier = ((rel15_ul->rb_start + rel15_ul->bwp_start) * NR_NB_SC_PER_RB + frame_parms->first_carrier_offset) % frame_parms->ofdm_symbol_size;
   LOG_D(PHY,"pusch %d.%d : bwp_start_subcarrier %d, rb_start %d, first_carrier_offset %d\n", frame,slot,bwp_start_subcarrier, rel15_ul->rb_start, frame_parms->first_carrier_offset);
   LOG_D(PHY,"pusch %d.%d : ul_dmrs_symb_pos %x\n",frame,slot,rel15_ul->ul_dmrs_symb_pos);
@@ -1483,9 +1468,9 @@ int nr_rx_pusch_tp(PHY_VARS_gNB *gNB,
   int total_res = 0;
   int const loop_iter = CEILIDIV(rel15_ul->nr_of_symbols, numSymbols);
   puschSymbolProc_t arr[loop_iter];
-  task_ans_t arr_ans[loop_iter];
+  task_ans_t ans;
+  init_task_ans(&ans, loop_iter);
 
-  memset(arr_ans, 0, sizeof(arr_ans));
   int sz_arr = 0;
   for(uint8_t task_index = 0; task_index < loop_iter; task_index++) {
     int symbol = task_index * numSymbols + rel15_ul->start_symbol_index;
@@ -1500,7 +1485,7 @@ int nr_rx_pusch_tp(PHY_VARS_gNB *gNB,
     total_res += res_per_task;
     if (res_per_task > 0) {
       puschSymbolProc_t *rdata = &arr[sz_arr];
-      rdata->ans = &arr_ans[sz_arr];
+      rdata->ans = &ans;
       ++sz_arr;
 
       rdata->gNB = gNB;
@@ -1522,16 +1507,15 @@ int nr_rx_pusch_tp(PHY_VARS_gNB *gNB,
       } else {
         task_t t = {.func = &nr_pusch_symbol_processing, .args = rdata};
         pushTpool(&gNB->threadPool, t);
-        nbSymb++;
       }
 
-      LOG_D(PHY, "%d.%d Added symbol %d (count %d) to process, in pipe\n", frame, slot, symbol, nbSymb);
+      LOG_D(PHY, "%d.%d Added symbol %d to process, in pipe\n", frame, slot, symbol);
+    } else {
+      completed_task_ans(&ans);
     }
   } // symbol loop
 
-  if (nbSymb > 0) {
-    join_task_ans(arr_ans, sz_arr);
-  }
+  join_task_ans(&ans);
   stop_meas(&gNB->rx_pusch_symbol_processing_stats);
 
   // Copy the data to the scope. This cannot be performed in one call to gNBscopeCopy because the data is not contiguous in the
diff --git a/openair1/PHY/NR_TRANSPORT/nr_ulsch_llr_computation.c b/openair1/PHY/NR_TRANSPORT/nr_ulsch_llr_computation.c
index 6c5c37385bd85de75f5de4ded8ec0d47b6884433..286386297fa9794f881a9613c02c2dd7220c899d 100644
--- a/openair1/PHY/NR_TRANSPORT/nr_ulsch_llr_computation.c
+++ b/openair1/PHY/NR_TRANSPORT/nr_ulsch_llr_computation.c
@@ -388,7 +388,6 @@ void nr_ulsch_qpsk_qpsk(c16_t *stream0_in, c16_t *stream1_in, c16_t *stream0_out
     }
   }
 #endif
-  simde_mm_empty();
 }
 
 
@@ -1056,7 +1055,6 @@ void nr_ulsch_qam16_qam16(c16_t *stream0_in,
     stream0_128i_out[3] = simde_mm_unpackhi_epi32(xmm1_128, xmm3_128); // 8 LLRs, 2 REs
   }
 #endif
-  simde_mm_empty();
 }
 
 /*
@@ -1777,7 +1775,6 @@ void nr_ulsch_qam64_qam64(c16_t *stream0_in,
     }
   }
 #endif
-  simde_mm_empty();
 }
 
 static void nr_ulsch_shift_llr(int16_t **llr_layers, uint32_t nb_re, uint32_t rxdataF_ext_offset, uint8_t mod_order, int shift)
diff --git a/openair1/PHY/NR_TRANSPORT/pucch_rx.c b/openair1/PHY/NR_TRANSPORT/pucch_rx.c
index e545766206255e4d40b52d2ee177fcf30f196600..6f2ccc8bc190db0e6d6c0678bf02a4843bf3dd61 100644
--- a/openair1/PHY/NR_TRANSPORT/pucch_rx.c
+++ b/openair1/PHY/NR_TRANSPORT/pucch_rx.c
@@ -53,6 +53,7 @@
 #include "SCHED_NR/sched_nr.h"
 
 #include "T.h"
+#include "nr_phy_common.h"
 
 //#define DEBUG_NR_PUCCH_RX 1
 
@@ -280,6 +281,7 @@ void nr_decode_pucch0(PHY_VARS_gNB *gNB,
     }
   }
   signal_energy /= (pucch_pdu->nr_of_symbols * frame_parms->nb_antennas_rx);
+  signal_energy_ant0 /= pucch_pdu->nr_of_symbols;
   int pucch_power_dBtimes10 = 10 * dB_fixed(signal_energy);
 
   //int32_t no_corr = 0;
@@ -403,7 +405,7 @@ void nr_decode_pucch0(PHY_VARS_gNB *gNB,
   uci_pdu->rnti = pucch_pdu->rnti;
   uci_pdu->ul_cqi = cqi;
   uci_pdu->timing_advance = 0xffff; // currently not valid
-  uci_pdu->rssi = 1280 - (10 * dB_fixed(32767 * 32767)) - dB_fixed_times10(signal_energy_ant0);
+  uci_pdu->rssi = 1280 - (10 * dB_fixed(32767 * 32767) - dB_fixed_times10(signal_energy_ant0));
 
   if (pucch_pdu->bit_len_harq==0) {
     uci_pdu->sr.sr_confidence_level = SNRtimes10 < gNB->pucch0_thres;
@@ -693,10 +695,6 @@ void nr_decode_pucch1(c16_t **rxdataF,
           table_6_3_2_4_1_1_N_SF_mprime_PUCCH_1_noHop[nrofSymbols - 1]; // only if intra-slot hopping not enabled (PUCCH)
       int N_SF_mprime_PUCCH_DMRS_1 =
           table_6_4_1_3_1_1_1_N_SF_mprime_PUCCH_1_noHop[nrofSymbols - 1]; // only if intra-slot hopping not enabled (DM-RS)
-#ifdef DEBUG_NR_PUCCH_RX
-      printf("\t [nr_generate_pucch1] w_index = %d, N_SF_mprime_PUCCH_1 = %d, N_SF_mprime_PUCCH_DMRS_1 = %d, N_SF_mprime0_PUCCH_1 = %d, N_SF_mprime0_PUCCH_DMRS_1 = %d\n",
-             w_index, N_SF_mprime_PUCCH_1,N_SF_mprime_PUCCH_DMRS_1,N_SF_mprime0_PUCCH_1,N_SF_mprime0_PUCCH_DMRS_1);
-#endif
 
       if(l%2==1){
         for (int m=0; m < N_SF_mprime_PUCCH_1; m++) {
@@ -860,164 +858,59 @@ void nr_decode_pucch1(c16_t **rxdataF,
   }
 }
 
-static simde__m256i pucch2_3bit[8 * 2];
-static simde__m256i pucch2_4bit[16 * 2];
-static simde__m256i pucch2_5bit[32 * 2];
-static simde__m256i pucch2_6bit[64 * 2];
-static simde__m256i pucch2_7bit[128 * 2];
-static simde__m256i pucch2_8bit[256 * 2];
-static simde__m256i pucch2_9bit[512 * 2];
-static simde__m256i pucch2_10bit[1024 * 2];
-static simde__m256i pucch2_11bit[2048 * 2];
-
-static simde__m256i *pucch2_lut[9] =
+typedef struct {c16_t cw[16];} cw_t;
+static  cw_t pucch2_3bit[8] __attribute__((aligned(32)));
+static cw_t pucch2_4bit[16] __attribute__((aligned(32)));
+static cw_t pucch2_5bit[32] __attribute__((aligned(32)));
+static cw_t  pucch2_6bit[64] __attribute__((aligned(32)));
+static cw_t pucch2_7bit[128] __attribute__((aligned(32)));
+static cw_t pucch2_8bit[256] __attribute__((aligned(32)));
+static cw_t pucch2_9bit[512] __attribute__((aligned(32)));
+static cw_t pucch2_10bit[1024] __attribute__((aligned(32)));
+static cw_t pucch2_11bit[2048] __attribute__((aligned(32)));
+
+static cw_t* pucch2_lut[9] =
     {pucch2_3bit, pucch2_4bit, pucch2_5bit, pucch2_6bit, pucch2_7bit, pucch2_8bit, pucch2_9bit, pucch2_10bit, pucch2_11bit};
 
-static simde__m64 pucch2_polar_4bit[16];
-static simde__m128i pucch2_polar_llr_num_lut[256], pucch2_polar_llr_den_lut[256];
+typedef struct {
+  int16_t cw[4];
+} cw4bit_t;
+static cw4bit_t pucch2_polar_4bit[16] __attribute__((aligned(32)));
+static simde__m128i pucch2_polar_llr_num_lut[256];
 
-void init_pucch2_luts() {
-
-  uint32_t out;
-  int8_t bit; 
-  
+void init_pucch2_luts()
+{
   for (int b=3;b<12;b++) {
-    for (int i = 0; i < (1 << b); i++) {
-      out = encodeSmallBlock(i, b);
-#ifdef DEBUG_NR_PUCCH_RX
-      if (b==3) printf("in %d, out %x\n",i,out);
-#endif
-      simde__m256i *lut_i=&pucch2_lut[b-3][i<<1];
-      simde__m256i *lut_ip1=&pucch2_lut[b-3][1+(i<<1)];
-      bit = (out&0x1) > 0 ? -1 : 1;
-      *lut_i = simde_mm256_insert_epi16(*lut_i,bit,0);
-      bit = (out&0x2) > 0 ? -1 : 1;
-      *lut_ip1 = simde_mm256_insert_epi16(*lut_ip1,bit,0);
-      bit = (out&0x4) > 0 ? -1 : 1;
-      *lut_i = simde_mm256_insert_epi16(*lut_i,bit,1);
-      bit = (out&0x8) > 0 ? -1 : 1;
-      *lut_ip1 = simde_mm256_insert_epi16(*lut_ip1,bit,1);
-      bit = (out&0x10) > 0 ? -1 : 1;
-      *lut_i = simde_mm256_insert_epi16(*lut_i,bit,2);
-      bit = (out&0x20) > 0 ? -1 : 1;
-      *lut_ip1 = simde_mm256_insert_epi16(*lut_ip1,bit,2);
-      bit = (out&0x40) > 0 ? -1 : 1;
-      *lut_i = simde_mm256_insert_epi16(*lut_i,bit,3);
-      bit = (out&0x80) > 0 ? -1 : 1;
-      *lut_ip1 = simde_mm256_insert_epi16(*lut_ip1,bit,3);
-      bit = (out&0x100) > 0 ? -1 : 1;
-      *lut_i = simde_mm256_insert_epi16(*lut_i,bit,4);
-      bit = (out&0x200) > 0 ? -1 : 1;
-      *lut_ip1 = simde_mm256_insert_epi16(*lut_ip1,bit,4);
-      bit = (out&0x400) > 0 ? -1 : 1;
-      *lut_i = simde_mm256_insert_epi16(*lut_i,bit,5);
-      bit = (out&0x800) > 0 ? -1 : 1;
-      *lut_ip1 = simde_mm256_insert_epi16(*lut_ip1,bit,5);
-      bit = (out&0x1000) > 0 ? -1 : 1;
-      *lut_i = simde_mm256_insert_epi16(*lut_i,bit,6);
-      bit = (out&0x2000) > 0 ? -1 : 1;
-      *lut_ip1 = simde_mm256_insert_epi16(*lut_ip1,bit,6);
-      bit = (out&0x4000) > 0 ? -1 : 1;
-      *lut_i = simde_mm256_insert_epi16(*lut_i,bit,7);
-      bit = (out&0x8000) > 0 ? -1 : 1;
-      *lut_ip1 = simde_mm256_insert_epi16(*lut_ip1,bit,7);
-      bit = (out&0x10000) > 0 ? -1 : 1;
-      *lut_i = simde_mm256_insert_epi16(*lut_i,bit,8);
-      bit = (out&0x20000) > 0 ? -1 : 1;
-      *lut_ip1 = simde_mm256_insert_epi16(*lut_ip1,bit,8);
-      bit = (out&0x40000) > 0 ? -1 : 1;
-      *lut_i = simde_mm256_insert_epi16(*lut_i,bit,9);
-      bit = (out&0x80000) > 0 ? -1 : 1;
-      *lut_ip1 = simde_mm256_insert_epi16(*lut_ip1,bit,9);
-      bit = (out&0x100000) > 0 ? -1 : 1;
-      *lut_i = simde_mm256_insert_epi16(*lut_i,bit,10);
-      bit = (out&0x200000) > 0 ? -1 : 1;
-      *lut_ip1 = simde_mm256_insert_epi16(*lut_ip1,bit,10);
-      bit = (out&0x400000) > 0 ? -1 : 1;
-      *lut_i = simde_mm256_insert_epi16(*lut_i,bit,11);
-      bit = (out&0x800000) > 0 ? -1 : 1;
-      *lut_ip1 = simde_mm256_insert_epi16(*lut_ip1,bit,11);
-      bit = (out&0x1000000) > 0 ? -1 : 1;
-      *lut_i = simde_mm256_insert_epi16(*lut_i,bit,12);
-      bit = (out&0x2000000) > 0 ? -1 : 1;
-      *lut_ip1 = simde_mm256_insert_epi16(*lut_ip1,bit,12);
-      bit = (out&0x4000000) > 0 ? -1 : 1;
-      *lut_i = simde_mm256_insert_epi16(*lut_i,bit,13);
-      bit = (out&0x8000000) > 0 ? -1 : 1;
-      *lut_ip1 = simde_mm256_insert_epi16(*lut_ip1,bit,13);
-      bit = (out&0x10000000) > 0 ? -1 : 1;
-      *lut_i = simde_mm256_insert_epi16(*lut_i,bit,14);
-      bit = (out&0x20000000) > 0 ? -1 : 1;
-      *lut_ip1 = simde_mm256_insert_epi16(*lut_ip1,bit,14);
-      bit = (out&0x40000000) > 0 ? -1 : 1;
-      *lut_i = simde_mm256_insert_epi16(*lut_i,bit,15);
-      bit = (out&0x80000000) > 0 ? -1 : 1;
-      *lut_ip1 = simde_mm256_insert_epi16(*lut_ip1,bit,15);
+    for (int cw = 0; cw < (1 << b); cw++) {
+      uint32_t out = encodeSmallBlock(cw, b);
+      uint16_t *tmp = (uint16_t *)pucch2_lut[b - 3][cw].cw;
+      for (int j = 0; j < 32; j++)
+        *tmp++ = (out & (1U<<j)) > 0 ? -1 : 1;
     }
   }
   for (int i = 0; i < 16; i++) {
-    simde__m64 *lut_i=&pucch2_polar_4bit[i];
-
-    bit = (i&0x1) > 0 ? -1 : 1;
-    *lut_i = simde_mm_insert_pi16(*lut_i,bit,0);
-    bit = (i&0x2) > 0 ? -1 : 1;
-    *lut_i = simde_mm_insert_pi16(*lut_i,bit,1);
-    bit = (i&0x4) > 0 ? -1 : 1;
-    *lut_i = simde_mm_insert_pi16(*lut_i,bit,2);
-    bit = (i&0x8) > 0 ? -1 : 1;
-    *lut_i = simde_mm_insert_pi16(*lut_i,bit,3);
+    int16_t *lut_i = pucch2_polar_4bit[i].cw;
+    *lut_i++ = (i & 0x1) <= 0;
+    *lut_i++ = (i & 0x2) <= 0;
+    *lut_i++ = (i & 0x4) <= 0;
+    *lut_i++ = (i & 0x8) <= 0;
   }
-  for (int i=0;i<256;i++) {
-    simde__m128i *lut_num_i=&pucch2_polar_llr_num_lut[i];
-    simde__m128i *lut_den_i=&pucch2_polar_llr_den_lut[i];
-    bit = (i&0x1) > 0 ? 0 : 1;
-    *lut_num_i = simde_mm_insert_epi16(*lut_num_i, bit, 0);
-    *lut_den_i = simde_mm_insert_epi16(*lut_den_i, 1 - bit, 0);
-
-    bit = (i&0x10) > 0 ? 0 : 1;
-    *lut_num_i = simde_mm_insert_epi16(*lut_num_i, bit, 1);
-    *lut_den_i = simde_mm_insert_epi16(*lut_den_i, 1 - bit, 1);
-
-    bit = (i&0x2) > 0 ? 0 : 1;
-    *lut_num_i = simde_mm_insert_epi16(*lut_num_i, bit, 2);
-    *lut_den_i = simde_mm_insert_epi16(*lut_den_i, 1 - bit, 2);
-
-    bit = (i&0x20) > 0 ? 0 : 1;
-    *lut_num_i = simde_mm_insert_epi16(*lut_num_i, bit, 3);
-    *lut_den_i = simde_mm_insert_epi16(*lut_den_i, 1 - bit, 3);
-
-    bit = (i&0x4) > 0 ? 0 : 1;
-    *lut_num_i = simde_mm_insert_epi16(*lut_num_i, bit, 4);
-    *lut_den_i = simde_mm_insert_epi16(*lut_den_i, 1 - bit, 4);
-
-    bit = (i&0x40) > 0 ? 0 : 1;
-    *lut_num_i = simde_mm_insert_epi16(*lut_num_i, bit, 5);
-    *lut_den_i = simde_mm_insert_epi16(*lut_den_i, 1 - bit, 5);
-
-    bit = (i&0x8) > 0 ? 0 : 1;
-    *lut_num_i = simde_mm_insert_epi16(*lut_num_i, bit, 6);
-    *lut_den_i = simde_mm_insert_epi16(*lut_den_i, 1 - bit, 6);
-
-    bit = (i&0x80) > 0 ? 0 : 1;
-    *lut_num_i = simde_mm_insert_epi16(*lut_num_i, bit, 7);
-    *lut_den_i = simde_mm_insert_epi16(*lut_den_i, 1 - bit, 7);
-
+  for (int cw = 0; cw < 256; cw++) {
+    int16_t *lut_num_i = (int16_t *)&pucch2_polar_llr_num_lut[cw];
+    *lut_num_i++ = (cw & 0x1) <= 0;
+    *lut_num_i++ = (cw & 0x10) <= 0;
+    *lut_num_i++ = (cw & 0x2) <= 0;
+    *lut_num_i++ = (cw & 0x20) <= 0;
+    *lut_num_i++ = (cw & 0x4) <= 0;
+    *lut_num_i++ = (cw & 0x40) <= 0;
+    *lut_num_i++ = (cw & 0x8) <= 0;
+    *lut_num_i++ = (cw & 0x80) <= 0;
 #ifdef DEBUG_NR_PUCCH_RX
-    printf("i %d, lut_num (%d,%d,%d,%d,%d,%d,%d,%d)\n",
-           i,
-           ((int16_t *)lut_num_i)[0],
-           ((int16_t *)lut_num_i)[1],
-           ((int16_t *)lut_num_i)[2],
-           ((int16_t *)lut_num_i)[3],
-           ((int16_t *)lut_num_i)[4],
-           ((int16_t *)lut_num_i)[5],
-           ((int16_t *)lut_num_i)[6],
-           ((int16_t *)lut_num_i)[7]);
+    log_dump(PHY, pucch2_polar_llr_num_lut, 8, LOG_DUMP_C16, "lut_num %d:", i);
 #endif
   }
 }
 
-
 void nr_decode_pucch2(PHY_VARS_gNB *gNB,
                       c16_t **rxdataF,
                       int frame,
@@ -1026,11 +919,14 @@ void nr_decode_pucch2(PHY_VARS_gNB *gNB,
                       nfapi_nr_pucch_pdu_t* pucch_pdu)
 {
   NR_DL_FRAME_PARMS *frame_parms = &gNB->frame_parms;
+  const simde__m256i conj256 = simde_mm256_set_epi16(-1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1);
+
   //pucch_GroupHopping_t pucch_GroupHopping = pucch_pdu->group_hop_flag + (pucch_pdu->sequence_hop_flag<<1);
+  const int nb_symbols=pucch_pdu->nr_of_symbols;
 
-  AssertFatal(pucch_pdu->nr_of_symbols == 1 || pucch_pdu->nr_of_symbols == 2,
+  AssertFatal(nb_symbols == 1 || nb_symbols == 2,
               "Illegal number of symbols  for PUCCH 2 %d\n",
-              pucch_pdu->nr_of_symbols);
+              nb_symbols);
 
   AssertFatal((pucch_pdu->prb_start-((pucch_pdu->prb_start>>2)<<2))==0,
               "Current pucch2 receiver implementation requires a PRB offset multiple of 4. The one selected is %d",
@@ -1040,35 +936,30 @@ void nr_decode_pucch2(PHY_VARS_gNB *gNB,
 
   int l2 = pucch_pdu->start_symbol_index;
   int soffset = (slot % RU_RX_SLOT_DEPTH) * frame_parms->symbols_per_slot * frame_parms->ofdm_symbol_size;
-  int re_offset[2];
+  int re_offset[nb_symbols];
   re_offset[0] =
       (12 * (pucch_pdu->prb_start + pucch_pdu->bwp_start) + frame_parms->first_carrier_offset) % frame_parms->ofdm_symbol_size;
-  if (pucch_pdu->freq_hop_flag == 0)
-    re_offset[1] = re_offset[0];
-  else {
-    re_offset[1] = 12*(pucch_pdu->second_hop_prb+pucch_pdu->bwp_start) + frame_parms->first_carrier_offset;
-    if (re_offset[1]>= frame_parms->ofdm_symbol_size)
-      re_offset[1]-=frame_parms->ofdm_symbol_size;
+  if (nb_symbols==2) {
+    if (pucch_pdu->freq_hop_flag )
+      re_offset[1] = (12*(pucch_pdu->second_hop_prb+pucch_pdu->bwp_start) + frame_parms->first_carrier_offset) % frame_parms->ofdm_symbol_size ;
+    else
+      re_offset[1] = re_offset[0];
   }
-  AssertFatal(pucch_pdu->prb_size * pucch_pdu->nr_of_symbols > 1,
+  AssertFatal(pucch_pdu->prb_size * nb_symbols > 1,
               "number of PRB*SYMB (%d,%d)< 2",
               pucch_pdu->prb_size,
-              pucch_pdu->nr_of_symbols);
+              nb_symbols);
 
   int Prx = gNB->gNB_config.carrier_config.num_rx_ant.value;
   //  AssertFatal((pucch_pdu->prb_size&1) == 0,"prb_size %d is not a multiple of2\n",pucch_pdu->prb_size);
-  int Prx2 = (Prx==1)?2:Prx;
   // use 2 for Nb antennas in case of single antenna to allow the following allocations
-  int nb_re_pucch = 12*pucch_pdu->prb_size;
-  int prb_size_ext = pucch_pdu->prb_size+(pucch_pdu->prb_size&1);
-
-  c16_t rp[Prx2][2][nb_re_pucch];
+  const int nb_re_pucch = 12 * pucch_pdu->prb_size;
+  c16_t rp[Prx][nb_symbols][nb_re_pucch];
   memset(rp, 0, sizeof(rp));
 
   int64_t pucch2_lev = 0;
-
   for (int aa=0;aa<Prx;aa++){
-    for (int symb=0;symb<pucch_pdu->nr_of_symbols;symb++) {
+    for (int symb=0;symb<nb_symbols;symb++) {
       c16_t *tmp_rp = ((c16_t *)&rxdataF[aa][soffset + (l2 + symb) * frame_parms->ofdm_symbol_size]);
 
       if (re_offset[symb] + nb_re_pucch < frame_parms->ofdm_symbol_size) {
@@ -1084,23 +975,14 @@ void nr_decode_pucch2(PHY_VARS_gNB *gNB,
     }
   }
 
-  pucch2_lev /= Prx * pucch_pdu->nr_of_symbols;
+  pucch2_lev /= Prx * nb_symbols;
   int pucch2_levdB = dB_fixed(pucch2_lev);
-  int scaling = 0;
-  if (pucch2_levdB > 72)
-    scaling = 4;
-  else if (pucch2_levdB > 66)
-    scaling = 3;
-  else if (pucch2_levdB > 60)
-    scaling = 2;
-  else if (pucch2_levdB > 54)
-    scaling = 1;
-
+  int scaling = max((log2_approx64(pucch2_lev) >> 1) - 8, 0);
   LOG_D(NR_PHY,
         "%d.%d Decoding pucch2 for %d symbols, %d PRB, nb_harq %d, nb_sr %d, nb_csi %d/%d, pucch2_lev %d dB (scaling %d)\n",
         frame,
         slot,
-        pucch_pdu->nr_of_symbols,
+        nb_symbols,
         pucch_pdu->prb_size,
         pucch_pdu->bit_len_harq,
         pucch_pdu->sr_flag,
@@ -1109,68 +991,87 @@ void nr_decode_pucch2(PHY_VARS_gNB *gNB,
         pucch2_levdB,
         scaling);
 
+  int prb_size_ext = pucch_pdu->prb_size + (pucch_pdu->prb_size & 1);
   int nc_group_size=1; // 2 PRB
   int ngroup = prb_size_ext/nc_group_size/2;
-  int32_t corr32_re[2][ngroup][Prx2],corr32_im[2][ngroup][Prx2];
-  memset(corr32_re, 0, sizeof(corr32_re));
-  memset(corr32_im, 0, sizeof(corr32_im));
-
-  int16_t r_re_ext[Prx2][2][8 * prb_size_ext] __attribute__((aligned(32)));
-  int16_t r_im_ext[Prx2][2][8 * prb_size_ext] __attribute__((aligned(32)));
-  int16_t r_re_ext2[Prx2][2][8 * prb_size_ext] __attribute__((aligned(32)));
-  int16_t r_im_ext2[Prx2][2][8 * prb_size_ext] __attribute__((aligned(32)));
-  int16_t rd_re_ext[Prx2][2][4 * prb_size_ext] __attribute__((aligned(32)));
-  int16_t rd_im_ext[Prx2][2][4 * prb_size_ext] __attribute__((aligned(32)));
-
-  if (pucch_pdu->prb_size != prb_size_ext) {
-    // if the number of PRBs is odd
-    // we fill the unsed part of the arrays
-    for (int aa = 0; aa < Prx; aa++) {
-      for (int symb = 0; symb < pucch_pdu->nr_of_symbols; symb++) {
-        const int sz = pucch_pdu->prb_size;
-        memset(r_re_ext[aa][symb] + 8 * sz, 0, 8 * sizeof(int16_t));
-        memset(r_im_ext[aa][symb] + 8 * sz, 0, 8 * sizeof(int16_t));
-        memset(rd_re_ext[aa][symb] + 4 * sz, 0, 4 * sizeof(int16_t));
-        memset(rd_im_ext[aa][symb] + 4 * sz, 0, 4 * sizeof(int16_t));
-      }
-    }
-  }
+  c32_t corr32[nb_symbols][ngroup][Prx];
+  memset(corr32, 0, sizeof(corr32));
+  const int nb_re_data = 8 * prb_size_ext;
+  const int nb_re_dmrs = 4 * prb_size_ext;
+  c16_t r_ext[Prx][nb_symbols][nb_re_data] __attribute__((aligned(32)));
+  c16_t r_ext2[Prx][nb_symbols][nb_re_data] __attribute__((aligned(32)));
+  const simde__m256i swap = simde_mm256_set_epi8(29,
+                                                 28,
+                                                 31,
+                                                 30,
+                                                 25,
+                                                 24,
+                                                 27,
+                                                 26,
+                                                 21,
+                                                 20,
+                                                 23,
+                                                 22,
+                                                 17,
+                                                 16,
+                                                 19,
+                                                 18,
+                                                 13,
+                                                 12,
+                                                 15,
+                                                 14,
+                                                 9,
+                                                 8,
+                                                 11,
+                                                 10,
+                                                 5,
+                                                 4,
+                                                 7,
+                                                 6,
+                                                 1,
+                                                 0,
+                                                 3,
+                                                 2);
+  // prepare scrambling sequence for data
+  uint32_t x2 = ((pucch_pdu->rnti) << 15) + pucch_pdu->data_scrambling_id;
+#ifdef DEBUG_NR_PUCCH_RX
+  printf("x2 %x\n", x2);
+#endif
+  c16_t scramb_data[nb_re_data] __attribute__((aligned(32)));
 
-  for (int symb=0; symb<pucch_pdu->nr_of_symbols;symb++) {
-    // 24 REs contains 48x16-bit, so 6x8x16-bit
+  uint32_t *sGold = gold_cache(x2, nb_symbols * nb_re_data/2);
+  uint8_t *sGold8 = (uint8_t *)sGold;
+  for (int i = 0; i < nb_re_data; i += 4)
+    *(simde__m128i *)(scramb_data + i) = byte2m128i[*sGold8++];
+  
+
+  for (int symb=0; symb<nb_symbols;symb++) {
+    c16_t rdmrs_ext[Prx][nb_re_dmrs] __attribute__((aligned(32)));
+
+    // extract DMRS
     for (int aa = 0; aa < Prx; aa++) {
+      c16_t *rdmrs_ext_p = rdmrs_ext[aa];
+      c16_t *rp_base = rp[aa][symb];
       for (int prb = 0; prb < pucch_pdu->prb_size; prb++) {
-        int16_t *r_re_ext_p = &r_re_ext[aa][symb][8 * prb];
-        int16_t *r_im_ext_p = &r_im_ext[aa][symb][8 * prb];
-        int16_t *rd_re_ext_p = &rd_re_ext[aa][symb][4 * prb];
-        int16_t *rd_im_ext_p = &rd_im_ext[aa][symb][4 * prb];
-
         for (int idx = 0; idx < 4; idx++) {
-          c16_t *rp_base = rp[aa][symb] + prb * 12 + 3 * idx;
-          AssertFatal(prb * 12 + 3 * idx + 2 < nb_re_pucch, "");
-          r_re_ext_p[idx << 1] = rp_base->r >> scaling;
-          r_im_ext_p[idx << 1] = rp_base->i >> scaling;
           rp_base++;
-          rd_re_ext_p[idx] = rp_base->r >> scaling;
-          rd_im_ext_p[idx] = rp_base->i >> scaling;
+          *rdmrs_ext_p++ = *rp_base++;
           rp_base++;
-          r_re_ext_p[1 + (idx << 1)] = rp_base->r >> scaling;
-          r_im_ext_p[1 + (idx << 1)] = rp_base->i >> scaling;
         }
+      }
+      if (pucch_pdu->prb_size != prb_size_ext)
+        // if the number of PRBs is odd
+        // we fill the unsed part of the arrays
+        memset(rdmrs_ext[aa] + pucch_pdu->prb_size * 4, 0, 4 * sizeof(c16_t));
+    }
 
 #ifdef DEBUG_NR_PUCCH_RX
-        for (int i = 0; i < 8; i++)
-          printf("Ant %d PRB %d dmrs[%d] -> (%d,%d)\n", aa, prb + (i >> 2), i, rd_re_ext_p[i], rd_im_ext_p[i]);
-        for (int i = 0; i < 16; i++)
-          printf("Ant %d PRB %d data[%d] -> (%d,%d)\n", aa, prb + (i >> 3), i, r_re_ext_p[i], r_im_ext_p[i]);
+    for (int aa = 0; aa < Prx; aa++)
+      log_dump(PHY, rdmrs_ext[aa], nb_re_dmrs, LOG_DUMP_C16, "Ant %d dmrs:\n", aa);
 #endif
-      }
-    }
 
     // first compute DMRS component
-
     const int scramble = pucch_pdu->dmrs_scrambling_id * 2;
-    // fixme: when MR2754 will be merged, use the gold sequence cache instead of regenerate each time
     uint32_t x2 =
         ((1ULL << 17) * ((NR_NUMBER_OF_SYMBOLS_PER_SLOT * slot + pucch_pdu->start_symbol_index + symb + 1) * (scramble + 1))
          + scramble)
@@ -1180,453 +1081,210 @@ void nr_decode_pucch2(PHY_VARS_gNB *gNB,
            slot,pucch_pdu->start_symbol_index,symb,pucch_pdu->dmrs_scrambling_id);
 #endif
     uint32_t *sGold = gold_cache(x2, pucch_pdu->prb_start / 4 + ngroup / 2);
-    for (int group = 0, goldIdx = pucch_pdu->prb_start / 4; group < ngroup; group++) {
-      // each group has 8*nc_group_size elements, compute 1 complex correlation with DMRS per group
-      // non-coherent combining across groups
-      uint8_t *sGold8 = (uint8_t *)&sGold[goldIdx];
-      simde__m64 dmrs_re = byte2m64_re[sGold8[(group & 1) << 1]];
-      int16_t *dmrs_re16 = (int16_t *)&dmrs_re;
-      simde__m64 dmrs_im = byte2m64_im[sGold8[(group & 1) << 1]];
-      int16_t *dmrs_im16 = (int16_t *)&dmrs_im;
-#ifdef DEBUG_NR_PUCCH_RX
-      printf("Group %d: x2 %x ((%d,%d),(%d,%d),(%d,%d),(%d,%d))\n",
-             group,
-             x2,
-             dmrs_re16[0],
-             dmrs_im16[0],
-             dmrs_re16[1],
-             dmrs_im16[1],
-             dmrs_re16[2],
-             dmrs_im16[2],
-             dmrs_re16[3],
-             dmrs_im16[3]);
-#endif
-      for (int aa=0;aa<Prx;aa++) {
-        int16_t *rd_re_ext_p = &rd_re_ext[aa][symb][8 * group];
-        int16_t *rd_im_ext_p = &rd_im_ext[aa][symb][8 * group];
+    // Compute pilot conjugate
+    c16_t pil_dmrs[nb_re_dmrs] __attribute__((aligned(32)));
+    uint8_t *sGold8 = (uint8_t *)(sGold + pucch_pdu->prb_start / 4);
+    for (int group = 0; group < nb_re_dmrs; group += 4)
+      *(simde__m128i *)(pil_dmrs + group) = simde_mm_sign_epi16(byte2m128i[*sGold8++], *(simde__m128i *)&conj256);
+
+    // Compute delay
+    c16_t ch_ls[128] __attribute__((aligned(32))) = {0};
+    {
+      c16_t rdmrs_gold[nb_re_dmrs] __attribute__((aligned(32)));
+      for (int aa = 0; aa < Prx; aa++) {
+        mult_complex_vectors(rdmrs_ext[aa], pil_dmrs, rdmrs_gold, nb_re_dmrs, 0);
+        c16_t *ch_ls_ptr = ch_ls;
+        c16_t *end = ch_ls_ptr + 128;
+        for (int i = 0; i < nb_re_dmrs; i++)
+          for (int k = 0; k < 3 && ch_ls_ptr < end; k++)
+            *ch_ls_ptr++ = rdmrs_gold[i];
+      }
+    }
+    c16_t ch_temp[128] __attribute__((aligned(32))) = {0};
+    delay_t delay = {0};
+    nr_est_delay(128, ch_ls, ch_temp, &delay);
+
+    // Apply delay compensation on the input
+    if (delay.est_delay != 0) {
+      int delay_idx = get_delay_idx(delay.est_delay, MAX_DELAY_COMP);
+      c16_t *delay_table = frame_parms->delay_table128[delay_idx];
+      for (int aa = 0; aa < Prx; aa++)
+        mult_complex_vectors(rp[aa][symb], delay_table, rp[aa][symb], nb_re_pucch, 8);
+    }
 
-#ifdef DEBUG_NR_PUCCH_RX
-        printf("Group %d: rd ((%d,%d),(%d,%d),(%d,%d),(%d,%d))\n",
-               group,
-               rd_re_ext_p[0],rd_im_ext_p[0],
-               rd_re_ext_p[1],rd_im_ext_p[1],
-               rd_re_ext_p[2],rd_im_ext_p[2],
-               rd_re_ext_p[3],rd_im_ext_p[3]);
-#endif
-        for (int z = 0; z < 4; z++) {
-          corr32_re[symb][group][aa] += rd_re_ext_p[z] * dmrs_re16[z] + rd_im_ext_p[z] * dmrs_im16[z];
-          corr32_im[symb][group][aa] += -rd_re_ext_p[z] * dmrs_im16[z] + rd_im_ext_p[z] * dmrs_re16[z];
+    // extract again DMRS, and signal, after delay compensation
+    for (int aa = 0; aa < Prx; aa++) {
+      c16_t *r_ext_p = r_ext[aa][symb];
+      c16_t *rdmrs_ext_p = rdmrs_ext[aa];
+      c16_t *rp_base = rp[aa][symb];
+      for (int prb = 0; prb < pucch_pdu->prb_size; prb++) {
+        for (int idx = 0; idx < 4; idx++) {
+          *r_ext_p++ = *rp_base++;
+          *rdmrs_ext_p++ = *rp_base++;
+          *r_ext_p++ = *rp_base++;
         }
       }
-      dmrs_re = byte2m64_re[sGold8[1 + ((group & 1) << 1)]];
-      dmrs_im = byte2m64_im[sGold8[1 + ((group & 1) << 1)]];
+      if (pucch_pdu->prb_size != prb_size_ext) {
+        // if the number of PRBs is odd
+        // we fill the unsed part of the arrays
+        memset(rdmrs_ext[aa] + pucch_pdu->prb_size * 4, 0, 4 * sizeof(c16_t));
+        memset(r_ext[aa][symb] + pucch_pdu->prb_size * 8, 0, 8 * sizeof(c16_t));
+      }
+    }
 #ifdef DEBUG_NR_PUCCH_RX
-      printf("Group %d: s %x ((%d,%d),(%d,%d),(%d,%d),(%d,%d))\n",
-             group,
-             ((uint16_t *)&sGold)[1],
-             dmrs_re16[0],
-             dmrs_im16[0],
-             dmrs_re16[1],
-             dmrs_im16[1],
-             dmrs_re16[2],
-             dmrs_im16[2],
-             dmrs_re16[3],
-             dmrs_im16[3]);
+    for (int aa = 0; aa < Prx; aa++) {
+      log_dump(PHY, rdmrs_ext[aa], nb_re_dmrs, LOG_DUMP_C16, "after delay compensation ant %d dmrs:\n", aa);
+      log_dump(PHY, r_ext[aa], nb_re_data, LOG_DUMP_C16, "after delay compensation ant %d data:\n", aa);
+    }
 #endif
+      c16_t rdmrs_gold[Prx][nb_re_dmrs] __attribute__((aligned(32)));
+      for (int aa = 0; aa < Prx; aa++)
+        mult_complex_vectors(rdmrs_ext[aa], pil_dmrs, rdmrs_gold[aa], nb_re_dmrs, 0);
       for (int aa=0;aa<Prx;aa++) {
-        int16_t *rd_re_ext_p = &rd_re_ext[aa][symb][8 * group];
-        int16_t *rd_im_ext_p = &rd_im_ext[aa][symb][8 * group];
-#ifdef DEBUG_NR_PUCCH_RX
-        printf("Group %d: rd ((%d,%d),(%d,%d),(%d,%d),(%d,%d))\n",
-               group,
-               rd_re_ext_p[4],rd_im_ext_p[4],
-               rd_re_ext_p[5],rd_im_ext_p[5],
-               rd_re_ext_p[6],rd_im_ext_p[6],
-               rd_re_ext_p[7],rd_im_ext_p[7]);
-#endif
-        for (int z = 0; z < 4; z++) {
-          corr32_re[symb][group][aa] += rd_re_ext_p[z + 4] * dmrs_re16[z] + rd_im_ext_p[z + 4] * dmrs_im16[z];
-          corr32_im[symb][group][aa] += -rd_re_ext_p[z + 4] * dmrs_im16[z] + rd_im_ext_p[z + 4] * dmrs_re16[z];
+        c16_t *pil_ptr = pil_dmrs;
+        for (int group = 0; group < ngroup; group++) {
+          // each group has 8*nc_group_size elements, compute 1 complex correlation with DMRS per group
+          // non-coherent combining across groups
+          c16_t *rdmrs_p = &rdmrs_ext[aa][8 * group];
+          for (int z = 0; z < 8; z++) {
+            c16_t tmp = c16mulShift(*rdmrs_p++, *pil_ptr++, scaling);
+            corr32[symb][group][aa].r += tmp.r;
+            corr32[symb][group][aa].i += tmp.i;
+          }
         }
-        /*      corr32_re[group][aa]>>=5;
-                corr32_im[group][aa]>>=5;*/
-#ifdef DEBUG_NR_PUCCH_RX
-        printf("Group %d: corr32 (%d,%d)\n",group,corr32_re[symb][group][aa],corr32_im[symb][group][aa]);
-#endif
-      } //aa    
-
-      if ((group & 1) == 1)
-        goldIdx++;
-    } // group
-  } // symb
-
-  // unscrambling
-  uint32_t x2 = ((pucch_pdu->rnti) << 15) + pucch_pdu->data_scrambling_id;
-#ifdef DEBUG_NR_PUCCH_RX
-  printf("x2 %x\n", x2);
-#endif
-  uint32_t *sGold = gold_cache(x2, pucch_pdu->nr_of_symbols * prb_size_ext / 2);
-  int goldIdx = 0;
-  for (int symb=0;symb<pucch_pdu->nr_of_symbols;symb++) {
-    simde__m64 c_re[4], c_im[4];
-    int re_off=0;
-    for (int prb=0;prb<prb_size_ext;prb+=2,re_off+=16) {
-      uint8_t *sGold8 = (uint8_t *)(sGold + goldIdx);
-      for (int z = 0; z < 4; z++) {
-        c_re[z] = byte2m64_re[sGold8[z]];
-        c_im[z] = byte2m64_im[sGold8[z]];
       }
-
-      for (int aa=0;aa<Prx;aa++) {
 #ifdef DEBUG_NR_PUCCH_RX
-        printf("prb %d: rd ((%d,%d),(%d,%d),(%d,%d),(%d,%d),(%d,%d),(%d,%d),(%d,%d),(%d,%d))\n",
-               prb,
-               r_re_ext[aa][symb][re_off],
-               r_im_ext[aa][symb][re_off],
-               r_re_ext[aa][symb][re_off + 1],
-               r_im_ext[aa][symb][re_off + 1],
-               r_re_ext[aa][symb][re_off + 2],
-               r_im_ext[aa][symb][re_off + 2],
-               r_re_ext[aa][symb][re_off + 3],
-               r_im_ext[aa][symb][re_off + 3],
-               r_re_ext[aa][symb][re_off + 4],
-               r_im_ext[aa][symb][re_off + 4],
-               r_re_ext[aa][symb][re_off + 5],
-               r_im_ext[aa][symb][re_off + 5],
-               r_re_ext[aa][symb][re_off + 6],
-               r_im_ext[aa][symb][re_off + 6],
-               r_re_ext[aa][symb][re_off + 7],
-               r_im_ext[aa][symb][re_off + 7]);
-        printf("prb %d: rd ((%d,%d),(%d,%d),(%d,%d),(%d,%d),(%d,%d),(%d,%d),(%d,%d),(%d,%d))\n",
-               prb+1,
-               r_re_ext[aa][symb][re_off + 8],
-               r_im_ext[aa][symb][re_off + 8],
-               r_re_ext[aa][symb][re_off + 9],
-               r_im_ext[aa][symb][re_off + 9],
-               r_re_ext[aa][symb][re_off + 10],
-               r_im_ext[aa][symb][re_off + 10],
-               r_re_ext[aa][symb][re_off + 11],
-               r_im_ext[aa][symb][re_off + 11],
-               r_re_ext[aa][symb][re_off + 12],
-               r_im_ext[aa][symb][re_off + 12],
-               r_re_ext[aa][symb][re_off + 13],
-               r_im_ext[aa][symb][re_off + 13],
-               r_re_ext[aa][symb][re_off + 14],
-               r_im_ext[aa][symb][re_off + 14],
-               r_re_ext[aa][symb][re_off + 15],
-               r_im_ext[aa][symb][re_off + 15]);
+    log_dump(PHY, corr32[symb][0], 8, LOG_DUMP_C32, "corr32:");
 #endif
 
-        simde__m64 *r_re_ext_64 = (simde__m64 *)&r_re_ext[aa][symb][re_off];
-        simde__m64 *r_re_ext2_64 = (simde__m64 *)&r_re_ext2[aa][symb][re_off];
-        simde__m64 *r_im_ext_64 = (simde__m64 *)&r_im_ext[aa][symb][re_off];
-        simde__m64 *r_im_ext2_64 = (simde__m64 *)&r_im_ext2[aa][symb][re_off];
-        for (int z = 0; z < 4; z++) {
-          r_re_ext2_64[z] = simde_mm_mullo_pi16(r_re_ext_64[z], c_im[z]);
-          r_re_ext_64[z] = simde_mm_mullo_pi16(r_re_ext_64[z], c_re[z]);
-          r_im_ext2_64[z] = simde_mm_mullo_pi16(r_im_ext_64[z], c_re[z]);
-          r_im_ext_64[z] = simde_mm_mullo_pi16(r_im_ext_64[z], c_im[z]);
-        }
-
-#ifdef DEBUG_NR_PUCCH_RX
-        printf("prb %d: r ((%d,%d),(%d,%d),(%d,%d),(%d,%d),(%d,%d),(%d,%d),(%d,%d),(%d,%d))\n",
-               prb,
-               r_re_ext[aa][symb][re_off],r_im_ext[aa][symb][re_off],
-               r_re_ext[aa][symb][re_off+1],r_im_ext[aa][symb][re_off+1],
-               r_re_ext[aa][symb][re_off+2],r_im_ext[aa][symb][re_off+2],
-               r_re_ext[aa][symb][re_off+3],r_im_ext[aa][symb][re_off+3],
-               r_re_ext[aa][symb][re_off+4],r_im_ext[aa][symb][re_off+4],
-               r_re_ext[aa][symb][re_off+5],r_im_ext[aa][symb][re_off+5],
-               r_re_ext[aa][symb][re_off+6],r_im_ext[aa][symb][re_off+6],
-               r_re_ext[aa][symb][re_off+7],r_im_ext[aa][symb][re_off+7]);
-        printf("prb %d: r ((%d,%d),(%d,%d),(%d,%d),(%d,%d),(%d,%d),(%d,%d),(%d,%d),(%d,%d))\n",
-               prb+1,
-               r_re_ext[aa][symb][re_off+8],r_im_ext[aa][symb][re_off+8],
-               r_re_ext[aa][symb][re_off+9],r_im_ext[aa][symb][re_off+9],
-               r_re_ext[aa][symb][re_off+10],r_im_ext[aa][symb][re_off+10],
-               r_re_ext[aa][symb][re_off+11],r_im_ext[aa][symb][re_off+11],
-               r_re_ext[aa][symb][re_off+12],r_im_ext[aa][symb][re_off+12],
-               r_re_ext[aa][symb][re_off+13],r_im_ext[aa][symb][re_off+13],
-               r_re_ext[aa][symb][re_off+14],r_im_ext[aa][symb][re_off+14],
-               r_re_ext[aa][symb][re_off+15],r_im_ext[aa][symb][re_off+15]);
-#endif      
+    // apply gold sequence on data symbols
+    for (int aa = 0; aa < Prx; aa++) {
+      simde__m256i *pil_ptr = (simde__m256i *)scramb_data;
+      simde__m256i *end = (simde__m256i *)(scramb_data + nb_re_data);
+      for (simde__m256i *ptr = (simde__m256i *)r_ext[aa][symb], *ptr2 = (simde__m256i *)r_ext2[aa][symb]; pil_ptr < end;
+           ptr++, pil_ptr++, ptr2++) {
+        simde__m256i tmp = simde_mm256_srai_epi16(*ptr, scaling);
+        *ptr2 = simde_mm256_sign_epi16(simde_mm256_sign_epi16(simde_mm256_shuffle_epi8(tmp, swap), *pil_ptr), conj256);
+        *ptr = simde_mm256_sign_epi16(tmp, *pil_ptr);
       }
-      goldIdx++;
-#ifdef DEBUG_NR_PUCCH_RX
-      printf("\n");
-#endif
     }
-  } //symb
+  }
+
   int nb_bit = pucch_pdu->bit_len_harq+pucch_pdu->sr_flag+pucch_pdu->bit_len_csi_part1+pucch_pdu->bit_len_csi_part2;
-  AssertFatal(nb_bit > 2  && nb_bit< 65,"illegal length (%d : %d,%d,%d,%d)\n",nb_bit,pucch_pdu->bit_len_harq,pucch_pdu->sr_flag,pucch_pdu->bit_len_csi_part1,pucch_pdu->bit_len_csi_part2);
+  AssertFatal(nb_bit > 2 && nb_bit < 65,
+              "illegal length (%d : %d,%d,%d,%d)\n",
+              nb_bit,
+              pucch_pdu->bit_len_harq,
+              pucch_pdu->sr_flag,
+              pucch_pdu->bit_len_csi_part1,
+              pucch_pdu->bit_len_csi_part2);
 
-  uint64_t decodedPayload[2];
+  uint64_t decodedPayload[nb_symbols];
+  memset(decodedPayload,0,sizeof(decodedPayload));
   uint8_t corr_dB;
   int decoderState = 2;
   if (pucch2_levdB < gNB->measurements.n0_subband_power_avg_dB + (gNB->pucch0_thres / 10))
     decoderState = 1; // assuming missed detection, only attempt to decode for polar case (with CRC)
   LOG_D(NR_PHY, "n0+thres %d decoderState %d\n", gNB->measurements.n0_subband_power_avg_dB + (gNB->pucch0_thres / 10), decoderState);
+
   if (nb_bit < 12 && decoderState == 2) { // short blocklength case
-    simde__m256i *rp_re[Prx2][2];
-    simde__m256i *rp2_re[Prx2][2];
-    simde__m256i *rp_im[Prx2][2];
-    simde__m256i *rp2_im[Prx2][2];
-    for (int aa=0;aa<Prx;aa++) {
-      for (int symb=0;symb<pucch_pdu->nr_of_symbols;symb++) {
-        rp_re[aa][symb] = (simde__m256i*)r_re_ext[aa][symb];
-        rp_im[aa][symb] = (simde__m256i*)r_im_ext[aa][symb];
-        rp2_re[aa][symb] = (simde__m256i*)r_re_ext2[aa][symb];
-        rp2_im[aa][symb] = (simde__m256i*)r_im_ext2[aa][symb];
-      }
-    }
-    simde__m256i prod_re[Prx2],prod_im[Prx2];
     uint64_t corr=0;
     int cw_ML=0;
-
-    for (int cw=0;cw<1<<nb_bit;cw++) {
-#ifdef DEBUG_NR_PUCCH_RX
-      printf("cw %d:",cw);
-      for (int i=0;i<32;i+=2) {
-        printf("%d,%d,",
-               ((int16_t *)&pucch2_lut[nb_bit - 3][cw << 1])[i >> 1],
-               ((int16_t *)&pucch2_lut[nb_bit - 3][cw << 1])[1 + (i >> 1)]);
-      }
-      printf("\n");
-#endif
+    for (int cw = 0; cw < 1 << nb_bit; cw++) {
       uint64_t corr_tmp = 0;
-
-      for (int symb=0;symb<pucch_pdu->nr_of_symbols;symb++) {
+      for (int symb=0;symb<nb_symbols;symb++) {
         for (int group=0;group<ngroup;group++) {
           // do complex correlation
-          for (int aa=0;aa<Prx;aa++) {
-            prod_re[aa] = /*simde_mm256_srai_epi16(*/ simde_mm256_adds_epi16(
-                simde_mm256_mullo_epi16(pucch2_lut[nb_bit - 3][cw << 1], rp_re[aa][symb][group]),
-                simde_mm256_mullo_epi16(pucch2_lut[nb_bit - 3][(cw << 1) + 1], rp_im[aa][symb][group])) /*,5)*/;
-            prod_im[aa] = /*simde_mm256_srai_epi16(*/ simde_mm256_subs_epi16(
-                simde_mm256_mullo_epi16(pucch2_lut[nb_bit - 3][cw << 1], rp2_im[aa][symb][group]),
-                simde_mm256_mullo_epi16(pucch2_lut[nb_bit - 3][(cw << 1) + 1], rp2_re[aa][symb][group])) /*,5)*/;
-#ifdef DEBUG_NR_PUCCH_RX
-            printf("prod_re[%d] => (%d,%d,%d,%d,%d,%d,%d,%d,%d,%d,%d,%d,%d,%d,%d,%d)\n",aa,
-                   ((int16_t*)&prod_re[aa])[0],((int16_t*)&prod_re[aa])[1],((int16_t*)&prod_re[aa])[2],((int16_t*)&prod_re[aa])[3],
-                   ((int16_t*)&prod_re[aa])[4],((int16_t*)&prod_re[aa])[5],((int16_t*)&prod_re[aa])[6],((int16_t*)&prod_re[aa])[7],
-                   ((int16_t*)&prod_re[aa])[8],((int16_t*)&prod_re[aa])[9],((int16_t*)&prod_re[aa])[10],((int16_t*)&prod_re[aa])[11],
-                   ((int16_t*)&prod_re[aa])[12],((int16_t*)&prod_re[aa])[13],((int16_t*)&prod_re[aa])[14],((int16_t*)&prod_re[aa])[15]);
-            printf("prod_im[%d] => (%d,%d,%d,%d,%d,%d,%d,%d,%d,%d,%d,%d,%d,%d,%d,%d)\n",aa,
-                   ((int16_t*)&prod_im[aa])[0],((int16_t*)&prod_im[aa])[1],((int16_t*)&prod_im[aa])[2],((int16_t*)&prod_im[aa])[3],
-                   ((int16_t*)&prod_im[aa])[4],((int16_t*)&prod_im[aa])[5],((int16_t*)&prod_im[aa])[6],((int16_t*)&prod_im[aa])[7],
-                   ((int16_t*)&prod_im[aa])[8],((int16_t*)&prod_im[aa])[9],((int16_t*)&prod_im[aa])[10],((int16_t*)&prod_im[aa])[11],
-                   ((int16_t*)&prod_im[aa])[12],((int16_t*)&prod_im[aa])[13],((int16_t*)&prod_im[aa])[14],((int16_t*)&prod_im[aa])[15]);
-
-#endif
-            prod_re[aa] = simde_mm256_hadds_epi16(prod_re[aa],prod_re[aa]);// 0+1
-#ifdef DEBUG_NR_PUCCH_RX
-            printf("0.prod_re[%d] => (%d,%d,%d,%d,%d,%d,%d,%d,%d,%d,%d,%d,%d,%d,%d,%d)\n",aa,
-                   ((int16_t*)&prod_re[aa])[0],((int16_t*)&prod_re[aa])[1],((int16_t*)&prod_re[aa])[2],((int16_t*)&prod_re[aa])[3],
-                   ((int16_t*)&prod_re[aa])[4],((int16_t*)&prod_re[aa])[5],((int16_t*)&prod_re[aa])[6],((int16_t*)&prod_re[aa])[7],
-                   ((int16_t*)&prod_re[aa])[8],((int16_t*)&prod_re[aa])[9],((int16_t*)&prod_re[aa])[10],((int16_t*)&prod_re[aa])[11],
-                   ((int16_t*)&prod_re[aa])[12],((int16_t*)&prod_re[aa])[13],((int16_t*)&prod_re[aa])[14],((int16_t*)&prod_re[aa])[15]);
-#endif
-            prod_im[aa] = simde_mm256_hadds_epi16(prod_im[aa],prod_im[aa]);
-            prod_re[aa] = simde_mm256_hadds_epi16(prod_re[aa],prod_re[aa]);// 0+1+2+3
-#ifdef DEBUG_NR_PUCCH_RX
-            printf("1.prod_re[%d] => (%d,%d,%d,%d,%d,%d,%d,%d,%d,%d,%d,%d,%d,%d,%d,%d)\n",aa,
-                   ((int16_t*)&prod_re[aa])[0],((int16_t*)&prod_re[aa])[1],((int16_t*)&prod_re[aa])[2],((int16_t*)&prod_re[aa])[3],
-                   ((int16_t*)&prod_re[aa])[4],((int16_t*)&prod_re[aa])[5],((int16_t*)&prod_re[aa])[6],((int16_t*)&prod_re[aa])[7],
-                   ((int16_t*)&prod_re[aa])[8],((int16_t*)&prod_re[aa])[9],((int16_t*)&prod_re[aa])[10],((int16_t*)&prod_re[aa])[11],
-                   ((int16_t*)&prod_re[aa])[12],((int16_t*)&prod_re[aa])[13],((int16_t*)&prod_re[aa])[14],((int16_t*)&prod_re[aa])[15]);
-#endif
-            prod_im[aa] = simde_mm256_hadds_epi16(prod_im[aa],prod_im[aa]);
-            prod_re[aa] = simde_mm256_hadds_epi16(prod_re[aa],prod_re[aa]);// 0+1+2+3+4+5+6+7
+          for (int aa = 0; aa < Prx; aa++) {
+            const simde__m256i *coeff = (simde__m256i *)&pucch2_lut[nb_bit - 3][cw].cw;
+            const simde__m256i *rext = (simde__m256i *)r_ext[aa][symb];
+            const simde__m256i *rext2 = (simde__m256i *)r_ext2[aa][symb];
+            simde__m256i re = simde_mm256_madd_epi16(coeff[0], rext[group]);
+            simde__m256i im = simde_mm256_madd_epi16(coeff[0], rext2[group]);
+            simde__m256i re2 = simde_mm256_madd_epi16(coeff[1], rext[group + 1]);
+            simde__m256i im2 = simde_mm256_madd_epi16(coeff[1], rext2[group + 1]);
+            re = simde_mm256_add_epi32(re, re2);
+            im = simde_mm256_add_epi32(im, im2);
+            re = simde_mm256_hadd_epi32(re, re);
+            re = simde_mm256_hadd_epi32(re, re);
+            im = simde_mm256_hadd_epi32(im, im);
+            im = simde_mm256_hadd_epi32(im, im);
+            int32_t *re32 = (int32_t *)&re;
+            int32_t *im32 = (int32_t *)&im;
+            c64_t prod = (c64_t){re32[0] + re32[5], im32[0] + im32[5]};
+            csum(prod, prod, corr32[symb][group][aa]);
+            corr_tmp += squaredMod(prod);
 #ifdef DEBUG_NR_PUCCH_RX
-            printf("2.prod_re[%d] => (%d,%d,%d,%d,%d,%d,%d,%d,%d,%d,%d,%d,%d,%d,%d,%d)\n",aa,
-                   ((int16_t*)&prod_re[aa])[0],((int16_t*)&prod_re[aa])[1],((int16_t*)&prod_re[aa])[2],((int16_t*)&prod_re[aa])[3],
-                   ((int16_t*)&prod_re[aa])[4],((int16_t*)&prod_re[aa])[5],((int16_t*)&prod_re[aa])[6],((int16_t*)&prod_re[aa])[7],
-                   ((int16_t*)&prod_re[aa])[8],((int16_t*)&prod_re[aa])[9],((int16_t*)&prod_re[aa])[10],((int16_t*)&prod_re[aa])[11],
-                   ((int16_t*)&prod_re[aa])[12],((int16_t*)&prod_re[aa])[13],((int16_t*)&prod_re[aa])[14],((int16_t*)&prod_re[aa])[15]);
-#endif
-            prod_im[aa] = simde_mm256_hadds_epi16(prod_im[aa],prod_im[aa]);
-          }
-          int64_t corr_re=0,corr_im=0;
-
-
-          for (int aa=0;aa<Prx;aa++) {
-
-            corr_re = ( corr32_re[symb][group][aa]+((int16_t*)(&prod_re[aa]))[0]+((int16_t*)(&prod_re[aa]))[8]);
-            corr_im = ( corr32_im[symb][group][aa]+((int16_t*)(&prod_im[aa]))[0]+((int16_t*)(&prod_im[aa]))[8]);
-#ifdef DEBUG_NR_PUCCH_RX
-            printf("pucch2 cw %d group %d aa %d: (%d,%d)+(%d,%d) = (%ld,%ld)\n",
+            printf("pucch2 cw %d group %d aa %d: (%d,%d)+prod=(%ld,%ld)\n",
                    cw,
                    group,
                    aa,
-                   corr32_re[symb][group][aa],
-                   corr32_im[symb][group][aa],
-                   ((int16_t *)(&prod_re[aa]))[0] + ((int16_t *)(&prod_re[aa]))[8],
-                   ((int16_t *)(&prod_im[aa]))[0] + ((int16_t *)(&prod_im[aa]))[8],
-                   corr_re,
-                   corr_im);
+                   corr32[symb][group][aa].r,
+                   corr32[symb][group][aa].i,
+                   prod.r,
+                   prod.i);
 
 #endif
-
-            corr_tmp += corr_re*corr_re + corr_im*corr_im;
-          } // aa loop
+          }
         }// group loop
       } // symb loop
       if (corr_tmp > corr) {
         corr = corr_tmp;
         cw_ML = cw;
-#ifdef DEBUG_NR_PUCCH_RX
+#ifdef DEBUG_NR_PUCCH_RX 
         printf("slot %d PUCCH2 cw_ML %d, corr %lu\n", slot, cw_ML, corr);
 #endif
       }
     } // cw loop
-    corr_dB = dB_fixed64((uint64_t)corr);
+    corr_dB = dB_fixed64(corr);
 #ifdef DEBUG_NR_PUCCH_RX
     printf("slot %d PUCCH2 cw_ML %d, metric %d \n",slot,cw_ML,corr_dB);
 #endif
     decodedPayload[0]=(uint64_t)cw_ML;
+    
   } else if (nb_bit >= 12) { // polar coded case
-    simde__m64 *rp_re[Prx2][2];
-    simde__m64 *rp2_re[Prx2][2];
-    simde__m64 *rp_im[Prx2][2];
-    simde__m64 *rp2_im[Prx2][2];
-    simde__m128i llrs[pucch_pdu->prb_size*2*pucch_pdu->nr_of_symbols];
-
-    for (int aa=0;aa<Prx;aa++) {
-      for (int symb=0;symb<pucch_pdu->nr_of_symbols;symb++) {
-        rp_re[aa][symb] = (simde__m64*)r_re_ext[aa][symb];
-        rp_im[aa][symb] = (simde__m64*)r_im_ext[aa][symb];
-        rp2_re[aa][symb] = (simde__m64*)r_re_ext2[aa][symb];
-        rp2_im[aa][symb] = (simde__m64*)r_im_ext2[aa][symb];
-      }
-    }
-    simde__m64 prod_re[Prx2],prod_im[Prx2];
-
-#ifdef DEBUG_NR_PUCCH_RX
-    for (int cw=0;cw<16;cw++) {
-
-      printf("cw %d:",cw);
-      for (int i=0;i<4;i++) {
-        printf("%d,", ((int16_t *)&pucch2_polar_4bit[cw])[i >> 1]);
-      }
-      printf("\n");
-    }
-#endif
     
+    simde__m128i llrs[pucch_pdu->prb_size * 2 * nb_symbols];
     // non-coherent LLR computation on groups of 4 REs (half-PRBs)
-    int32_t corr_re,corr_im,corr_tmp;
-    simde__m128i corr16,llr_num,llr_den;
     uint64_t corr = 0;
-    for (int symb=0;symb<pucch_pdu->nr_of_symbols;symb++) {
+    const simde__m128i ones = simde_mm_set1_epi16(1);
+    for (int symb=0;symb<nb_symbols;symb++) {
       for (int half_prb=0;half_prb<(2*pucch_pdu->prb_size);half_prb++) {
-        llr_num=simde_mm_set1_epi16(0);llr_den=simde_mm_set1_epi16(0);
+        simde__m128i llr_num = simde_mm_set1_epi16(0);
+       simde__m128i llr_den = simde_mm_set1_epi16(0);
         for (int cw=0;cw<256;cw++) {
-          corr_tmp=0;
+          int32_t corr_tmp=0;
           for (int aa=0;aa<Prx;aa++) {
-            prod_re[aa] =
-                simde_mm_srai_pi16(simde_mm_adds_pi16(simde_mm_mullo_pi16(pucch2_polar_4bit[cw & 15], rp_re[aa][symb][half_prb]),
-                                                      simde_mm_mullo_pi16(pucch2_polar_4bit[cw >> 4], rp_im[aa][symb][half_prb])),
-                                   5);
-            prod_im[aa] =
-                simde_mm_srai_pi16(simde_mm_subs_pi16(simde_mm_mullo_pi16(pucch2_polar_4bit[cw & 15], rp2_im[aa][symb][half_prb]),
-                                                      simde_mm_mullo_pi16(pucch2_polar_4bit[cw >> 4], rp2_re[aa][symb][half_prb])),
-                                   5);
-            prod_re[aa] = simde_mm_hadds_pi16(prod_re[aa],prod_re[aa]);// 0+1
-            prod_im[aa] = simde_mm_hadds_pi16(prod_im[aa],prod_im[aa]);
-            prod_re[aa] = simde_mm_hadds_pi16(prod_re[aa],prod_re[aa]);// 0+1+2+3
-            prod_im[aa] = simde_mm_hadds_pi16(prod_im[aa],prod_im[aa]);
-
+            simde__m128i part1 = simde_mm_set_epi64x(0ULL, *(int64_t *)&pucch2_polar_4bit[cw & 15].cw);
+            simde__m128i part2 = simde_mm_set_epi64x(0ULL, *(int64_t *)&pucch2_polar_4bit[cw >> 4].cw);
+            simde__m128i factor = simde_mm_unpacklo_epi16(part1, part2);
+            simde__m128i re = *(simde__m128i *)&r_ext[aa][symb][half_prb * 4];
+            simde__m128i im = *(simde__m128i *)&r_ext2[aa][symb][half_prb * 4];
+            simde__m128i prod_re = simde_mm_madd_epi16(re, factor);
+            simde__m128i prod_im = simde_mm_madd_epi16(im, factor);
+            prod_re = simde_mm_hadd_epi32(prod_re, prod_re);
+            prod_im = simde_mm_hadd_epi32(prod_im, prod_im);
+            prod_re = simde_mm_hadd_epi32(prod_re, prod_re);
+            prod_im = simde_mm_hadd_epi32(prod_im, prod_im);
+            simde__m128i prod = simde_mm_srai_epi32(simde_mm_unpacklo_epi32(prod_re, prod_im), 5);
+            c64_t corr64 = (c64_t){corr32[symb][half_prb >> 2][aa].r / (2 * nc_group_size * 4 / 2),
+                                   corr32[symb][half_prb >> 2][aa].i / (2 * nc_group_size * 4 / 2)};
+            //  _mm_srai_epi64 is missing in SIMDE package, we need to update it
+            c64_t prod2 = {simde_mm_extract_epi32(prod, 0), simde_mm_extract_epi32(prod, 1)};
+            csum(prod2, prod2, corr64);
+            corr_tmp += squaredMod(prod2) >> (Prx / 2);
             // this is for UL CQI measurement
-            if (cw==0) corr += ((int64_t)corr32_re[symb][half_prb>>2][aa]*corr32_re[symb][half_prb>>2][aa])+
-                         ((int64_t)corr32_im[symb][half_prb>>2][aa]*corr32_im[symb][half_prb>>2][aa]);
-
-
-            corr_re = ( corr32_re[symb][half_prb>>2][aa]/(2*nc_group_size*4/2)+((int16_t*)(&prod_re[aa]))[0]);
-            corr_im = ( corr32_im[symb][half_prb>>2][aa]/(2*nc_group_size*4/2)+((int16_t*)(&prod_im[aa]))[0]);
-            corr_tmp += (corr_re*corr_re + corr_im*corr_im)>>(Prx/2);
-
-            LOG_D(PHY,
-                  "pucch2 half_prb %d cw %d (%d,%d) aa %d: (%d,%d,%d,%d,%d,%d,%d,%d)x(%d,%d,%d,%d,%d,%d,%d,%d) (%d,%d)+(%d,%d) = "
-                  "(%d,%d) => %d\n",
-                  half_prb,
-                  cw,
-                  cw & 15,
-                  cw >> 4,
-                  aa,
-                  ((int16_t *)&pucch2_polar_4bit[cw & 15])[0],
-                  ((int16_t *)&pucch2_polar_4bit[cw >> 4])[0],
-                  ((int16_t *)&pucch2_polar_4bit[cw & 15])[1],
-                  ((int16_t *)&pucch2_polar_4bit[cw >> 4])[1],
-                  ((int16_t *)&pucch2_polar_4bit[cw & 15])[2],
-                  ((int16_t *)&pucch2_polar_4bit[cw >> 4])[2],
-                  ((int16_t *)&pucch2_polar_4bit[cw & 15])[3],
-                  ((int16_t *)&pucch2_polar_4bit[cw >> 4])[3],
-                  ((int16_t *)&rp_re[aa][half_prb])[0],
-                  ((int16_t *)&rp_im[aa][half_prb])[0],
-                  ((int16_t *)&rp_re[aa][half_prb])[1],
-                  ((int16_t *)&rp_im[aa][half_prb])[1],
-                  ((int16_t *)&rp_re[aa][half_prb])[2],
-                  ((int16_t *)&rp_im[aa][half_prb])[2],
-                  ((int16_t *)&rp_re[aa][half_prb])[3],
-                  ((int16_t *)&rp_im[aa][half_prb])[3],
-                  corr32_re[symb][half_prb >> 2][aa] / (2 * nc_group_size * 4 / 2),
-                  corr32_im[symb][half_prb >> 2][aa] / (2 * nc_group_size * 4 / 2),
-                  ((int16_t *)(&prod_re[aa]))[0],
-                  ((int16_t *)(&prod_im[aa]))[0],
-                  corr_re,
-                  corr_im,
-                  corr_tmp);
+            if (cw == 0)
+              corr += squaredMod(corr32[symb][half_prb >> 2][aa]);
           }
-          corr16 = simde_mm_set1_epi16((int16_t)(corr_tmp >> 8));
-
-          LOG_D(PHY, "half_prb %d cw %d corr16 %d\n", half_prb, cw, corr_tmp >> 8);
-
+          simde__m128i corr16 = simde_mm_set1_epi16((int16_t)(corr_tmp >> 8));
+          simde__m128i den = simde_mm_xor_si128(pucch2_polar_llr_num_lut[cw], ones);
           llr_num = simde_mm_max_epi16(simde_mm_mullo_epi16(corr16, pucch2_polar_llr_num_lut[cw]), llr_num);
-          llr_den = simde_mm_max_epi16(simde_mm_mullo_epi16(corr16, pucch2_polar_llr_den_lut[cw]), llr_den);
-
-          LOG_D(PHY,
-                "lut_num (%d,%d,%d,%d,%d,%d,%d,%d)\n",
-                ((int16_t *)&pucch2_polar_llr_num_lut[cw])[0],
-                ((int16_t *)&pucch2_polar_llr_num_lut[cw])[1],
-                ((int16_t *)&pucch2_polar_llr_num_lut[cw])[2],
-                ((int16_t *)&pucch2_polar_llr_num_lut[cw])[3],
-                ((int16_t *)&pucch2_polar_llr_num_lut[cw])[4],
-                ((int16_t *)&pucch2_polar_llr_num_lut[cw])[5],
-                ((int16_t *)&pucch2_polar_llr_num_lut[cw])[6],
-                ((int16_t *)&pucch2_polar_llr_num_lut[cw])[7]);
-
-          LOG_D(PHY,
-                "llr_num (%d,%d,%d,%d,%d,%d,%d,%d)\n",
-                ((int16_t *)&llr_num)[0],
-                ((int16_t *)&llr_num)[1],
-                ((int16_t *)&llr_num)[2],
-                ((int16_t *)&llr_num)[3],
-                ((int16_t *)&llr_num)[4],
-                ((int16_t *)&llr_num)[5],
-                ((int16_t *)&llr_num)[6],
-                ((int16_t *)&llr_num)[7]);
-          LOG_D(PHY,
-                "llr_den (%d,%d,%d,%d,%d,%d,%d,%d)\n",
-                ((int16_t *)&llr_den)[0],
-                ((int16_t *)&llr_den)[1],
-                ((int16_t *)&llr_den)[2],
-                ((int16_t *)&llr_den)[3],
-                ((int16_t *)&llr_den)[4],
-                ((int16_t *)&llr_den)[5],
-                ((int16_t *)&llr_den)[6],
-                ((int16_t *)&llr_den)[7]);
+          llr_den = simde_mm_max_epi16(simde_mm_mullo_epi16(corr16, den), llr_den);
         }
         // compute llrs
-        llrs[half_prb + (symb*2*pucch_pdu->prb_size)] = simde_mm_subs_epi16(llr_num,llr_den);
-        LOG_D(PHY,"llrs[%d] : (%d,%d,%d,%d,%d,%d,%d,%d)\n",
-              half_prb,
-              ((int16_t*)&llrs[half_prb])[0],
-              ((int16_t*)&llrs[half_prb])[1],
-              ((int16_t*)&llrs[half_prb])[2],
-              ((int16_t*)&llrs[half_prb])[3],
-              ((int16_t*)&llrs[half_prb])[4],
-              ((int16_t*)&llrs[half_prb])[5],
-              ((int16_t*)&llrs[half_prb])[6],
-              ((int16_t*)&llrs[half_prb])[7]);
+        llrs[half_prb + symb * 2 * pucch_pdu->prb_size] = simde_mm_subs_epi16(llr_num, llr_den);
+        LOG_DDUMP(PHY,   llrs+half_prb + symb * 2 * pucch_pdu->prb_size, 8,   LOG_DUMP_I16,  "llrs:");
       } // half_prb
     } // symb
 
@@ -1636,10 +1294,12 @@ void nr_decode_pucch2(PHY_VARS_gNB *gNB,
     // Decoder reversal
     decodedPayload[0] = reverse_bits(decodedPayload[0], nb_bit);
 
-    if (decoderState>0) decoderState=1;
+    if (decoderState > 0)
+      decoderState = 1;
     corr_dB = dB_fixed64(corr);
-    LOG_D(PHY,"metric %d dB\n",corr_dB);
-  }
+    LOG_D(PHY, "metric %d dB\n", corr_dB);
+  } else
+    LOG_E(PHY, "PUCCH not processed: nb_bit %d decoderState %d\n", nb_bit, decoderState);
 
   LOG_D(PHY, "UCI decoderState %d, payload[0] %llu\n", decoderState, (unsigned long long)decodedPayload[0]);
 
@@ -1647,42 +1307,46 @@ void nr_decode_pucch2(PHY_VARS_gNB *gNB,
   // TODO this computation is wrong -> to be ignored at MAC for now
   int cqi = 0xff;
   /*int SNRtimes10 =
-      dB_fixed_times10(signal_energy_nodc((int32_t *)&rxdataF[0][soffset + (l2 * frame_parms->ofdm_symbol_size) + re_offset[0]],
-                                          12 * pucch_pdu->prb_size))
-      - (10 * gNB->measurements.n0_power_tot_dB);
-  int cqi,bit_left;
-  if (SNRtimes10 < -640) cqi=0;
-  else if (SNRtimes10 >  635) cqi=255;
-  else cqi=(640+SNRtimes10)/5;*/
+    dB_fixed_times10(signal_energy_nodc((int32_t *)&rxdataF[0][soffset + (l2 * frame_parms->ofdm_symbol_size) + re_offset[0]],
+    12 * pucch_pdu->prb_size))
+    - (10 * gNB->measurements.n0_power_tot_dB);
+    int cqi,bit_left;
+    if (SNRtimes10 < -640) cqi=0;
+    else if (SNRtimes10 >  635) cqi=255;
+    else cqi=(640+SNRtimes10)/5;*/
 
   uci_pdu->harq.harq_bit_len = pucch_pdu->bit_len_harq;
-  uci_pdu->pduBitmap=0;
-  uci_pdu->rnti=pucch_pdu->rnti;
-  uci_pdu->handle=pucch_pdu->handle;
-  uci_pdu->pucch_format=0;
-  uci_pdu->ul_cqi=cqi;
-  uci_pdu->timing_advance=0xffff; // currently not valid
-  uci_pdu->rssi=1280 - (10*dB_fixed(32767*32767)-dB_fixed_times10(signal_energy_nodc(&rxdataF[0][soffset+(l2*frame_parms->ofdm_symbol_size)+re_offset[0]],12*pucch_pdu->prb_size)));
-  if (pucch_pdu->bit_len_harq>0) {
-    int harq_bytes=pucch_pdu->bit_len_harq>>3;
-    if ((pucch_pdu->bit_len_harq&7) > 0) harq_bytes++;
-    uci_pdu->pduBitmap|=2;
-    uci_pdu->harq.harq_payload = (uint8_t*)malloc(harq_bytes);
+  uci_pdu->pduBitmap = 0;
+  uci_pdu->rnti = pucch_pdu->rnti;
+  uci_pdu->handle = pucch_pdu->handle;
+  uci_pdu->pucch_format = 0;
+  uci_pdu->ul_cqi = cqi;
+  uci_pdu->timing_advance = 0xffff; // currently not valid
+  uci_pdu->rssi =
+      1280
+      - (10 * dB_fixed(32767 * 32767)
+         - dB_fixed_times10(signal_energy_nodc(&rxdataF[0][soffset + (l2 * frame_parms->ofdm_symbol_size) + re_offset[0]],
+                                               12 * pucch_pdu->prb_size)));
+  if (pucch_pdu->bit_len_harq > 0) {
+    int harq_bytes = pucch_pdu->bit_len_harq >> 3;
+    if ((pucch_pdu->bit_len_harq & 7) > 0)
+      harq_bytes++;
+    uci_pdu->pduBitmap |= 2;
+    uci_pdu->harq.harq_payload = (uint8_t *)malloc(harq_bytes);
     uci_pdu->harq.harq_crc = decoderState;
-    LOG_D(PHY,"[DLSCH/PDSCH/PUCCH2] %d.%d HARQ bytes (%d) Decoder state %d\n",
-          frame,slot,harq_bytes,decoderState);
-    int i=0;
-    for (;i<harq_bytes-1;i++) {
+    LOG_D(PHY, "[DLSCH/PDSCH/PUCCH2] %d.%d HARQ bytes (%d) Decoder state %d\n", frame, slot, harq_bytes, decoderState);
+    int i = 0;
+    for (; i < harq_bytes - 1; i++) {
       uci_pdu->harq.harq_payload[i] = decodedPayload[0] & 255;
       LOG_D(PHY, "[DLSCH/PDSCH/PUCCH2] %d.%d HARQ payload (%d) = %d\n", frame, slot, i, uci_pdu->harq.harq_payload[i]);
-      decodedPayload[0]>>=8;
+      decodedPayload[0] >>= 8;
     }
     int bit_left = pucch_pdu->bit_len_harq - ((harq_bytes - 1) << 3);
     uci_pdu->harq.harq_payload[i] = decodedPayload[0] & ((1 << bit_left) - 1);
     LOG_D(PHY, "[DLSCH/PDSCH/PUCCH2] %d.%d HARQ payload (%d) = %d\n", frame, slot, i, uci_pdu->harq.harq_payload[i]);
     decodedPayload[0] >>= pucch_pdu->bit_len_harq;
   }
-  
+
   if (pucch_pdu->sr_flag == 1) {
     uci_pdu->pduBitmap|=1;
     uci_pdu->sr.sr_bit_len = 1;
diff --git a/openair1/PHY/NR_UE_TRANSPORT/csi_rx.c b/openair1/PHY/NR_UE_TRANSPORT/csi_rx.c
index bcd24fef4a8073196e7baffb7356f42081b84c39..c80bc4e5f227119ad2f729b1a699613db4565de5 100644
--- a/openair1/PHY/NR_UE_TRANSPORT/csi_rx.c
+++ b/openair1/PHY/NR_UE_TRANSPORT/csi_rx.c
@@ -86,8 +86,6 @@ void nr_det_A_MF_2x2(int32_t *a_mf_00,
     a_mf_10_128+=1;
     a_mf_11_128+=1;
   }
-  simde_mm_empty();
-  simde_m_empty();
 }
 
 void nr_squared_matrix_element(int32_t *a,
@@ -100,8 +98,6 @@ void nr_squared_matrix_element(int32_t *a,
     a_sq_128+=1;
     a_128+=1;
   }
-  simde_mm_empty();
-  simde_m_empty();
 }
 
 void nr_numer_2x2(int32_t *a_00_sq,
@@ -125,8 +121,6 @@ void nr_numer_2x2(int32_t *a_00_sq,
     a_10_sq_128+=1;
     a_11_sq_128+=1;
   }
-  simde_mm_empty();
-  simde_m_empty();
 }
 
 bool is_csi_rs_in_symbol(const fapi_nr_dl_config_csirs_pdu_rel15_t csirs_config_pdu, const int symbol) {
diff --git a/openair1/PHY/NR_UE_TRANSPORT/nr_dlsch_demodulation.c b/openair1/PHY/NR_UE_TRANSPORT/nr_dlsch_demodulation.c
index 217a3700545fe6e021153e08075ae933375acfb5..e5442aab4833262fe971728dbbf609068d9f0b4f 100644
--- a/openair1/PHY/NR_UE_TRANSPORT/nr_dlsch_demodulation.c
+++ b/openair1/PHY/NR_UE_TRANSPORT/nr_dlsch_demodulation.c
@@ -1304,8 +1304,6 @@ static void nr_dlsch_detection_mrc(uint32_t rx_size_symbol,
         rho128_0[i] = simde_mm_adds_epi16(simde_mm_srai_epi16(rho128_0[i],1),simde_mm_srai_epi16(rho128_1[i],1));
       }*/
       }
-      simde_mm_empty();
-      simde_m_empty();
   }
 }
 
@@ -1617,8 +1615,6 @@ void nr_conjch0_mult_ch1(int *ch0,
     dl_ch1_128+=1;
     ch0conj_ch1_128+=1;
   }
-  simde_mm_empty();
-  simde_m_empty();
 }
 
 /*
diff --git a/openair1/PHY/NR_UE_TRANSPORT/nr_initial_sync.c b/openair1/PHY/NR_UE_TRANSPORT/nr_initial_sync.c
index b6b76ab8a863dda8a33fd6d4b0314a517172f318..3143333a3cc6356dd81898569fc2bfcf07ad19a5 100644
--- a/openair1/PHY/NR_UE_TRANSPORT/nr_initial_sync.c
+++ b/openair1/PHY/NR_UE_TRANSPORT/nr_initial_sync.c
@@ -308,8 +308,8 @@ nr_initial_sync_t nr_initial_sync(UE_nr_rxtx_proc_t *proc,
         fp->N_RB_DL,
         numGscn);
 
-  task_ans_t ans[numGscn];
-  memset(ans, 0, sizeof(ans));
+  task_ans_t ans;
+  init_task_ans(&ans, numGscn);
   nr_ue_ssb_scan_t ssb_info[numGscn];
   for (int s = 0; s < numGscn; s++) {
     nr_ue_ssb_scan_t *ssbInfo = &ssb_info[s];
@@ -331,7 +331,7 @@ nr_initial_sync_t nr_initial_sync(UE_nr_rxtx_proc_t *proc,
           ssbInfo->gscnInfo.gscn,
           ssbInfo->gscnInfo.ssbFirstSC,
           ssbInfo->gscnInfo.ssRef);
-    ssbInfo->ans = &ans[s];
+    ssbInfo->ans = &ans;
     task_t t = {.func = nr_scan_ssb, .args = ssbInfo};
     pushTpool(&get_nrUE_params()->Tpool, t);
   }
@@ -339,7 +339,7 @@ nr_initial_sync_t nr_initial_sync(UE_nr_rxtx_proc_t *proc,
   // Collect the scan results
   nr_ue_ssb_scan_t res = {0};
   if (numGscn > 0) {
-    join_task_ans(ans, numGscn);
+    join_task_ans(&ans);
     for (int i = 0; i < numGscn; i++) {
       nr_ue_ssb_scan_t *ssbInfo = &ssb_info[i];
       if (ssbInfo->syncRes.cell_detected) {
diff --git a/openair1/PHY/NR_UE_TRANSPORT/nr_pbch.c b/openair1/PHY/NR_UE_TRANSPORT/nr_pbch.c
index 229191269f4223dd4db2c0cff051eea0b63f28c4..1de1996126b0265f4321b871a6948f7361cf8fb6 100644
--- a/openair1/PHY/NR_UE_TRANSPORT/nr_pbch.c
+++ b/openair1/PHY/NR_UE_TRANSPORT/nr_pbch.c
@@ -263,8 +263,6 @@ void nr_pbch_detection_mrc(NR_DL_FRAME_PARMS *frame_parms,
     }
   }
 
-  simde_mm_empty();
-  simde_m_empty();
 }
 
 void nr_pbch_unscrambling(int16_t *demod_pbch_e,
diff --git a/openair1/PHY/TOOLS/cadd_vv.c b/openair1/PHY/TOOLS/cadd_vv.c
index f1c61cc60adc2959babb523dc980ecb13137b165..7eade29bb0f80dc28fa7f3c56aaa4da647704681 100644
--- a/openair1/PHY/TOOLS/cadd_vv.c
+++ b/openair1/PHY/TOOLS/cadd_vv.c
@@ -47,8 +47,6 @@ int32_t sub_cpx_vector16(int16_t *x,
 
   }
 
-  simde_mm_empty();
-  simde_m_empty();
   return(0);
 }
 
diff --git a/openair1/PHY/TOOLS/cmult_vv.c b/openair1/PHY/TOOLS/cmult_vv.c
index 6bee29551f101c327fea487dcbf22faa27483089..111bfcabcb8f6c04018b554556daf7aba0cb80e1 100644
--- a/openair1/PHY/TOOLS/cmult_vv.c
+++ b/openair1/PHY/TOOLS/cmult_vv.c
@@ -27,7 +27,6 @@ static const int16_t conjug[8]__attribute__((aligned(16))) = {-1,1,-1,1,-1,1,-1,
 static const int16_t conjug2[8]__attribute__((aligned(16))) = {1,-1,1,-1,1,-1,1,-1} ;
 
 #define simd_q15_t simde__m128i
-#define simdshort_q15_t simde__m64
 #define set1_int16(a) simde_mm_set1_epi16(a)
 #define setr_int16(a0, a1, a2, a3, a4, a5, a6, a7) simde_mm_setr_epi16(a0, a1, a2, a3, a4, a5, a6, a7 )
 
@@ -87,8 +86,6 @@ int mult_cpx_conj_vector(int16_t *x1,
   }
 
 
-  simde_mm_empty();
-  simde_m_empty();
 
   return(0);
 }
@@ -150,8 +147,6 @@ int mult_cpx_vector(int16_t *x1, //Q15
     x2_128++;
     y_128++;
   }
-  simde_mm_empty();
-  simde_m_empty();
   return(0);
 }
 
@@ -207,7 +202,5 @@ int multadd_cpx_vector(int16_t *x1,
     x2_128++;
     y_128++;
   }
-  simde_mm_empty();
-  simde_m_empty();
   return(0);
 }
diff --git a/openair1/PHY/TOOLS/invSqrt.c b/openair1/PHY/TOOLS/invSqrt.c
index b3e0d9749afed846cd04f2a15aac314166fad728..fce565b1d3f33866d5c23b9215140fcde4813a7d 100644
--- a/openair1/PHY/TOOLS/invSqrt.c
+++ b/openair1/PHY/TOOLS/invSqrt.c
@@ -19,7 +19,7 @@
  *      contact@openairinterface.org
  */
 
-static short lookup_table[1025] = {0x7fff, 0x16a0, 0x1000, 0xd10, 0xb50, 0xa1e, 0x93c, 0x88d, 0x800, 0x78a, 0x727, 0x6d2, 0x688, 0x646, 0x60c, 0x5d7, 0x5a8, 0x57c, 0x555, 0x530, 0x50f, 0x4f0, 0x4d2, 0x4b7, 0x49e, 0x486, 0x470, 0x45a, 0x446, 0x433, 0x421, 0x410, 0x400, 0x3f0, 0x3e1, 0x3d3, 0x3c5, 0x3b8, 0x3ab, 0x39f, 0x393, 0x388, 0x37d, 0x373, 0x369, 0x35f, 0x356, 0x34c, 0x344, 0x33b, 0x333, 0x32b, 0x323, 0x31b, 0x314, 0x30d, 0x306, 0x2ff, 0x2f8, 0x2f2, 0x2eb, 0x2e5, 0x2df, 0x2d9, 0x2d4, 0x2ce, 0x2c9, 0x2c3, 0x2be, 0x2b9, 0x2b4, 0x2af, 0x2aa, 0x2a5, 0x2a1, 0x29c, 0x298, 0x294, 0x28f, 0x28b, 0x287, 0x283, 0x27f, 0x27b, 0x278, 0x274, 0x270, 0x26d, 0x269, 0x266, 0x262, 0x25f, 0x25b, 0x258, 0x255, 0x252, 0x24f, 0x24c, 0x249, 0x246, 0x243, 0x240, 0x23d, 0x23a, 0x238, 0x235, 0x232, 0x22f, 0x22d, 0x22a, 0x228, 0x225, 0x223, 0x220, 0x21e, 0x21c, 0x219, 0x217, 0x215, 0x213, 0x210, 0x20e, 0x20c, 0x20a, 0x208, 0x206, 0x204, 0x202, 0x200, 0x1fe, 0x1fc, 0x1fa, 0x1f8, 0x1f6, 0x1f4, 0x1f2, 0x1f0, 0x1ee, 0x1ed, 0x1eb, 0x1e9, 0x1e7, 0x1e6, 0x1e4, 0x1e2, 0x1e1, 0x1df, 0x1dd, 0x1dc, 0x1da, 0x1d8, 0x1d7, 0x1d5, 0x1d4, 0x1d2, 0x1d1, 0x1cf, 0x1ce, 0x1cc, 0x1cb, 0x1c9, 0x1c8, 0x1c7, 0x1c5, 0x1c4, 0x1c2, 0x1c1, 0x1c0, 0x1be, 0x1bd, 0x1bc, 0x1ba, 0x1b9, 0x1b8, 0x1b7, 0x1b5, 0x1b4, 0x1b3, 0x1b2, 0x1b0, 0x1af, 0x1ae, 0x1ad, 0x1ac, 0x1ab, 0x1a9, 0x1a8, 0x1a7, 0x1a6, 0x1a5, 0x1a4, 0x1a3, 0x1a2, 0x1a0, 0x19f, 0x19e, 0x19d, 0x19c, 0x19b, 0x19a, 0x199, 0x198, 0x197, 0x196, 0x195, 0x194, 0x193, 0x192, 0x191, 0x190, 0x18f, 0x18e, 0x18d, 0x18c, 0x18b, 0x18b, 0x18a, 0x189, 0x188, 0x187, 0x186, 0x185, 0x184, 0x183, 0x183, 0x182, 0x181, 0x180, 0x17f, 0x17e, 0x17d, 0x17d, 0x17c, 0x17b, 0x17a, 0x179, 0x179, 0x178, 0x177, 0x176, 0x175, 0x175, 0x174, 0x173, 0x172, 0x172, 0x171, 0x170, 0x16f, 0x16f, 0x16e, 0x16d, 0x16c, 0x16c, 0x16b, 0x16a, 0x16a, 0x169, 0x168, 0x167, 0x167, 0x166, 0x165, 0x165, 0x164, 0x163, 0x163, 0x162, 0x161, 0x161, 0x160, 0x15f, 0x15f, 0x15e, 0x15d, 0x15d, 0x15c, 0x15c, 0x15b, 0x15a, 0x15a, 0x159, 0x158, 0x158, 0x157, 0x157, 0x156, 0x155, 0x155, 0x154, 0x154, 0x153, 0x152, 0x152, 0x151, 0x151, 0x150, 0x150, 0x14f, 0x14e, 0x14e, 0x14d, 0x14d, 0x14c, 0x14c, 0x14b, 0x14b, 0x14a, 0x14a, 0x149, 0x148, 0x148, 0x147, 0x147, 0x146, 0x146, 0x145, 0x145, 0x144, 0x144, 0x143, 0x143, 0x142, 0x142, 0x141, 0x141, 0x140, 0x140, 0x13f, 0x13f, 0x13e, 0x13e, 0x13d, 0x13d, 0x13c, 0x13c, 0x13c, 0x13b, 0x13b, 0x13a, 0x13a, 0x139, 0x139, 0x138, 0x138, 0x137, 0x137, 0x136, 0x136, 0x136, 0x135, 0x135, 0x134, 0x134, 0x133, 0x133, 0x133, 0x132, 0x132, 0x131, 0x131, 0x130, 0x130, 0x130, 0x12f, 0x12f, 0x12e, 0x12e, 0x12d, 0x12d, 0x12d, 0x12c, 0x12c, 0x12b, 0x12b, 0x12b, 0x12a, 0x12a, 0x129, 0x129, 0x129, 0x128, 0x128, 0x127, 0x127, 0x127, 0x126, 0x126, 0x126, 0x125, 0x125, 0x124, 0x124, 0x124, 0x123, 0x123, 0x123, 0x122, 0x122, 0x121, 0x121, 0x121, 0x120, 0x120, 0x120, 0x11f, 0x11f, 0x11f, 0x11e, 0x11e, 0x11e, 0x11d, 0x11d, 0x11d, 0x11c, 0x11c, 0x11c, 0x11b, 0x11b, 0x11a, 0x11a, 0x11a, 0x119, 0x119, 0x119, 0x118, 0x118, 0x118, 0x117, 0x117, 0x117, 0x117, 0x116, 0x116, 0x116, 0x115, 0x115, 0x115, 0x114, 0x114, 0x114, 0x113, 0x113, 0x113, 0x112, 0x112, 0x112, 0x111, 0x111, 0x111, 0x111, 0x110, 0x110, 0x110, 0x10f, 0x10f, 0x10f, 0x10e, 0x10e, 0x10e, 0x10e, 0x10d, 0x10d, 0x10d, 0x10c, 0x10c, 0x10c, 0x10c, 0x10b, 0x10b, 0x10b, 0x10a, 0x10a, 0x10a, 0x10a, 0x109, 0x109, 0x109, 0x108, 0x108, 0x108, 0x108, 0x107, 0x107, 0x107, 0x107, 0x106, 0x106, 0x106, 0x105, 0x105, 0x105, 0x105, 0x104, 0x104, 0x104, 0x104, 0x103, 0x103, 0x103, 0x103, 0x102, 0x102, 0x102, 0x102, 0x101, 0x101, 0x101, 0x101, 0x100, 0x100, 0x100, 0x100, 0xff, 0xff, 0xff, 0xff, 0xfe, 0xfe, 0xfe, 0xfe, 0xfd, 0xfd, 0xfd, 0xfd, 0xfc, 0xfc, 0xfc, 0xfc, 0xfb, 0xfb, 0xfb, 0xfb, 0xfa, 0xfa, 0xfa, 0xfa, 0xf9, 0xf9, 0xf9, 0xf9, 0xf9, 0xf8, 0xf8, 0xf8, 0xf8, 0xf7, 0xf7, 0xf7, 0xf7, 0xf6, 0xf6, 0xf6, 0xf6, 0xf6, 0xf5, 0xf5, 0xf5, 0xf5, 0xf5, 0xf4, 0xf4, 0xf4, 0xf4, 0xf3, 0xf3, 0xf3, 0xf3, 0xf3, 0xf2, 0xf2, 0xf2, 0xf2, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xef, 0xef, 0xef, 0xef, 0xef, 0xee, 0xee, 0xee, 0xee, 0xee, 0xed, 0xed, 0xed, 0xed, 0xed, 0xec, 0xec, 0xec, 0xec, 0xec, 0xeb, 0xeb, 0xeb, 0xeb, 0xeb, 0xea, 0xea, 0xea, 0xea, 0xea, 0xe9, 0xe9, 0xe9, 0xe9, 0xe9, 0xe9, 0xe8, 0xe8, 0xe8, 0xe8, 0xe8, 0xe7, 0xe7, 0xe7, 0xe7, 0xe7, 0xe6, 0xe6, 0xe6, 0xe6, 0xe6, 0xe6, 0xe5, 0xe5, 0xe5, 0xe5, 0xe5, 0xe4, 0xe4, 0xe4, 0xe4, 0xe4, 0xe4, 0xe3, 0xe3, 0xe3, 0xe3, 0xe3, 0xe3, 0xe2, 0xe2, 0xe2, 0xe2, 0xe2, 0xe1, 0xe1, 0xe1, 0xe1, 0xe1, 0xe1, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xdf, 0xdf, 0xdf, 0xdf, 0xdf, 0xdf, 0xde, 0xde, 0xde, 0xde, 0xde, 0xde, 0xdd, 0xdd, 0xdd, 0xdd, 0xdd, 0xdd, 0xdd, 0xdc, 0xdc, 0xdc, 0xdc, 0xdc, 0xdc, 0xdb, 0xdb, 0xdb, 0xdb, 0xdb, 0xdb, 0xda, 0xda, 0xda, 0xda, 0xda, 0xda, 0xda, 0xd9, 0xd9, 0xd9, 0xd9, 0xd9, 0xd9, 0xd8, 0xd8, 0xd8, 0xd8, 0xd8, 0xd8, 0xd8, 0xd7, 0xd7, 0xd7, 0xd7, 0xd7, 0xd7, 0xd6, 0xd6, 0xd6, 0xd6, 0xd6, 0xd6, 0xd6, 0xd5, 0xd5, 0xd5, 0xd5, 0xd5, 0xd5, 0xd5, 0xd4, 0xd4, 0xd4, 0xd4, 0xd4, 0xd4, 0xd4, 0xd3, 0xd3, 0xd3, 0xd3, 0xd3, 0xd3, 0xd3, 0xd2, 0xd2, 0xd2, 0xd2, 0xd2, 0xd2, 0xd2, 0xd1, 0xd1, 0xd1, 0xd1, 0xd1, 0xd1, 0xd1, 0xd1, 0xd0, 0xd0, 0xd0, 0xd0, 0xd0, 0xd0, 0xd0, 0xcf, 0xcf, 0xcf, 0xcf, 0xcf, 0xcf, 0xcf, 0xcf, 0xce, 0xce, 0xce, 0xce, 0xce, 0xce, 0xce, 0xcd, 0xcd, 0xcd, 0xcd, 0xcd, 0xcd, 0xcd, 0xcd, 0xcc, 0xcc, 0xcc, 0xcc, 0xcc, 0xcc, 0xcc, 0xcc, 0xcb, 0xcb, 0xcb, 0xcb, 0xcb, 0xcb, 0xcb, 0xcb, 0xca, 0xca, 0xca, 0xca, 0xca, 0xca, 0xca, 0xca, 0xc9, 0xc9, 0xc9, 0xc9, 0xc9, 0xc9, 0xc9, 0xc9, 0xc8, 0xc8, 0xc8, 0xc8, 0xc8, 0xc8, 0xc8, 0xc8, 0xc7, 0xc7, 0xc7, 0xc7, 0xc7, 0xc7, 0xc7, 0xc7, 0xc7, 0xc6, 0xc6, 0xc6, 0xc6, 0xc6, 0xc6, 0xc6, 0xc6, 0xc5, 0xc5, 0xc5, 0xc5, 0xc5, 0xc5, 0xc5, 0xc5, 0xc5, 0xc4, 0xc4, 0xc4, 0xc4, 0xc4, 0xc4, 0xc4, 0xc4, 0xc4, 0xc3, 0xc3, 0xc3, 0xc3, 0xc3, 0xc3, 0xc3, 0xc3, 0xc3, 0xc2, 0xc2, 0xc2, 0xc2, 0xc2, 0xc2, 0xc2, 0xc2, 0xc2, 0xc1, 0xc1, 0xc1, 0xc1, 0xc1, 0xc1, 0xc1, 0xc1, 0xc1, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbe, 0xbe, 0xbe, 0xbe, 0xbe, 0xbe, 0xbe, 0xbe, 0xbe, 0xbe, 0xbd, 0xbd, 0xbd, 0xbd, 0xbd, 0xbd, 0xbd, 0xbd, 0xbd, 0xbd, 0xbc, 0xbc, 0xbc, 0xbc, 0xbc, 0xbc, 0xbc, 0xbc, 0xbc, 0xbc, 0xbb, 0xbb, 0xbb, 0xbb, 0xbb, 0xbb, 0xbb, 0xbb, 0xbb, 0xbb, 0xba, 0xba, 0xba, 0xba, 0xba, 0xba, 0xba, 0xba, 0xba, 0xba, 0xb9, 0xb9, 0xb9, 0xb9, 0xb9, 0xb9, 0xb9, 0xb9, 0xb9, 0xb9, 0xb9, 0xb8, 0xb8, 0xb8, 0xb8, 0xb8, 0xb8, 0xb8, 0xb8, 0xb8, 0xb8, 0xb8, 0xb7, 0xb7, 0xb7, 0xb7, 0xb7, 0xb7, 0xb7, 0xb7, 0xb7, 0xb7, 0xb6, 0xb6, 0xb6, 0xb6, 0xb6, 0xb6, 0xb6, 0xb6, 0xb6, 0xb6, 0xb6, 0xb5, 0xb5, 0xb5, 0xb5, 0xb5, 0xb5, 0xb5, 0xb5, 0xb5, 0xb5, 0xb5, 0xb5};
+static const short lookup_table[1025] = {0x7fff, 0x16a0, 0x1000, 0xd10, 0xb50, 0xa1e, 0x93c, 0x88d, 0x800, 0x78a, 0x727, 0x6d2, 0x688, 0x646, 0x60c, 0x5d7, 0x5a8, 0x57c, 0x555, 0x530, 0x50f, 0x4f0, 0x4d2, 0x4b7, 0x49e, 0x486, 0x470, 0x45a, 0x446, 0x433, 0x421, 0x410, 0x400, 0x3f0, 0x3e1, 0x3d3, 0x3c5, 0x3b8, 0x3ab, 0x39f, 0x393, 0x388, 0x37d, 0x373, 0x369, 0x35f, 0x356, 0x34c, 0x344, 0x33b, 0x333, 0x32b, 0x323, 0x31b, 0x314, 0x30d, 0x306, 0x2ff, 0x2f8, 0x2f2, 0x2eb, 0x2e5, 0x2df, 0x2d9, 0x2d4, 0x2ce, 0x2c9, 0x2c3, 0x2be, 0x2b9, 0x2b4, 0x2af, 0x2aa, 0x2a5, 0x2a1, 0x29c, 0x298, 0x294, 0x28f, 0x28b, 0x287, 0x283, 0x27f, 0x27b, 0x278, 0x274, 0x270, 0x26d, 0x269, 0x266, 0x262, 0x25f, 0x25b, 0x258, 0x255, 0x252, 0x24f, 0x24c, 0x249, 0x246, 0x243, 0x240, 0x23d, 0x23a, 0x238, 0x235, 0x232, 0x22f, 0x22d, 0x22a, 0x228, 0x225, 0x223, 0x220, 0x21e, 0x21c, 0x219, 0x217, 0x215, 0x213, 0x210, 0x20e, 0x20c, 0x20a, 0x208, 0x206, 0x204, 0x202, 0x200, 0x1fe, 0x1fc, 0x1fa, 0x1f8, 0x1f6, 0x1f4, 0x1f2, 0x1f0, 0x1ee, 0x1ed, 0x1eb, 0x1e9, 0x1e7, 0x1e6, 0x1e4, 0x1e2, 0x1e1, 0x1df, 0x1dd, 0x1dc, 0x1da, 0x1d8, 0x1d7, 0x1d5, 0x1d4, 0x1d2, 0x1d1, 0x1cf, 0x1ce, 0x1cc, 0x1cb, 0x1c9, 0x1c8, 0x1c7, 0x1c5, 0x1c4, 0x1c2, 0x1c1, 0x1c0, 0x1be, 0x1bd, 0x1bc, 0x1ba, 0x1b9, 0x1b8, 0x1b7, 0x1b5, 0x1b4, 0x1b3, 0x1b2, 0x1b0, 0x1af, 0x1ae, 0x1ad, 0x1ac, 0x1ab, 0x1a9, 0x1a8, 0x1a7, 0x1a6, 0x1a5, 0x1a4, 0x1a3, 0x1a2, 0x1a0, 0x19f, 0x19e, 0x19d, 0x19c, 0x19b, 0x19a, 0x199, 0x198, 0x197, 0x196, 0x195, 0x194, 0x193, 0x192, 0x191, 0x190, 0x18f, 0x18e, 0x18d, 0x18c, 0x18b, 0x18b, 0x18a, 0x189, 0x188, 0x187, 0x186, 0x185, 0x184, 0x183, 0x183, 0x182, 0x181, 0x180, 0x17f, 0x17e, 0x17d, 0x17d, 0x17c, 0x17b, 0x17a, 0x179, 0x179, 0x178, 0x177, 0x176, 0x175, 0x175, 0x174, 0x173, 0x172, 0x172, 0x171, 0x170, 0x16f, 0x16f, 0x16e, 0x16d, 0x16c, 0x16c, 0x16b, 0x16a, 0x16a, 0x169, 0x168, 0x167, 0x167, 0x166, 0x165, 0x165, 0x164, 0x163, 0x163, 0x162, 0x161, 0x161, 0x160, 0x15f, 0x15f, 0x15e, 0x15d, 0x15d, 0x15c, 0x15c, 0x15b, 0x15a, 0x15a, 0x159, 0x158, 0x158, 0x157, 0x157, 0x156, 0x155, 0x155, 0x154, 0x154, 0x153, 0x152, 0x152, 0x151, 0x151, 0x150, 0x150, 0x14f, 0x14e, 0x14e, 0x14d, 0x14d, 0x14c, 0x14c, 0x14b, 0x14b, 0x14a, 0x14a, 0x149, 0x148, 0x148, 0x147, 0x147, 0x146, 0x146, 0x145, 0x145, 0x144, 0x144, 0x143, 0x143, 0x142, 0x142, 0x141, 0x141, 0x140, 0x140, 0x13f, 0x13f, 0x13e, 0x13e, 0x13d, 0x13d, 0x13c, 0x13c, 0x13c, 0x13b, 0x13b, 0x13a, 0x13a, 0x139, 0x139, 0x138, 0x138, 0x137, 0x137, 0x136, 0x136, 0x136, 0x135, 0x135, 0x134, 0x134, 0x133, 0x133, 0x133, 0x132, 0x132, 0x131, 0x131, 0x130, 0x130, 0x130, 0x12f, 0x12f, 0x12e, 0x12e, 0x12d, 0x12d, 0x12d, 0x12c, 0x12c, 0x12b, 0x12b, 0x12b, 0x12a, 0x12a, 0x129, 0x129, 0x129, 0x128, 0x128, 0x127, 0x127, 0x127, 0x126, 0x126, 0x126, 0x125, 0x125, 0x124, 0x124, 0x124, 0x123, 0x123, 0x123, 0x122, 0x122, 0x121, 0x121, 0x121, 0x120, 0x120, 0x120, 0x11f, 0x11f, 0x11f, 0x11e, 0x11e, 0x11e, 0x11d, 0x11d, 0x11d, 0x11c, 0x11c, 0x11c, 0x11b, 0x11b, 0x11a, 0x11a, 0x11a, 0x119, 0x119, 0x119, 0x118, 0x118, 0x118, 0x117, 0x117, 0x117, 0x117, 0x116, 0x116, 0x116, 0x115, 0x115, 0x115, 0x114, 0x114, 0x114, 0x113, 0x113, 0x113, 0x112, 0x112, 0x112, 0x111, 0x111, 0x111, 0x111, 0x110, 0x110, 0x110, 0x10f, 0x10f, 0x10f, 0x10e, 0x10e, 0x10e, 0x10e, 0x10d, 0x10d, 0x10d, 0x10c, 0x10c, 0x10c, 0x10c, 0x10b, 0x10b, 0x10b, 0x10a, 0x10a, 0x10a, 0x10a, 0x109, 0x109, 0x109, 0x108, 0x108, 0x108, 0x108, 0x107, 0x107, 0x107, 0x107, 0x106, 0x106, 0x106, 0x105, 0x105, 0x105, 0x105, 0x104, 0x104, 0x104, 0x104, 0x103, 0x103, 0x103, 0x103, 0x102, 0x102, 0x102, 0x102, 0x101, 0x101, 0x101, 0x101, 0x100, 0x100, 0x100, 0x100, 0xff, 0xff, 0xff, 0xff, 0xfe, 0xfe, 0xfe, 0xfe, 0xfd, 0xfd, 0xfd, 0xfd, 0xfc, 0xfc, 0xfc, 0xfc, 0xfb, 0xfb, 0xfb, 0xfb, 0xfa, 0xfa, 0xfa, 0xfa, 0xf9, 0xf9, 0xf9, 0xf9, 0xf9, 0xf8, 0xf8, 0xf8, 0xf8, 0xf7, 0xf7, 0xf7, 0xf7, 0xf6, 0xf6, 0xf6, 0xf6, 0xf6, 0xf5, 0xf5, 0xf5, 0xf5, 0xf5, 0xf4, 0xf4, 0xf4, 0xf4, 0xf3, 0xf3, 0xf3, 0xf3, 0xf3, 0xf2, 0xf2, 0xf2, 0xf2, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xef, 0xef, 0xef, 0xef, 0xef, 0xee, 0xee, 0xee, 0xee, 0xee, 0xed, 0xed, 0xed, 0xed, 0xed, 0xec, 0xec, 0xec, 0xec, 0xec, 0xeb, 0xeb, 0xeb, 0xeb, 0xeb, 0xea, 0xea, 0xea, 0xea, 0xea, 0xe9, 0xe9, 0xe9, 0xe9, 0xe9, 0xe9, 0xe8, 0xe8, 0xe8, 0xe8, 0xe8, 0xe7, 0xe7, 0xe7, 0xe7, 0xe7, 0xe6, 0xe6, 0xe6, 0xe6, 0xe6, 0xe6, 0xe5, 0xe5, 0xe5, 0xe5, 0xe5, 0xe4, 0xe4, 0xe4, 0xe4, 0xe4, 0xe4, 0xe3, 0xe3, 0xe3, 0xe3, 0xe3, 0xe3, 0xe2, 0xe2, 0xe2, 0xe2, 0xe2, 0xe1, 0xe1, 0xe1, 0xe1, 0xe1, 0xe1, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xdf, 0xdf, 0xdf, 0xdf, 0xdf, 0xdf, 0xde, 0xde, 0xde, 0xde, 0xde, 0xde, 0xdd, 0xdd, 0xdd, 0xdd, 0xdd, 0xdd, 0xdd, 0xdc, 0xdc, 0xdc, 0xdc, 0xdc, 0xdc, 0xdb, 0xdb, 0xdb, 0xdb, 0xdb, 0xdb, 0xda, 0xda, 0xda, 0xda, 0xda, 0xda, 0xda, 0xd9, 0xd9, 0xd9, 0xd9, 0xd9, 0xd9, 0xd8, 0xd8, 0xd8, 0xd8, 0xd8, 0xd8, 0xd8, 0xd7, 0xd7, 0xd7, 0xd7, 0xd7, 0xd7, 0xd6, 0xd6, 0xd6, 0xd6, 0xd6, 0xd6, 0xd6, 0xd5, 0xd5, 0xd5, 0xd5, 0xd5, 0xd5, 0xd5, 0xd4, 0xd4, 0xd4, 0xd4, 0xd4, 0xd4, 0xd4, 0xd3, 0xd3, 0xd3, 0xd3, 0xd3, 0xd3, 0xd3, 0xd2, 0xd2, 0xd2, 0xd2, 0xd2, 0xd2, 0xd2, 0xd1, 0xd1, 0xd1, 0xd1, 0xd1, 0xd1, 0xd1, 0xd1, 0xd0, 0xd0, 0xd0, 0xd0, 0xd0, 0xd0, 0xd0, 0xcf, 0xcf, 0xcf, 0xcf, 0xcf, 0xcf, 0xcf, 0xcf, 0xce, 0xce, 0xce, 0xce, 0xce, 0xce, 0xce, 0xcd, 0xcd, 0xcd, 0xcd, 0xcd, 0xcd, 0xcd, 0xcd, 0xcc, 0xcc, 0xcc, 0xcc, 0xcc, 0xcc, 0xcc, 0xcc, 0xcb, 0xcb, 0xcb, 0xcb, 0xcb, 0xcb, 0xcb, 0xcb, 0xca, 0xca, 0xca, 0xca, 0xca, 0xca, 0xca, 0xca, 0xc9, 0xc9, 0xc9, 0xc9, 0xc9, 0xc9, 0xc9, 0xc9, 0xc8, 0xc8, 0xc8, 0xc8, 0xc8, 0xc8, 0xc8, 0xc8, 0xc7, 0xc7, 0xc7, 0xc7, 0xc7, 0xc7, 0xc7, 0xc7, 0xc7, 0xc6, 0xc6, 0xc6, 0xc6, 0xc6, 0xc6, 0xc6, 0xc6, 0xc5, 0xc5, 0xc5, 0xc5, 0xc5, 0xc5, 0xc5, 0xc5, 0xc5, 0xc4, 0xc4, 0xc4, 0xc4, 0xc4, 0xc4, 0xc4, 0xc4, 0xc4, 0xc3, 0xc3, 0xc3, 0xc3, 0xc3, 0xc3, 0xc3, 0xc3, 0xc3, 0xc2, 0xc2, 0xc2, 0xc2, 0xc2, 0xc2, 0xc2, 0xc2, 0xc2, 0xc1, 0xc1, 0xc1, 0xc1, 0xc1, 0xc1, 0xc1, 0xc1, 0xc1, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbe, 0xbe, 0xbe, 0xbe, 0xbe, 0xbe, 0xbe, 0xbe, 0xbe, 0xbe, 0xbd, 0xbd, 0xbd, 0xbd, 0xbd, 0xbd, 0xbd, 0xbd, 0xbd, 0xbd, 0xbc, 0xbc, 0xbc, 0xbc, 0xbc, 0xbc, 0xbc, 0xbc, 0xbc, 0xbc, 0xbb, 0xbb, 0xbb, 0xbb, 0xbb, 0xbb, 0xbb, 0xbb, 0xbb, 0xbb, 0xba, 0xba, 0xba, 0xba, 0xba, 0xba, 0xba, 0xba, 0xba, 0xba, 0xb9, 0xb9, 0xb9, 0xb9, 0xb9, 0xb9, 0xb9, 0xb9, 0xb9, 0xb9, 0xb9, 0xb8, 0xb8, 0xb8, 0xb8, 0xb8, 0xb8, 0xb8, 0xb8, 0xb8, 0xb8, 0xb8, 0xb7, 0xb7, 0xb7, 0xb7, 0xb7, 0xb7, 0xb7, 0xb7, 0xb7, 0xb7, 0xb6, 0xb6, 0xb6, 0xb6, 0xb6, 0xb6, 0xb6, 0xb6, 0xb6, 0xb6, 0xb6, 0xb5, 0xb5, 0xb5, 0xb5, 0xb5, 0xb5, 0xb5, 0xb5, 0xb5, 0xb5, 0xb5, 0xb5};
 
 short invSqrt(short x)
 {
diff --git a/openair1/PHY/TOOLS/oai_dfts.c b/openair1/PHY/TOOLS/oai_dfts.c
index dcb206b5a40e3f2adb0f8fbcfa5d4611c1c19882..36bae5bd3ddea5c9781fd6070f746650226fdfd9 100644
--- a/openair1/PHY/TOOLS/oai_dfts.c
+++ b/openair1/PHY/TOOLS/oai_dfts.c
@@ -1513,7 +1513,6 @@ const static int16_t tw64c[96] __attribute__((aligned(32))) = {
 -12539,-30273,-20787,-25330,-27244,-18205,-31356,-9512
                                                  };
 #define simd_q15_t simde__m128i
-#define simdshort_q15_t simde__m64
 #define shiftright_int16(a,shift) simde_mm_srai_epi16(a,shift)
 #define mulhi_int16(a,b) simde_mm_mulhrs_epi16 (a,b)
 #define simd256_q15_t simde__m256i
@@ -1661,8 +1660,6 @@ void dft64(int16_t *x,int16_t *y,unsigned char scale)
     y256[7] = shiftright_int16_simd256(y256[7], 1);
   }
 
-  simde_mm_empty();
-  simde_m_empty();
 }
 
 void idft64(int16_t *x,int16_t *y,unsigned char scale)
@@ -1760,8 +1757,6 @@ void idft64(int16_t *x,int16_t *y,unsigned char scale)
     y256[7]  = shiftright_int16_simd256(y256[7],3);
   }
 
-  simde_mm_empty();
-  simde_m_empty();
 }
 
 static const int16_t tw128[128] __attribute__((aligned(32))) = {  32767,0,32727,-1608,32609,-3212,32412,-4808,32137,-6393,31785,-7962,31356,-9512,30851,-11039,30272,-12540,29621,-14010,28897,-15447,28105,-16846,27244,-18205,26318,-19520,25329,-20788,24278,-22005,23169,-23170,22004,-24279,20787,-25330,19519,-26319,18204,-27245,16845,-28106,15446,-28898,14009,-29622,12539,-30273,11038,-30852,9511,-31357,7961,-31786,6392,-32138,4807,-32413,3211,-32610,1607,-32728,0,-32767,-1608,-32728,-3212,-32610,-4808,-32413,-6393,-32138,-7962,-31786,-9512,-31357,-11039,-30852,-12540,-30273,-14010,-29622,-15447,-28898,-16846,-28106,-18205,-27245,-19520,-26319,-20788,-25330,-22005,-24279,-23170,-23170,-24279,-22005,-25330,-20788,-26319,-19520,-27245,-18205,-28106,-16846,-28898,-15447,-29622,-14010,-30273,-12540,-30852,-11039,-31357,-9512,-31786,-7962,-32138,-6393,-32413,-4808,-32610,-3212,-32728,-1608};
@@ -1998,8 +1993,6 @@ void dft256(int16_t *x,int16_t *y,unsigned char scale)
 
   }
 
-  simde_mm_empty();
-  simde_m_empty();
 }
 
 void idft256(int16_t *x,int16_t *y,unsigned char scale)
@@ -2082,8 +2075,6 @@ void idft256(int16_t *x,int16_t *y,unsigned char scale)
 
   }
 
-  simde_mm_empty();
-  simde_m_empty();
 }
 
 static const int16_t tw512[512] __attribute__((aligned(32))) = {
@@ -2307,8 +2298,6 @@ void dft1024(int16_t *x,int16_t *y,unsigned char scale)
 
   }
 
-  simde_mm_empty();
-  simde_m_empty();
 }
 
 void idft1024(int16_t *x,int16_t *y,unsigned char scale)
@@ -2362,8 +2351,6 @@ void idft1024(int16_t *x,int16_t *y,unsigned char scale)
 
   }
 
-  simde_mm_empty();
-  simde_m_empty();
 }
 
 int16_t tw2048[2048] __attribute__((aligned(32)));
@@ -2454,8 +2441,6 @@ void dft2048(int16_t *x,int16_t *y,unsigned char scale)
     }
   }
 
-  simde_mm_empty();
-  simde_m_empty();
 }
 
 void idft2048(int16_t *x,int16_t *y,unsigned char scale)
@@ -2543,8 +2528,6 @@ void idft2048(int16_t *x,int16_t *y,unsigned char scale)
     }
   }
 
-  simde_mm_empty();
-  simde_m_empty();
 }
 
 int16_t tw4096[3*2*1024];
@@ -2600,8 +2583,6 @@ void dft4096(int16_t *x,int16_t *y,unsigned char scale)
 
   }
 
-  simde_mm_empty();
-  simde_m_empty();
 }
 
 void idft4096(int16_t *x,int16_t *y,unsigned char scale)
@@ -2655,8 +2636,6 @@ void idft4096(int16_t *x,int16_t *y,unsigned char scale)
 
   }
 
-  simde_mm_empty();
-  simde_m_empty();
 }
 
 int16_t tw8192[2*4096] __attribute__((aligned(32)));
@@ -2747,8 +2726,6 @@ void dft8192(int16_t *x,int16_t *y,unsigned char scale)
     }
   }
 
-  simde_mm_empty();
-  simde_m_empty();
 }
 
 void idft8192(int16_t *x,int16_t *y,unsigned char scale)
@@ -2836,8 +2813,6 @@ void idft8192(int16_t *x,int16_t *y,unsigned char scale)
     }
   }
 
-  simde_mm_empty();
-  simde_m_empty();
 }
 
 int16_t tw16384[3*2*4096] __attribute__((aligned(32)));
@@ -2893,8 +2868,6 @@ void dft16384(int16_t *x,int16_t *y,unsigned char scale)
 
   }
 
-  simde_mm_empty();
-  simde_m_empty();
 }
 
 void idft16384(int16_t *x,int16_t *y,unsigned char scale)
@@ -2948,8 +2921,6 @@ void idft16384(int16_t *x,int16_t *y,unsigned char scale)
 
   }
 
-  simde_mm_empty();
-  simde_m_empty();
 }
 
 int16_t tw32768[2*16384] __attribute__((aligned(32)));
@@ -3040,8 +3011,6 @@ void dft32768(int16_t *x,int16_t *y,unsigned char scale)
     }
   }
 
-  simde_mm_empty();
-  simde_m_empty();
 }
 
 void idft32768(int16_t *x,int16_t *y,unsigned char scale)
@@ -3129,8 +3098,6 @@ void idft32768(int16_t *x,int16_t *y,unsigned char scale)
     }
   }
 
-  simde_mm_empty();
-  simde_m_empty();
 }
 
 int16_t twa768[512],twb768[512];
@@ -3183,8 +3150,6 @@ void idft768(int16_t *input, int16_t *output, unsigned char scale)
     }
   }
 
-  simde_mm_empty();
-  simde_m_empty();
 }
 
 void dft768(int16_t *input, int16_t *output, unsigned char scale)
@@ -3246,8 +3211,6 @@ void dft768(int16_t *input, int16_t *output, unsigned char scale)
     }
   }
 
-  simde_mm_empty();
-  simde_m_empty();
 }
 int16_t twa1536[1024],twb1536[1024];
 
@@ -3299,8 +3262,6 @@ void idft1536(int16_t *input, int16_t *output, unsigned char scale)
     }
   }
 
-  simde_mm_empty();
-  simde_m_empty();
 }
 
 void dft1536(int16_t *input, int16_t *output, unsigned char scale)
@@ -3362,8 +3323,6 @@ void dft1536(int16_t *input, int16_t *output, unsigned char scale)
     }
   }
 
-  simde_mm_empty();
-  simde_m_empty();
 }
 
 int16_t twa3072[2048] __attribute__((aligned(32)));
@@ -3415,8 +3374,6 @@ void dft3072(int16_t *input, int16_t *output,unsigned char scale)
     }
   }
 
-  simde_mm_empty();
-  simde_m_empty();
 }
 
 void idft3072(int16_t *input, int16_t *output,unsigned char scale)
@@ -3465,8 +3422,6 @@ void idft3072(int16_t *input, int16_t *output,unsigned char scale)
     }
   }
 
-  simde_mm_empty();
-  simde_m_empty();
 }
 
 
@@ -3527,8 +3482,6 @@ void idft6144(int16_t *input, int16_t *output,unsigned char scale)
     }
   }
 
-  simde_mm_empty();
-  simde_m_empty();
 }
 
 
@@ -3590,8 +3543,6 @@ void dft6144(int16_t *input, int16_t *output,unsigned char scale)
       y128p+=16;
     }
   }
-  simde_mm_empty();
-  simde_m_empty();
 }
 
 int16_t twa12288[8192] __attribute__((aligned(32)));
@@ -3654,8 +3605,6 @@ void dft12288(int16_t *input, int16_t *output,unsigned char scale)
       y128p+=16;
     }
   }
-  simde_mm_empty();
-  simde_m_empty();
 }
 
 void idft12288(int16_t *input, int16_t *output,unsigned char scale)
@@ -3712,8 +3661,6 @@ void idft12288(int16_t *input, int16_t *output,unsigned char scale)
       y128p+=16;
     }
   }
-  simde_mm_empty();
-  simde_m_empty();
 #ifndef MR_MAIN
   if (LOG_DUMPFLAG(DEBUG_DFT)) {
      LOG_M("idft12288out.m","out",output,6144,1,1);
@@ -3768,8 +3715,6 @@ void dft18432(int16_t *input, int16_t *output,unsigned char scale) {
       y128p+=16;
     }
   }
-  simde_mm_empty();
-  simde_m_empty();
 }
 
 void idft18432(int16_t *input, int16_t *output,unsigned char scale) {
@@ -3816,8 +3761,6 @@ void idft18432(int16_t *input, int16_t *output,unsigned char scale) {
       y128p+=16;
     }
   }
-  simde_mm_empty();
-  simde_m_empty();
 }
 
 
@@ -3882,8 +3825,6 @@ void dft24576(int16_t *input, int16_t *output,unsigned char scale)
       y128p+=16;
     }
   }
-  simde_mm_empty();
-  simde_m_empty();
 #ifndef MR_MAIN
   if (LOG_DUMPFLAG(DEBUG_DFT)) {
      LOG_M("out.m","out",output,24576,1,1);
@@ -3942,8 +3883,6 @@ void idft24576(int16_t *input, int16_t *output,unsigned char scale)
       y128p+=16;
     }
   }
-  simde_mm_empty();
-  simde_m_empty();
 #ifndef MR_MAIN
   if (LOG_DUMPFLAG(DEBUG_DFT)) {
     LOG_M("idft24576out.m","out",output,24576,1,1);
@@ -4006,8 +3945,6 @@ void dft36864(int16_t *input, int16_t *output,uint8_t scale) {
       y128p+=16;
     }
   }
-  simde_mm_empty();
-  simde_m_empty();
 #ifndef MR_MAIN
   if (LOG_DUMPFLAG(DEBUG_DFT)) {
      LOG_M("out.m","out",output,36864,1,1);
@@ -4059,8 +3996,6 @@ void idft36864(int16_t *input, int16_t *output,uint8_t scale) {
       y128p+=16;
     }
   }
-  simde_mm_empty();
-  simde_m_empty();
 }
 
 int16_t twa49152[32768] __attribute__((aligned(32)));
@@ -4111,8 +4046,6 @@ void dft49152(int16_t *input, int16_t *output,uint8_t scale) {
       y128p+=16;
     }
   }
-  simde_mm_empty();
-  simde_m_empty();
 }
 
 void idft49152(int16_t *input, int16_t *output,uint8_t scale) {
@@ -4159,8 +4092,6 @@ void idft49152(int16_t *input, int16_t *output,uint8_t scale) {
       y128p+=16;
     }
   }
-  simde_mm_empty();
-  simde_m_empty();
 }
 
 int16_t tw65536[3*2*16384] __attribute__((aligned(32)));
@@ -4216,8 +4147,6 @@ void idft65536(int16_t *x,int16_t *y,unsigned char scale)
 
   }
 
-  simde_mm_empty();
-  simde_m_empty();
 }
 
 int16_t twa98304[65536] __attribute__((aligned(32)));
@@ -4267,8 +4196,6 @@ void dft98304(int16_t *input, int16_t *output,uint8_t scale) {
       y128p+=16;
     }
   }
-  simde_mm_empty();
-  simde_m_empty();
 }
 
 void idft98304(int16_t *input, int16_t *output,uint8_t scale) {
@@ -4315,8 +4242,6 @@ void idft98304(int16_t *input, int16_t *output,uint8_t scale) {
       y128p+=16;
     }
   }
-  simde_mm_empty();
-  simde_m_empty();
 }
 
  
@@ -4470,8 +4395,6 @@ void dft12(int16_t *x,int16_t *y ,unsigned char scale_flag)
          &y128[10],
          &y128[11]);
 
-  simde_mm_empty();
-  simde_m_empty();
 }
 
 static const int16_t W1_12s_256[16] __attribute__((aligned(32))) =
@@ -4641,8 +4564,6 @@ void dft12_simd256(int16_t *x,int16_t *y)
                  &y256[10],
                  &y256[11]);
 
-  simde_mm_empty();
-  simde_m_empty();
 }
 
 static int16_t tw24[88]__attribute__((aligned(32)));
@@ -4735,8 +4656,6 @@ void dft24(int16_t *x,int16_t *y,unsigned char scale_flag)
     }
   }
 
-  simde_mm_empty();
-  simde_m_empty();
 }
 
 static int16_t twa36[88]__attribute__((aligned(32)));
@@ -4857,8 +4776,6 @@ void dft36(int16_t *x,int16_t *y,unsigned char scale_flag)
     }
   }
 
-  simde_mm_empty();
-  simde_m_empty();
 }
 
 static int16_t twa48[88]__attribute__((aligned(32)));
@@ -5017,8 +4934,6 @@ void dft48(int16_t *x, int16_t *y,unsigned char scale_flag)
     }
   }
 
-  simde_mm_empty();
-  simde_m_empty();
 }
 
 static int16_t twa60[88]__attribute__((aligned(32)));
@@ -5201,8 +5116,6 @@ void dft60(int16_t *x,int16_t *y,unsigned char scale)
     }
   }
 
-  simde_mm_empty();
-  simde_m_empty();
 }
 
 static int16_t tw72[280]__attribute__((aligned(32)));
@@ -5244,8 +5157,6 @@ void dft72(int16_t *x,int16_t *y,unsigned char scale_flag)
     }
   }
 
-  simde_mm_empty();
-  simde_m_empty();
 }
 
 static int16_t tw96[376]__attribute__((aligned(32)));
@@ -5289,8 +5200,6 @@ void dft96(int16_t *x,int16_t *y,unsigned char scale_flag)
     }
   }
 
-  simde_mm_empty();
-  simde_m_empty();
 }
 
 static int16_t twa108[280]__attribute__((aligned(32)));
@@ -5339,8 +5248,6 @@ void dft108(int16_t *x,int16_t *y,unsigned char scale_flag)
     }
   }
 
-  simde_mm_empty();
-  simde_m_empty();
 }
 
 static int16_t tw120[472]__attribute__((aligned(32)));
@@ -5380,8 +5287,6 @@ void dft120(int16_t *x,int16_t *y, unsigned char scale_flag)
     }
   }
 
-  simde_mm_empty();
-  simde_m_empty();
 }
 
 static int16_t twa144[376]__attribute__((aligned(32)));
@@ -5430,8 +5335,6 @@ void dft144(int16_t *x,int16_t *y,unsigned char scale_flag)
     }
   }
 
-  simde_mm_empty();
-  simde_m_empty();
 }
 
 static int16_t twa180[472]__attribute__((aligned(32)));
@@ -5481,8 +5384,6 @@ void dft180(int16_t *x,int16_t *y,unsigned char scale_flag)
     }
   }
 
-  simde_mm_empty();
-  simde_m_empty();
 }
 
 static int16_t twa192[376]__attribute__((aligned(32)));
@@ -5539,8 +5440,6 @@ void dft192(int16_t *x,int16_t *y,unsigned char scale_flag)
     }
   }
 
-  simde_mm_empty();
-  simde_m_empty();
 }
 
 static int16_t twa216[568]__attribute__((aligned(32)));
@@ -5590,8 +5489,6 @@ void dft216(int16_t *x,int16_t *y,unsigned char scale_flag)
     }
   }
 
-  simde_mm_empty();
-  simde_m_empty();
 }
 
 static int16_t twa240[472]__attribute__((aligned(32)));
@@ -5648,8 +5545,6 @@ void dft240(int16_t *x,int16_t *y,unsigned char scale_flag)
     }
   }
 
-  simde_mm_empty();
-  simde_m_empty();
 }
 
 static int16_t twa288[760]__attribute__((aligned(32)));
@@ -5699,8 +5594,6 @@ void dft288(int16_t *x,int16_t *y,unsigned char scale_flag)
     }
   }
 
-  simde_mm_empty();
-  simde_m_empty();
 }
 
 static int16_t twa300[472]__attribute__((aligned(32)));
@@ -5764,8 +5657,6 @@ void dft300(int16_t *x,int16_t *y,unsigned char scale_flag)
     }
   }
 
-  simde_mm_empty();
-  simde_m_empty();
 }
 
 static int16_t twa324[107*2*4];
@@ -5814,8 +5705,6 @@ void dft324(int16_t *x,int16_t *y,unsigned char scale_flag)  // 108 x 3
     }
   }
 
-  simde_mm_empty();
-  simde_m_empty();
 };
 
 static int16_t twa360[119*2*4];
@@ -5864,8 +5753,6 @@ void dft360(int16_t *x,int16_t *y,unsigned char scale_flag)  // 120 x 3
     }
   }
 
-  simde_mm_empty();
-  simde_m_empty();
 };
 
 static int16_t twa384[95*2*4];
@@ -5921,8 +5808,6 @@ void dft384(int16_t *x,int16_t *y,unsigned char scale_flag)  // 96 x 4
     }
   }
 
-  simde_mm_empty();
-  simde_m_empty();
 };
 
 static int16_t twa432[107*2*4];
@@ -5977,8 +5862,6 @@ void dft432(int16_t *x,int16_t *y,unsigned char scale_flag)  // 108 x 4
     }
   }
 
-  simde_mm_empty();
-  simde_m_empty();
 };
 static int16_t twa480[119*2*4];
 static int16_t twb480[119*2*4];
@@ -6033,8 +5916,6 @@ void dft480(int16_t *x,int16_t *y,unsigned char scale_flag)  // 120 x 4
     }
   }
 
-  simde_mm_empty();
-  simde_m_empty();
 };
 
 
@@ -6084,8 +5965,6 @@ void dft540(int16_t *x,int16_t *y,unsigned char scale_flag)  // 180 x 3
     }
   }
 
-  simde_mm_empty();
-  simde_m_empty();
 };
 
 static int16_t twa576[191*2*4];
@@ -6135,8 +6014,6 @@ void dft576(int16_t *x,int16_t *y,unsigned char scale_flag)  // 192 x 3
     }
   }
 
-  simde_mm_empty();
-  simde_m_empty();
 };
 
 
@@ -6179,8 +6056,6 @@ void dft600(int16_t *x,int16_t *y,unsigned char scale_flag)  // 300 x 2
     }
   }
 
-  simde_mm_empty();
-  simde_m_empty();
 };
 
 
@@ -6230,8 +6105,6 @@ void dft648(int16_t *x,int16_t *y,unsigned char scale_flag)  // 216 x 3
     }
   }
 
-  simde_mm_empty();
-  simde_m_empty();
 };
 
 
@@ -6289,8 +6162,6 @@ void dft720(int16_t *x,int16_t *y,unsigned char scale_flag)  // 180 x 4
     }
   }
 
-  simde_mm_empty();
-  simde_m_empty();
 };
 
 static int16_t twa768p[191*2*4];
@@ -6346,8 +6217,6 @@ void dft768p(int16_t *x,int16_t *y,unsigned char scale_flag) { // 192x 4;
     }
   }
 
-  simde_mm_empty();
-  simde_m_empty();
 }
 
 static int16_t twa384i[256];
@@ -6400,8 +6269,6 @@ void idft384(int16_t *input, int16_t *output, unsigned char scale)
     }
   }
 
-  simde_mm_empty();
-  simde_m_empty();
 }
 
 
@@ -6451,8 +6318,6 @@ void dft864(int16_t *x,int16_t *y,unsigned char scale_flag)  // 288 x 3
     }
   }
 
-  simde_mm_empty();
-  simde_m_empty();
 };
 
 static int16_t twa900[299*2*4];
@@ -6501,8 +6366,6 @@ void dft900(int16_t *x,int16_t *y,unsigned char scale_flag)  // 300 x 3
     }
   }
 
-  simde_mm_empty();
-  simde_m_empty();
 };
 
 
@@ -6560,8 +6423,6 @@ void dft960(int16_t *x,int16_t *y,unsigned char scale_flag)  // 240 x 4
     }
   }
 
-  simde_mm_empty();
-  simde_m_empty();
 };
 
 
@@ -6611,8 +6472,6 @@ void dft972(int16_t *x,int16_t *y,unsigned char scale_flag)  // 324 x 3
     }
   }
 
-  simde_mm_empty();
-  simde_m_empty();
 };
 
 static int16_t twa1080[359*2*4];
@@ -6661,8 +6520,6 @@ void dft1080(int16_t *x,int16_t *y,unsigned char scale_flag)  // 360 x 3
     }
   }
 
-  simde_mm_empty();
-  simde_m_empty();
 };
 
 static int16_t twa1152[287*2*4];
@@ -6719,8 +6576,6 @@ void dft1152(int16_t *x,int16_t *y,unsigned char scale_flag)  // 288 x 4
     }
   }
 
-  simde_mm_empty();
-  simde_m_empty();
 };
 
 int16_t twa1200[4784];
@@ -6776,8 +6631,6 @@ void dft1200(int16_t *x,int16_t *y,unsigned char scale_flag)
     }
   }
 
-  simde_mm_empty();
-  simde_m_empty();
 }
 
 
@@ -6828,8 +6681,6 @@ void dft1296(int16_t *x,int16_t *y,unsigned char scale_flag) //432 * 3
     }
   }
 
-  simde_mm_empty();
-  simde_m_empty();
 };
 
 
@@ -6879,8 +6730,6 @@ void dft1440(int16_t *x,int16_t *y,unsigned char scale_flag)  // 480 x 3
     }
   }
 
-  simde_mm_empty();
-  simde_m_empty();
 };
 
 static int16_t twa1500[2392]__attribute__((aligned(32)));
@@ -6944,8 +6793,6 @@ void dft1500(int16_t *x,int16_t *y,unsigned char scale_flag)
     }
   }
 
-  simde_mm_empty();
-  simde_m_empty();
 }
 
 static int16_t twa1620[539*2*4];
@@ -6994,8 +6841,6 @@ void dft1620(int16_t *x,int16_t *y,unsigned char scale_flag)  // 540 x 3
     }
   }
 
-  simde_mm_empty();
-  simde_m_empty();
 };
 
 static int16_t twa1728[575*2*4];
@@ -7044,8 +6889,6 @@ void dft1728(int16_t *x,int16_t *y,unsigned char scale_flag)  // 576 x 3
     }
   }
 
-  simde_mm_empty();
-  simde_m_empty();
 };
 
 static int16_t twa1800[599*2*4];
@@ -7094,8 +6937,6 @@ void dft1800(int16_t *x,int16_t *y,unsigned char scale_flag)  // 600 x 3
     }
   }
 
-  simde_mm_empty();
-  simde_m_empty();
 };
 
 static int16_t twa1920[479*2*4];
@@ -7150,8 +6991,6 @@ void dft1920(int16_t *x,int16_t *y,unsigned char scale_flag)  // 480 x 4
     }
   }
 
-  simde_mm_empty();
-  simde_m_empty();
 };
 
 static int16_t twa1944[647*2*4];
@@ -7200,8 +7039,6 @@ void dft1944(int16_t *x,int16_t *y,unsigned char scale_flag)  // 648 x 3
     }
   }
 
-  simde_mm_empty();
-  simde_m_empty();
 };
 
 static int16_t twa2160[719*2*4];
@@ -7250,8 +7087,6 @@ void dft2160(int16_t *x,int16_t *y,unsigned char scale_flag)  // 720 x 3
     }
   }
 
-  simde_mm_empty();
-  simde_m_empty();
 };
 
 static int16_t twa2304[767*2*4];
@@ -7300,8 +7135,6 @@ void dft2304(int16_t *x,int16_t *y,unsigned char scale_flag)  // 768 x 3
     }
   }
 
-  simde_mm_empty();
-  simde_m_empty();
 };
 
 static int16_t twa2400[599*2*4];
@@ -7357,8 +7190,6 @@ void dft2400(int16_t *x,int16_t *y,unsigned char scale_flag)  // 600 x 4
     }
   }
 
-  simde_mm_empty();
-  simde_m_empty();
 };
 
 static int16_t twa2592[863*2*4];
@@ -7407,8 +7238,6 @@ void dft2592(int16_t *x,int16_t *y,unsigned char scale_flag)  // 864 x 3
     }
   }
 
-  simde_mm_empty();
-  simde_m_empty();
 };
 
 static int16_t twa2700[899*2*4];
@@ -7457,8 +7286,6 @@ void dft2700(int16_t *x,int16_t *y,unsigned char scale_flag)  // 900 x 3
     }
   }
 
-  simde_mm_empty();
-  simde_m_empty();
 };
 
 static int16_t twa2880[959*2*4];
@@ -7507,8 +7334,6 @@ void dft2880(int16_t *x,int16_t *y,unsigned char scale_flag)  // 960 x 3
     }
   }
 
-  simde_mm_empty();
-  simde_m_empty();
 };
 
 static int16_t twa2916[971*2*4];
@@ -7557,8 +7382,6 @@ void dft2916(int16_t *x,int16_t *y,unsigned char scale_flag)  // 972 x 3
     }
   }
 
-  simde_mm_empty();
-  simde_m_empty();
 };
 
 static int16_t twa3000[599*8]__attribute__((aligned(32)));
@@ -7622,8 +7445,6 @@ void dft3000(int16_t *x,int16_t *y,unsigned char scale_flag) // 600 * 5
     }
   }
 
-  simde_mm_empty();
-  simde_m_empty();
 }
 
 static int16_t twa3240[1079*2*4];
@@ -7672,8 +7493,6 @@ void dft3240(int16_t *x,int16_t *y,unsigned char scale_flag)  // 1080 x 3
     }
   }
 
-  simde_mm_empty();
-  simde_m_empty();
 };
 
 void init_rad4(int N,int16_t *tw) {
diff --git a/openair1/PHY/TOOLS/oai_dfts_neon.c b/openair1/PHY/TOOLS/oai_dfts_neon.c
index f3a2e5cb46a0bb264a7b6ee0410dfe00cb2ced45..ddf8a59bf524dd99c0a44cb08d535c86a115abeb 100644
--- a/openair1/PHY/TOOLS/oai_dfts_neon.c
+++ b/openair1/PHY/TOOLS/oai_dfts_neon.c
@@ -909,9 +909,6 @@ const static int16_t tw64c[96] __attribute__((aligned(32))) = {
 #ifdef simd_q15_t
 #undef simd_q15_t
 #endif
-#ifdef simdshort_q15_t 
-#undef simdshort_q15_t
-#endif
 #ifdef shiftright_int16
 #undef shiftright_int16
 #endif
@@ -3633,8 +3630,6 @@ void idft65536(int16_t *x,int16_t *y,unsigned char scale)
 
   }
 
-  simde_mm_empty();
-  simde_m_empty();
 }
 
 int16_t twa98304[65536] __attribute__((aligned(32)));
diff --git a/openair1/PHY/TOOLS/signal_energy.c b/openair1/PHY/TOOLS/signal_energy.c
index 012677e8bb0ac54c5257ef3d62fd0af738344f93..93544da75883f13f89648e72d06026eb25b7b48a 100644
--- a/openair1/PHY/TOOLS/signal_energy.c
+++ b/openair1/PHY/TOOLS/signal_energy.c
@@ -65,52 +65,6 @@ int32_t signal_energy(int32_t *input,uint32_t length)
   return temp;
 }
 
-int32_t signal_energy_amp_shift(int32_t *input,uint32_t length)
-{
-
-  int32_t i;
-  int32_t temp,temp2;
-  register simde__m64 mm0,mm1,mm2,mm3;
-  simde__m64 *in = (simde__m64 *)input;
-
-  mm0 = simde_mm_setzero_si64();
-  mm3 = simde_mm_setzero_si64();
-
-  for (i=0; i<length>>1; i++) {
-
-    mm1 = in[i];
-    mm2 = mm1;
-    mm1 = simde_m_pmaddwd(mm1,mm1);
-    mm1 = simde_m_psradi(mm1,AMP_SHIFT);// shift any 32 bits blocs of the word by the value shift_p9
-    mm0 = simde_m_paddd(mm0,mm1);// add the two 64 bits words 4 bytes by 4 bytes
-    mm3 = simde_m_paddw(mm3,mm2);// add the two 64 bits words 2 bytes by 2 bytes
-  }
-
-  mm1 = mm0;
-  mm0 = simde_m_psrlqi(mm0,32);
-  mm0 = simde_m_paddd(mm0,mm1);
-  temp = simde_m_to_int(mm0);
-  temp/=length; // this is the average of x^2
-
-
-  // now remove the DC component
-
-
-  mm2 = simde_m_psrlqi(mm3,32);
-  mm2 = simde_m_paddw(mm2,mm3);
-  mm2 = simde_m_pmaddwd(mm2,mm2);
-  mm2 = simde_m_psradi(mm2,AMP_SHIFT); // fixed point representation of elements
-  temp2 = simde_m_to_int(mm2);
-  temp2/=(length*length);
-
-  temp -= temp2;
-
-  simde_mm_empty();
-  simde_m_empty();
-
-  return((temp>0)?temp:1);
-}
-
 uint32_t signal_energy_nodc(const c16_t *input, uint32_t length)
 {
   // init
diff --git a/openair1/PHY/TOOLS/simde_operations.c b/openair1/PHY/TOOLS/simde_operations.c
index 03a611e08dd69f212e0fe34e9a5f3385be83f028..f0dd306033f91298375948028e9bc4100c2e49d7 100644
--- a/openair1/PHY/TOOLS/simde_operations.c
+++ b/openair1/PHY/TOOLS/simde_operations.c
@@ -57,4 +57,4 @@ void simde_mm256_separate_real_imag_parts(simde__m256i *out_re, simde__m256i *ou
 
   *out_re = simde_mm256_permute4x64_epi64(tmp0, 0xd8);
   *out_im = simde_mm256_permute4x64_epi64(tmp1, 0xd8);
-}
\ No newline at end of file
+}
diff --git a/openair1/PHY/TOOLS/tools_defs.h b/openair1/PHY/TOOLS/tools_defs.h
index 41af6ba0a95c8ecd065c4d0e5208bf0035aa0e41..d7e738a9f44fc5cc39f8c25013187d8f61425c33 100644
--- a/openair1/PHY/TOOLS/tools_defs.h
+++ b/openair1/PHY/TOOLS/tools_defs.h
@@ -41,7 +41,6 @@
 #include "common/utils/LOG/log.h"
 
 #define simd_q15_t simde__m128i
-#define simdshort_q15_t simde__m64
 #define shiftright_int16(a,shift) simde_mm_srai_epi16(a,shift)
 #define set1_int16(a) simde_mm_set1_epi16(a)
 #define mulhi_int16(a,b) simde_mm_mulhrs_epi16 (a,b)
@@ -727,8 +726,6 @@ int32_t sub_cpx_vector16(int16_t *x,
 */
 int32_t signal_energy(int32_t *,uint32_t);
 
-int32_t signal_energy_amp_shift(int32_t *input, uint32_t length);
-
 #ifdef LOCALIZATION
 /*!\fn int32_t signal_energy(int *,uint32_t);
 \brief Computes the signal energy per subcarrier
diff --git a/openair1/PHY/defs_nr_UE.h b/openair1/PHY/defs_nr_UE.h
index b64cb20752ec503cdc8fac09c5ed21b3022192f6..dd035a73e3b5111609e0f4ddc35467c62697e43f 100644
--- a/openair1/PHY/defs_nr_UE.h
+++ b/openair1/PHY/defs_nr_UE.h
@@ -34,8 +34,10 @@
 
 #ifdef __cplusplus
 #include <atomic>
+#ifndef _Atomic
 #define _Atomic(X) std::atomic< X >
 #endif
+#endif
 
 #include "defs_nr_common.h"
 #include "CODING/nrPolar_tools/nr_polar_pbch_defs.h"
diff --git a/openair1/PHY/defs_nr_common.h b/openair1/PHY/defs_nr_common.h
index 64f3ef303ca6c025c240c03f0c136a401d425d9a..f6f18eb9427e2a8b02da52d0f21f2db83baaebe5 100644
--- a/openair1/PHY/defs_nr_common.h
+++ b/openair1/PHY/defs_nr_common.h
@@ -217,6 +217,8 @@ struct NR_DL_FRAME_PARMS {
   c16_t timeshift_symbol_rotation[4096*2] __attribute__ ((aligned (16)));
   /// Table used to apply the delay compensation in DL/UL
   c16_t delay_table[2 * MAX_DELAY_COMP + 1][NR_MAX_OFDM_SYMBOL_SIZE];
+  /// Table used to apply the delay compensation in PUCCH2
+  c16_t delay_table128[2 * MAX_DELAY_COMP + 1][128];
   /// SRS configuration from TS 38.331 RRC
   SRS_NR srs_nr;
   /// Power used by SSB in order to estimate signal strength and path loss
diff --git a/openair1/PHY/nr_phy_common/src/nr_phy_common.c b/openair1/PHY/nr_phy_common/src/nr_phy_common.c
index 11a40fd69e21ed2ef234736dc77718ad72cfaf2f..9aa45e20b385f1092739671d25f5f643315ce7da 100644
--- a/openair1/PHY/nr_phy_common/src/nr_phy_common.c
+++ b/openair1/PHY/nr_phy_common/src/nr_phy_common.c
@@ -25,6 +25,8 @@
 #define USE_128BIT
 #endif
 
+#define PEAK_DETECT_THRESHOLD 15
+
 int16_t saturating_sub(int16_t a, int16_t b)
 {
   int32_t result = (int32_t)a - (int32_t)b;
@@ -109,7 +111,6 @@ void nr_16qam_llr(int32_t *rxdataF_comp, int32_t *ch_mag_in, int16_t *llr, uint3
     ch_mag_128++;
   }
 
-  simde_mm_empty();
 
   nb_re &= 0x3;
   int16_t *rxDataF_i16 = (int16_t *)rxF_128;
@@ -195,9 +196,6 @@ void nr_64qam_llr(int32_t *rxdataF_comp, int32_t *ch_mag, int32_t *ch_mag2, int1
   simde__m128i *rxF_128 = (simde__m128i *)rxF;
   simde__m128i *ch_mag_128 = (simde__m128i *)ch_maga;
   simde__m128i *ch_magb_128 = (simde__m128i *)ch_magb;
-
-  simde__m64 *llr64 = (simde__m64 *)llr_32;
-
   // Each iteration does 4 RE (gives 24 16bit-llrs)
   for (int i = 0; i < (nb_re >> 2); i++) {
     simde__m128i xmm0, xmm1, xmm2;
@@ -207,12 +205,18 @@ void nr_64qam_llr(int32_t *rxdataF_comp, int32_t *ch_mag, int32_t *ch_mag2, int1
     xmm2 = simde_mm_abs_epi16(xmm1);
     xmm2 = simde_mm_subs_epi16(*ch_magb_128, xmm2);
 
-    *llr64++ = simde_mm_set_pi32(simde_mm_extract_epi32(xmm1, 0), simde_mm_extract_epi32(xmm0, 0));
-    *llr64++ = simde_mm_set_pi32(simde_mm_extract_epi32(xmm0, 1), simde_mm_extract_epi32(xmm2, 0));
-    *llr64++ = simde_mm_set_pi32(simde_mm_extract_epi32(xmm2, 1), simde_mm_extract_epi32(xmm1, 1));
-    *llr64++ = simde_mm_set_pi32(simde_mm_extract_epi32(xmm1, 2), simde_mm_extract_epi32(xmm0, 2));
-    *llr64++ = simde_mm_set_pi32(simde_mm_extract_epi32(xmm0, 3), simde_mm_extract_epi32(xmm2, 2));
-    *llr64++ = simde_mm_set_pi32(simde_mm_extract_epi32(xmm2, 3), simde_mm_extract_epi32(xmm1, 3));
+    *llr_32++ = simde_mm_extract_epi32(xmm0, 0);
+    *llr_32++ = simde_mm_extract_epi32(xmm1, 0);
+    *llr_32++ = simde_mm_extract_epi32(xmm2, 0);
+    *llr_32++ = simde_mm_extract_epi32(xmm0, 1);
+    *llr_32++ = simde_mm_extract_epi32(xmm1, 1);
+    *llr_32++ = simde_mm_extract_epi32(xmm2, 1);
+    *llr_32++ = simde_mm_extract_epi32(xmm0, 2);
+    *llr_32++ = simde_mm_extract_epi32(xmm1, 2);
+    *llr_32++ = simde_mm_extract_epi32(xmm2, 2);
+    *llr_32++ = simde_mm_extract_epi32(xmm0, 3);
+    *llr_32++ = simde_mm_extract_epi32(xmm1, 3);
+    *llr_32++ = simde_mm_extract_epi32(xmm2, 3);
     rxF_128++;
     ch_mag_128++;
     ch_magb_128++;
@@ -223,7 +227,7 @@ void nr_64qam_llr(int32_t *rxdataF_comp, int32_t *ch_mag, int32_t *ch_mag2, int1
   int16_t *rxDataF_i16 = (int16_t *)rxF_128;
   int16_t *ch_mag_i16 = (int16_t *)ch_mag_128;
   int16_t *ch_magb_i16 = (int16_t *)ch_magb_128;
-  int16_t *llr_i16 = (int16_t *)llr64;
+  int16_t *llr_i16 = (int16_t *)llr_32;
   for (int i = 0; i < nb_re; i++) {
     int16_t real = rxDataF_i16[2 * i];
     int16_t imag = rxDataF_i16[2 * i + 1];
@@ -238,7 +242,6 @@ void nr_64qam_llr(int32_t *rxdataF_comp, int32_t *ch_mag, int32_t *ch_mag2, int1
     llr_i16[6 * i + 4] = saturating_sub(mag_realb, abs(llr_i16[6 * i + 2]));
     llr_i16[6 * i + 5] = saturating_sub(mag_imagb, abs(llr_i16[6 * i + 3]));
   }
-  simde_mm_empty();
 }
 
 void nr_256qam_llr(int32_t *rxdataF_comp, int32_t *ch_mag, int32_t *ch_mag2, int32_t *ch_mag3, int16_t *llr, uint32_t nb_re)
@@ -350,7 +353,6 @@ void nr_256qam_llr(int32_t *rxdataF_comp, int32_t *ch_mag, int32_t *ch_mag2, int
       llr_i16[8 * i + 7] = saturating_sub(magc_imag, abs(llr_i16[8 * i + 5]));
     }
   }
-  simde_mm_empty();
 }
 
 void freq2time(uint16_t ofdm_symbol_size, int16_t *freq_signal, int16_t *time_signal)
@@ -367,20 +369,28 @@ void nr_est_delay(int ofdm_symbol_size, const c16_t *ls_est, c16_t *ch_estimates
   int max_val = delay->delay_max_val;
   const int sync_pos = 0;
 
+  uint64_t mean_val = 0;
   for (int i = 0; i < ofdm_symbol_size; i++) {
     int temp = c16amp2(ch_estimates_time[i]) >> 1;
+    mean_val += temp;
     if (temp > max_val) {
       max_pos = i;
       max_val = temp;
     }
   }
+  mean_val /= ofdm_symbol_size;
 
   if (max_pos > ofdm_symbol_size / 2)
     max_pos = max_pos - ofdm_symbol_size;
 
   delay->delay_max_pos = max_pos;
   delay->delay_max_val = max_val;
-  delay->est_delay = max_pos - sync_pos;
+
+  // The peak in general is quite clear. It only gives a small peak when the noise is high, generally obtaining an incorrect
+  // estimated delay, and causing the delay compensation to worsen the result instead of improving it. After analyzing several
+  // peaks, and doing many tests, a PEAK_DETECT_THRESHOLD = 15 is an adequate value, to apply delay compensation only when there is
+  // clearly a peak
+  delay->est_delay = mean_val > 0 && max_val / mean_val > PEAK_DETECT_THRESHOLD ? max_pos - sync_pos : 0;
 }
 
 unsigned int nr_get_tx_amp(int power_dBm, int power_max_dBm, int total_nb_rb, int nb_rb)
diff --git a/openair1/SCHED/phy_procedures_lte_eNb.c b/openair1/SCHED/phy_procedures_lte_eNb.c
index 2bde391979fbb41869728c6f515514505ec53f01..5d885cac91b80d9bd56f219625b4750ac00fbad9 100644
--- a/openair1/SCHED/phy_procedures_lte_eNb.c
+++ b/openair1/SCHED/phy_procedures_lte_eNb.c
@@ -1329,8 +1329,9 @@ void pusch_procedures(PHY_VARS_eNB *eNB,L1_rxtx_proc_t *proc) {
   uint32_t harq_pid0 = subframe2harq_pid(&eNB->frame_parms,frame,subframe);
 
   turboDecode_t arr[64] = {0};
-  task_ans_t ans[64] = {0};
-  thread_info_tm_t t_info = {.ans = ans, .cap = 64, .len = 0, .buf = (uint8_t *)arr};
+  task_ans_t ans;
+  init_task_ans(&ans, 64);
+  thread_info_tm_t t_info = {.ans = &ans, .cap = 64, .len = 0, .buf = (uint8_t *)arr};
 
   for (i = 0; i < NUMBER_OF_ULSCH_MAX; i++) {
     ulsch = eNB->ulsch[i];
@@ -1420,7 +1421,10 @@ void pusch_procedures(PHY_VARS_eNB *eNB,L1_rxtx_proc_t *proc) {
   const bool decode = proc->nbDecode;
   DevAssert(t_info.len == proc->nbDecode);
   if (proc->nbDecode > 0) {
-    join_task_ans(t_info.ans, t_info.len);
+    if (t_info.len != t_info.cap) {
+      completed_many_task_ans(t_info.ans, t_info.cap - t_info.len);
+    }
+    join_task_ans(t_info.ans);
     for (size_t i = 0; i < t_info.len; ++i) {
       postDecode(proc, &arr[i]);
     }
diff --git a/openair1/SCHED_NR/nr_ru_procedures.c b/openair1/SCHED_NR/nr_ru_procedures.c
index b385a1b540b2dabd9e4487f5cef5d6f59d82c95e..a0b3a69ba8c24becbbc24e8a6aa5fd8d5abb3c9c 100644
--- a/openair1/SCHED_NR/nr_ru_procedures.c
+++ b/openair1/SCHED_NR/nr_ru_procedures.c
@@ -300,14 +300,14 @@ void nr_feptx_tp(RU_t *ru, int frame_tx, int slot)
   start_meas(&ru->ofdm_total_stats);
 
   size_t const sz = ru->nb_tx + (ru->half_slot_parallelization > 0) * ru->nb_tx;
-  AssertFatal(sz < 64, "Please, increase the buffer size");
-  feptx_cmd_t arr[64] = {0};
-  task_ans_t ans[64] = {0};
+  feptx_cmd_t arr[sz];
+  task_ans_t ans;
+  init_task_ans(&ans, sz);
 
   int nbfeptx = 0;
   for (int aid = 0; aid < ru->nb_tx; aid++) {
     feptx_cmd_t *feptx_cmd = &arr[nbfeptx];
-    feptx_cmd->ans = &ans[nbfeptx];
+    feptx_cmd->ans = &ans;
 
     feptx_cmd->aid = aid;
     feptx_cmd->ru = ru;
@@ -321,7 +321,7 @@ void nr_feptx_tp(RU_t *ru, int frame_tx, int slot)
     nbfeptx++;
     if (ru->half_slot_parallelization > 0) {
       feptx_cmd_t *feptx_cmd = &arr[nbfeptx];
-      feptx_cmd->ans = &ans[nbfeptx];
+      feptx_cmd->ans = &ans;
 
       feptx_cmd->aid = aid;
       feptx_cmd->ru = ru;
@@ -334,8 +334,7 @@ void nr_feptx_tp(RU_t *ru, int frame_tx, int slot)
       nbfeptx++;
     }
   }
-
-  join_task_ans(ans, nbfeptx);
+  join_task_ans(&ans);
 
   stop_meas(&ru->ofdm_total_stats);
   if (ru->idx == 0)
@@ -379,13 +378,13 @@ void nr_fep_tp(RU_t *ru, int slot) {
   start_meas(&ru->ofdm_demod_stats);
 
   size_t const sz = ru->nb_rx + (ru->half_slot_parallelization > 0) * ru->nb_rx;
-  AssertFatal(sz < 64, "Please, increase buffer size");
-  feprx_cmd_t arr[64] = {0};
-  task_ans_t ans[64] = {0};
+  feprx_cmd_t arr[sz];
+  task_ans_t ans;
+  init_task_ans(&ans, sz);
 
   for (int aid=0;aid<ru->nb_rx;aid++) {
     feprx_cmd_t *feprx_cmd = &arr[nbfeprx];
-    feprx_cmd->ans = &ans[nbfeprx];
+    feprx_cmd->ans = &ans;
 
     feprx_cmd->aid = aid;
     feprx_cmd->ru = ru;
@@ -399,7 +398,7 @@ void nr_fep_tp(RU_t *ru, int slot) {
     nbfeprx++;
     if (ru->half_slot_parallelization > 0) {
       feprx_cmd_t *feprx_cmd = &arr[nbfeprx];
-      feprx_cmd->ans = &ans[nbfeprx];
+      feprx_cmd->ans = &ans;
 
       feprx_cmd->aid = aid;
       feprx_cmd->ru = ru;
@@ -413,8 +412,7 @@ void nr_fep_tp(RU_t *ru, int slot) {
       nbfeprx++;
     }
   }
-
-  join_task_ans(ans, nbfeprx);
+  join_task_ans(&ans);
 
   stop_meas(&ru->ofdm_demod_stats);
   if (ru->idx == 0) VCD_SIGNAL_DUMPER_DUMP_FUNCTION_BY_NAME( VCD_SIGNAL_DUMPER_FUNCTIONS_PHY_PROCEDURES_RU_FEPRX, 0 );
diff --git a/openair1/SCHED_NR_UE/phy_frame_config_nr_ue.c b/openair1/SCHED_NR_UE/phy_frame_config_nr_ue.c
index 4fbe2412239c43d21caa6a44c2146a72b0b49fc6..049d79483260a38ade0a7d4b4abeee7a97306454 100644
--- a/openair1/SCHED_NR_UE/phy_frame_config_nr_ue.c
+++ b/openair1/SCHED_NR_UE/phy_frame_config_nr_ue.c
@@ -139,4 +139,4 @@ int sl_nr_ue_slot_select(const sl_nr_phy_config_request_t *cfg, int slot, uint8_
   }
 
   return slot_type;
-}
\ No newline at end of file
+}
diff --git a/openair1/SIMULATION/NR_PHY/pucchsim.c b/openair1/SIMULATION/NR_PHY/pucchsim.c
index 81d6fff9b3cd890fe55688accbe7396e1a2d5847..ab73a13626b55c0ccc45ae23f70897c4e869092a 100644
--- a/openair1/SIMULATION/NR_PHY/pucchsim.c
+++ b/openair1/SIMULATION/NR_PHY/pucchsim.c
@@ -144,6 +144,7 @@ int main(int argc, char **argv)
   int sr_flag = 0;
   int pucch_DTX_thres = 0;
   cpuf = get_cpu_freq_GHz();
+  bool print_perf = false;
 
   if ((uniqCfg = load_configmodule(argc, argv, CONFIG_ENABLECMDLINEONLY)) == 0) {
     exit_fun("[NR_PUCCHSIM] Error, configuration module init failed\n");
@@ -153,8 +154,7 @@ int main(int argc, char **argv)
   logInit();
 
   int c;
-  while ((c = getopt (argc, argv, "--:O:f:hA:f:g:i:I:P:B:b:t:T:m:n:r:o:s:S:x:y:z:N:F:GR:IL:q:cd:")) != -1) {
-
+  while ((c = getopt(argc, argv, "--:O:f:hA:f:g:i:I:P:B:b:t:T:m:n:r:o:s:S:x:y:z:N:F:GR:IL:q:cd:C")) != -1) {
     /* ignore long options starting with '--', option '-O' and their arguments that are handled by configmodule */
     /* with this opstring getopt returns 1 for non-option arguments, refer to 'man 3 getopt' */
     if (c == 1 || c == '-' || c == 'O')
@@ -349,6 +349,10 @@ int main(int argc, char **argv)
       //nacktoack_flag=(uint8_t)atoi(optarg);
       target_error_rate=0.001;
       break;
+    case 'C':
+      print_perf = 1;
+      cpu_meas_enabled = 1;
+      break;
     default:
     case 'h':
       printf("%s -h(elp) -p(extended_prefix) -N cell_id -f output_filename -F input_filename -g channel_model -n n_frames -t Delayspread -s snr0 -S snr1 -x transmission_mode -y TXant -z RXant -i Intefrence0 -j Interference1 -A interpolation_file -C(alibration offset dB) -N CellId\n", argv[0]);
@@ -381,6 +385,7 @@ int main(int argc, char **argv)
       printf("-x Transmission mode (1,2,6 for the moment)\n");
       printf("-y Number of TX antennas used in eNB\n");
       printf("-z Number of RX antennas used in UE\n");
+      printf("-C print CPU cost\n");
       exit (-1);
       break;
     }
@@ -406,8 +411,8 @@ int main(int argc, char **argv)
   if ((format < 2) && (actual_payload == 4)) do_DTX=1;
 
   if (random_payload) {
-    srand(time(NULL));   // Initialization, should only be called once.
-    actual_payload = rand();      // Returns a pseudo-random integer between 0 and RAND_MAX.
+    double tmp = uniformrandom();
+    memcpy(&actual_payload, &tmp, sizeof(actual_payload));
   }
   actual_payload &= nr_bit < 64 ? (1UL << nr_bit) - 1: 0xffffffffffffffff;
 
@@ -638,6 +643,7 @@ int main(int argc, char **argv)
 
       // noise measurement (all PRBs)
       gNB_I0_measurements(gNB, nr_slot_tx, 0, gNB->frame_parms.symbols_per_slot, rb_mask_ul);
+      start_meas(&gNB->phy_proc_rx);
 
       if (n_trials==1) printf("noise rxlev %d (%d dB), rxlev pucch %d dB sigma2 %f dB, SNR %f, TX %f, I0 (pucch) %d, I0 (avg) %d\n",rxlev,dB_fixed(rxlev),dB_fixed(rxlev_pucch),sigma2_dB,SNR,10*log10((double)txlev*UE->frame_parms.ofdm_symbol_size/12),gNB->measurements.n0_subband_power_tot_dB[startingPRB],gNB->measurements.n0_subband_power_avg_dB);
       if(format==0){
@@ -731,10 +737,21 @@ int main(int argc, char **argv)
         free(uci_pdu.csi_part1.csi_part1_payload);
 
       }
+      stop_meas(&gNB->phy_proc_rx);
+
       n_errors=((actual_payload^payload_received)&1)+(((actual_payload^payload_received)&2)>>1)+(((actual_payload^payload_received)&4)>>2)+n_errors;
     }
     if (sr_flag == 1)
       printf("SR: SNR=%f, n_trials=%d, n_bit_errors=%d\n",SNR,n_trials,sr_errors);
+    if (print_perf) {
+      time_stats_t *ts = &gNB->phy_proc_rx;
+      printf("cpu time for pucch format %d: per block %.2f us; nb blocks %d, max time %.2f;\n",
+             format,
+             ts->diff / ts->trials / cpuf / 1000.0,
+             ts->trials,
+             ts->max / cpuf / 1000.0);
+      reset_meas(ts);
+    }
     if(nr_bit > 0)
       printf("ACK/NACK: SNR=%f, n_trials=%d, n_bit_errors=%d\n",SNR,n_trials,ack_nack_errors);
     if((float)(ack_nack_errors+sr_errors)/(float)(n_trials)<=target_error_rate){
diff --git a/openair2/LAYER2/NR_MAC_UE/nr_ue_scheduler.c b/openair2/LAYER2/NR_MAC_UE/nr_ue_scheduler.c
index 8e4fbf7154c361a4512b2a4bbd5fc79bcedb3517..300789084459b2366859f0b1756309c30a9caa10 100644
--- a/openair2/LAYER2/NR_MAC_UE/nr_ue_scheduler.c
+++ b/openair2/LAYER2/NR_MAC_UE/nr_ue_scheduler.c
@@ -3416,8 +3416,8 @@ static bool fill_mac_sdu(NR_UE_MAC_INST_t *mac,
     } else {
       *header = (NR_MAC_SUBHEADER_LONG){.R = 0, .F = 1, .LCID = lcid, .L = htons(sdu_length)};
 #ifdef ENABLE_MAC_PAYLOAD_DEBUG
-      LOG_I(NR_MAC, "dumping MAC SDU with length %d: \n", sduL);
-      log_dump(NR_MAC, header, sduL, LOG_DUMP_CHAR, "\n");
+      LOG_I(NR_MAC, "dumping MAC SDU with length %d: \n", sdu_length);
+      log_dump(NR_MAC, header, sdu_length, LOG_DUMP_CHAR, "\n");
 #endif
     }
     mac_ce_p->cur_ptr += header_sz + sdu_length;
@@ -3588,10 +3588,23 @@ static uint8_t nr_ue_get_sdu(NR_UE_MAC_INST_t *mac,
     LOG_D(NR_MAC, "Filling remainder %d bytes to the UL PDU \n", remain);
     *(NR_MAC_SUBHEADER_FIXED *)mac_ce_info.cur_ptr = (NR_MAC_SUBHEADER_FIXED){.R = 0, .LCID = UL_SCH_LCID_PADDING};
     mac_ce_info.cur_ptr++;
-    memset(mac_ce_info.cur_ptr, 0, mac_ce_info.pdu_end - mac_ce_info.cur_ptr);
+    if (get_softmodem_params()->phy_test || get_softmodem_params()->do_ra) {
+      uint8_t *buf = mac_ce_info.cur_ptr;
+      uint8_t *end = mac_ce_info.pdu_end;
+      for (; buf < end && ((intptr_t)buf) % 4; buf++)
+        *buf = lrand48() & 0xff;
+      for (; buf < end - 3; buf += 4) {
+        uint32_t *buf32 = (uint32_t *)buf;
+        *buf32 = lrand48();
+      }
+      for (; buf < end; buf++)
+        *buf = lrand48() & 0xff;
+    } else {
+      memset(mac_ce_info.cur_ptr, 0, mac_ce_info.pdu_end - mac_ce_info.cur_ptr);
+    }
   }
 #ifdef ENABLE_MAC_PAYLOAD_DEBUG
-  LOG_I(NR_MAC, "MAC PDU %d bytes \n", mac_ce_p->cur_ptr - ulsch_buffer);
+  LOG_I(NR_MAC, "MAC PDU %ld bytes\n", mac_ce_info.cur_ptr - ulsch_buffer);
   log_dump(NR_MAC, ulsch_buffer, buflen, LOG_DUMP_CHAR, "\n");
 #endif
 
diff --git a/openair2/NR_UE_PHY_INTERFACE/NR_IF_Module.c b/openair2/NR_UE_PHY_INTERFACE/NR_IF_Module.c
index 0f88066cc13c120657b813ca6fbcf158537d43f0..00f3b303e9d0928e0c6e945b20d439c067b90fe0 100644
--- a/openair2/NR_UE_PHY_INTERFACE/NR_IF_Module.c
+++ b/openair2/NR_UE_PHY_INTERFACE/NR_IF_Module.c
@@ -70,6 +70,8 @@ static void save_pdsch_pdu_for_crnti(nfapi_nr_dl_tti_request_t *dl_tti_request);
 void print_ue_mac_stats(const module_id_t mod, const int frame_rx, const int slot_rx)
 {
   NR_UE_MAC_INST_t *mac = get_mac_inst(mod);
+  if (mac->state != UE_CONNECTED)
+    return;
   int ret = pthread_mutex_lock(&mac->if_mutex);
   AssertFatal(!ret, "mutex failed %d\n", ret);