Commit 8c5e8126 authored by Florian Kaltenberger's avatar Florian Kaltenberger
Browse files

Added support for ARM NEON, lots of changes in openair1 and some in cmake_targets

git-svn-id: http://svn.eurecom.fr/openair4G/trunk@7543 818b1a75-f10b-46b9-bf7c-635c3b92a50f
parent 85ff3abc
......@@ -126,7 +126,7 @@ add_list_string_option(CMAKE_BUILD_TYPE "RelWithDebInfo" "Choose the type of bui
Message("Architecture is ${CMAKE_SYSTEM_PROCESSOR}")
if (CMAKE_SYSTEM_PROCESSOR STREQUAL "armv7l")
set(C_FLAGS_PROCESSOR "-mfloat-abi=softfp -mfpu=neon")
set(C_FLAGS_PROCESSOR "-gdwarf-2 -mfloat-abi=hard -mfpu=neon -lgcc -lrt")
else (CMAKE_SYSTEM_PROCESSOR STREQUAL "armv7l")
set(C_FLAGS_PROCESSOR "-msse4.2")
endif()
......@@ -140,8 +140,8 @@ set(CMAKE_C_FLAGS
# set a flag for changes in the source code
# these changes are related to hardcoded path to include .h files
add_definitions(-DCMAKER)
set(CMAKE_C_FLAGS_DEBUG "${CMAKE_C_FLAGS} -ggdb -DMALLOC_CHECK_=3")
set(CMAKE_C_FLAGS_RELWITHDEBINFO "${CMAKE_C_FLAGS} -ggdb -DMALLOC_CHECK_=3 -O2")
set(CMAKE_C_FLAGS_DEBUG "${CMAKE_C_FLAGS} -g -DMALLOC_CHECK_=3")
set(CMAKE_C_FLAGS_RELWITHDEBINFO "${CMAKE_C_FLAGS} -g -DMALLOC_CHECK_=3 -O2")
# Below has been put in comment because does not work with
# SVN authentication.
......@@ -778,7 +778,6 @@ set(PHY_SRC
${OPENAIR1_DIR}/PHY/TOOLS/log2_approx.c
${OPENAIR1_DIR}/PHY/TOOLS/cmult_sv.c
${OPENAIR1_DIR}/PHY/TOOLS/cmult_vv.c
${OPENAIR1_DIR}/PHY/TOOLS/cadd_vv.c
${OPENAIR1_DIR}/PHY/TOOLS/cdot_prod.c
${OPENAIR1_DIR}/PHY/TOOLS/signal_energy.c
${OPENAIR1_DIR}/PHY/TOOLS/dB_routines.c
......@@ -1692,7 +1691,7 @@ foreach(myExe dlsim ulsim pbchsim scansim mbmssim pdcchsim pucchsim prachsim syn
${XFORMS_SOURCE}
)
target_link_libraries (${myExe}
-Wl,--start-group SIMU UTIL SCHED_LIB PHY LFDS MSC ${ITTI_LIB} -Wl,--end-group
-Wl,--start-group SIMU UTIL SCHED_LIB PHY LFDS ${ITTI_LIB} -Wl,--end-group
pthread m rt ${CONFIG_LIBRARIES} ${ATLAS_LIBRARIES} ${XFORMS_LIBRARIES}
)
endforeach(myExe)
......
......@@ -10,5 +10,6 @@ set(RANDOM_BF False)
set(PBS_SIM False)
set(PERFECT_CE False)
set(NAS_UE False)
set(MESSAGE_CHART_GENERATOR False)
include(${CMAKE_CURRENT_SOURCE_DIR}/../CMakeLists.txt)
......@@ -31,8 +31,9 @@
author: raymond.knopp@eurecom.fr
date: 10.2009
*/
#include "defs.h"
//#include "lte_interleaver_inline.h"
#ifndef TC_MAIN
//#include "defs.h"
#endif
#include "extern_3GPPinterleaver.h"
......
This diff is collapsed.
TURBO_SRC = 3gpplte.c 3gpplte_turbo_decoder_sse.c crc_byte.c
TURBO_SRC = 3gpplte_sse.c 3gpplte_turbo_decoder_sse.c crc_byte.c
RATE13CC_SRC = ccoding_byte_lte.c viterbi_lte.c crc_byte.c
RATE12CC_SRC = ccoding_byte.c viterbi.c crc_byte.c
all: turbolte_test rate13cc_test rate12cc_test run_turbo run_rate13cc run_rate13ccdab run_rate12cc
all: 3gpplte_sse
turbolte_test: $(TURBO_SRC)
gcc -o turbo_test $(TURBO_SRC) -DTEST_DEBUG -DUSER_MODE -msse2 -mssse3 -Wall
3gpplte_sse: $(TURBO_SRC)
gcc -o 3gpplte_sse 3gpplte_sse.c -msse4 -Wall -g -ggdb -DMAIN
rate13cc_test: $(RATE13CC_SRC)
gcc -o rate13cc_test $(RATE13CC_SRC) -DTEST_DEBUG -DUSER_MODE -msse2 -mssse3 -Wall
rate12cc_test: $(RATE12CC_SRC)
gcc -o rate12cc_test $(RATE12CC_SRC) -DTEST_DEBUG -DUSER_MODE -msse2 -mssse3 -Wall
run_turbo: turbolte_test
./turbo_test
run_rate13cc: rate13cc_test
./rate13cc_test
run_rate13ccdab: rate13cc_test
./rate13cc_test -d
run_rate12cc: rate12cc_test
./rate12cc_test
clean:
rm *.o
......@@ -55,22 +55,22 @@ unsigned char ccodelte_table_rev[128]; // for receiver
void
ccodelte_encode (unsigned int numbits,
unsigned char add_crc,
unsigned char *inPtr,
unsigned char *outPtr,
unsigned short rnti)
ccodelte_encode (int32_t numbits,
uint8_t add_crc,
uint8_t *inPtr,
uint8_t *outPtr,
uint16_t rnti)
{
unsigned int state;
uint32_t state;
unsigned char c, out, first_bit;
char shiftbit=0;
unsigned short c16;
unsigned short next_last_byte=0;
unsigned int crc=0;
uint8_t c, out, first_bit;
int8_t shiftbit=0;
uint16_t c16;
uint16_t next_last_byte=0;
uint32_t crc=0;
#ifdef DEBUG_CCODE
unsigned int dummy=0;
uint32_t dummy=0;
#endif //DEBUG_CCODE
/* The input bit is shifted in position 8 of the state.
......@@ -80,20 +80,19 @@ ccodelte_encode (unsigned int numbits,
if (add_crc == 1) {
crc = crc8(inPtr,numbits);
first_bit = 2;
c = (unsigned char)(crc>>24);
c = (uint8_t)(crc>>24);
} else if (add_crc == 2) {
crc = crc16(inPtr,numbits);
#ifdef DEBUG_CCODE
printf("ccode_lte : crc %x\n",crc);
#endif
// scramble with RNTI
crc ^= (((unsigned int)rnti)<<16);
crc ^= (((uint32_t)rnti)<<16);
#ifdef DEBUG_CCODE
printf("ccode_lte : crc %x (rnti %x)\n",crc,rnti);
#endif
first_bit = 2;
// c = (unsigned char)(crc>>24);
c = (unsigned char)((crc>>16)&0xff);
c = (uint8_t)((crc>>16)&0xff);
} else {
next_last_byte = numbits>>3;
first_bit = (numbits-6)&7;
......@@ -182,7 +181,7 @@ ccodelte_encode (unsigned int numbits,
// now code 8-bit CRC for UCI
if (add_crc == 1) {
c = (unsigned char)(crc>>24);
c = (uint8_t)(crc>>24);
// for (shiftbit = 0; (shiftbit<8);shiftbit++) {
for (shiftbit = 7; (shiftbit>=0); shiftbit--) {
......@@ -209,7 +208,7 @@ ccodelte_encode (unsigned int numbits,
// now code 16-bit CRC for DCI
if (add_crc == 2) {
c16 = (unsigned short)(crc>>16);
c16 = (uint16_t)(crc>>16);
// for (shiftbit = 0; (shiftbit<16);shiftbit++) {
for (shiftbit = 15; (shiftbit>=0); shiftbit--) {
......
......@@ -320,7 +320,7 @@ void threegpplte_turbo_encoder(uint8_t *input,
uint16_t interleaver_f2);
/** \fn void ccodelte_encode(uint32_t numbits,uint8_t add_crc, uint8_t *inPtr,uint8_t *outPtr,uint16_t rnti)
/** \fn void ccodelte_encode(int32_t numbits,uint8_t add_crc, uint8_t *inPtr,uint8_t *outPtr,uint16_t rnti)
\brief This function implements the LTE convolutional code of rate 1/3
with a constraint length of 7 bits. The inputs are bit packed in octets
(from MSB to LSB). Trellis tail-biting is included here.
......@@ -331,7 +331,7 @@ void threegpplte_turbo_encoder(uint8_t *input,
@param rnti RNTI for CRC scrambling
*/
void
ccodelte_encode (uint32_t numbits,
ccodelte_encode (int32_t numbits,
uint8_t add_crc,
uint8_t *inPtr,
uint8_t *outPtr,
......
......@@ -33,9 +33,8 @@
*/
#ifndef EXPRESSMIMO_TARGET
#include "PHY/sse_intrin.h"
#endif //EXPRESSMIMO_TARGET
extern unsigned char ccodedot11_table[128],ccodedot11_table_rev[128];
......@@ -46,12 +45,6 @@ static unsigned char inputs[64][2048];
static unsigned short survivors[64][2048];
static short partial_metrics[64],partial_metrics_new[64];
#ifdef __KERNEL__
#define printf rt_printk
#endif
#ifndef EXPRESSMIMO_TARGET
void phy_viterbi_dot11(char *y,unsigned char *decoded_bytes,unsigned short n)
{
......@@ -191,22 +184,34 @@ void phy_generate_viterbi_tables(void)
#define INIT0 0x00000080
#define RESCALE 0x00000040
static __m128i __attribute__((aligned(16))) TB[4*4095*8];
static __m128i metrics0_15,metrics16_31,metrics32_47,metrics48_63,even0_30a,even0_30b,even32_62a,even32_62b,odd1_31a,odd1_31b,odd33_63a,odd33_63b,TBeven0_30,TBeven32_62,TBodd1_31,
TBodd33_63 __attribute__((aligned(16)));
static __m128i rescale,min_state,min_state2 __attribute__((aligned(16)));
void phy_viterbi_dot11_sse2(char *y,unsigned char *decoded_bytes,unsigned short n,int offset, int traceback )
{
#if defined(__x86_64__) || defined(__i386__)
__m128i TB[4*4095*8]; // 4 __m128i per input bit (64 states, 8-bits per state = 16-way), 4095 is largest packet size in bytes, 8 bits/byte
__m128i metrics0_15,metrics16_31,metrics32_47,metrics48_63,even0_30a,even0_30b,even32_62a,even32_62b,odd1_31a,odd1_31b,odd33_63a,odd33_63b,TBeven0_30,TBeven32_62,TBodd1_31,TBodd33_63;
__m128i min_state,min_state2;
__m128i *m0_ptr,*m1_ptr,*TB_ptr = &TB[offset<<2];
#elif defined(__arm__)
uint8x16x2_t TB[2*4095*8]; // 2 int8x16_t per input bit, 8 bits / byte, 4095 is largest packet size in bytes
uint8x16_t even0_30a,even0_30b,even32_62a,even32_62b,odd1_31a,odd1_31b,odd33_63a,odd33_63b,TBeven0_30,TBeven32_62,TBodd1_31,TBodd33_63;
uint8x16x2_t metrics0_31,metrics32_63;
uint8x16_t min_state;
uint8x16_t *m0_ptr,*m1_ptr;
uint8x16x2_t *TB_ptr = &TB[offset<<1];
#endif
char *in = y;
unsigned char prev_state0;
......@@ -216,6 +221,7 @@ void phy_viterbi_dot11_sse2(char *y,unsigned char *decoded_bytes,unsigned short
short position;
// printf("offset %d, TB_ptr %p\n",offset,TB_ptr);
#if defined(__x86_64__) || defined(__i386__)
if (offset == 0) {
// set initial metrics
......@@ -225,129 +231,64 @@ void phy_viterbi_dot11_sse2(char *y,unsigned char *decoded_bytes,unsigned short
metrics48_63 = _mm_setzero_si128();
}
rescale = _mm_cvtsi32_si128(RESCALE);
#elif defined(__arm__)
if (offset == 0) {
// set initial metrics
/*
print_bytes(metrics0_15,"metrics0_15");
print_bytes(metrics16_31,"metrics16_31");
print_bytes(metrics32_47,"metrics32_47");
print_bytes(metrics48_63,"metrics48_63");
*/
metrics0_31.val[0] = vdupq_n_u8(0); metrics0_31.val[0] = vsetq_lane_u8(INIT0,metrics0_31.val[0],0);
metrics0_31.val[1] = vdupq_n_u8(0);
metrics32_63.val[0] = vdupq_n_u8(0);
metrics32_63.val[1] = vdupq_n_u8(0);
}
for (position=offset; position<(offset+n); position++) {
#endif
for (position=offset; position<(offset+n); position++) {
//printf("%d : (%d,%d)\n",position,in[0],in[1]);
// get branch metric offsets for the 64 states
table_offset = (in[0]+8 + ((in[1]+8)<<4))<<6;
// printf("Table_offset = %u (in[0]=%d,in[1]=%d)\n",table_offset,in[0],in[1]);
#if defined(__x86_64__) || defined(__i386__)
m0_ptr = (__m128i *)&m0_table[table_offset];
m1_ptr = (__m128i *)&m1_table[table_offset];
// printf("\n");
// even states
even0_30a = _mm_adds_epu8(metrics0_15,m0_ptr[0]);
// print_bytes(even0_30a,"even0_30a");
even32_62a = _mm_adds_epu8(metrics16_31,m0_ptr[1]);
// print_bytes(even32_62a,"even32_62a");
even0_30b = _mm_adds_epu8(metrics32_47,m0_ptr[2]);
// print_bytes(even0_30b,"even0_30b");
even32_62b = _mm_adds_epu8(metrics48_63,m0_ptr[3]);
// print_bytes(even32_62b,"even32_62b");
// printf("\n");
// odd states
odd1_31a = _mm_adds_epu8(metrics0_15,m1_ptr[0]);
// print_bytes(odd1_31a,"odd1_31a");
odd33_63a = _mm_adds_epu8(metrics16_31,m1_ptr[1]);
// print_bytes(odd33_63a,"odd33_63a");
odd1_31b = _mm_adds_epu8(metrics32_47,m1_ptr[2]);
// print_bytes(odd1_31b,"odd1_31b");
odd33_63b = _mm_adds_epu8(metrics48_63,m1_ptr[3]);
// print_bytes(odd33_63b,"odd33_63b");
// select maxima
// printf("\n");
even0_30a = _mm_max_epu8(even0_30a,even0_30b);
// print_bytes(even0_30a,"even0_30a");
even32_62a = _mm_max_epu8(even32_62a,even32_62b);
// print_bytes(even32_62a,"even32_62a");
odd1_31a = _mm_max_epu8(odd1_31a,odd1_31b);
// print_bytes(odd1_31a,"odd1_31a");
odd33_63a = _mm_max_epu8(odd33_63a,odd33_63b);
// print_bytes(odd33_63a,"odd33_63a");
// printf("\n");
// Traceback information
TBeven0_30 = _mm_cmpeq_epi8(even0_30a,even0_30b);
TBeven32_62 = _mm_cmpeq_epi8(even32_62a,even32_62b);
TBodd1_31 = _mm_cmpeq_epi8(odd1_31a,odd1_31b);
TBodd33_63 = _mm_cmpeq_epi8(odd33_63a,odd33_63b);
metrics0_15 = _mm_unpacklo_epi8(even0_30a ,odd1_31a);
metrics16_31 = _mm_unpackhi_epi8(even0_30a ,odd1_31a);
metrics32_47 = _mm_unpacklo_epi8(even32_62a,odd33_63a);
metrics48_63 = _mm_unpackhi_epi8(even32_62a,odd33_63a);
//print_bytes(metrics0_15,"metrics0_15");
//print_bytes(metrics16_31,"metrics16_31");
//print_bytes(metrics32_47,"metrics32_47");
//print_bytes(metrics48_63,"metrics48_63");
TB_ptr[0] = _mm_unpacklo_epi8(TBeven0_30,TBodd1_31);
// print_bytes(TB_ptr[0],"TB0_15");
TB_ptr[0] = _mm_unpacklo_epi8(TBeven0_30,TBodd1_31);
TB_ptr[1] = _mm_unpackhi_epi8(TBeven0_30,TBodd1_31);
// print_bytes(TB_ptr[1],"TB16_31");
TB_ptr[2] = _mm_unpacklo_epi8(TBeven32_62,TBodd33_63);
// print_bytes(TB_ptr[2],"TB32_47");
TB_ptr[3] = _mm_unpackhi_epi8(TBeven32_62,TBodd33_63);
// print_bytes(TB_ptr[3],"TB48_63");
in+=2;
TB_ptr += 4;
......@@ -359,50 +300,92 @@ void phy_viterbi_dot11_sse2(char *y,unsigned char *decoded_bytes,unsigned short
min_state =_mm_min_epu8(min_state,metrics32_47);
min_state =_mm_min_epu8(min_state,metrics48_63);
// print_bytes(min_state,"min_state");
min_state2 = min_state;
min_state = _mm_unpacklo_epi8(min_state,min_state);
min_state2 = _mm_unpackhi_epi8(min_state2,min_state2);
min_state = _mm_min_epu8(min_state,min_state2);
// print_bytes(min_state,"min_state");
min_state2 = min_state;
min_state = _mm_unpacklo_epi8(min_state,min_state);
min_state2 = _mm_unpackhi_epi8(min_state2,min_state2);
min_state = _mm_min_epu8(min_state,min_state2);
// print_bytes(min_state,"min_state");
min_state2 = min_state;
min_state = _mm_unpacklo_epi8(min_state,min_state);
min_state2 = _mm_unpackhi_epi8(min_state2,min_state2);
min_state = _mm_min_epu8(min_state,min_state2);
// print_bytes(min_state,"min_state");
min_state2 = min_state;
min_state = _mm_unpacklo_epi8(min_state,min_state);
min_state2 = _mm_unpackhi_epi8(min_state2,min_state2);
min_state = _mm_min_epu8(min_state,min_state2);
// print_bytes(min_state,"min_state");
metrics0_15 = _mm_subs_epu8(metrics0_15,min_state);
metrics16_31 = _mm_subs_epu8(metrics16_31,min_state);
metrics32_47 = _mm_subs_epu8(metrics32_47,min_state);
metrics48_63 = _mm_subs_epu8(metrics48_63,min_state);
#elif defined(__arm__)
m0_ptr = (uint8x16_t *)&m0_table[table_offset];
m1_ptr = (uint8x16_t *)&m1_table[table_offset];
// even states
even0_30a = vqaddq_u8(metrics0_31.val[0],m0_ptr[0]);
even32_62a = vqaddq_u8(metrics0_31.val[1],m0_ptr[1]);
even0_30b = vqaddq_u8(metrics32_63.val[0],m0_ptr[2]);
even32_62b = vqaddq_u8(metrics32_63.val[1],m0_ptr[3]);
// odd states
odd1_31a = vqaddq_u8(metrics0_31.val[0],m1_ptr[0]);
odd33_63a = vqaddq_u8(metrics0_31.val[1],m1_ptr[1]);
odd1_31b = vqaddq_u8(metrics32_63.val[0],m1_ptr[2]);
odd33_63b = vqaddq_u8(metrics32_63.val[1],m1_ptr[3]);
// select maxima
even0_30a = vmaxq_u8(even0_30a,even0_30b);
even32_62a = vmaxq_u8(even32_62a,even32_62b);
odd1_31a = vmaxq_u8(odd1_31a,odd1_31b);
odd33_63a = vmaxq_u8(odd33_63a,odd33_63b);
// Traceback information
TBeven0_30 = vceqq_u8(even0_30a,even0_30b);
TBeven32_62 = vceqq_u8(even32_62a,even32_62b);
TBodd1_31 = vceqq_u8(odd1_31a,odd1_31b);
TBodd33_63 = vceqq_u8(odd33_63a,odd33_63b);
/*
print_bytes(metrics0_15,"metrics0_15");
print_bytes(metrics16_31,"metrics16_31");
print_bytes(metrics32_47,"metrics32_47");
print_bytes(metrics48_63,"metrics48_63");
*/
metrics0_31 = vzipq_u8(even0_30a,odd1_31a);
metrics32_63 = vzipq_u8(even32_62a,odd33_63a);
TB_ptr[0] = vzipq_u8(TBeven0_30,TBodd1_31);
TB_ptr[1] = vzipq_u8(TBeven32_62,TBodd33_63);
in+=2;
TB_ptr += 2;
// rescale by subtracting minimum
/****************************************************
USE SSSE instruction phminpos!!!!!!!
****************************************************/
min_state =vminq_u8(metrics0_31.val[0],metrics0_31.val[1]);
min_state =vminq_u8(min_state,metrics32_63.val[0]);
min_state =vminq_u8(min_state,metrics32_63.val[1]);
// here we have 16 maximum metrics from the 64 states
uint8x8_t min_state2 = vpmin_u8(((uint8x8_t*)&min_state)[0],((uint8x8_t*)&min_state)[0]);
// now the 8 maximum in min_state2
min_state2 = vpmin_u8(min_state2,min_state2);
// now the 4 maximum in min_state2, repeated twice
min_state2 = vpmin_u8(min_state2,min_state2);
// now the 2 maximum in min_state2, repeated 4 times
min_state2 = vpmin_u8(min_state2,min_state2);
// now the 1 maximum in min_state2, repeated 8 times
min_state = vcombine_u8(min_state2,min_state2);
// now the 1 maximum in min_state, repeated 16 times
metrics0_31.val[0] = vqsubq_u8(metrics0_31.val[0],min_state);
metrics0_31.val[1] = vqsubq_u8(metrics0_31.val[1],min_state);
metrics32_63.val[0] = vqsubq_u8(metrics32_63.val[0],min_state);
metrics32_63.val[1] = vqsubq_u8(metrics32_63.val[1],min_state);
#endif
}
// Traceback
......@@ -429,29 +412,10 @@ void phy_viterbi_dot11_sse2(char *y,unsigned char *decoded_bytes,unsigned short
}
}
#if defined(__x86_64) || defined(__i386__)
_mm_empty();
}
#else //EXPRESSMIMO_TARGET
void phy_viterbi_dot11(char *y,unsigned char *decoded_bytes,unsigned short n)
{
}
#endif //EXPRESSMIMO_TARGET
/*
void print_bytes(__m128i x,char *s) {
unsigned char *tempb = (unsigned char *)&x;
printf("%s : %u,%u,%u,%u,%u,%u,%u,%u,%u,%u,%u,%u,%u,%u,%u,%u\n",s,
tempb[0],tempb[1],tempb[2],tempb[3],tempb[4],tempb[5],tempb[6],tempb[7],
tempb[8],tempb[9],tempb[10],tempb[11],tempb[12],tempb[13],tempb[14],tempb[15]);
#endif
}
*/
#ifdef TEST_DEBUG
#include <stdio.h>
......
......@@ -49,21 +49,14 @@
#define msg printf
#endif
#ifndef EXPRESSMIMO_TARGET
#include "PHY/sse_intrin.h"
#endif //EXPRESSMIMO_TARGET
extern uint8_t ccodelte_table[128],ccodelte_table_rev[128];
#ifdef __KERNEL__
#define printf rt_printk
#endif
#ifndef EXPRESSMIMO_TARGET
static int8_t m0_table[64*16*16*16] __attribute__ ((aligned(16)));
static int8_t m1_table[64*16*16*16] __attribute__ ((aligned(16)));
......@@ -143,20 +136,33 @@ void print_shorts(__m128i x,char *s) {
#endif // USER_MODE
static __m128i TB[4*8192];
static __m128i metrics0_15,metrics16_31,metrics32_47,metrics48_63,even0_30a,even0_30b,even32_62a,even32_62b,odd1_31a,odd1_31b,odd33_63a,odd33_63b,TBeven0_30,TBeven32_62,TBodd1_31,
TBodd33_63;// __attribute__((aligned(16)));
static __m128i min_state,min_state2;// __attribute__((aligned(16)));
void phy_viterbi_lte_sse2(int8_t *y,uint8_t *decoded_bytes,uint16_t n)
{
static __m128i *m0_ptr,*m1_ptr,*TB_ptr = &TB[0];
#if defined(__x86_64__) || defined(__i386__)
__m128i TB[4*8192];
__m128i *m0_ptr,*m1_ptr,*TB_ptr = &TB[0];
__m128i metrics0_15,metrics16_31,metrics32_47,metrics48_63,even0_30a,even0_30b,even32_62a,even32_62b,odd1_31a,odd1_31b,odd33_63a,odd33_63b,TBeven0_30,TBeven32_62,TBodd1_31,
TBodd33_63;
__m128i min_state,min_state2;
#elif defined(__arm__)
uint8x16x2_t TB[2*8192]; // 2 int8x16_t per input bit, 8 bits / byte, 8192 is largest packet size in bits
uint8x16_t even0_30a,even0_30b,even32_62a,even32_62b,odd1_31a,odd1_31b,odd33_63a,odd33_63b,TBeven0_30,TBeven32_62,TBodd1_31,TBodd33_63;
uint8x16x2_t metrics0_31,metrics32_63;
uint8x16_t min_state;
uint8x16_t *m0_ptr,*m1_ptr;
uint8x16x2_t *TB_ptr = &TB[0];
#endif
int8_t *in = y;
uint8_t prev_state0,maxm,s;
static uint8_t *TB_ptr2;
......@@ -167,140 +173,70 @@ void phy_viterbi_lte_sse2(int8_t *y,uint8_t *decoded_bytes,uint16_t n)
// set initial metrics
//debug_msg("Doing viterbi\n");
metrics0_15 = _mm_setzero_si128();
#if defined(__x86_64__) || defined(__i386__)
metrics0_15 = _mm_setzero_si128();
metrics16_31 = _mm_setzero_si128();