viterbi_lte.c 14.2 KB
Newer Older
1
2
3
4
5
/*
 * Licensed to the OpenAirInterface (OAI) Software Alliance under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The OpenAirInterface Software Alliance licenses this file to You under
Cedric Roux's avatar
Cedric Roux committed
6
 * the OAI Public License, Version 1.1  (the "License"); you may not use this file
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
 * except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *      http://www.openairinterface.org/?page_id=698
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 *-------------------------------------------------------------------------------
 * For more information about the OpenAirInterface (OAI) Software Alliance:
 *      contact@openairinterface.org
 */

22
23
24
25
/* file: viterbit_lte.c
   purpose: SIMD optimized LTE Viterbi Decoder for rate 1/3 Tail-biting convolutional code.  Performs two iterations
            of code.  First pass does Viterbi with all initial partial metrics set to zero.  Second pass does Viterbi
            with initial partial metrics set to values from final state values after first pass. Max is selected at
26
      end to do trace-back.
27
   author: raymond.knopp@eurecom.fr
28
   date: 21.10.2009
29
30
31
32
33
34
35
36
37
38
39
40
*/

#ifndef TEST_DEBUG
#include "PHY/defs.h"
#include "PHY/extern.h"
#else
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#define msg printf
#endif

41

42
#include "PHY/sse_intrin.h"
43

gauthier's avatar
gauthier committed
44
extern uint8_t ccodelte_table[128],ccodelte_table_rev[128];
45
46
47
48




gauthier's avatar
gauthier committed
49
50
static int8_t m0_table[64*16*16*16] __attribute__ ((aligned(16)));
static int8_t m1_table[64*16*16*16] __attribute__ ((aligned(16)));
51
52
53


// Set up Viterbi tables for SSE2 implementation
54
55
void phy_generate_viterbi_tables_lte()
{
56

gauthier's avatar
gauthier committed
57
58
  int8_t w[8],in0,in1,in2;
  uint8_t state,index0,index1;
59
60

  for (in0 = -8; in0 <8 ; in0++) {   // use 4-bit quantization
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
    for (in1 = -8; in1 <8 ; in1++) {
      for (in2 = -8; in2 <8 ; in2++) {


        w[0] = 24 - in0 - in1 - in2;           // -1,-1,-1
        w[1] = 24 + in0 - in1 - in2;           // -1, 1,-1
        w[2] = 24 - in0 + in1 - in2;           //  1,-1,-1
        w[3] = 24 + in0 + in1 - in2;           //  1, 1,-1
        w[4] = 24 - in0 - in1 + in2;           // -1,-1, 1
        w[5] = 24 + in0 - in1 + in2;           // -1, 1, 1
        w[6] = 24 - in0 + in1 + in2;           //  1,-1, 1
        w[7] = 24 + in0 + in1 + in2;           //  1, 1, 1

        //    printf("w: %d %d %d %d\n",w[0],w[1],w[2],w[3]);
        for (state=0; state<64 ; state++) {

          // input 0
          index0 = (state<<1);

          m0_table[(in0+8 + (16*(in1+8)) + (256*(in2+8)))*64 +state] = w[ccodelte_table_rev[index0]];


          //    if (position < 8)
          //    printf("%d,%d : prev_state0 = %d,m0 = %d,w=%d (%d)\n",position,state,prev_state0%64,m0,w[ccodelte_table[prev_state0]],partial_metrics[prev_state0%64]);

          // input 1
          index1 = (1+ (state<<1));
          m1_table[(in0+8 + (16*(in1+8)) + (256*(in2+8)))*64 +state] = w[ccodelte_table_rev[index1]];

        }
91
92
93
94
95
96
97
98
99
100
101
102
      }
    }
  }
}


#define INIT0 0x00000080
#define RESCALE 0x00000040

//#define DEBUG_VITERBI

#ifdef DEBUG_VITERBI
103
104
void print_bytes(char *s,__m128i *x)
{
105

gauthier's avatar
gauthier committed
106
  uint8_t *tempb = (uint8_t *)x;
107
108

  printf("%s  : %d,%d,%d,%d,%d,%d,%d,%d,%d,%d,%d,%d,%d,%d,%d,%d\n",s,
109
110
         tempb[0],tempb[1],tempb[2],tempb[3],tempb[4],tempb[5],tempb[6],tempb[7],
         tempb[8],tempb[9],tempb[10],tempb[11],tempb[12],tempb[13],tempb[14],tempb[15]);
111
112
113
114
115
116

}

/*
void print_shorts(__m128i x,char *s) {

gauthier's avatar
gauthier committed
117
  int16_t *tempb = (int16_t *)&x;
118
119

  printf("%s  : %d,%d,%d,%d,%d,%d,%d,%d\n",s,
120
121
   tempb[0],tempb[1],tempb[2],tempb[3],tempb[4],tempb[5],tempb[6],tempb[7]
   );
122
123
124

}
*/
Mongazon's avatar
Mongazon committed
125
#endif // DEBUG_VITERBI
126
127
128



129
130
131
void phy_viterbi_lte_sse2(int8_t *y,uint8_t *decoded_bytes,uint16_t n)
{

132

133
134
135
136
137
138
139
140
#if defined(__x86_64__) || defined(__i386__)
  __m128i  TB[4*8192];
  __m128i *m0_ptr,*m1_ptr,*TB_ptr = &TB[0];
  
  __m128i metrics0_15,metrics16_31,metrics32_47,metrics48_63,even0_30a,even0_30b,even32_62a,even32_62b,odd1_31a,odd1_31b,odd33_63a,odd33_63b,TBeven0_30,TBeven32_62,TBodd1_31,
    TBodd33_63;
  
  __m128i min_state,min_state2;
141

142
143
#elif defined(__arm__)
  uint8x16x2_t TB[2*8192];  // 2 int8x16_t per input bit, 8 bits / byte, 8192 is largest packet size in bits
144

145
146
147
148
149
150
151
152
153
154
  uint8x16_t even0_30a,even0_30b,even32_62a,even32_62b,odd1_31a,odd1_31b,odd33_63a,odd33_63b,TBeven0_30,TBeven32_62,TBodd1_31,TBodd33_63;
  uint8x16x2_t metrics0_31,metrics32_63;

  uint8x16_t min_state;

  uint8x16_t *m0_ptr,*m1_ptr;
  uint8x16x2_t *TB_ptr = &TB[0];


#endif
gauthier's avatar
gauthier committed
155
156
157
158
159
160
  int8_t *in = y;
  uint8_t prev_state0,maxm,s;
  static uint8_t *TB_ptr2;
  uint32_t table_offset;
  uint8_t iter;
  int16_t position;
161
162
163
164

  // set initial metrics
  //debug_msg("Doing viterbi\n");

165
166
167
#if defined(__x86_64__) || defined(__i386__)

  metrics0_15  = _mm_setzero_si128();
168
169
170
  metrics16_31 = _mm_setzero_si128();
  metrics32_47 = _mm_setzero_si128();
  metrics48_63 = _mm_setzero_si128();
171
172
173
174
175
#elif defined(__arm__)
    metrics0_31.val[0]  = vdupq_n_u8(0); 
    metrics0_31.val[1]  = vdupq_n_u8(0);
    metrics32_63.val[0] = vdupq_n_u8(0);
    metrics32_63.val[1] = vdupq_n_u8(0);
176
177
#endif

178
  for (iter=0; iter<2; iter++) {
179
180
181
    in = y;
    TB_ptr=&TB[0];

182
183
184
    for (position=0; position<n; position++) {


185
186
      // get branch metric offsets for the 64 states
      table_offset = (in[0]+8 + ((in[1]+8)<<4) + ((in[2]+8)<<8))<<6;
187
188

#if defined(__x86_64__) || defined(__i386__)
189
190
      m0_ptr = (__m128i *)&m0_table[table_offset];
      m1_ptr = (__m128i *)&m1_table[table_offset];
191

192
193
194
195
196
      // even states
      even0_30a  = _mm_adds_epu8(metrics0_15,m0_ptr[0]);
      even32_62a = _mm_adds_epu8(metrics16_31,m0_ptr[1]);
      even0_30b  = _mm_adds_epu8(metrics32_47,m0_ptr[2]);
      even32_62b = _mm_adds_epu8(metrics48_63,m0_ptr[3]);
197
198


199
200
201
202
203
      // odd states
      odd1_31a   = _mm_adds_epu8(metrics0_15,m1_ptr[0]);
      odd33_63a  = _mm_adds_epu8(metrics16_31,m1_ptr[1]);
      odd1_31b   = _mm_adds_epu8(metrics32_47,m1_ptr[2]);
      odd33_63b  = _mm_adds_epu8(metrics48_63,m1_ptr[3]);
204

205
      // select maxima
206

207
208
209
210
      even0_30a  = _mm_max_epu8(even0_30a,even0_30b);
      even32_62a = _mm_max_epu8(even32_62a,even32_62b);
      odd1_31a   = _mm_max_epu8(odd1_31a,odd1_31b);
      odd33_63a  = _mm_max_epu8(odd33_63a,odd33_63b);
211

212
      // Traceback information
213

214
215
216
217
      TBeven0_30  = _mm_cmpeq_epi8(even0_30a,even0_30b);
      TBeven32_62 = _mm_cmpeq_epi8(even32_62a,even32_62b);
      TBodd1_31   = _mm_cmpeq_epi8(odd1_31a,odd1_31b);
      TBodd33_63  = _mm_cmpeq_epi8(odd33_63a,odd33_63b);
218

219
220
221
222
      metrics0_15        = _mm_unpacklo_epi8(even0_30a ,odd1_31a);
      metrics16_31       = _mm_unpackhi_epi8(even0_30a ,odd1_31a);
      metrics32_47       = _mm_unpacklo_epi8(even32_62a,odd33_63a);
      metrics48_63       = _mm_unpackhi_epi8(even32_62a,odd33_63a);
223

224
225
226
227
      TB_ptr[0]  = _mm_unpacklo_epi8(TBeven0_30,TBodd1_31);
      TB_ptr[1] = _mm_unpackhi_epi8(TBeven0_30,TBodd1_31);
      TB_ptr[2] = _mm_unpacklo_epi8(TBeven32_62,TBodd33_63);
      TB_ptr[3] = _mm_unpackhi_epi8(TBeven32_62,TBodd33_63);
228
229


230
231
      in+=3;
      TB_ptr += 4;
232

233
234
      // rescale by subtracting minimum
      /****************************************************
235
      USE SSSE instruction phminpos!!!!!!!
236
237
238
239
      ****************************************************/
      min_state =_mm_min_epu8(metrics0_15,metrics16_31);
      min_state =_mm_min_epu8(min_state,metrics32_47);
      min_state =_mm_min_epu8(min_state,metrics48_63);
240

241
242
243
244
      min_state2 = min_state;
      min_state  = _mm_unpacklo_epi8(min_state,min_state);
      min_state2 = _mm_unpackhi_epi8(min_state2,min_state2);
      min_state  = _mm_min_epu8(min_state,min_state2);
245

246
247
248
249
      min_state2 = min_state;
      min_state  = _mm_unpacklo_epi8(min_state,min_state);
      min_state2 = _mm_unpackhi_epi8(min_state2,min_state2);
      min_state  = _mm_min_epu8(min_state,min_state2);
250

251
252
253
254
      min_state2 = min_state;
      min_state  = _mm_unpacklo_epi8(min_state,min_state);
      min_state2 = _mm_unpackhi_epi8(min_state2,min_state2);
      min_state  = _mm_min_epu8(min_state,min_state2);
255

256
257
258
259
      min_state2 = min_state;
      min_state  = _mm_unpacklo_epi8(min_state,min_state);
      min_state2 = _mm_unpackhi_epi8(min_state2,min_state2);
      min_state  = _mm_min_epu8(min_state,min_state2);
260

261
262
263
264
      metrics0_15  = _mm_subs_epu8(metrics0_15,min_state);
      metrics16_31 = _mm_subs_epu8(metrics16_31,min_state);
      metrics32_47 = _mm_subs_epu8(metrics32_47,min_state);
      metrics48_63 = _mm_subs_epu8(metrics48_63,min_state);
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
#elif defined(__arm__)
    m0_ptr = (uint8x16_t *)&m0_table[table_offset];
    m1_ptr = (uint8x16_t *)&m1_table[table_offset];


    // even states
    even0_30a  = vqaddq_u8(metrics0_31.val[0],m0_ptr[0]);
    even32_62a = vqaddq_u8(metrics0_31.val[1],m0_ptr[1]);
    even0_30b  = vqaddq_u8(metrics32_63.val[0],m0_ptr[2]);
    even32_62b = vqaddq_u8(metrics32_63.val[1],m0_ptr[3]);

    // odd states
    odd1_31a   = vqaddq_u8(metrics0_31.val[0],m1_ptr[0]);
    odd33_63a  = vqaddq_u8(metrics0_31.val[1],m1_ptr[1]);
    odd1_31b   = vqaddq_u8(metrics32_63.val[0],m1_ptr[2]);
    odd33_63b  = vqaddq_u8(metrics32_63.val[1],m1_ptr[3]);
    // select maxima
    even0_30a  = vmaxq_u8(even0_30a,even0_30b);
    even32_62a = vmaxq_u8(even32_62a,even32_62b);
    odd1_31a   = vmaxq_u8(odd1_31a,odd1_31b);
    odd33_63a  = vmaxq_u8(odd33_63a,odd33_63b);

    // Traceback information
    TBeven0_30  = vceqq_u8(even0_30a,even0_30b);
    TBeven32_62 = vceqq_u8(even32_62a,even32_62b);
    TBodd1_31   = vceqq_u8(odd1_31a,odd1_31b);
    TBodd33_63  = vceqq_u8(odd33_63a,odd33_63b);

    metrics0_31  = vzipq_u8(even0_30a,odd1_31a);
    metrics32_63 = vzipq_u8(even32_62a,odd33_63a);

    TB_ptr[0] = vzipq_u8(TBeven0_30,TBodd1_31);
    TB_ptr[1] = vzipq_u8(TBeven32_62,TBodd33_63);

    in+=2;
    TB_ptr += 2;

    // rescale by subtracting minimum
    /****************************************************
    USE SSSE instruction phminpos!!!!!!!
    ****************************************************/
    min_state =vminq_u8(metrics0_31.val[0],metrics0_31.val[1]);
    min_state =vminq_u8(min_state,metrics32_63.val[0]);
    min_state =vminq_u8(min_state,metrics32_63.val[1]);
    // here we have 16 maximum metrics from the 64 states
    uint8x8_t min_state2 = vpmin_u8(((uint8x8_t*)&min_state)[0],((uint8x8_t*)&min_state)[0]);
    // now the 8 maximum in min_state2
    min_state2 = vpmin_u8(min_state2,min_state2);
    // now the 4 maximum in min_state2, repeated twice
    min_state2 = vpmin_u8(min_state2,min_state2);
    // now the 2 maximum in min_state2, repeated 4 times
    min_state2 = vpmin_u8(min_state2,min_state2);
    // now the 1 maximum in min_state2, repeated 8 times
    min_state  = vcombine_u8(min_state2,min_state2);
    // now the 1 maximum in min_state, repeated 16 times
    metrics0_31.val[0]  = vqsubq_u8(metrics0_31.val[0],min_state);
    metrics0_31.val[1]  = vqsubq_u8(metrics0_31.val[1],min_state);
    metrics32_63.val[0] = vqsubq_u8(metrics32_63.val[0],min_state);
    metrics32_63.val[1] = vqsubq_u8(metrics32_63.val[1],min_state);
#endif
325
326
327
328
329
330
331
332
    }

  } // iteration

  // Traceback
  prev_state0 = 0;
  maxm = 0;

333
#if defined(__x86_64__) || defined(__i386__)
334
  for (s=0; s<16; s++)
gauthier's avatar
gauthier committed
335
336
    if (((uint8_t *)&metrics0_15)[s] > maxm) {
      maxm = ((uint8_t *)&metrics0_15)[s];
337
338
339
      prev_state0 = s;
    }

340
  for (s=0; s<16; s++)
gauthier's avatar
gauthier committed
341
342
    if (((uint8_t *)&metrics16_31)[s] > maxm) {
      maxm = ((uint8_t *)&metrics16_31)[s];
343
344
345
      prev_state0 = s+16;
    }

346
  for (s=0; s<16; s++)
gauthier's avatar
gauthier committed
347
348
    if (((uint8_t *)&metrics32_47)[s] > maxm) {
      maxm = ((uint8_t *)&metrics32_47)[s];
349
350
351
      prev_state0 = s+32;
    }

352
  for (s=0; s<16; s++)
gauthier's avatar
gauthier committed
353
354
    if (((uint8_t *)&metrics48_63)[s] > maxm) {
      maxm = ((uint8_t *)&metrics48_63)[s];
355
356
357
      prev_state0 = s+48;
    }

358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384

#elif defined(__arm__)
  for (s=0; s<16; s++)
    if (((uint8_t *)&metrics0_31.val[0])[s] > maxm) {
      maxm = ((uint8_t *)&metrics0_31.val[0])[s];
      prev_state0 = s;
    }

  for (s=0; s<16; s++)
    if (((uint8_t *)&metrics0_31.val[1])[s] > maxm) {
      maxm = ((uint8_t *)&metrics0_31.val[1])[s];
      prev_state0 = s+16;
    }

  for (s=0; s<16; s++)
    if (((uint8_t *)&metrics32_63.val[0])[s] > maxm) {
      maxm = ((uint8_t *)&metrics32_63.val[0])[s];
      prev_state0 = s+32;
    }

  for (s=0; s<16; s++)
    if (((uint8_t *)&metrics32_63.val[1])[s] > maxm) {
      maxm = ((uint8_t *)&metrics32_63.val[1])[s];
      prev_state0 = s+48;
    }
#endif

gauthier's avatar
gauthier committed
385
  TB_ptr2 = (uint8_t *)&TB[(n-1)*4];
386

387
388
389
  for (position = n-1 ; position>-1; position--) {

    decoded_bytes[(position)>>3] += (prev_state0 & 0x1)<<(7-(position & 0x7));
390
391
392


    if (TB_ptr2[prev_state0] == 0)
393
394
395
396
397
398
      prev_state0 = (prev_state0 >> 1);
    else
      prev_state0 = 32 + (prev_state0>>1);

    TB_ptr2-=64;
  }
399

400
401

#if defined(__x86_64__) || defined(__i386__)
402
403
  _mm_empty();
  _m_empty();
404
#endif
405
406
407
}

#ifdef TEST_DEBUG
gauthier's avatar
gauthier committed
408
int test_viterbi(uint8_t dabflag)
409
{
gauthier's avatar
gauthier committed
410
411
412
  uint8_t test[8];
  //_declspec(align(16))  int8_t channel_output[512];
  //_declspec(align(16))  uint8_t output[512],decoded_output[16], *inPtr, *outPtr;
413

gauthier's avatar
gauthier committed
414
415
416
  int8_t channel_output[512];
  uint8_t output[512],decoded_output[16], *inPtr, *outPtr;
  uint32_t i;
417

418

419
420
421
422
423
424
425
426
  test[0] = 7;
  test[1] = 0xa5;
  test[2] = 0;
  test[3] = 0xfe;
  test[4] = 0x1a;
  test[5] = 0x33;
  test[6] = 0x99;
  test[7] = 0x14;
427

428
429
430
  if (dabflag==0) {
    ccodelte_init();
    ccodelte_init_inv();
431
  } else {
432
433
434
435
    ccodedab_init();
    ccodedab_init_inv();
    printf("Running with DAB polynomials\n");
  }
436

437
438
439
  inPtr = test;
  outPtr = output;
  phy_generate_viterbi_tables_lte();
440
441
442
443
444
  ccodelte_encode(64,     //input length in bits
                  0,      // add 16-bit crc with rnti scrambling
                  inPtr,  // input pointer
                  outPtr, // output pointer
                  0);     // rnti (optional)
445

446
  for (i = 0; i < 64*3; i++) {
447
448
449
450
451
452
    channel_output[i] = 7*(2*output[i] - 1);
  }

  memset(decoded_output,0,16);
  phy_viterbi_lte_sse2(channel_output,decoded_output,64);
  printf("Optimized Viterbi :");
453

454
455
456
457
458
459
460
  for (i =0 ; i<8 ; i++)
    printf("input %d : %x => %x\n",i,inPtr[i],decoded_output[i]);
}




461
462
int main(int argc, char **argv)
{
463
464

  char c;
gauthier's avatar
gauthier committed
465
  uint8_t dabflag=0;
466
467
468
469
470
471
472
473
474
475
476
477
478

  while ((c = getopt (argc, argv, "d")) != -1) {
    if (c=='d')
      dabflag=1;
  }

  test_viterbi(dabflag);
  return(0);
}

#endif // TEST_DEBUG