3gpplte_turbo_decoder_sse_8bit.c 52.5 KB
Newer Older
1
2
3
4
5
/*
 * Licensed to the OpenAirInterface (OAI) Software Alliance under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The OpenAirInterface Software Alliance licenses this file to You under
Cedric Roux's avatar
Cedric Roux committed
6
 * the OAI Public License, Version 1.1  (the "License"); you may not use this file
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
 * except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *      http://www.openairinterface.org/?page_id=698
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 *-------------------------------------------------------------------------------
 * For more information about the OpenAirInterface (OAI) Software Alliance:
 *      contact@openairinterface.org
 */

22
23
24
/* file: 3gpplte_turbo_decoder_sse.c
   purpose: Routines for implementing max-logmap decoding of Turbo-coded (DLSCH) transport channels from 36-212, V8.6 2009-03
   authors: raymond.knopp@eurecom.fr, Laurent Thomas (Alcatel-Lucent)
25
   date: 21.10.2009
26
27

   Note: This routine currently requires SSE2,SSSE3 and SSE4.1 equipped computers.  It uses 16-bit inputs for LLRs and 8-bit arithmetic for internal computations!
28

29
30
31
32
33
34
35
36
37
38
   Changelog: 17.11.2009 FK SSE4.1 not required anymore
   Aug. 2012 new parallelization options for higher speed (8-way parallelization)
   Jan. 2013 8-bit LLR support with 16-way parallelization
   Feb. 2013 New interleaving and hard-decision optimizations (L. Thomas)
   May 2013 Extracted 8-bit code
*/

///
///

39
#include "PHY/sse_intrin.h"
40

41
#ifndef TEST_DEBUG
42
43
44
  #include "PHY/defs_common.h"
  #include "PHY/CODING/coding_defs.h"
  #include "PHY/CODING/lte_interleaver_inline.h"
45
46
#else

47
48
49
50
  #include "defs.h"
  #include <stdio.h>
  #include <stdlib.h>
  #include <string.h>
51
52
53
#endif

#ifdef MEX
54
  #include "mex.h"
55
56
#endif

57
58
#include "common/ran_context.h"

59
#define SHUFFLE16(a,b,c,d,e,f,g,h) _mm_set_epi8(h==-1?-1:h*2+1, \
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
    h==-1?-1:h*2, \
    g==-1?-1:g*2+1, \
    g==-1?-1:g*2, \
    f==-1?-1:f*2+1, \
    f==-1?-1:f*2, \
    e==-1?-1:e*2+1, \
    e==-1?-1:e*2, \
    d==-1?-1:d*2+1, \
    d==-1?-1:d*2, \
    c==-1?-1:c*2+1, \
    c==-1?-1:c*2, \
    b==-1?-1:b*2+1, \
    b==-1?-1:b*2, \
    a==-1?-1:a*2+1, \
    a==-1?-1:a*2);
75
76
77
78
79
80
81





//#define DEBUG_LOGMAP

82

83
84
85
86
87
88

typedef int8_t llr_t; // internal decoder LLR data is 8-bit fixed
typedef int8_t channel_t;
#define MAX8 127


89
void log_map8(llr_t *systematic,channel_t *y_parity, llr_t *m11, llr_t *m10, llr_t *alpha, llr_t *beta, llr_t *ext,unsigned short frame_length,unsigned char term_flag,unsigned char F,int offset8_flag,
90
              time_stats_t *alpha_stats,time_stats_t *beta_stats,time_stats_t *gamma_stats,time_stats_t *ext_stats);
91
92
93
94
void compute_gamma8(llr_t *m11,llr_t *m10,llr_t *systematic, channel_t *y_parity, unsigned short frame_length,unsigned char term_flag);
void compute_alpha8(llr_t *alpha,llr_t *beta, llr_t *m11,llr_t *m10, unsigned short frame_length,unsigned char F);
void compute_beta8(llr_t *alpha, llr_t *beta,llr_t *m11,llr_t *m10, unsigned short frame_length,unsigned char F,int offset8_flag);
void compute_ext8(llr_t *alpha,llr_t *beta,llr_t *m11,llr_t *m10,llr_t *extrinsic, llr_t *ap, unsigned short frame_length);
95
96


97
void print_bytes(char *s, int8_t *x) {
98
  printf("%s  : %d,%d,%d,%d,%d,%d,%d,%d,%d,%d,%d,%d,%d,%d,%d,%d\n",s,
99
100
         x[0],x[1],x[2],x[3],x[4],x[5],x[6],x[7],
         x[8],x[9],x[10],x[11],x[12],x[13],x[14],x[15]);
101
102
103
}


104
105
106
107
void log_map8(llr_t *systematic,
              channel_t *y_parity,
              llr_t *m11,
              llr_t *m10,
108
109
              llr_t *alpha,
              llr_t *beta,
110
              llr_t *ext,
111
112
113
114
115
116
117
              unsigned short frame_length,
              unsigned char term_flag,
              unsigned char F,
              int offset8_flag,
              time_stats_t *alpha_stats,
              time_stats_t *beta_stats,
              time_stats_t *gamma_stats,
118
              time_stats_t *ext_stats) {
119
#ifdef DEBUG_LOGMAP
120
  printf("log_map, frame_length %d\n",frame_length);
121
122
#endif

123
  if (gamma_stats) start_meas(gamma_stats) ;
124

125
  compute_gamma8(m11,m10,systematic,y_parity,frame_length,term_flag) ;
126

127
  if (gamma_stats) stop_meas(gamma_stats);
128

129
  if (alpha_stats) start_meas(alpha_stats) ;
130

131
  compute_alpha8(alpha,beta,m11,m10,frame_length,F)                  ;
132

133
  if (alpha_stats) stop_meas(alpha_stats);
134

135
  if (beta_stats) start_meas(beta_stats)  ;
136

137
  compute_beta8(alpha,beta,m11,m10,frame_length,F,offset8_flag)      ;
138

139
  if (beta_stats) stop_meas(beta_stats);
140

141
  if (ext_stats) start_meas(ext_stats)   ;
142

143
  compute_ext8(alpha,beta,m11,m10,ext,systematic,frame_length)       ;
144

145
  if (ext_stats) stop_meas(ext_stats);
146
147
}

148
149
void compute_gamma8(llr_t *m11,llr_t *m10,llr_t *systematic,channel_t *y_parity,
                    unsigned short frame_length,unsigned char term_flag) {
150
  int k,K1;
151
#if defined(__x86_64__)||defined(__i386__)
152
153
154
155
  __m128i *systematic128 = (__m128i *)systematic;
  __m128i *y_parity128   = (__m128i *)y_parity;
  __m128i *m10_128        = (__m128i *)m10;
  __m128i *m11_128        = (__m128i *)m11;
156
157
158
159
160
161
#elif defined(__arm__)
  int8x16_t *systematic128  = (int8x16_t *)systematic;
  int8x16_t *y_parity128    = (int8x16_t *)y_parity;
  int8x16_t *m10_128        = (int8x16_t *)m10;
  int8x16_t *m11_128        = (int8x16_t *)m11;
#endif
162
#ifdef DEBUG_LOGMAP
163
  printf("compute_gamma, %p,%p,%p,%p,framelength %d\n",m11,m10,systematic,y_parity,frame_length);
164
#endif
165
#if defined(__x86_64__) || defined(__i386__)
166
  register __m128i sl,sh,ypl,yph; //K128=_mm_set1_epi8(-128);
167
#endif
168
169
170
  K1 = (frame_length>>4);

  for (k=0; k<K1; k++) {
171
#if defined(__x86_64__) || defined(__i386__)
172
    sl  = _mm_cvtepi8_epi16(systematic128[k]);
173
    sh  = _mm_cvtepi8_epi16(_mm_srli_si128(systematic128[k],8));
174
175
    ypl = _mm_cvtepi8_epi16(y_parity128[k]);
    yph = _mm_cvtepi8_epi16(_mm_srli_si128(y_parity128[k],8));
176
    m11_128[k] = _mm_packs_epi16(_mm_srai_epi16(_mm_adds_epi16(sl,ypl),1),
177
                                 _mm_srai_epi16(_mm_adds_epi16(sh,yph),1));
178
    m10_128[k] = _mm_packs_epi16(_mm_srai_epi16(_mm_subs_epi16(sl,ypl),1),
179
                                 _mm_srai_epi16(_mm_subs_epi16(sh,yph),1));
180
181
182
183
#elif defined(__arm__)
    m11_128[k] = vhaddq_s8(systematic128[k],y_parity128[k]);
    m10_128[k] = vhsubq_s8(systematic128[k],y_parity128[k]);
#endif
184
  }
185

186
  // Termination
187
#if defined(__x86_64__) || defined(__i386__)
188
189
190
191
192
193
194
195
  sl  = _mm_cvtepi8_epi16(systematic128[k+term_flag]);
  sh = _mm_cvtepi8_epi16(_mm_srli_si128(systematic128[k],8));
  ypl = _mm_cvtepi8_epi16(y_parity128[k+term_flag]);
  yph = _mm_cvtepi8_epi16(_mm_srli_si128(y_parity128[k],8));
  m11_128[k] = _mm_packs_epi16(_mm_srai_epi16(_mm_adds_epi16(sl,ypl),1),
                               _mm_srai_epi16(_mm_adds_epi16(sh,yph),1));
  m10_128[k] = _mm_packs_epi16(_mm_srai_epi16(_mm_subs_epi16(sl,ypl),1),
                               _mm_srai_epi16(_mm_subs_epi16(sh,yph),1));
196
197
198
199
#elif defined(__arm__)
  m11_128[k] = vhaddq_s8(systematic128[k+term_flag],y_parity128[k]);
  m10_128[k] = vhsubq_s8(systematic128[k+term_flag],y_parity128[k]);
#endif
200
201
202
203
}

#define L 16

204
void compute_alpha8(llr_t *alpha,llr_t *beta,llr_t *m_11,llr_t *m_10,unsigned short frame_length,unsigned char F) {
205
  int k,loopval,rerun_flag;
206
#if defined(__x86_64__) || defined(__i386__)
207
208
209
210
211
  __m128i *alpha128=(__m128i *)alpha,*alpha_ptr;
  __m128i *m11p,*m10p;
  __m128i m_b0,m_b1,m_b2,m_b3,m_b4,m_b5,m_b6,m_b7;
  __m128i new0,new1,new2,new3,new4,new5,new6,new7;
  __m128i alpha_max;
212
213
214
215
216
217
218
#elif defined(__arm__)
  int8x16_t *alpha128=(int8x16_t *)alpha,*alpha_ptr;
  int8x16_t *m11p,*m10p;
  int8x16_t m_b0,m_b1,m_b2,m_b3,m_b4,m_b5,m_b6,m_b7;
  int8x16_t new0,new1,new2,new3,new4,new5,new6,new7;
  int8x16_t alpha_max;
#endif
219
220
  // Set initial state: first colum is known
  // the other columns are unknown, so all states are set to same value
221
#if defined(__x86_64__) || defined(__i386__)
222
223
224
225
226
227
228
  alpha128[0] = _mm_set_epi8(-MAX8/2,-MAX8/2,-MAX8/2,-MAX8/2,-MAX8/2,-MAX8/2,-MAX8/2,-MAX8/2,-MAX8/2,-MAX8/2,-MAX8/2,-MAX8/2,-MAX8/2,-MAX8/2,-MAX8/2,0);
  alpha128[1] = _mm_set_epi8(-MAX8/2,-MAX8/2,-MAX8/2,-MAX8/2,-MAX8/2,-MAX8/2,-MAX8/2,-MAX8/2,-MAX8/2,-MAX8/2,-MAX8/2,-MAX8/2,-MAX8/2,-MAX8/2,-MAX8/2,-MAX8/2);
  alpha128[2] = _mm_set_epi8(-MAX8/2,-MAX8/2,-MAX8/2,-MAX8/2,-MAX8/2,-MAX8/2,-MAX8/2,-MAX8/2,-MAX8/2,-MAX8/2,-MAX8/2,-MAX8/2,-MAX8/2,-MAX8/2,-MAX8/2,-MAX8/2);
  alpha128[3] = _mm_set_epi8(-MAX8/2,-MAX8/2,-MAX8/2,-MAX8/2,-MAX8/2,-MAX8/2,-MAX8/2,-MAX8/2,-MAX8/2,-MAX8/2,-MAX8/2,-MAX8/2,-MAX8/2,-MAX8/2,-MAX8/2,-MAX8/2);
  alpha128[4] = _mm_set_epi8(-MAX8/2,-MAX8/2,-MAX8/2,-MAX8/2,-MAX8/2,-MAX8/2,-MAX8/2,-MAX8/2,-MAX8/2,-MAX8/2,-MAX8/2,-MAX8/2,-MAX8/2,-MAX8/2,-MAX8/2,-MAX8/2);
  alpha128[5] = _mm_set_epi8(-MAX8/2,-MAX8/2,-MAX8/2,-MAX8/2,-MAX8/2,-MAX8/2,-MAX8/2,-MAX8/2,-MAX8/2,-MAX8/2,-MAX8/2,-MAX8/2,-MAX8/2,-MAX8/2,-MAX8/2,-MAX8/2);
  alpha128[6] = _mm_set_epi8(-MAX8/2,-MAX8/2,-MAX8/2,-MAX8/2,-MAX8/2,-MAX8/2,-MAX8/2,-MAX8/2,-MAX8/2,-MAX8/2,-MAX8/2,-MAX8/2,-MAX8/2,-MAX8/2,-MAX8/2,-MAX8/2);
229
230
  alpha128[7] = _mm_set_epi8(-MAX8/2,-MAX8/2,-MAX8/2,-MAX8/2,-MAX8/2,-MAX8/2,-MAX8/2,-MAX8/2,-MAX8/2,-MAX8/2,-MAX8/2,-MAX8/2,-MAX8/2,-MAX8/2,-MAX8/2,-MAX8/2);

231
  for (loopval=frame_length>>4, rerun_flag=0; rerun_flag<2; loopval=L, rerun_flag++) {
232
    alpha_ptr = &alpha128[0];
233
234
    m11p = (__m128i *)m_11;
    m10p = (__m128i *)m_10;
235
236

    for (k=0;  k<loopval;  k++) {
237
      m_b0 = _mm_adds_epi8(alpha_ptr[1],*m11p);  // m11
238
      m_b4 = _mm_subs_epi8(alpha_ptr[1],*m11p);  // m00=-m11
239
240
241
242
243
      m_b1 = _mm_subs_epi8(alpha_ptr[3],*m10p);  // m01=-m10
      m_b5 = _mm_adds_epi8(alpha_ptr[3],*m10p);  // m10
      m_b2 = _mm_adds_epi8(alpha_ptr[5],*m10p);  // m10
      m_b6 = _mm_subs_epi8(alpha_ptr[5],*m10p);  // m01=-m10
      m_b3 = _mm_subs_epi8(alpha_ptr[7],*m11p);  // m00=-m11
244
      m_b7 = _mm_adds_epi8(alpha_ptr[7],*m11p);  // m11
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
      new0 = _mm_subs_epi8(alpha_ptr[0],*m11p);  // m00=-m11
      new4 = _mm_adds_epi8(alpha_ptr[0],*m11p);  // m11
      new1 = _mm_adds_epi8(alpha_ptr[2],*m10p);  // m10
      new5 = _mm_subs_epi8(alpha_ptr[2],*m10p);  // m01=-m10
      new2 = _mm_subs_epi8(alpha_ptr[4],*m10p);  // m01=-m10
      new6 = _mm_adds_epi8(alpha_ptr[4],*m10p);  // m10
      new3 = _mm_adds_epi8(alpha_ptr[6],*m11p);  // m11
      new7 = _mm_subs_epi8(alpha_ptr[6],*m11p);  // m00=-m11
      alpha_ptr += 8;
      m11p++;
      m10p++;
      alpha_ptr[0] = _mm_max_epi8(m_b0,new0);
      alpha_ptr[1] = _mm_max_epi8(m_b1,new1);
      alpha_ptr[2] = _mm_max_epi8(m_b2,new2);
      alpha_ptr[3] = _mm_max_epi8(m_b3,new3);
      alpha_ptr[4] = _mm_max_epi8(m_b4,new4);
      alpha_ptr[5] = _mm_max_epi8(m_b5,new5);
      alpha_ptr[6] = _mm_max_epi8(m_b6,new6);
      alpha_ptr[7] = _mm_max_epi8(m_b7,new7);
      // compute and subtract maxima
      alpha_max = _mm_max_epi8(alpha_ptr[0],alpha_ptr[1]);
      alpha_max = _mm_max_epi8(alpha_max,alpha_ptr[2]);
      alpha_max = _mm_max_epi8(alpha_max,alpha_ptr[3]);
      alpha_max = _mm_max_epi8(alpha_max,alpha_ptr[4]);
      alpha_max = _mm_max_epi8(alpha_max,alpha_ptr[5]);
      alpha_max = _mm_max_epi8(alpha_max,alpha_ptr[6]);
      alpha_max = _mm_max_epi8(alpha_max,alpha_ptr[7]);
      alpha_ptr[0] = _mm_subs_epi8(alpha_ptr[0],alpha_max);
      alpha_ptr[1] = _mm_subs_epi8(alpha_ptr[1],alpha_max);
      alpha_ptr[2] = _mm_subs_epi8(alpha_ptr[2],alpha_max);
      alpha_ptr[3] = _mm_subs_epi8(alpha_ptr[3],alpha_max);
      alpha_ptr[4] = _mm_subs_epi8(alpha_ptr[4],alpha_max);
      alpha_ptr[5] = _mm_subs_epi8(alpha_ptr[5],alpha_max);
      alpha_ptr[6] = _mm_subs_epi8(alpha_ptr[6],alpha_max);
      alpha_ptr[7] = _mm_subs_epi8(alpha_ptr[7],alpha_max);
    }

282
    // Set intial state for next iteration from the last state
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
    // as acolum end states are the first states of the next column
    int K1= frame_length>>1;
    alpha128[0] = _mm_slli_si128(alpha128[K1],1);
    alpha128[1] = _mm_slli_si128(alpha128[1+K1],1);
    alpha128[2] = _mm_slli_si128(alpha128[2+K1],1);
    alpha128[3] = _mm_slli_si128(alpha128[3+K1],1);
    alpha128[4] = _mm_slli_si128(alpha128[4+K1],1);
    alpha128[5] = _mm_slli_si128(alpha128[5+K1],1);
    alpha128[6] = _mm_slli_si128(alpha128[6+K1],1);
    alpha128[7] = _mm_slli_si128(alpha128[7+K1],1);
    alpha[16] =  -MAX8/2;
    alpha[32] = -MAX8/2;
    alpha[48] = -MAX8/2;
    alpha[64] = -MAX8/2;
    alpha[80] = -MAX8/2;
    alpha[96] = -MAX8/2;
    alpha[112] = -MAX8/2;
300
  }
301

302
303
304
305
306
307
308
309
310
311
312
#elif defined(__arm__)
  alpha128[0] = vdupq_n_s8(-MAX8/2);
  alpha128[0] = vsetq_lane_s8(0,alpha128[0],0);
  alpha128[1] = vdupq_n_s8(-MAX8/2);
  alpha128[2] = vdupq_n_s8(-MAX8/2);
  alpha128[3] = vdupq_n_s8(-MAX8/2);
  alpha128[4] = vdupq_n_s8(-MAX8/2);
  alpha128[5] = vdupq_n_s8(-MAX8/2);
  alpha128[6] = vdupq_n_s8(-MAX8/2);
  alpha128[7] = vdupq_n_s8(-MAX8/2);

313
  for (loopval=frame_length>>4, rerun_flag=0; rerun_flag<2; loopval=L, rerun_flag++) {
314
    alpha_ptr = &alpha128[0];
315
316
    m11p = (int8x16_t *)m_11;
    m10p = (int8x16_t *)m_10;
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366

    for (k=0;  k<loopval;  k++) {
      m_b0 = vqaddq_s8(alpha_ptr[1],*m11p);  // m11
      m_b4 = vqsubq_s8(alpha_ptr[1],*m11p);  // m00=-m11
      m_b1 = vqsubq_s8(alpha_ptr[3],*m10p);  // m01=-m10
      m_b5 = vqaddq_s8(alpha_ptr[3],*m10p);  // m10
      m_b2 = vqaddq_s8(alpha_ptr[5],*m10p);  // m10
      m_b6 = vqsubq_s8(alpha_ptr[5],*m10p);  // m01=-m10
      m_b3 = vqsubq_s8(alpha_ptr[7],*m11p);  // m00=-m11
      m_b7 = vqaddq_s8(alpha_ptr[7],*m11p);  // m11
      new0 = vqsubq_s8(alpha_ptr[0],*m11p);  // m00=-m11
      new4 = vqaddq_s8(alpha_ptr[0],*m11p);  // m11
      new1 = vqaddq_s8(alpha_ptr[2],*m10p);  // m10
      new5 = vqsubq_s8(alpha_ptr[2],*m10p);  // m01=-m10
      new2 = vqsubq_s8(alpha_ptr[4],*m10p);  // m01=-m10
      new6 = vqaddq_s8(alpha_ptr[4],*m10p);  // m10
      new3 = vqaddq_s8(alpha_ptr[6],*m11p);  // m11
      new7 = vqsubq_s8(alpha_ptr[6],*m11p);  // m00=-m11
      alpha_ptr += 8;
      m11p++;
      m10p++;
      alpha_ptr[0] = vmaxq_s8(m_b0,new0);
      alpha_ptr[1] = vmaxq_s8(m_b1,new1);
      alpha_ptr[2] = vmaxq_s8(m_b2,new2);
      alpha_ptr[3] = vmaxq_s8(m_b3,new3);
      alpha_ptr[4] = vmaxq_s8(m_b4,new4);
      alpha_ptr[5] = vmaxq_s8(m_b5,new5);
      alpha_ptr[6] = vmaxq_s8(m_b6,new6);
      alpha_ptr[7] = vmaxq_s8(m_b7,new7);
      // compute and subtract maxima
      alpha_max = vmaxq_s8(alpha_ptr[0],alpha_ptr[1]);
      alpha_max = vmaxq_s8(alpha_max,alpha_ptr[2]);
      alpha_max = vmaxq_s8(alpha_max,alpha_ptr[3]);
      alpha_max = vmaxq_s8(alpha_max,alpha_ptr[4]);
      alpha_max = vmaxq_s8(alpha_max,alpha_ptr[5]);
      alpha_max = vmaxq_s8(alpha_max,alpha_ptr[6]);
      alpha_max = vmaxq_s8(alpha_max,alpha_ptr[7]);
      alpha_ptr[0] = vqsubq_s8(alpha_ptr[0],alpha_max);
      alpha_ptr[1] = vqsubq_s8(alpha_ptr[1],alpha_max);
      alpha_ptr[2] = vqsubq_s8(alpha_ptr[2],alpha_max);
      alpha_ptr[3] = vqsubq_s8(alpha_ptr[3],alpha_max);
      alpha_ptr[4] = vqsubq_s8(alpha_ptr[4],alpha_max);
      alpha_ptr[5] = vqsubq_s8(alpha_ptr[5],alpha_max);
      alpha_ptr[6] = vqsubq_s8(alpha_ptr[6],alpha_max);
      alpha_ptr[7] = vqsubq_s8(alpha_ptr[7],alpha_max);
    }

    // Set intial state for next iteration from the last state
    // as a column end states are the first states of the next column
    int K1= frame_length>>1;
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
    alpha128[0] = (int8x16_t)vshlq_n_s64((int64x2_t)alpha128[K1],8);
    alpha128[0] = vsetq_lane_s8(alpha[8],alpha128[0],7);
    alpha128[1] = (int8x16_t)vshlq_n_s64((int64x2_t)alpha128[1+K1],8);
    alpha128[1] = vsetq_lane_s8(alpha[24],alpha128[0],7);
    alpha128[2] = (int8x16_t)vshlq_n_s64((int64x2_t)alpha128[2+K1],8);
    alpha128[2] = vsetq_lane_s8(alpha[40],alpha128[0],7);
    alpha128[3] = (int8x16_t)vshlq_n_s64((int64x2_t)alpha128[3+K1],8);
    alpha128[3] = vsetq_lane_s8(alpha[56],alpha128[0],7);
    alpha128[4] = (int8x16_t)vshlq_n_s64((int64x2_t)alpha128[4+K1],8);
    alpha128[4] = vsetq_lane_s8(alpha[72],alpha128[0],7);
    alpha128[5] = (int8x16_t)vshlq_n_s64((int64x2_t)alpha128[5+K1],8);
    alpha128[5] = vsetq_lane_s8(alpha[88],alpha128[0],7);
    alpha128[6] = (int8x16_t)vshlq_n_s64((int64x2_t)alpha128[6+K1],8);
    alpha128[6] = vsetq_lane_s8(alpha[104],alpha128[0],7);
    alpha128[7] = (int8x16_t)vshlq_n_s64((int64x2_t)alpha128[7+K1],8);
    alpha128[7] = vsetq_lane_s8(alpha[120],alpha128[0],7);
383
384
385
386
387
388
389
390
    alpha[16] =  -MAX8/2;
    alpha[32] = -MAX8/2;
    alpha[48] = -MAX8/2;
    alpha[64] = -MAX8/2;
    alpha[80] = -MAX8/2;
    alpha[96] = -MAX8/2;
    alpha[112] = -MAX8/2;
  }
391

392
#endif
393
394
395
}


396
void compute_beta8(llr_t *alpha,llr_t *beta,llr_t *m_11,llr_t *m_10,unsigned short frame_length,unsigned char F,int offset8_flag) {
397
  int k,rerun_flag, loopval;
398
#if defined(__x86_64__) || defined(__i386__)
399
400
401
402
  __m128i m11_128,m10_128;
  __m128i m_b0,m_b1,m_b2,m_b3,m_b4,m_b5,m_b6,m_b7;
  __m128i new0,new1,new2,new3,new4,new5,new6,new7;
  __m128i *beta128,*alpha128,*beta_ptr;
403
  __m128i beta_max;
404
405
406
407
408
409
410
#elif defined(__arm__)
  int8x16_t m11_128,m10_128;
  int8x16_t m_b0,m_b1,m_b2,m_b3,m_b4,m_b5,m_b6,m_b7;
  int8x16_t new0,new1,new2,new3,new4,new5,new6,new7;
  int8x16_t *beta128,*alpha128,*beta_ptr;
  int8x16_t beta_max;
#endif
411
412
  llr_t beta0,beta1;
  llr_t beta2,beta3,beta4,beta5,beta6,beta7;
413

414
  if (frame_length > 6144) {
415
416
417
418
    LOG_E(PHY,"compute_beta: frame_length %d\n",frame_length);
    return;
  }

419
420
  // we are supposed to run compute_alpha just before compute_beta
  // so the initial states of backward computation can be set from last value of alpha states (forward computation)
421
#if defined(__x86_64__) || defined(__i386__)
422
423
  beta_ptr   = (__m128i *)&beta[frame_length<<3];
  alpha128   = (__m128i *)&alpha[0];
424
#elif defined(__arm__)
425
426
  beta_ptr   = (int8x16_t *)&beta[frame_length<<3];
  alpha128   = (int8x16_t *)&alpha[0];
427
#endif
428
429
430
431
432
433
434
435
  beta_ptr[0] = alpha128[(frame_length>>1)];
  beta_ptr[1] = alpha128[1+(frame_length>>1)];
  beta_ptr[2] = alpha128[2+(frame_length>>1)];
  beta_ptr[3] = alpha128[3+(frame_length>>1)];
  beta_ptr[4] = alpha128[4+(frame_length>>1)];
  beta_ptr[5] = alpha128[5+(frame_length>>1)];
  beta_ptr[6] = alpha128[6+(frame_length>>1)];
  beta_ptr[7] = alpha128[7+(frame_length>>1)];
436
437
  int overlap = (frame_length>>4)> L ? (frame_length>>4)-L : 0 ;

438
439
  for (rerun_flag=0, loopval=0;
       rerun_flag<2 ;
440
       loopval=overlap,rerun_flag++) {
441
    if (offset8_flag==0) {
442
443
444
      // FIXME! beta0-beta7 are used uninitialized. FIXME!
      // workaround: init with 0
      beta0 = beta1 = beta2 = beta3 = beta4 = beta5 = beta6 = beta7 = 0;
445
#if defined(__x86_64__) || defined(__i386__)
446
447
448
449
450
451
452
453
      beta_ptr[0] = _mm_insert_epi8(beta_ptr[0],beta0,15);
      beta_ptr[1] = _mm_insert_epi8(beta_ptr[1],beta1,15);
      beta_ptr[2] = _mm_insert_epi8(beta_ptr[2],beta2,15);
      beta_ptr[3] = _mm_insert_epi8(beta_ptr[3],beta3,15);
      beta_ptr[4] = _mm_insert_epi8(beta_ptr[4],beta4,15);
      beta_ptr[5] = _mm_insert_epi8(beta_ptr[5],beta5,15);
      beta_ptr[6] = _mm_insert_epi8(beta_ptr[6],beta6,15);
      beta_ptr[7] = _mm_insert_epi8(beta_ptr[7],beta7,15);
454
455
456
457
458
459
460
461
462
463
#elif defined(__arm__)
      beta_ptr[0] = vsetq_lane_s8(beta0,beta_ptr[0],15);
      beta_ptr[1] = vsetq_lane_s8(beta1,beta_ptr[1],15);
      beta_ptr[2] = vsetq_lane_s8(beta2,beta_ptr[2],15);
      beta_ptr[3] = vsetq_lane_s8(beta3,beta_ptr[3],15);
      beta_ptr[4] = vsetq_lane_s8(beta4,beta_ptr[4],15);
      beta_ptr[5] = vsetq_lane_s8(beta5,beta_ptr[5],15);
      beta_ptr[6] = vsetq_lane_s8(beta6,beta_ptr[6],15);
      beta_ptr[7] = vsetq_lane_s8(beta7,beta_ptr[7],15);
#endif
464
465
    }

466
#if defined(__x86_64__) || defined(__i386__)
467
    beta_ptr = (__m128i *)&beta[frame_length<<3];
468
#elif defined(__arm__)
469
    beta_ptr = (int8x16_t *)&beta[frame_length<<3];
470
#endif
471

472
    for (k=(frame_length>>4)-1;
473
474
         k>=loopval;
         k--) {
475
#if defined(__x86_64__) || defined(__i386__)
476
477
      m11_128=((__m128i *)m_11)[k];
      m10_128=((__m128i *)m_10)[k];
478
479
480
481
      m_b0 = _mm_adds_epi8(beta_ptr[4],m11_128);  //m11
      m_b1 = _mm_subs_epi8(beta_ptr[4],m11_128);  //m00
      m_b2 = _mm_subs_epi8(beta_ptr[5],m10_128);  //m01
      m_b3 = _mm_adds_epi8(beta_ptr[5],m10_128);  //m10
482
      m_b4 = _mm_adds_epi8(beta_ptr[6],m10_128);  //m10
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
      m_b5 = _mm_subs_epi8(beta_ptr[6],m10_128);  //m01
      m_b6 = _mm_subs_epi8(beta_ptr[7],m11_128);  //m00
      m_b7 = _mm_adds_epi8(beta_ptr[7],m11_128);  //m11
      new0 = _mm_subs_epi8(beta_ptr[0],m11_128);  //m00
      new1 = _mm_adds_epi8(beta_ptr[0],m11_128);  //m11
      new2 = _mm_adds_epi8(beta_ptr[1],m10_128);  //m10
      new3 = _mm_subs_epi8(beta_ptr[1],m10_128);  //m01
      new4 = _mm_subs_epi8(beta_ptr[2],m10_128);  //m01
      new5 = _mm_adds_epi8(beta_ptr[2],m10_128);  //m10
      new6 = _mm_adds_epi8(beta_ptr[3],m11_128);  //m11
      new7 = _mm_subs_epi8(beta_ptr[3],m11_128);  //m00
      beta_ptr-=8;
      beta_ptr[0] = _mm_max_epi8(m_b0,new0);
      beta_ptr[1] = _mm_max_epi8(m_b1,new1);
      beta_ptr[2] = _mm_max_epi8(m_b2,new2);
      beta_ptr[3] = _mm_max_epi8(m_b3,new3);
      beta_ptr[4] = _mm_max_epi8(m_b4,new4);
      beta_ptr[5] = _mm_max_epi8(m_b5,new5);
      beta_ptr[6] = _mm_max_epi8(m_b6,new6);
      beta_ptr[7] = _mm_max_epi8(m_b7,new7);
      beta_max = _mm_max_epi8(beta_ptr[0],beta_ptr[1]);
      beta_max = _mm_max_epi8(beta_max   ,beta_ptr[2]);
      beta_max = _mm_max_epi8(beta_max   ,beta_ptr[3]);
      beta_max = _mm_max_epi8(beta_max   ,beta_ptr[4]);
      beta_max = _mm_max_epi8(beta_max   ,beta_ptr[5]);
      beta_max = _mm_max_epi8(beta_max   ,beta_ptr[6]);
      beta_max = _mm_max_epi8(beta_max   ,beta_ptr[7]);
      beta_ptr[0] = _mm_subs_epi8(beta_ptr[0],beta_max);
      beta_ptr[1] = _mm_subs_epi8(beta_ptr[1],beta_max);
      beta_ptr[2] = _mm_subs_epi8(beta_ptr[2],beta_max);
      beta_ptr[3] = _mm_subs_epi8(beta_ptr[3],beta_max);
      beta_ptr[4] = _mm_subs_epi8(beta_ptr[4],beta_max);
      beta_ptr[5] = _mm_subs_epi8(beta_ptr[5],beta_max);
      beta_ptr[6] = _mm_subs_epi8(beta_ptr[6],beta_max);
      beta_ptr[7] = _mm_subs_epi8(beta_ptr[7],beta_max);
518
#elif defined(__arm__)
519
520
      m11_128=((int8x16_t *)m_11)[k];
      m10_128=((int8x16_t *)m_10)[k];
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
      m_b0 = vqaddq_s8(beta_ptr[4],m11_128);  //m11
      m_b1 = vqsubq_s8(beta_ptr[4],m11_128);  //m00
      m_b2 = vqsubq_s8(beta_ptr[5],m10_128);  //m01
      m_b3 = vqaddq_s8(beta_ptr[5],m10_128);  //m10
      m_b4 = vqaddq_s8(beta_ptr[6],m10_128);  //m10
      m_b5 = vqsubq_s8(beta_ptr[6],m10_128);  //m01
      m_b6 = vqsubq_s8(beta_ptr[7],m11_128);  //m00
      m_b7 = vqaddq_s8(beta_ptr[7],m11_128);  //m11
      new0 = vqsubq_s8(beta_ptr[0],m11_128);  //m00
      new1 = vqaddq_s8(beta_ptr[0],m11_128);  //m11
      new2 = vqaddq_s8(beta_ptr[1],m10_128);  //m10
      new3 = vqsubq_s8(beta_ptr[1],m10_128);  //m01
      new4 = vqsubq_s8(beta_ptr[2],m10_128);  //m01
      new5 = vqaddq_s8(beta_ptr[2],m10_128);  //m10
      new6 = vqaddq_s8(beta_ptr[3],m11_128);  //m11
      new7 = vqsubq_s8(beta_ptr[3],m11_128);  //m00
      beta_ptr-=8;
      beta_ptr[0] = vmaxq_s8(m_b0,new0);
      beta_ptr[1] = vmaxq_s8(m_b1,new1);
      beta_ptr[2] = vmaxq_s8(m_b2,new2);
      beta_ptr[3] = vmaxq_s8(m_b3,new3);
      beta_ptr[4] = vmaxq_s8(m_b4,new4);
      beta_ptr[5] = vmaxq_s8(m_b5,new5);
      beta_ptr[6] = vmaxq_s8(m_b6,new6);
      beta_ptr[7] = vmaxq_s8(m_b7,new7);
      beta_max = vmaxq_s8(beta_ptr[0],beta_ptr[1]);
      beta_max = vmaxq_s8(beta_max   ,beta_ptr[2]);
      beta_max = vmaxq_s8(beta_max   ,beta_ptr[3]);
      beta_max = vmaxq_s8(beta_max   ,beta_ptr[4]);
      beta_max = vmaxq_s8(beta_max   ,beta_ptr[5]);
      beta_max = vmaxq_s8(beta_max   ,beta_ptr[6]);
      beta_max = vmaxq_s8(beta_max   ,beta_ptr[7]);
      beta_ptr[0] = vqsubq_s8(beta_ptr[0],beta_max);
      beta_ptr[1] = vqsubq_s8(beta_ptr[1],beta_max);
      beta_ptr[2] = vqsubq_s8(beta_ptr[2],beta_max);
      beta_ptr[3] = vqsubq_s8(beta_ptr[3],beta_max);
      beta_ptr[4] = vqsubq_s8(beta_ptr[4],beta_max);
      beta_ptr[5] = vqsubq_s8(beta_ptr[5],beta_max);
      beta_ptr[6] = vqsubq_s8(beta_ptr[6],beta_max);
      beta_ptr[7] = vqsubq_s8(beta_ptr[7],beta_max);
#endif
562
563
564
    }

    // Set intial state for next iteration from the last state
565
    // as column last states are the first states of the next column
566
567
    // The initial state of column 0 is coming from tail bits (to be computed)
#if defined(__x86_64__) || defined(__i386__)
568
569
    beta128 = (__m128i *)&beta[0];
    beta_ptr   = (__m128i *)&beta[frame_length<<3];
570
571
572
573
574
575
576
577
    beta_ptr[0] = _mm_srli_si128(beta128[0],1);
    beta_ptr[1] = _mm_srli_si128(beta128[1],1);
    beta_ptr[2] = _mm_srli_si128(beta128[2],1);
    beta_ptr[3] = _mm_srli_si128(beta128[3],1);
    beta_ptr[4] = _mm_srli_si128(beta128[4],1);
    beta_ptr[5] = _mm_srli_si128(beta128[5],1);
    beta_ptr[6] = _mm_srli_si128(beta128[6],1);
    beta_ptr[7] = _mm_srli_si128(beta128[7],1);
578
#elif defined(__arm__)
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
    beta128 = (int8x16_t *)&beta[0];
    beta_ptr   = (int8x16_t *)&beta[frame_length<<3];
    beta_ptr[0] = (int8x16_t)vshrq_n_s64((int64x2_t)beta128[0],8);
    beta_ptr[0] = vsetq_lane_s8(beta[7],beta_ptr[0],8);
    beta_ptr[1] = (int8x16_t)vshrq_n_s64((int64x2_t)beta128[1],8);
    beta_ptr[1] = vsetq_lane_s8(beta[23],beta_ptr[1],8);
    beta_ptr[2] = (int8x16_t)vshrq_n_s64((int64x2_t)beta128[2],8);
    beta_ptr[2] = vsetq_lane_s8(beta[39],beta_ptr[2],8);
    beta_ptr[3] = (int8x16_t)vshrq_n_s64((int64x2_t)beta128[3],8);
    beta_ptr[3] = vsetq_lane_s8(beta[55],beta_ptr[3],8);
    beta_ptr[4] = (int8x16_t)vshrq_n_s64((int64x2_t)beta128[4],8);
    beta_ptr[4] = vsetq_lane_s8(beta[71],beta_ptr[4],8);
    beta_ptr[5] = (int8x16_t)vshrq_n_s64((int64x2_t)beta128[5],8);
    beta_ptr[5] = vsetq_lane_s8(beta[87],beta_ptr[5],8);
    beta_ptr[6] = (int8x16_t)vshrq_n_s64((int64x2_t)beta128[6],8);
    beta_ptr[6] = vsetq_lane_s8(beta[103],beta_ptr[6],8);
    beta_ptr[7] = (int8x16_t)vshrq_n_s64((int64x2_t)beta128[7],8);
    beta_ptr[7] = vsetq_lane_s8(beta[119],beta_ptr[7],8);
597
#endif
598
599
600
  }
}

601
void compute_ext8(llr_t *alpha,llr_t *beta,llr_t *m_11,llr_t *m_10,llr_t *ext, llr_t *systematic,unsigned short frame_length) {
602
#if defined(__x86_64__) || defined(__i386__)
603
604
605
606
607
608
609
610
  __m128i *alpha128=(__m128i *)alpha;
  __m128i *beta128=(__m128i *)beta;
  __m128i *m11_128,*m10_128,*ext_128;
  __m128i *alpha_ptr,*beta_ptr;
  __m128i m00_1,m00_2,m00_3,m00_4;
  __m128i m01_1,m01_2,m01_3,m01_4;
  __m128i m10_1,m10_2,m10_3,m10_4;
  __m128i m11_1,m11_2,m11_3,m11_4;
611
612
613
614
615
616
617
618
619
620
#elif defined(__arm__)
  int8x16_t *alpha128=(int8x16_t *)alpha;
  int8x16_t *beta128=(int8x16_t *)beta;
  int8x16_t *m11_128,*m10_128,*ext_128;
  int8x16_t *alpha_ptr,*beta_ptr;
  int8x16_t m00_1,m00_2,m00_3,m00_4;
  int8x16_t m01_1,m01_2,m01_3,m01_4;
  int8x16_t m10_1,m10_2,m10_3,m10_4;
  int8x16_t m11_1,m11_2,m11_3,m11_4;
#endif
621
622
623
624
625
  int k;
  //
  // LLR computation, 8 consequtive bits per loop
  //
#ifdef DEBUG_LOGMAP
626
  printf("compute_ext, %p, %p, %p, %p, %p, %p ,framelength %d\n",alpha,beta,m_11,m_10,ext,systematic,frame_length);
627
628
629
630
#endif
  alpha_ptr = alpha128;
  beta_ptr = &beta128[8];

631
  for (k=0; k<(frame_length>>4); k++) {
632
#if defined(__x86_64__) || defined(__i386__)
633
634
635
    m11_128        = (__m128i *)&m_11[k<<4];
    m10_128        = (__m128i *)&m_10[k<<4];
    ext_128        = (__m128i *)&ext[k<<4];
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
    m00_4 = _mm_adds_epi8(alpha_ptr[7],beta_ptr[3]); //ALPHA_BETA_4m00;
    m11_4 = _mm_adds_epi8(alpha_ptr[7],beta_ptr[7]); //ALPHA_BETA_4m11;
    m00_3 = _mm_adds_epi8(alpha_ptr[6],beta_ptr[7]); //ALPHA_BETA_3m00;
    m11_3 = _mm_adds_epi8(alpha_ptr[6],beta_ptr[3]); //ALPHA_BETA_3m11;
    m00_2 = _mm_adds_epi8(alpha_ptr[1],beta_ptr[4]); //ALPHA_BETA_2m00;
    m11_2 = _mm_adds_epi8(alpha_ptr[1],beta_ptr[0]); //ALPHA_BETA_2m11;
    m11_1 = _mm_adds_epi8(alpha_ptr[0],beta_ptr[4]); //ALPHA_BETA_1m11;
    m00_1 = _mm_adds_epi8(alpha_ptr[0],beta_ptr[0]); //ALPHA_BETA_1m00;
    m01_4 = _mm_adds_epi8(alpha_ptr[5],beta_ptr[6]); //ALPHA_BETA_4m01;
    m10_4 = _mm_adds_epi8(alpha_ptr[5],beta_ptr[2]); //ALPHA_BETA_4m10;
    m01_3 = _mm_adds_epi8(alpha_ptr[4],beta_ptr[2]); //ALPHA_BETA_3m01;
    m10_3 = _mm_adds_epi8(alpha_ptr[4],beta_ptr[6]); //ALPHA_BETA_3m10;
    m01_2 = _mm_adds_epi8(alpha_ptr[3],beta_ptr[1]); //ALPHA_BETA_2m01;
    m10_2 = _mm_adds_epi8(alpha_ptr[3],beta_ptr[5]); //ALPHA_BETA_2m10;
    m10_1 = _mm_adds_epi8(alpha_ptr[2],beta_ptr[1]); //ALPHA_BETA_1m10;
    m01_1 = _mm_adds_epi8(alpha_ptr[2],beta_ptr[5]); //ALPHA_BETA_1m01;
    m01_1 = _mm_max_epi8(m01_1,m01_2);
    m01_1 = _mm_max_epi8(m01_1,m01_3);
    m01_1 = _mm_max_epi8(m01_1,m01_4);
    m00_1 = _mm_max_epi8(m00_1,m00_2);
    m00_1 = _mm_max_epi8(m00_1,m00_3);
    m00_1 = _mm_max_epi8(m00_1,m00_4);
    m10_1 = _mm_max_epi8(m10_1,m10_2);
    m10_1 = _mm_max_epi8(m10_1,m10_3);
    m10_1 = _mm_max_epi8(m10_1,m10_4);
    m11_1 = _mm_max_epi8(m11_1,m11_2);
    m11_1 = _mm_max_epi8(m11_1,m11_3);
    m11_1 = _mm_max_epi8(m11_1,m11_4);
    m01_1 = _mm_subs_epi8(m01_1,*m10_128);
    m00_1 = _mm_subs_epi8(m00_1,*m11_128);
    m10_1 = _mm_adds_epi8(m10_1,*m10_128);
    m11_1 = _mm_adds_epi8(m11_1,*m11_128);
    m01_1 = _mm_max_epi8(m01_1,m00_1);
    m10_1 = _mm_max_epi8(m10_1,m11_1);
    *ext_128 = _mm_subs_epi8(m10_1,m01_1);
    alpha_ptr+=8;
    beta_ptr+=8;
673
#elif defined(__arm__)
674
675
676
    m11_128        = (int8x16_t *)&m_11[k<<4];
    m10_128        = (int8x16_t *)&m_10[k<<4];
    ext_128        = (int8x16_t *)&ext[k<<4];
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
    m00_4 = vqaddq_s8(alpha_ptr[7],beta_ptr[3]); //ALPHA_BETA_4m00;
    m11_4 = vqaddq_s8(alpha_ptr[7],beta_ptr[7]); //ALPHA_BETA_4m11;
    m00_3 = vqaddq_s8(alpha_ptr[6],beta_ptr[7]); //ALPHA_BETA_3m00;
    m11_3 = vqaddq_s8(alpha_ptr[6],beta_ptr[3]); //ALPHA_BETA_3m11;
    m00_2 = vqaddq_s8(alpha_ptr[1],beta_ptr[4]); //ALPHA_BETA_2m00;
    m11_2 = vqaddq_s8(alpha_ptr[1],beta_ptr[0]); //ALPHA_BETA_2m11;
    m11_1 = vqaddq_s8(alpha_ptr[0],beta_ptr[4]); //ALPHA_BETA_1m11;
    m00_1 = vqaddq_s8(alpha_ptr[0],beta_ptr[0]); //ALPHA_BETA_1m00;
    m01_4 = vqaddq_s8(alpha_ptr[5],beta_ptr[6]); //ALPHA_BETA_4m01;
    m10_4 = vqaddq_s8(alpha_ptr[5],beta_ptr[2]); //ALPHA_BETA_4m10;
    m01_3 = vqaddq_s8(alpha_ptr[4],beta_ptr[2]); //ALPHA_BETA_3m01;
    m10_3 = vqaddq_s8(alpha_ptr[4],beta_ptr[6]); //ALPHA_BETA_3m10;
    m01_2 = vqaddq_s8(alpha_ptr[3],beta_ptr[1]); //ALPHA_BETA_2m01;
    m10_2 = vqaddq_s8(alpha_ptr[3],beta_ptr[5]); //ALPHA_BETA_2m10;
    m10_1 = vqaddq_s8(alpha_ptr[2],beta_ptr[1]); //ALPHA_BETA_1m10;
    m01_1 = vqaddq_s8(alpha_ptr[2],beta_ptr[5]); //ALPHA_BETA_1m01;
    m01_1 = vmaxq_s8(m01_1,m01_2);
    m01_1 = vmaxq_s8(m01_1,m01_3);
    m01_1 = vmaxq_s8(m01_1,m01_4);
    m00_1 = vmaxq_s8(m00_1,m00_2);
    m00_1 = vmaxq_s8(m00_1,m00_3);
    m00_1 = vmaxq_s8(m00_1,m00_4);
    m10_1 = vmaxq_s8(m10_1,m10_2);
    m10_1 = vmaxq_s8(m10_1,m10_3);
    m10_1 = vmaxq_s8(m10_1,m10_4);
    m11_1 = vmaxq_s8(m11_1,m11_2);
    m11_1 = vmaxq_s8(m11_1,m11_3);
    m11_1 = vmaxq_s8(m11_1,m11_4);
    m01_1 = vqsubq_s8(m01_1,*m10_128);
    m00_1 = vqsubq_s8(m00_1,*m11_128);
    m10_1 = vqaddq_s8(m10_1,*m10_128);
    m11_1 = vqaddq_s8(m11_1,*m11_128);
    m01_1 = vmaxq_s8(m01_1,m00_1);
    m10_1 = vmaxq_s8(m10_1,m11_1);
    *ext_128 = vqsubq_s8(m10_1,m01_1);
    alpha_ptr+=8;
    beta_ptr+=8;
#endif
715
716
717
718
719
720
721
722
  }
}



//int pi2[n],pi3[n+8],pi5[n+8],pi4[n+8],pi6[n+8],
int *pi2tab8[188],*pi5tab8[188],*pi4tab8[188],*pi6tab8[188];

723
void free_td8(void) {
724
725
726
  int ind;

  for (ind=0; ind<188; ind++) {
727
728
729
730
    free_and_zero(pi2tab8[ind]);
    free_and_zero(pi5tab8[ind]);
    free_and_zero(pi4tab8[ind]);
    free_and_zero(pi6tab8[ind]);
731
  }
732
733
}

734
735
736
737


extern RAN_CONTEXT_t RC;

738
void init_td8(void) {
739
  int ind,i,j,n,n2,pi,pi3;
740
  short *base_interleaver;
741

742
  for (ind=0; ind<188; ind++) {
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
    n = f1f2mat[ind].nb_bits;
    base_interleaver=il_tb+f1f2mat[ind].beg_index;
#ifdef MEX
    // This is needed for the Mex implementation to make the memory persistent
    pi2tab8[ind] = mxMalloc((n+8)*sizeof(int));
    pi5tab8[ind] = mxMalloc((n+8)*sizeof(int));
    pi4tab8[ind] = mxMalloc((n+8)*sizeof(int));
    pi6tab8[ind] = mxMalloc((n+8)*sizeof(int));
#else
    pi2tab8[ind] = malloc((n+8)*sizeof(int));
    pi5tab8[ind] = malloc((n+8)*sizeof(int));
    pi4tab8[ind] = malloc((n+8)*sizeof(int));
    pi6tab8[ind] = malloc((n+8)*sizeof(int));
#endif

    if ((n&15)>0) {
      n2 = n+8;
760
    } else
761
762
      n2 = n;

763
    for (j=0,i=0; i<n2; i++,j+=16) {
764
      if (j>=n2)
765
766
        j-=(n2-1);

767
768
769
      pi2tab8[ind][i] = j;
      //    printf("pi2[%d] = %d\n",i,j);
    }
770

771
    for (i=0; i<n2; i++) {
772
773
774
775
776
      pi = base_interleaver[i];//(unsigned int)threegpplte_interleaver(f1,f2,n);
      pi3 = pi2tab8[ind][pi];
      pi4tab8[ind][pi2tab8[ind][i]] = pi3;
      pi5tab8[ind][pi3] = pi2tab8[ind][i];
      pi6tab8[ind][pi] = pi2tab8[ind][i];
777
    }
778
779
780
  }
}

781
uint8_t phy_threegpplte_turbo_decoder8(int16_t *y,
782
783
784
785
786
787
788
789
790
791
792
793
794
795
                                       int16_t *y2,
                                       uint8_t *decoded_bytes,
                                       uint8_t *decoded_bytes2,
                                       uint16_t n,
                                       uint8_t max_iterations,
                                       uint8_t crc_type,
                                       uint8_t F,
                                       time_stats_t *init_stats,
                                       time_stats_t *alpha_stats,
                                       time_stats_t *beta_stats,
                                       time_stats_t *gamma_stats,
                                       time_stats_t *ext_stats,
                                       time_stats_t *intl1_stats,
                                       time_stats_t *intl2_stats) {
796
797
798
799
800
  /*  y is a pointer to the input
      decoded_bytes is a pointer to the decoded output
      n is the size in bits of the coded block, with the tail */
  int n2;
  llr_t y8[3*(n+16)] __attribute__((aligned(16)));
Thomas Laurent's avatar
Thomas Laurent committed
801
802
803
804
805
  llr_t systematic0[n+32] __attribute__ ((aligned(16)));
  llr_t systematic1[n+32] __attribute__ ((aligned(16)));
  llr_t systematic2[n+32] __attribute__ ((aligned(16)));
  llr_t yparity1[n+32] __attribute__ ((aligned(16)));
  llr_t yparity2[n+32] __attribute__ ((aligned(16)));
806
807
808
809
810
811
  llr_t ext[n+128] __attribute__((aligned(16)));
  llr_t ext2[n+128] __attribute__((aligned(16)));
  llr_t alpha[(n+16)*8] __attribute__ ((aligned(16)));
  llr_t beta[(n+16)*8] __attribute__ ((aligned(16)));
  llr_t m11[n+16] __attribute__ ((aligned(16)));
  llr_t m10[n+16] __attribute__ ((aligned(16)));
812
813
  //  int *pi2_p,*pi4_p,*pi5_p,*pi6_p;
  int *pi4_p,*pi5_p,*pi6_p;
Thomas Laurent's avatar
Thomas Laurent committed
814
  llr_t *s1,*s2,*yp1,*yp2,*yp;
815
816
817
  unsigned int i,j,iind;//,pi;
  unsigned char iteration_cnt=0;
  unsigned int crc,oldcrc,crc_len;
gauthier's avatar
gauthier committed
818
  uint8_t temp;
819
820
#if defined(__x86_64__) || defined(__i386__)
  __m128i *yp128;
821
  __m128i tmp128[(n+8)>>3];
822
  __m128i tmp={0}, zeros=_mm_setzero_si128();
823
824
825
826
#elif defined(__arm__)
  int8x16_t *yp128;
  int8x16_t tmp128[(n+8)>>3];
  int8x16_t tmp, zeros=vdupq_n_s8(0);
827
828
  const uint8_t __attribute__ ((aligned (16))) _Powers[16]=
  { 1, 2, 4, 8, 16, 32, 64, 128, 1, 2, 4, 8, 16, 32, 64, 128 };
829
830
831
  // Set the powers of 2 (do it once for all, if applicable)
  uint8x16_t Powers= vld1q_u8(_Powers);
#endif
832
833
834
  int offset8_flag=0;

  if (crc_type > 3) {
835
    printf("Illegal crc length!\n");
836
837
838
    return 255;
  }

839
  if (init_stats) start_meas(init_stats);
840
841
842
843

  if ((n&15)>0) {
    n2 = n+8;
    offset8_flag=1;
844
  } else
845
846
    n2 = n;

847
  for (iind=0; iind < 188 && f1f2mat[iind].nb_bits != n; iind++);
848

849
  if ( iind == 188 ) {
850
    printf("Illegal frame length!\n");
851
852
    return 255;
  }
853

854
  switch (crc_type) {
855
856
857
858
    case CRC24_A:
    case CRC24_B:
      crc_len=3;
      break;
859

860
861
862
    case CRC16:
      crc_len=2;
      break;
863

864
865
866
    case CRC8:
      crc_len=1;
      break;
867

868
869
    default:
      crc_len=3;
870
871
  }

872
#if defined(__x86_64__) || defined(__i386__)
873
  // note: this makes valgrind freak
874
  __m128i avg=_mm_set1_epi32(0);
875
876

  for (i=0; i<(3*(n>>4))+1; i++) {
877
878
    __m128i tmp=_mm_abs_epi16(_mm_unpackhi_epi16(((__m128i *)y)[i],((__m128i *)y)[i]));
    avg=_mm_add_epi32(_mm_cvtepi16_epi32(_mm_abs_epi16(((__m128i *)y)[i])),avg);
879
880
    avg=_mm_add_epi32(_mm_cvtepi16_epi32(tmp),avg);
  }
881

882
  int32_t round_avg=(_mm_extract_epi32(avg,0)+_mm_extract_epi32(avg,1)+_mm_extract_epi32(avg,2)+_mm_extract_epi32(avg,3))/(n*3);
883

884
  //printf("avg input turbo: %d sum %d taille bloc %d\n",round_avg,round_sum,n);
885

886
887
  if (round_avg < 16 )
    for (i=0,j=0; i<(3*(n2>>4))+1; i++,j+=2)
888
      ((__m128i *)y8)[i] = _mm_packs_epi16(((__m128i *)y)[j],((__m128i *)y)[j+1]);
889
890
  else if (round_avg < 32)
    for (i=0,j=0; i<(3*(n2>>4))+1; i++,j+=2)
891
892
      ((__m128i *)y8)[i] = _mm_packs_epi16(_mm_srai_epi16(((__m128i *)y)[j],1),_mm_srai_epi16(((__m128i *)y)[j+1],1));
  else if (round_avg < 64 )
893
    for (i=0,j=0; i<(3*(n2>>4))+1; i++,j+=2)
894
      ((__m128i *)y8)[i] = _mm_packs_epi16(_mm_srai_epi16(((__m128i *)y)[j],2),_mm_srai_epi16(((__m128i *)y)[j+1],2));
895
  else if (round_avg < 128)
896
    for (i=0,j=0; i<(3*(n2>>4))+1; i++,j+=2)
897
      ((__m128i *)y8)[i] = _mm_packs_epi16(_mm_srai_epi16(((__m128i *)y)[j],3),_mm_srai_epi16(((__m128i *)y)[j+1],3));
898
899
900
  else
    for (i=0,j=0; i<(3*(n2>>4))+1; i++,j+=2)
      ((__m128i *)y8)[i] = _mm_packs_epi16(_mm_srai_epi16(((__m128i *)y)[j],3),_mm_srai_epi16(((__m128i *)y)[j+1],4));
901

902
  yp128 = (__m128i *)y8;
903
904
905
906
#elif defined(__arm__)
  int32x4_t avg=vdupq_n_s32(0);

  for (i=0; i<(3*(n>>4))+1; i++) {
907
908
    int16x8_t tmp=vabsq_s16(((int16x8_t *)y)[i]);
    avg = vqaddq_s32(avg,vaddl_s16(((int16x4_t *)&tmp)[0],((int16x4_t *)&tmp)[1]));
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
  }

  int32_t round_avg=(vgetq_lane_s32(avg,0)+vgetq_lane_s32(avg,1)+vgetq_lane_s32(avg,2)+vgetq_lane_s32(avg,3))/(n*3);

  //printf("avg input turbo: %d sum %d taille bloc %d\n",round_avg,round_sum,n);

  if (round_avg < 16 )
    for (i=0,j=0; i<(3*(n2>>3))+1; i++,j+=2)
      ((int8x8_t *)y8)[i] = vqmovn_s16(((int16x8_t *)y)[j]);
  else if (round_avg < 32)
    for (i=0,j=0; i<(3*(n2>>3))+1; i++,j+=2)
      ((int8x8_t *)y8)[i] = vqmovn_s16(vshrq_n_s16(((int16x8_t *)y)[j],1));
  else if (round_avg < 64 )
    for (i=0,j=0; i<(3*(n2>>3))+1; i++,j+=2)
      ((int8x8_t *)y8)[i] = vqmovn_s16(vshrq_n_s16(((int16x8_t *)y)[j],2));
  else
    for (i=0,j=0; i<(3*(n2>>3))+1; i++,j+=2)
      ((int8x8_t *)y8)[i] = vqmovn_s16(vshrq_n_s16(((int16x8_t *)y)[j],3));

928
  yp128 = (int8x16_t *)y8;
929
#endif
930
931
932
933
934
935
  s1 = systematic1;
  s2 = systematic2;
  yp1 = yparity1;
  yp2 = yparity2;
  yp=y8;
#if 1
936

937
  for (i=0; i<16 ; i++ )
938
    for (j=0; j<n2; j+=16) {
939
      int k=i+j;
Thomas Laurent's avatar
Thomas Laurent committed
940
      systematic0[k]=*yp++;
941
942
      yp1[k]=*yp++;
      yp2[k]=*yp++;
943
944
    }

945
#endif
946
  yp=(llr_t *)yp128;
947
948
949

  if (n2>n) {
    /*
Thomas Laurent's avatar
Thomas Laurent committed
950
951
    systematic0[n]=0;systematic0[n+1]=0;systematic0[n+2]=0;systematic0[n+3]=0;
    systematic0[n+4]=0;s[n+5]=0;s[n+6]=0;s[n+7]=0;
952
953
954
955
    s1[n]=0;s1[n+1]=0;s1[n+2]=0;s1[n+3]=0;
    s1[n+4]=0;s1[n+5]=0;s1[n+6]=0;s1[n+7]=0;
    s2[n]=0;s2[n+1]=0;s2[n+2]=0;s2[n+3]=0;
    s2[n+4]=0;s2[n+5]=0;s2[n+6]=0;s2[n+7]=0;*/
956
    yp=(llr_t *)(y8+n);
957
958
959
960
961
  }

  //  printf("n=%d,n2=%d\n",n,n2);

  // Termination
962
  for (i=n2; i<n2+3; i++) {
Thomas Laurent's avatar
Thomas Laurent committed
963
964
965
    systematic0[i]= *yp;
    s1[i] = systematic0[i] ;
    s2[i] = systematic0[i];
966
967
968
    yp++;
    yp1[i] = *yp;
    yp++;
969
#ifdef DEBUG_LOGMAP
Thomas Laurent's avatar
Thomas Laurent committed
970
    printf("Term 1 (%u): %d %d\n",i,systematic0[i],yp1[i]);
971
972
973
#endif //DEBUG_LOGMAP
  }

974
  for (i=n2+16; i<n2+19; i++) {
Thomas Laurent's avatar
Thomas Laurent committed
975
976
977
    systematic0[i]= *yp;
    s1[i] = systematic0[i] ;
    s2[i] = systematic0[i];
978
979
980
    yp++;
    yp2[i-16] = *yp;
    yp++;
981
#ifdef DEBUG_LOGMAP
Thomas Laurent's avatar
Thomas Laurent committed
982
    printf("Term 2 (%u): %d %d\n",i-16,systematic0[i],yp2[i-16]);
983
984
985
986
#endif //DEBUG_LOGMAP
  }

#ifdef DEBUG_LOGMAP
987
  printf("\n");
988
#endif //DEBUG_LOGMAP
989

990
  if (init_stats) stop_meas(init_stats);
991
992
993

  // do log_map from first parity bit
  log_map8(systematic0,yparity1,m11,m10,alpha,beta,ext,n2,0,F,offset8_flag,alpha_stats,beta_stats,gamma_stats,ext_stats);
994

995
  while (iteration_cnt++ < max_iterations) {
996
#ifdef DEBUG_LOGMAP
997
    printf("\n*******************ITERATION %d (n %d, n2 %d), ext %p\n\n",iteration_cnt,n,n2,ext);
998
#endif //DEBUG_LOGMAP
999

1000
    if (intl1_stats) start_meas(intl1_stats);
1001

1002
    pi4_p=pi4tab8[iind];
1003
1004

    for (i=0; i<(n2>>4); i++) { // steady-state portion
1005
#if defined(__x86_64__) || defined(__i386__)
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
      tmp=_mm_insert_epi8(tmp,((llr_t *)ext)[*pi4_p++],0);
      tmp=_mm_insert_epi8(tmp,((llr_t *)ext)[*pi4_p++],1);
      tmp=_mm_insert_epi8(tmp,((llr_t *)ext)[*pi4_p++],2);
      tmp=_mm_insert_epi8(tmp,((llr_t *)ext)[*pi4_p++],3);
      tmp=_mm_insert_epi8(tmp,((llr_t *)ext)[*pi4_p++],4);
      tmp=_mm_insert_epi8(tmp,((llr_t *)ext)[*pi4_p++],5);
      tmp=_mm_insert_epi8(tmp,((llr_t *)ext)[*pi4_p++],6);
      tmp=_mm_insert_epi8(tmp,((llr_t *)ext)[*pi4_p++],7);
      tmp=_mm_insert_epi8(tmp,((llr_t *)ext)[*pi4_p++],8);
      tmp=_mm_insert_epi8(tmp,((llr_t *)ext)[*pi4_p++],9);
      tmp=_mm_insert_epi8(tmp,((llr_t *)ext)[*pi4_p++],10);
      tmp=_mm_insert_epi8(tmp,((llr_t *)ext)[*pi4_p++],11);
      tmp=_mm_insert_epi8(tmp,((llr_t *)ext)[*pi4_p++],12);
      tmp=_mm_insert_epi8(tmp,((llr_t *)ext)[*pi4_p++],13);
      tmp=_mm_insert_epi8(tmp,((llr_t *)ext)[*pi4_p++],14);
      ((__m128i *)systematic2)[i]=_mm_insert_epi8(tmp,((llr_t *)ext)[*pi4_p++],15);
1022
#elif defined(__arm__)
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
      tmp=vsetq_lane_s8(((llr_t *)ext)[*pi4_p++],tmp,0);
      tmp=vsetq_lane_s8(((llr_t *)ext)[*pi4_p++],tmp,1);
      tmp=vsetq_lane_s8(((llr_t *)ext)[*pi4_p++],tmp,2);
      tmp=vsetq_lane_s8(((llr_t *)ext)[*pi4_p++],tmp,3);
      tmp=vsetq_lane_s8(((llr_t *)ext)[*pi4_p++],tmp,4);
      tmp=vsetq_lane_s8(((llr_t *)ext)[*pi4_p++],tmp,5);
      tmp=vsetq_lane_s8(((llr_t *)ext)[*pi4_p++],tmp,6);
      tmp=vsetq_lane_s8(((llr_t *)ext)[*pi4_p++],tmp,7);
      tmp=vsetq_lane_s8(((llr_t *)ext)[*pi4_p++],tmp,8);
      tmp=vsetq_lane_s8(((llr_t *)ext)[*pi4_p++],tmp,9);
      tmp=vsetq_lane_s8(((llr_t *)ext)[*pi4_p++],tmp,10);
      tmp=vsetq_lane_s8(((llr_t *)ext)[*pi4_p++],tmp,11);
      tmp=vsetq_lane_s8(((llr_t *)ext)[*pi4_p++],tmp,12);
      tmp=vsetq_lane_s8(((llr_t *)ext)[*pi4_p++],tmp,13);
      tmp=vsetq_lane_s8(((llr_t *)ext)[*pi4_p++],tmp,14);
      ((int8x16_t *)systematic2)[i]=vsetq_lane_s8(((llr_t *)ext)[*pi4_p++],tmp,15);
1039
#endif
1040
1041
    }

1042
    if (intl1_stats) stop_meas(intl1_stats);
1043
1044

    // do log_map from second parity bit
1045
1046
    log_map8(systematic2,yparity2,m11,m10,alpha,beta,ext2,n2,1,F,offset8_flag,alpha_stats,beta_stats,gamma_stats,ext_stats);
    pi5_p=pi5tab8[iind];
gauthier's avatar
gauthier committed
1047
    uint16_t decoded_bytes_interl[6144/16] __attribute__((aligned(16)));
1048

1049
    if ((n2&0x7f) == 0) {  // n2 is a multiple of 128 bits
1050
      for (i=0; i<(n2>>4); i++) {
1051
#if defined(__x86_64__) || defined(__i386__)
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
        tmp=_mm_insert_epi8(tmp,ext2[*pi5_p++],0);
        tmp=_mm_insert_epi8(tmp,ext2[*pi5_p++],1);
        tmp=_mm_insert_epi8(tmp,ext2[*pi5_p++],2);
        tmp=_mm_insert_epi8(tmp,ext2[*pi5_p++],3);
        tmp=_mm_insert_epi8(tmp,ext2[*pi5_p++],4);
        tmp=_mm_insert_epi8(tmp,ext2[*pi5_p++],5);
        tmp=_mm_insert_epi8(tmp,ext2[*pi5_p++],6);
        tmp=_mm_insert_epi8(tmp,ext2[*pi5_p++],7);
        tmp=_mm_insert_epi8(tmp,ext2[*pi5_p++],8);
        tmp=_mm_insert_epi8(tmp,ext2[*pi5_p++],9);
        tmp=_mm_insert_epi8(tmp,ext2[*pi5_p++],10);
        tmp=_mm_insert_epi8(tmp,ext2[*pi5_p++],11);
        tmp=_mm_insert_epi8(tmp,ext2[*pi5_p++],12);
        tmp=_mm_insert_epi8(tmp,ext2[*pi5_p++],13);
        tmp=_mm_insert_epi8(tmp,ext2[*pi5_p++],14);
        tmp=_mm_insert_epi8(tmp,ext2[*pi5_p++],15);
        decoded_bytes_interl[i]=(uint16_t) _mm_movemask_epi8(_mm_cmpgt_epi8(tmp,zeros));
1069
        ((__m128i *)systematic1)[i] = _mm_adds_epi8(_mm_subs_epi8(tmp,((__m128i *)ext)[i]),((__m128i *)systematic0)[i]);
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
#elif defined(__arm__)
        tmp=vsetq_lane_s8(ext2[*pi5_p++],tmp,0);
        tmp=vsetq_lane_s8(ext2[*pi5_p++],tmp,1);
        tmp=vsetq_lane_s8(ext2[*pi5_p++],tmp,2);
        tmp=vsetq_lane_s8(ext2[*pi5_p++],tmp,3);
        tmp=vsetq_lane_s8(ext2[*pi5_p++],tmp,4);
        tmp=vsetq_lane_s8(ext2[*pi5_p++],tmp,5);
        tmp=vsetq_lane_s8(ext2[*pi5_p++],tmp,6);
        tmp=vsetq_lane_s8(ext2[*pi5_p++],tmp,7);
        tmp=vsetq_lane_s8(ext2[*pi5_p++],tmp,8);
        tmp=vsetq_lane_s8(ext2[*pi5_p++],tmp,9);
        tmp=vsetq_lane_s8(ext2[*pi5_p++],tmp,10);
        tmp=vsetq_lane_s8(ext2[*pi5_p++],tmp,11);
        tmp=vsetq_lane_s8(ext2[*pi5_p++],tmp,12);
        tmp=vsetq_lane_s8(ext2[*pi5_p++],tmp,13);
        tmp=vsetq_lane_s8(ext2[*pi5_p++],tmp,14);
        tmp=vsetq_lane_s8(ext2[*pi5_p++],tmp,15);
1087
1088
1089
1090
        uint64x2_t Mask= vpaddlq_u32(vpaddlq_u16(vpaddlq_u8(vandq_u8(vcgtq_s8(tmp,zeros), Powers))));
        vst1q_lane_u8(&((uint8_t *)&decoded_bytes[i])[0], (uint8x16_t)Mask, 0);
        vst1q_lane_u8(&((uint8_t *)&decoded_bytes[i])[1], (uint8x16_t)Mask, 8);
        ((int8x16_t *)systematic1)[i] = vqaddq_s8(vqsubq_s8(tmp,((int8x16_t *)ext)[i]),((int8x16_t *)systematic0)[i]);