cmult_vv.c 8 KB
Newer Older
ghaddab's avatar
ghaddab committed
1
/*******************************************************************************
2
    OpenAirInterface
ghaddab's avatar
ghaddab committed
3 4 5 6 7 8 9 10 11 12 13 14 15 16
    Copyright(c) 1999 - 2014 Eurecom

    OpenAirInterface is free software: you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation, either version 3 of the License, or
    (at your option) any later version.


    OpenAirInterface is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.

    You should have received a copy of the GNU General Public License
17 18
    along with OpenAirInterface.The full GNU General Public License is
   included in this distribution in the file called "COPYING". If not,
ghaddab's avatar
ghaddab committed
19 20 21 22 23
   see <http://www.gnu.org/licenses/>.

  Contact Information
  OpenAirInterface Admin: openair_admin@eurecom.fr
  OpenAirInterface Tech : openair_tech@eurecom.fr
24
  OpenAirInterface Dev  : openair4g-devel@lists.eurecom.fr
25

ghaddab's avatar
ghaddab committed
26
  Address      : Eurecom, Campus SophiaTech, 450 Route des Chappes, CS 50193 - 06904 Biot Sophia Antipolis cedex, FRANCE
ghaddab's avatar
ghaddab committed
27 28

 *******************************************************************************/
29 30 31 32 33 34
#include "defs.h"
//#include "MAC_INTERFACE/extern.h"
#ifdef USER_MODE
#include <stdio.h>
#endif

35 36
#if defined(__x86_64__) || defined(__i386__)
int16_t conjug[8]__attribute__((aligned(16))) = {-1,1,-1,1,-1,1,-1,1} ;
Elena Lukashova's avatar
Elena Lukashova committed
37 38
int16_t conjug2[8]__attribute__((aligned(16))) = {1,-1,1,-1,1,-1,1,-1} ;

39 40
#define simd_q15_t __m128i
#define simdshort_q15_t __m64
Elena Lukashova's avatar
Elena Lukashova committed
41 42
#define set1_int16(a) _mm_set1_epi16(a)
#define setr_int16(a0, a1, a2, a3, a4, a5, a6, a7) _mm_setr_epi16(a0, a1, a2, a3, a4, a5, a6, a7 )
43 44 45 46 47 48 49
#elif defined(__arm__)
int16_t conjug[4]__attribute__((aligned(16))) = {-1,1,-1,1} ;
#define simd_q15_t int16x8_t
#define simdshort_q15_t int16x4_t
#define _mm_empty()
#define _m_empty()
#endif
50

51 52 53 54 55
int mult_cpx_conj_vector(int16_t *x1,
                         int16_t *x2,
                         int16_t *y,
                         uint32_t N,
                         int output_shift)
56
{
57
  // Multiply elementwise the complex conjugate of x1 with x2. 
58
  // x1       - input 1    in the format  |Re0 Im0 Re1 Im1|,......,|Re(N-2)  Im(N-2) Re(N-1) Im(N-1)|
59 60
  //            We assume x1 with a dinamic of 15 bit maximum
  //
61
  // x2       - input 2    in the format  |Re0 Im0 Re1 Im1|,......,|Re(N-2)  Im(N-2) Re(N-1) Im(N-1)|
62 63
  //            We assume x2 with a dinamic of 14 bit maximum
  ///
64
  // y        - output     in the format  |Re0 Im0 Re1 Im1|,......,|Re(N-2)  Im(N-2) Re(N-1) Im(N-1)|
65 66 67
  //
  // N        - the size f the vectors (this function does N cpx mpy. WARNING: N>=4;
  //
68
  // output_shift  - shift to be applied to generate output
69

gauthier's avatar
gauthier committed
70
  uint32_t i;                 // loop counter
71

72 73 74 75 76 77
  simd_q15_t *x1_128;
  simd_q15_t *x2_128;
  simd_q15_t *y_128;
#if defined(__x86_64__) || defined(__i386__)
  simd_q15_t tmp_re,tmp_im;
  simd_q15_t tmpy0,tmpy1;
Elena Lukashova's avatar
Elena Lukashova committed
78
  
79 80 81 82 83
#elif defined(__arm__)
  int32x4_t tmp_re,tmp_im;
  int32x4_t tmp_re1,tmp_im1;
  int16x4x2_t tmpy; 
  int32x4_t shift = vdupq_n_s32(-output_shift); 
84
#endif
85

86 87 88
  x1_128 = (simd_q15_t *)&x1[0];
  x2_128 = (simd_q15_t *)&x2[0];
  y_128  = (simd_q15_t *)&y[0];
89

Elena Lukashova's avatar
Elena Lukashova committed
90
 
91
  // we compute 4 cpx multiply for each loop
92 93 94 95 96 97
  for(i=0; i<(N>>2); i++) {
  #if defined(__x86_64__) || defined(__i386__)
    tmp_re = _mm_madd_epi16(*x1_128,*x2_128);
    tmp_im = _mm_shufflelo_epi16(*x1_128,_MM_SHUFFLE(2,3,0,1));
    tmp_im = _mm_shufflehi_epi16(tmp_im,_MM_SHUFFLE(2,3,0,1));
    tmp_im = _mm_sign_epi16(tmp_im,*(__m128i*)&conjug[0]);
98
    tmp_im = _mm_madd_epi16(tmp_im,*x2_128);
99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125
    tmp_re = _mm_srai_epi32(tmp_re,output_shift);
    tmp_im = _mm_srai_epi32(tmp_im,output_shift);
    tmpy0  = _mm_unpacklo_epi32(tmp_re,tmp_im);
    tmpy1  = _mm_unpackhi_epi32(tmp_re,tmp_im);
    *y_128 = _mm_packs_epi32(tmpy0,tmpy1);
#elif defined(__arm__)

    tmp_re  = vmull_s16(((simdshort_q15_t *)x1_128)[0], ((simdshort_q15_t*)x2_128)[0]);
    //tmp_re = [Re(x1[0])Re(x2[0]) Im(x1[0])Im(x2[0]) Re(x1[1])Re(x2[1]) Im(x1[1])Im(x2[1])] 
    tmp_re1 = vmull_s16(((simdshort_q15_t *)x1_128)[1], ((simdshort_q15_t*)x2_128)[1]);
    //tmp_re1 = [Re(x1[1])Re(x2[1]) Im(x1[1])Im(x2[1]) Re(x1[1])Re(x2[2]) Im(x1[1])Im(x2[2])] 
    tmp_re  = vcombine_s32(vpadd_s32(vget_low_s32(tmp_re),vget_high_s32(tmp_re)),
                           vpadd_s32(vget_low_s32(tmp_re1),vget_high_s32(tmp_re1)));
    //tmp_re = [Re(ch[0])Re(rx[0])+Im(ch[0])Im(ch[0]) Re(ch[1])Re(rx[1])+Im(ch[1])Im(ch[1]) Re(ch[2])Re(rx[2])+Im(ch[2]) Im(ch[2]) Re(ch[3])Re(rx[3])+Im(ch[3])Im(ch[3])] 

    tmp_im  = vmull_s16(vrev32_s16(vmul_s16(((simdshort_q15_t*)x2_128)[0],*(simdshort_q15_t*)conjug)), ((simdshort_q15_t*)x1_128)[0]);
    //tmp_im = [-Im(ch[0])Re(rx[0]) Re(ch[0])Im(rx[0]) -Im(ch[1])Re(rx[1]) Re(ch[1])Im(rx[1])]
    tmp_im1 = vmull_s16(vrev32_s16(vmul_s16(((simdshort_q15_t*)x2_128)[1],*(simdshort_q15_t*)conjug)), ((simdshort_q15_t*)x1_128)[1]);
    //tmp_im1 = [-Im(ch[2])Re(rx[2]) Re(ch[2])Im(rx[2]) -Im(ch[3])Re(rx[3]) Re(ch[3])Im(rx[3])]
    tmp_im  = vcombine_s32(vpadd_s32(vget_low_s32(tmp_im),vget_high_s32(tmp_im)),
                           vpadd_s32(vget_low_s32(tmp_im1),vget_high_s32(tmp_im1)));
    //tmp_im = [-Im(ch[0])Re(rx[0])+Re(ch[0])Im(rx[0]) -Im(ch[1])Re(rx[1])+Re(ch[1])Im(rx[1]) -Im(ch[2])Re(rx[2])+Re(ch[2])Im(rx[2]) -Im(ch[3])Re(rx[3])+Re(ch[3])Im(rx[3])]

    tmp_re = vqshlq_s32(tmp_re,shift);
    tmp_im = vqshlq_s32(tmp_im,shift);
    tmpy   = vzip_s16(vmovn_s32(tmp_re),vmovn_s32(tmp_im));
    *y_128 = vcombine_s16(tmpy.val[0],tmpy.val[1]);
126
#endif
127 128 129
    x1_128++;
    x2_128++;
    y_128++;
130 131 132 133 134 135 136 137
  }


  _mm_empty();
  _m_empty();

  return(0);
}
138

Elena Lukashova's avatar
Elena Lukashova committed
139

140 141
int mult_cpx_vector(int16_t *x1, //Q15
                    int16_t *x2,//Q13
Elena Lukashova's avatar
Elena Lukashova committed
142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171
                    int16_t *y,
                    uint32_t N,
                    int output_shift)
{
  // Multiply elementwise x1 with x2. 
  // x1       - input 1    in the format  |Re0 Im0 Re1 Im1|,......,|Re(N-2)  Im(N-2) Re(N-1) Im(N-1)|
  //            We assume x1 with a dinamic of 15 bit maximum
  //
  // x2       - input 2    in the format  |Re0 Im0 Re1 Im1|,......,|Re(N-2)  Im(N-2) Re(N-1) Im(N-1)|
  //            We assume x2 with a dinamic of 14 bit maximum
  ///
  // y        - output     in the format  |Re0 Im0 Re1 Im1|,......,|Re(N-2)  Im(N-2) Re(N-1) Im(N-1)|
  //
  // N        - the size f the vectors (this function does N cpx mpy. WARNING: N>=4;
  //
  // output_shift  - shift to be applied to generate output

  uint32_t i;                 // loop counter

  simd_q15_t *x1_128;
  simd_q15_t *x2_128;
  simd_q15_t *y_128;
  simd_q15_t tmp_re,tmp_im;
  simd_q15_t tmpy0,tmpy1;


  x1_128 = (simd_q15_t *)&x1[0];
  x2_128 = (simd_q15_t *)&x2[0];
  y_128  = (simd_q15_t *)&y[0];
  //print_shorts("x1_128:",&x1_128[0]);
172
 // print_shorts("x2_128:",&x2_128[0]);
Elena Lukashova's avatar
Elena Lukashova committed
173 174 175 176
  
  //right shift by 13 while p_a * x0 and 15 while  
  // we compute 4 cpx multiply for each loop
  for(i=0; i<(N>>2); i++) {
177
    tmp_re = _mm_sign_epi16(*x1_128,*(__m128i*)&conjug2[0]);// Q15
Elena Lukashova's avatar
Elena Lukashova committed
178
    //print_shorts("tmp_re1:",&tmp_re[i]);
179
    tmp_re = _mm_madd_epi16(tmp_re,*x2_128); //Q28
Elena Lukashova's avatar
Elena Lukashova committed
180 181
    //print_ints("tmp_re2:",&tmp_re[i]);

182
    tmp_im = _mm_shufflelo_epi16(*x1_128,_MM_SHUFFLE(2,3,0,1)); //Q15
Elena Lukashova's avatar
Elena Lukashova committed
183
    //print_shorts("tmp_im1:",&tmp_im[i]);
184
    tmp_im = _mm_shufflehi_epi16(tmp_im,_MM_SHUFFLE(2,3,0,1)); //Q15
Elena Lukashova's avatar
Elena Lukashova committed
185
    //print_shorts("tmp_im2:",&tmp_im[i]);
186
    tmp_im = _mm_madd_epi16(tmp_im, *x2_128); //Q28
Elena Lukashova's avatar
Elena Lukashova committed
187
    //print_ints("tmp_im3:",&tmp_im[i]);
188
    tmp_re = _mm_srai_epi32(tmp_re,output_shift);//Q(28-shift)
Elena Lukashova's avatar
Elena Lukashova committed
189
    //print_ints("tmp_re shifted:",&tmp_re[i]);
190
    tmp_im = _mm_srai_epi32(tmp_im,output_shift); //Q(28-shift)
Elena Lukashova's avatar
Elena Lukashova committed
191
    //print_ints("tmp_im shifted:",&tmp_im[i]);
192
    tmpy0  = _mm_unpacklo_epi32(tmp_re,tmp_im); //Q(28-shift)
Elena Lukashova's avatar
Elena Lukashova committed
193
    //print_ints("unpack lo :",&tmpy0[i]);
194
    tmpy1  = _mm_unpackhi_epi32(tmp_re,tmp_im); //Q(28-shift)
Elena Lukashova's avatar
Elena Lukashova committed
195
    //print_ints("mrc rho0:",&tmpy1[i]);
196
    *y_128 = _mm_packs_epi32(tmpy0,tmpy1); //must be Q15 
Elena Lukashova's avatar
Elena Lukashova committed
197 198 199 200 201 202 203 204 205 206 207 208 209
    //print_shorts("*y_128:",&y_128[i]);

    x1_128++;
    x2_128++;
    y_128++;
  }


  _mm_empty();
  _m_empty();

  return(0);
}