Something went wrong on our end
-
Laurent Thomas authoredLaurent Thomas authored
cmult_sv.c 10.92 KiB
/*
* Licensed to the OpenAirInterface (OAI) Software Alliance under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The OpenAirInterface Software Alliance licenses this file to You under
* the OAI Public License, Version 1.1 (the "License"); you may not use this file
* except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.openairinterface.org/?page_id=698
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*-------------------------------------------------------------------------------
* For more information about the OpenAirInterface (OAI) Software Alliance:
* contact@openairinterface.org
*/
#include "PHY/sse_intrin.h"
#include "tools_defs.h"
#if defined(__x86_64__) || defined(__i386__)
#define simd_q15_t __m128i
#define simdshort_q15_t __m64
#define shiftright_int16(a,shift) _mm_srai_epi16(a,shift)
#define set1_int16(a) _mm_set1_epi16(a)
#define mulhi_int16(a,b) _mm_slli_epi16(_mm_mulhi_epi16(a,b),1)
#define mulhi_s1_int16(a,b) _mm_slli_epi16(_mm_mulhi_epi16(a,b),2)
#define adds_int16(a,b) _mm_adds_epi16(a,b)
#define mullo_int16(a,b) _mm_mullo_epi16(a,b)
#elif defined(__arm__)
#define simd_q15_t int16x8_t
#define simdshort_q15_t int16x4_t
#define shiftright_int16(a,shift) vshrq_n_s16(a,shift)
#define set1_int16(a) vdupq_n_s16(a)
#define mulhi_int16(a,b) vqdmulhq_s16(a,b)
#define mulhi_s1_int16(a,b) vshlq_n_s16(vqdmulhq_s16(a,b),1)
#define adds_int16(a,b) vqaddq_s16(a,b)
#define mullo_int16(a,b) vmulq_s16(a,b)
#define _mm_empty()
#define _m_empty()
#endif
void multadd_complex_vector_real_scalar(int16_t *x,
int16_t alpha,
int16_t *y,
uint8_t zero_flag,
uint32_t N)
{
simd_q15_t alpha_128,*x_128=(simd_q15_t *)x,*y_128=(simd_q15_t*)y;
int n;
alpha_128 = set1_int16(alpha);
if (zero_flag == 1)
for (n=0; n<N>>2; n++) {
// print_shorts("x_128[n]=", &x_128[n]);
// print_shorts("alpha_128", &alpha_128);
y_128[n] = mulhi_int16(x_128[n],alpha_128);
// print_shorts("y_128[n]=", &y_128[n]);
}
else
for (n=0; n<N>>2; n++) {
y_128[n] = adds_int16(y_128[n],mulhi_int16(x_128[n],alpha_128));
}
_mm_empty();
_m_empty();
}
void multadd_real_vector_complex_scalar(int16_t *x,
int16_t *alpha,
int16_t *y,
uint32_t N)
{
uint32_t i;
// do 8 multiplications at a time
simd_q15_t alpha_r_128,alpha_i_128,yr,yi,*x_128=(simd_q15_t*)x,*y_128=(simd_q15_t*)y;
int j;
// printf("alpha = %d,%d\n",alpha[0],alpha[1]);
alpha_r_128 = set1_int16(alpha[0]);
alpha_i_128 = set1_int16(alpha[1]);
j=0;
for (i=0; i<N>>3; i++) {
yr = mulhi_s1_int16(alpha_r_128,x_128[i]);
yi = mulhi_s1_int16(alpha_i_128,x_128[i]);
#if defined(__x86_64__) || defined(__i386__)
y_128[j] = _mm_adds_epi16(y_128[j],_mm_unpacklo_epi16(yr,yi));
j++;
y_128[j] = _mm_adds_epi16(y_128[j],_mm_unpackhi_epi16(yr,yi));
j++;
#elif defined(__arm__)
int16x8x2_t yint;
yint = vzipq_s16(yr,yi);
y_128[j] = adds_int16(y_128[j],yint.val[0]);
j++;
y_128[j] = adds_int16(y_128[j],yint.val[1]);
j++;
#endif
}
_mm_empty();
_m_empty();
}
void multadd_real_four_symbols_vector_complex_scalar(int16_t *x,
int16_t *alpha,
int16_t *y)
{
// do 8 multiplications at a time
simd_q15_t alpha_r_128,alpha_i_128,yr,yi,*x_128=(simd_q15_t*)x;
simd_q15_t y_128;
y_128 = _mm_loadu_si128((simd_q15_t*)y);
alpha_r_128 = set1_int16(alpha[0]);
alpha_i_128 = set1_int16(alpha[1]);
yr = mulhi_s1_int16(alpha_r_128,x_128[0]);
yi = mulhi_s1_int16(alpha_i_128,x_128[0]);
y_128 = _mm_adds_epi16(y_128,_mm_unpacklo_epi16(yr,yi));
y_128 = _mm_adds_epi16(y_128,_mm_unpackhi_epi16(yr,yi));
_mm_storeu_si128((simd_q15_t*)y, y_128);
_mm_empty();
_m_empty();
}
#ifdef __AVX2__
void rotate_cpx_vector(c16_t *x,
c16_t *alpha,
c16_t *y,
uint32_t N,
uint16_t output_shift)
{
// Multiply elementwise two complex vectors of N elements
// x - input 1 in the format |Re0 Im0 |,......,|Re(N-1) Im(N-1)|
// We assume x1 with a dynamic of 15 bit maximum
//
// alpha - input 2 in the format |Re0 Im0|
// We assume x2 with a dynamic of 15 bit maximum
//
// y - output in the format |Re0 Im0|,......,|Re(N-1) Im(N-1)|
//
// N - the size f the vectors (this function does N cpx mpy. WARNING: N>=4;
//
// log2_amp - increase the output amplitude by a factor 2^log2_amp (default is 0)
// WARNING: log2_amp>0 can cause overflow!!
uint32_t i; // loop counter
simd_q15_t *y_128,alpha_128;
int32_t *xd=(int32_t *)x;
#if defined(__x86_64__) || defined(__i386__)
__m128i shift = _mm_cvtsi32_si128(output_shift);
register simd_q15_t m0,m1,m2,m3;
((int16_t *)&alpha_128)[0] = alpha->r;
((int16_t *)&alpha_128)[1] = -alpha->i;
((int16_t *)&alpha_128)[2] = alpha->i;
((int16_t *)&alpha_128)[3] = alpha->r;
((int16_t *)&alpha_128)[4] = alpha->r;
((int16_t *)&alpha_128)[5] = -alpha->i;
((int16_t *)&alpha_128)[6] = alpha->i;
((int16_t *)&alpha_128)[7] = alpha->r;
#elif defined(__arm__)
int32x4_t shift;
int32x4_t ab_re0,ab_re1,ab_im0,ab_im1,re32,im32;
int16_t reflip[8] __attribute__((aligned(16))) = {1,-1,1,-1,1,-1,1,-1};
int32x4x2_t xtmp;
((int16_t *)&alpha_128)[0] = alpha->r;
((int16_t *)&alpha_128)[1] = alpha->i;
((int16_t *)&alpha_128)[2] = alpha->r;
((int16_t *)&alpha_128)[3] = alpha->i;
((int16_t *)&alpha_128)[4] = alpha->r;
((int16_t *)&alpha_128)[5] = alpha->i;
((int16_t *)&alpha_128)[6] = alpha->r;
((int16_t *)&alpha_128)[7] = alpha->i;
int16x8_t bflip = vrev32q_s16(alpha_128);
int16x8_t bconj = vmulq_s16(alpha_128,*(int16x8_t *)reflip);
shift = vdupq_n_s32(-output_shift);
#endif
y_128 = (simd_q15_t *) y;
for(i=0; i<N>>2; i++) {
#if defined(__x86_64__) || defined(__i386__)
m0 = _mm_setr_epi32(xd[0],xd[0],xd[1],xd[1]);
m1 = _mm_setr_epi32(xd[2],xd[2],xd[3],xd[3]);
m2 = _mm_madd_epi16(m0,alpha_128); //complex multiply. result is 32bit [Re Im Re Im]
m3 = _mm_madd_epi16(m1,alpha_128); //complex multiply. result is 32bit [Re Im Re Im]
m2 = _mm_sra_epi32(m2,shift); // shift right by shift in order to compensate for the input amplitude
m3 = _mm_sra_epi32(m3,shift); // shift right by shift in order to compensate for the input amplitude
y_128[0] = _mm_packs_epi32(m2,m3); // pack in 16bit integers with saturation [re im re im re im re im]
//print_ints("y_128[0]=", &y_128[0]);
#elif defined(__arm__)
ab_re0 = vmull_s16(((int16x4_t*)xd)[0],((int16x4_t*)&bconj)[0]);
ab_re1 = vmull_s16(((int16x4_t*)xd)[1],((int16x4_t*)&bconj)[1]);
ab_im0 = vmull_s16(((int16x4_t*)xd)[0],((int16x4_t*)&bflip)[0]);
ab_im1 = vmull_s16(((int16x4_t*)xd)[1],((int16x4_t*)&bflip)[1]);
re32 = vshlq_s32(vcombine_s32(vpadd_s32(((int32x2_t*)&ab_re0)[0],((int32x2_t*)&ab_re0)[1]),
vpadd_s32(((int32x2_t*)&ab_re1)[0],((int32x2_t*)&ab_re1)[1])),
shift);
im32 = vshlq_s32(vcombine_s32(vpadd_s32(((int32x2_t*)&ab_im0)[0],((int32x2_t*)&ab_im0)[1]),
vpadd_s32(((int32x2_t*)&ab_im1)[0],((int32x2_t*)&ab_im1)[1])),
shift);
xtmp = vzipq_s32(re32,im32);
y_128[0] = vcombine_s16(vmovn_s32(xtmp.val[0]),vmovn_s32(xtmp.val[1]));
#endif
xd+=4;
y_128+=1;
}
_mm_empty();
_m_empty();
return;
}
#endif
/*
int mult_vector32_scalar(int16_t *x1,
int x2,
int16_t *y,
uint32_t N)
{
// Multiply elementwise two real vectors of N elements
// x1 - input 1 in the format |Re(0) xxx Re(1) xxx|,......,|Re(N-2) xxx Re(N-1) xxx|
// We assume x1 with a dinamic of 31 bit maximum
//
// x1 - input 2
//
// y - output in the format |Re0 (64bit) |,......,|Re(N-1) (64bit)|
//
// N - the size f the vectors (this function does N cpx mpy. WARNING: N>=4;
//
uint32_t i; // loop counter
__m128i *x1_128;
__m128i x2_128;
__m128i *y_128;
x1_128 = (__m128i *)&x1[0];
x2_128 = _mm_setr_epi32(x2,0,x2,0);
y_128 = (__m128i *)&y[0];
// we compute 4 cpx multiply for each loop
for(i=0; i<(N>>3); i++) {
y_128[0] = _mm_mul_epu32(x1_128[0],x2_128);
y_128[1] = _mm_mul_epu32(x1_128[1],x2_128);
y_128[2] = _mm_mul_epu32(x1_128[2],x2_128);
y_128[3] = _mm_mul_epu32(x1_128[3],x2_128);
x1_128+=4;
y_128 +=4;
}
_mm_empty();
_m_empty();
return(0);
}
*/
int complex_conjugate(int16_t *x1,
int16_t *y,
uint32_t N)
{
uint32_t i; // loop counter
simd_q15_t *x1_128;
simd_q15_t *y_128;
int16_t x2[8] __attribute__((aligned(16))) = {1,-1,1,-1,1,-1,1,-1};
simd_q15_t *x2_128 = (simd_q15_t*)&x2[0];
x1_128 = (simd_q15_t *)&x1[0];
y_128 = (simd_q15_t *)&y[0];
// we compute 4 cpx multiply for each loop
for(i=0; i<(N>>3); i++) {
y_128[0] = mullo_int16(x1_128[0],*x2_128);
y_128[1] = mullo_int16(x1_128[1],*x2_128);
y_128[2] = mullo_int16(x1_128[2],*x2_128);
y_128[3] = mullo_int16(x1_128[3],*x2_128);
x1_128+=4;
y_128 +=4;
}
_mm_empty();
_m_empty();
return(0);
}
#ifdef MAIN
#define L 8
main ()
{
int16_t input[256] __attribute__((aligned(16)));
int16_t input2[256] __attribute__((aligned(16)));
int16_t output[256] __attribute__((aligned(16)));
c16_t alpha;
int i;
input[0] = 100;
input[1] = 200;
input[2] = -200;
input[3] = 100;
input[4] = 1000;
input[5] = 2000;
input[6] = -2000;
input[7] = 1000;
input[8] = 100;
input[9] = 200;
input[10] = -200;
input[11] = 100;
input[12] = 1000;
input[13] = 2000;
input[14] = -2000;
input[15] = 1000;
input2[0] = 1;
input2[1] = 2;
input2[2] = 1;
input2[3] = 2;
input2[4] = 10;
input2[5] = 20;
input2[6] = 10;
input2[7] = 20;
input2[8] = 1;
input2[9] = 2;
input2[10] = 1;
input2[11] = 2;
input2[12] = 1000;
input2[13] = 2000;
input2[14] = 1000;
input2[15] = 2000;
alpha->r=32767;
alpha->i=0;
//mult_cpx_vector(input,input2,output,L,0);
rotate_cpx_vector_norep(input,alpha,input,L,15);
for(i=0; i<L<<1; i++)
printf("output[i]=%d\n",input[i]);
}
#endif //MAIN