Commit 89849444 authored by Florian Kaltenberger's avatar Florian Kaltenberger
Browse files

fixing a bug in cmult_vv, removing unused code.

parent 78913e26
/*******************************************************************************
OpenAirInterface
Copyright(c) 1999 - 2014 Eurecom
OpenAirInterface is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
OpenAirInterface is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with OpenAirInterface.The full GNU General Public License is
included in this distribution in the file called "COPYING". If not,
see <http://www.gnu.org/licenses/>.
Contact Information
OpenAirInterface Admin: openair_admin@eurecom.fr
OpenAirInterface Tech : openair_tech@eurecom.fr
OpenAirInterface Dev : openair4g-devel@lists.eurecom.fr
Address : Eurecom, Campus SophiaTech, 450 Route des Chappes, CS 50193 - 06904 Biot Sophia Antipolis cedex, FRANCE
*******************************************************************************/
#include "defs.h"
int mult_cpx_matrix_h(short *x1[2][2],
short *x2[2][2],
short *y[2][2],
unsigned int N,
unsigned short output_shift,
short hermitian)
{
if (hermitian) {
// this computes x1^H*x2 and stores it in y
mult_cpx_vector_h(x2[0][0],x1[0][0],y[0][0],N,output_shift,1);
mult_cpx_vector_h(x2[0][1],x1[0][1],y[0][0],N,output_shift,1);
mult_cpx_vector_h(x2[0][0],x1[1][0],y[1][0],N,output_shift,1);
mult_cpx_vector_h(x2[0][1],x1[1][1],y[1][0],N,output_shift,1);
mult_cpx_vector_h(x2[1][0],x1[0][0],y[0][1],N,output_shift,1);
mult_cpx_vector_h(x2[1][1],x1[0][1],y[0][1],N,output_shift,1);
mult_cpx_vector_h(x2[1][0],x1[1][0],y[1][1],N,output_shift,1);
mult_cpx_vector_h(x2[1][1],x1[1][1],y[1][1],N,output_shift,1);
} else {
// this computes x1*x2^H and stores it in y
mult_cpx_vector_h(x1[0][0],x2[0][0],y[0][0],N,output_shift,1);
mult_cpx_vector_h(x1[0][1],x2[0][1],y[0][0],N,output_shift,1);
mult_cpx_vector_h(x1[0][0],x2[1][0],y[0][1],N,output_shift,1);
mult_cpx_vector_h(x1[0][1],x2[1][1],y[0][1],N,output_shift,1);
mult_cpx_vector_h(x1[1][0],x2[0][0],y[1][0],N,output_shift,1);
mult_cpx_vector_h(x1[1][1],x2[0][1],y[1][0],N,output_shift,1);
mult_cpx_vector_h(x1[1][0],x2[1][0],y[1][1],N,output_shift,1);
mult_cpx_vector_h(x1[1][1],x2[1][1],y[1][1],N,output_shift,1);
}
}
int mult_cpx_matrix_vector(int *x1[2][2],
int *x2[2],
int *y[2],
unsigned int N,
unsigned short output_shift)
{
Zero_Buffer(y[0],N*8);
Zero_Buffer(y[1],N*8);
// this computes x1*x2 and stores it in y (32 bit)
mult_cpx_vector_add32((short*)x2[0],(short*)x1[0][0],(short*)y[0],N);
mult_cpx_vector_add32((short*)x2[1],(short*)x1[0][1],(short*)y[0],N);
mult_cpx_vector_add32((short*)x2[0],(short*)x1[1][0],(short*)y[1],N);
mult_cpx_vector_add32((short*)x2[1],(short*)x1[1][1],(short*)y[1],N);
// shift and pack
shift_and_pack((short*)y[0],N,output_shift);
shift_and_pack((short*)y[1],N,output_shift);
}
#ifdef MAIN_MM
#include <stdio.h>
#include <stdlib.h>
main ()
{
short x1_00[256] __attribute__((aligned(16)));
short x1_10[256] __attribute__((aligned(16)));
short x1_01[256] __attribute__((aligned(16)));
short x1_11[256] __attribute__((aligned(16)));
short x2_0[256] __attribute__((aligned(16)));
short x2_1[256] __attribute__((aligned(16)));
short y_0[256] __attribute__((aligned(16)));
short y_1[256] __attribute__((aligned(16)));
int *x1[2][2];
int *x2[2];
int *y[2];
int i,m,n;
x1[0][0] = (int*)x1_00;
x1[0][1] = (int*)x1_01;
x1[1][0] = (int*)x1_10;
x1[1][1] = (int*)x1_11;
x2[0] = (int*)x2_0;
x2[1] = (int*)x2_1;
y[0] = (int*)y_0;
y[1] = (int*)y_1;
for(m=0; m<2; m++) {
for(n=0; n<2; n++) {
for(i=0; i<256; i+=4) {
((short*)x1[m][n])[i] = ((short) rand())/4;
((short*)x1[m][n])[i+1] = ((short) rand())/4;
((short*)x1[m][n])[i+2] = -((short*)x1[m][n])[i+1];
((short*)x1[m][n])[i+3] = ((short*)x1[m][n])[i];
}
}
for(i=0; i<256; i+=4) {
((short*)x2[m])[i] = ((short) rand())/4;
((short*)x2[m])[i+1] = ((short) rand())/4;
((short*)x2[m])[i+2] = ((short*)x2[m])[i];
((short*)x2[m])[i+3] = ((short*)x2[m])[i+1];
}
Zero_Buffer(y[m],512);
}
/*
input[0] = 100;
input[1] = 200;
input[2] = -200;
input[3] = 100;
input[4] = 1000;
input[5] = 2000;
input[6] = -2000;
input[7] = 1000;
input[8] = 100;
input[9] = 200;
input[10] = -200;
input[11] = 100;
input[12] = 1000;
input[13] = 2000;
input[14] = -2000;
input[15] = 1000;
input2[0] = 2;
input2[1] = 1;
input2[2] = 2;
input2[3] = 1;
input2[4] = 20;
input2[5] = 10;
input2[6] = 20;
input2[7] = 10;
input2[8] = 2;
input2[9] = 1;
input2[10] = 2;
input2[11] = 1;
input2[12] = 2000;
input2[13] = 1000;
input2[14] = 2000;
input2[15] = 1000;
x1[0][0] = (int*)input;
x1[0][1] = (int*)input;
x1[1][0] = (int*)input;
x1[1][1] = (int*)input;
x2[0] = (int*)input2;
x2[1] = (int*)input2;
y[0] = (int*)output;
y[1] = (int*)output2;
*/
mult_cpx_matrix_vector(x1,x2,y,64,15);
//mult_cpx_vector_add32(x2[0],x1[0][0],y[0],64);
for (i=0; i<128; i+=2)
printf("i=%d, x1 = [%d+1i*%d %d+1i*%d; %d+1i*%d %d+1i*%d]; x2 = [%d+1i*%d; %d+1i*%d]; y = [%d+1i*%d; %d+1i*%d]; y_m= round(x1*x2./pow2(15)); y-y_m \n",
i,
((short*)x1[0][0])[2*i], ((short*)x1[0][0])[2*i+2],
((short*)x1[0][1])[2*i], ((short*)x1[0][1])[2*i+2],
((short*)x1[1][0])[2*i], ((short*)x1[1][0])[2*i+2],
((short*)x1[1][1])[2*i], ((short*)x1[1][1])[2*i+2],
((short*)x2[0])[2*i], ((short*)x2[0])[2*i+1],
((short*)x2[1])[2*i], ((short*)x2[1])[2*i+1],
((short*)y[0])[2*i], ((short*)y[0])[2*i+1],
((short*)y[1])[2*i], ((short*)y[1])[2*i+1]);
//((int*)y[0])[i], ((int*)y[0])[i+1],
//((int*)y[1])[i], ((int*)y[1])[i+1]);
}
#endif
......@@ -50,7 +50,7 @@ int mult_cpx_conj_vector(int16_t *x1,
uint32_t N,
int output_shift)
{
// Multiply elementwise two complex vectors of N elements with repeated formatted output
// Multiply elementwise the complex conjugate of x1 with x2.
// x1 - input 1 in the format |Re0 Im0 Re1 Im1|,......,|Re(N-2) Im(N-2) Re(N-1) Im(N-1)|
// We assume x1 with a dinamic of 15 bit maximum
//
......@@ -90,7 +90,7 @@ int mult_cpx_conj_vector(int16_t *x1,
tmp_im = _mm_shufflelo_epi16(*x1_128,_MM_SHUFFLE(2,3,0,1));
tmp_im = _mm_shufflehi_epi16(tmp_im,_MM_SHUFFLE(2,3,0,1));
tmp_im = _mm_sign_epi16(tmp_im,*(__m128i*)&conjug[0]);
tmp_im = _mm_madd_epi16(tmp_im,*x1_128);
tmp_im = _mm_madd_epi16(tmp_im,*x2_128);
tmp_re = _mm_srai_epi32(tmp_re,output_shift);
tmp_im = _mm_srai_epi32(tmp_im,output_shift);
tmpy0 = _mm_unpacklo_epi32(tmp_re,tmp_im);
......@@ -130,3 +130,4 @@ int mult_cpx_conj_vector(int16_t *x1,
return(0);
}
/*******************************************************************************
OpenAirInterface
Copyright(c) 1999 - 2014 Eurecom
OpenAirInterface is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
OpenAirInterface is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with OpenAirInterface.The full GNU General Public License is
included in this distribution in the file called "COPYING". If not,
see <http://www.gnu.org/licenses/>.
Contact Information
OpenAirInterface Admin: openair_admin@eurecom.fr
OpenAirInterface Tech : openair_tech@eurecom.fr
OpenAirInterface Dev : openair4g-devel@lists.eurecom.fr
Address : Eurecom, Campus SophiaTech, 450 Route des Chappes, CS 50193 - 06904 Biot Sophia Antipolis cedex, FRANCE
*******************************************************************************/
#include "defs.h"
static __m128i shift __attribute__ ((aligned(16)));
int mult_cpx_vector_h(short *x1,
short *x2,
short *y,
unsigned int N,
unsigned short output_shift,
short sign)
{
// Multiply elementwise the complex vector x1 with the complex conjugate of the complex vecotr x2 of N elements and adds it to the vector y.
// x1 - input 1 in the format |Re0 Im0 Re0 Im0|,......,|Re(N-1) Im(N-1) Re(N-1) Im(N-1)|
// We assume x1 with a dinamic of 15 bit maximum
//
// x2 - input 2 in the format |Re0 Im0 Re0 Im0|,......,|Re(N-1) Im(N-1) Re(N-1) Im(N-1)|
// We assume x2 with a dinamic of 14 bit maximum
//
// y - output in the format |Re0 Im0 Re0 Im0|,......,|Re(N-1) Im(N-1) Re(N-1) Im(N-1)|
//
// N - the size f the vectors (this function does N cpx mpy. WARNING: N>=4;
//
// log2_amp - increase the output amplitude by a factor 2^log2_amp (default is 0)
// WARNING: log2_amp>0 can cause overflow!!
// sign - +1..add, -1..substract
unsigned int i; // loop counter
register __m128i m0,m1,m2;
short *temps;
int *tempd;
__m128i *x1_128;
__m128i *x2_128;
__m128i *y_128;
__m128i mask;
__m128i temp;
shift = _mm_cvtsi32_si128(output_shift);
x1_128 = (__m128i *)&x1[0];
x2_128 = (__m128i *)&x2[0];
y_128 = (__m128i *)&y[0];
if (sign == -1)
mask = (__m128i) _mm_set_epi16 (-1,1,-1,-1,-1,1,-1,-1);
else
mask = (__m128i) _mm_set_epi16 (1,-1,1,1,1,-1,1,1);
// we compute 2*4 cpx multiply for each loop
for(i=0; i<(N>>3); i++) {
// printf("i=%d\n",i);
// unroll 1
// temps = (short *)x1_128;
// printf("x1 : %d,%d,%d,%d,%d,%d,%d,%d\n",temps[0],temps[1],temps[2],temps[3],temps[4],temps[5],temps[6],temps[7]);
m1 = x1_128[0];
m2 = x2_128[0];
// temps = (short *)&x2_128[0];
// printf("x2 : %x,%x,%x,%x,%x,%x,%x,%x\n",temps[0],temps[1],temps[2],temps[3],temps[4],temps[5],temps[6],temps[7]);
// bring x2 in conjugate form
// the first two instructions might be replaced with a single one in SSE3
m2 = _mm_shufflelo_epi16(m2,_MM_SHUFFLE(0,1,3,2));
m2 = _mm_shufflehi_epi16(m2,_MM_SHUFFLE(0,1,3,2));
m2 = _mm_mullo_epi16(m2, mask);
// temp = m2;
// temps = (short *)&temp;
// printf("x2 conj : %x,%x,%x,%x,%x,%x,%x,%x\n",temps[0],temps[1],temps[2],temps[3],temps[4],temps[5],temps[6],temps[7]);
m0 = _mm_madd_epi16(m1,m2); //pmaddwd_r2r(mm1,mm0); // 1- compute x1[0]*x2[0]
// temp = m0;
// tempd = &temp;
// printf("m0 : %d,%d,%d,%d\n",tempd[0],tempd[1],tempd[2],tempd[3]);
m0 = _mm_sra_epi32(m0,shift); // 1- shift right by shift in order to compensate for the input amplitude
// temp = m0;
// tempd = (int *)&temp;
// printf("m0 : %d,%d,%d,%d\n",tempd[0],tempd[1],tempd[2],tempd[3]);
m0 = _mm_packs_epi32(m0,m0); // 1- pack in a 128 bit register [re im re im]
m0 = _mm_unpacklo_epi32(m0,m0); // 1- pack in a 128 bit register [re im re im]
y_128[0] = _mm_add_epi16(m0,y_128[0]);
// temps = (short *)&y_128[0];
// printf("y0 : %d,%d,%d,%d,%d,%d,%d,%d\n",temps[0],temps[1],temps[2],temps[3],temps[4],temps[5],temps[6],temps[7]);
// unroll 2
m1 = x1_128[1];
m2 = x2_128[1];
m2 = _mm_shufflelo_epi16(m2,_MM_SHUFFLE(0,1,3,2));
m2 = _mm_shufflehi_epi16(m2,_MM_SHUFFLE(0,1,3,2));
m2 = _mm_mullo_epi16(m2, mask);
m0 = _mm_madd_epi16(m1,m2); //pmaddwd_r2r(mm1,mm0); // 1- compute x1[0]*x2[0]
m0 = _mm_sra_epi32(m0,shift); // 1- shift right by shift in order to compensate for the input amplitude
m0 = _mm_packs_epi32(m0,m0); // 1- pack in a 128 bit register [re im re im]
m0 = _mm_unpacklo_epi32(m0,m0); // 1- pack in a 128 bit register [re im re im]
y_128[1] = _mm_add_epi16(m0,y_128[1]);
// unroll 3
m1 = x1_128[2];
m2 = x2_128[2];
m2 = _mm_shufflelo_epi16(m2,_MM_SHUFFLE(0,1,3,2));
m2 = _mm_shufflehi_epi16(m2,_MM_SHUFFLE(0,1,3,2));
m2 = _mm_mullo_epi16(m2, mask);
m0 = _mm_madd_epi16(m1,m2); //pmaddwd_r2r(mm1,mm0); // 1- compute x1[0]*x2[0]
m0 = _mm_sra_epi32(m0,shift); // 1- shift right by shift in order to compensate for the input amplitude
m0 = _mm_packs_epi32(m0,m0); // 1- pack in a 128 bit register [re im re im]
m0 = _mm_unpacklo_epi32(m0,m0); // 1- pack in a 128 bit register [re im re im]
y_128[2] = _mm_add_epi16(m0,y_128[2]);
// unroll 4
m1 = x1_128[3];
m2 = x2_128[3];
m2 = _mm_shufflelo_epi16(m2,_MM_SHUFFLE(0,1,3,2));
m2 = _mm_shufflehi_epi16(m2,_MM_SHUFFLE(0,1,3,2));
m2 = _mm_mullo_epi16(m2, mask);
m0 = _mm_madd_epi16(m1,m2); //pmaddwd_r2r(mm1,mm0); // 1- compute x1[0]*x2[0]
m0 = _mm_sra_epi32(m0,shift); // 1- shift right by shift in order to compensate for the input amplitude
m0 = _mm_packs_epi32(m0,m0); // 1- pack in a 128 bit register [re im re im]
m0 = _mm_unpacklo_epi32(m0,m0); // 1- pack in a 128 bit register [re im re im]
y_128[3] = _mm_add_epi16(m0,y_128[3]);
x1_128+=4;
x2_128+=4;
y_128 +=4;
// printf("x1_128 = %p, x2_128 =%p, y_128=%p\n",x1_128,x2_128,y_128);
}
_mm_empty();
_m_empty();
return(0);
}
int mult_cpx_vector_h_add32(short *x1,
short *x2,
short *y,
unsigned int N,
short sign)
{
// Multiply elementwise the complex vector x1 with the complex conjugate of the complex vecotr x2 of N elements and adds it to the vector y.
// x1 - input 1 in 16bit format |Re0 Im0 Re0 Im0|,......,|Re(N-1) Im(N-1) Re(N-1) Im(N-1)|
// We assume x1 with a dinamic of 15 bit maximum
//
// x2 - input 2 in 16bit format |Re0 Im0 Re0 Im0|,......,|Re(N-1) Im(N-1) Re(N-1) Im(N-1)|
// We assume x2 with a dinamic of 14 bit maximum
//
// y - output in 32bit format |Re0 Im0|,......,|Re(N-1) Im(N-1)|
//
// N - the size f the vectors (this function does N cpx mpy. WARNING: N>=4;
//
// sign - +1..add, -1..substract
unsigned int i; // loop counter
register __m128i m0,m1,m2;
short *temps;
int *tempd;
__m128i *x1_128;
__m128i *x2_128;
__m128i *y_128;
__m128i mask;
__m128i temp;
x1_128 = (__m128i *)&x1[0];
x2_128 = (__m128i *)&x2[0];
y_128 = (__m128i *)&y[0];
if (sign == -1)
mask = (__m128i) _mm_set_epi16 (-1,1,-1,-1,-1,1,-1,-1);
else
mask = (__m128i) _mm_set_epi16 (1,-1,1,1,1,-1,1,1);
// we compute 2*4 cpx multiply for each loop
for(i=0; i<(N>>3); i++) {
m1 = x1_128[0];
m2 = x2_128[0];
// bring x2 in conjugate form
// the first two instructions might be replaced with a single one in SSE3
m2 = _mm_shufflelo_epi16(m2,_MM_SHUFFLE(0,1,3,2));
m2 = _mm_shufflehi_epi16(m2,_MM_SHUFFLE(0,1,3,2));
m2 = _mm_mullo_epi16(m2, mask);
m0 = _mm_madd_epi16(m1,m2); // 1- compute x1[0]*x2[0], result is 32bit
y_128[0] = _mm_add_epi32(m0,y_128[0]);
// unroll 2
m1 = x1_128[1];
m2 = x2_128[1];
m2 = _mm_shufflelo_epi16(m2,_MM_SHUFFLE(0,1,3,2));
m2 = _mm_shufflehi_epi16(m2,_MM_SHUFFLE(0,1,3,2));
m2 = _mm_mullo_epi16(m2, mask);
m0 = _mm_madd_epi16(m1,m2);
y_128[1] = _mm_add_epi32(m0,y_128[1]);
// unroll 3
m1 = x1_128[2];
m2 = x2_128[2];
m2 = _mm_shufflelo_epi16(m2,_MM_SHUFFLE(0,1,3,2));
m2 = _mm_shufflehi_epi16(m2,_MM_SHUFFLE(0,1,3,2));
m2 = _mm_mullo_epi16(m2, mask);
m0 = _mm_madd_epi16(m1,m2);
y_128[2] = _mm_add_epi32(m0,y_128[2]);
// unroll 4
m1 = x1_128[3];
m2 = x2_128[3];
m2 = _mm_shufflelo_epi16(m2,_MM_SHUFFLE(0,1,3,2));
m2 = _mm_shufflehi_epi16(m2,_MM_SHUFFLE(0,1,3,2));
m2 = _mm_mullo_epi16(m2, mask);
m0 = _mm_madd_epi16(m1,m2);
y_128[3] = _mm_add_epi32(m0,y_128[3]);
x1_128+=4;
x2_128+=4;
y_128 +=4;
}
_mm_empty();
_m_empty();
return(0);
}
#ifdef MAIN
#define L 16
main ()
{
short input[256] __attribute__((aligned(16)));
short input2[256] __attribute__((aligned(16)));
short output[256] __attribute__((aligned(16)));
int i;
Zero_Buffer(output,256*2);
for (i=0; i<16; i+=2)
printf("output[%d] = %d + %d i\n",i,output[i],output[i+1]);
input[0] = 100;
input[1] = 200;
input[2] = 100;
input[3] = 200;
input[4] = 1234;
input[5] = -1234;
input[6] = 1234;
input[7] = -1234;
input[8] = 100;
input[9] = 200;
input[10] = 100;
input[11] = 200;
input[12] = 1000;
input[13] = 2000;
input[14] = 1000;
input[15] = 2000;
input2[0] = 1;
input2[1] = 2;
input2[2] = 1;
input2[3] = 2;
input2[4] = 10;
input2[5] = 20;
input2[6] = 10;
input2[7] = 20;
input2[8] = 1;
input2[9] = 2;
input2[10] = 1;
input2[11] = 2;
input2[12] = 1000;
input2[13] = 2000;
input2[14] = 1000;
input2[15] = 2000;
mult_cpx_vector_h(input2,input2,output,8,0,1);
for (i=0; i<16; i+=2)
printf("output[%d] = %d + %d i\n",i,output[i],output[i+1]);
Zero_Buffer(output,256*2);
mult_cpx_vector_h(input2,input2,output,8,0,-1);
for (i=0; i<16; i+=2)
printf("output[%d] = %d + %d i\n",i,output[i],output[i+1]);
}
#endif //MAIN
......@@ -64,6 +64,8 @@ struct complex32 {
int32_t i;
};
//cmult_sv.h
/*!\fn void multadd_real_vector_complex_scalar(int16_t *x,int16_t *alpha,int16_t *y,uint32_t N)
This function performs componentwise multiplication and accumulation of a complex scalar and a real vector.
@param x Vector input (Q1.15)
......@@ -95,192 +97,14 @@ void multadd_complex_vector_real_scalar(int16_t *x,
uint8_t zero_flag,
uint32_t N);
/*!\fn int32_t mult_cpx_vector(int16_t *x1,int16_t *x2,int16_t *y,uint32_t N,int32_t output_shift)
This function performs optimized componentwise multiplication of two Q1.15 vectors in repeated format.
@param x1 Input 1 in the format |Re0 Im0 Re0 Im0|,......,|Re(N-1) Im(N-1) Re(N-1) Im(N-1)|
@param x2 Input 2 in the format |Re0 -Im0 Im0 Re0|,......,|Re(N-1) -Im(N-1) Im(N-1) Re(N-1)|
@param y Output in the format |Re0 Im0 Re0 Im0|,......,|Re(N-1) Im(N-1) Re(N-1) Im(N-1)|