103 lines
2.6 KiB
C
103 lines
2.6 KiB
C
|
/* 16-bit signed integer dot product
|
||
|
* Altivec-assisted version
|
||
|
* Copyright 2004 Phil Karn
|
||
|
* May be used under the terms of the GNU Lesser General Public License (LGPL)
|
||
|
*/
|
||
|
#include <stdlib.h>
|
||
|
#include "fec.h"
|
||
|
|
||
|
struct dotprod {
|
||
|
int len; /* Number of coefficients */
|
||
|
|
||
|
/* On an Altivec machine, these hold 8 copies of the coefficients,
|
||
|
* preshifted by 0,1,..7 words to meet all possible input data
|
||
|
*/
|
||
|
signed short *coeffs[8];
|
||
|
};
|
||
|
|
||
|
/* Create and return a descriptor for use with the dot product function */
|
||
|
void *initdp_av(signed short coeffs[], int len)
|
||
|
{
|
||
|
struct dotprod *dp;
|
||
|
int i, j;
|
||
|
|
||
|
if (len == 0) {
|
||
|
return NULL;
|
||
|
}
|
||
|
|
||
|
dp = (struct dotprod *)calloc(1, sizeof(struct dotprod));
|
||
|
dp->len = len;
|
||
|
|
||
|
/* Make 8 copies of coefficients, one for each data alignment,
|
||
|
* each aligned to 16-byte boundary
|
||
|
*/
|
||
|
for (i = 0; i < 8; i++) {
|
||
|
dp->coeffs[i] = calloc(1 + (len + i - 1) / 8, sizeof(vector signed short));
|
||
|
for (j = 0; j < len; j++) {
|
||
|
dp->coeffs[i][j + i] = coeffs[j];
|
||
|
}
|
||
|
}
|
||
|
return (void *)dp;
|
||
|
}
|
||
|
|
||
|
|
||
|
/* Free a dot product descriptor created earlier */
|
||
|
void freedp_av(void *p)
|
||
|
{
|
||
|
struct dotprod *dp = (struct dotprod *)p;
|
||
|
int i;
|
||
|
|
||
|
for (i = 0; i < 8; i++)
|
||
|
if (dp->coeffs[i] != NULL) {
|
||
|
free(dp->coeffs[i]);
|
||
|
}
|
||
|
free(dp);
|
||
|
}
|
||
|
|
||
|
/* Compute a dot product given a descriptor and an input array
|
||
|
* The length is taken from the descriptor
|
||
|
*/
|
||
|
long dotprod_av(void *p, signed short a[])
|
||
|
{
|
||
|
struct dotprod *dp = (struct dotprod *)p;
|
||
|
int al;
|
||
|
vector signed short *ar, *d;
|
||
|
vector signed int sums0, sums1, sums2, sums3;
|
||
|
union {
|
||
|
vector signed int v;
|
||
|
signed int w[4];
|
||
|
} s;
|
||
|
int nblocks;
|
||
|
|
||
|
/* round ar down to beginning of 16-byte block containing 0th element of
|
||
|
* input buffer. Then set d to one of 8 sets of shifted coefficients
|
||
|
*/
|
||
|
ar = (vector signed short *)((int)a & ~15);
|
||
|
al = ((int)a & 15) / sizeof(signed short);
|
||
|
d = (vector signed short *)dp->coeffs[al];
|
||
|
|
||
|
nblocks = (dp->len + al - 1) / 8 + 1;
|
||
|
|
||
|
/* Sum into four vectors each holding four 32-bit partial sums */
|
||
|
sums3 = sums2 = sums1 = sums0 = (vector signed int)(0);
|
||
|
while (nblocks >= 4) {
|
||
|
sums0 = vec_msums(ar[nblocks - 1], d[nblocks - 1], sums0);
|
||
|
sums1 = vec_msums(ar[nblocks - 2], d[nblocks - 2], sums1);
|
||
|
sums2 = vec_msums(ar[nblocks - 3], d[nblocks - 3], sums2);
|
||
|
sums3 = vec_msums(ar[nblocks - 4], d[nblocks - 4], sums3);
|
||
|
nblocks -= 4;
|
||
|
}
|
||
|
sums0 = vec_adds(sums0, sums1);
|
||
|
sums2 = vec_adds(sums2, sums3);
|
||
|
sums0 = vec_adds(sums0, sums2);
|
||
|
while (nblocks-- > 0) {
|
||
|
sums0 = vec_msums(ar[nblocks], d[nblocks], sums0);
|
||
|
}
|
||
|
/* Sum 4 partial sums into final result */
|
||
|
s.v = vec_sums(sums0, (vector signed int)(0));
|
||
|
|
||
|
return s.w[3];
|
||
|
}
|
||
|
|
||
|
|