Blas1 single missing kernels implemented with vector builtinstags/v0.3.6^2
@@ -89,14 +89,14 @@ ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | |||
#SMINKERNEL = ../arm/min.c | |||
#DMINKERNEL = ../arm/min.c | |||
# | |||
#ISAMAXKERNEL = ../arm/iamax.c | |||
ISAMAXKERNEL = isamax.c | |||
IDAMAXKERNEL = idamax.c | |||
#ICAMAXKERNEL = ../arm/izamax.c | |||
IZAMAXKERNEL = izamax.c | |||
ICAMAXKERNEL = icamax.c | |||
IZAMAXKERNEL = izamax.c | |||
# | |||
#ISAMINKERNEL = ../arm/iamin.c | |||
IDAMINKERNEL = idamin.c | |||
#ICAMINKERNEL = ../arm/izamin.c | |||
ISAMINKERNEL = isamin.c | |||
IDAMINKERNEL = idamin.c | |||
ICAMINKERNEL = icamin.c | |||
IZAMINKERNEL = izamin.c | |||
# | |||
#ISMAXKERNEL = ../arm/imax.c | |||
@@ -110,9 +110,9 @@ DASUMKERNEL = dasum.c | |||
CASUMKERNEL = casum.c | |||
ZASUMKERNEL = zasum.c | |||
# | |||
#SAXPYKERNEL = ../arm/axpy.c | |||
SAXPYKERNEL = saxpy.c | |||
DAXPYKERNEL = daxpy.c | |||
#CAXPYKERNEL = ../arm/zaxpy.c | |||
CAXPYKERNEL = caxpy.c | |||
ZAXPYKERNEL = zaxpy.c | |||
# | |||
SCOPYKERNEL = scopy.c | |||
@@ -123,7 +123,7 @@ ZCOPYKERNEL = zcopy.c | |||
SDOTKERNEL = sdot.c | |||
DDOTKERNEL = ddot.c | |||
DSDOTKERNEL = sdot.c | |||
#CDOTKERNEL = ../arm/zdot.c | |||
CDOTKERNEL = cdot.c | |||
ZDOTKERNEL = zdot.c | |||
# | |||
SNRM2KERNEL = ../arm/nrm2.c | |||
@@ -133,7 +133,7 @@ ZNRM2KERNEL = ../arm/znrm2.c | |||
# | |||
SROTKERNEL = srot.c | |||
DROTKERNEL = drot.c | |||
CROTKERNEL = zrot.c | |||
CROTKERNEL = crot.c | |||
ZROTKERNEL = zrot.c | |||
# | |||
SSCALKERNEL = sscal.c | |||
@@ -0,0 +1,145 @@ | |||
/* | |||
Copyright (c) 2013-2018, The OpenBLAS Project | |||
All rights reserved. | |||
Redistribution and use in source and binary forms, with or without | |||
modification, are permitted provided that the following conditions are | |||
met: | |||
1. Redistributions of source code must retain the above copyright | |||
notice, this list of conditions and the following disclaimer. | |||
2. Redistributions in binary form must reproduce the above copyright | |||
notice, this list of conditions and the following disclaimer in | |||
the documentation and/or other materials provided with the | |||
distribution. | |||
3. Neither the name of the OpenBLAS project nor the names of | |||
its contributors may be used to endorse or promote products | |||
derived from this software without specific prior written permission. | |||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
*****************************************************************************/ | |||
#include "common.h" | |||
#ifndef HAVE_ASM_KERNEL | |||
#include <altivec.h> | |||
static void caxpy_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT alpha_r, FLOAT alpha_i) | |||
{ | |||
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) | |||
register __vector float valpha_r = {alpha_r, alpha_r,alpha_r, alpha_r}; | |||
register __vector float valpha_i = {-alpha_i, alpha_i,-alpha_i, alpha_i}; | |||
#else | |||
register __vector float valpha_r = {alpha_r, -alpha_r,alpha_r, -alpha_r}; | |||
register __vector float valpha_i = {alpha_i, alpha_i,alpha_i, alpha_i}; | |||
#endif | |||
__vector unsigned char swap_mask = { 4,5,6,7,0,1,2,3, 12,13,14,15, 8,9,10,11}; | |||
register __vector float *vy = (__vector float *) y; | |||
register __vector float *vx = (__vector float *) x; | |||
BLASLONG i=0; | |||
for (; i < n/2; i += 8) { | |||
register __vector float vy_0 = vy[i]; | |||
register __vector float vy_1 = vy[i + 1]; | |||
register __vector float vy_2 = vy[i + 2]; | |||
register __vector float vy_3 = vy[i + 3]; | |||
register __vector float vy_4 = vy[i + 4]; | |||
register __vector float vy_5 = vy[i + 5]; | |||
register __vector float vy_6 = vy[i + 6]; | |||
register __vector float vy_7 = vy[i + 7]; | |||
register __vector float vx_0 = vx[i]; | |||
register __vector float vx_1 = vx[i + 1]; | |||
register __vector float vx_2 = vx[i + 2]; | |||
register __vector float vx_3 = vx[i + 3]; | |||
register __vector float vx_4 = vx[i + 4]; | |||
register __vector float vx_5 = vx[i + 5]; | |||
register __vector float vx_6 = vx[i + 6]; | |||
register __vector float vx_7 = vx[i + 7]; | |||
vy_0 += vx_0*valpha_r; | |||
vy_1 += vx_1*valpha_r; | |||
vy_2 += vx_2*valpha_r; | |||
vy_3 += vx_3*valpha_r; | |||
vy_4 += vx_4*valpha_r; | |||
vy_5 += vx_5*valpha_r; | |||
vy_6 += vx_6*valpha_r; | |||
vy_7 += vx_7*valpha_r; | |||
vx_0 = vec_perm(vx_0, vx_0, swap_mask); | |||
vx_1 = vec_perm(vx_1, vx_1, swap_mask); | |||
vx_2 = vec_perm(vx_2, vx_2, swap_mask); | |||
vx_3 = vec_perm(vx_3, vx_3, swap_mask); | |||
vx_4 = vec_perm(vx_4, vx_4, swap_mask); | |||
vx_5 = vec_perm(vx_5, vx_5, swap_mask); | |||
vx_6 = vec_perm(vx_6, vx_6, swap_mask); | |||
vx_7 = vec_perm(vx_7, vx_7, swap_mask); | |||
vy_0 += vx_0*valpha_i; | |||
vy_1 += vx_1*valpha_i; | |||
vy_2 += vx_2*valpha_i; | |||
vy_3 += vx_3*valpha_i; | |||
vy_4 += vx_4*valpha_i; | |||
vy_5 += vx_5*valpha_i; | |||
vy_6 += vx_6*valpha_i; | |||
vy_7 += vx_7*valpha_i; | |||
vy[i] = vy_0; | |||
vy[i + 1] = vy_1; | |||
vy[i + 2] = vy_2; | |||
vy[i + 3] = vy_3; | |||
vy[i + 4] = vy_4; | |||
vy[i + 5] = vy_5 ; | |||
vy[i + 6] = vy_6 ; | |||
vy[i + 7] = vy_7 ; | |||
} | |||
} | |||
#endif | |||
int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) { | |||
BLASLONG i = 0; | |||
BLASLONG ix = 0, iy = 0; | |||
if (n <= 0) return (0); | |||
if ((inc_x == 1) && (inc_y == 1)) { | |||
BLASLONG n1 = n & -16; | |||
if (n1) { | |||
caxpy_kernel_16(n1, x, y, da_r,da_i); | |||
ix = 2 * n1; | |||
} | |||
i = n1; | |||
while (i < n) { | |||
#if !defined(CONJ) | |||
y[ix] += (da_r * x[ix] - da_i * x[ix + 1]); | |||
y[ix + 1] += (da_r * x[ix + 1] + da_i * x[ix]); | |||
#else | |||
y[ix] += (da_r * x[ix] + da_i * x[ix + 1]); | |||
y[ix + 1] -= (da_r * x[ix + 1] - da_i * x[ix]); | |||
#endif | |||
i++; | |||
ix += 2; | |||
} | |||
return (0); | |||
} | |||
inc_x *= 2; | |||
inc_y *= 2; | |||
while (i < n) { | |||
#if !defined(CONJ) | |||
y[iy] += (da_r * x[ix] - da_i * x[ix + 1]); | |||
y[iy + 1] += (da_r * x[ix + 1] + da_i * x[ix]); | |||
#else | |||
y[iy] += (da_r * x[ix] + da_i * x[ix + 1]); | |||
y[iy + 1] -= (da_r * x[ix + 1] - da_i * x[ix]); | |||
#endif | |||
ix += inc_x; | |||
iy += inc_y; | |||
i++; | |||
} | |||
return (0); | |||
} | |||
@@ -0,0 +1,164 @@ | |||
/*Copyright (c) 2013-201\n8, The OpenBLAS Project | |||
All rights reserved. | |||
Redistribution and use in source and binary forms, with or without | |||
modification, are permitted provided that the following conditions are | |||
met: | |||
1. Redistributions of source code must retain the above copyright | |||
notice, this list of conditions and the following disclaimer. | |||
2. Redistributions in binary form must reproduce the above copyright | |||
notice, this list of conditions and the following disclaimer in | |||
the documentation and/or other materials provided with the | |||
distribution. | |||
3. Neither the name of the OpenBLAS project nor the names of | |||
its contributors may be used to endorse or promote products | |||
derived from this software without specific prior written permission. | |||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
*****************************************************************************/ | |||
#include "common.h" | |||
#ifndef HAVE_KERNEL_8 | |||
#include <altivec.h> | |||
static void cdot_kernel_8(BLASLONG n, FLOAT *x, FLOAT *y, float *dot) | |||
{ | |||
__vector unsigned char swap_mask = { 4,5,6,7,0,1,2,3, 12,13,14,15, 8,9,10,11}; | |||
register __vector float *vy = (__vector float *) y; | |||
register __vector float *vx = (__vector float *) x; | |||
BLASLONG i = 0; | |||
register __vector float vd_0 = { 0 }; | |||
register __vector float vd_1 = { 0 }; | |||
register __vector float vd_2 = { 0 }; | |||
register __vector float vd_3 = { 0 }; | |||
register __vector float vdd_0 = { 0 }; | |||
register __vector float vdd_1 = { 0 }; | |||
register __vector float vdd_2 = { 0 }; | |||
register __vector float vdd_3 = { 0 }; | |||
for (; i < n/2; i += 4) { | |||
register __vector float vyy_0 ; | |||
register __vector float vyy_1 ; | |||
register __vector float vyy_2 ; | |||
register __vector float vyy_3 ; | |||
register __vector float vy_0 = vy[i]; | |||
register __vector float vy_1 = vy[i + 1]; | |||
register __vector float vy_2 = vy[i + 2]; | |||
register __vector float vy_3 = vy[i + 3]; | |||
register __vector float vx_0= vx[i]; | |||
register __vector float vx_1 = vx[i + 1]; | |||
register __vector float vx_2 = vx[i + 2]; | |||
register __vector float vx_3 = vx[i + 3]; | |||
vyy_0 = vec_perm(vy_0, vy_0, swap_mask); | |||
vyy_1 = vec_perm(vy_1, vy_1, swap_mask); | |||
vyy_2 = vec_perm(vy_2, vy_2, swap_mask); | |||
vyy_3 = vec_perm(vy_3, vy_3, swap_mask); | |||
vd_0 += vx_0 * vy_0; | |||
vd_1 += vx_1 * vy_1; | |||
vd_2 += vx_2 * vy_2; | |||
vd_3 += vx_3 * vy_3; | |||
vdd_0 += vx_0 * vyy_0; | |||
vdd_1 += vx_1 * vyy_1; | |||
vdd_2 += vx_2 * vyy_2; | |||
vdd_3 += vx_3 * vyy_3; | |||
} | |||
//aggregate | |||
vd_0 = vd_0 + vd_1 +vd_2 +vd_3; | |||
vdd_0= vdd_0 + vdd_1 +vdd_2 +vdd_3; | |||
//reverse and aggregate | |||
vd_1=vec_xxpermdi(vd_0,vd_0,2) ; | |||
vdd_1=vec_xxpermdi(vdd_0,vdd_0,2); | |||
vd_2=vd_0+vd_1; | |||
vdd_2=vdd_0+vdd_1; | |||
dot[0]=vd_2[0]; | |||
dot[1]=vd_2[1]; | |||
dot[2]=vdd_2[0]; | |||
dot[3]=vdd_2[1]; | |||
} | |||
#endif | |||
OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) { | |||
BLASLONG i = 0; | |||
BLASLONG ix=0, iy=0; | |||
OPENBLAS_COMPLEX_FLOAT result; | |||
FLOAT dot[4] __attribute__ ((aligned(16))) = {0.0, 0.0, 0.0, 0.0}; | |||
if (n <= 0) { | |||
CREAL(result) = 0.0; | |||
CIMAG(result) = 0.0; | |||
return (result); | |||
} | |||
if ((inc_x == 1) && (inc_y == 1)) { | |||
BLASLONG n1 = n & -8; | |||
BLASLONG j=0; | |||
if (n1){ | |||
cdot_kernel_8(n1, x, y, dot); | |||
i = n1; | |||
j = n1 <<1; | |||
} | |||
while (i < n) { | |||
dot[0] += x[j] * y[j]; | |||
dot[1] += x[j + 1] * y[j + 1]; | |||
dot[2] += x[j] * y[j + 1]; | |||
dot[3] += x[j + 1] * y[j]; | |||
j += 2; | |||
i++; | |||
} | |||
} else { | |||
i = 0; | |||
ix = 0; | |||
iy = 0; | |||
inc_x <<= 1; | |||
inc_y <<= 1; | |||
while (i < n) { | |||
dot[0] += x[ix] * y[iy]; | |||
dot[1] += x[ix + 1] * y[iy + 1]; | |||
dot[2] += x[ix] * y[iy + 1]; | |||
dot[3] += x[ix + 1] * y[iy]; | |||
ix += inc_x; | |||
iy += inc_y; | |||
i++; | |||
} | |||
} | |||
#if !defined(CONJ) | |||
CREAL(result) = dot[0] - dot[1]; | |||
CIMAG(result) = dot[2] + dot[3]; | |||
#else | |||
CREAL(result) = dot[0] + dot[1]; | |||
CIMAG(result) = dot[2] - dot[3]; | |||
#endif | |||
return (result); | |||
} |
@@ -0,0 +1,213 @@ | |||
/*************************************************************************** | |||
Copyright (c) 2013-2018, The OpenBLAS Project | |||
All rights reserved. | |||
Redistribution and use in source and binary forms, with or without | |||
modification, are permitted provided that the following conditions are | |||
met: | |||
1. Redistributions of source code must retain the above copyright | |||
notice, this list of conditions and the following disclaimer. | |||
2. Redistributions in binary form must reproduce the above copyright | |||
notice, this list of conditions and the following disclaimer in | |||
the documentation and/or other materials provided with the | |||
distribution. | |||
3. Neither the name of the OpenBLAS project nor the names of | |||
its contributors may be used to endorse or promote products | |||
derived from this software without specific prior written permission. | |||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
*****************************************************************************/ | |||
#include "common.h" | |||
#if defined(POWER8) | |||
static void crot_kernel_8 (long n, float *x, float *y, float c, float s) | |||
{ | |||
__vector float t0; | |||
__vector float t1; | |||
__vector float t2; | |||
__vector float t3; | |||
__vector float t4; | |||
__vector float t5; | |||
__vector float t6; | |||
__vector float t7; | |||
__asm__ | |||
( | |||
"xscvdpspn 36, %x[cos] \n\t" // load c to all words | |||
"xxspltw 36, 36, 0 \n\t" | |||
"xscvdpspn 37, %x[sin] \n\t" // load s to all words | |||
"xxspltw 37, 37, 0 \n\t" | |||
"lxvd2x 32, 0, %[x_ptr] \n\t" // load x | |||
"lxvd2x 33, %[i16], %[x_ptr] \n\t" | |||
"lxvd2x 34, %[i32], %[x_ptr] \n\t" | |||
"lxvd2x 35, %[i48], %[x_ptr] \n\t" | |||
"lxvd2x 48, 0, %[y_ptr] \n\t" // load y | |||
"lxvd2x 49, %[i16], %[y_ptr] \n\t" | |||
"lxvd2x 50, %[i32], %[y_ptr] \n\t" | |||
"lxvd2x 51, %[i48], %[y_ptr] \n\t" | |||
"addi %[x_ptr], %[x_ptr], 64 \n\t" | |||
"addi %[y_ptr], %[y_ptr], 64 \n\t" | |||
"addic. %[temp_n], %[temp_n], -16 \n\t" | |||
"ble 2f \n\t" | |||
".p2align 5 \n\t" | |||
"1: \n\t" | |||
"xvmulsp 40, 32, 36 \n\t" // c * x | |||
"xvmulsp 41, 33, 36 \n\t" | |||
"xvmulsp 42, 34, 36 \n\t" | |||
"xvmulsp 43, 35, 36 \n\t" | |||
"xvmulsp %x[x0], 48, 36 \n\t" // c * y | |||
"xvmulsp %x[x2], 49, 36 \n\t" | |||
"xvmulsp %x[x1], 50, 36 \n\t" | |||
"xvmulsp %x[x3], 51, 36 \n\t" | |||
"xvmulsp 44, 32, 37 \n\t" // s * x | |||
"xvmulsp 45, 33, 37 \n\t" | |||
"lxvd2x 32, 0, %[x_ptr] \n\t" // load x | |||
"lxvd2x 33, %[i16], %[x_ptr] \n\t" | |||
"xvmulsp 46, 34, 37 \n\t" | |||
"xvmulsp 47, 35, 37 \n\t" | |||
"lxvd2x 34, %[i32], %[x_ptr] \n\t" | |||
"lxvd2x 35, %[i48], %[x_ptr] \n\t" | |||
"xvmulsp %x[x4], 48, 37 \n\t" // s * y | |||
"xvmulsp %x[x5], 49, 37 \n\t" | |||
"lxvd2x 48, 0, %[y_ptr] \n\t" // load y | |||
"lxvd2x 49, %[i16], %[y_ptr] \n\t" | |||
"xvmulsp %x[x6], 50, 37 \n\t" | |||
"xvmulsp %x[x7], 51, 37 \n\t" | |||
"lxvd2x 50, %[i32], %[y_ptr] \n\t" | |||
"lxvd2x 51, %[i48], %[y_ptr] \n\t" | |||
"xvaddsp 40, 40, %x[x4] \n\t" // c * x + s * y | |||
"xvaddsp 41, 41, %x[x5] \n\t" // c * x + s * y | |||
"addi %[x_ptr], %[x_ptr], -64 \n\t" | |||
"addi %[y_ptr], %[y_ptr], -64 \n\t" | |||
"xvaddsp 42, 42, %x[x6] \n\t" // c * x + s * y | |||
"xvaddsp 43, 43, %x[x7] \n\t" // c * x + s * y | |||
"xvsubsp %x[x0], %x[x0], 44 \n\t" // c * y - s * x | |||
"xvsubsp %x[x2], %x[x2], 45 \n\t" // c * y - s * x | |||
"xvsubsp %x[x1], %x[x1], 46 \n\t" // c * y - s * x | |||
"xvsubsp %x[x3], %x[x3], 47 \n\t" // c * y - s * x | |||
"stxvd2x 40, 0, %[x_ptr] \n\t" // store x | |||
"stxvd2x 41, %[i16], %[x_ptr] \n\t" | |||
"stxvd2x 42, %[i32], %[x_ptr] \n\t" | |||
"stxvd2x 43, %[i48], %[x_ptr] \n\t" | |||
"stxvd2x %x[x0], 0, %[y_ptr] \n\t" // store y | |||
"stxvd2x %x[x2], %[i16], %[y_ptr] \n\t" | |||
"stxvd2x %x[x1], %[i32], %[y_ptr] \n\t" | |||
"stxvd2x %x[x3], %[i48], %[y_ptr] \n\t" | |||
"addi %[x_ptr], %[x_ptr], 128 \n\t" | |||
"addi %[y_ptr], %[y_ptr], 128 \n\t" | |||
"addic. %[temp_n], %[temp_n], -16 \n\t" | |||
"bgt 1b \n\t" | |||
"2: \n\t" | |||
"xvmulsp 40, 32, 36 \n\t" // c * x | |||
"xvmulsp 41, 33, 36 \n\t" | |||
"xvmulsp 42, 34, 36 \n\t" | |||
"xvmulsp 43, 35, 36 \n\t" | |||
"xvmulsp %x[x0], 48, 36 \n\t" // c * y | |||
"xvmulsp %x[x2], 49, 36 \n\t" | |||
"xvmulsp %x[x1], 50, 36 \n\t" | |||
"xvmulsp %x[x3], 51, 36 \n\t" | |||
"xvmulsp 44, 32, 37 \n\t" // s * x | |||
"xvmulsp 45, 33, 37 \n\t" | |||
"xvmulsp 46, 34, 37 \n\t" | |||
"xvmulsp 47, 35, 37 \n\t" | |||
"xvmulsp %x[x4], 48, 37 \n\t" // s * y | |||
"xvmulsp %x[x5], 49, 37 \n\t" | |||
"xvmulsp %x[x6], 50, 37 \n\t" | |||
"xvmulsp %x[x7], 51, 37 \n\t" | |||
"addi %[x_ptr], %[x_ptr], -64 \n\t" | |||
"addi %[y_ptr], %[y_ptr], -64 \n\t" | |||
"xvaddsp 40, 40, %x[x4] \n\t" // c * x + s * y | |||
"xvaddsp 41, 41, %x[x5] \n\t" // c * x + s * y | |||
"xvaddsp 42, 42, %x[x6] \n\t" // c * x + s * y | |||
"xvaddsp 43, 43, %x[x7] \n\t" // c * x + s * y | |||
"xvsubsp %x[x0], %x[x0], 44 \n\t" // c * y - s * x | |||
"xvsubsp %x[x2], %x[x2], 45 \n\t" // c * y - s * x | |||
"xvsubsp %x[x1], %x[x1], 46 \n\t" // c * y - s * x | |||
"xvsubsp %x[x3], %x[x3], 47 \n\t" // c * y - s * x | |||
"stxvd2x 40, 0, %[x_ptr] \n\t" // store x | |||
"stxvd2x 41, %[i16], %[x_ptr] \n\t" | |||
"stxvd2x 42, %[i32], %[x_ptr] \n\t" | |||
"stxvd2x 43, %[i48], %[x_ptr] \n\t" | |||
"stxvd2x %x[x0], 0, %[y_ptr] \n\t" // store y | |||
"stxvd2x %x[x2], %[i16], %[y_ptr] \n\t" | |||
"stxvd2x %x[x1], %[i32], %[y_ptr] \n\t" | |||
"stxvd2x %x[x3], %[i48], %[y_ptr] " | |||
: | |||
[mem_x] "+m" (*(float (*)[2*n])x), | |||
[mem_y] "+m" (*(float (*)[2*n])y), | |||
[temp_n] "+r" (n), | |||
[x_ptr] "+&b" (x), | |||
[y_ptr] "+&b" (y), | |||
[x0] "=wa" (t0), | |||
[x1] "=wa" (t2), | |||
[x2] "=wa" (t1), | |||
[x3] "=wa" (t3), | |||
[x4] "=wa" (t4), | |||
[x5] "=wa" (t5), | |||
[x6] "=wa" (t6), | |||
[x7] "=wa" (t7) | |||
: | |||
[cos] "f" (c), | |||
[sin] "f" (s), | |||
[i16] "b" (16), | |||
[i32] "b" (32), | |||
[i48] "b" (48) | |||
: | |||
"cr0", | |||
"vs32","vs33","vs34","vs35","vs36","vs37", | |||
"vs40","vs41","vs42","vs43","vs44","vs45","vs46","vs47", | |||
"vs48","vs49","vs50","vs51" | |||
); | |||
} | |||
#endif | |||
int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT c, FLOAT s) | |||
{ | |||
BLASLONG i=0; | |||
BLASLONG ix=0,iy=0; | |||
FLOAT *x1=x; | |||
FLOAT *y1=y; | |||
FLOAT temp; | |||
if ( n <= 0 ) return(0); | |||
if ( (inc_x == 1) && (inc_y == 1) ) | |||
{ | |||
BLASLONG n1 = n & -8; | |||
if ( n1 > 0 ) | |||
{ | |||
crot_kernel_8(n1, x1, y1, c, s); | |||
i=n1; | |||
} | |||
while(i < n) | |||
{ | |||
temp = c*x[i] + s*y[i] ; | |||
y[i] = c*y[i] - s*x[i] ; | |||
x[i] = temp ; | |||
i++ ; | |||
} | |||
} | |||
else | |||
{ | |||
while(i < n) | |||
{ | |||
temp = c*x[ix] + s*y[iy] ; | |||
y[iy] = c*y[iy] - s*x[ix] ; | |||
x[ix] = temp ; | |||
ix += inc_x ; | |||
iy += inc_y ; | |||
i++ ; | |||
} | |||
} | |||
return(0); | |||
} | |||
@@ -0,0 +1,261 @@ | |||
/*************************************************************************** | |||
Copyright (c) 2019, The OpenBLAS Project | |||
All rights reserved. | |||
Redistribution and use in source and binary forms, with or without | |||
modification, are permitted provided that the following conditions are | |||
met: | |||
1. Redistributions of source code must retain the above copyright | |||
notice, this list of conditions and the following disclaimer. | |||
2. Redistributions in binary form must reproduce the above copyright | |||
notice, this list of conditions and the following disclaimer in | |||
the documentation and/or other materials provided with the | |||
distribution. | |||
3. Neither the name of the OpenBLAS project nor the names of | |||
its contributors may be used to endorse or promote products | |||
derived from this software without specific prior written permission. | |||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
*****************************************************************************/ | |||
#include "common.h" | |||
#include <math.h> | |||
#include <altivec.h> | |||
#if defined(DOUBLE) | |||
#define ABS fabs | |||
#else | |||
#define ABS fabsf | |||
#endif | |||
#define CABS1(x,i) ABS(x[i])+ABS(x[i+1]) | |||
/** | |||
* Find maximum index | |||
* Warning: requirements n>0 and n % 32 == 0 | |||
* @param n | |||
* @param x pointer to the vector | |||
* @param maxf (out) maximum absolute value .( only for output ) | |||
* @return index | |||
*/ | |||
static BLASLONG ciamax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *maxf) { | |||
BLASLONG index; | |||
BLASLONG i; | |||
register __vector unsigned int static_index0 = {0,1,2,3}; | |||
register __vector unsigned int temp0 = {4,4,4, 4}; //temporary vector register | |||
register __vector unsigned int temp1= temp0<<1; //{8,8,8,8} | |||
register __vector unsigned int static_index1=static_index0 +temp0;//{4,5,6,7}; | |||
register __vector unsigned int static_index2=static_index0 +temp1;//{8,9,10,11}; | |||
register __vector unsigned int static_index3=static_index1 +temp1; //{12,13,14,15}; | |||
temp0=vec_xor(temp0,temp0); | |||
temp1=temp1 <<1 ; //{16,16,16,16} | |||
register __vector unsigned int temp_add=temp1 <<1; //{32,32,32,32} | |||
register __vector unsigned int quadruple_indices=temp0;//{0,0,0,0} | |||
register __vector float quadruple_values={0,0,0,0}; | |||
register __vector float * v_ptrx=(__vector float *)x; | |||
register __vector unsigned char real_pack_mask = { 0,1,2,3,8,9,10,11,16,17,18,19, 24,25,26,27}; | |||
register __vector unsigned char image_pack_mask= {4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31}; | |||
for(; i<n; i+=32){ | |||
//absolute temporary complex vectors | |||
register __vector float v0=vec_abs(v_ptrx[0]); | |||
register __vector float v1=vec_abs(v_ptrx[1]); | |||
register __vector float v2=vec_abs(v_ptrx[2]); | |||
register __vector float v3=vec_abs(v_ptrx[3]); | |||
register __vector float v4=vec_abs(v_ptrx[4]); | |||
register __vector float v5=vec_abs(v_ptrx[5]); | |||
register __vector float v6=vec_abs(v_ptrx[6]); | |||
register __vector float v7=vec_abs(v_ptrx[7]); | |||
//pack complex real and imaginary parts together to sum real+image | |||
register __vector float t1=vec_perm(v0,v1,real_pack_mask); | |||
register __vector float ti=vec_perm(v0,v1,image_pack_mask); | |||
v0=t1+ti; //sum quadruple real with quadruple image | |||
register __vector float t2=vec_perm(v2,v3,real_pack_mask); | |||
register __vector float ti2=vec_perm(v2,v3,image_pack_mask); | |||
v1=t2+ti2; | |||
t1=vec_perm(v4,v5,real_pack_mask); | |||
ti=vec_perm(v4,v5,image_pack_mask); | |||
v2=t1+ti; //sum | |||
t2=vec_perm(v6,v7,real_pack_mask); | |||
ti2=vec_perm(v6,v7,image_pack_mask); | |||
v3=t2+ti2; | |||
// now we have 16 summed elements . lets compare them | |||
v_ptrx+=8; | |||
register __vector bool int r1=vec_cmpgt(v1,v0); | |||
register __vector bool int r2=vec_cmpgt(v3,v2); | |||
register __vector unsigned int ind2= vec_sel(static_index0,static_index1,r1); | |||
v0=vec_sel(v0,v1,r1); | |||
register __vector unsigned int ind3= vec_sel(static_index2,static_index3,r2); | |||
v1=vec_sel(v2,v3,r2); | |||
//final cmp and select index and value for first 16 values | |||
r1=vec_cmpgt(v1,v0); | |||
register __vector unsigned int indf0 = vec_sel(ind2,ind3,r1); | |||
register __vector float vf0= vec_sel(v0,v1,r1); | |||
//absolute temporary complex vectors | |||
v0=vec_abs(v_ptrx[0]); | |||
v1=vec_abs(v_ptrx[1]); | |||
v2=vec_abs(v_ptrx[2]); | |||
v3=vec_abs(v_ptrx[3]); | |||
v4=vec_abs(v_ptrx[4]); | |||
v5=vec_abs(v_ptrx[5]); | |||
v6=vec_abs(v_ptrx[6]); | |||
v7=vec_abs(v_ptrx[7]); | |||
//pack complex real and imaginary parts together to sum real+image | |||
t1=vec_perm(v0,v1,real_pack_mask); | |||
ti=vec_perm(v0,v1,image_pack_mask); | |||
v0=t1+ti; //sum quadruple real with quadruple image | |||
t2=vec_perm(v2,v3,real_pack_mask); | |||
ti2=vec_perm(v2,v3,image_pack_mask); | |||
v1=t2+ti2; | |||
t1=vec_perm(v4,v5,real_pack_mask); | |||
ti=vec_perm(v4,v5,image_pack_mask); | |||
v2=t1+ti; //sum | |||
t2=vec_perm(v6,v7,real_pack_mask); | |||
ti2=vec_perm(v6,v7,image_pack_mask); | |||
v3=t2+ti2; | |||
// now we have 16 summed elements {from 16 to 31} . lets compare them | |||
v_ptrx+=8; | |||
r1=vec_cmpgt(v1,v0); | |||
r2=vec_cmpgt(v3,v2); | |||
ind2= vec_sel(static_index0,static_index1,r1); | |||
v0=vec_sel(v0,v1,r1); | |||
ind3= vec_sel(static_index2,static_index3,r2); | |||
v1=vec_sel(v2,v3,r2); | |||
//final cmp and select index and value for the second 16 values | |||
r1=vec_cmpgt(v1,v0); | |||
register __vector unsigned int indv0 = vec_sel(ind2,ind3,r1); | |||
register __vector float vv0= vec_sel(v0,v1,r1); | |||
indv0+=temp1; //make index from 16->31 | |||
//find final quadruple from 32 elements | |||
r2=vec_cmpgt(vv0,vf0); | |||
ind2 = vec_sel( indf0,indv0,r2); | |||
vv0= vec_sel(vf0,vv0,r2); | |||
//get asbolute index | |||
ind2+=temp0; | |||
//compare with old quadruple and update | |||
r1=vec_cmpgt(vv0,quadruple_values); | |||
quadruple_indices = vec_sel( quadruple_indices,ind2,r1); | |||
quadruple_values= vec_sel(quadruple_values,vv0,r1); | |||
temp0+=temp_add; | |||
} | |||
//now we have to chose from 4 values and 4 different indices | |||
// we will compare pairwise if pairs are exactly the same we will choose minimum between index | |||
// otherwise we will assign index of the maximum value | |||
float a1,a2,a3,a4; | |||
unsigned int i1,i2,i3,i4; | |||
a1=vec_extract(quadruple_values,0); | |||
a2=vec_extract(quadruple_values,1); | |||
a3=vec_extract(quadruple_values,2); | |||
a4=vec_extract(quadruple_values,3); | |||
i1=vec_extract(quadruple_indices,0); | |||
i2=vec_extract(quadruple_indices,1); | |||
i3=vec_extract(quadruple_indices,2); | |||
i4=vec_extract(quadruple_indices,3); | |||
if(a1==a2){ | |||
index=i1>i2?i2:i1; | |||
}else if(a2>a1){ | |||
index=i2; | |||
a1=a2; | |||
}else{ | |||
index= i1; | |||
} | |||
if(a4==a3){ | |||
i1=i3>i4?i4:i3; | |||
}else if(a4>a3){ | |||
i1=i4; | |||
a3=a4; | |||
}else{ | |||
i1= i3; | |||
} | |||
if(a1==a3){ | |||
index=i1>index?index:i1; | |||
*maxf=a1; | |||
}else if(a3>a1){ | |||
index=i1; | |||
*maxf=a3; | |||
}else{ | |||
*maxf=a1; | |||
} | |||
return index; | |||
} | |||
BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||
{ | |||
BLASLONG i = 0; | |||
BLASLONG ix = 0; | |||
FLOAT maxf = 0; | |||
BLASLONG max = 0; | |||
BLASLONG inc_x2; | |||
if (n <= 0 || inc_x <= 0) return(max); | |||
if (inc_x == 1) { | |||
BLASLONG n1 = n & -32; | |||
if (n1 > 0) { | |||
max = ciamax_kernel_32(n1, x, &maxf); | |||
i = n1; | |||
ix = n1 << 1; | |||
} | |||
while(i < n) | |||
{ | |||
if( CABS1(x,ix) > maxf ) | |||
{ | |||
max = i; | |||
maxf = CABS1(x,ix); | |||
} | |||
ix += 2; | |||
i++; | |||
} | |||
return (max + 1); | |||
} else { | |||
inc_x2 = 2 * inc_x; | |||
maxf = CABS1(x,0); | |||
ix += inc_x2; | |||
i++; | |||
while(i < n) | |||
{ | |||
if( CABS1(x,ix) > maxf ) | |||
{ | |||
max = i; | |||
maxf = CABS1(x,ix); | |||
} | |||
ix += inc_x2; | |||
i++; | |||
} | |||
return (max + 1); | |||
} | |||
} | |||
@@ -0,0 +1,266 @@ | |||
/*************************************************************************** | |||
Copyright (c) 2019, The OpenBLAS Project | |||
All rights reserved. | |||
Redistribution and use in source and binary forms, with or without | |||
modification, are permitted provided that the following conditions are | |||
met: | |||
1. Redistributions of source code must retain the above copyright | |||
notice, this list of conditions and the following disclaimer. | |||
2. Redistributions in binary form must reproduce the above copyright | |||
notice, this list of conditions and the following disclaimer in | |||
the documentation and/or other materials provided with the | |||
distribution. | |||
3. Neither the name of the OpenBLAS project nor the names of | |||
its contributors may be used to endorse or promote products | |||
derived from this software without specific prior written permission. | |||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
*****************************************************************************/ | |||
#include "common.h" | |||
#include <math.h> | |||
#include <altivec.h> | |||
#if defined(DOUBLE) | |||
#define ABS fabs | |||
#else | |||
#define ABS fabsf | |||
#endif | |||
#define CABS1(x,i) ABS(x[i])+ABS(x[i+1]) | |||
/** | |||
* Find minimum index | |||
* Warning: requirements n>0 and n % 32 == 0 | |||
* @param n | |||
* @param x pointer to the vector | |||
* @param minf (out) minimum absolute value .( only for output ) | |||
* @return index | |||
*/ | |||
static BLASLONG ciamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *minf) { | |||
BLASLONG index; | |||
BLASLONG i; | |||
register __vector unsigned int static_index0 = {0,1,2,3}; | |||
register __vector unsigned int temp0 = {4,4,4, 4}; //temporary vector register | |||
register __vector unsigned int temp1= temp0<<1; //{8,8,8,8} | |||
register __vector unsigned int static_index1=static_index0 +temp0;//{4,5,6,7}; | |||
register __vector unsigned int static_index2=static_index0 +temp1;//{8,9,10,11}; | |||
register __vector unsigned int static_index3=static_index1 +temp1; //{12,13,14,15}; | |||
temp0=vec_xor(temp0,temp0); | |||
temp1=temp1 <<1 ; //{16,16,16,16} | |||
register __vector unsigned int temp_add=temp1 <<1; //{32,32,32,32} | |||
register __vector unsigned int quadruple_indices=temp0;//{0,0,0,0} | |||
float first_min=CABS1(x,0); | |||
register __vector float quadruple_values={first_min,first_min,first_min,first_min}; | |||
register __vector float * v_ptrx=(__vector float *)x; | |||
register __vector unsigned char real_pack_mask = { 0,1,2,3,8,9,10,11,16,17,18,19, 24,25,26,27}; | |||
register __vector unsigned char image_pack_mask= {4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31}; | |||
for(; i<n; i+=32){ | |||
//absolute temporary complex vectors | |||
register __vector float v0=vec_abs(v_ptrx[0]); | |||
register __vector float v1=vec_abs(v_ptrx[1]); | |||
register __vector float v2=vec_abs(v_ptrx[2]); | |||
register __vector float v3=vec_abs(v_ptrx[3]); | |||
register __vector float v4=vec_abs(v_ptrx[4]); | |||
register __vector float v5=vec_abs(v_ptrx[5]); | |||
register __vector float v6=vec_abs(v_ptrx[6]); | |||
register __vector float v7=vec_abs(v_ptrx[7]); | |||
//pack complex real and imaginary parts together to sum real+image | |||
register __vector float t1=vec_perm(v0,v1,real_pack_mask); | |||
register __vector float ti=vec_perm(v0,v1,image_pack_mask); | |||
v0=t1+ti; //sum quadruple real with quadruple image | |||
register __vector float t2=vec_perm(v2,v3,real_pack_mask); | |||
register __vector float ti2=vec_perm(v2,v3,image_pack_mask); | |||
v1=t2+ti2; | |||
t1=vec_perm(v4,v5,real_pack_mask); | |||
ti=vec_perm(v4,v5,image_pack_mask); | |||
v2=t1+ti; //sum | |||
t2=vec_perm(v6,v7,real_pack_mask); | |||
ti2=vec_perm(v6,v7,image_pack_mask); | |||
v3=t2+ti2; | |||
// now we have 16 summed elements . lets compare them | |||
v_ptrx+=8; | |||
register __vector bool int r1=vec_cmpgt(v0,v1); | |||
register __vector bool int r2=vec_cmpgt(v2,v3); | |||
register __vector unsigned int ind2= vec_sel(static_index0,static_index1,r1); | |||
v0=vec_sel(v0,v1,r1); | |||
register __vector unsigned int ind3= vec_sel(static_index2,static_index3,r2); | |||
v1=vec_sel(v2,v3,r2); | |||
//final cmp and select index and value for first 16 values | |||
r1=vec_cmpgt(v0,v1); | |||
register __vector unsigned int indf0 = vec_sel(ind2,ind3,r1); | |||
register __vector float vf0= vec_sel(v0,v1,r1); | |||
//absolute temporary complex vectors | |||
v0=vec_abs(v_ptrx[0]); | |||
v1=vec_abs(v_ptrx[1]); | |||
v2=vec_abs(v_ptrx[2]); | |||
v3=vec_abs(v_ptrx[3]); | |||
v4=vec_abs(v_ptrx[4]); | |||
v5=vec_abs(v_ptrx[5]); | |||
v6=vec_abs(v_ptrx[6]); | |||
v7=vec_abs(v_ptrx[7]); | |||
//pack complex real and imaginary parts together to sum real+image | |||
t1=vec_perm(v0,v1,real_pack_mask); | |||
ti=vec_perm(v0,v1,image_pack_mask); | |||
v0=t1+ti; //sum quadruple real with quadruple image | |||
t2=vec_perm(v2,v3,real_pack_mask); | |||
ti2=vec_perm(v2,v3,image_pack_mask); | |||
v1=t2+ti2; | |||
t1=vec_perm(v4,v5,real_pack_mask); | |||
ti=vec_perm(v4,v5,image_pack_mask); | |||
v2=t1+ti; //sum | |||
t2=vec_perm(v6,v7,real_pack_mask); | |||
ti2=vec_perm(v6,v7,image_pack_mask); | |||
v3=t2+ti2; | |||
// now we have 16 summed elements {from 16 to 31} . lets compare them | |||
v_ptrx+=8; | |||
r1=vec_cmpgt(v0,v1); | |||
r2=vec_cmpgt(v2,v3); | |||
ind2= vec_sel(static_index0,static_index1,r1); | |||
v0=vec_sel(v0,v1,r1); | |||
ind3= vec_sel(static_index2,static_index3,r2); | |||
v1=vec_sel(v2,v3,r2); | |||
//final cmp and select index and value for the second 16 values | |||
r1=vec_cmpgt(v0,v1); | |||
register __vector unsigned int indv0 = vec_sel(ind2,ind3,r1); | |||
register __vector float vv0= vec_sel(v0,v1,r1); | |||
indv0+=temp1; //make index from 16->31 | |||
//find final quadruple from 32 elements | |||
r2=vec_cmpgt(vf0,vv0); | |||
ind2 = vec_sel( indf0,indv0,r2); | |||
vv0= vec_sel(vf0,vv0,r2); | |||
//get asbolute index | |||
ind2+=temp0; | |||
//compare with old quadruple and update | |||
r1=vec_cmpgt(quadruple_values,vv0); | |||
quadruple_indices = vec_sel( quadruple_indices,ind2,r1); | |||
quadruple_values= vec_sel(quadruple_values,vv0,r1); | |||
temp0+=temp_add; | |||
} | |||
//now we have to chose from 4 values and 4 different indices | |||
// we will compare pairwise if pairs are exactly the same we will choose minimum between index | |||
// otherwise we will assign index of the minimum value | |||
float a1,a2,a3,a4; | |||
unsigned int i1,i2,i3,i4; | |||
a1=vec_extract(quadruple_values,0); | |||
a2=vec_extract(quadruple_values,1); | |||
a3=vec_extract(quadruple_values,2); | |||
a4=vec_extract(quadruple_values,3); | |||
i1=vec_extract(quadruple_indices,0); | |||
i2=vec_extract(quadruple_indices,1); | |||
i3=vec_extract(quadruple_indices,2); | |||
i4=vec_extract(quadruple_indices,3); | |||
if(a1==a2){ | |||
index=i1>i2?i2:i1; | |||
}else if(a2<a1){ | |||
index=i2; | |||
a1=a2; | |||
}else{ | |||
index= i1; | |||
} | |||
if(a4==a3){ | |||
i1=i3>i4?i4:i3; | |||
}else if(a4<a3){ | |||
i1=i4; | |||
a3=a4; | |||
}else{ | |||
i1= i3; | |||
} | |||
if(a1==a3){ | |||
index=i1>index?index:i1; | |||
*minf=a1; | |||
}else if(a3<a1){ | |||
index=i1; | |||
*minf=a3; | |||
}else{ | |||
*minf=a1; | |||
} | |||
return index; | |||
} | |||
BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||
{ | |||
BLASLONG i=0; | |||
BLASLONG ix=0; | |||
FLOAT minf; | |||
BLASLONG min=0; | |||
BLASLONG inc_x2; | |||
if (n <= 0 || inc_x <= 0) return(min); | |||
if (inc_x == 1) { | |||
minf = CABS1(x,0); //index will not be incremented | |||
BLASLONG n1 = n & -32; | |||
if (n1 > 0) { | |||
min = ciamin_kernel_32(n1, x, &minf); | |||
i = n1; | |||
ix = n1 << 1; | |||
} | |||
while(i < n) | |||
{ | |||
if( CABS1(x,ix) < minf ) | |||
{ | |||
min = i; | |||
minf = CABS1(x,ix); | |||
} | |||
ix += 2; | |||
i++; | |||
} | |||
return (min + 1); | |||
} else { | |||
inc_x2 = 2 * inc_x; | |||
minf = CABS1(x,0); | |||
ix += inc_x2; | |||
i++; | |||
while(i < n) | |||
{ | |||
if( CABS1(x,ix) < minf ) | |||
{ | |||
min = i; | |||
minf = CABS1(x,ix); | |||
} | |||
ix += inc_x2; | |||
i++; | |||
} | |||
return (min + 1); | |||
} | |||
} | |||
@@ -89,10 +89,10 @@ static BLASLONG diamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *minf) { | |||
".p2align 5 \n\t" | |||
"1: \n\t" | |||
"xvcmpgedp 2,44,45 \n\t " | |||
"xvcmpgedp 3,46,47 \n\t " | |||
"xvcmpgedp 4,48,49 \n\t " | |||
"xvcmpgedp 5,50,51 \n\t" | |||
"xvcmpgtdp 2,44,45 \n\t " | |||
"xvcmpgtdp 3,46,47 \n\t " | |||
"xvcmpgtdp 4,48,49 \n\t " | |||
"xvcmpgtdp 5,50,51 \n\t" | |||
"xxsel 32,40,41,2 \n\t" | |||
"xxsel 0,44,45,2 \n\t" | |||
@@ -103,8 +103,8 @@ static BLASLONG diamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *minf) { | |||
"xxsel 35,42,43,5 \n\t" | |||
"xxsel 47,50,51,5 \n\t" | |||
"xvcmpgedp 2,0, 1 \n\t" | |||
"xvcmpgedp 3, 45,47 \n\t" | |||
"xvcmpgtdp 2,0, 1 \n\t" | |||
"xvcmpgtdp 3, 45,47 \n\t" | |||
"addi %[ptr_tmp] ,%[ptr_tmp] , 128 \n\t" | |||
@@ -125,7 +125,7 @@ static BLASLONG diamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *minf) { | |||
"lxvd2x 47, %[i48],%[ptr_tmp] \n\t" | |||
//choose smaller from first and second part | |||
"xvcmpgedp 4, 0,5 \n\t" | |||
"xvcmpgtdp 4, 0,5 \n\t" | |||
"xxsel 3, 0,5,4 \n\t" | |||
"xxsel 33,32,34,4 \n\t" | |||
@@ -139,7 +139,7 @@ static BLASLONG diamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *minf) { | |||
"lxvd2x 51,%[i112],%[ptr_tmp] \n\t" | |||
//compare with previous to get vec_min_index(v6 | vs38 ) and vec_min_value (vs39) | |||
"xvcmpgedp 2,39, 3 \n\t" | |||
"xvcmpgtdp 2,39, 3 \n\t" | |||
"xxsel 39,39,3,2 \n\t" | |||
"xxsel 38,38,33,2 \n\t" | |||
@@ -162,10 +162,10 @@ static BLASLONG diamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *minf) { | |||
//<-----------jump here from first load | |||
"2: \n\t" | |||
"xvcmpgedp 2,44,45 \n\t " | |||
"xvcmpgedp 3,46,47 \n\t " | |||
"xvcmpgedp 4,48,49 \n\t " | |||
"xvcmpgedp 5,50,51 \n\t" | |||
"xvcmpgtdp 2,44,45 \n\t " | |||
"xvcmpgtdp 3,46,47 \n\t " | |||
"xvcmpgtdp 4,48,49 \n\t " | |||
"xvcmpgtdp 5,50,51 \n\t" | |||
"xxsel 32,40,41,2 \n\t" | |||
"xxsel 0,44,45,2 \n\t" | |||
@@ -176,8 +176,8 @@ static BLASLONG diamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *minf) { | |||
"xxsel 35,42,43,5 \n\t" | |||
"xxsel 47,50,51,5 \n\t" | |||
"xvcmpgedp 2,0, 1 \n\t" | |||
"xvcmpgedp 3, 45,47 \n\t" | |||
"xvcmpgtdp 2,0, 1 \n\t" | |||
"xvcmpgtdp 3, 45,47 \n\t" | |||
"xxsel 32,32,33,2 \n\t" | |||
"xxsel 0 ,0,1,2 \n\t" | |||
"xxsel 34,34,35,3 \n\t" | |||
@@ -194,7 +194,7 @@ static BLASLONG diamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *minf) { | |||
"lxvd2x 47, %[i48],%[ptr_tmp] \n\t" | |||
//choose smaller from first and second part | |||
"xvcmpgedp 4, 0,5 \n\t" | |||
"xvcmpgtdp 4, 0,5 \n\t" | |||
"xxsel 3, 0,5,4 \n\t" | |||
"xxsel 33,32,34,4 \n\t" | |||
@@ -210,7 +210,7 @@ static BLASLONG diamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *minf) { | |||
//compare with previous to get vec_min_index(v6 | vs38 ) and vec_min_value (vs39) | |||
"xvcmpgedp 2,39, 3 \n\t" | |||
"xvcmpgtdp 2,39, 3 \n\t" | |||
"xxsel 39,39,3,2 \n\t" | |||
"xxsel 38,38,33,2 \n\t" | |||
@@ -238,10 +238,10 @@ static BLASLONG diamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *minf) { | |||
//============================================================================== | |||
"xvcmpgedp 2,44,45 \n\t " | |||
"xvcmpgedp 3,46,47 \n\t " | |||
"xvcmpgedp 4,48,49 \n\t " | |||
"xvcmpgedp 5,50,51 \n\t" | |||
"xvcmpgtdp 2,44,45 \n\t " | |||
"xvcmpgtdp 3,46,47 \n\t " | |||
"xvcmpgtdp 4,48,49 \n\t " | |||
"xvcmpgtdp 5,50,51 \n\t" | |||
"xxsel 32,40,41,2 \n\t" | |||
"xxsel 0,44,45,2 \n\t" | |||
@@ -252,8 +252,8 @@ static BLASLONG diamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *minf) { | |||
"xxsel 35,42,43,5 \n\t" | |||
"xxsel 47,50,51,5 \n\t" | |||
"xvcmpgedp 2,0, 1 \n\t" | |||
"xvcmpgedp 3, 45,47 \n\t" | |||
"xvcmpgtdp 2,0, 1 \n\t" | |||
"xvcmpgtdp 3, 45,47 \n\t" | |||
"xxsel 32,32,33,2 \n\t" | |||
@@ -264,14 +264,14 @@ static BLASLONG diamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *minf) { | |||
// for {second 8 elements } we have to add 8 to each so that it became {from 8 to 16} | |||
"vaddudm 2,2,4 \n\t" // vs34=vs34 + vs36{8,8} | |||
//choose smaller from first and second part | |||
"xvcmpgedp 4, 0,5 \n\t" | |||
"xvcmpgtdp 4, 0,5 \n\t" | |||
"xxsel 3, 0,5,4 \n\t" | |||
"xxsel 33,32,34,4 \n\t" | |||
"vaddudm 1,1,5 \n\t" // get real index for first smaller | |||
//compare with previous to get vec_min_index(v6 | vs38 ) and vec_min_value (vs39) | |||
"xvcmpgedp 2,39, 3 \n\t" | |||
"xvcmpgtdp 2,39, 3 \n\t" | |||
"xxsel 39,39,3,2 \n\t" | |||
"xxsel 38,38,33,2 \n\t" | |||
@@ -284,7 +284,7 @@ static BLASLONG diamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *minf) { | |||
//cr6 0 bit set if all true, cr6=4*6+bit_ind=24,0011at CR(BI)==1, at=10 hint that it occurs rarely | |||
//0b001110=14 | |||
"bc 14,24, 3f \n\t" | |||
"xvcmpgedp 4,39, 40 \n\t" | |||
"xvcmpgtdp 4,39, 40 \n\t" | |||
"xxsel 0,39,40,4 \n\t" | |||
"xxsel 1,38,32,4 \n\t" | |||
"stxsdx 0,0,%[ptr_minf] \n\t" | |||
@@ -0,0 +1,288 @@ | |||
/*************************************************************************** | |||
Copyright (c) 2013-2019, The OpenBLAS Project | |||
All rights reserved. | |||
Redistribution and use in source and binary forms, with or without | |||
modification, are permitted provided that the following conditions are | |||
met: | |||
1. Redistributions of source code must retain the above copyright | |||
notice, this list of conditions and the following disclaimer. | |||
2. Redistributions in binary form must reproduce the above copyright | |||
notice, this list of conditions and the following disclaimer in | |||
the documentation and/or other materials provided with the | |||
distribution. | |||
3. Neither the name of the OpenBLAS project nor the names of | |||
its contributors may be used to endorse or promote products | |||
derived from this software without specific prior written permission. | |||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
*****************************************************************************/ | |||
#include "common.h" | |||
#include <math.h> | |||
#include <altivec.h> | |||
#if defined(DOUBLE) | |||
#define ABS fabs | |||
#else | |||
#define ABS fabsf | |||
#endif | |||
/** | |||
* Find maximum index | |||
* Warning: requirements n>0 and n % 64 == 0 | |||
* @param n | |||
* @param x pointer to the vector | |||
* @param maxf (out) maximum absolute value .( only for output ) | |||
* @return index | |||
*/ | |||
static BLASLONG siamax_kernel_64(BLASLONG n, FLOAT *x, FLOAT *maxf) { | |||
BLASLONG index; | |||
BLASLONG i=0; | |||
register __vector unsigned int static_index0 = {0,1,2,3}; | |||
register __vector unsigned int temp0 = {4,4,4, 4}; //temporary vector register | |||
register __vector unsigned int temp1= temp0<<1; //{8,8,8,8} | |||
register __vector unsigned int static_index1=static_index0 +temp0;//{4,5,6,7}; | |||
register __vector unsigned int static_index2=static_index0 +temp1;//{8,9,10,11}; | |||
register __vector unsigned int static_index3=static_index1 +temp1; //{12,13,14,15}; | |||
temp0=vec_xor(temp0,temp0); | |||
temp1=temp1 <<1 ; //{16,16,16,16} | |||
register __vector unsigned int quadruple_indices=temp0;//{0,0,0,0} | |||
register __vector float quadruple_values={0,0,0,0}; | |||
register __vector float * v_ptrx=(__vector float *)x; | |||
for(; i<n; i+=64){ | |||
//absolute temporary vectors | |||
register __vector float v0=vec_abs(v_ptrx[0]); | |||
register __vector float v1=vec_abs(v_ptrx[1]); | |||
register __vector float v2=vec_abs(v_ptrx[2]); | |||
register __vector float v3=vec_abs(v_ptrx[3]); | |||
register __vector float v4=vec_abs(v_ptrx[4]); | |||
register __vector float v5=vec_abs(v_ptrx[5]); | |||
register __vector float v6=vec_abs(v_ptrx[6]); | |||
register __vector float v7=vec_abs(v_ptrx[7]); | |||
//cmp quadruple pairs | |||
register __vector bool int r1=vec_cmpgt(v1,v0); | |||
register __vector bool int r2=vec_cmpgt(v3,v2); | |||
register __vector bool int r3=vec_cmpgt(v5,v4); | |||
register __vector bool int r4=vec_cmpgt(v7,v6); | |||
//select | |||
register __vector unsigned int ind0_first= vec_sel(static_index0,static_index1,r1); | |||
register __vector float vf0= vec_sel(v0,v1,r1); | |||
register __vector unsigned int ind1= vec_sel(static_index2,static_index3,r2); | |||
register __vector float vf1= vec_sel(v2,v3,r2); | |||
register __vector unsigned int ind2= vec_sel(static_index0,static_index1,r3); | |||
v0=vec_sel(v4,v5,r3); | |||
register __vector unsigned int ind3= vec_sel(static_index2,static_index3,r4); | |||
v1=vec_sel(v6,v7,r4); | |||
// cmp selected | |||
r1=vec_cmpgt(vf1,vf0); | |||
r2=vec_cmpgt(v1,v0); | |||
v_ptrx+=8; | |||
//select from above | |||
ind0_first= vec_sel(ind0_first,ind1,r1); | |||
vf0= vec_sel(vf0,vf1,r1) ; | |||
ind2= vec_sel(ind2,ind3,r2); | |||
vf1= vec_sel(v0,v1,r2); | |||
//second indices actually should be within [16,31] so ind2+16 | |||
ind2 +=temp1; | |||
//final cmp and select index and value for the first 32 values | |||
r1=vec_cmpgt(vf1,vf0); | |||
ind0_first = vec_sel(ind0_first,ind2,r1); | |||
vf0= vec_sel(vf0,vf1,r1); | |||
ind0_first+=temp0; //get absolute index | |||
temp0+=temp1; | |||
temp0+=temp1; //temp0+32 | |||
//second part of 32 | |||
// absolute temporary vectors | |||
v0=vec_abs(v_ptrx[0]); | |||
v1=vec_abs(v_ptrx[1]); | |||
v2=vec_abs(v_ptrx[2]); | |||
v3=vec_abs(v_ptrx[3]); | |||
v4=vec_abs(v_ptrx[4]); | |||
v5=vec_abs(v_ptrx[5]); | |||
v6=vec_abs(v_ptrx[6]); | |||
v7=vec_abs(v_ptrx[7]); | |||
//cmp quadruple pairs | |||
r1=vec_cmpgt(v1,v0); | |||
r2=vec_cmpgt(v3,v2); | |||
r3=vec_cmpgt(v5,v4); | |||
r4=vec_cmpgt(v7,v6); | |||
//select | |||
register __vector unsigned int ind0_second= vec_sel(static_index0,static_index1,r1); | |||
register __vector float vv0= vec_sel(v0,v1,r1); | |||
ind1= vec_sel(static_index2,static_index3,r2); | |||
register __vector float vv1= vec_sel(v2,v3,r2); | |||
ind2= vec_sel(static_index0,static_index1,r3); | |||
v0=vec_sel(v4,v5,r3); | |||
ind3= vec_sel(static_index2,static_index3,r4); | |||
v1=vec_sel(v6,v7,r4); | |||
// cmp selected | |||
r1=vec_cmpgt(vv1,vv0); | |||
r2=vec_cmpgt(v1,v0); | |||
v_ptrx+=8; | |||
//select from above | |||
ind0_second= vec_sel(ind0_second,ind1,r1); | |||
vv0= vec_sel(vv0,vv1,r1) ; | |||
ind2= vec_sel(ind2,ind3,r2); | |||
vv1= vec_sel(v0,v1,r2) ; | |||
//second indices actually should be within [16,31] so ind2+16 | |||
ind2 +=temp1; | |||
//final cmp and select index and value for the second 32 values | |||
r1=vec_cmpgt(vv1,vv0); | |||
ind0_second = vec_sel(ind0_second,ind2,r1); | |||
vv0= vec_sel(vv0,vv1,r1); | |||
ind0_second+=temp0; //get absolute index | |||
//find final quadruple from 64 elements | |||
r2=vec_cmpgt(vv0,vf0); | |||
ind2 = vec_sel( ind0_first,ind0_second,r2); | |||
vv0= vec_sel(vf0,vv0,r2); | |||
//compare with old quadruple and update | |||
r3=vec_cmpgt(vv0,quadruple_values); | |||
quadruple_indices = vec_sel( quadruple_indices,ind2,r3); | |||
quadruple_values= vec_sel(quadruple_values,vv0,r3); | |||
temp0+=temp1; | |||
temp0+=temp1; //temp0+32 | |||
} | |||
//now we have to chose from 4 values and 4 different indices | |||
// we will compare pairwise if pairs are exactly the same we will choose minimum between index | |||
// otherwise we will assign index of the maximum value | |||
float a1,a2,a3,a4; | |||
unsigned int i1,i2,i3,i4; | |||
a1=vec_extract(quadruple_values,0); | |||
a2=vec_extract(quadruple_values,1); | |||
a3=vec_extract(quadruple_values,2); | |||
a4=vec_extract(quadruple_values,3); | |||
i1=vec_extract(quadruple_indices,0); | |||
i2=vec_extract(quadruple_indices,1); | |||
i3=vec_extract(quadruple_indices,2); | |||
i4=vec_extract(quadruple_indices,3); | |||
if(a1==a2){ | |||
index=i1>i2?i2:i1; | |||
}else if(a2>a1){ | |||
index=i2; | |||
a1=a2; | |||
}else{ | |||
index= i1; | |||
} | |||
if(a4==a3){ | |||
i1=i3>i4?i4:i3; | |||
}else if(a4>a3){ | |||
i1=i4; | |||
a3=a4; | |||
}else{ | |||
i1= i3; | |||
} | |||
if(a1==a3){ | |||
index=i1>index?index:i1; | |||
*maxf=a1; | |||
}else if(a3>a1){ | |||
index=i1; | |||
*maxf=a3; | |||
}else{ | |||
*maxf=a1; | |||
} | |||
return index; | |||
} | |||
BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { | |||
BLASLONG i = 0; | |||
BLASLONG j = 0; | |||
FLOAT maxf = 0.0; | |||
BLASLONG max = 0; | |||
if (n <= 0 || inc_x <= 0) return (max); | |||
if (inc_x == 1) { | |||
BLASLONG n1 = n & -64; | |||
if (n1 > 0) { | |||
max = siamax_kernel_64(n1, x, &maxf); | |||
i = n1; | |||
} | |||
while (i < n) { | |||
if (ABS(x[i]) > maxf) { | |||
max = i; | |||
maxf = ABS(x[i]); | |||
} | |||
i++; | |||
} | |||
return (max + 1); | |||
} else { | |||
BLASLONG n1 = n & -4; | |||
while (j < n1) { | |||
if (ABS(x[i]) > maxf) { | |||
max = j; | |||
maxf = ABS(x[i]); | |||
} | |||
if (ABS(x[i + inc_x]) > maxf) { | |||
max = j + 1; | |||
maxf = ABS(x[i + inc_x]); | |||
} | |||
if (ABS(x[i + 2 * inc_x]) > maxf) { | |||
max = j + 2; | |||
maxf = ABS(x[i + 2 * inc_x]); | |||
} | |||
if (ABS(x[i + 3 * inc_x]) > maxf) { | |||
max = j + 3; | |||
maxf = ABS(x[i + 3 * inc_x]); | |||
} | |||
i += inc_x * 4; | |||
j += 4; | |||
} | |||
while (j < n) { | |||
if (ABS(x[i]) > maxf) { | |||
max = j; | |||
maxf = ABS(x[i]); | |||
} | |||
i += inc_x; | |||
j++; | |||
} | |||
return (max + 1); | |||
} | |||
} |
@@ -0,0 +1,288 @@ | |||
/*************************************************************************** | |||
Copyright (c) 2013-2019, The OpenBLAS Project | |||
All rights reserved. | |||
Redistribution and use in source and binary forms, with or without | |||
modification, are permitted provided that the following conditions are | |||
met: | |||
1. Redistributions of source code must retain the above copyright | |||
notice, this list of conditions and the following disclaimer. | |||
2. Redistributions in binary form must reproduce the above copyright | |||
notice, this list of conditions and the following disclaimer in | |||
the documentation and/or other materials provided with the | |||
distribution. | |||
3. Neither the name of the OpenBLAS project nor the names of | |||
its contributors may be used to endorse or promote products | |||
derived from this software without specific prior written permission. | |||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
*****************************************************************************/ | |||
#include "common.h" | |||
#include <math.h> | |||
#include <altivec.h> | |||
#if defined(DOUBLE) | |||
#define ABS fabs | |||
#else | |||
#define ABS fabsf | |||
#endif | |||
/** | |||
* Find minimum index | |||
* Warning: requirements n>0 and n % 64 == 0 | |||
* @param n | |||
* @param x pointer to the vector | |||
* @param minf (out) minimum absolute value .( only for output ) | |||
* @return index | |||
*/ | |||
static BLASLONG siamin_kernel_64(BLASLONG n, FLOAT *x, FLOAT *minf) { | |||
BLASLONG index; | |||
BLASLONG i=0; | |||
register __vector unsigned int static_index0 = {0,1,2,3}; | |||
register __vector unsigned int temp0 = {4,4,4, 4}; //temporary vector register | |||
register __vector unsigned int temp1= temp0<<1; //{8,8,8,8} | |||
register __vector unsigned int static_index1=static_index0 +temp0;//{4,5,6,7}; | |||
register __vector unsigned int static_index2=static_index0 +temp1;//{8,9,10,11}; | |||
register __vector unsigned int static_index3=static_index1 +temp1; //{12,13,14,15}; | |||
temp0=vec_xor(temp0,temp0); | |||
temp1=temp1 <<1 ; //{16,16,16,16} | |||
register __vector unsigned int quadruple_indices=static_index0;//{0,1,2,3}; | |||
register __vector float * v_ptrx=(__vector float *)x; | |||
register __vector float quadruple_values=vec_abs(v_ptrx[0]); | |||
for(; i<n; i+=64){ | |||
//absolute temporary vectors | |||
register __vector float v0=vec_abs(v_ptrx[0]); | |||
register __vector float v1=vec_abs(v_ptrx[1]); | |||
register __vector float v2=vec_abs(v_ptrx[2]); | |||
register __vector float v3=vec_abs(v_ptrx[3]); | |||
register __vector float v4=vec_abs(v_ptrx[4]); | |||
register __vector float v5=vec_abs(v_ptrx[5]); | |||
register __vector float v6=vec_abs(v_ptrx[6]); | |||
register __vector float v7=vec_abs(v_ptrx[7]); | |||
//cmp quadruple pairs | |||
register __vector bool int r1=vec_cmpgt(v0,v1); | |||
register __vector bool int r2=vec_cmpgt(v2,v3); | |||
register __vector bool int r3=vec_cmpgt(v4,v5); | |||
register __vector bool int r4=vec_cmpgt(v6,v7); | |||
//select | |||
register __vector unsigned int ind0_first= vec_sel(static_index0,static_index1,r1); | |||
register __vector float vf0= vec_sel(v0,v1,r1); | |||
register __vector unsigned int ind1= vec_sel(static_index2,static_index3,r2); | |||
register __vector float vf1= vec_sel(v2,v3,r2); | |||
register __vector unsigned int ind2= vec_sel(static_index0,static_index1,r3); | |||
v0=vec_sel(v4,v5,r3); | |||
register __vector unsigned int ind3= vec_sel(static_index2,static_index3,r4); | |||
v1=vec_sel(v6,v7,r4); | |||
// cmp selected | |||
r1=vec_cmpgt(vf0,vf1); | |||
r2=vec_cmpgt(v0,v1); | |||
v_ptrx+=8; | |||
//select from above | |||
ind0_first= vec_sel(ind0_first,ind1,r1); | |||
vf0= vec_sel(vf0,vf1,r1) ; | |||
ind2= vec_sel(ind2,ind3,r2); | |||
vf1= vec_sel(v0,v1,r2); | |||
//second indices actually should be within [16,31] so ind2+16 | |||
ind2 +=temp1; | |||
//final cmp and select index and value for the first 32 values | |||
r1=vec_cmpgt(vf0,vf1); | |||
ind0_first = vec_sel(ind0_first,ind2,r1); | |||
vf0= vec_sel(vf0,vf1,r1); | |||
ind0_first+=temp0; //get absolute index | |||
temp0+=temp1; | |||
temp0+=temp1; //temp0+32 | |||
//second part of 32 | |||
// absolute temporary vectors | |||
v0=vec_abs(v_ptrx[0]); | |||
v1=vec_abs(v_ptrx[1]); | |||
v2=vec_abs(v_ptrx[2]); | |||
v3=vec_abs(v_ptrx[3]); | |||
v4=vec_abs(v_ptrx[4]); | |||
v5=vec_abs(v_ptrx[5]); | |||
v6=vec_abs(v_ptrx[6]); | |||
v7=vec_abs(v_ptrx[7]); | |||
//cmp quadruple pairs | |||
r1=vec_cmpgt(v0,v1); | |||
r2=vec_cmpgt(v2,v3); | |||
r3=vec_cmpgt(v4,v5); | |||
r4=vec_cmpgt(v6,v7); | |||
//select | |||
register __vector unsigned int ind0_second= vec_sel(static_index0,static_index1,r1); | |||
register __vector float vv0= vec_sel(v0,v1,r1); | |||
ind1= vec_sel(static_index2,static_index3,r2); | |||
register __vector float vv1= vec_sel(v2,v3,r2); | |||
ind2= vec_sel(static_index0,static_index1,r3); | |||
v0=vec_sel(v4,v5,r3); | |||
ind3= vec_sel(static_index2,static_index3,r4); | |||
v1=vec_sel(v6,v7,r4); | |||
// cmp selected | |||
r1=vec_cmpgt(vv0,vv1); | |||
r2=vec_cmpgt(v0,v1); | |||
v_ptrx+=8; | |||
//select from above | |||
ind0_second= vec_sel(ind0_second,ind1,r1); | |||
vv0= vec_sel(vv0,vv1,r1) ; | |||
ind2= vec_sel(ind2,ind3,r2); | |||
vv1= vec_sel(v0,v1,r2) ; | |||
//second indices actually should be within [16,31] so ind2+16 | |||
ind2 +=temp1; | |||
//final cmp and select index and value for the second 32 values | |||
r1=vec_cmpgt(vv0,vv1); | |||
ind0_second = vec_sel(ind0_second,ind2,r1); | |||
vv0= vec_sel(vv0,vv1,r1); | |||
ind0_second+=temp0; //get absolute index | |||
//find final quadruple from 64 elements | |||
r2=vec_cmpgt(vf0,vv0); | |||
ind2 = vec_sel( ind0_first,ind0_second,r2); | |||
vv0= vec_sel(vf0,vv0,r2); | |||
//compare with old quadruple and update | |||
r3=vec_cmpgt( quadruple_values,vv0); | |||
quadruple_indices = vec_sel( quadruple_indices,ind2,r3); | |||
quadruple_values= vec_sel(quadruple_values,vv0,r3); | |||
temp0+=temp1; | |||
temp0+=temp1; //temp0+32 | |||
} | |||
//now we have to chose from 4 values and 4 different indices | |||
// we will compare pairwise if pairs are exactly the same we will choose minimum between index | |||
// otherwise we will assign index of the minimum value | |||
float a1,a2,a3,a4; | |||
unsigned int i1,i2,i3,i4; | |||
a1=vec_extract(quadruple_values,0); | |||
a2=vec_extract(quadruple_values,1); | |||
a3=vec_extract(quadruple_values,2); | |||
a4=vec_extract(quadruple_values,3); | |||
i1=vec_extract(quadruple_indices,0); | |||
i2=vec_extract(quadruple_indices,1); | |||
i3=vec_extract(quadruple_indices,2); | |||
i4=vec_extract(quadruple_indices,3); | |||
if(a1==a2){ | |||
index=i1>i2?i2:i1; | |||
}else if(a2<a1){ | |||
index=i2; | |||
a1=a2; | |||
}else{ | |||
index= i1; | |||
} | |||
if(a4==a3){ | |||
i1=i3>i4?i4:i3; | |||
}else if(a4<a3){ | |||
i1=i4; | |||
a3=a4; | |||
}else{ | |||
i1= i3; | |||
} | |||
if(a1==a3){ | |||
index=i1>index?index:i1; | |||
*minf=a1; | |||
}else if(a3<a1){ | |||
index=i1; | |||
*minf=a3; | |||
}else{ | |||
*minf=a1; | |||
} | |||
return index; | |||
} | |||
BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { | |||
BLASLONG i = 0; | |||
BLASLONG j = 0; | |||
BLASLONG min = 0; | |||
FLOAT minf = 0.0; | |||
if (n <= 0 || inc_x <= 0) return (min); | |||
minf = ABS(x[0]); //index's not incremented | |||
if (inc_x == 1) { | |||
BLASLONG n1 = n & -64; | |||
if (n1 > 0) { | |||
min = siamin_kernel_64(n1, x, &minf); | |||
i = n1; | |||
} | |||
while (i < n) { | |||
if (ABS(x[i]) < minf) { | |||
min = i; | |||
minf = ABS(x[i]); | |||
} | |||
i++; | |||
} | |||
return (min + 1); | |||
} else { | |||
BLASLONG n1 = n & -4; | |||
while (j < n1) { | |||
if (ABS(x[i]) < minf) { | |||
min = j; | |||
minf = ABS(x[i]); | |||
} | |||
if (ABS(x[i + inc_x]) < minf) { | |||
min = j + 1; | |||
minf = ABS(x[i + inc_x]); | |||
} | |||
if (ABS(x[i + 2 * inc_x]) < minf) { | |||
min = j + 2; | |||
minf = ABS(x[i + 2 * inc_x]); | |||
} | |||
if (ABS(x[i + 3 * inc_x]) < minf) { | |||
min = j + 3; | |||
minf = ABS(x[i + 3 * inc_x]); | |||
} | |||
i += inc_x * 4; | |||
j += 4; | |||
} | |||
while (j < n) { | |||
if (ABS(x[i]) < minf) { | |||
min = j; | |||
minf = ABS(x[i]); | |||
} | |||
i += inc_x; | |||
j++; | |||
} | |||
return (min + 1); | |||
} | |||
} |
@@ -101,8 +101,8 @@ static BLASLONG ziamin_kernel_16_TUNED(BLASLONG n, FLOAT *x, FLOAT *minf) { | |||
"xvcmpgedp 50,46,47 \n\t " | |||
"xvcmpgedp 51,48,49 \n\t " | |||
"xvcmpgtdp 50,46,47 \n\t " | |||
"xvcmpgtdp 51,48,49 \n\t " | |||
"addi %[ptr_tmp] ,%[ptr_tmp] , 128 \n\t" | |||
@@ -114,7 +114,7 @@ static BLASLONG ziamin_kernel_16_TUNED(BLASLONG n, FLOAT *x, FLOAT *minf) { | |||
"lxvd2x 44, 0,%[ptr_tmp] \n\t" | |||
"lxvd2x 45, %[i16],%[ptr_tmp] \n\t" | |||
"xvcmpgedp 2,0,1 \n\t " | |||
"xvcmpgtdp 2,0,1 \n\t " | |||
"lxvd2x 46, %[i32],%[ptr_tmp] \n\t" | |||
"lxvd2x 47, %[i48],%[ptr_tmp] \n\t" | |||
@@ -126,7 +126,7 @@ static BLASLONG ziamin_kernel_16_TUNED(BLASLONG n, FLOAT *x, FLOAT *minf) { | |||
//cmp with previous | |||
"xvcmpgedp 4,39,3 \n\t " | |||
"xvcmpgtdp 4,39,3 \n\t " | |||
"vaddudm 5,5,4 \n\t" | |||
"lxvd2x 48, %[i64],%[ptr_tmp] \n\t" | |||
@@ -166,8 +166,8 @@ static BLASLONG ziamin_kernel_16_TUNED(BLASLONG n, FLOAT *x, FLOAT *minf) { | |||
"xvadddp 48, 4,5 \n\t" | |||
"xvadddp 49, 44,45 \n\t" | |||
"xvcmpgedp 50,46,47 \n\t " | |||
"xvcmpgedp 51,48,49 \n\t " | |||
"xvcmpgtdp 50,46,47 \n\t " | |||
"xvcmpgtdp 51,48,49 \n\t " | |||
"addi %[ptr_tmp] ,%[ptr_tmp] , 128 \n\t" | |||
@@ -179,7 +179,7 @@ static BLASLONG ziamin_kernel_16_TUNED(BLASLONG n, FLOAT *x, FLOAT *minf) { | |||
"lxvd2x 44, 0,%[ptr_tmp] \n\t" | |||
"lxvd2x 45, %[i16],%[ptr_tmp] \n\t" | |||
"xvcmpgedp 2,0,1 \n\t " | |||
"xvcmpgtdp 2,0,1 \n\t " | |||
"lxvd2x 46, %[i32],%[ptr_tmp] \n\t" | |||
"lxvd2x 47, %[i48],%[ptr_tmp] \n\t" | |||
@@ -191,7 +191,7 @@ static BLASLONG ziamin_kernel_16_TUNED(BLASLONG n, FLOAT *x, FLOAT *minf) { | |||
//cmp with previous | |||
"xvcmpgedp 4,39,3 \n\t " | |||
"xvcmpgtdp 4,39,3 \n\t " | |||
"vaddudm 5,5,4 \n\t" | |||
"lxvd2x 48, %[i64],%[ptr_tmp] \n\t" | |||
@@ -235,15 +235,15 @@ static BLASLONG ziamin_kernel_16_TUNED(BLASLONG n, FLOAT *x, FLOAT *minf) { | |||
"xvcmpgedp 50,46,47 \n\t " | |||
"xvcmpgedp 51,48,49 \n\t " | |||
"xvcmpgtdp 50,46,47 \n\t " | |||
"xvcmpgtdp 51,48,49 \n\t " | |||
"xxsel 32,40,41,50 \n\t" | |||
"xxsel 0,46,47,50 \n\t" | |||
"xxsel 33,42,43,51 \n\t" | |||
"xxsel 1,48,49,51 \n\t" | |||
"xvcmpgedp 2,0,1 \n\t " | |||
"xvcmpgtdp 2,0,1 \n\t " | |||
"xxsel 32,32,33,2 \n\t" | |||
"xxsel 3,0,1,2 \n\t" | |||
@@ -252,7 +252,7 @@ static BLASLONG ziamin_kernel_16_TUNED(BLASLONG n, FLOAT *x, FLOAT *minf) { | |||
"addi %[ptr_tmp] ,%[ptr_tmp] , 128 \n\t" | |||
//cmp with previous | |||
"xvcmpgedp 4,39,3 \n\t " | |||
"xvcmpgtdp 4,39,3 \n\t " | |||
"vaddudm 5,5,4 \n\t" | |||
"xxsel 38,38,32,4 \n\t" | |||
"xxsel 39,39,3,4 \n\t" | |||
@@ -267,7 +267,7 @@ static BLASLONG ziamin_kernel_16_TUNED(BLASLONG n, FLOAT *x, FLOAT *minf) { | |||
//cr6 0 bit set if all true, cr6=4*6+bit_ind=24,0011at CR(BI)==1, at=10 hint that it occurs rarely | |||
//0b001110=14 | |||
"bc 14,24, 3f \n\t" | |||
"xvcmpgedp 4,39, 40 \n\t" | |||
"xvcmpgtdp 4,39, 40 \n\t" | |||
"xxsel 0,39,40,4 \n\t" | |||
"xxsel 1,38,32,4 \n\t" | |||
"stxsdx 0,0,%[ptr_minf] \n\t" | |||
@@ -0,0 +1,129 @@ | |||
/*************************************************************************** | |||
Copyright (c) 2013-2018, The OpenBLAS Project | |||
All rights reserved. | |||
Redistribution and use in source and binary forms, with or without | |||
modification, are permitted provided that the following conditions are | |||
met: | |||
1. Redistributions of source code must retain the above copyright | |||
notice, this list of conditions and the following disclaimer. | |||
2. Redistributions in binary form must reproduce the above copyright | |||
notice, this list of conditions and the following disclaimer in | |||
the documentation and/or other materials provided with the | |||
distribution. | |||
3. Neither the name of the OpenBLAS project nor the names of | |||
its contributors may be used to endorse or promote products | |||
derived from this software without specific prior written permission. | |||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
*****************************************************************************/ | |||
#include "common.h" | |||
#ifndef HAVE_KERNEL_8 | |||
#include <altivec.h> | |||
static void saxpy_kernel_64(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT alpha) | |||
{ | |||
BLASLONG i = 0; | |||
__vector float v_a = {alpha,alpha,alpha,alpha}; | |||
__vector float * v_y=(__vector float *)y; | |||
__vector float * v_x=(__vector float *)x; | |||
for(; i<n/4; i+=16){ | |||
v_y[i] += v_a * v_x[i]; | |||
v_y[i+1] += v_a * v_x[i+1]; | |||
v_y[i+2] += v_a * v_x[i+2]; | |||
v_y[i+3] += v_a * v_x[i+3]; | |||
v_y[i+4] += v_a * v_x[i+4]; | |||
v_y[i+5] += v_a * v_x[i+5]; | |||
v_y[i+6] += v_a * v_x[i+6]; | |||
v_y[i+7] += v_a * v_x[i+7]; | |||
v_y[i+8] += v_a * v_x[i+8]; | |||
v_y[i+9] += v_a * v_x[i+9]; | |||
v_y[i+10] += v_a * v_x[i+10]; | |||
v_y[i+11] += v_a * v_x[i+11]; | |||
v_y[i+12] += v_a * v_x[i+12]; | |||
v_y[i+13] += v_a * v_x[i+13]; | |||
v_y[i+14] += v_a * v_x[i+14]; | |||
v_y[i+15] += v_a * v_x[i+15]; | |||
} | |||
} | |||
#endif | |||
int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) | |||
{ | |||
BLASLONG i=0; | |||
BLASLONG ix=0,iy=0; | |||
if ( n <= 0 ) return(0); | |||
if ( (inc_x == 1) && (inc_y == 1) ) | |||
{ | |||
BLASLONG n1 = n & -64; | |||
if ( n1 ) | |||
saxpy_kernel_64(n1, x, y, da); | |||
i = n1; | |||
while(i < n) | |||
{ | |||
y[i] += da * x[i] ; | |||
i++ ; | |||
} | |||
return(0); | |||
} | |||
BLASLONG n1 = n & -4; | |||
while(i < n1) | |||
{ | |||
FLOAT m1 = da * x[ix] ; | |||
FLOAT m2 = da * x[ix+inc_x] ; | |||
FLOAT m3 = da * x[ix+2*inc_x] ; | |||
FLOAT m4 = da * x[ix+3*inc_x] ; | |||
y[iy] += m1 ; | |||
y[iy+inc_y] += m2 ; | |||
y[iy+2*inc_y] += m3 ; | |||
y[iy+3*inc_y] += m4 ; | |||
ix += inc_x*4 ; | |||
iy += inc_y*4 ; | |||
i+=4 ; | |||
} | |||
while(i < n) | |||
{ | |||
y[iy] += da * x[ix] ; | |||
ix += inc_x ; | |||
iy += inc_y ; | |||
i++ ; | |||
} | |||
return(0); | |||
} | |||