| @@ -166,5 +166,5 @@ In chronological order: | |||||
| * [2017-01-01] dgemm and dtrmm kernels for IBM z13 | * [2017-01-01] dgemm and dtrmm kernels for IBM z13 | ||||
| * [2017-02-26] ztrmm kernel for IBM z13 | * [2017-02-26] ztrmm kernel for IBM z13 | ||||
| * [2017-03-13] strmm and ctrmm kernel for IBM z13 | * [2017-03-13] strmm and ctrmm kernel for IBM z13 | ||||
| * [2017-09-01] initial Blas Level-1,2 (double precision) for IBM z13 | |||||
| @@ -107,7 +107,7 @@ Please read GotoBLAS_01Readme.txt | |||||
| - **ARM Cortex-A57**: Experimental | - **ARM Cortex-A57**: Experimental | ||||
| #### IBM zEnterprise System: | #### IBM zEnterprise System: | ||||
| - **Z13**: Optimized Level-3 BLAS | |||||
| - **Z13**: Optimized Level-3 BLAS and Level-1,2 (double precision) | |||||
| ### Support OS: | ### Support OS: | ||||
| @@ -40,8 +40,12 @@ | |||||
| #include "common.h" | #include "common.h" | ||||
| #ifdef FUNCTION_PROFILE | #ifdef FUNCTION_PROFILE | ||||
| #include "functable.h" | #include "functable.h" | ||||
| #endif | |||||
| #if defined(Z13) | |||||
| #define MULTI_THREAD_MINIMAL 200000 | |||||
| #else | |||||
| #define MULTI_THREAD_MINIMAL 10000 | |||||
| #endif | #endif | ||||
| #ifndef CBLAS | #ifndef CBLAS | ||||
| void NAME(blasint *N, FLOAT *ALPHA, FLOAT *x, blasint *INCX, FLOAT *y, blasint *INCY){ | void NAME(blasint *N, FLOAT *ALPHA, FLOAT *x, blasint *INCX, FLOAT *y, blasint *INCY){ | ||||
| @@ -88,7 +92,7 @@ void CNAME(blasint n, FLOAT alpha, FLOAT *x, blasint incx, FLOAT *y, blasint inc | |||||
| //Temporarily work-around the low performance issue with small imput size & | //Temporarily work-around the low performance issue with small imput size & | ||||
| //multithreads. | //multithreads. | ||||
| if (n <= 10000) | |||||
| if (n <= MULTI_THREAD_MINIMAL) | |||||
| nthreads = 1; | nthreads = 1; | ||||
| if (nthreads == 1) { | if (nthreads == 1) { | ||||
| @@ -15,14 +15,14 @@ SMINKERNEL = ../arm/min.c | |||||
| DMINKERNEL = ../arm/min.c | DMINKERNEL = ../arm/min.c | ||||
| ISAMAXKERNEL = ../arm/iamax.c | ISAMAXKERNEL = ../arm/iamax.c | ||||
| IDAMAXKERNEL = ../arm/iamax.c | |||||
| IDAMAXKERNEL = idamax.c | |||||
| ICAMAXKERNEL = ../arm/izamax.c | ICAMAXKERNEL = ../arm/izamax.c | ||||
| IZAMAXKERNEL = ../arm/izamax.c | |||||
| IZAMAXKERNEL = izamax.c | |||||
| ISAMINKERNEL = ../arm/iamin.c | ISAMINKERNEL = ../arm/iamin.c | ||||
| IDAMINKERNEL = ../arm/iamin.c | |||||
| IDAMINKERNEL = idamin.c | |||||
| ICAMINKERNEL = ../arm/izamin.c | ICAMINKERNEL = ../arm/izamin.c | ||||
| IZAMINKERNEL = ../arm/izamin.c | |||||
| IZAMINKERNEL = izamin.c | |||||
| ISMAXKERNEL = ../arm/imax.c | ISMAXKERNEL = ../arm/imax.c | ||||
| IDMAXKERNEL = ../arm/imax.c | IDMAXKERNEL = ../arm/imax.c | ||||
| @@ -31,24 +31,24 @@ ISMINKERNEL = ../arm/imin.c | |||||
| IDMINKERNEL = ../arm/imin.c | IDMINKERNEL = ../arm/imin.c | ||||
| SASUMKERNEL = ../arm/asum.c | SASUMKERNEL = ../arm/asum.c | ||||
| DASUMKERNEL = ../arm/asum.c | |||||
| DASUMKERNEL = dasum.c | |||||
| CASUMKERNEL = ../arm/zasum.c | CASUMKERNEL = ../arm/zasum.c | ||||
| ZASUMKERNEL = ../arm/zasum.c | |||||
| ZASUMKERNEL = zasum.c | |||||
| SAXPYKERNEL = ../arm/axpy.c | SAXPYKERNEL = ../arm/axpy.c | ||||
| DAXPYKERNEL = ../arm/axpy.c | |||||
| DAXPYKERNEL = daxpy.c | |||||
| CAXPYKERNEL = ../arm/zaxpy.c | CAXPYKERNEL = ../arm/zaxpy.c | ||||
| ZAXPYKERNEL = ../arm/zaxpy.c | |||||
| ZAXPYKERNEL = zaxpy.c | |||||
| SCOPYKERNEL = ../arm/copy.c | SCOPYKERNEL = ../arm/copy.c | ||||
| DCOPYKERNEL = ../arm/copy.c | |||||
| DCOPYKERNEL = dcopy.c | |||||
| CCOPYKERNEL = ../arm/zcopy.c | CCOPYKERNEL = ../arm/zcopy.c | ||||
| ZCOPYKERNEL = ../arm/zcopy.c | |||||
| ZCOPYKERNEL = zcopy.c | |||||
| SDOTKERNEL = ../arm/dot.c | SDOTKERNEL = ../arm/dot.c | ||||
| DDOTKERNEL = ../arm/dot.c | |||||
| DDOTKERNEL = ddot.c | |||||
| CDOTKERNEL = ../arm/zdot.c | CDOTKERNEL = ../arm/zdot.c | ||||
| ZDOTKERNEL = ../arm/zdot.c | |||||
| ZDOTKERNEL = zdot.c | |||||
| SNRM2KERNEL = ../arm/nrm2.c | SNRM2KERNEL = ../arm/nrm2.c | ||||
| DNRM2KERNEL = ../arm/nrm2.c | DNRM2KERNEL = ../arm/nrm2.c | ||||
| @@ -56,29 +56,29 @@ CNRM2KERNEL = ../arm/znrm2.c | |||||
| ZNRM2KERNEL = ../arm/znrm2.c | ZNRM2KERNEL = ../arm/znrm2.c | ||||
| SROTKERNEL = ../arm/rot.c | SROTKERNEL = ../arm/rot.c | ||||
| DROTKERNEL = ../arm/rot.c | |||||
| DROTKERNEL = drot.c | |||||
| CROTKERNEL = ../arm/zrot.c | CROTKERNEL = ../arm/zrot.c | ||||
| ZROTKERNEL = ../arm/zrot.c | |||||
| ZROTKERNEL = zrot.c | |||||
| SSCALKERNEL = ../arm/scal.c | SSCALKERNEL = ../arm/scal.c | ||||
| DSCALKERNEL = ../arm/scal.c | |||||
| DSCALKERNEL = dscal.c | |||||
| CSCALKERNEL = ../arm/zscal.c | CSCALKERNEL = ../arm/zscal.c | ||||
| ZSCALKERNEL = ../arm/zscal.c | |||||
| ZSCALKERNEL = zscal.c | |||||
| SSWAPKERNEL = ../arm/swap.c | SSWAPKERNEL = ../arm/swap.c | ||||
| DSWAPKERNEL = ../arm/swap.c | |||||
| DSWAPKERNEL = dswap.c | |||||
| CSWAPKERNEL = ../arm/zswap.c | CSWAPKERNEL = ../arm/zswap.c | ||||
| ZSWAPKERNEL = ../arm/zswap.c | |||||
| ZSWAPKERNEL = zswap.c | |||||
| SGEMVNKERNEL = ../arm/gemv_n.c | SGEMVNKERNEL = ../arm/gemv_n.c | ||||
| DGEMVNKERNEL = ../arm/gemv_n.c | |||||
| DGEMVNKERNEL = dgemv_n_4.c | |||||
| CGEMVNKERNEL = ../arm/zgemv_n.c | CGEMVNKERNEL = ../arm/zgemv_n.c | ||||
| ZGEMVNKERNEL = ../arm/zgemv_n.c | |||||
| ZGEMVNKERNEL = zgemv_n_4.c | |||||
| SGEMVTKERNEL = ../arm/gemv_t.c | SGEMVTKERNEL = ../arm/gemv_t.c | ||||
| DGEMVTKERNEL = ../arm/gemv_t.c | |||||
| DGEMVTKERNEL = dgemv_t_4.c | |||||
| CGEMVTKERNEL = ../arm/zgemv_t.c | CGEMVTKERNEL = ../arm/zgemv_t.c | ||||
| ZGEMVTKERNEL = ../arm/zgemv_t.c | |||||
| ZGEMVTKERNEL = zgemv_t_4.c | |||||
| STRMMKERNEL = strmm8x4V.S | STRMMKERNEL = strmm8x4V.S | ||||
| DTRMMKERNEL = trmm8x4V.S | DTRMMKERNEL = trmm8x4V.S | ||||
| @@ -0,0 +1,159 @@ | |||||
| /*************************************************************************** | |||||
| Copyright (c) 2013-2017, The OpenBLAS Project | |||||
| All rights reserved. | |||||
| Redistribution and use in source and binary forms, with or without | |||||
| modification, are permitted provided that the following conditions are | |||||
| met: | |||||
| 1. Redistributions of source code must retain the above copyright | |||||
| notice, this list of conditions and the following disclaimer. | |||||
| 2. Redistributions in binary form must reproduce the above copyright | |||||
| notice, this list of conditions and the following disclaimer in | |||||
| the documentation and/or other materials provided with the | |||||
| distribution. | |||||
| 3. Neither the name of the OpenBLAS project nor the names of | |||||
| its contributors may be used to endorse or promote products | |||||
| derived from this software without specific prior written permission. | |||||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| *****************************************************************************/ | |||||
| #include "common.h" | |||||
| #include <math.h> | |||||
| #if defined(DOUBLE) | |||||
| #define ABS fabs | |||||
| #else | |||||
| #define ABS fabsf | |||||
| #endif | |||||
| static FLOAT __attribute__ ((noinline)) dasum_kernel_32(BLASLONG n, FLOAT *x) { | |||||
| __asm__ ( | |||||
| "pfd 1, 0(%1) \n\t" | |||||
| "sllg %%r0,%0,3 \n\t" | |||||
| "agr %%r0,%1 \n\t" | |||||
| "vzero %%v0 \n\t" | |||||
| "vzero %%v1 \n\t" | |||||
| "vzero %%v2 \n\t" | |||||
| "vzero %%v3 \n\t" | |||||
| ".align 16 \n\t" | |||||
| "1: \n\t" | |||||
| "pfd 1, 256(%1 ) \n\t" | |||||
| "vlm %%v24,%%v31, 0(%1 ) \n\t" | |||||
| "vflpdb %%v24, %%v24 \n\t" | |||||
| "vflpdb %%v25, %%v25 \n\t" | |||||
| "vflpdb %%v26, %%v26 \n\t" | |||||
| "vflpdb %%v27, %%v27 \n\t" | |||||
| "vflpdb %%v28, %%v28 \n\t" | |||||
| "vflpdb %%v29, %%v29 \n\t" | |||||
| "vflpdb %%v30, %%v30 \n\t" | |||||
| "vflpdb %%v31, %%v31 \n\t" | |||||
| "vfadb %%v0,%%v0,%%v24 \n\t" | |||||
| "vfadb %%v1,%%v1,%%v25 \n\t" | |||||
| "vfadb %%v2,%%v2,%%v26 \n\t" | |||||
| "vfadb %%v3,%%v3,%%v27 \n\t" | |||||
| "vfadb %%v0,%%v0,%%v28 \n\t" | |||||
| "vfadb %%v1,%%v1,%%v29 \n\t" | |||||
| "vfadb %%v2,%%v2,%%v30 \n\t" | |||||
| "vfadb %%v3,%%v3,%%v31 \n\t" | |||||
| "vlm %%v24,%%v31, 128(%1) \n\t" | |||||
| "vflpdb %%v24, %%v24 \n\t" | |||||
| "vflpdb %%v25, %%v25 \n\t" | |||||
| "vflpdb %%v26, %%v26 \n\t" | |||||
| "vflpdb %%v27, %%v27 \n\t" | |||||
| "vflpdb %%v28, %%v28 \n\t" | |||||
| "vflpdb %%v29, %%v29 \n\t" | |||||
| "vflpdb %%v30, %%v30 \n\t" | |||||
| "vflpdb %%v31, %%v31 \n\t" | |||||
| "la %1,256(%1) \n\t" | |||||
| "vfadb %%v0,%%v0,%%v24 \n\t" | |||||
| "vfadb %%v1,%%v1,%%v25 \n\t" | |||||
| "vfadb %%v2,%%v2,%%v26 \n\t" | |||||
| "vfadb %%v3,%%v3,%%v27 \n\t" | |||||
| "vfadb %%v0,%%v0,%%v28 \n\t" | |||||
| "vfadb %%v1,%%v1,%%v29 \n\t" | |||||
| "vfadb %%v2,%%v2,%%v30 \n\t" | |||||
| "vfadb %%v3,%%v3,%%v31 \n\t" | |||||
| "clgrjl %1,%%r0,1b \n\t" | |||||
| "vfadb %%v24,%%v0,%%v1 \n\t" | |||||
| "vfadb %%v25,%%v2,%%v3 \n\t" | |||||
| "vfadb %%v0,%%v25,%%v24 \n\t" | |||||
| "vrepg %%v1,%%v0,1 \n\t" | |||||
| "adbr %%f0,%%f1 \n\t" | |||||
| : | |||||
| : "r"(n), "a"(x) | |||||
| : "cc", "memory","r0","f0","f1","v0","v1","v2","v3","v24","v25","v26","v27","v28","v29","v30","v31" | |||||
| ); | |||||
| } | |||||
| FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { | |||||
| BLASLONG i = 0; | |||||
| BLASLONG j = 0; | |||||
| FLOAT sumf = 0.0; | |||||
| BLASLONG n1; | |||||
| if (n <= 0 || inc_x <= 0) return sumf; | |||||
| if (inc_x == 1) { | |||||
| n1 = n & -32; | |||||
| if (n1 > 0) { | |||||
| sumf = dasum_kernel_32(n1, x); | |||||
| i = n1; | |||||
| } | |||||
| while (i < n) { | |||||
| sumf += ABS(x[i]); | |||||
| i++; | |||||
| } | |||||
| } else { | |||||
| BLASLONG n1 = n & -4; | |||||
| register FLOAT sum1, sum2; | |||||
| sum1 = 0.0; | |||||
| sum2 = 0.0; | |||||
| while (j < n1) { | |||||
| sum1 += ABS(x[i]); | |||||
| sum2 += ABS(x[i + inc_x]); | |||||
| sum1 += ABS(x[i + 2 * inc_x]); | |||||
| sum2 += ABS(x[i + 3 * inc_x]); | |||||
| i += inc_x * 4; | |||||
| j += 4; | |||||
| } | |||||
| sumf = sum1 + sum2; | |||||
| while (j < n) { | |||||
| sumf += ABS(x[i]); | |||||
| i += inc_x; | |||||
| j++; | |||||
| } | |||||
| } | |||||
| return sumf; | |||||
| } | |||||
| @@ -0,0 +1,386 @@ | |||||
| /*************************************************************************** | |||||
| Copyright (c) 2013-2017, The OpenBLAS Project | |||||
| All rights reserved. | |||||
| Redistribution and use in source and binary forms, with or without | |||||
| modification, are permitted provided that the following conditions are | |||||
| met: | |||||
| 1. Redistributions of source code must retain the above copyright | |||||
| notice, this list of conditions and the following disclaimer. | |||||
| 2. Redistributions in binary form must reproduce the above copyright | |||||
| notice, this list of conditions and the following disclaimer in | |||||
| the documentation and/or other materials provided with the | |||||
| distribution. | |||||
| 3. Neither the name of the OpenBLAS project nor the names of | |||||
| its contributors may be used to endorse or promote products | |||||
| derived from this software without specific prior written permission. | |||||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| *****************************************************************************/ | |||||
| #include "common.h" | |||||
| #define Z13_D 1 | |||||
| #define PREFETCH_INS 1 | |||||
| #if defined(Z13_A) | |||||
| #include <vecintrin.h> | |||||
| static void daxpy_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) | |||||
| { | |||||
| BLASLONG i = 0; | |||||
| __vector double v_a = {*alpha,*alpha}; | |||||
| __vector double * v_y=(__vector double *)y; | |||||
| __vector double * v_x=(__vector double *)x; | |||||
| for(; i<n/2; i+=16){ | |||||
| v_y[i] += v_a * v_x[i]; | |||||
| v_y[i+1] += v_a * v_x[i+1]; | |||||
| v_y[i+2] += v_a * v_x[i+2]; | |||||
| v_y[i+3] += v_a * v_x[i+3]; | |||||
| v_y[i+4] += v_a * v_x[i+4]; | |||||
| v_y[i+5] += v_a * v_x[i+5]; | |||||
| v_y[i+6] += v_a * v_x[i+6]; | |||||
| v_y[i+7] += v_a * v_x[i+7]; | |||||
| v_y[i+8] += v_a * v_x[i+8]; | |||||
| v_y[i+9] += v_a * v_x[i+9]; | |||||
| v_y[i+10] += v_a * v_x[i+10]; | |||||
| v_y[i+11] += v_a * v_x[i+11]; | |||||
| v_y[i+12] += v_a * v_x[i+12]; | |||||
| v_y[i+13] += v_a * v_x[i+13]; | |||||
| v_y[i+14] += v_a * v_x[i+14]; | |||||
| v_y[i+15] += v_a * v_x[i+15]; | |||||
| } | |||||
| } | |||||
| #elif defined(Z13_B) | |||||
| static void __attribute__ ((noinline)) daxpy_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) | |||||
| { | |||||
| __asm__ volatile( | |||||
| #if defined(PREFETCH_INS) | |||||
| "pfd 1, 0(%1) \n\t" | |||||
| "pfd 2, 0(%2) \n\t" | |||||
| #endif | |||||
| "vlrepg %%v0 , 0(%3) \n\t" | |||||
| "srlg %3,%0,5 \n\t" | |||||
| "xgr %%r1,%%r1 \n\t" | |||||
| "vlr %%v1,%%v0 \n\t" | |||||
| ".align 16 \n\t" | |||||
| "1: \n\t" | |||||
| #if defined(PREFETCH_INS) | |||||
| "pfd 1, 256(%%r1,%1) \n\t" | |||||
| "pfd 2, 256(%%r1,%2) \n\t" | |||||
| #endif | |||||
| "vl %%v24, 0(%%r1,%2) \n\t" | |||||
| "vl %%v16, 0(%%r1,%1) \n\t" | |||||
| "vfmadb %%v16,%%v0,%%v16,%%v24 \n\t" | |||||
| "vst %%v16, 0(%%r1,%2) \n\t" | |||||
| "vl %%v25, 16(%%r1,%2) \n\t" | |||||
| "vl %%v17, 16(%%r1,%1) \n\t" | |||||
| "vfmadb %%v17,%%v0,%%v17,%%v25 \n\t" | |||||
| "vst %%v17, 16(%%r1,%2) \n\t" | |||||
| "vl %%v26, 32(%%r1,%2) \n\t" | |||||
| "vl %%v18, 32(%%r1,%1) \n\t" | |||||
| "vfmadb %%v18,%%v0,%%v18,%%v26 \n\t" | |||||
| "vst %%v18, 32(%%r1,%2) \n\t" | |||||
| "vl %%v27, 48(%%r1,%2) \n\t" | |||||
| "vl %%v19, 48(%%r1,%1) \n\t" | |||||
| "vfmadb %%v19,%%v0,%%v19,%%v27 \n\t" | |||||
| "vst %%v19, 48(%%r1,%2) \n\t" | |||||
| "vl %%v24,( 0+64)(%%r1,%2) \n\t" | |||||
| "vl %%v16,( 0+64)(%%r1,%1) \n\t" | |||||
| "vfmadb %%v16,%%v0,%%v16,%%v24 \n\t" | |||||
| "vst %%v16,( 0+64)(%%r1,%2) \n\t" | |||||
| "vl %%v25, (16+64)(%%r1,%2) \n\t" | |||||
| "vl %%v17, (16+64)(%%r1,%1) \n\t" | |||||
| "vfmadb %%v17,%%v0,%%v17,%%v25 \n\t" | |||||
| "vst %%v17, (16+64)(%%r1,%2) \n\t" | |||||
| "vl %%v26, (32+64)(%%r1,%2) \n\t" | |||||
| "vl %%v18, (32+64)(%%r1,%1) \n\t" | |||||
| "vfmadb %%v18,%%v0,%%v18,%%v26 \n\t" | |||||
| "vst %%v18, (32+64)(%%r1,%2) \n\t" | |||||
| "vl %%v27, (48+64)(%%r1,%2) \n\t" | |||||
| "vl %%v19, (48+64)(%%r1,%1) \n\t" | |||||
| "vfmadb %%v19,%%v0,%%v19,%%v27 \n\t" | |||||
| "vst %%v19, (48+64)(%%r1,%2) \n\t" | |||||
| "vl %%v24,( 0+128)(%%r1,%2) \n\t" | |||||
| "vl %%v16,( 0+128)(%%r1,%1) \n\t" | |||||
| "vfmadb %%v16,%%v0,%%v16,%%v24 \n\t" | |||||
| "vst %%v16,( 0+128)(%%r1,%2) \n\t" | |||||
| "vl %%v25, (16+128)(%%r1,%2) \n\t" | |||||
| "vl %%v17, (16+128)(%%r1,%1) \n\t" | |||||
| "vfmadb %%v17,%%v0,%%v17,%%v25 \n\t" | |||||
| "vst %%v17, (16+128)(%%r1,%2) \n\t" | |||||
| "vl %%v26, (32+128)(%%r1,%2) \n\t" | |||||
| "vl %%v18, (32+128)(%%r1,%1) \n\t" | |||||
| "vfmadb %%v18,%%v0,%%v18,%%v26 \n\t" | |||||
| "vst %%v18, (32+128)(%%r1,%2) \n\t" | |||||
| "vl %%v27, (48+128)(%%r1,%2) \n\t" | |||||
| "vl %%v19, (48+128)(%%r1,%1) \n\t" | |||||
| "vfmadb %%v19,%%v0,%%v19,%%v27 \n\t" | |||||
| "vst %%v19, (48+128)(%%r1,%2) \n\t" | |||||
| "vl %%v24,( 0+192)(%%r1,%2) \n\t" | |||||
| "vl %%v16,( 0+192)(%%r1,%1) \n\t" | |||||
| "vfmadb %%v16,%%v0,%%v16,%%v24 \n\t" | |||||
| "vst %%v16,( 0+192)(%%r1,%2) \n\t" | |||||
| "vl %%v25, (16+192)(%%r1,%2) \n\t" | |||||
| "vl %%v17, (16+192)(%%r1,%1) \n\t" | |||||
| "vfmadb %%v17,%%v0,%%v17,%%v25 \n\t" | |||||
| "vst %%v17, (16+192)(%%r1,%2) \n\t" | |||||
| "vl %%v26, (32+192)(%%r1,%2) \n\t" | |||||
| "vl %%v18, (32+192)(%%r1,%1) \n\t" | |||||
| "vfmadb %%v18,%%v0,%%v18,%%v26 \n\t" | |||||
| "vst %%v18, (32+192)(%%r1,%2) \n\t" | |||||
| "vl %%v27, (48+192)(%%r1,%2) \n\t" | |||||
| "vl %%v19, (48+192)(%%r1,%1) \n\t" | |||||
| "vfmadb %%v19,%%v0,%%v19,%%v27 \n\t" | |||||
| "vst %%v19, (48+192)(%%r1,%2) \n\t" | |||||
| "la %%r1,256(%%r1) \n\t" | |||||
| "brctg %3,1b" | |||||
| : | |||||
| :"r"(n),"a"(x),"a"(y),"a"(alpha) | |||||
| :"cc", "memory", "r1" ,"v0" ,"v16","v17","v18","v19", "v24","v25","v26","v27" | |||||
| ); | |||||
| } | |||||
| #elif defined(Z13_C) | |||||
| static void __attribute__ ((noinline)) daxpy_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) | |||||
| { | |||||
| __asm__ volatile( | |||||
| #if defined(PREFETCH_INS) | |||||
| "pfd 1, 0(%1) \n\t" | |||||
| "pfd 2, 0(%2) \n\t" | |||||
| #endif | |||||
| "vlrepg %%v0 , 0(%3) \n\t" | |||||
| "srlg %3,%0,5 \n\t" | |||||
| "xgr %%r1,%%r1 \n\t" | |||||
| "vlr %%v1,%%v0 \n\t" | |||||
| ".align 16 \n\t" | |||||
| "1: \n\t" | |||||
| #if defined(PREFETCH_INS) | |||||
| "pfd 1, 256(%%r1,%1) \n\t" | |||||
| "pfd 2, 256(%%r1,%2) \n\t" | |||||
| #endif | |||||
| "vl %%v16, 0(%%r1,%1) \n\t" | |||||
| "vl %%v17, 16(%%r1,%1) \n\t" | |||||
| "vl %%v18, 32(%%r1,%1) \n\t" | |||||
| "vl %%v19, 48(%%r1,%1) \n\t" | |||||
| "vl %%v24, 0(%%r1,%2) \n\t" | |||||
| "vl %%v25, 16(%%r1,%2) \n\t" | |||||
| "vl %%v26, 32(%%r1,%2) \n\t" | |||||
| "vl %%v27, 48(%%r1,%2) \n\t" | |||||
| "vfmadb %%v16,%%v0,%%v16,%%v24 \n\t" | |||||
| "vfmadb %%v17,%%v1,%%v17,%%v25 \n\t" | |||||
| "vfmadb %%v18,%%v0,%%v18,%%v26 \n\t" | |||||
| "vfmadb %%v19,%%v1,%%v19,%%v27 \n\t" | |||||
| "vst %%v16, 0(%%r1,%2) \n\t" | |||||
| "vst %%v17, 16(%%r1,%2) \n\t" | |||||
| "vst %%v18, 32(%%r1,%2) \n\t" | |||||
| "vst %%v19, 48(%%r1,%2) \n\t" | |||||
| "vl %%v24, 64(%%r1,%1) \n\t" | |||||
| "vl %%v25, 80(%%r1,%1) \n\t" | |||||
| "vl %%v26, 96(%%r1,%1) \n\t" | |||||
| "vl %%v27, 112(%%r1,%1) \n\t" | |||||
| "vl %%v16, 64(%%r1,%2) \n\t" | |||||
| "vl %%v17, 80(%%r1,%2) \n\t" | |||||
| "vl %%v18, 96(%%r1,%2) \n\t" | |||||
| "vl %%v19, 112(%%r1,%2) \n\t" | |||||
| "vfmadb %%v24,%%v0,%%v24,%%v16 \n\t" | |||||
| "vfmadb %%v25,%%v1,%%v25,%%v17 \n\t" | |||||
| "vfmadb %%v26,%%v0,%%v26,%%v18 \n\t" | |||||
| "vfmadb %%v27,%%v1,%%v27,%%v19 \n\t" | |||||
| "vst %%v24, 64(%%r1,%2) \n\t" | |||||
| "vst %%v25, 80(%%r1,%2) \n\t" | |||||
| "vst %%v26, 96(%%r1,%2) \n\t" | |||||
| "vst %%v27, 112(%%r1,%2) \n\t" | |||||
| "vl %%v16, (0+128)(%%r1,%1) \n\t" | |||||
| "vl %%v17, (16+128)(%%r1,%1) \n\t" | |||||
| "vl %%v18, (32+128)(%%r1,%1) \n\t" | |||||
| "vl %%v19, (48+128)(%%r1,%1) \n\t" | |||||
| "vl %%v24, (0+128)(%%r1,%2) \n\t" | |||||
| "vl %%v25, (16+128)(%%r1,%2) \n\t" | |||||
| "vl %%v26, (32+128)(%%r1,%2) \n\t" | |||||
| "vl %%v27, (48+128)(%%r1,%2) \n\t" | |||||
| "vfmadb %%v16,%%v0,%%v16,%%v24 \n\t" | |||||
| "vfmadb %%v17,%%v1,%%v17,%%v25 \n\t" | |||||
| "vfmadb %%v18,%%v0,%%v18,%%v26 \n\t" | |||||
| "vfmadb %%v19,%%v1,%%v19,%%v27 \n\t" | |||||
| "vst %%v16, (0+128)(%%r1,%2) \n\t" | |||||
| "vst %%v17, (16+128)(%%r1,%2) \n\t" | |||||
| "vst %%v18, (32+128)(%%r1,%2) \n\t" | |||||
| "vst %%v19, (48+128)(%%r1,%2) \n\t" | |||||
| "vl %%v24, (64+128)(%%r1,%1) \n\t" | |||||
| "vl %%v25, (80+128)(%%r1,%1) \n\t" | |||||
| "vl %%v26, (96+128)(%%r1,%1) \n\t" | |||||
| "vl %%v27, (112+128)(%%r1,%1) \n\t" | |||||
| "vl %%v16, (64+128)(%%r1,%2) \n\t" | |||||
| "vl %%v17, (80+128)(%%r1,%2) \n\t" | |||||
| "vl %%v18, (96+128)(%%r1,%2) \n\t" | |||||
| "vl %%v19, (112+128)(%%r1,%2) \n\t" | |||||
| "vfmadb %%v24,%%v0,%%v24,%%v16 \n\t" | |||||
| "vfmadb %%v25,%%v1,%%v25,%%v17 \n\t" | |||||
| "vfmadb %%v26,%%v0,%%v26,%%v18 \n\t" | |||||
| "vfmadb %%v27,%%v1,%%v27,%%v19 \n\t" | |||||
| "vst %%v24, (64+128)(%%r1,%2) \n\t" | |||||
| "vst %%v25, (80+128)(%%r1,%2) \n\t" | |||||
| "vst %%v26, (96+128)(%%r1,%2) \n\t" | |||||
| "vst %%v27, (112+128)(%%r1,%2) \n\t" | |||||
| "la %%r1,256(%%r1) \n\t" | |||||
| "brctg %3,1b" | |||||
| : | |||||
| :"r"(n),"a"(x),"a"(y),"a"(alpha) | |||||
| :"cc", "memory", "r1" ,"v0","v1","v16","v17","v18","v19", "v24","v25","v26","v27" | |||||
| ); | |||||
| } | |||||
| #elif defined(Z13_D) | |||||
| static void __attribute__ ((noinline)) daxpy_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) | |||||
| { | |||||
| __asm__ volatile( | |||||
| #if defined(PREFETCH_INS) | |||||
| "pfd 1, 0(%1) \n\t" | |||||
| "pfd 2, 0(%2) \n\t" | |||||
| #endif | |||||
| "vlrepg %%v0 , 0(%3) \n\t" | |||||
| "srlg %3,%0,5 \n\t" | |||||
| "vlr %%v1,%%v0 \n\t" | |||||
| ".align 16 \n\t" | |||||
| "1: \n\t" | |||||
| #if defined(PREFETCH_INS) | |||||
| "pfd 1, 256(%1) \n\t" | |||||
| "pfd 2, 256(%2) \n\t" | |||||
| #endif | |||||
| "vlm %%v16,%%v23, 0(%1) \n\t" | |||||
| "vlm %%v24, %%v31, 0(%2) \n\t" | |||||
| "vfmadb %%v16,%%v0,%%v16,%%v24 \n\t" | |||||
| "vfmadb %%v17,%%v1,%%v17,%%v25 \n\t" | |||||
| "vfmadb %%v18,%%v0,%%v18,%%v26 \n\t" | |||||
| "vfmadb %%v19,%%v1,%%v19,%%v27 \n\t" | |||||
| "vfmadb %%v20,%%v0,%%v20,%%v28 \n\t" | |||||
| "vfmadb %%v21,%%v1,%%v21,%%v29 \n\t" | |||||
| "vfmadb %%v22,%%v0,%%v22,%%v30 \n\t" | |||||
| "vfmadb %%v23,%%v1,%%v23,%%v31 \n\t" | |||||
| "vstm %%v16,%%v23, 0(%2) \n\t" | |||||
| "vlm %%v24,%%v31, 128(%1) \n\t" | |||||
| "vlm %%v16,%%v23, 128(%2) \n\t" | |||||
| "vfmadb %%v24,%%v0,%%v24,%%v16 \n\t" | |||||
| "vfmadb %%v25,%%v1,%%v25,%%v17 \n\t" | |||||
| "vfmadb %%v26,%%v0,%%v26,%%v18 \n\t" | |||||
| "vfmadb %%v27,%%v1,%%v27,%%v19 \n\t" | |||||
| "vfmadb %%v28,%%v0,%%v28,%%v20 \n\t" | |||||
| "vfmadb %%v29,%%v1,%%v29,%%v21 \n\t" | |||||
| "vfmadb %%v30,%%v0,%%v30,%%v22 \n\t" | |||||
| "vfmadb %%v31,%%v1,%%v31,%%v23 \n\t" | |||||
| "la %1,256(%1) \n\t" | |||||
| "vstm %%v24, %%v31, 128(%2) \n\t" | |||||
| "la %2,256(%2) \n\t" | |||||
| "brctg %3,1b" | |||||
| : | |||||
| :"r"(n),"a"(x),"a"(y),"a"(alpha) | |||||
| :"cc", "memory", "v0","v1","v16","v17","v18","v19","v20","v21", | |||||
| "v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" | |||||
| ); | |||||
| } | |||||
| #endif | |||||
| int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) | |||||
| { | |||||
| BLASLONG i=0; | |||||
| BLASLONG ix=0,iy=0; | |||||
| if ( n <= 0 ) return 0 ; | |||||
| if ( (inc_x == 1) && (inc_y == 1) ) | |||||
| { | |||||
| BLASLONG n1 = n & -32; | |||||
| if ( n1 ) | |||||
| daxpy_kernel_32(n1, x, y , &da ); | |||||
| i = n1; | |||||
| while(i < n) | |||||
| { | |||||
| y[i] += da * x[i] ; | |||||
| i++ ; | |||||
| } | |||||
| return 0 ; | |||||
| } | |||||
| BLASLONG n1 = n & -4; | |||||
| while(i < n1) | |||||
| { | |||||
| FLOAT m1 = da * x[ix] ; | |||||
| FLOAT m2 = da * x[ix+inc_x] ; | |||||
| FLOAT m3 = da * x[ix+2*inc_x] ; | |||||
| FLOAT m4 = da * x[ix+3*inc_x] ; | |||||
| y[iy] += m1 ; | |||||
| y[iy+inc_y] += m2 ; | |||||
| y[iy+2*inc_y] += m3 ; | |||||
| y[iy+3*inc_y] += m4 ; | |||||
| ix += inc_x*4 ; | |||||
| iy += inc_y*4 ; | |||||
| i+=4 ; | |||||
| } | |||||
| while(i < n) | |||||
| { | |||||
| y[iy] += da * x[ix] ; | |||||
| ix += inc_x ; | |||||
| iy += inc_y ; | |||||
| i++ ; | |||||
| } | |||||
| return 0 ; | |||||
| } | |||||
| @@ -0,0 +1,169 @@ | |||||
| /*************************************************************************** | |||||
| Copyright (c) 2013-2017, The OpenBLAS Project | |||||
| All rights reserved. | |||||
| Redistribution and use in source and binary forms, with or without | |||||
| modification, are permitted provided that the following conditions are | |||||
| met: | |||||
| 1. Redistributions of source code must retain the above copyright | |||||
| notice, this list of conditions and the following disclaimer. | |||||
| 2. Redistributions in binary form must reproduce the above copyright | |||||
| notice, this list of conditions and the following disclaimer in | |||||
| the documentation and/or other materials provided with the | |||||
| distribution. | |||||
| 3. Neither the name of the OpenBLAS project nor the names of | |||||
| its contributors may be used to endorse or promote products | |||||
| derived from this software without specific prior written permission. | |||||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| *****************************************************************************/ | |||||
| #include "common.h" | |||||
| #if defined(Z13mvc) | |||||
| static void __attribute__ ((noinline)) dcopy_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y) { | |||||
| __asm__ volatile( | |||||
| "pfd 1, 0(%1) \n\t" | |||||
| "pfd 2, 0(%2) \n\t" | |||||
| "srlg %%r0,%0,5 \n\t" | |||||
| ".align 16 \n\t" | |||||
| "1: \n\t" | |||||
| "mvc 0(256,%2),0(%1) \n\t" | |||||
| "la %1,256(%1) \n\t" | |||||
| "la %2,256(%2) \n\t" | |||||
| "brctg %%r0,1b" | |||||
| : | |||||
| : "r"(n), "a"(x), "a"(y) | |||||
| : "cc", "memory","r0" | |||||
| ); | |||||
| return; | |||||
| } | |||||
| #else | |||||
| static void __attribute__ ((noinline)) dcopy_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y) { | |||||
| __asm__ volatile( | |||||
| "pfd 1, 0(%1) \n\t" | |||||
| "pfd 2, 0(%2) \n\t" | |||||
| "srlg %%r0,%0,5 \n\t" | |||||
| "xgr %%r1,%%r1 \n\t" | |||||
| ".align 16 \n\t" | |||||
| "1: \n\t" | |||||
| "pfd 1, 256(%%r1,%1) \n\t" | |||||
| "pfd 2, 256(%%r1,%2) \n\t" | |||||
| "vl %%v24, 0(%%r1,%1) \n\t" | |||||
| "vst %%v24, 0(%%r1,%2) \n\t" | |||||
| "vl %%v25, 16(%%r1,%1) \n\t" | |||||
| "vst %%v25, 16(%%r1,%2) \n\t" | |||||
| "vl %%v26, 32(%%r1,%1) \n\t" | |||||
| "vst %%v26, 32(%%r1,%2) \n\t" | |||||
| "vl %%v27, 48(%%r1,%1) \n\t" | |||||
| "vst %%v27, 48(%%r1,%2) \n\t" | |||||
| "vl %%v24, 64(%%r1,%1) \n\t" | |||||
| "vst %%v24, 64(%%r1,%2) \n\t" | |||||
| "vl %%v25, 80(%%r1,%1) \n\t" | |||||
| "vst %%v25, 80(%%r1,%2) \n\t" | |||||
| "vl %%v26, 96(%%r1,%1) \n\t" | |||||
| "vst %%v26, 96(%%r1,%2) \n\t" | |||||
| "vl %%v27, 112(%%r1,%1) \n\t" | |||||
| "vst %%v27, 112(%%r1,%2) \n\t" | |||||
| "vl %%v24, 128(%%r1,%1) \n\t" | |||||
| "vst %%v24, 128(%%r1,%2) \n\t" | |||||
| "vl %%v25, 144(%%r1,%1) \n\t" | |||||
| "vst %%v25, 144(%%r1,%2) \n\t" | |||||
| "vl %%v26, 160(%%r1,%1) \n\t" | |||||
| "vst %%v26, 160(%%r1,%2) \n\t" | |||||
| "vl %%v27, 176(%%r1,%1) \n\t" | |||||
| "vst %%v27, 176(%%r1,%2) \n\t" | |||||
| "vl %%v24, 192(%%r1,%1) \n\t" | |||||
| "vst %%v24, 192(%%r1,%2) \n\t" | |||||
| "vl %%v25, 208(%%r1,%1) \n\t" | |||||
| "vst %%v25, 208(%%r1,%2) \n\t" | |||||
| "vl %%v26, 224(%%r1,%1) \n\t" | |||||
| "vst %%v26, 224(%%r1,%2) \n\t" | |||||
| "vl %%v27, 240(%%r1,%1) \n\t" | |||||
| "vst %%v27, 240(%%r1,%2) \n\t" | |||||
| "la %%r1,256(%%r1) \n\t" | |||||
| "brctg %%r0,1b" | |||||
| : | |||||
| : "r"(n), "a"(x), "a"(y) | |||||
| : "cc", "memory","r0","r1", "v24","v25","v26","v27" | |||||
| ); | |||||
| return; | |||||
| } | |||||
| #endif | |||||
| int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) { | |||||
| BLASLONG i = 0; | |||||
| BLASLONG ix = 0, iy = 0; | |||||
| if (n <= 0) return 0; | |||||
| if ((inc_x == 1) && (inc_y == 1)) { | |||||
| BLASLONG n1 = n & -32; | |||||
| if (n1 > 0) { | |||||
| dcopy_kernel_32(n1, x, y); | |||||
| i = n1; | |||||
| } | |||||
| while (i < n) { | |||||
| y[i] = x[i]; | |||||
| i++; | |||||
| } | |||||
| } else { | |||||
| BLASLONG n1 = n & -4; | |||||
| while (i < n1) { | |||||
| y[iy] = x[ix]; | |||||
| y[iy + inc_y] = x[ix + inc_x]; | |||||
| y[iy + 2 * inc_y] = x[ix + 2 * inc_x]; | |||||
| y[iy + 3 * inc_y] = x[ix + 3 * inc_x]; | |||||
| ix += inc_x * 4; | |||||
| iy += inc_y * 4; | |||||
| i += 4; | |||||
| } | |||||
| while (i < n) { | |||||
| y[iy] = x[ix]; | |||||
| ix += inc_x; | |||||
| iy += inc_y; | |||||
| i++; | |||||
| } | |||||
| } | |||||
| return 0; | |||||
| } | |||||
| @@ -0,0 +1,194 @@ | |||||
| /*************************************************************************** | |||||
| Copyright (c) 2013-2017, The OpenBLAS Project | |||||
| All rights reserved. | |||||
| Redistribution and use in source and binary forms, with or without | |||||
| modification, are permitted provided that the following conditions are | |||||
| met: | |||||
| 1. Redistributions of source code must retain the above copyright | |||||
| notice, this list of conditions and the following disclaimer. | |||||
| 2. Redistributions in binary form must reproduce the above copyright | |||||
| notice, this list of conditions and the following disclaimer in | |||||
| the documentation and/or other materials provided with the | |||||
| distribution. | |||||
| 3. Neither the name of the OpenBLAS project nor the names of | |||||
| its contributors may be used to endorse or promote products | |||||
| derived from this software without specific prior written permission. | |||||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| *****************************************************************************/ | |||||
| #include "common.h" | |||||
| #if defined(Z13) | |||||
| static void __attribute__ ((noinline)) ddot_kernel_8(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *d) | |||||
| { | |||||
| __asm__ volatile( | |||||
| "pfd 1, 0(%1) \n\t" | |||||
| "pfd 1, 0(%2) \n\t" | |||||
| "vzero %%v24 \n\t" | |||||
| "vzero %%v25 \n\t" | |||||
| "vzero %%v26 \n\t" | |||||
| "vzero %%v27 \n\t" | |||||
| "srlg %%r0,%0,4 \n\t" | |||||
| "xgr %%r1,%%r1 \n\t" | |||||
| ".align 16 \n\t" | |||||
| "1: \n\t" | |||||
| "pfd 1, 256(%%r1,%1) \n\t" | |||||
| "pfd 1, 256(%%r1,%2) \n\t" | |||||
| "vl %%v16, 0(%%r1,%1) \n\t" | |||||
| "vl %%v17, 16(%%r1,%1) \n\t" | |||||
| "vl %%v18, 32(%%r1,%1) \n\t" | |||||
| "vl %%v19, 48(%%r1,%1) \n\t" | |||||
| "vl %%v28, 0(%%r1,%2) \n\t" | |||||
| "vfmadb %%v24,%%v16,%%v28,%%v24 \n\t" | |||||
| "vl %%v29, 16(%%r1,%2) \n\t" | |||||
| "vfmadb %%v25,%%v17,%%v29,%%v25 \n\t" | |||||
| "vl %%v30, 32(%%r1,%2) \n\t" | |||||
| "vfmadb %%v26,%%v18,%%v30,%%v26 \n\t" | |||||
| "vl %%v31, 48(%%r1,%2) \n\t" | |||||
| "vfmadb %%v27,%%v19,%%v31,%%v27 \n\t" | |||||
| "vl %%v16, 64(%%r1,%1) \n\t" | |||||
| "vl %%v17, 80(%%r1,%1) \n\t" | |||||
| "vl %%v18, 96(%%r1,%1) \n\t" | |||||
| "vl %%v19, 112(%%r1,%1) \n\t" | |||||
| "vl %%v28, 64(%%r1,%2) \n\t" | |||||
| "vfmadb %%v24,%%v16,%%v28,%%v24 \n\t" | |||||
| "vl %%v29, 80(%%r1,%2) \n\t" | |||||
| "vfmadb %%v25,%%v17,%%v29,%%v25 \n\t" | |||||
| "vl %%v30, 96(%%r1,%2) \n\t" | |||||
| "vfmadb %%v26,%%v18,%%v30,%%v26 \n\t" | |||||
| "vl %%v31, 112(%%r1,%2) \n\t" | |||||
| "vfmadb %%v27,%%v19,%%v31,%%v27 \n\t" | |||||
| "la %%r1,128(%%r1) \n\t" | |||||
| "brctg %%r0,1b \n\t" | |||||
| "vfadb %%v24,%%v25,%%v24 \n\t" | |||||
| "vfadb %%v24,%%v26,%%v24 \n\t" | |||||
| "vfadb %%v24,%%v27,%%v24 \n\t" | |||||
| "vrepg %%v1,%%v24,1 \n\t" | |||||
| "vfadb %%v1,%%v24,%%v1 \n\t" | |||||
| " std %%f1,0(%3) \n\t" | |||||
| : | |||||
| :"r"(n),"a"(x),"a"(y),"a"(d) | |||||
| :"cc" , "memory" ,"r0","r1","v16", "v17","v18","v19","v20","v21","v22","v23", | |||||
| "v24","v25","v26","v27","v28","v29","v30","v31" | |||||
| ); | |||||
| } | |||||
| #else | |||||
| static void ddot_kernel_8(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *d) | |||||
| { | |||||
| BLASLONG register i = 0; | |||||
| FLOAT dot = 0.0; | |||||
| while(i < n) | |||||
| { | |||||
| dot += y[i] * x[i] | |||||
| + y[i+1] * x[i+1] | |||||
| + y[i+2] * x[i+2] | |||||
| + y[i+3] * x[i+3] | |||||
| + y[i+4] * x[i+4] | |||||
| + y[i+5] * x[i+5] | |||||
| + y[i+6] * x[i+6] | |||||
| + y[i+7] * x[i+7] ; | |||||
| i+=8 ; | |||||
| } | |||||
| *d += dot; | |||||
| } | |||||
| #endif | |||||
| FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) | |||||
| { | |||||
| BLASLONG i=0; | |||||
| BLASLONG ix=0,iy=0; | |||||
| FLOAT dot = 0.0 ; | |||||
| if ( n <= 0 ) return(dot); | |||||
| if ( (inc_x == 1) && (inc_y == 1) ) | |||||
| { | |||||
| BLASLONG n1 = n & -16; | |||||
| if ( n1 ) | |||||
| ddot_kernel_8(n1, x, y , &dot ); | |||||
| i = n1; | |||||
| while(i < n) | |||||
| { | |||||
| dot += y[i] * x[i] ; | |||||
| i++ ; | |||||
| } | |||||
| return(dot); | |||||
| } | |||||
| FLOAT temp1 = 0.0; | |||||
| FLOAT temp2 = 0.0; | |||||
| BLASLONG n1 = n & -4; | |||||
| while(i < n1) | |||||
| { | |||||
| FLOAT m1 = y[iy] * x[ix] ; | |||||
| FLOAT m2 = y[iy+inc_y] * x[ix+inc_x] ; | |||||
| FLOAT m3 = y[iy+2*inc_y] * x[ix+2*inc_x] ; | |||||
| FLOAT m4 = y[iy+3*inc_y] * x[ix+3*inc_x] ; | |||||
| ix += inc_x*4 ; | |||||
| iy += inc_y*4 ; | |||||
| temp1 += m1+m3; | |||||
| temp2 += m2+m4; | |||||
| i+=4 ; | |||||
| } | |||||
| while(i < n) | |||||
| { | |||||
| temp1 += y[iy] * x[ix] ; | |||||
| ix += inc_x ; | |||||
| iy += inc_y ; | |||||
| i++ ; | |||||
| } | |||||
| dot = temp1 + temp2; | |||||
| return(dot); | |||||
| } | |||||
| @@ -0,0 +1,487 @@ | |||||
| /*************************************************************************** | |||||
| Copyright (c) 2017, The OpenBLAS Project | |||||
| All rights reserved. | |||||
| Redistribution and use in source and binary forms, with or without | |||||
| modification, are permitted provided that the following conditions are | |||||
| met: | |||||
| 1. Redistributions of source code must retain the above copyright | |||||
| notice, this list of conditions and the following disclaimer. | |||||
| 2. Redistributions in binary form must reproduce the above copyright | |||||
| notice, this list of conditions and the following disclaimer in | |||||
| the documentation and/or other materials provided with the | |||||
| distribution. | |||||
| 3. Neither the name of the OpenBLAS project nor the names of | |||||
| its contributors may be used to endorse or promote products | |||||
| derived from this software without specific prior written permission. | |||||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| *****************************************************************************/ | |||||
| #include "common.h" | |||||
| #define NBMAX 2048 | |||||
| #define HAVE_KERNEL_4x4_VEC 1 | |||||
| #define HAVE_KERNEL_4x2_VEC 1 | |||||
| #define HAVE_KERNEL_4x1_VEC 1 | |||||
| #if defined(HAVE_KERNEL_4x4_VEC) || defined(HAVE_KERNEL_4x2_VEC) || defined(HAVE_KERNEL_4x1_VEC) | |||||
| #include <vecintrin.h> | |||||
| #endif | |||||
| #ifdef HAVE_KERNEL_4x4 | |||||
| #elif HAVE_KERNEL_4x4_VEC | |||||
| static void dgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *xo, FLOAT *y, FLOAT *alpha) | |||||
| { | |||||
| BLASLONG i; | |||||
| FLOAT x0,x1,x2,x3; | |||||
| x0 = xo[0] * *alpha; | |||||
| x1 = xo[1] * *alpha; | |||||
| x2 = xo[2] * *alpha; | |||||
| x3 = xo[3] * *alpha; | |||||
| __vector double v_x0 = {x0,x0}; | |||||
| __vector double v_x1 = {x1,x1}; | |||||
| __vector double v_x2 = {x2,x2}; | |||||
| __vector double v_x3 = {x3,x3}; | |||||
| __vector double* v_y =(__vector double*)y; | |||||
| __vector double* va0 = (__vector double*)ap[0]; | |||||
| __vector double* va1 = (__vector double*)ap[1]; | |||||
| __vector double* va2 = (__vector double*)ap[2]; | |||||
| __vector double* va3 = (__vector double*)ap[3]; | |||||
| for ( i=0; i< n/2; i+=2 ) | |||||
| { | |||||
| v_y[i] += v_x0 * va0[i] + v_x1 * va1[i] + v_x2 * va2[i] + v_x3 * va3[i] ; | |||||
| v_y[i+1] += v_x0 * va0[i+1] + v_x1 * va1[i+1] + v_x2 * va2[i+1] + v_x3 * va3[i+1] ; | |||||
| } | |||||
| } | |||||
| #else | |||||
| static void dgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *xo, FLOAT *y, FLOAT *alpha) | |||||
| { | |||||
| BLASLONG i; | |||||
| FLOAT *a0,*a1,*a2,*a3; | |||||
| FLOAT x[4] __attribute__ ((aligned (16))); | |||||
| a0 = ap[0]; | |||||
| a1 = ap[1]; | |||||
| a2 = ap[2]; | |||||
| a3 = ap[3]; | |||||
| for ( i=0; i<4; i++) | |||||
| x[i] = xo[i] * *alpha; | |||||
| for ( i=0; i< n; i+=4 ) | |||||
| { | |||||
| y[i] += a0[i]*x[0] + a1[i]*x[1] + a2[i]*x[2] + a3[i]*x[3]; | |||||
| y[i+1] += a0[i+1]*x[0] + a1[i+1]*x[1] + a2[i+1]*x[2] + a3[i+1]*x[3]; | |||||
| y[i+2] += a0[i+2]*x[0] + a1[i+2]*x[1] + a2[i+2]*x[2] + a3[i+2]*x[3]; | |||||
| y[i+3] += a0[i+3]*x[0] + a1[i+3]*x[1] + a2[i+3]*x[2] + a3[i+3]*x[3]; | |||||
| } | |||||
| } | |||||
| #endif | |||||
| #ifdef HAVE_KERNEL_4x2 | |||||
| #elif HAVE_KERNEL_4x2_VEC | |||||
| static void dgemv_kernel_4x2(BLASLONG n, FLOAT **ap, FLOAT *xo, FLOAT *y, FLOAT *alpha) | |||||
| { | |||||
| BLASLONG i; | |||||
| FLOAT x0,x1; | |||||
| x0 = xo[0] * *alpha; | |||||
| x1 = xo[1] * *alpha; | |||||
| __vector double v_x0 = {x0,x0}; | |||||
| __vector double v_x1 = {x1,x1}; | |||||
| __vector double* v_y =(__vector double*)y; | |||||
| __vector double* va0 = (__vector double*)ap[0]; | |||||
| __vector double* va1 = (__vector double*)ap[1]; | |||||
| for ( i=0; i< n/2; i+=2 ) | |||||
| { | |||||
| v_y[i] += v_x0 * va0[i] + v_x1 * va1[i] ; | |||||
| v_y[i+1] += v_x0 * va0[i+1] + v_x1 * va1[i+1] ; | |||||
| } | |||||
| } | |||||
| #else | |||||
| static void dgemv_kernel_4x2(BLASLONG n, FLOAT **ap, FLOAT *xo, FLOAT *y, FLOAT *alpha) | |||||
| { | |||||
| BLASLONG i; | |||||
| FLOAT *a0,*a1; | |||||
| FLOAT x[4] __attribute__ ((aligned (16))); | |||||
| a0 = ap[0]; | |||||
| a1 = ap[1]; | |||||
| for ( i=0; i<2; i++) | |||||
| x[i] = xo[i] * *alpha; | |||||
| for ( i=0; i< n; i+=4 ) | |||||
| { | |||||
| y[i] += a0[i]*x[0] + a1[i]*x[1]; | |||||
| y[i+1] += a0[i+1]*x[0] + a1[i+1]*x[1]; | |||||
| y[i+2] += a0[i+2]*x[0] + a1[i+2]*x[1]; | |||||
| y[i+3] += a0[i+3]*x[0] + a1[i+3]*x[1]; | |||||
| } | |||||
| } | |||||
| #endif | |||||
| #ifdef HAVE_KERNEL_4x1 | |||||
| #elif HAVE_KERNEL_4x1_VEC | |||||
| static void dgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *xo, FLOAT *y, FLOAT *alpha) | |||||
| { | |||||
| BLASLONG i; | |||||
| FLOAT x0; | |||||
| x0 = xo[0] * *alpha; | |||||
| __vector double v_x0 = {x0,x0}; | |||||
| __vector double* v_y =(__vector double*)y; | |||||
| __vector double* va0 = (__vector double*)ap; | |||||
| for ( i=0; i< n/2; i+=2 ) | |||||
| { | |||||
| v_y[i] += v_x0 * va0[i] ; | |||||
| v_y[i+1] += v_x0 * va0[i+1] ; | |||||
| } | |||||
| } | |||||
| #else | |||||
| static void dgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *xo, FLOAT *y, FLOAT *alpha) | |||||
| { | |||||
| BLASLONG i; | |||||
| FLOAT *a0; | |||||
| FLOAT x[4] __attribute__ ((aligned (16))); | |||||
| a0 = ap; | |||||
| for ( i=0; i<1; i++) | |||||
| x[i] = xo[i] * *alpha; | |||||
| for ( i=0; i< n; i+=4 ) | |||||
| { | |||||
| y[i] += a0[i]*x[0]; | |||||
| y[i+1] += a0[i+1]*x[0]; | |||||
| y[i+2] += a0[i+2]*x[0]; | |||||
| y[i+3] += a0[i+3]*x[0]; | |||||
| } | |||||
| } | |||||
| #endif | |||||
| static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest) __attribute__ ((noinline)); | |||||
| static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest) | |||||
| { | |||||
| BLASLONG i; | |||||
| for ( i=0; i<n; i++ ){ | |||||
| *dest += *src; | |||||
| src++; | |||||
| dest += inc_dest; | |||||
| } | |||||
| return; | |||||
| } | |||||
| int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) | |||||
| { | |||||
| BLASLONG i; | |||||
| BLASLONG j; | |||||
| FLOAT *a_ptr; | |||||
| FLOAT *x_ptr; | |||||
| FLOAT *y_ptr; | |||||
| FLOAT *ap[4]; | |||||
| BLASLONG n1; | |||||
| BLASLONG m1; | |||||
| BLASLONG m2; | |||||
| BLASLONG m3; | |||||
| BLASLONG n2; | |||||
| BLASLONG lda4 = lda << 2; | |||||
| FLOAT xbuffer[8],*ybuffer; | |||||
| if ( m < 1 ) return(0); | |||||
| if ( n < 1 ) return(0); | |||||
| ybuffer = buffer; | |||||
| n1 = n >> 2 ; | |||||
| n2 = n & 3 ; | |||||
| m3 = m & 3 ; | |||||
| m1 = m & -4 ; | |||||
| m2 = (m & (NBMAX-1)) - m3 ; | |||||
| y_ptr = y; | |||||
| BLASLONG NB = NBMAX; | |||||
| while ( NB == NBMAX ) | |||||
| { | |||||
| m1 -= NB; | |||||
| if ( m1 < 0) | |||||
| { | |||||
| if ( m2 == 0 ) break; | |||||
| NB = m2; | |||||
| } | |||||
| a_ptr = a; | |||||
| x_ptr = x; | |||||
| ap[0] = a_ptr; | |||||
| ap[1] = a_ptr + lda; | |||||
| ap[2] = ap[1] + lda; | |||||
| ap[3] = ap[2] + lda; | |||||
| if ( inc_y != 1 ) | |||||
| memset(ybuffer,0,NB*8); | |||||
| else | |||||
| ybuffer = y_ptr; | |||||
| if ( inc_x == 1 ) | |||||
| { | |||||
| for( i = 0; i < n1 ; i++) | |||||
| { | |||||
| dgemv_kernel_4x4(NB,ap,x_ptr,ybuffer,&alpha); | |||||
| ap[0] += lda4; | |||||
| ap[1] += lda4; | |||||
| ap[2] += lda4; | |||||
| ap[3] += lda4; | |||||
| a_ptr += lda4; | |||||
| x_ptr += 4; | |||||
| } | |||||
| if ( n2 & 2 ) | |||||
| { | |||||
| dgemv_kernel_4x2(NB,ap,x_ptr,ybuffer,&alpha); | |||||
| a_ptr += lda*2; | |||||
| x_ptr += 2; | |||||
| } | |||||
| if ( n2 & 1 ) | |||||
| { | |||||
| dgemv_kernel_4x1(NB,a_ptr,x_ptr,ybuffer,&alpha); | |||||
| a_ptr += lda; | |||||
| x_ptr += 1; | |||||
| } | |||||
| } | |||||
| else | |||||
| { | |||||
| for( i = 0; i < n1 ; i++) | |||||
| { | |||||
| xbuffer[0] = x_ptr[0]; | |||||
| x_ptr += inc_x; | |||||
| xbuffer[1] = x_ptr[0]; | |||||
| x_ptr += inc_x; | |||||
| xbuffer[2] = x_ptr[0]; | |||||
| x_ptr += inc_x; | |||||
| xbuffer[3] = x_ptr[0]; | |||||
| x_ptr += inc_x; | |||||
| dgemv_kernel_4x4(NB,ap,xbuffer,ybuffer,&alpha); | |||||
| ap[0] += lda4; | |||||
| ap[1] += lda4; | |||||
| ap[2] += lda4; | |||||
| ap[3] += lda4; | |||||
| a_ptr += lda4; | |||||
| } | |||||
| for( i = 0; i < n2 ; i++) | |||||
| { | |||||
| xbuffer[0] = x_ptr[0]; | |||||
| x_ptr += inc_x; | |||||
| dgemv_kernel_4x1(NB,a_ptr,xbuffer,ybuffer,&alpha); | |||||
| a_ptr += lda; | |||||
| } | |||||
| } | |||||
| a += NB; | |||||
| if ( inc_y != 1 ) | |||||
| { | |||||
| add_y(NB,ybuffer,y_ptr,inc_y); | |||||
| y_ptr += NB * inc_y; | |||||
| } | |||||
| else | |||||
| y_ptr += NB ; | |||||
| } | |||||
| if ( m3 == 0 ) return(0); | |||||
| if ( m3 == 3 ) | |||||
| { | |||||
| a_ptr = a; | |||||
| x_ptr = x; | |||||
| FLOAT temp0 = 0.0; | |||||
| FLOAT temp1 = 0.0; | |||||
| FLOAT temp2 = 0.0; | |||||
| if ( lda == 3 && inc_x ==1 ) | |||||
| { | |||||
| for( i = 0; i < ( n & -4 ); i+=4 ) | |||||
| { | |||||
| temp0 += a_ptr[0] * x_ptr[0] + a_ptr[3] * x_ptr[1]; | |||||
| temp1 += a_ptr[1] * x_ptr[0] + a_ptr[4] * x_ptr[1]; | |||||
| temp2 += a_ptr[2] * x_ptr[0] + a_ptr[5] * x_ptr[1]; | |||||
| temp0 += a_ptr[6] * x_ptr[2] + a_ptr[9] * x_ptr[3]; | |||||
| temp1 += a_ptr[7] * x_ptr[2] + a_ptr[10] * x_ptr[3]; | |||||
| temp2 += a_ptr[8] * x_ptr[2] + a_ptr[11] * x_ptr[3]; | |||||
| a_ptr += 12; | |||||
| x_ptr += 4; | |||||
| } | |||||
| for( ; i < n; i++ ) | |||||
| { | |||||
| temp0 += a_ptr[0] * x_ptr[0]; | |||||
| temp1 += a_ptr[1] * x_ptr[0]; | |||||
| temp2 += a_ptr[2] * x_ptr[0]; | |||||
| a_ptr += 3; | |||||
| x_ptr ++; | |||||
| } | |||||
| } | |||||
| else | |||||
| { | |||||
| for( i = 0; i < n; i++ ) | |||||
| { | |||||
| temp0 += a_ptr[0] * x_ptr[0]; | |||||
| temp1 += a_ptr[1] * x_ptr[0]; | |||||
| temp2 += a_ptr[2] * x_ptr[0]; | |||||
| a_ptr += lda; | |||||
| x_ptr += inc_x; | |||||
| } | |||||
| } | |||||
| y_ptr[0] += alpha * temp0; | |||||
| y_ptr += inc_y; | |||||
| y_ptr[0] += alpha * temp1; | |||||
| y_ptr += inc_y; | |||||
| y_ptr[0] += alpha * temp2; | |||||
| return(0); | |||||
| } | |||||
| if ( m3 == 2 ) | |||||
| { | |||||
| a_ptr = a; | |||||
| x_ptr = x; | |||||
| FLOAT temp0 = 0.0; | |||||
| FLOAT temp1 = 0.0; | |||||
| if ( lda == 2 && inc_x ==1 ) | |||||
| { | |||||
| for( i = 0; i < (n & -4) ; i+=4 ) | |||||
| { | |||||
| temp0 += a_ptr[0] * x_ptr[0] + a_ptr[2] * x_ptr[1]; | |||||
| temp1 += a_ptr[1] * x_ptr[0] + a_ptr[3] * x_ptr[1]; | |||||
| temp0 += a_ptr[4] * x_ptr[2] + a_ptr[6] * x_ptr[3]; | |||||
| temp1 += a_ptr[5] * x_ptr[2] + a_ptr[7] * x_ptr[3]; | |||||
| a_ptr += 8; | |||||
| x_ptr += 4; | |||||
| } | |||||
| for( ; i < n; i++ ) | |||||
| { | |||||
| temp0 += a_ptr[0] * x_ptr[0]; | |||||
| temp1 += a_ptr[1] * x_ptr[0]; | |||||
| a_ptr += 2; | |||||
| x_ptr ++; | |||||
| } | |||||
| } | |||||
| else | |||||
| { | |||||
| for( i = 0; i < n; i++ ) | |||||
| { | |||||
| temp0 += a_ptr[0] * x_ptr[0]; | |||||
| temp1 += a_ptr[1] * x_ptr[0]; | |||||
| a_ptr += lda; | |||||
| x_ptr += inc_x; | |||||
| } | |||||
| } | |||||
| y_ptr[0] += alpha * temp0; | |||||
| y_ptr += inc_y; | |||||
| y_ptr[0] += alpha * temp1; | |||||
| return(0); | |||||
| } | |||||
| if ( m3 == 1 ) | |||||
| { | |||||
| a_ptr = a; | |||||
| x_ptr = x; | |||||
| FLOAT temp = 0.0; | |||||
| if ( lda == 1 && inc_x ==1 ) | |||||
| { | |||||
| for( i = 0; i < (n & -4); i+=4 ) | |||||
| { | |||||
| temp += a_ptr[i] * x_ptr[i] + a_ptr[i+1] * x_ptr[i+1] + a_ptr[i+2] * x_ptr[i+2] + a_ptr[i+3] * x_ptr[i+3]; | |||||
| } | |||||
| for( ; i < n; i++ ) | |||||
| { | |||||
| temp += a_ptr[i] * x_ptr[i]; | |||||
| } | |||||
| } | |||||
| else | |||||
| { | |||||
| for( i = 0; i < n; i++ ) | |||||
| { | |||||
| temp += a_ptr[0] * x_ptr[0]; | |||||
| a_ptr += lda; | |||||
| x_ptr += inc_x; | |||||
| } | |||||
| } | |||||
| y_ptr[0] += alpha * temp; | |||||
| return(0); | |||||
| } | |||||
| return(0); | |||||
| } | |||||
| @@ -0,0 +1,541 @@ | |||||
| /*************************************************************************** | |||||
| Copyright (c) 2017, The OpenBLAS Project | |||||
| All rights reserved. | |||||
| Redistribution and use in source and binary forms, with or without | |||||
| modification, are permitted provided that the following conditions are | |||||
| met: | |||||
| 1. Redistributions of source code must retain the above copyright | |||||
| notice, this list of conditions and the following disclaimer. | |||||
| 2. Redistributions in binary form must reproduce the above copyright | |||||
| notice, this list of conditions and the following disclaimer in | |||||
| the documentation and/or other materials provided with the | |||||
| distribution. | |||||
| 3. Neither the name of the OpenBLAS project nor the names of | |||||
| its contributors may be used to endorse or promote products | |||||
| derived from this software without specific prior written permission. | |||||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| *****************************************************************************/ | |||||
| #include "common.h" | |||||
| #define HAVE_KERNEL_4x4_VEC 1 | |||||
| #define HAVE_KERNEL_4x2_VEC 1 | |||||
| #define HAVE_KERNEL_4x1_VEC 1 | |||||
| #if defined(HAVE_KERNEL_4x4_VEC) || defined(HAVE_KERNEL_4x2_VEC) || defined(HAVE_KERNEL_4x1_VEC) | |||||
| #include <vecintrin.h> | |||||
| #endif | |||||
| #define NBMAX 2048 | |||||
| #ifdef HAVE_KERNEL_4x4 | |||||
| #elif HAVE_KERNEL_4x4_VEC | |||||
| static void dgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) | |||||
| { | |||||
| BLASLONG i; | |||||
| __vector double* va0 = (__vector double*)ap[0]; | |||||
| __vector double* va1 = (__vector double*)ap[1]; | |||||
| __vector double* va2 = (__vector double*)ap[2]; | |||||
| __vector double* va3 = (__vector double*)ap[3]; | |||||
| __vector double* v_x =(__vector double*)x; | |||||
| __vector double temp0 = {0,0}; | |||||
| __vector double temp1 = {0,0}; | |||||
| __vector double temp2 = {0,0}; | |||||
| __vector double temp3 = {0,0}; | |||||
| for ( i=0; i< n/2; i+=2 ) | |||||
| { | |||||
| temp0 += v_x[i] * va0[i] + v_x[i+1] * va0[i+1] ; | |||||
| temp1 += v_x[i] * va1[i] + v_x[i+1] * va1[i+1] ; | |||||
| temp2 += v_x[i] * va2[i] + v_x[i+1] * va2[i+1] ; | |||||
| temp3 += v_x[i] * va3[i] + v_x[i+1] * va3[i+1] ; | |||||
| } | |||||
| y[0] = temp0[0] + temp0[1]; | |||||
| y[1] = temp1[0] + temp1[1]; | |||||
| y[2] = temp2[0] + temp2[1]; | |||||
| y[3] = temp3[0] + temp3[1];; | |||||
| } | |||||
| #else | |||||
| static void dgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) | |||||
| { | |||||
| BLASLONG i; | |||||
| FLOAT *a0,*a1,*a2,*a3; | |||||
| a0 = ap[0]; | |||||
| a1 = ap[1]; | |||||
| a2 = ap[2]; | |||||
| a3 = ap[3]; | |||||
| FLOAT temp0 = 0.0; | |||||
| FLOAT temp1 = 0.0; | |||||
| FLOAT temp2 = 0.0; | |||||
| FLOAT temp3 = 0.0; | |||||
| for ( i=0; i< n; i+=4 ) | |||||
| { | |||||
| temp0 += a0[i]*x[i] + a0[i+1]*x[i+1] + a0[i+2]*x[i+2] + a0[i+3]*x[i+3]; | |||||
| temp1 += a1[i]*x[i] + a1[i+1]*x[i+1] + a1[i+2]*x[i+2] + a1[i+3]*x[i+3]; | |||||
| temp2 += a2[i]*x[i] + a2[i+1]*x[i+1] + a2[i+2]*x[i+2] + a2[i+3]*x[i+3]; | |||||
| temp3 += a3[i]*x[i] + a3[i+1]*x[i+1] + a3[i+2]*x[i+2] + a3[i+3]*x[i+3]; | |||||
| } | |||||
| y[0] = temp0; | |||||
| y[1] = temp1; | |||||
| y[2] = temp2; | |||||
| y[3] = temp3; | |||||
| } | |||||
| #endif | |||||
| #ifdef HAVE_KERNEL_4x2 | |||||
| #elif HAVE_KERNEL_4x2_VEC | |||||
| static void dgemv_kernel_4x2(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) | |||||
| { | |||||
| BLASLONG i; | |||||
| __vector double* va0 = (__vector double*)ap[0]; | |||||
| __vector double* va1 = (__vector double*)ap[1]; | |||||
| __vector double* v_x =(__vector double*)x; | |||||
| __vector double temp0 = {0,0}; | |||||
| __vector double temp1 = {0,0}; | |||||
| for ( i=0; i< n/2; i+=2 ) | |||||
| { | |||||
| temp0 += v_x[i] * va0[i] + v_x[i+1] * va0[i+1] ; | |||||
| temp1 += v_x[i] * va1[i] + v_x[i+1] * va1[i+1] ; | |||||
| } | |||||
| y[0] = temp0[0] + temp0[1]; | |||||
| y[1] = temp1[0] + temp1[1]; | |||||
| } | |||||
| #else | |||||
| static void dgemv_kernel_4x2(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) | |||||
| { | |||||
| BLASLONG i; | |||||
| FLOAT *a0,*a1; | |||||
| a0 = ap[0]; | |||||
| a1 = ap[1]; | |||||
| FLOAT temp0 = 0.0; | |||||
| FLOAT temp1 = 0.0; | |||||
| for ( i=0; i< n; i+=4 ) | |||||
| { | |||||
| temp0 += a0[i]*x[i] + a0[i+1]*x[i+1] + a0[i+2]*x[i+2] + a0[i+3]*x[i+3]; | |||||
| temp1 += a1[i]*x[i] + a1[i+1]*x[i+1] + a1[i+2]*x[i+2] + a1[i+3]*x[i+3]; | |||||
| } | |||||
| y[0] = temp0; | |||||
| y[1] = temp1; | |||||
| } | |||||
| #endif | |||||
| #ifdef HAVE_KERNEL_4x1 | |||||
| #elif HAVE_KERNEL_4x1_VEC | |||||
| static void dgemv_kernel_4x1(BLASLONG n, FLOAT *a0, FLOAT *x, FLOAT *y) | |||||
| { | |||||
| BLASLONG i; | |||||
| __vector double* va0 = (__vector double*)a0; | |||||
| __vector double* v_x =(__vector double*)x; | |||||
| __vector double temp0 = {0,0}; | |||||
| for ( i=0; i< n/2; i+=2 ) | |||||
| { | |||||
| temp0 += v_x[i] * va0[i] + v_x[i+1] * va0[i+1] ; | |||||
| } | |||||
| y[0] = temp0[0] + temp0[1]; | |||||
| } | |||||
| #else | |||||
| static void dgemv_kernel_4x1(BLASLONG n, FLOAT *a0, FLOAT *x, FLOAT *y) | |||||
| { | |||||
| BLASLONG i; | |||||
| FLOAT temp0 = 0.0; | |||||
| for ( i=0; i< n; i+=4 ) | |||||
| { | |||||
| temp0 += a0[i]*x[i] + a0[i+1]*x[i+1] + a0[i+2]*x[i+2] + a0[i+3]*x[i+3]; | |||||
| } | |||||
| y[0] = temp0; | |||||
| } | |||||
| #endif | |||||
| static void copy_x(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_src) | |||||
| { | |||||
| BLASLONG i; | |||||
| for ( i=0; i<n; i++ ) | |||||
| { | |||||
| *dest = *src; | |||||
| dest++; | |||||
| src += inc_src; | |||||
| } | |||||
| } | |||||
| static void add_y(BLASLONG n, FLOAT da , FLOAT *src, FLOAT *dest, BLASLONG inc_dest) | |||||
| { | |||||
| BLASLONG i; | |||||
| for ( i=0; i<n; i++ ) | |||||
| { | |||||
| *dest += src[i] * da; | |||||
| dest += inc_dest; | |||||
| } | |||||
| return; | |||||
| } | |||||
| int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) | |||||
| { | |||||
| BLASLONG register i; | |||||
| BLASLONG register j; | |||||
| FLOAT *a_ptr; | |||||
| FLOAT *x_ptr; | |||||
| FLOAT *y_ptr; | |||||
| BLASLONG n0; | |||||
| BLASLONG n1; | |||||
| BLASLONG m1; | |||||
| BLASLONG m2; | |||||
| BLASLONG m3; | |||||
| BLASLONG n2; | |||||
| FLOAT ybuffer[4],*xbuffer; | |||||
| FLOAT *ytemp; | |||||
| if ( m < 1 ) return(0); | |||||
| if ( n < 1 ) return(0); | |||||
| xbuffer = buffer; | |||||
| ytemp = buffer + (m < NBMAX ? m : NBMAX); | |||||
| n0 = n / NBMAX; | |||||
| n1 = (n % NBMAX) >> 2 ; | |||||
| n2 = n & 3 ; | |||||
| m3 = m & 3 ; | |||||
| m1 = m & -4 ; | |||||
| m2 = (m & (NBMAX-1)) - m3 ; | |||||
| BLASLONG NB = NBMAX; | |||||
| while ( NB == NBMAX ) | |||||
| { | |||||
| m1 -= NB; | |||||
| if ( m1 < 0) | |||||
| { | |||||
| if ( m2 == 0 ) break; | |||||
| NB = m2; | |||||
| } | |||||
| y_ptr = y; | |||||
| a_ptr = a; | |||||
| x_ptr = x; | |||||
| if ( inc_x == 1 ) | |||||
| xbuffer = x_ptr; | |||||
| else | |||||
| copy_x(NB,x_ptr,xbuffer,inc_x); | |||||
| FLOAT *ap[4]; | |||||
| FLOAT *yp; | |||||
| BLASLONG register lda4 = 4 * lda; | |||||
| ap[0] = a_ptr; | |||||
| ap[1] = a_ptr + lda; | |||||
| ap[2] = ap[1] + lda; | |||||
| ap[3] = ap[2] + lda; | |||||
| if ( n0 > 0 ) | |||||
| { | |||||
| BLASLONG nb1 = NBMAX / 4; | |||||
| for( j=0; j<n0; j++) | |||||
| { | |||||
| yp = ytemp; | |||||
| for( i = 0; i < nb1 ; i++) | |||||
| { | |||||
| dgemv_kernel_4x4(NB,ap,xbuffer,yp); | |||||
| ap[0] += lda4 ; | |||||
| ap[1] += lda4 ; | |||||
| ap[2] += lda4 ; | |||||
| ap[3] += lda4 ; | |||||
| yp += 4; | |||||
| } | |||||
| add_y(nb1*4, alpha, ytemp, y_ptr, inc_y ); | |||||
| y_ptr += nb1 * inc_y * 4; | |||||
| a_ptr += nb1 * lda4 ; | |||||
| } | |||||
| } | |||||
| yp = ytemp; | |||||
| for( i = 0; i < n1 ; i++) | |||||
| { | |||||
| dgemv_kernel_4x4(NB,ap,xbuffer,yp); | |||||
| ap[0] += lda4 ; | |||||
| ap[1] += lda4 ; | |||||
| ap[2] += lda4 ; | |||||
| ap[3] += lda4 ; | |||||
| yp += 4; | |||||
| } | |||||
| if ( n1 > 0 ) | |||||
| { | |||||
| add_y(n1*4, alpha, ytemp, y_ptr, inc_y ); | |||||
| y_ptr += n1 * inc_y * 4; | |||||
| a_ptr += n1 * lda4 ; | |||||
| } | |||||
| if ( n2 & 2 ) | |||||
| { | |||||
| dgemv_kernel_4x2(NB,ap,xbuffer,ybuffer); | |||||
| a_ptr += lda * 2; | |||||
| *y_ptr += ybuffer[0] * alpha; | |||||
| y_ptr += inc_y; | |||||
| *y_ptr += ybuffer[1] * alpha; | |||||
| y_ptr += inc_y; | |||||
| } | |||||
| if ( n2 & 1 ) | |||||
| { | |||||
| dgemv_kernel_4x1(NB,a_ptr,xbuffer,ybuffer); | |||||
| a_ptr += lda; | |||||
| *y_ptr += ybuffer[0] * alpha; | |||||
| y_ptr += inc_y; | |||||
| } | |||||
| a += NB; | |||||
| x += NB * inc_x; | |||||
| } | |||||
| if ( m3 == 0 ) return(0); | |||||
| x_ptr = x; | |||||
| a_ptr = a; | |||||
| if ( m3 == 3 ) | |||||
| { | |||||
| FLOAT xtemp0 = *x_ptr * alpha; | |||||
| x_ptr += inc_x; | |||||
| FLOAT xtemp1 = *x_ptr * alpha; | |||||
| x_ptr += inc_x; | |||||
| FLOAT xtemp2 = *x_ptr * alpha; | |||||
| FLOAT *aj = a_ptr; | |||||
| y_ptr = y; | |||||
| if ( lda == 3 && inc_y == 1 ) | |||||
| { | |||||
| for ( j=0; j< ( n & -4) ; j+=4 ) | |||||
| { | |||||
| y_ptr[j] += aj[0] * xtemp0 + aj[1] * xtemp1 + aj[2] * xtemp2; | |||||
| y_ptr[j+1] += aj[3] * xtemp0 + aj[4] * xtemp1 + aj[5] * xtemp2; | |||||
| y_ptr[j+2] += aj[6] * xtemp0 + aj[7] * xtemp1 + aj[8] * xtemp2; | |||||
| y_ptr[j+3] += aj[9] * xtemp0 + aj[10] * xtemp1 + aj[11] * xtemp2; | |||||
| aj += 12; | |||||
| } | |||||
| for ( ; j<n; j++ ) | |||||
| { | |||||
| y_ptr[j] += aj[0] * xtemp0 + aj[1] * xtemp1 + aj[2] * xtemp2; | |||||
| aj += 3; | |||||
| } | |||||
| } | |||||
| else | |||||
| { | |||||
| if ( inc_y == 1 ) | |||||
| { | |||||
| BLASLONG register lda2 = lda << 1; | |||||
| BLASLONG register lda4 = lda << 2; | |||||
| BLASLONG register lda3 = lda2 + lda; | |||||
| for ( j=0; j< ( n & -4 ); j+=4 ) | |||||
| { | |||||
| y_ptr[j] += *aj * xtemp0 + *(aj+1) * xtemp1 + *(aj+2) * xtemp2; | |||||
| y_ptr[j+1] += *(aj+lda) * xtemp0 + *(aj+lda+1) * xtemp1 + *(aj+lda+2) * xtemp2; | |||||
| y_ptr[j+2] += *(aj+lda2) * xtemp0 + *(aj+lda2+1) * xtemp1 + *(aj+lda2+2) * xtemp2; | |||||
| y_ptr[j+3] += *(aj+lda3) * xtemp0 + *(aj+lda3+1) * xtemp1 + *(aj+lda3+2) * xtemp2; | |||||
| aj += lda4; | |||||
| } | |||||
| for ( ; j< n ; j++ ) | |||||
| { | |||||
| y_ptr[j] += *aj * xtemp0 + *(aj+1) * xtemp1 + *(aj+2) * xtemp2 ; | |||||
| aj += lda; | |||||
| } | |||||
| } | |||||
| else | |||||
| { | |||||
| for ( j=0; j<n; j++ ) | |||||
| { | |||||
| *y_ptr += *aj * xtemp0 + *(aj+1) * xtemp1 + *(aj+2) * xtemp2; | |||||
| y_ptr += inc_y; | |||||
| aj += lda; | |||||
| } | |||||
| } | |||||
| } | |||||
| return(0); | |||||
| } | |||||
| if ( m3 == 2 ) | |||||
| { | |||||
| FLOAT xtemp0 = *x_ptr * alpha; | |||||
| x_ptr += inc_x; | |||||
| FLOAT xtemp1 = *x_ptr * alpha; | |||||
| FLOAT *aj = a_ptr; | |||||
| y_ptr = y; | |||||
| if ( lda == 2 && inc_y == 1 ) | |||||
| { | |||||
| for ( j=0; j< ( n & -4) ; j+=4 ) | |||||
| { | |||||
| y_ptr[j] += aj[0] * xtemp0 + aj[1] * xtemp1 ; | |||||
| y_ptr[j+1] += aj[2] * xtemp0 + aj[3] * xtemp1 ; | |||||
| y_ptr[j+2] += aj[4] * xtemp0 + aj[5] * xtemp1 ; | |||||
| y_ptr[j+3] += aj[6] * xtemp0 + aj[7] * xtemp1 ; | |||||
| aj += 8; | |||||
| } | |||||
| for ( ; j<n; j++ ) | |||||
| { | |||||
| y_ptr[j] += aj[0] * xtemp0 + aj[1] * xtemp1 ; | |||||
| aj += 2; | |||||
| } | |||||
| } | |||||
| else | |||||
| { | |||||
| if ( inc_y == 1 ) | |||||
| { | |||||
| BLASLONG register lda2 = lda << 1; | |||||
| BLASLONG register lda4 = lda << 2; | |||||
| BLASLONG register lda3 = lda2 + lda; | |||||
| for ( j=0; j< ( n & -4 ); j+=4 ) | |||||
| { | |||||
| y_ptr[j] += *aj * xtemp0 + *(aj+1) * xtemp1 ; | |||||
| y_ptr[j+1] += *(aj+lda) * xtemp0 + *(aj+lda+1) * xtemp1 ; | |||||
| y_ptr[j+2] += *(aj+lda2) * xtemp0 + *(aj+lda2+1) * xtemp1 ; | |||||
| y_ptr[j+3] += *(aj+lda3) * xtemp0 + *(aj+lda3+1) * xtemp1 ; | |||||
| aj += lda4; | |||||
| } | |||||
| for ( ; j< n ; j++ ) | |||||
| { | |||||
| y_ptr[j] += *aj * xtemp0 + *(aj+1) * xtemp1 ; | |||||
| aj += lda; | |||||
| } | |||||
| } | |||||
| else | |||||
| { | |||||
| for ( j=0; j<n; j++ ) | |||||
| { | |||||
| *y_ptr += *aj * xtemp0 + *(aj+1) * xtemp1 ; | |||||
| y_ptr += inc_y; | |||||
| aj += lda; | |||||
| } | |||||
| } | |||||
| } | |||||
| return(0); | |||||
| } | |||||
| FLOAT xtemp = *x_ptr * alpha; | |||||
| FLOAT *aj = a_ptr; | |||||
| y_ptr = y; | |||||
| if ( lda == 1 && inc_y == 1 ) | |||||
| { | |||||
| for ( j=0; j< ( n & -4) ; j+=4 ) | |||||
| { | |||||
| y_ptr[j] += aj[j] * xtemp; | |||||
| y_ptr[j+1] += aj[j+1] * xtemp; | |||||
| y_ptr[j+2] += aj[j+2] * xtemp; | |||||
| y_ptr[j+3] += aj[j+3] * xtemp; | |||||
| } | |||||
| for ( ; j<n ; j++ ) | |||||
| { | |||||
| y_ptr[j] += aj[j] * xtemp; | |||||
| } | |||||
| } | |||||
| else | |||||
| { | |||||
| if ( inc_y == 1 ) | |||||
| { | |||||
| BLASLONG register lda2 = lda << 1; | |||||
| BLASLONG register lda4 = lda << 2; | |||||
| BLASLONG register lda3 = lda2 + lda; | |||||
| for ( j=0; j< ( n & -4 ); j+=4 ) | |||||
| { | |||||
| y_ptr[j] += *aj * xtemp; | |||||
| y_ptr[j+1] += *(aj+lda) * xtemp; | |||||
| y_ptr[j+2] += *(aj+lda2) * xtemp; | |||||
| y_ptr[j+3] += *(aj+lda3) * xtemp; | |||||
| aj += lda4 ; | |||||
| } | |||||
| for ( ; j<n; j++ ) | |||||
| { | |||||
| y_ptr[j] += *aj * xtemp; | |||||
| aj += lda; | |||||
| } | |||||
| } | |||||
| else | |||||
| { | |||||
| for ( j=0; j<n; j++ ) | |||||
| { | |||||
| *y_ptr += *aj * xtemp; | |||||
| y_ptr += inc_y; | |||||
| aj += lda; | |||||
| } | |||||
| } | |||||
| } | |||||
| return(0); | |||||
| } | |||||
| @@ -0,0 +1,270 @@ | |||||
| /*************************************************************************** | |||||
| Copyright (c) 2013-2017, The OpenBLAS Project | |||||
| All rights reserved. | |||||
| Redistribution and use in source and binary forms, with or without | |||||
| modification, are permitted provided that the following conditions are | |||||
| met: | |||||
| 1. Redistributions of source code must retain the above copyright | |||||
| notice, this list of conditions and the following disclaimer. | |||||
| 2. Redistributions in binary form must reproduce the above copyright | |||||
| notice, this list of conditions and the following disclaimer in | |||||
| the documentation and/or other materials provided with the | |||||
| distribution. | |||||
| 3. Neither the name of the OpenBLAS project nor the names of | |||||
| its contributors may be used to endorse or promote products | |||||
| derived from this software without specific prior written permission. | |||||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| *****************************************************************************/ | |||||
| #include "common.h" | |||||
| static void __attribute__ ((noinline)) drot_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *c, FLOAT *s) | |||||
| { | |||||
| __asm__ ( | |||||
| "pfd 2, 0(%1) \n\t" | |||||
| "pfd 2, 0(%2) \n\t" | |||||
| "vlrepg %%v0,0(%3) \n\t" | |||||
| "vlrepg %%v1,0(%4) \n\t" | |||||
| "srlg %%r0,%0,5 \n\t" | |||||
| "xgr %%r1,%%r1 \n\t" | |||||
| ".align 16 \n\t" | |||||
| "1: \n\t" | |||||
| "pfd 2, 256(%%r1,%1) \n\t" | |||||
| "pfd 2, 256(%%r1,%2) \n\t" | |||||
| "vl %%v24, 0(%%r1,%1) \n\t" | |||||
| "vl %%v25, 16(%%r1,%1) \n\t" | |||||
| "vl %%v26, 32(%%r1,%1) \n\t" | |||||
| "vl %%v27, 48(%%r1,%1) \n\t" | |||||
| "vl %%v16, 0(%%r1,%2) \n\t" | |||||
| "vl %%v17, 16(%%r1,%2) \n\t" | |||||
| "vl %%v18, 32(%%r1,%2) \n\t" | |||||
| "vl %%v19, 48(%%r1,%2) \n\t" | |||||
| "vfmdb %%v28,%%v24,%%v0 \n\t" | |||||
| "vfmdb %%v29,%%v25,%%v0 \n\t" | |||||
| "vfmdb %%v20,%%v24,%%v1 \n\t" /* yn=x*s */ | |||||
| "vfmdb %%v21,%%v25,%%v1 \n\t" /* yn=x*s */ | |||||
| "vfmdb %%v30,%%v26,%%v0 \n\t" | |||||
| "vfmdb %%v22,%%v26,%%v1 \n\t" /* yn=x*s */ | |||||
| "vfmdb %%v31,%%v27,%%v0 \n\t" | |||||
| "vfmdb %%v23,%%v27,%%v1 \n\t" /* yn=x*s */ | |||||
| /* 2nd parts*/ | |||||
| "vfmadb %%v28,%%v16,%%v1,%%v28 \n\t" | |||||
| "vfmsdb %%v20,%%v16,%%v0,%%v20 \n\t" /* yn=y*c-yn */ | |||||
| "vfmadb %%v29,%%v17,%%v1,%%v29 \n\t" | |||||
| "vfmsdb %%v21,%%v17,%%v0,%%v21 \n\t" /* yn=y*c-yn */ | |||||
| "vfmadb %%v30,%%v18,%%v1,%%v30 \n\t" | |||||
| "vfmsdb %%v22,%%v18,%%v0,%%v22 \n\t" /* yn=y*c-yn */ | |||||
| "vfmadb %%v31,%%v19,%%v1,%%v31 \n\t" | |||||
| "vfmsdb %%v23,%%v19,%%v0,%%v23 \n\t" /* yn=y*c-yn */ | |||||
| "vst %%v28, 0(%%r1,%1) \n\t" | |||||
| "vst %%v29, 16(%%r1,%1) \n\t" | |||||
| "vst %%v30, 32(%%r1,%1) \n\t" | |||||
| "vst %%v31, 48(%%r1,%1) \n\t" | |||||
| "vst %%v20, 0(%%r1,%2) \n\t" | |||||
| "vst %%v21, 16(%%r1,%2) \n\t" | |||||
| "vst %%v22, 32(%%r1,%2) \n\t" | |||||
| "vst %%v23, 48(%%r1,%2) \n\t" | |||||
| "vl %%v24, 64(%%r1,%1) \n\t" | |||||
| "vl %%v25, 80(%%r1,%1) \n\t" | |||||
| "vl %%v26, 96(%%r1,%1) \n\t" | |||||
| "vl %%v27, 112(%%r1,%1) \n\t" | |||||
| "vl %%v16, 64(%%r1,%2) \n\t" | |||||
| "vl %%v17, 80(%%r1,%2) \n\t" | |||||
| "vl %%v18, 96(%%r1,%2) \n\t" | |||||
| "vl %%v19, 112(%%r1,%2) \n\t" | |||||
| "vfmdb %%v28,%%v24,%%v0 \n\t" | |||||
| "vfmdb %%v29,%%v25,%%v0 \n\t" | |||||
| "vfmdb %%v20,%%v24,%%v1 \n\t" /* yn=x*s */ | |||||
| "vfmdb %%v21,%%v25,%%v1 \n\t" /* yn=x*s */ | |||||
| "vfmdb %%v30,%%v26,%%v0 \n\t" | |||||
| "vfmdb %%v22,%%v26,%%v1 \n\t" /* yn=x*s */ | |||||
| "vfmdb %%v31,%%v27,%%v0 \n\t" | |||||
| "vfmdb %%v23,%%v27,%%v1 \n\t" /* yn=x*s */ | |||||
| /* 2nd parts*/ | |||||
| "vfmadb %%v28,%%v16,%%v1,%%v28 \n\t" | |||||
| "vfmsdb %%v20,%%v16,%%v0,%%v20 \n\t" /* yn=y*c-yn */ | |||||
| "vfmadb %%v29,%%v17,%%v1,%%v29 \n\t" | |||||
| "vfmsdb %%v21,%%v17,%%v0,%%v21 \n\t" /* yn=y*c-yn */ | |||||
| "vfmadb %%v30,%%v18,%%v1,%%v30 \n\t" | |||||
| "vfmsdb %%v22,%%v18,%%v0,%%v22 \n\t" /* yn=y*c-yn */ | |||||
| "vfmadb %%v31,%%v19,%%v1,%%v31 \n\t" | |||||
| "vfmsdb %%v23,%%v19,%%v0,%%v23 \n\t" /* yn=y*c-yn */ | |||||
| "vst %%v28, 64(%%r1,%1) \n\t" | |||||
| "vst %%v29, 80(%%r1,%1) \n\t" | |||||
| "vst %%v30, 96(%%r1,%1) \n\t" | |||||
| "vst %%v31, 112(%%r1,%1) \n\t" | |||||
| "vst %%v20, 64(%%r1,%2) \n\t" | |||||
| "vst %%v21, 80(%%r1,%2) \n\t" | |||||
| "vst %%v22, 96(%%r1,%2) \n\t" | |||||
| "vst %%v23, 112(%%r1,%2) \n\t" | |||||
| "vl %%v24, 128(%%r1,%1) \n\t" | |||||
| "vl %%v25, 144(%%r1,%1) \n\t" | |||||
| "vl %%v26, 160(%%r1,%1) \n\t" | |||||
| "vl %%v27, 176(%%r1,%1) \n\t" | |||||
| "vl %%v16, 128(%%r1,%2) \n\t" | |||||
| "vl %%v17, 144(%%r1,%2) \n\t" | |||||
| "vl %%v18, 160(%%r1,%2) \n\t" | |||||
| "vl %%v19, 176(%%r1,%2) \n\t" | |||||
| "vfmdb %%v28,%%v24,%%v0 \n\t" | |||||
| "vfmdb %%v29,%%v25,%%v0 \n\t" | |||||
| "vfmdb %%v20,%%v24,%%v1 \n\t" /* yn=x*s */ | |||||
| "vfmdb %%v21,%%v25,%%v1 \n\t" /* yn=x*s */ | |||||
| "vfmdb %%v30,%%v26,%%v0 \n\t" | |||||
| "vfmdb %%v22,%%v26,%%v1 \n\t" /* yn=x*s */ | |||||
| "vfmdb %%v31,%%v27,%%v0 \n\t" | |||||
| "vfmdb %%v23,%%v27,%%v1 \n\t" /* yn=x*s */ | |||||
| /* 2nd parts*/ | |||||
| "vfmadb %%v28,%%v16,%%v1,%%v28 \n\t" | |||||
| "vfmsdb %%v20,%%v16,%%v0,%%v20 \n\t" /* yn=y*c-yn */ | |||||
| "vfmadb %%v29,%%v17,%%v1,%%v29 \n\t" | |||||
| "vfmsdb %%v21,%%v17,%%v0,%%v21 \n\t" /* yn=y*c-yn */ | |||||
| "vfmadb %%v30,%%v18,%%v1,%%v30 \n\t" | |||||
| "vfmsdb %%v22,%%v18,%%v0,%%v22 \n\t" /* yn=y*c-yn */ | |||||
| "vfmadb %%v31,%%v19,%%v1,%%v31 \n\t" | |||||
| "vfmsdb %%v23,%%v19,%%v0,%%v23 \n\t" /* yn=y*c-yn */ | |||||
| "vst %%v28, 128(%%r1,%1) \n\t" | |||||
| "vst %%v29, 144(%%r1,%1) \n\t" | |||||
| "vst %%v30, 160(%%r1,%1) \n\t" | |||||
| "vst %%v31, 176(%%r1,%1) \n\t" | |||||
| "vst %%v20, 128(%%r1,%2) \n\t" | |||||
| "vst %%v21, 144(%%r1,%2) \n\t" | |||||
| "vst %%v22, 160(%%r1,%2) \n\t" | |||||
| "vst %%v23, 176(%%r1,%2) \n\t" | |||||
| "vl %%v24, 192(%%r1,%1) \n\t" | |||||
| "vl %%v25, 208(%%r1,%1) \n\t" | |||||
| "vl %%v26, 224(%%r1,%1) \n\t" | |||||
| "vl %%v27, 240(%%r1,%1) \n\t" | |||||
| "vl %%v16, 192(%%r1,%2) \n\t" | |||||
| "vl %%v17, 208(%%r1,%2) \n\t" | |||||
| "vl %%v18, 224(%%r1,%2) \n\t" | |||||
| "vl %%v19, 240(%%r1,%2) \n\t" | |||||
| "vfmdb %%v28,%%v24,%%v0 \n\t" | |||||
| "vfmdb %%v29,%%v25,%%v0 \n\t" | |||||
| "vfmdb %%v20,%%v24,%%v1 \n\t" /* yn=x*s */ | |||||
| "vfmdb %%v21,%%v25,%%v1 \n\t" /* yn=x*s */ | |||||
| "vfmdb %%v30,%%v26,%%v0 \n\t" | |||||
| "vfmdb %%v22,%%v26,%%v1 \n\t" /* yn=x*s */ | |||||
| "vfmdb %%v31,%%v27,%%v0 \n\t" | |||||
| "vfmdb %%v23,%%v27,%%v1 \n\t" /* yn=x*s */ | |||||
| /* 2nd parts*/ | |||||
| "vfmadb %%v28,%%v16,%%v1,%%v28 \n\t" | |||||
| "vfmsdb %%v20,%%v16,%%v0,%%v20 \n\t" /* yn=y*c-yn */ | |||||
| "vfmadb %%v29,%%v17,%%v1,%%v29 \n\t" | |||||
| "vfmsdb %%v21,%%v17,%%v0,%%v21 \n\t" /* yn=y*c-yn */ | |||||
| "vfmadb %%v30,%%v18,%%v1,%%v30 \n\t" | |||||
| "vfmsdb %%v22,%%v18,%%v0,%%v22 \n\t" /* yn=y*c-yn */ | |||||
| "vfmadb %%v31,%%v19,%%v1,%%v31 \n\t" | |||||
| "vfmsdb %%v23,%%v19,%%v0,%%v23 \n\t" /* yn=y*c-yn */ | |||||
| "vst %%v28, 192(%%r1,%1) \n\t" | |||||
| "vst %%v29, 208(%%r1,%1) \n\t" | |||||
| "vst %%v30, 224(%%r1,%1) \n\t" | |||||
| "vst %%v31, 240(%%r1,%1) \n\t" | |||||
| "vst %%v20, 192(%%r1,%2) \n\t" | |||||
| "vst %%v21, 208(%%r1,%2) \n\t" | |||||
| "vst %%v22, 224(%%r1,%2) \n\t" | |||||
| "vst %%v23, 240(%%r1,%2) \n\t" | |||||
| "la %%r1,256(%%r1) \n\t" | |||||
| "brctg %%r0,1b" | |||||
| : | |||||
| : "r"(n), "a"(x), "a"(y),"a"(c),"a"(s) | |||||
| : "cc", "memory","r0","r1" ,"v0","v1","v16", | |||||
| "v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" | |||||
| ); | |||||
| return; | |||||
| } | |||||
| int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT c, FLOAT s) | |||||
| { | |||||
| BLASLONG i=0; | |||||
| BLASLONG ix=0,iy=0; | |||||
| FLOAT temp; | |||||
| if ( n <= 0 ) return(0); | |||||
| if ( (inc_x == 1) && (inc_y == 1) ) | |||||
| { | |||||
| BLASLONG n1 = n & -32; | |||||
| if ( n1 > 0 ) | |||||
| { | |||||
| FLOAT cosa,sina; | |||||
| cosa=c; | |||||
| sina=s; | |||||
| drot_kernel_32(n1, x, y, &cosa, &sina); | |||||
| i=n1; | |||||
| } | |||||
| while(i < n) | |||||
| { | |||||
| temp = c*x[i] + s*y[i] ; | |||||
| y[i] = c*y[i] - s*x[i] ; | |||||
| x[i] = temp ; | |||||
| i++ ; | |||||
| } | |||||
| } | |||||
| else | |||||
| { | |||||
| while(i < n) | |||||
| { | |||||
| temp = c*x[ix] + s*y[iy] ; | |||||
| y[iy] = c*y[iy] - s*x[ix] ; | |||||
| x[ix] = temp ; | |||||
| ix += inc_x ; | |||||
| iy += inc_y ; | |||||
| i++ ; | |||||
| } | |||||
| } | |||||
| return(0); | |||||
| } | |||||
| @@ -0,0 +1,210 @@ | |||||
| /*************************************************************************** | |||||
| Copyright (c) 2013-2017, The OpenBLAS Project | |||||
| All rights reserved. | |||||
| Redistribution and use in source and binary forms, with or without | |||||
| modification, are permitted provided that the following conditions are | |||||
| met: | |||||
| 1. Redistributions of source code must retain the above copyright | |||||
| notice, this list of conditions and the following disclaimer. | |||||
| 2. Redistributions in binary form must reproduce the above copyright | |||||
| notice, this list of conditions and the following disclaimer in | |||||
| the documentation and/or other materials provided with the | |||||
| distribution. | |||||
| 3. Neither the name of the OpenBLAS project nor the names of | |||||
| its contributors may be used to endorse or promote products | |||||
| derived from this software without specific prior written permission. | |||||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| *****************************************************************************/ | |||||
| #include "common.h" | |||||
| #if defined(Z13) | |||||
| static void __attribute__ ((noinline)) dscal_kernel_8( BLASLONG n, FLOAT da , FLOAT *x ) | |||||
| { | |||||
| __asm__ ("pfd 2, 0(%1) \n\t" | |||||
| "vrepg %%v0 , %%v0,0 \n\t" | |||||
| "sllg %%r0,%0,3 \n\t" | |||||
| "xgr %%r1,%%r1 \n\t" | |||||
| ".align 16 \n\t" | |||||
| "1: \n\t" | |||||
| "pfd 2, 256(%%r1,%1) \n\t" | |||||
| "vl %%v24, 0(%%r1,%1) \n\t" | |||||
| "vfmdb %%v24,%%v24,%%v0 \n\t" | |||||
| "vst %%v24, 0(%%r1,%1) \n\t" | |||||
| "vl %%v25, 16(%%r1,%1) \n\t" | |||||
| "vfmdb %%v25,%%v25,%%v0 \n\t" | |||||
| "vst %%v25, 16(%%r1,%1) \n\t" | |||||
| "vl %%v26, 32(%%r1,%1) \n\t" | |||||
| "vfmdb %%v26,%%v26,%%v0 \n\t" | |||||
| "vst %%v26, 32(%%r1,%1) \n\t" | |||||
| "vl %%v27, 48(%%r1,%1) \n\t" | |||||
| "vfmdb %%v27,%%v27,%%v0 \n\t" | |||||
| "vst %%v27, 48(%%r1,%1) \n\t" | |||||
| "vl %%v24, 64(%%r1,%1) \n\t" | |||||
| "vfmdb %%v24,%%v24,%%v0 \n\t" | |||||
| "vst %%v24, 64(%%r1,%1) \n\t" | |||||
| "vl %%v25, 80(%%r1,%1) \n\t" | |||||
| "vfmdb %%v25,%%v25,%%v0 \n\t" | |||||
| "vst %%v25, 80(%%r1,%1) \n\t" | |||||
| "vl %%v26, 96(%%r1,%1) \n\t" | |||||
| "vfmdb %%v26,%%v26,%%v0 \n\t" | |||||
| "vst %%v26, 96(%%r1,%1) \n\t" | |||||
| "vl %%v27, 112(%%r1,%1) \n\t" | |||||
| "vfmdb %%v27,%%v27,%%v0 \n\t" | |||||
| "vst %%v27, 112(%%r1,%1) \n\t" | |||||
| "la %%r1,128(%%r1) \n\t" | |||||
| "clgrjl %%r1,%%r0,1b \n\t" | |||||
| : | |||||
| :"r"(n),"a"(x),"f"(da) | |||||
| :"cc" , "memory" ,"r0","r1","v0","v24","v25","v26","v27" | |||||
| ); | |||||
| } | |||||
| static void __attribute__ ((noinline)) dscal_kernel_8_zero( BLASLONG n, FLOAT da , FLOAT *x ) | |||||
| { | |||||
| __asm__ ("pfd 2, 0(%1) \n\t" | |||||
| "vzero %%v0 \n\t" | |||||
| "sllg %%r0,%0,3 \n\t" | |||||
| "xgr %%r1,%%r1 \n\t" | |||||
| ".align 16 \n\t" | |||||
| "1: \n\t" | |||||
| "pfd 2, 256(%%r1,%1) \n\t" | |||||
| "vst %%v0, 0(%%r1,%1) \n\t" | |||||
| "vst %%v0, 16(%%r1,%1) \n\t" | |||||
| "vst %%v0, 32(%%r1,%1) \n\t" | |||||
| "vst %%v0, 48(%%r1,%1) \n\t" | |||||
| "vst %%v0, 64(%%r1,%1) \n\t" | |||||
| "vst %%v0, 80(%%r1,%1) \n\t" | |||||
| "vst %%v0, 96(%%r1,%1) \n\t" | |||||
| "vst %%v0, 112(%%r1,%1) \n\t" | |||||
| "la %%r1,128(%%r1) \n\t" | |||||
| "clgrjl %%r1,%%r0,1b \n\t" | |||||
| : | |||||
| :"r"(n),"a"(x),"f"(da) | |||||
| :"cc" , "memory" ,"r0","r1","v0" | |||||
| ); | |||||
| } | |||||
| #endif | |||||
| int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) | |||||
| { | |||||
| BLASLONG i=0,j=0; | |||||
| if ( n <= 0 || inc_x <=0 ) | |||||
| return(0); | |||||
| if ( inc_x == 1 ) | |||||
| { | |||||
| if ( da == 0.0 ) | |||||
| { | |||||
| BLASLONG n1 = n & -16; | |||||
| if ( n1 > 0 ) | |||||
| { | |||||
| dscal_kernel_8_zero(n1 , da , x); | |||||
| j=n1; | |||||
| } | |||||
| while(j < n) | |||||
| { | |||||
| x[j]=0.0; | |||||
| j++; | |||||
| } | |||||
| } | |||||
| else | |||||
| { | |||||
| BLASLONG n1 = n & -16; | |||||
| if ( n1 > 0 ) | |||||
| { | |||||
| dscal_kernel_8(n1 , da , x); | |||||
| j=n1; | |||||
| } | |||||
| while(j < n) | |||||
| { | |||||
| x[j] = da * x[j] ; | |||||
| j++; | |||||
| } | |||||
| } | |||||
| } | |||||
| else | |||||
| { | |||||
| if ( da == 0.0 ) | |||||
| { | |||||
| BLASLONG n1 = n & -4; | |||||
| while (j < n1) { | |||||
| x[i]=0.0; | |||||
| x[i + inc_x]=0.0; | |||||
| x[i + 2 * inc_x]=0.0; | |||||
| x[i + 3 * inc_x]=0.0; | |||||
| i += inc_x * 4; | |||||
| j += 4; | |||||
| } | |||||
| while(j < n) | |||||
| { | |||||
| x[i]=0.0; | |||||
| i += inc_x ; | |||||
| j++; | |||||
| } | |||||
| } | |||||
| else | |||||
| { | |||||
| BLASLONG n1 = n & -4; | |||||
| while (j < n1) { | |||||
| x[i] = da * x[i] ; | |||||
| x[i + inc_x] = da * x[i + inc_x]; | |||||
| x[i + 2 * inc_x] = da * x[i + 2 * inc_x]; | |||||
| x[i + 3 * inc_x] = da * x[i + 3 * inc_x]; | |||||
| i += inc_x * 4; | |||||
| j += 4; | |||||
| } | |||||
| while(j < n) | |||||
| { | |||||
| x[i] = da * x[i] ; | |||||
| i += inc_x ; | |||||
| j++; | |||||
| } | |||||
| } | |||||
| } | |||||
| return 0; | |||||
| } | |||||
| @@ -0,0 +1,382 @@ | |||||
| /*************************************************************************** | |||||
| Copyright (c) 2013-2017, The OpenBLAS Project | |||||
| All rights reserved. | |||||
| Redistribution and use in source and binary forms, with or without | |||||
| modification, are permitted provided that the following conditions are | |||||
| met: | |||||
| 1. Redistributions of source code must retain the above copyright | |||||
| notice, this list of conditions and the following disclaimer. | |||||
| 2. Redistributions in binary form must reproduce the above copyright | |||||
| notice, this list of conditions and the following disclaimer in | |||||
| the documentation and/or other materials provided with the | |||||
| distribution. | |||||
| 3. Neither the name of the OpenBLAS project nor the names of | |||||
| its contributors may be used to endorse or promote products | |||||
| derived from this software without specific prior written permission. | |||||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| *****************************************************************************/ | |||||
| #include "common.h" | |||||
| #define Z13_SWAP_C 1 | |||||
| #if defined(Z13_SWAP_A) | |||||
| static void __attribute__ ((noinline)) dswap_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y) | |||||
| { | |||||
| __asm__ volatile( | |||||
| "pfd 1, 0(%1) \n\t" | |||||
| "pfd 2, 0(%2) \n\t" | |||||
| "srlg %%r0,%0,5 \n\t" | |||||
| "xgr %%r1,%%r1 \n\t" | |||||
| ".align 16 \n\t" | |||||
| "1: \n\t" | |||||
| "pfd 2, 256(%%r1,%1) \n\t" | |||||
| "pfd 2, 256(%%r1,%2) \n\t" | |||||
| "vl %%v24, 0(%%r1,%1) \n\t" | |||||
| "vl %%v16, 0(%%r1,%2) \n\t" | |||||
| "vst %%v24, 0(%%r1,%2) \n\t" | |||||
| "vst %%v16, 0(%%r1,%1) \n\t" | |||||
| "vl %%v25, 16(%%r1,%1) \n\t" | |||||
| "vl %%v17, 16(%%r1,%2) \n\t" | |||||
| "vst %%v25, 16(%%r1,%2) \n\t" | |||||
| "vst %%v17, 16(%%r1,%1) \n\t" | |||||
| "vl %%v26, 32(%%r1,%1) \n\t" | |||||
| "vl %%v18, 32(%%r1,%2) \n\t" | |||||
| "vst %%v26, 32(%%r1,%2) \n\t" | |||||
| "vst %%v18, 32(%%r1,%1) \n\t" | |||||
| "vl %%v27, 48(%%r1,%1) \n\t" | |||||
| "vl %%v19, 48(%%r1,%2) \n\t" | |||||
| "vst %%v27, 48(%%r1,%2) \n\t" | |||||
| "vst %%v19, 48(%%r1,%1) \n\t" | |||||
| "vl %%v28, 64(%%r1,%1) \n\t" | |||||
| "vl %%v20, 64(%%r1,%2) \n\t" | |||||
| "vst %%v28, 64(%%r1,%2) \n\t" | |||||
| "vst %%v20, 64(%%r1,%1) \n\t" | |||||
| "vl %%v29, 80(%%r1,%1) \n\t" | |||||
| "vl %%v21, 80(%%r1,%2) \n\t" | |||||
| "vst %%v29, 80(%%r1,%2) \n\t" | |||||
| "vst %%v21, 80(%%r1,%1) \n\t" | |||||
| "vl %%v30, 96(%%r1,%1) \n\t" | |||||
| "vl %%v22, 96(%%r1,%2) \n\t" | |||||
| "vst %%v30, 96(%%r1,%2) \n\t" | |||||
| "vst %%v22, 96(%%r1,%1) \n\t" | |||||
| "vl %%v31, 112(%%r1,%1) \n\t" | |||||
| "vl %%v23, 112(%%r1,%2) \n\t" | |||||
| "vst %%v31, 112(%%r1,%2) \n\t" | |||||
| "vst %%v23, 112(%%r1,%1) \n\t" | |||||
| "vl %%v24, 128(%%r1,%1) \n\t" | |||||
| "vl %%v16, 128(%%r1,%2) \n\t" | |||||
| "vst %%v24, 128(%%r1,%2) \n\t" | |||||
| "vst %%v16, 128(%%r1,%1) \n\t" | |||||
| "vl %%v25, 144(%%r1,%1) \n\t" | |||||
| "vl %%v17, 144(%%r1,%2) \n\t" | |||||
| "vst %%v25, 144(%%r1,%2) \n\t" | |||||
| "vst %%v17, 144(%%r1,%1) \n\t" | |||||
| "vl %%v26, 160(%%r1,%1) \n\t" | |||||
| "vl %%v18, 160(%%r1,%2) \n\t" | |||||
| "vst %%v26, 160(%%r1,%2) \n\t" | |||||
| "vst %%v18, 160(%%r1,%1) \n\t" | |||||
| "vl %%v27, 176(%%r1,%1) \n\t" | |||||
| "vl %%v19, 176(%%r1,%2) \n\t" | |||||
| "vst %%v27, 176(%%r1,%2) \n\t" | |||||
| "vst %%v19, 176(%%r1,%1) \n\t" | |||||
| "vl %%v28, 192(%%r1,%1) \n\t" | |||||
| "vl %%v20, 192(%%r1,%2) \n\t" | |||||
| "vst %%v28, 192(%%r1,%2) \n\t" | |||||
| "vst %%v20, 192(%%r1,%1) \n\t" | |||||
| "vl %%v29, 208(%%r1,%1) \n\t" | |||||
| "vl %%v21, 208(%%r1,%2) \n\t" | |||||
| "vst %%v29, 208(%%r1,%2) \n\t" | |||||
| "vst %%v21, 208(%%r1,%1) \n\t" | |||||
| "vl %%v30, 224(%%r1,%1) \n\t" | |||||
| "vl %%v22, 224(%%r1,%2) \n\t" | |||||
| "vst %%v30, 224(%%r1,%2) \n\t" | |||||
| "vst %%v22, 224(%%r1,%1) \n\t" | |||||
| "vl %%v31, 240(%%r1,%1) \n\t" | |||||
| "vl %%v23, 240(%%r1,%2) \n\t" | |||||
| "vst %%v31, 240(%%r1,%2) \n\t" | |||||
| "vst %%v23, 240(%%r1,%1) \n\t" | |||||
| "la %%r1,256(%%r1) \n\t" | |||||
| "brctg %%r0,1b" | |||||
| : | |||||
| : "r"(n), "a"(x), "a"(y) | |||||
| : "cc", "memory" ,"r0","r1", "v16","v17","v18","v19","v20","v21","v22","v23" | |||||
| ,"v24","v25","v26","v27","v28","v29","v30","v31" | |||||
| ); | |||||
| return; | |||||
| } | |||||
| #elif defined(Z13_SWAP_B) | |||||
| static void __attribute__ ((noinline)) dswap_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y) | |||||
| { | |||||
| __asm__ volatile( | |||||
| "pfd 2, 0(%1) \n\t" | |||||
| "pfd 2, 0(%2) \n\t" | |||||
| "srlg %%r0,%0,5 \n\t" | |||||
| "xgr %%r1,%%r1 \n\t" | |||||
| ".align 16 \n\t" | |||||
| "1: \n\t" | |||||
| "pfd 2, 256(%%r1,%1) \n\t" | |||||
| "pfd 2, 256(%%r1,%2) \n\t" | |||||
| "vl %%v24, 0(%%r1,%1) \n\t" | |||||
| "vl %%v25, 16(%%r1,%1) \n\t" | |||||
| "vl %%v26, 32(%%r1,%1) \n\t" | |||||
| "vl %%v27, 48(%%r1,%1) \n\t" | |||||
| "vl %%v28, 64(%%r1,%1) \n\t" | |||||
| "vl %%v29, 80(%%r1,%1) \n\t" | |||||
| "vl %%v30, 96(%%r1,%1) \n\t" | |||||
| "vl %%v31, 112(%%r1,%1) \n\t" | |||||
| "vl %%v16, 0(%%r1,%2) \n\t" | |||||
| "vl %%v17, 16(%%r1,%2) \n\t" | |||||
| "vl %%v18, 32(%%r1,%2) \n\t" | |||||
| "vl %%v19, 48(%%r1,%2) \n\t" | |||||
| "vl %%v20, 64(%%r1,%2) \n\t" | |||||
| "vl %%v21, 80(%%r1,%2) \n\t" | |||||
| "vl %%v22, 96(%%r1,%2) \n\t" | |||||
| "vl %%v23, 112(%%r1,%2) \n\t" | |||||
| "vst %%v24, 0(%%r1,%2) \n\t" | |||||
| "vst %%v25, 16(%%r1,%2) \n\t" | |||||
| "vst %%v26, 32(%%r1,%2) \n\t" | |||||
| "vst %%v27, 48(%%r1,%2) \n\t" | |||||
| "vst %%v28, 64(%%r1,%2) \n\t" | |||||
| "vst %%v29, 80(%%r1,%2) \n\t" | |||||
| "vst %%v30, 96(%%r1,%2) \n\t" | |||||
| "vst %%v31, 112(%%r1,%2)\n\t" | |||||
| "vst %%v16, 0(%%r1,%1) \n\t" | |||||
| "vst %%v17, 16(%%r1,%1) \n\t" | |||||
| "vst %%v18, 32(%%r1,%1) \n\t" | |||||
| "vst %%v19, 48(%%r1,%1) \n\t" | |||||
| "vst %%v20, 64(%%r1,%1) \n\t" | |||||
| "vst %%v21, 80(%%r1,%1) \n\t" | |||||
| "vst %%v22, 96(%%r1,%1) \n\t" | |||||
| "vst %%v23, 112(%%r1,%1)\n\t" | |||||
| "vl %%v24, 128(%%r1,%1) \n\t" | |||||
| "vl %%v25, 144(%%r1,%1) \n\t" | |||||
| "vl %%v26, 160(%%r1,%1) \n\t" | |||||
| "vl %%v27, 176(%%r1,%1) \n\t" | |||||
| "vl %%v28, 192(%%r1,%1) \n\t" | |||||
| "vl %%v29, 208(%%r1,%1) \n\t" | |||||
| "vl %%v30, 224(%%r1,%1) \n\t" | |||||
| "vl %%v31, 240(%%r1,%1) \n\t" | |||||
| "vl %%v16, 128(%%r1,%2) \n\t" | |||||
| "vl %%v17, 144(%%r1,%2) \n\t" | |||||
| "vl %%v18, 160(%%r1,%2) \n\t" | |||||
| "vl %%v19, 176(%%r1,%2) \n\t" | |||||
| "vl %%v20, 192(%%r1,%2) \n\t" | |||||
| "vl %%v21, 208(%%r1,%2) \n\t" | |||||
| "vl %%v22, 224(%%r1,%2) \n\t" | |||||
| "vl %%v23, 240(%%r1,%2) \n\t" | |||||
| "vst %%v24, 128(%%r1,%2) \n\t" | |||||
| "vst %%v25, 144(%%r1,%2) \n\t" | |||||
| "vst %%v26, 160(%%r1,%2) \n\t" | |||||
| "vst %%v27, 176(%%r1,%2) \n\t" | |||||
| "vst %%v28, 192(%%r1,%2) \n\t" | |||||
| "vst %%v29, 208(%%r1,%2) \n\t" | |||||
| "vst %%v30, 224(%%r1,%2) \n\t" | |||||
| "vst %%v31, 240(%%r1,%2) \n\t" | |||||
| "vst %%v16, 128(%%r1,%1) \n\t" | |||||
| "vst %%v17, 144(%%r1,%1) \n\t" | |||||
| "vst %%v18, 160(%%r1,%1) \n\t" | |||||
| "vst %%v19, 176(%%r1,%1) \n\t" | |||||
| "vst %%v20, 192(%%r1,%1) \n\t" | |||||
| "vst %%v21, 208(%%r1,%1) \n\t" | |||||
| "vst %%v22, 224(%%r1,%1) \n\t" | |||||
| "vst %%v23, 240(%%r1,%1) \n\t" | |||||
| "la %%r1,256(%%r1) \n\t" | |||||
| "brctg %%r0,1b" | |||||
| : | |||||
| : "r"(n), "a"(x), "a"(y) | |||||
| : "cc", "memory","r0","r1", "v16", | |||||
| "v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" | |||||
| ); | |||||
| return; | |||||
| } | |||||
| #elif defined(Z13_SWAP_C) | |||||
| static void __attribute__ ((noinline)) dswap_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y) | |||||
| { | |||||
| __asm__ volatile( | |||||
| "pfd 2, 0(%1) \n\t" | |||||
| "pfd 2, 0(%2) \n\t" | |||||
| "srlg %%r0,%0,5 \n\t" | |||||
| "xgr %%r1,%%r1 \n\t" | |||||
| ".align 16 \n\t" | |||||
| "1: \n\t" | |||||
| "pfd 2, 256(%%r1,%1) \n\t" | |||||
| "pfd 2, 256(%%r1,%2) \n\t" | |||||
| "vl %%v16, 0(%%r1,%1) \n\t" | |||||
| "vl %%v17, 16(%%r1,%1) \n\t" | |||||
| "vl %%v18, 32(%%r1,%1) \n\t" | |||||
| "vl %%v19, 48(%%r1,%1) \n\t" | |||||
| "vl %%v20, 64(%%r1,%1) \n\t" | |||||
| "vl %%v21, 80(%%r1,%1) \n\t" | |||||
| "vl %%v22, 96(%%r1,%1) \n\t" | |||||
| "vl %%v23, 112(%%r1,%1) \n\t" | |||||
| "vl %%v24, 128(%%r1,%1) \n\t" | |||||
| "vl %%v25, 144(%%r1,%1) \n\t" | |||||
| "vl %%v26, 160(%%r1,%1) \n\t" | |||||
| "vl %%v27, 176(%%r1,%1) \n\t" | |||||
| "vl %%v28, 192(%%r1,%1) \n\t" | |||||
| "vl %%v29, 208(%%r1,%1) \n\t" | |||||
| "vl %%v30, 224(%%r1,%1) \n\t" | |||||
| "vl %%v31, 240(%%r1,%1) \n\t" | |||||
| "vl %%v0, 0(%%r1,%2) \n\t" | |||||
| "vl %%v1, 16(%%r1,%2) \n\t" | |||||
| "vl %%v2, 32(%%r1,%2) \n\t" | |||||
| "vl %%v3, 48(%%r1,%2) \n\t" | |||||
| "vl %%v4, 64(%%r1,%2) \n\t" | |||||
| "vl %%v5, 80(%%r1,%2) \n\t" | |||||
| "vl %%v6, 96(%%r1,%2) \n\t" | |||||
| "vl %%v7, 112(%%r1,%2) \n\t" | |||||
| "vst %%v0, 0(%%r1,%1) \n\t" | |||||
| "vst %%v1, 16(%%r1,%1) \n\t" | |||||
| "vst %%v2, 32(%%r1,%1) \n\t" | |||||
| "vst %%v3, 48(%%r1,%1) \n\t" | |||||
| "vst %%v4, 64(%%r1,%1) \n\t" | |||||
| "vst %%v5, 80(%%r1,%1) \n\t" | |||||
| "vst %%v6, 96(%%r1,%1) \n\t" | |||||
| "vst %%v7, 112(%%r1,%1) \n\t" | |||||
| "vl %%v0, 128(%%r1,%2) \n\t" | |||||
| "vl %%v1, 144(%%r1,%2) \n\t" | |||||
| "vl %%v2, 160(%%r1,%2) \n\t" | |||||
| "vl %%v3, 176(%%r1,%2) \n\t" | |||||
| "vl %%v4, 192(%%r1,%2) \n\t" | |||||
| "vl %%v5, 208(%%r1,%2) \n\t" | |||||
| "vl %%v6, 224(%%r1,%2) \n\t" | |||||
| "vl %%v7, 240(%%r1,%2) \n\t" | |||||
| "vst %%v0, 128(%%r1,%1) \n\t" | |||||
| "vst %%v1, 144(%%r1,%1) \n\t" | |||||
| "vst %%v2, 160(%%r1,%1) \n\t" | |||||
| "vst %%v3, 176(%%r1,%1) \n\t" | |||||
| "vst %%v4, 192(%%r1,%1) \n\t" | |||||
| "vst %%v5, 208(%%r1,%1) \n\t" | |||||
| "vst %%v6, 224(%%r1,%1) \n\t" | |||||
| "vst %%v7, 240(%%r1,%1) \n\t" | |||||
| "vst %%v16, 0(%%r1,%2) \n\t" | |||||
| "vst %%v17, 16(%%r1,%2) \n\t" | |||||
| "vst %%v18, 32(%%r1,%2) \n\t" | |||||
| "vst %%v19, 48(%%r1,%2) \n\t" | |||||
| "vst %%v20, 64(%%r1,%2) \n\t" | |||||
| "vst %%v21, 80(%%r1,%2) \n\t" | |||||
| "vst %%v22, 96(%%r1,%2) \n\t" | |||||
| "vst %%v23, 112(%%r1,%2) \n\t" | |||||
| "vst %%v24, 128(%%r1,%2) \n\t" | |||||
| "vst %%v25, 144(%%r1,%2) \n\t" | |||||
| "vst %%v26, 160(%%r1,%2) \n\t" | |||||
| "vst %%v27, 176(%%r1,%2) \n\t" | |||||
| "vst %%v28, 192(%%r1,%2) \n\t" | |||||
| "vst %%v29, 208(%%r1,%2) \n\t" | |||||
| "vst %%v30, 224(%%r1,%2) \n\t" | |||||
| "vst %%v31, 240(%%r1,%2) \n\t" | |||||
| "la %%r1,256(%%r1) \n\t" | |||||
| "brctg %%r0,1b" | |||||
| : | |||||
| : "r"(n), "a"(x), "a"(y) | |||||
| : "cc", "memory","r0","r1", "v0","v1","v2","v3","v4","v5","v6","v7","v16", | |||||
| "v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" | |||||
| ); | |||||
| return; | |||||
| } | |||||
| #endif | |||||
| int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) | |||||
| { | |||||
| BLASLONG i=0; | |||||
| BLASLONG ix=0,iy=0; | |||||
| FLOAT temp; | |||||
| if ( n <= 0 ) return(0); | |||||
| if ( (inc_x == 1) && (inc_y == 1 )) | |||||
| { | |||||
| BLASLONG n1 = n & -32; | |||||
| if ( n1 > 0 ) | |||||
| { | |||||
| dswap_kernel_32(n1, x, y); | |||||
| i=n1; | |||||
| } | |||||
| while(i < n) | |||||
| { | |||||
| temp = y[i]; | |||||
| y[i] = x[i] ; | |||||
| x[i] = temp; | |||||
| i++ ; | |||||
| } | |||||
| } | |||||
| else | |||||
| { | |||||
| while(i < n) | |||||
| { | |||||
| temp = y[iy]; | |||||
| y[iy] = x[ix] ; | |||||
| x[ix] = temp; | |||||
| ix += inc_x ; | |||||
| iy += inc_y ; | |||||
| i++ ; | |||||
| } | |||||
| } | |||||
| return(0); | |||||
| } | |||||
| @@ -0,0 +1,249 @@ | |||||
| /*************************************************************************** | |||||
| Copyright (c) 2013-2017, The OpenBLAS Project | |||||
| All rights reserved. | |||||
| Redistribution and use in source and binary forms, with or without | |||||
| modification, are permitted provided that the following conditions are | |||||
| met: | |||||
| 1. Redistributions of source code must retain the above copyright | |||||
| notice, this list of conditions and the following disclaimer. | |||||
| 2. Redistributions in binary form must reproduce the above copyright | |||||
| notice, this list of conditions and the following disclaimer in | |||||
| the documentation and/or other materials provided with the | |||||
| distribution. | |||||
| 3. Neither the name of the OpenBLAS project nor the names of | |||||
| its contributors may be used to endorse or promote products | |||||
| derived from this software without specific prior written permission. | |||||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| *****************************************************************************/ | |||||
| #include "common.h" | |||||
| #include <math.h> | |||||
| #if defined(DOUBLE) | |||||
| #define ABS fabs | |||||
| #else | |||||
| #define ABS fabsf | |||||
| #endif | |||||
| static BLASLONG __attribute__((noinline)) diamax_kernel_32_TUNED(BLASLONG n, FLOAT *x, FLOAT *maxf) { | |||||
| __asm__( | |||||
| "pfd 1, 0(%1) \n\t" | |||||
| "sllg %%r0,%0,3 \n\t" | |||||
| "agr %%r0,%1 \n\t" | |||||
| "VLEIG %%v20,0,0 \n\t" | |||||
| "VLEIG %%v20,1,1 \n\t" | |||||
| "VLEIG %%v21,2,0 \n\t" | |||||
| "VLEIG %%v21,3,1 \n\t" | |||||
| "VLEIG %%v22,4,0 \n\t" | |||||
| "VLEIG %%v22,5,1 \n\t" | |||||
| "VLEIG %%v23,6,0 \n\t" | |||||
| "VLEIG %%v23,7,1 \n\t" | |||||
| "VREPIG %%v4,8 \n\t" | |||||
| "vzero %%v5 \n\t" | |||||
| "vzero %%v18 \n\t" | |||||
| "vzero %%v19 \n\t" | |||||
| ".align 16 \n\t" | |||||
| "1: \n\t" | |||||
| "pfd 1, 256(%1 ) \n\t" | |||||
| "vlm %%v24,%%v31, 0(%1 ) \n\t" | |||||
| "vflpdb %%v24, %%v24 \n\t" | |||||
| "vflpdb %%v25, %%v25 \n\t" | |||||
| "vflpdb %%v26, %%v26 \n\t" | |||||
| "vflpdb %%v27, %%v27 \n\t" | |||||
| "vflpdb %%v28, %%v28 \n\t" | |||||
| "vflpdb %%v29, %%v29 \n\t" | |||||
| "vflpdb %%v30, %%v30 \n\t" | |||||
| "vflpdb %%v31, %%v31 \n\t" | |||||
| "vfchdb %%v16,%%v25,%%v24 \n\t " | |||||
| "vfchdb %%v17,%%v27,%%v26 \n\t " | |||||
| "vsel %%v1,%%v21,%%v20,%%v16 \n\t" | |||||
| "vsel %%v0,%%v25,%%v24,%%v16 \n\t" | |||||
| "vsel %%v2,%%v23,%%v22,%%v17 \n\t" | |||||
| "vsel %%v3,%%v27,%%v26,%%v17 \n\t" | |||||
| "vfchdb %%v16,%%v29,%%v28 \n\t " | |||||
| "vfchdb %%v17,%%v31,%%v30 \n\t" | |||||
| "vsel %%v24,%%v21,%%v20,%%v16 \n\t" | |||||
| "vsel %%v25,%%v29,%%v28,%%v16 \n\t" | |||||
| "vsel %%v26,%%v23,%%v22,%%v17 \n\t" | |||||
| "vsel %%v27,%%v31,%%v30,%%v17 \n\t" | |||||
| "vfchdb %%v28, %%v3,%%v0 \n\t" | |||||
| "vfchdb %%v29,%%v27, %%v25 \n\t" | |||||
| "vsel %%v1,%%v2,%%v1,%%v28 \n\t" | |||||
| "vsel %%v0,%%v3,%%v0,%%v28 \n\t" | |||||
| "vsel %%v24,%%v26,%%v24,%%v29 \n\t" | |||||
| "vsel %%v25,%%v27,%%v25,%%v29 \n\t" | |||||
| "VAG %%v1,%%v1,%%v5 \n\t" | |||||
| "VAG %%v24,%%v24,%%v5 \n\t" | |||||
| "VAG %%v24,%%v24,%%v4 \n\t" | |||||
| "vfchdb %%v16,%%v25 , %%v0 \n\t" | |||||
| "VAG %%v5,%%v5,%%v4 \n\t" | |||||
| "vsel %%v29,%%v25,%%v0,%%v16 \n\t" | |||||
| "vsel %%v28,%%v24,%%v1,%%v16 \n\t" | |||||
| "vfchdb %%v17, %%v29,%%v18 \n\t" | |||||
| "vsel %%v19,%%v28,%%v19,%%v17 \n\t" | |||||
| "vsel %%v18,%%v29,%%v18,%%v17 \n\t" | |||||
| "VAG %%v5,%%v5,%%v4 \n\t" | |||||
| "vlm %%v24,%%v31,128(%1 ) \n\t" | |||||
| "vflpdb %%v24, %%v24 \n\t" | |||||
| "vflpdb %%v25, %%v25 \n\t" | |||||
| "vflpdb %%v26, %%v26 \n\t" | |||||
| "vflpdb %%v27, %%v27 \n\t" | |||||
| "vflpdb %%v28, %%v28 \n\t" | |||||
| "vflpdb %%v29, %%v29 \n\t" | |||||
| "vflpdb %%v30, %%v30 \n\t" | |||||
| "vflpdb %%v31, %%v31 \n\t" | |||||
| "vfchdb %%v16,%%v25,%%v24 \n\t " | |||||
| "vfchdb %%v17,%%v27,%%v26 \n\t " | |||||
| "vsel %%v1,%%v21,%%v20,%%v16 \n\t" | |||||
| "vsel %%v0,%%v25,%%v24,%%v16 \n\t" | |||||
| "vsel %%v2,%%v23,%%v22,%%v17 \n\t" | |||||
| "vsel %%v3,%%v27,%%v26,%%v17 \n\t" | |||||
| "vfchdb %%v16,%%v29,%%v28 \n\t " | |||||
| "vfchdb %%v17,%%v31,%%v30 \n\t" | |||||
| "vsel %%v24,%%v21,%%v20,%%v16 \n\t" | |||||
| "vsel %%v25,%%v29,%%v28,%%v16 \n\t" | |||||
| "vsel %%v26,%%v23,%%v22,%%v17 \n\t" | |||||
| "vsel %%v27,%%v31,%%v30,%%v17 \n\t" | |||||
| "vfchdb %%v28, %%v3,%%v0 \n\t" | |||||
| "vfchdb %%v29,%%v27, %%v25 \n\t" | |||||
| "vsel %%v1,%%v2,%%v1,%%v28 \n\t" | |||||
| "vsel %%v0,%%v3,%%v0,%%v28 \n\t" | |||||
| "vsel %%v24,%%v26,%%v24,%%v29 \n\t" | |||||
| "vsel %%v25,%%v27,%%v25,%%v29 \n\t" | |||||
| "VAG %%v1,%%v1,%%v5 \n\t" | |||||
| "VAG %%v24,%%v24,%%v5 \n\t" | |||||
| "la %1,256(%1) \n\t" | |||||
| "VAG %%v24,%%v24,%%v4 \n\t" | |||||
| "vfchdb %%v16,%%v25 , %%v0 \n\t" | |||||
| "VAG %%v5,%%v5,%%v4 \n\t" | |||||
| "vsel %%v29,%%v25,%%v0,%%v16 \n\t" | |||||
| "vsel %%v28,%%v24,%%v1,%%v16 \n\t" | |||||
| "vfchdb %%v17, %%v29,%%v18 \n\t" | |||||
| "vsel %%v19,%%v28,%%v19,%%v17 \n\t" | |||||
| "vsel %%v18,%%v29,%%v18,%%v17 \n\t" | |||||
| "VAG %%v5,%%v5,%%v4 \n\t" | |||||
| "clgrjl %1,%%r0,1b \n\t" | |||||
| "vrepg %%v26,%%v18,1 \n\t" | |||||
| "vrepg %%v5,%%v19,1 \n\t" | |||||
| "wfcdb %%v26,%%v18 \n\t" | |||||
| "jne 2f \n\t" | |||||
| "VSTEG %%v18,0(%2),0 \n\t" | |||||
| "VMNLG %%v1,%%v5,%%v19 \n\t" | |||||
| "VLGVG %%r2,%%v1,0 \n\t" | |||||
| "br %%r14 \n\t" | |||||
| "2: \n\t" | |||||
| "wfchdb %%v16,%%v26,%%v18 \n\t" | |||||
| "vsel %%v1,%%v5,%%v19,%%v16 \n\t" | |||||
| "vsel %%v0,%%v26,%%v18,%%v16 \n\t" | |||||
| "VLGVG %%r2,%%v1,0 \n\t" | |||||
| "std %%f0,0(%2) \n\t" | |||||
| : | |||||
| : "r"(n), "a"(x), "a"(maxf) | |||||
| : "cc", "memory","r0","r1","r2","f0","v0","v1","v2","v3","v4","v5","v6","v7","v16", | |||||
| "v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" | |||||
| ); | |||||
| } | |||||
| BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { | |||||
| BLASLONG i = 0; | |||||
| BLASLONG j = 0; | |||||
| BLASLONG ix = 0; | |||||
| FLOAT maxf = 0.0; | |||||
| BLASLONG max = 0; | |||||
| if (n <= 0 || inc_x <= 0) return (max); | |||||
| if (inc_x == 1) { | |||||
| BLASLONG n1 = n & -32; | |||||
| if (n1 > 0) { | |||||
| max = diamax_kernel_32_TUNED(n1, x, &maxf); | |||||
| i = n1; | |||||
| } | |||||
| while (i < n) { | |||||
| if (ABS(x[i]) > maxf) { | |||||
| max = i; | |||||
| maxf = ABS(x[i]); | |||||
| } | |||||
| i++; | |||||
| } | |||||
| return (max + 1); | |||||
| } else { | |||||
| BLASLONG n1 = n & -4; | |||||
| while (j < n1) { | |||||
| if (ABS(x[i]) > maxf) { | |||||
| max = j; | |||||
| maxf = ABS(x[i]); | |||||
| } | |||||
| if (ABS(x[i + inc_x]) > maxf) { | |||||
| max = j + 1; | |||||
| maxf = ABS(x[i + inc_x]); | |||||
| } | |||||
| if (ABS(x[i + 2 * inc_x]) > maxf) { | |||||
| max = j + 2; | |||||
| maxf = ABS(x[i + 2 * inc_x]); | |||||
| } | |||||
| if (ABS(x[i + 3 * inc_x]) > maxf) { | |||||
| max = j + 3; | |||||
| maxf = ABS(x[i + 3 * inc_x]); | |||||
| } | |||||
| i += inc_x * 4; | |||||
| j += 4; | |||||
| } | |||||
| while (j < n) { | |||||
| if (ABS(x[i]) > maxf) { | |||||
| max = j; | |||||
| maxf = ABS(x[i]); | |||||
| } | |||||
| i += inc_x; | |||||
| j++; | |||||
| } | |||||
| return (max + 1); | |||||
| } | |||||
| } | |||||
| @@ -0,0 +1,249 @@ | |||||
| /*************************************************************************** | |||||
| Copyright (c) 2013-2017, The OpenBLAS Project | |||||
| All rights reserved. | |||||
| Redistribution and use in source and binary forms, with or without | |||||
| modification, are permitted provided that the following conditions are | |||||
| met: | |||||
| 1. Redistributions of source code must retain the above copyright | |||||
| notice, this list of conditions and the following disclaimer. | |||||
| 2. Redistributions in binary form must reproduce the above copyright | |||||
| notice, this list of conditions and the following disclaimer in | |||||
| the documentation and/or other materials provided with the | |||||
| distribution. | |||||
| 3. Neither the name of the OpenBLAS project nor the names of | |||||
| its contributors may be used to endorse or promote products | |||||
| derived from this software without specific prior written permission. | |||||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| *****************************************************************************/ | |||||
| #include "common.h" | |||||
| #include <math.h> | |||||
| #if defined(DOUBLE) | |||||
| #define ABS fabs | |||||
| #else | |||||
| #define ABS fabsf | |||||
| #endif | |||||
| static BLASLONG __attribute__((noinline)) diamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *maxf) { | |||||
| __asm__( | |||||
| "pfd 1, 0(%1) \n\t" | |||||
| "sllg %%r0,%0,3 \n\t" | |||||
| "agr %%r0,%1 \n\t" | |||||
| "VLEIG %%v20,0,0 \n\t" | |||||
| "VLEIG %%v20,1,1 \n\t" | |||||
| "VLEIG %%v21,2,0 \n\t" | |||||
| "VLEIG %%v21,3,1 \n\t" | |||||
| "VLEIG %%v22,4,0 \n\t" | |||||
| "VLEIG %%v22,5,1 \n\t" | |||||
| "VLEIG %%v23,6,0 \n\t" | |||||
| "VLEIG %%v23,7,1 \n\t" | |||||
| "VREPIG %%v4,8 \n\t" | |||||
| "vzero %%v5 \n\t" | |||||
| "vlrepg %%v18,0(%1) \n\t" | |||||
| "vzero %%v19 \n\t" | |||||
| ".align 16 \n\t" | |||||
| "1: \n\t" | |||||
| "pfd 1, 256(%1 ) \n\t" | |||||
| "vlm %%v24,%%v31, 0(%1 ) \n\t" | |||||
| "vflpdb %%v24, %%v24 \n\t" | |||||
| "vflpdb %%v25, %%v25 \n\t" | |||||
| "vflpdb %%v26, %%v26 \n\t" | |||||
| "vflpdb %%v27, %%v27 \n\t" | |||||
| "vflpdb %%v28, %%v28 \n\t" | |||||
| "vflpdb %%v29, %%v29 \n\t" | |||||
| "vflpdb %%v30, %%v30 \n\t" | |||||
| "vflpdb %%v31, %%v31 \n\t" | |||||
| "vfchdb %%v16,%%v24,%%v25 \n\t " | |||||
| "vfchdb %%v17,%%v26 ,%%v27 \n\t " | |||||
| "vsel %%v1,%%v21,%%v20,%%v16 \n\t" | |||||
| "vsel %%v0,%%v25,%%v24,%%v16 \n\t" | |||||
| "vsel %%v2,%%v23,%%v22,%%v17 \n\t" | |||||
| "vsel %%v3,%%v27,%%v26,%%v17 \n\t" | |||||
| "vfchdb %%v16,%%v28, %%v29 \n\t " | |||||
| "vfchdb %%v17,%%v30,%%v31 \n\t" | |||||
| "vsel %%v24,%%v21,%%v20,%%v16 \n\t" | |||||
| "vsel %%v25,%%v29,%%v28,%%v16 \n\t" | |||||
| "vsel %%v26,%%v23,%%v22,%%v17 \n\t" | |||||
| "vsel %%v27,%%v31,%%v30,%%v17 \n\t" | |||||
| "vfchdb %%v28,%%v0 , %%v3 \n\t" | |||||
| "vfchdb %%v29, %%v25,%%v27 \n\t" | |||||
| "vsel %%v1,%%v2,%%v1,%%v28 \n\t" | |||||
| "vsel %%v0,%%v3,%%v0,%%v28 \n\t" | |||||
| "vsel %%v24,%%v26,%%v24,%%v29 \n\t" | |||||
| "vsel %%v25,%%v27,%%v25,%%v29 \n\t" | |||||
| "VAG %%v1,%%v1,%%v5 \n\t" | |||||
| "VAG %%v24,%%v24,%%v5 \n\t" | |||||
| "VAG %%v24,%%v24,%%v4 \n\t" | |||||
| "vfchdb %%v16, %%v0,%%v25 \n\t" | |||||
| "VAG %%v5,%%v5,%%v4 \n\t" | |||||
| "vsel %%v29,%%v25,%%v0,%%v16 \n\t" | |||||
| "vsel %%v28,%%v24,%%v1,%%v16 \n\t" | |||||
| "vfchdb %%v17,%%v18, %%v29 \n\t" | |||||
| "vsel %%v19,%%v28,%%v19,%%v17 \n\t" | |||||
| "vsel %%v18,%%v29,%%v18,%%v17 \n\t" | |||||
| "VAG %%v5,%%v5,%%v4 \n\t" | |||||
| "vlm %%v24,%%v31,128(%1 ) \n\t" | |||||
| "vflpdb %%v24, %%v24 \n\t" | |||||
| "vflpdb %%v25, %%v25 \n\t" | |||||
| "vflpdb %%v26, %%v26 \n\t" | |||||
| "vflpdb %%v27, %%v27 \n\t" | |||||
| "vflpdb %%v28, %%v28 \n\t" | |||||
| "vflpdb %%v29, %%v29 \n\t" | |||||
| "vflpdb %%v30, %%v30 \n\t" | |||||
| "vflpdb %%v31, %%v31 \n\t" | |||||
| "vfchdb %%v16,%%v24,%%v25 \n\t" | |||||
| "vfchdb %%v17,%%v26 ,%%v27 \n\t" | |||||
| "vsel %%v1,%%v21,%%v20,%%v16 \n\t" | |||||
| "vsel %%v0,%%v25,%%v24,%%v16 \n\t" | |||||
| "vsel %%v2,%%v23,%%v22,%%v17 \n\t" | |||||
| "vsel %%v3,%%v27,%%v26,%%v17 \n\t" | |||||
| "vfchdb %%v16,%%v28 ,%%v29 \n\t" | |||||
| "vfchdb %%v17,%%v30,%%v31 \n\t" | |||||
| "vsel %%v24,%%v21,%%v20,%%v16 \n\t" | |||||
| "vsel %%v25,%%v29,%%v28,%%v16 \n\t" | |||||
| "vsel %%v26,%%v23,%%v22,%%v17 \n\t" | |||||
| "vsel %%v27,%%v31,%%v30,%%v17 \n\t" | |||||
| "vfchdb %%v28,%%v0 , %%v3 \n\t" | |||||
| "vfchdb %%v29, %%v25,%%v27 \n\t" | |||||
| "vsel %%v1,%%v2,%%v1,%%v28 \n\t" | |||||
| "vsel %%v0,%%v3,%%v0,%%v28 \n\t" | |||||
| "vsel %%v24,%%v26,%%v24,%%v29 \n\t" | |||||
| "vsel %%v25,%%v27,%%v25,%%v29 \n\t" | |||||
| "VAG %%v1,%%v1,%%v5 \n\t" | |||||
| "VAG %%v24,%%v24,%%v5 \n\t" | |||||
| "la %1,256(%1) \n\t" | |||||
| "VAG %%v24,%%v24,%%v4 \n\t" | |||||
| "vfchdb %%v16, %%v0,%%v25 \n\t" | |||||
| "VAG %%v5,%%v5,%%v4 \n\t" | |||||
| "vsel %%v29,%%v25,%%v0,%%v16 \n\t" | |||||
| "vsel %%v28,%%v24,%%v1,%%v16 \n\t" | |||||
| "vfchdb %%v17,%%v18, %%v29 \n\t" | |||||
| "vsel %%v19,%%v28,%%v19,%%v17 \n\t" | |||||
| "vsel %%v18,%%v29,%%v18,%%v17 \n\t" | |||||
| "VAG %%v5,%%v5,%%v4 \n\t" | |||||
| "clgrjl %1,%%r0,1b \n\t" | |||||
| "vrepg %%v26,%%v18,1 \n\t" | |||||
| "vrepg %%v5,%%v19,1 \n\t" | |||||
| "wfcdb %%v26,%%v18 \n\t" | |||||
| "jne 2f \n\t" | |||||
| "VSTEG %%v18,0(%2),0 \n\t" | |||||
| "VMNLG %%v1,%%v5,%%v19 \n\t" | |||||
| "VLGVG %%r2,%%v1,0 \n\t" | |||||
| "br %%r14 \n\t" | |||||
| "2: \n\t" | |||||
| "wfchdb %%v16,%%v18 ,%%v26 \n\t " | |||||
| "vsel %%v1,%%v5,%%v19,%%v16 \n\t" | |||||
| "vsel %%v0,%%v26,%%v18,%%v16 \n\t" | |||||
| "VLGVG %%r2,%%v1,0 \n\t" | |||||
| "std %%f0,0(%2) \n\t" | |||||
| : | |||||
| : "r"(n), "a"(x), "a"(maxf) | |||||
| : "cc", "memory","r0","r1","r2","f0","v0","v1","v2","v3","v4","v5","v6","v7","v16", | |||||
| "v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" | |||||
| ); | |||||
| } | |||||
| BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { | |||||
| BLASLONG i = 0; | |||||
| BLASLONG j = 0; | |||||
| BLASLONG ix = 0; | |||||
| FLOAT minf = 0.0; | |||||
| BLASLONG min = 0; | |||||
| if (n <= 0 || inc_x <= 0) return (min); | |||||
| if (inc_x == 1) { | |||||
| BLASLONG n1 = n & -32; | |||||
| if (n1 > 0) { | |||||
| min = diamin_kernel_32(n1, x, &minf); | |||||
| i = n1; | |||||
| } | |||||
| while (i < n) { | |||||
| if (ABS(x[i]) < minf) { | |||||
| min = i; | |||||
| minf = ABS(x[i]); | |||||
| } | |||||
| i++; | |||||
| } | |||||
| return (min + 1); | |||||
| } else { | |||||
| BLASLONG n1 = n & -4; | |||||
| while (j < n1) { | |||||
| if (ABS(x[i]) < minf) { | |||||
| min = j; | |||||
| minf = ABS(x[i]); | |||||
| } | |||||
| if (ABS(x[i + inc_x]) < minf) { | |||||
| min = j + 1; | |||||
| minf = ABS(x[i + inc_x]); | |||||
| } | |||||
| if (ABS(x[i + 2 * inc_x]) < minf) { | |||||
| min = j + 2; | |||||
| minf = ABS(x[i + 2 * inc_x]); | |||||
| } | |||||
| if (ABS(x[i + 3 * inc_x]) < minf) { | |||||
| min = j + 3; | |||||
| minf = ABS(x[i + 3 * inc_x]); | |||||
| } | |||||
| i += inc_x * 4; | |||||
| j += 4; | |||||
| } | |||||
| while (j < n) { | |||||
| if (ABS(x[i]) < minf) { | |||||
| min = j; | |||||
| minf = ABS(x[i]); | |||||
| } | |||||
| i += inc_x; | |||||
| j++; | |||||
| } | |||||
| return (min + 1); | |||||
| } | |||||
| } | |||||
| @@ -0,0 +1,257 @@ | |||||
| /*************************************************************************** | |||||
| Copyright (c) 2017, The OpenBLAS Project | |||||
| All rights reserved. | |||||
| Redistribution and use in source and binary forms, with or without | |||||
| modification, are permitted provided that the following conditions are | |||||
| met: | |||||
| 1. Redistributions of source code must retain the above copyright | |||||
| notice, this list of conditions and the following disclaimer. | |||||
| 2. Redistributions in binary form must reproduce the above copyright | |||||
| notice, this list of conditions and the following disclaimer in | |||||
| the documentation and/or other materials provided with the | |||||
| distribution. | |||||
| 3. Neither the name of the OpenBLAS project nor the names of | |||||
| its contributors may be used to endorse or promote products | |||||
| derived from this software without specific prior written permission. | |||||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| *****************************************************************************/ | |||||
| #include "common.h" | |||||
| #include <math.h> | |||||
| #define ABS fabs | |||||
| #define CABS1(x,i) ABS(x[i])+ABS(x[i+1]) | |||||
| static BLASLONG __attribute__((noinline)) ziamax_kernel_8_TUNED(BLASLONG n, FLOAT *x, FLOAT *maxf) { | |||||
| __asm__( | |||||
| "pfd 1, 0(%1) \n\t" | |||||
| "VLEIG %%v16,0,0 \n\t" | |||||
| "VLEIG %%v16,1,1 \n\t" | |||||
| "VLEIG %%v17,2,0 \n\t" | |||||
| "VLEIG %%v17,3,1 \n\t" | |||||
| "VLEIG %%v18,4,0 \n\t" | |||||
| "VLEIG %%v18,5,1 \n\t" | |||||
| "VLEIG %%v19,6,0 \n\t" | |||||
| "VLEIG %%v19,7,1 \n\t" | |||||
| "VLEIG %%v20,8,0 \n\t" | |||||
| "VLEIG %%v20,9,1 \n\t" | |||||
| "VLEIG %%v21,10,0 \n\t" | |||||
| "VLEIG %%v21,11,1 \n\t" | |||||
| "VLEIG %%v22,12,0 \n\t" | |||||
| "VLEIG %%v22,13,1 \n\t" | |||||
| "VLEIG %%v23,14,0 \n\t" | |||||
| "VLEIG %%v23,15,1 \n\t" | |||||
| "sllg %%r0,%0,4 \n\t" | |||||
| "agr %%r0,%1 \n\t" | |||||
| "vzero %%v6 \n\t" | |||||
| "vzero %%v7 \n\t" | |||||
| "VREPIG %%v4,16 \n\t" | |||||
| "vzero %%v5 \n\t" | |||||
| ".align 16 \n\t" | |||||
| "1: \n\t" | |||||
| "pfd 1, 256(%1 ) \n\t" | |||||
| "vleg %%v24 , 0( %1),0 \n\t" | |||||
| "vleg %%v25 , 8( %1),0 \n\t" | |||||
| "vleg %%v24 , 16( %1),1 \n\t" | |||||
| "vleg %%v25 , 24( %1),1 \n\t" | |||||
| "vleg %%v26 , 32( %1),0 \n\t" | |||||
| "vleg %%v27 , 40( %1),0 \n\t" | |||||
| "vleg %%v26 , 48( %1),1 \n\t" | |||||
| "vleg %%v27 , 56( %1),1 \n\t" | |||||
| "vleg %%v28 , 64( %1),0 \n\t" | |||||
| "vleg %%v29 , 72( %1),0 \n\t" | |||||
| "vleg %%v28 , 80( %1),1 \n\t" | |||||
| "vleg %%v29 , 88( %1),1 \n\t" | |||||
| "vleg %%v30 , 96( %1),0 \n\t" | |||||
| "vleg %%v31 ,104( %1),0 \n\t" | |||||
| "vleg %%v30 ,112( %1),1 \n\t" | |||||
| "vleg %%v31 ,120( %1),1 \n\t" | |||||
| "vflpdb %%v24, %%v24 \n\t" | |||||
| "vflpdb %%v25, %%v25 \n\t" | |||||
| "vflpdb %%v26, %%v26 \n\t" | |||||
| "vflpdb %%v27, %%v27 \n\t" | |||||
| "vflpdb %%v28, %%v28 \n\t" | |||||
| "vflpdb %%v29, %%v29 \n\t" | |||||
| "vflpdb %%v30, %%v30 \n\t" | |||||
| "vflpdb %%v31, %%v31 \n\t" | |||||
| "vfadb %%v0,%%v24,%%v25 \n\t" | |||||
| "vfadb %%v1,%%v26,%%v27 \n\t" | |||||
| "vfadb %%v2,%%v28,%%v29 \n\t" | |||||
| "vfadb %%v3,%%v30,%%v31 \n\t" | |||||
| "vleg %%v24 , 128( %1),0 \n\t" | |||||
| "vleg %%v25 , 136( %1),0 \n\t" | |||||
| "vleg %%v24 , 144( %1),1 \n\t" | |||||
| "vleg %%v25 , 152( %1),1 \n\t" | |||||
| "vleg %%v26 , 160( %1),0 \n\t" | |||||
| "vleg %%v27 , 168( %1),0 \n\t" | |||||
| "vleg %%v26 , 176( %1),1 \n\t" | |||||
| "vleg %%v27 , 184( %1),1 \n\t" | |||||
| "vleg %%v28 , 192( %1),0 \n\t" | |||||
| "vleg %%v29 , 200( %1),0 \n\t" | |||||
| "vleg %%v28 , 208( %1),1 \n\t" | |||||
| "vleg %%v29 , 216( %1),1 \n\t" | |||||
| "vleg %%v30 , 224( %1),0 \n\t" | |||||
| "vleg %%v31 , 232( %1),0 \n\t" | |||||
| "vleg %%v30 , 240( %1),1 \n\t" | |||||
| "vleg %%v31 , 248( %1),1 \n\t" | |||||
| "vflpdb %%v24, %%v24 \n\t" | |||||
| "vflpdb %%v25, %%v25 \n\t" | |||||
| "vflpdb %%v26, %%v26 \n\t" | |||||
| "vflpdb %%v27, %%v27 \n\t" | |||||
| "vflpdb %%v28, %%v28 \n\t" | |||||
| "vflpdb %%v29, %%v29 \n\t" | |||||
| "vflpdb %%v30, %%v30 \n\t" | |||||
| "vflpdb %%v31, %%v31 \n\t" | |||||
| "vfadb %%v24,%%v24,%%v25 \n\t" | |||||
| "vfadb %%v26,%%v26,%%v27 \n\t" | |||||
| "vfadb %%v28,%%v28,%%v29 \n\t" | |||||
| "vfadb %%v30,%%v30,%%v31 \n\t" | |||||
| "vfchdb %%v25,%%v1,%%v0 \n\t" | |||||
| "vsel %%v29,%%v17,%%v16,%%v25 \n\t" | |||||
| "vsel %%v31,%%v1,%%v0,%%v25 \n\t" | |||||
| "vfchdb %%v27,%%v3,%%v2 \n\t " | |||||
| "vsel %%v0,%%v19,%%v18,%%v27 \n\t" | |||||
| "vsel %%v1,%%v3,%%v2,%%v27 \n\t" | |||||
| "vfchdb %%v25,%%v26,%%v24 \n\t " | |||||
| "vsel %%v2,%%v21,%%v20,%%v25 \n\t" | |||||
| "vsel %%v3,%%v26,%%v24,%%v25 \n\t" | |||||
| "vfchdb %%v27,%%v30,%%v28 \n\t " | |||||
| "vsel %%v25,%%v23,%%v22,%%v27 \n\t" | |||||
| "vsel %%v27,%%v30,%%v28,%%v27 \n\t" | |||||
| "vfchdb %%v24, %%v1,%%v31 \n\t" | |||||
| "vsel %%v26,%%v0,%%v29,%%v24 \n\t" | |||||
| "vsel %%v28,%%v1,%%v31,%%v24 \n\t" | |||||
| "vfchdb %%v30, %%v27,%%v3 \n\t" | |||||
| "vsel %%v29,%%v25,%%v2,%%v30 \n\t" | |||||
| "vsel %%v31,%%v27,%%v3 ,%%v30 \n\t" | |||||
| "la %1,256(%1) \n\t" | |||||
| "vfchdb %%v0, %%v31,%%v28 \n\t" | |||||
| "vsel %%v25,%%v29,%%v26,%%v0 \n\t" | |||||
| "vsel %%v27,%%v31,%%v28,%%v0 \n\t" | |||||
| "VAG %%v25,%%v25,%%v5 \n\t" | |||||
| //cmp with previous | |||||
| "vfchdb %%v30, %%v27,%%v6 \n\t" | |||||
| "vsel %%v7,%%v25,%%v7,%%v30 \n\t" | |||||
| "vsel %%v6,%%v27,%%v6,%%v30 \n\t" | |||||
| "VAG %%v5,%%v5,%%v4 \n\t" | |||||
| "clgrjl %1,%%r0,1b \n\t" | |||||
| //xtract index | |||||
| "vrepg %%v26,%%v6,1 \n\t" | |||||
| "vrepg %%v5,%%v7,1 \n\t" | |||||
| "wfcdb %%v26,%%v6 \n\t" | |||||
| "jne 2f \n\t" | |||||
| "VSTEG %%v6,0(%2),0 \n\t" | |||||
| "VMNLG %%v1,%%v5,%%v7 \n\t" | |||||
| "VLGVG %%r2,%%v1,0 \n\t" | |||||
| "br %%r14 \n\t" | |||||
| "2: \n\t" | |||||
| "wfchdb %%v16,%%v26,%%v6 \n\t" | |||||
| "vsel %%v1,%%v5,%%v7,%%v16 \n\t" | |||||
| "vsel %%v0,%%v26,%%v6,%%v16 \n\t" | |||||
| "VLGVG %%r2,%%v1,0 \n\t" | |||||
| "std %%f0,0(%2) \n\t" | |||||
| : | |||||
| : "r"(n), "a"(x), "a"(maxf) | |||||
| : "cc", "memory","r0","r1","r2","f0","v0","v1","v2","v3","v4","v5","v6","v7","v16", | |||||
| "v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" | |||||
| ); | |||||
| } | |||||
| BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||||
| { | |||||
| BLASLONG i = 0; | |||||
| BLASLONG ix = 0; | |||||
| FLOAT maxf = 0; | |||||
| BLASLONG max = 0; | |||||
| BLASLONG inc_x2; | |||||
| if (n <= 0 || inc_x <= 0) return(max); | |||||
| if (inc_x == 1) { | |||||
| BLASLONG n1 = n & -8; | |||||
| if (n1 > 0) { | |||||
| max = ziamax_kernel_8_TUNED(n1, x, &maxf); | |||||
| i = n1; | |||||
| } | |||||
| while(i < n) | |||||
| { | |||||
| if( CABS1(x,ix) > maxf ) | |||||
| { | |||||
| max = i; | |||||
| maxf = CABS1(x,ix); | |||||
| } | |||||
| ix += 2; | |||||
| i++; | |||||
| } | |||||
| return (max + 1); | |||||
| } else { | |||||
| inc_x2 = 2 * inc_x; | |||||
| maxf = CABS1(x,0); | |||||
| ix += inc_x2; | |||||
| i++; | |||||
| while(i < n) | |||||
| { | |||||
| if( CABS1(x,ix) > maxf ) | |||||
| { | |||||
| max = i; | |||||
| maxf = CABS1(x,ix); | |||||
| } | |||||
| ix += inc_x2; | |||||
| i++; | |||||
| } | |||||
| return (max + 1); | |||||
| } | |||||
| } | |||||
| @@ -0,0 +1,259 @@ | |||||
| /*************************************************************************** | |||||
| Copyright (c) 2017, The OpenBLAS Project | |||||
| All rights reserved. | |||||
| Redistribution and use in source and binary forms, with or without | |||||
| modification, are permitted provided that the following conditions are | |||||
| met: | |||||
| 1. Redistributions of source code must retain the above copyright | |||||
| notice, this list of conditions and the following disclaimer. | |||||
| 2. Redistributions in binary form must reproduce the above copyright | |||||
| notice, this list of conditions and the following disclaimer in | |||||
| the documentation and/or other materials provided with the | |||||
| distribution. | |||||
| 3. Neither the name of the OpenBLAS project nor the names of | |||||
| its contributors may be used to endorse or promote products | |||||
| derived from this software without specific prior written permission. | |||||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| *****************************************************************************/ | |||||
| #include "common.h" | |||||
| #include <math.h> | |||||
| #define ABS fabs | |||||
| #define CABS1(x,i) ABS(x[i])+ABS(x[i+1]) | |||||
| static BLASLONG __attribute__((noinline)) ziamin_kernel_8_TUNED(BLASLONG n, FLOAT *x, FLOAT *minf) { | |||||
| __asm__( | |||||
| "pfd 1, 0(%1) \n\t" | |||||
| "VLEIG %%v16,0,0 \n\t" | |||||
| "VLEIG %%v16,1,1 \n\t" | |||||
| "VLEIG %%v17,2,0 \n\t" | |||||
| "VLEIG %%v17,3,1 \n\t" | |||||
| "VLEIG %%v18,4,0 \n\t" | |||||
| "VLEIG %%v18,5,1 \n\t" | |||||
| "VLEIG %%v19,6,0 \n\t" | |||||
| "VLEIG %%v19,7,1 \n\t" | |||||
| "VLEIG %%v20,8,0 \n\t" | |||||
| "VLEIG %%v20,9,1 \n\t" | |||||
| "VLEIG %%v21,10,0 \n\t" | |||||
| "VLEIG %%v21,11,1 \n\t" | |||||
| "VLEIG %%v22,12,0 \n\t" | |||||
| "VLEIG %%v22,13,1 \n\t" | |||||
| "VLEIG %%v23,14,0 \n\t" | |||||
| "VLEIG %%v23,15,1 \n\t" | |||||
| "ld %%f6,0(%1) \n\t" | |||||
| "lpdbr %%f6,%%f6 \n\t" | |||||
| "ld %%f7,8(%1) \n\t" | |||||
| "lpdbr %%f7,%%f7 \n\t" | |||||
| "adbr %%f6,%%f7 \n\t" | |||||
| "sllg %%r0,%0,4 \n\t" | |||||
| "agr %%r0,%1 \n\t" | |||||
| "vrepg %%v6,%%v6,0 \n\t" | |||||
| "vzero %%v7 \n\t" | |||||
| "VREPIG %%v4,16 \n\t" | |||||
| "vzero %%v5 \n\t" | |||||
| ".align 16 \n\t" | |||||
| "1: \n\t" | |||||
| "pfd 1, 256(%1 ) \n\t" | |||||
| "vleg %%v24 , 0( %1),0 \n\t" | |||||
| "vleg %%v25 , 8( %1),0 \n\t" | |||||
| "vleg %%v24 , 16( %1),1 \n\t" | |||||
| "vleg %%v25 , 24( %1),1 \n\t" | |||||
| "vleg %%v26 , 32( %1),0 \n\t" | |||||
| "vleg %%v27 , 40( %1),0 \n\t" | |||||
| "vleg %%v26 , 48( %1),1 \n\t" | |||||
| "vleg %%v27 , 56( %1),1 \n\t" | |||||
| "vleg %%v28 , 64( %1),0 \n\t" | |||||
| "vleg %%v29 , 72( %1),0 \n\t" | |||||
| "vleg %%v28 , 80( %1),1 \n\t" | |||||
| "vleg %%v29 , 88( %1),1 \n\t" | |||||
| "vleg %%v30 , 96( %1),0 \n\t" | |||||
| "vleg %%v31 ,104( %1),0 \n\t" | |||||
| "vleg %%v30 ,112( %1),1 \n\t" | |||||
| "vleg %%v31 ,120( %1),1 \n\t" | |||||
| "vflpdb %%v24, %%v24 \n\t" | |||||
| "vflpdb %%v25, %%v25 \n\t" | |||||
| "vflpdb %%v26, %%v26 \n\t" | |||||
| "vflpdb %%v27, %%v27 \n\t" | |||||
| "vflpdb %%v28, %%v28 \n\t" | |||||
| "vflpdb %%v29, %%v29 \n\t" | |||||
| "vflpdb %%v30, %%v30 \n\t" | |||||
| "vflpdb %%v31, %%v31 \n\t" | |||||
| "vfadb %%v0,%%v24,%%v25 \n\t" | |||||
| "vfadb %%v1,%%v26,%%v27 \n\t" | |||||
| "vfadb %%v2,%%v28,%%v29 \n\t" | |||||
| "vfadb %%v3,%%v30,%%v31 \n\t" | |||||
| "vleg %%v24 ,128( %1),0 \n\t" | |||||
| "vleg %%v25 ,136( %1),0 \n\t" | |||||
| "vleg %%v24 ,144( %1),1 \n\t" | |||||
| "vleg %%v25 ,152( %1),1 \n\t" | |||||
| "vleg %%v26 ,160( %1),0 \n\t" | |||||
| "vleg %%v27 ,168( %1),0 \n\t" | |||||
| "vleg %%v26 ,176( %1),1 \n\t" | |||||
| "vleg %%v27 ,184( %1),1 \n\t" | |||||
| "vleg %%v28 ,192( %1),0 \n\t" | |||||
| "vleg %%v29 ,200( %1),0 \n\t" | |||||
| "vleg %%v28 ,208( %1),1 \n\t" | |||||
| "vleg %%v29 ,216( %1),1 \n\t" | |||||
| "vleg %%v30 ,224( %1),0 \n\t" | |||||
| "vleg %%v31 ,232( %1),0 \n\t" | |||||
| "vleg %%v30 ,240( %1),1 \n\t" | |||||
| "vleg %%v31 ,248( %1),1 \n\t" | |||||
| "vflpdb %%v24, %%v24 \n\t" | |||||
| "vflpdb %%v25, %%v25 \n\t" | |||||
| "vflpdb %%v26, %%v26 \n\t" | |||||
| "vflpdb %%v27, %%v27 \n\t" | |||||
| "vflpdb %%v28, %%v28 \n\t" | |||||
| "vflpdb %%v29, %%v29 \n\t" | |||||
| "vflpdb %%v30, %%v30 \n\t" | |||||
| "vflpdb %%v31, %%v31 \n\t" | |||||
| "vfadb %%v24,%%v24,%%v25 \n\t" | |||||
| "vfadb %%v26,%%v26,%%v27 \n\t" | |||||
| "vfadb %%v28,%%v28,%%v29 \n\t" | |||||
| "vfadb %%v30,%%v30,%%v31 \n\t" | |||||
| "vfchdb %%v25,%%v0 ,%%v1 \n\t" | |||||
| "vsel %%v29,%%v17,%%v16,%%v25 \n\t" | |||||
| "vsel %%v31,%%v1,%%v0,%%v25 \n\t" | |||||
| "vfchdb %%v27,%%v2,%%v3 \n\t" | |||||
| "vsel %%v0,%%v19,%%v18,%%v27 \n\t" | |||||
| "vsel %%v1,%%v3,%%v2,%%v27 \n\t" | |||||
| "vfchdb %%v25,%%v24,%%v26 \n\t" | |||||
| "vsel %%v2,%%v21,%%v20,%%v25 \n\t" | |||||
| "vsel %%v3,%%v26,%%v24,%%v25 \n\t" | |||||
| "vfchdb %%v27,%%v28,%%v30 \n\t" | |||||
| "vsel %%v25,%%v23,%%v22,%%v27 \n\t" | |||||
| "vsel %%v27,%%v30,%%v28,%%v27 \n\t" | |||||
| "vfchdb %%v24,%%v31, %%v1 \n\t" | |||||
| "vsel %%v26,%%v0,%%v29,%%v24 \n\t" | |||||
| "vsel %%v28,%%v1,%%v31,%%v24 \n\t" | |||||
| "vfchdb %%v30,%%v3, %%v27 \n\t" | |||||
| "vsel %%v29,%%v25,%%v2,%%v30 \n\t" | |||||
| "vsel %%v31,%%v27,%%v3 ,%%v30 \n\t" | |||||
| "la %1,256(%1) \n\t" | |||||
| "vfchdb %%v0,%%v28, %%v31 \n\t" | |||||
| "vsel %%v25,%%v29,%%v26,%%v0 \n\t" | |||||
| "vsel %%v27,%%v31,%%v28,%%v0 \n\t" | |||||
| "VAG %%v25,%%v25,%%v5 \n\t" | |||||
| //cmp with previous | |||||
| "vfchdb %%v30,%%v6 , %%v27 \n\t" | |||||
| "vsel %%v7,%%v25,%%v7,%%v30 \n\t" | |||||
| "vsel %%v6,%%v27,%%v6,%%v30 \n\t" | |||||
| "VAG %%v5,%%v5,%%v4 \n\t" | |||||
| "clgrjl %1,%%r0,1b \n\t" | |||||
| //xtract index | |||||
| "vrepg %%v26,%%v6,1 \n\t" | |||||
| "vrepg %%v5,%%v7,1 \n\t" | |||||
| "wfcdb %%v26,%%v6 \n\t" | |||||
| "jne 2f \n\t" | |||||
| "VSTEG %%v6,0(%2),0 \n\t" | |||||
| "VMNLG %%v1,%%v5,%%v7 \n\t" | |||||
| "VLGVG %%r2,%%v1,0 \n\t" | |||||
| "br %%r14 \n\t" | |||||
| "2: \n\t" | |||||
| "wfchdb %%v16,%%v6 ,%%v26 \n\t" | |||||
| "vsel %%v1,%%v5,%%v7,%%v16 \n\t" | |||||
| "vsel %%v0,%%v26,%%v6,%%v16 \n\t" | |||||
| "VLGVG %%r2,%%v1,0 \n\t" | |||||
| "std %%f0,0(%2) \n\t" | |||||
| : | |||||
| : "r"(n), "a"(x), "a"(minf) | |||||
| : "cc", "memory","r0","r1","r2","f0","v0","v1","v2","v3","v4","v5","v6","v7","v16", | |||||
| "v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" | |||||
| ); | |||||
| } | |||||
| BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||||
| { | |||||
| BLASLONG i=0; | |||||
| BLASLONG ix=0; | |||||
| FLOAT minf; | |||||
| BLASLONG min=0; | |||||
| BLASLONG inc_x2; | |||||
| if (n <= 0 || inc_x <= 0) return(min); | |||||
| if (inc_x == 1) { | |||||
| BLASLONG n1 = n & -8; | |||||
| if (n1 > 0) { | |||||
| min = ziamin_kernel_8_TUNED(n1, x, &minf); | |||||
| i = n1; | |||||
| } | |||||
| while(i < n) | |||||
| { | |||||
| if( CABS1(x,ix) < minf ) | |||||
| { | |||||
| min = i; | |||||
| minf = CABS1(x,ix); | |||||
| } | |||||
| ix += 2; | |||||
| i++; | |||||
| } | |||||
| return (min + 1); | |||||
| } else { | |||||
| inc_x2 = 2 * inc_x; | |||||
| minf = CABS1(x,0); | |||||
| ix += inc_x2; | |||||
| i++; | |||||
| while(i < n) | |||||
| { | |||||
| if( CABS1(x,ix) < minf ) | |||||
| { | |||||
| min = i; | |||||
| minf = CABS1(x,ix); | |||||
| } | |||||
| ix += inc_x2; | |||||
| i++; | |||||
| } | |||||
| return (min + 1); | |||||
| } | |||||
| } | |||||
| @@ -0,0 +1,156 @@ | |||||
| /*************************************************************************** | |||||
| Copyright (c) 2013-2017, The OpenBLAS Project | |||||
| All rights reserved. | |||||
| Redistribution and use in source and binary forms, with or without | |||||
| modification, are permitted provided that the following conditions are | |||||
| met: | |||||
| 1. Redistributions of source code must retain the above copyright | |||||
| notice, this list of conditions and the following disclaimer. | |||||
| 2. Redistributions in binary form must reproduce the above copyright | |||||
| notice, this list of conditions and the following disclaimer in | |||||
| the documentation and/or other materials provided with the | |||||
| distribution. | |||||
| 3. Neither the name of the OpenBLAS project nor the names of | |||||
| its contributors may be used to endorse or promote products | |||||
| derived from this software without specific prior written permission. | |||||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| *****************************************************************************/ | |||||
| #include "common.h" | |||||
| #include <math.h> | |||||
| #if defined(DOUBLE) | |||||
| #define ABS fabs | |||||
| #else | |||||
| #define ABS fabsf | |||||
| #endif | |||||
| static FLOAT __attribute__ ((noinline)) zasum_kernel_16(BLASLONG n, FLOAT *x) { | |||||
| __asm__ ( | |||||
| "pfd 1, 0(%1) \n\t" | |||||
| "sllg %%r0,%0,4 \n\t" | |||||
| "agr %%r0,%1 \n\t" | |||||
| "vzero %%v0 \n\t" | |||||
| "vzero %%v1 \n\t" | |||||
| "vzero %%v22 \n\t" | |||||
| "vzero %%v23 \n\t" | |||||
| ".align 16 \n\t" | |||||
| "1: \n\t" | |||||
| "pfd 1, 256(%1 ) \n\t" | |||||
| "vlm %%v24,%%v31,0(%1) \n\t" | |||||
| "vflpdb %%v24, %%v24 \n\t" | |||||
| "vflpdb %%v25, %%v25 \n\t" | |||||
| "vflpdb %%v26, %%v26 \n\t" | |||||
| "vflpdb %%v27, %%v27 \n\t" | |||||
| "vflpdb %%v28, %%v28 \n\t" | |||||
| "vflpdb %%v29, %%v29 \n\t" | |||||
| "vflpdb %%v30, %%v30 \n\t" | |||||
| "vflpdb %%v31, %%v31 \n\t" | |||||
| "vfadb %%v0,%%v0,%%v24 \n\t" | |||||
| "vfadb %%v1,%%v1,%%v25 \n\t" | |||||
| "vfadb %%v23,%%v23,%%v26 \n\t" | |||||
| "vfadb %%v22,%%v22,%%v27 \n\t" | |||||
| "vfadb %%v0,%%v0,%%v28 \n\t" | |||||
| "vfadb %%v1,%%v1,%%v29 \n\t" | |||||
| "vfadb %%v23,%%v23,%%v30 \n\t" | |||||
| "vfadb %%v22,%%v22,%%v31 \n\t" | |||||
| "vlm %%v24,%%v31, 128(%1 ) \n\t" | |||||
| "vflpdb %%v24, %%v24 \n\t" | |||||
| "vflpdb %%v25, %%v25 \n\t" | |||||
| "vflpdb %%v26, %%v26 \n\t" | |||||
| "vflpdb %%v27, %%v27 \n\t" | |||||
| "vflpdb %%v28, %%v28 \n\t" | |||||
| "vflpdb %%v29, %%v29 \n\t" | |||||
| "vflpdb %%v30, %%v30 \n\t" | |||||
| "vflpdb %%v31, %%v31 \n\t" | |||||
| "la %1,256(%1) \n\t" | |||||
| "vfadb %%v0,%%v0,%%v24 \n\t" | |||||
| "vfadb %%v1,%%v1,%%v25 \n\t" | |||||
| "vfadb %%v23,%%v23,%%v26 \n\t" | |||||
| "vfadb %%v22,%%v22,%%v27 \n\t" | |||||
| "vfadb %%v0,%%v0,%%v28 \n\t" | |||||
| "vfadb %%v1,%%v1,%%v29 \n\t" | |||||
| "vfadb %%v23,%%v23,%%v30 \n\t" | |||||
| "vfadb %%v22,%%v22,%%v31 \n\t" | |||||
| "clgrjl %1,%%r0,1b \n\t" | |||||
| "vfadb %%v24,%%v0,%%v1 \n\t" | |||||
| "vfadb %%v25,%%v23,%%v22 \n\t" | |||||
| "vfadb %%v0,%%v25,%%v24 \n\t" | |||||
| "vrepg %%v1,%%v0,1 \n\t" | |||||
| "adbr %%f0,%%f1 \n\t" | |||||
| : | |||||
| : "r"(n), "a"(x) | |||||
| : "cc", "memory","r0","f0","f1","v0","v1","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" | |||||
| ); | |||||
| } | |||||
| FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||||
| { | |||||
| BLASLONG i=0; | |||||
| BLASLONG ip=0; | |||||
| FLOAT sumf = 0.0; | |||||
| BLASLONG n1; | |||||
| BLASLONG inc_x2; | |||||
| if (n <= 0 || inc_x <= 0) return(sumf); | |||||
| if ( inc_x == 1 ) | |||||
| { | |||||
| n1 = n & -16; | |||||
| if ( n1 > 0 ) | |||||
| { | |||||
| sumf=zasum_kernel_16(n1, x ); | |||||
| i=n1; | |||||
| ip=2*n1; | |||||
| } | |||||
| while(i < n) | |||||
| { | |||||
| sumf += ABS(x[ip]) + ABS(x[ip+1]); | |||||
| i++; | |||||
| ip+=2; | |||||
| } | |||||
| } | |||||
| else | |||||
| { | |||||
| inc_x2 = 2* inc_x; | |||||
| while(i < n) | |||||
| { | |||||
| sumf += ABS(x[ip]) + ABS(x[ip+1]); | |||||
| ip+=inc_x2; | |||||
| i++; | |||||
| } | |||||
| } | |||||
| return(sumf); | |||||
| } | |||||
| @@ -0,0 +1,207 @@ | |||||
| /*************************************************************************** | |||||
| Copyright (c) 2017, The OpenBLAS Project | |||||
| All rights reserved. | |||||
| Redistribution and use in source and binary forms, with or without | |||||
| modification, are permitted provided that the following conditions are | |||||
| met: | |||||
| 1. Redistributions of source code must retain the above copyright | |||||
| notice, this list of conditions and the following disclaimer. | |||||
| 2. Redistributions in binary form must reproduce the above copyright | |||||
| notice, this list of conditions and the following disclaimer in | |||||
| the documentation and/or other materials provided with the | |||||
| distribution. | |||||
| 3. Neither the name of the OpenBLAS project nor the names of | |||||
| its contributors may be used to endorse or promote products | |||||
| derived from this software without specific prior written permission. | |||||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| *****************************************************************************/ | |||||
| #include "common.h" | |||||
| static void __attribute__ ((noinline)) zaxpy_kernel_8(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) { | |||||
| __asm__ ("pfd 1, 0(%1) \n\t" | |||||
| "pfd 2, 0(%2) \n\t" | |||||
| "vlrepg %%v28 , 0(%3) \n\t" | |||||
| "vlrepg %%v29, 8(%3) \n\t" | |||||
| "srlg %3,%0,3 \n\t" | |||||
| "xgr %%r1,%%r1 \n\t" | |||||
| ".align 16 \n\t" | |||||
| "1: \n\t" | |||||
| "pfd 1, 256(%%r1,%1) \n\t" | |||||
| "pfd 2, 256(%%r1,%2) \n\t" | |||||
| "vleg %%v16 , 0(%%r1,%2),0 \n\t" | |||||
| "vleg %%v17 , 8(%%r1,%2),0 \n\t" | |||||
| "vleg %%v16 , 16(%%r1,%2),1 \n\t" | |||||
| "vleg %%v17 , 24(%%r1,%2),1 \n\t" | |||||
| "vleg %%v18 , 32(%%r1,%2),0 \n\t" | |||||
| "vleg %%v19 , 40(%%r1,%2),0 \n\t" | |||||
| "vleg %%v18 , 48(%%r1,%2),1 \n\t" | |||||
| "vleg %%v19 , 56(%%r1,%2),1 \n\t" | |||||
| "vleg %%v24 , 0(%%r1,%1),0 \n\t" | |||||
| "vleg %%v25 , 8(%%r1,%1),0 \n\t" | |||||
| "vleg %%v24 , 16(%%r1,%1),1 \n\t" | |||||
| "vleg %%v25 , 24(%%r1,%1),1 \n\t" | |||||
| "vleg %%v26 , 32(%%r1,%1),0 \n\t" | |||||
| "vleg %%v27 , 40(%%r1,%1),0 \n\t" | |||||
| "vleg %%v26 , 48(%%r1,%1),1 \n\t" | |||||
| "vleg %%v27 , 56(%%r1,%1),1 \n\t" | |||||
| #if !defined(CONJ) | |||||
| "vfmsdb %%v16, %%v25, %%v29,%%v16 \n\t" | |||||
| "vfmadb %%v17, %%v24, %%v29, %%v17 \n\t" | |||||
| "vfmsdb %%v18, %%v27, %%v29, %%v18 \n\t" | |||||
| "vfmadb %%v19, %%v26, %%v29, %%v19 \n\t" | |||||
| "vfmsdb %%v16, %%v24, %%v28 ,%%v16 \n\t" | |||||
| "vfmadb %%v17, %%v25, %%v28, %%v17 \n\t" | |||||
| "vfmsdb %%v18, %%v26, %%v28, %%v18 \n\t" | |||||
| "vfmadb %%v19, %%v27, %%v28, %%v19 \n\t" | |||||
| #else | |||||
| "vfmadb %%v16, %%v25, %%v29, %%v16 \n\t" | |||||
| "vfmsdb %%v17, %%v25, %%v28, %%v17 \n\t" | |||||
| "vfmadb %%v18, %%v27, %%v29, %%v18 \n\t" | |||||
| "vfmsdb %%v19, %%v27, %%v28, %%v19 \n\t" | |||||
| "vfmadb %%v16, %%v24, %%v28, %%v16 \n\t" | |||||
| "vfmsdb %%v17, %%v24, %%v29, %%v17 \n\t" | |||||
| "vfmadb %%v18, %%v26, %%v28, %%v18 \n\t" | |||||
| "vfmsdb %%v19, %%v26, %%v29, %%v19 \n\t" | |||||
| #endif | |||||
| "vsteg %%v16 , 0(%%r1,%2),0 \n\t" | |||||
| "vsteg %%v17 , 8(%%r1,%2),0 \n\t" | |||||
| "vsteg %%v16 , 16(%%r1,%2),1 \n\t" | |||||
| "vsteg %%v17 , 24(%%r1,%2),1 \n\t" | |||||
| "vsteg %%v18 , 32(%%r1,%2),0 \n\t" | |||||
| "vsteg %%v19 , 40(%%r1,%2),0 \n\t" | |||||
| "vsteg %%v18 , 48(%%r1,%2),1 \n\t" | |||||
| "vsteg %%v19 , 56(%%r1,%2),1 \n\t" | |||||
| "vleg %%v20 , 64(%%r1,%2),0 \n\t" | |||||
| "vleg %%v21 , 72(%%r1,%2),0 \n\t" | |||||
| "vleg %%v20 , 80(%%r1,%2),1 \n\t" | |||||
| "vleg %%v21 , 88(%%r1,%2),1 \n\t" | |||||
| "vleg %%v22 , 96(%%r1,%2),0 \n\t" | |||||
| "vleg %%v23 , 104(%%r1,%2),0 \n\t" | |||||
| "vleg %%v22 , 112(%%r1,%2),1 \n\t" | |||||
| "vleg %%v23 , 120(%%r1,%2),1 \n\t" | |||||
| "vleg %%v24 , 64(%%r1,%1),0 \n\t" | |||||
| "vleg %%v25 , 72(%%r1,%1),0 \n\t" | |||||
| "vleg %%v24 , 80(%%r1,%1),1 \n\t" | |||||
| "vleg %%v25 , 88(%%r1,%1),1 \n\t" | |||||
| "vleg %%v26 , 96(%%r1,%1),0 \n\t" | |||||
| "vleg %%v27 , 104(%%r1,%1),0 \n\t" | |||||
| "vleg %%v26 , 112(%%r1,%1),1 \n\t" | |||||
| "vleg %%v27 , 120(%%r1,%1),1 \n\t" | |||||
| #if !defined(CONJ) | |||||
| "vfmsdb %%v20, %%v25, %%v29,%%v20 \n\t" | |||||
| "vfmadb %%v21, %%v24, %%v29, %%v21 \n\t" | |||||
| "vfmsdb %%v22, %%v27, %%v29, %%v22 \n\t" | |||||
| "vfmadb %%v23, %%v26, %%v29, %%v23 \n\t" | |||||
| "vfmsdb %%v20, %%v24, %%v28 ,%%v20 \n\t" | |||||
| "vfmadb %%v21, %%v25, %%v28, %%v21 \n\t" | |||||
| "vfmsdb %%v22, %%v26, %%v28, %%v22 \n\t" | |||||
| "vfmadb %%v23, %%v27, %%v28, %%v23 \n\t" | |||||
| #else | |||||
| "vfmadb %%v20, %%v25, %%v29, %%v20 \n\t" | |||||
| "vfmsdb %%v21, %%v25, %%v28, %%v21 \n\t" | |||||
| "vfmadb %%v22, %%v27, %%v29, %%v22 \n\t" | |||||
| "vfmsdb %%v23, %%v27, %%v28, %%v23 \n\t" | |||||
| "vfmadb %%v20, %%v24, %%v28, %%v20 \n\t" | |||||
| "vfmsdb %%v21, %%v24, %%v29, %%v21 \n\t" | |||||
| "vfmadb %%v22, %%v26, %%v28, %%v22 \n\t" | |||||
| "vfmsdb %%v23, %%v26, %%v29, %%v23 \n\t" | |||||
| #endif | |||||
| "vsteg %%v20 , 64(%%r1,%2),0 \n\t" | |||||
| "vsteg %%v21 , 72(%%r1,%2),0 \n\t" | |||||
| "vsteg %%v20 , 80(%%r1,%2),1 \n\t" | |||||
| "vsteg %%v21 , 88(%%r1,%2),1 \n\t" | |||||
| "vsteg %%v22 , 96(%%r1,%2),0 \n\t" | |||||
| "vsteg %%v23 , 104(%%r1,%2),0 \n\t" | |||||
| "vsteg %%v22 , 112(%%r1,%2),1 \n\t" | |||||
| "vsteg %%v23 , 120(%%r1,%2),1 \n\t" | |||||
| "la %%r1,128(%%r1) \n\t" | |||||
| "brctg %3,1b" | |||||
| : | |||||
| : "r"(n), "a"(x), "a"(y), "a"(alpha) | |||||
| : "cc", "memory", "r1","v16", | |||||
| "v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29" | |||||
| ); | |||||
| } | |||||
| int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) { | |||||
| BLASLONG i = 0; | |||||
| BLASLONG ix = 0, iy = 0; | |||||
| FLOAT da[2]; | |||||
| if (n <= 0) return (0); | |||||
| if ((inc_x == 1) && (inc_y == 1)) { | |||||
| BLASLONG n1 = n & -8; | |||||
| if (n1) { | |||||
| da[0] = da_r; | |||||
| da[1] = da_i; | |||||
| zaxpy_kernel_8(n1, x, y, da); | |||||
| ix = 2 * n1; | |||||
| } | |||||
| i = n1; | |||||
| while (i < n) { | |||||
| #if !defined(CONJ) | |||||
| y[ix] += (da_r * x[ix] - da_i * x[ix + 1]); | |||||
| y[ix + 1] += (da_r * x[ix + 1] + da_i * x[ix]); | |||||
| #else | |||||
| y[ix] += (da_r * x[ix] + da_i * x[ix + 1]); | |||||
| y[ix + 1] -= (da_r * x[ix + 1] - da_i * x[ix]); | |||||
| #endif | |||||
| i++; | |||||
| ix += 2; | |||||
| } | |||||
| return (0); | |||||
| } | |||||
| inc_x *= 2; | |||||
| inc_y *= 2; | |||||
| while (i < n) { | |||||
| #if !defined(CONJ) | |||||
| y[iy] += (da_r * x[ix] - da_i * x[ix + 1]); | |||||
| y[iy + 1] += (da_r * x[ix + 1] + da_i * x[ix]); | |||||
| #else | |||||
| y[iy] += (da_r * x[ix] + da_i * x[ix + 1]); | |||||
| y[iy + 1] -= (da_r * x[ix + 1] - da_i * x[ix]); | |||||
| #endif | |||||
| ix += inc_x; | |||||
| iy += inc_y; | |||||
| i++; | |||||
| } | |||||
| return (0); | |||||
| } | |||||
| @@ -0,0 +1,145 @@ | |||||
| /*************************************************************************** | |||||
| Copyright (c) 2013-2017, The OpenBLAS Project | |||||
| All rights reserved. | |||||
| Redistribution and use in source and binary forms, with or without | |||||
| modification, are permitted provided that the following conditions are | |||||
| met: | |||||
| 1. Redistributions of source code must retain the above copyright | |||||
| notice, this list of conditions and the following disclaimer. | |||||
| 2. Redistributions in binary form must reproduce the above copyright | |||||
| notice, this list of conditions and the following disclaimer in | |||||
| the documentation and/or other materials provided with the | |||||
| distribution. | |||||
| 3. Neither the name of the OpenBLAS project nor the names of | |||||
| its contributors may be used to endorse or promote products | |||||
| derived from this software without specific prior written permission. | |||||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| *****************************************************************************/ | |||||
| #include "common.h" | |||||
| static void __attribute__ ((noinline)) zcopy_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y) { | |||||
| __asm__ volatile( | |||||
| "pfd 1, 0(%1) \n\t" | |||||
| "pfd 2, 0(%2) \n\t" | |||||
| "srlg %%r0,%0,4 \n\t" | |||||
| "xgr %%r1,%%r1 \n\t" | |||||
| ".align 16 \n\t" | |||||
| "1: \n\t" | |||||
| "pfd 1, 256(%%r1,%1) \n\t" | |||||
| "pfd 2, 256(%%r1,%2) \n\t" | |||||
| "vl %%v24, 0(%%r1,%1) \n\t" | |||||
| "vst %%v24, 0(%%r1,%2) \n\t" | |||||
| "vl %%v25, 16(%%r1,%1) \n\t" | |||||
| "vst %%v25, 16(%%r1,%2) \n\t" | |||||
| "vl %%v26, 32(%%r1,%1) \n\t" | |||||
| "vst %%v26, 32(%%r1,%2) \n\t" | |||||
| "vl %%v27, 48(%%r1,%1) \n\t" | |||||
| "vst %%v27, 48(%%r1,%2) \n\t" | |||||
| "vl %%v28, 64(%%r1,%1) \n\t" | |||||
| "vst %%v28, 64(%%r1,%2) \n\t" | |||||
| "vl %%v29, 80(%%r1,%1) \n\t" | |||||
| "vst %%v29, 80(%%r1,%2) \n\t" | |||||
| "vl %%v30, 96(%%r1,%1) \n\t" | |||||
| "vst %%v30, 96(%%r1,%2) \n\t" | |||||
| "vl %%v31,112(%%r1,%1) \n\t" | |||||
| "vst %%v31,112(%%r1,%2) \n\t" | |||||
| "vl %%v24,128(%%r1,%1) \n\t" | |||||
| "vst %%v24,128(%%r1,%2) \n\t" | |||||
| "vl %%v25,144(%%r1,%1) \n\t" | |||||
| "vst %%v25,144(%%r1,%2) \n\t" | |||||
| "vl %%v26,160(%%r1,%1) \n\t" | |||||
| "vst %%v26,160(%%r1,%2) \n\t" | |||||
| "vl %%v27,176(%%r1,%1) \n\t" | |||||
| "vst %%v27,176(%%r1,%2) \n\t" | |||||
| "vl %%v28, 192(%%r1,%1) \n\t" | |||||
| "vst %%v28, 192(%%r1,%2) \n\t" | |||||
| "vl %%v29, 208(%%r1,%1) \n\t" | |||||
| "vst %%v29, 208(%%r1,%2) \n\t" | |||||
| "vl %%v30, 224(%%r1,%1) \n\t" | |||||
| "vst %%v30, 224(%%r1,%2) \n\t" | |||||
| "vl %%v31, 240(%%r1,%1) \n\t" | |||||
| "vst %%v31, 240(%%r1,%2) \n\t" | |||||
| "la %%r1,256(%%r1) \n\t" | |||||
| "brctg %%r0,1b" | |||||
| : | |||||
| : "r"(n), "a"(x), "a"(y) | |||||
| : "cc", "memory","r0","r1","v24","v25","v26","v27","v28","v29","v30","v31" | |||||
| ); | |||||
| return; | |||||
| } | |||||
| int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) | |||||
| { | |||||
| BLASLONG i=0; | |||||
| BLASLONG ix=0,iy=0; | |||||
| if ( n <= 0 ) return(0); | |||||
| if ( (inc_x == 1) && (inc_y == 1 )) | |||||
| { | |||||
| BLASLONG n1 = n & -16; | |||||
| if ( n1 > 0 ) | |||||
| { | |||||
| zcopy_kernel_16(n1, x, y); | |||||
| i=n1; | |||||
| ix=n1*2; | |||||
| iy=n1*2; | |||||
| } | |||||
| while(i < n) | |||||
| { | |||||
| y[iy] = x[iy] ; | |||||
| y[iy+1] = x[ix+1] ; | |||||
| ix+=2; | |||||
| iy+=2; | |||||
| i++ ; | |||||
| } | |||||
| } | |||||
| else | |||||
| { | |||||
| BLASLONG inc_x2 = 2 * inc_x; | |||||
| BLASLONG inc_y2 = 2 * inc_y; | |||||
| while(i < n) | |||||
| { | |||||
| y[iy] = x[ix] ; | |||||
| y[iy+1] = x[ix+1] ; | |||||
| ix += inc_x2 ; | |||||
| iy += inc_y2 ; | |||||
| i++ ; | |||||
| } | |||||
| } | |||||
| return(0); | |||||
| } | |||||
| @@ -0,0 +1,216 @@ | |||||
| /*************************************************************************** | |||||
| Copyright (c) 2013-2017, The OpenBLAS Project | |||||
| All rights reserved. | |||||
| Redistribution and use in source and binary forms, with or without | |||||
| modification, are permitted provided that the following conditions are | |||||
| met: | |||||
| 1. Redistributions of source code must retain the above copyright | |||||
| notice, this list of conditions and the following disclaimer. | |||||
| 2. Redistributions in binary form must reproduce the above copyright | |||||
| notice, this list of conditions and the following disclaimer in | |||||
| the documentation and/or other materials provided with the | |||||
| distribution. | |||||
| 3. Neither the name of the OpenBLAS project nor the names of | |||||
| its contributors may be used to endorse or promote products | |||||
| derived from this software without specific prior written permission. | |||||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| *****************************************************************************/ | |||||
| #include "common.h" | |||||
| static void __attribute__ ((noinline)) zdot_kernel_8(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *d) { | |||||
| __asm__ volatile( | |||||
| "pfd 1, 0(%1) \n\t" | |||||
| "pfd 1, 0(%2) \n\t" | |||||
| "vzero %%v24 \n\t" | |||||
| "vzero %%v25 \n\t" | |||||
| "vzero %%v26 \n\t" | |||||
| "vzero %%v27 \n\t" | |||||
| "srlg %%r0,%0,3 \n\t" | |||||
| "xgr %%r1,%%r1 \n\t" | |||||
| ".align 16 \n\t" | |||||
| "1: \n\t" | |||||
| "pfd 1, 256(%%r1,%1) \n\t" | |||||
| "pfd 1, 256(%%r1,%2) \n\t" | |||||
| "vl %%v16, 0(%%r1,%1) \n\t" | |||||
| "vl %%v17, 16(%%r1,%1) \n\t" | |||||
| "vl %%v18, 32(%%r1,%1) \n\t" | |||||
| "vl %%v19, 48(%%r1,%1) \n\t" | |||||
| "vl %%v28, 0(%%r1,%2) \n\t" | |||||
| "vl %%v29, 16(%%r1,%2) \n\t" | |||||
| "vl %%v30, 32(%%r1,%2) \n\t" | |||||
| "vl %%v31, 48(%%r1,%2) \n\t" | |||||
| "vpdi %%v20,%%v16,%%v16,4 \n\t" | |||||
| "vpdi %%v21,%%v17,%%v17,4 \n\t" | |||||
| "vpdi %%v22,%%v18,%%v18,4 \n\t" | |||||
| "vpdi %%v23,%%v19,%%v19,4 \n\t" | |||||
| "vfmadb %%v24,%%v16,%%v28,%%v24 \n\t" | |||||
| "vfmadb %%v25,%%v20,%%v28,%%v25 \n\t" | |||||
| "vfmadb %%v26,%%v17,%%v29,%%v26 \n\t" | |||||
| "vfmadb %%v27,%%v21,%%v29,%%v27 \n\t" | |||||
| "vfmadb %%v24,%%v18,%%v30,%%v24 \n\t" | |||||
| "vfmadb %%v25,%%v22,%%v30,%%v25 \n\t" | |||||
| "vfmadb %%v26,%%v19,%%v31,%%v26 \n\t" | |||||
| "vfmadb %%v27,%%v23,%%v31,%%v27 \n\t" | |||||
| "vl %%v16, 64(%%r1,%1) \n\t" | |||||
| "vl %%v17, 80(%%r1,%1) \n\t" | |||||
| "vl %%v18, 96(%%r1,%1) \n\t" | |||||
| "vl %%v19,112(%%r1,%1) \n\t" | |||||
| "vl %%v28, 64(%%r1,%2) \n\t" | |||||
| "vl %%v29, 80(%%r1,%2) \n\t" | |||||
| "vl %%v30, 96(%%r1,%2) \n\t" | |||||
| "vl %%v31,112(%%r1,%2) \n\t" | |||||
| "vpdi %%v20,%%v16,%%v16,4 \n\t" | |||||
| "vpdi %%v21,%%v17,%%v17,4 \n\t" | |||||
| "vpdi %%v22,%%v18,%%v18,4 \n\t" | |||||
| "vpdi %%v23,%%v19,%%v19,4 \n\t" | |||||
| "vfmadb %%v24,%%v16,%%v28,%%v24 \n\t" | |||||
| "vfmadb %%v25,%%v20,%%v28,%%v25 \n\t" | |||||
| "vfmadb %%v26,%%v17,%%v29,%%v26 \n\t" | |||||
| "vfmadb %%v27,%%v21,%%v29,%%v27 \n\t" | |||||
| "vfmadb %%v24,%%v18,%%v30,%%v24 \n\t" | |||||
| "vfmadb %%v25,%%v22,%%v30,%%v25 \n\t" | |||||
| "vfmadb %%v26,%%v19,%%v31,%%v26 \n\t" | |||||
| "vfmadb %%v27,%%v23,%%v31,%%v27 \n\t" | |||||
| "la %%r1,128(%%r1) \n\t" | |||||
| "brctg %%r0,1b \n\t" | |||||
| "vfadb %%v24,%%v26,%%v24 \n\t" | |||||
| "vfadb %%v25,%%v25,%%v27 \n\t" | |||||
| "vsteg %%v24,0(%3),0 \n\t" | |||||
| "vsteg %%v24,8(%3),1 \n\t" | |||||
| "vsteg %%v25,16(%3),1 \n\t" | |||||
| "vsteg %%v25,24(%3),0 \n\t" | |||||
| : | |||||
| : "r"(n), "a"(x), "a"(y), "a"(d) | |||||
| : "cc", "memory","r0","r1","v16", | |||||
| "v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" | |||||
| ); | |||||
| } | |||||
| static __attribute__ ((noinline)) void zdot_kernel_8n(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *d) { | |||||
| BLASLONG register i = 0; | |||||
| FLOAT dot[4] = {0.0, 0.0, 0.0, 0.0}; | |||||
| BLASLONG j = 0; | |||||
| while (i < n) { | |||||
| dot[0] += x[j] * y[j]; | |||||
| dot[1] += x[j + 1] * y[j + 1]; | |||||
| dot[2] += x[j] * y[j + 1]; | |||||
| dot[3] += x[j + 1] * y[j]; | |||||
| dot[0] += x[j + 2] * y[j + 2]; | |||||
| dot[1] += x[j + 3] * y[j + 3]; | |||||
| dot[2] += x[j + 2] * y[j + 3]; | |||||
| dot[3] += x[j + 3] * y[j + 2]; | |||||
| dot[0] += x[j + 4] * y[j + 4]; | |||||
| dot[1] += x[j + 5] * y[j + 5]; | |||||
| dot[2] += x[j + 4] * y[j + 5]; | |||||
| dot[3] += x[j + 5] * y[j + 4]; | |||||
| dot[0] += x[j + 6] * y[j + 6]; | |||||
| dot[1] += x[j + 7] * y[j + 7]; | |||||
| dot[2] += x[j + 6] * y[j + 7]; | |||||
| dot[3] += x[j + 7] * y[j + 6]; | |||||
| j += 8; | |||||
| i += 4; | |||||
| } | |||||
| d[0] = dot[0]; | |||||
| d[1] = dot[1]; | |||||
| d[2] = dot[2]; | |||||
| d[3] = dot[3]; | |||||
| } | |||||
| OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) { | |||||
| BLASLONG i; | |||||
| BLASLONG ix, iy; | |||||
| OPENBLAS_COMPLEX_FLOAT result; | |||||
| FLOAT dot[4] __attribute__ ((aligned(16))) = {0.0, 0.0, 0.0, 0.0}; | |||||
| if (n <= 0) { | |||||
| CREAL(result) = 0.0; | |||||
| CIMAG(result) = 0.0; | |||||
| return (result); | |||||
| } | |||||
| if ((inc_x == 1) && (inc_y == 1)) { | |||||
| BLASLONG n1 = n & -16; | |||||
| if (n1) | |||||
| zdot_kernel_8(n1, x, y, dot); | |||||
| i = n1; | |||||
| BLASLONG j = i * 2; | |||||
| while (i < n) { | |||||
| dot[0] += x[j] * y[j]; | |||||
| dot[1] += x[j + 1] * y[j + 1]; | |||||
| dot[2] += x[j] * y[j + 1]; | |||||
| dot[3] += x[j + 1] * y[j]; | |||||
| j += 2; | |||||
| i++; | |||||
| } | |||||
| } else { | |||||
| i = 0; | |||||
| ix = 0; | |||||
| iy = 0; | |||||
| inc_x <<= 1; | |||||
| inc_y <<= 1; | |||||
| while (i < n) { | |||||
| dot[0] += x[ix] * y[iy]; | |||||
| dot[1] += x[ix + 1] * y[iy + 1]; | |||||
| dot[2] += x[ix] * y[iy + 1]; | |||||
| dot[3] += x[ix + 1] * y[iy]; | |||||
| ix += inc_x; | |||||
| iy += inc_y; | |||||
| i++; | |||||
| } | |||||
| } | |||||
| #if !defined(CONJ) | |||||
| CREAL(result) = dot[0] - dot[1]; | |||||
| CIMAG(result) = dot[2] + dot[3]; | |||||
| #else | |||||
| CREAL(result) = dot[0] + dot[1]; | |||||
| CIMAG(result) = dot[2] - dot[3]; | |||||
| #endif | |||||
| return (result); | |||||
| } | |||||
| @@ -0,0 +1,919 @@ | |||||
| /*************************************************************************** | |||||
| Copyright (c) 2017, The OpenBLAS Project | |||||
| All rights reserved. | |||||
| Redistribution and use in source and binary forms, with or without | |||||
| modification, are permitted provided that the following conditions are | |||||
| met: | |||||
| 1. Redistributions of source code must retain the above copyright | |||||
| notice, this list of conditions and the following disclaimer. | |||||
| 2. Redistributions in binary form must reproduce the above copyright | |||||
| notice, this list of conditions and the following disclaimer in | |||||
| the documentation and/or other materials provided with the | |||||
| distribution. | |||||
| 3. Neither the name of the OpenBLAS project nor the names of | |||||
| its contributors may be used to endorse or promote products | |||||
| derived from this software without specific prior written permission. | |||||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| *****************************************************************************/ | |||||
| #include <stdlib.h> | |||||
| #include <stdio.h> | |||||
| #include "common.h" | |||||
| #define HAVE_KERNEL_4x4_VEC 1 | |||||
| #define HAVE_KERNEL_4x2_VEC 1 | |||||
| #define HAVE_KERNEL_4x1_VEC 1 | |||||
| #define HAVE_KERNEL_ADDY 1 | |||||
| #if defined(HAVE_KERNEL_4x4_VEC) || defined(HAVE_KERNEL_4x2_VEC) || defined(HAVE_KERNEL_4x1_VEC) | |||||
| #include <vecintrin.h> | |||||
| #endif | |||||
| /** | |||||
| * if define IGNORE_TEMP_PERM we store and use ybuffer as {real,real} {img;img} | |||||
| * of not we will retrieve and store normal way | |||||
| */ | |||||
| #if (defined(HAVE_KERNEL_4x4_VEC_ASM) || defined(HAVE_KERNEL_4x4_VEC) ) && defined(HAVE_KERNEL_4x2_VEC) && defined(HAVE_KERNEL_4x1_VEC) && defined(HAVE_KERNEL_ADDY) | |||||
| // #define IGNORE_TEMP_PERM 1 | |||||
| #endif | |||||
| #define NBMAX 1024 | |||||
| #ifdef HAVE_KERNEL_4x4_VEC_ASM | |||||
| #elif HAVE_KERNEL_4x4_VEC | |||||
| static void zgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) | |||||
| { | |||||
| BLASLONG i; | |||||
| FLOAT *a0,*a1,*a2,*a3; | |||||
| a0 = ap[0]; | |||||
| a1 = ap[1]; | |||||
| a2 = ap[2]; | |||||
| a3 = ap[3]; | |||||
| register __vector double vx0_r = {x[0],x[0]}; | |||||
| register __vector double vx0_i = {x[1],x[1]}; | |||||
| register __vector double vx1_r = {x[2],x[2]}; | |||||
| register __vector double vx1_i = {x[3],x[3]}; | |||||
| register __vector double vx2_r = {x[4],x[4]}; | |||||
| register __vector double vx2_i = {x[5],x[5]}; | |||||
| register __vector double vx3_r = {x[6],x[6]}; | |||||
| register __vector double vx3_i = {x[7],x[7]}; | |||||
| #ifdef IGNORE_TEMP_PERM | |||||
| register __vector double *vy = (__vector double *)y; | |||||
| register BLASLONG j=0; | |||||
| #endif | |||||
| for ( i=0; i< 2*n; i+=4 ) | |||||
| { | |||||
| #ifdef IGNORE_TEMP_PERM | |||||
| register __vector double vresult_r = vy[j]; | |||||
| register __vector double vresult_i = vy[j+1]; | |||||
| #else | |||||
| register __vector double vresult_r = {y[i],y[i+2]}; | |||||
| register __vector double vresult_i = {y[i+1],y[i+3]}; | |||||
| #endif | |||||
| register __vector double va0_r= {a0[i],a0[i+2]}; | |||||
| register __vector double va0_i= {a0[i+1],a0[i+3]}; | |||||
| register __vector double va1_r= {a1[i],a1[i+2]}; | |||||
| register __vector double va1_i= {a1[i+1],a1[i+3]}; | |||||
| register __vector double va2_r= {a2[i],a2[i+2]}; | |||||
| register __vector double va2_i= {a2[i+1],a2[i+3]}; | |||||
| register __vector double va3_r= {a3[i],a3[i+2]}; | |||||
| register __vector double va3_i= {a3[i+1],a3[i+3]}; | |||||
| #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) | |||||
| vresult_r = va0_r * vx0_r - (va0_i*vx0_i -vresult_r) ; | |||||
| vresult_i = vresult_i + va0_r * vx0_i + va0_i * vx0_r ; | |||||
| vresult_r = va1_r * vx1_r - (va1_i*vx1_i -vresult_r) ; | |||||
| vresult_i = vresult_i + va1_r * vx1_i + va1_i * vx1_r ; | |||||
| vresult_r = va2_r * vx2_r - (va2_i*vx2_i -vresult_r) ; | |||||
| vresult_i = vresult_i + va2_r * vx2_i + va2_i * vx2_r ; | |||||
| vresult_r = va3_r * vx3_r - (va3_i*vx3_i -vresult_r) ; | |||||
| vresult_i = vresult_i + va3_r * vx3_i + va3_i * vx3_r ; | |||||
| #else | |||||
| vresult_r = vresult_r + va0_r * vx0_r + va0_i*vx0_i ; | |||||
| vresult_i = va0_r * vx0_i - ( va0_i * vx0_r - vresult_i) ; | |||||
| vresult_r = vresult_r + va1_r * vx1_r + va1_i*vx1_i ; | |||||
| vresult_i = va1_r * vx1_i - ( va1_i * vx1_r - vresult_i) ; | |||||
| vresult_r = vresult_r + va2_r * vx2_r + va2_i*vx2_i ; | |||||
| vresult_i = va2_r * vx2_i - ( va2_i * vx2_r - vresult_i) ; | |||||
| vresult_r = vresult_r + va3_r * vx3_r + va3_i*vx3_i ; | |||||
| vresult_i = va3_r * vx3_i - ( va3_i * vx3_r - vresult_i) ; | |||||
| #endif | |||||
| #ifdef IGNORE_TEMP_PERM | |||||
| vy[j] = vresult_r ; | |||||
| vy[j+1] = vresult_i ; | |||||
| j+=2; | |||||
| #else | |||||
| y[i] = vresult_r[0]; | |||||
| y[i+1] = vresult_i[0]; | |||||
| y[i +2 ] = vresult_r[1]; | |||||
| y[i + 3 ] = vresult_i[1]; | |||||
| #endif | |||||
| } | |||||
| } | |||||
| #else | |||||
| static void zgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) | |||||
| { | |||||
| BLASLONG i; | |||||
| FLOAT *a0,*a1,*a2,*a3; | |||||
| a0 = ap[0]; | |||||
| a1 = ap[1]; | |||||
| a2 = ap[2]; | |||||
| a3 = ap[3]; | |||||
| for ( i=0; i< 2*n; i+=2 ) | |||||
| { | |||||
| #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) | |||||
| y[i] += a0[i]*x[0] - a0[i+1] * x[1]; | |||||
| y[i+1] += a0[i]*x[1] + a0[i+1] * x[0]; | |||||
| y[i] += a1[i]*x[2] - a1[i+1] * x[3]; | |||||
| y[i+1] += a1[i]*x[3] + a1[i+1] * x[2]; | |||||
| y[i] += a2[i]*x[4] - a2[i+1] * x[5]; | |||||
| y[i+1] += a2[i]*x[5] + a2[i+1] * x[4]; | |||||
| y[i] += a3[i]*x[6] - a3[i+1] * x[7]; | |||||
| y[i+1] += a3[i]*x[7] + a3[i+1] * x[6]; | |||||
| #else | |||||
| y[i] += a0[i]*x[0] + a0[i+1] * x[1]; | |||||
| y[i+1] += a0[i]*x[1] - a0[i+1] * x[0]; | |||||
| y[i] += a1[i]*x[2] + a1[i+1] * x[3]; | |||||
| y[i+1] += a1[i]*x[3] - a1[i+1] * x[2]; | |||||
| y[i] += a2[i]*x[4] + a2[i+1] * x[5]; | |||||
| y[i+1] += a2[i]*x[5] - a2[i+1] * x[4]; | |||||
| y[i] += a3[i]*x[6] + a3[i+1] * x[7]; | |||||
| y[i+1] += a3[i]*x[7] - a3[i+1] * x[6]; | |||||
| #endif | |||||
| } | |||||
| } | |||||
| #endif | |||||
| #ifdef HAVE_KERNEL_4x2_VEC | |||||
| static void zgemv_kernel_4x2(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) | |||||
| { | |||||
| BLASLONG i; | |||||
| FLOAT *a0,*a1; | |||||
| a0 = ap[0]; | |||||
| a1 = ap[1]; | |||||
| register __vector double vx0_r = {x[0],x[0]}; | |||||
| register __vector double vx0_i = {x[1],x[1]}; | |||||
| register __vector double vx1_r = {x[2],x[2]}; | |||||
| register __vector double vx1_i = {x[3],x[3]}; | |||||
| #ifdef IGNORE_TEMP_PERM | |||||
| register __vector double *vy = (__vector double *)y; | |||||
| register BLASLONG j=0; | |||||
| #endif | |||||
| for ( i=0; i< 2*n; i+=4 ) | |||||
| { | |||||
| #ifdef IGNORE_TEMP_PERM | |||||
| register __vector double vresult_r = vy[j]; | |||||
| register __vector double vresult_i = vy[j+1]; | |||||
| #else | |||||
| register __vector double vresult_r = {y[i],y[i+2]}; | |||||
| register __vector double vresult_i = {y[i+1],y[i+3]}; | |||||
| #endif | |||||
| register __vector double va0_r= {a0[i],a0[i+2]}; | |||||
| register __vector double va0_i= {a0[i+1],a0[i+3]}; | |||||
| register __vector double va1_r= {a1[i],a1[i+2]}; | |||||
| register __vector double va1_i= {a1[i+1],a1[i+3]}; | |||||
| #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) | |||||
| vresult_r = va0_r * vx0_r - (va0_i*vx0_i -vresult_r) ; | |||||
| vresult_i = vresult_i + va0_r * vx0_i + va0_i * vx0_r ; | |||||
| vresult_r = va1_r * vx1_r - (va1_i*vx1_i -vresult_r) ; | |||||
| vresult_i = vresult_i + va1_r * vx1_i + va1_i * vx1_r ; | |||||
| #else | |||||
| vresult_r = vresult_r + va0_r * vx0_r + va0_i*vx0_i ; | |||||
| vresult_i = va0_r * vx0_i - ( va0_i * vx0_r - vresult_i) ; | |||||
| vresult_r = vresult_r + va1_r * vx1_r + va1_i*vx1_i ; | |||||
| vresult_i = va1_r * vx1_i - ( va1_i * vx1_r - vresult_i) ; | |||||
| #endif | |||||
| #ifdef IGNORE_TEMP_PERM | |||||
| vy[j] = vresult_r ; | |||||
| vy[j+1] = vresult_i ; | |||||
| j+=2; | |||||
| #else | |||||
| y[i] = vresult_r[0]; | |||||
| y[i+1] = vresult_i[0]; | |||||
| y[i +2 ] = vresult_r[1]; | |||||
| y[i + 3 ] = vresult_i[1]; | |||||
| #endif | |||||
| } | |||||
| } | |||||
| #else | |||||
| static void zgemv_kernel_4x2(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) | |||||
| { | |||||
| BLASLONG i; | |||||
| FLOAT *a0,*a1; | |||||
| a0 = ap[0]; | |||||
| a1 = ap[1]; | |||||
| for ( i=0; i< 2*n; i+=2 ) | |||||
| { | |||||
| #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) | |||||
| y[i] += a0[i]*x[0] - a0[i+1] * x[1]; | |||||
| y[i+1] += a0[i]*x[1] + a0[i+1] * x[0]; | |||||
| y[i] += a1[i]*x[2] - a1[i+1] * x[3]; | |||||
| y[i+1] += a1[i]*x[3] + a1[i+1] * x[2]; | |||||
| #else | |||||
| y[i] += a0[i]*x[0] + a0[i+1] * x[1]; | |||||
| y[i+1] += a0[i]*x[1] - a0[i+1] * x[0]; | |||||
| y[i] += a1[i]*x[2] + a1[i+1] * x[3]; | |||||
| y[i+1] += a1[i]*x[3] - a1[i+1] * x[2]; | |||||
| #endif | |||||
| } | |||||
| } | |||||
| #endif | |||||
| #ifdef HAVE_KERNEL_4x1_VEC | |||||
| static void zgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y) | |||||
| { | |||||
| BLASLONG i; | |||||
| FLOAT *a0; | |||||
| a0 = ap; | |||||
| register __vector double vx_r = {x[0],x[0]}; | |||||
| register __vector double vx_i = {x[1],x[1]}; | |||||
| #ifdef IGNORE_TEMP_PERM | |||||
| register __vector double *vy = (__vector double *)y; | |||||
| register BLASLONG j=0; | |||||
| #endif | |||||
| for ( i=0; i< 2*n; i+=4 ) | |||||
| { | |||||
| #ifdef IGNORE_TEMP_PERM | |||||
| register __vector double vresult_r = vy[j]; | |||||
| register __vector double vresult_i = vy[j+1]; | |||||
| #else | |||||
| register __vector double vresult_r = {y[i],y[i+2]}; | |||||
| register __vector double vresult_i = {y[i+1],y[i+3]}; | |||||
| #endif | |||||
| register __vector double va0_r= {a0[i],a0[i+2]}; | |||||
| register __vector double va0_i= {a0[i+1],a0[i+3]}; | |||||
| #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) | |||||
| vresult_r = va0_r * vx_r - (va0_i*vx_i -vresult_r) ; | |||||
| vresult_i = vresult_i + va0_r * vx_i + va0_i * vx_r ; | |||||
| #else | |||||
| vresult_r = vresult_r + va0_r * vx_r + va0_i*vx_i ; | |||||
| vresult_i = va0_r * vx_i - ( va0_i * vx_r - vresult_i) ; | |||||
| // y[i] += a0[i]*x[0] + a0[i+1] * x[1]; | |||||
| // y[i+1] += a0[i]*x[1] - a0[i+1] * x[0]; | |||||
| #endif | |||||
| #ifndef IGNORE_TEMP_PERM | |||||
| y[i] = vresult_r[0]; | |||||
| y[i+1] = vresult_i[0]; | |||||
| y[i +2 ] = vresult_r[1]; | |||||
| y[i + 3 ] = vresult_i[1]; | |||||
| #else | |||||
| vy[j] = vresult_r ; | |||||
| vy[j+1] = vresult_i ; | |||||
| j+=2; | |||||
| #endif | |||||
| } | |||||
| } | |||||
| #else | |||||
| static void zgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y) | |||||
| { | |||||
| BLASLONG i; | |||||
| FLOAT *a0; | |||||
| a0 = ap; | |||||
| for ( i=0; i< 2*n; i+=2 ) | |||||
| { | |||||
| #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) | |||||
| y[i] += a0[i]*x[0] - a0[i+1] * x[1]; | |||||
| y[i+1] += a0[i]*x[1] + a0[i+1] * x[0]; | |||||
| #else | |||||
| y[i] += a0[i]*x[0] + a0[i+1] * x[1]; | |||||
| y[i+1] += a0[i]*x[1] - a0[i+1] * x[0]; | |||||
| #endif | |||||
| } | |||||
| } | |||||
| #endif | |||||
| #ifdef HAVE_KERNEL_ADDY | |||||
| static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest,FLOAT alpha_r, FLOAT alpha_i) | |||||
| { | |||||
| BLASLONG i; | |||||
| #ifdef IGNORE_TEMP_PERM | |||||
| register __vector double *src_vec = (__vector double *)src; | |||||
| #endif | |||||
| register __vector double valpha_r = {alpha_r,alpha_r}; | |||||
| register __vector double valpha_i = {alpha_i,alpha_i}; | |||||
| register __vector double vresult_r; | |||||
| register __vector double vresult_i; | |||||
| if ( inc_dest != 2 ) | |||||
| { | |||||
| for ( i=0; i<n; i+=2 ) | |||||
| { | |||||
| #ifdef IGNORE_TEMP_PERM | |||||
| register __vector double vsrc_r= src_vec[i]; | |||||
| register __vector double vsrc_i= src_vec[i+1]; | |||||
| #else | |||||
| register __vector double vsrc_r= {src[0],src[2]}; | |||||
| register __vector double vsrc_i= {src[1],src[3]}; | |||||
| #endif | |||||
| #if !defined(XCONJ) | |||||
| vresult_r = vsrc_r * valpha_r; | |||||
| vresult_r-= vsrc_i*valpha_i ; | |||||
| vresult_i = vsrc_r * valpha_i + vsrc_i * valpha_r ; | |||||
| #else | |||||
| vresult_r = vsrc_r * valpha_r + vsrc_i*valpha_i ; | |||||
| vresult_i = vsrc_r * valpha_i ; | |||||
| vresult_i -= vsrc_i * valpha_r ; | |||||
| #endif | |||||
| *dest += vresult_r[0]; | |||||
| *(dest+1) += vresult_i[0]; | |||||
| *(dest + inc_dest) += vresult_r[1]; | |||||
| *(dest+inc_dest+1) += vresult_i[1]; | |||||
| #ifndef IGNORE_TEMP_PERM | |||||
| src+=4; | |||||
| #endif | |||||
| dest += 2*inc_dest; | |||||
| } | |||||
| return; | |||||
| } | |||||
| for ( i=0; i<n; i+=2 ) | |||||
| { | |||||
| #ifdef IGNORE_TEMP_PERM | |||||
| register __vector double vsrc_r= src_vec[i]; | |||||
| register __vector double vsrc_i= src_vec[i+1]; | |||||
| #else | |||||
| register __vector double vsrc_r= {src[0],src[2]}; | |||||
| register __vector double vsrc_i= {src[1],src[3]}; | |||||
| #endif | |||||
| #if !defined(XCONJ) | |||||
| vresult_r = vsrc_r * valpha_r; | |||||
| vresult_r-= vsrc_i*valpha_i ; | |||||
| vresult_i = vsrc_r * valpha_i + vsrc_i * valpha_r ; | |||||
| #else | |||||
| vresult_r = vsrc_r * valpha_r + vsrc_i*valpha_i ; | |||||
| vresult_i = vsrc_r * valpha_i ; | |||||
| vresult_i -= vsrc_i * valpha_r ; | |||||
| #endif | |||||
| *dest += vresult_r[0]; | |||||
| *(dest+1) += vresult_i[0]; | |||||
| *(dest + 2) += vresult_r[1]; | |||||
| *(dest+3) += vresult_i[1]; | |||||
| #ifndef IGNORE_TEMP_PERM | |||||
| src+=4; | |||||
| #endif | |||||
| dest += 4; | |||||
| } | |||||
| return; | |||||
| return; | |||||
| } | |||||
| #else | |||||
| static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest,FLOAT alpha_r, FLOAT alpha_i) | |||||
| { | |||||
| BLASLONG i; | |||||
| if ( inc_dest != 2 ) | |||||
| { | |||||
| FLOAT temp_r; | |||||
| FLOAT temp_i; | |||||
| for ( i=0; i<n; i++ ) | |||||
| { | |||||
| #if !defined(XCONJ) | |||||
| temp_r = alpha_r * src[0] - alpha_i * src[1]; | |||||
| temp_i = alpha_r * src[1] + alpha_i * src[0]; | |||||
| #else | |||||
| temp_r = alpha_r * src[0] + alpha_i * src[1]; | |||||
| temp_i = -alpha_r * src[1] + alpha_i * src[0]; | |||||
| #endif | |||||
| *dest += temp_r; | |||||
| *(dest+1) += temp_i; | |||||
| src+=2; | |||||
| dest += inc_dest; | |||||
| } | |||||
| return; | |||||
| } | |||||
| FLOAT temp_r0; | |||||
| FLOAT temp_i0; | |||||
| FLOAT temp_r1; | |||||
| FLOAT temp_i1; | |||||
| FLOAT temp_r2; | |||||
| FLOAT temp_i2; | |||||
| FLOAT temp_r3; | |||||
| FLOAT temp_i3; | |||||
| for ( i=0; i<n; i+=4 ) | |||||
| { | |||||
| #if !defined(XCONJ) | |||||
| temp_r0 = alpha_r * src[0] - alpha_i * src[1]; | |||||
| temp_i0 = alpha_r * src[1] + alpha_i * src[0]; | |||||
| temp_r1 = alpha_r * src[2] - alpha_i * src[3]; | |||||
| temp_i1 = alpha_r * src[3] + alpha_i * src[2]; | |||||
| temp_r2 = alpha_r * src[4] - alpha_i * src[5]; | |||||
| temp_i2 = alpha_r * src[5] + alpha_i * src[4]; | |||||
| temp_r3 = alpha_r * src[6] - alpha_i * src[7]; | |||||
| temp_i3 = alpha_r * src[7] + alpha_i * src[6]; | |||||
| #else | |||||
| temp_r0 = alpha_r * src[0] + alpha_i * src[1]; | |||||
| temp_i0 = -alpha_r * src[1] + alpha_i * src[0]; | |||||
| temp_r1 = alpha_r * src[2] + alpha_i * src[3]; | |||||
| temp_i1 = -alpha_r * src[3] + alpha_i * src[2]; | |||||
| temp_r2 = alpha_r * src[4] + alpha_i * src[5]; | |||||
| temp_i2 = -alpha_r * src[5] + alpha_i * src[4]; | |||||
| temp_r3 = alpha_r * src[6] + alpha_i * src[7]; | |||||
| temp_i3 = -alpha_r * src[7] + alpha_i * src[6]; | |||||
| #endif | |||||
| dest[0] += temp_r0; | |||||
| dest[1] += temp_i0; | |||||
| dest[2] += temp_r1; | |||||
| dest[3] += temp_i1; | |||||
| dest[4] += temp_r2; | |||||
| dest[5] += temp_i2; | |||||
| dest[6] += temp_r3; | |||||
| dest[7] += temp_i3; | |||||
| src += 8; | |||||
| dest += 8; | |||||
| } | |||||
| return; | |||||
| } | |||||
| #endif | |||||
| int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r,FLOAT alpha_i, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) | |||||
| { | |||||
| BLASLONG i; | |||||
| BLASLONG j; | |||||
| FLOAT *a_ptr; | |||||
| FLOAT *x_ptr; | |||||
| FLOAT *y_ptr; | |||||
| FLOAT *ap[4]; | |||||
| BLASLONG n1; | |||||
| BLASLONG m1; | |||||
| BLASLONG m2; | |||||
| BLASLONG m3; | |||||
| BLASLONG n2; | |||||
| BLASLONG lda4; | |||||
| FLOAT xbuffer[8],*ybuffer; | |||||
| if ( m < 1 ) return(0); | |||||
| if ( n < 1 ) return(0); | |||||
| ybuffer = buffer; | |||||
| inc_x *= 2; | |||||
| inc_y *= 2; | |||||
| lda *= 2; | |||||
| lda4 = 4 * lda; | |||||
| n1 = n / 4 ; | |||||
| n2 = n % 4 ; | |||||
| m3 = m % 4; | |||||
| m1 = m - ( m % 4 ); | |||||
| m2 = (m % NBMAX) - (m % 4) ; | |||||
| y_ptr = y; | |||||
| BLASLONG NB = NBMAX; | |||||
| while ( NB == NBMAX ) | |||||
| { | |||||
| m1 -= NB; | |||||
| if ( m1 < 0) | |||||
| { | |||||
| if ( m2 == 0 ) break; | |||||
| NB = m2; | |||||
| } | |||||
| a_ptr = a; | |||||
| ap[0] = a_ptr; | |||||
| ap[1] = a_ptr + lda; | |||||
| ap[2] = ap[1] + lda; | |||||
| ap[3] = ap[2] + lda; | |||||
| x_ptr = x; | |||||
| //zero_y(NB,ybuffer); | |||||
| memset(ybuffer,0,NB*16); | |||||
| if ( inc_x == 2 ) | |||||
| { | |||||
| for( i = 0; i < n1 ; i++) | |||||
| { | |||||
| zgemv_kernel_4x4(NB,ap,x_ptr,ybuffer); | |||||
| ap[0] += lda4; | |||||
| ap[1] += lda4; | |||||
| ap[2] += lda4; | |||||
| ap[3] += lda4; | |||||
| a_ptr += lda4; | |||||
| x_ptr += 8; | |||||
| } | |||||
| if ( n2 & 2 ) | |||||
| { | |||||
| zgemv_kernel_4x2(NB,ap,x_ptr,ybuffer); | |||||
| x_ptr += 4; | |||||
| a_ptr += 2 * lda; | |||||
| } | |||||
| if ( n2 & 1 ) | |||||
| { | |||||
| zgemv_kernel_4x1(NB,a_ptr,x_ptr,ybuffer); | |||||
| x_ptr += 2; | |||||
| a_ptr += lda; | |||||
| } | |||||
| } | |||||
| else | |||||
| { | |||||
| for( i = 0; i < n1 ; i++) | |||||
| { | |||||
| xbuffer[0] = x_ptr[0]; | |||||
| xbuffer[1] = x_ptr[1]; | |||||
| x_ptr += inc_x; | |||||
| xbuffer[2] = x_ptr[0]; | |||||
| xbuffer[3] = x_ptr[1]; | |||||
| x_ptr += inc_x; | |||||
| xbuffer[4] = x_ptr[0]; | |||||
| xbuffer[5] = x_ptr[1]; | |||||
| x_ptr += inc_x; | |||||
| xbuffer[6] = x_ptr[0]; | |||||
| xbuffer[7] = x_ptr[1]; | |||||
| x_ptr += inc_x; | |||||
| zgemv_kernel_4x4(NB,ap,xbuffer,ybuffer); | |||||
| ap[0] += lda4; | |||||
| ap[1] += lda4; | |||||
| ap[2] += lda4; | |||||
| ap[3] += lda4; | |||||
| a_ptr += lda4; | |||||
| } | |||||
| for( i = 0; i < n2 ; i++) | |||||
| { | |||||
| xbuffer[0] = x_ptr[0]; | |||||
| xbuffer[1] = x_ptr[1]; | |||||
| x_ptr += inc_x; | |||||
| zgemv_kernel_4x1(NB,a_ptr,xbuffer,ybuffer); | |||||
| a_ptr += 1 * lda; | |||||
| } | |||||
| } | |||||
| add_y(NB,ybuffer,y_ptr,inc_y,alpha_r,alpha_i); | |||||
| a += 2 * NB; | |||||
| y_ptr += NB * inc_y; | |||||
| } | |||||
| if ( m3 == 0 ) return(0); | |||||
| if ( m3 == 1 ) | |||||
| { | |||||
| a_ptr = a; | |||||
| x_ptr = x; | |||||
| FLOAT temp_r = 0.0; | |||||
| FLOAT temp_i = 0.0; | |||||
| if ( lda == 2 && inc_x == 2 ) | |||||
| { | |||||
| for( i=0 ; i < (n & -2); i+=2 ) | |||||
| { | |||||
| #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) | |||||
| temp_r += a_ptr[0] * x_ptr[0] - a_ptr[1] * x_ptr[1]; | |||||
| temp_i += a_ptr[0] * x_ptr[1] + a_ptr[1] * x_ptr[0]; | |||||
| temp_r += a_ptr[2] * x_ptr[2] - a_ptr[3] * x_ptr[3]; | |||||
| temp_i += a_ptr[2] * x_ptr[3] + a_ptr[3] * x_ptr[2]; | |||||
| #else | |||||
| temp_r += a_ptr[0] * x_ptr[0] + a_ptr[1] * x_ptr[1]; | |||||
| temp_i += a_ptr[0] * x_ptr[1] - a_ptr[1] * x_ptr[0]; | |||||
| temp_r += a_ptr[2] * x_ptr[2] + a_ptr[3] * x_ptr[3]; | |||||
| temp_i += a_ptr[2] * x_ptr[3] - a_ptr[3] * x_ptr[2]; | |||||
| #endif | |||||
| a_ptr += 4; | |||||
| x_ptr += 4; | |||||
| } | |||||
| for( ; i < n; i++ ) | |||||
| { | |||||
| #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) | |||||
| temp_r += a_ptr[0] * x_ptr[0] - a_ptr[1] * x_ptr[1]; | |||||
| temp_i += a_ptr[0] * x_ptr[1] + a_ptr[1] * x_ptr[0]; | |||||
| #else | |||||
| temp_r += a_ptr[0] * x_ptr[0] + a_ptr[1] * x_ptr[1]; | |||||
| temp_i += a_ptr[0] * x_ptr[1] - a_ptr[1] * x_ptr[0]; | |||||
| #endif | |||||
| a_ptr += 2; | |||||
| x_ptr += 2; | |||||
| } | |||||
| } | |||||
| else | |||||
| { | |||||
| for( i = 0; i < n; i++ ) | |||||
| { | |||||
| #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) | |||||
| temp_r += a_ptr[0] * x_ptr[0] - a_ptr[1] * x_ptr[1]; | |||||
| temp_i += a_ptr[0] * x_ptr[1] + a_ptr[1] * x_ptr[0]; | |||||
| #else | |||||
| temp_r += a_ptr[0] * x_ptr[0] + a_ptr[1] * x_ptr[1]; | |||||
| temp_i += a_ptr[0] * x_ptr[1] - a_ptr[1] * x_ptr[0]; | |||||
| #endif | |||||
| a_ptr += lda; | |||||
| x_ptr += inc_x; | |||||
| } | |||||
| } | |||||
| #if !defined(XCONJ) | |||||
| y_ptr[0] += alpha_r * temp_r - alpha_i * temp_i; | |||||
| y_ptr[1] += alpha_r * temp_i + alpha_i * temp_r; | |||||
| #else | |||||
| y_ptr[0] += alpha_r * temp_r + alpha_i * temp_i; | |||||
| y_ptr[1] -= alpha_r * temp_i - alpha_i * temp_r; | |||||
| #endif | |||||
| return(0); | |||||
| } | |||||
| if ( m3 == 2 ) | |||||
| { | |||||
| a_ptr = a; | |||||
| x_ptr = x; | |||||
| FLOAT temp_r0 = 0.0; | |||||
| FLOAT temp_i0 = 0.0; | |||||
| FLOAT temp_r1 = 0.0; | |||||
| FLOAT temp_i1 = 0.0; | |||||
| if ( lda == 4 && inc_x == 2 ) | |||||
| { | |||||
| for( i = 0; i < (n & -2); i+=2 ) | |||||
| { | |||||
| #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) | |||||
| temp_r0 += a_ptr[0] * x_ptr[0] - a_ptr[1] * x_ptr[1]; | |||||
| temp_i0 += a_ptr[0] * x_ptr[1] + a_ptr[1] * x_ptr[0]; | |||||
| temp_r1 += a_ptr[2] * x_ptr[0] - a_ptr[3] * x_ptr[1]; | |||||
| temp_i1 += a_ptr[2] * x_ptr[1] + a_ptr[3] * x_ptr[0]; | |||||
| temp_r0 += a_ptr[4] * x_ptr[2] - a_ptr[5] * x_ptr[3]; | |||||
| temp_i0 += a_ptr[4] * x_ptr[3] + a_ptr[5] * x_ptr[2]; | |||||
| temp_r1 += a_ptr[6] * x_ptr[2] - a_ptr[7] * x_ptr[3]; | |||||
| temp_i1 += a_ptr[6] * x_ptr[3] + a_ptr[7] * x_ptr[2]; | |||||
| #else | |||||
| temp_r0 += a_ptr[0] * x_ptr[0] + a_ptr[1] * x_ptr[1]; | |||||
| temp_i0 += a_ptr[0] * x_ptr[1] - a_ptr[1] * x_ptr[0]; | |||||
| temp_r1 += a_ptr[2] * x_ptr[0] + a_ptr[3] * x_ptr[1]; | |||||
| temp_i1 += a_ptr[2] * x_ptr[1] - a_ptr[3] * x_ptr[0]; | |||||
| temp_r0 += a_ptr[4] * x_ptr[2] + a_ptr[5] * x_ptr[3]; | |||||
| temp_i0 += a_ptr[4] * x_ptr[3] - a_ptr[5] * x_ptr[2]; | |||||
| temp_r1 += a_ptr[6] * x_ptr[2] + a_ptr[7] * x_ptr[3]; | |||||
| temp_i1 += a_ptr[6] * x_ptr[3] - a_ptr[7] * x_ptr[2]; | |||||
| #endif | |||||
| a_ptr += 8; | |||||
| x_ptr += 4; | |||||
| } | |||||
| for( ; i < n; i++ ) | |||||
| { | |||||
| #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) | |||||
| temp_r0 += a_ptr[0] * x_ptr[0] - a_ptr[1] * x_ptr[1]; | |||||
| temp_i0 += a_ptr[0] * x_ptr[1] + a_ptr[1] * x_ptr[0]; | |||||
| temp_r1 += a_ptr[2] * x_ptr[0] - a_ptr[3] * x_ptr[1]; | |||||
| temp_i1 += a_ptr[2] * x_ptr[1] + a_ptr[3] * x_ptr[0]; | |||||
| #else | |||||
| temp_r0 += a_ptr[0] * x_ptr[0] + a_ptr[1] * x_ptr[1]; | |||||
| temp_i0 += a_ptr[0] * x_ptr[1] - a_ptr[1] * x_ptr[0]; | |||||
| temp_r1 += a_ptr[2] * x_ptr[0] + a_ptr[3] * x_ptr[1]; | |||||
| temp_i1 += a_ptr[2] * x_ptr[1] - a_ptr[3] * x_ptr[0]; | |||||
| #endif | |||||
| a_ptr += 4; | |||||
| x_ptr += 2; | |||||
| } | |||||
| } | |||||
| else | |||||
| { | |||||
| for( i=0 ; i < n; i++ ) | |||||
| { | |||||
| #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) | |||||
| temp_r0 += a_ptr[0] * x_ptr[0] - a_ptr[1] * x_ptr[1]; | |||||
| temp_i0 += a_ptr[0] * x_ptr[1] + a_ptr[1] * x_ptr[0]; | |||||
| temp_r1 += a_ptr[2] * x_ptr[0] - a_ptr[3] * x_ptr[1]; | |||||
| temp_i1 += a_ptr[2] * x_ptr[1] + a_ptr[3] * x_ptr[0]; | |||||
| #else | |||||
| temp_r0 += a_ptr[0] * x_ptr[0] + a_ptr[1] * x_ptr[1]; | |||||
| temp_i0 += a_ptr[0] * x_ptr[1] - a_ptr[1] * x_ptr[0]; | |||||
| temp_r1 += a_ptr[2] * x_ptr[0] + a_ptr[3] * x_ptr[1]; | |||||
| temp_i1 += a_ptr[2] * x_ptr[1] - a_ptr[3] * x_ptr[0]; | |||||
| #endif | |||||
| a_ptr += lda; | |||||
| x_ptr += inc_x; | |||||
| } | |||||
| } | |||||
| #if !defined(XCONJ) | |||||
| y_ptr[0] += alpha_r * temp_r0 - alpha_i * temp_i0; | |||||
| y_ptr[1] += alpha_r * temp_i0 + alpha_i * temp_r0; | |||||
| y_ptr += inc_y; | |||||
| y_ptr[0] += alpha_r * temp_r1 - alpha_i * temp_i1; | |||||
| y_ptr[1] += alpha_r * temp_i1 + alpha_i * temp_r1; | |||||
| #else | |||||
| y_ptr[0] += alpha_r * temp_r0 + alpha_i * temp_i0; | |||||
| y_ptr[1] -= alpha_r * temp_i0 - alpha_i * temp_r0; | |||||
| y_ptr += inc_y; | |||||
| y_ptr[0] += alpha_r * temp_r1 + alpha_i * temp_i1; | |||||
| y_ptr[1] -= alpha_r * temp_i1 - alpha_i * temp_r1; | |||||
| #endif | |||||
| return(0); | |||||
| } | |||||
| if ( m3 == 3 ) | |||||
| { | |||||
| a_ptr = a; | |||||
| x_ptr = x; | |||||
| FLOAT temp_r0 = 0.0; | |||||
| FLOAT temp_i0 = 0.0; | |||||
| FLOAT temp_r1 = 0.0; | |||||
| FLOAT temp_i1 = 0.0; | |||||
| FLOAT temp_r2 = 0.0; | |||||
| FLOAT temp_i2 = 0.0; | |||||
| if ( lda == 6 && inc_x == 2 ) | |||||
| { | |||||
| for( i=0 ; i < n; i++ ) | |||||
| { | |||||
| #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) | |||||
| temp_r0 += a_ptr[0] * x_ptr[0] - a_ptr[1] * x_ptr[1]; | |||||
| temp_i0 += a_ptr[0] * x_ptr[1] + a_ptr[1] * x_ptr[0]; | |||||
| temp_r1 += a_ptr[2] * x_ptr[0] - a_ptr[3] * x_ptr[1]; | |||||
| temp_i1 += a_ptr[2] * x_ptr[1] + a_ptr[3] * x_ptr[0]; | |||||
| temp_r2 += a_ptr[4] * x_ptr[0] - a_ptr[5] * x_ptr[1]; | |||||
| temp_i2 += a_ptr[4] * x_ptr[1] + a_ptr[5] * x_ptr[0]; | |||||
| #else | |||||
| temp_r0 += a_ptr[0] * x_ptr[0] + a_ptr[1] * x_ptr[1]; | |||||
| temp_i0 += a_ptr[0] * x_ptr[1] - a_ptr[1] * x_ptr[0]; | |||||
| temp_r1 += a_ptr[2] * x_ptr[0] + a_ptr[3] * x_ptr[1]; | |||||
| temp_i1 += a_ptr[2] * x_ptr[1] - a_ptr[3] * x_ptr[0]; | |||||
| temp_r2 += a_ptr[4] * x_ptr[0] + a_ptr[5] * x_ptr[1]; | |||||
| temp_i2 += a_ptr[4] * x_ptr[1] - a_ptr[5] * x_ptr[0]; | |||||
| #endif | |||||
| a_ptr += 6; | |||||
| x_ptr += 2; | |||||
| } | |||||
| } | |||||
| else | |||||
| { | |||||
| for( i = 0; i < n; i++ ) | |||||
| { | |||||
| #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) | |||||
| temp_r0 += a_ptr[0] * x_ptr[0] - a_ptr[1] * x_ptr[1]; | |||||
| temp_i0 += a_ptr[0] * x_ptr[1] + a_ptr[1] * x_ptr[0]; | |||||
| temp_r1 += a_ptr[2] * x_ptr[0] - a_ptr[3] * x_ptr[1]; | |||||
| temp_i1 += a_ptr[2] * x_ptr[1] + a_ptr[3] * x_ptr[0]; | |||||
| temp_r2 += a_ptr[4] * x_ptr[0] - a_ptr[5] * x_ptr[1]; | |||||
| temp_i2 += a_ptr[4] * x_ptr[1] + a_ptr[5] * x_ptr[0]; | |||||
| #else | |||||
| temp_r0 += a_ptr[0] * x_ptr[0] + a_ptr[1] * x_ptr[1]; | |||||
| temp_i0 += a_ptr[0] * x_ptr[1] - a_ptr[1] * x_ptr[0]; | |||||
| temp_r1 += a_ptr[2] * x_ptr[0] + a_ptr[3] * x_ptr[1]; | |||||
| temp_i1 += a_ptr[2] * x_ptr[1] - a_ptr[3] * x_ptr[0]; | |||||
| temp_r2 += a_ptr[4] * x_ptr[0] + a_ptr[5] * x_ptr[1]; | |||||
| temp_i2 += a_ptr[4] * x_ptr[1] - a_ptr[5] * x_ptr[0]; | |||||
| #endif | |||||
| a_ptr += lda; | |||||
| x_ptr += inc_x; | |||||
| } | |||||
| } | |||||
| #if !defined(XCONJ) | |||||
| y_ptr[0] += alpha_r * temp_r0 - alpha_i * temp_i0; | |||||
| y_ptr[1] += alpha_r * temp_i0 + alpha_i * temp_r0; | |||||
| y_ptr += inc_y; | |||||
| y_ptr[0] += alpha_r * temp_r1 - alpha_i * temp_i1; | |||||
| y_ptr[1] += alpha_r * temp_i1 + alpha_i * temp_r1; | |||||
| y_ptr += inc_y; | |||||
| y_ptr[0] += alpha_r * temp_r2 - alpha_i * temp_i2; | |||||
| y_ptr[1] += alpha_r * temp_i2 + alpha_i * temp_r2; | |||||
| #else | |||||
| y_ptr[0] += alpha_r * temp_r0 + alpha_i * temp_i0; | |||||
| y_ptr[1] -= alpha_r * temp_i0 - alpha_i * temp_r0; | |||||
| y_ptr += inc_y; | |||||
| y_ptr[0] += alpha_r * temp_r1 + alpha_i * temp_i1; | |||||
| y_ptr[1] -= alpha_r * temp_i1 - alpha_i * temp_r1; | |||||
| y_ptr += inc_y; | |||||
| y_ptr[0] += alpha_r * temp_r2 + alpha_i * temp_i2; | |||||
| y_ptr[1] -= alpha_r * temp_i2 - alpha_i * temp_r2; | |||||
| #endif | |||||
| return(0); | |||||
| } | |||||
| return(0); | |||||
| } | |||||
| @@ -0,0 +1,788 @@ | |||||
| /*************************************************************************** | |||||
| Copyright (c) 2017, The OpenBLAS Project | |||||
| All rights reserved. | |||||
| Redistribution and use in source and binary forms, with or without | |||||
| modification, are permitted provided that the following conditions are | |||||
| met: | |||||
| 1. Redistributions of source code must retain the above copyright | |||||
| notice, this list of conditions and the following disclaimer. | |||||
| 2. Redistributions in binary form must reproduce the above copyright | |||||
| notice, this list of conditions and the following disclaimer in | |||||
| the documentation and/or other materials provided with the | |||||
| distribution. | |||||
| 3. Neither the name of the OpenBLAS project nor the names of | |||||
| its contributors may be used to endorse or promote products | |||||
| derived from this software without specific prior written permission. | |||||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| *****************************************************************************/ | |||||
| #include "common.h" | |||||
| #define NBMAX 1024 | |||||
| #define HAVE_KERNEL_4x4_VEC 1 | |||||
| #define HAVE_KERNEL_4x2_VEC 1 | |||||
| #define HAVE_KERNEL_4x1_VEC 1 | |||||
| #if defined(HAVE_KERNEL_4x4_VEC) || defined(HAVE_KERNEL_4x2_VEC) || defined(HAVE_KERNEL_4x1_VEC) | |||||
| #include <vecintrin.h> | |||||
| #endif | |||||
| #ifdef HAVE_KERNEL_4x4_VEC_ASM | |||||
| #elif HAVE_KERNEL_4x4_VEC | |||||
| static void zgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha) | |||||
| { | |||||
| BLASLONG i; | |||||
| FLOAT *a0,*a1,*a2,*a3; | |||||
| a0 = ap[0]; | |||||
| a1 = ap[1]; | |||||
| a2 = ap[2]; | |||||
| a3 = ap[3]; | |||||
| register __vector double vtemp0_r = {0.0,0.0}; | |||||
| register __vector double vtemp0_i = {0.0,0.0}; | |||||
| register __vector double vtemp1_r = {0.0,0.0}; | |||||
| register __vector double vtemp1_i = {0.0,0.0}; | |||||
| register __vector double vtemp2_r = {0.0,0.0}; | |||||
| register __vector double vtemp2_i = {0.0,0.0}; | |||||
| register __vector double vtemp3_r = {0.0,0.0}; | |||||
| register __vector double vtemp3_i = {0.0,0.0}; | |||||
| for ( i=0; i< 2*n; i+=4 ) | |||||
| { | |||||
| register __vector double vx_r = {x[i],x[i+2]}; | |||||
| register __vector double vx_i = {x[i+1],x[i+3]}; | |||||
| register __vector double va0_r= {a0[i],a0[i+2]}; | |||||
| register __vector double va0_i= {a0[i+1],a0[i+3]}; | |||||
| register __vector double va1_r= {a1[i],a1[i+2]}; | |||||
| register __vector double va1_i= {a1[i+1],a1[i+3]}; | |||||
| register __vector double va2_r= {a2[i],a2[i+2]}; | |||||
| register __vector double va2_i= {a2[i+1],a2[i+3]}; | |||||
| register __vector double va3_r= {a3[i],a3[i+2]}; | |||||
| register __vector double va3_i= {a3[i+1],a3[i+3]}; | |||||
| #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) | |||||
| vtemp0_r = va0_r * vx_r - (va0_i*vx_i -vtemp0_r) ; | |||||
| vtemp0_i = vtemp0_i + va0_r * vx_i + va0_i * vx_r ; | |||||
| vtemp1_r = va1_r * vx_r - (va1_i*vx_i -vtemp1_r) ; | |||||
| vtemp1_i = vtemp1_i + va1_r * vx_i + va1_i * vx_r ; | |||||
| vtemp2_r = va2_r * vx_r - (va2_i*vx_i -vtemp2_r) ; | |||||
| vtemp2_i = vtemp2_i + va2_r * vx_i + va2_i * vx_r ; | |||||
| vtemp3_r = va3_r * vx_r - (va3_i*vx_i -vtemp3_r) ; | |||||
| vtemp3_i = vtemp3_i + va3_r * vx_i + va3_i * vx_r ; | |||||
| #else | |||||
| vtemp0_r = vtemp0_r + va0_r * vx_r + va0_i*vx_i ; | |||||
| vtemp0_i = va0_r * vx_i - ( va0_i * vx_r - vtemp0_i) ; | |||||
| vtemp1_r = vtemp1_r + va1_r * vx_r + va1_i*vx_i ; | |||||
| vtemp1_i = va1_r * vx_i - ( va1_i * vx_r - vtemp1_i); | |||||
| vtemp2_r = vtemp2_r + va2_r * vx_r + va2_i*vx_i ; | |||||
| vtemp2_i = va2_r * vx_i - ( va2_i * vx_r - vtemp2_i) ; | |||||
| vtemp3_r = vtemp3_r + va3_r * vx_r + va3_i*vx_i ; | |||||
| vtemp3_i = va3_r * vx_i - ( va3_i * vx_r - vtemp3_i); | |||||
| #endif | |||||
| } | |||||
| register FLOAT alpha_r = alpha[0] ; | |||||
| register FLOAT alpha_i = alpha[1] ; | |||||
| register FLOAT temp_r0 = vtemp0_r[0]+vtemp0_r[1] ; | |||||
| register FLOAT temp_i0 = vtemp0_i[0]+vtemp0_i[1] ; | |||||
| register FLOAT temp_r1 = vtemp1_r[0]+vtemp1_r[1] ; | |||||
| register FLOAT temp_i1 = vtemp1_i[0]+vtemp1_i[1] ; | |||||
| register FLOAT temp_r2 = vtemp2_r[0]+vtemp2_r[1] ; | |||||
| register FLOAT temp_i2 = vtemp2_i[0]+vtemp2_i[1] ; | |||||
| register FLOAT temp_r3 = vtemp3_r[0]+vtemp3_r[1] ; | |||||
| register FLOAT temp_i3 = vtemp3_i[0]+vtemp3_i[1] ; | |||||
| #if !defined(XCONJ) | |||||
| y[0] += alpha_r * temp_r0 - alpha_i * temp_i0; | |||||
| y[1] += alpha_r * temp_i0 + alpha_i * temp_r0; | |||||
| y[2] += alpha_r * temp_r1 - alpha_i * temp_i1; | |||||
| y[3] += alpha_r * temp_i1 + alpha_i * temp_r1; | |||||
| y[4] += alpha_r * temp_r2 - alpha_i * temp_i2; | |||||
| y[5] += alpha_r * temp_i2 + alpha_i * temp_r2; | |||||
| y[6] += alpha_r * temp_r3 - alpha_i * temp_i3; | |||||
| y[7] += alpha_r * temp_i3 + alpha_i * temp_r3; | |||||
| #else | |||||
| y[0] += alpha_r * temp_r0 + alpha_i * temp_i0; | |||||
| y[1] -= alpha_r * temp_i0 - alpha_i * temp_r0; | |||||
| y[2] += alpha_r * temp_r1 + alpha_i * temp_i1; | |||||
| y[3] -= alpha_r * temp_i1 - alpha_i * temp_r1; | |||||
| y[4] += alpha_r * temp_r2 + alpha_i * temp_i2; | |||||
| y[5] -= alpha_r * temp_i2 - alpha_i * temp_r2; | |||||
| y[6] += alpha_r * temp_r3 + alpha_i * temp_i3; | |||||
| y[7] -= alpha_r * temp_i3 - alpha_i * temp_r3; | |||||
| #endif | |||||
| } | |||||
| #else | |||||
| static void zgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha) | |||||
| { | |||||
| BLASLONG i; | |||||
| FLOAT *a0,*a1,*a2,*a3; | |||||
| a0 = ap[0]; | |||||
| a1 = ap[1]; | |||||
| a2 = ap[2]; | |||||
| a3 = ap[3]; | |||||
| FLOAT alpha_r = alpha[0]; | |||||
| FLOAT alpha_i = alpha[1]; | |||||
| FLOAT temp_r0 = 0.0; | |||||
| FLOAT temp_r1 = 0.0; | |||||
| FLOAT temp_r2 = 0.0; | |||||
| FLOAT temp_r3 = 0.0; | |||||
| FLOAT temp_i0 = 0.0; | |||||
| FLOAT temp_i1 = 0.0; | |||||
| FLOAT temp_i2 = 0.0; | |||||
| FLOAT temp_i3 = 0.0; | |||||
| for ( i=0; i< 2*n; i+=2 ) | |||||
| { | |||||
| #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) | |||||
| temp_r0 += a0[i]*x[i] - a0[i+1]*x[i+1]; | |||||
| temp_i0 += a0[i]*x[i+1] + a0[i+1]*x[i]; | |||||
| temp_r1 += a1[i]*x[i] - a1[i+1]*x[i+1]; | |||||
| temp_i1 += a1[i]*x[i+1] + a1[i+1]*x[i]; | |||||
| temp_r2 += a2[i]*x[i] - a2[i+1]*x[i+1]; | |||||
| temp_i2 += a2[i]*x[i+1] + a2[i+1]*x[i]; | |||||
| temp_r3 += a3[i]*x[i] - a3[i+1]*x[i+1]; | |||||
| temp_i3 += a3[i]*x[i+1] + a3[i+1]*x[i]; | |||||
| #else | |||||
| temp_r0 += a0[i]*x[i] + a0[i+1]*x[i+1]; | |||||
| temp_i0 += a0[i]*x[i+1] - a0[i+1]*x[i]; | |||||
| temp_r1 += a1[i]*x[i] + a1[i+1]*x[i+1]; | |||||
| temp_i1 += a1[i]*x[i+1] - a1[i+1]*x[i]; | |||||
| temp_r2 += a2[i]*x[i] + a2[i+1]*x[i+1]; | |||||
| temp_i2 += a2[i]*x[i+1] - a2[i+1]*x[i]; | |||||
| temp_r3 += a3[i]*x[i] + a3[i+1]*x[i+1]; | |||||
| temp_i3 += a3[i]*x[i+1] - a3[i+1]*x[i]; | |||||
| #endif | |||||
| } | |||||
| #if !defined(XCONJ) | |||||
| y[0] += alpha_r * temp_r0 - alpha_i * temp_i0; | |||||
| y[1] += alpha_r * temp_i0 + alpha_i * temp_r0; | |||||
| y[2] += alpha_r * temp_r1 - alpha_i * temp_i1; | |||||
| y[3] += alpha_r * temp_i1 + alpha_i * temp_r1; | |||||
| y[4] += alpha_r * temp_r2 - alpha_i * temp_i2; | |||||
| y[5] += alpha_r * temp_i2 + alpha_i * temp_r2; | |||||
| y[6] += alpha_r * temp_r3 - alpha_i * temp_i3; | |||||
| y[7] += alpha_r * temp_i3 + alpha_i * temp_r3; | |||||
| #else | |||||
| y[0] += alpha_r * temp_r0 + alpha_i * temp_i0; | |||||
| y[1] -= alpha_r * temp_i0 - alpha_i * temp_r0; | |||||
| y[2] += alpha_r * temp_r1 + alpha_i * temp_i1; | |||||
| y[3] -= alpha_r * temp_i1 - alpha_i * temp_r1; | |||||
| y[4] += alpha_r * temp_r2 + alpha_i * temp_i2; | |||||
| y[5] -= alpha_r * temp_i2 - alpha_i * temp_r2; | |||||
| y[6] += alpha_r * temp_r3 + alpha_i * temp_i3; | |||||
| y[7] -= alpha_r * temp_i3 - alpha_i * temp_r3; | |||||
| #endif | |||||
| } | |||||
| #endif | |||||
| #ifdef HAVE_KERNEL_4x2_VEC | |||||
| static void zgemv_kernel_4x2(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha) | |||||
| { | |||||
| BLASLONG i; | |||||
| FLOAT *a0,*a1; | |||||
| a0 = ap[0]; | |||||
| a1 = ap[1]; | |||||
| register __vector double vtemp0_r = {0.0,0.0}; | |||||
| register __vector double vtemp0_i = {0.0,0.0}; | |||||
| register __vector double vtemp1_r = {0.0,0.0}; | |||||
| register __vector double vtemp1_i = {0.0,0.0}; | |||||
| for ( i=0; i< 2*n; i+=4 ) | |||||
| { | |||||
| register __vector double vx_r = {x[i],x[i+2]}; | |||||
| register __vector double vx_i = {x[i+1],x[i+3]}; | |||||
| register __vector double va0_r= {a0[i],a0[i+2]}; | |||||
| register __vector double va0_i= {a0[i+1],a0[i+3]}; | |||||
| register __vector double va1_r= {a1[i],a1[i+2]}; | |||||
| register __vector double va1_i= {a1[i+1],a1[i+3]}; | |||||
| #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) | |||||
| vtemp0_r = va0_r * vx_r - (va0_i*vx_i -vtemp0_r) ; | |||||
| vtemp0_i = vtemp0_i + va0_r * vx_i + va0_i * vx_r ; | |||||
| vtemp1_r = va1_r * vx_r - (va1_i*vx_i -vtemp1_r) ; | |||||
| vtemp1_i = vtemp1_i + va1_r * vx_i + va1_i * vx_r ; | |||||
| #else | |||||
| vtemp0_r = vtemp0_r + va0_r * vx_r + va0_i*vx_i ; | |||||
| vtemp0_i = va0_r * vx_i - ( va0_i * vx_r - vtemp0_i) ; | |||||
| vtemp1_r = vtemp1_r + va1_r * vx_r + va1_i*vx_i ; | |||||
| vtemp1_i = va1_r * vx_i - ( va1_i * vx_r - vtemp1_i); | |||||
| #endif | |||||
| } | |||||
| register FLOAT temp_r0 = vtemp0_r[0]+vtemp0_r[1] ; | |||||
| register FLOAT temp_i0 = vtemp0_i[0]+vtemp0_i[1] ; | |||||
| register FLOAT temp_r1 = vtemp1_r[0]+vtemp1_r[1] ; | |||||
| register FLOAT temp_i1 = vtemp1_i[0]+vtemp1_i[1] ; | |||||
| register FLOAT alpha_r = alpha[0] ; | |||||
| register FLOAT alpha_i = alpha[1] ; | |||||
| #if !defined(XCONJ) | |||||
| y[0] += alpha_r * temp_r0 - alpha_i * temp_i0; | |||||
| y[1] += alpha_r * temp_i0 + alpha_i * temp_r0; | |||||
| y[2] += alpha_r * temp_r1 - alpha_i * temp_i1; | |||||
| y[3] += alpha_r * temp_i1 + alpha_i * temp_r1; | |||||
| #else | |||||
| y[0] += alpha_r * temp_r0 + alpha_i * temp_i0; | |||||
| y[1] -= alpha_r * temp_i0 - alpha_i * temp_r0; | |||||
| y[2] += alpha_r * temp_r1 + alpha_i * temp_i1; | |||||
| y[3] -= alpha_r * temp_i1 - alpha_i * temp_r1; | |||||
| #endif | |||||
| } | |||||
| #else | |||||
| static void zgemv_kernel_4x2(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha) | |||||
| { | |||||
| BLASLONG i; | |||||
| FLOAT *a0,*a1; | |||||
| a0 = ap[0]; | |||||
| a1 = ap[1]; | |||||
| FLOAT alpha_r = alpha[0]; | |||||
| FLOAT alpha_i = alpha[1]; | |||||
| FLOAT temp_r0 = 0.0; | |||||
| FLOAT temp_r1 = 0.0; | |||||
| FLOAT temp_i0 = 0.0; | |||||
| FLOAT temp_i1 = 0.0; | |||||
| for ( i=0; i< 2*n; i+=2 ) | |||||
| { | |||||
| #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) | |||||
| temp_r0 += a0[i]*x[i] - a0[i+1]*x[i+1]; | |||||
| temp_i0 += a0[i]*x[i+1] + a0[i+1]*x[i]; | |||||
| temp_r1 += a1[i]*x[i] - a1[i+1]*x[i+1]; | |||||
| temp_i1 += a1[i]*x[i+1] + a1[i+1]*x[i]; | |||||
| #else | |||||
| temp_r0 += a0[i]*x[i] + a0[i+1]*x[i+1]; | |||||
| temp_i0 += a0[i]*x[i+1] - a0[i+1]*x[i]; | |||||
| temp_r1 += a1[i]*x[i] + a1[i+1]*x[i+1]; | |||||
| temp_i1 += a1[i]*x[i+1] - a1[i+1]*x[i]; | |||||
| #endif | |||||
| } | |||||
| #if !defined(XCONJ) | |||||
| y[0] += alpha_r * temp_r0 - alpha_i * temp_i0; | |||||
| y[1] += alpha_r * temp_i0 + alpha_i * temp_r0; | |||||
| y[2] += alpha_r * temp_r1 - alpha_i * temp_i1; | |||||
| y[3] += alpha_r * temp_i1 + alpha_i * temp_r1; | |||||
| #else | |||||
| y[0] += alpha_r * temp_r0 + alpha_i * temp_i0; | |||||
| y[1] -= alpha_r * temp_i0 - alpha_i * temp_r0; | |||||
| y[2] += alpha_r * temp_r1 + alpha_i * temp_i1; | |||||
| y[3] -= alpha_r * temp_i1 - alpha_i * temp_r1; | |||||
| #endif | |||||
| } | |||||
| #endif | |||||
| #ifdef HAVE_KERNEL_4x1_VEC | |||||
| static void zgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT *alpha) | |||||
| { | |||||
| BLASLONG i; | |||||
| FLOAT *a0; | |||||
| a0 = ap; | |||||
| register __vector double vtemp_r = {0.0,0.0}; | |||||
| register __vector double vtemp_i = {0.0,0.0}; | |||||
| for ( i=0; i< 2*n; i+=4 ) | |||||
| { | |||||
| register __vector double va0_r= {a0[i],a0[i+2]}; | |||||
| register __vector double va0_i= {a0[i+1],a0[i+3]}; | |||||
| register __vector double vx0_r = {x[i],x[i+2]}; | |||||
| register __vector double vx0_i = {x[i+1],x[i+3]}; | |||||
| #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) | |||||
| vtemp_r = va0_r * vx0_r - (va0_i*vx0_i -vtemp_r) ; | |||||
| vtemp_i = vtemp_i + va0_r * vx0_i + va0_i * vx0_r ; | |||||
| #else | |||||
| vtemp_r = vtemp_r + va0_r * vx0_r + va0_i*vx0_i ; | |||||
| vtemp_i = va0_r * vx0_i - ( va0_i * vx0_r - vtemp_i) ; | |||||
| #endif | |||||
| } | |||||
| register FLOAT temp_r0 = vtemp_r[0]+vtemp_r[1] ; | |||||
| register FLOAT temp_i0 = vtemp_i[0]+vtemp_i[1] ; | |||||
| register FLOAT alpha_r = alpha[0] ; | |||||
| register FLOAT alpha_i = alpha[1] ; | |||||
| #if !defined(XCONJ) | |||||
| y[0] += alpha_r * temp_r0 - alpha_i * temp_i0; | |||||
| y[1] += alpha_r * temp_i0 + alpha_i * temp_r0; | |||||
| #else | |||||
| y[0] += alpha_r * temp_r0 + alpha_i * temp_i0; | |||||
| y[1] -= alpha_r * temp_i0 - alpha_i * temp_r0; | |||||
| #endif | |||||
| } | |||||
| #else | |||||
| static void zgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT *alpha) | |||||
| { | |||||
| BLASLONG i; | |||||
| FLOAT *a0; | |||||
| a0 = ap; | |||||
| FLOAT alpha_r = alpha[0]; | |||||
| FLOAT alpha_i = alpha[1]; | |||||
| FLOAT temp_r0 = 0.0; | |||||
| FLOAT temp_i0 = 0.0; | |||||
| for ( i=0; i< 2*n; i+=2 ) | |||||
| { | |||||
| #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) | |||||
| temp_r0 += a0[i]*x[i] - a0[i+1]*x[i+1]; | |||||
| temp_i0 += a0[i]*x[i+1] + a0[i+1]*x[i]; | |||||
| #else | |||||
| temp_r0 += a0[i]*x[i] + a0[i+1]*x[i+1]; | |||||
| temp_i0 += a0[i]*x[i+1] - a0[i+1]*x[i]; | |||||
| #endif | |||||
| } | |||||
| #if !defined(XCONJ) | |||||
| y[0] += alpha_r * temp_r0 - alpha_i * temp_i0; | |||||
| y[1] += alpha_r * temp_i0 + alpha_i * temp_r0; | |||||
| #else | |||||
| y[0] += alpha_r * temp_r0 + alpha_i * temp_i0; | |||||
| y[1] -= alpha_r * temp_i0 - alpha_i * temp_r0; | |||||
| #endif | |||||
| } | |||||
| #endif | |||||
| static void copy_x(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_src) | |||||
| { | |||||
| BLASLONG i; | |||||
| for ( i=0; i<n; i++ ) | |||||
| { | |||||
| *dest = *src; | |||||
| *(dest+1) = *(src+1); | |||||
| dest+=2; | |||||
| src += inc_src; | |||||
| } | |||||
| } | |||||
| int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) | |||||
| { | |||||
| BLASLONG i; | |||||
| BLASLONG j; | |||||
| FLOAT *a_ptr; | |||||
| FLOAT *x_ptr; | |||||
| FLOAT *y_ptr; | |||||
| FLOAT *ap[8]; | |||||
| BLASLONG n1; | |||||
| BLASLONG m1; | |||||
| BLASLONG m2; | |||||
| BLASLONG m3; | |||||
| BLASLONG n2; | |||||
| BLASLONG lda4; | |||||
| FLOAT ybuffer[8],*xbuffer; | |||||
| FLOAT alpha[2]; | |||||
| if ( m < 1 ) return(0); | |||||
| if ( n < 1 ) return(0); | |||||
| inc_x <<= 1; | |||||
| inc_y <<= 1; | |||||
| lda <<= 1; | |||||
| lda4 = lda << 2; | |||||
| xbuffer = buffer; | |||||
| n1 = n >> 2 ; | |||||
| n2 = n & 3 ; | |||||
| m3 = m & 3 ; | |||||
| m1 = m - m3; | |||||
| m2 = (m & (NBMAX-1)) - m3 ; | |||||
| alpha[0] = alpha_r; | |||||
| alpha[1] = alpha_i; | |||||
| BLASLONG NB = NBMAX; | |||||
| while ( NB == NBMAX ) | |||||
| { | |||||
| m1 -= NB; | |||||
| if ( m1 < 0) | |||||
| { | |||||
| if ( m2 == 0 ) break; | |||||
| NB = m2; | |||||
| } | |||||
| y_ptr = y; | |||||
| a_ptr = a; | |||||
| x_ptr = x; | |||||
| ap[0] = a_ptr; | |||||
| ap[1] = a_ptr + lda; | |||||
| ap[2] = ap[1] + lda; | |||||
| ap[3] = ap[2] + lda; | |||||
| if ( inc_x != 2 ) | |||||
| copy_x(NB,x_ptr,xbuffer,inc_x); | |||||
| else | |||||
| xbuffer = x_ptr; | |||||
| if ( inc_y == 2 ) | |||||
| { | |||||
| for( i = 0; i < n1 ; i++) | |||||
| { | |||||
| zgemv_kernel_4x4(NB,ap,xbuffer,y_ptr,alpha); | |||||
| ap[0] += lda4; | |||||
| ap[1] += lda4; | |||||
| ap[2] += lda4; | |||||
| ap[3] += lda4; | |||||
| a_ptr += lda4; | |||||
| y_ptr += 8; | |||||
| } | |||||
| if ( n2 & 2 ) | |||||
| { | |||||
| zgemv_kernel_4x2(NB,ap,xbuffer,y_ptr,alpha); | |||||
| a_ptr += lda * 2; | |||||
| y_ptr += 4; | |||||
| } | |||||
| if ( n2 & 1 ) | |||||
| { | |||||
| zgemv_kernel_4x1(NB,a_ptr,xbuffer,y_ptr,alpha); | |||||
| a_ptr += lda; | |||||
| y_ptr += 2; | |||||
| } | |||||
| } | |||||
| else | |||||
| { | |||||
| for( i = 0; i < n1 ; i++) | |||||
| { | |||||
| memset(ybuffer,0,64); | |||||
| zgemv_kernel_4x4(NB,ap,xbuffer,ybuffer,alpha); | |||||
| ap[0] += lda4; | |||||
| ap[1] += lda4; | |||||
| ap[2] += lda4; | |||||
| ap[3] += lda4; | |||||
| a_ptr += lda4; | |||||
| y_ptr[0] += ybuffer[0]; | |||||
| y_ptr[1] += ybuffer[1]; | |||||
| y_ptr += inc_y; | |||||
| y_ptr[0] += ybuffer[2]; | |||||
| y_ptr[1] += ybuffer[3]; | |||||
| y_ptr += inc_y; | |||||
| y_ptr[0] += ybuffer[4]; | |||||
| y_ptr[1] += ybuffer[5]; | |||||
| y_ptr += inc_y; | |||||
| y_ptr[0] += ybuffer[6]; | |||||
| y_ptr[1] += ybuffer[7]; | |||||
| y_ptr += inc_y; | |||||
| } | |||||
| for( i = 0; i < n2 ; i++) | |||||
| { | |||||
| memset(ybuffer,0,64); | |||||
| zgemv_kernel_4x1(NB,a_ptr,xbuffer,ybuffer,alpha); | |||||
| a_ptr += lda; | |||||
| y_ptr[0] += ybuffer[0]; | |||||
| y_ptr[1] += ybuffer[1]; | |||||
| y_ptr += inc_y; | |||||
| } | |||||
| } | |||||
| a += 2 * NB; | |||||
| x += NB * inc_x; | |||||
| } | |||||
| if ( m3 == 0 ) return(0); | |||||
| x_ptr = x; | |||||
| j=0; | |||||
| a_ptr = a; | |||||
| y_ptr = y; | |||||
| if ( m3 == 3 ) | |||||
| { | |||||
| FLOAT temp_r ; | |||||
| FLOAT temp_i ; | |||||
| FLOAT x0 = x_ptr[0]; | |||||
| FLOAT x1 = x_ptr[1]; | |||||
| x_ptr += inc_x; | |||||
| FLOAT x2 = x_ptr[0]; | |||||
| FLOAT x3 = x_ptr[1]; | |||||
| x_ptr += inc_x; | |||||
| FLOAT x4 = x_ptr[0]; | |||||
| FLOAT x5 = x_ptr[1]; | |||||
| while ( j < n) | |||||
| { | |||||
| #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) | |||||
| temp_r = a_ptr[0] * x0 - a_ptr[1] * x1; | |||||
| temp_i = a_ptr[0] * x1 + a_ptr[1] * x0; | |||||
| temp_r += a_ptr[2] * x2 - a_ptr[3] * x3; | |||||
| temp_i += a_ptr[2] * x3 + a_ptr[3] * x2; | |||||
| temp_r += a_ptr[4] * x4 - a_ptr[5] * x5; | |||||
| temp_i += a_ptr[4] * x5 + a_ptr[5] * x4; | |||||
| #else | |||||
| temp_r = a_ptr[0] * x0 + a_ptr[1] * x1; | |||||
| temp_i = a_ptr[0] * x1 - a_ptr[1] * x0; | |||||
| temp_r += a_ptr[2] * x2 + a_ptr[3] * x3; | |||||
| temp_i += a_ptr[2] * x3 - a_ptr[3] * x2; | |||||
| temp_r += a_ptr[4] * x4 + a_ptr[5] * x5; | |||||
| temp_i += a_ptr[4] * x5 - a_ptr[5] * x4; | |||||
| #endif | |||||
| #if !defined(XCONJ) | |||||
| y_ptr[0] += alpha_r * temp_r - alpha_i * temp_i; | |||||
| y_ptr[1] += alpha_r * temp_i + alpha_i * temp_r; | |||||
| #else | |||||
| y_ptr[0] += alpha_r * temp_r + alpha_i * temp_i; | |||||
| y_ptr[1] -= alpha_r * temp_i - alpha_i * temp_r; | |||||
| #endif | |||||
| a_ptr += lda; | |||||
| y_ptr += inc_y; | |||||
| j++; | |||||
| } | |||||
| return(0); | |||||
| } | |||||
| if ( m3 == 2 ) | |||||
| { | |||||
| FLOAT temp_r ; | |||||
| FLOAT temp_i ; | |||||
| FLOAT temp_r1 ; | |||||
| FLOAT temp_i1 ; | |||||
| FLOAT x0 = x_ptr[0]; | |||||
| FLOAT x1 = x_ptr[1]; | |||||
| x_ptr += inc_x; | |||||
| FLOAT x2 = x_ptr[0]; | |||||
| FLOAT x3 = x_ptr[1]; | |||||
| FLOAT ar = alpha[0]; | |||||
| FLOAT ai = alpha[1]; | |||||
| while ( j < ( n & -2 )) | |||||
| { | |||||
| #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) | |||||
| temp_r = a_ptr[0] * x0 - a_ptr[1] * x1; | |||||
| temp_i = a_ptr[0] * x1 + a_ptr[1] * x0; | |||||
| temp_r += a_ptr[2] * x2 - a_ptr[3] * x3; | |||||
| temp_i += a_ptr[2] * x3 + a_ptr[3] * x2; | |||||
| a_ptr += lda; | |||||
| temp_r1 = a_ptr[0] * x0 - a_ptr[1] * x1; | |||||
| temp_i1 = a_ptr[0] * x1 + a_ptr[1] * x0; | |||||
| temp_r1 += a_ptr[2] * x2 - a_ptr[3] * x3; | |||||
| temp_i1 += a_ptr[2] * x3 + a_ptr[3] * x2; | |||||
| #else | |||||
| temp_r = a_ptr[0] * x0 + a_ptr[1] * x1; | |||||
| temp_i = a_ptr[0] * x1 - a_ptr[1] * x0; | |||||
| temp_r += a_ptr[2] * x2 + a_ptr[3] * x3; | |||||
| temp_i += a_ptr[2] * x3 - a_ptr[3] * x2; | |||||
| a_ptr += lda; | |||||
| temp_r1 = a_ptr[0] * x0 + a_ptr[1] * x1; | |||||
| temp_i1 = a_ptr[0] * x1 - a_ptr[1] * x0; | |||||
| temp_r1 += a_ptr[2] * x2 + a_ptr[3] * x3; | |||||
| temp_i1 += a_ptr[2] * x3 - a_ptr[3] * x2; | |||||
| #endif | |||||
| #if !defined(XCONJ) | |||||
| y_ptr[0] += ar * temp_r - ai * temp_i; | |||||
| y_ptr[1] += ar * temp_i + ai * temp_r; | |||||
| y_ptr += inc_y; | |||||
| y_ptr[0] += ar * temp_r1 - ai * temp_i1; | |||||
| y_ptr[1] += ar * temp_i1 + ai * temp_r1; | |||||
| #else | |||||
| y_ptr[0] += ar * temp_r + ai * temp_i; | |||||
| y_ptr[1] -= ar * temp_i - ai * temp_r; | |||||
| y_ptr += inc_y; | |||||
| y_ptr[0] += ar * temp_r1 + ai * temp_i1; | |||||
| y_ptr[1] -= ar * temp_i1 - ai * temp_r1; | |||||
| #endif | |||||
| a_ptr += lda; | |||||
| y_ptr += inc_y; | |||||
| j+=2; | |||||
| } | |||||
| while ( j < n) | |||||
| { | |||||
| #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) | |||||
| temp_r = a_ptr[0] * x0 - a_ptr[1] * x1; | |||||
| temp_i = a_ptr[0] * x1 + a_ptr[1] * x0; | |||||
| temp_r += a_ptr[2] * x2 - a_ptr[3] * x3; | |||||
| temp_i += a_ptr[2] * x3 + a_ptr[3] * x2; | |||||
| #else | |||||
| temp_r = a_ptr[0] * x0 + a_ptr[1] * x1; | |||||
| temp_i = a_ptr[0] * x1 - a_ptr[1] * x0; | |||||
| temp_r += a_ptr[2] * x2 + a_ptr[3] * x3; | |||||
| temp_i += a_ptr[2] * x3 - a_ptr[3] * x2; | |||||
| #endif | |||||
| #if !defined(XCONJ) | |||||
| y_ptr[0] += ar * temp_r - ai * temp_i; | |||||
| y_ptr[1] += ar * temp_i + ai * temp_r; | |||||
| #else | |||||
| y_ptr[0] += ar * temp_r + ai * temp_i; | |||||
| y_ptr[1] -= ar * temp_i - ai * temp_r; | |||||
| #endif | |||||
| a_ptr += lda; | |||||
| y_ptr += inc_y; | |||||
| j++; | |||||
| } | |||||
| return(0); | |||||
| } | |||||
| if ( m3 == 1 ) | |||||
| { | |||||
| FLOAT temp_r ; | |||||
| FLOAT temp_i ; | |||||
| FLOAT temp_r1 ; | |||||
| FLOAT temp_i1 ; | |||||
| FLOAT x0 = x_ptr[0]; | |||||
| FLOAT x1 = x_ptr[1]; | |||||
| FLOAT ar = alpha[0]; | |||||
| FLOAT ai = alpha[1]; | |||||
| while ( j < ( n & -2 )) | |||||
| { | |||||
| #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) | |||||
| temp_r = a_ptr[0] * x0 - a_ptr[1] * x1; | |||||
| temp_i = a_ptr[0] * x1 + a_ptr[1] * x0; | |||||
| a_ptr += lda; | |||||
| temp_r1 = a_ptr[0] * x0 - a_ptr[1] * x1; | |||||
| temp_i1 = a_ptr[0] * x1 + a_ptr[1] * x0; | |||||
| #else | |||||
| temp_r = a_ptr[0] * x0 + a_ptr[1] * x1; | |||||
| temp_i = a_ptr[0] * x1 - a_ptr[1] * x0; | |||||
| a_ptr += lda; | |||||
| temp_r1 = a_ptr[0] * x0 + a_ptr[1] * x1; | |||||
| temp_i1 = a_ptr[0] * x1 - a_ptr[1] * x0; | |||||
| #endif | |||||
| #if !defined(XCONJ) | |||||
| y_ptr[0] += ar * temp_r - ai * temp_i; | |||||
| y_ptr[1] += ar * temp_i + ai * temp_r; | |||||
| y_ptr += inc_y; | |||||
| y_ptr[0] += ar * temp_r1 - ai * temp_i1; | |||||
| y_ptr[1] += ar * temp_i1 + ai * temp_r1; | |||||
| #else | |||||
| y_ptr[0] += ar * temp_r + ai * temp_i; | |||||
| y_ptr[1] -= ar * temp_i - ai * temp_r; | |||||
| y_ptr += inc_y; | |||||
| y_ptr[0] += ar * temp_r1 + ai * temp_i1; | |||||
| y_ptr[1] -= ar * temp_i1 - ai * temp_r1; | |||||
| #endif | |||||
| a_ptr += lda; | |||||
| y_ptr += inc_y; | |||||
| j+=2; | |||||
| } | |||||
| while ( j < n) | |||||
| { | |||||
| #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) | |||||
| temp_r = a_ptr[0] * x0 - a_ptr[1] * x1; | |||||
| temp_i = a_ptr[0] * x1 + a_ptr[1] * x0; | |||||
| #else | |||||
| temp_r = a_ptr[0] * x0 + a_ptr[1] * x1; | |||||
| temp_i = a_ptr[0] * x1 - a_ptr[1] * x0; | |||||
| #endif | |||||
| #if !defined(XCONJ) | |||||
| y_ptr[0] += ar * temp_r - ai * temp_i; | |||||
| y_ptr[1] += ar * temp_i + ai * temp_r; | |||||
| #else | |||||
| y_ptr[0] += ar * temp_r + ai * temp_i; | |||||
| y_ptr[1] -= ar * temp_i - ai * temp_r; | |||||
| #endif | |||||
| a_ptr += lda; | |||||
| y_ptr += inc_y; | |||||
| j++; | |||||
| } | |||||
| return(0); | |||||
| } | |||||
| return(0); | |||||
| } | |||||
| @@ -0,0 +1,276 @@ | |||||
| /*************************************************************************** | |||||
| Copyright (c) 2017, The OpenBLAS Project | |||||
| All rights reserved. | |||||
| Redistribution and use in source and binary forms, with or without | |||||
| modification, are permitted provided that the following conditions are | |||||
| met: | |||||
| 1. Redistributions of source code must retain the above copyright | |||||
| notice, this list of conditions and the following disclaimer. | |||||
| 2. Redistributions in binary form must reproduce the above copyright | |||||
| notice, this list of conditions and the following disclaimer in | |||||
| the documentation and/or other materials provided with the | |||||
| distribution. | |||||
| 3. Neither the name of the OpenBLAS project nor the names of | |||||
| its contributors may be used to endorse or promote products | |||||
| derived from this software without specific prior written permission. | |||||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| *****************************************************************************/ | |||||
| #include "common.h" | |||||
| static void __attribute__ ((noinline)) zrot_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *c, FLOAT *s) | |||||
| { | |||||
| __asm__ ( | |||||
| "pfd 2, 0(%1) \n\t" | |||||
| "pfd 2, 0(%2) \n\t" | |||||
| "vlrepg %%v0,0(%3) \n\t" | |||||
| "vlrepg %%v1,0(%4) \n\t" | |||||
| "srlg %%r0,%0,4 \n\t" | |||||
| "xgr %%r1,%%r1 \n\t" | |||||
| ".align 16 \n\t" | |||||
| "1: \n\t" | |||||
| "pfd 2, 256(%%r1,%1) \n\t" | |||||
| "pfd 2, 256(%%r1,%2) \n\t" | |||||
| "vl %%v24, 0(%%r1,%1) \n\t" | |||||
| "vl %%v25, 16(%%r1,%1) \n\t" | |||||
| "vl %%v26, 32(%%r1,%1) \n\t" | |||||
| "vl %%v27, 48(%%r1,%1) \n\t" | |||||
| "vl %%v16, 0(%%r1,%2) \n\t" | |||||
| "vl %%v17, 16(%%r1,%2) \n\t" | |||||
| "vl %%v18, 32(%%r1,%2) \n\t" | |||||
| "vl %%v19, 48(%%r1,%2) \n\t" | |||||
| "vfmdb %%v28,%%v24,%%v0 \n\t" | |||||
| "vfmdb %%v29,%%v25,%%v0 \n\t" | |||||
| "vfmdb %%v20,%%v24,%%v1 \n\t" /* yn=x*s */ | |||||
| "vfmdb %%v21,%%v25,%%v1 \n\t" /* yn=x*s */ | |||||
| "vfmdb %%v30,%%v26,%%v0 \n\t" | |||||
| "vfmdb %%v22,%%v26,%%v1 \n\t" /* yn=x*s */ | |||||
| "vfmdb %%v31,%%v27,%%v0 \n\t" | |||||
| "vfmdb %%v23,%%v27,%%v1 \n\t" /* yn=x*s */ | |||||
| /* 2nd parts*/ | |||||
| "vfmadb %%v28,%%v16,%%v1,%%v28 \n\t" | |||||
| "vfmsdb %%v20,%%v16,%%v0,%%v20 \n\t" /* yn=y*c-yn */ | |||||
| "vfmadb %%v29,%%v17,%%v1,%%v29 \n\t" | |||||
| "vfmsdb %%v21,%%v17,%%v0,%%v21 \n\t" /* yn=y*c-yn */ | |||||
| "vfmadb %%v30,%%v18,%%v1,%%v30 \n\t" | |||||
| "vfmsdb %%v22,%%v18,%%v0,%%v22 \n\t" /* yn=y*c-yn */ | |||||
| "vfmadb %%v31,%%v19,%%v1,%%v31 \n\t" | |||||
| "vfmsdb %%v23,%%v19,%%v0,%%v23 \n\t" /* yn=y*c-yn */ | |||||
| "vst %%v28, 0(%%r1,%1) \n\t" | |||||
| "vst %%v29, 16(%%r1,%1) \n\t" | |||||
| "vst %%v30, 32(%%r1,%1) \n\t" | |||||
| "vst %%v31, 48(%%r1,%1) \n\t" | |||||
| "vst %%v20, 0(%%r1,%2) \n\t" | |||||
| "vst %%v21, 16(%%r1,%2) \n\t" | |||||
| "vst %%v22, 32(%%r1,%2) \n\t" | |||||
| "vst %%v23, 48(%%r1,%2) \n\t" | |||||
| "vl %%v24, 64(%%r1,%1) \n\t" | |||||
| "vl %%v25, 80(%%r1,%1) \n\t" | |||||
| "vl %%v26, 96(%%r1,%1) \n\t" | |||||
| "vl %%v27,112(%%r1,%1) \n\t" | |||||
| "vl %%v16, 64(%%r1,%2) \n\t" | |||||
| "vl %%v17, 80(%%r1,%2) \n\t" | |||||
| "vl %%v18, 96(%%r1,%2) \n\t" | |||||
| "vl %%v19,112(%%r1,%2) \n\t" | |||||
| "vfmdb %%v28,%%v24,%%v0 \n\t" | |||||
| "vfmdb %%v29,%%v25,%%v0 \n\t" | |||||
| "vfmdb %%v20,%%v24,%%v1 \n\t" /* yn=x*s */ | |||||
| "vfmdb %%v21,%%v25,%%v1 \n\t" /* yn=x*s */ | |||||
| "vfmdb %%v30,%%v26,%%v0 \n\t" | |||||
| "vfmdb %%v22,%%v26,%%v1 \n\t" /* yn=x*s */ | |||||
| "vfmdb %%v31,%%v27,%%v0 \n\t" | |||||
| "vfmdb %%v23,%%v27,%%v1 \n\t" /* yn=x*s */ | |||||
| /* 2nd parts*/ | |||||
| "vfmadb %%v28,%%v16,%%v1,%%v28 \n\t" | |||||
| "vfmsdb %%v20,%%v16,%%v0,%%v20 \n\t" /* yn=y*c-yn */ | |||||
| "vfmadb %%v29,%%v17,%%v1,%%v29 \n\t" | |||||
| "vfmsdb %%v21,%%v17,%%v0,%%v21 \n\t" /* yn=y*c-yn */ | |||||
| "vfmadb %%v30,%%v18,%%v1,%%v30 \n\t" | |||||
| "vfmsdb %%v22,%%v18,%%v0,%%v22 \n\t" /* yn=y*c-yn */ | |||||
| "vfmadb %%v31,%%v19,%%v1,%%v31 \n\t" | |||||
| "vfmsdb %%v23,%%v19,%%v0,%%v23 \n\t" /* yn=y*c-yn */ | |||||
| "vst %%v28, 64(%%r1,%1) \n\t" | |||||
| "vst %%v29, 80(%%r1,%1) \n\t" | |||||
| "vst %%v30, 96(%%r1,%1) \n\t" | |||||
| "vst %%v31, 112(%%r1,%1) \n\t" | |||||
| "vst %%v20, 64(%%r1,%2) \n\t" | |||||
| "vst %%v21, 80(%%r1,%2) \n\t" | |||||
| "vst %%v22, 96(%%r1,%2) \n\t" | |||||
| "vst %%v23, 112(%%r1,%2) \n\t" | |||||
| "vl %%v24, 128(%%r1,%1) \n\t" | |||||
| "vl %%v25, 144(%%r1,%1) \n\t" | |||||
| "vl %%v26, 160(%%r1,%1) \n\t" | |||||
| "vl %%v27, 176(%%r1,%1) \n\t" | |||||
| "vl %%v16, 128(%%r1,%2) \n\t" | |||||
| "vl %%v17, 144(%%r1,%2) \n\t" | |||||
| "vl %%v18, 160(%%r1,%2) \n\t" | |||||
| "vl %%v19, 176(%%r1,%2) \n\t" | |||||
| "vfmdb %%v28,%%v24,%%v0 \n\t" | |||||
| "vfmdb %%v29,%%v25,%%v0 \n\t" | |||||
| "vfmdb %%v20,%%v24,%%v1 \n\t" /* yn=x*s */ | |||||
| "vfmdb %%v21,%%v25,%%v1 \n\t" /* yn=x*s */ | |||||
| "vfmdb %%v30,%%v26,%%v0 \n\t" | |||||
| "vfmdb %%v22,%%v26,%%v1 \n\t" /* yn=x*s */ | |||||
| "vfmdb %%v31,%%v27,%%v0 \n\t" | |||||
| "vfmdb %%v23,%%v27,%%v1 \n\t" /* yn=x*s */ | |||||
| /* 2nd parts*/ | |||||
| "vfmadb %%v28,%%v16,%%v1,%%v28 \n\t" | |||||
| "vfmsdb %%v20,%%v16,%%v0,%%v20 \n\t" /* yn=y*c-yn */ | |||||
| "vfmadb %%v29,%%v17,%%v1,%%v29 \n\t" | |||||
| "vfmsdb %%v21,%%v17,%%v0,%%v21 \n\t" /* yn=y*c-yn */ | |||||
| "vfmadb %%v30,%%v18,%%v1,%%v30 \n\t" | |||||
| "vfmsdb %%v22,%%v18,%%v0,%%v22 \n\t" /* yn=y*c-yn */ | |||||
| "vfmadb %%v31,%%v19,%%v1,%%v31 \n\t" | |||||
| "vfmsdb %%v23,%%v19,%%v0,%%v23 \n\t" /* yn=y*c-yn */ | |||||
| "vst %%v28, 128(%%r1,%1) \n\t" | |||||
| "vst %%v29, 144(%%r1,%1) \n\t" | |||||
| "vst %%v30, 160(%%r1,%1) \n\t" | |||||
| "vst %%v31, 176(%%r1,%1) \n\t" | |||||
| "vst %%v20, 128(%%r1,%2) \n\t" | |||||
| "vst %%v21, 144(%%r1,%2) \n\t" | |||||
| "vst %%v22, 160(%%r1,%2) \n\t" | |||||
| "vst %%v23, 176(%%r1,%2) \n\t" | |||||
| "vl %%v24, 192(%%r1,%1) \n\t" | |||||
| "vl %%v25, 208(%%r1,%1) \n\t" | |||||
| "vl %%v26, 224(%%r1,%1) \n\t" | |||||
| "vl %%v27, 240(%%r1,%1) \n\t" | |||||
| "vl %%v16, 192(%%r1,%2) \n\t" | |||||
| "vl %%v17, 208(%%r1,%2) \n\t" | |||||
| "vl %%v18, 224(%%r1,%2) \n\t" | |||||
| "vl %%v19, 240(%%r1,%2) \n\t" | |||||
| "vfmdb %%v28,%%v24,%%v0 \n\t" | |||||
| "vfmdb %%v29,%%v25,%%v0 \n\t" | |||||
| "vfmdb %%v20,%%v24,%%v1 \n\t" /* yn=x*s */ | |||||
| "vfmdb %%v21,%%v25,%%v1 \n\t" /* yn=x*s */ | |||||
| "vfmdb %%v30,%%v26,%%v0 \n\t" | |||||
| "vfmdb %%v22,%%v26,%%v1 \n\t" /* yn=x*s */ | |||||
| "vfmdb %%v31,%%v27,%%v0 \n\t" | |||||
| "vfmdb %%v23,%%v27,%%v1 \n\t" /* yn=x*s */ | |||||
| /* 2nd parts*/ | |||||
| "vfmadb %%v28,%%v16,%%v1,%%v28 \n\t" | |||||
| "vfmsdb %%v20,%%v16,%%v0,%%v20 \n\t" /* yn=y*c-yn */ | |||||
| "vfmadb %%v29,%%v17,%%v1,%%v29 \n\t" | |||||
| "vfmsdb %%v21,%%v17,%%v0,%%v21 \n\t" /* yn=y*c-yn */ | |||||
| "vfmadb %%v30,%%v18,%%v1,%%v30 \n\t" | |||||
| "vfmsdb %%v22,%%v18,%%v0,%%v22 \n\t" /* yn=y*c-yn */ | |||||
| "vfmadb %%v31,%%v19,%%v1,%%v31 \n\t" | |||||
| "vfmsdb %%v23,%%v19,%%v0,%%v23 \n\t" /* yn=y*c-yn */ | |||||
| "vst %%v28, 192(%%r1,%1) \n\t" | |||||
| "vst %%v29, 208(%%r1,%1) \n\t" | |||||
| "vst %%v30, 224(%%r1,%1) \n\t" | |||||
| "vst %%v31, 240(%%r1,%1) \n\t" | |||||
| "vst %%v20, 192(%%r1,%2) \n\t" | |||||
| "vst %%v21, 208(%%r1,%2) \n\t" | |||||
| "vst %%v22, 224(%%r1,%2) \n\t" | |||||
| "vst %%v23, 240(%%r1,%2) \n\t" | |||||
| "la %%r1,256(%%r1) \n\t" | |||||
| "brctg %%r0,1b" | |||||
| : | |||||
| : "r"(n), "a"(x), "a"(y),"a"(c),"a"(s) | |||||
| : "cc", "memory","r0","r1" ,"v0","v1","v16", | |||||
| "v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" | |||||
| ); | |||||
| return; | |||||
| } | |||||
| int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT c, FLOAT s) | |||||
| { | |||||
| BLASLONG i=0; | |||||
| BLASLONG ix=0,iy=0; | |||||
| FLOAT temp[2]; | |||||
| BLASLONG inc_x2; | |||||
| BLASLONG inc_y2; | |||||
| if ( n <= 0 ) return(0); | |||||
| if ( (inc_x == 1) && (inc_y == 1) ) | |||||
| { | |||||
| BLASLONG n1 = n & -16; | |||||
| if ( n1 > 0 ) | |||||
| { | |||||
| FLOAT cosa,sina; | |||||
| cosa=c; | |||||
| sina=s; | |||||
| zrot_kernel_16(n1, x, y, &cosa, &sina); | |||||
| i=n1; | |||||
| ix=2*n1; | |||||
| } | |||||
| while(i < n) | |||||
| { | |||||
| temp[0] = c*x[ix] + s*y[ix] ; | |||||
| temp[1] = c*x[ix+1] + s*y[ix+1] ; | |||||
| y[ix] = c*y[ix] - s*x[ix] ; | |||||
| y[ix+1] = c*y[ix+1] - s*x[ix+1] ; | |||||
| x[ix] = temp[0] ; | |||||
| x[ix+1] = temp[1] ; | |||||
| ix += 2 ; | |||||
| i++ ; | |||||
| } | |||||
| } | |||||
| else | |||||
| { | |||||
| inc_x2 = 2 * inc_x ; | |||||
| inc_y2 = 2 * inc_y ; | |||||
| while(i < n) | |||||
| { | |||||
| temp[0] = c*x[ix] + s*y[iy] ; | |||||
| temp[1] = c*x[ix+1] + s*y[iy+1] ; | |||||
| y[iy] = c*y[iy] - s*x[ix] ; | |||||
| y[iy+1] = c*y[iy+1] - s*x[ix+1] ; | |||||
| x[ix] = temp[0] ; | |||||
| x[ix+1] = temp[1] ; | |||||
| ix += inc_x2 ; | |||||
| iy += inc_y2 ; | |||||
| i++ ; | |||||
| } | |||||
| } | |||||
| return(0); | |||||
| } | |||||
| @@ -0,0 +1,483 @@ | |||||
| /*************************************************************************** | |||||
| Copyright (c) 2013 - 2017, The OpenBLAS Project | |||||
| All rights reserved. | |||||
| Redistribution and use in source and binary forms, with or without | |||||
| modification, are permitted provided that the following conditions are | |||||
| met: | |||||
| 1. Redistributions of source code must retain the above copyright | |||||
| notice, this list of conditions and the following disclaimer. | |||||
| 2. Redistributions in binary form must reproduce the above copyright | |||||
| notice, this list of conditions and the following disclaimer in | |||||
| the documentation and/or other materials provided with the | |||||
| distribution. | |||||
| 3. Neither the name of the OpenBLAS project nor the names of | |||||
| its contributors may be used to endorse or promote products | |||||
| derived from this software without specific prior written permission. | |||||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| *****************************************************************************/ | |||||
| #include "common.h" | |||||
| static void __attribute__ ((noinline)) zscal_kernel_8(BLASLONG n, FLOAT *alpha, FLOAT *x) { | |||||
| __asm__( | |||||
| "pfd 1, 0(%1) \n\t" | |||||
| "sllg %%r0,%0,4 \n\t" | |||||
| "agr %%r0,%2 \n\t" | |||||
| "vlrepg %%v24,0(%1) \n\t" | |||||
| "vlrepg %%v25,8(%1) \n\t" | |||||
| ".align 16 \n\t" | |||||
| "1: \n\t" | |||||
| "pfd 2, 256(%2 ) \n\t" | |||||
| "vleg %%v20 , 0(%2),0 \n\t" | |||||
| "vleg %%v21 , 8(%2),0 \n\t" | |||||
| "vleg %%v20 , 16(%2),1 \n\t" | |||||
| "vleg %%v21 , 24(%2),1 \n\t" | |||||
| "vleg %%v22 , 32(%2),0 \n\t" | |||||
| "vleg %%v23 , 40(%2),0 \n\t" | |||||
| "vleg %%v22 , 48(%2),1 \n\t" | |||||
| "vleg %%v23 , 56(%2),1 \n\t" | |||||
| "vfmdb %%v16, %%v21, %%v25 \n\t" | |||||
| "vfmdb %%v17, %%v20, %%v25 \n\t" | |||||
| "vfmdb %%v18, %%v23, %%v25 \n\t" | |||||
| "vfmdb %%v19, %%v22, %%v25 \n\t" | |||||
| "vfmsdb %%v16, %%v20, %%v24 ,%%v16 \n\t" | |||||
| "vfmadb %%v17, %%v21, %%v24, %%v17 \n\t" | |||||
| "vfmsdb %%v18, %%v22, %%v24, %%v18 \n\t" | |||||
| "vfmadb %%v19, %%v23, %%v24, %%v19 \n\t" | |||||
| "vsteg %%v16 , 0(%2),0 \n\t" | |||||
| "vsteg %%v17 , 8(%2),0 \n\t" | |||||
| "vsteg %%v16 , 16(%2),1 \n\t" | |||||
| "vsteg %%v17 , 24(%2),1 \n\t" | |||||
| "vsteg %%v18 , 32(%2),0 \n\t" | |||||
| "vsteg %%v19 , 40(%2),0 \n\t" | |||||
| "vsteg %%v18 , 48(%2),1 \n\t" | |||||
| "vsteg %%v19 , 56(%2),1 \n\t" | |||||
| "vleg %%v20 , 64(%2),0 \n\t" | |||||
| "vleg %%v21 , 72(%2),0 \n\t" | |||||
| "vleg %%v20 , 80(%2),1 \n\t" | |||||
| "vleg %%v21 , 88(%2),1 \n\t" | |||||
| "vleg %%v22 , 96(%2),0 \n\t" | |||||
| "vleg %%v23 , 104(%2),0 \n\t" | |||||
| "vleg %%v22 , 112(%2),1 \n\t" | |||||
| "vleg %%v23 , 120(%2),1 \n\t" | |||||
| "vfmdb %%v16, %%v21, %%v25 \n\t" | |||||
| "vfmdb %%v17, %%v20, %%v25 \n\t" | |||||
| "vfmdb %%v18, %%v23, %%v25 \n\t" | |||||
| "vfmdb %%v19, %%v22, %%v25 \n\t" | |||||
| "vfmsdb %%v16, %%v20, %%v24 ,%%v16 \n\t" | |||||
| "vfmadb %%v17, %%v21, %%v24, %%v17 \n\t" | |||||
| "vfmsdb %%v18, %%v22, %%v24, %%v18 \n\t" | |||||
| "vfmadb %%v19, %%v23, %%v24, %%v19 \n\t" | |||||
| "vsteg %%v16 , 64(%2),0 \n\t" | |||||
| "vsteg %%v17 , 72(%2),0 \n\t" | |||||
| "vsteg %%v16 , 80(%2),1 \n\t" | |||||
| "vsteg %%v17 , 88(%2),1 \n\t" | |||||
| "vsteg %%v18 , 96(%2),0 \n\t" | |||||
| "vsteg %%v19 , 104(%2),0 \n\t" | |||||
| "vsteg %%v18 , 112(%2),1 \n\t" | |||||
| "vsteg %%v19 , 120(%2),1 \n\t" | |||||
| "la %2,128(%2) \n\t" | |||||
| "clgrjl %2,%%r0,1b \n\t" | |||||
| : | |||||
| : "r"(n), "a"(alpha), "a"(x) | |||||
| : "cc", "memory","r0","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25" | |||||
| ); | |||||
| } | |||||
| static void __attribute__ ((noinline)) zscal_kernel_8_zero_r(BLASLONG n, FLOAT *alpha, FLOAT *x) { | |||||
| __asm__ ( "pfd 2, 0(%1) \n\t" | |||||
| "ld %%f0,8(%2) \n\t" | |||||
| "lcdbr %%f1,%%f0 \n\t" | |||||
| "lgdr %%r0,%%f1 \n\t" | |||||
| "vlvgg %%v0,%%r0,1 \n\t" | |||||
| "vlr %%v16,%%v0 \n\t" | |||||
| "vlr %%v17 ,%%v0 \n\t" | |||||
| "vlr %%v1,%%v0 \n\t" | |||||
| "sllg %%r0,%0,4 \n\t" | |||||
| "agr %%r0,%1 \n\t" | |||||
| ".align 16 \n\t" | |||||
| "1: \n\t" | |||||
| "vl %%v24, 0(%1) \n\t" | |||||
| "vfmdb %%v24,%%v24,%%v0 \n\t" | |||||
| "vsteg %%v24, 0(%1),1 \n\t" | |||||
| "vsteg %%v24, 8(%1),0 \n\t" | |||||
| "vl %%v25, 16(%1) \n\t" | |||||
| "vfmdb %%v25,%%v25,%%v1 \n\t" | |||||
| "vsteg %%v25, 16(%1),1 \n\t" | |||||
| "vsteg %%v25, 24(%1),0 \n\t" | |||||
| "vl %%v26, 32(%1) \n\t" | |||||
| "vfmdb %%v26,%%v26,%%v16 \n\t" | |||||
| "vsteg %%v26, 32(%1),1 \n\t" | |||||
| "vsteg %%v26, 40(%1),0 \n\t" | |||||
| "vl %%v27, 48(%1) \n\t" | |||||
| "vfmdb %%v27,%%v27,%%v17 \n\t" | |||||
| "vsteg %%v27, 40(%1),1 \n\t" | |||||
| "vsteg %%v27, 48(%1),0 \n\t" | |||||
| "vl %%v28, 64(%1) \n\t" | |||||
| "vfmdb %%v28,%%v28,%%v0 \n\t" | |||||
| "vsteg %%v28, 64(%1),1 \n\t" | |||||
| "vsteg %%v28, 72(%1),0 \n\t" | |||||
| "vl %%v29, 80(%1) \n\t" | |||||
| "vfmdb %%v29,%%v29,%%v1 \n\t" | |||||
| "vsteg %%v29, 80(%1),1 \n\t" | |||||
| "vsteg %%v29, 88(%1),0 \n\t" | |||||
| "vl %%v30, 96(%1) \n\t" | |||||
| "vfmdb %%v30,%%v30,%%v16 \n\t" | |||||
| "vsteg %%v27, 96(%1),1 \n\t" | |||||
| "vsteg %%v27, 104(%1),0 \n\t" | |||||
| "vl %%v31, 112(%1) \n\t" | |||||
| "vfmdb %%v31,%%v31,%%v17 \n\t" | |||||
| "vsteg %%v31, 112(%1),1 \n\t" | |||||
| "vsteg %%v31, 120(%1),0 \n\t" | |||||
| "la %1,128(%1) \n\t" | |||||
| "clgrjl %1,%%r0,1b \n\t" | |||||
| : | |||||
| :"r"(n),"a"(x) ,"a"(alpha) | |||||
| :"cc", "memory","r0","f0", "f1","v0","v1","v18","v19","v24","v25","v26","v27","v28","v29","v30","v31" | |||||
| ); | |||||
| } | |||||
| static void __attribute__ ((noinline)) zscal_kernel_8_zero_i(BLASLONG n, FLOAT *alpha, FLOAT *x) { | |||||
| __asm__ ("pfd 2, 0(%1) \n\t" | |||||
| "vlrepg %%v18,0(%2) \n\t" | |||||
| "vlr %%v19,%%v18 \n\t" | |||||
| "vlr %%v16 ,%%v18 \n\t" | |||||
| "vlr %%v17,%%v18 \n\t" | |||||
| "sllg %%r0,%0,4 \n\t" | |||||
| "agr %%r0,%1 \n\t" | |||||
| ".align 16 \n\t" | |||||
| "1: \n\t" | |||||
| "vl %%v24, 0(%1) \n\t" | |||||
| "vfmdb %%v24,%%v24,%%v18 \n\t" | |||||
| "vst %%v24, 0(%1) \n\t" | |||||
| "vl %%v25, 16(%1) \n\t" | |||||
| "vfmdb %%v25,%%v25,%%v19 \n\t" | |||||
| "vst %%v25, 16(%1) \n\t" | |||||
| "vl %%v26, 32(%1) \n\t" | |||||
| "vfmdb %%v26,%%v26,%%v16 \n\t" | |||||
| "vst %%v26, 32(%1) \n\t" | |||||
| "vl %%v27, 48(%1) \n\t" | |||||
| "vfmdb %%v27,%%v27,%%v17 \n\t" | |||||
| "vst %%v27, 48(%1) \n\t" | |||||
| "vl %%v28, 64(%1) \n\t" | |||||
| "vfmdb %%v28,%%v28,%%v18 \n\t" | |||||
| "vst %%v28, 64(%1) \n\t" | |||||
| "vl %%v29, 80(%1) \n\t" | |||||
| "vfmdb %%v29,%%v29,%%v19 \n\t" | |||||
| "vst %%v29, 80(%1) \n\t" | |||||
| "vl %%v30, 96(%1) \n\t" | |||||
| "vfmdb %%v30,%%v30,%%v16 \n\t" | |||||
| "vst %%v30, 96(%1) \n\t" | |||||
| "vl %%v31, 112(%1) \n\t" | |||||
| "vfmdb %%v31,%%v31,%%v17 \n\t" | |||||
| "vst %%v31, 112(%1) \n\t" | |||||
| "la %1,128(%1) \n\t" | |||||
| "clgrjl %1,%%r0,1b \n\t" | |||||
| : | |||||
| :"r"(n),"a"(x) ,"a"(alpha) | |||||
| :"cc", "memory","r0","v16", "v17","v18","v19","v24","v25","v26","v27","v28","v29","v30","v31" | |||||
| ); | |||||
| } | |||||
| static void __attribute__ ((noinline)) zscal_kernel_8_zero(BLASLONG n, FLOAT *x) { | |||||
| __asm__ ( "pfd 2, 0(%1) \n\t" | |||||
| "vzero %%v24 \n\t" | |||||
| "vzero %%v25 \n\t" | |||||
| "vzero %%v26 \n\t" | |||||
| "vzero %%v27 \n\t" | |||||
| "sllg %%r0,%0,4 \n\t" | |||||
| "agr %%r0,%1 \n\t" | |||||
| ".align 16 \n\t" | |||||
| "1: \n\t" | |||||
| "pfd 2, 256( %1) \n\t" | |||||
| "vst %%v24, 0( %1) \n\t" | |||||
| "vst %%v25, 16( %1) \n\t" | |||||
| "vst %%v26, 32( %1) \n\t" | |||||
| "vst %%v27, 48( %1) \n\t" | |||||
| "vst %%v24, 64( %1) \n\t" | |||||
| "vst %%v25, 80( %1) \n\t" | |||||
| "vst %%v26, 96( %1) \n\t" | |||||
| "vst %%v27,112( %1) \n\t" | |||||
| "la %1,128(%1) \n\t" | |||||
| "clgrjl %1,%%r0,1b \n\t" | |||||
| : | |||||
| :"r"(n),"a"(x) | |||||
| :"cc" , "memory" ,"r0","v24","v25","v26","v27" | |||||
| ); | |||||
| } | |||||
| static void zscal_kernel_inc_8(BLASLONG n, FLOAT *alpha, FLOAT *x, BLASLONG inc_x) __attribute__ ((noinline)); | |||||
| static void zscal_kernel_inc_8(BLASLONG n, FLOAT *alpha, FLOAT *x, BLASLONG inc_x) { | |||||
| BLASLONG i; | |||||
| BLASLONG inc_x2 = 2 * inc_x; | |||||
| BLASLONG inc_x3 = inc_x2 + inc_x; | |||||
| FLOAT t0, t1, t2, t3; | |||||
| FLOAT da_r = alpha[0]; | |||||
| FLOAT da_i = alpha[1]; | |||||
| for (i = 0; i < n; i += 4) { | |||||
| t0 = da_r * x[0] - da_i * x[1]; | |||||
| t1 = da_r * x[inc_x] - da_i * x[inc_x + 1]; | |||||
| t2 = da_r * x[inc_x2] - da_i * x[inc_x2 + 1]; | |||||
| t3 = da_r * x[inc_x3] - da_i * x[inc_x3 + 1]; | |||||
| x[1] = da_i * x[0] + da_r * x[1]; | |||||
| x[inc_x + 1] = da_i * x[inc_x] + da_r * x[inc_x + 1]; | |||||
| x[inc_x2 + 1] = da_i * x[inc_x2] + da_r * x[inc_x2 + 1]; | |||||
| x[inc_x3 + 1] = da_i * x[inc_x3] + da_r * x[inc_x3 + 1]; | |||||
| x[0] = t0; | |||||
| x[inc_x] = t1; | |||||
| x[inc_x2] = t2; | |||||
| x[inc_x3] = t3; | |||||
| x += 4 * inc_x; | |||||
| } | |||||
| } | |||||
| int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) { | |||||
| BLASLONG i = 0, j = 0; | |||||
| FLOAT temp0; | |||||
| FLOAT temp1; | |||||
| FLOAT alpha[2]; | |||||
| if (inc_x != 1) { | |||||
| inc_x <<= 1; | |||||
| if (da_r == 0.0) { | |||||
| BLASLONG n1 = n & -2; | |||||
| if (da_i == 0.0) { | |||||
| while (j < n1) { | |||||
| x[i] = 0.0; | |||||
| x[i + 1] = 0.0; | |||||
| x[i + inc_x] = 0.0; | |||||
| x[i + 1 + inc_x] = 0.0; | |||||
| i += 2 * inc_x; | |||||
| j += 2; | |||||
| } | |||||
| while (j < n) { | |||||
| x[i] = 0.0; | |||||
| x[i + 1] = 0.0; | |||||
| i += inc_x; | |||||
| j++; | |||||
| } | |||||
| } else { | |||||
| while (j < n1) { | |||||
| temp0 = -da_i * x[i + 1]; | |||||
| x[i + 1] = da_i * x[i]; | |||||
| x[i] = temp0; | |||||
| temp1 = -da_i * x[i + 1 + inc_x]; | |||||
| x[i + 1 + inc_x] = da_i * x[i + inc_x]; | |||||
| x[i + inc_x] = temp1; | |||||
| i += 2 * inc_x; | |||||
| j += 2; | |||||
| } | |||||
| while (j < n) { | |||||
| temp0 = -da_i * x[i + 1]; | |||||
| x[i + 1] = da_i * x[i]; | |||||
| x[i] = temp0; | |||||
| i += inc_x; | |||||
| j++; | |||||
| } | |||||
| } | |||||
| } else { | |||||
| if (da_i == 0.0) { | |||||
| BLASLONG n1 = n & -2; | |||||
| while (j < n1) { | |||||
| temp0 = da_r * x[i]; | |||||
| x[i + 1] = da_r * x[i + 1]; | |||||
| x[i] = temp0; | |||||
| temp1 = da_r * x[i + inc_x]; | |||||
| x[i + 1 + inc_x] = da_r * x[i + 1 + inc_x]; | |||||
| x[i + inc_x] = temp1; | |||||
| i += 2 * inc_x; | |||||
| j += 2; | |||||
| } | |||||
| while (j < n) { | |||||
| temp0 = da_r * x[i]; | |||||
| x[i + 1] = da_r * x[i + 1]; | |||||
| x[i] = temp0; | |||||
| i += inc_x; | |||||
| j++; | |||||
| } | |||||
| } else { | |||||
| BLASLONG n1 = n & -8; | |||||
| if (n1 > 0) { | |||||
| alpha[0] = da_r; | |||||
| alpha[1] = da_i; | |||||
| zscal_kernel_inc_8(n1, alpha, x, inc_x); | |||||
| j = n1; | |||||
| i = n1 * inc_x; | |||||
| } | |||||
| while (j < n) { | |||||
| temp0 = da_r * x[i] - da_i * x[i + 1]; | |||||
| x[i + 1] = da_r * x[i + 1] + da_i * x[i]; | |||||
| x[i] = temp0; | |||||
| i += inc_x; | |||||
| j++; | |||||
| } | |||||
| } | |||||
| } | |||||
| return (0); | |||||
| } | |||||
| BLASLONG n1 = n & -8; | |||||
| if (n1 > 0) { | |||||
| alpha[0] = da_r; | |||||
| alpha[1] = da_i; | |||||
| if (da_r == 0.0) | |||||
| if (da_i == 0) | |||||
| zscal_kernel_8_zero(n1, x); | |||||
| else | |||||
| zscal_kernel_8_zero_r(n1, alpha, x); | |||||
| else | |||||
| if (da_i == 0) | |||||
| zscal_kernel_8_zero_i(n1, alpha, x); | |||||
| else | |||||
| zscal_kernel_8(n1, alpha, x); | |||||
| i = n1 << 1; | |||||
| j = n1; | |||||
| } | |||||
| if (da_r == 0.0) { | |||||
| if (da_i == 0.0) { | |||||
| while (j < n) { | |||||
| x[i] = 0.0; | |||||
| x[i + 1] = 0.0; | |||||
| i += 2; | |||||
| j++; | |||||
| } | |||||
| } else { | |||||
| while (j < n) { | |||||
| temp0 = -da_i * x[i + 1]; | |||||
| x[i + 1] = da_i * x[i]; | |||||
| x[i] = temp0; | |||||
| i += 2; | |||||
| j++; | |||||
| } | |||||
| } | |||||
| } else { | |||||
| if (da_i == 0.0) { | |||||
| while (j < n) { | |||||
| temp0 = da_r * x[i]; | |||||
| x[i + 1] = da_r * x[i + 1]; | |||||
| x[i] = temp0; | |||||
| i += 2; | |||||
| j++; | |||||
| } | |||||
| } else { | |||||
| while (j < n) { | |||||
| temp0 = da_r * x[i] - da_i * x[i + 1]; | |||||
| x[i + 1] = da_r * x[i + 1] + da_i * x[i]; | |||||
| x[i] = temp0; | |||||
| i += 2; | |||||
| j++; | |||||
| } | |||||
| } | |||||
| } | |||||
| return (0); | |||||
| } | |||||
| @@ -0,0 +1,198 @@ | |||||
| /*************************************************************************** | |||||
| Copyright (c) 2013-2017, The OpenBLAS Project | |||||
| All rights reserved. | |||||
| Redistribution and use in source and binary forms, with or without | |||||
| modification, are permitted provided that the following conditions are | |||||
| met: | |||||
| 1. Redistributions of source code must retain the above copyright | |||||
| notice, this list of conditions and the following disclaimer. | |||||
| 2. Redistributions in binary form must reproduce the above copyright | |||||
| notice, this list of conditions and the following disclaimer in | |||||
| the documentation and/or other materials provided with the | |||||
| distribution. | |||||
| 3. Neither the name of the OpenBLAS project nor the names of | |||||
| its contributors may be used to endorse or promote products | |||||
| derived from this software without specific prior written permission. | |||||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| *****************************************************************************/ | |||||
| #include "common.h" | |||||
| static void __attribute__ ((noinline)) zswap_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y) | |||||
| { | |||||
| __asm__ volatile( | |||||
| "pfd 2, 0(%1) \n\t" | |||||
| "pfd 2, 0(%2) \n\t" | |||||
| "srlg %%r0,%0,4 \n\t" | |||||
| "xgr %%r1,%%r1 \n\t" | |||||
| ".align 16 \n\t" | |||||
| "1: \n\t" | |||||
| "pfd 2, 256(%%r1,%1) \n\t" | |||||
| "pfd 2, 256(%%r1,%2) \n\t" | |||||
| "vl %%v16, 0(%%r1,%1) \n\t" | |||||
| "vl %%v17, 16(%%r1,%1) \n\t" | |||||
| "vl %%v18, 32(%%r1,%1) \n\t" | |||||
| "vl %%v19, 48(%%r1,%1) \n\t" | |||||
| "vl %%v20, 64(%%r1,%1) \n\t" | |||||
| "vl %%v21, 80(%%r1,%1) \n\t" | |||||
| "vl %%v22, 96(%%r1,%1) \n\t" | |||||
| "vl %%v23, 112(%%r1,%1) \n\t" | |||||
| "vl %%v24, 128(%%r1,%1) \n\t" | |||||
| "vl %%v25, 144(%%r1,%1) \n\t" | |||||
| "vl %%v26, 160(%%r1,%1) \n\t" | |||||
| "vl %%v27, 176(%%r1,%1) \n\t" | |||||
| "vl %%v28, 192(%%r1,%1) \n\t" | |||||
| "vl %%v29, 208(%%r1,%1) \n\t" | |||||
| "vl %%v30, 224(%%r1,%1) \n\t" | |||||
| "vl %%v31, 240(%%r1,%1) \n\t" | |||||
| "vl %%v0, 0(%%r1,%2) \n\t" | |||||
| "vl %%v1, 16(%%r1,%2) \n\t" | |||||
| "vl %%v2, 32(%%r1,%2) \n\t" | |||||
| "vl %%v3, 48(%%r1,%2) \n\t" | |||||
| "vl %%v4, 64(%%r1,%2) \n\t" | |||||
| "vl %%v5, 80(%%r1,%2) \n\t" | |||||
| "vl %%v6, 96(%%r1,%2) \n\t" | |||||
| "vl %%v7, 112(%%r1,%2) \n\t" | |||||
| "vst %%v0, 0(%%r1,%1) \n\t" | |||||
| "vst %%v1, 16(%%r1,%1) \n\t" | |||||
| "vst %%v2, 32(%%r1,%1) \n\t" | |||||
| "vst %%v3, 48(%%r1,%1) \n\t" | |||||
| "vst %%v4, 64(%%r1,%1) \n\t" | |||||
| "vst %%v5, 80(%%r1,%1) \n\t" | |||||
| "vst %%v6, 96(%%r1,%1) \n\t" | |||||
| "vst %%v7, 112(%%r1,%1) \n\t" | |||||
| "vl %%v0, 128(%%r1,%2) \n\t" | |||||
| "vl %%v1, 144(%%r1,%2) \n\t" | |||||
| "vl %%v2, 160(%%r1,%2) \n\t" | |||||
| "vl %%v3, 176(%%r1,%2) \n\t" | |||||
| "vl %%v4, 192(%%r1,%2) \n\t" | |||||
| "vl %%v5, 208(%%r1,%2) \n\t" | |||||
| "vl %%v6, 224(%%r1,%2) \n\t" | |||||
| "vl %%v7, 240(%%r1,%2) \n\t" | |||||
| "vst %%v0, 128(%%r1,%1) \n\t" | |||||
| "vst %%v1, 144(%%r1,%1) \n\t" | |||||
| "vst %%v2, 160(%%r1,%1) \n\t" | |||||
| "vst %%v3, 176(%%r1,%1) \n\t" | |||||
| "vst %%v4, 192(%%r1,%1) \n\t" | |||||
| "vst %%v5, 208(%%r1,%1) \n\t" | |||||
| "vst %%v6, 224(%%r1,%1) \n\t" | |||||
| "vst %%v7, 240(%%r1,%1) \n\t" | |||||
| "vst %%v16, 0(%%r1,%2) \n\t" | |||||
| "vst %%v17, 16(%%r1,%2) \n\t" | |||||
| "vst %%v18, 32(%%r1,%2) \n\t" | |||||
| "vst %%v19, 48(%%r1,%2) \n\t" | |||||
| "vst %%v20, 64(%%r1,%2) \n\t" | |||||
| "vst %%v21, 80(%%r1,%2) \n\t" | |||||
| "vst %%v22, 96(%%r1,%2) \n\t" | |||||
| "vst %%v23, 112(%%r1,%2) \n\t" | |||||
| "vst %%v24, 128(%%r1,%2) \n\t" | |||||
| "vst %%v25, 144(%%r1,%2) \n\t" | |||||
| "vst %%v26, 160(%%r1,%2) \n\t" | |||||
| "vst %%v27, 176(%%r1,%2) \n\t" | |||||
| "vst %%v28, 192(%%r1,%2) \n\t" | |||||
| "vst %%v29, 208(%%r1,%2) \n\t" | |||||
| "vst %%v30, 224(%%r1,%2) \n\t" | |||||
| "vst %%v31, 240(%%r1,%2) \n\t" | |||||
| "la %%r1,256(%%r1) \n\t" | |||||
| "brctg %%r0,1b" | |||||
| : | |||||
| : "r"(n), "a"(x), "a"(y) | |||||
| :"cc", "memory","r0","r1", "v0","v1","v2","v3","v4","v5","v6","v7","v16", | |||||
| "v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" | |||||
| ); | |||||
| return; | |||||
| } | |||||
| int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT dummy4, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) | |||||
| { | |||||
| BLASLONG i=0; | |||||
| BLASLONG ix=0,iy=0; | |||||
| FLOAT temp[2]; | |||||
| BLASLONG inc_x2, inc_y2; | |||||
| if ( n <= 0 ) return(0); | |||||
| if ( (inc_x == 1) && (inc_y == 1 )) | |||||
| { | |||||
| BLASLONG n1 = n & -16; | |||||
| if ( n1 > 0 ) | |||||
| { | |||||
| zswap_kernel_16(n1, x, y); | |||||
| i=n1; | |||||
| ix = 2* n1; | |||||
| iy = 2* n1; | |||||
| } | |||||
| while(i < n) | |||||
| { | |||||
| temp[0] = x[ix] ; | |||||
| temp[1] = x[ix+1] ; | |||||
| x[ix] = y[iy] ; | |||||
| x[ix+1] = y[iy+1] ; | |||||
| y[iy] = temp[0] ; | |||||
| y[iy+1] = temp[1] ; | |||||
| ix += 2 ; | |||||
| iy += 2 ; | |||||
| i++ ; | |||||
| } | |||||
| } | |||||
| else | |||||
| { | |||||
| inc_x2 = 2 * inc_x; | |||||
| inc_y2 = 2 * inc_y; | |||||
| while(i < n) | |||||
| { | |||||
| temp[0] = x[ix] ; | |||||
| temp[1] = x[ix+1] ; | |||||
| x[ix] = y[iy] ; | |||||
| x[ix+1] = y[iy+1] ; | |||||
| y[iy] = temp[0] ; | |||||
| y[iy+1] = temp[1] ; | |||||
| ix += inc_x2 ; | |||||
| iy += inc_y2 ; | |||||
| i++ ; | |||||
| } | |||||
| } | |||||
| return(0); | |||||
| } | |||||