z13: improved zgemv_(t|n)_4,zscal,zaxpytags/v0.3.0
| @@ -90,14 +90,14 @@ ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | |||||
| #DMINKERNEL = ../arm/min.c | #DMINKERNEL = ../arm/min.c | ||||
| # | # | ||||
| #ISAMAXKERNEL = ../arm/iamax.c | #ISAMAXKERNEL = ../arm/iamax.c | ||||
| #IDAMAXKERNEL = ../arm/iamax.c | |||||
| IDAMAXKERNEL = idamax.c | |||||
| #ICAMAXKERNEL = ../arm/izamax.c | #ICAMAXKERNEL = ../arm/izamax.c | ||||
| #IZAMAXKERNEL = ../arm/izamax.c | |||||
| IZAMAXKERNEL = izamax.c | |||||
| # | # | ||||
| #ISAMINKERNEL = ../arm/iamin.c | #ISAMINKERNEL = ../arm/iamin.c | ||||
| #IDAMINKERNEL = ../arm/iamin.c | |||||
| IDAMINKERNEL = idamin.c | |||||
| #ICAMINKERNEL = ../arm/izamin.c | #ICAMINKERNEL = ../arm/izamin.c | ||||
| #IZAMINKERNEL = ../arm/izamin.c | |||||
| IZAMINKERNEL = izamin.c | |||||
| # | # | ||||
| #ISMAXKERNEL = ../arm/imax.c | #ISMAXKERNEL = ../arm/imax.c | ||||
| #IDMAXKERNEL = ../arm/imax.c | #IDMAXKERNEL = ../arm/imax.c | ||||
| @@ -134,7 +134,7 @@ ZNRM2KERNEL = ../arm/znrm2.c | |||||
| SROTKERNEL = srot.c | SROTKERNEL = srot.c | ||||
| DROTKERNEL = drot.c | DROTKERNEL = drot.c | ||||
| #CROTKERNEL = ../arm/zrot.c | #CROTKERNEL = ../arm/zrot.c | ||||
| #ZROTKERNEL = ../arm/zrot.c | |||||
| ZROTKERNEL = zrot.c | |||||
| # | # | ||||
| SSCALKERNEL = sscal.c | SSCALKERNEL = sscal.c | ||||
| DSCALKERNEL = dscal.c | DSCALKERNEL = dscal.c | ||||
| @@ -150,12 +150,12 @@ ZSWAPKERNEL = zswap.c | |||||
| #SGEMVNKERNEL = ../arm/gemv_n.c | #SGEMVNKERNEL = ../arm/gemv_n.c | ||||
| DGEMVNKERNEL = dgemv_n.c | DGEMVNKERNEL = dgemv_n.c | ||||
| #CGEMVNKERNEL = ../arm/zgemv_n.c | #CGEMVNKERNEL = ../arm/zgemv_n.c | ||||
| #ZGEMVNKERNEL = ../arm/zgemv_n.c | |||||
| ZGEMVNKERNEL = zgemv_n_4.c | |||||
| # | # | ||||
| #SGEMVTKERNEL = ../arm/gemv_t.c | #SGEMVTKERNEL = ../arm/gemv_t.c | ||||
| #DGEMVTKERNEL = ../arm/gemv_t.c | |||||
| DGEMVTKERNEL = dgemv_t.c | |||||
| #CGEMVTKERNEL = ../arm/zgemv_t.c | #CGEMVTKERNEL = ../arm/zgemv_t.c | ||||
| #ZGEMVTKERNEL = zgemv_t_4.c | |||||
| ZGEMVTKERNEL = zgemv_t_4.c | |||||
| #SSYMV_U_KERNEL = ../generic/symv_k.c | #SSYMV_U_KERNEL = ../generic/symv_k.c | ||||
| @@ -0,0 +1,886 @@ | |||||
| /*************************************************************************** | |||||
| Copyright (c) 2018, The OpenBLAS Project | |||||
| All rights reserved. | |||||
| Redistribution and use in source and binary forms, with or without | |||||
| modification, are permitted provided that the following conditions are | |||||
| met: | |||||
| 1. Redistributions of source code must retain the above copyright | |||||
| notice, this list of conditions and the following disclaimer. | |||||
| 2. Redistributions in binary form must reproduce the above copyright | |||||
| notice, this list of conditions and the following disclaimer in | |||||
| the documentation and/or other materials provided with the | |||||
| distribution. | |||||
| 3. Neither the name of the OpenBLAS project nor the names of | |||||
| its contributors may be used to endorse or promote products | |||||
| derived from this software without specific prior written permission. | |||||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| *****************************************************************************/ | |||||
| #include "common.h" | |||||
| #define NBMAX 8192 | |||||
| #define PREFETCH 1 | |||||
| #include <altivec.h> | |||||
| #define HAVE_KERNEL4x8_ASM 1 | |||||
| #if defined(HAVE_KERNEL4x8_ASM) | |||||
| static void dgemv_kernel_4x8(BLASLONG n, BLASLONG lda, double *ap, double *x, double *y, double alpha) { | |||||
| FLOAT *a0, *a1, *a2, *a3, *a4, *a5, *a6, *a7; | |||||
| BLASLONG off2; | |||||
| BLASLONG tempR; | |||||
| __asm__( | |||||
| "sldi %[temp],%[off], 4 \n\t" // lda * sizeof (double) *2 | |||||
| "sldi %[off], %[off], 3 \n\t" // lda * sizeof (double) | |||||
| "xxlxor 34,34,34 \n\t" | |||||
| "xxlxor 35,34,34 \n\t" | |||||
| "add %[a2], %[a0], %[temp] \n\t" | |||||
| "add %[a1], %[a0], %[off] \n\t" | |||||
| "xxlxor 4,34,34 \n\t" | |||||
| "xxlxor 5,34,34 \n\t" | |||||
| "xxlxor 6,34,34 \n\t" | |||||
| "xxlxor 7,34,34 \n\t" | |||||
| "add %[a3], %[a2], %[off] \n\t" | |||||
| "add %[a4], %[a2], %[temp] \n\t" | |||||
| "xxlxor 8,34,34 \n\t" | |||||
| "xxlxor 9,34,34 \n\t" | |||||
| "add %[a5], %[a3], %[temp] \n\t" | |||||
| "li %[off],0 \n\t" | |||||
| "li %[off2],16 \n\t" | |||||
| "add %[a6], %[a4], %[temp] \n\t" | |||||
| "add %[a7], %[a5], %[temp] \n\t" | |||||
| "lxvd2x 32, %[x], %[off] \n\t" | |||||
| "lxvd2x 36, %[a0], %[off] \n\t" | |||||
| "lxvd2x 38, %[a1], %[off] \n\t" | |||||
| "lxvd2x 40, %[a2], %[off] \n\t" | |||||
| "lxvd2x 42, %[a3], %[off] \n\t" | |||||
| "lxvd2x 44, %[a4], %[off] \n\t" | |||||
| "lxvd2x 46, %[a5], %[off] \n\t" | |||||
| "lxvd2x 48, %[a6], %[off] \n\t" | |||||
| "lxvd2x 50, %[a7], %[off] \n\t" | |||||
| "lxvd2x 33, %[x], %[off2] \n\t" | |||||
| "lxvd2x 37, %[a0], %[off2] \n\t" | |||||
| "lxvd2x 39, %[a1], %[off2] \n\t" | |||||
| "lxvd2x 41, %[a2], %[off2] \n\t" | |||||
| "lxvd2x 43, %[a3], %[off2] \n\t" | |||||
| "lxvd2x 45, %[a4], %[off2] \n\t" | |||||
| "lxvd2x 47, %[a5], %[off2] \n\t" | |||||
| "lxvd2x 49, %[a6], %[off2] \n\t" | |||||
| "lxvd2x 51, %[a7], %[off2] \n\t" | |||||
| #if defined(PREFETCH) | |||||
| "li %[temp],896 \n\t" | |||||
| #endif | |||||
| "addic. %[n],%[n],-4 \n\t" | |||||
| "li %[off],32 \n\t" | |||||
| "ble- 2f \n\t" | |||||
| //-------------------------------------------------- | |||||
| ".p2align 5 \n\t" | |||||
| "1: \n\t" | |||||
| "xvmaddadp 34,36,32 \n\t" | |||||
| "xvmaddadp 35,38,32 \n\t" | |||||
| "addi %[off2], %[off2],32 \n\t" | |||||
| "lxvd2x 36, %[a0], %[off] \n\t" | |||||
| "lxvd2x 38, %[a1], %[off] \n\t" | |||||
| "xvmaddadp 4,40,32 \n\t" | |||||
| "xvmaddadp 5,42,32 \n\t" | |||||
| "lxvd2x 40, %[a2], %[off] \n\t" | |||||
| "lxvd2x 42, %[a3], %[off] \n\t" | |||||
| "xvmaddadp 6,44,32 \n\t" | |||||
| "xvmaddadp 7,46,32 \n\t" | |||||
| "lxvd2x 44, %[a4], %[off] \n\t" | |||||
| "lxvd2x 46, %[a5], %[off] \n\t" | |||||
| "xvmaddadp 8,48,32 \n\t" | |||||
| "xvmaddadp 9,50,32 \n\t" | |||||
| "lxvd2x 48, %[a6], %[off] \n\t" | |||||
| "lxvd2x 50, %[a7], %[off] \n\t" | |||||
| "lxvd2x 32, %[x], %[off] \n\t" | |||||
| "xvmaddadp 34,37,33 \n\t" | |||||
| "xvmaddadp 35,39,33 \n\t" | |||||
| "lxvd2x 37, %[a0], %[off2] \n\t" | |||||
| "lxvd2x 39, %[a1], %[off2] \n\t" | |||||
| "xvmaddadp 4,41,33 \n\t" | |||||
| "xvmaddadp 5,43,33 \n\t" | |||||
| "addi %[off], %[off],32 \n\t" | |||||
| "lxvd2x 41, %[a2], %[off2] \n\t" | |||||
| "lxvd2x 43, %[a3], %[off2] \n\t" | |||||
| "xvmaddadp 6,45,33 \n\t" | |||||
| "xvmaddadp 7,47,33 \n\t" | |||||
| "lxvd2x 45, %[a4], %[off2] \n\t" | |||||
| "lxvd2x 47, %[a5], %[off2] \n\t" | |||||
| "xvmaddadp 8,49,33 \n\t" | |||||
| "xvmaddadp 9,51,33 \n\t" | |||||
| "addic. %[n],%[n],-4 \n\t" | |||||
| "lxvd2x 49, %[a6], %[off2] \n\t" | |||||
| "lxvd2x 51, %[a7], %[off2] \n\t" | |||||
| "lxvd2x 33, %[x], %[off2] \n\t" | |||||
| "ble- 2f \n\t" | |||||
| "xvmaddadp 34,36,32 \n\t" | |||||
| "xvmaddadp 35,38,32 \n\t" | |||||
| "addi %[off2], %[off2],32 \n\t" | |||||
| "lxvd2x 36, %[a0], %[off] \n\t" | |||||
| "lxvd2x 38, %[a1], %[off] \n\t" | |||||
| "xvmaddadp 4,40,32 \n\t" | |||||
| "xvmaddadp 5,42,32 \n\t" | |||||
| "lxvd2x 40, %[a2], %[off] \n\t" | |||||
| "lxvd2x 42, %[a3], %[off] \n\t" | |||||
| "xvmaddadp 6,44,32 \n\t" | |||||
| "xvmaddadp 7,46,32 \n\t" | |||||
| "lxvd2x 44, %[a4], %[off] \n\t" | |||||
| "lxvd2x 46, %[a5], %[off] \n\t" | |||||
| "xvmaddadp 8,48,32 \n\t" | |||||
| "xvmaddadp 9,50,32 \n\t" | |||||
| "lxvd2x 48, %[a6], %[off] \n\t" | |||||
| "lxvd2x 50, %[a7], %[off] \n\t" | |||||
| "lxvd2x 32, %[x], %[off] \n\t" | |||||
| "xvmaddadp 34,37,33 \n\t" | |||||
| "xvmaddadp 35,39,33 \n\t" | |||||
| "lxvd2x 37, %[a0], %[off2] \n\t" | |||||
| "lxvd2x 39, %[a1], %[off2] \n\t" | |||||
| "xvmaddadp 4,41,33 \n\t" | |||||
| "xvmaddadp 5,43,33 \n\t" | |||||
| "addi %[off], %[off],32 \n\t" | |||||
| "lxvd2x 41, %[a2], %[off2] \n\t" | |||||
| "lxvd2x 43, %[a3], %[off2] \n\t" | |||||
| "xvmaddadp 6,45,33 \n\t" | |||||
| "xvmaddadp 7,47,33 \n\t" | |||||
| "lxvd2x 45, %[a4], %[off2] \n\t" | |||||
| "lxvd2x 47, %[a5], %[off2] \n\t" | |||||
| "xvmaddadp 8,49,33 \n\t" | |||||
| "xvmaddadp 9,51,33 \n\t" | |||||
| "addic. %[n],%[n],-4 \n\t" | |||||
| "lxvd2x 49, %[a6], %[off2] \n\t" | |||||
| "lxvd2x 51, %[a7], %[off2] \n\t" | |||||
| "lxvd2x 33, %[x], %[off2] \n\t" | |||||
| "ble- 2f \n\t" | |||||
| "xvmaddadp 34,36,32 \n\t" | |||||
| "xvmaddadp 35,38,32 \n\t" | |||||
| #if defined(PREFETCH) | |||||
| "addi %[temp],%[temp],128 \n\t" | |||||
| #endif | |||||
| "addi %[off2], %[off2],32 \n\t" | |||||
| "lxvd2x 36, %[a0], %[off] \n\t" | |||||
| "lxvd2x 38, %[a1], %[off] \n\t" | |||||
| "xvmaddadp 4,40,32 \n\t" | |||||
| "xvmaddadp 5,42,32 \n\t" | |||||
| "lxvd2x 40, %[a2], %[off] \n\t" | |||||
| "lxvd2x 42, %[a3], %[off] \n\t" | |||||
| "xvmaddadp 6,44,32 \n\t" | |||||
| "xvmaddadp 7,46,32 \n\t" | |||||
| "lxvd2x 44, %[a4], %[off] \n\t" | |||||
| "lxvd2x 46, %[a5], %[off] \n\t" | |||||
| "xvmaddadp 8,48,32 \n\t" | |||||
| "xvmaddadp 9,50,32 \n\t" | |||||
| "lxvd2x 48, %[a6], %[off] \n\t" | |||||
| "lxvd2x 50, %[a7], %[off] \n\t" | |||||
| #if defined(PREFETCH) | |||||
| "dcbt %[temp],%[a0] \n\t" | |||||
| #endif | |||||
| "lxvd2x 32, %[x], %[off] \n\t" | |||||
| "xvmaddadp 34,37,33 \n\t" | |||||
| "xvmaddadp 35,39,33 \n\t" | |||||
| "lxvd2x 37, %[a0], %[off2] \n\t" | |||||
| "lxvd2x 39, %[a1], %[off2] \n\t" | |||||
| "xvmaddadp 4,41,33 \n\t" | |||||
| "xvmaddadp 5,43,33 \n\t" | |||||
| #if defined(PREFETCH) | |||||
| "dcbt %[temp],%[a1] \n\t" | |||||
| #endif | |||||
| "lxvd2x 41, %[a2], %[off2] \n\t" | |||||
| "addi %[off], %[off],32 \n\t" | |||||
| "lxvd2x 43, %[a3], %[off2] \n\t" | |||||
| "xvmaddadp 6,45,33 \n\t" | |||||
| "xvmaddadp 7,47,33 \n\t" | |||||
| "lxvd2x 45, %[a4], %[off2] \n\t" | |||||
| "lxvd2x 47, %[a5], %[off2] \n\t" | |||||
| "xvmaddadp 8,49,33 \n\t" | |||||
| "xvmaddadp 9,51,33 \n\t" | |||||
| #if defined(PREFETCH) | |||||
| "dcbt %[temp],%[a3] \n\t" | |||||
| #endif | |||||
| "lxvd2x 49, %[a6], %[off2] \n\t" | |||||
| "lxvd2x 51, %[a7], %[off2] \n\t" | |||||
| "lxvd2x 33, %[x], %[off2] \n\t" | |||||
| "addic. %[n],%[n],-4 \n\t" | |||||
| "ble- 2f \n\t" | |||||
| "addi %[off2], %[off2],32 \n\t" | |||||
| #if defined(PREFETCH) | |||||
| "dcbt %[temp],%[a2] \n\t" | |||||
| #endif | |||||
| "xvmaddadp 34,36,32 \n\t" | |||||
| "xvmaddadp 35,38,32 \n\t" | |||||
| "lxvd2x 36, %[a0], %[off] \n\t" | |||||
| "lxvd2x 38, %[a1], %[off] \n\t" | |||||
| "xvmaddadp 4,40,32 \n\t" | |||||
| "xvmaddadp 5,42,32 \n\t" | |||||
| "lxvd2x 40, %[a2], %[off] \n\t" | |||||
| "lxvd2x 42, %[a3], %[off] \n\t" | |||||
| #if defined(PREFETCH) | |||||
| "dcbt %[temp],%[a4] \n\t" | |||||
| #endif | |||||
| "xvmaddadp 6,44,32 \n\t" | |||||
| "xvmaddadp 7,46,32 \n\t" | |||||
| "lxvd2x 44, %[a4], %[off] \n\t" | |||||
| "lxvd2x 46, %[a5], %[off] \n\t" | |||||
| "xvmaddadp 8,48,32 \n\t" | |||||
| "xvmaddadp 9,50,32 \n\t" | |||||
| "lxvd2x 48, %[a6], %[off] \n\t" | |||||
| "lxvd2x 50, %[a7], %[off] \n\t" | |||||
| "lxvd2x 32, %[x], %[off] \n\t" | |||||
| #if defined(PREFETCH) | |||||
| "dcbt %[temp],%[a5] \n\t" | |||||
| #endif | |||||
| "xvmaddadp 34,37,33 \n\t" | |||||
| "xvmaddadp 35,39,33 \n\t" | |||||
| "lxvd2x 37, %[a0], %[off2] \n\t" | |||||
| "lxvd2x 39, %[a1], %[off2] \n\t" | |||||
| "xvmaddadp 4,41,33 \n\t" | |||||
| "xvmaddadp 5,43,33 \n\t" | |||||
| "addi %[off], %[off],32 \n\t" | |||||
| "lxvd2x 41, %[a2], %[off2] \n\t" | |||||
| "lxvd2x 43, %[a3], %[off2] \n\t" | |||||
| #if defined(PREFETCH) | |||||
| "dcbt %[temp],%[a6] \n\t" | |||||
| #endif | |||||
| "xvmaddadp 6,45,33 \n\t" | |||||
| "xvmaddadp 7,47,33 \n\t" | |||||
| "lxvd2x 45, %[a4], %[off2] \n\t" | |||||
| "lxvd2x 47, %[a5], %[off2] \n\t" | |||||
| "xvmaddadp 8,49,33 \n\t" | |||||
| "xvmaddadp 9,51,33 \n\t" | |||||
| #if defined(PREFETCH) | |||||
| "dcbt %[temp],%[a7] \n\t" | |||||
| #endif | |||||
| "lxvd2x 49, %[a6], %[off2] \n\t" | |||||
| "addic. %[n],%[n],-4 \n\t" | |||||
| "lxvd2x 51, %[a7], %[off2] \n\t" | |||||
| "lxvd2x 33, %[x], %[off2] \n\t" | |||||
| #if defined(PREFETCH) | |||||
| "dcbt %[temp],%[x] \n\t" | |||||
| #endif | |||||
| "bgt+ 1b \n\t" | |||||
| ".p2align 5 \n\t" | |||||
| "2: \n\t" | |||||
| //-------------------------------------------- | |||||
| "xvmaddadp 34,36,32 \n\t" | |||||
| "xvmaddadp 35,38,32 \n\t" | |||||
| "xvmaddadp 4,40,32 \n\t" | |||||
| "xvmaddadp 5,42,32 \n\t" | |||||
| "xvmaddadp 6,44,32 \n\t" | |||||
| "xvmaddadp 7,46,32 \n\t" | |||||
| "xvmaddadp 8,48,32 \n\t" | |||||
| "xvmaddadp 9,50,32 \n\t" | |||||
| "xxspltd 36, %x[alpha], 0 \n\t" | |||||
| "xvmaddadp 34,37,33 \n\t" | |||||
| "xvmaddadp 35,39,33 \n\t" | |||||
| "xvmaddadp 4,41,33 \n\t" | |||||
| "xvmaddadp 5,43,33 \n\t" | |||||
| "xvmaddadp 6,45,33 \n\t" | |||||
| "xvmaddadp 7,47,33 \n\t" | |||||
| "xvmaddadp 8,49,33 \n\t" | |||||
| "xvmaddadp 9,51,33 \n\t" | |||||
| "lxvd2x 37, 0, %[y] \n\t" | |||||
| "li %[off2],16 \n\t" | |||||
| "lxvd2x 38, %[off2], %[y] \n\t" | |||||
| "li %[off2],32 \n\t" | |||||
| "lxvd2x 39, %[off2], %[y] \n\t" | |||||
| "li %[off2],48 \n\t" | |||||
| "lxvd2x 40, %[off2], %[y] \n\t" | |||||
| "xxmrgld 42,34,35 \n\t" | |||||
| "xxmrghd 43,34,35 \n\t" | |||||
| "xxmrgld 44,4,5 \n\t" | |||||
| "xxmrghd 45,4,5 \n\t" | |||||
| "xvadddp 42,42,43 \n\t" | |||||
| "xxmrgld 46,6,7 \n\t" | |||||
| "xxmrghd 47,6,7 \n\t" | |||||
| "xvadddp 44,44,45 \n\t" | |||||
| "xxmrgld 48,8,9 \n\t" | |||||
| "xxmrghd 49,8,9 \n\t" | |||||
| "xvadddp 46,46,47 \n\t" | |||||
| "xvmaddadp 37,42,36 \n\t" | |||||
| "xvmaddadp 38,44,36 \n\t" | |||||
| "xvadddp 48,48,49 \n\t" | |||||
| "xvmaddadp 39,46,36 \n\t" | |||||
| "stxvd2x 37, 0, %[y] \n\t" | |||||
| "li %[off],16 \n\t" | |||||
| "stxvd2x 38, %[off], %[y] \n\t" | |||||
| "xvmaddadp 40,48,36 \n\t" | |||||
| "li %[off],32 \n\t" | |||||
| "stxvd2x 39, %[off], %[y] \n\t" | |||||
| "stxvd2x 40, %[off2], %[y] \n\t" | |||||
| : [memy] "+m" (*(const double (*)[8])y), | |||||
| [n] "+&r" (n), | |||||
| [a0] "=b" (a0), | |||||
| [a1] "=&b" (a1), | |||||
| [a2] "=&b" (a2), | |||||
| [a3] "=&b" (a3), | |||||
| [a4] "=&b" (a4), | |||||
| [a5] "=&b" (a5), | |||||
| [a6] "=&b" (a6), | |||||
| [a7] "=&b" (a7), | |||||
| [off] "+&b" (lda), | |||||
| [off2]"=&b" (off2), | |||||
| [temp] "=&b" (tempR) | |||||
| : [memx] "m" (*(const double (*)[n])x), | |||||
| [mem_ap] "m" (*(const double (*)[]) ap), | |||||
| [alpha] "d" (alpha), | |||||
| "[a0]" (ap), | |||||
| [x] "b" (x), | |||||
| [y] "b" (y) | |||||
| : "cc","vs4","vs5","vs6","vs7","vs8","vs9" ,"vs32","vs33","vs34","vs35", "vs36", "vs37", "vs38", "vs39", | |||||
| "vs40", "vs41", "vs42", "vs43", "vs44", "vs45", "vs46", "vs47", "vs48", "vs49", "vs50", "vs51" | |||||
| ); | |||||
| return; | |||||
| } | |||||
| #else | |||||
| static void dgemv_kernel_4x8(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT alpha) { | |||||
| BLASLONG i; | |||||
| #if defined(PREFETCH) | |||||
| BLASLONG j, c, k; | |||||
| #endif | |||||
| FLOAT *a0, *a1, *a2, *a3, *a4, *a5, *a6, *a7; | |||||
| __vector double *va0, *va1, *va2, *va3, *va4, *va5, *va6, *va7, *v_x; | |||||
| register __vector double temp0 = {0, 0}; | |||||
| register __vector double temp1 = {0, 0}; | |||||
| register __vector double temp2 = {0, 0}; | |||||
| register __vector double temp3 = {0, 0}; | |||||
| register __vector double temp4 = {0, 0}; | |||||
| register __vector double temp5 = {0, 0}; | |||||
| register __vector double temp6 = {0, 0}; | |||||
| register __vector double temp7 = {0, 0}; | |||||
| a0 = ap; | |||||
| a1 = ap + lda; | |||||
| a2 = a1 + lda; | |||||
| a3 = a2 + lda; | |||||
| a4 = a3 + lda; | |||||
| a5 = a4 + lda; | |||||
| a6 = a5 + lda; | |||||
| a7 = a6 + lda; | |||||
| va0 = (__vector double*) a0; | |||||
| va1 = (__vector double*) a1; | |||||
| va2 = (__vector double*) a2; | |||||
| va3 = (__vector double*) a3; | |||||
| va4 = (__vector double*) a4; | |||||
| va5 = (__vector double*) a5; | |||||
| va6 = (__vector double*) a6; | |||||
| va7 = (__vector double*) a7; | |||||
| v_x = (__vector double*) x; | |||||
| #if defined(PREFETCH) | |||||
| c = n >> 1; | |||||
| for (j = 0; j < c; j += 64) { | |||||
| k = (c - j) > 64 ? 64 : (c - j); | |||||
| __builtin_prefetch(v_x + 64); | |||||
| __builtin_prefetch(va0 + 64); | |||||
| __builtin_prefetch(va1 + 64); | |||||
| __builtin_prefetch(va2 + 64); | |||||
| __builtin_prefetch(va3 + 64); | |||||
| __builtin_prefetch(va4 + 64); | |||||
| __builtin_prefetch(va5 + 64); | |||||
| __builtin_prefetch(va6 + 64); | |||||
| __builtin_prefetch(va7 + 64); | |||||
| for (i = 0; i < k; i += 2) { | |||||
| #else | |||||
| for (i = 0; i < n/2; i += 2) { | |||||
| #endif | |||||
| temp0 += v_x[i] * va0[i]; | |||||
| temp1 += v_x[i] * va1[i]; | |||||
| temp2 += v_x[i] * va2[i]; | |||||
| temp3 += v_x[i] * va3[i]; | |||||
| temp4 += v_x[i] * va4[i]; | |||||
| temp5 += v_x[i] * va5[i]; | |||||
| temp6 += v_x[i] * va6[i]; | |||||
| temp7 += v_x[i] * va7[i]; | |||||
| temp0 += v_x[i + 1] * va0[i + 1]; | |||||
| temp1 += v_x[i + 1] * va1[i + 1]; | |||||
| temp2 += v_x[i + 1] * va2[i + 1]; | |||||
| temp3 += v_x[i + 1] * va3[i + 1]; | |||||
| temp4 += v_x[i + 1] * va4[i + 1]; | |||||
| temp5 += v_x[i + 1] * va5[i + 1]; | |||||
| temp6 += v_x[i + 1] * va6[i + 1]; | |||||
| temp7 += v_x[i + 1] * va7[i + 1]; | |||||
| } | |||||
| #if defined(PREFETCH) | |||||
| va0 += 64; | |||||
| va1 += 64; | |||||
| va2 += 64; | |||||
| va3 += 64; | |||||
| va4 += 64; | |||||
| va5 += 64; | |||||
| va6 += 64; | |||||
| va7 += 64; | |||||
| v_x += 64; | |||||
| } | |||||
| #endif | |||||
| y[0] += alpha * (temp0[0] + temp0[1]); | |||||
| y[1] += alpha * (temp1[0] + temp1[1]); | |||||
| y[2] += alpha * (temp2[0] + temp2[1]); | |||||
| y[3] += alpha * (temp3[0] + temp3[1]); | |||||
| y[4] += alpha * (temp4[0] + temp4[1]); | |||||
| y[5] += alpha * (temp5[0] + temp5[1]); | |||||
| y[6] += alpha * (temp6[0] + temp6[1]); | |||||
| y[7] += alpha * (temp7[0] + temp7[1]); | |||||
| } | |||||
| #endif | |||||
| static void dgemv_kernel_4x4(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT alpha) { | |||||
| BLASLONG i = 0; | |||||
| FLOAT *a0, *a1, *a2, *a3; | |||||
| a0 = ap; | |||||
| a1 = ap + lda; | |||||
| a2 = a1 + lda; | |||||
| a3 = a2 + lda; | |||||
| __vector double* va0 = (__vector double*) a0; | |||||
| __vector double* va1 = (__vector double*) a1; | |||||
| __vector double* va2 = (__vector double*) a2; | |||||
| __vector double* va3 = (__vector double*) a3; | |||||
| __vector double* v_x = (__vector double*) x; | |||||
| register __vector double temp0 = {0, 0}; | |||||
| register __vector double temp1 = {0, 0}; | |||||
| register __vector double temp2 = {0, 0}; | |||||
| register __vector double temp3 = {0, 0}; | |||||
| register __vector double temp4 = {0, 0}; | |||||
| register __vector double temp5 = {0, 0}; | |||||
| register __vector double temp6 = {0, 0}; | |||||
| register __vector double temp7 = {0, 0}; | |||||
| for (i = 0; i < n / 2; i += 2) { | |||||
| temp0 += v_x[i] * va0[i]; | |||||
| temp1 += v_x[i] * va1[i]; | |||||
| temp2 += v_x[i] * va2[i]; | |||||
| temp3 += v_x[i] * va3[i]; | |||||
| temp4 += v_x[i + 1] * va0[i + 1]; | |||||
| temp5 += v_x[i + 1] * va1[i + 1]; | |||||
| temp6 += v_x[i + 1] * va2[i + 1]; | |||||
| temp7 += v_x[i + 1] * va3[i + 1]; | |||||
| } | |||||
| temp0 += temp4; | |||||
| temp1 += temp5; | |||||
| temp2 += temp6; | |||||
| temp3 += temp7; | |||||
| y[0] += alpha * (temp0[0] + temp0[1]); | |||||
| y[1] += alpha * (temp1[0] + temp1[1]); | |||||
| y[2] += alpha * (temp2[0] + temp2[1]); | |||||
| y[3] += alpha * (temp3[0] + temp3[1]); | |||||
| } | |||||
| static void dgemv_kernel_4x2(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT alpha, BLASLONG inc_y) { | |||||
| BLASLONG i; | |||||
| FLOAT *a0, *a1; | |||||
| a0 = ap; | |||||
| a1 = ap + lda; | |||||
| __vector double* va0 = (__vector double*) a0; | |||||
| __vector double* va1 = (__vector double*) a1; | |||||
| __vector double* v_x = (__vector double*) x; | |||||
| __vector double temp0 = {0, 0}; | |||||
| __vector double temp1 = {0, 0}; | |||||
| for (i = 0; i < n / 2; i += 2) { | |||||
| temp0 += v_x[i] * va0[i] + v_x[i + 1] * va0[i + 1]; | |||||
| temp1 += v_x[i] * va1[i] + v_x[i + 1] * va1[i + 1]; | |||||
| } | |||||
| y[0] += alpha * (temp0[0] + temp0[1]); | |||||
| y[inc_y] += alpha * (temp1[0] + temp1[1]); | |||||
| } | |||||
| static void dgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT alpha) { | |||||
| BLASLONG i; | |||||
| FLOAT *a0; | |||||
| a0 = ap; | |||||
| __vector double* va0 = (__vector double*) a0; | |||||
| __vector double* v_x = (__vector double*) x; | |||||
| __vector double temp0 = {0, 0}; | |||||
| for (i = 0; i < n / 2; i += 2) { | |||||
| temp0 += v_x[i] * va0[i] + v_x[i + 1] * va0[i + 1]; | |||||
| } | |||||
| *y += alpha * (temp0[0] + temp0[1]); | |||||
| } | |||||
| static void copy_x(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_src) { | |||||
| BLASLONG i; | |||||
| for (i = 0; i < n; i++) { | |||||
| *dest++ = *src; | |||||
| src += inc_src; | |||||
| } | |||||
| } | |||||
| int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) { | |||||
| BLASLONG i; | |||||
| BLASLONG j; | |||||
| FLOAT *a_ptr; | |||||
| FLOAT *x_ptr; | |||||
| FLOAT *y_ptr; | |||||
| BLASLONG n1; | |||||
| BLASLONG m1; | |||||
| BLASLONG m2; | |||||
| BLASLONG m3; | |||||
| BLASLONG n2; | |||||
| FLOAT ybuffer[8], *xbuffer; | |||||
| if (m < 1) return (0); | |||||
| if (n < 1) return (0); | |||||
| xbuffer = buffer; | |||||
| n1 = n >> 3; | |||||
| n2 = n & 7; | |||||
| m3 = m & 3; | |||||
| m1 = m - m3; | |||||
| m2 = (m & (NBMAX - 1)) - m3; | |||||
| BLASLONG NB = NBMAX; | |||||
| while (NB == NBMAX) { | |||||
| m1 -= NB; | |||||
| if (m1 < 0) { | |||||
| if (m2 == 0) break; | |||||
| NB = m2; | |||||
| } | |||||
| y_ptr = y; | |||||
| a_ptr = a; | |||||
| x_ptr = x; | |||||
| if (inc_x != 1) | |||||
| copy_x(NB, x_ptr, xbuffer, inc_x); | |||||
| else | |||||
| xbuffer = x_ptr; | |||||
| BLASLONG lda8 = lda << 3; | |||||
| if (inc_y == 1) { | |||||
| for (i = 0; i < n1; i++) { | |||||
| dgemv_kernel_4x8(NB, lda, a_ptr, xbuffer, y_ptr, alpha); | |||||
| y_ptr += 8; | |||||
| a_ptr += lda8; | |||||
| #if defined(PREFETCH) | |||||
| __builtin_prefetch(y_ptr+64); | |||||
| #endif | |||||
| } | |||||
| } else { | |||||
| for (i = 0; i < n1; i++) { | |||||
| ybuffer[0] = 0; | |||||
| ybuffer[1] = 0; | |||||
| ybuffer[2] = 0; | |||||
| ybuffer[3] = 0; | |||||
| ybuffer[4] = 0; | |||||
| ybuffer[5] = 0; | |||||
| ybuffer[6] = 0; | |||||
| ybuffer[7] = 0; | |||||
| dgemv_kernel_4x8(NB, lda, a_ptr, xbuffer, ybuffer, alpha); | |||||
| *y_ptr += ybuffer[0]; | |||||
| y_ptr += inc_y; | |||||
| *y_ptr += ybuffer[1]; | |||||
| y_ptr += inc_y; | |||||
| *y_ptr += ybuffer[2]; | |||||
| y_ptr += inc_y; | |||||
| *y_ptr += ybuffer[3]; | |||||
| y_ptr += inc_y; | |||||
| *y_ptr += ybuffer[4]; | |||||
| y_ptr += inc_y; | |||||
| *y_ptr += ybuffer[5]; | |||||
| y_ptr += inc_y; | |||||
| *y_ptr += ybuffer[6]; | |||||
| y_ptr += inc_y; | |||||
| *y_ptr += ybuffer[7]; | |||||
| y_ptr += inc_y; | |||||
| a_ptr += lda8; | |||||
| } | |||||
| } | |||||
| if (n2 & 4) { | |||||
| ybuffer[0] = 0; | |||||
| ybuffer[1] = 0; | |||||
| ybuffer[2] = 0; | |||||
| ybuffer[3] = 0; | |||||
| dgemv_kernel_4x4(NB, lda, a_ptr, xbuffer, ybuffer, alpha); | |||||
| a_ptr += lda<<2; | |||||
| *y_ptr += ybuffer[0]; | |||||
| y_ptr += inc_y; | |||||
| *y_ptr += ybuffer[1]; | |||||
| y_ptr += inc_y; | |||||
| *y_ptr += ybuffer[2]; | |||||
| y_ptr += inc_y; | |||||
| *y_ptr += ybuffer[3]; | |||||
| y_ptr += inc_y; | |||||
| } | |||||
| if (n2 & 2) { | |||||
| dgemv_kernel_4x2(NB, lda, a_ptr, xbuffer, y_ptr, alpha, inc_y); | |||||
| a_ptr += lda << 1; | |||||
| y_ptr += 2 * inc_y; | |||||
| } | |||||
| if (n2 & 1) { | |||||
| dgemv_kernel_4x1(NB, a_ptr, xbuffer, y_ptr, alpha); | |||||
| a_ptr += lda; | |||||
| y_ptr += inc_y; | |||||
| } | |||||
| a += NB; | |||||
| x += NB * inc_x; | |||||
| } | |||||
| if (m3 == 0) return (0); | |||||
| x_ptr = x; | |||||
| a_ptr = a; | |||||
| if (m3 == 3) { | |||||
| FLOAT xtemp0 = *x_ptr * alpha; | |||||
| x_ptr += inc_x; | |||||
| FLOAT xtemp1 = *x_ptr * alpha; | |||||
| x_ptr += inc_x; | |||||
| FLOAT xtemp2 = *x_ptr * alpha; | |||||
| FLOAT *aj = a_ptr; | |||||
| y_ptr = y; | |||||
| if (lda == 3 && inc_y == 1) { | |||||
| for (j = 0; j < (n & -4); j += 4) { | |||||
| y_ptr[j] += aj[0] * xtemp0 + aj[1] * xtemp1 + aj[2] * xtemp2; | |||||
| y_ptr[j + 1] += aj[3] * xtemp0 + aj[4] * xtemp1 + aj[5] * xtemp2; | |||||
| y_ptr[j + 2] += aj[6] * xtemp0 + aj[7] * xtemp1 + aj[8] * xtemp2; | |||||
| y_ptr[j + 3] += aj[9] * xtemp0 + aj[10] * xtemp1 + aj[11] * xtemp2; | |||||
| aj += 12; | |||||
| } | |||||
| for (; j < n; j++) { | |||||
| y_ptr[j] += aj[0] * xtemp0 + aj[1] * xtemp1 + aj[2] * xtemp2; | |||||
| aj += 3; | |||||
| } | |||||
| } else { | |||||
| if (inc_y == 1) { | |||||
| BLASLONG register lda2 = lda << 1; | |||||
| BLASLONG register lda4 = lda << 2; | |||||
| BLASLONG register lda3 = lda2 + lda; | |||||
| for (j = 0; j < (n & -4); j += 4) { | |||||
| y_ptr[j] += *aj * xtemp0 + *(aj + 1) * xtemp1 + *(aj + 2) * xtemp2; | |||||
| y_ptr[j + 1] += *(aj + lda) * xtemp0 + *(aj + lda + 1) * xtemp1 + *(aj + lda + 2) * xtemp2; | |||||
| y_ptr[j + 2] += *(aj + lda2) * xtemp0 + *(aj + lda2 + 1) * xtemp1 + *(aj + lda2 + 2) * xtemp2; | |||||
| y_ptr[j + 3] += *(aj + lda3) * xtemp0 + *(aj + lda3 + 1) * xtemp1 + *(aj + lda3 + 2) * xtemp2; | |||||
| aj += lda4; | |||||
| } | |||||
| for (; j < n; j++) { | |||||
| y_ptr[j] += *aj * xtemp0 + *(aj + 1) * xtemp1 + *(aj + 2) * xtemp2; | |||||
| aj += lda; | |||||
| } | |||||
| } else { | |||||
| for (j = 0; j < n; j++) { | |||||
| *y_ptr += *aj * xtemp0 + *(aj + 1) * xtemp1 + *(aj + 2) * xtemp2; | |||||
| y_ptr += inc_y; | |||||
| aj += lda; | |||||
| } | |||||
| } | |||||
| } | |||||
| return (0); | |||||
| } | |||||
| if (m3 == 2) { | |||||
| FLOAT xtemp0 = *x_ptr * alpha; | |||||
| x_ptr += inc_x; | |||||
| FLOAT xtemp1 = *x_ptr * alpha; | |||||
| FLOAT *aj = a_ptr; | |||||
| y_ptr = y; | |||||
| if (lda == 2 && inc_y == 1) { | |||||
| for (j = 0; j < (n & -4); j += 4) { | |||||
| y_ptr[j] += aj[0] * xtemp0 + aj[1] * xtemp1; | |||||
| y_ptr[j + 1] += aj[2] * xtemp0 + aj[3] * xtemp1; | |||||
| y_ptr[j + 2] += aj[4] * xtemp0 + aj[5] * xtemp1; | |||||
| y_ptr[j + 3] += aj[6] * xtemp0 + aj[7] * xtemp1; | |||||
| aj += 8; | |||||
| } | |||||
| for (; j < n; j++) { | |||||
| y_ptr[j] += aj[0] * xtemp0 + aj[1] * xtemp1; | |||||
| aj += 2; | |||||
| } | |||||
| } else { | |||||
| if (inc_y == 1) { | |||||
| BLASLONG register lda2 = lda << 1; | |||||
| BLASLONG register lda4 = lda << 2; | |||||
| BLASLONG register lda3 = lda2 + lda; | |||||
| for (j = 0; j < (n & -4); j += 4) { | |||||
| y_ptr[j] += *aj * xtemp0 + *(aj + 1) * xtemp1; | |||||
| y_ptr[j + 1] += *(aj + lda) * xtemp0 + *(aj + lda + 1) * xtemp1; | |||||
| y_ptr[j + 2] += *(aj + lda2) * xtemp0 + *(aj + lda2 + 1) * xtemp1; | |||||
| y_ptr[j + 3] += *(aj + lda3) * xtemp0 + *(aj + lda3 + 1) * xtemp1; | |||||
| aj += lda4; | |||||
| } | |||||
| for (; j < n; j++) { | |||||
| y_ptr[j] += *aj * xtemp0 + *(aj + 1) * xtemp1; | |||||
| aj += lda; | |||||
| } | |||||
| } else { | |||||
| for (j = 0; j < n; j++) { | |||||
| *y_ptr += *aj * xtemp0 + *(aj + 1) * xtemp1; | |||||
| y_ptr += inc_y; | |||||
| aj += lda; | |||||
| } | |||||
| } | |||||
| } | |||||
| return (0); | |||||
| } | |||||
| FLOAT xtemp = *x_ptr * alpha; | |||||
| FLOAT *aj = a_ptr; | |||||
| y_ptr = y; | |||||
| if (lda == 1 && inc_y == 1) { | |||||
| for (j = 0; j < (n & -4); j += 4) { | |||||
| y_ptr[j] += aj[j] * xtemp; | |||||
| y_ptr[j + 1] += aj[j + 1] * xtemp; | |||||
| y_ptr[j + 2] += aj[j + 2] * xtemp; | |||||
| y_ptr[j + 3] += aj[j + 3] * xtemp; | |||||
| } | |||||
| for (; j < n; j++) { | |||||
| y_ptr[j] += aj[j] * xtemp; | |||||
| } | |||||
| } else { | |||||
| if (inc_y == 1) { | |||||
| BLASLONG register lda2 = lda << 1; | |||||
| BLASLONG register lda4 = lda << 2; | |||||
| BLASLONG register lda3 = lda2 + lda; | |||||
| for (j = 0; j < (n & -4); j += 4) { | |||||
| y_ptr[j] += *aj * xtemp; | |||||
| y_ptr[j + 1] += *(aj + lda) * xtemp; | |||||
| y_ptr[j + 2] += *(aj + lda2) * xtemp; | |||||
| y_ptr[j + 3] += *(aj + lda3) * xtemp; | |||||
| aj += lda4; | |||||
| } | |||||
| for (; j < n; j++) { | |||||
| y_ptr[j] += *aj * xtemp; | |||||
| aj += lda; | |||||
| } | |||||
| } else { | |||||
| for (j = 0; j < n; j++) { | |||||
| *y_ptr += *aj * xtemp; | |||||
| y_ptr += inc_y; | |||||
| aj += lda; | |||||
| } | |||||
| } | |||||
| } | |||||
| return (0); | |||||
| } | |||||
| @@ -0,0 +1,383 @@ | |||||
| /*************************************************************************** | |||||
| Copyright (c) 2013-2018, The OpenBLAS Project | |||||
| All rights reserved. | |||||
| Redistribution and use in source and binary forms, with or without | |||||
| modification, are permitted provided that the following conditions are | |||||
| met: | |||||
| 1. Redistributions of source code must retain the above copyright | |||||
| notice, this list of conditions and the following disclaimer. | |||||
| 2. Redistributions in binary form must reproduce the above copyright | |||||
| notice, this list of conditions and the following disclaimer in | |||||
| the documentation and/or other materials provided with the | |||||
| distribution. | |||||
| 3. Neither the name of the OpenBLAS project nor the names of | |||||
| its contributors may be used to endorse or promote products | |||||
| derived from this software without specific prior written permission. | |||||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| *****************************************************************************/ | |||||
| #include "common.h" | |||||
| #include <math.h> | |||||
| #include <altivec.h> | |||||
| #if defined(DOUBLE) | |||||
| #define ABS fabs | |||||
| #else | |||||
| #define ABS fabsf | |||||
| #endif | |||||
| /** | |||||
| * Find maximum index | |||||
| * Warning: requirements n>0 and n % 32 == 0 | |||||
| * @param n | |||||
| * @param x pointer to the vector | |||||
| * @param maxf (out) maximum absolute value .( only for output ) | |||||
| * @return index | |||||
| */ | |||||
| static BLASLONG diamax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *maxf) { | |||||
| BLASLONG index; | |||||
| register __vector long long start = {1,0}; | |||||
| register __vector long long temp_add_index = {2, 2}; | |||||
| __asm__( | |||||
| "lxvd2x 44, 0,%[ptr_tmp] \n\t" | |||||
| "lxvd2x 45, %[i16],%[ptr_tmp] \n\t" | |||||
| "lxvd2x 46, %[i32],%[ptr_tmp] \n\t" | |||||
| "lxvd2x 47, %[i48],%[ptr_tmp] \n\t" | |||||
| "lxvd2x 48, %[i64],%[ptr_tmp] \n\t" | |||||
| "lxvd2x 49, %[i80],%[ptr_tmp] \n\t" | |||||
| "lxvd2x 50, %[i96],%[ptr_tmp] \n\t" | |||||
| "lxvd2x 51,%[i112],%[ptr_tmp] \n\t" | |||||
| "xxlor 40,%x[start],%x[start] \n\t" //{ 1,0} vs40 | v8 | |||||
| "vaddudm 9,8,%[adder] \n\t" //{3,2} vs41 | |||||
| "xxlxor 37,37 ,37 \n\t" //v5 v37 index_count | |||||
| "vaddudm 10,9,%[adder] \n\t" //{5,4} vs42 | |||||
| "xxlxor 38 ,38 ,38 \n\t" // v6 | vs38 vec_max_index | |||||
| "vaddudm 11,10,%[adder] \n\t" //{7,6} vs43 | |||||
| "xxlxor 39,39,39 \n\t" // vs39 vec_max_value | |||||
| "vaddudm 4,11, %[adder] \n\t" // {9,8} -{8;8} vs36 | v4 | |||||
| "xxspltd 36,36,0 \n\t" | |||||
| "xvabsdp 44, 44 \n\t" | |||||
| "xvabsdp 45, 45 \n\t" | |||||
| "xvabsdp 46, 46 \n\t" | |||||
| "xvabsdp 47, 47 \n\t" | |||||
| "xvabsdp 48, 48 \n\t" | |||||
| "xvabsdp 49, 49 \n\t" | |||||
| "xvabsdp 50, 50 \n\t" | |||||
| "xvabsdp 51, 51 \n\t" | |||||
| //jump first half forward | |||||
| "b 2f \n\t" | |||||
| //=================================================================== | |||||
| ".p2align 5 \n\t" | |||||
| "1: \n\t" | |||||
| "xvcmpgtdp 2,45,44 \n\t " | |||||
| "xvcmpgtdp 3,47,46 \n\t " | |||||
| "xvcmpgtdp 4,49,48 \n\t " | |||||
| "xvcmpgtdp 5,51,50 \n\t" | |||||
| "xxsel 32,40,41,2 \n\t" | |||||
| "xxsel 0,44,45,2 \n\t" | |||||
| "xxsel 33,42,43,3 \n\t" | |||||
| "xxsel 1,46,47,3 \n\t" | |||||
| "xxsel 34,40,41,4 \n\t" | |||||
| "xxsel 45,48,49,4 \n\t" | |||||
| "xxsel 35,42,43,5 \n\t" | |||||
| "xxsel 47,50,51,5 \n\t" | |||||
| "xvcmpgtdp 2, 1,0 \n\t" | |||||
| "xvcmpgtdp 3,47, 45 \n\t" | |||||
| "addi %[ptr_tmp] ,%[ptr_tmp] , 128 \n\t" | |||||
| "xxsel 32,32,33,2 \n\t" | |||||
| "xxsel 0 ,0,1,2 \n\t" | |||||
| "xxsel 34,34,35,3 \n\t" | |||||
| "xxsel 5,45,47,3 \n\t" | |||||
| //load next 64 | |||||
| "lxvd2x 44, 0,%[ptr_tmp] \n\t" | |||||
| "lxvd2x 45, %[i16],%[ptr_tmp] \n\t" | |||||
| // for {second 8 elements } we have to add 8 to each so that it became {from 8 to 16} | |||||
| "vaddudm 2,2,4 \n\t" // vs34=vs34 + vs36{8,8} | |||||
| "lxvd2x 46, %[i32],%[ptr_tmp] \n\t" | |||||
| "lxvd2x 47, %[i48],%[ptr_tmp] \n\t" | |||||
| //choose bigger from first and second part | |||||
| "xvcmpgtdp 4,5 , 0 \n\t" | |||||
| "xxsel 3, 0,5,4 \n\t" | |||||
| "xxsel 33,32,34,4 \n\t" | |||||
| //load next 64 | |||||
| "lxvd2x 48, %[i64],%[ptr_tmp] \n\t" | |||||
| "lxvd2x 49, %[i80],%[ptr_tmp] \n\t" | |||||
| "vaddudm 1,1,5 \n\t" // get real index for first bigger | |||||
| "lxvd2x 50, %[i96],%[ptr_tmp] \n\t" | |||||
| "lxvd2x 51,%[i112],%[ptr_tmp] \n\t" | |||||
| //compare with previous to get vec_max_index(v6 | vs38 ) and vec_max_value (vs39) | |||||
| "xvcmpgtdp 2, 3,39 \n\t" | |||||
| "xxsel 39,39,3,2 \n\t" | |||||
| "xxsel 38,38,33,2 \n\t" | |||||
| //update index += 8 | |||||
| "vaddudm 5,5,4 \n\t" | |||||
| "xvabsdp 44, 44 \n\t" | |||||
| "xvabsdp 45, 45 \n\t" | |||||
| "xvabsdp 46, 46 \n\t" | |||||
| "xvabsdp 47, 47 \n\t" | |||||
| //update index += 8 | |||||
| "vaddudm 5,5,4 \n\t" | |||||
| "xvabsdp 48, 48 \n\t" | |||||
| "xvabsdp 49, 49 \n\t" | |||||
| "xvabsdp 50, 50 \n\t" | |||||
| "xvabsdp 51, 51 \n\t" | |||||
| //<-----------jump here from first load | |||||
| "2: \n\t" | |||||
| "xvcmpgtdp 2,45,44 \n\t " | |||||
| "xvcmpgtdp 3,47,46 \n\t " | |||||
| "xvcmpgtdp 4,49,48 \n\t " | |||||
| "xvcmpgtdp 5,51,50 \n\t" | |||||
| "xxsel 32,40,41,2 \n\t" | |||||
| "xxsel 0,44,45,2 \n\t" | |||||
| "xxsel 33,42,43,3 \n\t" | |||||
| "xxsel 1,46,47,3 \n\t" | |||||
| "xxsel 34,40,41,4 \n\t" | |||||
| "xxsel 45,48,49,4 \n\t" | |||||
| "xxsel 35,42,43,5 \n\t" | |||||
| "xxsel 47,50,51,5 \n\t" | |||||
| "xvcmpgtdp 2, 1,0 \n\t" | |||||
| "xvcmpgtdp 3,47, 45 \n\t" | |||||
| "xxsel 32,32,33,2 \n\t" | |||||
| "xxsel 0 ,0,1,2 \n\t" | |||||
| "xxsel 34,34,35,3 \n\t" | |||||
| "xxsel 5,45,47,3 \n\t" | |||||
| "addi %[ptr_tmp] ,%[ptr_tmp] , 128 \n\t" | |||||
| // for {second 8 elements } we have to add 8 to each so that it became {from 8 to 16} | |||||
| "vaddudm 2,2,4 \n\t" // vs34=vs34 + vs36{8,8} | |||||
| //load next 64 | |||||
| "lxvd2x 44, 0,%[ptr_tmp] \n\t" | |||||
| "lxvd2x 45, %[i16],%[ptr_tmp] \n\t" | |||||
| "lxvd2x 46, %[i32],%[ptr_tmp] \n\t" | |||||
| "lxvd2x 47, %[i48],%[ptr_tmp] \n\t" | |||||
| //choose bigger from first and second part | |||||
| "xvcmpgtdp 4,5 , 0 \n\t" | |||||
| "xxsel 3, 0,5,4 \n\t" | |||||
| "xxsel 33,32,34,4 \n\t" | |||||
| //load next 64 | |||||
| "lxvd2x 48, %[i64],%[ptr_tmp] \n\t" | |||||
| "lxvd2x 49, %[i80],%[ptr_tmp] \n\t" | |||||
| "vaddudm 1,1,5 \n\t" // get real index for first bigger | |||||
| "lxvd2x 50, %[i96],%[ptr_tmp] \n\t" | |||||
| "lxvd2x 51,%[i112],%[ptr_tmp] \n\t" | |||||
| //compare with previous to get vec_max_index(v6 | vs38 ) and vec_max_value (vs39) | |||||
| "xvcmpgtdp 2, 3,39 \n\t" | |||||
| "xxsel 39,39,3,2 \n\t" | |||||
| "xxsel 38,38,33,2 \n\t" | |||||
| //update index += 8 | |||||
| "vaddudm 5,5,4 \n\t" | |||||
| "xvabsdp 44, 44 \n\t" | |||||
| "xvabsdp 45, 45 \n\t" | |||||
| "xvabsdp 46, 46 \n\t" | |||||
| "xvabsdp 47, 47 \n\t" | |||||
| //update index += 8 | |||||
| "vaddudm 5,5,4 \n\t" | |||||
| "xvabsdp 48, 48 \n\t" | |||||
| "xvabsdp 49, 49 \n\t" | |||||
| "xvabsdp 50, 50 \n\t" | |||||
| "xvabsdp 51, 51 \n\t" | |||||
| //decrement n | |||||
| "addic. %[n], %[n], -32 \n\t" | |||||
| //Loop back if >0 | |||||
| "bgt+ 1b \n\t" | |||||
| //============================================================================== | |||||
| "xvcmpgtdp 2,45,44 \n\t " | |||||
| "xvcmpgtdp 3,47,46 \n\t " | |||||
| "xvcmpgtdp 4,49,48 \n\t " | |||||
| "xvcmpgtdp 5,51,50 \n\t" | |||||
| "xxsel 32,40,41,2 \n\t" | |||||
| "xxsel 0,44,45,2 \n\t" | |||||
| "xxsel 33,42,43,3 \n\t" | |||||
| "xxsel 1,46,47,3 \n\t" | |||||
| "xxsel 34,40,41,4 \n\t" | |||||
| "xxsel 45,48,49,4 \n\t" | |||||
| "xxsel 35,42,43,5 \n\t" | |||||
| "xxsel 47,50,51,5 \n\t" | |||||
| "xvcmpgtdp 2, 1,0 \n\t" | |||||
| "xvcmpgtdp 3,47, 45 \n\t" | |||||
| "xxsel 32,32,33,2 \n\t" | |||||
| "xxsel 0 ,0,1,2 \n\t" | |||||
| "xxsel 34,34,35,3 \n\t" | |||||
| "xxsel 5,45,47,3 \n\t" | |||||
| // for {second 8 elements } we have to add 8 to each so that it became {from 8 to 16} | |||||
| "vaddudm 2,2,4 \n\t" // vs34=vs34 + vs36{8,8} | |||||
| //choose bigger from first and second part | |||||
| "xvcmpgtdp 4,5 , 0 \n\t" | |||||
| "xxsel 3, 0,5,4 \n\t" | |||||
| "xxsel 33,32,34,4 \n\t" | |||||
| "vaddudm 1,1,5 \n\t" // get real index for first bigger | |||||
| //compare with previous to get vec_max_index(v6 | vs38 ) and vec_max_value (vs39) | |||||
| "xvcmpgtdp 2, 3,39 \n\t" | |||||
| "xxsel 39,39,3,2 \n\t" | |||||
| "xxsel 38,38,33,2 \n\t" | |||||
| ///////extract max value and max index from vector | |||||
| "xxspltd 32,38,1 \n\t" | |||||
| "xxspltd 40,39,1 \n\t" | |||||
| "xvcmpeqdp. 2, 40,39 \n\t" | |||||
| //cr6 0 bit set if all true, cr6=4*6+bit_ind=24,0011at CR(BI)==1, at=10 hint that it occurs rarely | |||||
| //0b001110=14 | |||||
| "bc 14,24, 3f \n\t" | |||||
| "xvcmpgtdp 4, 40,39 \n\t" | |||||
| "xxsel 0,39,40,4 \n\t" | |||||
| "xxsel 1,38,32,4 \n\t" | |||||
| "stxsdx 0,0,%[ptr_maxf] \n\t" | |||||
| "b 4f \n\t" | |||||
| "3: \n\t" | |||||
| //if elements value are equal then choose minimum index | |||||
| "xxspltd 0,40,0 \n\t" | |||||
| "vminud 0,0,6 \n\t" //vs32 vs38 | |||||
| "xxlor 1,32,32 \n\t" | |||||
| "stxsdx 0,0,%[ptr_maxf] \n\t" | |||||
| "4: \n\t" | |||||
| "mfvsrd %[index],1 \n\t" | |||||
| : [maxf] "=m"(*maxf),[ptr_tmp] "+&b"(x),[index] "=r"(index), [n] "+&r"(n) | |||||
| : [mem] "m"(*(const double (*)[n])x), [ptr_x] "b"(x), [ptr_maxf] "b"(maxf) , | |||||
| [i16] "b"(16), [i32] "b"(32), [i48] "b"(48), | |||||
| [i64] "b"(64), [i80] "b"(80), [i96] "b"(96), [i112] "b"(112), | |||||
| [start] "v"(start), [adder] "v"(temp_add_index) | |||||
| : "cc", "vs0", "vs1","vs2","vs3", "vs4","vs5","vs32", "vs33", "vs34", "vs35", "vs36", | |||||
| "vs37", "vs38", "vs39", "vs40", "vs41", "vs42", "vs43", "vs44", "vs45", "vs46", "vs47", "vs48", "vs49", "vs50", "vs51" | |||||
| ); | |||||
| return index; | |||||
| } | |||||
| BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { | |||||
| BLASLONG i = 0; | |||||
| BLASLONG j = 0; | |||||
| FLOAT maxf = 0.0; | |||||
| BLASLONG max = 0; | |||||
| if (n <= 0 || inc_x <= 0) return (max); | |||||
| if (inc_x == 1) { | |||||
| BLASLONG n1 = n & -32; | |||||
| if (n1 > 0) { | |||||
| max = diamax_kernel_32(n1, x, &maxf); | |||||
| i = n1; | |||||
| } | |||||
| while (i < n) { | |||||
| if (ABS(x[i]) > maxf) { | |||||
| max = i; | |||||
| maxf = ABS(x[i]); | |||||
| } | |||||
| i++; | |||||
| } | |||||
| return (max + 1); | |||||
| } else { | |||||
| BLASLONG n1 = n & -4; | |||||
| while (j < n1) { | |||||
| if (ABS(x[i]) > maxf) { | |||||
| max = j; | |||||
| maxf = ABS(x[i]); | |||||
| } | |||||
| if (ABS(x[i + inc_x]) > maxf) { | |||||
| max = j + 1; | |||||
| maxf = ABS(x[i + inc_x]); | |||||
| } | |||||
| if (ABS(x[i + 2 * inc_x]) > maxf) { | |||||
| max = j + 2; | |||||
| maxf = ABS(x[i + 2 * inc_x]); | |||||
| } | |||||
| if (ABS(x[i + 3 * inc_x]) > maxf) { | |||||
| max = j + 3; | |||||
| maxf = ABS(x[i + 3 * inc_x]); | |||||
| } | |||||
| i += inc_x * 4; | |||||
| j += 4; | |||||
| } | |||||
| while (j < n) { | |||||
| if (ABS(x[i]) > maxf) { | |||||
| max = j; | |||||
| maxf = ABS(x[i]); | |||||
| } | |||||
| i += inc_x; | |||||
| j++; | |||||
| } | |||||
| return (max + 1); | |||||
| } | |||||
| } | |||||
| @@ -0,0 +1,384 @@ | |||||
| /*************************************************************************** | |||||
| Copyright (c) 2013-2018, The OpenBLAS Project | |||||
| All rights reserved. | |||||
| Redistribution and use in source and binary forms, with or without | |||||
| modification, are permitted provided that the following conditions are | |||||
| met: | |||||
| 1. Redistributions of source code must retain the above copyright | |||||
| notice, this list of conditions and the following disclaimer. | |||||
| 2. Redistributions in binary form must reproduce the above copyright | |||||
| notice, this list of conditions and the following disclaimer in | |||||
| the documentation and/or other materials provided with the | |||||
| distribution. | |||||
| 3. Neither the name of the OpenBLAS project nor the names of | |||||
| its contributors may be used to endorse or promote products | |||||
| derived from this software without specific prior written permission. | |||||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| *****************************************************************************/ | |||||
| #include "common.h" | |||||
| #include <math.h> | |||||
| #if defined(DOUBLE) | |||||
| #define ABS fabs | |||||
| #else | |||||
| #define ABS fabsf | |||||
| #endif | |||||
| /** | |||||
| * Find minimum index | |||||
| * Warning: requirements n>0 and n % 32 == 0 | |||||
| * @param n | |||||
| * @param x pointer to the vector | |||||
| * @param minf (out) minimum absolute value .( only for output ) | |||||
| * @return minimum index | |||||
| */ | |||||
| static BLASLONG diamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *minf) { | |||||
| BLASLONG index; | |||||
| register __vector long long start = {1,0}; | |||||
| register __vector long long temp_add_index = {2, 2}; | |||||
| __asm__( | |||||
| "lxvd2x 44, 0,%[ptr_tmp] \n\t" | |||||
| "lxvd2x 45, %[i16],%[ptr_tmp] \n\t" | |||||
| "lxvd2x 46, %[i32],%[ptr_tmp] \n\t" | |||||
| "lxvd2x 47, %[i48],%[ptr_tmp] \n\t" | |||||
| "lxvd2x 48, %[i64],%[ptr_tmp] \n\t" | |||||
| "lxvd2x 49, %[i80],%[ptr_tmp] \n\t" | |||||
| "lxvd2x 50, %[i96],%[ptr_tmp] \n\t" | |||||
| "lxvd2x 51,%[i112],%[ptr_tmp] \n\t" | |||||
| "xxlor 40,%x[start],%x[start] \n\t" //{ 1,0} vs40 | v8 | |||||
| "vaddudm 9,8, %[adder] \n\t" //{3,2} vs41 | |||||
| "xxlxor 37,37 ,37 \n\t" //v5 v37 index_count | |||||
| "vaddudm 10,9,%[adder] \n\t" //{5,4} vs42 | |||||
| "xxlxor 38 ,38 ,38 \n\t" // v6 | vs38 vec_min_index | |||||
| "vaddudm 11,10,%[adder] \n\t" //{7,6} vs43 | |||||
| "lxvdsx 39,0,%[ptr_minf] \n\t" // vs39 vec_min_value | |||||
| "vaddudm 4,11, %[adder] \n\t" // {9,8} -{8;8} vs36 | v4 | |||||
| "xxspltd 36,36,0 \n\t" | |||||
| "xvabsdp 39, 39 \n\t" | |||||
| "xvabsdp 44, 44 \n\t" | |||||
| "xvabsdp 45, 45 \n\t" | |||||
| "xvabsdp 46, 46 \n\t" | |||||
| "xvabsdp 47, 47 \n\t" | |||||
| "xvabsdp 48, 48 \n\t" | |||||
| "xvabsdp 49, 49 \n\t" | |||||
| "xvabsdp 50, 50 \n\t" | |||||
| "xvabsdp 51, 51 \n\t" | |||||
| //jump first half forward | |||||
| "b 2f \n\t" | |||||
| //=================================================================== | |||||
| ".p2align 5 \n\t" | |||||
| "1: \n\t" | |||||
| "xvcmpgedp 2,44,45 \n\t " | |||||
| "xvcmpgedp 3,46,47 \n\t " | |||||
| "xvcmpgedp 4,48,49 \n\t " | |||||
| "xvcmpgedp 5,50,51 \n\t" | |||||
| "xxsel 32,40,41,2 \n\t" | |||||
| "xxsel 0,44,45,2 \n\t" | |||||
| "xxsel 33,42,43,3 \n\t" | |||||
| "xxsel 1,46,47,3 \n\t" | |||||
| "xxsel 34,40,41,4 \n\t" | |||||
| "xxsel 45,48,49,4 \n\t" | |||||
| "xxsel 35,42,43,5 \n\t" | |||||
| "xxsel 47,50,51,5 \n\t" | |||||
| "xvcmpgedp 2,0, 1 \n\t" | |||||
| "xvcmpgedp 3, 45,47 \n\t" | |||||
| "addi %[ptr_tmp] ,%[ptr_tmp] , 128 \n\t" | |||||
| "xxsel 32,32,33,2 \n\t" | |||||
| "xxsel 0 ,0,1,2 \n\t" | |||||
| "xxsel 34,34,35,3 \n\t" | |||||
| "xxsel 5,45,47,3 \n\t" | |||||
| //load next 64 | |||||
| "lxvd2x 44, 0,%[ptr_tmp] \n\t" | |||||
| "lxvd2x 45, %[i16],%[ptr_tmp] \n\t" | |||||
| // for {second 8 elements } we have to add 8 to each so that it became {from 8 to 16} | |||||
| "vaddudm 2,2,4 \n\t" // vs34=vs34 + vs36{8,8} | |||||
| "lxvd2x 46, %[i32],%[ptr_tmp] \n\t" | |||||
| "lxvd2x 47, %[i48],%[ptr_tmp] \n\t" | |||||
| //choose smaller from first and second part | |||||
| "xvcmpgedp 4, 0,5 \n\t" | |||||
| "xxsel 3, 0,5,4 \n\t" | |||||
| "xxsel 33,32,34,4 \n\t" | |||||
| //load next 64 | |||||
| "lxvd2x 48, %[i64],%[ptr_tmp] \n\t" | |||||
| "lxvd2x 49, %[i80],%[ptr_tmp] \n\t" | |||||
| "vaddudm 1,1,5 \n\t" // get real index for first smaller | |||||
| "lxvd2x 50, %[i96],%[ptr_tmp] \n\t" | |||||
| "lxvd2x 51,%[i112],%[ptr_tmp] \n\t" | |||||
| //compare with previous to get vec_min_index(v6 | vs38 ) and vec_min_value (vs39) | |||||
| "xvcmpgedp 2,39, 3 \n\t" | |||||
| "xxsel 39,39,3,2 \n\t" | |||||
| "xxsel 38,38,33,2 \n\t" | |||||
| //update index += 8 | |||||
| "vaddudm 5,5,4 \n\t" | |||||
| "xvabsdp 44, 44 \n\t" | |||||
| "xvabsdp 45, 45 \n\t" | |||||
| "xvabsdp 46, 46 \n\t" | |||||
| "xvabsdp 47, 47 \n\t" | |||||
| //update index += 8 | |||||
| "vaddudm 5,5,4 \n\t" | |||||
| "xvabsdp 48, 48 \n\t" | |||||
| "xvabsdp 49, 49 \n\t" | |||||
| "xvabsdp 50, 50 \n\t" | |||||
| "xvabsdp 51, 51 \n\t" | |||||
| //<-----------jump here from first load | |||||
| "2: \n\t" | |||||
| "xvcmpgedp 2,44,45 \n\t " | |||||
| "xvcmpgedp 3,46,47 \n\t " | |||||
| "xvcmpgedp 4,48,49 \n\t " | |||||
| "xvcmpgedp 5,50,51 \n\t" | |||||
| "xxsel 32,40,41,2 \n\t" | |||||
| "xxsel 0,44,45,2 \n\t" | |||||
| "xxsel 33,42,43,3 \n\t" | |||||
| "xxsel 1,46,47,3 \n\t" | |||||
| "xxsel 34,40,41,4 \n\t" | |||||
| "xxsel 45,48,49,4 \n\t" | |||||
| "xxsel 35,42,43,5 \n\t" | |||||
| "xxsel 47,50,51,5 \n\t" | |||||
| "xvcmpgedp 2,0, 1 \n\t" | |||||
| "xvcmpgedp 3, 45,47 \n\t" | |||||
| "xxsel 32,32,33,2 \n\t" | |||||
| "xxsel 0 ,0,1,2 \n\t" | |||||
| "xxsel 34,34,35,3 \n\t" | |||||
| "xxsel 5,45,47,3 \n\t" | |||||
| "addi %[ptr_tmp] ,%[ptr_tmp] , 128 \n\t" | |||||
| // for {second 8 elements } we have to add 8 to each so that it became {from 8 to 16} | |||||
| "vaddudm 2,2,4 \n\t" // vs34=vs34 + vs36{8,8} | |||||
| //load next 64 | |||||
| "lxvd2x 44, 0,%[ptr_tmp] \n\t" | |||||
| "lxvd2x 45, %[i16],%[ptr_tmp] \n\t" | |||||
| "lxvd2x 46, %[i32],%[ptr_tmp] \n\t" | |||||
| "lxvd2x 47, %[i48],%[ptr_tmp] \n\t" | |||||
| //choose smaller from first and second part | |||||
| "xvcmpgedp 4, 0,5 \n\t" | |||||
| "xxsel 3, 0,5,4 \n\t" | |||||
| "xxsel 33,32,34,4 \n\t" | |||||
| //load next 64 | |||||
| "lxvd2x 48, %[i64],%[ptr_tmp] \n\t" | |||||
| "lxvd2x 49, %[i80],%[ptr_tmp] \n\t" | |||||
| "vaddudm 1,1,5 \n\t" // get real index for first smaller | |||||
| "lxvd2x 50, %[i96],%[ptr_tmp] \n\t" | |||||
| "lxvd2x 51,%[i112],%[ptr_tmp] \n\t" | |||||
| //compare with previous to get vec_min_index(v6 | vs38 ) and vec_min_value (vs39) | |||||
| "xvcmpgedp 2,39, 3 \n\t" | |||||
| "xxsel 39,39,3,2 \n\t" | |||||
| "xxsel 38,38,33,2 \n\t" | |||||
| //update index += 8 | |||||
| "vaddudm 5,5,4 \n\t" | |||||
| "xvabsdp 44, 44 \n\t" | |||||
| "xvabsdp 45, 45 \n\t" | |||||
| "xvabsdp 46, 46 \n\t" | |||||
| "xvabsdp 47, 47 \n\t" | |||||
| //update index += 8 | |||||
| "vaddudm 5,5,4 \n\t" | |||||
| "xvabsdp 48, 48 \n\t" | |||||
| "xvabsdp 49, 49 \n\t" | |||||
| "xvabsdp 50, 50 \n\t" | |||||
| "xvabsdp 51, 51 \n\t" | |||||
| //decrement n | |||||
| "addic. %[n], %[n], -32 \n\t" | |||||
| //Loop back if >0 | |||||
| "bgt+ 1b \n\t" | |||||
| //============================================================================== | |||||
| "xvcmpgedp 2,44,45 \n\t " | |||||
| "xvcmpgedp 3,46,47 \n\t " | |||||
| "xvcmpgedp 4,48,49 \n\t " | |||||
| "xvcmpgedp 5,50,51 \n\t" | |||||
| "xxsel 32,40,41,2 \n\t" | |||||
| "xxsel 0,44,45,2 \n\t" | |||||
| "xxsel 33,42,43,3 \n\t" | |||||
| "xxsel 1,46,47,3 \n\t" | |||||
| "xxsel 34,40,41,4 \n\t" | |||||
| "xxsel 45,48,49,4 \n\t" | |||||
| "xxsel 35,42,43,5 \n\t" | |||||
| "xxsel 47,50,51,5 \n\t" | |||||
| "xvcmpgedp 2,0, 1 \n\t" | |||||
| "xvcmpgedp 3, 45,47 \n\t" | |||||
| "xxsel 32,32,33,2 \n\t" | |||||
| "xxsel 0 ,0,1,2 \n\t" | |||||
| "xxsel 34,34,35,3 \n\t" | |||||
| "xxsel 5,45,47,3 \n\t" | |||||
| // for {second 8 elements } we have to add 8 to each so that it became {from 8 to 16} | |||||
| "vaddudm 2,2,4 \n\t" // vs34=vs34 + vs36{8,8} | |||||
| //choose smaller from first and second part | |||||
| "xvcmpgedp 4, 0,5 \n\t" | |||||
| "xxsel 3, 0,5,4 \n\t" | |||||
| "xxsel 33,32,34,4 \n\t" | |||||
| "vaddudm 1,1,5 \n\t" // get real index for first smaller | |||||
| //compare with previous to get vec_min_index(v6 | vs38 ) and vec_min_value (vs39) | |||||
| "xvcmpgedp 2,39, 3 \n\t" | |||||
| "xxsel 39,39,3,2 \n\t" | |||||
| "xxsel 38,38,33,2 \n\t" | |||||
| ///////extract min value and min index from vector | |||||
| "xxspltd 32,38,1 \n\t" | |||||
| "xxspltd 40,39,1 \n\t" | |||||
| "xvcmpeqdp. 2, 40,39 \n\t" | |||||
| //cr6 0 bit set if all true, cr6=4*6+bit_ind=24,0011at CR(BI)==1, at=10 hint that it occurs rarely | |||||
| //0b001110=14 | |||||
| "bc 14,24, 3f \n\t" | |||||
| "xvcmpgedp 4,39, 40 \n\t" | |||||
| "xxsel 0,39,40,4 \n\t" | |||||
| "xxsel 1,38,32,4 \n\t" | |||||
| "stxsdx 0,0,%[ptr_minf] \n\t" | |||||
| "b 4f \n\t" | |||||
| "3: \n\t" | |||||
| //if elements value are equal then choose minimum index | |||||
| "xxspltd 0,40,0 \n\t" | |||||
| "vminud 0,0,6 \n\t" //vs32 vs38 | |||||
| "xxlor 1,32,32 \n\t" | |||||
| "stxsdx 0,0,%[ptr_minf] \n\t" | |||||
| "4: \n\t" | |||||
| "mfvsrd %[index],1 \n\t" | |||||
| : [minf] "=m"(*minf),[ptr_tmp] "+&b"(x),[index] "=r"(index), [n] "+&r"(n) | |||||
| : [mem] "m"(*(const double (*)[n])x), [ptr_x] "b"(x), [ptr_minf] "b"(minf) , | |||||
| [i16] "b"(16), [i32] "b"(32), [i48] "b"(48), | |||||
| [i64] "b"(64), [i80] "b"(80), [i96] "b"(96), [i112] "b"(112), | |||||
| [start] "v"(start), [adder] "v"(temp_add_index) | |||||
| : "cc", "vs0", "vs1","vs2","vs3", "vs4","vs5","vs32", "vs33", "vs34", "vs35", "vs36", | |||||
| "vs37", "vs38", "vs39", "vs40", "vs41", "vs42", "vs43", "vs44", "vs45", "vs46", "vs47", "vs48", "vs49", "vs50", "vs51" | |||||
| ); | |||||
| return index; | |||||
| } | |||||
| BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { | |||||
| BLASLONG i = 0; | |||||
| BLASLONG j = 0; | |||||
| BLASLONG min = 0; | |||||
| FLOAT minf = 0.0; | |||||
| if (n <= 0 || inc_x <= 0) return (min); | |||||
| minf = ABS(x[0]); //index's not incremented | |||||
| if (inc_x == 1) { | |||||
| BLASLONG n1 = n & -32; | |||||
| if (n1 > 0) { | |||||
| min = diamin_kernel_32(n1, x, &minf); | |||||
| i = n1; | |||||
| } | |||||
| while (i < n) { | |||||
| if (ABS(x[i]) < minf) { | |||||
| min = i; | |||||
| minf = ABS(x[i]); | |||||
| } | |||||
| i++; | |||||
| } | |||||
| return (min + 1); | |||||
| } else { | |||||
| BLASLONG n1 = n & -4; | |||||
| while (j < n1) { | |||||
| if (ABS(x[i]) < minf) { | |||||
| min = j; | |||||
| minf = ABS(x[i]); | |||||
| } | |||||
| if (ABS(x[i + inc_x]) < minf) { | |||||
| min = j + 1; | |||||
| minf = ABS(x[i + inc_x]); | |||||
| } | |||||
| if (ABS(x[i + 2 * inc_x]) < minf) { | |||||
| min = j + 2; | |||||
| minf = ABS(x[i + 2 * inc_x]); | |||||
| } | |||||
| if (ABS(x[i + 3 * inc_x]) < minf) { | |||||
| min = j + 3; | |||||
| minf = ABS(x[i + 3 * inc_x]); | |||||
| } | |||||
| i += inc_x * 4; | |||||
| j += 4; | |||||
| } | |||||
| while (j < n) { | |||||
| if (ABS(x[i]) < minf) { | |||||
| min = j; | |||||
| minf = ABS(x[i]); | |||||
| } | |||||
| i += inc_x; | |||||
| j++; | |||||
| } | |||||
| return (min + 1); | |||||
| } | |||||
| } | |||||
| @@ -0,0 +1,362 @@ | |||||
| /*************************************************************************** | |||||
| Copyright (c) 2017, The OpenBLAS Project | |||||
| All rights reserved. | |||||
| Redistribution and use in source and binary forms, with or without | |||||
| modification, are permitted provided that the following conditions are | |||||
| met: | |||||
| 1. Redistributions of source code must retain the above copyright | |||||
| notice, this list of conditions and the following disclaimer. | |||||
| 2. Redistributions in binary form must reproduce the above copyright | |||||
| notice, this list of conditions and the following disclaimer in | |||||
| the documentation and/or other materials provided with the | |||||
| distribution. | |||||
| 3. Neither the name of the OpenBLAS project nor the names of | |||||
| its contributors may be used to endorse or promote products | |||||
| derived from this software without specific prior written permission. | |||||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| *****************************************************************************/ | |||||
| #include "common.h" | |||||
| #include <math.h> | |||||
| #define ABS fabs | |||||
| #define CABS1(x,i) ABS(x[i])+ABS(x[i+1]) | |||||
| /** | |||||
| * Find maximum index | |||||
| * Warning: requirements n>0 and n % 16 == 0 | |||||
| * @param n | |||||
| * @param x pointer to the vector | |||||
| * @param maxf (out) maximum absolute value .( only for output ) | |||||
| * @return index | |||||
| */ | |||||
| static BLASLONG ziamax_kernel_16(BLASLONG n, FLOAT *x, FLOAT *maxf) { | |||||
| BLASLONG index; | |||||
| register __vector long long start = {1,0}; | |||||
| register __vector long long temp_add_index = {2, 2}; | |||||
| __asm__( | |||||
| "lxvd2x 44, 0,%[ptr_tmp] \n\t" | |||||
| "lxvd2x 45, %[i16],%[ptr_tmp] \n\t" | |||||
| "lxvd2x 46, %[i32],%[ptr_tmp] \n\t" | |||||
| "lxvd2x 47, %[i48],%[ptr_tmp] \n\t" | |||||
| "lxvd2x 48, %[i64],%[ptr_tmp] \n\t" | |||||
| "lxvd2x 49, %[i80],%[ptr_tmp] \n\t" | |||||
| "lxvd2x 50, %[i96],%[ptr_tmp] \n\t" | |||||
| "lxvd2x 51,%[i112],%[ptr_tmp] \n\t" | |||||
| "xxlor 40,%x[start],%x[start] \n\t" //{ 1,0} vs40 | v8 | |||||
| "vaddudm 9,8,%[adder] \n\t" //{3,2} vs41 | |||||
| "xxlxor 37,37 ,37 \n\t" //v5 v37 index_count | |||||
| "vaddudm 10,9,%[adder] \n\t" //{5,4} vs42 | |||||
| "xxlxor 38 ,38 ,38 \n\t" // v6 | vs38 vec_max_index | |||||
| "vaddudm 11,10,%[adder] \n\t" //{7,6} vs43 | |||||
| "xxlxor 39,39,39 \n\t" // vs39 vec_max_value is zero | |||||
| "vaddudm 4,11, %[adder] \n\t" // {9,8} -{8;8} vs36 | v4 | |||||
| "xxspltd 36,36,0 \n\t" | |||||
| "xvabsdp 44, 44 \n\t" | |||||
| "xvabsdp 45, 45 \n\t" | |||||
| "xvabsdp 46, 46 \n\t" | |||||
| "xvabsdp 47, 47 \n\t" | |||||
| "xvabsdp 48, 48 \n\t" | |||||
| "xvabsdp 49, 49 \n\t" | |||||
| "xvabsdp 50, 50 \n\t" | |||||
| "xvabsdp 51, 51 \n\t" | |||||
| //jump first half forward | |||||
| "b 2f \n\t" | |||||
| ".p2align 5 \n\t" | |||||
| "1: \n\t" | |||||
| "xxmrghd 0,44,45 \n\t" | |||||
| "xxmrgld 1,44,45 \n\t" | |||||
| "xxmrghd 2,46,47 \n\t" | |||||
| "xxmrgld 3,46,47 \n\t" | |||||
| "xxmrghd 4,48,49 \n\t" | |||||
| "xxmrgld 5,48,49 \n\t" | |||||
| "xxmrghd 44,50,51 \n\t" | |||||
| "xxmrgld 45,50,51 \n\t" | |||||
| "xvadddp 46, 0,1 \n\t" | |||||
| "xvadddp 47, 2,3 \n\t" | |||||
| "xvadddp 48, 4,5 \n\t" | |||||
| "xvadddp 49, 44,45 \n\t" | |||||
| "xvcmpgtdp 50,47,46 \n\t " | |||||
| "xvcmpgtdp 51,49,48 \n\t " | |||||
| "addi %[ptr_tmp] ,%[ptr_tmp] , 128 \n\t" | |||||
| "xxsel 32,40,41,50 \n\t" | |||||
| "xxsel 0,46,47,50 \n\t" | |||||
| "xxsel 33,42,43,51 \n\t" | |||||
| "xxsel 1,48,49,51 \n\t" | |||||
| "lxvd2x 44, 0,%[ptr_tmp] \n\t" | |||||
| "lxvd2x 45, %[i16],%[ptr_tmp] \n\t" | |||||
| "xvcmpgtdp 2,1,0 \n\t " | |||||
| "lxvd2x 46, %[i32],%[ptr_tmp] \n\t" | |||||
| "lxvd2x 47, %[i48],%[ptr_tmp] \n\t" | |||||
| "xxsel 32,32,33,2 \n\t" | |||||
| "xxsel 3,0,1,2 \n\t" | |||||
| "vaddudm 0,0,5 \n\t" | |||||
| //cmp with previous | |||||
| "xvcmpgtdp 4,3,39 \n\t " | |||||
| "vaddudm 5,5,4 \n\t" | |||||
| "lxvd2x 48, %[i64],%[ptr_tmp] \n\t" | |||||
| "lxvd2x 49, %[i80],%[ptr_tmp] \n\t" | |||||
| "lxvd2x 50, %[i96],%[ptr_tmp] \n\t" | |||||
| "lxvd2x 51,%[i112],%[ptr_tmp] \n\t" | |||||
| //select with previous | |||||
| "xxsel 38,38,32,4 \n\t" | |||||
| "xxsel 39,39,3,4 \n\t" | |||||
| "xvabsdp 44, 44 \n\t" | |||||
| "xvabsdp 45, 45 \n\t" | |||||
| "xvabsdp 46, 46 \n\t" | |||||
| "xvabsdp 47, 47 \n\t" | |||||
| "xvabsdp 48, 48 \n\t" | |||||
| "xvabsdp 49, 49 \n\t" | |||||
| "xvabsdp 50, 50 \n\t" | |||||
| "xvabsdp 51, 51 \n\t" | |||||
| //>>/////////////////////////////// half start | |||||
| "2: \n\t" | |||||
| "xxmrghd 0,44,45 \n\t" | |||||
| "xxmrgld 1,44,45 \n\t" | |||||
| "xxmrghd 2,46,47 \n\t" | |||||
| "xxmrgld 3,46,47 \n\t" | |||||
| "xxmrghd 4,48,49 \n\t" | |||||
| "xxmrgld 5,48,49 \n\t" | |||||
| "xxmrghd 44,50,51 \n\t" | |||||
| "xxmrgld 45,50,51 \n\t" | |||||
| "xvadddp 46, 0,1 \n\t" | |||||
| "xvadddp 47, 2,3 \n\t" | |||||
| "xvadddp 48, 4,5 \n\t" | |||||
| "xvadddp 49, 44,45 \n\t" | |||||
| "xvcmpgtdp 50,47,46 \n\t " | |||||
| "xvcmpgtdp 51,49,48 \n\t " | |||||
| "addi %[ptr_tmp] ,%[ptr_tmp] , 128 \n\t" | |||||
| "xxsel 32,40,41,50 \n\t" | |||||
| "xxsel 0,46,47,50 \n\t" | |||||
| "xxsel 33,42,43,51 \n\t" | |||||
| "xxsel 1,48,49,51 \n\t" | |||||
| "lxvd2x 44, 0,%[ptr_tmp] \n\t" | |||||
| "lxvd2x 45, %[i16],%[ptr_tmp] \n\t" | |||||
| "xvcmpgtdp 2,1,0 \n\t " | |||||
| "lxvd2x 46, %[i32],%[ptr_tmp] \n\t" | |||||
| "lxvd2x 47, %[i48],%[ptr_tmp] \n\t" | |||||
| "xxsel 32,32,33,2 \n\t" | |||||
| "xxsel 3,0,1,2 \n\t" | |||||
| "vaddudm 0,0,5 \n\t" | |||||
| //cmp with previous | |||||
| "xvcmpgtdp 4,3,39 \n\t " | |||||
| "vaddudm 5,5,4 \n\t" | |||||
| "lxvd2x 48, %[i64],%[ptr_tmp] \n\t" | |||||
| "lxvd2x 49, %[i80],%[ptr_tmp] \n\t" | |||||
| "lxvd2x 50, %[i96],%[ptr_tmp] \n\t" | |||||
| "lxvd2x 51,%[i112],%[ptr_tmp] \n\t" | |||||
| //select with previous | |||||
| "xxsel 38,38,32,4 \n\t" | |||||
| "xxsel 39,39,3,4 \n\t" | |||||
| "xvabsdp 44, 44 \n\t" | |||||
| "xvabsdp 45, 45 \n\t" | |||||
| "xvabsdp 46, 46 \n\t" | |||||
| "xvabsdp 47, 47 \n\t" | |||||
| "xvabsdp 48, 48 \n\t" | |||||
| "xvabsdp 49, 49 \n\t" | |||||
| "xvabsdp 50, 50 \n\t" | |||||
| "xvabsdp 51, 51 \n\t" | |||||
| //decrement n | |||||
| "addic. %[n], %[n], -16 \n\t" | |||||
| //Loop back if >0 | |||||
| "bgt+ 1b \n\t" | |||||
| "xxmrghd 0,44,45 \n\t" | |||||
| "xxmrgld 1,44,45 \n\t" | |||||
| "xxmrghd 2,46,47 \n\t" | |||||
| "xxmrgld 3,46,47 \n\t" | |||||
| "xxmrghd 4,48,49 \n\t" | |||||
| "xxmrgld 5,48,49 \n\t" | |||||
| "xxmrghd 44,50,51 \n\t" | |||||
| "xxmrgld 45,50,51 \n\t" | |||||
| "xvadddp 46, 0,1 \n\t" | |||||
| "xvadddp 47, 2,3 \n\t" | |||||
| "xvadddp 48, 4,5 \n\t" | |||||
| "xvadddp 49, 44,45 \n\t" | |||||
| "xvcmpgtdp 50,47,46 \n\t " | |||||
| "xvcmpgtdp 51,49,48 \n\t " | |||||
| "xxsel 32,40,41,50 \n\t" | |||||
| "xxsel 0,46,47,50 \n\t" | |||||
| "xxsel 33,42,43,51 \n\t" | |||||
| "xxsel 1,48,49,51 \n\t" | |||||
| "xvcmpgtdp 2,1,0 \n\t " | |||||
| "xxsel 32,32,33,2 \n\t" | |||||
| "xxsel 3,0,1,2 \n\t" | |||||
| "vaddudm 0,0,5 \n\t" | |||||
| "addi %[ptr_tmp] ,%[ptr_tmp] , 128 \n\t" | |||||
| //cmp with previous | |||||
| "xvcmpgtdp 4,3,39 \n\t " | |||||
| "vaddudm 5,5,4 \n\t" | |||||
| "xxsel 38,38,32,4 \n\t" | |||||
| "xxsel 39,39,3,4 \n\t" | |||||
| ///////extract max value and max index from vector | |||||
| "xxspltd 32,38,1 \n\t" | |||||
| "xxspltd 40,39,1 \n\t" | |||||
| "xvcmpeqdp. 2, 40,39 \n\t" | |||||
| //cr6 0 bit set if all true, cr6=4*6+bit_ind=24,0011at CR(BI)==1, at=10 hint that it occurs rarely | |||||
| //0b001110=14 | |||||
| "bc 14,24, 3f \n\t" | |||||
| "xvcmpgtdp 4, 40,39 \n\t" | |||||
| "xxsel 0,39,40,4 \n\t" | |||||
| "xxsel 1,38,32,4 \n\t" | |||||
| "stxsdx 0,0,%[ptr_maxf] \n\t" | |||||
| "b 4f \n\t" | |||||
| "3: \n\t" | |||||
| //if elements value are equal then choose minimum index | |||||
| "xxspltd 0,40,0 \n\t" | |||||
| "vminud 0,0,6 \n\t" //vs32 vs38 | |||||
| "xxlor 1,32,32 \n\t" | |||||
| "stxsdx 0,0,%[ptr_maxf] \n\t" | |||||
| "4: \n\t" | |||||
| "mfvsrd %[index],1 \n\t" | |||||
| : [maxf] "=m"(*maxf),[ptr_tmp] "+&b"(x),[index] "=r"(index), [n] "+&r"(n) | |||||
| : [mem] "m"(*(const double (*)[2*n])x), [ptr_x] "b"(x), [ptr_maxf] "b"(maxf) , | |||||
| [i16] "b"(16), [i32] "b"(32), [i48] "b"(48), | |||||
| [i64] "b"(64), [i80] "b"(80), [i96] "b"(96), [i112] "b"(112), | |||||
| [start] "v"(start), [adder] "v"(temp_add_index) | |||||
| : "cc", "vs0", "vs1","vs2","vs3", "vs4","vs5","vs32", "vs33", "vs34", "vs35", "vs36", | |||||
| "vs37", "vs38", "vs39", "vs40", "vs41", "vs42", "vs43", "vs44", "vs45", "vs46", "vs47", "vs48", "vs49", "vs50", "vs51" | |||||
| ); | |||||
| return index; | |||||
| } | |||||
| BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||||
| { | |||||
| BLASLONG i = 0; | |||||
| BLASLONG ix = 0; | |||||
| FLOAT maxf = 0; | |||||
| BLASLONG max = 0; | |||||
| BLASLONG inc_x2; | |||||
| if (n <= 0 || inc_x <= 0) return(max); | |||||
| if (inc_x == 1) { | |||||
| BLASLONG n1 = n & -16; | |||||
| if (n1 > 0) { | |||||
| max = ziamax_kernel_16(n1, x, &maxf); | |||||
| i = n1; | |||||
| ix = n1 << 1; | |||||
| } | |||||
| while(i < n) | |||||
| { | |||||
| if( CABS1(x,ix) > maxf ) | |||||
| { | |||||
| max = i; | |||||
| maxf = CABS1(x,ix); | |||||
| } | |||||
| ix += 2; | |||||
| i++; | |||||
| } | |||||
| return (max + 1); | |||||
| } else { | |||||
| inc_x2 = 2 * inc_x; | |||||
| maxf = CABS1(x,0); | |||||
| ix += inc_x2; | |||||
| i++; | |||||
| while(i < n) | |||||
| { | |||||
| if( CABS1(x,ix) > maxf ) | |||||
| { | |||||
| max = i; | |||||
| maxf = CABS1(x,ix); | |||||
| } | |||||
| ix += inc_x2; | |||||
| i++; | |||||
| } | |||||
| return (max + 1); | |||||
| } | |||||
| } | |||||
| @@ -0,0 +1,361 @@ | |||||
| /*************************************************************************** | |||||
| Copyright (c) 2017, The OpenBLAS Project | |||||
| All rights reserved. | |||||
| Redistribution and use in source and binary forms, with or without | |||||
| modification, are permitted provided that the following conditions are | |||||
| met: | |||||
| 1. Redistributions of source code must retain the above copyright | |||||
| notice, this list of conditions and the following disclaimer. | |||||
| 2. Redistributions in binary form must reproduce the above copyright | |||||
| notice, this list of conditions and the following disclaimer in | |||||
| the documentation and/or other materials provided with the | |||||
| distribution. | |||||
| 3. Neither the name of the OpenBLAS project nor the names of | |||||
| its contributors may be used to endorse or promote products | |||||
| derived from this software without specific prior written permission. | |||||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| *****************************************************************************/ | |||||
| #include "common.h" | |||||
| #include <math.h> | |||||
| #define ABS fabs | |||||
| #define CABS1(x,i) ABS(x[i])+ABS(x[i+1]) | |||||
| /** | |||||
| * Find minimum index | |||||
| * Warning: requirements n>0 and n % 16 == 0 | |||||
| * @param n | |||||
| * @param x pointer to the vector | |||||
| * @param minf (out) minimum absolute value .( only for output ) | |||||
| * @return minimum index | |||||
| */ | |||||
| static BLASLONG ziamin_kernel_16_TUNED(BLASLONG n, FLOAT *x, FLOAT *minf) { | |||||
| BLASLONG index; | |||||
| register __vector long long start = {1,0}; | |||||
| register __vector long long temp_add_index = {2, 2}; | |||||
| __asm__( | |||||
| "lxvd2x 44, 0,%[ptr_tmp] \n\t" | |||||
| "lxvd2x 45, %[i16],%[ptr_tmp] \n\t" | |||||
| "lxvd2x 46, %[i32],%[ptr_tmp] \n\t" | |||||
| "lxvd2x 47, %[i48],%[ptr_tmp] \n\t" | |||||
| "lxvd2x 48, %[i64],%[ptr_tmp] \n\t" | |||||
| "lxvd2x 49, %[i80],%[ptr_tmp] \n\t" | |||||
| "lxvd2x 50, %[i96],%[ptr_tmp] \n\t" | |||||
| "lxvd2x 51,%[i112],%[ptr_tmp] \n\t" | |||||
| "xxlor 40,%x[start],%x[start] \n\t" //{ 1,0} vs40 | v8 | |||||
| "vaddudm 9,8,%[adder] \n\t" //{3,2} vs41 | |||||
| "xxlxor 37,37 ,37 \n\t" //v5 v37 index_count | |||||
| "vaddudm 10,9,%[adder] \n\t" //{5,4} vs42 | |||||
| "xxlxor 38 ,38 ,38 \n\t" // v6 | vs38 vec_min_index | |||||
| "vaddudm 11,10,%[adder] \n\t" //{7,6} vs43 | |||||
| "lxvdsx 39,0,%[ptr_minf] \n\t" // vs39 vec_min_value | |||||
| "vaddudm 4,11, %[adder] \n\t" // {9,8} -{8;8} vs36 | v4 | |||||
| "xxspltd 36,36,0 \n\t" | |||||
| "xvabsdp 44, 44 \n\t" | |||||
| "xvabsdp 45, 45 \n\t" | |||||
| "xvabsdp 46, 46 \n\t" | |||||
| "xvabsdp 47, 47 \n\t" | |||||
| "xvabsdp 48, 48 \n\t" | |||||
| "xvabsdp 49, 49 \n\t" | |||||
| "xvabsdp 50, 50 \n\t" | |||||
| "xvabsdp 51, 51 \n\t" | |||||
| //jump first half forward | |||||
| "b 2f \n\t" | |||||
| ".p2align 5 \n\t" | |||||
| "1: \n\t" | |||||
| "xxmrghd 0,44,45 \n\t" | |||||
| "xxmrgld 1,44,45 \n\t" | |||||
| "xxmrghd 2,46,47 \n\t" | |||||
| "xxmrgld 3,46,47 \n\t" | |||||
| "xxmrghd 4,48,49 \n\t" | |||||
| "xxmrgld 5,48,49 \n\t" | |||||
| "xxmrghd 44,50,51 \n\t" | |||||
| "xxmrgld 45,50,51 \n\t" | |||||
| "xvadddp 46, 0,1 \n\t" | |||||
| "xvadddp 47, 2,3 \n\t" | |||||
| "xvadddp 48, 4,5 \n\t" | |||||
| "xvadddp 49, 44,45 \n\t" | |||||
| "xvcmpgedp 50,46,47 \n\t " | |||||
| "xvcmpgedp 51,48,49 \n\t " | |||||
| "addi %[ptr_tmp] ,%[ptr_tmp] , 128 \n\t" | |||||
| "xxsel 32,40,41,50 \n\t" | |||||
| "xxsel 0,46,47,50 \n\t" | |||||
| "xxsel 33,42,43,51 \n\t" | |||||
| "xxsel 1,48,49,51 \n\t" | |||||
| "lxvd2x 44, 0,%[ptr_tmp] \n\t" | |||||
| "lxvd2x 45, %[i16],%[ptr_tmp] \n\t" | |||||
| "xvcmpgedp 2,0,1 \n\t " | |||||
| "lxvd2x 46, %[i32],%[ptr_tmp] \n\t" | |||||
| "lxvd2x 47, %[i48],%[ptr_tmp] \n\t" | |||||
| "xxsel 32,32,33,2 \n\t" | |||||
| "xxsel 3,0,1,2 \n\t" | |||||
| "vaddudm 0,0,5 \n\t" | |||||
| //cmp with previous | |||||
| "xvcmpgedp 4,39,3 \n\t " | |||||
| "vaddudm 5,5,4 \n\t" | |||||
| "lxvd2x 48, %[i64],%[ptr_tmp] \n\t" | |||||
| "lxvd2x 49, %[i80],%[ptr_tmp] \n\t" | |||||
| "lxvd2x 50, %[i96],%[ptr_tmp] \n\t" | |||||
| "lxvd2x 51,%[i112],%[ptr_tmp] \n\t" | |||||
| //select with previous | |||||
| "xxsel 38,38,32,4 \n\t" | |||||
| "xxsel 39,39,3,4 \n\t" | |||||
| "xvabsdp 44, 44 \n\t" | |||||
| "xvabsdp 45, 45 \n\t" | |||||
| "xvabsdp 46, 46 \n\t" | |||||
| "xvabsdp 47, 47 \n\t" | |||||
| "xvabsdp 48, 48 \n\t" | |||||
| "xvabsdp 49, 49 \n\t" | |||||
| "xvabsdp 50, 50 \n\t" | |||||
| "xvabsdp 51, 51 \n\t" | |||||
| //>>/////////////////////////////// half start | |||||
| "2: \n\t" | |||||
| "xxmrghd 0,44,45 \n\t" | |||||
| "xxmrgld 1,44,45 \n\t" | |||||
| "xxmrghd 2,46,47 \n\t" | |||||
| "xxmrgld 3,46,47 \n\t" | |||||
| "xxmrghd 4,48,49 \n\t" | |||||
| "xxmrgld 5,48,49 \n\t" | |||||
| "xxmrghd 44,50,51 \n\t" | |||||
| "xxmrgld 45,50,51 \n\t" | |||||
| "xvadddp 46, 0,1 \n\t" | |||||
| "xvadddp 47, 2,3 \n\t" | |||||
| "xvadddp 48, 4,5 \n\t" | |||||
| "xvadddp 49, 44,45 \n\t" | |||||
| "xvcmpgedp 50,46,47 \n\t " | |||||
| "xvcmpgedp 51,48,49 \n\t " | |||||
| "addi %[ptr_tmp] ,%[ptr_tmp] , 128 \n\t" | |||||
| "xxsel 32,40,41,50 \n\t" | |||||
| "xxsel 0,46,47,50 \n\t" | |||||
| "xxsel 33,42,43,51 \n\t" | |||||
| "xxsel 1,48,49,51 \n\t" | |||||
| "lxvd2x 44, 0,%[ptr_tmp] \n\t" | |||||
| "lxvd2x 45, %[i16],%[ptr_tmp] \n\t" | |||||
| "xvcmpgedp 2,0,1 \n\t " | |||||
| "lxvd2x 46, %[i32],%[ptr_tmp] \n\t" | |||||
| "lxvd2x 47, %[i48],%[ptr_tmp] \n\t" | |||||
| "xxsel 32,32,33,2 \n\t" | |||||
| "xxsel 3,0,1,2 \n\t" | |||||
| "vaddudm 0,0,5 \n\t" | |||||
| //cmp with previous | |||||
| "xvcmpgedp 4,39,3 \n\t " | |||||
| "vaddudm 5,5,4 \n\t" | |||||
| "lxvd2x 48, %[i64],%[ptr_tmp] \n\t" | |||||
| "lxvd2x 49, %[i80],%[ptr_tmp] \n\t" | |||||
| "lxvd2x 50, %[i96],%[ptr_tmp] \n\t" | |||||
| "lxvd2x 51,%[i112],%[ptr_tmp] \n\t" | |||||
| //select with previous | |||||
| "xxsel 38,38,32,4 \n\t" | |||||
| "xxsel 39,39,3,4 \n\t" | |||||
| "xvabsdp 44, 44 \n\t" | |||||
| "xvabsdp 45, 45 \n\t" | |||||
| "xvabsdp 46, 46 \n\t" | |||||
| "xvabsdp 47, 47 \n\t" | |||||
| "xvabsdp 48, 48 \n\t" | |||||
| "xvabsdp 49, 49 \n\t" | |||||
| "xvabsdp 50, 50 \n\t" | |||||
| "xvabsdp 51, 51 \n\t" | |||||
| //decrement n | |||||
| "addic. %[n], %[n], -16 \n\t" | |||||
| //Loop back if >0 | |||||
| "bgt+ 1b \n\t" | |||||
| "xxmrghd 0,44,45 \n\t" | |||||
| "xxmrgld 1,44,45 \n\t" | |||||
| "xxmrghd 2,46,47 \n\t" | |||||
| "xxmrgld 3,46,47 \n\t" | |||||
| "xxmrghd 4,48,49 \n\t" | |||||
| "xxmrgld 5,48,49 \n\t" | |||||
| "xxmrghd 44,50,51 \n\t" | |||||
| "xxmrgld 45,50,51 \n\t" | |||||
| "xvadddp 46, 0,1 \n\t" | |||||
| "xvadddp 47, 2,3 \n\t" | |||||
| "xvadddp 48, 4,5 \n\t" | |||||
| "xvadddp 49, 44,45 \n\t" | |||||
| "xvcmpgedp 50,46,47 \n\t " | |||||
| "xvcmpgedp 51,48,49 \n\t " | |||||
| "xxsel 32,40,41,50 \n\t" | |||||
| "xxsel 0,46,47,50 \n\t" | |||||
| "xxsel 33,42,43,51 \n\t" | |||||
| "xxsel 1,48,49,51 \n\t" | |||||
| "xvcmpgedp 2,0,1 \n\t " | |||||
| "xxsel 32,32,33,2 \n\t" | |||||
| "xxsel 3,0,1,2 \n\t" | |||||
| "vaddudm 0,0,5 \n\t" | |||||
| "addi %[ptr_tmp] ,%[ptr_tmp] , 128 \n\t" | |||||
| //cmp with previous | |||||
| "xvcmpgedp 4,39,3 \n\t " | |||||
| "vaddudm 5,5,4 \n\t" | |||||
| "xxsel 38,38,32,4 \n\t" | |||||
| "xxsel 39,39,3,4 \n\t" | |||||
| ///////extract min value and min index from vector | |||||
| "xxspltd 32,38,1 \n\t" | |||||
| "xxspltd 40,39,1 \n\t" | |||||
| "xvcmpeqdp. 2, 40,39 \n\t" | |||||
| //cr6 0 bit set if all true, cr6=4*6+bit_ind=24,0011at CR(BI)==1, at=10 hint that it occurs rarely | |||||
| //0b001110=14 | |||||
| "bc 14,24, 3f \n\t" | |||||
| "xvcmpgedp 4,39, 40 \n\t" | |||||
| "xxsel 0,39,40,4 \n\t" | |||||
| "xxsel 1,38,32,4 \n\t" | |||||
| "stxsdx 0,0,%[ptr_minf] \n\t" | |||||
| "b 4f \n\t" | |||||
| "3: \n\t" | |||||
| //if elements value are equal then choose minimum index | |||||
| "xxspltd 0,40,0 \n\t" | |||||
| "vminud 0,0,6 \n\t" //vs32 vs38 | |||||
| "xxlor 1,32,32 \n\t" | |||||
| "stxsdx 0,0,%[ptr_minf] \n\t" | |||||
| "4: \n\t" | |||||
| "mfvsrd %[index],1 \n\t" | |||||
| : [minf] "=m"(*minf),[ptr_tmp] "+&b"(x),[index] "=r"(index), [n] "+&r"(n) | |||||
| : [mem] "m"(*(const double (*)[2*n])x), [ptr_x] "b"(x), [ptr_minf] "b"(minf) , | |||||
| [i16] "b"(16), [i32] "b"(32), [i48] "b"(48), | |||||
| [i64] "b"(64), [i80] "b"(80), [i96] "b"(96), [i112] "b"(112), | |||||
| [start] "v"(start), [adder] "v"(temp_add_index) | |||||
| : "cc", "vs0", "vs1","vs2","vs3", "vs4","vs5","vs32", "vs33", "vs34", "vs35", "vs36", | |||||
| "vs37", "vs38", "vs39", "vs40", "vs41", "vs42", "vs43", "vs44", "vs45", "vs46", "vs47", "vs48", "vs49", "vs50", "vs51" | |||||
| ); | |||||
| return index; | |||||
| } | |||||
| BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||||
| { | |||||
| BLASLONG i=0; | |||||
| BLASLONG ix=0; | |||||
| FLOAT minf; | |||||
| BLASLONG min=0; | |||||
| BLASLONG inc_x2; | |||||
| if (n <= 0 || inc_x <= 0) return(min); | |||||
| if (inc_x == 1) { | |||||
| minf = CABS1(x,0); //index will not be incremented | |||||
| BLASLONG n1 = n & -16; | |||||
| if (n1 > 0) { | |||||
| min = ziamin_kernel_16_TUNED(n1, x, &minf); | |||||
| i = n1; | |||||
| ix = n1 << 1; | |||||
| } | |||||
| while(i < n) | |||||
| { | |||||
| if( CABS1(x,ix) < minf ) | |||||
| { | |||||
| min = i; | |||||
| minf = CABS1(x,ix); | |||||
| } | |||||
| ix += 2; | |||||
| i++; | |||||
| } | |||||
| return (min + 1); | |||||
| } else { | |||||
| inc_x2 = 2 * inc_x; | |||||
| minf = CABS1(x,0); | |||||
| ix += inc_x2; | |||||
| i++; | |||||
| while(i < n) | |||||
| { | |||||
| if( CABS1(x,ix) < minf ) | |||||
| { | |||||
| min = i; | |||||
| minf = CABS1(x,ix); | |||||
| } | |||||
| ix += inc_x2; | |||||
| i++; | |||||
| } | |||||
| return (min + 1); | |||||
| } | |||||
| } | |||||
| @@ -0,0 +1,958 @@ | |||||
| /*************************************************************************** | |||||
| Copyright (c) 2018, The OpenBLAS Project | |||||
| All rights reserved. | |||||
| Redistribution and use in source and binary forms, with or without | |||||
| modification, are permitted provided that the following conditions are | |||||
| met: | |||||
| 1. Redistributions of source code must retain the above copyright | |||||
| notice, this list of conditions and the following disclaimer. | |||||
| 2. Redistributions in binary form must reproduce the above copyright | |||||
| notice, this list of conditions and the following disclaimer in | |||||
| the documentation and/or other materials provided with the | |||||
| distribution. | |||||
| 3. Neither the name of the OpenBLAS project nor the names of | |||||
| its contributors may be used to endorse or promote products | |||||
| derived from this software without specific prior written permission. | |||||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| *****************************************************************************/ | |||||
| #include <stdlib.h> | |||||
| #include <stdio.h> | |||||
| #include "common.h" | |||||
| #define HAVE_KERNEL_4x4_VEC 1 | |||||
| #define HAVE_KERNEL_4x2_VEC 1 | |||||
| #define HAVE_KERNEL_4x1_VEC 1 | |||||
| #define HAVE_KERNEL_ADDY 1 | |||||
| #if defined(HAVE_KERNEL_4x4_VEC) || defined(HAVE_KERNEL_4x2_VEC) || defined(HAVE_KERNEL_4x1_VEC) | |||||
| #include <altivec.h> | |||||
| #endif | |||||
| // | |||||
| #define NBMAX 4096 | |||||
| #ifdef HAVE_KERNEL_4x4_VEC_ASM | |||||
| #elif HAVE_KERNEL_4x4_VEC | |||||
| static void zgemv_kernel_4x4(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y) { | |||||
| FLOAT *a0, *a1, *a2, *a3; | |||||
| a0 = ap; | |||||
| a1 = ap + lda; | |||||
| a2 = a1 + lda; | |||||
| a3 = a2 + lda; | |||||
| #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) | |||||
| register __vector double vx0_r = {x[0], x[0]}; | |||||
| register __vector double vx0_i = {-x[1], x[1]}; | |||||
| register __vector double vx1_r = {x[2], x[2]}; | |||||
| register __vector double vx1_i = {-x[3], x[3]}; | |||||
| register __vector double vx2_r = {x[4], x[4]}; | |||||
| register __vector double vx2_i = {-x[5], x[5]}; | |||||
| register __vector double vx3_r = {x[6], x[6]}; | |||||
| register __vector double vx3_i = {-x[7], x[7]}; | |||||
| #else | |||||
| register __vector double vx0_r = {x[0], -x[0]}; | |||||
| register __vector double vx0_i = {x[1], x[1]}; | |||||
| register __vector double vx1_r = {x[2], -x[2]}; | |||||
| register __vector double vx1_i = {x[3], x[3]}; | |||||
| register __vector double vx2_r = {x[4], -x[4]}; | |||||
| register __vector double vx2_i = {x[5], x[5]}; | |||||
| register __vector double vx3_r = {x[6], -x[6]}; | |||||
| register __vector double vx3_i = {x[7], x[7]}; | |||||
| #endif | |||||
| register __vector double *vy = (__vector double *) y; | |||||
| register __vector double *vptr_a0 = (__vector double *) a0; | |||||
| register __vector double *vptr_a1 = (__vector double *) a1; | |||||
| register __vector double *vptr_a2 = (__vector double *) a2; | |||||
| register __vector double *vptr_a3 = (__vector double *) a3; | |||||
| register __vector double vy_0; | |||||
| register __vector double va0; | |||||
| register __vector double va1; | |||||
| register __vector double va2; | |||||
| register __vector double va3; | |||||
| register __vector double vy_1; | |||||
| register __vector double va0_1; | |||||
| register __vector double va1_1; | |||||
| register __vector double va2_1; | |||||
| register __vector double va3_1; | |||||
| register __vector double vy_2; | |||||
| register __vector double va0_2; | |||||
| register __vector double va1_2; | |||||
| register __vector double va2_2; | |||||
| register __vector double va3_2; | |||||
| register __vector double vy_3; | |||||
| register __vector double va0_3; | |||||
| register __vector double va1_3; | |||||
| register __vector double va2_3; | |||||
| register __vector double va3_3; | |||||
| BLASLONG i = 0; | |||||
| while (i < n) { | |||||
| vy_0 = vy[i]; | |||||
| va0 = vptr_a0[i]; | |||||
| va1 = vptr_a1[i]; | |||||
| va2 = vptr_a2[i]; | |||||
| va3 = vptr_a3[i]; | |||||
| vy_1 = vy[i + 1]; | |||||
| va0_1 = vptr_a0[i + 1]; | |||||
| va1_1 = vptr_a1[i + 1]; | |||||
| va2_1 = vptr_a2[i + 1]; | |||||
| va3_1 = vptr_a3[i + 1]; | |||||
| vy_2 = vy[i + 2]; | |||||
| va0_2 = vptr_a0[i + 2]; | |||||
| va1_2 = vptr_a1[i + 2]; | |||||
| va2_2 = vptr_a2[i + 2]; | |||||
| va3_2 = vptr_a3[i + 2]; | |||||
| vy_3 = vy[i + 3]; | |||||
| va0_3 = vptr_a0[i + 3]; | |||||
| va1_3 = vptr_a1[i + 3]; | |||||
| va2_3 = vptr_a2[i + 3]; | |||||
| va3_3 = vptr_a3[i + 3]; | |||||
| vy_0 += va0*vx0_r; | |||||
| vy_1 += va0_1*vx0_r; | |||||
| vy_2 += va0_2*vx0_r; | |||||
| vy_3 += va0_3*vx0_r; | |||||
| vy_0 += va1*vx1_r; | |||||
| vy_1 += va1_1*vx1_r; | |||||
| vy_2 += va1_2*vx1_r; | |||||
| vy_3 += va1_3*vx1_r; | |||||
| va0 = vec_xxpermdi(va0, va0, 2); | |||||
| va0_1 = vec_xxpermdi(va0_1, va0_1, 2); | |||||
| vy_0 += va2*vx2_r; | |||||
| vy_1 += va2_1*vx2_r; | |||||
| va0_2 = vec_xxpermdi(va0_2, va0_2, 2); | |||||
| va0_3 = vec_xxpermdi(va0_3, va0_3, 2); | |||||
| vy_2 += va2_2*vx2_r; | |||||
| vy_3 += va2_3*vx2_r; | |||||
| va1 = vec_xxpermdi(va1, va1, 2); | |||||
| va1_1 = vec_xxpermdi(va1_1, va1_1, 2); | |||||
| vy_0 += va3*vx3_r; | |||||
| vy_1 += va3_1*vx3_r; | |||||
| va1_2 = vec_xxpermdi(va1_2, va1_2, 2); | |||||
| va1_3 = vec_xxpermdi(va1_3, va1_3, 2); | |||||
| vy_2 += va3_2*vx3_r; | |||||
| vy_3 += va3_3*vx3_r; | |||||
| va2 = vec_xxpermdi(va2, va2, 2); | |||||
| va2_1 = vec_xxpermdi(va2_1, va2_1, 2); | |||||
| vy_0 += va0*vx0_i; | |||||
| vy_1 += va0_1*vx0_i; | |||||
| va2_2 = vec_xxpermdi(va2_2, va2_2, 2); | |||||
| va2_3 = vec_xxpermdi(va2_3, va2_3, 2); | |||||
| vy_2 += va0_2*vx0_i; | |||||
| vy_3 += va0_3*vx0_i; | |||||
| va3 = vec_xxpermdi(va3, va3, 2); | |||||
| va3_1 = vec_xxpermdi(va3_1, va3_1, 2); | |||||
| vy_0 += va1*vx1_i; | |||||
| vy_1 += va1_1*vx1_i; | |||||
| va3_2 = vec_xxpermdi(va3_2, va3_2, 2); | |||||
| va3_3 = vec_xxpermdi(va3_3, va3_3, 2); | |||||
| vy_2 += va1_2*vx1_i; | |||||
| vy_3 += va1_3*vx1_i; | |||||
| vy_0 += va2*vx2_i; | |||||
| vy_1 += va2_1*vx2_i; | |||||
| vy_2 += va2_2*vx2_i; | |||||
| vy_3 += va2_3*vx2_i; | |||||
| vy_0 += va3*vx3_i; | |||||
| vy_1 += va3_1*vx3_i; | |||||
| vy_2 += va3_2*vx3_i; | |||||
| vy_3 += va3_3*vx3_i; | |||||
| vy[i] = vy_0; | |||||
| vy[i + 1] = vy_1; | |||||
| vy[i + 2] = vy_2; | |||||
| vy[i + 3] = vy_3; | |||||
| i += 4; | |||||
| } | |||||
| } | |||||
| #else | |||||
| static void zgemv_kernel_4x4(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y) { | |||||
| BLASLONG i; | |||||
| FLOAT *a0, *a1, *a2, *a3; | |||||
| a0 = ap; | |||||
| a1 = ap + lda; | |||||
| a2 = a1 + lda; | |||||
| a3 = a2 + lda; | |||||
| for (i = 0; i < 2 * n; i += 2) { | |||||
| #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) | |||||
| y[i] += a0[i] * x[0] - a0[i + 1] * x[1]; | |||||
| y[i + 1] += a0[i] * x[1] + a0[i + 1] * x[0]; | |||||
| y[i] += a1[i] * x[2] - a1[i + 1] * x[3]; | |||||
| y[i + 1] += a1[i] * x[3] + a1[i + 1] * x[2]; | |||||
| y[i] += a2[i] * x[4] - a2[i + 1] * x[5]; | |||||
| y[i + 1] += a2[i] * x[5] + a2[i + 1] * x[4]; | |||||
| y[i] += a3[i] * x[6] - a3[i + 1] * x[7]; | |||||
| y[i + 1] += a3[i] * x[7] + a3[i + 1] * x[6]; | |||||
| #else | |||||
| y[i] += a0[i] * x[0] + a0[i + 1] * x[1]; | |||||
| y[i + 1] += a0[i] * x[1] - a0[i + 1] * x[0]; | |||||
| y[i] += a1[i] * x[2] + a1[i + 1] * x[3]; | |||||
| y[i + 1] += a1[i] * x[3] - a1[i + 1] * x[2]; | |||||
| y[i] += a2[i] * x[4] + a2[i + 1] * x[5]; | |||||
| y[i + 1] += a2[i] * x[5] - a2[i + 1] * x[4]; | |||||
| y[i] += a3[i] * x[6] + a3[i + 1] * x[7]; | |||||
| y[i + 1] += a3[i] * x[7] - a3[i + 1] * x[6]; | |||||
| #endif | |||||
| } | |||||
| } | |||||
| #endif | |||||
| #ifdef HAVE_KERNEL_4x2_VEC | |||||
| static void zgemv_kernel_4x2(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y) { | |||||
| BLASLONG i; | |||||
| FLOAT *a0, *a1; | |||||
| a0 = ap; | |||||
| a1 = ap + lda; | |||||
| #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) | |||||
| register __vector double vx0_r = {x[0], x[0]}; | |||||
| register __vector double vx0_i = {-x[1], x[1]}; | |||||
| register __vector double vx1_r = {x[2], x[2]}; | |||||
| register __vector double vx1_i = {-x[3], x[3]}; | |||||
| #else | |||||
| register __vector double vx0_r = {x[0], -x[0]}; | |||||
| register __vector double vx0_i = {x[1], x[1]}; | |||||
| register __vector double vx1_r = {x[2], -x[2]}; | |||||
| register __vector double vx1_i = {x[3], x[3]}; | |||||
| #endif | |||||
| register __vector double *vy = (__vector double *) y; | |||||
| register __vector double *vptr_a0 = (__vector double *) a0; | |||||
| register __vector double *vptr_a1 = (__vector double *) a1; | |||||
| for (i = 0; i < n; i += 4) { | |||||
| register __vector double vy_0 = vy[i]; | |||||
| register __vector double vy_1 = vy[i + 1]; | |||||
| register __vector double vy_2 = vy[i + 2]; | |||||
| register __vector double vy_3 = vy[i + 3]; | |||||
| register __vector double va0 = vptr_a0[i]; | |||||
| register __vector double va0_1 = vptr_a0[i + 1]; | |||||
| register __vector double va0_2 = vptr_a0[i + 2]; | |||||
| register __vector double va0_3 = vptr_a0[i + 3]; | |||||
| register __vector double va1 = vptr_a1[i]; | |||||
| register __vector double va1_1 = vptr_a1[i + 1]; | |||||
| register __vector double va1_2 = vptr_a1[i + 2]; | |||||
| register __vector double va1_3 = vptr_a1[i + 3]; | |||||
| vy_0 += va0*vx0_r; | |||||
| vy_1 += va0_1*vx0_r; | |||||
| vy_2 += va0_2*vx0_r; | |||||
| vy_3 += va0_3*vx0_r; | |||||
| va0 = vec_xxpermdi(va0, va0, 2); | |||||
| va0_1 = vec_xxpermdi(va0_1, va0_1, 2); | |||||
| va0_2 = vec_xxpermdi(va0_2, va0_2, 2); | |||||
| va0_3 = vec_xxpermdi(va0_3, va0_3, 2); | |||||
| vy_0 += va1*vx1_r; | |||||
| vy_1 += va1_1*vx1_r; | |||||
| vy_2 += va1_2*vx1_r; | |||||
| vy_3 += va1_3*vx1_r; | |||||
| va1 = vec_xxpermdi(va1, va1, 2); | |||||
| va1_1 = vec_xxpermdi(va1_1, va1_1, 2); | |||||
| va1_2 = vec_xxpermdi(va1_2, va1_2, 2); | |||||
| va1_3 = vec_xxpermdi(va1_3, va1_3, 2); | |||||
| vy_0 += va0*vx0_i; | |||||
| vy_1 += va0_1*vx0_i; | |||||
| vy_2 += va0_2*vx0_i; | |||||
| vy_3 += va0_3*vx0_i; | |||||
| vy_0 += va1*vx1_i; | |||||
| vy_1 += va1_1*vx1_i; | |||||
| vy_2 += va1_2*vx1_i; | |||||
| vy_3 += va1_3*vx1_i; | |||||
| vy[i] = vy_0; | |||||
| vy[i + 1] = vy_1; | |||||
| vy[i + 2] = vy_2; | |||||
| vy[i + 3] = vy_3; | |||||
| } | |||||
| } | |||||
| #else | |||||
| static void zgemv_kernel_4x2(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y) { | |||||
| BLASLONG i; | |||||
| FLOAT *a0, *a1; | |||||
| a0 = ap; | |||||
| a1 = ap + lda; | |||||
| for (i = 0; i < 2 * n; i += 2) { | |||||
| #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) | |||||
| y[i] += a0[i] * x[0] - a0[i + 1] * x[1]; | |||||
| y[i + 1] += a0[i] * x[1] + a0[i + 1] * x[0]; | |||||
| y[i] += a1[i] * x[2] - a1[i + 1] * x[3]; | |||||
| y[i + 1] += a1[i] * x[3] + a1[i + 1] * x[2]; | |||||
| #else | |||||
| y[i] += a0[i] * x[0] + a0[i + 1] * x[1]; | |||||
| y[i + 1] += a0[i] * x[1] - a0[i + 1] * x[0]; | |||||
| y[i] += a1[i] * x[2] + a1[i + 1] * x[3]; | |||||
| y[i + 1] += a1[i] * x[3] - a1[i + 1] * x[2]; | |||||
| #endif | |||||
| } | |||||
| } | |||||
| #endif | |||||
| #ifdef HAVE_KERNEL_4x1_VEC | |||||
| static void zgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y) { | |||||
| BLASLONG i; | |||||
| FLOAT *a0; | |||||
| a0 = ap; | |||||
| #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) | |||||
| register __vector double vx0_r = {x[0], x[0]}; | |||||
| register __vector double vx0_i = {-x[1], x[1]}; | |||||
| #else | |||||
| register __vector double vx0_r = {x[0], -x[0]}; | |||||
| register __vector double vx0_i = {x[1], x[1]}; | |||||
| #endif | |||||
| register __vector double *vy = (__vector double *) y; | |||||
| register __vector double *vptr_a0 = (__vector double *) a0; | |||||
| for (i = 0; i < n; i += 4) { | |||||
| register __vector double vy_0 = vy[i]; | |||||
| register __vector double vy_1 = vy[i + 1]; | |||||
| register __vector double vy_2 = vy[i + 2]; | |||||
| register __vector double vy_3 = vy[i + 3]; | |||||
| register __vector double va0 = vptr_a0[i]; | |||||
| register __vector double va0_1 = vptr_a0[i + 1]; | |||||
| register __vector double va0_2 = vptr_a0[i + 2]; | |||||
| register __vector double va0_3 = vptr_a0[i + 3]; | |||||
| vy_0 += va0*vx0_r; | |||||
| vy_1 += va0_1*vx0_r; | |||||
| vy_2 += va0_2*vx0_r; | |||||
| vy_3 += va0_3*vx0_r; | |||||
| va0 = vec_xxpermdi(va0, va0, 2); | |||||
| va0_1 = vec_xxpermdi(va0_1, va0_1, 2); | |||||
| va0_2 = vec_xxpermdi(va0_2, va0_2, 2); | |||||
| va0_3 = vec_xxpermdi(va0_3, va0_3, 2); | |||||
| vy_0 += va0*vx0_i; | |||||
| vy_1 += va0_1*vx0_i; | |||||
| vy_2 += va0_2*vx0_i; | |||||
| vy_3 += va0_3*vx0_i; | |||||
| vy[i] = vy_0; | |||||
| vy[i + 1] = vy_1; | |||||
| vy[i + 2] = vy_2; | |||||
| vy[i + 3] = vy_3; | |||||
| } | |||||
| } | |||||
| #else | |||||
| static void zgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y) { | |||||
| BLASLONG i; | |||||
| FLOAT *a0; | |||||
| a0 = ap; | |||||
| for (i = 0; i < 2 * n; i += 2) { | |||||
| #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) | |||||
| y[i] += a0[i] * x[0] - a0[i + 1] * x[1]; | |||||
| y[i + 1] += a0[i] * x[1] + a0[i + 1] * x[0]; | |||||
| #else | |||||
| y[i] += a0[i] * x[0] + a0[i + 1] * x[1]; | |||||
| y[i + 1] += a0[i] * x[1] - a0[i + 1] * x[0]; | |||||
| #endif | |||||
| } | |||||
| } | |||||
| #endif | |||||
| #ifdef HAVE_KERNEL_ADDY | |||||
| static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest, FLOAT alpha_r, FLOAT alpha_i) { | |||||
| BLASLONG i; | |||||
| #if !defined(XCONJ) | |||||
| register __vector double valpha_r = {alpha_r, alpha_r}; | |||||
| register __vector double valpha_i = {-alpha_i, alpha_i}; | |||||
| #else | |||||
| register __vector double valpha_r = {alpha_r, -alpha_r}; | |||||
| register __vector double valpha_i = {alpha_i, alpha_i}; | |||||
| #endif | |||||
| register __vector double *vptr_src = (__vector double *) src; | |||||
| if (inc_dest != 2) { | |||||
| register __vector double *vptr_y = (__vector double *) dest; | |||||
| //note that inc_dest is already 2x. so we should add it to double* | |||||
| register __vector double *vptr_y1 = (__vector double *) (dest + inc_dest); | |||||
| register __vector double *vptr_y2 = (__vector double *) (dest + 2 * inc_dest); | |||||
| register __vector double *vptr_y3 = (__vector double *) (dest + 3 * inc_dest); | |||||
| BLASLONG dest_t = 0; | |||||
| BLASLONG add_dest = inc_dest << 1; //inc_dest is already multiplied by 2, so for vector 4 we just multiply 2 times | |||||
| for (i = 0; i < n; i += 4) { | |||||
| register __vector double vy_0 = vptr_y[dest_t]; | |||||
| register __vector double vy_1 = vptr_y1[dest_t]; | |||||
| register __vector double vy_2 = vptr_y2[dest_t]; | |||||
| register __vector double vy_3 = vptr_y3[dest_t]; | |||||
| register __vector double vsrc = vptr_src[i]; | |||||
| register __vector double vsrc_1 = vptr_src[i + 1]; | |||||
| register __vector double vsrc_2 = vptr_src[i + 2]; | |||||
| register __vector double vsrc_3 = vptr_src[i + 3]; | |||||
| vy_0 += vsrc*valpha_r; | |||||
| vy_1 += vsrc_1*valpha_r; | |||||
| vy_2 += vsrc_2*valpha_r; | |||||
| vy_3 += vsrc_3*valpha_r; | |||||
| vsrc = vec_xxpermdi(vsrc, vsrc, 2); | |||||
| vsrc_1 = vec_xxpermdi(vsrc_1, vsrc_1, 2); | |||||
| vsrc_2 = vec_xxpermdi(vsrc_2, vsrc_2, 2); | |||||
| vsrc_3 = vec_xxpermdi(vsrc_3, vsrc_3, 2); | |||||
| vy_0 += vsrc*valpha_i; | |||||
| vy_1 += vsrc_1*valpha_i; | |||||
| vy_2 += vsrc_2*valpha_i; | |||||
| vy_3 += vsrc_3*valpha_i; | |||||
| vptr_y[dest_t] = vy_0; | |||||
| vptr_y1[dest_t ] = vy_1; | |||||
| vptr_y2[dest_t] = vy_2; | |||||
| vptr_y3[dest_t] = vy_3; | |||||
| dest_t += add_dest; | |||||
| } | |||||
| return; | |||||
| } else { | |||||
| register __vector double *vptr_y = (__vector double *) dest; | |||||
| for (i = 0; i < n; i += 4) { | |||||
| register __vector double vy_0 = vptr_y[i]; | |||||
| register __vector double vy_1 = vptr_y[i + 1]; | |||||
| register __vector double vy_2 = vptr_y[i + 2]; | |||||
| register __vector double vy_3 = vptr_y[i + 3]; | |||||
| register __vector double vsrc = vptr_src[i]; | |||||
| register __vector double vsrc_1 = vptr_src[i + 1]; | |||||
| register __vector double vsrc_2 = vptr_src[i + 2]; | |||||
| register __vector double vsrc_3 = vptr_src[i + 3]; | |||||
| vy_0 += vsrc*valpha_r; | |||||
| vy_1 += vsrc_1*valpha_r; | |||||
| vy_2 += vsrc_2*valpha_r; | |||||
| vy_3 += vsrc_3*valpha_r; | |||||
| vsrc = vec_xxpermdi(vsrc, vsrc, 2); | |||||
| vsrc_1 = vec_xxpermdi(vsrc_1, vsrc_1, 2); | |||||
| vsrc_2 = vec_xxpermdi(vsrc_2, vsrc_2, 2); | |||||
| vsrc_3 = vec_xxpermdi(vsrc_3, vsrc_3, 2); | |||||
| vy_0 += vsrc*valpha_i; | |||||
| vy_1 += vsrc_1*valpha_i; | |||||
| vy_2 += vsrc_2*valpha_i; | |||||
| vy_3 += vsrc_3*valpha_i; | |||||
| vptr_y[i] = vy_0; | |||||
| vptr_y[i + 1 ] = vy_1; | |||||
| vptr_y[i + 2] = vy_2; | |||||
| vptr_y[i + 3] = vy_3; | |||||
| } | |||||
| return; | |||||
| } | |||||
| return; | |||||
| } | |||||
| #else | |||||
| static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest, FLOAT alpha_r, FLOAT alpha_i) { | |||||
| BLASLONG i; | |||||
| if (inc_dest != 2) { | |||||
| FLOAT temp_r; | |||||
| FLOAT temp_i; | |||||
| for (i = 0; i < n; i++) { | |||||
| #if !defined(XCONJ) | |||||
| temp_r = alpha_r * src[0] - alpha_i * src[1]; | |||||
| temp_i = alpha_r * src[1] + alpha_i * src[0]; | |||||
| #else | |||||
| temp_r = alpha_r * src[0] + alpha_i * src[1]; | |||||
| temp_i = -alpha_r * src[1] + alpha_i * src[0]; | |||||
| #endif | |||||
| *dest += temp_r; | |||||
| *(dest + 1) += temp_i; | |||||
| src += 2; | |||||
| dest += inc_dest; | |||||
| } | |||||
| return; | |||||
| } | |||||
| FLOAT temp_r0; | |||||
| FLOAT temp_i0; | |||||
| FLOAT temp_r1; | |||||
| FLOAT temp_i1; | |||||
| FLOAT temp_r2; | |||||
| FLOAT temp_i2; | |||||
| FLOAT temp_r3; | |||||
| FLOAT temp_i3; | |||||
| for (i = 0; i < n; i += 4) { | |||||
| #if !defined(XCONJ) | |||||
| temp_r0 = alpha_r * src[0] - alpha_i * src[1]; | |||||
| temp_i0 = alpha_r * src[1] + alpha_i * src[0]; | |||||
| temp_r1 = alpha_r * src[2] - alpha_i * src[3]; | |||||
| temp_i1 = alpha_r * src[3] + alpha_i * src[2]; | |||||
| temp_r2 = alpha_r * src[4] - alpha_i * src[5]; | |||||
| temp_i2 = alpha_r * src[5] + alpha_i * src[4]; | |||||
| temp_r3 = alpha_r * src[6] - alpha_i * src[7]; | |||||
| temp_i3 = alpha_r * src[7] + alpha_i * src[6]; | |||||
| #else | |||||
| temp_r0 = alpha_r * src[0] + alpha_i * src[1]; | |||||
| temp_i0 = -alpha_r * src[1] + alpha_i * src[0]; | |||||
| temp_r1 = alpha_r * src[2] + alpha_i * src[3]; | |||||
| temp_i1 = -alpha_r * src[3] + alpha_i * src[2]; | |||||
| temp_r2 = alpha_r * src[4] + alpha_i * src[5]; | |||||
| temp_i2 = -alpha_r * src[5] + alpha_i * src[4]; | |||||
| temp_r3 = alpha_r * src[6] + alpha_i * src[7]; | |||||
| temp_i3 = -alpha_r * src[7] + alpha_i * src[6]; | |||||
| #endif | |||||
| dest[0] += temp_r0; | |||||
| dest[1] += temp_i0; | |||||
| dest[2] += temp_r1; | |||||
| dest[3] += temp_i1; | |||||
| dest[4] += temp_r2; | |||||
| dest[5] += temp_i2; | |||||
| dest[6] += temp_r3; | |||||
| dest[7] += temp_i3; | |||||
| src += 8; | |||||
| dest += 8; | |||||
| } | |||||
| return; | |||||
| } | |||||
| #endif | |||||
| int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT * buffer) { | |||||
| BLASLONG i; | |||||
| BLASLONG j; | |||||
| FLOAT *a_ptr; | |||||
| FLOAT *x_ptr; | |||||
| FLOAT *y_ptr; | |||||
| BLASLONG n1; | |||||
| BLASLONG m1; | |||||
| BLASLONG m2; | |||||
| BLASLONG m3; | |||||
| BLASLONG n2; | |||||
| FLOAT xbuffer[8], *ybuffer; | |||||
| if (m < 1) return (0); | |||||
| if (n < 1) return (0); | |||||
| ybuffer = buffer; | |||||
| inc_x *= 2; | |||||
| inc_y *= 2; | |||||
| lda *= 2; | |||||
| n1 = n / 4; | |||||
| n2 = n % 4; | |||||
| m3 = m % 4; | |||||
| m1 = m - (m % 4); | |||||
| m2 = (m % NBMAX) - (m % 4); | |||||
| y_ptr = y; | |||||
| BLASLONG NB = NBMAX; | |||||
| while (NB == NBMAX) { | |||||
| m1 -= NB; | |||||
| if (m1 < 0) { | |||||
| if (m2 == 0) break; | |||||
| NB = m2; | |||||
| } | |||||
| a_ptr = a; | |||||
| x_ptr = x; | |||||
| //zero_y(NB,ybuffer); | |||||
| memset(ybuffer, 0, NB * 16); | |||||
| if (inc_x == 2) { | |||||
| for (i = 0; i < n1; i++) { | |||||
| zgemv_kernel_4x4(NB, lda, a_ptr, x_ptr, ybuffer); | |||||
| a_ptr += lda << 2; | |||||
| x_ptr += 8; | |||||
| } | |||||
| if (n2 & 2) { | |||||
| zgemv_kernel_4x2(NB, lda, a_ptr, x_ptr, ybuffer); | |||||
| x_ptr += 4; | |||||
| a_ptr += 2 * lda; | |||||
| } | |||||
| if (n2 & 1) { | |||||
| zgemv_kernel_4x1(NB, a_ptr, x_ptr, ybuffer); | |||||
| x_ptr += 2; | |||||
| a_ptr += lda; | |||||
| } | |||||
| } else { | |||||
| for (i = 0; i < n1; i++) { | |||||
| xbuffer[0] = x_ptr[0]; | |||||
| xbuffer[1] = x_ptr[1]; | |||||
| x_ptr += inc_x; | |||||
| xbuffer[2] = x_ptr[0]; | |||||
| xbuffer[3] = x_ptr[1]; | |||||
| x_ptr += inc_x; | |||||
| xbuffer[4] = x_ptr[0]; | |||||
| xbuffer[5] = x_ptr[1]; | |||||
| x_ptr += inc_x; | |||||
| xbuffer[6] = x_ptr[0]; | |||||
| xbuffer[7] = x_ptr[1]; | |||||
| x_ptr += inc_x; | |||||
| zgemv_kernel_4x4(NB, lda, a_ptr, xbuffer, ybuffer); | |||||
| a_ptr += lda << 2; | |||||
| } | |||||
| for (i = 0; i < n2; i++) { | |||||
| xbuffer[0] = x_ptr[0]; | |||||
| xbuffer[1] = x_ptr[1]; | |||||
| x_ptr += inc_x; | |||||
| zgemv_kernel_4x1(NB, a_ptr, xbuffer, ybuffer); | |||||
| a_ptr += lda; | |||||
| } | |||||
| } | |||||
| add_y(NB, ybuffer, y_ptr, inc_y, alpha_r, alpha_i); | |||||
| a += 2 * NB; | |||||
| y_ptr += NB * inc_y; | |||||
| } | |||||
| if (m3 == 0) return (0); | |||||
| if (m3 == 1) { | |||||
| a_ptr = a; | |||||
| x_ptr = x; | |||||
| FLOAT temp_r = 0.0; | |||||
| FLOAT temp_i = 0.0; | |||||
| if (lda == 2 && inc_x == 2) { | |||||
| for (i = 0; i < (n & -2); i += 2) { | |||||
| #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) | |||||
| temp_r += a_ptr[0] * x_ptr[0] - a_ptr[1] * x_ptr[1]; | |||||
| temp_i += a_ptr[0] * x_ptr[1] + a_ptr[1] * x_ptr[0]; | |||||
| temp_r += a_ptr[2] * x_ptr[2] - a_ptr[3] * x_ptr[3]; | |||||
| temp_i += a_ptr[2] * x_ptr[3] + a_ptr[3] * x_ptr[2]; | |||||
| #else | |||||
| temp_r += a_ptr[0] * x_ptr[0] + a_ptr[1] * x_ptr[1]; | |||||
| temp_i += a_ptr[0] * x_ptr[1] - a_ptr[1] * x_ptr[0]; | |||||
| temp_r += a_ptr[2] * x_ptr[2] + a_ptr[3] * x_ptr[3]; | |||||
| temp_i += a_ptr[2] * x_ptr[3] - a_ptr[3] * x_ptr[2]; | |||||
| #endif | |||||
| a_ptr += 4; | |||||
| x_ptr += 4; | |||||
| } | |||||
| for (; i < n; i++) { | |||||
| #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) | |||||
| temp_r += a_ptr[0] * x_ptr[0] - a_ptr[1] * x_ptr[1]; | |||||
| temp_i += a_ptr[0] * x_ptr[1] + a_ptr[1] * x_ptr[0]; | |||||
| #else | |||||
| temp_r += a_ptr[0] * x_ptr[0] + a_ptr[1] * x_ptr[1]; | |||||
| temp_i += a_ptr[0] * x_ptr[1] - a_ptr[1] * x_ptr[0]; | |||||
| #endif | |||||
| a_ptr += 2; | |||||
| x_ptr += 2; | |||||
| } | |||||
| } else { | |||||
| for (i = 0; i < n; i++) { | |||||
| #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) | |||||
| temp_r += a_ptr[0] * x_ptr[0] - a_ptr[1] * x_ptr[1]; | |||||
| temp_i += a_ptr[0] * x_ptr[1] + a_ptr[1] * x_ptr[0]; | |||||
| #else | |||||
| temp_r += a_ptr[0] * x_ptr[0] + a_ptr[1] * x_ptr[1]; | |||||
| temp_i += a_ptr[0] * x_ptr[1] - a_ptr[1] * x_ptr[0]; | |||||
| #endif | |||||
| a_ptr += lda; | |||||
| x_ptr += inc_x; | |||||
| } | |||||
| } | |||||
| #if !defined(XCONJ) | |||||
| y_ptr[0] += alpha_r * temp_r - alpha_i * temp_i; | |||||
| y_ptr[1] += alpha_r * temp_i + alpha_i * temp_r; | |||||
| #else | |||||
| y_ptr[0] += alpha_r * temp_r + alpha_i * temp_i; | |||||
| y_ptr[1] -= alpha_r * temp_i - alpha_i * temp_r; | |||||
| #endif | |||||
| return (0); | |||||
| } | |||||
| if (m3 == 2) { | |||||
| a_ptr = a; | |||||
| x_ptr = x; | |||||
| FLOAT temp_r0 = 0.0; | |||||
| FLOAT temp_i0 = 0.0; | |||||
| FLOAT temp_r1 = 0.0; | |||||
| FLOAT temp_i1 = 0.0; | |||||
| if (lda == 4 && inc_x == 2) { | |||||
| for (i = 0; i < (n & -2); i += 2) { | |||||
| #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) | |||||
| temp_r0 += a_ptr[0] * x_ptr[0] - a_ptr[1] * x_ptr[1]; | |||||
| temp_i0 += a_ptr[0] * x_ptr[1] + a_ptr[1] * x_ptr[0]; | |||||
| temp_r1 += a_ptr[2] * x_ptr[0] - a_ptr[3] * x_ptr[1]; | |||||
| temp_i1 += a_ptr[2] * x_ptr[1] + a_ptr[3] * x_ptr[0]; | |||||
| temp_r0 += a_ptr[4] * x_ptr[2] - a_ptr[5] * x_ptr[3]; | |||||
| temp_i0 += a_ptr[4] * x_ptr[3] + a_ptr[5] * x_ptr[2]; | |||||
| temp_r1 += a_ptr[6] * x_ptr[2] - a_ptr[7] * x_ptr[3]; | |||||
| temp_i1 += a_ptr[6] * x_ptr[3] + a_ptr[7] * x_ptr[2]; | |||||
| #else | |||||
| temp_r0 += a_ptr[0] * x_ptr[0] + a_ptr[1] * x_ptr[1]; | |||||
| temp_i0 += a_ptr[0] * x_ptr[1] - a_ptr[1] * x_ptr[0]; | |||||
| temp_r1 += a_ptr[2] * x_ptr[0] + a_ptr[3] * x_ptr[1]; | |||||
| temp_i1 += a_ptr[2] * x_ptr[1] - a_ptr[3] * x_ptr[0]; | |||||
| temp_r0 += a_ptr[4] * x_ptr[2] + a_ptr[5] * x_ptr[3]; | |||||
| temp_i0 += a_ptr[4] * x_ptr[3] - a_ptr[5] * x_ptr[2]; | |||||
| temp_r1 += a_ptr[6] * x_ptr[2] + a_ptr[7] * x_ptr[3]; | |||||
| temp_i1 += a_ptr[6] * x_ptr[3] - a_ptr[7] * x_ptr[2]; | |||||
| #endif | |||||
| a_ptr += 8; | |||||
| x_ptr += 4; | |||||
| } | |||||
| for (; i < n; i++) { | |||||
| #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) | |||||
| temp_r0 += a_ptr[0] * x_ptr[0] - a_ptr[1] * x_ptr[1]; | |||||
| temp_i0 += a_ptr[0] * x_ptr[1] + a_ptr[1] * x_ptr[0]; | |||||
| temp_r1 += a_ptr[2] * x_ptr[0] - a_ptr[3] * x_ptr[1]; | |||||
| temp_i1 += a_ptr[2] * x_ptr[1] + a_ptr[3] * x_ptr[0]; | |||||
| #else | |||||
| temp_r0 += a_ptr[0] * x_ptr[0] + a_ptr[1] * x_ptr[1]; | |||||
| temp_i0 += a_ptr[0] * x_ptr[1] - a_ptr[1] * x_ptr[0]; | |||||
| temp_r1 += a_ptr[2] * x_ptr[0] + a_ptr[3] * x_ptr[1]; | |||||
| temp_i1 += a_ptr[2] * x_ptr[1] - a_ptr[3] * x_ptr[0]; | |||||
| #endif | |||||
| a_ptr += 4; | |||||
| x_ptr += 2; | |||||
| } | |||||
| } else { | |||||
| for (i = 0; i < n; i++) { | |||||
| #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) | |||||
| temp_r0 += a_ptr[0] * x_ptr[0] - a_ptr[1] * x_ptr[1]; | |||||
| temp_i0 += a_ptr[0] * x_ptr[1] + a_ptr[1] * x_ptr[0]; | |||||
| temp_r1 += a_ptr[2] * x_ptr[0] - a_ptr[3] * x_ptr[1]; | |||||
| temp_i1 += a_ptr[2] * x_ptr[1] + a_ptr[3] * x_ptr[0]; | |||||
| #else | |||||
| temp_r0 += a_ptr[0] * x_ptr[0] + a_ptr[1] * x_ptr[1]; | |||||
| temp_i0 += a_ptr[0] * x_ptr[1] - a_ptr[1] * x_ptr[0]; | |||||
| temp_r1 += a_ptr[2] * x_ptr[0] + a_ptr[3] * x_ptr[1]; | |||||
| temp_i1 += a_ptr[2] * x_ptr[1] - a_ptr[3] * x_ptr[0]; | |||||
| #endif | |||||
| a_ptr += lda; | |||||
| x_ptr += inc_x; | |||||
| } | |||||
| } | |||||
| #if !defined(XCONJ) | |||||
| y_ptr[0] += alpha_r * temp_r0 - alpha_i * temp_i0; | |||||
| y_ptr[1] += alpha_r * temp_i0 + alpha_i * temp_r0; | |||||
| y_ptr += inc_y; | |||||
| y_ptr[0] += alpha_r * temp_r1 - alpha_i * temp_i1; | |||||
| y_ptr[1] += alpha_r * temp_i1 + alpha_i * temp_r1; | |||||
| #else | |||||
| y_ptr[0] += alpha_r * temp_r0 + alpha_i * temp_i0; | |||||
| y_ptr[1] -= alpha_r * temp_i0 - alpha_i * temp_r0; | |||||
| y_ptr += inc_y; | |||||
| y_ptr[0] += alpha_r * temp_r1 + alpha_i * temp_i1; | |||||
| y_ptr[1] -= alpha_r * temp_i1 - alpha_i * temp_r1; | |||||
| #endif | |||||
| return (0); | |||||
| } | |||||
| if (m3 == 3) { | |||||
| a_ptr = a; | |||||
| x_ptr = x; | |||||
| FLOAT temp_r0 = 0.0; | |||||
| FLOAT temp_i0 = 0.0; | |||||
| FLOAT temp_r1 = 0.0; | |||||
| FLOAT temp_i1 = 0.0; | |||||
| FLOAT temp_r2 = 0.0; | |||||
| FLOAT temp_i2 = 0.0; | |||||
| if (lda == 6 && inc_x == 2) { | |||||
| for (i = 0; i < n; i++) { | |||||
| #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) | |||||
| temp_r0 += a_ptr[0] * x_ptr[0] - a_ptr[1] * x_ptr[1]; | |||||
| temp_i0 += a_ptr[0] * x_ptr[1] + a_ptr[1] * x_ptr[0]; | |||||
| temp_r1 += a_ptr[2] * x_ptr[0] - a_ptr[3] * x_ptr[1]; | |||||
| temp_i1 += a_ptr[2] * x_ptr[1] + a_ptr[3] * x_ptr[0]; | |||||
| temp_r2 += a_ptr[4] * x_ptr[0] - a_ptr[5] * x_ptr[1]; | |||||
| temp_i2 += a_ptr[4] * x_ptr[1] + a_ptr[5] * x_ptr[0]; | |||||
| #else | |||||
| temp_r0 += a_ptr[0] * x_ptr[0] + a_ptr[1] * x_ptr[1]; | |||||
| temp_i0 += a_ptr[0] * x_ptr[1] - a_ptr[1] * x_ptr[0]; | |||||
| temp_r1 += a_ptr[2] * x_ptr[0] + a_ptr[3] * x_ptr[1]; | |||||
| temp_i1 += a_ptr[2] * x_ptr[1] - a_ptr[3] * x_ptr[0]; | |||||
| temp_r2 += a_ptr[4] * x_ptr[0] + a_ptr[5] * x_ptr[1]; | |||||
| temp_i2 += a_ptr[4] * x_ptr[1] - a_ptr[5] * x_ptr[0]; | |||||
| #endif | |||||
| a_ptr += 6; | |||||
| x_ptr += 2; | |||||
| } | |||||
| } else { | |||||
| for (i = 0; i < n; i++) { | |||||
| #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) | |||||
| temp_r0 += a_ptr[0] * x_ptr[0] - a_ptr[1] * x_ptr[1]; | |||||
| temp_i0 += a_ptr[0] * x_ptr[1] + a_ptr[1] * x_ptr[0]; | |||||
| temp_r1 += a_ptr[2] * x_ptr[0] - a_ptr[3] * x_ptr[1]; | |||||
| temp_i1 += a_ptr[2] * x_ptr[1] + a_ptr[3] * x_ptr[0]; | |||||
| temp_r2 += a_ptr[4] * x_ptr[0] - a_ptr[5] * x_ptr[1]; | |||||
| temp_i2 += a_ptr[4] * x_ptr[1] + a_ptr[5] * x_ptr[0]; | |||||
| #else | |||||
| temp_r0 += a_ptr[0] * x_ptr[0] + a_ptr[1] * x_ptr[1]; | |||||
| temp_i0 += a_ptr[0] * x_ptr[1] - a_ptr[1] * x_ptr[0]; | |||||
| temp_r1 += a_ptr[2] * x_ptr[0] + a_ptr[3] * x_ptr[1]; | |||||
| temp_i1 += a_ptr[2] * x_ptr[1] - a_ptr[3] * x_ptr[0]; | |||||
| temp_r2 += a_ptr[4] * x_ptr[0] + a_ptr[5] * x_ptr[1]; | |||||
| temp_i2 += a_ptr[4] * x_ptr[1] - a_ptr[5] * x_ptr[0]; | |||||
| #endif | |||||
| a_ptr += lda; | |||||
| x_ptr += inc_x; | |||||
| } | |||||
| } | |||||
| #if !defined(XCONJ) | |||||
| y_ptr[0] += alpha_r * temp_r0 - alpha_i * temp_i0; | |||||
| y_ptr[1] += alpha_r * temp_i0 + alpha_i * temp_r0; | |||||
| y_ptr += inc_y; | |||||
| y_ptr[0] += alpha_r * temp_r1 - alpha_i * temp_i1; | |||||
| y_ptr[1] += alpha_r * temp_i1 + alpha_i * temp_r1; | |||||
| y_ptr += inc_y; | |||||
| y_ptr[0] += alpha_r * temp_r2 - alpha_i * temp_i2; | |||||
| y_ptr[1] += alpha_r * temp_i2 + alpha_i * temp_r2; | |||||
| #else | |||||
| y_ptr[0] += alpha_r * temp_r0 + alpha_i * temp_i0; | |||||
| y_ptr[1] -= alpha_r * temp_i0 - alpha_i * temp_r0; | |||||
| y_ptr += inc_y; | |||||
| y_ptr[0] += alpha_r * temp_r1 + alpha_i * temp_i1; | |||||
| y_ptr[1] -= alpha_r * temp_i1 - alpha_i * temp_r1; | |||||
| y_ptr += inc_y; | |||||
| y_ptr[0] += alpha_r * temp_r2 + alpha_i * temp_i2; | |||||
| y_ptr[1] -= alpha_r * temp_i2 - alpha_i * temp_r2; | |||||
| #endif | |||||
| return (0); | |||||
| } | |||||
| return (0); | |||||
| } | |||||
| @@ -0,0 +1,847 @@ | |||||
| /*************************************************************************** | |||||
| Copyright (c) 2018, The OpenBLAS Project | |||||
| All rights reserved. | |||||
| Redistribution and use in source and binary forms, with or without | |||||
| modification, are permitted provided that the following conditions are | |||||
| met: | |||||
| 1. Redistributions of source code must retain the above copyright | |||||
| notice, this list of conditions and the following disclaimer. | |||||
| 2. Redistributions in binary form must reproduce the above copyright | |||||
| notice, this list of conditions and the following disclaimer in | |||||
| the documentation and/or other materials provided with the | |||||
| distribution. | |||||
| 3. Neither the name of the OpenBLAS project nor the names of | |||||
| its contributors may be used to endorse or promote products | |||||
| derived from this software without specific prior written permission. | |||||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| *****************************************************************************/ | |||||
| #include "common.h" | |||||
| #define NBMAX 4096 | |||||
| #define HAVE_KERNEL_4x4_VEC 1 | |||||
| #define HAVE_KERNEL_4x2_VEC 1 | |||||
| #define HAVE_KERNEL_4x1_VEC 1 | |||||
| #if defined(HAVE_KERNEL_4x4_VEC) || defined(HAVE_KERNEL_4x2_VEC) || defined(HAVE_KERNEL_4x1_VEC) | |||||
| #include <altivec.h> | |||||
| #endif | |||||
| #ifdef HAVE_KERNEL_4x4_VEC_ASM | |||||
| #elif HAVE_KERNEL_4x4_VEC | |||||
| static void zgemv_kernel_4x4(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT alpha_r, FLOAT alpha_i) { | |||||
| BLASLONG i; | |||||
| FLOAT *a0, *a1, *a2, *a3; | |||||
| a0 = ap; | |||||
| a1 = ap + lda; | |||||
| a2 = a1 + lda; | |||||
| a3 = a2 + lda; | |||||
| //p for positive(real*real,image*image) r for image (real*image,image*real) | |||||
| register __vector double vtemp0_p = {0.0, 0.0}; | |||||
| register __vector double vtemp0_r = {0.0, 0.0}; | |||||
| register __vector double vtemp1_p = {0.0, 0.0}; | |||||
| register __vector double vtemp1_r = {0.0, 0.0}; | |||||
| register __vector double vtemp2_p = {0.0, 0.0}; | |||||
| register __vector double vtemp2_r = {0.0, 0.0}; | |||||
| register __vector double vtemp3_p = {0.0, 0.0}; | |||||
| register __vector double vtemp3_r = {0.0, 0.0}; | |||||
| i = 0; | |||||
| n = n << 1; | |||||
| while (i < n) { | |||||
| // __builtin_prefetch(&x[i]); | |||||
| // __builtin_prefetch(&a0[i]); | |||||
| // __builtin_prefetch(&a1[i]); | |||||
| // __builtin_prefetch(&a2[i]); | |||||
| // __builtin_prefetch(&a3[i]); | |||||
| register __vector double vx_0 = *(__vector double*) (&x[i]); | |||||
| register __vector double vx_1 = *(__vector double*) (&x[i + 2]); | |||||
| register __vector double vx_2 = *(__vector double*) (&x[i + 4]); | |||||
| register __vector double vx_3 = *(__vector double*) (&x[i + 6]); | |||||
| register __vector double va0 = *(__vector double*) (&a0[i]); | |||||
| register __vector double va0_1 = *(__vector double*) (&a0[i + 2]); | |||||
| register __vector double va0_2 = *(__vector double*) (&a0[i + 4]); | |||||
| register __vector double va0_3 = *(__vector double*) (&a0[i + 6]); | |||||
| register __vector double va1 = *(__vector double*) (&a1[i]); | |||||
| register __vector double va1_1 = *(__vector double*) (&a1[i + 2]); | |||||
| register __vector double va1_2 = *(__vector double*) (&a1[i + 4]); | |||||
| register __vector double va1_3 = *(__vector double*) (&a1[i + 6]); | |||||
| register __vector double va2 = *(__vector double*) (&a2[i]); | |||||
| register __vector double va2_1 = *(__vector double*) (&a2[i + 2]); | |||||
| register __vector double va2_2 = *(__vector double*) (&a2[i + 4]); | |||||
| register __vector double va2_3 = *(__vector double*) (&a2[i + 6]); | |||||
| register __vector double va3 = *(__vector double*) (&a3[i]); | |||||
| register __vector double va3_1 = *(__vector double*) (&a3[i + 2]); | |||||
| register __vector double va3_2 = *(__vector double*) (&a3[i + 4]); | |||||
| register __vector double va3_3 = *(__vector double*) (&a3[i + 6]); | |||||
| register __vector double vxr_0 = vec_xxpermdi(vx_0, vx_0, 2); | |||||
| register __vector double vxr_1 = vec_xxpermdi(vx_1, vx_1, 2); | |||||
| i += 8; | |||||
| vtemp0_p += vx_0*va0; | |||||
| vtemp0_r += vxr_0*va0; | |||||
| vtemp1_p += vx_0*va1; | |||||
| vtemp1_r += vxr_0*va1; | |||||
| vtemp2_p += vx_0*va2; | |||||
| vtemp2_r += vxr_0*va2; | |||||
| vtemp3_p += vx_0*va3; | |||||
| vtemp3_r += vxr_0*va3; | |||||
| vtemp0_p += vx_1*va0_1; | |||||
| vtemp0_r += vxr_1*va0_1; | |||||
| vtemp1_p += vx_1*va1_1; | |||||
| vtemp1_r += vxr_1*va1_1; | |||||
| vxr_0 = vec_xxpermdi(vx_2, vx_2, 2); | |||||
| vtemp2_p += vx_1*va2_1; | |||||
| vtemp2_r += vxr_1*va2_1; | |||||
| vtemp3_p += vx_1*va3_1; | |||||
| vtemp3_r += vxr_1*va3_1; | |||||
| vtemp0_p += vx_2*va0_2; | |||||
| vtemp0_r += vxr_0*va0_2; | |||||
| vxr_1 = vec_xxpermdi(vx_3, vx_3, 2); | |||||
| vtemp1_p += vx_2*va1_2; | |||||
| vtemp1_r += vxr_0*va1_2; | |||||
| vtemp2_p += vx_2*va2_2; | |||||
| vtemp2_r += vxr_0*va2_2; | |||||
| vtemp3_p += vx_2*va3_2; | |||||
| vtemp3_r += vxr_0*va3_2; | |||||
| vtemp0_p += vx_3*va0_3; | |||||
| vtemp0_r += vxr_1*va0_3; | |||||
| vtemp1_p += vx_3*va1_3; | |||||
| vtemp1_r += vxr_1*va1_3; | |||||
| vtemp2_p += vx_3*va2_3; | |||||
| vtemp2_r += vxr_1*va2_3; | |||||
| vtemp3_p += vx_3*va3_3; | |||||
| vtemp3_r += vxr_1*va3_3; | |||||
| } | |||||
| #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) | |||||
| register FLOAT temp_r0 = vtemp0_p[0] - vtemp0_p[1]; | |||||
| register FLOAT temp_i0 = vtemp0_r[0] + vtemp0_r[1]; | |||||
| register FLOAT temp_r1 = vtemp1_p[0] - vtemp1_p[1]; | |||||
| register FLOAT temp_i1 = vtemp1_r[0] + vtemp1_r[1]; | |||||
| register FLOAT temp_r2 = vtemp2_p[0] - vtemp2_p[1]; | |||||
| register FLOAT temp_i2 = vtemp2_r[0] + vtemp2_r[1]; | |||||
| register FLOAT temp_r3 = vtemp3_p[0] - vtemp3_p[1]; | |||||
| register FLOAT temp_i3 = vtemp3_r[0] + vtemp3_r[1]; | |||||
| #else | |||||
| register FLOAT temp_r0 = vtemp0_p[0] + vtemp0_p[1]; | |||||
| register FLOAT temp_i0 = vtemp0_r[0] - vtemp0_r[1]; | |||||
| register FLOAT temp_r1 = vtemp1_p[0] + vtemp1_p[1]; | |||||
| register FLOAT temp_i1 = vtemp1_r[0] - vtemp1_r[1]; | |||||
| register FLOAT temp_r2 = vtemp2_p[0] + vtemp2_p[1]; | |||||
| register FLOAT temp_i2 = vtemp2_r[0] - vtemp2_r[1]; | |||||
| register FLOAT temp_r3 = vtemp3_p[0] + vtemp3_p[1]; | |||||
| register FLOAT temp_i3 = vtemp3_r[0] - vtemp3_r[1]; | |||||
| #endif | |||||
| #if !defined(XCONJ) | |||||
| y[0] += alpha_r * temp_r0 - alpha_i * temp_i0; | |||||
| y[1] += alpha_r * temp_i0 + alpha_i * temp_r0; | |||||
| y[2] += alpha_r * temp_r1 - alpha_i * temp_i1; | |||||
| y[3] += alpha_r * temp_i1 + alpha_i * temp_r1; | |||||
| y[4] += alpha_r * temp_r2 - alpha_i * temp_i2; | |||||
| y[5] += alpha_r * temp_i2 + alpha_i * temp_r2; | |||||
| y[6] += alpha_r * temp_r3 - alpha_i * temp_i3; | |||||
| y[7] += alpha_r * temp_i3 + alpha_i * temp_r3; | |||||
| #else | |||||
| y[0] += alpha_r * temp_r0 + alpha_i * temp_i0; | |||||
| y[1] -= alpha_r * temp_i0 - alpha_i * temp_r0; | |||||
| y[2] += alpha_r * temp_r1 + alpha_i * temp_i1; | |||||
| y[3] -= alpha_r * temp_i1 - alpha_i * temp_r1; | |||||
| y[4] += alpha_r * temp_r2 + alpha_i * temp_i2; | |||||
| y[5] -= alpha_r * temp_i2 - alpha_i * temp_r2; | |||||
| y[6] += alpha_r * temp_r3 + alpha_i * temp_i3; | |||||
| y[7] -= alpha_r * temp_i3 - alpha_i * temp_r3; | |||||
| #endif | |||||
| } | |||||
| #else | |||||
| static void zgemv_kernel_4x4(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT alpha_r, FLOAT alpha_i) { | |||||
| BLASLONG i; | |||||
| FLOAT *a0, *a1, *a2, *a3; | |||||
| a0 = ap; | |||||
| a1 = ap + lda; | |||||
| a2 = a1 + lda; | |||||
| a3 = a2 + lda; | |||||
| FLOAT temp_r0 = 0.0; | |||||
| FLOAT temp_r1 = 0.0; | |||||
| FLOAT temp_r2 = 0.0; | |||||
| FLOAT temp_r3 = 0.0; | |||||
| FLOAT temp_i0 = 0.0; | |||||
| FLOAT temp_i1 = 0.0; | |||||
| FLOAT temp_i2 = 0.0; | |||||
| FLOAT temp_i3 = 0.0; | |||||
| for (i = 0; i < 2 * n; i += 2) { | |||||
| #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) | |||||
| temp_r0 += a0[i] * x[i] - a0[i + 1] * x[i + 1]; | |||||
| temp_i0 += a0[i] * x[i + 1] + a0[i + 1] * x[i]; | |||||
| temp_r1 += a1[i] * x[i] - a1[i + 1] * x[i + 1]; | |||||
| temp_i1 += a1[i] * x[i + 1] + a1[i + 1] * x[i]; | |||||
| temp_r2 += a2[i] * x[i] - a2[i + 1] * x[i + 1]; | |||||
| temp_i2 += a2[i] * x[i + 1] + a2[i + 1] * x[i]; | |||||
| temp_r3 += a3[i] * x[i] - a3[i + 1] * x[i + 1]; | |||||
| temp_i3 += a3[i] * x[i + 1] + a3[i + 1] * x[i]; | |||||
| #else | |||||
| temp_r0 += a0[i] * x[i] + a0[i + 1] * x[i + 1]; | |||||
| temp_i0 += a0[i] * x[i + 1] - a0[i + 1] * x[i]; | |||||
| temp_r1 += a1[i] * x[i] + a1[i + 1] * x[i + 1]; | |||||
| temp_i1 += a1[i] * x[i + 1] - a1[i + 1] * x[i]; | |||||
| temp_r2 += a2[i] * x[i] + a2[i + 1] * x[i + 1]; | |||||
| temp_i2 += a2[i] * x[i + 1] - a2[i + 1] * x[i]; | |||||
| temp_r3 += a3[i] * x[i] + a3[i + 1] * x[i + 1]; | |||||
| temp_i3 += a3[i] * x[i + 1] - a3[i + 1] * x[i]; | |||||
| #endif | |||||
| } | |||||
| #if !defined(XCONJ) | |||||
| y[0] += alpha_r * temp_r0 - alpha_i * temp_i0; | |||||
| y[1] += alpha_r * temp_i0 + alpha_i * temp_r0; | |||||
| y[2] += alpha_r * temp_r1 - alpha_i * temp_i1; | |||||
| y[3] += alpha_r * temp_i1 + alpha_i * temp_r1; | |||||
| y[4] += alpha_r * temp_r2 - alpha_i * temp_i2; | |||||
| y[5] += alpha_r * temp_i2 + alpha_i * temp_r2; | |||||
| y[6] += alpha_r * temp_r3 - alpha_i * temp_i3; | |||||
| y[7] += alpha_r * temp_i3 + alpha_i * temp_r3; | |||||
| #else | |||||
| y[0] += alpha_r * temp_r0 + alpha_i * temp_i0; | |||||
| y[1] -= alpha_r * temp_i0 - alpha_i * temp_r0; | |||||
| y[2] += alpha_r * temp_r1 + alpha_i * temp_i1; | |||||
| y[3] -= alpha_r * temp_i1 - alpha_i * temp_r1; | |||||
| y[4] += alpha_r * temp_r2 + alpha_i * temp_i2; | |||||
| y[5] -= alpha_r * temp_i2 - alpha_i * temp_r2; | |||||
| y[6] += alpha_r * temp_r3 + alpha_i * temp_i3; | |||||
| y[7] -= alpha_r * temp_i3 - alpha_i * temp_r3; | |||||
| #endif | |||||
| } | |||||
| #endif | |||||
| #ifdef HAVE_KERNEL_4x2_VEC | |||||
| static void zgemv_kernel_4x2(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT alpha_r, FLOAT alpha_i) { | |||||
| BLASLONG i; | |||||
| FLOAT *a0, *a1; | |||||
| a0 = ap; | |||||
| a1 = ap + lda; | |||||
| //p for positive(real*real,image*image) r for image (real*image,image*real) | |||||
| register __vector double vtemp0_p = {0.0, 0.0}; | |||||
| register __vector double vtemp0_r = {0.0, 0.0}; | |||||
| register __vector double vtemp1_p = {0.0, 0.0}; | |||||
| register __vector double vtemp1_r = {0.0, 0.0}; | |||||
| i = 0; | |||||
| n = n << 1; | |||||
| while (i < n) { | |||||
| register __vector double vx_0 = *(__vector double*) (&x[i]); | |||||
| register __vector double vx_1 = *(__vector double*) (&x[i + 2]); | |||||
| register __vector double vx_2 = *(__vector double*) (&x[i + 4]); | |||||
| register __vector double vx_3 = *(__vector double*) (&x[i + 6]); | |||||
| register __vector double va0 = *(__vector double*) (&a0[i]); | |||||
| register __vector double va0_1 = *(__vector double*) (&a0[i + 2]); | |||||
| register __vector double va0_2 = *(__vector double*) (&a0[i + 4]); | |||||
| register __vector double va0_3 = *(__vector double*) (&a0[i + 6]); | |||||
| register __vector double va1 = *(__vector double*) (&a1[i]); | |||||
| register __vector double va1_1 = *(__vector double*) (&a1[i + 2]); | |||||
| register __vector double va1_2 = *(__vector double*) (&a1[i + 4]); | |||||
| register __vector double va1_3 = *(__vector double*) (&a1[i + 6]); | |||||
| register __vector double vxr_0 = vec_xxpermdi(vx_0, vx_0, 2); | |||||
| register __vector double vxr_1 = vec_xxpermdi(vx_1, vx_1, 2); | |||||
| i += 8; | |||||
| vtemp0_p += vx_0*va0; | |||||
| vtemp0_r += vxr_0*va0; | |||||
| vtemp1_p += vx_0*va1; | |||||
| vtemp1_r += vxr_0*va1; | |||||
| vxr_0 = vec_xxpermdi(vx_2, vx_2, 2); | |||||
| vtemp0_p += vx_1*va0_1; | |||||
| vtemp0_r += vxr_1*va0_1; | |||||
| vtemp1_p += vx_1*va1_1; | |||||
| vtemp1_r += vxr_1*va1_1; | |||||
| vxr_1 = vec_xxpermdi(vx_3, vx_3, 2); | |||||
| vtemp0_p += vx_2*va0_2; | |||||
| vtemp0_r += vxr_0*va0_2; | |||||
| vtemp1_p += vx_2*va1_2; | |||||
| vtemp1_r += vxr_0*va1_2; | |||||
| vtemp0_p += vx_3*va0_3; | |||||
| vtemp0_r += vxr_1*va0_3; | |||||
| vtemp1_p += vx_3*va1_3; | |||||
| vtemp1_r += vxr_1*va1_3; | |||||
| } | |||||
| #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) | |||||
| register FLOAT temp_r0 = vtemp0_p[0] - vtemp0_p[1]; | |||||
| register FLOAT temp_i0 = vtemp0_r[0] + vtemp0_r[1]; | |||||
| register FLOAT temp_r1 = vtemp1_p[0] - vtemp1_p[1]; | |||||
| register FLOAT temp_i1 = vtemp1_r[0] + vtemp1_r[1]; | |||||
| #else | |||||
| register FLOAT temp_r0 = vtemp0_p[0] + vtemp0_p[1]; | |||||
| register FLOAT temp_i0 = vtemp0_r[0] - vtemp0_r[1]; | |||||
| register FLOAT temp_r1 = vtemp1_p[0] + vtemp1_p[1]; | |||||
| register FLOAT temp_i1 = vtemp1_r[0] - vtemp1_r[1]; | |||||
| #endif | |||||
| #if !defined(XCONJ) | |||||
| y[0] += alpha_r * temp_r0 - alpha_i * temp_i0; | |||||
| y[1] += alpha_r * temp_i0 + alpha_i * temp_r0; | |||||
| y[2] += alpha_r * temp_r1 - alpha_i * temp_i1; | |||||
| y[3] += alpha_r * temp_i1 + alpha_i * temp_r1; | |||||
| #else | |||||
| y[0] += alpha_r * temp_r0 + alpha_i * temp_i0; | |||||
| y[1] -= alpha_r * temp_i0 - alpha_i * temp_r0; | |||||
| y[2] += alpha_r * temp_r1 + alpha_i * temp_i1; | |||||
| y[3] -= alpha_r * temp_i1 - alpha_i * temp_r1; | |||||
| #endif | |||||
| } | |||||
| #else | |||||
| static void zgemv_kernel_4x2(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT alpha_r, FLOAT alpha_i) { | |||||
| BLASLONG i; | |||||
| FLOAT *a0, *a1; | |||||
| a0 = ap; | |||||
| a1 = ap + lda; | |||||
| FLOAT temp_r0 = 0.0; | |||||
| FLOAT temp_r1 = 0.0; | |||||
| FLOAT temp_i0 = 0.0; | |||||
| FLOAT temp_i1 = 0.0; | |||||
| for (i = 0; i < 2 * n; i += 2) { | |||||
| #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) | |||||
| temp_r0 += a0[i] * x[i] - a0[i + 1] * x[i + 1]; | |||||
| temp_i0 += a0[i] * x[i + 1] + a0[i + 1] * x[i]; | |||||
| temp_r1 += a1[i] * x[i] - a1[i + 1] * x[i + 1]; | |||||
| temp_i1 += a1[i] * x[i + 1] + a1[i + 1] * x[i]; | |||||
| #else | |||||
| temp_r0 += a0[i] * x[i] + a0[i + 1] * x[i + 1]; | |||||
| temp_i0 += a0[i] * x[i + 1] - a0[i + 1] * x[i]; | |||||
| temp_r1 += a1[i] * x[i] + a1[i + 1] * x[i + 1]; | |||||
| temp_i1 += a1[i] * x[i + 1] - a1[i + 1] * x[i]; | |||||
| #endif | |||||
| } | |||||
| #if !defined(XCONJ) | |||||
| y[0] += alpha_r * temp_r0 - alpha_i * temp_i0; | |||||
| y[1] += alpha_r * temp_i0 + alpha_i * temp_r0; | |||||
| y[2] += alpha_r * temp_r1 - alpha_i * temp_i1; | |||||
| y[3] += alpha_r * temp_i1 + alpha_i * temp_r1; | |||||
| #else | |||||
| y[0] += alpha_r * temp_r0 + alpha_i * temp_i0; | |||||
| y[1] -= alpha_r * temp_i0 - alpha_i * temp_r0; | |||||
| y[2] += alpha_r * temp_r1 + alpha_i * temp_i1; | |||||
| y[3] -= alpha_r * temp_i1 - alpha_i * temp_r1; | |||||
| #endif | |||||
| } | |||||
| #endif | |||||
| #ifdef HAVE_KERNEL_4x1_VEC | |||||
| static void zgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT alpha_r, FLOAT alpha_i) { | |||||
| BLASLONG i; | |||||
| FLOAT *a0 ; | |||||
| a0 = ap; | |||||
| //p for positive(real*real,image*image) r for image (real*image,image*real) | |||||
| register __vector double vtemp0_p = {0.0, 0.0}; | |||||
| register __vector double vtemp0_r = {0.0, 0.0}; | |||||
| i = 0; | |||||
| n = n << 1; | |||||
| while (i < n) { | |||||
| register __vector double vx_0 = *(__vector double*) (&x[i]); | |||||
| register __vector double vx_1 = *(__vector double*) (&x[i + 2]); | |||||
| register __vector double vx_2 = *(__vector double*) (&x[i + 4]); | |||||
| register __vector double vx_3 = *(__vector double*) (&x[i + 6]); | |||||
| register __vector double va0 = *(__vector double*) (&a0[i]); | |||||
| register __vector double va0_1 = *(__vector double*) (&a0[i + 2]); | |||||
| register __vector double va0_2 = *(__vector double*) (&a0[i + 4]); | |||||
| register __vector double va0_3 = *(__vector double*) (&a0[i + 6]); | |||||
| register __vector double vxr_0 = vec_xxpermdi(vx_0, vx_0, 2); | |||||
| register __vector double vxr_1 = vec_xxpermdi(vx_1, vx_1, 2); | |||||
| i += 8; | |||||
| vtemp0_p += vx_0*va0; | |||||
| vtemp0_r += vxr_0*va0; | |||||
| vxr_0 = vec_xxpermdi(vx_2, vx_2, 2); | |||||
| vtemp0_p += vx_1*va0_1; | |||||
| vtemp0_r += vxr_1*va0_1; | |||||
| vxr_1 = vec_xxpermdi(vx_3, vx_3, 2); | |||||
| vtemp0_p += vx_2*va0_2; | |||||
| vtemp0_r += vxr_0*va0_2; | |||||
| vtemp0_p += vx_3*va0_3; | |||||
| vtemp0_r += vxr_1*va0_3; | |||||
| } | |||||
| #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) | |||||
| register FLOAT temp_r0 = vtemp0_p[0] - vtemp0_p[1]; | |||||
| register FLOAT temp_i0 = vtemp0_r[0] + vtemp0_r[1]; | |||||
| #else | |||||
| register FLOAT temp_r0 = vtemp0_p[0] + vtemp0_p[1]; | |||||
| register FLOAT temp_i0 = vtemp0_r[0] - vtemp0_r[1]; | |||||
| #endif | |||||
| #if !defined(XCONJ) | |||||
| y[0] += alpha_r * temp_r0 - alpha_i * temp_i0; | |||||
| y[1] += alpha_r * temp_i0 + alpha_i * temp_r0; | |||||
| #else | |||||
| y[0] += alpha_r * temp_r0 + alpha_i * temp_i0; | |||||
| y[1] -= alpha_r * temp_i0 - alpha_i * temp_r0; | |||||
| #endif | |||||
| } | |||||
| #else | |||||
| static void zgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT alpha_r, FLOAT alpha_i) { | |||||
| BLASLONG i; | |||||
| FLOAT *a0; | |||||
| a0 = ap; | |||||
| FLOAT temp_r0 = 0.0; | |||||
| FLOAT temp_i0 = 0.0; | |||||
| for (i = 0; i < 2 * n; i += 2) { | |||||
| #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) | |||||
| temp_r0 += a0[i] * x[i] - a0[i + 1] * x[i + 1]; | |||||
| temp_i0 += a0[i] * x[i + 1] + a0[i + 1] * x[i]; | |||||
| #else | |||||
| temp_r0 += a0[i] * x[i] + a0[i + 1] * x[i + 1]; | |||||
| temp_i0 += a0[i] * x[i + 1] - a0[i + 1] * x[i]; | |||||
| #endif | |||||
| } | |||||
| #if !defined(XCONJ) | |||||
| y[0] += alpha_r * temp_r0 - alpha_i * temp_i0; | |||||
| y[1] += alpha_r * temp_i0 + alpha_i * temp_r0; | |||||
| #else | |||||
| y[0] += alpha_r * temp_r0 + alpha_i * temp_i0; | |||||
| y[1] -= alpha_r * temp_i0 - alpha_i * temp_r0; | |||||
| #endif | |||||
| } | |||||
| #endif | |||||
| static __attribute__((always_inline)) void copy_x(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_src) { | |||||
| BLASLONG i; | |||||
| for (i = 0; i < n; i++) { | |||||
| *dest = *src; | |||||
| *(dest + 1) = *(src + 1); | |||||
| dest += 2; | |||||
| src += inc_src; | |||||
| } | |||||
| } | |||||
| int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) { | |||||
| BLASLONG i; | |||||
| BLASLONG j; | |||||
| FLOAT *a_ptr; | |||||
| FLOAT *x_ptr; | |||||
| FLOAT *y_ptr; | |||||
| BLASLONG n1; | |||||
| BLASLONG m1; | |||||
| BLASLONG m2; | |||||
| BLASLONG m3; | |||||
| BLASLONG n2; | |||||
| FLOAT ybuffer[8], *xbuffer; | |||||
| if (m < 1) return (0); | |||||
| if (n < 1) return (0); | |||||
| inc_x <<= 1; | |||||
| inc_y <<= 1; | |||||
| lda <<= 1; | |||||
| xbuffer = buffer; | |||||
| n1 = n >> 2; | |||||
| n2 = n & 3; | |||||
| m3 = m & 3; | |||||
| m1 = m - m3; | |||||
| m2 = (m & (NBMAX - 1)) - m3; | |||||
| BLASLONG NB = NBMAX; | |||||
| while (NB == NBMAX) { | |||||
| m1 -= NB; | |||||
| if (m1 < 0) { | |||||
| if (m2 == 0) break; | |||||
| NB = m2; | |||||
| } | |||||
| y_ptr = y; | |||||
| a_ptr = a; | |||||
| x_ptr = x; | |||||
| if (inc_x != 2) | |||||
| copy_x(NB, x_ptr, xbuffer, inc_x); | |||||
| else | |||||
| xbuffer = x_ptr; | |||||
| if (inc_y == 2) { | |||||
| for (i = 0; i < n1; i++) { | |||||
| zgemv_kernel_4x4(NB, lda, a_ptr, xbuffer, y_ptr, alpha_r, alpha_i); | |||||
| a_ptr += lda << 2; | |||||
| y_ptr += 8; | |||||
| } | |||||
| if (n2 & 2) { | |||||
| zgemv_kernel_4x2(NB, lda, a_ptr, xbuffer, y_ptr, alpha_r, alpha_i); | |||||
| a_ptr += lda << 1; | |||||
| y_ptr += 4; | |||||
| } | |||||
| if (n2 & 1) { | |||||
| zgemv_kernel_4x1(NB, a_ptr, xbuffer, y_ptr, alpha_r, alpha_i); | |||||
| a_ptr += lda; | |||||
| y_ptr += 2; | |||||
| } | |||||
| } else { | |||||
| for (i = 0; i < n1; i++) { | |||||
| memset(ybuffer, 0, sizeof (ybuffer)); | |||||
| zgemv_kernel_4x4(NB, lda, a_ptr, xbuffer, ybuffer, alpha_r, alpha_i); | |||||
| a_ptr += lda << 2; | |||||
| y_ptr[0] += ybuffer[0]; | |||||
| y_ptr[1] += ybuffer[1]; | |||||
| y_ptr += inc_y; | |||||
| y_ptr[0] += ybuffer[2]; | |||||
| y_ptr[1] += ybuffer[3]; | |||||
| y_ptr += inc_y; | |||||
| y_ptr[0] += ybuffer[4]; | |||||
| y_ptr[1] += ybuffer[5]; | |||||
| y_ptr += inc_y; | |||||
| y_ptr[0] += ybuffer[6]; | |||||
| y_ptr[1] += ybuffer[7]; | |||||
| y_ptr += inc_y; | |||||
| } | |||||
| for (i = 0; i < n2; i++) { | |||||
| memset(ybuffer, 0, sizeof (ybuffer)); | |||||
| zgemv_kernel_4x1(NB, a_ptr, xbuffer, ybuffer, alpha_r, alpha_i); | |||||
| a_ptr += lda; | |||||
| y_ptr[0] += ybuffer[0]; | |||||
| y_ptr[1] += ybuffer[1]; | |||||
| y_ptr += inc_y; | |||||
| } | |||||
| } | |||||
| a += 2 * NB; | |||||
| x += NB * inc_x; | |||||
| } | |||||
| if (m3 == 0) return (0); | |||||
| x_ptr = x; | |||||
| j = 0; | |||||
| a_ptr = a; | |||||
| y_ptr = y; | |||||
| if (m3 == 3) { | |||||
| FLOAT temp_r; | |||||
| FLOAT temp_i; | |||||
| FLOAT x0 = x_ptr[0]; | |||||
| FLOAT x1 = x_ptr[1]; | |||||
| x_ptr += inc_x; | |||||
| FLOAT x2 = x_ptr[0]; | |||||
| FLOAT x3 = x_ptr[1]; | |||||
| x_ptr += inc_x; | |||||
| FLOAT x4 = x_ptr[0]; | |||||
| FLOAT x5 = x_ptr[1]; | |||||
| while (j < n) { | |||||
| #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) | |||||
| temp_r = a_ptr[0] * x0 - a_ptr[1] * x1; | |||||
| temp_i = a_ptr[0] * x1 + a_ptr[1] * x0; | |||||
| temp_r += a_ptr[2] * x2 - a_ptr[3] * x3; | |||||
| temp_i += a_ptr[2] * x3 + a_ptr[3] * x2; | |||||
| temp_r += a_ptr[4] * x4 - a_ptr[5] * x5; | |||||
| temp_i += a_ptr[4] * x5 + a_ptr[5] * x4; | |||||
| #else | |||||
| temp_r = a_ptr[0] * x0 + a_ptr[1] * x1; | |||||
| temp_i = a_ptr[0] * x1 - a_ptr[1] * x0; | |||||
| temp_r += a_ptr[2] * x2 + a_ptr[3] * x3; | |||||
| temp_i += a_ptr[2] * x3 - a_ptr[3] * x2; | |||||
| temp_r += a_ptr[4] * x4 + a_ptr[5] * x5; | |||||
| temp_i += a_ptr[4] * x5 - a_ptr[5] * x4; | |||||
| #endif | |||||
| #if !defined(XCONJ) | |||||
| y_ptr[0] += alpha_r * temp_r - alpha_i * temp_i; | |||||
| y_ptr[1] += alpha_r * temp_i + alpha_i * temp_r; | |||||
| #else | |||||
| y_ptr[0] += alpha_r * temp_r + alpha_i * temp_i; | |||||
| y_ptr[1] -= alpha_r * temp_i - alpha_i * temp_r; | |||||
| #endif | |||||
| a_ptr += lda; | |||||
| y_ptr += inc_y; | |||||
| j++; | |||||
| } | |||||
| return (0); | |||||
| } | |||||
| if (m3 == 2) { | |||||
| FLOAT temp_r; | |||||
| FLOAT temp_i; | |||||
| FLOAT temp_r1; | |||||
| FLOAT temp_i1; | |||||
| FLOAT x0 = x_ptr[0]; | |||||
| FLOAT x1 = x_ptr[1]; | |||||
| x_ptr += inc_x; | |||||
| FLOAT x2 = x_ptr[0]; | |||||
| FLOAT x3 = x_ptr[1]; | |||||
| while (j < (n & -2)) { | |||||
| #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) | |||||
| temp_r = a_ptr[0] * x0 - a_ptr[1] * x1; | |||||
| temp_i = a_ptr[0] * x1 + a_ptr[1] * x0; | |||||
| temp_r += a_ptr[2] * x2 - a_ptr[3] * x3; | |||||
| temp_i += a_ptr[2] * x3 + a_ptr[3] * x2; | |||||
| a_ptr += lda; | |||||
| temp_r1 = a_ptr[0] * x0 - a_ptr[1] * x1; | |||||
| temp_i1 = a_ptr[0] * x1 + a_ptr[1] * x0; | |||||
| temp_r1 += a_ptr[2] * x2 - a_ptr[3] * x3; | |||||
| temp_i1 += a_ptr[2] * x3 + a_ptr[3] * x2; | |||||
| #else | |||||
| temp_r = a_ptr[0] * x0 + a_ptr[1] * x1; | |||||
| temp_i = a_ptr[0] * x1 - a_ptr[1] * x0; | |||||
| temp_r += a_ptr[2] * x2 + a_ptr[3] * x3; | |||||
| temp_i += a_ptr[2] * x3 - a_ptr[3] * x2; | |||||
| a_ptr += lda; | |||||
| temp_r1 = a_ptr[0] * x0 + a_ptr[1] * x1; | |||||
| temp_i1 = a_ptr[0] * x1 - a_ptr[1] * x0; | |||||
| temp_r1 += a_ptr[2] * x2 + a_ptr[3] * x3; | |||||
| temp_i1 += a_ptr[2] * x3 - a_ptr[3] * x2; | |||||
| #endif | |||||
| #if !defined(XCONJ) | |||||
| y_ptr[0] += alpha_r * temp_r - alpha_i * temp_i; | |||||
| y_ptr[1] += alpha_r * temp_i + alpha_i * temp_r; | |||||
| y_ptr += inc_y; | |||||
| y_ptr[0] += alpha_r * temp_r1 - alpha_i * temp_i1; | |||||
| y_ptr[1] += alpha_r * temp_i1 + alpha_i * temp_r1; | |||||
| #else | |||||
| y_ptr[0] += alpha_r * temp_r + alpha_i * temp_i; | |||||
| y_ptr[1] -= alpha_r * temp_i - alpha_i * temp_r; | |||||
| y_ptr += inc_y; | |||||
| y_ptr[0] += alpha_r * temp_r1 + alpha_i * temp_i1; | |||||
| y_ptr[1] -= alpha_r * temp_i1 - alpha_i * temp_r1; | |||||
| #endif | |||||
| a_ptr += lda; | |||||
| y_ptr += inc_y; | |||||
| j += 2; | |||||
| } | |||||
| while (j < n) { | |||||
| #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) | |||||
| temp_r = a_ptr[0] * x0 - a_ptr[1] * x1; | |||||
| temp_i = a_ptr[0] * x1 + a_ptr[1] * x0; | |||||
| temp_r += a_ptr[2] * x2 - a_ptr[3] * x3; | |||||
| temp_i += a_ptr[2] * x3 + a_ptr[3] * x2; | |||||
| #else | |||||
| temp_r = a_ptr[0] * x0 + a_ptr[1] * x1; | |||||
| temp_i = a_ptr[0] * x1 - a_ptr[1] * x0; | |||||
| temp_r += a_ptr[2] * x2 + a_ptr[3] * x3; | |||||
| temp_i += a_ptr[2] * x3 - a_ptr[3] * x2; | |||||
| #endif | |||||
| #if !defined(XCONJ) | |||||
| y_ptr[0] += alpha_r * temp_r - alpha_i * temp_i; | |||||
| y_ptr[1] += alpha_r * temp_i + alpha_i * temp_r; | |||||
| #else | |||||
| y_ptr[0] += alpha_r * temp_r + alpha_i * temp_i; | |||||
| y_ptr[1] -= alpha_r * temp_i - alpha_i * temp_r; | |||||
| #endif | |||||
| a_ptr += lda; | |||||
| y_ptr += inc_y; | |||||
| j++; | |||||
| } | |||||
| return (0); | |||||
| } | |||||
| if (m3 == 1) { | |||||
| FLOAT temp_r; | |||||
| FLOAT temp_i; | |||||
| FLOAT temp_r1; | |||||
| FLOAT temp_i1; | |||||
| FLOAT x0 = x_ptr[0]; | |||||
| FLOAT x1 = x_ptr[1]; | |||||
| while (j < (n & -2)) { | |||||
| #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) | |||||
| temp_r = a_ptr[0] * x0 - a_ptr[1] * x1; | |||||
| temp_i = a_ptr[0] * x1 + a_ptr[1] * x0; | |||||
| a_ptr += lda; | |||||
| temp_r1 = a_ptr[0] * x0 - a_ptr[1] * x1; | |||||
| temp_i1 = a_ptr[0] * x1 + a_ptr[1] * x0; | |||||
| #else | |||||
| temp_r = a_ptr[0] * x0 + a_ptr[1] * x1; | |||||
| temp_i = a_ptr[0] * x1 - a_ptr[1] * x0; | |||||
| a_ptr += lda; | |||||
| temp_r1 = a_ptr[0] * x0 + a_ptr[1] * x1; | |||||
| temp_i1 = a_ptr[0] * x1 - a_ptr[1] * x0; | |||||
| #endif | |||||
| #if !defined(XCONJ) | |||||
| y_ptr[0] += alpha_r * temp_r - alpha_i * temp_i; | |||||
| y_ptr[1] += alpha_r * temp_i + alpha_i * temp_r; | |||||
| y_ptr += inc_y; | |||||
| y_ptr[0] += alpha_r * temp_r1 - alpha_i * temp_i1; | |||||
| y_ptr[1] += alpha_r * temp_i1 + alpha_i * temp_r1; | |||||
| #else | |||||
| y_ptr[0] += alpha_r * temp_r + alpha_i * temp_i; | |||||
| y_ptr[1] -= alpha_r * temp_i - alpha_i * temp_r; | |||||
| y_ptr += inc_y; | |||||
| y_ptr[0] += alpha_r * temp_r1 + alpha_i * temp_i1; | |||||
| y_ptr[1] -= alpha_r * temp_i1 - alpha_i * temp_r1; | |||||
| #endif | |||||
| a_ptr += lda; | |||||
| y_ptr += inc_y; | |||||
| j += 2; | |||||
| } | |||||
| while (j < n) { | |||||
| #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) | |||||
| temp_r = a_ptr[0] * x0 - a_ptr[1] * x1; | |||||
| temp_i = a_ptr[0] * x1 + a_ptr[1] * x0; | |||||
| #else | |||||
| temp_r = a_ptr[0] * x0 + a_ptr[1] * x1; | |||||
| temp_i = a_ptr[0] * x1 - a_ptr[1] * x0; | |||||
| #endif | |||||
| #if !defined(XCONJ) | |||||
| y_ptr[0] += alpha_r * temp_r - alpha_i * temp_i; | |||||
| y_ptr[1] += alpha_r * temp_i + alpha_i * temp_r; | |||||
| #else | |||||
| y_ptr[0] += alpha_r * temp_r + alpha_i * temp_i; | |||||
| y_ptr[1] -= alpha_r * temp_i - alpha_i * temp_r; | |||||
| #endif | |||||
| a_ptr += lda; | |||||
| y_ptr += inc_y; | |||||
| j++; | |||||
| } | |||||
| return (0); | |||||
| } | |||||
| return (0); | |||||
| } | |||||
| @@ -0,0 +1,265 @@ | |||||
| /*************************************************************************** | |||||
| Copyright (c) 2018, The OpenBLAS Project | |||||
| All rights reserved. | |||||
| Redistribution and use in source and binary forms, with or without | |||||
| modification, are permitted provided that the following conditions are | |||||
| met: | |||||
| 1. Redistributions of source code must retain the above copyright | |||||
| notice, this list of conditions and the following disclaimer. | |||||
| 2. Redistributions in binary form must reproduce the above copyright | |||||
| notice, this list of conditions and the following disclaimer in | |||||
| the documentation and/or other materials provided with the | |||||
| distribution. | |||||
| 3. Neither the name of the OpenBLAS project nor the names of | |||||
| its contributors may be used to endorse or promote products | |||||
| derived from this software without specific prior written permission. | |||||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| *****************************************************************************/ | |||||
| #include "common.h" | |||||
| static void zrot_kernel_4(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT cosA, FLOAT sinA) | |||||
| { | |||||
| __vector double t0; | |||||
| __vector double t1; | |||||
| __vector double t2; | |||||
| __vector double t3; | |||||
| __vector double t4; | |||||
| __vector double t5; | |||||
| __vector double t6; | |||||
| __vector double t7; | |||||
| __asm__ | |||||
| ( | |||||
| "xxspltd 36, %x[cos], 0 \n\t" // load c to both dwords | |||||
| "xxspltd 37, %x[sin], 0 \n\t" // load s to both dwords | |||||
| "lxvd2x 32, 0, %[x_ptr] \n\t" // load x | |||||
| "lxvd2x 33, %[i16], %[x_ptr] \n\t" | |||||
| "lxvd2x 34, %[i32], %[x_ptr] \n\t" | |||||
| "lxvd2x 35, %[i48], %[x_ptr] \n\t" | |||||
| "lxvd2x 48, 0, %[y_ptr] \n\t" // load y | |||||
| "lxvd2x 49, %[i16], %[y_ptr] \n\t" | |||||
| "lxvd2x 50, %[i32], %[y_ptr] \n\t" | |||||
| "lxvd2x 51, %[i48], %[y_ptr] \n\t" | |||||
| "addi %[x_ptr], %[x_ptr], 64 \n\t" | |||||
| "addi %[y_ptr], %[y_ptr], 64 \n\t" | |||||
| "addic. %[temp_n], %[temp_n], -4 \n\t" | |||||
| "ble 2f \n\t" | |||||
| ".p2align 5 \n" | |||||
| "1: \n\t" | |||||
| "xvmuldp 40, 32, 36 \n\t" // c * x | |||||
| "xvmuldp 41, 33, 36 \n\t" | |||||
| "xvmuldp 42, 34, 36 \n\t" | |||||
| "xvmuldp 43, 35, 36 \n\t" | |||||
| "xvmuldp %x[x0], 48, 36 \n\t" // c * y | |||||
| "xvmuldp %x[x1], 49, 36 \n\t" | |||||
| "xvmuldp %x[x2], 50, 36 \n\t" | |||||
| "xvmuldp %x[x3], 51, 36 \n\t" | |||||
| "xvmuldp 44, 32, 37 \n\t" // s * x | |||||
| "xvmuldp 45, 33, 37 \n\t" | |||||
| "lxvd2x 32, 0, %[x_ptr] \n\t" // load x | |||||
| "lxvd2x 33, %[i16],%[x_ptr] \n\t" | |||||
| "xvmuldp 46, 34, 37 \n\t" | |||||
| "xvmuldp 47, 35, 37 \n\t" | |||||
| "lxvd2x 34, %[i32], %[x_ptr] \n\t" | |||||
| "lxvd2x 35, %[i48], %[x_ptr] \n\t" | |||||
| "xvmuldp %x[x4], 48, 37 \n\t" // s * y | |||||
| "xvmuldp %x[x5], 49, 37 \n\t" | |||||
| "lxvd2x 48, 0, %[y_ptr] \n\t" // load y | |||||
| "lxvd2x 49, %[i16], %[y_ptr] \n\t" | |||||
| "xvmuldp %x[x6], 50, 37 \n\t" | |||||
| "xvmuldp %x[x7], 51, 37 \n\t" | |||||
| "lxvd2x 50, %[i32], %[y_ptr] \n\t" | |||||
| "lxvd2x 51, %[i48], %[y_ptr] \n\t" | |||||
| "xvadddp 40, 40, %x[x4] \n\t" // c * x + s * y | |||||
| "xvadddp 41, 41, %x[x5] \n\t" // c * x + s * y | |||||
| "addi %[x_ptr], %[x_ptr], -64 \n\t" | |||||
| "addi %[y_ptr], %[y_ptr], -64 \n\t" | |||||
| "xvadddp 42, 42, %x[x6] \n\t" // c * x + s * y | |||||
| "xvadddp 43, 43, %x[x7] \n\t" // c * x + s * y | |||||
| "xvsubdp %x[x0], %x[x0], 44 \n\t" // c * y - s * x | |||||
| "xvsubdp %x[x1], %x[x1], 45 \n\t" // c * y - s * x | |||||
| "xvsubdp %x[x2], %x[x2], 46 \n\t" // c * y - s * x | |||||
| "xvsubdp %x[x3], %x[x3], 47 \n\t" // c * y - s * x | |||||
| "stxvd2x 40, 0, %[x_ptr] \n\t" // store x | |||||
| "stxvd2x 41, %[i16], %[x_ptr] \n\t" | |||||
| "stxvd2x 42, %[i32], %[x_ptr] \n\t" | |||||
| "stxvd2x 43, %[i48], %[x_ptr] \n\t" | |||||
| "stxvd2x %x[x0], 0, %[y_ptr] \n\t" // store y | |||||
| "stxvd2x %x[x1], %[i16], %[y_ptr] \n\t" | |||||
| "stxvd2x %x[x2], %[i32], %[y_ptr] \n\t" | |||||
| "stxvd2x %x[x3], %[i48], %[y_ptr] \n\t" | |||||
| "addi %[x_ptr], %[x_ptr], 128 \n\t" | |||||
| "addi %[y_ptr], %[y_ptr], 128 \n\t" | |||||
| "addic. %[temp_n], %[temp_n], -4 \n\t" | |||||
| "bgt+ 1b \n" | |||||
| "2: \n\t" | |||||
| "xvmuldp 40, 32, 36 \n\t" // c * x | |||||
| "xvmuldp 41, 33, 36 \n\t" | |||||
| "xvmuldp 42, 34, 36 \n\t" | |||||
| "xvmuldp 43, 35, 36 \n\t" | |||||
| "xvmuldp %x[x0], 48, 36 \n\t" // c * y | |||||
| "xvmuldp %x[x1], 49, 36 \n\t" | |||||
| "xvmuldp %x[x2], 50, 36 \n\t" | |||||
| "xvmuldp %x[x3], 51, 36 \n\t" | |||||
| "xvmuldp 44, 32, 37 \n\t" // s * x | |||||
| "xvmuldp 45, 33, 37 \n\t" | |||||
| "xvmuldp 46, 34, 37 \n\t" | |||||
| "xvmuldp 47, 35, 37 \n\t" | |||||
| "xvmuldp %x[x4], 48, 37 \n\t" // s * y | |||||
| "xvmuldp %x[x5], 49, 37 \n\t" | |||||
| "xvmuldp %x[x6], 50, 37 \n\t" | |||||
| "xvmuldp %x[x7], 51, 37 \n\t" | |||||
| "addi %[x_ptr], %[x_ptr], -64 \n\t" | |||||
| "addi %[y_ptr], %[y_ptr], -64 \n\t" | |||||
| "xvadddp 40, 40, %x[x4] \n\t" // c * x + s * y | |||||
| "xvadddp 41, 41, %x[x5] \n\t" // c * x + s * y | |||||
| "xvadddp 42, 42, %x[x6] \n\t" // c * x + s * y | |||||
| "xvadddp 43, 43, %x[x7] \n\t" // c * x + s * y | |||||
| "xvsubdp %x[x0], %x[x0], 44 \n\t" // c * y - s * x | |||||
| "xvsubdp %x[x1], %x[x1], 45 \n\t" // c * y - s * x | |||||
| "xvsubdp %x[x2], %x[x2], 46 \n\t" // c * y - s * x | |||||
| "xvsubdp %x[x3], %x[x3], 47 \n\t" // c * y - s * x | |||||
| "stxvd2x 40, 0, %[x_ptr] \n\t" // store x | |||||
| "stxvd2x 41, %[i16], %[x_ptr] \n\t" | |||||
| "stxvd2x 42, %[i32], %[x_ptr] \n\t" | |||||
| "stxvd2x 43, %[i48], %[x_ptr] \n\t" | |||||
| "stxvd2x %x[x0], 0, %[y_ptr] \n\t" // store y | |||||
| "stxvd2x %x[x1], %[i16], %[y_ptr] \n\t" | |||||
| "stxvd2x %x[x2], %[i32], %[y_ptr] \n\t" | |||||
| "stxvd2x %x[x3], %[i48], %[y_ptr] \n\t" | |||||
| : | |||||
| [mem_x] "+m" (*(double (*)[2*n])x), | |||||
| [mem_y] "+m" (*(double (*)[2*n])y), | |||||
| [temp_n] "+&r" (n), | |||||
| [x_ptr] "+&b"(x), [y_ptr] "+&b"(y), | |||||
| [x0] "=wa" (t0), | |||||
| [x1] "=wa" (t1), | |||||
| [x2] "=wa" (t2), | |||||
| [x3] "=wa" (t3), | |||||
| [x4] "=wa" (t4), | |||||
| [x5] "=wa" (t5), | |||||
| [x6] "=wa" (t6), | |||||
| [x7] "=wa" (t7) | |||||
| : | |||||
| [cos] "d" (cosA), | |||||
| [sin] "d" (sinA), | |||||
| [i16] "b" (16), | |||||
| [i32] "b" (32), | |||||
| [i48] "b" (48) | |||||
| : | |||||
| "cr0", | |||||
| "vs32","vs33","vs34","vs35","vs36","vs37", | |||||
| "vs40","vs41","vs42","vs43","vs44","vs45","vs46","vs47", | |||||
| "vs48","vs49","vs50","vs51" | |||||
| ); | |||||
| return; | |||||
| } | |||||
| int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT c, FLOAT s) | |||||
| { | |||||
| BLASLONG i=0; | |||||
| BLASLONG ix=0,iy=0; | |||||
| FLOAT temp[2]; | |||||
| BLASLONG inc_x2; | |||||
| BLASLONG inc_y2; | |||||
| if ( n <= 0 ) return(0); | |||||
| if ( (inc_x == 1) && (inc_y == 1) ) | |||||
| { | |||||
| BLASLONG n1 = n & -4; | |||||
| if ( n1 > 0 ) | |||||
| { | |||||
| zrot_kernel_4(n1, x, y, c, s); | |||||
| i=n1; | |||||
| ix=2*n1; | |||||
| } | |||||
| while(i < n) | |||||
| { | |||||
| temp[0] = c*x[ix] + s*y[ix] ; | |||||
| temp[1] = c*x[ix+1] + s*y[ix+1] ; | |||||
| y[ix] = c*y[ix] - s*x[ix] ; | |||||
| y[ix+1] = c*y[ix+1] - s*x[ix+1] ; | |||||
| x[ix] = temp[0] ; | |||||
| x[ix+1] = temp[1] ; | |||||
| ix += 2 ; | |||||
| i++ ; | |||||
| } | |||||
| } | |||||
| else | |||||
| { | |||||
| inc_x2 = 2 * inc_x ; | |||||
| inc_y2 = 2 * inc_y ; | |||||
| while(i < n) | |||||
| { | |||||
| temp[0] = c*x[ix] + s*y[iy] ; | |||||
| temp[1] = c*x[ix+1] + s*y[iy+1] ; | |||||
| y[iy] = c*y[iy] - s*x[ix] ; | |||||
| y[iy+1] = c*y[iy+1] - s*x[ix+1] ; | |||||
| x[ix] = temp[0] ; | |||||
| x[ix+1] = temp[1] ; | |||||
| ix += inc_x2 ; | |||||
| iy += inc_y2 ; | |||||
| i++ ; | |||||
| } | |||||
| } | |||||
| return(0); | |||||
| } | |||||
| @@ -28,126 +28,134 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #include "common.h" | #include "common.h" | ||||
| static void zaxpy_kernel_8(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT da_r,FLOAT da_i) { | static void zaxpy_kernel_8(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT da_r,FLOAT da_i) { | ||||
| BLASLONG tempR1 ; | |||||
| __asm__ ("pfd 1, 0(%[x_tmp]) \n\t" | __asm__ ("pfd 1, 0(%[x_tmp]) \n\t" | ||||
| "pfd 2, 0(%[y_tmp]) \n\t" | |||||
| "lgdr %%r1,%[alpha_r] \n\t" | |||||
| "vlvgp %%v28,%%r1,%%r1 \n\t" | |||||
| "lgdr %%r1,%[alpha_i] \n\t" | |||||
| "vlvgp %%v29,%%r1,%%r1 \n\t" | |||||
| "sllg %[tmp],%[tmp],4 \n\t" | |||||
| "xgr %%r1,%%r1 \n\t" | |||||
| "pfd 2, 0(%[y_tmp]) \n\t" | |||||
| #if !defined(CONJ) | |||||
| "lgdr %[t1],%[alpha_r] \n\t" | |||||
| "vlvgp %%v28,%[t1],%[t1] \n\t" //load both from disjoint | |||||
| "lgdr %[t1],%[alpha_i] \n\t" | |||||
| "vlvgp %%v29,%[t1],%[t1] \n\t" //load both from disjoint | |||||
| "vflcdb %%v29,%%v29 \n\t" //complement both | |||||
| "vlvgg %%v29,%[t1],1 \n\t" //restore 2nd so that {-alpha_i, alpha_i} | |||||
| #else | |||||
| "lgdr %[t1],%[alpha_i] \n\t" | |||||
| "vlvgp %%v29,%[t1],%[t1] \n\t" //load both from disjoint | |||||
| "lgdr %[t1],%[alpha_r] \n\t" | |||||
| "vlvgp %%v28,%[t1],%[t1] \n\t" //load both from disjoint | |||||
| "vflcdb %%v28,%%v28 \n\t" //complement both | |||||
| "vlvgg %%v28,%[t1],0 \n\t" //restore 1st so that {alpha_r,-alpha_r} | |||||
| #endif | |||||
| "xgr %[t1],%[t1] \n\t" | |||||
| "sllg %[tmp],%[tmp],4 \n\t" | |||||
| "vl %%v30 , 0(%[t1],%[y_tmp]) \n\t" | |||||
| "vl %%v31 , 16(%[t1],%[y_tmp]) \n\t" | |||||
| "vl %%v6 , 32(%[t1],%[y_tmp]) \n\t" | |||||
| "vl %%v7 , 48(%[t1],%[y_tmp]) \n\t" | |||||
| "vl %%v20 , 0(%[t1],%[x_tmp]) \n\t" | |||||
| "vl %%v21 , 16(%[t1],%[x_tmp]) \n\t" | |||||
| "vl %%v22 , 32(%[t1],%[x_tmp]) \n\t" | |||||
| "vl %%v23 , 48(%[t1],%[x_tmp]) \n\t" | |||||
| "lay %[tmp],-64 (%[tmp]) \n\t" //tmp-=64 so that t1+64 can break tmp condition | |||||
| "j 2f \n\t" | |||||
| ".align 16 \n\t" | ".align 16 \n\t" | ||||
| "1: \n\t" | "1: \n\t" | ||||
| "pfd 1, 256(%%r1,%[x_tmp]) \n\t" | |||||
| "pfd 2, 256(%%r1,%[y_tmp]) \n\t" | |||||
| "vleg %%v16 , 0(%%r1,%[y_tmp]),0 \n\t" | |||||
| "vleg %%v17 , 8(%%r1,%[y_tmp]),0 \n\t" | |||||
| "vleg %%v16 , 16(%%r1,%[y_tmp]),1 \n\t" | |||||
| "vleg %%v17 , 24(%%r1,%[y_tmp]),1 \n\t" | |||||
| "vleg %%v18 , 32(%%r1,%[y_tmp]),0 \n\t" | |||||
| "vleg %%v19 , 40(%%r1,%[y_tmp]),0 \n\t" | |||||
| "vleg %%v18 , 48(%%r1,%[y_tmp]),1 \n\t" | |||||
| "vleg %%v19 , 56(%%r1,%[y_tmp]),1 \n\t" | |||||
| "vleg %%v24 , 0(%%r1,%[x_tmp]),0 \n\t" | |||||
| "vleg %%v25 , 8(%%r1,%[x_tmp]),0 \n\t" | |||||
| "vleg %%v24 , 16(%%r1,%[x_tmp]),1 \n\t" | |||||
| "vleg %%v25 , 24(%%r1,%[x_tmp]),1 \n\t" | |||||
| "vleg %%v26 , 32(%%r1,%[x_tmp]),0 \n\t" | |||||
| "vleg %%v27 , 40(%%r1,%[x_tmp]),0 \n\t" | |||||
| "vleg %%v26 , 48(%%r1,%[x_tmp]),1 \n\t" | |||||
| "vleg %%v27 , 56(%%r1,%[x_tmp]),1 \n\t" | |||||
| #if !defined(CONJ) | |||||
| "vfmsdb %%v16, %%v25, %%v29,%%v16 \n\t" | |||||
| "vfmadb %%v17, %%v24, %%v29, %%v17 \n\t" | |||||
| "vfmsdb %%v18, %%v27, %%v29, %%v18 \n\t" | |||||
| "vfmadb %%v19, %%v26, %%v29, %%v19 \n\t" | |||||
| "vfmsdb %%v16, %%v24, %%v28 ,%%v16 \n\t" | |||||
| "vfmadb %%v17, %%v25, %%v28, %%v17 \n\t" | |||||
| "vfmsdb %%v18, %%v26, %%v28, %%v18 \n\t" | |||||
| "vfmadb %%v19, %%v27, %%v28, %%v19 \n\t" | |||||
| #else | |||||
| "vfmadb %%v16, %%v25, %%v29, %%v16 \n\t" | |||||
| "vfmsdb %%v17, %%v25, %%v28, %%v17 \n\t" | |||||
| "vfmadb %%v18, %%v27, %%v29, %%v18 \n\t" | |||||
| "vfmsdb %%v19, %%v27, %%v28, %%v19 \n\t" | |||||
| "vfmadb %%v16, %%v24, %%v28, %%v16 \n\t" | |||||
| "vfmsdb %%v17, %%v24, %%v29, %%v17 \n\t" | |||||
| "vfmadb %%v18, %%v26, %%v28, %%v18 \n\t" | |||||
| "vfmsdb %%v19, %%v26, %%v29, %%v19 \n\t" | |||||
| #endif | |||||
| "vsteg %%v16 , 0(%%r1,%[y_tmp]),0 \n\t" | |||||
| "vsteg %%v17 , 8(%%r1,%[y_tmp]),0 \n\t" | |||||
| "vsteg %%v16 , 16(%%r1,%[y_tmp]),1 \n\t" | |||||
| "vsteg %%v17 , 24(%%r1,%[y_tmp]),1 \n\t" | |||||
| "vsteg %%v18 , 32(%%r1,%[y_tmp]),0 \n\t" | |||||
| "vsteg %%v19 , 40(%%r1,%[y_tmp]),0 \n\t" | |||||
| "vsteg %%v18 , 48(%%r1,%[y_tmp]),1 \n\t" | |||||
| "vsteg %%v19 , 56(%%r1,%[y_tmp]),1 \n\t" | |||||
| "vleg %%v20 , 64(%%r1,%[y_tmp]),0 \n\t" | |||||
| "vleg %%v21 , 72(%%r1,%[y_tmp]),0 \n\t" | |||||
| "vleg %%v20 , 80(%%r1,%[y_tmp]),1 \n\t" | |||||
| "vleg %%v21 , 88(%%r1,%[y_tmp]),1 \n\t" | |||||
| "vleg %%v22 , 96(%%r1,%[y_tmp]),0 \n\t" | |||||
| "vleg %%v23 , 104(%%r1,%[y_tmp]),0 \n\t" | |||||
| "vleg %%v22 , 112(%%r1,%[y_tmp]),1 \n\t" | |||||
| "vleg %%v23 , 120(%%r1,%[y_tmp]),1 \n\t" | |||||
| "vleg %%v24 , 64(%%r1,%[x_tmp]),0 \n\t" | |||||
| "vleg %%v25 , 72(%%r1,%[x_tmp]),0 \n\t" | |||||
| "vleg %%v24 , 80(%%r1,%[x_tmp]),1 \n\t" | |||||
| "vleg %%v25 , 88(%%r1,%[x_tmp]),1 \n\t" | |||||
| "vleg %%v26 , 96(%%r1,%[x_tmp]),0 \n\t" | |||||
| "vleg %%v27 , 104(%%r1,%[x_tmp]),0 \n\t" | |||||
| "vleg %%v26 , 112(%%r1,%[x_tmp]),1 \n\t" | |||||
| "vleg %%v27 , 120(%%r1,%[x_tmp]),1 \n\t" | |||||
| #if !defined(CONJ) | |||||
| "vfmsdb %%v20, %%v25, %%v29,%%v20 \n\t" | |||||
| "vfmadb %%v21, %%v24, %%v29, %%v21 \n\t" | |||||
| "vfmsdb %%v22, %%v27, %%v29, %%v22 \n\t" | |||||
| "vfmadb %%v23, %%v26, %%v29, %%v23 \n\t" | |||||
| "vfmsdb %%v20, %%v24, %%v28 ,%%v20 \n\t" | |||||
| "vfmadb %%v21, %%v25, %%v28, %%v21 \n\t" | |||||
| "vfmsdb %%v22, %%v26, %%v28, %%v22 \n\t" | |||||
| "vfmadb %%v23, %%v27, %%v28, %%v23 \n\t" | |||||
| #else | |||||
| "vfmadb %%v20, %%v25, %%v29, %%v20 \n\t" | |||||
| "vfmsdb %%v21, %%v25, %%v28, %%v21 \n\t" | |||||
| "vfmadb %%v22, %%v27, %%v29, %%v22 \n\t" | |||||
| "vfmsdb %%v23, %%v27, %%v28, %%v23 \n\t" | |||||
| "vfmadb %%v20, %%v24, %%v28, %%v20 \n\t" | |||||
| "vfmsdb %%v21, %%v24, %%v29, %%v21 \n\t" | |||||
| "vfmadb %%v22, %%v26, %%v28, %%v22 \n\t" | |||||
| "vfmsdb %%v23, %%v26, %%v29, %%v23 \n\t" | |||||
| #endif | |||||
| "vsteg %%v20 , 64(%%r1,%[y_tmp]),0 \n\t" | |||||
| "vsteg %%v21 , 72(%%r1,%[y_tmp]),0 \n\t" | |||||
| "vsteg %%v20 , 80(%%r1,%[y_tmp]),1 \n\t" | |||||
| "vsteg %%v21 , 88(%%r1,%[y_tmp]),1 \n\t" | |||||
| "vsteg %%v22 , 96(%%r1,%[y_tmp]),0 \n\t" | |||||
| "vsteg %%v23 , 104(%%r1,%[y_tmp]),0 \n\t" | |||||
| "vsteg %%v22 , 112(%%r1,%[y_tmp]),1 \n\t" | |||||
| "vsteg %%v23 , 120(%%r1,%[y_tmp]),1 \n\t" | |||||
| "la %%r1,128(%%r1) \n\t" | |||||
| "clgrjl %%r1,%[tmp],1b \n\t" | |||||
| : [mem_y] "+m" (*(double (*)[2*n])y),[tmp]"+&r"(n) | |||||
| "vpdi %%v24 , %%v20, %%v20, 4 \n\t" | |||||
| "vpdi %%v25 , %%v21, %%v21, 4 \n\t" | |||||
| "vpdi %%v26 , %%v22, %%v22, 4 \n\t" | |||||
| "vpdi %%v27 , %%v23, %%v23, 4 \n\t" | |||||
| "vfmadb %%v16, %%v20, %%v28, %%v16 \n\t" | |||||
| "vfmadb %%v17, %%v21, %%v28, %%v17 \n\t" | |||||
| "vfmadb %%v18, %%v22, %%v28, %%v18 \n\t" | |||||
| "vfmadb %%v19, %%v23, %%v28, %%v19 \n\t" | |||||
| "vl %%v30, 64(%[t1],%[y_tmp]) \n\t" | |||||
| "vl %%v31, 80(%[t1],%[y_tmp]) \n\t" | |||||
| "vl %%v6 , 96(%[t1],%[y_tmp]) \n\t" | |||||
| "vl %%v7 , 112(%[t1],%[y_tmp]) \n\t" | |||||
| "vfmadb %%v16, %%v24, %%v29, %%v16 \n\t" | |||||
| "vfmadb %%v17, %%v25, %%v29, %%v17 \n\t" | |||||
| "vfmadb %%v18, %%v26, %%v29, %%v18 \n\t" | |||||
| "vfmadb %%v19, %%v27, %%v29, %%v19 \n\t" | |||||
| "vl %%v20 , 64(%[t1],%[x_tmp]) \n\t" | |||||
| "vl %%v21 , 80(%[t1],%[x_tmp]) \n\t" | |||||
| "vl %%v22 , 96(%[t1],%[x_tmp]) \n\t" | |||||
| "vl %%v23 ,112(%[t1],%[x_tmp]) \n\t" | |||||
| "vst %%v16 , 0(%[t1],%[y_tmp]) \n\t" | |||||
| "vst %%v17 , 16(%[t1],%[y_tmp]) \n\t" | |||||
| "vst %%v18 , 32(%[t1],%[y_tmp]) \n\t" | |||||
| "vst %%v19 , 48(%[t1],%[y_tmp]) \n\t" | |||||
| "la %[t1],64(%[t1] ) \n\t" | |||||
| "2: \n\t" | |||||
| "pfd 1, 256(%[t1],%[x_tmp]) \n\t" | |||||
| "pfd 2, 256(%[t1],%[y_tmp]) \n\t" | |||||
| "vpdi %%v24 , %%v20, %%v20, 4 \n\t" | |||||
| "vpdi %%v25 , %%v21, %%v21, 4 \n\t" | |||||
| "vpdi %%v26 , %%v22, %%v22, 4 \n\t" | |||||
| "vpdi %%v27 , %%v23, %%v23, 4 \n\t" | |||||
| "vfmadb %%v30, %%v20, %%v28, %%v30 \n\t" | |||||
| "vfmadb %%v31, %%v21, %%v28, %%v31 \n\t" | |||||
| "vfmadb %%v6, %%v22, %%v28, %%v6 \n\t" | |||||
| "vfmadb %%v7, %%v23, %%v28, %%v7 \n\t" | |||||
| "vl %%v16, 64(%[t1],%[y_tmp]) \n\t" | |||||
| "vl %%v17, 80(%[t1],%[y_tmp]) \n\t" | |||||
| "vl %%v18, 96(%[t1],%[y_tmp]) \n\t" | |||||
| "vl %%v19, 112(%[t1],%[y_tmp]) \n\t" | |||||
| "vfmadb %%v30, %%v24, %%v29, %%v30 \n\t" | |||||
| "vfmadb %%v31, %%v25, %%v29, %%v31 \n\t" | |||||
| "vfmadb %%v6, %%v26, %%v29, %%v6 \n\t" | |||||
| "vfmadb %%v7, %%v27, %%v29, %%v7 \n\t" | |||||
| "vl %%v20 , 64(%[t1],%[x_tmp]) \n\t" | |||||
| "vl %%v21 , 80(%[t1],%[x_tmp]) \n\t" | |||||
| "vl %%v22 , 96(%[t1],%[x_tmp]) \n\t" | |||||
| "vl %%v23 ,112(%[t1],%[x_tmp]) \n\t" | |||||
| "vst %%v30 , 0(%[t1],%[y_tmp]) \n\t" | |||||
| "vst %%v31 , 16(%[t1],%[y_tmp]) \n\t" | |||||
| "vst %%v6 , 32(%[t1],%[y_tmp]) \n\t" | |||||
| "vst %%v7 , 48(%[t1],%[y_tmp]) \n\t" | |||||
| "la %[t1],64(%[t1] ) \n\t" | |||||
| "clgrjl %[t1],%[tmp],1b \n\t" | |||||
| //---------------------------------------------------------------------- | |||||
| "vfmadb %%v16, %%v20, %%v28, %%v16 \n\t" | |||||
| "vfmadb %%v17, %%v21, %%v28, %%v17 \n\t" | |||||
| "vfmadb %%v18, %%v22, %%v28, %%v18 \n\t" | |||||
| "vfmadb %%v19, %%v23, %%v28, %%v19 \n\t" | |||||
| "vpdi %%v24 , %%v20, %%v20, 4 \n\t" | |||||
| "vpdi %%v25 , %%v21, %%v21, 4 \n\t" | |||||
| "vpdi %%v26 , %%v22, %%v22, 4 \n\t" | |||||
| "vpdi %%v27 , %%v23, %%v23, 4 \n\t" | |||||
| "vfmadb %%v16, %%v24, %%v29, %%v16 \n\t" | |||||
| "vfmadb %%v17, %%v25, %%v29, %%v17 \n\t" | |||||
| "vfmadb %%v18, %%v26, %%v29, %%v18 \n\t" | |||||
| "vfmadb %%v19, %%v27, %%v29, %%v19 \n\t" | |||||
| "vst %%v16 , 0(%[t1],%[y_tmp]) \n\t" | |||||
| "vst %%v17 , 16(%[t1],%[y_tmp]) \n\t" | |||||
| "vst %%v18 , 32(%[t1],%[y_tmp]) \n\t" | |||||
| "vst %%v19 , 48(%[t1],%[y_tmp]) \n\t" | |||||
| : [mem_y] "+m" (*(double (*)[2*n])y),[tmp]"+&r"(n) , [t1] "=&a" (tempR1) | |||||
| : [mem_x] "m" (*(const double (*)[2*n])x), [x_tmp] "a"(x), [y_tmp] "a"(y), [alpha_r] "f"(da_r),[alpha_i] "f"(da_i) | : [mem_x] "m" (*(const double (*)[2*n])x), [x_tmp] "a"(x), [y_tmp] "a"(y), [alpha_r] "f"(da_r),[alpha_i] "f"(da_i) | ||||
| : "cc", "r1","v16", | |||||
| "v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29" | |||||
| : "cc", "v6","v7", "v16", | |||||
| "v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" | |||||
| ); | ); | ||||
| } | } | ||||
| int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) { | int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) { | ||||
| BLASLONG i = 0; | BLASLONG i = 0; | ||||
| BLASLONG ix = 0, iy = 0; | BLASLONG ix = 0, iy = 0; | ||||
| @@ -30,74 +30,119 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| static void zscal_kernel_8(BLASLONG n, FLOAT da_r,FLOAT da_i, FLOAT *x) { | static void zscal_kernel_8(BLASLONG n, FLOAT da_r,FLOAT da_i, FLOAT *x) { | ||||
| __asm__( | |||||
| "pfd 1, 0(%[x_ptr]) \n\t" | |||||
| "lgdr %%r0,%[alpha_r] \n\t" | |||||
| "vlvgp %%v24,%%r0,%%r0 \n\t" | |||||
| "lgdr %%r0,%[alpha_i] \n\t" | |||||
| "vlvgp %%v25,%%r0,%%r0 \n\t" | |||||
| "sllg %%r0,%[n],4 \n\t" | |||||
| "agr %%r0,%[x_ptr] \n\t" | |||||
| BLASLONG tempR1 ; | |||||
| __asm__ ( | |||||
| "pfd 2, 0(%[x_tmp]) \n\t" | |||||
| #if !defined(CONJ) | |||||
| "lgdr %[t1],%[alpha_r] \n\t" | |||||
| "vlvgp %%v28,%[t1],%[t1] \n\t" //load both from disjoint | |||||
| "lgdr %[t1],%[alpha_i] \n\t" | |||||
| "vlvgp %%v29,%[t1],%[t1] \n\t" //load both from disjoint | |||||
| "vflcdb %%v29,%%v29 \n\t" //complement both | |||||
| "vlvgg %%v29,%[t1],1 \n\t" //restore 2nd so that {-alpha_i, alpha_i} | |||||
| #else | |||||
| "lgdr %[t1],%[alpha_i] \n\t" | |||||
| "vlvgp %%v29,%[t1],%[t1] \n\t" //load both from disjoint | |||||
| "lgdr %[t1],%[alpha_r] \n\t" | |||||
| "vlvgp %%v28,%[t1],%[t1] \n\t" //load both from disjoint | |||||
| "vflcdb %%v28,%%v28 \n\t" //complement both | |||||
| "vlvgg %%v28,%[t1],0 \n\t" //restore 1st so that {alpha_r,-alpha_r} | |||||
| #endif | |||||
| "xgr %[t1],%[t1] \n\t" | |||||
| "sllg %[tmp],%[tmp],4 \n\t" | |||||
| "vl %%v20 , 0(%[t1],%[x_tmp]) \n\t" | |||||
| "vl %%v21 , 16(%[t1],%[x_tmp]) \n\t" | |||||
| "vl %%v22 , 32(%[t1],%[x_tmp]) \n\t" | |||||
| "vl %%v23 , 48(%[t1],%[x_tmp]) \n\t" | |||||
| "lay %[tmp],-64 (%[tmp]) \n\t" //tmp-=64 so that t1+64 can break tmp condition | |||||
| "j 2f \n\t" | |||||
| ".align 16 \n\t" | ".align 16 \n\t" | ||||
| "1: \n\t" | "1: \n\t" | ||||
| "pfd 2, 256(%[x_ptr] ) \n\t" | |||||
| "vleg %%v20 , 0(%[x_ptr]),0 \n\t" | |||||
| "vleg %%v21 , 8(%[x_ptr]),0 \n\t" | |||||
| "vleg %%v20 , 16(%[x_ptr]),1 \n\t" | |||||
| "vleg %%v21 , 24(%[x_ptr]),1 \n\t" | |||||
| "vleg %%v22 , 32(%[x_ptr]),0 \n\t" | |||||
| "vleg %%v23 , 40(%[x_ptr]),0 \n\t" | |||||
| "vleg %%v22 , 48(%[x_ptr]),1 \n\t" | |||||
| "vleg %%v23 , 56(%[x_ptr]),1 \n\t" | |||||
| "vfmdb %%v16, %%v21, %%v25 \n\t" | |||||
| "vfmdb %%v17, %%v20, %%v25 \n\t" | |||||
| "vfmdb %%v18, %%v23, %%v25 \n\t" | |||||
| "vfmdb %%v19, %%v22, %%v25 \n\t" | |||||
| "vfmsdb %%v16, %%v20, %%v24 ,%%v16 \n\t" | |||||
| "vfmadb %%v17, %%v21, %%v24, %%v17 \n\t" | |||||
| "vfmsdb %%v18, %%v22, %%v24, %%v18 \n\t" | |||||
| "vfmadb %%v19, %%v23, %%v24, %%v19 \n\t" | |||||
| "vsteg %%v16 , 0(%[x_ptr]),0 \n\t" | |||||
| "vsteg %%v17 , 8(%[x_ptr]),0 \n\t" | |||||
| "vsteg %%v16 , 16(%[x_ptr]),1 \n\t" | |||||
| "vsteg %%v17 , 24(%[x_ptr]),1 \n\t" | |||||
| "vsteg %%v18 , 32(%[x_ptr]),0 \n\t" | |||||
| "vsteg %%v19 , 40(%[x_ptr]),0 \n\t" | |||||
| "vsteg %%v18 , 48(%[x_ptr]),1 \n\t" | |||||
| "vsteg %%v19 , 56(%[x_ptr]),1 \n\t" | |||||
| "vleg %%v20 , 64(%[x_ptr]),0 \n\t" | |||||
| "vleg %%v21 , 72(%[x_ptr]),0 \n\t" | |||||
| "vleg %%v20 , 80(%[x_ptr]),1 \n\t" | |||||
| "vleg %%v21 , 88(%[x_ptr]),1 \n\t" | |||||
| "vleg %%v22 , 96(%[x_ptr]),0 \n\t" | |||||
| "vleg %%v23 , 104(%[x_ptr]),0 \n\t" | |||||
| "vleg %%v22 , 112(%[x_ptr]),1 \n\t" | |||||
| "vleg %%v23 , 120(%[x_ptr]),1 \n\t" | |||||
| "vfmdb %%v16, %%v21, %%v25 \n\t" | |||||
| "vfmdb %%v17, %%v20, %%v25 \n\t" | |||||
| "vfmdb %%v18, %%v23, %%v25 \n\t" | |||||
| "vfmdb %%v19, %%v22, %%v25 \n\t" | |||||
| "vfmsdb %%v16, %%v20, %%v24 ,%%v16 \n\t" | |||||
| "vfmadb %%v17, %%v21, %%v24, %%v17 \n\t" | |||||
| "vfmsdb %%v18, %%v22, %%v24, %%v18 \n\t" | |||||
| "vfmadb %%v19, %%v23, %%v24, %%v19 \n\t" | |||||
| "vsteg %%v16 , 64(%[x_ptr]),0 \n\t" | |||||
| "vsteg %%v17 , 72(%[x_ptr]),0 \n\t" | |||||
| "vsteg %%v16 , 80(%[x_ptr]),1 \n\t" | |||||
| "vsteg %%v17 , 88(%[x_ptr]),1 \n\t" | |||||
| "vsteg %%v18 , 96(%[x_ptr]),0 \n\t" | |||||
| "vsteg %%v19 , 104(%[x_ptr]),0 \n\t" | |||||
| "vsteg %%v18 , 112(%[x_ptr]),1 \n\t" | |||||
| "vsteg %%v19 , 120(%[x_ptr]),1 \n\t" | |||||
| "la %[x_ptr],128(%[x_ptr]) \n\t" | |||||
| "clgrjl %[x_ptr],%%r0,1b \n\t" | |||||
| : [mem] "+m" (*(double (*)[2*n])x) ,[x_ptr] "+&a"(x) | |||||
| : [n] "r"(n), [alpha_r] "f"(da_r),[alpha_i] "f"(da_i) | |||||
| : "cc", "r0","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25" | |||||
| "vpdi %%v24 , %%v20, %%v20, 4 \n\t" | |||||
| "vpdi %%v25 , %%v21, %%v21, 4 \n\t" | |||||
| "vpdi %%v26 , %%v22, %%v22, 4 \n\t" | |||||
| "vpdi %%v27 , %%v23, %%v23, 4 \n\t" | |||||
| "vfmdb %%v16, %%v20, %%v28 \n\t" | |||||
| "vfmdb %%v17, %%v21, %%v28 \n\t" | |||||
| "vfmdb %%v18, %%v22, %%v28 \n\t" | |||||
| "vfmdb %%v19, %%v23, %%v28 \n\t" | |||||
| "vl %%v20, 64(%[t1],%[x_tmp]) \n\t" | |||||
| "vl %%v21, 80(%[t1],%[x_tmp]) \n\t" | |||||
| "vl %%v22, 96(%[t1],%[x_tmp]) \n\t" | |||||
| "vl %%v23, 112(%[t1],%[x_tmp]) \n\t" | |||||
| "vfmadb %%v16, %%v24, %%v29, %%v16 \n\t" | |||||
| "vfmadb %%v17, %%v25, %%v29, %%v17 \n\t" | |||||
| "vfmadb %%v18, %%v26, %%v29, %%v18 \n\t" | |||||
| "vfmadb %%v19, %%v27, %%v29, %%v19 \n\t" | |||||
| "vst %%v16 , 0(%[t1],%[x_tmp]) \n\t" | |||||
| "vst %%v17 , 16(%[t1],%[x_tmp]) \n\t" | |||||
| "vst %%v18 , 32(%[t1],%[x_tmp]) \n\t" | |||||
| "vst %%v19 , 48(%[t1],%[x_tmp]) \n\t" | |||||
| "la %[t1],64(%[t1] ) \n\t" | |||||
| "2: \n\t" | |||||
| "pfd 2, 256(%[t1],%[x_tmp]) \n\t" | |||||
| "vpdi %%v24 , %%v20, %%v20, 4 \n\t" | |||||
| "vpdi %%v25 , %%v21, %%v21, 4 \n\t" | |||||
| "vpdi %%v26 , %%v22, %%v22, 4 \n\t" | |||||
| "vpdi %%v27 , %%v23, %%v23, 4 \n\t" | |||||
| "vfmdb %%v30, %%v20, %%v28 \n\t" | |||||
| "vfmdb %%v31, %%v21, %%v28 \n\t" | |||||
| "vfmdb %%v6, %%v22, %%v28 \n\t" | |||||
| "vfmdb %%v7, %%v23, %%v28 \n\t" | |||||
| "vl %%v20 , 64(%[t1],%[x_tmp]) \n\t" | |||||
| "vl %%v21 , 80(%[t1],%[x_tmp]) \n\t" | |||||
| "vl %%v22 , 96(%[t1],%[x_tmp]) \n\t" | |||||
| "vl %%v23 ,112(%[t1],%[x_tmp]) \n\t" | |||||
| "vfmadb %%v30, %%v24, %%v29, %%v30 \n\t" | |||||
| "vfmadb %%v31, %%v25, %%v29, %%v31 \n\t" | |||||
| "vfmadb %%v6, %%v26, %%v29, %%v6 \n\t" | |||||
| "vfmadb %%v7, %%v27, %%v29, %%v7 \n\t" | |||||
| "vst %%v30 , 0(%[t1],%[x_tmp]) \n\t" | |||||
| "vst %%v31 , 16(%[t1],%[x_tmp]) \n\t" | |||||
| "vst %%v6 , 32(%[t1],%[x_tmp]) \n\t" | |||||
| "vst %%v7 , 48(%[t1],%[x_tmp]) \n\t" | |||||
| "la %[t1],64(%[t1] ) \n\t" | |||||
| "clgrjl %[t1],%[tmp],1b \n\t" | |||||
| //---------------------------------------------------------------------- | |||||
| "vfmdb %%v16, %%v20, %%v28 \n\t" | |||||
| "vfmdb %%v17, %%v21, %%v28 \n\t" | |||||
| "vfmdb %%v18, %%v22, %%v28 \n\t" | |||||
| "vfmdb %%v19, %%v23, %%v28 \n\t" | |||||
| "vpdi %%v24 , %%v20, %%v20, 4 \n\t" | |||||
| "vpdi %%v25 , %%v21, %%v21, 4 \n\t" | |||||
| "vpdi %%v26 , %%v22, %%v22, 4 \n\t" | |||||
| "vpdi %%v27 , %%v23, %%v23, 4 \n\t" | |||||
| "vfmadb %%v16, %%v24, %%v29, %%v16 \n\t" | |||||
| "vfmadb %%v17, %%v25, %%v29, %%v17 \n\t" | |||||
| "vfmadb %%v18, %%v26, %%v29, %%v18 \n\t" | |||||
| "vfmadb %%v19, %%v27, %%v29, %%v19 \n\t" | |||||
| "vst %%v16 , 0(%[t1],%[x_tmp]) \n\t" | |||||
| "vst %%v17 , 16(%[t1],%[x_tmp]) \n\t" | |||||
| "vst %%v18 , 32(%[t1],%[x_tmp]) \n\t" | |||||
| "vst %%v19 , 48(%[t1],%[x_tmp]) \n\t" | |||||
| : [mem_x] "+m" (*(double (*)[2*n])x),[tmp]"+&r"(n) , [t1] "=&a" (tempR1) | |||||
| : [x_tmp] "a"(x), [alpha_r] "f"(da_r),[alpha_i] "f"(da_i) | |||||
| : "cc", "v6","v7", "v16", | |||||
| "v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" | |||||
| ); | ); | ||||
| } | } | ||||