| @@ -27,9 +27,9 @@ | |||
| #include <string.h> | |||
| #define CPU_GENERIC 0 | |||
| #define CPU_Z13 1 | |||
| #define CPU_Z14 2 | |||
| #define CPU_GENERIC 0 | |||
| #define CPU_Z13 1 | |||
| #define CPU_Z14 2 | |||
| static char *cpuname[] = { | |||
| "ZARCH_GENERIC", | |||
| @@ -112,7 +112,7 @@ void get_cpuconfig(void) | |||
| printf("#define Z13\n"); | |||
| printf("#define DTB_DEFAULT_ENTRIES 64\n"); | |||
| break; | |||
| case CPU_Z14: | |||
| case CPU_Z14: | |||
| printf("#define Z14\n"); | |||
| printf("#define DTB_DEFAULT_ENTRIES 64\n"); | |||
| break; | |||
| @@ -74,12 +74,12 @@ ZSWAPKERNEL = zswap.c | |||
| SGEMVNKERNEL = ../arm/gemv_n.c | |||
| DGEMVNKERNEL = dgemv_n_4.c | |||
| CGEMVNKERNEL = ../arm/zgemv_n.c | |||
| ZGEMVNKERNEL = ../arm/zgemv_n.c | |||
| ZGEMVNKERNEL = zgemv_n_4.c | |||
| SGEMVTKERNEL = ../arm/gemv_t.c | |||
| DGEMVTKERNEL = dgemv_t_4.c | |||
| CGEMVTKERNEL = ../arm/zgemv_t.c | |||
| ZGEMVTKERNEL = ../arm/zgemv_t.c | |||
| ZGEMVTKERNEL = zgemv_t_4.c | |||
| STRMMKERNEL = strmm8x4V.S | |||
| DTRMMKERNEL = trmm8x4V.S | |||
| @@ -73,13 +73,13 @@ ZSWAPKERNEL = zswap.c | |||
| SGEMVNKERNEL = sgemv_n_4.c | |||
| DGEMVNKERNEL = dgemv_n_4.c | |||
| CGEMVNKERNEL = ../arm/zgemv_n.c | |||
| ZGEMVNKERNEL = ../arm/zgemv_n.c | |||
| CGEMVNKERNEL = cgemv_n_4.c | |||
| ZGEMVNKERNEL = zgemv_n_4.c | |||
| SGEMVTKERNEL = sgemv_t_4.c | |||
| DGEMVTKERNEL = dgemv_t_4.c | |||
| CGEMVTKERNEL = ../arm/zgemv_t.c | |||
| ZGEMVTKERNEL = ../arm/zgemv_t.c | |||
| CGEMVTKERNEL = cgemv_t_4.c | |||
| ZGEMVTKERNEL = zgemv_t_4.c | |||
| STRMMKERNEL = strmm8x4V.S | |||
| DTRMMKERNEL = trmm8x4V.S | |||
| @@ -198,7 +198,7 @@ static FLOAT camax_kernel_32(BLASLONG n, FLOAT *x) | |||
| FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { | |||
| BLASLONG i = 0; | |||
| BLASLONG j = 0; | |||
| BLASLONG ix = 0; | |||
| FLOAT maxf = 0.0; | |||
| BLASLONG inc_x2; | |||
| @@ -216,53 +216,55 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { | |||
| else | |||
| { | |||
| maxf=CABS1(x,0); | |||
| ix += 2; | |||
| i++; | |||
| } | |||
| while (i < n) { | |||
| if (ABS(x[i*2]) > maxf) { | |||
| maxf = ABS(x[i*2]); | |||
| if (CABS1(x,ix) > maxf) { | |||
| maxf = CABS1(x,ix); | |||
| } | |||
| ix += 2; | |||
| i++; | |||
| } | |||
| return (maxf); | |||
| } else { | |||
| inc_x2 = 2 * inc_x; | |||
| maxf=CABS1(x,0); | |||
| i += inc_x2; | |||
| j++; | |||
| inc_x2 = 2 * inc_x; | |||
| ix += inc_x2; | |||
| i++; | |||
| BLASLONG n1 = (n - 1) & -4; | |||
| while (j < n1) { | |||
| while (i < n1) { | |||
| if (CABS1(x,i) > maxf) { | |||
| maxf = CABS1(x,i); | |||
| if (CABS1(x,ix) > maxf) { | |||
| maxf = CABS1(x,ix); | |||
| } | |||
| if (CABS1(x,i+inc_x2) > maxf) { | |||
| maxf = CABS1(x,i+inc_x2); | |||
| if (CABS1(x,ix+inc_x2) > maxf) { | |||
| maxf = CABS1(x,ix+inc_x2); | |||
| } | |||
| if (CABS1(x,i+inc_x2*2) > maxf) { | |||
| maxf = CABS1(x,i+inc_x2*2); | |||
| if (CABS1(x,ix+inc_x2*2) > maxf) { | |||
| maxf = CABS1(x,ix+inc_x2*2); | |||
| } | |||
| if (CABS1(x,i+inc_x2*3) > maxf) { | |||
| maxf = CABS1(x,i+inc_x2*3); | |||
| if (CABS1(x,ix+inc_x2*3) > maxf) { | |||
| maxf = CABS1(x,ix+inc_x2*3); | |||
| } | |||
| i += inc_x2 * 4; | |||
| ix += inc_x2 * 4; | |||
| j += 4; | |||
| i += 4; | |||
| } | |||
| while (j < n) { | |||
| if (CABS1(x,i) > maxf) { | |||
| maxf = CABS1(x,i); | |||
| while (i < n) { | |||
| if (CABS1(x,ix) > maxf) { | |||
| maxf = CABS1(x,ix); | |||
| } | |||
| i += inc_x2; | |||
| j++; | |||
| ix += inc_x2; | |||
| i++; | |||
| } | |||
| return (maxf); | |||
| } | |||
| @@ -198,7 +198,7 @@ static FLOAT camin_kernel_32(BLASLONG n, FLOAT *x) | |||
| FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { | |||
| BLASLONG i = 0; | |||
| BLASLONG j = 0; | |||
| BLASLONG ix = 0; | |||
| FLOAT minf = 0.0; | |||
| BLASLONG inc_x2; | |||
| @@ -216,53 +216,55 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { | |||
| else | |||
| { | |||
| minf=CABS1(x,0); | |||
| ix += 2; | |||
| i++; | |||
| } | |||
| while (i < n) { | |||
| if (ABS(x[i*2]) < minf) { | |||
| minf = ABS(x[i*2]); | |||
| if (CABS1(x,ix) < minf) { | |||
| minf = CABS1(x,ix); | |||
| } | |||
| ix += 2; | |||
| i++; | |||
| } | |||
| return (minf); | |||
| } else { | |||
| inc_x2 = 2 * inc_x; | |||
| minf=CABS1(x,0); | |||
| i += inc_x2; | |||
| j++; | |||
| inc_x2 = 2 * inc_x; | |||
| ix += inc_x2; | |||
| i++; | |||
| BLASLONG n1 = (n - 1) & -4; | |||
| while (j < n1) { | |||
| while (i < n1) { | |||
| if (CABS1(x,i) < minf) { | |||
| minf = CABS1(x,i); | |||
| if (CABS1(x,ix) < minf) { | |||
| minf = CABS1(x,ix); | |||
| } | |||
| if (CABS1(x,i+inc_x2) < minf) { | |||
| minf = CABS1(x,i+inc_x2); | |||
| if (CABS1(x,ix+inc_x2) < minf) { | |||
| minf = CABS1(x,ix+inc_x2); | |||
| } | |||
| if (CABS1(x,i+inc_x2*2) < minf) { | |||
| minf = CABS1(x,i+inc_x2*2); | |||
| if (CABS1(x,ix+inc_x2*2) < minf) { | |||
| minf = CABS1(x,ix+inc_x2*2); | |||
| } | |||
| if (CABS1(x,i+inc_x2*3) < minf) { | |||
| minf = CABS1(x,i+inc_x2*3); | |||
| if (CABS1(x,ix+inc_x2*3) < minf) { | |||
| minf = CABS1(x,ix+inc_x2*3); | |||
| } | |||
| i += inc_x2 * 4; | |||
| ix += inc_x2 * 4; | |||
| j += 4; | |||
| i += 4; | |||
| } | |||
| while (j < n) { | |||
| if (CABS1(x,i) < minf) { | |||
| minf = CABS1(x,i); | |||
| while (i < n) { | |||
| if (CABS1(x,ix) < minf) { | |||
| minf = CABS1(x,ix); | |||
| } | |||
| i += inc_x2; | |||
| j++; | |||
| ix += inc_x2; | |||
| i++; | |||
| } | |||
| return (minf); | |||
| } | |||
| @@ -110,7 +110,7 @@ static void caxpy_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) | |||
| "agfi %%r1,128 \n\t" | |||
| "brctg %%r0,0b " | |||
| : | |||
| :"r"(n),"ZR"((const FLOAT (*)[n * 2])x),"ZR"((FLOAT (*)[n * 2])y),"a"(alpha) | |||
| :"r"(n),"ZR"((const FLOAT (*)[n * 2])x),"ZR"((FLOAT (*)[n * 2])y),"ZQ"((const FLOAT (*)[2])alpha) | |||
| :"memory","cc","r0","r1","v0","v1","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" | |||
| ); | |||
| } | |||
| @@ -118,7 +118,7 @@ static void caxpy_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) | |||
| int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) { | |||
| BLASLONG i = 0; | |||
| BLASLONG ix = 0, iy = 0; | |||
| FLOAT da[2]; | |||
| FLOAT da[2] __attribute__ ((aligned(16))); | |||
| if (n <= 0) return (0); | |||
| @@ -0,0 +1,743 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2014, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #include <stdlib.h> | |||
| #include <stdio.h> | |||
| #include "common.h" | |||
| #define NBMAX 1024 | |||
| static void cgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) | |||
| { | |||
| __asm__ volatile ( | |||
| "vlrepg %%v16,0(%5) \n\t" | |||
| "vlrepg %%v17,8(%5) \n\t" | |||
| "vlrepg %%v18,16(%5) \n\t" | |||
| "vlrepg %%v19,24(%5) \n\t" | |||
| #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) | |||
| "vlef %%v20,4(%5),0 \n\t" | |||
| "vlef %%v20,4(%5),2 \n\t" | |||
| "vflcsb %%v20,%%v20 \n\t" | |||
| "vlef %%v20,0(%5),1 \n\t" | |||
| "vlef %%v20,0(%5),3 \n\t" | |||
| "vlef %%v21,12(%5),0 \n\t" | |||
| "vlef %%v21,12(%5),2 \n\t" | |||
| "vflcsb %%v21,%%v21 \n\t" | |||
| "vlef %%v21,8(%5),1 \n\t" | |||
| "vlef %%v21,8(%5),3 \n\t" | |||
| "vlef %%v22,20(%5),0 \n\t" | |||
| "vlef %%v22,20(%5),2 \n\t" | |||
| "vflcsb %%v22,%%v22 \n\t" | |||
| "vlef %%v22,16(%5),1 \n\t" | |||
| "vlef %%v22,16(%5),3 \n\t" | |||
| "vlef %%v23,28(%5),0 \n\t" | |||
| "vlef %%v23,28(%5),2 \n\t" | |||
| "vflcsb %%v23,%%v23 \n\t" | |||
| "vlef %%v23,24(%5),1 \n\t" | |||
| "vlef %%v23,24(%5),3 \n\t" | |||
| #else | |||
| "vlef %%v20,0(%5),1 \n\t" | |||
| "vlef %%v20,0(%5),3 \n\t" | |||
| "vflcsb %%v20,%%v20 \n\t" | |||
| "vlef %%v20,4(%5),0 \n\t" | |||
| "vlef %%v20,4(%5),2 \n\t" | |||
| "vlef %%v21,8(%5),1 \n\t" | |||
| "vlef %%v21,8(%5),3 \n\t" | |||
| "vflcsb %%v21,%%v21 \n\t" | |||
| "vlef %%v21,12(%5),0 \n\t" | |||
| "vlef %%v21,12(%5),2 \n\t" | |||
| "vlef %%v22,16(%5),1 \n\t" | |||
| "vlef %%v22,16(%5),3 \n\t" | |||
| "vflcsb %%v22,%%v22 \n\t" | |||
| "vlef %%v22,20(%5),0 \n\t" | |||
| "vlef %%v22,20(%5),2 \n\t" | |||
| "vlef %%v23,24(%5),1 \n\t" | |||
| "vlef %%v23,24(%5),3 \n\t" | |||
| "vflcsb %%v23,%%v23 \n\t" | |||
| "vlef %%v23,28(%5),0 \n\t" | |||
| "vlef %%v23,28(%5),2 \n\t" | |||
| #endif | |||
| "xgr %%r1,%%r1 \n\t" | |||
| "srlg %%r0,%%r0,1 \n\t" | |||
| "0: \n\t" | |||
| "pfd 1,1024(%%r1,%1) \n\t" | |||
| "pfd 1,1024(%%r1,%2) \n\t" | |||
| "pfd 1,1024(%%r1,%3) \n\t" | |||
| "pfd 1,1024(%%r1,%4) \n\t" | |||
| "pfd 2,1024(%%r1,%6) \n\t" | |||
| "vlef %%v24,0(%%r1,%1),0 \n\t" | |||
| "vlef %%v24,0(%%r1,%1),1 \n\t" | |||
| "vlef %%v24,8(%%r1,%1),2 \n\t" | |||
| "vlef %%v24,8(%%r1,%1),3 \n\t" | |||
| "vlef %%v25,4(%%r1,%1),0 \n\t" | |||
| "vlef %%v25,4(%%r1,%1),1 \n\t" | |||
| "vlef %%v25,12(%%r1,%1),2 \n\t" | |||
| "vlef %%v25,12(%%r1,%1),3 \n\t" | |||
| "vlef %%v26,0(%%r1,%2),0 \n\t" | |||
| "vlef %%v26,0(%%r1,%2),1 \n\t" | |||
| "vlef %%v26,8(%%r1,%2),2 \n\t" | |||
| "vlef %%v26,8(%%r1,%2),3 \n\t" | |||
| "vlef %%v27,4(%%r1,%2),0 \n\t" | |||
| "vlef %%v27,4(%%r1,%2),1 \n\t" | |||
| "vlef %%v27,12(%%r1,%2),2 \n\t" | |||
| "vlef %%v27,12(%%r1,%2),3 \n\t" | |||
| "vl %%v0,0(%%r1,%6) \n\t" | |||
| "vfmasb %%v0,%%v24,%%v16,%%v0 \n\t" | |||
| "vfmasb %%v0,%%v25,%%v20,%%v0 \n\t" | |||
| "vfmasb %%v0,%%v26,%%v17,%%v0 \n\t" | |||
| "vfmasb %%v0,%%v27,%%v21,%%v0 \n\t" | |||
| "vlef %%v28,0(%%r1,%1),0 \n\t" | |||
| "vlef %%v28,0(%%r1,%1),1 \n\t" | |||
| "vlef %%v28,8(%%r1,%1),2 \n\t" | |||
| "vlef %%v28,8(%%r1,%1),3 \n\t" | |||
| "vlef %%v29,4(%%r1,%1),0 \n\t" | |||
| "vlef %%v29,4(%%r1,%1),1 \n\t" | |||
| "vlef %%v29,12(%%r1,%1),2 \n\t" | |||
| "vlef %%v29,12(%%r1,%1),3 \n\t" | |||
| "vlef %%v30,0(%%r1,%2),0 \n\t" | |||
| "vlef %%v30,0(%%r1,%2),1 \n\t" | |||
| "vlef %%v30,8(%%r1,%2),2 \n\t" | |||
| "vlef %%v30,8(%%r1,%2),3 \n\t" | |||
| "vlef %%v31,4(%%r1,%2),0 \n\t" | |||
| "vlef %%v31,4(%%r1,%2),1 \n\t" | |||
| "vlef %%v31,12(%%r1,%2),2 \n\t" | |||
| "vlef %%v31,12(%%r1,%2),3 \n\t" | |||
| "vfmasb %%v0,%%v28,%%v18,%%v0 \n\t" | |||
| "vfmasb %%v0,%%v29,%%v22,%%v0 \n\t" | |||
| "vfmasb %%v0,%%v30,%%v19,%%v0 \n\t" | |||
| "vfmasb %%v0,%%v31,%%v23,%%v0 \n\t" | |||
| "vst %%v0,0(%%r1,%6) \n\t" | |||
| "agfi %%r1,16 \n\t" | |||
| "brctg %%r0,0b \n\t" | |||
| : | |||
| :"r"(n),"ZR"((const FLOAT (*)[n * 2])ap[0]),"ZR"((const FLOAT (*)[n * 2])ap[1]),"ZR"((const FLOAT (*)[n * 2])ap[2]),"ZR"((const FLOAT (*)[n * 2])ap[3]),"ZQ"((const FLOAT (*)[8])x),"ZR"((FLOAT (*)[n * 2])y) | |||
| :"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" | |||
| ); | |||
| } | |||
| static void cgemv_kernel_4x2(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) | |||
| { | |||
| __asm__ volatile ( | |||
| "vlrepg %%v16,0(%3) \n\t" | |||
| "vlrepg %%v17,8(%3) \n\t" | |||
| #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) | |||
| "vlef %%v18,4(%3),0 \n\t" | |||
| "vlef %%v18,4(%3),2 \n\t" | |||
| "vflcsb %%v18,%%v18 \n\t" | |||
| "vlef %%v18,0(%3),1 \n\t" | |||
| "vlef %%v18,0(%3),3 \n\t" | |||
| "vlef %%v19,12(%3),0 \n\t" | |||
| "vlef %%v19,12(%3),2 \n\t" | |||
| "vflcsb %%v19,%%v19 \n\t" | |||
| "vlef %%v19,8(%3),1 \n\t" | |||
| "vlef %%v19,8(%3),3 \n\t" | |||
| #else | |||
| "vlef %%v18,0(%3),1 \n\t" | |||
| "vlef %%v18,0(%3),3 \n\t" | |||
| "vflcsb %%v18,%%v18 \n\t" | |||
| "vlef %%v18,4(%3),0 \n\t" | |||
| "vlef %%v18,4(%3),2 \n\t" | |||
| "vlef %%v19,8(%3),1 \n\t" | |||
| "vlef %%v19,8(%3),3 \n\t" | |||
| "vflcsb %%v19,%%v19 \n\t" | |||
| "vlef %%v19,12(%3),0 \n\t" | |||
| "vlef %%v19,12(%3),2 \n\t" | |||
| #endif | |||
| "xgr %%r1,%%r1 \n\t" | |||
| "srlg %%r0,%%r0,1 \n\t" | |||
| "0: \n\t" | |||
| "pfd 1,1024(%%r1,%1) \n\t" | |||
| "pfd 1,1024(%%r1,%2) \n\t" | |||
| "pfd 2,1024(%%r1,%4) \n\t" | |||
| "vlef %%v20,0(%%r1,%1),0 \n\t" | |||
| "vlef %%v20,0(%%r1,%1),1 \n\t" | |||
| "vlef %%v20,8(%%r1,%1),2 \n\t" | |||
| "vlef %%v20,8(%%r1,%1),3 \n\t" | |||
| "vlef %%v21,4(%%r1,%1),0 \n\t" | |||
| "vlef %%v21,4(%%r1,%1),1 \n\t" | |||
| "vlef %%v21,12(%%r1,%1),2 \n\t" | |||
| "vlef %%v21,12(%%r1,%1),3 \n\t" | |||
| "vlef %%v22,0(%%r1,%2),0 \n\t" | |||
| "vlef %%v22,0(%%r1,%2),1 \n\t" | |||
| "vlef %%v22,8(%%r1,%2),2 \n\t" | |||
| "vlef %%v22,8(%%r1,%2),3 \n\t" | |||
| "vlef %%v23,4(%%r1,%2),0 \n\t" | |||
| "vlef %%v23,4(%%r1,%2),1 \n\t" | |||
| "vlef %%v23,12(%%r1,%2),2 \n\t" | |||
| "vlef %%v23,12(%%r1,%2),3 \n\t" | |||
| "vl %%v0,0(%%r1,%4) \n\t" | |||
| "vfmasb %%v0,%%v20,%%v16,%%v0 \n\t" | |||
| "vfmasb %%v0,%%v21,%%v18,%%v0 \n\t" | |||
| "vfmasb %%v0,%%v22,%%v17,%%v0 \n\t" | |||
| "vfmasb %%v0,%%v23,%%v19,%%v0 \n\t" | |||
| "vst %%v0,0(%%r1,%4) \n\t" | |||
| "agfi %%r1,16 \n\t" | |||
| "brctg %%r0,0b \n\t" | |||
| : | |||
| :"r"(n),"ZR"((const FLOAT (*)[n * 2])ap[0]),"ZR"((const FLOAT (*)[n * 2])ap[1]),"ZQ"((const FLOAT (*)[4])x),"ZR"((FLOAT (*)[n * 2])y) | |||
| :"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23" | |||
| ); | |||
| } | |||
| static void cgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y) | |||
| { | |||
| __asm__ volatile ( | |||
| "vlrepg %%v16,0(%2) \n\t" | |||
| #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) | |||
| "vlef %%v17,4(%2),0 \n\t" | |||
| "vlef %%v17,4(%2),2 \n\t" | |||
| "vflcsb %%v17,%%v17 \n\t" | |||
| "vlef %%v17,0(%2),1 \n\t" | |||
| "vlef %%v17,0(%2),3 \n\t" | |||
| #else | |||
| "vlef %%v17,0(%2),1 \n\t" | |||
| "vlef %%v17,0(%2),3 \n\t" | |||
| "vflcsb %%v17,%%v17 \n\t" | |||
| "vlef %%v17,4(%2),0 \n\t" | |||
| "vlef %%v17,4(%2),2 \n\t" | |||
| #endif | |||
| "xgr %%r1,%%r1 \n\t" | |||
| "srlg %%r0,%%r0,1 \n\t" | |||
| "0: \n\t" | |||
| "pfd 1,1024(%%r1,%1) \n\t" | |||
| "pfd 2,1024(%%r1,%3) \n\t" | |||
| "vlef %%v18,0(%%r1,%1),0 \n\t" | |||
| "vlef %%v18,0(%%r1,%1),1 \n\t" | |||
| "vlef %%v18,8(%%r1,%1),2 \n\t" | |||
| "vlef %%v18,8(%%r1,%1),3 \n\t" | |||
| "vlef %%v19,4(%%r1,%1),0 \n\t" | |||
| "vlef %%v19,4(%%r1,%1),1 \n\t" | |||
| "vlef %%v19,12(%%r1,%1),2 \n\t" | |||
| "vlef %%v19,12(%%r1,%1),3 \n\t" | |||
| "vl %%v0,0(%%r1,%3) \n\t" | |||
| "vfmasb %%v0,%%v18,%%v16,%%v0 \n\t" | |||
| "vfmasb %%v0,%%v19,%%v17,%%v0 \n\t" | |||
| "vst %%v0,0(%%r1,%3) \n\t" | |||
| "agfi %%r1,16 \n\t" | |||
| "brctg %%r0,0b \n\t" | |||
| : | |||
| :"r"(n),"ZR"((const FLOAT (*)[n * 2])ap),"ZQ"((const FLOAT (*)[2])x),"ZR"((FLOAT (*)[n * 2])y) | |||
| :"memory","cc","r0","r1","v0","v16","v17","v18","v19" | |||
| ); | |||
| } | |||
| static void add_y_4(BLASLONG n, FLOAT *src, FLOAT *dest, FLOAT alpha_r, FLOAT alpha_i) | |||
| { | |||
| __asm__ volatile ( | |||
| #if !defined(XCONJ) | |||
| "vlrepf %%v0,%3 \n\t" | |||
| "vlef %%v1,%4,0 \n\t" | |||
| "vlef %%v1,%4,2 \n\t" | |||
| "vflcsb %%v1,%%v1 \n\t" | |||
| "vlef %%v1,%4,1 \n\t" | |||
| "vlef %%v1,%4,3 \n\t" | |||
| #else | |||
| "vlef %%v0,%3,1 \n\t" | |||
| "vlef %%v0,%3,3 \n\t" | |||
| "vflcsb %%v0,%%v0 \n\t" | |||
| "vlef %%v0,%3,0 \n\t" | |||
| "vlef %%v0,%3,2 \n\t" | |||
| "vlrepf %%v1,%4 \n\t" | |||
| #endif | |||
| "xgr %%r1,%%r1 \n\t" | |||
| "srlg %%r0,%0,2 \n\t" | |||
| "0: \n\t" | |||
| "pfd 1,1024(%%r1,%1) \n\t" | |||
| "pfd 2,1024(%%r1,%2) \n\t" | |||
| "vl %%v16,0(%%r1,%1) \n\t" | |||
| "vl %%v17,16(%%r1,%1) \n\t" | |||
| "vl %%v18,0(%%r1,%2) \n\t" | |||
| "vl %%v19,16(%%r1,%2) \n\t" | |||
| "verllg %%v20,%%v16,32 \n\t" | |||
| "verllg %%v21,%%v17,32 \n\t" | |||
| "vfmasb %%v22,%%v16,%%v0,%%v18 \n\t" | |||
| "vfmasb %%v23,%%v17,%%v0,%%v19 \n\t" | |||
| "vfmasb %%v22,%%v20,%%v1,%%v22 \n\t" | |||
| "vfmasb %%v23,%%v21,%%v1,%%v23 \n\t" | |||
| "vst %%v22,0(%%r1,%2) \n\t" | |||
| "vst %%v23,16(%%r1,%2) \n\t" | |||
| "agfi %%r1,32 \n\t" | |||
| "brctg %%r0,0b " | |||
| : | |||
| :"r"(n),"ZR"((const FLOAT (*)[n * 2])src),"ZR"((FLOAT (*)[n * 2])dest),"m"(alpha_r),"m"(alpha_i) | |||
| :"memory","cc","r0","r1","v0","v1","v16","v17","v18","v19","v20","v21","v22","v23" | |||
| ); | |||
| } | |||
| static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest, FLOAT alpha_r, FLOAT alpha_i) | |||
| { | |||
| BLASLONG i; | |||
| if ( inc_dest != 2 ) | |||
| { | |||
| FLOAT temp_r; | |||
| FLOAT temp_i; | |||
| for ( i=0; i<n; i++ ) | |||
| { | |||
| #if !defined(XCONJ) | |||
| temp_r = alpha_r * src[0] - alpha_i * src[1]; | |||
| temp_i = alpha_r * src[1] + alpha_i * src[0]; | |||
| #else | |||
| temp_r = alpha_r * src[0] + alpha_i * src[1]; | |||
| temp_i = -alpha_r * src[1] + alpha_i * src[0]; | |||
| #endif | |||
| *dest += temp_r; | |||
| *(dest+1) += temp_i; | |||
| src+=2; | |||
| dest += inc_dest; | |||
| } | |||
| return; | |||
| } | |||
| add_y_4(n, src, dest, alpha_r, alpha_i); | |||
| } | |||
| int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r,FLOAT alpha_i, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) | |||
| { | |||
| BLASLONG i; | |||
| FLOAT *a_ptr; | |||
| FLOAT *x_ptr; | |||
| FLOAT *y_ptr; | |||
| FLOAT *ap[4]; | |||
| BLASLONG n1; | |||
| BLASLONG m1; | |||
| BLASLONG m2; | |||
| BLASLONG m3; | |||
| BLASLONG n2; | |||
| BLASLONG lda4; | |||
| FLOAT xbuffer[8],*ybuffer; | |||
| if ( m < 1 ) return(0); | |||
| if ( n < 1 ) return(0); | |||
| ybuffer = buffer; | |||
| inc_x *= 2; | |||
| inc_y *= 2; | |||
| lda *= 2; | |||
| lda4 = 4 * lda; | |||
| n1 = n / 4 ; | |||
| n2 = n % 4 ; | |||
| m3 = m % 4; | |||
| m1 = m - ( m % 4 ); | |||
| m2 = (m % NBMAX) - (m % 4) ; | |||
| y_ptr = y; | |||
| BLASLONG NB = NBMAX; | |||
| while ( NB == NBMAX ) | |||
| { | |||
| m1 -= NB; | |||
| if ( m1 < 0) | |||
| { | |||
| if ( m2 == 0 ) break; | |||
| NB = m2; | |||
| } | |||
| a_ptr = a; | |||
| ap[0] = a_ptr; | |||
| ap[1] = a_ptr + lda; | |||
| ap[2] = ap[1] + lda; | |||
| ap[3] = ap[2] + lda; | |||
| x_ptr = x; | |||
| //zero_y(NB,ybuffer); | |||
| memset(ybuffer,0,NB*16); | |||
| if ( inc_x == 2 ) | |||
| { | |||
| for( i = 0; i < n1 ; i++) | |||
| { | |||
| cgemv_kernel_4x4(NB,ap,x_ptr,ybuffer); | |||
| ap[0] += lda4; | |||
| ap[1] += lda4; | |||
| ap[2] += lda4; | |||
| ap[3] += lda4; | |||
| a_ptr += lda4; | |||
| x_ptr += 8; | |||
| } | |||
| if ( n2 & 2 ) | |||
| { | |||
| cgemv_kernel_4x2(NB,ap,x_ptr,ybuffer); | |||
| x_ptr += 4; | |||
| a_ptr += 2 * lda; | |||
| } | |||
| if ( n2 & 1 ) | |||
| { | |||
| cgemv_kernel_4x1(NB,a_ptr,x_ptr,ybuffer); | |||
| /* x_ptr += 2; | |||
| a_ptr += lda; */ | |||
| } | |||
| } | |||
| else | |||
| { | |||
| for( i = 0; i < n1 ; i++) | |||
| { | |||
| xbuffer[0] = x_ptr[0]; | |||
| xbuffer[1] = x_ptr[1]; | |||
| x_ptr += inc_x; | |||
| xbuffer[2] = x_ptr[0]; | |||
| xbuffer[3] = x_ptr[1]; | |||
| x_ptr += inc_x; | |||
| xbuffer[4] = x_ptr[0]; | |||
| xbuffer[5] = x_ptr[1]; | |||
| x_ptr += inc_x; | |||
| xbuffer[6] = x_ptr[0]; | |||
| xbuffer[7] = x_ptr[1]; | |||
| x_ptr += inc_x; | |||
| cgemv_kernel_4x4(NB,ap,xbuffer,ybuffer); | |||
| ap[0] += lda4; | |||
| ap[1] += lda4; | |||
| ap[2] += lda4; | |||
| ap[3] += lda4; | |||
| a_ptr += lda4; | |||
| } | |||
| for( i = 0; i < n2 ; i++) | |||
| { | |||
| xbuffer[0] = x_ptr[0]; | |||
| xbuffer[1] = x_ptr[1]; | |||
| x_ptr += inc_x; | |||
| cgemv_kernel_4x1(NB,a_ptr,xbuffer,ybuffer); | |||
| a_ptr += 1 * lda; | |||
| } | |||
| } | |||
| add_y(NB,ybuffer,y_ptr,inc_y,alpha_r,alpha_i); | |||
| a += 2 * NB; | |||
| y_ptr += NB * inc_y; | |||
| } | |||
| if ( m3 == 0 ) return(0); | |||
| if ( m3 == 1 ) | |||
| { | |||
| a_ptr = a; | |||
| x_ptr = x; | |||
| FLOAT temp_r = 0.0; | |||
| FLOAT temp_i = 0.0; | |||
| if ( lda == 2 && inc_x == 2 ) | |||
| { | |||
| for( i=0 ; i < (n & -2); i+=2 ) | |||
| { | |||
| #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) | |||
| temp_r += a_ptr[0] * x_ptr[0] - a_ptr[1] * x_ptr[1]; | |||
| temp_i += a_ptr[0] * x_ptr[1] + a_ptr[1] * x_ptr[0]; | |||
| temp_r += a_ptr[2] * x_ptr[2] - a_ptr[3] * x_ptr[3]; | |||
| temp_i += a_ptr[2] * x_ptr[3] + a_ptr[3] * x_ptr[2]; | |||
| #else | |||
| temp_r += a_ptr[0] * x_ptr[0] + a_ptr[1] * x_ptr[1]; | |||
| temp_i += a_ptr[0] * x_ptr[1] - a_ptr[1] * x_ptr[0]; | |||
| temp_r += a_ptr[2] * x_ptr[2] + a_ptr[3] * x_ptr[3]; | |||
| temp_i += a_ptr[2] * x_ptr[3] - a_ptr[3] * x_ptr[2]; | |||
| #endif | |||
| a_ptr += 4; | |||
| x_ptr += 4; | |||
| } | |||
| for( ; i < n; i++ ) | |||
| { | |||
| #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) | |||
| temp_r += a_ptr[0] * x_ptr[0] - a_ptr[1] * x_ptr[1]; | |||
| temp_i += a_ptr[0] * x_ptr[1] + a_ptr[1] * x_ptr[0]; | |||
| #else | |||
| temp_r += a_ptr[0] * x_ptr[0] + a_ptr[1] * x_ptr[1]; | |||
| temp_i += a_ptr[0] * x_ptr[1] - a_ptr[1] * x_ptr[0]; | |||
| #endif | |||
| a_ptr += 2; | |||
| x_ptr += 2; | |||
| } | |||
| } | |||
| else | |||
| { | |||
| for( i = 0; i < n; i++ ) | |||
| { | |||
| #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) | |||
| temp_r += a_ptr[0] * x_ptr[0] - a_ptr[1] * x_ptr[1]; | |||
| temp_i += a_ptr[0] * x_ptr[1] + a_ptr[1] * x_ptr[0]; | |||
| #else | |||
| temp_r += a_ptr[0] * x_ptr[0] + a_ptr[1] * x_ptr[1]; | |||
| temp_i += a_ptr[0] * x_ptr[1] - a_ptr[1] * x_ptr[0]; | |||
| #endif | |||
| a_ptr += lda; | |||
| x_ptr += inc_x; | |||
| } | |||
| } | |||
| #if !defined(XCONJ) | |||
| y_ptr[0] += alpha_r * temp_r - alpha_i * temp_i; | |||
| y_ptr[1] += alpha_r * temp_i + alpha_i * temp_r; | |||
| #else | |||
| y_ptr[0] += alpha_r * temp_r + alpha_i * temp_i; | |||
| y_ptr[1] -= alpha_r * temp_i - alpha_i * temp_r; | |||
| #endif | |||
| return(0); | |||
| } | |||
| if ( m3 == 2 ) | |||
| { | |||
| a_ptr = a; | |||
| x_ptr = x; | |||
| FLOAT temp_r0 = 0.0; | |||
| FLOAT temp_i0 = 0.0; | |||
| FLOAT temp_r1 = 0.0; | |||
| FLOAT temp_i1 = 0.0; | |||
| if ( lda == 4 && inc_x == 2 ) | |||
| { | |||
| for( i = 0; i < (n & -2); i+=2 ) | |||
| { | |||
| #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) | |||
| temp_r0 += a_ptr[0] * x_ptr[0] - a_ptr[1] * x_ptr[1]; | |||
| temp_i0 += a_ptr[0] * x_ptr[1] + a_ptr[1] * x_ptr[0]; | |||
| temp_r1 += a_ptr[2] * x_ptr[0] - a_ptr[3] * x_ptr[1]; | |||
| temp_i1 += a_ptr[2] * x_ptr[1] + a_ptr[3] * x_ptr[0]; | |||
| temp_r0 += a_ptr[4] * x_ptr[2] - a_ptr[5] * x_ptr[3]; | |||
| temp_i0 += a_ptr[4] * x_ptr[3] + a_ptr[5] * x_ptr[2]; | |||
| temp_r1 += a_ptr[6] * x_ptr[2] - a_ptr[7] * x_ptr[3]; | |||
| temp_i1 += a_ptr[6] * x_ptr[3] + a_ptr[7] * x_ptr[2]; | |||
| #else | |||
| temp_r0 += a_ptr[0] * x_ptr[0] + a_ptr[1] * x_ptr[1]; | |||
| temp_i0 += a_ptr[0] * x_ptr[1] - a_ptr[1] * x_ptr[0]; | |||
| temp_r1 += a_ptr[2] * x_ptr[0] + a_ptr[3] * x_ptr[1]; | |||
| temp_i1 += a_ptr[2] * x_ptr[1] - a_ptr[3] * x_ptr[0]; | |||
| temp_r0 += a_ptr[4] * x_ptr[2] + a_ptr[5] * x_ptr[3]; | |||
| temp_i0 += a_ptr[4] * x_ptr[3] - a_ptr[5] * x_ptr[2]; | |||
| temp_r1 += a_ptr[6] * x_ptr[2] + a_ptr[7] * x_ptr[3]; | |||
| temp_i1 += a_ptr[6] * x_ptr[3] - a_ptr[7] * x_ptr[2]; | |||
| #endif | |||
| a_ptr += 8; | |||
| x_ptr += 4; | |||
| } | |||
| for( ; i < n; i++ ) | |||
| { | |||
| #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) | |||
| temp_r0 += a_ptr[0] * x_ptr[0] - a_ptr[1] * x_ptr[1]; | |||
| temp_i0 += a_ptr[0] * x_ptr[1] + a_ptr[1] * x_ptr[0]; | |||
| temp_r1 += a_ptr[2] * x_ptr[0] - a_ptr[3] * x_ptr[1]; | |||
| temp_i1 += a_ptr[2] * x_ptr[1] + a_ptr[3] * x_ptr[0]; | |||
| #else | |||
| temp_r0 += a_ptr[0] * x_ptr[0] + a_ptr[1] * x_ptr[1]; | |||
| temp_i0 += a_ptr[0] * x_ptr[1] - a_ptr[1] * x_ptr[0]; | |||
| temp_r1 += a_ptr[2] * x_ptr[0] + a_ptr[3] * x_ptr[1]; | |||
| temp_i1 += a_ptr[2] * x_ptr[1] - a_ptr[3] * x_ptr[0]; | |||
| #endif | |||
| a_ptr += 4; | |||
| x_ptr += 2; | |||
| } | |||
| } | |||
| else | |||
| { | |||
| for( i=0 ; i < n; i++ ) | |||
| { | |||
| #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) | |||
| temp_r0 += a_ptr[0] * x_ptr[0] - a_ptr[1] * x_ptr[1]; | |||
| temp_i0 += a_ptr[0] * x_ptr[1] + a_ptr[1] * x_ptr[0]; | |||
| temp_r1 += a_ptr[2] * x_ptr[0] - a_ptr[3] * x_ptr[1]; | |||
| temp_i1 += a_ptr[2] * x_ptr[1] + a_ptr[3] * x_ptr[0]; | |||
| #else | |||
| temp_r0 += a_ptr[0] * x_ptr[0] + a_ptr[1] * x_ptr[1]; | |||
| temp_i0 += a_ptr[0] * x_ptr[1] - a_ptr[1] * x_ptr[0]; | |||
| temp_r1 += a_ptr[2] * x_ptr[0] + a_ptr[3] * x_ptr[1]; | |||
| temp_i1 += a_ptr[2] * x_ptr[1] - a_ptr[3] * x_ptr[0]; | |||
| #endif | |||
| a_ptr += lda; | |||
| x_ptr += inc_x; | |||
| } | |||
| } | |||
| #if !defined(XCONJ) | |||
| y_ptr[0] += alpha_r * temp_r0 - alpha_i * temp_i0; | |||
| y_ptr[1] += alpha_r * temp_i0 + alpha_i * temp_r0; | |||
| y_ptr += inc_y; | |||
| y_ptr[0] += alpha_r * temp_r1 - alpha_i * temp_i1; | |||
| y_ptr[1] += alpha_r * temp_i1 + alpha_i * temp_r1; | |||
| #else | |||
| y_ptr[0] += alpha_r * temp_r0 + alpha_i * temp_i0; | |||
| y_ptr[1] -= alpha_r * temp_i0 - alpha_i * temp_r0; | |||
| y_ptr += inc_y; | |||
| y_ptr[0] += alpha_r * temp_r1 + alpha_i * temp_i1; | |||
| y_ptr[1] -= alpha_r * temp_i1 - alpha_i * temp_r1; | |||
| #endif | |||
| return(0); | |||
| } | |||
| if ( m3 == 3 ) | |||
| { | |||
| a_ptr = a; | |||
| x_ptr = x; | |||
| FLOAT temp_r0 = 0.0; | |||
| FLOAT temp_i0 = 0.0; | |||
| FLOAT temp_r1 = 0.0; | |||
| FLOAT temp_i1 = 0.0; | |||
| FLOAT temp_r2 = 0.0; | |||
| FLOAT temp_i2 = 0.0; | |||
| if ( lda == 6 && inc_x == 2 ) | |||
| { | |||
| for( i=0 ; i < n; i++ ) | |||
| { | |||
| #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) | |||
| temp_r0 += a_ptr[0] * x_ptr[0] - a_ptr[1] * x_ptr[1]; | |||
| temp_i0 += a_ptr[0] * x_ptr[1] + a_ptr[1] * x_ptr[0]; | |||
| temp_r1 += a_ptr[2] * x_ptr[0] - a_ptr[3] * x_ptr[1]; | |||
| temp_i1 += a_ptr[2] * x_ptr[1] + a_ptr[3] * x_ptr[0]; | |||
| temp_r2 += a_ptr[4] * x_ptr[0] - a_ptr[5] * x_ptr[1]; | |||
| temp_i2 += a_ptr[4] * x_ptr[1] + a_ptr[5] * x_ptr[0]; | |||
| #else | |||
| temp_r0 += a_ptr[0] * x_ptr[0] + a_ptr[1] * x_ptr[1]; | |||
| temp_i0 += a_ptr[0] * x_ptr[1] - a_ptr[1] * x_ptr[0]; | |||
| temp_r1 += a_ptr[2] * x_ptr[0] + a_ptr[3] * x_ptr[1]; | |||
| temp_i1 += a_ptr[2] * x_ptr[1] - a_ptr[3] * x_ptr[0]; | |||
| temp_r2 += a_ptr[4] * x_ptr[0] + a_ptr[5] * x_ptr[1]; | |||
| temp_i2 += a_ptr[4] * x_ptr[1] - a_ptr[5] * x_ptr[0]; | |||
| #endif | |||
| a_ptr += 6; | |||
| x_ptr += 2; | |||
| } | |||
| } | |||
| else | |||
| { | |||
| for( i = 0; i < n; i++ ) | |||
| { | |||
| #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) | |||
| temp_r0 += a_ptr[0] * x_ptr[0] - a_ptr[1] * x_ptr[1]; | |||
| temp_i0 += a_ptr[0] * x_ptr[1] + a_ptr[1] * x_ptr[0]; | |||
| temp_r1 += a_ptr[2] * x_ptr[0] - a_ptr[3] * x_ptr[1]; | |||
| temp_i1 += a_ptr[2] * x_ptr[1] + a_ptr[3] * x_ptr[0]; | |||
| temp_r2 += a_ptr[4] * x_ptr[0] - a_ptr[5] * x_ptr[1]; | |||
| temp_i2 += a_ptr[4] * x_ptr[1] + a_ptr[5] * x_ptr[0]; | |||
| #else | |||
| temp_r0 += a_ptr[0] * x_ptr[0] + a_ptr[1] * x_ptr[1]; | |||
| temp_i0 += a_ptr[0] * x_ptr[1] - a_ptr[1] * x_ptr[0]; | |||
| temp_r1 += a_ptr[2] * x_ptr[0] + a_ptr[3] * x_ptr[1]; | |||
| temp_i1 += a_ptr[2] * x_ptr[1] - a_ptr[3] * x_ptr[0]; | |||
| temp_r2 += a_ptr[4] * x_ptr[0] + a_ptr[5] * x_ptr[1]; | |||
| temp_i2 += a_ptr[4] * x_ptr[1] - a_ptr[5] * x_ptr[0]; | |||
| #endif | |||
| a_ptr += lda; | |||
| x_ptr += inc_x; | |||
| } | |||
| } | |||
| #if !defined(XCONJ) | |||
| y_ptr[0] += alpha_r * temp_r0 - alpha_i * temp_i0; | |||
| y_ptr[1] += alpha_r * temp_i0 + alpha_i * temp_r0; | |||
| y_ptr += inc_y; | |||
| y_ptr[0] += alpha_r * temp_r1 - alpha_i * temp_i1; | |||
| y_ptr[1] += alpha_r * temp_i1 + alpha_i * temp_r1; | |||
| y_ptr += inc_y; | |||
| y_ptr[0] += alpha_r * temp_r2 - alpha_i * temp_i2; | |||
| y_ptr[1] += alpha_r * temp_i2 + alpha_i * temp_r2; | |||
| #else | |||
| y_ptr[0] += alpha_r * temp_r0 + alpha_i * temp_i0; | |||
| y_ptr[1] -= alpha_r * temp_i0 - alpha_i * temp_r0; | |||
| y_ptr += inc_y; | |||
| y_ptr[0] += alpha_r * temp_r1 + alpha_i * temp_i1; | |||
| y_ptr[1] -= alpha_r * temp_i1 - alpha_i * temp_r1; | |||
| y_ptr += inc_y; | |||
| y_ptr[0] += alpha_r * temp_r2 + alpha_i * temp_i2; | |||
| y_ptr[1] -= alpha_r * temp_i2 - alpha_i * temp_r2; | |||
| #endif | |||
| return(0); | |||
| } | |||
| return(0); | |||
| } | |||
| @@ -0,0 +1,671 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2014, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #include "common.h" | |||
| #define NBMAX 1024 | |||
| static void cgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha) | |||
| { | |||
| __asm__ volatile ( | |||
| "vzero %%v16 \n\t" | |||
| "vzero %%v17 \n\t" | |||
| "vzero %%v18 \n\t" | |||
| "vzero %%v19 \n\t" | |||
| "xgr %%r1,%%r1 \n\t" | |||
| "srlg %%r0,%0,1 \n\t" | |||
| "0: \n\t" | |||
| "pfd 1,1024(%%r1,%1) \n\t" | |||
| "pfd 1,1024(%%r1,%2) \n\t" | |||
| "pfd 1,1024(%%r1,%3) \n\t" | |||
| "pfd 1,1024(%%r1,%4) \n\t" | |||
| "pfd 1,1024(%%r1,%5) \n\t" | |||
| "vl %%v20,0(%%r1,%5) \n\t" | |||
| #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) | |||
| "vlef %%v21,4(%%r1,%5),0 \n\t" | |||
| "vlef %%v21,12(%%r1,%5),2 \n\t" | |||
| "vflcsb %%v21,%%v21 \n\t" | |||
| "vlef %%v21,0(%%r1,%5),1 \n\t" | |||
| "vlef %%v21,8(%%r1,%5),3 \n\t" | |||
| #else | |||
| "vlef %%v21,0(%%r1,%5),1 \n\t" | |||
| "vlef %%v21,8(%%r1,%5),3 \n\t" | |||
| "vflcsb %%v21,%%v21 \n\t" | |||
| "vlef %%v21,4(%%r1,%5),0 \n\t" | |||
| "vlef %%v21,12(%%r1,%5),2 \n\t" | |||
| #endif | |||
| "vlef %%v22,0(%%r1,%1),0 \n\t" | |||
| "vlef %%v22,0(%%r1,%1),1 \n\t" | |||
| "vlef %%v22,8(%%r1,%1),2 \n\t" | |||
| "vlef %%v22,8(%%r1,%1),3 \n\t" | |||
| "vlef %%v23,4(%%r1,%1),0 \n\t" | |||
| "vlef %%v23,4(%%r1,%1),1 \n\t" | |||
| "vlef %%v23,12(%%r1,%1),2 \n\t" | |||
| "vlef %%v23,12(%%r1,%1),3 \n\t" | |||
| "vlef %%v24,0(%%r1,%2),0 \n\t" | |||
| "vlef %%v24,0(%%r1,%2),1 \n\t" | |||
| "vlef %%v24,8(%%r1,%2),2 \n\t" | |||
| "vlef %%v24,8(%%r1,%2),3 \n\t" | |||
| "vlef %%v25,4(%%r1,%2),0 \n\t" | |||
| "vlef %%v25,4(%%r1,%2),1 \n\t" | |||
| "vlef %%v25,12(%%r1,%2),2 \n\t" | |||
| "vlef %%v25,12(%%r1,%2),3 \n\t" | |||
| "vfmasb %%v16,%%v22,%%v20,%%v16 \n\t" | |||
| "vfmasb %%v16,%%v23,%%v21,%%v16 \n\t" | |||
| "vfmasb %%v17,%%v24,%%v20,%%v17 \n\t" | |||
| "vfmasb %%v17,%%v25,%%v21,%%v17 \n\t" | |||
| "vlef %%v26,0(%%r1,%3),0 \n\t" | |||
| "vlef %%v26,0(%%r1,%3),1 \n\t" | |||
| "vlef %%v26,8(%%r1,%3),2 \n\t" | |||
| "vlef %%v26,8(%%r1,%3),3 \n\t" | |||
| "vlef %%v27,4(%%r1,%3),0 \n\t" | |||
| "vlef %%v27,4(%%r1,%3),1 \n\t" | |||
| "vlef %%v27,12(%%r1,%3),2 \n\t" | |||
| "vlef %%v27,12(%%r1,%3),3 \n\t" | |||
| "vlef %%v28,0(%%r1,%4),0 \n\t" | |||
| "vlef %%v28,0(%%r1,%4),1 \n\t" | |||
| "vlef %%v28,8(%%r1,%4),2 \n\t" | |||
| "vlef %%v28,8(%%r1,%4),3 \n\t" | |||
| "vlef %%v29,4(%%r1,%4),0 \n\t" | |||
| "vlef %%v29,4(%%r1,%4),1 \n\t" | |||
| "vlef %%v29,12(%%r1,%4),2 \n\t" | |||
| "vlef %%v29,12(%%r1,%4),3 \n\t" | |||
| "vfmasb %%v18,%%v26,%%v20,%%v18 \n\t" | |||
| "vfmasb %%v18,%%v27,%%v21,%%v18 \n\t" | |||
| "vfmasb %%v19,%%v28,%%v20,%%v19 \n\t" | |||
| "vfmasb %%v19,%%v29,%%v21,%%v19 \n\t" | |||
| "agfi %%r1,16 \n\t" | |||
| "brctg %%r0,0b \n\t" | |||
| "vrepg %%v20,%%v16,1 \n\t" | |||
| "vrepg %%v21,%%v17,1 \n\t" | |||
| "vrepg %%v22,%%v18,1 \n\t" | |||
| "vrepg %%v23,%%v19,1 \n\t" | |||
| "vfasb %%v16,%%v16,%%v20 \n\t" | |||
| "vfasb %%v17,%%v17,%%v21 \n\t" | |||
| "vfasb %%v18,%%v18,%%v22 \n\t" | |||
| "vfasb %%v19,%%v19,%%v23 \n\t" | |||
| "vmrhg %%v16,%%v16,%%v17 \n\t" | |||
| "vmrhg %%v17,%%v18,%%v19 \n\t" | |||
| "verllg %%v18,%%v16,32 \n\t" | |||
| "verllg %%v19,%%v17,32 \n\t" | |||
| #if !defined(XCONJ) | |||
| "vlrepf %%v20,0(%7) \n\t" | |||
| "vlef %%v21,4(%7),0 \n\t" | |||
| "vlef %%v21,4(%7),2 \n\t" | |||
| "vflcsb %%v21,%%v21 \n\t" | |||
| "vlef %%v21,4(%7),1 \n\t" | |||
| "vlef %%v21,4(%7),3 \n\t" | |||
| #else | |||
| "vlef %%v20,0(%7),1 \n\t" | |||
| "vlef %%v20,0(%7),3 \n\t" | |||
| "vflcsb %%v20,%%v20 \n\t" | |||
| "vlef %%v20,0(%7),0 \n\t" | |||
| "vlef %%v20,0(%7),2 \n\t" | |||
| "vlrepf %%v21,4(%7) \n\t" | |||
| #endif | |||
| "vl %%v22,0(%6) \n\t" | |||
| "vl %%v23,16(%6) \n\t" | |||
| "vfmasb %%v22,%%v16,%%v20,%%v22 \n\t" | |||
| "vfmasb %%v22,%%v18,%%v21,%%v22 \n\t" | |||
| "vfmasb %%v23,%%v17,%%v20,%%v23 \n\t" | |||
| "vfmasb %%v23,%%v19,%%v21,%%v23 \n\t" | |||
| "vst %%v22,0(%6) \n\t" | |||
| "vst %%v23,16(%6) " | |||
| : | |||
| :"r"(n),"ZR"((const FLOAT (*)[n * 2])ap[0]),"ZR"((const FLOAT (*)[n * 2])ap[1]),"ZR"((const FLOAT (*)[n * 2])ap[2]),"ZR"((const FLOAT (*)[n * 2])ap[3]),"ZR"((const FLOAT (*)[n * 2])x),"ZQ"((FLOAT (*)[8])y),"ZQ"((const FLOAT (*)[2])alpha) | |||
| :"memory","cc","r0","r1","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29" | |||
| ); | |||
| } | |||
| static void cgemv_kernel_4x2(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha) | |||
| { | |||
| __asm__ volatile ( | |||
| "vzero %%v16 \n\t" | |||
| "vzero %%v17 \n\t" | |||
| "xgr %%r1,%%r1 \n\t" | |||
| "srlg %%r0,%0,1 \n\t" | |||
| "0: \n\t" | |||
| "pfd 1,1024(%%r1,%1) \n\t" | |||
| "pfd 1,1024(%%r1,%2) \n\t" | |||
| "pfd 1,1024(%%r1,%3) \n\t" | |||
| "vl %%v18,0(%%r1,%3) \n\t" | |||
| #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) | |||
| "vlef %%v19,4(%%r1,%3),0 \n\t" | |||
| "vlef %%v19,12(%%r1,%3),2 \n\t" | |||
| "vflcsb %%v19,%%v19 \n\t" | |||
| "vlef %%v19,0(%%r1,%3),1 \n\t" | |||
| "vlef %%v19,8(%%r1,%3),3 \n\t" | |||
| #else | |||
| "vlef %%v19,0(%%r1,%3),1 \n\t" | |||
| "vlef %%v19,8(%%r1,%3),3 \n\t" | |||
| "vflcsb %%v19,%%v19 \n\t" | |||
| "vlef %%v19,4(%%r1,%3),0 \n\t" | |||
| "vlef %%v19,12(%%r1,%3),2 \n\t" | |||
| #endif | |||
| "vlef %%v20,0(%%r1,%1),0 \n\t" | |||
| "vlef %%v20,0(%%r1,%1),1 \n\t" | |||
| "vlef %%v20,8(%%r1,%1),2 \n\t" | |||
| "vlef %%v20,8(%%r1,%1),3 \n\t" | |||
| "vlef %%v21,4(%%r1,%1),0 \n\t" | |||
| "vlef %%v21,4(%%r1,%1),1 \n\t" | |||
| "vlef %%v21,12(%%r1,%1),2 \n\t" | |||
| "vlef %%v21,12(%%r1,%1),3 \n\t" | |||
| "vlef %%v22,0(%%r1,%2),0 \n\t" | |||
| "vlef %%v22,0(%%r1,%2),1 \n\t" | |||
| "vlef %%v22,8(%%r1,%2),2 \n\t" | |||
| "vlef %%v22,8(%%r1,%2),3 \n\t" | |||
| "vlef %%v23,4(%%r1,%2),0 \n\t" | |||
| "vlef %%v23,4(%%r1,%2),1 \n\t" | |||
| "vlef %%v23,12(%%r1,%2),2 \n\t" | |||
| "vlef %%v23,12(%%r1,%2),3 \n\t" | |||
| "vfmasb %%v16,%%v20,%%v18,%%v16 \n\t" | |||
| "vfmasb %%v16,%%v21,%%v19,%%v16 \n\t" | |||
| "vfmasb %%v17,%%v22,%%v18,%%v17 \n\t" | |||
| "vfmasb %%v17,%%v23,%%v19,%%v17 \n\t" | |||
| "agfi %%r1,16 \n\t" | |||
| "brctg %%r0,0b \n\t" | |||
| "vrepg %%v18,%%v16,1 \n\t" | |||
| "vrepg %%v19,%%v17,1 \n\t" | |||
| "vfasb %%v16,%%v16,%%v18 \n\t" | |||
| "vfasb %%v17,%%v17,%%v19 \n\t" | |||
| "vmrhg %%v16,%%v16,%%v17 \n\t" | |||
| "verllg %%v17,%%v16,32 \n\t" | |||
| #if !defined(XCONJ) | |||
| "vlrepf %%v18,0(%5) \n\t" | |||
| "vlef %%v19,4(%5),0 \n\t" | |||
| "vlef %%v19,4(%5),2 \n\t" | |||
| "vflcsb %%v19,%%v19 \n\t" | |||
| "vlef %%v19,4(%5),1 \n\t" | |||
| "vlef %%v19,4(%5),3 \n\t" | |||
| #else | |||
| "vlef %%v18,0(%5),1 \n\t" | |||
| "vlef %%v18,0(%5),3 \n\t" | |||
| "vflcsb %%v18,%%v18 \n\t" | |||
| "vlef %%v18,0(%5),0 \n\t" | |||
| "vlef %%v18,0(%5),2 \n\t" | |||
| "vlrepf %%v19,4(%5) \n\t" | |||
| #endif | |||
| "vl %%v20,0(%4) \n\t" | |||
| "vfmasb %%v20,%%v16,%%v18,%%v20 \n\t" | |||
| "vfmasb %%v20,%%v17,%%v19,%%v20 \n\t" | |||
| "vst %%v20,0(%4) " | |||
| : | |||
| :"r"(n),"ZR"((const FLOAT (*)[n * 2])ap[0]),"ZR"((const FLOAT (*)[n * 2])ap[1]),"ZR"((const FLOAT (*)[n * 2])x),"ZQ"((FLOAT (*)[4])y),"ZQ"((const FLOAT (*)[2])alpha) | |||
| :"memory","cc","r0","r1","v16","v17","v18","v19","v20","v21","v22","v23" | |||
| ); | |||
| } | |||
| static void cgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT *alpha) | |||
| { | |||
| __asm__ volatile ( | |||
| "vzero %%v16 \n\t" | |||
| "xgr %%r1,%%r1 \n\t" | |||
| "srlg %%r0,%0,1 \n\t" | |||
| "0: \n\t" | |||
| "pfd 1,1024(%%r1,%1) \n\t" | |||
| "pfd 1,1024(%%r1,%2) \n\t" | |||
| "vl %%v17,0(%%r1,%2) \n\t" | |||
| #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) | |||
| "vlef %%v18,4(%%r1,%2),0 \n\t" | |||
| "vlef %%v18,12(%%r1,%2),2 \n\t" | |||
| "vflcsb %%v18,%%v18 \n\t" | |||
| "vlef %%v18,0(%%r1,%2),1 \n\t" | |||
| "vlef %%v18,8(%%r1,%2),3 \n\t" | |||
| #else | |||
| "vlef %%v18,0(%%r1,%2),1 \n\t" | |||
| "vlef %%v18,8(%%r1,%2),3 \n\t" | |||
| "vflcsb %%v18,%%v18 \n\t" | |||
| "vlef %%v18,4(%%r1,%2),0 \n\t" | |||
| "vlef %%v18,12(%%r1,%2),2 \n\t" | |||
| #endif | |||
| "vlef %%v19,0(%%r1,%1),0 \n\t" | |||
| "vlef %%v19,0(%%r1,%1),1 \n\t" | |||
| "vlef %%v19,8(%%r1,%1),2 \n\t" | |||
| "vlef %%v19,8(%%r1,%1),3 \n\t" | |||
| "vlef %%v20,4(%%r1,%1),0 \n\t" | |||
| "vlef %%v20,4(%%r1,%1),1 \n\t" | |||
| "vlef %%v20,12(%%r1,%1),2 \n\t" | |||
| "vlef %%v20,12(%%r1,%1),3 \n\t" | |||
| "vfmasb %%v16,%%v19,%%v17,%%v16 \n\t" | |||
| "vfmasb %%v16,%%v20,%%v18,%%v16 \n\t" | |||
| "agfi %%r1,16 \n\t" | |||
| "brctg %%r0,0b \n\t" | |||
| "vrepg %%v17,%%v16,1 \n\t" | |||
| "vfasb %%v16,%%v16,%%v17 \n\t" | |||
| "verllg %%v17,%%v16,32 \n\t" | |||
| #if !defined(XCONJ) | |||
| "vlrepf %%v18,0(%4) \n\t" | |||
| "vlef %%v19,4(%4),0 \n\t" | |||
| "vflcsb %%v19,%%v19 \n\t" | |||
| "vlef %%v19,4(%4),1 \n\t" | |||
| #else | |||
| "vlef %%v18,0(%4),1 \n\t" | |||
| "vflcsb %%v18,%%v18 \n\t" | |||
| "vlef %%v18,0(%4),0 \n\t" | |||
| "vlrepf %%v19,4(%4) \n\t" | |||
| #endif | |||
| "vleg %%v20,0(%3),0 \n\t" | |||
| "vfmasb %%v20,%%v16,%%v18,%%v20 \n\t" | |||
| "vfmasb %%v20,%%v17,%%v19,%%v20 \n\t" | |||
| "vsteg %%v20,0(%3),0 " | |||
| : | |||
| :"r"(n),"ZR"((const FLOAT (*)[n * 2])ap),"ZR"((const FLOAT (*)[n * 2])x),"ZQ"((FLOAT (*)[2])y),"ZQ"((const FLOAT (*)[2])alpha) | |||
| :"memory","cc","r0","r1","v16","v17","v18","v19","v20","v21","v22","v23" | |||
| ); | |||
| } | |||
| static void copy_x(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_src) | |||
| { | |||
| BLASLONG i; | |||
| for ( i=0; i<n; i++ ) | |||
| { | |||
| *dest = *src; | |||
| *(dest+1) = *(src+1); | |||
| dest+=2; | |||
| src += inc_src; | |||
| } | |||
| } | |||
| int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) | |||
| { | |||
| BLASLONG i; | |||
| BLASLONG j; | |||
| FLOAT *a_ptr; | |||
| FLOAT *x_ptr; | |||
| FLOAT *y_ptr; | |||
| FLOAT *ap[8]; | |||
| BLASLONG n1; | |||
| BLASLONG m1; | |||
| BLASLONG m2; | |||
| BLASLONG m3; | |||
| BLASLONG n2; | |||
| BLASLONG lda4; | |||
| FLOAT ybuffer[8],*xbuffer; | |||
| FLOAT alpha[2]; | |||
| if ( m < 1 ) return(0); | |||
| if ( n < 1 ) return(0); | |||
| inc_x <<= 1; | |||
| inc_y <<= 1; | |||
| lda <<= 1; | |||
| lda4 = lda << 2; | |||
| xbuffer = buffer; | |||
| n1 = n >> 2 ; | |||
| n2 = n & 3 ; | |||
| m3 = m & 3 ; | |||
| m1 = m - m3; | |||
| m2 = (m & (NBMAX-1)) - m3 ; | |||
| alpha[0] = alpha_r; | |||
| alpha[1] = alpha_i; | |||
| BLASLONG NB = NBMAX; | |||
| while ( NB == NBMAX ) | |||
| { | |||
| m1 -= NB; | |||
| if ( m1 < 0) | |||
| { | |||
| if ( m2 == 0 ) break; | |||
| NB = m2; | |||
| } | |||
| y_ptr = y; | |||
| a_ptr = a; | |||
| x_ptr = x; | |||
| ap[0] = a_ptr; | |||
| ap[1] = a_ptr + lda; | |||
| ap[2] = ap[1] + lda; | |||
| ap[3] = ap[2] + lda; | |||
| if ( inc_x != 2 ) | |||
| copy_x(NB,x_ptr,xbuffer,inc_x); | |||
| else | |||
| xbuffer = x_ptr; | |||
| if ( inc_y == 2 ) | |||
| { | |||
| for( i = 0; i < n1 ; i++) | |||
| { | |||
| cgemv_kernel_4x4(NB,ap,xbuffer,y_ptr,alpha); | |||
| ap[0] += lda4; | |||
| ap[1] += lda4; | |||
| ap[2] += lda4; | |||
| ap[3] += lda4; | |||
| a_ptr += lda4; | |||
| y_ptr += 8; | |||
| } | |||
| if ( n2 & 2 ) | |||
| { | |||
| cgemv_kernel_4x2(NB,ap,xbuffer,y_ptr,alpha); | |||
| a_ptr += lda * 2; | |||
| y_ptr += 4; | |||
| } | |||
| if ( n2 & 1 ) | |||
| { | |||
| cgemv_kernel_4x1(NB,a_ptr,xbuffer,y_ptr,alpha); | |||
| /* a_ptr += lda; | |||
| y_ptr += 2; */ | |||
| } | |||
| } | |||
| else | |||
| { | |||
| for( i = 0; i < n1 ; i++) | |||
| { | |||
| memset(ybuffer,0,sizeof(ybuffer)); | |||
| cgemv_kernel_4x4(NB,ap,xbuffer,ybuffer,alpha); | |||
| ap[0] += lda4; | |||
| ap[1] += lda4; | |||
| ap[2] += lda4; | |||
| ap[3] += lda4; | |||
| a_ptr += lda4; | |||
| y_ptr[0] += ybuffer[0]; | |||
| y_ptr[1] += ybuffer[1]; | |||
| y_ptr += inc_y; | |||
| y_ptr[0] += ybuffer[2]; | |||
| y_ptr[1] += ybuffer[3]; | |||
| y_ptr += inc_y; | |||
| y_ptr[0] += ybuffer[4]; | |||
| y_ptr[1] += ybuffer[5]; | |||
| y_ptr += inc_y; | |||
| y_ptr[0] += ybuffer[6]; | |||
| y_ptr[1] += ybuffer[7]; | |||
| y_ptr += inc_y; | |||
| } | |||
| for( i = 0; i < n2 ; i++) | |||
| { | |||
| memset(ybuffer,0,sizeof(ybuffer)); | |||
| cgemv_kernel_4x1(NB,a_ptr,xbuffer,ybuffer,alpha); | |||
| a_ptr += lda; | |||
| y_ptr[0] += ybuffer[0]; | |||
| y_ptr[1] += ybuffer[1]; | |||
| y_ptr += inc_y; | |||
| } | |||
| } | |||
| a += 2 * NB; | |||
| x += NB * inc_x; | |||
| } | |||
| if ( m3 == 0 ) return(0); | |||
| x_ptr = x; | |||
| j=0; | |||
| a_ptr = a; | |||
| y_ptr = y; | |||
| if ( m3 == 3 ) | |||
| { | |||
| FLOAT temp_r ; | |||
| FLOAT temp_i ; | |||
| FLOAT x0 = x_ptr[0]; | |||
| FLOAT x1 = x_ptr[1]; | |||
| x_ptr += inc_x; | |||
| FLOAT x2 = x_ptr[0]; | |||
| FLOAT x3 = x_ptr[1]; | |||
| x_ptr += inc_x; | |||
| FLOAT x4 = x_ptr[0]; | |||
| FLOAT x5 = x_ptr[1]; | |||
| while ( j < n) | |||
| { | |||
| #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) | |||
| temp_r = a_ptr[0] * x0 - a_ptr[1] * x1; | |||
| temp_i = a_ptr[0] * x1 + a_ptr[1] * x0; | |||
| temp_r += a_ptr[2] * x2 - a_ptr[3] * x3; | |||
| temp_i += a_ptr[2] * x3 + a_ptr[3] * x2; | |||
| temp_r += a_ptr[4] * x4 - a_ptr[5] * x5; | |||
| temp_i += a_ptr[4] * x5 + a_ptr[5] * x4; | |||
| #else | |||
| temp_r = a_ptr[0] * x0 + a_ptr[1] * x1; | |||
| temp_i = a_ptr[0] * x1 - a_ptr[1] * x0; | |||
| temp_r += a_ptr[2] * x2 + a_ptr[3] * x3; | |||
| temp_i += a_ptr[2] * x3 - a_ptr[3] * x2; | |||
| temp_r += a_ptr[4] * x4 + a_ptr[5] * x5; | |||
| temp_i += a_ptr[4] * x5 - a_ptr[5] * x4; | |||
| #endif | |||
| #if !defined(XCONJ) | |||
| y_ptr[0] += alpha_r * temp_r - alpha_i * temp_i; | |||
| y_ptr[1] += alpha_r * temp_i + alpha_i * temp_r; | |||
| #else | |||
| y_ptr[0] += alpha_r * temp_r + alpha_i * temp_i; | |||
| y_ptr[1] -= alpha_r * temp_i - alpha_i * temp_r; | |||
| #endif | |||
| a_ptr += lda; | |||
| y_ptr += inc_y; | |||
| j++; | |||
| } | |||
| return(0); | |||
| } | |||
| if ( m3 == 2 ) | |||
| { | |||
| FLOAT temp_r ; | |||
| FLOAT temp_i ; | |||
| FLOAT temp_r1 ; | |||
| FLOAT temp_i1 ; | |||
| FLOAT x0 = x_ptr[0]; | |||
| FLOAT x1 = x_ptr[1]; | |||
| x_ptr += inc_x; | |||
| FLOAT x2 = x_ptr[0]; | |||
| FLOAT x3 = x_ptr[1]; | |||
| FLOAT ar = alpha[0]; | |||
| FLOAT ai = alpha[1]; | |||
| while ( j < ( n & -2 )) | |||
| { | |||
| #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) | |||
| temp_r = a_ptr[0] * x0 - a_ptr[1] * x1; | |||
| temp_i = a_ptr[0] * x1 + a_ptr[1] * x0; | |||
| temp_r += a_ptr[2] * x2 - a_ptr[3] * x3; | |||
| temp_i += a_ptr[2] * x3 + a_ptr[3] * x2; | |||
| a_ptr += lda; | |||
| temp_r1 = a_ptr[0] * x0 - a_ptr[1] * x1; | |||
| temp_i1 = a_ptr[0] * x1 + a_ptr[1] * x0; | |||
| temp_r1 += a_ptr[2] * x2 - a_ptr[3] * x3; | |||
| temp_i1 += a_ptr[2] * x3 + a_ptr[3] * x2; | |||
| #else | |||
| temp_r = a_ptr[0] * x0 + a_ptr[1] * x1; | |||
| temp_i = a_ptr[0] * x1 - a_ptr[1] * x0; | |||
| temp_r += a_ptr[2] * x2 + a_ptr[3] * x3; | |||
| temp_i += a_ptr[2] * x3 - a_ptr[3] * x2; | |||
| a_ptr += lda; | |||
| temp_r1 = a_ptr[0] * x0 + a_ptr[1] * x1; | |||
| temp_i1 = a_ptr[0] * x1 - a_ptr[1] * x0; | |||
| temp_r1 += a_ptr[2] * x2 + a_ptr[3] * x3; | |||
| temp_i1 += a_ptr[2] * x3 - a_ptr[3] * x2; | |||
| #endif | |||
| #if !defined(XCONJ) | |||
| y_ptr[0] += ar * temp_r - ai * temp_i; | |||
| y_ptr[1] += ar * temp_i + ai * temp_r; | |||
| y_ptr += inc_y; | |||
| y_ptr[0] += ar * temp_r1 - ai * temp_i1; | |||
| y_ptr[1] += ar * temp_i1 + ai * temp_r1; | |||
| #else | |||
| y_ptr[0] += ar * temp_r + ai * temp_i; | |||
| y_ptr[1] -= ar * temp_i - ai * temp_r; | |||
| y_ptr += inc_y; | |||
| y_ptr[0] += ar * temp_r1 + ai * temp_i1; | |||
| y_ptr[1] -= ar * temp_i1 - ai * temp_r1; | |||
| #endif | |||
| a_ptr += lda; | |||
| y_ptr += inc_y; | |||
| j+=2; | |||
| } | |||
| while ( j < n) | |||
| { | |||
| #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) | |||
| temp_r = a_ptr[0] * x0 - a_ptr[1] * x1; | |||
| temp_i = a_ptr[0] * x1 + a_ptr[1] * x0; | |||
| temp_r += a_ptr[2] * x2 - a_ptr[3] * x3; | |||
| temp_i += a_ptr[2] * x3 + a_ptr[3] * x2; | |||
| #else | |||
| temp_r = a_ptr[0] * x0 + a_ptr[1] * x1; | |||
| temp_i = a_ptr[0] * x1 - a_ptr[1] * x0; | |||
| temp_r += a_ptr[2] * x2 + a_ptr[3] * x3; | |||
| temp_i += a_ptr[2] * x3 - a_ptr[3] * x2; | |||
| #endif | |||
| #if !defined(XCONJ) | |||
| y_ptr[0] += ar * temp_r - ai * temp_i; | |||
| y_ptr[1] += ar * temp_i + ai * temp_r; | |||
| #else | |||
| y_ptr[0] += ar * temp_r + ai * temp_i; | |||
| y_ptr[1] -= ar * temp_i - ai * temp_r; | |||
| #endif | |||
| a_ptr += lda; | |||
| y_ptr += inc_y; | |||
| j++; | |||
| } | |||
| return(0); | |||
| } | |||
| if ( m3 == 1 ) | |||
| { | |||
| FLOAT temp_r ; | |||
| FLOAT temp_i ; | |||
| FLOAT temp_r1 ; | |||
| FLOAT temp_i1 ; | |||
| FLOAT x0 = x_ptr[0]; | |||
| FLOAT x1 = x_ptr[1]; | |||
| FLOAT ar = alpha[0]; | |||
| FLOAT ai = alpha[1]; | |||
| while ( j < ( n & -2 )) | |||
| { | |||
| #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) | |||
| temp_r = a_ptr[0] * x0 - a_ptr[1] * x1; | |||
| temp_i = a_ptr[0] * x1 + a_ptr[1] * x0; | |||
| a_ptr += lda; | |||
| temp_r1 = a_ptr[0] * x0 - a_ptr[1] * x1; | |||
| temp_i1 = a_ptr[0] * x1 + a_ptr[1] * x0; | |||
| #else | |||
| temp_r = a_ptr[0] * x0 + a_ptr[1] * x1; | |||
| temp_i = a_ptr[0] * x1 - a_ptr[1] * x0; | |||
| a_ptr += lda; | |||
| temp_r1 = a_ptr[0] * x0 + a_ptr[1] * x1; | |||
| temp_i1 = a_ptr[0] * x1 - a_ptr[1] * x0; | |||
| #endif | |||
| #if !defined(XCONJ) | |||
| y_ptr[0] += ar * temp_r - ai * temp_i; | |||
| y_ptr[1] += ar * temp_i + ai * temp_r; | |||
| y_ptr += inc_y; | |||
| y_ptr[0] += ar * temp_r1 - ai * temp_i1; | |||
| y_ptr[1] += ar * temp_i1 + ai * temp_r1; | |||
| #else | |||
| y_ptr[0] += ar * temp_r + ai * temp_i; | |||
| y_ptr[1] -= ar * temp_i - ai * temp_r; | |||
| y_ptr += inc_y; | |||
| y_ptr[0] += ar * temp_r1 + ai * temp_i1; | |||
| y_ptr[1] -= ar * temp_i1 - ai * temp_r1; | |||
| #endif | |||
| a_ptr += lda; | |||
| y_ptr += inc_y; | |||
| j+=2; | |||
| } | |||
| while ( j < n) | |||
| { | |||
| #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) | |||
| temp_r = a_ptr[0] * x0 - a_ptr[1] * x1; | |||
| temp_i = a_ptr[0] * x1 + a_ptr[1] * x0; | |||
| #else | |||
| temp_r = a_ptr[0] * x0 + a_ptr[1] * x1; | |||
| temp_i = a_ptr[0] * x1 - a_ptr[1] * x0; | |||
| #endif | |||
| #if !defined(XCONJ) | |||
| y_ptr[0] += ar * temp_r - ai * temp_i; | |||
| y_ptr[1] += ar * temp_i + ai * temp_r; | |||
| #else | |||
| y_ptr[0] += ar * temp_r + ai * temp_i; | |||
| y_ptr[1] -= ar * temp_i - ai * temp_r; | |||
| #endif | |||
| a_ptr += lda; | |||
| y_ptr += inc_y; | |||
| j++; | |||
| } | |||
| return(0); | |||
| } | |||
| return(0); | |||
| } | |||
| @@ -281,6 +281,12 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||
| i = n1; | |||
| } | |||
| else | |||
| { | |||
| maxf = CABS1(x,0); | |||
| ix += 2; | |||
| i++; | |||
| } | |||
| while(i < n) | |||
| { | |||
| @@ -296,9 +302,8 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||
| } else { | |||
| inc_x2 = 2 * inc_x; | |||
| maxf = CABS1(x,0); | |||
| inc_x2 = 2 * inc_x; | |||
| ix += inc_x2; | |||
| i++; | |||
| @@ -281,6 +281,12 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||
| i = n1; | |||
| } | |||
| else | |||
| { | |||
| minf = CABS1(x,0); | |||
| ix += 2; | |||
| i++; | |||
| } | |||
| while(i < n) | |||
| { | |||
| @@ -296,9 +302,8 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||
| } else { | |||
| inc_x2 = 2 * inc_x; | |||
| minf = CABS1(x,0); | |||
| inc_x2 = 2 * inc_x; | |||
| ix += inc_x2; | |||
| i++; | |||
| @@ -204,6 +204,11 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { | |||
| i = n1; | |||
| } | |||
| else | |||
| { | |||
| maxf = ABS(x[0]); | |||
| i++; | |||
| } | |||
| while (i < n) { | |||
| if (ABS(x[i]) > maxf) { | |||
| @@ -216,7 +221,11 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { | |||
| } else { | |||
| BLASLONG n1 = n & -4; | |||
| maxf = ABS(x[0]); | |||
| i += inc_x; | |||
| j++; | |||
| BLASLONG n1 = (n - 1) & -4; | |||
| while (j < n1) { | |||
| if (ABS(x[i]) > maxf) { | |||
| @@ -204,6 +204,11 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { | |||
| i = n1; | |||
| } | |||
| else | |||
| { | |||
| minf = ABS(x[0]); | |||
| i++; | |||
| } | |||
| while (i < n) { | |||
| if (ABS(x[i]) < minf) { | |||
| @@ -216,7 +221,11 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { | |||
| } else { | |||
| BLASLONG n1 = n & -4; | |||
| minf = ABS(x[0]); | |||
| i += inc_x; | |||
| j++; | |||
| BLASLONG n1 = (n - 1) & -4; | |||
| while (j < n1) { | |||
| if (ABS(x[i]) < minf) { | |||
| @@ -180,6 +180,11 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { | |||
| i = n1; | |||
| } | |||
| else | |||
| { | |||
| maxf = x[0]; | |||
| i++; | |||
| } | |||
| while (i < n) { | |||
| if (x[i] > maxf) { | |||
| @@ -192,7 +197,11 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { | |||
| } else { | |||
| BLASLONG n1 = n & -4; | |||
| maxf = x[0]; | |||
| i += inc_x; | |||
| j++; | |||
| BLASLONG n1 = (n - 1) & -4; | |||
| while (j < n1) { | |||
| if (x[i] > maxf) { | |||
| @@ -180,6 +180,11 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { | |||
| i = n1; | |||
| } | |||
| else | |||
| { | |||
| minf = x[0]; | |||
| i++; | |||
| } | |||
| while (i < n) { | |||
| if (x[i] < minf) { | |||
| @@ -192,7 +197,11 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { | |||
| } else { | |||
| BLASLONG n1 = n & -4; | |||
| minf = x[0]; | |||
| i += inc_x; | |||
| j++; | |||
| BLASLONG n1 = (n - 1) & -4; | |||
| while (j < n1) { | |||
| if (x[i] < minf) { | |||
| @@ -247,6 +247,11 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { | |||
| i = n1; | |||
| } | |||
| else | |||
| { | |||
| maxf = ABS(x[0]); | |||
| i++; | |||
| } | |||
| while (i < n) { | |||
| if (ABS(x[i]) > maxf) { | |||
| @@ -259,7 +264,11 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { | |||
| } else { | |||
| BLASLONG n1 = n & -4; | |||
| maxf = ABS(x[0]); | |||
| i += inc_x; | |||
| j++; | |||
| BLASLONG n1 = (n - 1) & -4; | |||
| while (j < n1) { | |||
| if (ABS(x[i]) > maxf) { | |||
| @@ -247,6 +247,11 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { | |||
| i = n1; | |||
| } | |||
| else | |||
| { | |||
| minf = ABS(x[0]); | |||
| i++; | |||
| } | |||
| while (i < n) { | |||
| if (ABS(x[i]) < minf) { | |||
| @@ -259,7 +264,11 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { | |||
| } else { | |||
| BLASLONG n1 = n & -4; | |||
| minf = ABS(x[0]); | |||
| i += inc_x; | |||
| j++; | |||
| BLASLONG n1 = (n - 1) & -4; | |||
| while (j < n1) { | |||
| if (ABS(x[i]) < minf) { | |||
| @@ -223,6 +223,11 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { | |||
| i = n1; | |||
| } | |||
| else | |||
| { | |||
| maxf = x[0]; | |||
| i++; | |||
| } | |||
| while (i < n) { | |||
| if (x[i] > maxf) { | |||
| @@ -235,7 +240,11 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { | |||
| } else { | |||
| BLASLONG n1 = n & -4; | |||
| maxf = x[0]; | |||
| i += inc_x; | |||
| j++; | |||
| BLASLONG n1 = (n - 1) & -4; | |||
| while (j < n1) { | |||
| if (x[i] > maxf) { | |||
| @@ -223,6 +223,11 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { | |||
| i = n1; | |||
| } | |||
| else | |||
| { | |||
| minf = x[0]; | |||
| i++; | |||
| } | |||
| while (i < n) { | |||
| if (x[i] < minf) { | |||
| @@ -235,7 +240,11 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { | |||
| } else { | |||
| BLASLONG n1 = n & -4; | |||
| minf = x[0]; | |||
| i += inc_x; | |||
| j++; | |||
| BLASLONG n1 = (n - 1) & -4; | |||
| while (j < n1) { | |||
| if (x[i] < minf) { | |||
| @@ -202,6 +202,12 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||
| i = n1; | |||
| } | |||
| else | |||
| { | |||
| maxf = CABS1(x,0); | |||
| ix += 2; | |||
| i++; | |||
| } | |||
| while(i < n) | |||
| { | |||
| @@ -217,9 +223,8 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||
| } else { | |||
| inc_x2 = 2 * inc_x; | |||
| maxf = CABS1(x,0); | |||
| inc_x2 = 2 * inc_x; | |||
| ix += inc_x2; | |||
| i++; | |||
| @@ -202,6 +202,12 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||
| i = n1; | |||
| } | |||
| else | |||
| { | |||
| minf = CABS1(x,0); | |||
| ix += 2; | |||
| i++; | |||
| } | |||
| while(i < n) | |||
| { | |||
| @@ -217,9 +223,8 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||
| } else { | |||
| inc_x2 = 2 * inc_x; | |||
| minf = CABS1(x,0); | |||
| inc_x2 = 2 * inc_x; | |||
| ix += inc_x2; | |||
| i++; | |||
| @@ -150,7 +150,7 @@ static FLOAT zamax_kernel_16(BLASLONG n, FLOAT *x) | |||
| FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { | |||
| BLASLONG i = 0; | |||
| BLASLONG j = 0; | |||
| BLASLONG ix = 0; | |||
| FLOAT maxf = 0.0; | |||
| BLASLONG inc_x2; | |||
| @@ -168,53 +168,55 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { | |||
| else | |||
| { | |||
| maxf=CABS1(x,0); | |||
| ix += 2; | |||
| i++; | |||
| } | |||
| while (i < n) { | |||
| if (ABS(x[i*2]) > maxf) { | |||
| maxf = ABS(x[i*2]); | |||
| if (CABS1(x,ix) > maxf) { | |||
| maxf = CABS1(x,ix); | |||
| } | |||
| ix += 2; | |||
| i++; | |||
| } | |||
| return (maxf); | |||
| } else { | |||
| inc_x2 = 2 * inc_x; | |||
| maxf=CABS1(x,0); | |||
| i += inc_x2; | |||
| j++; | |||
| inc_x2 = 2 * inc_x; | |||
| ix += inc_x2; | |||
| i++; | |||
| BLASLONG n1 = (n - 1) & -4; | |||
| while (j < n1) { | |||
| while (i < n1) { | |||
| if (CABS1(x,i) > maxf) { | |||
| maxf = CABS1(x,i); | |||
| if (CABS1(x,ix) > maxf) { | |||
| maxf = CABS1(x,ix); | |||
| } | |||
| if (CABS1(x,i+inc_x2) > maxf) { | |||
| maxf = CABS1(x,i+inc_x2); | |||
| if (CABS1(x,ix+inc_x2) > maxf) { | |||
| maxf = CABS1(x,ix+inc_x2); | |||
| } | |||
| if (CABS1(x,i+inc_x2*2) > maxf) { | |||
| maxf = CABS1(x,i+inc_x2*2); | |||
| if (CABS1(x,ix+inc_x2*2) > maxf) { | |||
| maxf = CABS1(x,ix+inc_x2*2); | |||
| } | |||
| if (CABS1(x,i+inc_x2*3) > maxf) { | |||
| maxf = CABS1(x,i+inc_x2*3); | |||
| if (CABS1(x,ix+inc_x2*3) > maxf) { | |||
| maxf = CABS1(x,ix+inc_x2*3); | |||
| } | |||
| i += inc_x2 * 4; | |||
| ix += inc_x2 * 4; | |||
| j += 4; | |||
| i += 4; | |||
| } | |||
| while (j < n) { | |||
| if (CABS1(x,i) > maxf) { | |||
| maxf = CABS1(x,i); | |||
| while (i < n) { | |||
| if (CABS1(x,ix) > maxf) { | |||
| maxf = CABS1(x,ix); | |||
| } | |||
| i += inc_x2; | |||
| j++; | |||
| ix += inc_x2; | |||
| i++; | |||
| } | |||
| return (maxf); | |||
| } | |||
| @@ -150,7 +150,7 @@ static FLOAT zamin_kernel_16(BLASLONG n, FLOAT *x) | |||
| FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { | |||
| BLASLONG i = 0; | |||
| BLASLONG j = 0; | |||
| BLASLONG ix = 0; | |||
| FLOAT minf = 0.0; | |||
| BLASLONG inc_x2; | |||
| @@ -168,53 +168,55 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { | |||
| else | |||
| { | |||
| minf=CABS1(x,0); | |||
| ix += 2; | |||
| i++; | |||
| } | |||
| while (i < n) { | |||
| if (ABS(x[i*2]) < minf) { | |||
| minf = ABS(x[i*2]); | |||
| if (CABS1(x,ix) < minf) { | |||
| minf = CABS1(x,ix); | |||
| } | |||
| ix += 2; | |||
| i++; | |||
| } | |||
| return (minf); | |||
| } else { | |||
| inc_x2 = 2 * inc_x; | |||
| minf=CABS1(x,0); | |||
| i += inc_x2; | |||
| j++; | |||
| inc_x2 = 2 * inc_x; | |||
| ix += inc_x2; | |||
| i++; | |||
| BLASLONG n1 = (n - 1) & -4; | |||
| while (j < n1) { | |||
| while (i < n1) { | |||
| if (CABS1(x,i) < minf) { | |||
| minf = CABS1(x,i); | |||
| if (CABS1(x,ix) < minf) { | |||
| minf = CABS1(x,ix); | |||
| } | |||
| if (CABS1(x,i+inc_x2) < minf) { | |||
| minf = CABS1(x,i+inc_x2); | |||
| if (CABS1(x,ix+inc_x2) < minf) { | |||
| minf = CABS1(x,ix+inc_x2); | |||
| } | |||
| if (CABS1(x,i+inc_x2*2) < minf) { | |||
| minf = CABS1(x,i+inc_x2*2); | |||
| if (CABS1(x,ix+inc_x2*2) < minf) { | |||
| minf = CABS1(x,ix+inc_x2*2); | |||
| } | |||
| if (CABS1(x,i+inc_x2*3) < minf) { | |||
| minf = CABS1(x,i+inc_x2*3); | |||
| if (CABS1(x,ix+inc_x2*3) < minf) { | |||
| minf = CABS1(x,ix+inc_x2*3); | |||
| } | |||
| i += inc_x2 * 4; | |||
| ix += inc_x2 * 4; | |||
| j += 4; | |||
| i += 4; | |||
| } | |||
| while (j < n) { | |||
| if (CABS1(x,i) < minf) { | |||
| minf = CABS1(x,i); | |||
| while (i < n) { | |||
| if (CABS1(x,ix) < minf) { | |||
| minf = CABS1(x,ix); | |||
| } | |||
| i += inc_x2; | |||
| j++; | |||
| ix += inc_x2; | |||
| i++; | |||
| } | |||
| return (minf); | |||
| } | |||
| @@ -106,7 +106,7 @@ static void zaxpy_kernel_8(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) | |||
| "agfi %%r1,128 \n\t" | |||
| "brctg %%r0,0b " | |||
| : | |||
| :"r"(n),"ZR"((const FLOAT (*)[n * 2])x),"ZR"((FLOAT (*)[n * 2])y),"a"(alpha) | |||
| :"r"(n),"ZR"((const FLOAT (*)[n * 2])x),"ZR"((FLOAT (*)[n * 2])y),"ZQ"((const FLOAT (*)[2])alpha) | |||
| :"memory","cc","r0","r1","v0","v1","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" | |||
| ); | |||
| } | |||
| @@ -114,7 +114,7 @@ static void zaxpy_kernel_8(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) | |||
| int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) { | |||
| BLASLONG i = 0; | |||
| BLASLONG ix = 0, iy = 0; | |||
| FLOAT da[2]; | |||
| FLOAT da[2] __attribute__ ((aligned(16))); | |||
| if (n <= 0) return (0); | |||
| @@ -52,67 +52,66 @@ int assert_dbl_near(double exp, double real, double tol) { | |||
| int zgemv_n_c(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) | |||
| { | |||
| BLASLONG i; | |||
| BLASLONG ix,iy; | |||
| BLASLONG ix, iy; | |||
| BLASLONG j; | |||
| FLOAT *a_ptr; | |||
| FLOAT temp_r,temp_i; | |||
| BLASLONG inc_x2,inc_y2; | |||
| FLOAT temp_r, temp_i; | |||
| BLASLONG inc_x2, inc_y2; | |||
| BLASLONG lda2; | |||
| BLASLONG i2; | |||
| lda2 = 2*lda; | |||
| lda2 = 2 * lda; | |||
| ix = 0; | |||
| a_ptr = a; | |||
| if ( inc_x == 1 && inc_y == 1 ) | |||
| if (inc_x == 1 && inc_y == 1) | |||
| { | |||
| for (j=0; j<n; j++) | |||
| { | |||
| for (j = 0; j<n; j++) | |||
| { | |||
| #if !defined(XCONJ) | |||
| temp_r = alpha_r * x[ix] - alpha_i * x[ix+1]; | |||
| temp_i = alpha_r * x[ix+1] + alpha_i * x[ix]; | |||
| temp_r = alpha_r * x[ix] - alpha_i * x[ix + 1]; | |||
| temp_i = alpha_r * x[ix + 1] + alpha_i * x[ix]; | |||
| #else | |||
| temp_r = alpha_r * x[ix] + alpha_i * x[ix+1]; | |||
| temp_i = alpha_r * x[ix+1] - alpha_i * x[ix]; | |||
| temp_r = alpha_r * x[ix] + alpha_i * x[ix + 1]; | |||
| temp_i = alpha_r * x[ix + 1] - alpha_i * x[ix]; | |||
| #endif | |||
| iy = 0; | |||
| i2=0; | |||
| iy = 0; | |||
| i2 = 0; | |||
| for (i=0; i<m; i++) | |||
| { | |||
| for (i = 0; i<m; i++) | |||
| { | |||
| #if !defined(CONJ) | |||
| #if !defined(XCONJ) | |||
| printf("\nParO: %f %f %f %f\n", a_ptr[i2], a_ptr[i2+1], temp_r, temp_i); | |||
| y[iy] += temp_r * a_ptr[i2] - temp_i * a_ptr[i2+1]; | |||
| y[iy+1] += temp_r * a_ptr[i2+1] + temp_i * a_ptr[i2]; | |||
| y[iy] += temp_r * a_ptr[i2] - temp_i * a_ptr[i2 + 1]; | |||
| y[iy + 1] += temp_r * a_ptr[i2 + 1] + temp_i * a_ptr[i2]; | |||
| #else | |||
| y[iy] += temp_r * a_ptr[i2] + temp_i * a_ptr[i2+1]; | |||
| y[iy+1] += temp_r * a_ptr[i2+1] - temp_i * a_ptr[i2]; | |||
| y[iy] += temp_r * a_ptr[i2] + temp_i * a_ptr[i2 + 1]; | |||
| y[iy + 1] += temp_r * a_ptr[i2 + 1] - temp_i * a_ptr[i2]; | |||
| #endif | |||
| #else | |||
| #if !defined(XCONJ) | |||
| y[iy] += temp_r * a_ptr[i2] + temp_i * a_ptr[i2+1]; | |||
| y[iy+1] -= temp_r * a_ptr[i2+1] - temp_i * a_ptr[i2]; | |||
| y[iy] += temp_r * a_ptr[i2] + temp_i * a_ptr[i2 + 1]; | |||
| y[iy + 1] -= temp_r * a_ptr[i2 + 1] - temp_i * a_ptr[i2]; | |||
| #else | |||
| y[iy] += temp_r * a_ptr[i2] - temp_i * a_ptr[i2+1]; | |||
| y[iy+1] -= temp_r * a_ptr[i2+1] + temp_i * a_ptr[i2]; | |||
| y[iy] += temp_r * a_ptr[i2] - temp_i * a_ptr[i2 + 1]; | |||
| y[iy + 1] -= temp_r * a_ptr[i2 + 1] + temp_i * a_ptr[i2]; | |||
| #endif | |||
| #endif | |||
| i2 += 2; | |||
| iy += 2; | |||
| i2 += 2; | |||
| iy += 2; | |||
| } | |||
| a_ptr += lda2; | |||
| ix += 2; | |||
| } | |||
| a_ptr += lda2; | |||
| ix += 2; | |||
| } | |||
| return(0); | |||
| return(0); | |||
| } | |||
| @@ -120,39 +119,39 @@ int zgemv_n_c(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alph | |||
| inc_x2 = 2 * inc_x; | |||
| inc_y2 = 2 * inc_y; | |||
| for (j=0; j<n; j++) | |||
| for (j = 0; j<n; j++) | |||
| { | |||
| #if !defined(XCONJ) | |||
| temp_r = alpha_r * x[ix] - alpha_i * x[ix+1]; | |||
| temp_i = alpha_r * x[ix+1] + alpha_i * x[ix]; | |||
| temp_r = alpha_r * x[ix] - alpha_i * x[ix + 1]; | |||
| temp_i = alpha_r * x[ix + 1] + alpha_i * x[ix]; | |||
| #else | |||
| temp_r = alpha_r * x[ix] + alpha_i * x[ix+1]; | |||
| temp_i = alpha_r * x[ix+1] - alpha_i * x[ix]; | |||
| temp_r = alpha_r * x[ix] + alpha_i * x[ix + 1]; | |||
| temp_i = alpha_r * x[ix + 1] - alpha_i * x[ix]; | |||
| #endif | |||
| iy = 0; | |||
| i2=0; | |||
| i2 = 0; | |||
| for (i=0; i<m; i++) | |||
| for (i = 0; i<m; i++) | |||
| { | |||
| #if !defined(CONJ) | |||
| #if !defined(XCONJ) | |||
| y[iy] += temp_r * a_ptr[i2] - temp_i * a_ptr[i2+1]; | |||
| y[iy+1] += temp_r * a_ptr[i2+1] + temp_i * a_ptr[i2]; | |||
| y[iy] += temp_r * a_ptr[i2] - temp_i * a_ptr[i2 + 1]; | |||
| y[iy + 1] += temp_r * a_ptr[i2 + 1] + temp_i * a_ptr[i2]; | |||
| #else | |||
| y[iy] += temp_r * a_ptr[i2] + temp_i * a_ptr[i2+1]; | |||
| y[iy+1] += temp_r * a_ptr[i2+1] - temp_i * a_ptr[i2]; | |||
| y[iy] += temp_r * a_ptr[i2] + temp_i * a_ptr[i2 + 1]; | |||
| y[iy + 1] += temp_r * a_ptr[i2 + 1] - temp_i * a_ptr[i2]; | |||
| #endif | |||
| #else | |||
| #if !defined(XCONJ) | |||
| y[iy] += temp_r * a_ptr[i2] + temp_i * a_ptr[i2+1]; | |||
| y[iy+1] -= temp_r * a_ptr[i2+1] - temp_i * a_ptr[i2]; | |||
| y[iy] += temp_r * a_ptr[i2] + temp_i * a_ptr[i2 + 1]; | |||
| y[iy + 1] -= temp_r * a_ptr[i2 + 1] - temp_i * a_ptr[i2]; | |||
| #else | |||
| y[iy] += temp_r * a_ptr[i2] - temp_i * a_ptr[i2+1]; | |||
| y[iy+1] -= temp_r * a_ptr[i2+1] + temp_i * a_ptr[i2]; | |||
| y[iy] += temp_r * a_ptr[i2] - temp_i * a_ptr[i2 + 1]; | |||
| y[iy + 1] -= temp_r * a_ptr[i2 + 1] + temp_i * a_ptr[i2]; | |||
| #endif | |||
| #endif | |||
| @@ -160,7 +159,7 @@ int zgemv_n_c(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alph | |||
| iy += inc_y2; | |||
| } | |||
| a_ptr += lda2; | |||
| ix += inc_x2; | |||
| ix += inc_x2; | |||
| } | |||
| @@ -404,7 +403,7 @@ int main(int argc, char *argv[]){ | |||
| FLOAT *a, *x, *y, *y_c; | |||
| FLOAT alpha[] = {1.0, 1.0}; | |||
| FLOAT beta [] = {1.0, 1.0}; | |||
| FLOAT beta [] = {1.0, 0.0}; | |||
| char trans='N'; | |||
| blasint m, i, j; | |||
| blasint inc_x=1,inc_y=1; | |||
| @@ -422,6 +421,7 @@ int main(int argc, char *argv[]){ | |||
| struct timeval start, stop; | |||
| double time1,timeg,timeg_c; | |||
| blasint y_size; | |||
| blasint iy; | |||
| int test = 1; | |||
| @@ -500,13 +500,15 @@ int main(int argc, char *argv[]){ | |||
| for(i = 0; i < n * COMPSIZE * abs(inc_y); i++){ | |||
| y[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; | |||
| y_c[i]= y[i]; | |||
| y_c[i]= y[i]; | |||
| } | |||
| gettimeofday( &start, (struct timezone *)0); | |||
| GEMV (&trans, &m, &n, alpha, a, &m, x, &inc_x, beta, y, &inc_y ); | |||
| gettimeofday( &stop, (struct timezone *)0); | |||
| time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6; | |||
| timeg += time1; | |||
| gettimeofday( &start, (struct timezone *)0); | |||
| GEMV (&trans, &m, &n, alpha, a, &m, x, &inc_x, beta, y, &inc_y ); | |||
| gettimeofday( &stop, (struct timezone *)0); | |||
| time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6; | |||
| timeg += time1; | |||
| gettimeofday( &start, (struct timezone *)0); | |||
| #ifdef COMPLEX | |||
| @@ -524,21 +526,27 @@ int main(int argc, char *argv[]){ | |||
| time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6; | |||
| timeg_c += time1; | |||
| iy = 0; | |||
| if (trans == 'N') | |||
| y_size = m; | |||
| else | |||
| y_size = n; | |||
| iy = 0; | |||
| for (i = 0; i < y_size; i++) | |||
| { | |||
| #ifdef COMPLEX | |||
| for (i = 0; i < m * 2; i++) | |||
| #else | |||
| for (i = 0; i < m; i++) | |||
| #endif | |||
| { | |||
| test &= assert_dbl_near(y[iy], y_c[iy], SINGLE_EPS); | |||
| test &= assert_dbl_near(y[iy + 1], y_c[iy + 1], SINGLE_EPS); | |||
| iy += (inc_y * 2); | |||
| #else | |||
| test &= assert_dbl_near(y[iy], y_c[iy], SINGLE_EPS); | |||
| iy += inc_y; | |||
| #endif | |||
| } | |||
| } | |||
| timeg /= loops; | |||
| timeg_c /= loops; | |||
| timeg_c /= loops; | |||
| fprintf(stderr, "%10.2f MFlops %10.6f sec %10.6f sec %s\n", 2. * (double)m / timeg * 1.e-6, timeg, timeg_c, test ? "PASS" : "FAILD"); | |||
| @@ -567,13 +575,14 @@ int main(int argc, char *argv[]){ | |||
| for(i = 0; i < n * COMPSIZE * abs(inc_y); i++){ | |||
| y[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; | |||
| y_c[i]= y[i]; | |||
| y_c[i]= y[i]; | |||
| } | |||
| gettimeofday( &start, (struct timezone *)0); | |||
| GEMV (&trans, &m, &n, alpha, a, &m, x, &inc_x, beta, y, &inc_y ); | |||
| gettimeofday( &stop, (struct timezone *)0); | |||
| time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6; | |||
| timeg += time1; | |||
| gettimeofday( &start, (struct timezone *)0); | |||
| GEMV (&trans, &m, &n, alpha, a, &m, x, &inc_x, beta, y, &inc_y ); | |||
| gettimeofday( &stop, (struct timezone *)0); | |||
| time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6; | |||
| timeg += time1; | |||
| gettimeofday( &start, (struct timezone *)0); | |||
| #ifdef COMPLEX | |||
| @@ -591,21 +600,27 @@ int main(int argc, char *argv[]){ | |||
| time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6; | |||
| timeg_c += time1; | |||
| if (trans == 'N') | |||
| y_size = m; | |||
| else | |||
| y_size = n; | |||
| iy = 0; | |||
| for (i = 0; i < y_size; i++) | |||
| { | |||
| #ifdef COMPLEX | |||
| for (i = 0; i < m * 2; i++) | |||
| #else | |||
| for (i = 0; i < m; i++) | |||
| #endif | |||
| { | |||
| test &= assert_dbl_near(y[iy], y_c[iy], SINGLE_EPS); | |||
| test &= assert_dbl_near(y[iy + 1], y_c[iy + 1], SINGLE_EPS); | |||
| iy += (inc_y * 2); | |||
| #else | |||
| test &= assert_dbl_near(y[iy], y_c[iy], SINGLE_EPS); | |||
| iy += inc_y; | |||
| #endif | |||
| } | |||
| } | |||
| timeg /= loops; | |||
| timeg_c /= loops; | |||
| timeg_c /= loops; | |||
| fprintf(stderr, "%10.2f MFlops %10.6f sec %10.6f sec %s\n", 2. * (double)m / timeg * 1.e-6, timeg, timeg_c, test ? "PASS" : "FAILD"); | |||