| @@ -1,5 +1,5 @@ | |||||
| /*************************************************************************** | /*************************************************************************** | ||||
| Copyright (c) 2013-2017, The OpenBLAS Project | |||||
| Copyright (c) 2013-2019, The OpenBLAS Project | |||||
| All rights reserved. | All rights reserved. | ||||
| Redistribution and use in source and binary forms, with or without | Redistribution and use in source and binary forms, with or without | ||||
| modification, are permitted provided that the following conditions are | modification, are permitted provided that the following conditions are | ||||
| @@ -28,214 +28,188 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #include "common.h" | #include "common.h" | ||||
| #include <math.h> | #include <math.h> | ||||
| #if defined(DOUBLE) | |||||
| #define ABS fabs | |||||
| #else | |||||
| #define ABS fabsf | |||||
| #endif | |||||
| #define CABS1(x,i) (ABS(x[i]) + ABS(x[i + 1])) | |||||
| static FLOAT camax_kernel_32(BLASLONG n, FLOAT *x) | |||||
| { | |||||
| FLOAT amax; | |||||
| __asm__ volatile ( | |||||
| "vlef %%v0,0(%2),0 \n\t" | |||||
| "vlef %%v16,4(%2),0 \n\t" | |||||
| "vlef %%v0,8(%2),1 \n\t" | |||||
| "vlef %%v16,12(%2),1 \n\t" | |||||
| "vlef %%v0,16(%2),2 \n\t" | |||||
| "vlef %%v16,20(%2),2 \n\t" | |||||
| "vlef %%v0,24(%2),3 \n\t" | |||||
| "vlef %%v16,28(%2),3 \n\t" | |||||
| "vflpsb %%v0,%%v0 \n\t" | |||||
| "vflpsb %%v16,%%v16 \n\t" | |||||
| "vfasb %%v0,%%v0,%%v16 \n\t" | |||||
| "vleib %%v1,0,0 \n\t" | |||||
| "vleib %%v1,1,1 \n\t" | |||||
| "vleib %%v1,2,2 \n\t" | |||||
| "vleib %%v1,3,3 \n\t" | |||||
| "vleib %%v1,8,4 \n\t" | |||||
| "vleib %%v1,9,5 \n\t" | |||||
| "vleib %%v1,10,6 \n\t" | |||||
| "vleib %%v1,11,7 \n\t" | |||||
| "vleib %%v1,16,8 \n\t" | |||||
| "vleib %%v1,17,9 \n\t" | |||||
| "vleib %%v1,18,10 \n\t" | |||||
| "vleib %%v1,19,11 \n\t" | |||||
| "vleib %%v1,24,12 \n\t" | |||||
| "vleib %%v1,25,13 \n\t" | |||||
| "vleib %%v1,26,14 \n\t" | |||||
| "vleib %%v1,27,15 \n\t" | |||||
| "srlg %%r0,%1,5 \n\t" | |||||
| "xgr %%r1,%%r1 \n\t" | |||||
| "0: \n\t" | |||||
| "pfd 1, 1024(%%r1,%2) \n\t" | |||||
| "vl %%v16,0(%%r1,%2) \n\t" | |||||
| "vl %%v2,16(%%r1,%2) \n\t" | |||||
| "vpkg %%v17,%%v16,%%v2 \n\t" | |||||
| "vperm %%v16,%%v16,%%v2,%%v1 \n\t" | |||||
| "vl %%v18,32(%%r1,%2) \n\t" | |||||
| "vl %%v2,48(%%r1,%2) \n\t" | |||||
| "vpkg %%v19,%%v18,%%v2 \n\t" | |||||
| "vperm %%v18,%%v18,%%v2,%%v1 \n\t" | |||||
| "vl %%v20,64(%%r1,%2) \n\t" | |||||
| "vl %%v2,80(%%r1,%2) \n\t" | |||||
| "vpkg %%v21,%%v20,%%v2 \n\t" | |||||
| "vperm %%v20,%%v20,%%v2,%%v1 \n\t" | |||||
| "vl %%v22,96(%%r1,%2) \n\t" | |||||
| "vl %%v2,112(%%r1,%2) \n\t" | |||||
| "vpkg %%v23,%%v22,%%v2 \n\t" | |||||
| "vperm %%v22,%%v22,%%v2,%%v1 \n\t" | |||||
| "vl %%v24,128(%%r1,%2) \n\t" | |||||
| "vl %%v2,144(%%r1,%2) \n\t" | |||||
| "vpkg %%v25,%%v24,%%v2 \n\t" | |||||
| "vperm %%v24,%%v24,%%v2,%%v1 \n\t" | |||||
| "vl %%v26,160(%%r1,%2) \n\t" | |||||
| "vl %%v2,176(%%r1,%2) \n\t" | |||||
| "vpkg %%v27,%%v26,%%v2 \n\t" | |||||
| "vperm %%v26,%%v26,%%v2,%%v1 \n\t" | |||||
| "vl %%v28,192(%%r1,%2) \n\t" | |||||
| "vl %%v2,208(%%r1,%2) \n\t" | |||||
| "vpkg %%v29,%%v28,%%v2 \n\t" | |||||
| "vperm %%v28,%%v28,%%v2,%%v1 \n\t" | |||||
| "vl %%v30,224(%%r1,%2) \n\t" | |||||
| "vl %%v2,240(%%r1,%2) \n\t" | |||||
| "vpkg %%v31,%%v30,%%v2 \n\t" | |||||
| "vperm %%v30,%%v30,%%v2,%%v1 \n\t" | |||||
| "vflpsb %%v16,%%v16 \n\t" | |||||
| "vflpsb %%v17,%%v17 \n\t" | |||||
| "vflpsb %%v18,%%v18 \n\t" | |||||
| "vflpsb %%v19,%%v19 \n\t" | |||||
| "vflpsb %%v20,%%v20 \n\t" | |||||
| "vflpsb %%v21,%%v21 \n\t" | |||||
| "vflpsb %%v22,%%v22 \n\t" | |||||
| "vflpsb %%v23,%%v23 \n\t" | |||||
| "vflpsb %%v24,%%v24 \n\t" | |||||
| "vflpsb %%v25,%%v25 \n\t" | |||||
| "vflpsb %%v26,%%v26 \n\t" | |||||
| "vflpsb %%v27,%%v27 \n\t" | |||||
| "vflpsb %%v28,%%v28 \n\t" | |||||
| "vflpsb %%v29,%%v29 \n\t" | |||||
| "vflpsb %%v30,%%v30 \n\t" | |||||
| "vflpsb %%v31,%%v31 \n\t" | |||||
| "vfasb %%v16,%%v16,%%v17 \n\t" | |||||
| "vfasb %%v18,%%v18,%%v19 \n\t" | |||||
| "vfasb %%v20,%%v20,%%v21 \n\t" | |||||
| "vfasb %%v22,%%v22,%%v23 \n\t" | |||||
| "vfasb %%v24,%%v24,%%v25 \n\t" | |||||
| "vfasb %%v26,%%v26,%%v27 \n\t" | |||||
| "vfasb %%v28,%%v28,%%v29 \n\t" | |||||
| "vfasb %%v30,%%v30,%%v31 \n\t" | |||||
| "vfmaxsb %%v16,%%v16,%%v24,0 \n\t" | |||||
| "vfmaxsb %%v18,%%v18,%%v26,0 \n\t" | |||||
| "vfmaxsb %%v20,%%v20,%%v28,0 \n\t" | |||||
| "vfmaxsb %%v22,%%v22,%%v30,0 \n\t" | |||||
| "vfmaxsb %%v16,%%v16,%%v20,0 \n\t" | |||||
| "vfmaxsb %%v18,%%v18,%%v22,0 \n\t" | |||||
| "vfmaxsb %%v16,%%v16,%%v18,0 \n\t" | |||||
| "vfmaxsb %%v0,%%v0,%%v16,0 \n\t" | |||||
| "agfi %%r1, 256 \n\t" | |||||
| "brctg %%r0, 0b \n\t" | |||||
| "veslg %%v16,%%v0,32 \n\t" | |||||
| "vfmaxsb %%v0,%%v0,%%v16,0 \n\t" | |||||
| "vrepf %%v16,%%v0,2 \n\t" | |||||
| "wfmaxsb %%v0,%%v0,%%v16,0 \n\t" | |||||
| "ler %0,%%f0 " | |||||
| :"=f"(amax) | |||||
| :"r"(n),"ZR"((const FLOAT (*)[n])x) | |||||
| :"memory","cc","r0","r1","v0","v1","v2","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" | |||||
| ); | |||||
| return amax; | |||||
| #define CABS1(x,i) (fabsf(x[i]) + fabsf(x[i + 1])) | |||||
| static FLOAT camax_kernel_32(BLASLONG n, FLOAT *x) { | |||||
| FLOAT amax; | |||||
| __asm__("vlef %%v0,0(%[x]),0\n\t" | |||||
| "vlef %%v16,4(%[x]),0\n\t" | |||||
| "vlef %%v0,8(%[x]),1\n\t" | |||||
| "vlef %%v16,12(%[x]),1\n\t" | |||||
| "vlef %%v0,16(%[x]),2\n\t" | |||||
| "vlef %%v16,20(%[x]),2\n\t" | |||||
| "vlef %%v0,24(%[x]),3\n\t" | |||||
| "vlef %%v16,28(%[x]),3\n\t" | |||||
| "vflpsb %%v0,%%v0\n\t" | |||||
| "vflpsb %%v16,%%v16\n\t" | |||||
| "vfasb %%v0,%%v0,%%v16\n\t" | |||||
| "vleib %%v1,0,0\n\t" | |||||
| "vleib %%v1,1,1\n\t" | |||||
| "vleib %%v1,2,2\n\t" | |||||
| "vleib %%v1,3,3\n\t" | |||||
| "vleib %%v1,8,4\n\t" | |||||
| "vleib %%v1,9,5\n\t" | |||||
| "vleib %%v1,10,6\n\t" | |||||
| "vleib %%v1,11,7\n\t" | |||||
| "vleib %%v1,16,8\n\t" | |||||
| "vleib %%v1,17,9\n\t" | |||||
| "vleib %%v1,18,10\n\t" | |||||
| "vleib %%v1,19,11\n\t" | |||||
| "vleib %%v1,24,12\n\t" | |||||
| "vleib %%v1,25,13\n\t" | |||||
| "vleib %%v1,26,14\n\t" | |||||
| "vleib %%v1,27,15\n\t" | |||||
| "srlg %[n],%[n],5\n\t" | |||||
| "xgr %%r1,%%r1\n\t" | |||||
| "0:\n\t" | |||||
| "pfd 1, 1024(%%r1,%[x])\n\t" | |||||
| "vl %%v16,0(%%r1,%[x])\n\t" | |||||
| "vl %%v2,16(%%r1,%[x])\n\t" | |||||
| "vpkg %%v17,%%v16,%%v2\n\t" | |||||
| "vperm %%v16,%%v16,%%v2,%%v1\n\t" | |||||
| "vl %%v18,32(%%r1,%[x])\n\t" | |||||
| "vl %%v2,48(%%r1,%[x])\n\t" | |||||
| "vpkg %%v19,%%v18,%%v2\n\t" | |||||
| "vperm %%v18,%%v18,%%v2,%%v1\n\t" | |||||
| "vl %%v20,64(%%r1,%[x])\n\t" | |||||
| "vl %%v2,80(%%r1,%[x])\n\t" | |||||
| "vpkg %%v21,%%v20,%%v2\n\t" | |||||
| "vperm %%v20,%%v20,%%v2,%%v1\n\t" | |||||
| "vl %%v22,96(%%r1,%[x])\n\t" | |||||
| "vl %%v2,112(%%r1,%[x])\n\t" | |||||
| "vpkg %%v23,%%v22,%%v2\n\t" | |||||
| "vperm %%v22,%%v22,%%v2,%%v1\n\t" | |||||
| "vl %%v24,128(%%r1,%[x])\n\t" | |||||
| "vl %%v2,144(%%r1,%[x])\n\t" | |||||
| "vpkg %%v25,%%v24,%%v2\n\t" | |||||
| "vperm %%v24,%%v24,%%v2,%%v1\n\t" | |||||
| "vl %%v26,160(%%r1,%[x])\n\t" | |||||
| "vl %%v2,176(%%r1,%[x])\n\t" | |||||
| "vpkg %%v27,%%v26,%%v2\n\t" | |||||
| "vperm %%v26,%%v26,%%v2,%%v1\n\t" | |||||
| "vl %%v28,192(%%r1,%[x])\n\t" | |||||
| "vl %%v2,208(%%r1,%[x])\n\t" | |||||
| "vpkg %%v29,%%v28,%%v2\n\t" | |||||
| "vperm %%v28,%%v28,%%v2,%%v1\n\t" | |||||
| "vl %%v30,224(%%r1,%[x])\n\t" | |||||
| "vl %%v2,240(%%r1,%[x])\n\t" | |||||
| "vpkg %%v31,%%v30,%%v2\n\t" | |||||
| "vperm %%v30,%%v30,%%v2,%%v1\n\t" | |||||
| "vflpsb %%v16,%%v16\n\t" | |||||
| "vflpsb %%v17,%%v17\n\t" | |||||
| "vflpsb %%v18,%%v18\n\t" | |||||
| "vflpsb %%v19,%%v19\n\t" | |||||
| "vflpsb %%v20,%%v20\n\t" | |||||
| "vflpsb %%v21,%%v21\n\t" | |||||
| "vflpsb %%v22,%%v22\n\t" | |||||
| "vflpsb %%v23,%%v23\n\t" | |||||
| "vflpsb %%v24,%%v24\n\t" | |||||
| "vflpsb %%v25,%%v25\n\t" | |||||
| "vflpsb %%v26,%%v26\n\t" | |||||
| "vflpsb %%v27,%%v27\n\t" | |||||
| "vflpsb %%v28,%%v28\n\t" | |||||
| "vflpsb %%v29,%%v29\n\t" | |||||
| "vflpsb %%v30,%%v30\n\t" | |||||
| "vflpsb %%v31,%%v31\n\t" | |||||
| "vfasb %%v16,%%v16,%%v17\n\t" | |||||
| "vfasb %%v18,%%v18,%%v19\n\t" | |||||
| "vfasb %%v20,%%v20,%%v21\n\t" | |||||
| "vfasb %%v22,%%v22,%%v23\n\t" | |||||
| "vfasb %%v24,%%v24,%%v25\n\t" | |||||
| "vfasb %%v26,%%v26,%%v27\n\t" | |||||
| "vfasb %%v28,%%v28,%%v29\n\t" | |||||
| "vfasb %%v30,%%v30,%%v31\n\t" | |||||
| "vfmaxsb %%v16,%%v16,%%v24,0\n\t" | |||||
| "vfmaxsb %%v18,%%v18,%%v26,0\n\t" | |||||
| "vfmaxsb %%v20,%%v20,%%v28,0\n\t" | |||||
| "vfmaxsb %%v22,%%v22,%%v30,0\n\t" | |||||
| "vfmaxsb %%v16,%%v16,%%v20,0\n\t" | |||||
| "vfmaxsb %%v18,%%v18,%%v22,0\n\t" | |||||
| "vfmaxsb %%v16,%%v16,%%v18,0\n\t" | |||||
| "vfmaxsb %%v0,%%v0,%%v16,0\n\t" | |||||
| "agfi %%r1, 256\n\t" | |||||
| "brctg %[n], 0b\n\t" | |||||
| "veslg %%v16,%%v0,32\n\t" | |||||
| "vfmaxsb %%v0,%%v0,%%v16,0\n\t" | |||||
| "vrepf %%v16,%%v0,2\n\t" | |||||
| "wfmaxsb %%v0,%%v0,%%v16,0\n\t" | |||||
| "ler %[amax],%%f0" | |||||
| : [amax] "=f"(amax),[n] "+&r"(n) | |||||
| : "m"(*(const FLOAT (*)[n * 2]) x),[x] "a"(x) | |||||
| : "cc", "r1", "v0", "v1", "v2", "v16", "v17", "v18", "v19", "v20", | |||||
| "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", | |||||
| "v31"); | |||||
| return amax; | |||||
| } | } | ||||
| FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { | FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { | ||||
| BLASLONG i = 0; | |||||
| BLASLONG ix = 0; | |||||
| FLOAT maxf = 0.0; | |||||
| BLASLONG inc_x2; | |||||
| if (n <= 0 || inc_x <= 0) return (maxf); | |||||
| if (inc_x == 1) { | |||||
| BLASLONG n1 = n & -32; | |||||
| if (n1 > 0) { | |||||
| maxf = camax_kernel_32(n1, x); | |||||
| ix = n1 * 2; | |||||
| i = n1; | |||||
| } | |||||
| else | |||||
| { | |||||
| maxf=CABS1(x,0); | |||||
| ix += 2; | |||||
| i++; | |||||
| } | |||||
| while (i < n) { | |||||
| if (CABS1(x,ix) > maxf) { | |||||
| maxf = CABS1(x,ix); | |||||
| } | |||||
| ix += 2; | |||||
| i++; | |||||
| } | |||||
| return (maxf); | |||||
| BLASLONG i = 0; | |||||
| BLASLONG ix = 0; | |||||
| FLOAT maxf = 0.0; | |||||
| BLASLONG inc_x2; | |||||
| if (n <= 0 || inc_x <= 0) | |||||
| return (maxf); | |||||
| if (inc_x == 1) { | |||||
| BLASLONG n1 = n & -32; | |||||
| if (n1 > 0) { | |||||
| maxf = camax_kernel_32(n1, x); | |||||
| ix = n1 * 2; | |||||
| i = n1; | |||||
| } else { | } else { | ||||
| maxf = CABS1(x, 0); | |||||
| ix += 2; | |||||
| i++; | |||||
| } | |||||
| while (i < n) { | |||||
| if (CABS1(x, ix) > maxf) { | |||||
| maxf = CABS1(x, ix); | |||||
| } | |||||
| ix += 2; | |||||
| i++; | |||||
| } | |||||
| return (maxf); | |||||
| maxf=CABS1(x,0); | |||||
| inc_x2 = 2 * inc_x; | |||||
| } else { | |||||
| BLASLONG n1 = n & -4; | |||||
| while (i < n1) { | |||||
| maxf = CABS1(x, 0); | |||||
| inc_x2 = 2 * inc_x; | |||||
| if (CABS1(x,ix) > maxf) { | |||||
| maxf = CABS1(x,ix); | |||||
| } | |||||
| if (CABS1(x,ix+inc_x2) > maxf) { | |||||
| maxf = CABS1(x,ix+inc_x2); | |||||
| } | |||||
| if (CABS1(x,ix+inc_x2*2) > maxf) { | |||||
| maxf = CABS1(x,ix+inc_x2*2); | |||||
| } | |||||
| if (CABS1(x,ix+inc_x2*3) > maxf) { | |||||
| maxf = CABS1(x,ix+inc_x2*3); | |||||
| } | |||||
| BLASLONG n1 = n & -4; | |||||
| while (i < n1) { | |||||
| ix += inc_x2 * 4; | |||||
| if (CABS1(x, ix) > maxf) { | |||||
| maxf = CABS1(x, ix); | |||||
| } | |||||
| if (CABS1(x, ix + inc_x2) > maxf) { | |||||
| maxf = CABS1(x, ix + inc_x2); | |||||
| } | |||||
| if (CABS1(x, ix + inc_x2 * 2) > maxf) { | |||||
| maxf = CABS1(x, ix + inc_x2 * 2); | |||||
| } | |||||
| if (CABS1(x, ix + inc_x2 * 3) > maxf) { | |||||
| maxf = CABS1(x, ix + inc_x2 * 3); | |||||
| } | |||||
| i += 4; | |||||
| ix += inc_x2 * 4; | |||||
| } | |||||
| i += 4; | |||||
| } | |||||
| while (i < n) { | |||||
| if (CABS1(x,ix) > maxf) { | |||||
| maxf = CABS1(x,ix); | |||||
| } | |||||
| ix += inc_x2; | |||||
| i++; | |||||
| } | |||||
| return (maxf); | |||||
| while (i < n) { | |||||
| if (CABS1(x, ix) > maxf) { | |||||
| maxf = CABS1(x, ix); | |||||
| } | |||||
| ix += inc_x2; | |||||
| i++; | |||||
| } | } | ||||
| return (maxf); | |||||
| } | |||||
| } | } | ||||
| @@ -1,5 +1,5 @@ | |||||
| /*************************************************************************** | /*************************************************************************** | ||||
| Copyright (c) 2013-2017, The OpenBLAS Project | |||||
| Copyright (c) 2013-2019, The OpenBLAS Project | |||||
| All rights reserved. | All rights reserved. | ||||
| Redistribution and use in source and binary forms, with or without | Redistribution and use in source and binary forms, with or without | ||||
| modification, are permitted provided that the following conditions are | modification, are permitted provided that the following conditions are | ||||
| @@ -28,214 +28,188 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #include "common.h" | #include "common.h" | ||||
| #include <math.h> | #include <math.h> | ||||
| #if defined(DOUBLE) | |||||
| #define ABS fabs | |||||
| #else | |||||
| #define ABS fabsf | |||||
| #endif | |||||
| #define CABS1(x,i) (ABS(x[i]) + ABS(x[i + 1])) | |||||
| static FLOAT camin_kernel_32(BLASLONG n, FLOAT *x) | |||||
| { | |||||
| FLOAT amin; | |||||
| __asm__ volatile ( | |||||
| "vlef %%v0,0(%2),0 \n\t" | |||||
| "vlef %%v16,4(%2),0 \n\t" | |||||
| "vlef %%v0,8(%2),1 \n\t" | |||||
| "vlef %%v16,12(%2),1 \n\t" | |||||
| "vlef %%v0,16(%2),2 \n\t" | |||||
| "vlef %%v16,20(%2),2 \n\t" | |||||
| "vlef %%v0,24(%2),3 \n\t" | |||||
| "vlef %%v16,28(%2),3 \n\t" | |||||
| "vflpsb %%v0,%%v0 \n\t" | |||||
| "vflpsb %%v16,%%v16 \n\t" | |||||
| "vfasb %%v0,%%v0,%%v16 \n\t" | |||||
| "vleib %%v1,0,0 \n\t" | |||||
| "vleib %%v1,1,1 \n\t" | |||||
| "vleib %%v1,2,2 \n\t" | |||||
| "vleib %%v1,3,3 \n\t" | |||||
| "vleib %%v1,8,4 \n\t" | |||||
| "vleib %%v1,9,5 \n\t" | |||||
| "vleib %%v1,10,6 \n\t" | |||||
| "vleib %%v1,11,7 \n\t" | |||||
| "vleib %%v1,16,8 \n\t" | |||||
| "vleib %%v1,17,9 \n\t" | |||||
| "vleib %%v1,18,10 \n\t" | |||||
| "vleib %%v1,19,11 \n\t" | |||||
| "vleib %%v1,24,12 \n\t" | |||||
| "vleib %%v1,25,13 \n\t" | |||||
| "vleib %%v1,26,14 \n\t" | |||||
| "vleib %%v1,27,15 \n\t" | |||||
| "srlg %%r0,%1,5 \n\t" | |||||
| "xgr %%r1,%%r1 \n\t" | |||||
| "0: \n\t" | |||||
| "pfd 1, 1024(%%r1,%2) \n\t" | |||||
| "vl %%v16,0(%%r1,%2) \n\t" | |||||
| "vl %%v2,16(%%r1,%2) \n\t" | |||||
| "vpkg %%v17,%%v16,%%v2 \n\t" | |||||
| "vperm %%v16,%%v16,%%v2,%%v1 \n\t" | |||||
| "vl %%v18,32(%%r1,%2) \n\t" | |||||
| "vl %%v2,48(%%r1,%2) \n\t" | |||||
| "vpkg %%v19,%%v18,%%v2 \n\t" | |||||
| "vperm %%v18,%%v18,%%v2,%%v1 \n\t" | |||||
| "vl %%v20,64(%%r1,%2) \n\t" | |||||
| "vl %%v2,80(%%r1,%2) \n\t" | |||||
| "vpkg %%v21,%%v20,%%v2 \n\t" | |||||
| "vperm %%v20,%%v20,%%v2,%%v1 \n\t" | |||||
| "vl %%v22,96(%%r1,%2) \n\t" | |||||
| "vl %%v2,112(%%r1,%2) \n\t" | |||||
| "vpkg %%v23,%%v22,%%v2 \n\t" | |||||
| "vperm %%v22,%%v22,%%v2,%%v1 \n\t" | |||||
| "vl %%v24,128(%%r1,%2) \n\t" | |||||
| "vl %%v2,144(%%r1,%2) \n\t" | |||||
| "vpkg %%v25,%%v24,%%v2 \n\t" | |||||
| "vperm %%v24,%%v24,%%v2,%%v1 \n\t" | |||||
| "vl %%v26,160(%%r1,%2) \n\t" | |||||
| "vl %%v2,176(%%r1,%2) \n\t" | |||||
| "vpkg %%v27,%%v26,%%v2 \n\t" | |||||
| "vperm %%v26,%%v26,%%v2,%%v1 \n\t" | |||||
| "vl %%v28,192(%%r1,%2) \n\t" | |||||
| "vl %%v2,208(%%r1,%2) \n\t" | |||||
| "vpkg %%v29,%%v28,%%v2 \n\t" | |||||
| "vperm %%v28,%%v28,%%v2,%%v1 \n\t" | |||||
| "vl %%v30,224(%%r1,%2) \n\t" | |||||
| "vl %%v2,240(%%r1,%2) \n\t" | |||||
| "vpkg %%v31,%%v30,%%v2 \n\t" | |||||
| "vperm %%v30,%%v30,%%v2,%%v1 \n\t" | |||||
| "vflpsb %%v16,%%v16 \n\t" | |||||
| "vflpsb %%v17,%%v17 \n\t" | |||||
| "vflpsb %%v18,%%v18 \n\t" | |||||
| "vflpsb %%v19,%%v19 \n\t" | |||||
| "vflpsb %%v20,%%v20 \n\t" | |||||
| "vflpsb %%v21,%%v21 \n\t" | |||||
| "vflpsb %%v22,%%v22 \n\t" | |||||
| "vflpsb %%v23,%%v23 \n\t" | |||||
| "vflpsb %%v24,%%v24 \n\t" | |||||
| "vflpsb %%v25,%%v25 \n\t" | |||||
| "vflpsb %%v26,%%v26 \n\t" | |||||
| "vflpsb %%v27,%%v27 \n\t" | |||||
| "vflpsb %%v28,%%v28 \n\t" | |||||
| "vflpsb %%v29,%%v29 \n\t" | |||||
| "vflpsb %%v30,%%v30 \n\t" | |||||
| "vflpsb %%v31,%%v31 \n\t" | |||||
| "vfasb %%v16,%%v16,%%v17 \n\t" | |||||
| "vfasb %%v18,%%v18,%%v19 \n\t" | |||||
| "vfasb %%v20,%%v20,%%v21 \n\t" | |||||
| "vfasb %%v22,%%v22,%%v23 \n\t" | |||||
| "vfasb %%v24,%%v24,%%v25 \n\t" | |||||
| "vfasb %%v26,%%v26,%%v27 \n\t" | |||||
| "vfasb %%v28,%%v28,%%v29 \n\t" | |||||
| "vfasb %%v30,%%v30,%%v31 \n\t" | |||||
| "vfminsb %%v16,%%v16,%%v24,0 \n\t" | |||||
| "vfminsb %%v18,%%v18,%%v26,0 \n\t" | |||||
| "vfminsb %%v20,%%v20,%%v28,0 \n\t" | |||||
| "vfminsb %%v22,%%v22,%%v30,0 \n\t" | |||||
| "vfminsb %%v16,%%v16,%%v20,0 \n\t" | |||||
| "vfminsb %%v18,%%v18,%%v22,0 \n\t" | |||||
| "vfminsb %%v16,%%v16,%%v18,0 \n\t" | |||||
| "vfminsb %%v0,%%v0,%%v16,0 \n\t" | |||||
| "agfi %%r1, 256 \n\t" | |||||
| "brctg %%r0, 0b \n\t" | |||||
| "veslg %%v16,%%v0,32 \n\t" | |||||
| "vfminsb %%v0,%%v0,%%v16,0 \n\t" | |||||
| "vrepf %%v16,%%v0,2 \n\t" | |||||
| "wfminsb %%v0,%%v0,%%v16,0 \n\t" | |||||
| "ler %0,%%f0 " | |||||
| :"=f"(amin) | |||||
| :"r"(n),"ZR"((const FLOAT (*)[n])x) | |||||
| :"memory","cc","r0","r1","v0","v1","v2","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" | |||||
| ); | |||||
| return amin; | |||||
| #define CABS1(x,i) (fabsf(x[i]) + fabsf(x[i + 1])) | |||||
| static FLOAT camin_kernel_32(BLASLONG n, FLOAT *x) { | |||||
| FLOAT amin; | |||||
| __asm__("vlef %%v0,0(%[x]),0\n\t" | |||||
| "vlef %%v16,4(%[x]),0\n\t" | |||||
| "vlef %%v0,8(%[x]),1\n\t" | |||||
| "vlef %%v16,12(%[x]),1\n\t" | |||||
| "vlef %%v0,16(%[x]),2\n\t" | |||||
| "vlef %%v16,20(%[x]),2\n\t" | |||||
| "vlef %%v0,24(%[x]),3\n\t" | |||||
| "vlef %%v16,28(%[x]),3\n\t" | |||||
| "vflpsb %%v0,%%v0\n\t" | |||||
| "vflpsb %%v16,%%v16\n\t" | |||||
| "vfasb %%v0,%%v0,%%v16\n\t" | |||||
| "vleib %%v1,0,0\n\t" | |||||
| "vleib %%v1,1,1\n\t" | |||||
| "vleib %%v1,2,2\n\t" | |||||
| "vleib %%v1,3,3\n\t" | |||||
| "vleib %%v1,8,4\n\t" | |||||
| "vleib %%v1,9,5\n\t" | |||||
| "vleib %%v1,10,6\n\t" | |||||
| "vleib %%v1,11,7\n\t" | |||||
| "vleib %%v1,16,8\n\t" | |||||
| "vleib %%v1,17,9\n\t" | |||||
| "vleib %%v1,18,10\n\t" | |||||
| "vleib %%v1,19,11\n\t" | |||||
| "vleib %%v1,24,12\n\t" | |||||
| "vleib %%v1,25,13\n\t" | |||||
| "vleib %%v1,26,14\n\t" | |||||
| "vleib %%v1,27,15\n\t" | |||||
| "srlg %[n],%[n],5\n\t" | |||||
| "xgr %%r1,%%r1\n\t" | |||||
| "0:\n\t" | |||||
| "pfd 1, 1024(%%r1,%[x])\n\t" | |||||
| "vl %%v16,0(%%r1,%[x])\n\t" | |||||
| "vl %%v2,16(%%r1,%[x])\n\t" | |||||
| "vpkg %%v17,%%v16,%%v2\n\t" | |||||
| "vperm %%v16,%%v16,%%v2,%%v1\n\t" | |||||
| "vl %%v18,32(%%r1,%[x])\n\t" | |||||
| "vl %%v2,48(%%r1,%[x])\n\t" | |||||
| "vpkg %%v19,%%v18,%%v2\n\t" | |||||
| "vperm %%v18,%%v18,%%v2,%%v1\n\t" | |||||
| "vl %%v20,64(%%r1,%[x])\n\t" | |||||
| "vl %%v2,80(%%r1,%[x])\n\t" | |||||
| "vpkg %%v21,%%v20,%%v2\n\t" | |||||
| "vperm %%v20,%%v20,%%v2,%%v1\n\t" | |||||
| "vl %%v22,96(%%r1,%[x])\n\t" | |||||
| "vl %%v2,112(%%r1,%[x])\n\t" | |||||
| "vpkg %%v23,%%v22,%%v2\n\t" | |||||
| "vperm %%v22,%%v22,%%v2,%%v1\n\t" | |||||
| "vl %%v24,128(%%r1,%[x])\n\t" | |||||
| "vl %%v2,144(%%r1,%[x])\n\t" | |||||
| "vpkg %%v25,%%v24,%%v2\n\t" | |||||
| "vperm %%v24,%%v24,%%v2,%%v1\n\t" | |||||
| "vl %%v26,160(%%r1,%[x])\n\t" | |||||
| "vl %%v2,176(%%r1,%[x])\n\t" | |||||
| "vpkg %%v27,%%v26,%%v2\n\t" | |||||
| "vperm %%v26,%%v26,%%v2,%%v1\n\t" | |||||
| "vl %%v28,192(%%r1,%[x])\n\t" | |||||
| "vl %%v2,208(%%r1,%[x])\n\t" | |||||
| "vpkg %%v29,%%v28,%%v2\n\t" | |||||
| "vperm %%v28,%%v28,%%v2,%%v1\n\t" | |||||
| "vl %%v30,224(%%r1,%[x])\n\t" | |||||
| "vl %%v2,240(%%r1,%[x])\n\t" | |||||
| "vpkg %%v31,%%v30,%%v2\n\t" | |||||
| "vperm %%v30,%%v30,%%v2,%%v1\n\t" | |||||
| "vflpsb %%v16,%%v16\n\t" | |||||
| "vflpsb %%v17,%%v17\n\t" | |||||
| "vflpsb %%v18,%%v18\n\t" | |||||
| "vflpsb %%v19,%%v19\n\t" | |||||
| "vflpsb %%v20,%%v20\n\t" | |||||
| "vflpsb %%v21,%%v21\n\t" | |||||
| "vflpsb %%v22,%%v22\n\t" | |||||
| "vflpsb %%v23,%%v23\n\t" | |||||
| "vflpsb %%v24,%%v24\n\t" | |||||
| "vflpsb %%v25,%%v25\n\t" | |||||
| "vflpsb %%v26,%%v26\n\t" | |||||
| "vflpsb %%v27,%%v27\n\t" | |||||
| "vflpsb %%v28,%%v28\n\t" | |||||
| "vflpsb %%v29,%%v29\n\t" | |||||
| "vflpsb %%v30,%%v30\n\t" | |||||
| "vflpsb %%v31,%%v31\n\t" | |||||
| "vfasb %%v16,%%v16,%%v17\n\t" | |||||
| "vfasb %%v18,%%v18,%%v19\n\t" | |||||
| "vfasb %%v20,%%v20,%%v21\n\t" | |||||
| "vfasb %%v22,%%v22,%%v23\n\t" | |||||
| "vfasb %%v24,%%v24,%%v25\n\t" | |||||
| "vfasb %%v26,%%v26,%%v27\n\t" | |||||
| "vfasb %%v28,%%v28,%%v29\n\t" | |||||
| "vfasb %%v30,%%v30,%%v31\n\t" | |||||
| "vfminsb %%v16,%%v16,%%v24,0\n\t" | |||||
| "vfminsb %%v18,%%v18,%%v26,0\n\t" | |||||
| "vfminsb %%v20,%%v20,%%v28,0\n\t" | |||||
| "vfminsb %%v22,%%v22,%%v30,0\n\t" | |||||
| "vfminsb %%v16,%%v16,%%v20,0\n\t" | |||||
| "vfminsb %%v18,%%v18,%%v22,0\n\t" | |||||
| "vfminsb %%v16,%%v16,%%v18,0\n\t" | |||||
| "vfminsb %%v0,%%v0,%%v16,0\n\t" | |||||
| "agfi %%r1, 256\n\t" | |||||
| "brctg %[n], 0b\n\t" | |||||
| "veslg %%v16,%%v0,32\n\t" | |||||
| "vfminsb %%v0,%%v0,%%v16,0\n\t" | |||||
| "vrepf %%v16,%%v0,2\n\t" | |||||
| "wfminsb %%v0,%%v0,%%v16,0\n\t" | |||||
| "ler %[amin],%%f0" | |||||
| : [amin] "=f"(amin),[n] "+&r"(n) | |||||
| : "m"(*(const FLOAT (*)[n * 2]) x),[x] "a"(x) | |||||
| : "cc", "r1", "v0", "v1", "v2", "v16", "v17", "v18", "v19", "v20", | |||||
| "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", | |||||
| "v31"); | |||||
| return amin; | |||||
| } | } | ||||
| FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { | FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { | ||||
| BLASLONG i = 0; | |||||
| BLASLONG ix = 0; | |||||
| FLOAT minf = 0.0; | |||||
| BLASLONG inc_x2; | |||||
| if (n <= 0 || inc_x <= 0) return (minf); | |||||
| if (inc_x == 1) { | |||||
| BLASLONG n1 = n & -32; | |||||
| if (n1 > 0) { | |||||
| minf = camin_kernel_32(n1, x); | |||||
| ix = n1 * 2; | |||||
| i = n1; | |||||
| } | |||||
| else | |||||
| { | |||||
| minf=CABS1(x,0); | |||||
| ix += 2; | |||||
| i++; | |||||
| } | |||||
| while (i < n) { | |||||
| if (CABS1(x,ix) < minf) { | |||||
| minf = CABS1(x,ix); | |||||
| } | |||||
| ix += 2; | |||||
| i++; | |||||
| } | |||||
| return (minf); | |||||
| BLASLONG i = 0; | |||||
| BLASLONG ix = 0; | |||||
| FLOAT minf = 0.0; | |||||
| BLASLONG inc_x2; | |||||
| if (n <= 0 || inc_x <= 0) | |||||
| return (minf); | |||||
| if (inc_x == 1) { | |||||
| BLASLONG n1 = n & -32; | |||||
| if (n1 > 0) { | |||||
| minf = camin_kernel_32(n1, x); | |||||
| ix = n1 * 2; | |||||
| i = n1; | |||||
| } else { | } else { | ||||
| minf = CABS1(x, 0); | |||||
| ix += 2; | |||||
| i++; | |||||
| } | |||||
| while (i < n) { | |||||
| if (CABS1(x, ix) < minf) { | |||||
| minf = CABS1(x, ix); | |||||
| } | |||||
| ix += 2; | |||||
| i++; | |||||
| } | |||||
| return (minf); | |||||
| minf=CABS1(x,0); | |||||
| inc_x2 = 2 * inc_x; | |||||
| } else { | |||||
| BLASLONG n1 = n & -4; | |||||
| while (i < n1) { | |||||
| minf = CABS1(x, 0); | |||||
| inc_x2 = 2 * inc_x; | |||||
| if (CABS1(x,ix) < minf) { | |||||
| minf = CABS1(x,ix); | |||||
| } | |||||
| if (CABS1(x,ix+inc_x2) < minf) { | |||||
| minf = CABS1(x,ix+inc_x2); | |||||
| } | |||||
| if (CABS1(x,ix+inc_x2*2) < minf) { | |||||
| minf = CABS1(x,ix+inc_x2*2); | |||||
| } | |||||
| if (CABS1(x,ix+inc_x2*3) < minf) { | |||||
| minf = CABS1(x,ix+inc_x2*3); | |||||
| } | |||||
| BLASLONG n1 = n & -4; | |||||
| while (i < n1) { | |||||
| ix += inc_x2 * 4; | |||||
| if (CABS1(x, ix) < minf) { | |||||
| minf = CABS1(x, ix); | |||||
| } | |||||
| if (CABS1(x, ix + inc_x2) < minf) { | |||||
| minf = CABS1(x, ix + inc_x2); | |||||
| } | |||||
| if (CABS1(x, ix + inc_x2 * 2) < minf) { | |||||
| minf = CABS1(x, ix + inc_x2 * 2); | |||||
| } | |||||
| if (CABS1(x, ix + inc_x2 * 3) < minf) { | |||||
| minf = CABS1(x, ix + inc_x2 * 3); | |||||
| } | |||||
| i += 4; | |||||
| ix += inc_x2 * 4; | |||||
| } | |||||
| i += 4; | |||||
| } | |||||
| while (i < n) { | |||||
| if (CABS1(x,ix) < minf) { | |||||
| minf = CABS1(x,ix); | |||||
| } | |||||
| ix += inc_x2; | |||||
| i++; | |||||
| } | |||||
| return (minf); | |||||
| while (i < n) { | |||||
| if (CABS1(x, ix) < minf) { | |||||
| minf = CABS1(x, ix); | |||||
| } | |||||
| ix += inc_x2; | |||||
| i++; | |||||
| } | } | ||||
| return (minf); | |||||
| } | |||||
| } | } | ||||
| @@ -1,5 +1,5 @@ | |||||
| /*************************************************************************** | /*************************************************************************** | ||||
| Copyright (c) 2013-2017, The OpenBLAS Project | |||||
| Copyright (c) 2013-2019, The OpenBLAS Project | |||||
| All rights reserved. | All rights reserved. | ||||
| Redistribution and use in source and binary forms, with or without | Redistribution and use in source and binary forms, with or without | ||||
| modification, are permitted provided that the following conditions are | modification, are permitted provided that the following conditions are | ||||
| @@ -28,140 +28,128 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #include "common.h" | #include "common.h" | ||||
| #include <math.h> | #include <math.h> | ||||
| #if defined(DOUBLE) | |||||
| #define ABS fabs | |||||
| #else | |||||
| #define ABS fabsf | #define ABS fabsf | ||||
| #endif | |||||
| static FLOAT casum_kernel_32(BLASLONG n, FLOAT *x) | |||||
| { | |||||
| FLOAT asum; | |||||
| __asm__ ( | |||||
| "vzero %%v0 \n\t" | |||||
| "vzero %%v1 \n\t" | |||||
| "vzero %%v2 \n\t" | |||||
| "vzero %%v3 \n\t" | |||||
| "srlg %%r0,%1,5 \n\t" | |||||
| "xgr %%r1,%%r1 \n\t" | |||||
| "0: \n\t" | |||||
| "pfd 1, 1024(%%r1,%2) \n\t" | |||||
| "vl %%v16, 0(%%r1,%2) \n\t" | |||||
| "vl %%v17, 16(%%r1,%2) \n\t" | |||||
| "vl %%v18, 32(%%r1,%2) \n\t" | |||||
| "vl %%v19, 48(%%r1,%2) \n\t" | |||||
| "vl %%v20, 64(%%r1,%2) \n\t" | |||||
| "vl %%v21, 80(%%r1,%2) \n\t" | |||||
| "vl %%v22, 96(%%r1,%2) \n\t" | |||||
| "vl %%v23, 112(%%r1,%2) \n\t" | |||||
| "vflpsb %%v16, %%v16 \n\t" | |||||
| "vflpsb %%v17, %%v17 \n\t" | |||||
| "vflpsb %%v18, %%v18 \n\t" | |||||
| "vflpsb %%v19, %%v19 \n\t" | |||||
| "vflpsb %%v20, %%v20 \n\t" | |||||
| "vflpsb %%v21, %%v21 \n\t" | |||||
| "vflpsb %%v22, %%v22 \n\t" | |||||
| "vflpsb %%v23, %%v23 \n\t" | |||||
| "vfasb %%v0,%%v0,%%v16 \n\t" | |||||
| "vfasb %%v1,%%v1,%%v17 \n\t" | |||||
| "vfasb %%v2,%%v2,%%v18 \n\t" | |||||
| "vfasb %%v3,%%v3,%%v19 \n\t" | |||||
| "vfasb %%v0,%%v0,%%v20 \n\t" | |||||
| "vfasb %%v1,%%v1,%%v21 \n\t" | |||||
| "vfasb %%v2,%%v2,%%v22 \n\t" | |||||
| "vfasb %%v3,%%v3,%%v23 \n\t" | |||||
| "vl %%v16, 128(%%r1,%2) \n\t" | |||||
| "vl %%v17, 144(%%r1,%2) \n\t" | |||||
| "vl %%v18, 160(%%r1,%2) \n\t" | |||||
| "vl %%v19, 176(%%r1,%2) \n\t" | |||||
| "vl %%v20, 192(%%r1,%2) \n\t" | |||||
| "vl %%v21, 208(%%r1,%2) \n\t" | |||||
| "vl %%v22, 224(%%r1,%2) \n\t" | |||||
| "vl %%v23, 240(%%r1,%2) \n\t" | |||||
| "vflpsb %%v16, %%v16 \n\t" | |||||
| "vflpsb %%v17, %%v17 \n\t" | |||||
| "vflpsb %%v18, %%v18 \n\t" | |||||
| "vflpsb %%v19, %%v19 \n\t" | |||||
| "vflpsb %%v20, %%v20 \n\t" | |||||
| "vflpsb %%v21, %%v21 \n\t" | |||||
| "vflpsb %%v22, %%v22 \n\t" | |||||
| "vflpsb %%v23, %%v23 \n\t" | |||||
| "vfasb %%v0,%%v0,%%v16 \n\t" | |||||
| "vfasb %%v1,%%v1,%%v17 \n\t" | |||||
| "vfasb %%v2,%%v2,%%v18 \n\t" | |||||
| "vfasb %%v3,%%v3,%%v19 \n\t" | |||||
| "vfasb %%v0,%%v0,%%v20 \n\t" | |||||
| "vfasb %%v1,%%v1,%%v21 \n\t" | |||||
| "vfasb %%v2,%%v2,%%v22 \n\t" | |||||
| "vfasb %%v3,%%v3,%%v23 \n\t" | |||||
| "agfi %%r1,256 \n\t" | |||||
| "brctg %%r0,0b \n\t" | |||||
| "vfasb %%v0,%%v0,%%v1 \n\t" | |||||
| "vfasb %%v0,%%v0,%%v2 \n\t" | |||||
| "vfasb %%v0,%%v0,%%v3 \n\t" | |||||
| "veslg %%v1,%%v0,32 \n\t" | |||||
| "vfasb %%v0,%%v0,%%v1 \n\t" | |||||
| "vrepf %%v1,%%v0,2 \n\t" | |||||
| "aebr %%f0,%%f1 \n\t" | |||||
| "ler %0,%%f0 " | |||||
| :"=f"(asum) | |||||
| :"r"(n),"ZR"((const FLOAT (*)[n * 2])x) | |||||
| :"memory","cc","r0","r1","v0","v1","v2","v3","v16","v17","v18","v19","v20","v21","v22","v23" | |||||
| ); | |||||
| return asum; | |||||
| static FLOAT casum_kernel_32(BLASLONG n, FLOAT *x) { | |||||
| FLOAT asum; | |||||
| __asm__("vzero %%v24\n\t" | |||||
| "vzero %%v25\n\t" | |||||
| "vzero %%v26\n\t" | |||||
| "vzero %%v27\n\t" | |||||
| "vzero %%v28\n\t" | |||||
| "vzero %%v29\n\t" | |||||
| "vzero %%v30\n\t" | |||||
| "vzero %%v31\n\t" | |||||
| "srlg %[n],%[n],5\n\t" | |||||
| "xgr %%r1,%%r1\n\t" | |||||
| "0:\n\t" | |||||
| "pfd 1, 1024(%%r1,%[x])\n\t" | |||||
| "vl %%v16, 0(%%r1,%[x])\n\t" | |||||
| "vl %%v17, 16(%%r1,%[x])\n\t" | |||||
| "vl %%v18, 32(%%r1,%[x])\n\t" | |||||
| "vl %%v19, 48(%%r1,%[x])\n\t" | |||||
| "vl %%v20, 64(%%r1,%[x])\n\t" | |||||
| "vl %%v21, 80(%%r1,%[x])\n\t" | |||||
| "vl %%v22, 96(%%r1,%[x])\n\t" | |||||
| "vl %%v23, 112(%%r1,%[x])\n\t" | |||||
| "vflpsb %%v16, %%v16\n\t" | |||||
| "vflpsb %%v17, %%v17\n\t" | |||||
| "vflpsb %%v18, %%v18\n\t" | |||||
| "vflpsb %%v19, %%v19\n\t" | |||||
| "vflpsb %%v20, %%v20\n\t" | |||||
| "vflpsb %%v21, %%v21\n\t" | |||||
| "vflpsb %%v22, %%v22\n\t" | |||||
| "vflpsb %%v23, %%v23\n\t" | |||||
| "vfasb %%v24,%%v24,%%v16\n\t" | |||||
| "vfasb %%v25,%%v25,%%v17\n\t" | |||||
| "vfasb %%v26,%%v26,%%v18\n\t" | |||||
| "vfasb %%v27,%%v27,%%v19\n\t" | |||||
| "vfasb %%v28,%%v28,%%v20\n\t" | |||||
| "vfasb %%v29,%%v29,%%v21\n\t" | |||||
| "vfasb %%v30,%%v30,%%v22\n\t" | |||||
| "vfasb %%v31,%%v31,%%v23\n\t" | |||||
| "vl %%v16, 128(%%r1,%[x])\n\t" | |||||
| "vl %%v17, 144(%%r1,%[x])\n\t" | |||||
| "vl %%v18, 160(%%r1,%[x])\n\t" | |||||
| "vl %%v19, 176(%%r1,%[x])\n\t" | |||||
| "vl %%v20, 192(%%r1,%[x])\n\t" | |||||
| "vl %%v21, 208(%%r1,%[x])\n\t" | |||||
| "vl %%v22, 224(%%r1,%[x])\n\t" | |||||
| "vl %%v23, 240(%%r1,%[x])\n\t" | |||||
| "vflpsb %%v16, %%v16\n\t" | |||||
| "vflpsb %%v17, %%v17\n\t" | |||||
| "vflpsb %%v18, %%v18\n\t" | |||||
| "vflpsb %%v19, %%v19\n\t" | |||||
| "vflpsb %%v20, %%v20\n\t" | |||||
| "vflpsb %%v21, %%v21\n\t" | |||||
| "vflpsb %%v22, %%v22\n\t" | |||||
| "vflpsb %%v23, %%v23\n\t" | |||||
| "vfasb %%v24,%%v24,%%v16\n\t" | |||||
| "vfasb %%v25,%%v25,%%v17\n\t" | |||||
| "vfasb %%v26,%%v26,%%v18\n\t" | |||||
| "vfasb %%v27,%%v27,%%v19\n\t" | |||||
| "vfasb %%v28,%%v28,%%v20\n\t" | |||||
| "vfasb %%v29,%%v29,%%v21\n\t" | |||||
| "vfasb %%v30,%%v30,%%v22\n\t" | |||||
| "vfasb %%v31,%%v31,%%v23\n\t" | |||||
| "agfi %%r1,256\n\t" | |||||
| "brctg %[n],0b\n\t" | |||||
| "vfasb %%v24,%%v24,%%v25\n\t" | |||||
| "vfasb %%v24,%%v24,%%v26\n\t" | |||||
| "vfasb %%v24,%%v24,%%v27\n\t" | |||||
| "vfasb %%v24,%%v24,%%v28\n\t" | |||||
| "vfasb %%v24,%%v24,%%v29\n\t" | |||||
| "vfasb %%v24,%%v24,%%v30\n\t" | |||||
| "vfasb %%v24,%%v24,%%v31\n\t" | |||||
| "veslg %%v25,%%v24,32\n\t" | |||||
| "vfasb %%v24,%%v24,%%v25\n\t" | |||||
| "vrepf %%v25,%%v24,2\n\t" | |||||
| "vfasb %%v24,%%v24,%%v25\n\t" | |||||
| "vstef %%v24,%[asum],0" | |||||
| : [asum] "=m"(asum),[n] "+&r"(n) | |||||
| : "m"(*(const FLOAT (*)[n * 2]) x),[x] "a"(x) | |||||
| : "cc", "r1", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", | |||||
| "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"); | |||||
| return asum; | |||||
| } | } | ||||
| FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||||
| { | |||||
| BLASLONG i=0; | |||||
| BLASLONG ip=0; | |||||
| FLOAT sumf = 0.0; | |||||
| BLASLONG n1; | |||||
| BLASLONG inc_x2; | |||||
| if (n <= 0 || inc_x <= 0) return(sumf); | |||||
| FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { | |||||
| BLASLONG i = 0; | |||||
| BLASLONG ip = 0; | |||||
| FLOAT sumf = 0.0; | |||||
| BLASLONG n1; | |||||
| BLASLONG inc_x2; | |||||
| if ( inc_x == 1 ) | |||||
| { | |||||
| if (n <= 0 || inc_x <= 0) | |||||
| return (sumf); | |||||
| n1 = n & -32; | |||||
| if ( n1 > 0 ) | |||||
| { | |||||
| if (inc_x == 1) { | |||||
| sumf = casum_kernel_32(n1, x); | |||||
| i=n1; | |||||
| ip=2*n1; | |||||
| } | |||||
| n1 = n & -32; | |||||
| if (n1 > 0) { | |||||
| while(i < n) | |||||
| { | |||||
| sumf += ABS(x[ip]) + ABS(x[ip+1]); | |||||
| i++; | |||||
| ip+=2; | |||||
| } | |||||
| sumf = casum_kernel_32(n1, x); | |||||
| i = n1; | |||||
| ip = 2 * n1; | |||||
| } | |||||
| while (i < n) { | |||||
| sumf += ABS(x[ip]) + ABS(x[ip + 1]); | |||||
| i++; | |||||
| ip += 2; | |||||
| } | } | ||||
| else | |||||
| { | |||||
| inc_x2 = 2* inc_x; | |||||
| while(i < n) | |||||
| { | |||||
| sumf += ABS(x[ip]) + ABS(x[ip+1]); | |||||
| ip+=inc_x2; | |||||
| i++; | |||||
| } | |||||
| } else { | |||||
| inc_x2 = 2 * inc_x; | |||||
| while (i < n) { | |||||
| sumf += ABS(x[ip]) + ABS(x[ip + 1]); | |||||
| ip += inc_x2; | |||||
| i++; | |||||
| } | } | ||||
| return(sumf); | |||||
| } | |||||
| } | |||||
| return (sumf); | |||||
| } | |||||
| @@ -1,5 +1,5 @@ | |||||
| /*************************************************************************** | /*************************************************************************** | ||||
| Copyright (c) 2017, The OpenBLAS Project | |||||
| Copyright (c) 2013-2019, The OpenBLAS Project | |||||
| All rights reserved. | All rights reserved. | ||||
| Redistribution and use in source and binary forms, with or without | Redistribution and use in source and binary forms, with or without | ||||
| modification, are permitted provided that the following conditions are | modification, are permitted provided that the following conditions are | ||||
| @@ -27,148 +27,132 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #include "common.h" | #include "common.h" | ||||
| static void caxpy_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) | |||||
| { | |||||
| __asm__ volatile( | |||||
| static void caxpy_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) { | |||||
| __asm__( | |||||
| #if !defined(CONJ) | #if !defined(CONJ) | ||||
| "vlrepf %%v0,0(%3) \n\t" | |||||
| "vlef %%v1,4(%3),0 \n\t" | |||||
| "vlef %%v1,4(%3),2 \n\t" | |||||
| "vflcsb %%v1,%%v1 \n\t" | |||||
| "vlef %%v1,4(%3),1 \n\t" | |||||
| "vlef %%v1,4(%3),3 \n\t" | |||||
| #else | |||||
| "vlef %%v0,0(%3),1 \n\t" | |||||
| "vlef %%v0,0(%3),3 \n\t" | |||||
| "vflcsb %%v0,%%v0 \n\t" | |||||
| "vlef %%v0,0(%3),0 \n\t" | |||||
| "vlef %%v0,0(%3),2 \n\t" | |||||
| "vlrepf %%v1,4(%3) \n\t" | |||||
| "vlrepf %%v0,0(%[alpha])\n\t" | |||||
| "vlef %%v1,4(%[alpha]),0\n\t" | |||||
| "vlef %%v1,4(%[alpha]),2\n\t" | |||||
| "vflcsb %%v1,%%v1\n\t" | |||||
| "vlef %%v1,4(%[alpha]),1\n\t" | |||||
| "vlef %%v1,4(%[alpha]),3\n\t" | |||||
| #else | |||||
| "vlef %%v0,0(%[alpha]),1\n\t" | |||||
| "vlef %%v0,0(%[alpha]),3\n\t" | |||||
| "vflcsb %%v0,%%v0\n\t" | |||||
| "vlef %%v0,0(%[alpha]),0\n\t" | |||||
| "vlef %%v0,0(%[alpha]),2\n\t" | |||||
| "vlrepf %%v1,4(%[alpha])\n\t" | |||||
| #endif | #endif | ||||
| "srlg %%r0,%0,4 \n\t" | |||||
| "xgr %%r1,%%r1 \n\t" | |||||
| "0: \n\t" | |||||
| "pfd 1, 1024(%%r1,%1) \n\t" | |||||
| "pfd 2, 1024(%%r1,%2) \n\t" | |||||
| "vl %%v16,0(%%r1,%1) \n\t" | |||||
| "vl %%v17,16(%%r1,%1) \n\t" | |||||
| "vl %%v18,32(%%r1,%1) \n\t" | |||||
| "vl %%v19,48(%%r1,%1) \n\t" | |||||
| "vl %%v20,0(%%r1,%2) \n\t" | |||||
| "vl %%v21,16(%%r1,%2) \n\t" | |||||
| "vl %%v22,32(%%r1,%2) \n\t" | |||||
| "vl %%v23,48(%%r1,%2) \n\t" | |||||
| "verllg %%v24,%%v16,32 \n\t" | |||||
| "verllg %%v25,%%v17,32 \n\t" | |||||
| "verllg %%v26,%%v18,32 \n\t" | |||||
| "verllg %%v27,%%v19,32 \n\t" | |||||
| "vfmasb %%v28,%%v16,%%v0,%%v20 \n\t" | |||||
| "vfmasb %%v29,%%v17,%%v0,%%v21 \n\t" | |||||
| "vfmasb %%v30,%%v18,%%v0,%%v22 \n\t" | |||||
| "vfmasb %%v31,%%v19,%%v0,%%v23 \n\t" | |||||
| "vfmasb %%v28,%%v24,%%v1,%%v28 \n\t" | |||||
| "vfmasb %%v29,%%v25,%%v1,%%v29 \n\t" | |||||
| "vfmasb %%v30,%%v26,%%v1,%%v30 \n\t" | |||||
| "vfmasb %%v31,%%v27,%%v1,%%v31 \n\t" | |||||
| "vst %%v28,0(%%r1,%2) \n\t" | |||||
| "vst %%v29,16(%%r1,%2) \n\t" | |||||
| "vst %%v30,32(%%r1,%2) \n\t" | |||||
| "vst %%v31,48(%%r1,%2) \n\t" | |||||
| "vl %%v16,64(%%r1,%1) \n\t" | |||||
| "vl %%v17,80(%%r1,%1) \n\t" | |||||
| "vl %%v18,96(%%r1,%1) \n\t" | |||||
| "vl %%v19,112(%%r1,%1) \n\t" | |||||
| "vl %%v20,64(%%r1,%2) \n\t" | |||||
| "vl %%v21,80(%%r1,%2) \n\t" | |||||
| "vl %%v22,96(%%r1,%2) \n\t" | |||||
| "vl %%v23,112(%%r1,%2) \n\t" | |||||
| "verllg %%v24,%%v16,32 \n\t" | |||||
| "verllg %%v25,%%v17,32 \n\t" | |||||
| "verllg %%v26,%%v18,32 \n\t" | |||||
| "verllg %%v27,%%v19,32 \n\t" | |||||
| "vfmasb %%v28,%%v16,%%v0,%%v20 \n\t" | |||||
| "vfmasb %%v29,%%v17,%%v0,%%v21 \n\t" | |||||
| "vfmasb %%v30,%%v18,%%v0,%%v22 \n\t" | |||||
| "vfmasb %%v31,%%v19,%%v0,%%v23 \n\t" | |||||
| "vfmasb %%v28,%%v24,%%v1,%%v28 \n\t" | |||||
| "vfmasb %%v29,%%v25,%%v1,%%v29 \n\t" | |||||
| "vfmasb %%v30,%%v26,%%v1,%%v30 \n\t" | |||||
| "vfmasb %%v31,%%v27,%%v1,%%v31 \n\t" | |||||
| "vst %%v28,64(%%r1,%2) \n\t" | |||||
| "vst %%v29,80(%%r1,%2) \n\t" | |||||
| "vst %%v30,96(%%r1,%2) \n\t" | |||||
| "vst %%v31,112(%%r1,%2) \n\t" | |||||
| "agfi %%r1,128 \n\t" | |||||
| "brctg %%r0,0b " | |||||
| : | |||||
| :"r"(n),"ZR"((const FLOAT (*)[n * 2])x),"ZR"((FLOAT (*)[n * 2])y),"ZQ"((const FLOAT (*)[2])alpha) | |||||
| :"memory","cc","r0","r1","v0","v1","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" | |||||
| ); | |||||
| "srlg %[n],%[n],4\n\t" | |||||
| "xgr %%r1,%%r1\n\t" | |||||
| "0:\n\t" | |||||
| "pfd 1, 1024(%%r1,%[x])\n\t" | |||||
| "pfd 2, 1024(%%r1,%[y])\n\t" | |||||
| "vl %%v8,0(%%r1,%[x])\n\t" | |||||
| "vl %%v9,16(%%r1,%[x])\n\t" | |||||
| "vl %%v10,32(%%r1,%[x])\n\t" | |||||
| "vl %%v11,48(%%r1,%[x])\n\t" | |||||
| "vl %%v12,0(%%r1,%[y])\n\t" | |||||
| "vl %%v13,16(%%r1,%[y])\n\t" | |||||
| "vl %%v14,32(%%r1,%[y])\n\t" | |||||
| "vl %%v15,48(%%r1,%[y])\n\t" | |||||
| "vl %%v16,64(%%r1,%[x])\n\t" | |||||
| "vl %%v17,80(%%r1,%[x])\n\t" | |||||
| "vl %%v18,96(%%r1,%[x])\n\t" | |||||
| "vl %%v19,112(%%r1,%[x])\n\t" | |||||
| "vl %%v20,64(%%r1,%[y])\n\t" | |||||
| "vl %%v21,80(%%r1,%[y])\n\t" | |||||
| "vl %%v22,96(%%r1,%[y])\n\t" | |||||
| "vl %%v23,112(%%r1,%[y])\n\t" | |||||
| "vfmasb %%v8,%%v8,%%v0,%%v12\n\t" | |||||
| "vfmasb %%v9,%%v9,%%v0,%%v13\n\t" | |||||
| "vfmasb %%v10,%%v10,%%v0,%%v14\n\t" | |||||
| "vfmasb %%v11,%%v11,%%v0,%%v15\n\t" | |||||
| "vfmasb %%v16,%%v16,%%v0,%%v20\n\t" | |||||
| "vfmasb %%v17,%%v17,%%v0,%%v21\n\t" | |||||
| "vfmasb %%v18,%%v18,%%v0,%%v22\n\t" | |||||
| "vfmasb %%v19,%%v19,%%v0,%%v23\n\t" | |||||
| "vfmasb %%v8,%%v24,%%v1,%%v8\n\t" | |||||
| "vfmasb %%v9,%%v25,%%v1,%%v9\n\t" | |||||
| "vfmasb %%v10,%%v26,%%v1,%%v10\n\t" | |||||
| "vfmasb %%v11,%%v27,%%v1,%%v11\n\t" | |||||
| "vfmasb %%v16,%%v28,%%v1,%%v16\n\t" | |||||
| "vfmasb %%v17,%%v29,%%v1,%%v17\n\t" | |||||
| "vfmasb %%v18,%%v30,%%v1,%%v18\n\t" | |||||
| "vfmasb %%v19,%%v31,%%v1,%%v19\n\t" | |||||
| "vst %%v8,0(%%r1,%[y])\n\t" | |||||
| "vst %%v9,16(%%r1,%[y])\n\t" | |||||
| "vst %%v10,32(%%r1,%[y])\n\t" | |||||
| "vst %%v11,48(%%r1,%[y])\n\t" | |||||
| "vst %%v16,64(%%r1,%[y])\n\t" | |||||
| "vst %%v17,80(%%r1,%[y])\n\t" | |||||
| "vst %%v18,96(%%r1,%[y])\n\t" | |||||
| "vst %%v19,112(%%r1,%[y])\n\t" | |||||
| "agfi %%r1,128\n\t" | |||||
| "brctg %[n],0b" | |||||
| : "+m"(*(FLOAT (*)[n * 2]) y),[n] "+&r"(n) | |||||
| : [y] "a"(y), "m"(*(const FLOAT (*)[n * 2]) x),[x] "a"(x), | |||||
| "m"(*(const FLOAT (*)[2]) alpha),[alpha] "a"(alpha) | |||||
| : "cc", "r1", "v0", "v1", "v8", "v9", "v10", "v11", "v12", "v13", | |||||
| "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", | |||||
| "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"); | |||||
| } | } | ||||
| int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) { | |||||
| BLASLONG i = 0; | |||||
| BLASLONG ix = 0, iy = 0; | |||||
| FLOAT da[2] __attribute__ ((aligned(16))); | |||||
| int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, | |||||
| FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, | |||||
| BLASLONG dummy2) { | |||||
| BLASLONG i = 0; | |||||
| BLASLONG ix = 0, iy = 0; | |||||
| FLOAT da[2] __attribute__ ((aligned(16))); | |||||
| if (n <= 0) return (0); | |||||
| if (n <= 0) | |||||
| return (0); | |||||
| if ((inc_x == 1) && (inc_y == 1)) { | |||||
| if ((inc_x == 1) && (inc_y == 1)) { | |||||
| BLASLONG n1 = n & -16; | |||||
| BLASLONG n1 = n & -16; | |||||
| if (n1) { | |||||
| da[0] = da_r; | |||||
| da[1] = da_i; | |||||
| caxpy_kernel_16(n1, x, y, da); | |||||
| ix = 2 * n1; | |||||
| } | |||||
| i = n1; | |||||
| while (i < n) { | |||||
| if (n1) { | |||||
| da[0] = da_r; | |||||
| da[1] = da_i; | |||||
| caxpy_kernel_16(n1, x, y, da); | |||||
| ix = 2 * n1; | |||||
| } | |||||
| i = n1; | |||||
| while (i < n) { | |||||
| #if !defined(CONJ) | #if !defined(CONJ) | ||||
| y[ix] += (da_r * x[ix] - da_i * x[ix + 1]); | |||||
| y[ix + 1] += (da_r * x[ix + 1] + da_i * x[ix]); | |||||
| y[ix] += (da_r * x[ix] - da_i * x[ix + 1]); | |||||
| y[ix + 1] += (da_r * x[ix + 1] + da_i * x[ix]); | |||||
| #else | #else | ||||
| y[ix] += (da_r * x[ix] + da_i * x[ix + 1]); | |||||
| y[ix + 1] -= (da_r * x[ix + 1] - da_i * x[ix]); | |||||
| y[ix] += (da_r * x[ix] + da_i * x[ix + 1]); | |||||
| y[ix + 1] -= (da_r * x[ix + 1] - da_i * x[ix]); | |||||
| #endif | #endif | ||||
| i++; | |||||
| ix += 2; | |||||
| } | |||||
| return (0); | |||||
| i++; | |||||
| ix += 2; | |||||
| } | } | ||||
| return (0); | |||||
| inc_x *= 2; | |||||
| inc_y *= 2; | |||||
| } | |||||
| while (i < n) { | |||||
| inc_x *= 2; | |||||
| inc_y *= 2; | |||||
| while (i < n) { | |||||
| #if !defined(CONJ) | #if !defined(CONJ) | ||||
| y[iy] += (da_r * x[ix] - da_i * x[ix + 1]); | |||||
| y[iy + 1] += (da_r * x[ix + 1] + da_i * x[ix]); | |||||
| y[iy] += (da_r * x[ix] - da_i * x[ix + 1]); | |||||
| y[iy + 1] += (da_r * x[ix + 1] + da_i * x[ix]); | |||||
| #else | #else | ||||
| y[iy] += (da_r * x[ix] + da_i * x[ix + 1]); | |||||
| y[iy + 1] -= (da_r * x[ix + 1] - da_i * x[ix]); | |||||
| y[iy] += (da_r * x[ix] + da_i * x[ix + 1]); | |||||
| y[iy + 1] -= (da_r * x[ix + 1] - da_i * x[ix]); | |||||
| #endif | #endif | ||||
| ix += inc_x; | |||||
| iy += inc_y; | |||||
| i++; | |||||
| ix += inc_x; | |||||
| iy += inc_y; | |||||
| i++; | |||||
| } | |||||
| return (0); | |||||
| } | |||||
| return (0); | |||||
| } | } | ||||
| @@ -1,5 +1,5 @@ | |||||
| /*************************************************************************** | /*************************************************************************** | ||||
| Copyright (c) 2013-2018, The OpenBLAS Project | |||||
| Copyright (c) 2013-2019, The OpenBLAS Project | |||||
| All rights reserved. | All rights reserved. | ||||
| Redistribution and use in source and binary forms, with or without | Redistribution and use in source and binary forms, with or without | ||||
| modification, are permitted provided that the following conditions are | modification, are permitted provided that the following conditions are | ||||
| @@ -27,73 +27,61 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #include "common.h" | #include "common.h" | ||||
| static void ccopy_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y) | |||||
| { | |||||
| __asm__ volatile ( | |||||
| "lgr %%r1,%1 \n\t" | |||||
| "lgr %%r2,%2 \n\t" | |||||
| "srlg %%r0,%0,5 \n\t" | |||||
| "0: \n\t" | |||||
| "pfd 1, 1024(%%r1) \n\t" | |||||
| "pfd 2, 1024(%%r2) \n\t" | |||||
| "mvc 0(256,%%r2),0(%%r1) \n\t" | |||||
| "agfi %%r1,256 \n\t" | |||||
| "agfi %%r2,256 \n\t" | |||||
| "brctg %%r0,0b " | |||||
| : | |||||
| :"r"(n),"a"((const FLOAT (*)[n * 2])x),"a"((FLOAT (*)[n * 2])y) | |||||
| :"memory","cc","r0","r1","r2" | |||||
| ); | |||||
| static void ccopy_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y) { | |||||
| __asm__("srlg %[n],%[n],5\n\t" | |||||
| "0:\n\t" | |||||
| "pfd 1, 1024(%[x])\n\t" | |||||
| "pfd 2, 1024(%[y])\n\t" | |||||
| "mvc 0(256,%[y]),0(%[x])\n\t" | |||||
| "la %[x],256(%[x])\n\t" | |||||
| "la %[y],256(%[y])\n\t" | |||||
| "brctg %[n],0b" | |||||
| : "=m"(*(FLOAT (*)[n * 2]) y),[x] "+&a"(x),[y] "+&a"(y) | |||||
| : "m"(*(const FLOAT (*)[n * 2]) x) | |||||
| : "cc"); | |||||
| } | } | ||||
| int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) | |||||
| { | |||||
| BLASLONG i=0; | |||||
| BLASLONG ix=0,iy=0; | |||||
| int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) { | |||||
| BLASLONG i = 0; | |||||
| BLASLONG ix = 0, iy = 0; | |||||
| if ( n <= 0 ) return(0); | |||||
| if (n <= 0) | |||||
| return (0); | |||||
| if ( (inc_x == 1) && (inc_y == 1 )) | |||||
| { | |||||
| if ((inc_x == 1) && (inc_y == 1)) { | |||||
| BLASLONG n1 = n & -32; | |||||
| if ( n1 > 0 ) | |||||
| { | |||||
| ccopy_kernel_32(n1, x, y); | |||||
| i=n1; | |||||
| ix=n1*2; | |||||
| iy=n1*2; | |||||
| } | |||||
| while(i < n) | |||||
| { | |||||
| y[iy] = x[iy] ; | |||||
| y[iy+1] = x[ix+1] ; | |||||
| ix+=2; | |||||
| iy+=2; | |||||
| i++ ; | |||||
| } | |||||
| BLASLONG n1 = n & -32; | |||||
| if (n1 > 0) { | |||||
| ccopy_kernel_32(n1, x, y); | |||||
| i = n1; | |||||
| ix = n1 * 2; | |||||
| iy = n1 * 2; | |||||
| } | |||||
| while (i < n) { | |||||
| y[iy] = x[iy]; | |||||
| y[iy + 1] = x[ix + 1]; | |||||
| ix += 2; | |||||
| iy += 2; | |||||
| i++; | |||||
| } | } | ||||
| else | |||||
| { | |||||
| BLASLONG inc_x2 = 2 * inc_x; | |||||
| BLASLONG inc_y2 = 2 * inc_y; | |||||
| } else { | |||||
| while(i < n) | |||||
| { | |||||
| y[iy] = x[ix] ; | |||||
| y[iy+1] = x[ix+1] ; | |||||
| ix += inc_x2 ; | |||||
| iy += inc_y2 ; | |||||
| i++ ; | |||||
| BLASLONG inc_x2 = 2 * inc_x; | |||||
| BLASLONG inc_y2 = 2 * inc_y; | |||||
| } | |||||
| while (i < n) { | |||||
| y[iy] = x[ix]; | |||||
| y[iy + 1] = x[ix + 1]; | |||||
| ix += inc_x2; | |||||
| iy += inc_y2; | |||||
| i++; | |||||
| } | } | ||||
| return(0); | |||||
| } | |||||
| return (0); | |||||
| } | } | ||||
| @@ -1,5 +1,5 @@ | |||||
| /*************************************************************************** | /*************************************************************************** | ||||
| Copyright (c) 2013-2017, The OpenBLAS Project | |||||
| Copyright (c) 2013-2019, The OpenBLAS Project | |||||
| All rights reserved. | All rights reserved. | ||||
| Redistribution and use in source and binary forms, with or without | Redistribution and use in source and binary forms, with or without | ||||
| modification, are permitted provided that the following conditions are | modification, are permitted provided that the following conditions are | ||||
| @@ -27,156 +27,150 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #include "common.h" | #include "common.h" | ||||
| static void cdot_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *d) | |||||
| { | |||||
| __asm__ volatile( | |||||
| "vzero %%v24 \n\t" | |||||
| "vzero %%v25 \n\t" | |||||
| "vzero %%v26 \n\t" | |||||
| "vzero %%v27 \n\t" | |||||
| "vzero %%v28 \n\t" | |||||
| "vzero %%v29 \n\t" | |||||
| "vzero %%v30 \n\t" | |||||
| "vzero %%v31 \n\t" | |||||
| "srlg %%r0,%0,4 \n\t" | |||||
| "xgr %%r1,%%r1 \n\t" | |||||
| "0: \n\t" | |||||
| "pfd 1, 1024(%%r1,%1) \n\t" | |||||
| "pfd 1, 1024(%%r1,%2) \n\t" | |||||
| "vl %%v16, 0(%%r1,%1) \n\t" | |||||
| "vl %%v17, 16(%%r1,%1) \n\t" | |||||
| "vl %%v18, 32(%%r1,%1) \n\t" | |||||
| "vl %%v19, 48(%%r1,%1) \n\t" | |||||
| "vl %%v0, 0(%%r1,%2) \n\t" | |||||
| "vl %%v1, 16(%%r1,%2) \n\t" | |||||
| "vl %%v2, 32(%%r1,%2) \n\t" | |||||
| "vl %%v3, 48(%%r1,%2) \n\t" | |||||
| "verllg %%v20,%%v16,32 \n\t" | |||||
| "verllg %%v21,%%v17,32 \n\t" | |||||
| "verllg %%v22,%%v18,32 \n\t" | |||||
| "verllg %%v23,%%v19,32 \n\t" | |||||
| "vfmasb %%v24,%%v16,%%v0,%%v24 \n\t" | |||||
| "vfmasb %%v25,%%v20,%%v0,%%v25 \n\t" | |||||
| "vfmasb %%v26,%%v17,%%v1,%%v26 \n\t" | |||||
| "vfmasb %%v27,%%v21,%%v1,%%v27 \n\t" | |||||
| "vfmasb %%v28,%%v18,%%v2,%%v28 \n\t" | |||||
| "vfmasb %%v29,%%v22,%%v2,%%v29 \n\t" | |||||
| "vfmasb %%v30,%%v19,%%v3,%%v30 \n\t" | |||||
| "vfmasb %%v31,%%v23,%%v3,%%v31 \n\t" | |||||
| "vl %%v16, 64(%%r1,%1) \n\t" | |||||
| "vl %%v17, 80(%%r1,%1) \n\t" | |||||
| "vl %%v18, 96(%%r1,%1) \n\t" | |||||
| "vl %%v19, 112(%%r1,%1) \n\t" | |||||
| "vl %%v0, 64(%%r1,%2) \n\t" | |||||
| "vl %%v1, 80(%%r1,%2) \n\t" | |||||
| "vl %%v2, 96(%%r1,%2) \n\t" | |||||
| "vl %%v3, 112(%%r1,%2) \n\t" | |||||
| "verllg %%v20,%%v16,32 \n\t" | |||||
| "verllg %%v21,%%v17,32 \n\t" | |||||
| "verllg %%v22,%%v18,32 \n\t" | |||||
| "verllg %%v23,%%v19,32 \n\t" | |||||
| "vfmasb %%v24,%%v16,%%v0,%%v24 \n\t" | |||||
| "vfmasb %%v25,%%v20,%%v0,%%v25 \n\t" | |||||
| "vfmasb %%v26,%%v17,%%v1,%%v26 \n\t" | |||||
| "vfmasb %%v27,%%v21,%%v1,%%v27 \n\t" | |||||
| "vfmasb %%v28,%%v18,%%v2,%%v28 \n\t" | |||||
| "vfmasb %%v29,%%v22,%%v2,%%v29 \n\t" | |||||
| "vfmasb %%v30,%%v19,%%v3,%%v30 \n\t" | |||||
| "vfmasb %%v31,%%v23,%%v3,%%v31 \n\t" | |||||
| "agfi %%r1,128 \n\t" | |||||
| "brctg %%r0,0b \n\t" | |||||
| "vfasb %%v24,%%v24,%%v26 \n\t" | |||||
| "vfasb %%v24,%%v24,%%v28 \n\t" | |||||
| "vfasb %%v24,%%v24,%%v30 \n\t" | |||||
| "vrepg %%v26,%%v24,1 \n\t" | |||||
| "vfasb %%v24,%%v24,%%v26 \n\t" | |||||
| "vfasb %%v25,%%v25,%%v27 \n\t" | |||||
| "vfasb %%v25,%%v25,%%v29 \n\t" | |||||
| "vfasb %%v25,%%v25,%%v31 \n\t" | |||||
| "vrepg %%v27,%%v25,1 \n\t" | |||||
| "vfasb %%v25,%%v25,%%v27 \n\t" | |||||
| "vstef %%v24,0(%3),0 \n\t" | |||||
| "vstef %%v24,4(%3),1 \n\t" | |||||
| "vstef %%v25,8(%3),1 \n\t" | |||||
| "vstef %%v25,12(%3),0 " | |||||
| : | |||||
| :"r"(n),"ZR"((const FLOAT (*)[n * 2])x),"ZR"((const FLOAT (*)[n * 2])y),"ZQ"((FLOAT (*)[4])d) | |||||
| :"memory","cc","r0","r1","v0","v1","v2","v3","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" | |||||
| ); | |||||
| static void cdot_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *d) { | |||||
| __asm__("vzero %%v24\n\t" | |||||
| "vzero %%v25\n\t" | |||||
| "vzero %%v26\n\t" | |||||
| "vzero %%v27\n\t" | |||||
| "vzero %%v28\n\t" | |||||
| "vzero %%v29\n\t" | |||||
| "vzero %%v30\n\t" | |||||
| "vzero %%v31\n\t" | |||||
| "srlg %[n],%[n],4\n\t" | |||||
| "xgr %%r1,%%r1\n\t" | |||||
| "0:\n\t" | |||||
| "pfd 1, 1024(%%r1,%[x])\n\t" | |||||
| "pfd 1, 1024(%%r1,%[y])\n\t" | |||||
| "vl %%v16, 0(%%r1,%[x])\n\t" | |||||
| "vl %%v17, 16(%%r1,%[x])\n\t" | |||||
| "vl %%v18, 32(%%r1,%[x])\n\t" | |||||
| "vl %%v19, 48(%%r1,%[x])\n\t" | |||||
| "vl %%v0, 0(%%r1,%[y])\n\t" | |||||
| "vl %%v1, 16(%%r1,%[y])\n\t" | |||||
| "vl %%v2, 32(%%r1,%[y])\n\t" | |||||
| "vl %%v3, 48(%%r1,%[y])\n\t" | |||||
| "verllg %%v20,%%v16,32\n\t" | |||||
| "verllg %%v21,%%v17,32\n\t" | |||||
| "verllg %%v22,%%v18,32\n\t" | |||||
| "verllg %%v23,%%v19,32\n\t" | |||||
| "vfmasb %%v24,%%v16,%%v0,%%v24\n\t" | |||||
| "vfmasb %%v25,%%v20,%%v0,%%v25\n\t" | |||||
| "vfmasb %%v26,%%v17,%%v1,%%v26\n\t" | |||||
| "vfmasb %%v27,%%v21,%%v1,%%v27\n\t" | |||||
| "vfmasb %%v28,%%v18,%%v2,%%v28\n\t" | |||||
| "vfmasb %%v29,%%v22,%%v2,%%v29\n\t" | |||||
| "vfmasb %%v30,%%v19,%%v3,%%v30\n\t" | |||||
| "vfmasb %%v31,%%v23,%%v3,%%v31\n\t" | |||||
| "vl %%v16, 64(%%r1,%[x])\n\t" | |||||
| "vl %%v17, 80(%%r1,%[x])\n\t" | |||||
| "vl %%v18, 96(%%r1,%[x])\n\t" | |||||
| "vl %%v19, 112(%%r1,%[x])\n\t" | |||||
| "vl %%v0, 64(%%r1,%[y])\n\t" | |||||
| "vl %%v1, 80(%%r1,%[y])\n\t" | |||||
| "vl %%v2, 96(%%r1,%[y])\n\t" | |||||
| "vl %%v3, 112(%%r1,%[y])\n\t" | |||||
| "verllg %%v20,%%v16,32\n\t" | |||||
| "verllg %%v21,%%v17,32\n\t" | |||||
| "verllg %%v22,%%v18,32\n\t" | |||||
| "verllg %%v23,%%v19,32\n\t" | |||||
| "vfmasb %%v24,%%v16,%%v0,%%v24\n\t" | |||||
| "vfmasb %%v25,%%v20,%%v0,%%v25\n\t" | |||||
| "vfmasb %%v26,%%v17,%%v1,%%v26\n\t" | |||||
| "vfmasb %%v27,%%v21,%%v1,%%v27\n\t" | |||||
| "vfmasb %%v28,%%v18,%%v2,%%v28\n\t" | |||||
| "vfmasb %%v29,%%v22,%%v2,%%v29\n\t" | |||||
| "vfmasb %%v30,%%v19,%%v3,%%v30\n\t" | |||||
| "vfmasb %%v31,%%v23,%%v3,%%v31\n\t" | |||||
| "agfi %%r1,128\n\t" | |||||
| "brctg %[n],0b\n\t" | |||||
| "vfasb %%v24,%%v24,%%v26\n\t" | |||||
| "vfasb %%v24,%%v24,%%v28\n\t" | |||||
| "vfasb %%v24,%%v24,%%v30\n\t" | |||||
| "vrepg %%v26,%%v24,1\n\t" | |||||
| "vfasb %%v24,%%v24,%%v26\n\t" | |||||
| "vfasb %%v25,%%v25,%%v27\n\t" | |||||
| "vfasb %%v25,%%v25,%%v29\n\t" | |||||
| "vfasb %%v25,%%v25,%%v31\n\t" | |||||
| "vrepg %%v27,%%v25,1\n\t" | |||||
| "vfasb %%v25,%%v25,%%v27\n\t" | |||||
| "vstef %%v24,0(%[d]),0\n\t" | |||||
| "vstef %%v24,4(%[d]),1\n\t" | |||||
| "vstef %%v25,8(%[d]),1\n\t" | |||||
| "vstef %%v25,12(%[d]),0" | |||||
| : "=m"(*(FLOAT (*)[4]) d),[n] "+&r"(n) | |||||
| : [d] "a"(d), "m"(*(const FLOAT (*)[n * 2]) x),[x] "a"(x), | |||||
| "m"(*(const FLOAT (*)[n * 2]) y),[y] "a"(y) | |||||
| : "cc", "r1", "v0", "v1", "v2", "v3", "v16", "v17", "v18", "v19", "v20", | |||||
| "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", | |||||
| "v31"); | |||||
| } | } | ||||
| OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) { | |||||
| BLASLONG i; | |||||
| BLASLONG ix, iy; | |||||
| OPENBLAS_COMPLEX_FLOAT result; | |||||
| FLOAT dot[4] __attribute__ ((aligned(16))) = {0.0, 0.0, 0.0, 0.0}; | |||||
| if (n <= 0) { | |||||
| CREAL(result) = 0.0; | |||||
| CIMAG(result) = 0.0; | |||||
| return (result); | |||||
| } | |||||
| OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, | |||||
| BLASLONG inc_y) { | |||||
| BLASLONG i; | |||||
| BLASLONG ix, iy; | |||||
| OPENBLAS_COMPLEX_FLOAT result; | |||||
| FLOAT dot[4] __attribute__ ((aligned(16))) = { | |||||
| 0.0, 0.0, 0.0, 0.0}; | |||||
| if (n <= 0) { | |||||
| CREAL(result) = 0.0; | |||||
| CIMAG(result) = 0.0; | |||||
| return (result); | |||||
| if ((inc_x == 1) && (inc_y == 1)) { | |||||
| } | |||||
| BLASLONG n1 = n & -16; | |||||
| if ((inc_x == 1) && (inc_y == 1)) { | |||||
| if (n1) | |||||
| cdot_kernel_16(n1, x, y, dot); | |||||
| BLASLONG n1 = n & -16; | |||||
| i = n1; | |||||
| BLASLONG j = i * 2; | |||||
| if (n1) | |||||
| cdot_kernel_16(n1, x, y, dot); | |||||
| while (i < n) { | |||||
| i = n1; | |||||
| BLASLONG j = i * 2; | |||||
| dot[0] += x[j] * y[j]; | |||||
| dot[1] += x[j + 1] * y[j + 1]; | |||||
| dot[2] += x[j] * y[j + 1]; | |||||
| dot[3] += x[j + 1] * y[j]; | |||||
| while (i < n) { | |||||
| j += 2; | |||||
| i++; | |||||
| dot[0] += x[j] * y[j]; | |||||
| dot[1] += x[j + 1] * y[j + 1]; | |||||
| dot[2] += x[j] * y[j + 1]; | |||||
| dot[3] += x[j + 1] * y[j]; | |||||
| } | |||||
| j += 2; | |||||
| i++; | |||||
| } | |||||
| } else { | |||||
| i = 0; | |||||
| ix = 0; | |||||
| iy = 0; | |||||
| inc_x <<= 1; | |||||
| inc_y <<= 1; | |||||
| while (i < n) { | |||||
| } else { | |||||
| i = 0; | |||||
| ix = 0; | |||||
| iy = 0; | |||||
| inc_x <<= 1; | |||||
| inc_y <<= 1; | |||||
| while (i < n) { | |||||
| dot[0] += x[ix] * y[iy]; | |||||
| dot[1] += x[ix + 1] * y[iy + 1]; | |||||
| dot[2] += x[ix] * y[iy + 1]; | |||||
| dot[3] += x[ix + 1] * y[iy]; | |||||
| dot[0] += x[ix] * y[iy]; | |||||
| dot[1] += x[ix + 1] * y[iy + 1]; | |||||
| dot[2] += x[ix] * y[iy + 1]; | |||||
| dot[3] += x[ix + 1] * y[iy]; | |||||
| ix += inc_x; | |||||
| iy += inc_y; | |||||
| i++; | |||||
| ix += inc_x; | |||||
| iy += inc_y; | |||||
| i++; | |||||
| } | |||||
| } | } | ||||
| } | |||||
| #if !defined(CONJ) | #if !defined(CONJ) | ||||
| CREAL(result) = dot[0] - dot[1]; | |||||
| CIMAG(result) = dot[2] + dot[3]; | |||||
| CREAL(result) = dot[0] - dot[1]; | |||||
| CIMAG(result) = dot[2] + dot[3]; | |||||
| #else | #else | ||||
| CREAL(result) = dot[0] + dot[1]; | |||||
| CIMAG(result) = dot[2] - dot[3]; | |||||
| CREAL(result) = dot[0] + dot[1]; | |||||
| CIMAG(result) = dot[2] - dot[3]; | |||||
| #endif | #endif | ||||
| return (result); | |||||
| return (result); | |||||
| } | } | ||||
| @@ -1,5 +1,5 @@ | |||||
| /*************************************************************************** | /*************************************************************************** | ||||
| Copyright (c) 2013-2018, The OpenBLAS Project | |||||
| Copyright (c) 2013-2019, The OpenBLAS Project | |||||
| All rights reserved. | All rights reserved. | ||||
| Redistribution and use in source and binary forms, with or without | Redistribution and use in source and binary forms, with or without | ||||
| modification, are permitted provided that the following conditions are | modification, are permitted provided that the following conditions are | ||||
| @@ -27,230 +27,209 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #include "common.h" | #include "common.h" | ||||
| static void crot_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *c, FLOAT *s) | |||||
| { | |||||
| __asm__ ( | |||||
| "vlrepf %%v0,%3 \n\t" | |||||
| "vlrepf %%v1,%4 \n\t" | |||||
| "srlg %%r0,%0,5 \n\t" | |||||
| "xgr %%r1,%%r1 \n\t" | |||||
| "0: \n\t" | |||||
| "pfd 2, 1024(%%r1,%1) \n\t" | |||||
| "pfd 2, 1024(%%r1,%2) \n\t" | |||||
| "vl %%v24, 0(%%r1,%1) \n\t" | |||||
| "vl %%v25, 16(%%r1,%1) \n\t" | |||||
| "vl %%v26, 32(%%r1,%1) \n\t" | |||||
| "vl %%v27, 48(%%r1,%1) \n\t" | |||||
| "vl %%v16, 0(%%r1,%2) \n\t" | |||||
| "vl %%v17, 16(%%r1,%2) \n\t" | |||||
| "vl %%v18, 32(%%r1,%2) \n\t" | |||||
| "vl %%v19, 48(%%r1,%2) \n\t" | |||||
| "vfmsb %%v28,%%v24,%%v0 \n\t" | |||||
| "vfmsb %%v29,%%v25,%%v0 \n\t" | |||||
| "vfmsb %%v20,%%v24,%%v1 \n\t" /* yn=x*s */ | |||||
| "vfmsb %%v21,%%v25,%%v1 \n\t" /* yn=x*s */ | |||||
| "vfmsb %%v30,%%v26,%%v0 \n\t" | |||||
| "vfmsb %%v22,%%v26,%%v1 \n\t" /* yn=x*s */ | |||||
| "vfmsb %%v31,%%v27,%%v0 \n\t" | |||||
| "vfmsb %%v23,%%v27,%%v1 \n\t" /* yn=x*s */ | |||||
| /* 2nd parts*/ | |||||
| "vfmasb %%v28,%%v16,%%v1,%%v28 \n\t" | |||||
| "vfmssb %%v20,%%v16,%%v0,%%v20 \n\t" /* yn=y*c-yn */ | |||||
| "vfmasb %%v29,%%v17,%%v1,%%v29 \n\t" | |||||
| "vfmssb %%v21,%%v17,%%v0,%%v21 \n\t" /* yn=y*c-yn */ | |||||
| "vfmasb %%v30,%%v18,%%v1,%%v30 \n\t" | |||||
| "vfmssb %%v22,%%v18,%%v0,%%v22 \n\t" /* yn=y*c-yn */ | |||||
| "vfmasb %%v31,%%v19,%%v1,%%v31 \n\t" | |||||
| "vfmssb %%v23,%%v19,%%v0,%%v23 \n\t" /* yn=y*c-yn */ | |||||
| "vst %%v28, 0(%%r1,%1) \n\t" | |||||
| "vst %%v29, 16(%%r1,%1) \n\t" | |||||
| "vst %%v30, 32(%%r1,%1) \n\t" | |||||
| "vst %%v31, 48(%%r1,%1) \n\t" | |||||
| "vst %%v20, 0(%%r1,%2) \n\t" | |||||
| "vst %%v21, 16(%%r1,%2) \n\t" | |||||
| "vst %%v22, 32(%%r1,%2) \n\t" | |||||
| "vst %%v23, 48(%%r1,%2) \n\t" | |||||
| "vl %%v24, 64(%%r1,%1) \n\t" | |||||
| "vl %%v25, 80(%%r1,%1) \n\t" | |||||
| "vl %%v26, 96(%%r1,%1) \n\t" | |||||
| "vl %%v27, 112(%%r1,%1) \n\t" | |||||
| "vl %%v16, 64(%%r1,%2) \n\t" | |||||
| "vl %%v17, 80(%%r1,%2) \n\t" | |||||
| "vl %%v18, 96(%%r1,%2) \n\t" | |||||
| "vl %%v19, 112(%%r1,%2) \n\t" | |||||
| "vfmsb %%v28,%%v24,%%v0 \n\t" | |||||
| "vfmsb %%v29,%%v25,%%v0 \n\t" | |||||
| "vfmsb %%v20,%%v24,%%v1 \n\t" /* yn=x*s */ | |||||
| "vfmsb %%v21,%%v25,%%v1 \n\t" /* yn=x*s */ | |||||
| "vfmsb %%v30,%%v26,%%v0 \n\t" | |||||
| "vfmsb %%v22,%%v26,%%v1 \n\t" /* yn=x*s */ | |||||
| "vfmsb %%v31,%%v27,%%v0 \n\t" | |||||
| "vfmsb %%v23,%%v27,%%v1 \n\t" /* yn=x*s */ | |||||
| /* 2nd parts*/ | |||||
| "vfmasb %%v28,%%v16,%%v1,%%v28 \n\t" | |||||
| "vfmssb %%v20,%%v16,%%v0,%%v20 \n\t" /* yn=y*c-yn */ | |||||
| "vfmasb %%v29,%%v17,%%v1,%%v29 \n\t" | |||||
| "vfmssb %%v21,%%v17,%%v0,%%v21 \n\t" /* yn=y*c-yn */ | |||||
| "vfmasb %%v30,%%v18,%%v1,%%v30 \n\t" | |||||
| "vfmssb %%v22,%%v18,%%v0,%%v22 \n\t" /* yn=y*c-yn */ | |||||
| "vfmasb %%v31,%%v19,%%v1,%%v31 \n\t" | |||||
| "vfmssb %%v23,%%v19,%%v0,%%v23 \n\t" /* yn=y*c-yn */ | |||||
| "vst %%v28, 64(%%r1,%1) \n\t" | |||||
| "vst %%v29, 80(%%r1,%1) \n\t" | |||||
| "vst %%v30, 96(%%r1,%1) \n\t" | |||||
| "vst %%v31, 112(%%r1,%1) \n\t" | |||||
| "vst %%v20, 64(%%r1,%2) \n\t" | |||||
| "vst %%v21, 80(%%r1,%2) \n\t" | |||||
| "vst %%v22, 96(%%r1,%2) \n\t" | |||||
| "vst %%v23, 112(%%r1,%2) \n\t" | |||||
| "vl %%v24, 128(%%r1,%1) \n\t" | |||||
| "vl %%v25, 144(%%r1,%1) \n\t" | |||||
| "vl %%v26, 160(%%r1,%1) \n\t" | |||||
| "vl %%v27, 176(%%r1,%1) \n\t" | |||||
| "vl %%v16, 128(%%r1,%2) \n\t" | |||||
| "vl %%v17, 144(%%r1,%2) \n\t" | |||||
| "vl %%v18, 160(%%r1,%2) \n\t" | |||||
| "vl %%v19, 176(%%r1,%2) \n\t" | |||||
| "vfmsb %%v28,%%v24,%%v0 \n\t" | |||||
| "vfmsb %%v29,%%v25,%%v0 \n\t" | |||||
| "vfmsb %%v20,%%v24,%%v1 \n\t" /* yn=x*s */ | |||||
| "vfmsb %%v21,%%v25,%%v1 \n\t" /* yn=x*s */ | |||||
| "vfmsb %%v30,%%v26,%%v0 \n\t" | |||||
| "vfmsb %%v22,%%v26,%%v1 \n\t" /* yn=x*s */ | |||||
| "vfmsb %%v31,%%v27,%%v0 \n\t" | |||||
| "vfmsb %%v23,%%v27,%%v1 \n\t" /* yn=x*s */ | |||||
| /* 2nd parts*/ | |||||
| "vfmasb %%v28,%%v16,%%v1,%%v28 \n\t" | |||||
| "vfmssb %%v20,%%v16,%%v0,%%v20 \n\t" /* yn=y*c-yn */ | |||||
| "vfmasb %%v29,%%v17,%%v1,%%v29 \n\t" | |||||
| "vfmssb %%v21,%%v17,%%v0,%%v21 \n\t" /* yn=y*c-yn */ | |||||
| "vfmasb %%v30,%%v18,%%v1,%%v30 \n\t" | |||||
| "vfmssb %%v22,%%v18,%%v0,%%v22 \n\t" /* yn=y*c-yn */ | |||||
| "vfmasb %%v31,%%v19,%%v1,%%v31 \n\t" | |||||
| "vfmssb %%v23,%%v19,%%v0,%%v23 \n\t" /* yn=y*c-yn */ | |||||
| "vst %%v28, 128(%%r1,%1) \n\t" | |||||
| "vst %%v29, 144(%%r1,%1) \n\t" | |||||
| "vst %%v30, 160(%%r1,%1) \n\t" | |||||
| "vst %%v31, 176(%%r1,%1) \n\t" | |||||
| "vst %%v20, 128(%%r1,%2) \n\t" | |||||
| "vst %%v21, 144(%%r1,%2) \n\t" | |||||
| "vst %%v22, 160(%%r1,%2) \n\t" | |||||
| "vst %%v23, 176(%%r1,%2) \n\t" | |||||
| "vl %%v24, 192(%%r1,%1) \n\t" | |||||
| "vl %%v25, 208(%%r1,%1) \n\t" | |||||
| "vl %%v26, 224(%%r1,%1) \n\t" | |||||
| "vl %%v27, 240(%%r1,%1) \n\t" | |||||
| "vl %%v16, 192(%%r1,%2) \n\t" | |||||
| "vl %%v17, 208(%%r1,%2) \n\t" | |||||
| "vl %%v18, 224(%%r1,%2) \n\t" | |||||
| "vl %%v19, 240(%%r1,%2) \n\t" | |||||
| "vfmsb %%v28,%%v24,%%v0 \n\t" | |||||
| "vfmsb %%v29,%%v25,%%v0 \n\t" | |||||
| "vfmsb %%v20,%%v24,%%v1 \n\t" /* yn=x*s */ | |||||
| "vfmsb %%v21,%%v25,%%v1 \n\t" /* yn=x*s */ | |||||
| "vfmsb %%v30,%%v26,%%v0 \n\t" | |||||
| "vfmsb %%v22,%%v26,%%v1 \n\t" /* yn=x*s */ | |||||
| "vfmsb %%v31,%%v27,%%v0 \n\t" | |||||
| "vfmsb %%v23,%%v27,%%v1 \n\t" /* yn=x*s */ | |||||
| /* 2nd parts*/ | |||||
| "vfmasb %%v28,%%v16,%%v1,%%v28 \n\t" | |||||
| "vfmssb %%v20,%%v16,%%v0,%%v20 \n\t" /* yn=y*c-yn */ | |||||
| "vfmasb %%v29,%%v17,%%v1,%%v29 \n\t" | |||||
| "vfmssb %%v21,%%v17,%%v0,%%v21 \n\t" /* yn=y*c-yn */ | |||||
| "vfmasb %%v30,%%v18,%%v1,%%v30 \n\t" | |||||
| "vfmssb %%v22,%%v18,%%v0,%%v22 \n\t" /* yn=y*c-yn */ | |||||
| "vfmasb %%v31,%%v19,%%v1,%%v31 \n\t" | |||||
| "vfmssb %%v23,%%v19,%%v0,%%v23 \n\t" /* yn=y*c-yn */ | |||||
| "vst %%v28, 192(%%r1,%1) \n\t" | |||||
| "vst %%v29, 208(%%r1,%1) \n\t" | |||||
| "vst %%v30, 224(%%r1,%1) \n\t" | |||||
| "vst %%v31, 240(%%r1,%1) \n\t" | |||||
| "vst %%v20, 192(%%r1,%2) \n\t" | |||||
| "vst %%v21, 208(%%r1,%2) \n\t" | |||||
| "vst %%v22, 224(%%r1,%2) \n\t" | |||||
| "vst %%v23, 240(%%r1,%2) \n\t" | |||||
| "agfi %%r1,256 \n\t" | |||||
| "brctg %%r0,0b " | |||||
| : | |||||
| :"r"(n),"ZR"((FLOAT (*)[n * 2])x),"ZR"((FLOAT (*)[n * 2])y),"m"(*c),"m"(*s) | |||||
| :"memory","cc","r0","r1","v0","v1","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" | |||||
| ); | |||||
| static void crot_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *c, FLOAT *s) { | |||||
| __asm__("vlrepf %%v0,%[c]\n\t" | |||||
| "vlrepf %%v1,%[s]\n\t" | |||||
| "srlg %[n],%[n],5\n\t" | |||||
| "xgr %%r1,%%r1\n\t" | |||||
| "0:\n\t" | |||||
| "pfd 2, 1024(%%r1,%[x])\n\t" | |||||
| "pfd 2, 1024(%%r1,%[y])\n\t" | |||||
| "vl %%v24, 0(%%r1,%[x])\n\t" | |||||
| "vl %%v25, 16(%%r1,%[x])\n\t" | |||||
| "vl %%v26, 32(%%r1,%[x])\n\t" | |||||
| "vl %%v27, 48(%%r1,%[x])\n\t" | |||||
| "vl %%v16, 0(%%r1,%[y])\n\t" | |||||
| "vl %%v17, 16(%%r1,%[y])\n\t" | |||||
| "vl %%v18, 32(%%r1,%[y])\n\t" | |||||
| "vl %%v19, 48(%%r1,%[y])\n\t" | |||||
| "vfmsb %%v28,%%v24,%%v0\n\t" | |||||
| "vfmsb %%v29,%%v25,%%v0\n\t" | |||||
| "vfmsb %%v20,%%v24,%%v1\n\t" /* yn=x*s */ | |||||
| "vfmsb %%v21,%%v25,%%v1\n\t" /* yn=x*s */ | |||||
| "vfmsb %%v30,%%v26,%%v0\n\t" | |||||
| "vfmsb %%v22,%%v26,%%v1\n\t" /* yn=x*s */ | |||||
| "vfmsb %%v31,%%v27,%%v0\n\t" | |||||
| "vfmsb %%v23,%%v27,%%v1\n\t" /* yn=x*s */ | |||||
| /* 2nd parts */ | |||||
| "vfmasb %%v28,%%v16,%%v1,%%v28\n\t" | |||||
| "vfmssb %%v20,%%v16,%%v0,%%v20\n\t" /* yn=y*c-yn */ | |||||
| "vfmasb %%v29,%%v17,%%v1,%%v29\n\t" | |||||
| "vfmssb %%v21,%%v17,%%v0,%%v21\n\t" /* yn=y*c-yn */ | |||||
| "vfmasb %%v30,%%v18,%%v1,%%v30\n\t" | |||||
| "vfmssb %%v22,%%v18,%%v0,%%v22\n\t" /* yn=y*c-yn */ | |||||
| "vfmasb %%v31,%%v19,%%v1,%%v31\n\t" | |||||
| "vfmssb %%v23,%%v19,%%v0,%%v23\n\t" /* yn=y*c-yn */ | |||||
| "vst %%v28, 0(%%r1,%[x])\n\t" | |||||
| "vst %%v29, 16(%%r1,%[x])\n\t" | |||||
| "vst %%v30, 32(%%r1,%[x])\n\t" | |||||
| "vst %%v31, 48(%%r1,%[x])\n\t" | |||||
| "vst %%v20, 0(%%r1,%[y])\n\t" | |||||
| "vst %%v21, 16(%%r1,%[y])\n\t" | |||||
| "vst %%v22, 32(%%r1,%[y])\n\t" | |||||
| "vst %%v23, 48(%%r1,%[y])\n\t" | |||||
| "vl %%v24, 64(%%r1,%[x])\n\t" | |||||
| "vl %%v25, 80(%%r1,%[x])\n\t" | |||||
| "vl %%v26, 96(%%r1,%[x])\n\t" | |||||
| "vl %%v27, 112(%%r1,%[x])\n\t" | |||||
| "vl %%v16, 64(%%r1,%[y])\n\t" | |||||
| "vl %%v17, 80(%%r1,%[y])\n\t" | |||||
| "vl %%v18, 96(%%r1,%[y])\n\t" | |||||
| "vl %%v19, 112(%%r1,%[y])\n\t" | |||||
| "vfmsb %%v28,%%v24,%%v0\n\t" | |||||
| "vfmsb %%v29,%%v25,%%v0\n\t" | |||||
| "vfmsb %%v20,%%v24,%%v1\n\t" /* yn=x*s */ | |||||
| "vfmsb %%v21,%%v25,%%v1\n\t" /* yn=x*s */ | |||||
| "vfmsb %%v30,%%v26,%%v0\n\t" | |||||
| "vfmsb %%v22,%%v26,%%v1\n\t" /* yn=x*s */ | |||||
| "vfmsb %%v31,%%v27,%%v0\n\t" | |||||
| "vfmsb %%v23,%%v27,%%v1\n\t" /* yn=x*s */ | |||||
| /* 2nd parts */ | |||||
| "vfmasb %%v28,%%v16,%%v1,%%v28\n\t" | |||||
| "vfmssb %%v20,%%v16,%%v0,%%v20\n\t" /* yn=y*c-yn */ | |||||
| "vfmasb %%v29,%%v17,%%v1,%%v29\n\t" | |||||
| "vfmssb %%v21,%%v17,%%v0,%%v21\n\t" /* yn=y*c-yn */ | |||||
| "vfmasb %%v30,%%v18,%%v1,%%v30\n\t" | |||||
| "vfmssb %%v22,%%v18,%%v0,%%v22\n\t" /* yn=y*c-yn */ | |||||
| "vfmasb %%v31,%%v19,%%v1,%%v31\n\t" | |||||
| "vfmssb %%v23,%%v19,%%v0,%%v23\n\t" /* yn=y*c-yn */ | |||||
| "vst %%v28, 64(%%r1,%[x])\n\t" | |||||
| "vst %%v29, 80(%%r1,%[x])\n\t" | |||||
| "vst %%v30, 96(%%r1,%[x])\n\t" | |||||
| "vst %%v31, 112(%%r1,%[x])\n\t" | |||||
| "vst %%v20, 64(%%r1,%[y])\n\t" | |||||
| "vst %%v21, 80(%%r1,%[y])\n\t" | |||||
| "vst %%v22, 96(%%r1,%[y])\n\t" | |||||
| "vst %%v23, 112(%%r1,%[y])\n\t" | |||||
| "vl %%v24, 128(%%r1,%[x])\n\t" | |||||
| "vl %%v25, 144(%%r1,%[x])\n\t" | |||||
| "vl %%v26, 160(%%r1,%[x])\n\t" | |||||
| "vl %%v27, 176(%%r1,%[x])\n\t" | |||||
| "vl %%v16, 128(%%r1,%[y])\n\t" | |||||
| "vl %%v17, 144(%%r1,%[y])\n\t" | |||||
| "vl %%v18, 160(%%r1,%[y])\n\t" | |||||
| "vl %%v19, 176(%%r1,%[y])\n\t" | |||||
| "vfmsb %%v28,%%v24,%%v0\n\t" | |||||
| "vfmsb %%v29,%%v25,%%v0\n\t" | |||||
| "vfmsb %%v20,%%v24,%%v1\n\t" /* yn=x*s */ | |||||
| "vfmsb %%v21,%%v25,%%v1\n\t" /* yn=x*s */ | |||||
| "vfmsb %%v30,%%v26,%%v0\n\t" | |||||
| "vfmsb %%v22,%%v26,%%v1\n\t" /* yn=x*s */ | |||||
| "vfmsb %%v31,%%v27,%%v0\n\t" | |||||
| "vfmsb %%v23,%%v27,%%v1\n\t" /* yn=x*s */ | |||||
| /* 2nd parts */ | |||||
| "vfmasb %%v28,%%v16,%%v1,%%v28\n\t" | |||||
| "vfmssb %%v20,%%v16,%%v0,%%v20\n\t" /* yn=y*c-yn */ | |||||
| "vfmasb %%v29,%%v17,%%v1,%%v29\n\t" | |||||
| "vfmssb %%v21,%%v17,%%v0,%%v21\n\t" /* yn=y*c-yn */ | |||||
| "vfmasb %%v30,%%v18,%%v1,%%v30\n\t" | |||||
| "vfmssb %%v22,%%v18,%%v0,%%v22\n\t" /* yn=y*c-yn */ | |||||
| "vfmasb %%v31,%%v19,%%v1,%%v31\n\t" | |||||
| "vfmssb %%v23,%%v19,%%v0,%%v23\n\t" /* yn=y*c-yn */ | |||||
| "vst %%v28, 128(%%r1,%[x])\n\t" | |||||
| "vst %%v29, 144(%%r1,%[x])\n\t" | |||||
| "vst %%v30, 160(%%r1,%[x])\n\t" | |||||
| "vst %%v31, 176(%%r1,%[x])\n\t" | |||||
| "vst %%v20, 128(%%r1,%[y])\n\t" | |||||
| "vst %%v21, 144(%%r1,%[y])\n\t" | |||||
| "vst %%v22, 160(%%r1,%[y])\n\t" | |||||
| "vst %%v23, 176(%%r1,%[y])\n\t" | |||||
| "vl %%v24, 192(%%r1,%[x])\n\t" | |||||
| "vl %%v25, 208(%%r1,%[x])\n\t" | |||||
| "vl %%v26, 224(%%r1,%[x])\n\t" | |||||
| "vl %%v27, 240(%%r1,%[x])\n\t" | |||||
| "vl %%v16, 192(%%r1,%[y])\n\t" | |||||
| "vl %%v17, 208(%%r1,%[y])\n\t" | |||||
| "vl %%v18, 224(%%r1,%[y])\n\t" | |||||
| "vl %%v19, 240(%%r1,%[y])\n\t" | |||||
| "vfmsb %%v28,%%v24,%%v0\n\t" | |||||
| "vfmsb %%v29,%%v25,%%v0\n\t" | |||||
| "vfmsb %%v20,%%v24,%%v1\n\t" /* yn=x*s */ | |||||
| "vfmsb %%v21,%%v25,%%v1\n\t" /* yn=x*s */ | |||||
| "vfmsb %%v30,%%v26,%%v0\n\t" | |||||
| "vfmsb %%v22,%%v26,%%v1\n\t" /* yn=x*s */ | |||||
| "vfmsb %%v31,%%v27,%%v0\n\t" | |||||
| "vfmsb %%v23,%%v27,%%v1\n\t" /* yn=x*s */ | |||||
| /* 2nd parts */ | |||||
| "vfmasb %%v28,%%v16,%%v1,%%v28\n\t" | |||||
| "vfmssb %%v20,%%v16,%%v0,%%v20\n\t" /* yn=y*c-yn */ | |||||
| "vfmasb %%v29,%%v17,%%v1,%%v29\n\t" | |||||
| "vfmssb %%v21,%%v17,%%v0,%%v21\n\t" /* yn=y*c-yn */ | |||||
| "vfmasb %%v30,%%v18,%%v1,%%v30\n\t" | |||||
| "vfmssb %%v22,%%v18,%%v0,%%v22\n\t" /* yn=y*c-yn */ | |||||
| "vfmasb %%v31,%%v19,%%v1,%%v31\n\t" | |||||
| "vfmssb %%v23,%%v19,%%v0,%%v23\n\t" /* yn=y*c-yn */ | |||||
| "vst %%v28, 192(%%r1,%[x])\n\t" | |||||
| "vst %%v29, 208(%%r1,%[x])\n\t" | |||||
| "vst %%v30, 224(%%r1,%[x])\n\t" | |||||
| "vst %%v31, 240(%%r1,%[x])\n\t" | |||||
| "vst %%v20, 192(%%r1,%[y])\n\t" | |||||
| "vst %%v21, 208(%%r1,%[y])\n\t" | |||||
| "vst %%v22, 224(%%r1,%[y])\n\t" | |||||
| "vst %%v23, 240(%%r1,%[y])\n\t" | |||||
| "agfi %%r1,256\n\t" | |||||
| "brctg %[n],0b" | |||||
| : "+m"(*(FLOAT (*)[n * 2]) x), "+m"(*(FLOAT (*)[n * 2]) y),[n] "+&r"(n) | |||||
| : [x] "a"(x),[y] "a"(y),[c] "m"(*c),[s] "m"(*s) | |||||
| : "cc", "r1", "v0", "v1", "v16", "v17", "v18", "v19", "v20", "v21", | |||||
| "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", | |||||
| "v31"); | |||||
| } | } | ||||
| int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT c, FLOAT s) | |||||
| { | |||||
| BLASLONG i=0; | |||||
| BLASLONG ix=0,iy=0; | |||||
| FLOAT temp[2]; | |||||
| BLASLONG inc_x2; | |||||
| BLASLONG inc_y2; | |||||
| if ( n <= 0 ) return(0); | |||||
| if ( (inc_x == 1) && (inc_y == 1) ) | |||||
| { | |||||
| BLASLONG n1 = n & -32; | |||||
| if ( n1 > 0 ) | |||||
| { | |||||
| FLOAT cosa,sina; | |||||
| cosa=c; | |||||
| sina=s; | |||||
| crot_kernel_32(n1, x, y, &cosa, &sina); | |||||
| i=n1; | |||||
| ix=2*n1; | |||||
| } | |||||
| while(i < n) | |||||
| { | |||||
| temp[0] = c*x[ix] + s*y[ix] ; | |||||
| temp[1] = c*x[ix+1] + s*y[ix+1] ; | |||||
| y[ix] = c*y[ix] - s*x[ix] ; | |||||
| y[ix+1] = c*y[ix+1] - s*x[ix+1] ; | |||||
| x[ix] = temp[0] ; | |||||
| x[ix+1] = temp[1] ; | |||||
| ix += 2 ; | |||||
| i++ ; | |||||
| int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, | |||||
| FLOAT c, FLOAT s) { | |||||
| BLASLONG i = 0; | |||||
| BLASLONG ix = 0, iy = 0; | |||||
| FLOAT temp[2]; | |||||
| BLASLONG inc_x2; | |||||
| BLASLONG inc_y2; | |||||
| if (n <= 0) | |||||
| return (0); | |||||
| if ((inc_x == 1) && (inc_y == 1)) { | |||||
| BLASLONG n1 = n & -32; | |||||
| if (n1 > 0) { | |||||
| FLOAT cosa, sina; | |||||
| cosa = c; | |||||
| sina = s; | |||||
| crot_kernel_32(n1, x, y, &cosa, &sina); | |||||
| i = n1; | |||||
| ix = 2 * n1; | |||||
| } | |||||
| } | |||||
| while (i < n) { | |||||
| temp[0] = c * x[ix] + s * y[ix]; | |||||
| temp[1] = c * x[ix + 1] + s * y[ix + 1]; | |||||
| y[ix] = c * y[ix] - s * x[ix]; | |||||
| y[ix + 1] = c * y[ix + 1] - s * x[ix + 1]; | |||||
| x[ix] = temp[0]; | |||||
| x[ix + 1] = temp[1]; | |||||
| ix += 2; | |||||
| i++; | |||||
| } | } | ||||
| else | |||||
| { | |||||
| inc_x2 = 2 * inc_x ; | |||||
| inc_y2 = 2 * inc_y ; | |||||
| while(i < n) | |||||
| { | |||||
| temp[0] = c*x[ix] + s*y[iy] ; | |||||
| temp[1] = c*x[ix+1] + s*y[iy+1] ; | |||||
| y[iy] = c*y[iy] - s*x[ix] ; | |||||
| y[iy+1] = c*y[iy+1] - s*x[ix+1] ; | |||||
| x[ix] = temp[0] ; | |||||
| x[ix+1] = temp[1] ; | |||||
| ix += inc_x2 ; | |||||
| iy += inc_y2 ; | |||||
| i++ ; | |||||
| } | |||||
| } else { | |||||
| inc_x2 = 2 * inc_x; | |||||
| inc_y2 = 2 * inc_y; | |||||
| while (i < n) { | |||||
| temp[0] = c * x[ix] + s * y[iy]; | |||||
| temp[1] = c * x[ix + 1] + s * y[iy + 1]; | |||||
| y[iy] = c * y[iy] - s * x[ix]; | |||||
| y[iy + 1] = c * y[iy + 1] - s * x[ix + 1]; | |||||
| x[ix] = temp[0]; | |||||
| x[ix + 1] = temp[1]; | |||||
| ix += inc_x2; | |||||
| iy += inc_y2; | |||||
| i++; | |||||
| } | } | ||||
| return(0); | |||||
| } | |||||
| } | |||||
| return (0); | |||||
| } | |||||
| @@ -1,5 +1,5 @@ | |||||
| /*************************************************************************** | /*************************************************************************** | ||||
| Copyright (c) 2013 - 2017, The OpenBLAS Project | |||||
| Copyright (c) 2013-2019, The OpenBLAS Project | |||||
| All rights reserved. | All rights reserved. | ||||
| Redistribution and use in source and binary forms, with or without | Redistribution and use in source and binary forms, with or without | ||||
| modification, are permitted provided that the following conditions are | modification, are permitted provided that the following conditions are | ||||
| @@ -27,430 +27,400 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #include "common.h" | #include "common.h" | ||||
| static void cscal_kernel_16(BLASLONG n, FLOAT *alpha, FLOAT *x) | |||||
| { | |||||
| __asm__ volatile( | |||||
| "vlrepf %%v0,0(%1) \n\t" | |||||
| "vlef %%v1,4(%1),0 \n\t" | |||||
| "vlef %%v1,4(%1),2 \n\t" | |||||
| "vflcsb %%v1,%%v1 \n\t" | |||||
| "vlef %%v1,4(%1),1 \n\t" | |||||
| "vlef %%v1,4(%1),3 \n\t" | |||||
| "srlg %%r0,%0,4 \n\t" | |||||
| "xgr %%r1,%%r1 \n\t" | |||||
| "0: \n\t" | |||||
| "pfd 2, 1024(%%r1,%2) \n\t" | |||||
| "vl %%v16,0(%%r1,%2) \n\t" | |||||
| "vl %%v17,16(%%r1,%2) \n\t" | |||||
| "vl %%v18,32(%%r1,%2) \n\t" | |||||
| "vl %%v19,48(%%r1,%2) \n\t" | |||||
| "vl %%v20,64(%%r1,%2) \n\t" | |||||
| "vl %%v21,80(%%r1,%2) \n\t" | |||||
| "vl %%v22,96(%%r1,%2) \n\t" | |||||
| "vl %%v23,112(%%r1,%2) \n\t" | |||||
| "verllg %%v24,%%v16,32 \n\t" | |||||
| "verllg %%v25,%%v17,32 \n\t" | |||||
| "verllg %%v26,%%v18,32 \n\t" | |||||
| "verllg %%v27,%%v19,32 \n\t" | |||||
| "verllg %%v28,%%v20,32 \n\t" | |||||
| "verllg %%v29,%%v21,32 \n\t" | |||||
| "verllg %%v30,%%v22,32 \n\t" | |||||
| "verllg %%v31,%%v23,32 \n\t" | |||||
| "vfmsb %%v16,%%v16,%%v0 \n\t" | |||||
| "vfmsb %%v17,%%v17,%%v0 \n\t" | |||||
| "vfmsb %%v18,%%v18,%%v0 \n\t" | |||||
| "vfmsb %%v19,%%v19,%%v0 \n\t" | |||||
| "vfmsb %%v20,%%v20,%%v0 \n\t" | |||||
| "vfmsb %%v21,%%v21,%%v0 \n\t" | |||||
| "vfmsb %%v22,%%v22,%%v0 \n\t" | |||||
| "vfmsb %%v23,%%v23,%%v0 \n\t" | |||||
| "vfmasb %%v16,%%v24,%%v1,%%v16 \n\t" | |||||
| "vfmasb %%v17,%%v25,%%v1,%%v17 \n\t" | |||||
| "vfmasb %%v18,%%v26,%%v1,%%v18 \n\t" | |||||
| "vfmasb %%v19,%%v27,%%v1,%%v19 \n\t" | |||||
| "vfmasb %%v20,%%v28,%%v1,%%v20 \n\t" | |||||
| "vfmasb %%v21,%%v29,%%v1,%%v21 \n\t" | |||||
| "vfmasb %%v22,%%v30,%%v1,%%v22 \n\t" | |||||
| "vfmasb %%v23,%%v31,%%v1,%%v23 \n\t" | |||||
| "vst %%v16,0(%%r1,%2) \n\t" | |||||
| "vst %%v17,16(%%r1,%2) \n\t" | |||||
| "vst %%v18,32(%%r1,%2) \n\t" | |||||
| "vst %%v19,48(%%r1,%2) \n\t" | |||||
| "vst %%v20,64(%%r1,%2) \n\t" | |||||
| "vst %%v21,80(%%r1,%2) \n\t" | |||||
| "vst %%v22,96(%%r1,%2) \n\t" | |||||
| "vst %%v23,112(%%r1,%2) \n\t" | |||||
| "agfi %%r1,128 \n\t" | |||||
| "brctg %%r0,0b " | |||||
| : | |||||
| :"r"(n),"ZQ"((const FLOAT (*)[2])alpha),"ZR"((FLOAT (*)[n * 2])x) | |||||
| :"memory","cc","r0","r1","v0","v1","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" | |||||
| ); | |||||
| } | |||||
| static void cscal_kernel_16_zero_r(BLASLONG n, FLOAT *alpha, FLOAT *x) | |||||
| { | |||||
| __asm__ volatile( | |||||
| "vlef %%v0,4(%1),0 \n\t" | |||||
| "vlef %%v0,4(%1),2 \n\t" | |||||
| "vflcsb %%v0,%%v0 \n\t" | |||||
| "vlef %%v0,4(%1),1 \n\t" | |||||
| "vlef %%v0,4(%1),3 \n\t" | |||||
| "srlg %%r0,%0,4 \n\t" | |||||
| "xgr %%r1,%%r1 \n\t" | |||||
| "0: \n\t" | |||||
| "pfd 2, 1024(%%r1,%2) \n\t" | |||||
| "vl %%v16,0(%%r1,%2) \n\t" | |||||
| "vl %%v17,16(%%r1,%2) \n\t" | |||||
| "vl %%v18,32(%%r1,%2) \n\t" | |||||
| "vl %%v19,48(%%r1,%2) \n\t" | |||||
| "vl %%v20,64(%%r1,%2) \n\t" | |||||
| "vl %%v21,80(%%r1,%2) \n\t" | |||||
| "vl %%v22,96(%%r1,%2) \n\t" | |||||
| "vl %%v23,112(%%r1,%2) \n\t" | |||||
| "verllg %%v16,%%v16,32 \n\t" | |||||
| "verllg %%v17,%%v17,32 \n\t" | |||||
| "verllg %%v18,%%v18,32 \n\t" | |||||
| "verllg %%v19,%%v19,32 \n\t" | |||||
| "verllg %%v20,%%v20,32 \n\t" | |||||
| "verllg %%v21,%%v21,32 \n\t" | |||||
| "verllg %%v22,%%v22,32 \n\t" | |||||
| "verllg %%v23,%%v23,32 \n\t" | |||||
| "vfmsb %%v16,%%v16,%%v0 \n\t" | |||||
| "vfmsb %%v17,%%v17,%%v0 \n\t" | |||||
| "vfmsb %%v18,%%v18,%%v0 \n\t" | |||||
| "vfmsb %%v19,%%v19,%%v0 \n\t" | |||||
| "vfmsb %%v20,%%v20,%%v0 \n\t" | |||||
| "vfmsb %%v21,%%v21,%%v0 \n\t" | |||||
| "vfmsb %%v22,%%v22,%%v0 \n\t" | |||||
| "vfmsb %%v23,%%v23,%%v0 \n\t" | |||||
| "vst %%v16,0(%%r1,%2) \n\t" | |||||
| "vst %%v17,16(%%r1,%2) \n\t" | |||||
| "vst %%v18,32(%%r1,%2) \n\t" | |||||
| "vst %%v19,48(%%r1,%2) \n\t" | |||||
| "vst %%v20,64(%%r1,%2) \n\t" | |||||
| "vst %%v21,80(%%r1,%2) \n\t" | |||||
| "vst %%v22,96(%%r1,%2) \n\t" | |||||
| "vst %%v23,112(%%r1,%2) \n\t" | |||||
| "agfi %%r1,128 \n\t" | |||||
| "brctg %%r0,0b " | |||||
| : | |||||
| :"r"(n),"ZQ"((const FLOAT (*)[2])alpha),"ZR"((FLOAT (*)[n * 2])x) | |||||
| :"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23" | |||||
| ); | |||||
| static void cscal_kernel_16(BLASLONG n, FLOAT *alpha, FLOAT *x) { | |||||
| __asm__("vlrepf %%v0,0(%[alpha])\n\t" | |||||
| "vlef %%v1,4(%[alpha]),0\n\t" | |||||
| "vlef %%v1,4(%[alpha]),2\n\t" | |||||
| "vflcsb %%v1,%%v1\n\t" | |||||
| "vlef %%v1,4(%[alpha]),1\n\t" | |||||
| "vlef %%v1,4(%[alpha]),3\n\t" | |||||
| "srlg %[n],%[n],4\n\t" | |||||
| "xgr %%r1,%%r1\n\t" | |||||
| "0:\n\t" | |||||
| "pfd 2, 1024(%%r1,%[x])\n\t" | |||||
| "vl %%v16,0(%%r1,%[x])\n\t" | |||||
| "vl %%v17,16(%%r1,%[x])\n\t" | |||||
| "vl %%v18,32(%%r1,%[x])\n\t" | |||||
| "vl %%v19,48(%%r1,%[x])\n\t" | |||||
| "vl %%v20,64(%%r1,%[x])\n\t" | |||||
| "vl %%v21,80(%%r1,%[x])\n\t" | |||||
| "vl %%v22,96(%%r1,%[x])\n\t" | |||||
| "vl %%v23,112(%%r1,%[x])\n\t" | |||||
| "verllg %%v24,%%v16,32\n\t" | |||||
| "verllg %%v25,%%v17,32\n\t" | |||||
| "verllg %%v26,%%v18,32\n\t" | |||||
| "verllg %%v27,%%v19,32\n\t" | |||||
| "verllg %%v28,%%v20,32\n\t" | |||||
| "verllg %%v29,%%v21,32\n\t" | |||||
| "verllg %%v30,%%v22,32\n\t" | |||||
| "verllg %%v31,%%v23,32\n\t" | |||||
| "vfmsb %%v16,%%v16,%%v0\n\t" | |||||
| "vfmsb %%v17,%%v17,%%v0\n\t" | |||||
| "vfmsb %%v18,%%v18,%%v0\n\t" | |||||
| "vfmsb %%v19,%%v19,%%v0\n\t" | |||||
| "vfmsb %%v20,%%v20,%%v0\n\t" | |||||
| "vfmsb %%v21,%%v21,%%v0\n\t" | |||||
| "vfmsb %%v22,%%v22,%%v0\n\t" | |||||
| "vfmsb %%v23,%%v23,%%v0\n\t" | |||||
| "vfmasb %%v16,%%v24,%%v1,%%v16\n\t" | |||||
| "vfmasb %%v17,%%v25,%%v1,%%v17\n\t" | |||||
| "vfmasb %%v18,%%v26,%%v1,%%v18\n\t" | |||||
| "vfmasb %%v19,%%v27,%%v1,%%v19\n\t" | |||||
| "vfmasb %%v20,%%v28,%%v1,%%v20\n\t" | |||||
| "vfmasb %%v21,%%v29,%%v1,%%v21\n\t" | |||||
| "vfmasb %%v22,%%v30,%%v1,%%v22\n\t" | |||||
| "vfmasb %%v23,%%v31,%%v1,%%v23\n\t" | |||||
| "vst %%v16,0(%%r1,%[x])\n\t" | |||||
| "vst %%v17,16(%%r1,%[x])\n\t" | |||||
| "vst %%v18,32(%%r1,%[x])\n\t" | |||||
| "vst %%v19,48(%%r1,%[x])\n\t" | |||||
| "vst %%v20,64(%%r1,%[x])\n\t" | |||||
| "vst %%v21,80(%%r1,%[x])\n\t" | |||||
| "vst %%v22,96(%%r1,%[x])\n\t" | |||||
| "vst %%v23,112(%%r1,%[x])\n\t" | |||||
| "agfi %%r1,128\n\t" | |||||
| "brctg %[n],0b" | |||||
| : "+m"(*(FLOAT (*)[n * 2]) x),[n] "+&r"(n) | |||||
| : [x] "a"(x), "m"(*(const FLOAT (*)[2]) alpha),[alpha] "a"(alpha) | |||||
| : "cc", "r1", "v0", "v1", "v16", "v17", "v18", "v19", "v20", "v21", | |||||
| "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", | |||||
| "v31"); | |||||
| } | } | ||||
| static void cscal_kernel_16_zero_i(BLASLONG n, FLOAT *alpha, FLOAT *x) | |||||
| { | |||||
| __asm__ volatile( | |||||
| "vlrepf %%v0,0(%1) \n\t" | |||||
| "srlg %%r0,%0,4 \n\t" | |||||
| "xgr %%r1,%%r1 \n\t" | |||||
| "0: \n\t" | |||||
| "pfd 2, 1024(%%r1,%2) \n\t" | |||||
| "vl %%v16,0(%%r1,%2) \n\t" | |||||
| "vl %%v17,16(%%r1,%2) \n\t" | |||||
| "vl %%v18,32(%%r1,%2) \n\t" | |||||
| "vl %%v19,48(%%r1,%2) \n\t" | |||||
| "vl %%v20,64(%%r1,%2) \n\t" | |||||
| "vl %%v21,80(%%r1,%2) \n\t" | |||||
| "vl %%v22,96(%%r1,%2) \n\t" | |||||
| "vl %%v23,112(%%r1,%2) \n\t" | |||||
| "vfmsb %%v16,%%v16,%%v0 \n\t" | |||||
| "vfmsb %%v17,%%v17,%%v0 \n\t" | |||||
| "vfmsb %%v18,%%v18,%%v0 \n\t" | |||||
| "vfmsb %%v19,%%v19,%%v0 \n\t" | |||||
| "vfmsb %%v20,%%v20,%%v0 \n\t" | |||||
| "vfmsb %%v21,%%v21,%%v0 \n\t" | |||||
| "vfmsb %%v22,%%v22,%%v0 \n\t" | |||||
| "vfmsb %%v23,%%v23,%%v0 \n\t" | |||||
| "vst %%v16,0(%%r1,%2) \n\t" | |||||
| "vst %%v17,16(%%r1,%2) \n\t" | |||||
| "vst %%v18,32(%%r1,%2) \n\t" | |||||
| "vst %%v19,48(%%r1,%2) \n\t" | |||||
| "vst %%v20,64(%%r1,%2) \n\t" | |||||
| "vst %%v21,80(%%r1,%2) \n\t" | |||||
| "vst %%v22,96(%%r1,%2) \n\t" | |||||
| "vst %%v23,112(%%r1,%2) \n\t" | |||||
| "agfi %%r1,128 \n\t" | |||||
| "brctg %%r0,0b " | |||||
| : | |||||
| :"r"(n),"ZQ"((const FLOAT (*)[2])alpha),"ZR"((FLOAT (*)[n * 2])x) | |||||
| :"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23" | |||||
| ); | |||||
| static void cscal_kernel_16_zero_r(BLASLONG n, FLOAT *alpha, FLOAT *x) { | |||||
| __asm__("vlef %%v0,4(%[alpha]),0\n\t" | |||||
| "vlef %%v0,4(%[alpha]),2\n\t" | |||||
| "vflcsb %%v0,%%v0\n\t" | |||||
| "vlef %%v0,4(%[alpha]),1\n\t" | |||||
| "vlef %%v0,4(%[alpha]),3\n\t" | |||||
| "srlg %[n],%[n],4\n\t" | |||||
| "xgr %%r1,%%r1\n\t" | |||||
| "0:\n\t" | |||||
| "pfd 2, 1024(%%r1,%[x])\n\t" | |||||
| "vl %%v16,0(%%r1,%[x])\n\t" | |||||
| "vl %%v17,16(%%r1,%[x])\n\t" | |||||
| "vl %%v18,32(%%r1,%[x])\n\t" | |||||
| "vl %%v19,48(%%r1,%[x])\n\t" | |||||
| "vl %%v20,64(%%r1,%[x])\n\t" | |||||
| "vl %%v21,80(%%r1,%[x])\n\t" | |||||
| "vl %%v22,96(%%r1,%[x])\n\t" | |||||
| "vl %%v23,112(%%r1,%[x])\n\t" | |||||
| "verllg %%v16,%%v16,32\n\t" | |||||
| "verllg %%v17,%%v17,32\n\t" | |||||
| "verllg %%v18,%%v18,32\n\t" | |||||
| "verllg %%v19,%%v19,32\n\t" | |||||
| "verllg %%v20,%%v20,32\n\t" | |||||
| "verllg %%v21,%%v21,32\n\t" | |||||
| "verllg %%v22,%%v22,32\n\t" | |||||
| "verllg %%v23,%%v23,32\n\t" | |||||
| "vfmsb %%v16,%%v16,%%v0\n\t" | |||||
| "vfmsb %%v17,%%v17,%%v0\n\t" | |||||
| "vfmsb %%v18,%%v18,%%v0\n\t" | |||||
| "vfmsb %%v19,%%v19,%%v0\n\t" | |||||
| "vfmsb %%v20,%%v20,%%v0\n\t" | |||||
| "vfmsb %%v21,%%v21,%%v0\n\t" | |||||
| "vfmsb %%v22,%%v22,%%v0\n\t" | |||||
| "vfmsb %%v23,%%v23,%%v0\n\t" | |||||
| "vst %%v16,0(%%r1,%[x])\n\t" | |||||
| "vst %%v17,16(%%r1,%[x])\n\t" | |||||
| "vst %%v18,32(%%r1,%[x])\n\t" | |||||
| "vst %%v19,48(%%r1,%[x])\n\t" | |||||
| "vst %%v20,64(%%r1,%[x])\n\t" | |||||
| "vst %%v21,80(%%r1,%[x])\n\t" | |||||
| "vst %%v22,96(%%r1,%[x])\n\t" | |||||
| "vst %%v23,112(%%r1,%[x])\n\t" | |||||
| "agfi %%r1,128\n\t" | |||||
| "brctg %[n],0b" | |||||
| : "+m"(*(FLOAT (*)[n * 2]) x),[n] "+&r"(n) | |||||
| : [x] "a"(x), "m"(*(const FLOAT (*)[2]) alpha),[alpha] "a"(alpha) | |||||
| : "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22", | |||||
| "v23"); | |||||
| } | } | ||||
| static void cscal_kernel_16_zero(BLASLONG n, FLOAT *x) | |||||
| { | |||||
| __asm__ volatile( | |||||
| "vzero %%v24 \n\t" | |||||
| "vzero %%v25 \n\t" | |||||
| "vzero %%v26 \n\t" | |||||
| "vzero %%v27 \n\t" | |||||
| "srlg %%r0,%0,4 \n\t" | |||||
| "xgr %%r1,%%r1 \n\t" | |||||
| "0: \n\t" | |||||
| "pfd 2, 1024(%%r1,%1) \n\t" | |||||
| "vst %%v24,0(%%r1,%1) \n\t" | |||||
| "vst %%v25,16(%%r1,%1) \n\t" | |||||
| "vst %%v26,32(%%r1,%1) \n\t" | |||||
| "vst %%v27,48(%%r1,%1) \n\t" | |||||
| "vst %%v24,64(%%r1,%1) \n\t" | |||||
| "vst %%v25,80(%%r1,%1) \n\t" | |||||
| "vst %%v26,96(%%r1,%1) \n\t" | |||||
| "vst %%v27,112(%%r1,%1) \n\t" | |||||
| "agfi %%r1,128 \n\t" | |||||
| "brctg %%r0,0b " | |||||
| : | |||||
| :"r"(n),"ZR"((FLOAT (*)[n * 2])x) | |||||
| :"memory","cc","r0","r1","v24","v25","v26","v27" | |||||
| ); | |||||
| static void cscal_kernel_16_zero_i(BLASLONG n, FLOAT *alpha, FLOAT *x) { | |||||
| __asm__("vlrepf %%v0,0(%[alpha])\n\t" | |||||
| "srlg %[n],%[n],4\n\t" | |||||
| "xgr %%r1,%%r1\n\t" | |||||
| "0:\n\t" | |||||
| "pfd 2, 1024(%%r1,%[x])\n\t" | |||||
| "vl %%v16,0(%%r1,%[x])\n\t" | |||||
| "vl %%v17,16(%%r1,%[x])\n\t" | |||||
| "vl %%v18,32(%%r1,%[x])\n\t" | |||||
| "vl %%v19,48(%%r1,%[x])\n\t" | |||||
| "vl %%v20,64(%%r1,%[x])\n\t" | |||||
| "vl %%v21,80(%%r1,%[x])\n\t" | |||||
| "vl %%v22,96(%%r1,%[x])\n\t" | |||||
| "vl %%v23,112(%%r1,%[x])\n\t" | |||||
| "vfmsb %%v16,%%v16,%%v0\n\t" | |||||
| "vfmsb %%v17,%%v17,%%v0\n\t" | |||||
| "vfmsb %%v18,%%v18,%%v0\n\t" | |||||
| "vfmsb %%v19,%%v19,%%v0\n\t" | |||||
| "vfmsb %%v20,%%v20,%%v0\n\t" | |||||
| "vfmsb %%v21,%%v21,%%v0\n\t" | |||||
| "vfmsb %%v22,%%v22,%%v0\n\t" | |||||
| "vfmsb %%v23,%%v23,%%v0\n\t" | |||||
| "vst %%v16,0(%%r1,%[x])\n\t" | |||||
| "vst %%v17,16(%%r1,%[x])\n\t" | |||||
| "vst %%v18,32(%%r1,%[x])\n\t" | |||||
| "vst %%v19,48(%%r1,%[x])\n\t" | |||||
| "vst %%v20,64(%%r1,%[x])\n\t" | |||||
| "vst %%v21,80(%%r1,%[x])\n\t" | |||||
| "vst %%v22,96(%%r1,%[x])\n\t" | |||||
| "vst %%v23,112(%%r1,%[x])\n\t" | |||||
| "agfi %%r1,128\n\t" | |||||
| "brctg %[n],0b" | |||||
| : "+m"(*(FLOAT (*)[n * 2]) x),[n] "+&r"(n) | |||||
| : [x] "a"(x), "m"(*(const FLOAT (*)[2]) alpha),[alpha] "a"(alpha) | |||||
| : "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22", | |||||
| "v23"); | |||||
| } | } | ||||
| static void cscal_kernel_inc_8(BLASLONG n, FLOAT *alpha, FLOAT *x, BLASLONG inc_x) | |||||
| { | |||||
| BLASLONG i; | |||||
| BLASLONG inc_x2 = 2 * inc_x; | |||||
| BLASLONG inc_x3 = inc_x2 + inc_x; | |||||
| FLOAT t0, t1, t2, t3; | |||||
| FLOAT da_r = alpha[0]; | |||||
| FLOAT da_i = alpha[1]; | |||||
| for (i = 0; i < n; i += 4) | |||||
| { | |||||
| t0 = da_r * x[0] - da_i * x[1]; | |||||
| t1 = da_r * x[inc_x] - da_i * x[inc_x + 1]; | |||||
| t2 = da_r * x[inc_x2] - da_i * x[inc_x2 + 1]; | |||||
| t3 = da_r * x[inc_x3] - da_i * x[inc_x3 + 1]; | |||||
| x[1] = da_i * x[0] + da_r * x[1]; | |||||
| x[inc_x + 1] = da_i * x[inc_x] + da_r * x[inc_x + 1]; | |||||
| x[inc_x2 + 1] = da_i * x[inc_x2] + da_r * x[inc_x2 + 1]; | |||||
| x[inc_x3 + 1] = da_i * x[inc_x3] + da_r * x[inc_x3 + 1]; | |||||
| x[0] = t0; | |||||
| x[inc_x] = t1; | |||||
| x[inc_x2] = t2; | |||||
| x[inc_x3] = t3; | |||||
| x += 4 * inc_x; | |||||
| } | |||||
| static void cscal_kernel_16_zero(BLASLONG n, FLOAT *x) { | |||||
| __asm__("vzero %%v0\n\t" | |||||
| "srlg %[n],%[n],4\n\t" | |||||
| "xgr %%r1,%%r1\n\t" | |||||
| "0:\n\t" | |||||
| "pfd 2, 1024(%%r1,%[x])\n\t" | |||||
| "vst %%v0,0(%%r1,%[x])\n\t" | |||||
| "vst %%v0,16(%%r1,%[x])\n\t" | |||||
| "vst %%v0,32(%%r1,%[x])\n\t" | |||||
| "vst %%v0,48(%%r1,%[x])\n\t" | |||||
| "vst %%v0,64(%%r1,%[x])\n\t" | |||||
| "vst %%v0,80(%%r1,%[x])\n\t" | |||||
| "vst %%v0,96(%%r1,%[x])\n\t" | |||||
| "vst %%v0,112(%%r1,%[x])\n\t" | |||||
| "agfi %%r1,128\n\t" | |||||
| "brctg %[n],0b" | |||||
| : "=m"(*(FLOAT (*)[n * 2]) x),[n] "+&r"(n) | |||||
| : [x] "a"(x) | |||||
| : "cc", "r1", "v0"); | |||||
| } | } | ||||
| int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) { | |||||
| BLASLONG i = 0, j = 0; | |||||
| FLOAT temp0; | |||||
| FLOAT temp1; | |||||
| FLOAT alpha[2] __attribute__ ((aligned(16))); | |||||
| if (inc_x != 1) { | |||||
| inc_x <<= 1; | |||||
| if (da_r == 0.0) { | |||||
| BLASLONG n1 = n & -2; | |||||
| if (da_i == 0.0) { | |||||
| static void cscal_kernel_inc_8(BLASLONG n, FLOAT *alpha, FLOAT *x, | |||||
| BLASLONG inc_x) { | |||||
| BLASLONG i; | |||||
| BLASLONG inc_x2 = 2 * inc_x; | |||||
| BLASLONG inc_x3 = inc_x2 + inc_x; | |||||
| FLOAT t0, t1, t2, t3; | |||||
| FLOAT da_r = alpha[0]; | |||||
| FLOAT da_i = alpha[1]; | |||||
| for (i = 0; i < n; i += 4) { | |||||
| t0 = da_r * x[0] - da_i * x[1]; | |||||
| t1 = da_r * x[inc_x] - da_i * x[inc_x + 1]; | |||||
| t2 = da_r * x[inc_x2] - da_i * x[inc_x2 + 1]; | |||||
| t3 = da_r * x[inc_x3] - da_i * x[inc_x3 + 1]; | |||||
| x[1] = da_i * x[0] + da_r * x[1]; | |||||
| x[inc_x + 1] = da_i * x[inc_x] + da_r * x[inc_x + 1]; | |||||
| x[inc_x2 + 1] = da_i * x[inc_x2] + da_r * x[inc_x2 + 1]; | |||||
| x[inc_x3 + 1] = da_i * x[inc_x3] + da_r * x[inc_x3 + 1]; | |||||
| x[0] = t0; | |||||
| x[inc_x] = t1; | |||||
| x[inc_x2] = t2; | |||||
| x[inc_x3] = t3; | |||||
| x += 4 * inc_x; | |||||
| } | |||||
| } | |||||
| while (j < n1) { | |||||
| int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, | |||||
| FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, | |||||
| BLASLONG dummy2) { | |||||
| BLASLONG i = 0, j = 0; | |||||
| FLOAT temp0; | |||||
| FLOAT temp1; | |||||
| FLOAT alpha[2] __attribute__ ((aligned(16))); | |||||
| x[i] = 0.0; | |||||
| x[i + 1] = 0.0; | |||||
| x[i + inc_x] = 0.0; | |||||
| x[i + 1 + inc_x] = 0.0; | |||||
| i += 2 * inc_x; | |||||
| j += 2; | |||||
| if (inc_x != 1) { | |||||
| inc_x <<= 1; | |||||
| } | |||||
| if (da_r == 0.0) { | |||||
| while (j < n) { | |||||
| BLASLONG n1 = n & -2; | |||||
| x[i] = 0.0; | |||||
| x[i + 1] = 0.0; | |||||
| i += inc_x; | |||||
| j++; | |||||
| if (da_i == 0.0) { | |||||
| } | |||||
| while (j < n1) { | |||||
| } else { | |||||
| x[i] = 0.0; | |||||
| x[i + 1] = 0.0; | |||||
| x[i + inc_x] = 0.0; | |||||
| x[i + 1 + inc_x] = 0.0; | |||||
| i += 2 * inc_x; | |||||
| j += 2; | |||||
| while (j < n1) { | |||||
| } | |||||
| temp0 = -da_i * x[i + 1]; | |||||
| x[i + 1] = da_i * x[i]; | |||||
| x[i] = temp0; | |||||
| temp1 = -da_i * x[i + 1 + inc_x]; | |||||
| x[i + 1 + inc_x] = da_i * x[i + inc_x]; | |||||
| x[i + inc_x] = temp1; | |||||
| i += 2 * inc_x; | |||||
| j += 2; | |||||
| while (j < n) { | |||||
| } | |||||
| x[i] = 0.0; | |||||
| x[i + 1] = 0.0; | |||||
| i += inc_x; | |||||
| j++; | |||||
| while (j < n) { | |||||
| } | |||||
| temp0 = -da_i * x[i + 1]; | |||||
| x[i + 1] = da_i * x[i]; | |||||
| x[i] = temp0; | |||||
| i += inc_x; | |||||
| j++; | |||||
| } else { | |||||
| } | |||||
| while (j < n1) { | |||||
| temp0 = -da_i * x[i + 1]; | |||||
| x[i + 1] = da_i * x[i]; | |||||
| x[i] = temp0; | |||||
| temp1 = -da_i * x[i + 1 + inc_x]; | |||||
| x[i + 1 + inc_x] = da_i * x[i + inc_x]; | |||||
| x[i + inc_x] = temp1; | |||||
| i += 2 * inc_x; | |||||
| j += 2; | |||||
| } | |||||
| } | |||||
| while (j < n) { | |||||
| } else { | |||||
| temp0 = -da_i * x[i + 1]; | |||||
| x[i + 1] = da_i * x[i]; | |||||
| x[i] = temp0; | |||||
| i += inc_x; | |||||
| j++; | |||||
| } | |||||
| if (da_i == 0.0) { | |||||
| BLASLONG n1 = n & -2; | |||||
| } | |||||
| while (j < n1) { | |||||
| } else { | |||||
| temp0 = da_r * x[i]; | |||||
| x[i + 1] = da_r * x[i + 1]; | |||||
| x[i] = temp0; | |||||
| temp1 = da_r * x[i + inc_x]; | |||||
| x[i + 1 + inc_x] = da_r * x[i + 1 + inc_x]; | |||||
| x[i + inc_x] = temp1; | |||||
| i += 2 * inc_x; | |||||
| j += 2; | |||||
| if (da_i == 0.0) { | |||||
| BLASLONG n1 = n & -2; | |||||
| } | |||||
| while (j < n1) { | |||||
| while (j < n) { | |||||
| temp0 = da_r * x[i]; | |||||
| x[i + 1] = da_r * x[i + 1]; | |||||
| x[i] = temp0; | |||||
| temp1 = da_r * x[i + inc_x]; | |||||
| x[i + 1 + inc_x] = da_r * x[i + 1 + inc_x]; | |||||
| x[i + inc_x] = temp1; | |||||
| i += 2 * inc_x; | |||||
| j += 2; | |||||
| temp0 = da_r * x[i]; | |||||
| x[i + 1] = da_r * x[i + 1]; | |||||
| x[i] = temp0; | |||||
| i += inc_x; | |||||
| j++; | |||||
| } | |||||
| } | |||||
| while (j < n) { | |||||
| } else { | |||||
| temp0 = da_r * x[i]; | |||||
| x[i + 1] = da_r * x[i + 1]; | |||||
| x[i] = temp0; | |||||
| i += inc_x; | |||||
| j++; | |||||
| BLASLONG n1 = n & -8; | |||||
| if (n1 > 0) { | |||||
| alpha[0] = da_r; | |||||
| alpha[1] = da_i; | |||||
| cscal_kernel_inc_8(n1, alpha, x, inc_x); | |||||
| j = n1; | |||||
| i = n1 * inc_x; | |||||
| } | |||||
| } | |||||
| while (j < n) { | |||||
| } else { | |||||
| temp0 = da_r * x[i] - da_i * x[i + 1]; | |||||
| x[i + 1] = da_r * x[i + 1] + da_i * x[i]; | |||||
| x[i] = temp0; | |||||
| i += inc_x; | |||||
| j++; | |||||
| BLASLONG n1 = n & -8; | |||||
| if (n1 > 0) { | |||||
| alpha[0] = da_r; | |||||
| alpha[1] = da_i; | |||||
| cscal_kernel_inc_8(n1, alpha, x, inc_x); | |||||
| j = n1; | |||||
| i = n1 * inc_x; | |||||
| } | |||||
| } | |||||
| while (j < n) { | |||||
| } | |||||
| temp0 = da_r * x[i] - da_i * x[i + 1]; | |||||
| x[i + 1] = da_r * x[i + 1] + da_i * x[i]; | |||||
| x[i] = temp0; | |||||
| i += inc_x; | |||||
| j++; | |||||
| } | } | ||||
| return (0); | |||||
| } | |||||
| } | |||||
| } | |||||
| BLASLONG n1 = n & -16; | |||||
| if (n1 > 0) { | |||||
| return (0); | |||||
| } | |||||
| alpha[0] = da_r; | |||||
| alpha[1] = da_i; | |||||
| BLASLONG n1 = n & -16; | |||||
| if (n1 > 0) { | |||||
| if (da_r == 0.0) | |||||
| if (da_i == 0) | |||||
| cscal_kernel_16_zero(n1, x); | |||||
| else | |||||
| cscal_kernel_16_zero_r(n1, alpha, x); | |||||
| else | |||||
| if (da_i == 0) | |||||
| cscal_kernel_16_zero_i(n1, alpha, x); | |||||
| else | |||||
| cscal_kernel_16(n1, alpha, x); | |||||
| alpha[0] = da_r; | |||||
| alpha[1] = da_i; | |||||
| i = n1 << 1; | |||||
| j = n1; | |||||
| } | |||||
| if (da_r == 0.0) | |||||
| if (da_i == 0) | |||||
| cscal_kernel_16_zero(n1, x); | |||||
| else | |||||
| cscal_kernel_16_zero_r(n1, alpha, x); | |||||
| else if (da_i == 0) | |||||
| cscal_kernel_16_zero_i(n1, alpha, x); | |||||
| else | |||||
| cscal_kernel_16(n1, alpha, x); | |||||
| i = n1 << 1; | |||||
| j = n1; | |||||
| } | |||||
| if (da_r == 0.0) { | |||||
| if (da_r == 0.0) { | |||||
| if (da_i == 0.0) { | |||||
| if (da_i == 0.0) { | |||||
| while (j < n) { | |||||
| while (j < n) { | |||||
| x[i] = 0.0; | |||||
| x[i + 1] = 0.0; | |||||
| i += 2; | |||||
| j++; | |||||
| x[i] = 0.0; | |||||
| x[i + 1] = 0.0; | |||||
| i += 2; | |||||
| j++; | |||||
| } | |||||
| } | |||||
| } else { | |||||
| } else { | |||||
| while (j < n) { | |||||
| while (j < n) { | |||||
| temp0 = -da_i * x[i + 1]; | |||||
| x[i + 1] = da_i * x[i]; | |||||
| x[i] = temp0; | |||||
| i += 2; | |||||
| j++; | |||||
| temp0 = -da_i * x[i + 1]; | |||||
| x[i + 1] = da_i * x[i]; | |||||
| x[i] = temp0; | |||||
| i += 2; | |||||
| j++; | |||||
| } | |||||
| } | |||||
| } | |||||
| } | |||||
| } else { | |||||
| } else { | |||||
| if (da_i == 0.0) { | |||||
| if (da_i == 0.0) { | |||||
| while (j < n) { | |||||
| while (j < n) { | |||||
| temp0 = da_r * x[i]; | |||||
| x[i + 1] = da_r * x[i + 1]; | |||||
| x[i] = temp0; | |||||
| i += 2; | |||||
| j++; | |||||
| temp0 = da_r * x[i]; | |||||
| x[i + 1] = da_r * x[i + 1]; | |||||
| x[i] = temp0; | |||||
| i += 2; | |||||
| j++; | |||||
| } | |||||
| } | |||||
| } else { | |||||
| } else { | |||||
| while (j < n) { | |||||
| while (j < n) { | |||||
| temp0 = da_r * x[i] - da_i * x[i + 1]; | |||||
| x[i + 1] = da_r * x[i + 1] + da_i * x[i]; | |||||
| x[i] = temp0; | |||||
| i += 2; | |||||
| j++; | |||||
| temp0 = da_r * x[i] - da_i * x[i + 1]; | |||||
| x[i + 1] = da_r * x[i + 1] + da_i * x[i]; | |||||
| x[i] = temp0; | |||||
| i += 2; | |||||
| j++; | |||||
| } | |||||
| } | |||||
| } | |||||
| } | } | ||||
| return (0); | |||||
| } | |||||
| return (0); | |||||
| } | } | ||||
| @@ -1,5 +1,5 @@ | |||||
| /*************************************************************************** | /*************************************************************************** | ||||
| Copyright (c) 2013-2017, The OpenBLAS Project | |||||
| Copyright (c) 2013-2019, The OpenBLAS Project | |||||
| All rights reserved. | All rights reserved. | ||||
| Redistribution and use in source and binary forms, with or without | Redistribution and use in source and binary forms, with or without | ||||
| modification, are permitted provided that the following conditions are | modification, are permitted provided that the following conditions are | ||||
| @@ -27,157 +27,142 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #include "common.h" | #include "common.h" | ||||
| static void cswap_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y) | |||||
| { | |||||
| __asm__ volatile( | |||||
| "srlg %%r0,%0,5 \n\t" | |||||
| "xgr %%r1,%%r1 \n\t" | |||||
| "0: \n\t" | |||||
| "pfd 2, 1024(%%r1,%1) \n\t" | |||||
| "pfd 2, 1024(%%r1,%2) \n\t" | |||||
| "vl %%v16, 0(%%r1,%1) \n\t" | |||||
| "vl %%v17, 16(%%r1,%1) \n\t" | |||||
| "vl %%v18, 32(%%r1,%1) \n\t" | |||||
| "vl %%v19, 48(%%r1,%1) \n\t" | |||||
| "vl %%v20, 64(%%r1,%1) \n\t" | |||||
| "vl %%v21, 80(%%r1,%1) \n\t" | |||||
| "vl %%v22, 96(%%r1,%1) \n\t" | |||||
| "vl %%v23, 112(%%r1,%1) \n\t" | |||||
| "vl %%v24, 128(%%r1,%1) \n\t" | |||||
| "vl %%v25, 144(%%r1,%1) \n\t" | |||||
| "vl %%v26, 160(%%r1,%1) \n\t" | |||||
| "vl %%v27, 176(%%r1,%1) \n\t" | |||||
| "vl %%v28, 192(%%r1,%1) \n\t" | |||||
| "vl %%v29, 208(%%r1,%1) \n\t" | |||||
| "vl %%v30, 224(%%r1,%1) \n\t" | |||||
| "vl %%v31, 240(%%r1,%1) \n\t" | |||||
| "vl %%v0, 0(%%r1,%2) \n\t" | |||||
| "vl %%v1, 16(%%r1,%2) \n\t" | |||||
| "vl %%v2, 32(%%r1,%2) \n\t" | |||||
| "vl %%v3, 48(%%r1,%2) \n\t" | |||||
| "vl %%v4, 64(%%r1,%2) \n\t" | |||||
| "vl %%v5, 80(%%r1,%2) \n\t" | |||||
| "vl %%v6, 96(%%r1,%2) \n\t" | |||||
| "vl %%v7, 112(%%r1,%2) \n\t" | |||||
| "vst %%v0, 0(%%r1,%1) \n\t" | |||||
| "vst %%v1, 16(%%r1,%1) \n\t" | |||||
| "vst %%v2, 32(%%r1,%1) \n\t" | |||||
| "vst %%v3, 48(%%r1,%1) \n\t" | |||||
| "vst %%v4, 64(%%r1,%1) \n\t" | |||||
| "vst %%v5, 80(%%r1,%1) \n\t" | |||||
| "vst %%v6, 96(%%r1,%1) \n\t" | |||||
| "vst %%v7, 112(%%r1,%1) \n\t" | |||||
| "vl %%v0, 128(%%r1,%2) \n\t" | |||||
| "vl %%v1, 144(%%r1,%2) \n\t" | |||||
| "vl %%v2, 160(%%r1,%2) \n\t" | |||||
| "vl %%v3, 176(%%r1,%2) \n\t" | |||||
| "vl %%v4, 192(%%r1,%2) \n\t" | |||||
| "vl %%v5, 208(%%r1,%2) \n\t" | |||||
| "vl %%v6, 224(%%r1,%2) \n\t" | |||||
| "vl %%v7, 240(%%r1,%2) \n\t" | |||||
| "vst %%v0, 128(%%r1,%1) \n\t" | |||||
| "vst %%v1, 144(%%r1,%1) \n\t" | |||||
| "vst %%v2, 160(%%r1,%1) \n\t" | |||||
| "vst %%v3, 176(%%r1,%1) \n\t" | |||||
| "vst %%v4, 192(%%r1,%1) \n\t" | |||||
| "vst %%v5, 208(%%r1,%1) \n\t" | |||||
| "vst %%v6, 224(%%r1,%1) \n\t" | |||||
| "vst %%v7, 240(%%r1,%1) \n\t" | |||||
| "vst %%v16, 0(%%r1,%2) \n\t" | |||||
| "vst %%v17, 16(%%r1,%2) \n\t" | |||||
| "vst %%v18, 32(%%r1,%2) \n\t" | |||||
| "vst %%v19, 48(%%r1,%2) \n\t" | |||||
| "vst %%v20, 64(%%r1,%2) \n\t" | |||||
| "vst %%v21, 80(%%r1,%2) \n\t" | |||||
| "vst %%v22, 96(%%r1,%2) \n\t" | |||||
| "vst %%v23, 112(%%r1,%2) \n\t" | |||||
| "vst %%v24, 128(%%r1,%2) \n\t" | |||||
| "vst %%v25, 144(%%r1,%2) \n\t" | |||||
| "vst %%v26, 160(%%r1,%2) \n\t" | |||||
| "vst %%v27, 176(%%r1,%2) \n\t" | |||||
| "vst %%v28, 192(%%r1,%2) \n\t" | |||||
| "vst %%v29, 208(%%r1,%2) \n\t" | |||||
| "vst %%v30, 224(%%r1,%2) \n\t" | |||||
| "vst %%v31, 240(%%r1,%2) \n\t" | |||||
| "agfi %%r1,256 \n\t" | |||||
| "brctg %%r0,0b " | |||||
| : | |||||
| :"r"(n),"ZR"((FLOAT (*)[n * 2])x),"ZR"((FLOAT (*)[n * 2])y) | |||||
| :"memory","cc","r0","r1","v0","v1","v2","v3","v4","v5","v6","v7","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" | |||||
| ); | |||||
| static void cswap_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y) { | |||||
| __asm__("srlg %[n],%[n],5\n\t" | |||||
| "xgr %%r1,%%r1\n\t" | |||||
| "0:\n\t" | |||||
| "pfd 2, 1024(%%r1,%[x])\n\t" | |||||
| "pfd 2, 1024(%%r1,%[y])\n\t" | |||||
| "vl %%v16, 0(%%r1,%[x])\n\t" | |||||
| "vl %%v17, 16(%%r1,%[x])\n\t" | |||||
| "vl %%v18, 32(%%r1,%[x])\n\t" | |||||
| "vl %%v19, 48(%%r1,%[x])\n\t" | |||||
| "vl %%v20, 64(%%r1,%[x])\n\t" | |||||
| "vl %%v21, 80(%%r1,%[x])\n\t" | |||||
| "vl %%v22, 96(%%r1,%[x])\n\t" | |||||
| "vl %%v23, 112(%%r1,%[x])\n\t" | |||||
| "vl %%v24, 128(%%r1,%[x])\n\t" | |||||
| "vl %%v25, 144(%%r1,%[x])\n\t" | |||||
| "vl %%v26, 160(%%r1,%[x])\n\t" | |||||
| "vl %%v27, 176(%%r1,%[x])\n\t" | |||||
| "vl %%v28, 192(%%r1,%[x])\n\t" | |||||
| "vl %%v29, 208(%%r1,%[x])\n\t" | |||||
| "vl %%v30, 224(%%r1,%[x])\n\t" | |||||
| "vl %%v31, 240(%%r1,%[x])\n\t" | |||||
| "vl %%v0, 0(%%r1,%[y])\n\t" | |||||
| "vl %%v1, 16(%%r1,%[y])\n\t" | |||||
| "vl %%v2, 32(%%r1,%[y])\n\t" | |||||
| "vl %%v3, 48(%%r1,%[y])\n\t" | |||||
| "vl %%v4, 64(%%r1,%[y])\n\t" | |||||
| "vl %%v5, 80(%%r1,%[y])\n\t" | |||||
| "vl %%v6, 96(%%r1,%[y])\n\t" | |||||
| "vl %%v7, 112(%%r1,%[y])\n\t" | |||||
| "vst %%v0, 0(%%r1,%[x])\n\t" | |||||
| "vst %%v1, 16(%%r1,%[x])\n\t" | |||||
| "vst %%v2, 32(%%r1,%[x])\n\t" | |||||
| "vst %%v3, 48(%%r1,%[x])\n\t" | |||||
| "vst %%v4, 64(%%r1,%[x])\n\t" | |||||
| "vst %%v5, 80(%%r1,%[x])\n\t" | |||||
| "vst %%v6, 96(%%r1,%[x])\n\t" | |||||
| "vst %%v7, 112(%%r1,%[x])\n\t" | |||||
| "vl %%v0, 128(%%r1,%[y])\n\t" | |||||
| "vl %%v1, 144(%%r1,%[y])\n\t" | |||||
| "vl %%v2, 160(%%r1,%[y])\n\t" | |||||
| "vl %%v3, 176(%%r1,%[y])\n\t" | |||||
| "vl %%v4, 192(%%r1,%[y])\n\t" | |||||
| "vl %%v5, 208(%%r1,%[y])\n\t" | |||||
| "vl %%v6, 224(%%r1,%[y])\n\t" | |||||
| "vl %%v7, 240(%%r1,%[y])\n\t" | |||||
| "vst %%v0, 128(%%r1,%[x])\n\t" | |||||
| "vst %%v1, 144(%%r1,%[x])\n\t" | |||||
| "vst %%v2, 160(%%r1,%[x])\n\t" | |||||
| "vst %%v3, 176(%%r1,%[x])\n\t" | |||||
| "vst %%v4, 192(%%r1,%[x])\n\t" | |||||
| "vst %%v5, 208(%%r1,%[x])\n\t" | |||||
| "vst %%v6, 224(%%r1,%[x])\n\t" | |||||
| "vst %%v7, 240(%%r1,%[x])\n\t" | |||||
| "vst %%v16, 0(%%r1,%[y])\n\t" | |||||
| "vst %%v17, 16(%%r1,%[y])\n\t" | |||||
| "vst %%v18, 32(%%r1,%[y])\n\t" | |||||
| "vst %%v19, 48(%%r1,%[y])\n\t" | |||||
| "vst %%v20, 64(%%r1,%[y])\n\t" | |||||
| "vst %%v21, 80(%%r1,%[y])\n\t" | |||||
| "vst %%v22, 96(%%r1,%[y])\n\t" | |||||
| "vst %%v23, 112(%%r1,%[y])\n\t" | |||||
| "vst %%v24, 128(%%r1,%[y])\n\t" | |||||
| "vst %%v25, 144(%%r1,%[y])\n\t" | |||||
| "vst %%v26, 160(%%r1,%[y])\n\t" | |||||
| "vst %%v27, 176(%%r1,%[y])\n\t" | |||||
| "vst %%v28, 192(%%r1,%[y])\n\t" | |||||
| "vst %%v29, 208(%%r1,%[y])\n\t" | |||||
| "vst %%v30, 224(%%r1,%[y])\n\t" | |||||
| "vst %%v31, 240(%%r1,%[y])\n\t" | |||||
| "agfi %%r1,256\n\t" | |||||
| "brctg %[n],0b" | |||||
| : "+m"(*(FLOAT (*)[n * 2]) x), "+m"(*(FLOAT (*)[n * 2]) y),[n] "+&r"(n) | |||||
| : [x] "a"(x),[y] "a"(y) | |||||
| : "cc", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", | |||||
| "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", | |||||
| "v27", "v28", "v29", "v30", "v31"); | |||||
| } | } | ||||
| int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT dummy4, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) | |||||
| { | |||||
| BLASLONG i=0; | |||||
| BLASLONG ix=0,iy=0; | |||||
| FLOAT temp[2]; | |||||
| BLASLONG inc_x2, inc_y2; | |||||
| if ( n <= 0 ) return(0); | |||||
| if ( (inc_x == 1) && (inc_y == 1 )) | |||||
| { | |||||
| BLASLONG n1 = n & -32; | |||||
| if ( n1 > 0 ) | |||||
| { | |||||
| cswap_kernel_32(n1, x, y); | |||||
| i=n1; | |||||
| ix = 2* n1; | |||||
| iy = 2* n1; | |||||
| } | |||||
| while(i < n) | |||||
| { | |||||
| temp[0] = x[ix] ; | |||||
| temp[1] = x[ix+1] ; | |||||
| x[ix] = y[iy] ; | |||||
| x[ix+1] = y[iy+1] ; | |||||
| y[iy] = temp[0] ; | |||||
| y[iy+1] = temp[1] ; | |||||
| ix += 2 ; | |||||
| iy += 2 ; | |||||
| i++ ; | |||||
| int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, | |||||
| FLOAT dummy4, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, | |||||
| FLOAT *dummy, BLASLONG dummy2) { | |||||
| BLASLONG i = 0; | |||||
| BLASLONG ix = 0, iy = 0; | |||||
| FLOAT temp[2]; | |||||
| BLASLONG inc_x2, inc_y2; | |||||
| if (n <= 0) | |||||
| return (0); | |||||
| if ((inc_x == 1) && (inc_y == 1)) { | |||||
| BLASLONG n1 = n & -32; | |||||
| if (n1 > 0) { | |||||
| cswap_kernel_32(n1, x, y); | |||||
| i = n1; | |||||
| ix = 2 * n1; | |||||
| iy = 2 * n1; | |||||
| } | |||||
| while (i < n) { | |||||
| } | |||||
| temp[0] = x[ix]; | |||||
| temp[1] = x[ix + 1]; | |||||
| x[ix] = y[iy]; | |||||
| x[ix + 1] = y[iy + 1]; | |||||
| y[iy] = temp[0]; | |||||
| y[iy + 1] = temp[1]; | |||||
| ix += 2; | |||||
| iy += 2; | |||||
| i++; | |||||
| } | } | ||||
| else | |||||
| { | |||||
| inc_x2 = 2 * inc_x; | |||||
| inc_y2 = 2 * inc_y; | |||||
| } else { | |||||
| while(i < n) | |||||
| { | |||||
| inc_x2 = 2 * inc_x; | |||||
| inc_y2 = 2 * inc_y; | |||||
| temp[0] = x[ix] ; | |||||
| temp[1] = x[ix+1] ; | |||||
| x[ix] = y[iy] ; | |||||
| x[ix+1] = y[iy+1] ; | |||||
| y[iy] = temp[0] ; | |||||
| y[iy+1] = temp[1] ; | |||||
| while (i < n) { | |||||
| ix += inc_x2 ; | |||||
| iy += inc_y2 ; | |||||
| i++ ; | |||||
| temp[0] = x[ix]; | |||||
| temp[1] = x[ix + 1]; | |||||
| x[ix] = y[iy]; | |||||
| x[ix + 1] = y[iy + 1]; | |||||
| y[iy] = temp[0]; | |||||
| y[iy + 1] = temp[1]; | |||||
| } | |||||
| ix += inc_x2; | |||||
| iy += inc_y2; | |||||
| i++; | |||||
| } | } | ||||
| return(0); | |||||
| } | |||||
| } | |||||
| return (0); | |||||
| } | |||||
| @@ -1,5 +1,5 @@ | |||||
| /*************************************************************************** | /*************************************************************************** | ||||
| Copyright (c) 2013-2017, The OpenBLAS Project | |||||
| Copyright (c) 2013-2019, The OpenBLAS Project | |||||
| All rights reserved. | All rights reserved. | ||||
| Redistribution and use in source and binary forms, with or without | Redistribution and use in source and binary forms, with or without | ||||
| modification, are permitted provided that the following conditions are | modification, are permitted provided that the following conditions are | ||||
| @@ -28,139 +28,123 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #include "common.h" | #include "common.h" | ||||
| #include <math.h> | #include <math.h> | ||||
| #if defined(DOUBLE) | |||||
| #define ABS fabs | #define ABS fabs | ||||
| #else | |||||
| #define ABS fabsf | |||||
| #endif | |||||
| static FLOAT damax_kernel_32(BLASLONG n, FLOAT *x) | |||||
| { | |||||
| FLOAT amax; | |||||
| __asm__ volatile ( | |||||
| "vl %%v0,0(%2) \n\t" | |||||
| "srlg %%r0,%1,5 \n\t" | |||||
| "xgr %%r1,%%r1 \n\t" | |||||
| "0: \n\t" | |||||
| "pfd 1, 1024(%%r1,%2) \n\t" | |||||
| "vl %%v16,0(%%r1,%2) \n\t" | |||||
| "vl %%v17,16(%%r1,%2) \n\t" | |||||
| "vl %%v18,32(%%r1,%2) \n\t" | |||||
| "vl %%v19,48(%%r1,%2) \n\t" | |||||
| "vl %%v20,64(%%r1,%2) \n\t" | |||||
| "vl %%v21,80(%%r1,%2) \n\t" | |||||
| "vl %%v22,96(%%r1,%2) \n\t" | |||||
| "vl %%v23,112(%%r1,%2) \n\t" | |||||
| "vl %%v24,128(%%r1,%2) \n\t" | |||||
| "vl %%v25,144(%%r1,%2) \n\t" | |||||
| "vl %%v26,160(%%r1,%2) \n\t" | |||||
| "vl %%v27,176(%%r1,%2) \n\t" | |||||
| "vl %%v28,192(%%r1,%2) \n\t" | |||||
| "vl %%v29,208(%%r1,%2) \n\t" | |||||
| "vl %%v30,224(%%r1,%2) \n\t" | |||||
| "vl %%v31,240(%%r1,%2) \n\t" | |||||
| "vfmaxdb %%v16,%%v16,%%v24,8 \n\t" | |||||
| "vfmaxdb %%v17,%%v17,%%v25,8 \n\t" | |||||
| "vfmaxdb %%v18,%%v18,%%v26,8 \n\t" | |||||
| "vfmaxdb %%v19,%%v19,%%v27,8 \n\t" | |||||
| "vfmaxdb %%v20,%%v20,%%v28,8 \n\t" | |||||
| "vfmaxdb %%v21,%%v21,%%v29,8 \n\t" | |||||
| "vfmaxdb %%v22,%%v22,%%v30,8 \n\t" | |||||
| "vfmaxdb %%v23,%%v23,%%v31,8 \n\t" | |||||
| "vfmaxdb %%v16,%%v16,%%v20,8 \n\t" | |||||
| "vfmaxdb %%v17,%%v17,%%v21,8 \n\t" | |||||
| "vfmaxdb %%v18,%%v18,%%v22,8 \n\t" | |||||
| "vfmaxdb %%v19,%%v19,%%v23,8 \n\t" | |||||
| "vfmaxdb %%v16,%%v16,%%v18,8 \n\t" | |||||
| "vfmaxdb %%v17,%%v17,%%v19,8 \n\t" | |||||
| "vfmaxdb %%v16,%%v16,%%v17,8 \n\t" | |||||
| "vfmaxdb %%v0,%%v0,%%v16,8 \n\t" | |||||
| "agfi %%r1, 256 \n\t" | |||||
| "brctg %%r0, 0b \n\t" | |||||
| "vrepg %%v16,%%v0,1 \n\t" | |||||
| "wfmaxdb %%v0,%%v0,%%v16,8 \n\t" | |||||
| "lpdr %0,%%f0 " | |||||
| :"=f"(amax) | |||||
| :"r"(n),"ZR"((const FLOAT (*)[n])x) | |||||
| :"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" | |||||
| ); | |||||
| return amax; | |||||
| } | |||||
| FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { | |||||
| BLASLONG i = 0; | |||||
| BLASLONG j = 0; | |||||
| FLOAT maxf = 0.0; | |||||
| if (n <= 0 || inc_x <= 0) return (maxf); | |||||
| static FLOAT damax_kernel_32(BLASLONG n, FLOAT *x) { | |||||
| FLOAT amax; | |||||
| __asm__("vl %%v0,0(%[x])\n\t" | |||||
| "srlg %[n],%[n],5\n\t" | |||||
| "xgr %%r1,%%r1\n\t" | |||||
| "0:\n\t" | |||||
| "pfd 1, 1024(%%r1,%[x])\n\t" | |||||
| "vl %%v16,0(%%r1,%[x])\n\t" | |||||
| "vl %%v17,16(%%r1,%[x])\n\t" | |||||
| "vl %%v18,32(%%r1,%[x])\n\t" | |||||
| "vl %%v19,48(%%r1,%[x])\n\t" | |||||
| "vl %%v20,64(%%r1,%[x])\n\t" | |||||
| "vl %%v21,80(%%r1,%[x])\n\t" | |||||
| "vl %%v22,96(%%r1,%[x])\n\t" | |||||
| "vl %%v23,112(%%r1,%[x])\n\t" | |||||
| "vl %%v24,128(%%r1,%[x])\n\t" | |||||
| "vl %%v25,144(%%r1,%[x])\n\t" | |||||
| "vl %%v26,160(%%r1,%[x])\n\t" | |||||
| "vl %%v27,176(%%r1,%[x])\n\t" | |||||
| "vl %%v28,192(%%r1,%[x])\n\t" | |||||
| "vl %%v29,208(%%r1,%[x])\n\t" | |||||
| "vl %%v30,224(%%r1,%[x])\n\t" | |||||
| "vl %%v31,240(%%r1,%[x])\n\t" | |||||
| "vfmaxdb %%v16,%%v16,%%v24,8\n\t" | |||||
| "vfmaxdb %%v17,%%v17,%%v25,8\n\t" | |||||
| "vfmaxdb %%v18,%%v18,%%v26,8\n\t" | |||||
| "vfmaxdb %%v19,%%v19,%%v27,8\n\t" | |||||
| "vfmaxdb %%v20,%%v20,%%v28,8\n\t" | |||||
| "vfmaxdb %%v21,%%v21,%%v29,8\n\t" | |||||
| "vfmaxdb %%v22,%%v22,%%v30,8\n\t" | |||||
| "vfmaxdb %%v23,%%v23,%%v31,8\n\t" | |||||
| "vfmaxdb %%v16,%%v16,%%v20,8\n\t" | |||||
| "vfmaxdb %%v17,%%v17,%%v21,8\n\t" | |||||
| "vfmaxdb %%v18,%%v18,%%v22,8\n\t" | |||||
| "vfmaxdb %%v19,%%v19,%%v23,8\n\t" | |||||
| "vfmaxdb %%v16,%%v16,%%v18,8\n\t" | |||||
| "vfmaxdb %%v17,%%v17,%%v19,8\n\t" | |||||
| "vfmaxdb %%v16,%%v16,%%v17,8\n\t" | |||||
| "vfmaxdb %%v0,%%v0,%%v16,8\n\t" | |||||
| "agfi %%r1, 256\n\t" | |||||
| "brctg %[n], 0b\n\t" | |||||
| "vrepg %%v16,%%v0,1\n\t" | |||||
| "wfmaxdb %%v0,%%v0,%%v16,8\n\t" | |||||
| "lpdr %[amax],%%f0" | |||||
| : [amax] "=f"(amax),[n] "+&r"(n) | |||||
| : "m"(*(const FLOAT (*)[n]) x),[x] "a"(x) | |||||
| : "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22", | |||||
| "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"); | |||||
| return amax; | |||||
| } | |||||
| if (inc_x == 1) { | |||||
| FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { | |||||
| BLASLONG i = 0; | |||||
| BLASLONG j = 0; | |||||
| FLOAT maxf = 0.0; | |||||
| BLASLONG n1 = n & -32; | |||||
| if (n1 > 0) { | |||||
| if (n <= 0 || inc_x <= 0) | |||||
| return (maxf); | |||||
| maxf = damax_kernel_32(n1, x); | |||||
| if (inc_x == 1) { | |||||
| i = n1; | |||||
| } | |||||
| else | |||||
| { | |||||
| maxf=ABS(x[0]); | |||||
| i++; | |||||
| } | |||||
| BLASLONG n1 = n & -32; | |||||
| if (n1 > 0) { | |||||
| while (i < n) { | |||||
| if (ABS(x[i]) > maxf) { | |||||
| maxf = ABS(x[i]); | |||||
| } | |||||
| i++; | |||||
| } | |||||
| return (maxf); | |||||
| maxf = damax_kernel_32(n1, x); | |||||
| i = n1; | |||||
| } else { | } else { | ||||
| maxf = ABS(x[0]); | |||||
| i++; | |||||
| } | |||||
| while (i < n) { | |||||
| if (ABS(x[i]) > maxf) { | |||||
| maxf = ABS(x[i]); | |||||
| } | |||||
| i++; | |||||
| } | |||||
| return (maxf); | |||||
| maxf=ABS(x[0]); | |||||
| } else { | |||||
| BLASLONG n1 = n & -4; | |||||
| while (j < n1) { | |||||
| maxf = ABS(x[0]); | |||||
| if (ABS(x[i]) > maxf) { | |||||
| maxf = ABS(x[i]); | |||||
| } | |||||
| if (ABS(x[i + inc_x]) > maxf) { | |||||
| maxf = ABS(x[i + inc_x]); | |||||
| } | |||||
| if (ABS(x[i + 2 * inc_x]) > maxf) { | |||||
| maxf = ABS(x[i + 2 * inc_x]); | |||||
| } | |||||
| if (ABS(x[i + 3 * inc_x]) > maxf) { | |||||
| maxf = ABS(x[i + 3 * inc_x]); | |||||
| } | |||||
| BLASLONG n1 = n & -4; | |||||
| while (j < n1) { | |||||
| i += inc_x * 4; | |||||
| if (ABS(x[i]) > maxf) { | |||||
| maxf = ABS(x[i]); | |||||
| } | |||||
| if (ABS(x[i + inc_x]) > maxf) { | |||||
| maxf = ABS(x[i + inc_x]); | |||||
| } | |||||
| if (ABS(x[i + 2 * inc_x]) > maxf) { | |||||
| maxf = ABS(x[i + 2 * inc_x]); | |||||
| } | |||||
| if (ABS(x[i + 3 * inc_x]) > maxf) { | |||||
| maxf = ABS(x[i + 3 * inc_x]); | |||||
| } | |||||
| j += 4; | |||||
| i += inc_x * 4; | |||||
| } | |||||
| j += 4; | |||||
| } | |||||
| while (j < n) { | |||||
| if (ABS(x[i]) > maxf) { | |||||
| maxf = ABS(x[i]); | |||||
| } | |||||
| i += inc_x; | |||||
| j++; | |||||
| } | |||||
| return (maxf); | |||||
| while (j < n) { | |||||
| if (ABS(x[i]) > maxf) { | |||||
| maxf = ABS(x[i]); | |||||
| } | |||||
| i += inc_x; | |||||
| j++; | |||||
| } | } | ||||
| return (maxf); | |||||
| } | |||||
| } | } | ||||
| @@ -1,5 +1,5 @@ | |||||
| /*************************************************************************** | /*************************************************************************** | ||||
| Copyright (c) 2013-2017, The OpenBLAS Project | |||||
| Copyright (c) 2013-2019, The OpenBLAS Project | |||||
| All rights reserved. | All rights reserved. | ||||
| Redistribution and use in source and binary forms, with or without | Redistribution and use in source and binary forms, with or without | ||||
| modification, are permitted provided that the following conditions are | modification, are permitted provided that the following conditions are | ||||
| @@ -28,177 +28,157 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #include "common.h" | #include "common.h" | ||||
| #include <math.h> | #include <math.h> | ||||
| #if defined(DOUBLE) | |||||
| #define ABS fabs | #define ABS fabs | ||||
| #else | |||||
| #define ABS fabsf | |||||
| #endif | |||||
| static FLOAT damax_kernel_32(BLASLONG n, FLOAT *x) | |||||
| { | |||||
| FLOAT amax; | |||||
| __asm__ volatile ( | |||||
| "vl %%v0,0(%2) \n\t" | |||||
| "vflpdb %%v0,%%v0 \n\t" | |||||
| "srlg %%r0,%1,5 \n\t" | |||||
| "xgr %%r1,%%r1 \n\t" | |||||
| "0: \n\t" | |||||
| "pfd 1, 1024(%%r1,%2) \n\t" | |||||
| "vl %%v16,0(%%r1,%2) \n\t" | |||||
| "vl %%v17,16(%%r1,%2) \n\t" | |||||
| "vl %%v18,32(%%r1,%2) \n\t" | |||||
| "vl %%v19,48(%%r1,%2) \n\t" | |||||
| "vl %%v20,64(%%r1,%2) \n\t" | |||||
| "vl %%v21,80(%%r1,%2) \n\t" | |||||
| "vl %%v22,96(%%r1,%2) \n\t" | |||||
| "vl %%v23,112(%%r1,%2) \n\t" | |||||
| "vflpdb %%v16, %%v16 \n\t" | |||||
| "vflpdb %%v17, %%v17 \n\t" | |||||
| "vflpdb %%v18, %%v18 \n\t" | |||||
| "vflpdb %%v19, %%v19 \n\t" | |||||
| "vflpdb %%v20, %%v20 \n\t" | |||||
| "vflpdb %%v21, %%v21 \n\t" | |||||
| "vflpdb %%v22, %%v22 \n\t" | |||||
| "vflpdb %%v23, %%v23 \n\t" | |||||
| "vfchdb %%v24,%%v16,%%v17 \n\t" | |||||
| "vfchdb %%v25,%%v18,%%v19 \n\t" | |||||
| "vfchdb %%v26,%%v20,%%v21 \n\t" | |||||
| "vfchdb %%v27,%%v22,%%v23 \n\t" | |||||
| "vsel %%v24,%%v16,%%v17,%%v24 \n\t" | |||||
| "vsel %%v25,%%v18,%%v19,%%v25 \n\t" | |||||
| "vsel %%v26,%%v20,%%v21,%%v26 \n\t" | |||||
| "vsel %%v27,%%v22,%%v23,%%v27 \n\t" | |||||
| "vfchdb %%v28,%%v24,%%v25 \n\t" | |||||
| "vfchdb %%v29,%%v26,%%v27 \n\t" | |||||
| "vsel %%v28,%%v24,%%v25,%%v28 \n\t" | |||||
| "vsel %%v29,%%v26,%%v27,%%v29 \n\t" | |||||
| "vfchdb %%v30,%%v28,%%v29 \n\t" | |||||
| "vsel %%v30,%%v28,%%v29,%%v30 \n\t" | |||||
| "vfchdb %%v31,%%v30,%%v0 \n\t" | |||||
| "vsel %%v0,%%v30,%%v0,%%v31 \n\t" | |||||
| "vl %%v16,128(%%r1,%2) \n\t" | |||||
| "vl %%v17,144(%%r1,%2) \n\t" | |||||
| "vl %%v18,160(%%r1,%2) \n\t" | |||||
| "vl %%v19,176(%%r1,%2) \n\t" | |||||
| "vl %%v20,192(%%r1,%2) \n\t" | |||||
| "vl %%v21,208(%%r1,%2) \n\t" | |||||
| "vl %%v22,224(%%r1,%2) \n\t" | |||||
| "vl %%v23,240(%%r1,%2) \n\t" | |||||
| "vflpdb %%v16, %%v16 \n\t" | |||||
| "vflpdb %%v17, %%v17 \n\t" | |||||
| "vflpdb %%v18, %%v18 \n\t" | |||||
| "vflpdb %%v19, %%v19 \n\t" | |||||
| "vflpdb %%v20, %%v20 \n\t" | |||||
| "vflpdb %%v21, %%v21 \n\t" | |||||
| "vflpdb %%v22, %%v22 \n\t" | |||||
| "vflpdb %%v23, %%v23 \n\t" | |||||
| "vfchdb %%v24,%%v16,%%v17 \n\t" | |||||
| "vfchdb %%v25,%%v18,%%v19 \n\t" | |||||
| "vfchdb %%v26,%%v20,%%v21 \n\t" | |||||
| "vfchdb %%v27,%%v22,%%v23 \n\t" | |||||
| "vsel %%v24,%%v16,%%v17,%%v24 \n\t" | |||||
| "vsel %%v25,%%v18,%%v19,%%v25 \n\t" | |||||
| "vsel %%v26,%%v20,%%v21,%%v26 \n\t" | |||||
| "vsel %%v27,%%v22,%%v23,%%v27 \n\t" | |||||
| "vfchdb %%v28,%%v24,%%v25 \n\t" | |||||
| "vfchdb %%v29,%%v26,%%v27 \n\t" | |||||
| "vsel %%v28,%%v24,%%v25,%%v28 \n\t" | |||||
| "vsel %%v29,%%v26,%%v27,%%v29 \n\t" | |||||
| "vfchdb %%v30,%%v28,%%v29 \n\t" | |||||
| "vsel %%v30,%%v28,%%v29,%%v30 \n\t" | |||||
| "vfchdb %%v31,%%v30,%%v0 \n\t" | |||||
| "vsel %%v0,%%v30,%%v0,%%v31 \n\t" | |||||
| "agfi %%r1, 256 \n\t" | |||||
| "brctg %%r0, 0b \n\t" | |||||
| "vrepg %%v16,%%v0,1 \n\t" | |||||
| "wfchdb %%v17,%%v0,%%v16 \n\t" | |||||
| "vsel %%v0,%%v0,%%v16,%%v17 \n\t" | |||||
| "ldr %0,%%f0 " | |||||
| :"=f"(amax) | |||||
| :"r"(n),"ZR"((const FLOAT (*)[n])x) | |||||
| :"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" | |||||
| ); | |||||
| return amax; | |||||
| } | |||||
| FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { | |||||
| BLASLONG i = 0; | |||||
| BLASLONG j = 0; | |||||
| FLOAT maxf = 0.0; | |||||
| if (n <= 0 || inc_x <= 0) return (maxf); | |||||
| static FLOAT damax_kernel_32(BLASLONG n, FLOAT *x) { | |||||
| FLOAT amax; | |||||
| __asm__("vl %%v0,0(%[x])\n\t" | |||||
| "vflpdb %%v0,%%v0\n\t" | |||||
| "srlg %[n],%[n],5\n\t" | |||||
| "xgr %%r1,%%r1\n\t" | |||||
| "0:\n\t" | |||||
| "pfd 1, 1024(%%r1,%[x])\n\t" | |||||
| "vl %%v16,0(%%r1,%[x])\n\t" | |||||
| "vl %%v17,16(%%r1,%[x])\n\t" | |||||
| "vl %%v18,32(%%r1,%[x])\n\t" | |||||
| "vl %%v19,48(%%r1,%[x])\n\t" | |||||
| "vl %%v20,64(%%r1,%[x])\n\t" | |||||
| "vl %%v21,80(%%r1,%[x])\n\t" | |||||
| "vl %%v22,96(%%r1,%[x])\n\t" | |||||
| "vl %%v23,112(%%r1,%[x])\n\t" | |||||
| "vflpdb %%v16, %%v16\n\t" | |||||
| "vflpdb %%v17, %%v17\n\t" | |||||
| "vflpdb %%v18, %%v18\n\t" | |||||
| "vflpdb %%v19, %%v19\n\t" | |||||
| "vflpdb %%v20, %%v20\n\t" | |||||
| "vflpdb %%v21, %%v21\n\t" | |||||
| "vflpdb %%v22, %%v22\n\t" | |||||
| "vflpdb %%v23, %%v23\n\t" | |||||
| "vfchdb %%v24,%%v16,%%v17\n\t" | |||||
| "vfchdb %%v25,%%v18,%%v19\n\t" | |||||
| "vfchdb %%v26,%%v20,%%v21\n\t" | |||||
| "vfchdb %%v27,%%v22,%%v23\n\t" | |||||
| "vsel %%v24,%%v16,%%v17,%%v24\n\t" | |||||
| "vsel %%v25,%%v18,%%v19,%%v25\n\t" | |||||
| "vsel %%v26,%%v20,%%v21,%%v26\n\t" | |||||
| "vsel %%v27,%%v22,%%v23,%%v27\n\t" | |||||
| "vfchdb %%v28,%%v24,%%v25\n\t" | |||||
| "vfchdb %%v29,%%v26,%%v27\n\t" | |||||
| "vsel %%v28,%%v24,%%v25,%%v28\n\t" | |||||
| "vsel %%v29,%%v26,%%v27,%%v29\n\t" | |||||
| "vfchdb %%v30,%%v28,%%v29\n\t" | |||||
| "vsel %%v30,%%v28,%%v29,%%v30\n\t" | |||||
| "vfchdb %%v31,%%v30,%%v0\n\t" | |||||
| "vsel %%v0,%%v30,%%v0,%%v31\n\t" | |||||
| "vl %%v16,128(%%r1,%[x])\n\t" | |||||
| "vl %%v17,144(%%r1,%[x])\n\t" | |||||
| "vl %%v18,160(%%r1,%[x])\n\t" | |||||
| "vl %%v19,176(%%r1,%[x])\n\t" | |||||
| "vl %%v20,192(%%r1,%[x])\n\t" | |||||
| "vl %%v21,208(%%r1,%[x])\n\t" | |||||
| "vl %%v22,224(%%r1,%[x])\n\t" | |||||
| "vl %%v23,240(%%r1,%[x])\n\t" | |||||
| "vflpdb %%v16, %%v16\n\t" | |||||
| "vflpdb %%v17, %%v17\n\t" | |||||
| "vflpdb %%v18, %%v18\n\t" | |||||
| "vflpdb %%v19, %%v19\n\t" | |||||
| "vflpdb %%v20, %%v20\n\t" | |||||
| "vflpdb %%v21, %%v21\n\t" | |||||
| "vflpdb %%v22, %%v22\n\t" | |||||
| "vflpdb %%v23, %%v23\n\t" | |||||
| "vfchdb %%v24,%%v16,%%v17\n\t" | |||||
| "vfchdb %%v25,%%v18,%%v19\n\t" | |||||
| "vfchdb %%v26,%%v20,%%v21\n\t" | |||||
| "vfchdb %%v27,%%v22,%%v23\n\t" | |||||
| "vsel %%v24,%%v16,%%v17,%%v24\n\t" | |||||
| "vsel %%v25,%%v18,%%v19,%%v25\n\t" | |||||
| "vsel %%v26,%%v20,%%v21,%%v26\n\t" | |||||
| "vsel %%v27,%%v22,%%v23,%%v27\n\t" | |||||
| "vfchdb %%v28,%%v24,%%v25\n\t" | |||||
| "vfchdb %%v29,%%v26,%%v27\n\t" | |||||
| "vsel %%v28,%%v24,%%v25,%%v28\n\t" | |||||
| "vsel %%v29,%%v26,%%v27,%%v29\n\t" | |||||
| "vfchdb %%v30,%%v28,%%v29\n\t" | |||||
| "vsel %%v30,%%v28,%%v29,%%v30\n\t" | |||||
| "vfchdb %%v31,%%v30,%%v0\n\t" | |||||
| "vsel %%v0,%%v30,%%v0,%%v31\n\t" | |||||
| "agfi %%r1, 256\n\t" | |||||
| "brctg %[n], 0b\n\t" | |||||
| "vrepg %%v16,%%v0,1\n\t" | |||||
| "wfchdb %%v17,%%v0,%%v16\n\t" | |||||
| "vsel %%v0,%%v0,%%v16,%%v17\n\t" | |||||
| "ldr %[amax],%%f0" | |||||
| : [amax] "=f"(amax),[n] "+&r"(n) | |||||
| : "m"(*(const FLOAT (*)[n]) x),[x] "a"(x) | |||||
| : "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22", | |||||
| "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"); | |||||
| return amax; | |||||
| } | |||||
| if (inc_x == 1) { | |||||
| FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { | |||||
| BLASLONG i = 0; | |||||
| BLASLONG j = 0; | |||||
| FLOAT maxf = 0.0; | |||||
| BLASLONG n1 = n & -32; | |||||
| if (n1 > 0) { | |||||
| if (n <= 0 || inc_x <= 0) | |||||
| return (maxf); | |||||
| maxf = damax_kernel_32(n1, x); | |||||
| if (inc_x == 1) { | |||||
| i = n1; | |||||
| } | |||||
| else | |||||
| { | |||||
| maxf=ABS(x[0]); | |||||
| i++; | |||||
| } | |||||
| BLASLONG n1 = n & -32; | |||||
| if (n1 > 0) { | |||||
| while (i < n) { | |||||
| if (ABS(x[i]) > maxf) { | |||||
| maxf = ABS(x[i]); | |||||
| } | |||||
| i++; | |||||
| } | |||||
| return (maxf); | |||||
| maxf = damax_kernel_32(n1, x); | |||||
| i = n1; | |||||
| } else { | } else { | ||||
| maxf = ABS(x[0]); | |||||
| i++; | |||||
| } | |||||
| while (i < n) { | |||||
| if (ABS(x[i]) > maxf) { | |||||
| maxf = ABS(x[i]); | |||||
| } | |||||
| i++; | |||||
| } | |||||
| return (maxf); | |||||
| maxf=ABS(x[0]); | |||||
| } else { | |||||
| BLASLONG n1 = n & -4; | |||||
| while (j < n1) { | |||||
| maxf = ABS(x[0]); | |||||
| if (ABS(x[i]) > maxf) { | |||||
| maxf = ABS(x[i]); | |||||
| } | |||||
| if (ABS(x[i + inc_x]) > maxf) { | |||||
| maxf = ABS(x[i + inc_x]); | |||||
| } | |||||
| if (ABS(x[i + 2 * inc_x]) > maxf) { | |||||
| maxf = ABS(x[i + 2 * inc_x]); | |||||
| } | |||||
| if (ABS(x[i + 3 * inc_x]) > maxf) { | |||||
| maxf = ABS(x[i + 3 * inc_x]); | |||||
| } | |||||
| BLASLONG n1 = n & -4; | |||||
| while (j < n1) { | |||||
| i += inc_x * 4; | |||||
| if (ABS(x[i]) > maxf) { | |||||
| maxf = ABS(x[i]); | |||||
| } | |||||
| if (ABS(x[i + inc_x]) > maxf) { | |||||
| maxf = ABS(x[i + inc_x]); | |||||
| } | |||||
| if (ABS(x[i + 2 * inc_x]) > maxf) { | |||||
| maxf = ABS(x[i + 2 * inc_x]); | |||||
| } | |||||
| if (ABS(x[i + 3 * inc_x]) > maxf) { | |||||
| maxf = ABS(x[i + 3 * inc_x]); | |||||
| } | |||||
| j += 4; | |||||
| i += inc_x * 4; | |||||
| } | |||||
| j += 4; | |||||
| } | |||||
| while (j < n) { | |||||
| if (ABS(x[i]) > maxf) { | |||||
| maxf = ABS(x[i]); | |||||
| } | |||||
| i += inc_x; | |||||
| j++; | |||||
| } | |||||
| return (maxf); | |||||
| while (j < n) { | |||||
| if (ABS(x[i]) > maxf) { | |||||
| maxf = ABS(x[i]); | |||||
| } | |||||
| i += inc_x; | |||||
| j++; | |||||
| } | } | ||||
| return (maxf); | |||||
| } | |||||
| } | } | ||||
| @@ -1,5 +1,5 @@ | |||||
| /*************************************************************************** | /*************************************************************************** | ||||
| Copyright (c) 2013-2017, The OpenBLAS Project | |||||
| Copyright (c) 2013-2019, The OpenBLAS Project | |||||
| All rights reserved. | All rights reserved. | ||||
| Redistribution and use in source and binary forms, with or without | Redistribution and use in source and binary forms, with or without | ||||
| modification, are permitted provided that the following conditions are | modification, are permitted provided that the following conditions are | ||||
| @@ -28,139 +28,123 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #include "common.h" | #include "common.h" | ||||
| #include <math.h> | #include <math.h> | ||||
| #if defined(DOUBLE) | |||||
| #define ABS fabs | #define ABS fabs | ||||
| #else | |||||
| #define ABS fabsf | |||||
| #endif | |||||
| static FLOAT damin_kernel_32(BLASLONG n, FLOAT *x) | |||||
| { | |||||
| FLOAT amin; | |||||
| __asm__ volatile ( | |||||
| "vl %%v0,0(%2) \n\t" | |||||
| "srlg %%r0,%1,5 \n\t" | |||||
| "xgr %%r1,%%r1 \n\t" | |||||
| "0: \n\t" | |||||
| "pfd 1, 1024(%%r1,%2) \n\t" | |||||
| "vl %%v16,0(%%r1,%2) \n\t" | |||||
| "vl %%v17,16(%%r1,%2) \n\t" | |||||
| "vl %%v18,32(%%r1,%2) \n\t" | |||||
| "vl %%v19,48(%%r1,%2) \n\t" | |||||
| "vl %%v20,64(%%r1,%2) \n\t" | |||||
| "vl %%v21,80(%%r1,%2) \n\t" | |||||
| "vl %%v22,96(%%r1,%2) \n\t" | |||||
| "vl %%v23,112(%%r1,%2) \n\t" | |||||
| "vl %%v24,128(%%r1,%2) \n\t" | |||||
| "vl %%v25,144(%%r1,%2) \n\t" | |||||
| "vl %%v26,160(%%r1,%2) \n\t" | |||||
| "vl %%v27,176(%%r1,%2) \n\t" | |||||
| "vl %%v28,192(%%r1,%2) \n\t" | |||||
| "vl %%v29,208(%%r1,%2) \n\t" | |||||
| "vl %%v30,224(%%r1,%2) \n\t" | |||||
| "vl %%v31,240(%%r1,%2) \n\t" | |||||
| "vfmindb %%v16,%%v16,%%v24,8 \n\t" | |||||
| "vfmindb %%v17,%%v17,%%v25,8 \n\t" | |||||
| "vfmindb %%v18,%%v18,%%v26,8 \n\t" | |||||
| "vfmindb %%v19,%%v19,%%v27,8 \n\t" | |||||
| "vfmindb %%v20,%%v20,%%v28,8 \n\t" | |||||
| "vfmindb %%v21,%%v21,%%v29,8 \n\t" | |||||
| "vfmindb %%v22,%%v22,%%v30,8 \n\t" | |||||
| "vfmindb %%v23,%%v23,%%v31,8 \n\t" | |||||
| "vfmindb %%v16,%%v16,%%v20,8 \n\t" | |||||
| "vfmindb %%v17,%%v17,%%v21,8 \n\t" | |||||
| "vfmindb %%v18,%%v18,%%v22,8 \n\t" | |||||
| "vfmindb %%v19,%%v19,%%v23,8 \n\t" | |||||
| "vfmindb %%v16,%%v16,%%v18,8 \n\t" | |||||
| "vfmindb %%v17,%%v17,%%v19,8 \n\t" | |||||
| "vfmindb %%v16,%%v16,%%v17,8 \n\t" | |||||
| "vfmindb %%v0,%%v0,%%v16,8 \n\t" | |||||
| "agfi %%r1, 256 \n\t" | |||||
| "brctg %%r0, 0b \n\t" | |||||
| "vrepg %%v16,%%v0,1 \n\t" | |||||
| "wfmindb %%v0,%%v0,%%v16,8 \n\t" | |||||
| "lpdr %0,%%f0 " | |||||
| :"=f"(amin) | |||||
| :"r"(n),"ZR"((const FLOAT (*)[n])x) | |||||
| :"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" | |||||
| ); | |||||
| return amin; | |||||
| } | |||||
| FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { | |||||
| BLASLONG i = 0; | |||||
| BLASLONG j = 0; | |||||
| FLOAT minf = 0.0; | |||||
| if (n <= 0 || inc_x <= 0) return (minf); | |||||
| static FLOAT damin_kernel_32(BLASLONG n, FLOAT *x) { | |||||
| FLOAT amin; | |||||
| __asm__("vl %%v0,0(%[x])\n\t" | |||||
| "srlg %[n],%[n],5\n\t" | |||||
| "xgr %%r1,%%r1\n\t" | |||||
| "0:\n\t" | |||||
| "pfd 1, 1024(%%r1,%[x])\n\t" | |||||
| "vl %%v16,0(%%r1,%[x])\n\t" | |||||
| "vl %%v17,16(%%r1,%[x])\n\t" | |||||
| "vl %%v18,32(%%r1,%[x])\n\t" | |||||
| "vl %%v19,48(%%r1,%[x])\n\t" | |||||
| "vl %%v20,64(%%r1,%[x])\n\t" | |||||
| "vl %%v21,80(%%r1,%[x])\n\t" | |||||
| "vl %%v22,96(%%r1,%[x])\n\t" | |||||
| "vl %%v23,112(%%r1,%[x])\n\t" | |||||
| "vl %%v24,128(%%r1,%[x])\n\t" | |||||
| "vl %%v25,144(%%r1,%[x])\n\t" | |||||
| "vl %%v26,160(%%r1,%[x])\n\t" | |||||
| "vl %%v27,176(%%r1,%[x])\n\t" | |||||
| "vl %%v28,192(%%r1,%[x])\n\t" | |||||
| "vl %%v29,208(%%r1,%[x])\n\t" | |||||
| "vl %%v30,224(%%r1,%[x])\n\t" | |||||
| "vl %%v31,240(%%r1,%[x])\n\t" | |||||
| "vfmindb %%v16,%%v16,%%v24,8\n\t" | |||||
| "vfmindb %%v17,%%v17,%%v25,8\n\t" | |||||
| "vfmindb %%v18,%%v18,%%v26,8\n\t" | |||||
| "vfmindb %%v19,%%v19,%%v27,8\n\t" | |||||
| "vfmindb %%v20,%%v20,%%v28,8\n\t" | |||||
| "vfmindb %%v21,%%v21,%%v29,8\n\t" | |||||
| "vfmindb %%v22,%%v22,%%v30,8\n\t" | |||||
| "vfmindb %%v23,%%v23,%%v31,8\n\t" | |||||
| "vfmindb %%v16,%%v16,%%v20,8\n\t" | |||||
| "vfmindb %%v17,%%v17,%%v21,8\n\t" | |||||
| "vfmindb %%v18,%%v18,%%v22,8\n\t" | |||||
| "vfmindb %%v19,%%v19,%%v23,8\n\t" | |||||
| "vfmindb %%v16,%%v16,%%v18,8\n\t" | |||||
| "vfmindb %%v17,%%v17,%%v19,8\n\t" | |||||
| "vfmindb %%v16,%%v16,%%v17,8\n\t" | |||||
| "vfmindb %%v0,%%v0,%%v16,8\n\t" | |||||
| "agfi %%r1, 256\n\t" | |||||
| "brctg %[n], 0b\n\t" | |||||
| "vrepg %%v16,%%v0,1\n\t" | |||||
| "wfmindb %%v0,%%v0,%%v16,8\n\t" | |||||
| "lpdr %[amin],%%f0" | |||||
| : [amin] "=f"(amin),[n] "+&r"(n) | |||||
| : "m"(*(const FLOAT (*)[n]) x),[x] "a"(x) | |||||
| : "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22", | |||||
| "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"); | |||||
| return amin; | |||||
| } | |||||
| if (inc_x == 1) { | |||||
| FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { | |||||
| BLASLONG i = 0; | |||||
| BLASLONG j = 0; | |||||
| FLOAT minf = 0.0; | |||||
| BLASLONG n1 = n & -32; | |||||
| if (n1 > 0) { | |||||
| if (n <= 0 || inc_x <= 0) | |||||
| return (minf); | |||||
| minf = damin_kernel_32(n1, x); | |||||
| if (inc_x == 1) { | |||||
| i = n1; | |||||
| } | |||||
| else | |||||
| { | |||||
| minf=ABS(x[0]); | |||||
| i++; | |||||
| } | |||||
| BLASLONG n1 = n & -32; | |||||
| if (n1 > 0) { | |||||
| while (i < n) { | |||||
| if (ABS(x[i]) < minf) { | |||||
| minf = ABS(x[i]); | |||||
| } | |||||
| i++; | |||||
| } | |||||
| return (minf); | |||||
| minf = damin_kernel_32(n1, x); | |||||
| i = n1; | |||||
| } else { | } else { | ||||
| minf = ABS(x[0]); | |||||
| i++; | |||||
| } | |||||
| while (i < n) { | |||||
| if (ABS(x[i]) < minf) { | |||||
| minf = ABS(x[i]); | |||||
| } | |||||
| i++; | |||||
| } | |||||
| return (minf); | |||||
| minf=ABS(x[0]); | |||||
| } else { | |||||
| BLASLONG n1 = n & -4; | |||||
| while (j < n1) { | |||||
| minf = ABS(x[0]); | |||||
| if (ABS(x[i]) < minf) { | |||||
| minf = ABS(x[i]); | |||||
| } | |||||
| if (ABS(x[i + inc_x]) < minf) { | |||||
| minf = ABS(x[i + inc_x]); | |||||
| } | |||||
| if (ABS(x[i + 2 * inc_x]) < minf) { | |||||
| minf = ABS(x[i + 2 * inc_x]); | |||||
| } | |||||
| if (ABS(x[i + 3 * inc_x]) < minf) { | |||||
| minf = ABS(x[i + 3 * inc_x]); | |||||
| } | |||||
| BLASLONG n1 = n & -4; | |||||
| while (j < n1) { | |||||
| i += inc_x * 4; | |||||
| if (ABS(x[i]) < minf) { | |||||
| minf = ABS(x[i]); | |||||
| } | |||||
| if (ABS(x[i + inc_x]) < minf) { | |||||
| minf = ABS(x[i + inc_x]); | |||||
| } | |||||
| if (ABS(x[i + 2 * inc_x]) < minf) { | |||||
| minf = ABS(x[i + 2 * inc_x]); | |||||
| } | |||||
| if (ABS(x[i + 3 * inc_x]) < minf) { | |||||
| minf = ABS(x[i + 3 * inc_x]); | |||||
| } | |||||
| j += 4; | |||||
| i += inc_x * 4; | |||||
| } | |||||
| j += 4; | |||||
| } | |||||
| while (j < n) { | |||||
| if (ABS(x[i]) < minf) { | |||||
| minf = ABS(x[i]); | |||||
| } | |||||
| i += inc_x; | |||||
| j++; | |||||
| } | |||||
| return (minf); | |||||
| while (j < n) { | |||||
| if (ABS(x[i]) < minf) { | |||||
| minf = ABS(x[i]); | |||||
| } | |||||
| i += inc_x; | |||||
| j++; | |||||
| } | } | ||||
| return (minf); | |||||
| } | |||||
| } | } | ||||
| @@ -1,5 +1,5 @@ | |||||
| /*************************************************************************** | /*************************************************************************** | ||||
| Copyright (c) 2013-2017, The OpenBLAS Project | |||||
| Copyright (c) 2013-2019, The OpenBLAS Project | |||||
| All rights reserved. | All rights reserved. | ||||
| Redistribution and use in source and binary forms, with or without | Redistribution and use in source and binary forms, with or without | ||||
| modification, are permitted provided that the following conditions are | modification, are permitted provided that the following conditions are | ||||
| @@ -28,177 +28,157 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #include "common.h" | #include "common.h" | ||||
| #include <math.h> | #include <math.h> | ||||
| #if defined(DOUBLE) | |||||
| #define ABS fabs | #define ABS fabs | ||||
| #else | |||||
| #define ABS fabsf | |||||
| #endif | |||||
| static FLOAT damin_kernel_32(BLASLONG n, FLOAT *x) | |||||
| { | |||||
| FLOAT amin; | |||||
| __asm__ volatile ( | |||||
| "vl %%v0,0(%2) \n\t" | |||||
| "vflpdb %%v0,%%v0 \n\t" | |||||
| "srlg %%r0,%1,5 \n\t" | |||||
| "xgr %%r1,%%r1 \n\t" | |||||
| "0: \n\t" | |||||
| "pfd 1, 1024(%%r1,%2) \n\t" | |||||
| "vl %%v16,0(%%r1,%2) \n\t" | |||||
| "vl %%v17,16(%%r1,%2) \n\t" | |||||
| "vl %%v18,32(%%r1,%2) \n\t" | |||||
| "vl %%v19,48(%%r1,%2) \n\t" | |||||
| "vl %%v20,64(%%r1,%2) \n\t" | |||||
| "vl %%v21,80(%%r1,%2) \n\t" | |||||
| "vl %%v22,96(%%r1,%2) \n\t" | |||||
| "vl %%v23,112(%%r1,%2) \n\t" | |||||
| "vflpdb %%v16, %%v16 \n\t" | |||||
| "vflpdb %%v17, %%v17 \n\t" | |||||
| "vflpdb %%v18, %%v18 \n\t" | |||||
| "vflpdb %%v19, %%v19 \n\t" | |||||
| "vflpdb %%v20, %%v20 \n\t" | |||||
| "vflpdb %%v21, %%v21 \n\t" | |||||
| "vflpdb %%v22, %%v22 \n\t" | |||||
| "vflpdb %%v23, %%v23 \n\t" | |||||
| "vfchdb %%v24,%%v17,%%v16 \n\t" | |||||
| "vfchdb %%v25,%%v19,%%v18 \n\t" | |||||
| "vfchdb %%v26,%%v21,%%v20 \n\t" | |||||
| "vfchdb %%v27,%%v23,%%v22 \n\t" | |||||
| "vsel %%v24,%%v16,%%v17,%%v24 \n\t" | |||||
| "vsel %%v25,%%v18,%%v19,%%v25 \n\t" | |||||
| "vsel %%v26,%%v20,%%v21,%%v26 \n\t" | |||||
| "vsel %%v27,%%v22,%%v23,%%v27 \n\t" | |||||
| "vfchdb %%v28,%%v25,%%v24 \n\t" | |||||
| "vfchdb %%v29,%%v27,%%v26 \n\t" | |||||
| "vsel %%v28,%%v24,%%v25,%%v28 \n\t" | |||||
| "vsel %%v29,%%v26,%%v27,%%v29 \n\t" | |||||
| "vfchdb %%v30,%%v29,%%v28 \n\t" | |||||
| "vsel %%v30,%%v28,%%v29,%%v30 \n\t" | |||||
| "vfchdb %%v31,%%v0,%%v30 \n\t" | |||||
| "vsel %%v0,%%v30,%%v0,%%v31 \n\t" | |||||
| "vl %%v16,128(%%r1,%2) \n\t" | |||||
| "vl %%v17,144(%%r1,%2) \n\t" | |||||
| "vl %%v18,160(%%r1,%2) \n\t" | |||||
| "vl %%v19,176(%%r1,%2) \n\t" | |||||
| "vl %%v20,192(%%r1,%2) \n\t" | |||||
| "vl %%v21,208(%%r1,%2) \n\t" | |||||
| "vl %%v22,224(%%r1,%2) \n\t" | |||||
| "vl %%v23,240(%%r1,%2) \n\t" | |||||
| "vflpdb %%v16, %%v16 \n\t" | |||||
| "vflpdb %%v17, %%v17 \n\t" | |||||
| "vflpdb %%v18, %%v18 \n\t" | |||||
| "vflpdb %%v19, %%v19 \n\t" | |||||
| "vflpdb %%v20, %%v20 \n\t" | |||||
| "vflpdb %%v21, %%v21 \n\t" | |||||
| "vflpdb %%v22, %%v22 \n\t" | |||||
| "vflpdb %%v23, %%v23 \n\t" | |||||
| "vfchdb %%v24,%%v17,%%v16 \n\t" | |||||
| "vfchdb %%v25,%%v19,%%v18 \n\t" | |||||
| "vfchdb %%v26,%%v21,%%v20 \n\t" | |||||
| "vfchdb %%v27,%%v23,%%v22 \n\t" | |||||
| "vsel %%v24,%%v16,%%v17,%%v24 \n\t" | |||||
| "vsel %%v25,%%v18,%%v19,%%v25 \n\t" | |||||
| "vsel %%v26,%%v20,%%v21,%%v26 \n\t" | |||||
| "vsel %%v27,%%v22,%%v23,%%v27 \n\t" | |||||
| "vfchdb %%v28,%%v25,%%v24 \n\t" | |||||
| "vfchdb %%v29,%%v27,%%v26 \n\t" | |||||
| "vsel %%v28,%%v24,%%v25,%%v28 \n\t" | |||||
| "vsel %%v29,%%v26,%%v27,%%v29 \n\t" | |||||
| "vfchdb %%v30,%%v29,%%v28 \n\t" | |||||
| "vsel %%v30,%%v28,%%v29,%%v30 \n\t" | |||||
| "vfchdb %%v31,%%v0,%%v30 \n\t" | |||||
| "vsel %%v0,%%v30,%%v0,%%v31 \n\t" | |||||
| "agfi %%r1, 256 \n\t" | |||||
| "brctg %%r0, 0b \n\t" | |||||
| "vrepg %%v16,%%v0,1 \n\t" | |||||
| "wfchdb %%v17,%%v16,%%v0 \n\t" | |||||
| "vsel %%v0,%%v0,%%v16,%%v17 \n\t" | |||||
| "ldr %0,%%f0 " | |||||
| :"=f"(amin) | |||||
| :"r"(n),"ZR"((const FLOAT (*)[n])x) | |||||
| :"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" | |||||
| ); | |||||
| return amin; | |||||
| } | |||||
| FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { | |||||
| BLASLONG i = 0; | |||||
| BLASLONG j = 0; | |||||
| FLOAT minf = 0.0; | |||||
| if (n <= 0 || inc_x <= 0) return (minf); | |||||
| static FLOAT damin_kernel_32(BLASLONG n, FLOAT *x) { | |||||
| FLOAT amin; | |||||
| __asm__("vl %%v0,0(%[x])\n\t" | |||||
| "vflpdb %%v0,%%v0\n\t" | |||||
| "srlg %[n],%[n],5\n\t" | |||||
| "xgr %%r1,%%r1\n\t" | |||||
| "0:\n\t" | |||||
| "pfd 1, 1024(%%r1,%[x])\n\t" | |||||
| "vl %%v16,0(%%r1,%[x])\n\t" | |||||
| "vl %%v17,16(%%r1,%[x])\n\t" | |||||
| "vl %%v18,32(%%r1,%[x])\n\t" | |||||
| "vl %%v19,48(%%r1,%[x])\n\t" | |||||
| "vl %%v20,64(%%r1,%[x])\n\t" | |||||
| "vl %%v21,80(%%r1,%[x])\n\t" | |||||
| "vl %%v22,96(%%r1,%[x])\n\t" | |||||
| "vl %%v23,112(%%r1,%[x])\n\t" | |||||
| "vflpdb %%v16, %%v16\n\t" | |||||
| "vflpdb %%v17, %%v17\n\t" | |||||
| "vflpdb %%v18, %%v18\n\t" | |||||
| "vflpdb %%v19, %%v19\n\t" | |||||
| "vflpdb %%v20, %%v20\n\t" | |||||
| "vflpdb %%v21, %%v21\n\t" | |||||
| "vflpdb %%v22, %%v22\n\t" | |||||
| "vflpdb %%v23, %%v23\n\t" | |||||
| "vfchdb %%v24,%%v17,%%v16\n\t" | |||||
| "vfchdb %%v25,%%v19,%%v18\n\t" | |||||
| "vfchdb %%v26,%%v21,%%v20\n\t" | |||||
| "vfchdb %%v27,%%v23,%%v22\n\t" | |||||
| "vsel %%v24,%%v16,%%v17,%%v24\n\t" | |||||
| "vsel %%v25,%%v18,%%v19,%%v25\n\t" | |||||
| "vsel %%v26,%%v20,%%v21,%%v26\n\t" | |||||
| "vsel %%v27,%%v22,%%v23,%%v27\n\t" | |||||
| "vfchdb %%v28,%%v25,%%v24\n\t" | |||||
| "vfchdb %%v29,%%v27,%%v26\n\t" | |||||
| "vsel %%v28,%%v24,%%v25,%%v28\n\t" | |||||
| "vsel %%v29,%%v26,%%v27,%%v29\n\t" | |||||
| "vfchdb %%v30,%%v29,%%v28\n\t" | |||||
| "vsel %%v30,%%v28,%%v29,%%v30\n\t" | |||||
| "vfchdb %%v31,%%v0,%%v30\n\t" | |||||
| "vsel %%v0,%%v30,%%v0,%%v31\n\t" | |||||
| "vl %%v16,128(%%r1,%[x])\n\t" | |||||
| "vl %%v17,144(%%r1,%[x])\n\t" | |||||
| "vl %%v18,160(%%r1,%[x])\n\t" | |||||
| "vl %%v19,176(%%r1,%[x])\n\t" | |||||
| "vl %%v20,192(%%r1,%[x])\n\t" | |||||
| "vl %%v21,208(%%r1,%[x])\n\t" | |||||
| "vl %%v22,224(%%r1,%[x])\n\t" | |||||
| "vl %%v23,240(%%r1,%[x])\n\t" | |||||
| "vflpdb %%v16, %%v16\n\t" | |||||
| "vflpdb %%v17, %%v17\n\t" | |||||
| "vflpdb %%v18, %%v18\n\t" | |||||
| "vflpdb %%v19, %%v19\n\t" | |||||
| "vflpdb %%v20, %%v20\n\t" | |||||
| "vflpdb %%v21, %%v21\n\t" | |||||
| "vflpdb %%v22, %%v22\n\t" | |||||
| "vflpdb %%v23, %%v23\n\t" | |||||
| "vfchdb %%v24,%%v17,%%v16\n\t" | |||||
| "vfchdb %%v25,%%v19,%%v18\n\t" | |||||
| "vfchdb %%v26,%%v21,%%v20\n\t" | |||||
| "vfchdb %%v27,%%v23,%%v22\n\t" | |||||
| "vsel %%v24,%%v16,%%v17,%%v24\n\t" | |||||
| "vsel %%v25,%%v18,%%v19,%%v25\n\t" | |||||
| "vsel %%v26,%%v20,%%v21,%%v26\n\t" | |||||
| "vsel %%v27,%%v22,%%v23,%%v27\n\t" | |||||
| "vfchdb %%v28,%%v25,%%v24\n\t" | |||||
| "vfchdb %%v29,%%v27,%%v26\n\t" | |||||
| "vsel %%v28,%%v24,%%v25,%%v28\n\t" | |||||
| "vsel %%v29,%%v26,%%v27,%%v29\n\t" | |||||
| "vfchdb %%v30,%%v29,%%v28\n\t" | |||||
| "vsel %%v30,%%v28,%%v29,%%v30\n\t" | |||||
| "vfchdb %%v31,%%v0,%%v30\n\t" | |||||
| "vsel %%v0,%%v30,%%v0,%%v31\n\t" | |||||
| "agfi %%r1, 256\n\t" | |||||
| "brctg %[n], 0b\n\t" | |||||
| "vrepg %%v16,%%v0,1\n\t" | |||||
| "wfchdb %%v17,%%v16,%%v0\n\t" | |||||
| "vsel %%v0,%%v0,%%v16,%%v17\n\t" | |||||
| "ldr %[amin],%%f0" | |||||
| : [amin] "=f"(amin),[n] "+&r"(n) | |||||
| : "m"(*(const FLOAT (*)[n]) x),[x] "a"(x) | |||||
| : "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22", | |||||
| "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"); | |||||
| return amin; | |||||
| } | |||||
| if (inc_x == 1) { | |||||
| FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { | |||||
| BLASLONG i = 0; | |||||
| BLASLONG j = 0; | |||||
| FLOAT minf = 0.0; | |||||
| BLASLONG n1 = n & -32; | |||||
| if (n1 > 0) { | |||||
| if (n <= 0 || inc_x <= 0) | |||||
| return (minf); | |||||
| minf = damin_kernel_32(n1, x); | |||||
| if (inc_x == 1) { | |||||
| i = n1; | |||||
| } | |||||
| else | |||||
| { | |||||
| minf=ABS(x[0]); | |||||
| i++; | |||||
| } | |||||
| BLASLONG n1 = n & -32; | |||||
| if (n1 > 0) { | |||||
| while (i < n) { | |||||
| if (ABS(x[i]) < minf) { | |||||
| minf = ABS(x[i]); | |||||
| } | |||||
| i++; | |||||
| } | |||||
| return (minf); | |||||
| minf = damin_kernel_32(n1, x); | |||||
| i = n1; | |||||
| } else { | } else { | ||||
| minf = ABS(x[0]); | |||||
| i++; | |||||
| } | |||||
| while (i < n) { | |||||
| if (ABS(x[i]) < minf) { | |||||
| minf = ABS(x[i]); | |||||
| } | |||||
| i++; | |||||
| } | |||||
| return (minf); | |||||
| minf=ABS(x[0]); | |||||
| } else { | |||||
| BLASLONG n1 = n & -4; | |||||
| while (j < n1) { | |||||
| minf = ABS(x[0]); | |||||
| if (ABS(x[i]) < minf) { | |||||
| minf = ABS(x[i]); | |||||
| } | |||||
| if (ABS(x[i + inc_x]) < minf) { | |||||
| minf = ABS(x[i + inc_x]); | |||||
| } | |||||
| if (ABS(x[i + 2 * inc_x]) < minf) { | |||||
| minf = ABS(x[i + 2 * inc_x]); | |||||
| } | |||||
| if (ABS(x[i + 3 * inc_x]) < minf) { | |||||
| minf = ABS(x[i + 3 * inc_x]); | |||||
| } | |||||
| BLASLONG n1 = n & -4; | |||||
| while (j < n1) { | |||||
| i += inc_x * 4; | |||||
| if (ABS(x[i]) < minf) { | |||||
| minf = ABS(x[i]); | |||||
| } | |||||
| if (ABS(x[i + inc_x]) < minf) { | |||||
| minf = ABS(x[i + inc_x]); | |||||
| } | |||||
| if (ABS(x[i + 2 * inc_x]) < minf) { | |||||
| minf = ABS(x[i + 2 * inc_x]); | |||||
| } | |||||
| if (ABS(x[i + 3 * inc_x]) < minf) { | |||||
| minf = ABS(x[i + 3 * inc_x]); | |||||
| } | |||||
| j += 4; | |||||
| i += inc_x * 4; | |||||
| } | |||||
| j += 4; | |||||
| } | |||||
| while (j < n) { | |||||
| if (ABS(x[i]) < minf) { | |||||
| minf = ABS(x[i]); | |||||
| } | |||||
| i += inc_x; | |||||
| j++; | |||||
| } | |||||
| return (minf); | |||||
| while (j < n) { | |||||
| if (ABS(x[i]) < minf) { | |||||
| minf = ABS(x[i]); | |||||
| } | |||||
| i += inc_x; | |||||
| j++; | |||||
| } | } | ||||
| return (minf); | |||||
| } | |||||
| } | } | ||||
| @@ -1,5 +1,5 @@ | |||||
| /*************************************************************************** | /*************************************************************************** | ||||
| Copyright (c) 2013-2018, The OpenBLAS Project | |||||
| Copyright (c) 2013-2019, The OpenBLAS Project | |||||
| All rights reserved. | All rights reserved. | ||||
| Redistribution and use in source and binary forms, with or without | Redistribution and use in source and binary forms, with or without | ||||
| modification, are permitted provided that the following conditions are | modification, are permitted provided that the following conditions are | ||||
| @@ -28,145 +28,139 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #include "common.h" | #include "common.h" | ||||
| #include <math.h> | #include <math.h> | ||||
| #if defined(DOUBLE) | |||||
| #define ABS fabs | |||||
| #else | |||||
| #define ABS fabsf | |||||
| #endif | |||||
| static FLOAT dasum_kernel_32(BLASLONG n, FLOAT *x) | |||||
| { | |||||
| FLOAT asum; | |||||
| __asm__ ( | |||||
| "vzero %%v0 \n\t" | |||||
| "vzero %%v1 \n\t" | |||||
| "vzero %%v2 \n\t" | |||||
| "vzero %%v3 \n\t" | |||||
| "srlg %%r0,%1,5 \n\t" | |||||
| "xgr %%r1,%%r1 \n\t" | |||||
| "0: \n\t" | |||||
| "pfd 1, 1024(%%r1,%2) \n\t" | |||||
| "vl %%v16, 0(%%r1,%2) \n\t" | |||||
| "vl %%v17, 16(%%r1,%2) \n\t" | |||||
| "vl %%v18, 32(%%r1,%2) \n\t" | |||||
| "vl %%v19, 48(%%r1,%2) \n\t" | |||||
| "vl %%v20, 64(%%r1,%2) \n\t" | |||||
| "vl %%v21, 80(%%r1,%2) \n\t" | |||||
| "vl %%v22, 96(%%r1,%2) \n\t" | |||||
| "vl %%v23, 112(%%r1,%2) \n\t" | |||||
| "vflpdb %%v16, %%v16 \n\t" | |||||
| "vflpdb %%v17, %%v17 \n\t" | |||||
| "vflpdb %%v18, %%v18 \n\t" | |||||
| "vflpdb %%v19, %%v19 \n\t" | |||||
| "vflpdb %%v20, %%v20 \n\t" | |||||
| "vflpdb %%v21, %%v21 \n\t" | |||||
| "vflpdb %%v22, %%v22 \n\t" | |||||
| "vflpdb %%v23, %%v23 \n\t" | |||||
| "vfadb %%v0,%%v0,%%v16 \n\t" | |||||
| "vfadb %%v1,%%v1,%%v17 \n\t" | |||||
| "vfadb %%v2,%%v2,%%v18 \n\t" | |||||
| "vfadb %%v3,%%v3,%%v19 \n\t" | |||||
| "vfadb %%v0,%%v0,%%v20 \n\t" | |||||
| "vfadb %%v1,%%v1,%%v21 \n\t" | |||||
| "vfadb %%v2,%%v2,%%v22 \n\t" | |||||
| "vfadb %%v3,%%v3,%%v23 \n\t" | |||||
| "vl %%v16, 128(%%r1,%2) \n\t" | |||||
| "vl %%v17, 144(%%r1,%2) \n\t" | |||||
| "vl %%v18, 160(%%r1,%2) \n\t" | |||||
| "vl %%v19, 176(%%r1,%2) \n\t" | |||||
| "vl %%v20, 192(%%r1,%2) \n\t" | |||||
| "vl %%v21, 208(%%r1,%2) \n\t" | |||||
| "vl %%v22, 224(%%r1,%2) \n\t" | |||||
| "vl %%v23, 240(%%r1,%2) \n\t" | |||||
| "vflpdb %%v16, %%v16 \n\t" | |||||
| "vflpdb %%v17, %%v17 \n\t" | |||||
| "vflpdb %%v18, %%v18 \n\t" | |||||
| "vflpdb %%v19, %%v19 \n\t" | |||||
| "vflpdb %%v20, %%v20 \n\t" | |||||
| "vflpdb %%v21, %%v21 \n\t" | |||||
| "vflpdb %%v22, %%v22 \n\t" | |||||
| "vflpdb %%v23, %%v23 \n\t" | |||||
| "vfadb %%v0,%%v0,%%v16 \n\t" | |||||
| "vfadb %%v1,%%v1,%%v17 \n\t" | |||||
| "vfadb %%v2,%%v2,%%v18 \n\t" | |||||
| "vfadb %%v3,%%v3,%%v19 \n\t" | |||||
| "vfadb %%v0,%%v0,%%v20 \n\t" | |||||
| "vfadb %%v1,%%v1,%%v21 \n\t" | |||||
| "vfadb %%v2,%%v2,%%v22 \n\t" | |||||
| "vfadb %%v3,%%v3,%%v23 \n\t" | |||||
| "agfi %%r1,256 \n\t" | |||||
| "brctg %%r0,0b \n\t" | |||||
| "vfadb %%v0,%%v0,%%v1 \n\t" | |||||
| "vfadb %%v0,%%v0,%%v2 \n\t" | |||||
| "vfadb %%v0,%%v0,%%v3 \n\t" | |||||
| "vrepg %%v1,%%v0,1 \n\t" | |||||
| "adbr %%f0,%%f1 \n\t" | |||||
| "ldr %0,%%f0 " | |||||
| :"=f"(asum) | |||||
| :"r"(n),"ZR"((const FLOAT (*)[n])x) | |||||
| :"memory","cc","r0","r1","v0","v1","v2","v3","v16","v17","v18","v19","v20","v21","v22","v23" | |||||
| ); | |||||
| return asum; | |||||
| #define ABS fabs | |||||
| static FLOAT dasum_kernel_32(BLASLONG n, FLOAT *x) { | |||||
| FLOAT asum; | |||||
| __asm__("vzero %%v24\n\t" | |||||
| "vzero %%v25\n\t" | |||||
| "vzero %%v26\n\t" | |||||
| "vzero %%v27\n\t" | |||||
| "vzero %%v28\n\t" | |||||
| "vzero %%v29\n\t" | |||||
| "vzero %%v30\n\t" | |||||
| "vzero %%v31\n\t" | |||||
| "srlg %[n],%[n],5\n\t" | |||||
| "xgr %%r1,%%r1\n\t" | |||||
| "0:\n\t" | |||||
| "pfd 1, 1024(%%r1,%[x])\n\t" | |||||
| "vl %%v16, 0(%%r1,%[x])\n\t" | |||||
| "vl %%v17, 16(%%r1,%[x])\n\t" | |||||
| "vl %%v18, 32(%%r1,%[x])\n\t" | |||||
| "vl %%v19, 48(%%r1,%[x])\n\t" | |||||
| "vl %%v20, 64(%%r1,%[x])\n\t" | |||||
| "vl %%v21, 80(%%r1,%[x])\n\t" | |||||
| "vl %%v22, 96(%%r1,%[x])\n\t" | |||||
| "vl %%v23, 112(%%r1,%[x])\n\t" | |||||
| "vflpdb %%v16, %%v16\n\t" | |||||
| "vflpdb %%v17, %%v17\n\t" | |||||
| "vflpdb %%v18, %%v18\n\t" | |||||
| "vflpdb %%v19, %%v19\n\t" | |||||
| "vflpdb %%v20, %%v20\n\t" | |||||
| "vflpdb %%v21, %%v21\n\t" | |||||
| "vflpdb %%v22, %%v22\n\t" | |||||
| "vflpdb %%v23, %%v23\n\t" | |||||
| "vfadb %%v24,%%v24,%%v16\n\t" | |||||
| "vfadb %%v25,%%v25,%%v17\n\t" | |||||
| "vfadb %%v26,%%v26,%%v18\n\t" | |||||
| "vfadb %%v27,%%v27,%%v19\n\t" | |||||
| "vfadb %%v28,%%v28,%%v20\n\t" | |||||
| "vfadb %%v29,%%v29,%%v21\n\t" | |||||
| "vfadb %%v30,%%v30,%%v22\n\t" | |||||
| "vfadb %%v31,%%v31,%%v23\n\t" | |||||
| "vl %%v16, 128(%%r1,%[x])\n\t" | |||||
| "vl %%v17, 144(%%r1,%[x])\n\t" | |||||
| "vl %%v18, 160(%%r1,%[x])\n\t" | |||||
| "vl %%v19, 176(%%r1,%[x])\n\t" | |||||
| "vl %%v20, 192(%%r1,%[x])\n\t" | |||||
| "vl %%v21, 208(%%r1,%[x])\n\t" | |||||
| "vl %%v22, 224(%%r1,%[x])\n\t" | |||||
| "vl %%v23, 240(%%r1,%[x])\n\t" | |||||
| "vflpdb %%v16, %%v16\n\t" | |||||
| "vflpdb %%v17, %%v17\n\t" | |||||
| "vflpdb %%v18, %%v18\n\t" | |||||
| "vflpdb %%v19, %%v19\n\t" | |||||
| "vflpdb %%v20, %%v20\n\t" | |||||
| "vflpdb %%v21, %%v21\n\t" | |||||
| "vflpdb %%v22, %%v22\n\t" | |||||
| "vflpdb %%v23, %%v23\n\t" | |||||
| "vfadb %%v24,%%v24,%%v16\n\t" | |||||
| "vfadb %%v25,%%v25,%%v17\n\t" | |||||
| "vfadb %%v26,%%v26,%%v18\n\t" | |||||
| "vfadb %%v27,%%v27,%%v19\n\t" | |||||
| "vfadb %%v28,%%v28,%%v20\n\t" | |||||
| "vfadb %%v29,%%v29,%%v21\n\t" | |||||
| "vfadb %%v30,%%v30,%%v22\n\t" | |||||
| "vfadb %%v31,%%v31,%%v23\n\t" | |||||
| "agfi %%r1,256\n\t" | |||||
| "brctg %[n],0b\n\t" | |||||
| "vfadb %%v24,%%v24,%%v25\n\t" | |||||
| "vfadb %%v24,%%v24,%%v26\n\t" | |||||
| "vfadb %%v24,%%v24,%%v27\n\t" | |||||
| "vfadb %%v24,%%v24,%%v28\n\t" | |||||
| "vfadb %%v24,%%v24,%%v29\n\t" | |||||
| "vfadb %%v24,%%v24,%%v30\n\t" | |||||
| "vfadb %%v24,%%v24,%%v31\n\t" | |||||
| "vrepg %%v25,%%v24,1\n\t" | |||||
| "vfadb %%v24,%%v24,%%v25\n\t" | |||||
| "vsteg %%v24,%[asum],0" | |||||
| : [asum] "=m"(asum),[n] "+&r"(n) | |||||
| : "m"(*(const FLOAT (*)[n]) x),[x] "a"(x) | |||||
| : "cc", "r1", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", | |||||
| "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"); | |||||
| return asum; | |||||
| } | } | ||||
| FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { | FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { | ||||
| BLASLONG i = 0; | |||||
| BLASLONG j = 0; | |||||
| FLOAT sumf = 0.0; | |||||
| BLASLONG n1; | |||||
| BLASLONG i = 0; | |||||
| BLASLONG j = 0; | |||||
| FLOAT sumf = 0.0; | |||||
| BLASLONG n1; | |||||
| if (n <= 0 || inc_x <= 0) return sumf; | |||||
| if (inc_x == 1) { | |||||
| n1 = n & -32; | |||||
| if (n1 > 0) { | |||||
| if (n <= 0 || inc_x <= 0) | |||||
| return sumf; | |||||
| sumf = dasum_kernel_32(n1, x); | |||||
| i = n1; | |||||
| } | |||||
| if (inc_x == 1) { | |||||
| while (i < n) { | |||||
| sumf += ABS(x[i]); | |||||
| i++; | |||||
| } | |||||
| n1 = n & -32; | |||||
| } else { | |||||
| BLASLONG n1 = n & -4; | |||||
| register FLOAT sum1, sum2; | |||||
| sum1 = 0.0; | |||||
| sum2 = 0.0; | |||||
| while (j < n1) { | |||||
| if (n1 > 0) { | |||||
| sum1 += ABS(x[i]); | |||||
| sum2 += ABS(x[i + inc_x]); | |||||
| sum1 += ABS(x[i + 2 * inc_x]); | |||||
| sum2 += ABS(x[i + 3 * inc_x]); | |||||
| sumf = dasum_kernel_32(n1, x); | |||||
| i = n1; | |||||
| } | |||||
| i += inc_x * 4; | |||||
| j += 4; | |||||
| while (i < n) { | |||||
| sumf += ABS(x[i]); | |||||
| i++; | |||||
| } | |||||
| } | |||||
| sumf = sum1 + sum2; | |||||
| while (j < n) { | |||||
| } else { | |||||
| BLASLONG n1 = n & -4; | |||||
| register FLOAT sum1, sum2; | |||||
| sum1 = 0.0; | |||||
| sum2 = 0.0; | |||||
| while (j < n1) { | |||||
| sumf += ABS(x[i]); | |||||
| i += inc_x; | |||||
| j++; | |||||
| } | |||||
| sum1 += ABS(x[i]); | |||||
| sum2 += ABS(x[i + inc_x]); | |||||
| sum1 += ABS(x[i + 2 * inc_x]); | |||||
| sum2 += ABS(x[i + 3 * inc_x]); | |||||
| i += inc_x * 4; | |||||
| j += 4; | |||||
| } | } | ||||
| return sumf; | |||||
| } | |||||
| sumf = sum1 + sum2; | |||||
| while (j < n) { | |||||
| sumf += ABS(x[i]); | |||||
| i += inc_x; | |||||
| j++; | |||||
| } | |||||
| } | |||||
| return sumf; | |||||
| } | |||||
| @@ -1,5 +1,5 @@ | |||||
| /*************************************************************************** | /*************************************************************************** | ||||
| Copyright (c) 2013-2017, The OpenBLAS Project | |||||
| Copyright (c) 2013-2019, The OpenBLAS Project | |||||
| All rights reserved. | All rights reserved. | ||||
| Redistribution and use in source and binary forms, with or without | Redistribution and use in source and binary forms, with or without | ||||
| modification, are permitted provided that the following conditions are | modification, are permitted provided that the following conditions are | ||||
| @@ -27,158 +27,141 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #include "common.h" | #include "common.h" | ||||
| static void daxpy_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) | |||||
| { | |||||
| __asm__ volatile( | |||||
| "vlrepg %%v0,%3 \n\t" | |||||
| "srlg %%r0,%0,5 \n\t" | |||||
| "xgr %%r1,%%r1 \n\t" | |||||
| "0: \n\t" | |||||
| "pfd 1, 1024(%%r1,%1) \n\t" | |||||
| "pfd 2, 1024(%%r1,%2) \n\t" | |||||
| "vl %%v16,0(%%r1,%1) \n\t" | |||||
| "vl %%v17,16(%%r1,%1) \n\t" | |||||
| "vl %%v18,32(%%r1,%1) \n\t" | |||||
| "vl %%v19,48(%%r1,%1) \n\t" | |||||
| "vl %%v20,0(%%r1,%2) \n\t" | |||||
| "vl %%v21,16(%%r1,%2) \n\t" | |||||
| "vl %%v22,32(%%r1,%2) \n\t" | |||||
| "vl %%v23,48(%%r1,%2) \n\t" | |||||
| "vfmadb %%v16,%%v0,%%v16,%%v20 \n\t" | |||||
| "vfmadb %%v17,%%v0,%%v17,%%v21 \n\t" | |||||
| "vfmadb %%v18,%%v0,%%v18,%%v22 \n\t" | |||||
| "vfmadb %%v19,%%v0,%%v19,%%v23 \n\t" | |||||
| "vl %%v24,64(%%r1,%1) \n\t" | |||||
| "vl %%v25,80(%%r1,%1) \n\t" | |||||
| "vl %%v26,96(%%r1,%1) \n\t" | |||||
| "vl %%v27,112(%%r1,%1) \n\t" | |||||
| "vl %%v28,64(%%r1,%2) \n\t" | |||||
| "vl %%v29,80(%%r1,%2) \n\t" | |||||
| "vl %%v30,96(%%r1,%2) \n\t" | |||||
| "vl %%v31,112(%%r1,%2) \n\t" | |||||
| "vfmadb %%v20,%%v0,%%v24,%%v28 \n\t" | |||||
| "vfmadb %%v21,%%v0,%%v25,%%v29 \n\t" | |||||
| "vfmadb %%v22,%%v0,%%v26,%%v30 \n\t" | |||||
| "vfmadb %%v23,%%v0,%%v27,%%v31 \n\t" | |||||
| "vst %%v16,0(%%r1,%2) \n\t" | |||||
| "vst %%v17,16(%%r1,%2) \n\t" | |||||
| "vst %%v18,32(%%r1,%2) \n\t" | |||||
| "vst %%v19,48(%%r1,%2) \n\t" | |||||
| "vst %%v20,64(%%r1,%2) \n\t" | |||||
| "vst %%v21,80(%%r1,%2) \n\t" | |||||
| "vst %%v22,96(%%r1,%2) \n\t" | |||||
| "vst %%v23,112(%%r1,%2) \n\t" | |||||
| "vl %%v16,128(%%r1,%1) \n\t" | |||||
| "vl %%v17,144(%%r1,%1) \n\t" | |||||
| "vl %%v18,160(%%r1,%1) \n\t" | |||||
| "vl %%v19,176(%%r1,%1) \n\t" | |||||
| "vl %%v20,128(%%r1,%2) \n\t" | |||||
| "vl %%v21,144(%%r1,%2) \n\t" | |||||
| "vl %%v22,160(%%r1,%2) \n\t" | |||||
| "vl %%v23,176(%%r1,%2) \n\t" | |||||
| "vfmadb %%v16,%%v0,%%v16,%%v20 \n\t" | |||||
| "vfmadb %%v17,%%v0,%%v17,%%v21 \n\t" | |||||
| "vfmadb %%v18,%%v0,%%v18,%%v22 \n\t" | |||||
| "vfmadb %%v19,%%v0,%%v19,%%v23 \n\t" | |||||
| "vl %%v24,192(%%r1,%1) \n\t" | |||||
| "vl %%v25,208(%%r1,%1) \n\t" | |||||
| "vl %%v26,224(%%r1,%1) \n\t" | |||||
| "vl %%v27,240(%%r1,%1) \n\t" | |||||
| "vl %%v28,192(%%r1,%2) \n\t" | |||||
| "vl %%v29,208(%%r1,%2) \n\t" | |||||
| "vl %%v30,224(%%r1,%2) \n\t" | |||||
| "vl %%v31,240(%%r1,%2) \n\t" | |||||
| "vfmadb %%v20,%%v0,%%v24,%%v28 \n\t" | |||||
| "vfmadb %%v21,%%v0,%%v25,%%v29 \n\t" | |||||
| "vfmadb %%v22,%%v0,%%v26,%%v30 \n\t" | |||||
| "vfmadb %%v23,%%v0,%%v27,%%v31 \n\t" | |||||
| "vst %%v16,128(%%r1,%2) \n\t" | |||||
| "vst %%v17,144(%%r1,%2) \n\t" | |||||
| "vst %%v18,160(%%r1,%2) \n\t" | |||||
| "vst %%v19,176(%%r1,%2) \n\t" | |||||
| "vst %%v20,192(%%r1,%2) \n\t" | |||||
| "vst %%v21,208(%%r1,%2) \n\t" | |||||
| "vst %%v22,224(%%r1,%2) \n\t" | |||||
| "vst %%v23,240(%%r1,%2) \n\t" | |||||
| "agfi %%r1,256 \n\t" | |||||
| "brctg %%r0,0b " | |||||
| : | |||||
| :"r"(n),"ZR"((const FLOAT (*)[n])x),"ZR"((FLOAT (*)[n])y),"m"(*alpha) | |||||
| :"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" | |||||
| ); | |||||
| static void daxpy_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) { | |||||
| __asm__("vlrepg %%v0,%[alpha]\n\t" | |||||
| "srlg %[n],%[n],5\n\t" | |||||
| "xgr %%r1,%%r1\n\t" | |||||
| "0:\n\t" | |||||
| "pfd 1, 1024(%%r1,%[x])\n\t" | |||||
| "pfd 2, 1024(%%r1,%[y])\n\t" | |||||
| "vl %%v16,0(%%r1,%[x])\n\t" | |||||
| "vl %%v17,16(%%r1,%[x])\n\t" | |||||
| "vl %%v18,32(%%r1,%[x])\n\t" | |||||
| "vl %%v19,48(%%r1,%[x])\n\t" | |||||
| "vl %%v20,0(%%r1,%[y])\n\t" | |||||
| "vl %%v21,16(%%r1,%[y])\n\t" | |||||
| "vl %%v22,32(%%r1,%[y])\n\t" | |||||
| "vl %%v23,48(%%r1,%[y])\n\t" | |||||
| "vl %%v24,64(%%r1,%[x])\n\t" | |||||
| "vl %%v25,80(%%r1,%[x])\n\t" | |||||
| "vl %%v26,96(%%r1,%[x])\n\t" | |||||
| "vl %%v27,112(%%r1,%[x])\n\t" | |||||
| "vl %%v28,64(%%r1,%[y])\n\t" | |||||
| "vl %%v29,80(%%r1,%[y])\n\t" | |||||
| "vl %%v30,96(%%r1,%[y])\n\t" | |||||
| "vl %%v31,112(%%r1,%[y])\n\t" | |||||
| "vfmadb %%v16,%%v0,%%v16,%%v20\n\t" | |||||
| "vfmadb %%v17,%%v0,%%v17,%%v21\n\t" | |||||
| "vfmadb %%v18,%%v0,%%v18,%%v22\n\t" | |||||
| "vfmadb %%v19,%%v0,%%v19,%%v23\n\t" | |||||
| "vfmadb %%v24,%%v0,%%v24,%%v28\n\t" | |||||
| "vfmadb %%v25,%%v0,%%v25,%%v29\n\t" | |||||
| "vfmadb %%v26,%%v0,%%v26,%%v30\n\t" | |||||
| "vfmadb %%v27,%%v0,%%v27,%%v31\n\t" | |||||
| "vst %%v16,0(%%r1,%[y])\n\t" | |||||
| "vst %%v17,16(%%r1,%[y])\n\t" | |||||
| "vst %%v18,32(%%r1,%[y])\n\t" | |||||
| "vst %%v19,48(%%r1,%[y])\n\t" | |||||
| "vst %%v24,64(%%r1,%[y])\n\t" | |||||
| "vst %%v25,80(%%r1,%[y])\n\t" | |||||
| "vst %%v26,96(%%r1,%[y])\n\t" | |||||
| "vst %%v27,112(%%r1,%[y])\n\t" | |||||
| "vl %%v16,128(%%r1,%[x])\n\t" | |||||
| "vl %%v17,144(%%r1,%[x])\n\t" | |||||
| "vl %%v18,160(%%r1,%[x])\n\t" | |||||
| "vl %%v19,176(%%r1,%[x])\n\t" | |||||
| "vl %%v20,128(%%r1,%[y])\n\t" | |||||
| "vl %%v21,144(%%r1,%[y])\n\t" | |||||
| "vl %%v22,160(%%r1,%[y])\n\t" | |||||
| "vl %%v23,176(%%r1,%[y])\n\t" | |||||
| "vl %%v24,192(%%r1,%[x])\n\t" | |||||
| "vl %%v25,208(%%r1,%[x])\n\t" | |||||
| "vl %%v26,224(%%r1,%[x])\n\t" | |||||
| "vl %%v27,240(%%r1,%[x])\n\t" | |||||
| "vl %%v28,192(%%r1,%[y])\n\t" | |||||
| "vl %%v29,208(%%r1,%[y])\n\t" | |||||
| "vl %%v30,224(%%r1,%[y])\n\t" | |||||
| "vl %%v31,240(%%r1,%[y])\n\t" | |||||
| "vfmadb %%v16,%%v0,%%v16,%%v20\n\t" | |||||
| "vfmadb %%v17,%%v0,%%v17,%%v21\n\t" | |||||
| "vfmadb %%v18,%%v0,%%v18,%%v22\n\t" | |||||
| "vfmadb %%v19,%%v0,%%v19,%%v23\n\t" | |||||
| "vfmadb %%v24,%%v0,%%v24,%%v28\n\t" | |||||
| "vfmadb %%v25,%%v0,%%v25,%%v29\n\t" | |||||
| "vfmadb %%v26,%%v0,%%v26,%%v30\n\t" | |||||
| "vfmadb %%v27,%%v0,%%v27,%%v31\n\t" | |||||
| "vst %%v16,128(%%r1,%[y])\n\t" | |||||
| "vst %%v17,144(%%r1,%[y])\n\t" | |||||
| "vst %%v18,160(%%r1,%[y])\n\t" | |||||
| "vst %%v19,176(%%r1,%[y])\n\t" | |||||
| "vst %%v24,192(%%r1,%[y])\n\t" | |||||
| "vst %%v25,208(%%r1,%[y])\n\t" | |||||
| "vst %%v26,224(%%r1,%[y])\n\t" | |||||
| "vst %%v27,240(%%r1,%[y])\n\t" | |||||
| "agfi %%r1,256\n\t" | |||||
| "brctg %[n],0b" | |||||
| : "+m"(*(FLOAT (*)[n]) y),[n] "+&r"(n) | |||||
| : [y] "a"(y), "m"(*(const FLOAT (*)[n]) x),[x] "a"(x), | |||||
| [alpha] "m"(*alpha) | |||||
| : "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22", | |||||
| "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"); | |||||
| } | } | ||||
| int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) | |||||
| { | |||||
| BLASLONG i=0; | |||||
| BLASLONG ix=0,iy=0; | |||||
| int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, | |||||
| BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, | |||||
| BLASLONG dummy2) { | |||||
| BLASLONG i = 0; | |||||
| BLASLONG ix = 0, iy = 0; | |||||
| if ( n <= 0 ) return 0 ; | |||||
| if (n <= 0) | |||||
| return 0; | |||||
| if ( (inc_x == 1) && (inc_y == 1) ) | |||||
| { | |||||
| if ((inc_x == 1) && (inc_y == 1)) { | |||||
| BLASLONG n1 = n & -32; | |||||
| BLASLONG n1 = n & -32; | |||||
| if ( n1 ) | |||||
| daxpy_kernel_32(n1, x, y , &da); | |||||
| if (n1) | |||||
| daxpy_kernel_32(n1, x, y, &da); | |||||
| i = n1; | |||||
| while(i < n) | |||||
| { | |||||
| y[i] += da * x[i] ; | |||||
| i++ ; | |||||
| } | |||||
| return 0 ; | |||||
| i = n1; | |||||
| while (i < n) { | |||||
| y[i] += da * x[i]; | |||||
| i++; | |||||
| } | } | ||||
| return 0; | |||||
| BLASLONG n1 = n & -4; | |||||
| } | |||||
| while(i < n1) | |||||
| { | |||||
| BLASLONG n1 = n & -4; | |||||
| FLOAT m1 = da * x[ix] ; | |||||
| FLOAT m2 = da * x[ix+inc_x] ; | |||||
| FLOAT m3 = da * x[ix+2*inc_x] ; | |||||
| FLOAT m4 = da * x[ix+3*inc_x] ; | |||||
| while (i < n1) { | |||||
| y[iy] += m1 ; | |||||
| y[iy+inc_y] += m2 ; | |||||
| y[iy+2*inc_y] += m3 ; | |||||
| y[iy+3*inc_y] += m4 ; | |||||
| FLOAT m1 = da * x[ix]; | |||||
| FLOAT m2 = da * x[ix + inc_x]; | |||||
| FLOAT m3 = da * x[ix + 2 * inc_x]; | |||||
| FLOAT m4 = da * x[ix + 3 * inc_x]; | |||||
| ix += inc_x*4 ; | |||||
| iy += inc_y*4 ; | |||||
| i+=4 ; | |||||
| y[iy] += m1; | |||||
| y[iy + inc_y] += m2; | |||||
| y[iy + 2 * inc_y] += m3; | |||||
| y[iy + 3 * inc_y] += m4; | |||||
| } | |||||
| ix += inc_x * 4; | |||||
| iy += inc_y * 4; | |||||
| i += 4; | |||||
| while(i < n) | |||||
| { | |||||
| } | |||||
| y[iy] += da * x[ix] ; | |||||
| ix += inc_x ; | |||||
| iy += inc_y ; | |||||
| i++ ; | |||||
| while (i < n) { | |||||
| } | |||||
| return 0 ; | |||||
| } | |||||
| y[iy] += da * x[ix]; | |||||
| ix += inc_x; | |||||
| iy += inc_y; | |||||
| i++; | |||||
| } | |||||
| return 0; | |||||
| } | |||||
| @@ -1,5 +1,5 @@ | |||||
| /*************************************************************************** | /*************************************************************************** | ||||
| Copyright (c) 2013-2018, The OpenBLAS Project | |||||
| Copyright (c) 2013-2019, The OpenBLAS Project | |||||
| All rights reserved. | All rights reserved. | ||||
| Redistribution and use in source and binary forms, with or without | Redistribution and use in source and binary forms, with or without | ||||
| modification, are permitted provided that the following conditions are | modification, are permitted provided that the following conditions are | ||||
| @@ -27,59 +27,53 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #include "common.h" | #include "common.h" | ||||
| static void dcopy_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y) | |||||
| { | |||||
| __asm__ volatile ( | |||||
| "lgr %%r1,%1 \n\t" | |||||
| "lgr %%r2,%2 \n\t" | |||||
| "srlg %%r0,%0,5 \n\t" | |||||
| "0: \n\t" | |||||
| "pfd 1, 1024(%%r1) \n\t" | |||||
| "pfd 2, 1024(%%r2) \n\t" | |||||
| "mvc 0(256,%%r2),0(%%r1) \n\t" | |||||
| "agfi %%r1,256 \n\t" | |||||
| "agfi %%r2,256 \n\t" | |||||
| "brctg %%r0,0b " | |||||
| : | |||||
| :"r"(n),"a"((const FLOAT (*)[n])x),"a"((FLOAT (*)[n])y) | |||||
| :"memory","cc","r0","r1","r2" | |||||
| ); | |||||
| static void dcopy_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y) { | |||||
| __asm__("srlg %[n],%[n],5\n\t" | |||||
| "0:\n\t" | |||||
| "pfd 1, 1024(%[x])\n\t" | |||||
| "pfd 2, 1024(%[y])\n\t" | |||||
| "mvc 0(256,%[y]),0(%[x])\n\t" | |||||
| "la %[x],256(%[x])\n\t" | |||||
| "la %[y],256(%[y])\n\t" | |||||
| "brctg %[n],0b" | |||||
| : "=m"(*(FLOAT (*)[n]) y),[x] "+&a"(x),[y] "+&a"(y) | |||||
| : "m"(*(const FLOAT (*)[n]) x) | |||||
| : "cc"); | |||||
| } | } | ||||
| int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) { | int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) { | ||||
| BLASLONG i = 0; | |||||
| BLASLONG ix = 0, iy = 0; | |||||
| BLASLONG i = 0; | |||||
| BLASLONG ix = 0, iy = 0; | |||||
| if (n <= 0) return 0; | |||||
| if ((inc_x == 1) && (inc_y == 1)) { | |||||
| BLASLONG n1 = n & -32; | |||||
| if (n1 > 0) { | |||||
| dcopy_kernel_32(n1, x, y); | |||||
| i = n1; | |||||
| } | |||||
| if (n <= 0) | |||||
| return 0; | |||||
| while (i < n) { | |||||
| y[i] = x[i]; | |||||
| i++; | |||||
| if ((inc_x == 1) && (inc_y == 1)) { | |||||
| } | |||||
| BLASLONG n1 = n & -32; | |||||
| if (n1 > 0) { | |||||
| dcopy_kernel_32(n1, x, y); | |||||
| i = n1; | |||||
| } | |||||
| while (i < n) { | |||||
| y[i] = x[i]; | |||||
| i++; | |||||
| } else { | |||||
| } | |||||
| while (i < n) { | |||||
| } else { | |||||
| y[iy] = x[ix]; | |||||
| ix += inc_x; | |||||
| iy += inc_y; | |||||
| i++; | |||||
| while (i < n) { | |||||
| } | |||||
| y[iy] = x[ix]; | |||||
| ix += inc_x; | |||||
| iy += inc_y; | |||||
| i++; | |||||
| } | } | ||||
| return 0; | |||||
| } | |||||
| return 0; | |||||
| } | } | ||||
| @@ -1,5 +1,5 @@ | |||||
| /*************************************************************************** | /*************************************************************************** | ||||
| Copyright (c) 2013-2017, The OpenBLAS Project | |||||
| Copyright (c) 2013-2019, The OpenBLAS Project | |||||
| All rights reserved. | All rights reserved. | ||||
| Redistribution and use in source and binary forms, with or without | Redistribution and use in source and binary forms, with or without | ||||
| modification, are permitted provided that the following conditions are | modification, are permitted provided that the following conditions are | ||||
| @@ -27,123 +27,127 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #include "common.h" | #include "common.h" | ||||
| static FLOAT ddot_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y) | |||||
| { | |||||
| FLOAT dot; | |||||
| __asm__ volatile ( | |||||
| "vzero %%v0 \n\t" | |||||
| "srlg %%r0,%1,4 \n\t" | |||||
| "xgr %%r1,%%r1 \n\t" | |||||
| "0: \n\t" | |||||
| "pfd 1,1024(%%r1,%2) \n\t" | |||||
| "pfd 1,1024(%%r1,%3) \n\t" | |||||
| "vl %%v16,0(%%r1,%2) \n\t" | |||||
| "vl %%v17,16(%%r1,%2) \n\t" | |||||
| "vl %%v18,32(%%r1,%2) \n\t" | |||||
| "vl %%v19,48(%%r1,%2) \n\t" | |||||
| "vl %%v20,64(%%r1,%2) \n\t" | |||||
| "vl %%v21,80(%%r1,%2) \n\t" | |||||
| "vl %%v22,96(%%r1,%2) \n\t" | |||||
| "vl %%v23,112(%%r1,%2) \n\t" | |||||
| "vl %%v24,0(%%r1,%3) \n\t" | |||||
| "vfmadb %%v0,%%v16,%%v24,%%v0 \n\t" | |||||
| "vl %%v25,16(%%r1,%3) \n\t" | |||||
| "vfmadb %%v0,%%v17,%%v25,%%v0 \n\t" | |||||
| "vl %%v26,32(%%r1,%3) \n\t" | |||||
| "vfmadb %%v0,%%v18,%%v26,%%v0 \n\t" | |||||
| "vl %%v27,48(%%r1,%3) \n\t" | |||||
| "vfmadb %%v0,%%v19,%%v27,%%v0 \n\t" | |||||
| "vl %%v28,64(%%r1,%3) \n\t" | |||||
| "vfmadb %%v0,%%v20,%%v28,%%v0 \n\t" | |||||
| "vl %%v29,80(%%r1,%3) \n\t" | |||||
| "vfmadb %%v0,%%v21,%%v29,%%v0 \n\t" | |||||
| "vl %%v30,96(%%r1,%3) \n\t" | |||||
| "vfmadb %%v0,%%v22,%%v30,%%v0 \n\t" | |||||
| "vl %%v31,112(%%r1,%3) \n\t" | |||||
| "vfmadb %%v0,%%v23,%%v31,%%v0 \n\t" | |||||
| "agfi %%r1,128 \n\t" | |||||
| "brctg %%r0,0b \n\t" | |||||
| "vrepg %%v1,%%v0,1 \n\t" | |||||
| "adbr %%f0,%%f1 \n\t" | |||||
| "ldr %0,%%f0 " | |||||
| :"=f"(dot) | |||||
| :"r"(n),"ZR"((const FLOAT (*)[n])x),"ZR"((const FLOAT (*)[n])y) | |||||
| :"memory","cc","r0","r1","v0","v1","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" | |||||
| ); | |||||
| return dot; | |||||
| static FLOAT ddot_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y) { | |||||
| FLOAT dot; | |||||
| __asm__("vzero %%v0\n\t" | |||||
| "vzero %%v1\n\t" | |||||
| "vzero %%v2\n\t" | |||||
| "vzero %%v3\n\t" | |||||
| "vzero %%v4\n\t" | |||||
| "vzero %%v5\n\t" | |||||
| "vzero %%v6\n\t" | |||||
| "vzero %%v7\n\t" | |||||
| "srlg %[n],%[n],4\n\t" | |||||
| "xgr %%r1,%%r1\n\t" | |||||
| "0:\n\t" | |||||
| "pfd 1,1024(%%r1,%[x])\n\t" | |||||
| "pfd 1,1024(%%r1,%[y])\n\t" | |||||
| "vl %%v16,0(%%r1,%[x])\n\t" | |||||
| "vl %%v17,16(%%r1,%[x])\n\t" | |||||
| "vl %%v18,32(%%r1,%[x])\n\t" | |||||
| "vl %%v19,48(%%r1,%[x])\n\t" | |||||
| "vl %%v20,64(%%r1,%[x])\n\t" | |||||
| "vl %%v21,80(%%r1,%[x])\n\t" | |||||
| "vl %%v22,96(%%r1,%[x])\n\t" | |||||
| "vl %%v23,112(%%r1,%[x])\n\t" | |||||
| "vl %%v24,0(%%r1,%[y])\n\t" | |||||
| "vl %%v25,16(%%r1,%[y])\n\t" | |||||
| "vl %%v26,32(%%r1,%[y])\n\t" | |||||
| "vl %%v27,48(%%r1,%[y])\n\t" | |||||
| "vl %%v28,64(%%r1,%[y])\n\t" | |||||
| "vl %%v29,80(%%r1,%[y])\n\t" | |||||
| "vl %%v30,96(%%r1,%[y])\n\t" | |||||
| "vl %%v31,112(%%r1,%[y])\n\t" | |||||
| "vfmadb %%v0,%%v16,%%v24,%%v0\n\t" | |||||
| "vfmadb %%v1,%%v17,%%v25,%%v1\n\t" | |||||
| "vfmadb %%v2,%%v18,%%v26,%%v2\n\t" | |||||
| "vfmadb %%v3,%%v19,%%v27,%%v3\n\t" | |||||
| "vfmadb %%v4,%%v20,%%v28,%%v4\n\t" | |||||
| "vfmadb %%v5,%%v21,%%v29,%%v5\n\t" | |||||
| "vfmadb %%v6,%%v22,%%v30,%%v6\n\t" | |||||
| "vfmadb %%v7,%%v23,%%v31,%%v7\n\t" | |||||
| "agfi %%r1,128\n\t" | |||||
| "brctg %[n],0b\n\t" | |||||
| "vfadb %%v0,%%v0,%%v1\n\t" | |||||
| "vfadb %%v0,%%v0,%%v2\n\t" | |||||
| "vfadb %%v0,%%v0,%%v3\n\t" | |||||
| "vfadb %%v0,%%v0,%%v4\n\t" | |||||
| "vfadb %%v0,%%v0,%%v5\n\t" | |||||
| "vfadb %%v0,%%v0,%%v6\n\t" | |||||
| "vfadb %%v0,%%v0,%%v7\n\t" | |||||
| "vrepg %%v1,%%v0,1\n\t" | |||||
| "adbr %%f0,%%f1\n\t" | |||||
| "ldr %[dot],%%f0" | |||||
| : [dot] "=f"(dot),[n] "+&r"(n) | |||||
| : "m"(*(const FLOAT (*)[n]) x),[x] "a"(x), "m"(*(const FLOAT (*)[n]) y), | |||||
| [y] "a"(y) | |||||
| : "cc", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", | |||||
| "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", | |||||
| "v27", "v28", "v29", "v30", "v31"); | |||||
| return dot; | |||||
| } | } | ||||
| FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) | |||||
| { | |||||
| BLASLONG i=0; | |||||
| BLASLONG ix=0,iy=0; | |||||
| FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) { | |||||
| BLASLONG i = 0; | |||||
| BLASLONG ix = 0, iy = 0; | |||||
| FLOAT dot = 0.0 ; | |||||
| FLOAT dot = 0.0; | |||||
| if ( n <= 0 ) return(dot); | |||||
| if (n <= 0) | |||||
| return (dot); | |||||
| if ( (inc_x == 1) && (inc_y == 1) ) | |||||
| { | |||||
| if ((inc_x == 1) && (inc_y == 1)) { | |||||
| BLASLONG n1 = n & -16; | |||||
| BLASLONG n1 = n & -16; | |||||
| if ( n1 ) | |||||
| dot = ddot_kernel_16(n1, x, y); | |||||
| if (n1) | |||||
| dot = ddot_kernel_16(n1, x, y); | |||||
| i = n1; | |||||
| while(i < n) | |||||
| { | |||||
| dot += y[i] * x[i] ; | |||||
| i++ ; | |||||
| } | |||||
| return(dot); | |||||
| i = n1; | |||||
| while (i < n) { | |||||
| dot += y[i] * x[i]; | |||||
| i++; | |||||
| } | } | ||||
| return (dot); | |||||
| FLOAT temp1 = 0.0; | |||||
| FLOAT temp2 = 0.0; | |||||
| } | |||||
| BLASLONG n1 = n & -4; | |||||
| FLOAT temp1 = 0.0; | |||||
| FLOAT temp2 = 0.0; | |||||
| while(i < n1) | |||||
| { | |||||
| BLASLONG n1 = n & -4; | |||||
| FLOAT m1 = y[iy] * x[ix] ; | |||||
| FLOAT m2 = y[iy+inc_y] * x[ix+inc_x] ; | |||||
| while (i < n1) { | |||||
| FLOAT m3 = y[iy+2*inc_y] * x[ix+2*inc_x] ; | |||||
| FLOAT m4 = y[iy+3*inc_y] * x[ix+3*inc_x] ; | |||||
| FLOAT m1 = y[iy] * x[ix]; | |||||
| FLOAT m2 = y[iy + inc_y] * x[ix + inc_x]; | |||||
| ix += inc_x*4 ; | |||||
| iy += inc_y*4 ; | |||||
| FLOAT m3 = y[iy + 2 * inc_y] * x[ix + 2 * inc_x]; | |||||
| FLOAT m4 = y[iy + 3 * inc_y] * x[ix + 3 * inc_x]; | |||||
| temp1 += m1+m3; | |||||
| temp2 += m2+m4; | |||||
| ix += inc_x * 4; | |||||
| iy += inc_y * 4; | |||||
| i+=4 ; | |||||
| temp1 += m1 + m3; | |||||
| temp2 += m2 + m4; | |||||
| } | |||||
| i += 4; | |||||
| while(i < n) | |||||
| { | |||||
| } | |||||
| temp1 += y[iy] * x[ix] ; | |||||
| ix += inc_x ; | |||||
| iy += inc_y ; | |||||
| i++ ; | |||||
| while (i < n) { | |||||
| } | |||||
| dot = temp1 + temp2; | |||||
| return(dot); | |||||
| } | |||||
| temp1 += y[iy] * x[ix]; | |||||
| ix += inc_x; | |||||
| iy += inc_y; | |||||
| i++; | |||||
| } | |||||
| dot = temp1 + temp2; | |||||
| return (dot); | |||||
| } | |||||
| @@ -1,5 +1,5 @@ | |||||
| /*************************************************************************** | /*************************************************************************** | ||||
| Copyright (c) 2013-2017, The OpenBLAS Project | |||||
| Copyright (c) 2013-2019, The OpenBLAS Project | |||||
| All rights reserved. | All rights reserved. | ||||
| Redistribution and use in source and binary forms, with or without | Redistribution and use in source and binary forms, with or without | ||||
| modification, are permitted provided that the following conditions are | modification, are permitted provided that the following conditions are | ||||
| @@ -27,133 +27,121 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #include "common.h" | #include "common.h" | ||||
| static FLOAT dmax_kernel_32(BLASLONG n, FLOAT *x) | |||||
| { | |||||
| FLOAT max; | |||||
| __asm__ volatile ( | |||||
| "vl %%v0,0(%2) \n\t" | |||||
| "srlg %%r0,%1,5 \n\t" | |||||
| "xgr %%r1,%%r1 \n\t" | |||||
| "0: \n\t" | |||||
| "pfd 1, 1024(%%r1,%2) \n\t" | |||||
| "vl %%v16,0(%%r1,%2) \n\t" | |||||
| "vl %%v17,16(%%r1,%2) \n\t" | |||||
| "vl %%v18,32(%%r1,%2) \n\t" | |||||
| "vl %%v19,48(%%r1,%2) \n\t" | |||||
| "vl %%v20,64(%%r1,%2) \n\t" | |||||
| "vl %%v21,80(%%r1,%2) \n\t" | |||||
| "vl %%v22,96(%%r1,%2) \n\t" | |||||
| "vl %%v23,112(%%r1,%2) \n\t" | |||||
| "vl %%v24,128(%%r1,%2) \n\t" | |||||
| "vl %%v25,144(%%r1,%2) \n\t" | |||||
| "vl %%v26,160(%%r1,%2) \n\t" | |||||
| "vl %%v27,176(%%r1,%2) \n\t" | |||||
| "vl %%v28,192(%%r1,%2) \n\t" | |||||
| "vl %%v29,208(%%r1,%2) \n\t" | |||||
| "vl %%v30,224(%%r1,%2) \n\t" | |||||
| "vl %%v31,240(%%r1,%2) \n\t" | |||||
| "vfmaxdb %%v16,%%v16,%%v24,0 \n\t" | |||||
| "vfmaxdb %%v17,%%v17,%%v25,0 \n\t" | |||||
| "vfmaxdb %%v18,%%v18,%%v26,0 \n\t" | |||||
| "vfmaxdb %%v19,%%v19,%%v27,0 \n\t" | |||||
| "vfmaxdb %%v20,%%v20,%%v28,0 \n\t" | |||||
| "vfmaxdb %%v21,%%v21,%%v29,0 \n\t" | |||||
| "vfmaxdb %%v22,%%v22,%%v30,0 \n\t" | |||||
| "vfmaxdb %%v23,%%v23,%%v31,0 \n\t" | |||||
| "vfmaxdb %%v16,%%v16,%%v20,0 \n\t" | |||||
| "vfmaxdb %%v17,%%v17,%%v21,0 \n\t" | |||||
| "vfmaxdb %%v18,%%v18,%%v22,0 \n\t" | |||||
| "vfmaxdb %%v19,%%v19,%%v23,0 \n\t" | |||||
| "vfmaxdb %%v16,%%v16,%%v18,0 \n\t" | |||||
| "vfmaxdb %%v17,%%v17,%%v19,0 \n\t" | |||||
| "vfmaxdb %%v16,%%v16,%%v17,0 \n\t" | |||||
| "vfmaxdb %%v0,%%v0,%%v16,0 \n\t" | |||||
| "agfi %%r1, 256 \n\t" | |||||
| "brctg %%r0, 0b \n\t" | |||||
| "vrepg %%v16,%%v0,1 \n\t" | |||||
| "wfmaxdb %%v0,%%v0,%%v16,0 \n\t" | |||||
| "ldr %0,%%f0 " | |||||
| :"=f"(max) | |||||
| :"r"(n),"ZR"((const FLOAT (*)[n])x) | |||||
| :"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" | |||||
| ); | |||||
| return max; | |||||
| static FLOAT dmax_kernel_32(BLASLONG n, FLOAT *x) { | |||||
| FLOAT max; | |||||
| __asm__("vl %%v0,0(%[x])\n\t" | |||||
| "srlg %[n],%[n],5\n\t" | |||||
| "xgr %%r1,%%r1\n\t" | |||||
| "0:\n\t" | |||||
| "pfd 1, 1024(%%r1,%[x])\n\t" | |||||
| "vl %%v16,0(%%r1,%[x])\n\t" | |||||
| "vl %%v17,16(%%r1,%[x])\n\t" | |||||
| "vl %%v18,32(%%r1,%[x])\n\t" | |||||
| "vl %%v19,48(%%r1,%[x])\n\t" | |||||
| "vl %%v20,64(%%r1,%[x])\n\t" | |||||
| "vl %%v21,80(%%r1,%[x])\n\t" | |||||
| "vl %%v22,96(%%r1,%[x])\n\t" | |||||
| "vl %%v23,112(%%r1,%[x])\n\t" | |||||
| "vl %%v24,128(%%r1,%[x])\n\t" | |||||
| "vl %%v25,144(%%r1,%[x])\n\t" | |||||
| "vl %%v26,160(%%r1,%[x])\n\t" | |||||
| "vl %%v27,176(%%r1,%[x])\n\t" | |||||
| "vl %%v28,192(%%r1,%[x])\n\t" | |||||
| "vl %%v29,208(%%r1,%[x])\n\t" | |||||
| "vl %%v30,224(%%r1,%[x])\n\t" | |||||
| "vl %%v31,240(%%r1,%[x])\n\t" | |||||
| "vfmaxdb %%v16,%%v16,%%v24,0\n\t" | |||||
| "vfmaxdb %%v17,%%v17,%%v25,0\n\t" | |||||
| "vfmaxdb %%v18,%%v18,%%v26,0\n\t" | |||||
| "vfmaxdb %%v19,%%v19,%%v27,0\n\t" | |||||
| "vfmaxdb %%v20,%%v20,%%v28,0\n\t" | |||||
| "vfmaxdb %%v21,%%v21,%%v29,0\n\t" | |||||
| "vfmaxdb %%v22,%%v22,%%v30,0\n\t" | |||||
| "vfmaxdb %%v23,%%v23,%%v31,0\n\t" | |||||
| "vfmaxdb %%v16,%%v16,%%v20,0\n\t" | |||||
| "vfmaxdb %%v17,%%v17,%%v21,0\n\t" | |||||
| "vfmaxdb %%v18,%%v18,%%v22,0\n\t" | |||||
| "vfmaxdb %%v19,%%v19,%%v23,0\n\t" | |||||
| "vfmaxdb %%v16,%%v16,%%v18,0\n\t" | |||||
| "vfmaxdb %%v17,%%v17,%%v19,0\n\t" | |||||
| "vfmaxdb %%v16,%%v16,%%v17,0\n\t" | |||||
| "vfmaxdb %%v0,%%v0,%%v16,0\n\t" | |||||
| "agfi %%r1, 256\n\t" | |||||
| "brctg %[n], 0b\n\t" | |||||
| "vrepg %%v16,%%v0,1\n\t" | |||||
| "wfmaxdb %%v0,%%v0,%%v16,0\n\t" | |||||
| "ldr %[max],%%f0" | |||||
| : [max] "=f"(max),[n] "+&r"(n) | |||||
| : "m"(*(const FLOAT (*)[n]) x),[x] "a"(x) | |||||
| : "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22", | |||||
| "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"); | |||||
| return max; | |||||
| } | } | ||||
| FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { | |||||
| BLASLONG i = 0; | |||||
| BLASLONG j = 0; | |||||
| FLOAT maxf = 0.0; | |||||
| if (n <= 0 || inc_x <= 0) return (maxf); | |||||
| if (inc_x == 1) { | |||||
| FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { | |||||
| BLASLONG i = 0; | |||||
| BLASLONG j = 0; | |||||
| FLOAT maxf = 0.0; | |||||
| BLASLONG n1 = n & -32; | |||||
| if (n1 > 0) { | |||||
| if (n <= 0 || inc_x <= 0) | |||||
| return (maxf); | |||||
| maxf = dmax_kernel_32(n1, x); | |||||
| if (inc_x == 1) { | |||||
| i = n1; | |||||
| } | |||||
| else | |||||
| { | |||||
| maxf=x[0]; | |||||
| i++; | |||||
| } | |||||
| BLASLONG n1 = n & -32; | |||||
| if (n1 > 0) { | |||||
| while (i < n) { | |||||
| if (x[i] > maxf) { | |||||
| maxf = x[i]; | |||||
| } | |||||
| i++; | |||||
| } | |||||
| return (maxf); | |||||
| maxf = dmax_kernel_32(n1, x); | |||||
| i = n1; | |||||
| } else { | } else { | ||||
| maxf = x[0]; | |||||
| i++; | |||||
| } | |||||
| maxf=x[0]; | |||||
| while (i < n) { | |||||
| if (x[i] > maxf) { | |||||
| maxf = x[i]; | |||||
| } | |||||
| i++; | |||||
| } | |||||
| return (maxf); | |||||
| } else { | |||||
| BLASLONG n1 = n & -4; | |||||
| while (j < n1) { | |||||
| maxf = x[0]; | |||||
| if (x[i] > maxf) { | |||||
| maxf = x[i]; | |||||
| } | |||||
| if (x[i + inc_x] > maxf) { | |||||
| maxf = x[i + inc_x]; | |||||
| } | |||||
| if (x[i + 2 * inc_x] > maxf) { | |||||
| maxf = x[i + 2 * inc_x]; | |||||
| } | |||||
| if (x[i + 3 * inc_x] > maxf) { | |||||
| maxf = x[i + 3 * inc_x]; | |||||
| } | |||||
| BLASLONG n1 = n & -4; | |||||
| while (j < n1) { | |||||
| i += inc_x * 4; | |||||
| if (x[i] > maxf) { | |||||
| maxf = x[i]; | |||||
| } | |||||
| if (x[i + inc_x] > maxf) { | |||||
| maxf = x[i + inc_x]; | |||||
| } | |||||
| if (x[i + 2 * inc_x] > maxf) { | |||||
| maxf = x[i + 2 * inc_x]; | |||||
| } | |||||
| if (x[i + 3 * inc_x] > maxf) { | |||||
| maxf = x[i + 3 * inc_x]; | |||||
| } | |||||
| j += 4; | |||||
| i += inc_x * 4; | |||||
| } | |||||
| j += 4; | |||||
| } | |||||
| while (j < n) { | |||||
| if (x[i] > maxf) { | |||||
| maxf = x[i]; | |||||
| } | |||||
| i += inc_x; | |||||
| j++; | |||||
| } | |||||
| return (maxf); | |||||
| while (j < n) { | |||||
| if (x[i] > maxf) { | |||||
| maxf = x[i]; | |||||
| } | |||||
| i += inc_x; | |||||
| j++; | |||||
| } | } | ||||
| return (maxf); | |||||
| } | |||||
| } | } | ||||
| @@ -1,5 +1,5 @@ | |||||
| /*************************************************************************** | /*************************************************************************** | ||||
| Copyright (c) 2013-2017, The OpenBLAS Project | |||||
| Copyright (c) 2013-2019, The OpenBLAS Project | |||||
| All rights reserved. | All rights reserved. | ||||
| Redistribution and use in source and binary forms, with or without | Redistribution and use in source and binary forms, with or without | ||||
| modification, are permitted provided that the following conditions are | modification, are permitted provided that the following conditions are | ||||
| @@ -27,154 +27,138 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #include "common.h" | #include "common.h" | ||||
| static FLOAT dmax_kernel_32(BLASLONG n, FLOAT *x) | |||||
| { | |||||
| FLOAT max; | |||||
| __asm__ volatile ( | |||||
| "vl %%v0,0(%2) \n\t" | |||||
| "srlg %%r0,%1,5 \n\t" | |||||
| "xgr %%r1,%%r1 \n\t" | |||||
| "0: \n\t" | |||||
| "pfd 1, 1024(%%r1,%2) \n\t" | |||||
| "vl %%v16,0(%%r1,%2) \n\t" | |||||
| "vl %%v17,16(%%r1,%2) \n\t" | |||||
| "vl %%v18,32(%%r1,%2) \n\t" | |||||
| "vl %%v19,48(%%r1,%2) \n\t" | |||||
| "vl %%v20,64(%%r1,%2) \n\t" | |||||
| "vl %%v21,80(%%r1,%2) \n\t" | |||||
| "vl %%v22,96(%%r1,%2) \n\t" | |||||
| "vl %%v23,112(%%r1,%2) \n\t" | |||||
| "vfchdb %%v24,%%v16,%%v17 \n\t" | |||||
| "vfchdb %%v25,%%v18,%%v19 \n\t" | |||||
| "vfchdb %%v26,%%v20,%%v21 \n\t" | |||||
| "vfchdb %%v27,%%v22,%%v23 \n\t" | |||||
| "vsel %%v24,%%v16,%%v17,%%v24 \n\t" | |||||
| "vsel %%v25,%%v18,%%v19,%%v25 \n\t" | |||||
| "vsel %%v26,%%v20,%%v21,%%v26 \n\t" | |||||
| "vsel %%v27,%%v22,%%v23,%%v27 \n\t" | |||||
| "vfchdb %%v28,%%v24,%%v25 \n\t" | |||||
| "vfchdb %%v29,%%v26,%%v27 \n\t" | |||||
| "vsel %%v28,%%v24,%%v25,%%v28 \n\t" | |||||
| "vsel %%v29,%%v26,%%v27,%%v29 \n\t" | |||||
| "vfchdb %%v30,%%v28,%%v29 \n\t" | |||||
| "vsel %%v30,%%v28,%%v29,%%v30 \n\t" | |||||
| "vfchdb %%v31,%%v30,%%v0 \n\t" | |||||
| "vsel %%v0,%%v30,%%v0,%%v31 \n\t" | |||||
| "vl %%v16,128(%%r1,%2) \n\t" | |||||
| "vl %%v17,144(%%r1,%2) \n\t" | |||||
| "vl %%v18,160(%%r1,%2) \n\t" | |||||
| "vl %%v19,176(%%r1,%2) \n\t" | |||||
| "vl %%v20,192(%%r1,%2) \n\t" | |||||
| "vl %%v21,208(%%r1,%2) \n\t" | |||||
| "vl %%v22,224(%%r1,%2) \n\t" | |||||
| "vl %%v23,240(%%r1,%2) \n\t" | |||||
| "vfchdb %%v24,%%v16,%%v17 \n\t" | |||||
| "vfchdb %%v25,%%v18,%%v19 \n\t" | |||||
| "vfchdb %%v26,%%v20,%%v21 \n\t" | |||||
| "vfchdb %%v27,%%v22,%%v23 \n\t" | |||||
| "vsel %%v24,%%v16,%%v17,%%v24 \n\t" | |||||
| "vsel %%v25,%%v18,%%v19,%%v25 \n\t" | |||||
| "vsel %%v26,%%v20,%%v21,%%v26 \n\t" | |||||
| "vsel %%v27,%%v22,%%v23,%%v27 \n\t" | |||||
| "vfchdb %%v28,%%v24,%%v25 \n\t" | |||||
| "vfchdb %%v29,%%v26,%%v27 \n\t" | |||||
| "vsel %%v28,%%v24,%%v25,%%v28 \n\t" | |||||
| "vsel %%v29,%%v26,%%v27,%%v29 \n\t" | |||||
| "vfchdb %%v30,%%v28,%%v29 \n\t" | |||||
| "vsel %%v30,%%v28,%%v29,%%v30 \n\t" | |||||
| "vfchdb %%v31,%%v30,%%v0 \n\t" | |||||
| "vsel %%v0,%%v30,%%v0,%%v31 \n\t" | |||||
| "agfi %%r1, 256 \n\t" | |||||
| "brctg %%r0, 0b \n\t" | |||||
| "vrepg %%v16,%%v0,1 \n\t" | |||||
| "wfchdb %%v17,%%v0,%%v16 \n\t" | |||||
| "vsel %%v0,%%v0,%%v16,%%v17 \n\t" | |||||
| "ldr %0,%%f0 " | |||||
| :"=f"(max) | |||||
| :"r"(n),"ZR"((const FLOAT (*)[n])x) | |||||
| :"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" | |||||
| ); | |||||
| return max; | |||||
| static FLOAT dmax_kernel_32(BLASLONG n, FLOAT *x) { | |||||
| FLOAT max; | |||||
| __asm__("vl %%v0,0(%[x])\n\t" | |||||
| "srlg %[n],%[n],5\n\t" | |||||
| "xgr %%r1,%%r1\n\t" | |||||
| "0:\n\t" | |||||
| "pfd 1, 1024(%%r1,%[x])\n\t" | |||||
| "vl %%v16,0(%%r1,%[x])\n\t" | |||||
| "vl %%v17,16(%%r1,%[x])\n\t" | |||||
| "vl %%v18,32(%%r1,%[x])\n\t" | |||||
| "vl %%v19,48(%%r1,%[x])\n\t" | |||||
| "vl %%v20,64(%%r1,%[x])\n\t" | |||||
| "vl %%v21,80(%%r1,%[x])\n\t" | |||||
| "vl %%v22,96(%%r1,%[x])\n\t" | |||||
| "vl %%v23,112(%%r1,%[x])\n\t" | |||||
| "vfchdb %%v24,%%v16,%%v17\n\t" | |||||
| "vfchdb %%v25,%%v18,%%v19\n\t" | |||||
| "vfchdb %%v26,%%v20,%%v21\n\t" | |||||
| "vfchdb %%v27,%%v22,%%v23\n\t" | |||||
| "vsel %%v24,%%v16,%%v17,%%v24\n\t" | |||||
| "vsel %%v25,%%v18,%%v19,%%v25\n\t" | |||||
| "vsel %%v26,%%v20,%%v21,%%v26\n\t" | |||||
| "vsel %%v27,%%v22,%%v23,%%v27\n\t" | |||||
| "vfchdb %%v28,%%v24,%%v25\n\t" | |||||
| "vfchdb %%v29,%%v26,%%v27\n\t" | |||||
| "vsel %%v28,%%v24,%%v25,%%v28\n\t" | |||||
| "vsel %%v29,%%v26,%%v27,%%v29\n\t" | |||||
| "vfchdb %%v30,%%v28,%%v29\n\t" | |||||
| "vsel %%v30,%%v28,%%v29,%%v30\n\t" | |||||
| "vfchdb %%v31,%%v30,%%v0\n\t" | |||||
| "vsel %%v0,%%v30,%%v0,%%v31\n\t" | |||||
| "vl %%v16,128(%%r1,%[x])\n\t" | |||||
| "vl %%v17,144(%%r1,%[x])\n\t" | |||||
| "vl %%v18,160(%%r1,%[x])\n\t" | |||||
| "vl %%v19,176(%%r1,%[x])\n\t" | |||||
| "vl %%v20,192(%%r1,%[x])\n\t" | |||||
| "vl %%v21,208(%%r1,%[x])\n\t" | |||||
| "vl %%v22,224(%%r1,%[x])\n\t" | |||||
| "vl %%v23,240(%%r1,%[x])\n\t" | |||||
| "vfchdb %%v24,%%v16,%%v17\n\t" | |||||
| "vfchdb %%v25,%%v18,%%v19\n\t" | |||||
| "vfchdb %%v26,%%v20,%%v21\n\t" | |||||
| "vfchdb %%v27,%%v22,%%v23\n\t" | |||||
| "vsel %%v24,%%v16,%%v17,%%v24\n\t" | |||||
| "vsel %%v25,%%v18,%%v19,%%v25\n\t" | |||||
| "vsel %%v26,%%v20,%%v21,%%v26\n\t" | |||||
| "vsel %%v27,%%v22,%%v23,%%v27\n\t" | |||||
| "vfchdb %%v28,%%v24,%%v25\n\t" | |||||
| "vfchdb %%v29,%%v26,%%v27\n\t" | |||||
| "vsel %%v28,%%v24,%%v25,%%v28\n\t" | |||||
| "vsel %%v29,%%v26,%%v27,%%v29\n\t" | |||||
| "vfchdb %%v30,%%v28,%%v29\n\t" | |||||
| "vsel %%v30,%%v28,%%v29,%%v30\n\t" | |||||
| "vfchdb %%v31,%%v30,%%v0\n\t" | |||||
| "vsel %%v0,%%v30,%%v0,%%v31\n\t" | |||||
| "agfi %%r1, 256\n\t" | |||||
| "brctg %[n], 0b\n\t" | |||||
| "vrepg %%v16,%%v0,1\n\t" | |||||
| "wfchdb %%v17,%%v0,%%v16\n\t" | |||||
| "vsel %%v0,%%v0,%%v16,%%v17\n\t" | |||||
| "ldr %[max],%%f0" | |||||
| : [max] "=f"(max),[n] "+&r"(n) | |||||
| : "m"(*(const FLOAT (*)[n]) x),[x] "a"(x) | |||||
| : "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22", | |||||
| "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"); | |||||
| return max; | |||||
| } | } | ||||
| FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { | |||||
| BLASLONG i = 0; | |||||
| BLASLONG j = 0; | |||||
| FLOAT maxf = 0.0; | |||||
| if (n <= 0 || inc_x <= 0) return (maxf); | |||||
| if (inc_x == 1) { | |||||
| FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { | |||||
| BLASLONG i = 0; | |||||
| BLASLONG j = 0; | |||||
| FLOAT maxf = 0.0; | |||||
| BLASLONG n1 = n & -32; | |||||
| if (n1 > 0) { | |||||
| if (n <= 0 || inc_x <= 0) | |||||
| return (maxf); | |||||
| maxf = dmax_kernel_32(n1, x); | |||||
| if (inc_x == 1) { | |||||
| i = n1; | |||||
| } | |||||
| else | |||||
| { | |||||
| maxf=x[0]; | |||||
| i++; | |||||
| } | |||||
| BLASLONG n1 = n & -32; | |||||
| if (n1 > 0) { | |||||
| while (i < n) { | |||||
| if (x[i] > maxf) { | |||||
| maxf = x[i]; | |||||
| } | |||||
| i++; | |||||
| } | |||||
| return (maxf); | |||||
| maxf = dmax_kernel_32(n1, x); | |||||
| i = n1; | |||||
| } else { | } else { | ||||
| maxf = x[0]; | |||||
| i++; | |||||
| } | |||||
| maxf=x[0]; | |||||
| while (i < n) { | |||||
| if (x[i] > maxf) { | |||||
| maxf = x[i]; | |||||
| } | |||||
| i++; | |||||
| } | |||||
| return (maxf); | |||||
| } else { | |||||
| BLASLONG n1 = n & -4; | |||||
| while (j < n1) { | |||||
| maxf = x[0]; | |||||
| if (x[i] > maxf) { | |||||
| maxf = x[i]; | |||||
| } | |||||
| if (x[i + inc_x] > maxf) { | |||||
| maxf = x[i + inc_x]; | |||||
| } | |||||
| if (x[i + 2 * inc_x] > maxf) { | |||||
| maxf = x[i + 2 * inc_x]; | |||||
| } | |||||
| if (x[i + 3 * inc_x] > maxf) { | |||||
| maxf = x[i + 3 * inc_x]; | |||||
| } | |||||
| BLASLONG n1 = n & -4; | |||||
| while (j < n1) { | |||||
| i += inc_x * 4; | |||||
| if (x[i] > maxf) { | |||||
| maxf = x[i]; | |||||
| } | |||||
| if (x[i + inc_x] > maxf) { | |||||
| maxf = x[i + inc_x]; | |||||
| } | |||||
| if (x[i + 2 * inc_x] > maxf) { | |||||
| maxf = x[i + 2 * inc_x]; | |||||
| } | |||||
| if (x[i + 3 * inc_x] > maxf) { | |||||
| maxf = x[i + 3 * inc_x]; | |||||
| } | |||||
| j += 4; | |||||
| i += inc_x * 4; | |||||
| } | |||||
| j += 4; | |||||
| } | |||||
| while (j < n) { | |||||
| if (x[i] > maxf) { | |||||
| maxf = x[i]; | |||||
| } | |||||
| i += inc_x; | |||||
| j++; | |||||
| } | |||||
| return (maxf); | |||||
| while (j < n) { | |||||
| if (x[i] > maxf) { | |||||
| maxf = x[i]; | |||||
| } | |||||
| i += inc_x; | |||||
| j++; | |||||
| } | } | ||||
| return (maxf); | |||||
| } | |||||
| } | } | ||||
| @@ -1,5 +1,5 @@ | |||||
| /*************************************************************************** | /*************************************************************************** | ||||
| Copyright (c) 2013-2017, The OpenBLAS Project | |||||
| Copyright (c) 2013-2019, The OpenBLAS Project | |||||
| All rights reserved. | All rights reserved. | ||||
| Redistribution and use in source and binary forms, with or without | Redistribution and use in source and binary forms, with or without | ||||
| modification, are permitted provided that the following conditions are | modification, are permitted provided that the following conditions are | ||||
| @@ -27,133 +27,121 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #include "common.h" | #include "common.h" | ||||
| static FLOAT dmin_kernel_32(BLASLONG n, FLOAT *x) | |||||
| { | |||||
| FLOAT min; | |||||
| __asm__ volatile ( | |||||
| "vl %%v0,0(%2) \n\t" | |||||
| "srlg %%r0,%1,5 \n\t" | |||||
| "xgr %%r1,%%r1 \n\t" | |||||
| "0: \n\t" | |||||
| "pfd 1, 1024(%%r1,%2) \n\t" | |||||
| "vl %%v16,0(%%r1,%2) \n\t" | |||||
| "vl %%v17,16(%%r1,%2) \n\t" | |||||
| "vl %%v18,32(%%r1,%2) \n\t" | |||||
| "vl %%v19,48(%%r1,%2) \n\t" | |||||
| "vl %%v20,64(%%r1,%2) \n\t" | |||||
| "vl %%v21,80(%%r1,%2) \n\t" | |||||
| "vl %%v22,96(%%r1,%2) \n\t" | |||||
| "vl %%v23,112(%%r1,%2) \n\t" | |||||
| "vl %%v24,128(%%r1,%2) \n\t" | |||||
| "vl %%v25,144(%%r1,%2) \n\t" | |||||
| "vl %%v26,160(%%r1,%2) \n\t" | |||||
| "vl %%v27,176(%%r1,%2) \n\t" | |||||
| "vl %%v28,192(%%r1,%2) \n\t" | |||||
| "vl %%v29,208(%%r1,%2) \n\t" | |||||
| "vl %%v30,224(%%r1,%2) \n\t" | |||||
| "vl %%v31,240(%%r1,%2) \n\t" | |||||
| "vfmindb %%v16,%%v16,%%v24,0 \n\t" | |||||
| "vfmindb %%v17,%%v17,%%v25,0 \n\t" | |||||
| "vfmindb %%v18,%%v18,%%v26,0 \n\t" | |||||
| "vfmindb %%v19,%%v19,%%v27,0 \n\t" | |||||
| "vfmindb %%v20,%%v20,%%v28,0 \n\t" | |||||
| "vfmindb %%v21,%%v21,%%v29,0 \n\t" | |||||
| "vfmindb %%v22,%%v22,%%v30,0 \n\t" | |||||
| "vfmindb %%v23,%%v23,%%v31,0 \n\t" | |||||
| "vfmindb %%v16,%%v16,%%v20,0 \n\t" | |||||
| "vfmindb %%v17,%%v17,%%v21,0 \n\t" | |||||
| "vfmindb %%v18,%%v18,%%v22,0 \n\t" | |||||
| "vfmindb %%v19,%%v19,%%v23,0 \n\t" | |||||
| "vfmindb %%v16,%%v16,%%v18,0 \n\t" | |||||
| "vfmindb %%v17,%%v17,%%v19,0 \n\t" | |||||
| "vfmindb %%v16,%%v16,%%v17,0 \n\t" | |||||
| "vfmindb %%v0,%%v0,%%v16,0 \n\t" | |||||
| "agfi %%r1, 256 \n\t" | |||||
| "brctg %%r0, 0b \n\t" | |||||
| "vrepg %%v16,%%v0,1 \n\t" | |||||
| "wfmindb %%v0,%%v0,%%v16,0 \n\t" | |||||
| "ldr %0,%%f0 " | |||||
| :"=f"(min) | |||||
| :"r"(n),"ZR"((const FLOAT (*)[n])x) | |||||
| :"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" | |||||
| ); | |||||
| return min; | |||||
| static FLOAT dmin_kernel_32(BLASLONG n, FLOAT *x) { | |||||
| FLOAT min; | |||||
| __asm__("vl %%v0,0(%[x])\n\t" | |||||
| "srlg %[n],%[n],5\n\t" | |||||
| "xgr %%r1,%%r1\n\t" | |||||
| "0:\n\t" | |||||
| "pfd 1, 1024(%%r1,%[x])\n\t" | |||||
| "vl %%v16,0(%%r1,%[x])\n\t" | |||||
| "vl %%v17,16(%%r1,%[x])\n\t" | |||||
| "vl %%v18,32(%%r1,%[x])\n\t" | |||||
| "vl %%v19,48(%%r1,%[x])\n\t" | |||||
| "vl %%v20,64(%%r1,%[x])\n\t" | |||||
| "vl %%v21,80(%%r1,%[x])\n\t" | |||||
| "vl %%v22,96(%%r1,%[x])\n\t" | |||||
| "vl %%v23,112(%%r1,%[x])\n\t" | |||||
| "vl %%v24,128(%%r1,%[x])\n\t" | |||||
| "vl %%v25,144(%%r1,%[x])\n\t" | |||||
| "vl %%v26,160(%%r1,%[x])\n\t" | |||||
| "vl %%v27,176(%%r1,%[x])\n\t" | |||||
| "vl %%v28,192(%%r1,%[x])\n\t" | |||||
| "vl %%v29,208(%%r1,%[x])\n\t" | |||||
| "vl %%v30,224(%%r1,%[x])\n\t" | |||||
| "vl %%v31,240(%%r1,%[x])\n\t" | |||||
| "vfmindb %%v16,%%v16,%%v24,0\n\t" | |||||
| "vfmindb %%v17,%%v17,%%v25,0\n\t" | |||||
| "vfmindb %%v18,%%v18,%%v26,0\n\t" | |||||
| "vfmindb %%v19,%%v19,%%v27,0\n\t" | |||||
| "vfmindb %%v20,%%v20,%%v28,0\n\t" | |||||
| "vfmindb %%v21,%%v21,%%v29,0\n\t" | |||||
| "vfmindb %%v22,%%v22,%%v30,0\n\t" | |||||
| "vfmindb %%v23,%%v23,%%v31,0\n\t" | |||||
| "vfmindb %%v16,%%v16,%%v20,0\n\t" | |||||
| "vfmindb %%v17,%%v17,%%v21,0\n\t" | |||||
| "vfmindb %%v18,%%v18,%%v22,0\n\t" | |||||
| "vfmindb %%v19,%%v19,%%v23,0\n\t" | |||||
| "vfmindb %%v16,%%v16,%%v18,0\n\t" | |||||
| "vfmindb %%v17,%%v17,%%v19,0\n\t" | |||||
| "vfmindb %%v16,%%v16,%%v17,0\n\t" | |||||
| "vfmindb %%v0,%%v0,%%v16,0\n\t" | |||||
| "agfi %%r1, 256\n\t" | |||||
| "brctg %[n], 0b\n\t" | |||||
| "vrepg %%v16,%%v0,1\n\t" | |||||
| "wfmindb %%v0,%%v0,%%v16,0\n\t" | |||||
| "ldr %[min],%%f0" | |||||
| : [min] "=f"(min),[n] "+&r"(n) | |||||
| : "m"(*(const FLOAT (*)[n]) x),[x] "a"(x) | |||||
| : "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22", | |||||
| "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"); | |||||
| return min; | |||||
| } | } | ||||
| FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { | |||||
| BLASLONG i = 0; | |||||
| BLASLONG j = 0; | |||||
| FLOAT minf = 0.0; | |||||
| if (n <= 0 || inc_x <= 0) return (minf); | |||||
| if (inc_x == 1) { | |||||
| FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { | |||||
| BLASLONG i = 0; | |||||
| BLASLONG j = 0; | |||||
| FLOAT minf = 0.0; | |||||
| BLASLONG n1 = n & -32; | |||||
| if (n1 > 0) { | |||||
| if (n <= 0 || inc_x <= 0) | |||||
| return (minf); | |||||
| minf = dmin_kernel_32(n1, x); | |||||
| if (inc_x == 1) { | |||||
| i = n1; | |||||
| } | |||||
| else | |||||
| { | |||||
| minf=x[0]; | |||||
| i++; | |||||
| } | |||||
| BLASLONG n1 = n & -32; | |||||
| if (n1 > 0) { | |||||
| while (i < n) { | |||||
| if (x[i] < minf) { | |||||
| minf = x[i]; | |||||
| } | |||||
| i++; | |||||
| } | |||||
| return (minf); | |||||
| minf = dmin_kernel_32(n1, x); | |||||
| i = n1; | |||||
| } else { | } else { | ||||
| minf = x[0]; | |||||
| i++; | |||||
| } | |||||
| minf=x[0]; | |||||
| while (i < n) { | |||||
| if (x[i] < minf) { | |||||
| minf = x[i]; | |||||
| } | |||||
| i++; | |||||
| } | |||||
| return (minf); | |||||
| } else { | |||||
| BLASLONG n1 = n & -4; | |||||
| while (j < n1) { | |||||
| minf = x[0]; | |||||
| if (x[i] < minf) { | |||||
| minf = x[i]; | |||||
| } | |||||
| if (x[i + inc_x] < minf) { | |||||
| minf = x[i + inc_x]; | |||||
| } | |||||
| if (x[i + 2 * inc_x] < minf) { | |||||
| minf = x[i + 2 * inc_x]; | |||||
| } | |||||
| if (x[i + 3 * inc_x] < minf) { | |||||
| minf = x[i + 3 * inc_x]; | |||||
| } | |||||
| BLASLONG n1 = n & -4; | |||||
| while (j < n1) { | |||||
| i += inc_x * 4; | |||||
| if (x[i] < minf) { | |||||
| minf = x[i]; | |||||
| } | |||||
| if (x[i + inc_x] < minf) { | |||||
| minf = x[i + inc_x]; | |||||
| } | |||||
| if (x[i + 2 * inc_x] < minf) { | |||||
| minf = x[i + 2 * inc_x]; | |||||
| } | |||||
| if (x[i + 3 * inc_x] < minf) { | |||||
| minf = x[i + 3 * inc_x]; | |||||
| } | |||||
| j += 4; | |||||
| i += inc_x * 4; | |||||
| } | |||||
| j += 4; | |||||
| } | |||||
| while (j < n) { | |||||
| if (x[i] < minf) { | |||||
| minf = x[i]; | |||||
| } | |||||
| i += inc_x; | |||||
| j++; | |||||
| } | |||||
| return (minf); | |||||
| while (j < n) { | |||||
| if (x[i] < minf) { | |||||
| minf = x[i]; | |||||
| } | |||||
| i += inc_x; | |||||
| j++; | |||||
| } | } | ||||
| return (minf); | |||||
| } | |||||
| } | } | ||||
| @@ -1,5 +1,5 @@ | |||||
| /*************************************************************************** | /*************************************************************************** | ||||
| Copyright (c) 2013-2017, The OpenBLAS Project | |||||
| Copyright (c) 2013-2019, The OpenBLAS Project | |||||
| All rights reserved. | All rights reserved. | ||||
| Redistribution and use in source and binary forms, with or without | Redistribution and use in source and binary forms, with or without | ||||
| modification, are permitted provided that the following conditions are | modification, are permitted provided that the following conditions are | ||||
| @@ -27,154 +27,138 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #include "common.h" | #include "common.h" | ||||
| static FLOAT dmin_kernel_32(BLASLONG n, FLOAT *x) | |||||
| { | |||||
| FLOAT min; | |||||
| __asm__ volatile ( | |||||
| "vl %%v0,0(%2) \n\t" | |||||
| "srlg %%r0,%1,5 \n\t" | |||||
| "xgr %%r1,%%r1 \n\t" | |||||
| "0: \n\t" | |||||
| "pfd 1, 1024(%%r1,%2) \n\t" | |||||
| "vl %%v16,0(%%r1,%2) \n\t" | |||||
| "vl %%v17,16(%%r1,%2) \n\t" | |||||
| "vl %%v18,32(%%r1,%2) \n\t" | |||||
| "vl %%v19,48(%%r1,%2) \n\t" | |||||
| "vl %%v20,64(%%r1,%2) \n\t" | |||||
| "vl %%v21,80(%%r1,%2) \n\t" | |||||
| "vl %%v22,96(%%r1,%2) \n\t" | |||||
| "vl %%v23,112(%%r1,%2) \n\t" | |||||
| "vfchdb %%v24,%%v17,%%v16 \n\t" | |||||
| "vfchdb %%v25,%%v19,%%v18 \n\t" | |||||
| "vfchdb %%v26,%%v21,%%v20 \n\t" | |||||
| "vfchdb %%v27,%%v23,%%v22 \n\t" | |||||
| "vsel %%v24,%%v16,%%v17,%%v24 \n\t" | |||||
| "vsel %%v25,%%v18,%%v19,%%v25 \n\t" | |||||
| "vsel %%v26,%%v20,%%v21,%%v26 \n\t" | |||||
| "vsel %%v27,%%v22,%%v23,%%v27 \n\t" | |||||
| "vfchdb %%v28,%%v25,%%v24 \n\t" | |||||
| "vfchdb %%v29,%%v27,%%v26 \n\t" | |||||
| "vsel %%v28,%%v24,%%v25,%%v28 \n\t" | |||||
| "vsel %%v29,%%v26,%%v27,%%v29 \n\t" | |||||
| "vfchdb %%v30,%%v29,%%v28 \n\t" | |||||
| "vsel %%v30,%%v28,%%v29,%%v30 \n\t" | |||||
| "vfchdb %%v31,%%v0,%%v30 \n\t" | |||||
| "vsel %%v0,%%v30,%%v0,%%v31 \n\t" | |||||
| "vl %%v16,128(%%r1,%2) \n\t" | |||||
| "vl %%v17,144(%%r1,%2) \n\t" | |||||
| "vl %%v18,160(%%r1,%2) \n\t" | |||||
| "vl %%v19,176(%%r1,%2) \n\t" | |||||
| "vl %%v20,192(%%r1,%2) \n\t" | |||||
| "vl %%v21,208(%%r1,%2) \n\t" | |||||
| "vl %%v22,224(%%r1,%2) \n\t" | |||||
| "vl %%v23,240(%%r1,%2) \n\t" | |||||
| "vfchdb %%v24,%%v17,%%v16 \n\t" | |||||
| "vfchdb %%v25,%%v19,%%v18 \n\t" | |||||
| "vfchdb %%v26,%%v21,%%v20 \n\t" | |||||
| "vfchdb %%v27,%%v23,%%v22 \n\t" | |||||
| "vsel %%v24,%%v16,%%v17,%%v24 \n\t" | |||||
| "vsel %%v25,%%v18,%%v19,%%v25 \n\t" | |||||
| "vsel %%v26,%%v20,%%v21,%%v26 \n\t" | |||||
| "vsel %%v27,%%v22,%%v23,%%v27 \n\t" | |||||
| "vfchdb %%v28,%%v25,%%v24 \n\t" | |||||
| "vfchdb %%v29,%%v27,%%v26 \n\t" | |||||
| "vsel %%v28,%%v24,%%v25,%%v28 \n\t" | |||||
| "vsel %%v29,%%v26,%%v27,%%v29 \n\t" | |||||
| "vfchdb %%v30,%%v29,%%v28 \n\t" | |||||
| "vsel %%v30,%%v28,%%v29,%%v30 \n\t" | |||||
| "vfchdb %%v31,%%v0,%%v30 \n\t" | |||||
| "vsel %%v0,%%v30,%%v0,%%v31 \n\t" | |||||
| "agfi %%r1, 256 \n\t" | |||||
| "brctg %%r0, 0b \n\t" | |||||
| "vrepg %%v16,%%v0,1 \n\t" | |||||
| "wfchdb %%v17,%%v16,%%v0 \n\t" | |||||
| "vsel %%v0,%%v0,%%v16,%%v17 \n\t" | |||||
| "ldr %0,%%f0 " | |||||
| :"=f"(min) | |||||
| :"r"(n),"ZR"((const FLOAT (*)[n])x) | |||||
| :"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" | |||||
| ); | |||||
| return min; | |||||
| static FLOAT dmin_kernel_32(BLASLONG n, FLOAT *x) { | |||||
| FLOAT min; | |||||
| __asm__("vl %%v0,0(%[x])\n\t" | |||||
| "srlg %[n],%[n],5\n\t" | |||||
| "xgr %%r1,%%r1\n\t" | |||||
| "0:\n\t" | |||||
| "pfd 1, 1024(%%r1,%[x])\n\t" | |||||
| "vl %%v16,0(%%r1,%[x])\n\t" | |||||
| "vl %%v17,16(%%r1,%[x])\n\t" | |||||
| "vl %%v18,32(%%r1,%[x])\n\t" | |||||
| "vl %%v19,48(%%r1,%[x])\n\t" | |||||
| "vl %%v20,64(%%r1,%[x])\n\t" | |||||
| "vl %%v21,80(%%r1,%[x])\n\t" | |||||
| "vl %%v22,96(%%r1,%[x])\n\t" | |||||
| "vl %%v23,112(%%r1,%[x])\n\t" | |||||
| "vfchdb %%v24,%%v17,%%v16\n\t" | |||||
| "vfchdb %%v25,%%v19,%%v18\n\t" | |||||
| "vfchdb %%v26,%%v21,%%v20\n\t" | |||||
| "vfchdb %%v27,%%v23,%%v22\n\t" | |||||
| "vsel %%v24,%%v16,%%v17,%%v24\n\t" | |||||
| "vsel %%v25,%%v18,%%v19,%%v25\n\t" | |||||
| "vsel %%v26,%%v20,%%v21,%%v26\n\t" | |||||
| "vsel %%v27,%%v22,%%v23,%%v27\n\t" | |||||
| "vfchdb %%v28,%%v25,%%v24\n\t" | |||||
| "vfchdb %%v29,%%v27,%%v26\n\t" | |||||
| "vsel %%v28,%%v24,%%v25,%%v28\n\t" | |||||
| "vsel %%v29,%%v26,%%v27,%%v29\n\t" | |||||
| "vfchdb %%v30,%%v29,%%v28\n\t" | |||||
| "vsel %%v30,%%v28,%%v29,%%v30\n\t" | |||||
| "vfchdb %%v31,%%v0,%%v30\n\t" | |||||
| "vsel %%v0,%%v30,%%v0,%%v31\n\t" | |||||
| "vl %%v16,128(%%r1,%[x])\n\t" | |||||
| "vl %%v17,144(%%r1,%[x])\n\t" | |||||
| "vl %%v18,160(%%r1,%[x])\n\t" | |||||
| "vl %%v19,176(%%r1,%[x])\n\t" | |||||
| "vl %%v20,192(%%r1,%[x])\n\t" | |||||
| "vl %%v21,208(%%r1,%[x])\n\t" | |||||
| "vl %%v22,224(%%r1,%[x])\n\t" | |||||
| "vl %%v23,240(%%r1,%[x])\n\t" | |||||
| "vfchdb %%v24,%%v17,%%v16\n\t" | |||||
| "vfchdb %%v25,%%v19,%%v18\n\t" | |||||
| "vfchdb %%v26,%%v21,%%v20\n\t" | |||||
| "vfchdb %%v27,%%v23,%%v22\n\t" | |||||
| "vsel %%v24,%%v16,%%v17,%%v24\n\t" | |||||
| "vsel %%v25,%%v18,%%v19,%%v25\n\t" | |||||
| "vsel %%v26,%%v20,%%v21,%%v26\n\t" | |||||
| "vsel %%v27,%%v22,%%v23,%%v27\n\t" | |||||
| "vfchdb %%v28,%%v25,%%v24\n\t" | |||||
| "vfchdb %%v29,%%v27,%%v26\n\t" | |||||
| "vsel %%v28,%%v24,%%v25,%%v28\n\t" | |||||
| "vsel %%v29,%%v26,%%v27,%%v29\n\t" | |||||
| "vfchdb %%v30,%%v29,%%v28\n\t" | |||||
| "vsel %%v30,%%v28,%%v29,%%v30\n\t" | |||||
| "vfchdb %%v31,%%v0,%%v30\n\t" | |||||
| "vsel %%v0,%%v30,%%v0,%%v31\n\t" | |||||
| "agfi %%r1, 256\n\t" | |||||
| "brctg %[n], 0b\n\t" | |||||
| "vrepg %%v16,%%v0,1\n\t" | |||||
| "wfchdb %%v17,%%v16,%%v0\n\t" | |||||
| "vsel %%v0,%%v0,%%v16,%%v17\n\t" | |||||
| "ldr %[min],%%f0" | |||||
| : [min] "=f"(min),[n] "+&r"(n) | |||||
| : "m"(*(const FLOAT (*)[n]) x),[x] "a"(x) | |||||
| : "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22", | |||||
| "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"); | |||||
| return min; | |||||
| } | } | ||||
| FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { | |||||
| BLASLONG i = 0; | |||||
| BLASLONG j = 0; | |||||
| FLOAT minf = 0.0; | |||||
| if (n <= 0 || inc_x <= 0) return (minf); | |||||
| if (inc_x == 1) { | |||||
| FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { | |||||
| BLASLONG i = 0; | |||||
| BLASLONG j = 0; | |||||
| FLOAT minf = 0.0; | |||||
| BLASLONG n1 = n & -32; | |||||
| if (n1 > 0) { | |||||
| if (n <= 0 || inc_x <= 0) | |||||
| return (minf); | |||||
| minf = dmin_kernel_32(n1, x); | |||||
| if (inc_x == 1) { | |||||
| i = n1; | |||||
| } | |||||
| else | |||||
| { | |||||
| minf=x[0]; | |||||
| i++; | |||||
| } | |||||
| BLASLONG n1 = n & -32; | |||||
| if (n1 > 0) { | |||||
| while (i < n) { | |||||
| if (x[i] < minf) { | |||||
| minf = x[i]; | |||||
| } | |||||
| i++; | |||||
| } | |||||
| return (minf); | |||||
| minf = dmin_kernel_32(n1, x); | |||||
| i = n1; | |||||
| } else { | } else { | ||||
| minf = x[0]; | |||||
| i++; | |||||
| } | |||||
| minf=x[0]; | |||||
| while (i < n) { | |||||
| if (x[i] < minf) { | |||||
| minf = x[i]; | |||||
| } | |||||
| i++; | |||||
| } | |||||
| return (minf); | |||||
| } else { | |||||
| BLASLONG n1 = n & -4; | |||||
| while (j < n1) { | |||||
| minf = x[0]; | |||||
| if (x[i] < minf) { | |||||
| minf = x[i]; | |||||
| } | |||||
| if (x[i + inc_x] < minf) { | |||||
| minf = x[i + inc_x]; | |||||
| } | |||||
| if (x[i + 2 * inc_x] < minf) { | |||||
| minf = x[i + 2 * inc_x]; | |||||
| } | |||||
| if (x[i + 3 * inc_x] < minf) { | |||||
| minf = x[i + 3 * inc_x]; | |||||
| } | |||||
| BLASLONG n1 = n & -4; | |||||
| while (j < n1) { | |||||
| i += inc_x * 4; | |||||
| if (x[i] < minf) { | |||||
| minf = x[i]; | |||||
| } | |||||
| if (x[i + inc_x] < minf) { | |||||
| minf = x[i + inc_x]; | |||||
| } | |||||
| if (x[i + 2 * inc_x] < minf) { | |||||
| minf = x[i + 2 * inc_x]; | |||||
| } | |||||
| if (x[i + 3 * inc_x] < minf) { | |||||
| minf = x[i + 3 * inc_x]; | |||||
| } | |||||
| j += 4; | |||||
| i += inc_x * 4; | |||||
| } | |||||
| j += 4; | |||||
| } | |||||
| while (j < n) { | |||||
| if (x[i] < minf) { | |||||
| minf = x[i]; | |||||
| } | |||||
| i += inc_x; | |||||
| j++; | |||||
| } | |||||
| return (minf); | |||||
| while (j < n) { | |||||
| if (x[i] < minf) { | |||||
| minf = x[i]; | |||||
| } | |||||
| i += inc_x; | |||||
| j++; | |||||
| } | } | ||||
| return (minf); | |||||
| } | |||||
| } | } | ||||
| @@ -1,5 +1,5 @@ | |||||
| /*************************************************************************** | /*************************************************************************** | ||||
| Copyright (c) 2013-2018, The OpenBLAS Project | |||||
| Copyright (c) 2013-2019, The OpenBLAS Project | |||||
| All rights reserved. | All rights reserved. | ||||
| Redistribution and use in source and binary forms, with or without | Redistribution and use in source and binary forms, with or without | ||||
| modification, are permitted provided that the following conditions are | modification, are permitted provided that the following conditions are | ||||
| @@ -27,220 +27,199 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #include "common.h" | #include "common.h" | ||||
| static void drot_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *c, FLOAT *s) | |||||
| { | |||||
| __asm__ ( | |||||
| "vlrepg %%v0,%3 \n\t" | |||||
| "vlrepg %%v1,%4 \n\t" | |||||
| "srlg %%r0,%0,5 \n\t" | |||||
| "xgr %%r1,%%r1 \n\t" | |||||
| "0: \n\t" | |||||
| "pfd 2, 1024(%%r1,%1) \n\t" | |||||
| "pfd 2, 1024(%%r1,%2) \n\t" | |||||
| "vl %%v24, 0(%%r1,%1) \n\t" | |||||
| "vl %%v25, 16(%%r1,%1) \n\t" | |||||
| "vl %%v26, 32(%%r1,%1) \n\t" | |||||
| "vl %%v27, 48(%%r1,%1) \n\t" | |||||
| "vl %%v16, 0(%%r1,%2) \n\t" | |||||
| "vl %%v17, 16(%%r1,%2) \n\t" | |||||
| "vl %%v18, 32(%%r1,%2) \n\t" | |||||
| "vl %%v19, 48(%%r1,%2) \n\t" | |||||
| "vfmdb %%v28,%%v24,%%v0 \n\t" | |||||
| "vfmdb %%v29,%%v25,%%v0 \n\t" | |||||
| "vfmdb %%v20,%%v24,%%v1 \n\t" /* yn=x*s */ | |||||
| "vfmdb %%v21,%%v25,%%v1 \n\t" /* yn=x*s */ | |||||
| "vfmdb %%v30,%%v26,%%v0 \n\t" | |||||
| "vfmdb %%v22,%%v26,%%v1 \n\t" /* yn=x*s */ | |||||
| "vfmdb %%v31,%%v27,%%v0 \n\t" | |||||
| "vfmdb %%v23,%%v27,%%v1 \n\t" /* yn=x*s */ | |||||
| /* 2nd parts*/ | |||||
| "vfmadb %%v28,%%v16,%%v1,%%v28 \n\t" | |||||
| "vfmsdb %%v20,%%v16,%%v0,%%v20 \n\t" /* yn=y*c-yn */ | |||||
| "vfmadb %%v29,%%v17,%%v1,%%v29 \n\t" | |||||
| "vfmsdb %%v21,%%v17,%%v0,%%v21 \n\t" /* yn=y*c-yn */ | |||||
| "vfmadb %%v30,%%v18,%%v1,%%v30 \n\t" | |||||
| "vfmsdb %%v22,%%v18,%%v0,%%v22 \n\t" /* yn=y*c-yn */ | |||||
| "vfmadb %%v31,%%v19,%%v1,%%v31 \n\t" | |||||
| "vfmsdb %%v23,%%v19,%%v0,%%v23 \n\t" /* yn=y*c-yn */ | |||||
| "vst %%v28, 0(%%r1,%1) \n\t" | |||||
| "vst %%v29, 16(%%r1,%1) \n\t" | |||||
| "vst %%v30, 32(%%r1,%1) \n\t" | |||||
| "vst %%v31, 48(%%r1,%1) \n\t" | |||||
| "vst %%v20, 0(%%r1,%2) \n\t" | |||||
| "vst %%v21, 16(%%r1,%2) \n\t" | |||||
| "vst %%v22, 32(%%r1,%2) \n\t" | |||||
| "vst %%v23, 48(%%r1,%2) \n\t" | |||||
| "vl %%v24, 64(%%r1,%1) \n\t" | |||||
| "vl %%v25, 80(%%r1,%1) \n\t" | |||||
| "vl %%v26, 96(%%r1,%1) \n\t" | |||||
| "vl %%v27, 112(%%r1,%1) \n\t" | |||||
| "vl %%v16, 64(%%r1,%2) \n\t" | |||||
| "vl %%v17, 80(%%r1,%2) \n\t" | |||||
| "vl %%v18, 96(%%r1,%2) \n\t" | |||||
| "vl %%v19, 112(%%r1,%2) \n\t" | |||||
| "vfmdb %%v28,%%v24,%%v0 \n\t" | |||||
| "vfmdb %%v29,%%v25,%%v0 \n\t" | |||||
| "vfmdb %%v20,%%v24,%%v1 \n\t" /* yn=x*s */ | |||||
| "vfmdb %%v21,%%v25,%%v1 \n\t" /* yn=x*s */ | |||||
| "vfmdb %%v30,%%v26,%%v0 \n\t" | |||||
| "vfmdb %%v22,%%v26,%%v1 \n\t" /* yn=x*s */ | |||||
| "vfmdb %%v31,%%v27,%%v0 \n\t" | |||||
| "vfmdb %%v23,%%v27,%%v1 \n\t" /* yn=x*s */ | |||||
| /* 2nd parts*/ | |||||
| "vfmadb %%v28,%%v16,%%v1,%%v28 \n\t" | |||||
| "vfmsdb %%v20,%%v16,%%v0,%%v20 \n\t" /* yn=y*c-yn */ | |||||
| "vfmadb %%v29,%%v17,%%v1,%%v29 \n\t" | |||||
| "vfmsdb %%v21,%%v17,%%v0,%%v21 \n\t" /* yn=y*c-yn */ | |||||
| "vfmadb %%v30,%%v18,%%v1,%%v30 \n\t" | |||||
| "vfmsdb %%v22,%%v18,%%v0,%%v22 \n\t" /* yn=y*c-yn */ | |||||
| "vfmadb %%v31,%%v19,%%v1,%%v31 \n\t" | |||||
| "vfmsdb %%v23,%%v19,%%v0,%%v23 \n\t" /* yn=y*c-yn */ | |||||
| "vst %%v28, 64(%%r1,%1) \n\t" | |||||
| "vst %%v29, 80(%%r1,%1) \n\t" | |||||
| "vst %%v30, 96(%%r1,%1) \n\t" | |||||
| "vst %%v31, 112(%%r1,%1) \n\t" | |||||
| "vst %%v20, 64(%%r1,%2) \n\t" | |||||
| "vst %%v21, 80(%%r1,%2) \n\t" | |||||
| "vst %%v22, 96(%%r1,%2) \n\t" | |||||
| "vst %%v23, 112(%%r1,%2) \n\t" | |||||
| "vl %%v24, 128(%%r1,%1) \n\t" | |||||
| "vl %%v25, 144(%%r1,%1) \n\t" | |||||
| "vl %%v26, 160(%%r1,%1) \n\t" | |||||
| "vl %%v27, 176(%%r1,%1) \n\t" | |||||
| "vl %%v16, 128(%%r1,%2) \n\t" | |||||
| "vl %%v17, 144(%%r1,%2) \n\t" | |||||
| "vl %%v18, 160(%%r1,%2) \n\t" | |||||
| "vl %%v19, 176(%%r1,%2) \n\t" | |||||
| "vfmdb %%v28,%%v24,%%v0 \n\t" | |||||
| "vfmdb %%v29,%%v25,%%v0 \n\t" | |||||
| "vfmdb %%v20,%%v24,%%v1 \n\t" /* yn=x*s */ | |||||
| "vfmdb %%v21,%%v25,%%v1 \n\t" /* yn=x*s */ | |||||
| "vfmdb %%v30,%%v26,%%v0 \n\t" | |||||
| "vfmdb %%v22,%%v26,%%v1 \n\t" /* yn=x*s */ | |||||
| "vfmdb %%v31,%%v27,%%v0 \n\t" | |||||
| "vfmdb %%v23,%%v27,%%v1 \n\t" /* yn=x*s */ | |||||
| /* 2nd parts*/ | |||||
| "vfmadb %%v28,%%v16,%%v1,%%v28 \n\t" | |||||
| "vfmsdb %%v20,%%v16,%%v0,%%v20 \n\t" /* yn=y*c-yn */ | |||||
| "vfmadb %%v29,%%v17,%%v1,%%v29 \n\t" | |||||
| "vfmsdb %%v21,%%v17,%%v0,%%v21 \n\t" /* yn=y*c-yn */ | |||||
| "vfmadb %%v30,%%v18,%%v1,%%v30 \n\t" | |||||
| "vfmsdb %%v22,%%v18,%%v0,%%v22 \n\t" /* yn=y*c-yn */ | |||||
| "vfmadb %%v31,%%v19,%%v1,%%v31 \n\t" | |||||
| "vfmsdb %%v23,%%v19,%%v0,%%v23 \n\t" /* yn=y*c-yn */ | |||||
| "vst %%v28, 128(%%r1,%1) \n\t" | |||||
| "vst %%v29, 144(%%r1,%1) \n\t" | |||||
| "vst %%v30, 160(%%r1,%1) \n\t" | |||||
| "vst %%v31, 176(%%r1,%1) \n\t" | |||||
| "vst %%v20, 128(%%r1,%2) \n\t" | |||||
| "vst %%v21, 144(%%r1,%2) \n\t" | |||||
| "vst %%v22, 160(%%r1,%2) \n\t" | |||||
| "vst %%v23, 176(%%r1,%2) \n\t" | |||||
| "vl %%v24, 192(%%r1,%1) \n\t" | |||||
| "vl %%v25, 208(%%r1,%1) \n\t" | |||||
| "vl %%v26, 224(%%r1,%1) \n\t" | |||||
| "vl %%v27, 240(%%r1,%1) \n\t" | |||||
| "vl %%v16, 192(%%r1,%2) \n\t" | |||||
| "vl %%v17, 208(%%r1,%2) \n\t" | |||||
| "vl %%v18, 224(%%r1,%2) \n\t" | |||||
| "vl %%v19, 240(%%r1,%2) \n\t" | |||||
| "vfmdb %%v28,%%v24,%%v0 \n\t" | |||||
| "vfmdb %%v29,%%v25,%%v0 \n\t" | |||||
| "vfmdb %%v20,%%v24,%%v1 \n\t" /* yn=x*s */ | |||||
| "vfmdb %%v21,%%v25,%%v1 \n\t" /* yn=x*s */ | |||||
| "vfmdb %%v30,%%v26,%%v0 \n\t" | |||||
| "vfmdb %%v22,%%v26,%%v1 \n\t" /* yn=x*s */ | |||||
| "vfmdb %%v31,%%v27,%%v0 \n\t" | |||||
| "vfmdb %%v23,%%v27,%%v1 \n\t" /* yn=x*s */ | |||||
| /* 2nd parts*/ | |||||
| "vfmadb %%v28,%%v16,%%v1,%%v28 \n\t" | |||||
| "vfmsdb %%v20,%%v16,%%v0,%%v20 \n\t" /* yn=y*c-yn */ | |||||
| "vfmadb %%v29,%%v17,%%v1,%%v29 \n\t" | |||||
| "vfmsdb %%v21,%%v17,%%v0,%%v21 \n\t" /* yn=y*c-yn */ | |||||
| "vfmadb %%v30,%%v18,%%v1,%%v30 \n\t" | |||||
| "vfmsdb %%v22,%%v18,%%v0,%%v22 \n\t" /* yn=y*c-yn */ | |||||
| "vfmadb %%v31,%%v19,%%v1,%%v31 \n\t" | |||||
| "vfmsdb %%v23,%%v19,%%v0,%%v23 \n\t" /* yn=y*c-yn */ | |||||
| "vst %%v28, 192(%%r1,%1) \n\t" | |||||
| "vst %%v29, 208(%%r1,%1) \n\t" | |||||
| "vst %%v30, 224(%%r1,%1) \n\t" | |||||
| "vst %%v31, 240(%%r1,%1) \n\t" | |||||
| "vst %%v20, 192(%%r1,%2) \n\t" | |||||
| "vst %%v21, 208(%%r1,%2) \n\t" | |||||
| "vst %%v22, 224(%%r1,%2) \n\t" | |||||
| "vst %%v23, 240(%%r1,%2) \n\t" | |||||
| "agfi %%r1,256 \n\t" | |||||
| "brctg %%r0,0b " | |||||
| : | |||||
| :"r"(n),"ZR"((FLOAT (*)[n])x),"ZR"((FLOAT (*)[n])y),"m"(*c),"m"(*s) | |||||
| :"memory","cc","r0","r1","v0","v1","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" | |||||
| ); | |||||
| static void drot_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *c, FLOAT *s) { | |||||
| __asm__("vlrepg %%v0,%[c]\n\t" | |||||
| "vlrepg %%v1,%[s]\n\t" | |||||
| "srlg %[n],%[n],5\n\t" | |||||
| "xgr %%r1,%%r1\n\t" | |||||
| "0:\n\t" | |||||
| "pfd 2, 1024(%%r1,%[x])\n\t" | |||||
| "pfd 2, 1024(%%r1,%[y])\n\t" | |||||
| "vl %%v24, 0(%%r1,%[x])\n\t" | |||||
| "vl %%v25, 16(%%r1,%[x])\n\t" | |||||
| "vl %%v26, 32(%%r1,%[x])\n\t" | |||||
| "vl %%v27, 48(%%r1,%[x])\n\t" | |||||
| "vl %%v16, 0(%%r1,%[y])\n\t" | |||||
| "vl %%v17, 16(%%r1,%[y])\n\t" | |||||
| "vl %%v18, 32(%%r1,%[y])\n\t" | |||||
| "vl %%v19, 48(%%r1,%[y])\n\t" | |||||
| "vfmdb %%v28,%%v24,%%v0\n\t" | |||||
| "vfmdb %%v29,%%v25,%%v0\n\t" | |||||
| "vfmdb %%v20,%%v24,%%v1\n\t" /* yn=x*s */ | |||||
| "vfmdb %%v21,%%v25,%%v1\n\t" /* yn=x*s */ | |||||
| "vfmdb %%v30,%%v26,%%v0\n\t" | |||||
| "vfmdb %%v22,%%v26,%%v1\n\t" /* yn=x*s */ | |||||
| "vfmdb %%v31,%%v27,%%v0\n\t" | |||||
| "vfmdb %%v23,%%v27,%%v1\n\t" /* yn=x*s */ | |||||
| /* 2nd parts */ | |||||
| "vfmadb %%v28,%%v16,%%v1,%%v28\n\t" | |||||
| "vfmsdb %%v20,%%v16,%%v0,%%v20\n\t" /* yn=y*c-yn */ | |||||
| "vfmadb %%v29,%%v17,%%v1,%%v29\n\t" | |||||
| "vfmsdb %%v21,%%v17,%%v0,%%v21\n\t" /* yn=y*c-yn */ | |||||
| "vfmadb %%v30,%%v18,%%v1,%%v30\n\t" | |||||
| "vfmsdb %%v22,%%v18,%%v0,%%v22\n\t" /* yn=y*c-yn */ | |||||
| "vfmadb %%v31,%%v19,%%v1,%%v31\n\t" | |||||
| "vfmsdb %%v23,%%v19,%%v0,%%v23\n\t" /* yn=y*c-yn */ | |||||
| "vst %%v28, 0(%%r1,%[x])\n\t" | |||||
| "vst %%v29, 16(%%r1,%[x])\n\t" | |||||
| "vst %%v30, 32(%%r1,%[x])\n\t" | |||||
| "vst %%v31, 48(%%r1,%[x])\n\t" | |||||
| "vst %%v20, 0(%%r1,%[y])\n\t" | |||||
| "vst %%v21, 16(%%r1,%[y])\n\t" | |||||
| "vst %%v22, 32(%%r1,%[y])\n\t" | |||||
| "vst %%v23, 48(%%r1,%[y])\n\t" | |||||
| "vl %%v24, 64(%%r1,%[x])\n\t" | |||||
| "vl %%v25, 80(%%r1,%[x])\n\t" | |||||
| "vl %%v26, 96(%%r1,%[x])\n\t" | |||||
| "vl %%v27, 112(%%r1,%[x])\n\t" | |||||
| "vl %%v16, 64(%%r1,%[y])\n\t" | |||||
| "vl %%v17, 80(%%r1,%[y])\n\t" | |||||
| "vl %%v18, 96(%%r1,%[y])\n\t" | |||||
| "vl %%v19, 112(%%r1,%[y])\n\t" | |||||
| "vfmdb %%v28,%%v24,%%v0\n\t" | |||||
| "vfmdb %%v29,%%v25,%%v0\n\t" | |||||
| "vfmdb %%v20,%%v24,%%v1\n\t" /* yn=x*s */ | |||||
| "vfmdb %%v21,%%v25,%%v1\n\t" /* yn=x*s */ | |||||
| "vfmdb %%v30,%%v26,%%v0\n\t" | |||||
| "vfmdb %%v22,%%v26,%%v1\n\t" /* yn=x*s */ | |||||
| "vfmdb %%v31,%%v27,%%v0\n\t" | |||||
| "vfmdb %%v23,%%v27,%%v1\n\t" /* yn=x*s */ | |||||
| /* 2nd parts */ | |||||
| "vfmadb %%v28,%%v16,%%v1,%%v28\n\t" | |||||
| "vfmsdb %%v20,%%v16,%%v0,%%v20\n\t" /* yn=y*c-yn */ | |||||
| "vfmadb %%v29,%%v17,%%v1,%%v29\n\t" | |||||
| "vfmsdb %%v21,%%v17,%%v0,%%v21\n\t" /* yn=y*c-yn */ | |||||
| "vfmadb %%v30,%%v18,%%v1,%%v30\n\t" | |||||
| "vfmsdb %%v22,%%v18,%%v0,%%v22\n\t" /* yn=y*c-yn */ | |||||
| "vfmadb %%v31,%%v19,%%v1,%%v31\n\t" | |||||
| "vfmsdb %%v23,%%v19,%%v0,%%v23\n\t" /* yn=y*c-yn */ | |||||
| "vst %%v28, 64(%%r1,%[x])\n\t" | |||||
| "vst %%v29, 80(%%r1,%[x])\n\t" | |||||
| "vst %%v30, 96(%%r1,%[x])\n\t" | |||||
| "vst %%v31, 112(%%r1,%[x])\n\t" | |||||
| "vst %%v20, 64(%%r1,%[y])\n\t" | |||||
| "vst %%v21, 80(%%r1,%[y])\n\t" | |||||
| "vst %%v22, 96(%%r1,%[y])\n\t" | |||||
| "vst %%v23, 112(%%r1,%[y])\n\t" | |||||
| "vl %%v24, 128(%%r1,%[x])\n\t" | |||||
| "vl %%v25, 144(%%r1,%[x])\n\t" | |||||
| "vl %%v26, 160(%%r1,%[x])\n\t" | |||||
| "vl %%v27, 176(%%r1,%[x])\n\t" | |||||
| "vl %%v16, 128(%%r1,%[y])\n\t" | |||||
| "vl %%v17, 144(%%r1,%[y])\n\t" | |||||
| "vl %%v18, 160(%%r1,%[y])\n\t" | |||||
| "vl %%v19, 176(%%r1,%[y])\n\t" | |||||
| "vfmdb %%v28,%%v24,%%v0\n\t" | |||||
| "vfmdb %%v29,%%v25,%%v0\n\t" | |||||
| "vfmdb %%v20,%%v24,%%v1\n\t" /* yn=x*s */ | |||||
| "vfmdb %%v21,%%v25,%%v1\n\t" /* yn=x*s */ | |||||
| "vfmdb %%v30,%%v26,%%v0\n\t" | |||||
| "vfmdb %%v22,%%v26,%%v1\n\t" /* yn=x*s */ | |||||
| "vfmdb %%v31,%%v27,%%v0\n\t" | |||||
| "vfmdb %%v23,%%v27,%%v1\n\t" /* yn=x*s */ | |||||
| /* 2nd parts */ | |||||
| "vfmadb %%v28,%%v16,%%v1,%%v28\n\t" | |||||
| "vfmsdb %%v20,%%v16,%%v0,%%v20\n\t" /* yn=y*c-yn */ | |||||
| "vfmadb %%v29,%%v17,%%v1,%%v29\n\t" | |||||
| "vfmsdb %%v21,%%v17,%%v0,%%v21\n\t" /* yn=y*c-yn */ | |||||
| "vfmadb %%v30,%%v18,%%v1,%%v30\n\t" | |||||
| "vfmsdb %%v22,%%v18,%%v0,%%v22\n\t" /* yn=y*c-yn */ | |||||
| "vfmadb %%v31,%%v19,%%v1,%%v31\n\t" | |||||
| "vfmsdb %%v23,%%v19,%%v0,%%v23\n\t" /* yn=y*c-yn */ | |||||
| "vst %%v28, 128(%%r1,%[x])\n\t" | |||||
| "vst %%v29, 144(%%r1,%[x])\n\t" | |||||
| "vst %%v30, 160(%%r1,%[x])\n\t" | |||||
| "vst %%v31, 176(%%r1,%[x])\n\t" | |||||
| "vst %%v20, 128(%%r1,%[y])\n\t" | |||||
| "vst %%v21, 144(%%r1,%[y])\n\t" | |||||
| "vst %%v22, 160(%%r1,%[y])\n\t" | |||||
| "vst %%v23, 176(%%r1,%[y])\n\t" | |||||
| "vl %%v24, 192(%%r1,%[x])\n\t" | |||||
| "vl %%v25, 208(%%r1,%[x])\n\t" | |||||
| "vl %%v26, 224(%%r1,%[x])\n\t" | |||||
| "vl %%v27, 240(%%r1,%[x])\n\t" | |||||
| "vl %%v16, 192(%%r1,%[y])\n\t" | |||||
| "vl %%v17, 208(%%r1,%[y])\n\t" | |||||
| "vl %%v18, 224(%%r1,%[y])\n\t" | |||||
| "vl %%v19, 240(%%r1,%[y])\n\t" | |||||
| "vfmdb %%v28,%%v24,%%v0\n\t" | |||||
| "vfmdb %%v29,%%v25,%%v0\n\t" | |||||
| "vfmdb %%v20,%%v24,%%v1\n\t" /* yn=x*s */ | |||||
| "vfmdb %%v21,%%v25,%%v1\n\t" /* yn=x*s */ | |||||
| "vfmdb %%v30,%%v26,%%v0\n\t" | |||||
| "vfmdb %%v22,%%v26,%%v1\n\t" /* yn=x*s */ | |||||
| "vfmdb %%v31,%%v27,%%v0\n\t" | |||||
| "vfmdb %%v23,%%v27,%%v1\n\t" /* yn=x*s */ | |||||
| /* 2nd parts */ | |||||
| "vfmadb %%v28,%%v16,%%v1,%%v28\n\t" | |||||
| "vfmsdb %%v20,%%v16,%%v0,%%v20\n\t" /* yn=y*c-yn */ | |||||
| "vfmadb %%v29,%%v17,%%v1,%%v29\n\t" | |||||
| "vfmsdb %%v21,%%v17,%%v0,%%v21\n\t" /* yn=y*c-yn */ | |||||
| "vfmadb %%v30,%%v18,%%v1,%%v30\n\t" | |||||
| "vfmsdb %%v22,%%v18,%%v0,%%v22\n\t" /* yn=y*c-yn */ | |||||
| "vfmadb %%v31,%%v19,%%v1,%%v31\n\t" | |||||
| "vfmsdb %%v23,%%v19,%%v0,%%v23\n\t" /* yn=y*c-yn */ | |||||
| "vst %%v28, 192(%%r1,%[x])\n\t" | |||||
| "vst %%v29, 208(%%r1,%[x])\n\t" | |||||
| "vst %%v30, 224(%%r1,%[x])\n\t" | |||||
| "vst %%v31, 240(%%r1,%[x])\n\t" | |||||
| "vst %%v20, 192(%%r1,%[y])\n\t" | |||||
| "vst %%v21, 208(%%r1,%[y])\n\t" | |||||
| "vst %%v22, 224(%%r1,%[y])\n\t" | |||||
| "vst %%v23, 240(%%r1,%[y])\n\t" | |||||
| "agfi %%r1,256\n\t" | |||||
| "brctg %[n],0b" | |||||
| : "+m"(*(FLOAT (*)[n]) x), "+m"(*(FLOAT (*)[n]) y),[n] "+&r"(n) | |||||
| : [x] "a"(x),[y] "a"(y),[c] "m"(*c),[s] "m"(*s) | |||||
| : "cc", "r1", "v0", "v1", "v16", "v17", "v18", "v19", "v20", "v21", | |||||
| "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", | |||||
| "v31"); | |||||
| } | } | ||||
| int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT c, FLOAT s) | |||||
| { | |||||
| BLASLONG i=0; | |||||
| BLASLONG ix=0,iy=0; | |||||
| FLOAT temp; | |||||
| int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, | |||||
| FLOAT c, FLOAT s) { | |||||
| BLASLONG i = 0; | |||||
| BLASLONG ix = 0, iy = 0; | |||||
| if ( n <= 0 ) return(0); | |||||
| FLOAT temp; | |||||
| if ( (inc_x == 1) && (inc_y == 1) ) | |||||
| { | |||||
| if (n <= 0) | |||||
| return (0); | |||||
| BLASLONG n1 = n & -32; | |||||
| if ( n1 > 0 ) | |||||
| { | |||||
| FLOAT cosa,sina; | |||||
| cosa=c; | |||||
| sina=s; | |||||
| drot_kernel_32(n1, x, y, &cosa, &sina); | |||||
| i=n1; | |||||
| } | |||||
| if ((inc_x == 1) && (inc_y == 1)) { | |||||
| while(i < n) | |||||
| { | |||||
| temp = c*x[i] + s*y[i] ; | |||||
| y[i] = c*y[i] - s*x[i] ; | |||||
| x[i] = temp ; | |||||
| i++ ; | |||||
| BLASLONG n1 = n & -32; | |||||
| if (n1 > 0) { | |||||
| FLOAT cosa, sina; | |||||
| cosa = c; | |||||
| sina = s; | |||||
| drot_kernel_32(n1, x, y, &cosa, &sina); | |||||
| i = n1; | |||||
| } | |||||
| } | |||||
| while (i < n) { | |||||
| temp = c * x[i] + s * y[i]; | |||||
| y[i] = c * y[i] - s * x[i]; | |||||
| x[i] = temp; | |||||
| i++; | |||||
| } | } | ||||
| else | |||||
| { | |||||
| while(i < n) | |||||
| { | |||||
| temp = c*x[ix] + s*y[iy] ; | |||||
| y[iy] = c*y[iy] - s*x[ix] ; | |||||
| x[ix] = temp ; | |||||
| } else { | |||||
| ix += inc_x ; | |||||
| iy += inc_y ; | |||||
| i++ ; | |||||
| while (i < n) { | |||||
| temp = c * x[ix] + s * y[iy]; | |||||
| y[iy] = c * y[iy] - s * x[ix]; | |||||
| x[ix] = temp; | |||||
| } | |||||
| ix += inc_x; | |||||
| iy += inc_y; | |||||
| i++; | |||||
| } | } | ||||
| return(0); | |||||
| } | |||||
| } | |||||
| return (0); | |||||
| } | |||||
| @@ -1,5 +1,5 @@ | |||||
| /*************************************************************************** | /*************************************************************************** | ||||
| Copyright (c) 2013-2017, The OpenBLAS Project | |||||
| Copyright (c) 2013-2019, The OpenBLAS Project | |||||
| All rights reserved. | All rights reserved. | ||||
| Redistribution and use in source and binary forms, with or without | Redistribution and use in source and binary forms, with or without | ||||
| modification, are permitted provided that the following conditions are | modification, are permitted provided that the following conditions are | ||||
| @@ -27,179 +27,151 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #include "common.h" | #include "common.h" | ||||
| static void dscal_kernel_16(BLASLONG n, FLOAT da, FLOAT *x) | |||||
| { | |||||
| __asm__ volatile ( | |||||
| "vlrepg %%v0,%1 \n\t" | |||||
| "srlg %%r0,%0,4 \n\t" | |||||
| "xgr %%r1,%%r1 \n\t" | |||||
| "0: \n\t" | |||||
| "pfd 2, 1024(%%r1,%2) \n\t" | |||||
| "vl %%v24, 0(%%r1,%2) \n\t" | |||||
| "vfmdb %%v24,%%v24,%%v0 \n\t" | |||||
| "vst %%v24, 0(%%r1,%2) \n\t" | |||||
| "vl %%v25, 16(%%r1,%2) \n\t" | |||||
| "vfmdb %%v25,%%v25,%%v0 \n\t" | |||||
| "vst %%v25, 16(%%r1,%2) \n\t" | |||||
| "vl %%v26, 32(%%r1,%2) \n\t" | |||||
| "vfmdb %%v26,%%v26,%%v0 \n\t" | |||||
| "vst %%v26, 32(%%r1,%2) \n\t" | |||||
| "vl %%v27, 48(%%r1,%2) \n\t" | |||||
| "vfmdb %%v27,%%v27,%%v0 \n\t" | |||||
| "vst %%v27, 48(%%r1,%2) \n\t" | |||||
| "vl %%v24, 64(%%r1,%2) \n\t" | |||||
| "vfmdb %%v24,%%v24,%%v0 \n\t" | |||||
| "vst %%v24, 64(%%r1,%2) \n\t" | |||||
| "vl %%v25, 80(%%r1,%2) \n\t" | |||||
| "vfmdb %%v25,%%v25,%%v0 \n\t" | |||||
| "vst %%v25, 80(%%r1,%2) \n\t" | |||||
| "vl %%v26, 96(%%r1,%2) \n\t" | |||||
| "vfmdb %%v26,%%v26,%%v0 \n\t" | |||||
| "vst %%v26, 96(%%r1,%2) \n\t" | |||||
| "vl %%v27, 112(%%r1,%2) \n\t" | |||||
| "vfmdb %%v27,%%v27,%%v0 \n\t" | |||||
| "vst %%v27, 112(%%r1,%2) \n\t" | |||||
| "agfi %%r1,128 \n\t" | |||||
| "brctg %%r0,0b " | |||||
| : | |||||
| :"r"(n),"m"(da),"ZR"((FLOAT (*)[n])x) | |||||
| :"memory","cc","r0","r1","v0","v24","v25","v26","v27" | |||||
| ); | |||||
| static void dscal_kernel_16(BLASLONG n, FLOAT da, FLOAT *x) { | |||||
| __asm__("vlrepg %%v0,%[da]\n\t" | |||||
| "srlg %[n],%[n],4\n\t" | |||||
| "xgr %%r1,%%r1\n\t" | |||||
| "0:\n\t" | |||||
| "pfd 2, 1024(%%r1,%[x])\n\t" | |||||
| "vl %%v24,0(%%r1,%[x])\n\t" | |||||
| "vfmdb %%v24,%%v24,%%v0\n\t" | |||||
| "vst %%v24,0(%%r1,%[x])\n\t" | |||||
| "vl %%v25,16(%%r1,%[x])\n\t" | |||||
| "vfmdb %%v25,%%v25,%%v0\n\t" | |||||
| "vst %%v25,16(%%r1,%[x])\n\t" | |||||
| "vl %%v26,32(%%r1,%[x])\n\t" | |||||
| "vfmdb %%v26,%%v26,%%v0\n\t" | |||||
| "vst %%v26,32(%%r1,%[x])\n\t" | |||||
| "vl %%v27,48(%%r1,%[x])\n\t" | |||||
| "vfmdb %%v27,%%v27,%%v0\n\t" | |||||
| "vst %%v27,48(%%r1,%[x])\n\t" | |||||
| "vl %%v28,64(%%r1,%[x])\n\t" | |||||
| "vfmdb %%v28,%%v28,%%v0\n\t" | |||||
| "vst %%v28,64(%%r1,%[x])\n\t" | |||||
| "vl %%v29,80(%%r1,%[x])\n\t" | |||||
| "vfmdb %%v29,%%v29,%%v0\n\t" | |||||
| "vst %%v29,80(%%r1,%[x])\n\t" | |||||
| "vl %%v30,96(%%r1,%[x])\n\t" | |||||
| "vfmdb %%v30,%%v30,%%v0\n\t" | |||||
| "vst %%v30,96(%%r1,%[x])\n\t" | |||||
| "vl %%v31,112(%%r1,%[x])\n\t" | |||||
| "vfmdb %%v31,%%v31,%%v0\n\t" | |||||
| "vst %%v31,112(%%r1,%[x])\n\t" | |||||
| "agfi %%r1,128\n\t" | |||||
| "brctg %[n],0b" | |||||
| : "+m"(*(FLOAT (*)[n]) x),[n] "+&r"(n) | |||||
| : [x] "a"(x),[da] "m"(da) | |||||
| : "cc", "r1", "v0", "v24", "v25", "v26", "v27", "v28", "v29", "v30", | |||||
| "v31"); | |||||
| } | } | ||||
| static void dscal_kernel_16_zero(BLASLONG n, FLOAT *x) | |||||
| { | |||||
| __asm__ volatile( | |||||
| "vzero %%v24 \n\t" | |||||
| "vzero %%v25 \n\t" | |||||
| "vzero %%v26 \n\t" | |||||
| "vzero %%v27 \n\t" | |||||
| "srlg %%r0,%0,4 \n\t" | |||||
| "xgr %%r1,%%r1 \n\t" | |||||
| "0: \n\t" | |||||
| "pfd 2, 1024(%%r1,%1) \n\t" | |||||
| "vst %%v24,0(%%r1,%1) \n\t" | |||||
| "vst %%v25,16(%%r1,%1) \n\t" | |||||
| "vst %%v26,32(%%r1,%1) \n\t" | |||||
| "vst %%v27,48(%%r1,%1) \n\t" | |||||
| "vst %%v24,64(%%r1,%1) \n\t" | |||||
| "vst %%v25,80(%%r1,%1) \n\t" | |||||
| "vst %%v26,96(%%r1,%1) \n\t" | |||||
| "vst %%v27,112(%%r1,%1) \n\t" | |||||
| "agfi %%r1,128 \n\t" | |||||
| "brctg %%r0,0b " | |||||
| : | |||||
| :"r"(n),"ZR"((FLOAT (*)[n])x) | |||||
| :"memory","cc","r0","r1","v24","v25","v26","v27" | |||||
| ); | |||||
| static void dscal_kernel_16_zero(BLASLONG n, FLOAT *x) { | |||||
| __asm__("vzero %%v0\n\t" | |||||
| "srlg %[n],%[n],4\n\t" | |||||
| "xgr %%r1,%%r1\n\t" | |||||
| "0:\n\t" | |||||
| "pfd 2, 1024(%%r1,%[x])\n\t" | |||||
| "vst %%v0,0(%%r1,%[x])\n\t" | |||||
| "vst %%v0,16(%%r1,%[x])\n\t" | |||||
| "vst %%v0,32(%%r1,%[x])\n\t" | |||||
| "vst %%v0,48(%%r1,%[x])\n\t" | |||||
| "vst %%v0,64(%%r1,%[x])\n\t" | |||||
| "vst %%v0,80(%%r1,%[x])\n\t" | |||||
| "vst %%v0,96(%%r1,%[x])\n\t" | |||||
| "vst %%v0,112(%%r1,%[x])\n\t" | |||||
| "agfi %%r1,128\n\t" | |||||
| "brctg %[n],0b" | |||||
| : "=m"(*(FLOAT (*)[n]) x),[n] "+&r"(n) | |||||
| : [x] "a"(x) | |||||
| : "cc", "r1", "v0"); | |||||
| } | } | ||||
| int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) | |||||
| { | |||||
| BLASLONG i=0,j=0; | |||||
| if ( n <= 0 || inc_x <=0 ) | |||||
| return(0); | |||||
| if ( inc_x == 1 ) | |||||
| { | |||||
| if ( da == 0.0 ) | |||||
| { | |||||
| BLASLONG n1 = n & -16; | |||||
| if ( n1 > 0 ) | |||||
| { | |||||
| dscal_kernel_16_zero(n1, x); | |||||
| j=n1; | |||||
| } | |||||
| while(j < n) | |||||
| { | |||||
| x[j]=0.0; | |||||
| j++; | |||||
| } | |||||
| } | |||||
| else | |||||
| { | |||||
| BLASLONG n1 = n & -16; | |||||
| if ( n1 > 0 ) | |||||
| { | |||||
| dscal_kernel_16(n1, da, x); | |||||
| j=n1; | |||||
| } | |||||
| while(j < n) | |||||
| { | |||||
| x[j] = da * x[j] ; | |||||
| j++; | |||||
| } | |||||
| } | |||||
| int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, | |||||
| BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, | |||||
| BLASLONG dummy2) { | |||||
| BLASLONG i = 0, j = 0; | |||||
| if (n <= 0 || inc_x <= 0) | |||||
| return (0); | |||||
| if (inc_x == 1) { | |||||
| if (da == 0.0) { | |||||
| BLASLONG n1 = n & -16; | |||||
| if (n1 > 0) { | |||||
| dscal_kernel_16_zero(n1, x); | |||||
| j = n1; | |||||
| } | |||||
| while (j < n) { | |||||
| x[j] = 0.0; | |||||
| j++; | |||||
| } | |||||
| } else { | |||||
| BLASLONG n1 = n & -16; | |||||
| if (n1 > 0) { | |||||
| dscal_kernel_16(n1, da, x); | |||||
| j = n1; | |||||
| } | |||||
| while (j < n) { | |||||
| x[j] = da * x[j]; | |||||
| j++; | |||||
| } | |||||
| } | } | ||||
| else | |||||
| { | |||||
| if ( da == 0.0 ) | |||||
| { | |||||
| } else { | |||||
| BLASLONG n1 = n & -4; | |||||
| if (da == 0.0) { | |||||
| while (j < n1) { | |||||
| BLASLONG n1 = n & -4; | |||||
| x[i]=0.0; | |||||
| x[i + inc_x]=0.0; | |||||
| x[i + 2 * inc_x]=0.0; | |||||
| x[i + 3 * inc_x]=0.0; | |||||
| while (j < n1) { | |||||
| i += inc_x * 4; | |||||
| j += 4; | |||||
| x[i] = 0.0; | |||||
| x[i + inc_x] = 0.0; | |||||
| x[i + 2 * inc_x] = 0.0; | |||||
| x[i + 3 * inc_x] = 0.0; | |||||
| } | |||||
| while(j < n) | |||||
| { | |||||
| i += inc_x * 4; | |||||
| j += 4; | |||||
| x[i]=0.0; | |||||
| i += inc_x ; | |||||
| j++; | |||||
| } | |||||
| } | |||||
| while (j < n) { | |||||
| } | |||||
| else | |||||
| { | |||||
| BLASLONG n1 = n & -4; | |||||
| x[i] = 0.0; | |||||
| i += inc_x; | |||||
| j++; | |||||
| } | |||||
| while (j < n1) { | |||||
| } else { | |||||
| BLASLONG n1 = n & -4; | |||||
| x[i] = da * x[i] ; | |||||
| x[i + inc_x] = da * x[i + inc_x]; | |||||
| x[i + 2 * inc_x] = da * x[i + 2 * inc_x]; | |||||
| x[i + 3 * inc_x] = da * x[i + 3 * inc_x]; | |||||
| while (j < n1) { | |||||
| i += inc_x * 4; | |||||
| j += 4; | |||||
| x[i] = da * x[i]; | |||||
| x[i + inc_x] = da * x[i + inc_x]; | |||||
| x[i + 2 * inc_x] = da * x[i + 2 * inc_x]; | |||||
| x[i + 3 * inc_x] = da * x[i + 3 * inc_x]; | |||||
| } | |||||
| i += inc_x * 4; | |||||
| j += 4; | |||||
| while(j < n) | |||||
| { | |||||
| } | |||||
| x[i] = da * x[i] ; | |||||
| i += inc_x ; | |||||
| j++; | |||||
| } | |||||
| } | |||||
| while (j < n) { | |||||
| x[i] = da * x[i]; | |||||
| i += inc_x; | |||||
| j++; | |||||
| } | |||||
| } | } | ||||
| return 0; | |||||
| } | |||||
| } | |||||
| return 0; | |||||
| } | |||||
| @@ -1,5 +1,5 @@ | |||||
| /*************************************************************************** | /*************************************************************************** | ||||
| Copyright (c) 2013-2018,The OpenBLAS Project | |||||
| Copyright (c) 2013-2019,The OpenBLAS Project | |||||
| All rights reserved. | All rights reserved. | ||||
| Redistribution and use in source and binary forms,with or without | Redistribution and use in source and binary forms,with or without | ||||
| modification,are permitted provided that the following conditions are | modification,are permitted provided that the following conditions are | ||||
| @@ -27,144 +27,146 @@ USE OF THIS SOFTWARE,EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #include "common.h" | #include "common.h" | ||||
| static double dsdot_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y) | |||||
| { | |||||
| double dot; | |||||
| __asm__ volatile ( | |||||
| "vzero %%v0 \n\t" | |||||
| "srlg %%r0,%1,4 \n\t" | |||||
| "xgr %%r1,%%r1 \n\t" | |||||
| "0: \n\t" | |||||
| "pfd 1,1024(%%r1,%2) \n\t" | |||||
| "pfd 1,1024(%%r1,%3) \n\t" | |||||
| "vlef %%v16,0(%%r1,%2),0 \n\t" | |||||
| "vlef %%v16,4(%%r1,%2),2 \n\t" | |||||
| "vlef %%v17,8(%%r1,%2),0 \n\t" | |||||
| "vlef %%v17,12(%%r1,%2),2 \n\t" | |||||
| "vlef %%v18,16(%%r1,%2),0 \n\t" | |||||
| "vlef %%v18,20(%%r1,%2),2 \n\t" | |||||
| "vlef %%v19,24(%%r1,%2),0 \n\t" | |||||
| "vlef %%v19,28(%%r1,%2),2 \n\t" | |||||
| "vlef %%v20,32(%%r1,%2),0 \n\t" | |||||
| "vlef %%v20,36(%%r1,%2),2 \n\t" | |||||
| "vlef %%v21,40(%%r1,%2),0 \n\t" | |||||
| "vlef %%v21,44(%%r1,%2),2 \n\t" | |||||
| "vlef %%v22,48(%%r1,%2),0 \n\t" | |||||
| "vlef %%v22,52(%%r1,%2),2 \n\t" | |||||
| "vlef %%v23,56(%%r1,%2),0 \n\t" | |||||
| "vlef %%v23,60(%%r1,%2),2 \n\t" | |||||
| "vflls %%v16,%%v16 \n\t" | |||||
| "vflls %%v17,%%v17 \n\t" | |||||
| "vflls %%v18,%%v18 \n\t" | |||||
| "vflls %%v19,%%v19 \n\t" | |||||
| "vflls %%v20,%%v20 \n\t" | |||||
| "vflls %%v21,%%v21 \n\t" | |||||
| "vflls %%v22,%%v22 \n\t" | |||||
| "vflls %%v23,%%v23 \n\t" | |||||
| "vlef %%v24,0(%%r1,%3),0 \n\t" | |||||
| "vlef %%v24,4(%%r1,%3),2 \n\t" | |||||
| "vflls %%v24,%%v24 \n\t" | |||||
| "vfmadb %%v0,%%v16,%%v24,%%v0 \n\t" | |||||
| "vlef %%v25,8(%%r1,%3),0 \n\t" | |||||
| "vlef %%v25,12(%%r1,%3),2 \n\t" | |||||
| "vflls %%v25,%%v25 \n\t" | |||||
| "vfmadb %%v0,%%v17,%%v25,%%v0 \n\t" | |||||
| "vlef %%v26,16(%%r1,%3),0 \n\t" | |||||
| "vlef %%v26,20(%%r1,%3),2 \n\t" | |||||
| "vflls %%v26,%%v26 \n\t" | |||||
| "vfmadb %%v0,%%v18,%%v26,%%v0 \n\t" | |||||
| "vlef %%v27,24(%%r1,%3),0 \n\t" | |||||
| "vlef %%v27,28(%%r1,%3),2 \n\t" | |||||
| "vflls %%v27,%%v27 \n\t" | |||||
| "vfmadb %%v0,%%v19,%%v27,%%v0 \n\t" | |||||
| "vlef %%v28,32(%%r1,%3),0 \n\t" | |||||
| "vlef %%v28,36(%%r1,%3),2 \n\t" | |||||
| "vflls %%v28,%%v28 \n\t" | |||||
| "vfmadb %%v0,%%v20,%%v28,%%v0 \n\t" | |||||
| "vlef %%v29,40(%%r1,%3),0 \n\t" | |||||
| "vlef %%v29,44(%%r1,%3),2 \n\t" | |||||
| "vflls %%v29,%%v29 \n\t" | |||||
| "vfmadb %%v0,%%v21,%%v29,%%v0 \n\t" | |||||
| "vlef %%v30,48(%%r1,%3),0 \n\t" | |||||
| "vlef %%v30,52(%%r1,%3),2 \n\t" | |||||
| "vflls %%v30,%%v30 \n\t" | |||||
| "vfmadb %%v0,%%v22,%%v30,%%v0 \n\t" | |||||
| "vlef %%v31,56(%%r1,%3),0 \n\t" | |||||
| "vlef %%v31,60(%%r1,%3),2 \n\t" | |||||
| "vflls %%v31,%%v31 \n\t" | |||||
| "vfmadb %%v0,%%v23,%%v31,%%v0 \n\t" | |||||
| "agfi %%r1,64 \n\t" | |||||
| "brctg %%r0,0b \n\t" | |||||
| "vrepg %%v1,%%v0,1 \n\t" | |||||
| "adbr %%f0,%%f1 \n\t" | |||||
| "ldr %0,%%f0 " | |||||
| :"=f"(dot) | |||||
| :"r"(n),"ZR"((const FLOAT (*)[n])x),"ZR"((const FLOAT (*)[n])y) | |||||
| :"memory","cc","r0","r1","v0","v1","v2","v3","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" | |||||
| ); | |||||
| return dot; | |||||
| static double dsdot_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y) { | |||||
| double dot; | |||||
| __asm__("vzero %%v0\n\t" | |||||
| "vzero %%v1\n\t" | |||||
| "vzero %%v2\n\t" | |||||
| "vzero %%v3\n\t" | |||||
| "vzero %%v4\n\t" | |||||
| "vzero %%v5\n\t" | |||||
| "vzero %%v6\n\t" | |||||
| "vzero %%v7\n\t" | |||||
| "srlg %[n],%[n],4\n\t" | |||||
| "xgr %%r1,%%r1\n\t" | |||||
| "0:\n\t" | |||||
| "pfd 1,1024(%%r1,%[x])\n\t" | |||||
| "pfd 1,1024(%%r1,%[y])\n\t" | |||||
| "vlef %%v16,0(%%r1,%[x]),0\n\t" | |||||
| "vlef %%v16,4(%%r1,%[x]),2\n\t" | |||||
| "vlef %%v17,8(%%r1,%[x]),0\n\t" | |||||
| "vlef %%v17,12(%%r1,%[x]),2\n\t" | |||||
| "vlef %%v18,16(%%r1,%[x]),0\n\t" | |||||
| "vlef %%v18,20(%%r1,%[x]),2\n\t" | |||||
| "vlef %%v19,24(%%r1,%[x]),0\n\t" | |||||
| "vlef %%v19,28(%%r1,%[x]),2\n\t" | |||||
| "vlef %%v20,32(%%r1,%[x]),0\n\t" | |||||
| "vlef %%v20,36(%%r1,%[x]),2\n\t" | |||||
| "vlef %%v21,40(%%r1,%[x]),0\n\t" | |||||
| "vlef %%v21,44(%%r1,%[x]),2\n\t" | |||||
| "vlef %%v22,48(%%r1,%[x]),0\n\t" | |||||
| "vlef %%v22,52(%%r1,%[x]),2\n\t" | |||||
| "vlef %%v23,56(%%r1,%[x]),0\n\t" | |||||
| "vlef %%v23,60(%%r1,%[x]),2\n\t" | |||||
| "vflls %%v16,%%v16\n\t" | |||||
| "vflls %%v17,%%v17\n\t" | |||||
| "vflls %%v18,%%v18\n\t" | |||||
| "vflls %%v19,%%v19\n\t" | |||||
| "vflls %%v20,%%v20\n\t" | |||||
| "vflls %%v21,%%v21\n\t" | |||||
| "vflls %%v22,%%v22\n\t" | |||||
| "vflls %%v23,%%v23\n\t" | |||||
| "vlef %%v24,0(%%r1,%[y]),0\n\t" | |||||
| "vlef %%v24,4(%%r1,%[y]),2\n\t" | |||||
| "vflls %%v24,%%v24\n\t" | |||||
| "vfmadb %%v0,%%v16,%%v24,%%v0\n\t" | |||||
| "vlef %%v25,8(%%r1,%[y]),0\n\t" | |||||
| "vlef %%v25,12(%%r1,%[y]),2\n\t" | |||||
| "vflls %%v25,%%v25\n\t" | |||||
| "vfmadb %%v1,%%v17,%%v25,%%v1\n\t" | |||||
| "vlef %%v26,16(%%r1,%[y]),0\n\t" | |||||
| "vlef %%v26,20(%%r1,%[y]),2\n\t" | |||||
| "vflls %%v26,%%v26\n\t" | |||||
| "vfmadb %%v2,%%v18,%%v26,%%v2\n\t" | |||||
| "vlef %%v27,24(%%r1,%[y]),0\n\t" | |||||
| "vlef %%v27,28(%%r1,%[y]),2\n\t" | |||||
| "vflls %%v27,%%v27\n\t" | |||||
| "vfmadb %%v3,%%v19,%%v27,%%v3\n\t" | |||||
| "vlef %%v28,32(%%r1,%[y]),0\n\t" | |||||
| "vlef %%v28,36(%%r1,%[y]),2\n\t" | |||||
| "vflls %%v28,%%v28\n\t" | |||||
| "vfmadb %%v4,%%v20,%%v28,%%v4\n\t" | |||||
| "vlef %%v29,40(%%r1,%[y]),0\n\t" | |||||
| "vlef %%v29,44(%%r1,%[y]),2\n\t" | |||||
| "vflls %%v29,%%v29\n\t" | |||||
| "vfmadb %%v5,%%v21,%%v29,%%v5\n\t" | |||||
| "vlef %%v30,48(%%r1,%[y]),0\n\t" | |||||
| "vlef %%v30,52(%%r1,%[y]),2\n\t" | |||||
| "vflls %%v30,%%v30\n\t" | |||||
| "vfmadb %%v6,%%v22,%%v30,%%v6\n\t" | |||||
| "vlef %%v31,56(%%r1,%[y]),0\n\t" | |||||
| "vlef %%v31,60(%%r1,%[y]),2\n\t" | |||||
| "vflls %%v31,%%v31\n\t" | |||||
| "vfmadb %%v7,%%v23,%%v31,%%v7\n\t" | |||||
| "agfi %%r1,64\n\t" | |||||
| "brctg %[n],0b\n\t" | |||||
| "vfadb %%v0,%%v0,%%v1\n\t" | |||||
| "vfadb %%v0,%%v0,%%v2\n\t" | |||||
| "vfadb %%v0,%%v0,%%v3\n\t" | |||||
| "vfadb %%v0,%%v0,%%v4\n\t" | |||||
| "vfadb %%v0,%%v0,%%v5\n\t" | |||||
| "vfadb %%v0,%%v0,%%v6\n\t" | |||||
| "vfadb %%v0,%%v0,%%v7\n\t" | |||||
| "vrepg %%v1,%%v0,1\n\t" | |||||
| "adbr %%f0,%%f1\n\t" | |||||
| "ldr %[dot],%%f0" | |||||
| : [dot] "=f"(dot),[n] "+&r"(n) | |||||
| : "m"(*(const FLOAT (*)[n]) x),[x] "a"(x), "m"(*(const FLOAT (*)[n]) y),[y] "a"(y) | |||||
| : "cc", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", | |||||
| "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", | |||||
| "v27", "v28", "v29", "v30", "v31"); | |||||
| return dot; | |||||
| } | } | ||||
| double CNAME(BLASLONG n,FLOAT *x,BLASLONG inc_x,FLOAT *y,BLASLONG inc_y) | |||||
| { | |||||
| BLASLONG i=0; | |||||
| BLASLONG ix=0,iy=0; | |||||
| double CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) { | |||||
| BLASLONG i = 0; | |||||
| BLASLONG ix = 0, iy = 0; | |||||
| double dot = 0.0 ; | |||||
| double dot = 0.0; | |||||
| if ( n <= 0 ) return(dot); | |||||
| if (n <= 0) | |||||
| return (dot); | |||||
| if ( (inc_x == 1) && (inc_y == 1) ) | |||||
| { | |||||
| if ((inc_x == 1) && (inc_y == 1)) { | |||||
| BLASLONG n1 = n & -16; | |||||
| BLASLONG n1 = n & -16; | |||||
| if ( n1 ) | |||||
| dot = dsdot_kernel_16(n1,x,y); | |||||
| if (n1) | |||||
| dot = dsdot_kernel_16(n1, x, y); | |||||
| i = n1; | |||||
| while(i < n) | |||||
| { | |||||
| i = n1; | |||||
| while (i < n) { | |||||
| dot += (double) y[i] * (double) x[i] ; | |||||
| i++ ; | |||||
| dot += (double) y[i] * (double) x[i]; | |||||
| i++; | |||||
| } | |||||
| return(dot); | |||||
| } | |||||
| return (dot); | |||||
| } | |||||
| } | |||||
| BLASLONG n1 = n & -2; | |||||
| BLASLONG n1 = n & -2; | |||||
| while (i < n1) { | |||||
| while(i < n1) | |||||
| { | |||||
| dot += (double) y[iy] * (double) x[ix]; | |||||
| dot += (double) y[iy + inc_y] * (double) x[ix + inc_x]; | |||||
| ix += inc_x * 2; | |||||
| iy += inc_y * 2; | |||||
| i += 2; | |||||
| dot += (double) y[iy] * (double) x[ix]; | |||||
| dot += (double) y[iy+inc_y] * (double) x[ix+inc_x]; | |||||
| ix += inc_x*2 ; | |||||
| iy += inc_y*2 ; | |||||
| i+=2 ; | |||||
| } | |||||
| } | |||||
| while (i < n) { | |||||
| while(i < n) | |||||
| { | |||||
| dot += (double) y[iy] * (double) x[ix]; | |||||
| ix += inc_x; | |||||
| iy += inc_y; | |||||
| i++; | |||||
| dot += (double) y[iy] * (double) x[ix] ; | |||||
| ix += inc_x ; | |||||
| iy += inc_y ; | |||||
| i++ ; | |||||
| } | |||||
| return(dot); | |||||
| } | |||||
| return (dot); | |||||
| } | } | ||||
| @@ -1,5 +1,5 @@ | |||||
| /*************************************************************************** | /*************************************************************************** | ||||
| Copyright (c) 2013-2018, The OpenBLAS Project | |||||
| Copyright (c) 2013-2019, The OpenBLAS Project | |||||
| All rights reserved. | All rights reserved. | ||||
| Redistribution and use in source and binary forms, with or without | Redistribution and use in source and binary forms, with or without | ||||
| modification, are permitted provided that the following conditions are | modification, are permitted provided that the following conditions are | ||||
| @@ -27,136 +27,124 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #include "common.h" | #include "common.h" | ||||
| static void dswap_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y) | |||||
| { | |||||
| __asm__ volatile( | |||||
| "srlg %%r0,%0,5 \n\t" | |||||
| "xgr %%r1,%%r1 \n\t" | |||||
| "0: \n\t" | |||||
| "pfd 2, 1024(%%r1,%1) \n\t" | |||||
| "pfd 2, 1024(%%r1,%2) \n\t" | |||||
| "vl %%v16, 0(%%r1,%1) \n\t" | |||||
| "vl %%v17, 16(%%r1,%1) \n\t" | |||||
| "vl %%v18, 32(%%r1,%1) \n\t" | |||||
| "vl %%v19, 48(%%r1,%1) \n\t" | |||||
| "vl %%v20, 64(%%r1,%1) \n\t" | |||||
| "vl %%v21, 80(%%r1,%1) \n\t" | |||||
| "vl %%v22, 96(%%r1,%1) \n\t" | |||||
| "vl %%v23, 112(%%r1,%1) \n\t" | |||||
| "vl %%v24, 128(%%r1,%1) \n\t" | |||||
| "vl %%v25, 144(%%r1,%1) \n\t" | |||||
| "vl %%v26, 160(%%r1,%1) \n\t" | |||||
| "vl %%v27, 176(%%r1,%1) \n\t" | |||||
| "vl %%v28, 192(%%r1,%1) \n\t" | |||||
| "vl %%v29, 208(%%r1,%1) \n\t" | |||||
| "vl %%v30, 224(%%r1,%1) \n\t" | |||||
| "vl %%v31, 240(%%r1,%1) \n\t" | |||||
| "vl %%v0, 0(%%r1,%2) \n\t" | |||||
| "vl %%v1, 16(%%r1,%2) \n\t" | |||||
| "vl %%v2, 32(%%r1,%2) \n\t" | |||||
| "vl %%v3, 48(%%r1,%2) \n\t" | |||||
| "vl %%v4, 64(%%r1,%2) \n\t" | |||||
| "vl %%v5, 80(%%r1,%2) \n\t" | |||||
| "vl %%v6, 96(%%r1,%2) \n\t" | |||||
| "vl %%v7, 112(%%r1,%2) \n\t" | |||||
| "vst %%v0, 0(%%r1,%1) \n\t" | |||||
| "vst %%v1, 16(%%r1,%1) \n\t" | |||||
| "vst %%v2, 32(%%r1,%1) \n\t" | |||||
| "vst %%v3, 48(%%r1,%1) \n\t" | |||||
| "vst %%v4, 64(%%r1,%1) \n\t" | |||||
| "vst %%v5, 80(%%r1,%1) \n\t" | |||||
| "vst %%v6, 96(%%r1,%1) \n\t" | |||||
| "vst %%v7, 112(%%r1,%1) \n\t" | |||||
| "vl %%v0, 128(%%r1,%2) \n\t" | |||||
| "vl %%v1, 144(%%r1,%2) \n\t" | |||||
| "vl %%v2, 160(%%r1,%2) \n\t" | |||||
| "vl %%v3, 176(%%r1,%2) \n\t" | |||||
| "vl %%v4, 192(%%r1,%2) \n\t" | |||||
| "vl %%v5, 208(%%r1,%2) \n\t" | |||||
| "vl %%v6, 224(%%r1,%2) \n\t" | |||||
| "vl %%v7, 240(%%r1,%2) \n\t" | |||||
| "vst %%v0, 128(%%r1,%1) \n\t" | |||||
| "vst %%v1, 144(%%r1,%1) \n\t" | |||||
| "vst %%v2, 160(%%r1,%1) \n\t" | |||||
| "vst %%v3, 176(%%r1,%1) \n\t" | |||||
| "vst %%v4, 192(%%r1,%1) \n\t" | |||||
| "vst %%v5, 208(%%r1,%1) \n\t" | |||||
| "vst %%v6, 224(%%r1,%1) \n\t" | |||||
| "vst %%v7, 240(%%r1,%1) \n\t" | |||||
| "vst %%v16, 0(%%r1,%2) \n\t" | |||||
| "vst %%v17, 16(%%r1,%2) \n\t" | |||||
| "vst %%v18, 32(%%r1,%2) \n\t" | |||||
| "vst %%v19, 48(%%r1,%2) \n\t" | |||||
| "vst %%v20, 64(%%r1,%2) \n\t" | |||||
| "vst %%v21, 80(%%r1,%2) \n\t" | |||||
| "vst %%v22, 96(%%r1,%2) \n\t" | |||||
| "vst %%v23, 112(%%r1,%2) \n\t" | |||||
| "vst %%v24, 128(%%r1,%2) \n\t" | |||||
| "vst %%v25, 144(%%r1,%2) \n\t" | |||||
| "vst %%v26, 160(%%r1,%2) \n\t" | |||||
| "vst %%v27, 176(%%r1,%2) \n\t" | |||||
| "vst %%v28, 192(%%r1,%2) \n\t" | |||||
| "vst %%v29, 208(%%r1,%2) \n\t" | |||||
| "vst %%v30, 224(%%r1,%2) \n\t" | |||||
| "vst %%v31, 240(%%r1,%2) \n\t" | |||||
| "agfi %%r1,256 \n\t" | |||||
| "brctg %%r0,0b " | |||||
| : | |||||
| :"r"(n),"ZR"((FLOAT (*)[n])x),"ZR"((FLOAT (*)[n])y) | |||||
| :"memory","cc","r0","r1","v0","v1","v2","v3","v4","v5","v6","v7","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" | |||||
| ); | |||||
| static void dswap_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y) { | |||||
| __asm__("srlg %[n],%[n],5\n\t" | |||||
| "xgr %%r1,%%r1\n\t" | |||||
| "0:\n\t" | |||||
| "pfd 2, 1024(%%r1,%[x])\n\t" | |||||
| "pfd 2, 1024(%%r1,%[y])\n\t" | |||||
| "vl %%v16, 0(%%r1,%[x])\n\t" | |||||
| "vl %%v17, 16(%%r1,%[x])\n\t" | |||||
| "vl %%v18, 32(%%r1,%[x])\n\t" | |||||
| "vl %%v19, 48(%%r1,%[x])\n\t" | |||||
| "vl %%v20, 64(%%r1,%[x])\n\t" | |||||
| "vl %%v21, 80(%%r1,%[x])\n\t" | |||||
| "vl %%v22, 96(%%r1,%[x])\n\t" | |||||
| "vl %%v23, 112(%%r1,%[x])\n\t" | |||||
| "vl %%v24, 128(%%r1,%[x])\n\t" | |||||
| "vl %%v25, 144(%%r1,%[x])\n\t" | |||||
| "vl %%v26, 160(%%r1,%[x])\n\t" | |||||
| "vl %%v27, 176(%%r1,%[x])\n\t" | |||||
| "vl %%v28, 192(%%r1,%[x])\n\t" | |||||
| "vl %%v29, 208(%%r1,%[x])\n\t" | |||||
| "vl %%v30, 224(%%r1,%[x])\n\t" | |||||
| "vl %%v31, 240(%%r1,%[x])\n\t" | |||||
| "vl %%v0, 0(%%r1,%[y])\n\t" | |||||
| "vl %%v1, 16(%%r1,%[y])\n\t" | |||||
| "vl %%v2, 32(%%r1,%[y])\n\t" | |||||
| "vl %%v3, 48(%%r1,%[y])\n\t" | |||||
| "vl %%v4, 64(%%r1,%[y])\n\t" | |||||
| "vl %%v5, 80(%%r1,%[y])\n\t" | |||||
| "vl %%v6, 96(%%r1,%[y])\n\t" | |||||
| "vl %%v7, 112(%%r1,%[y])\n\t" | |||||
| "vst %%v0, 0(%%r1,%[x])\n\t" | |||||
| "vst %%v1, 16(%%r1,%[x])\n\t" | |||||
| "vst %%v2, 32(%%r1,%[x])\n\t" | |||||
| "vst %%v3, 48(%%r1,%[x])\n\t" | |||||
| "vst %%v4, 64(%%r1,%[x])\n\t" | |||||
| "vst %%v5, 80(%%r1,%[x])\n\t" | |||||
| "vst %%v6, 96(%%r1,%[x])\n\t" | |||||
| "vst %%v7, 112(%%r1,%[x])\n\t" | |||||
| "vl %%v0, 128(%%r1,%[y])\n\t" | |||||
| "vl %%v1, 144(%%r1,%[y])\n\t" | |||||
| "vl %%v2, 160(%%r1,%[y])\n\t" | |||||
| "vl %%v3, 176(%%r1,%[y])\n\t" | |||||
| "vl %%v4, 192(%%r1,%[y])\n\t" | |||||
| "vl %%v5, 208(%%r1,%[y])\n\t" | |||||
| "vl %%v6, 224(%%r1,%[y])\n\t" | |||||
| "vl %%v7, 240(%%r1,%[y])\n\t" | |||||
| "vst %%v0, 128(%%r1,%[x])\n\t" | |||||
| "vst %%v1, 144(%%r1,%[x])\n\t" | |||||
| "vst %%v2, 160(%%r1,%[x])\n\t" | |||||
| "vst %%v3, 176(%%r1,%[x])\n\t" | |||||
| "vst %%v4, 192(%%r1,%[x])\n\t" | |||||
| "vst %%v5, 208(%%r1,%[x])\n\t" | |||||
| "vst %%v6, 224(%%r1,%[x])\n\t" | |||||
| "vst %%v7, 240(%%r1,%[x])\n\t" | |||||
| "vst %%v16, 0(%%r1,%[y])\n\t" | |||||
| "vst %%v17, 16(%%r1,%[y])\n\t" | |||||
| "vst %%v18, 32(%%r1,%[y])\n\t" | |||||
| "vst %%v19, 48(%%r1,%[y])\n\t" | |||||
| "vst %%v20, 64(%%r1,%[y])\n\t" | |||||
| "vst %%v21, 80(%%r1,%[y])\n\t" | |||||
| "vst %%v22, 96(%%r1,%[y])\n\t" | |||||
| "vst %%v23, 112(%%r1,%[y])\n\t" | |||||
| "vst %%v24, 128(%%r1,%[y])\n\t" | |||||
| "vst %%v25, 144(%%r1,%[y])\n\t" | |||||
| "vst %%v26, 160(%%r1,%[y])\n\t" | |||||
| "vst %%v27, 176(%%r1,%[y])\n\t" | |||||
| "vst %%v28, 192(%%r1,%[y])\n\t" | |||||
| "vst %%v29, 208(%%r1,%[y])\n\t" | |||||
| "vst %%v30, 224(%%r1,%[y])\n\t" | |||||
| "vst %%v31, 240(%%r1,%[y])\n\t" | |||||
| "agfi %%r1,256\n\t" | |||||
| "brctg %[n],0b" | |||||
| : "+m"(*(FLOAT (*)[n]) x), "+m"(*(FLOAT (*)[n]) y),[n] "+&r"(n) | |||||
| : [x] "a"(x),[y] "a"(y) | |||||
| : "cc", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", | |||||
| "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", | |||||
| "v27", "v28", "v29", "v30", "v31"); | |||||
| } | } | ||||
| int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) | |||||
| { | |||||
| BLASLONG i=0; | |||||
| BLASLONG ix=0,iy=0; | |||||
| FLOAT temp; | |||||
| if ( n <= 0 ) return(0); | |||||
| int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT *x, | |||||
| BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, | |||||
| BLASLONG dummy2) { | |||||
| BLASLONG i = 0; | |||||
| BLASLONG ix = 0, iy = 0; | |||||
| FLOAT temp; | |||||
| if ( (inc_x == 1) && (inc_y == 1 )) | |||||
| { | |||||
| if (n <= 0) | |||||
| return (0); | |||||
| BLASLONG n1 = n & -32; | |||||
| if ( n1 > 0 ) | |||||
| { | |||||
| dswap_kernel_32(n1, x, y); | |||||
| i=n1; | |||||
| } | |||||
| if ((inc_x == 1) && (inc_y == 1)) { | |||||
| while(i < n) | |||||
| { | |||||
| temp = y[i]; | |||||
| y[i] = x[i] ; | |||||
| x[i] = temp; | |||||
| i++ ; | |||||
| } | |||||
| BLASLONG n1 = n & -32; | |||||
| if (n1 > 0) { | |||||
| dswap_kernel_32(n1, x, y); | |||||
| i = n1; | |||||
| } | |||||
| while (i < n) { | |||||
| temp = y[i]; | |||||
| y[i] = x[i]; | |||||
| x[i] = temp; | |||||
| i++; | |||||
| } | } | ||||
| else | |||||
| { | |||||
| while(i < n) | |||||
| { | |||||
| temp = y[iy]; | |||||
| y[iy] = x[ix] ; | |||||
| x[ix] = temp; | |||||
| ix += inc_x ; | |||||
| iy += inc_y ; | |||||
| i++ ; | |||||
| } else { | |||||
| } | |||||
| while (i < n) { | |||||
| temp = y[iy]; | |||||
| y[iy] = x[ix]; | |||||
| x[ix] = temp; | |||||
| ix += inc_x; | |||||
| iy += inc_y; | |||||
| i++; | |||||
| } | } | ||||
| return(0); | |||||
| } | |||||
| return (0); | |||||
| } | } | ||||
| @@ -1,5 +1,5 @@ | |||||
| /*************************************************************************** | /*************************************************************************** | ||||
| Copyright (c) 2017, The OpenBLAS Project | |||||
| Copyright (c) 2019, The OpenBLAS Project | |||||
| All rights reserved. | All rights reserved. | ||||
| Redistribution and use in source and binary forms, with or without | Redistribution and use in source and binary forms, with or without | ||||
| modification, are permitted provided that the following conditions are | modification, are permitted provided that the following conditions are | ||||
| @@ -27,285 +27,276 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #include "common.h" | #include "common.h" | ||||
| #include <math.h> | #include <math.h> | ||||
| #if defined(DOUBLE) | |||||
| #define ABS fabs | |||||
| #else | |||||
| #define ABS fabsf | |||||
| #endif | |||||
| #define CABS1(x,i) (ABS(x[i]) + ABS(x[i + 1])) | |||||
| static BLASLONG icamax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *amax) | |||||
| { | |||||
| BLASLONG iamax; | |||||
| #define CABS1(x,i) (fabsf(x[i]) + fabsf(x[i + 1])) | |||||
| __asm__ volatile ( | |||||
| "vlef %%v0,0(%3),0 \n\t" | |||||
| "vlef %%v1,4(%3),0 \n\t" | |||||
| "vlef %%v0,8(%3),1 \n\t" | |||||
| "vlef %%v1,12(%3),1 \n\t" | |||||
| "vlef %%v0,16(%3),2 \n\t" | |||||
| "vlef %%v1,20(%3),2 \n\t" | |||||
| "vlef %%v0,24(%3),3 \n\t" | |||||
| "vlef %%v1,28(%3),3 \n\t" | |||||
| "vflpsb %%v0,%%v0 \n\t" | |||||
| "vflpsb %%v1,%%v1 \n\t" | |||||
| "vfasb %%v0,%%v0,%%v1 \n\t" | |||||
| "vleig %%v1,0,0 \n\t" | |||||
| "vleig %%v1,2,1 \n\t" | |||||
| "vleig %%v2,1,0 \n\t" | |||||
| "vleig %%v2,3,1 \n\t" | |||||
| "vrepig %%v3,16 \n\t" | |||||
| "vzero %%v4 \n\t" | |||||
| "vleib %%v9,0,0 \n\t" | |||||
| "vleib %%v9,1,1 \n\t" | |||||
| "vleib %%v9,2,2 \n\t" | |||||
| "vleib %%v9,3,3 \n\t" | |||||
| "vleib %%v9,8,4 \n\t" | |||||
| "vleib %%v9,9,5 \n\t" | |||||
| "vleib %%v9,10,6 \n\t" | |||||
| "vleib %%v9,11,7 \n\t" | |||||
| "vleib %%v9,16,8 \n\t" | |||||
| "vleib %%v9,17,9 \n\t" | |||||
| "vleib %%v9,18,10 \n\t" | |||||
| "vleib %%v9,19,11 \n\t" | |||||
| "vleib %%v9,24,12 \n\t" | |||||
| "vleib %%v9,25,13 \n\t" | |||||
| "vleib %%v9,26,14 \n\t" | |||||
| "vleib %%v9,27,15 \n\t" | |||||
| "vleif %%v24,0,0 \n\t" | |||||
| "vleif %%v24,1,1 \n\t" | |||||
| "vleif %%v24,2,2 \n\t" | |||||
| "vleif %%v24,3,3 \n\t" | |||||
| "vleif %%v25,4,0 \n\t" | |||||
| "vleif %%v25,5,1 \n\t" | |||||
| "vleif %%v25,6,2 \n\t" | |||||
| "vleif %%v25,7,3 \n\t" | |||||
| "vleif %%v26,8,0 \n\t" | |||||
| "vleif %%v26,9,1 \n\t" | |||||
| "vleif %%v26,10,2 \n\t" | |||||
| "vleif %%v26,11,3 \n\t" | |||||
| "vleif %%v27,12,0 \n\t" | |||||
| "vleif %%v27,13,1 \n\t" | |||||
| "vleif %%v27,14,2 \n\t" | |||||
| "vleif %%v27,15,3 \n\t" | |||||
| "srlg %%r0,%2,5 \n\t" | |||||
| "xgr %%r1,%%r1 \n\t" | |||||
| "0: \n\t" | |||||
| "pfd 1, 1024(%%r1,%3) \n\t" | |||||
| static BLASLONG icamax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *amax) { | |||||
| BLASLONG iamax; | |||||
| "vl %%v16,0(%%r1,%3) \n\t" | |||||
| "vl %%v28,16(%%r1,%3) \n\t" | |||||
| "vpkg %%v17,%%v16,%%v28 \n\t" | |||||
| "vperm %%v16,%%v16,%%v28,%%v9 \n\t" | |||||
| __asm__("vlef %%v0,0(%[x]),0\n\t" | |||||
| "vlef %%v1,4(%[x]),0\n\t" | |||||
| "vlef %%v0,8(%[x]),1\n\t" | |||||
| "vlef %%v1,12(%[x]),1\n\t" | |||||
| "vlef %%v0,16(%[x]),2\n\t" | |||||
| "vlef %%v1,20(%[x]),2\n\t" | |||||
| "vlef %%v0,24(%[x]),3\n\t" | |||||
| "vlef %%v1,28(%[x]),3\n\t" | |||||
| "vflpsb %%v0,%%v0\n\t" | |||||
| "vflpsb %%v1,%%v1\n\t" | |||||
| "vfasb %%v0,%%v0,%%v1\n\t" | |||||
| "vleig %%v1,0,0\n\t" | |||||
| "vleig %%v1,2,1\n\t" | |||||
| "vleig %%v2,1,0\n\t" | |||||
| "vleig %%v2,3,1\n\t" | |||||
| "vrepig %%v3,16\n\t" | |||||
| "vzero %%v4\n\t" | |||||
| "vleib %%v9,0,0\n\t" | |||||
| "vleib %%v9,1,1\n\t" | |||||
| "vleib %%v9,2,2\n\t" | |||||
| "vleib %%v9,3,3\n\t" | |||||
| "vleib %%v9,8,4\n\t" | |||||
| "vleib %%v9,9,5\n\t" | |||||
| "vleib %%v9,10,6\n\t" | |||||
| "vleib %%v9,11,7\n\t" | |||||
| "vleib %%v9,16,8\n\t" | |||||
| "vleib %%v9,17,9\n\t" | |||||
| "vleib %%v9,18,10\n\t" | |||||
| "vleib %%v9,19,11\n\t" | |||||
| "vleib %%v9,24,12\n\t" | |||||
| "vleib %%v9,25,13\n\t" | |||||
| "vleib %%v9,26,14\n\t" | |||||
| "vleib %%v9,27,15\n\t" | |||||
| "vleif %%v24,0,0\n\t" | |||||
| "vleif %%v24,1,1\n\t" | |||||
| "vleif %%v24,2,2\n\t" | |||||
| "vleif %%v24,3,3\n\t" | |||||
| "vleif %%v25,4,0\n\t" | |||||
| "vleif %%v25,5,1\n\t" | |||||
| "vleif %%v25,6,2\n\t" | |||||
| "vleif %%v25,7,3\n\t" | |||||
| "vleif %%v26,8,0\n\t" | |||||
| "vleif %%v26,9,1\n\t" | |||||
| "vleif %%v26,10,2\n\t" | |||||
| "vleif %%v26,11,3\n\t" | |||||
| "vleif %%v27,12,0\n\t" | |||||
| "vleif %%v27,13,1\n\t" | |||||
| "vleif %%v27,14,2\n\t" | |||||
| "vleif %%v27,15,3\n\t" | |||||
| "srlg %[n],%[n],5\n\t" | |||||
| "xgr %%r1,%%r1\n\t" | |||||
| "0:\n\t" | |||||
| "pfd 1, 1024(%%r1,%[x])\n\t" | |||||
| "vl %%v16,0(%%r1,%[x])\n\t" | |||||
| "vl %%v28,16(%%r1,%[x])\n\t" | |||||
| "vpkg %%v17,%%v16,%%v28\n\t" | |||||
| "vperm %%v16,%%v16,%%v28,%%v9\n\t" | |||||
| "vl %%v18,32(%%r1,%[x])\n\t" | |||||
| "vl %%v29,48(%%r1,%[x])\n\t" | |||||
| "vpkg %%v19,%%v18,%%v29\n\t" | |||||
| "vperm %%v18,%%v18,%%v29,%%v9\n\t" | |||||
| "vl %%v20,64(%%r1,%[x])\n\t" | |||||
| "vl %%v30,80(%%r1,%[x])\n\t" | |||||
| "vpkg %%v21,%%v20,%%v30\n\t" | |||||
| "vperm %%v20,%%v20,%%v30,%%v9\n\t" | |||||
| "vl %%v22,96(%%r1,%[x])\n\t" | |||||
| "vl %%v31,112(%%r1,%[x])\n\t" | |||||
| "vpkg %%v23,%%v22,%%v31\n\t" | |||||
| "vperm %%v22,%%v22,%%v31,%%v9\n\t" | |||||
| "vflpsb %%v16, %%v16\n\t" | |||||
| "vflpsb %%v17, %%v17\n\t" | |||||
| "vflpsb %%v18, %%v18\n\t" | |||||
| "vflpsb %%v19, %%v19\n\t" | |||||
| "vflpsb %%v20, %%v20\n\t" | |||||
| "vflpsb %%v21, %%v21\n\t" | |||||
| "vflpsb %%v22, %%v22\n\t" | |||||
| "vflpsb %%v23, %%v23\n\t" | |||||
| "vfasb %%v16,%%v16,%%v17\n\t" | |||||
| "vfasb %%v17,%%v18,%%v19\n\t" | |||||
| "vfasb %%v18,%%v20,%%v21\n\t" | |||||
| "vfasb %%v19,%%v22,%%v23\n\t" | |||||
| "vfchesb %%v5,%%v16,%%v17\n\t" | |||||
| "vfchesb %%v6,%%v18,%%v19\n\t" | |||||
| "vsel %%v16,%%v16,%%v17,%%v5\n\t" | |||||
| "vsel %%v5,%%v24,%%v25,%%v5\n\t" | |||||
| "vsel %%v17,%%v18,%%v19,%%v6\n\t" | |||||
| "vsel %%v6,%%v26,%%v27,%%v6\n\t" | |||||
| "vfchesb %%v18,%%v16,%%v17\n\t" | |||||
| "vsel %%v16,%%v16,%%v17,%%v18\n\t" | |||||
| "vsel %%v5,%%v5,%%v6,%%v18\n\t" | |||||
| "vsegf %%v6,%%v5\n\t" | |||||
| "vesrlg %%v5,%%v5,32\n\t" | |||||
| "vag %%v5,%%v5,%%v4\n\t" | |||||
| "vag %%v6,%%v6,%%v4\n\t" | |||||
| "vfchesb %%v7,%%v0,%%v16\n\t" | |||||
| "vsel %%v0,%%v0,%%v16,%%v7\n\t" | |||||
| "vsegf %%v8,%%v7\n\t" | |||||
| "vesrlg %%v7,%%v7,32\n\t" | |||||
| "vsegf %%v7,%%v7\n\t" | |||||
| "vsel %%v1,%%v1,%%v5,%%v7\n\t" | |||||
| "vsel %%v2,%%v2,%%v6,%%v8\n\t" | |||||
| "vag %%v4,%%v4,%%v3\n\t" | |||||
| "vl %%v16,128(%%r1,%[x])\n\t" | |||||
| "vl %%v28,144(%%r1,%[x])\n\t" | |||||
| "vpkg %%v17,%%v16,%%v28\n\t" | |||||
| "vperm %%v16,%%v16,%%v28,%%v9\n\t" | |||||
| "vl %%v18,160(%%r1,%[x])\n\t" | |||||
| "vl %%v29,176(%%r1,%[x])\n\t" | |||||
| "vpkg %%v19,%%v18,%%v29\n\t" | |||||
| "vperm %%v18,%%v18,%%v29,%%v9\n\t" | |||||
| "vl %%v20,192(%%r1,%[x])\n\t" | |||||
| "vl %%v30,208(%%r1,%[x])\n\t" | |||||
| "vpkg %%v21,%%v20,%%v30\n\t" | |||||
| "vperm %%v20,%%v20,%%v30,%%v9\n\t" | |||||
| "vl %%v22,224(%%r1,%[x])\n\t" | |||||
| "vl %%v31,240(%%r1,%[x])\n\t" | |||||
| "vpkg %%v23,%%v22,%%v31\n\t" | |||||
| "vperm %%v22,%%v22,%%v31,%%v9\n\t" | |||||
| "vflpsb %%v16, %%v16\n\t" | |||||
| "vflpsb %%v17, %%v17\n\t" | |||||
| "vflpsb %%v18, %%v18\n\t" | |||||
| "vflpsb %%v19, %%v19\n\t" | |||||
| "vflpsb %%v20, %%v20\n\t" | |||||
| "vflpsb %%v21, %%v21\n\t" | |||||
| "vflpsb %%v22, %%v22\n\t" | |||||
| "vflpsb %%v23, %%v23\n\t" | |||||
| "vfasb %%v16,%%v16,%%v17\n\t" | |||||
| "vfasb %%v17,%%v18,%%v19\n\t" | |||||
| "vfasb %%v18,%%v20,%%v21\n\t" | |||||
| "vfasb %%v19,%%v22,%%v23\n\t" | |||||
| "vfchesb %%v5,%%v16,%%v17\n\t" | |||||
| "vfchesb %%v6,%%v18,%%v19\n\t" | |||||
| "vsel %%v16,%%v16,%%v17,%%v5\n\t" | |||||
| "vsel %%v5,%%v24,%%v25,%%v5\n\t" | |||||
| "vsel %%v17,%%v18,%%v19,%%v6\n\t" | |||||
| "vsel %%v6,%%v26,%%v27,%%v6\n\t" | |||||
| "vfchesb %%v18,%%v16,%%v17\n\t" | |||||
| "vsel %%v16,%%v16,%%v17,%%v18\n\t" | |||||
| "vsel %%v5,%%v5,%%v6,%%v18\n\t" | |||||
| "vsegf %%v6,%%v5\n\t" | |||||
| "vesrlg %%v5,%%v5,32\n\t" | |||||
| "vag %%v5,%%v5,%%v4\n\t" | |||||
| "vag %%v6,%%v6,%%v4\n\t" | |||||
| "vfchesb %%v7,%%v0,%%v16\n\t" | |||||
| "vsel %%v0,%%v0,%%v16,%%v7\n\t" | |||||
| "vsegf %%v8,%%v7\n\t" | |||||
| "vesrlg %%v7,%%v7,32\n\t" | |||||
| "vsegf %%v7,%%v7\n\t" | |||||
| "vsel %%v1,%%v1,%%v5,%%v7\n\t" | |||||
| "vsel %%v2,%%v2,%%v6,%%v8\n\t" | |||||
| "vag %%v4,%%v4,%%v3\n\t" | |||||
| "agfi %%r1, 256\n\t" | |||||
| "brctg %[n], 0b\n\t" | |||||
| "veslg %%v3,%%v0,32\n\t" | |||||
| "vfchsb %%v4,%%v0,%%v3\n\t" | |||||
| "vchlg %%v5,%%v2,%%v1\n\t" | |||||
| "vfcesb %%v6,%%v0,%%v3\n\t" | |||||
| "vn %%v5,%%v5,%%v6\n\t" | |||||
| "vo %%v4,%%v4,%%v5\n\t" | |||||
| "vsel %%v0,%%v0,%%v3,%%v4\n\t" | |||||
| "vesrlg %%v4,%%v4,32\n\t" | |||||
| "vsegf %%v4,%%v4\n\t" | |||||
| "vsel %%v1,%%v1,%%v2,%%v4\n\t" | |||||
| "vrepf %%v2,%%v0,2\n\t" | |||||
| "vrepg %%v3,%%v1,1\n\t" | |||||
| "wfcsb %%v2,%%v0\n\t" | |||||
| "jne 1f\n\t" | |||||
| "vstef %%v0,%[amax],0\n\t" | |||||
| "vmnlg %%v0,%%v1,%%v3\n\t" | |||||
| "vlgvg %[iamax],%%v0,0\n\t" | |||||
| "j 2f\n\t" | |||||
| "1:\n\t" | |||||
| "wfchsb %%v4,%%v2,%%v0\n\t" | |||||
| "vesrlg %%v4,%%v4,32\n\t" | |||||
| "vsegf %%v4,%%v4\n\t" | |||||
| "vsel %%v1,%%v3,%%v1,%%v4\n\t" | |||||
| "vsel %%v0,%%v2,%%v0,%%v4\n\t" | |||||
| "ste %%f0,%[amax]\n\t" | |||||
| "vlgvg %[iamax],%%v1,0\n\t" | |||||
| "2:\n\t" | |||||
| "nop" | |||||
| : [iamax] "=r"(iamax),[amax] "=m"(*amax),[n] "+&r"(n) | |||||
| : "m"(*(const FLOAT (*)[n * 2]) x),[x] "a"(x) | |||||
| : "cc", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", | |||||
| "v9", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", | |||||
| "v25", "v26", "v27", "v28", "v29", "v30", "v31"); | |||||
| "vl %%v18,32(%%r1,%3) \n\t" | |||||
| "vl %%v29,48(%%r1,%3) \n\t" | |||||
| "vpkg %%v19,%%v18,%%v29 \n\t" | |||||
| "vperm %%v18,%%v18,%%v29,%%v9 \n\t" | |||||
| "vl %%v20,64(%%r1,%3) \n\t" | |||||
| "vl %%v30,80(%%r1,%3) \n\t" | |||||
| "vpkg %%v21,%%v20,%%v30 \n\t" | |||||
| "vperm %%v20,%%v20,%%v30,%%v9 \n\t" | |||||
| "vl %%v22,96(%%r1,%3) \n\t" | |||||
| "vl %%v31,112(%%r1,%3) \n\t" | |||||
| "vpkg %%v23,%%v22,%%v31 \n\t" | |||||
| "vperm %%v22,%%v22,%%v31,%%v9 \n\t" | |||||
| "vflpsb %%v16, %%v16 \n\t" | |||||
| "vflpsb %%v17, %%v17 \n\t" | |||||
| "vflpsb %%v18, %%v18 \n\t" | |||||
| "vflpsb %%v19, %%v19 \n\t" | |||||
| "vflpsb %%v20, %%v20 \n\t" | |||||
| "vflpsb %%v21, %%v21 \n\t" | |||||
| "vflpsb %%v22, %%v22 \n\t" | |||||
| "vflpsb %%v23, %%v23 \n\t" | |||||
| "vfasb %%v16,%%v16,%%v17 \n\t" | |||||
| "vfasb %%v17,%%v18,%%v19 \n\t" | |||||
| "vfasb %%v18,%%v20,%%v21 \n\t" | |||||
| "vfasb %%v19,%%v22,%%v23 \n\t" | |||||
| "vfchesb %%v5,%%v16,%%v17 \n\t" | |||||
| "vfchesb %%v6,%%v18,%%v19 \n\t" | |||||
| "vsel %%v16,%%v16,%%v17,%%v5 \n\t" | |||||
| "vsel %%v5,%%v24,%%v25,%%v5 \n\t" | |||||
| "vsel %%v17,%%v18,%%v19,%%v6 \n\t" | |||||
| "vsel %%v6,%%v26,%%v27,%%v6 \n\t" | |||||
| "vfchesb %%v18,%%v16,%%v17 \n\t" | |||||
| "vsel %%v16,%%v16,%%v17,%%v18 \n\t" | |||||
| "vsel %%v5,%%v5,%%v6,%%v18 \n\t" | |||||
| "vsegf %%v6,%%v5 \n\t" | |||||
| "vesrlg %%v5,%%v5,32 \n\t" | |||||
| "vag %%v5,%%v5,%%v4 \n\t" | |||||
| "vag %%v6,%%v6,%%v4 \n\t" | |||||
| "vfchesb %%v7,%%v0,%%v16 \n\t" | |||||
| "vsel %%v0,%%v0,%%v16,%%v7 \n\t" | |||||
| "vsegf %%v8,%%v7 \n\t" | |||||
| "vesrlg %%v7,%%v7,32 \n\t" | |||||
| "vsegf %%v7,%%v7 \n\t" | |||||
| "vsel %%v1,%%v1,%%v5,%%v7 \n\t" | |||||
| "vsel %%v2,%%v2,%%v6,%%v8 \n\t" | |||||
| "vag %%v4,%%v4,%%v3 \n\t" | |||||
| "vl %%v16,128(%%r1,%3) \n\t" | |||||
| "vl %%v28,144(%%r1,%3) \n\t" | |||||
| "vpkg %%v17,%%v16,%%v28 \n\t" | |||||
| "vperm %%v16,%%v16,%%v28,%%v9 \n\t" | |||||
| "vl %%v18,160(%%r1,%3) \n\t" | |||||
| "vl %%v29,176(%%r1,%3) \n\t" | |||||
| "vpkg %%v19,%%v18,%%v29 \n\t" | |||||
| "vperm %%v18,%%v18,%%v29,%%v9 \n\t" | |||||
| "vl %%v20,192(%%r1,%3) \n\t" | |||||
| "vl %%v30,208(%%r1,%3) \n\t" | |||||
| "vpkg %%v21,%%v20,%%v30 \n\t" | |||||
| "vperm %%v20,%%v20,%%v30,%%v9 \n\t" | |||||
| "vl %%v22,224(%%r1,%3) \n\t" | |||||
| "vl %%v31,240(%%r1,%3) \n\t" | |||||
| "vpkg %%v23,%%v22,%%v31 \n\t" | |||||
| "vperm %%v22,%%v22,%%v31,%%v9 \n\t" | |||||
| "vflpsb %%v16, %%v16 \n\t" | |||||
| "vflpsb %%v17, %%v17 \n\t" | |||||
| "vflpsb %%v18, %%v18 \n\t" | |||||
| "vflpsb %%v19, %%v19 \n\t" | |||||
| "vflpsb %%v20, %%v20 \n\t" | |||||
| "vflpsb %%v21, %%v21 \n\t" | |||||
| "vflpsb %%v22, %%v22 \n\t" | |||||
| "vflpsb %%v23, %%v23 \n\t" | |||||
| "vfasb %%v16,%%v16,%%v17 \n\t" | |||||
| "vfasb %%v17,%%v18,%%v19 \n\t" | |||||
| "vfasb %%v18,%%v20,%%v21 \n\t" | |||||
| "vfasb %%v19,%%v22,%%v23 \n\t" | |||||
| "vfchesb %%v5,%%v16,%%v17 \n\t" | |||||
| "vfchesb %%v6,%%v18,%%v19 \n\t" | |||||
| "vsel %%v16,%%v16,%%v17,%%v5 \n\t" | |||||
| "vsel %%v5,%%v24,%%v25,%%v5 \n\t" | |||||
| "vsel %%v17,%%v18,%%v19,%%v6 \n\t" | |||||
| "vsel %%v6,%%v26,%%v27,%%v6 \n\t" | |||||
| return iamax; | |||||
| } | |||||
| "vfchesb %%v18,%%v16,%%v17 \n\t" | |||||
| "vsel %%v16,%%v16,%%v17,%%v18 \n\t" | |||||
| "vsel %%v5,%%v5,%%v6,%%v18 \n\t" | |||||
| "vsegf %%v6,%%v5 \n\t" | |||||
| "vesrlg %%v5,%%v5,32 \n\t" | |||||
| "vag %%v5,%%v5,%%v4 \n\t" | |||||
| "vag %%v6,%%v6,%%v4 \n\t" | |||||
| BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { | |||||
| BLASLONG i = 0; | |||||
| BLASLONG ix = 0; | |||||
| FLOAT maxf = 0; | |||||
| BLASLONG max = 0; | |||||
| BLASLONG inc_x2; | |||||
| "vfchesb %%v7,%%v0,%%v16 \n\t" | |||||
| "vsel %%v0,%%v0,%%v16,%%v7 \n\t" | |||||
| "vsegf %%v8,%%v7 \n\t" | |||||
| "vesrlg %%v7,%%v7,32 \n\t" | |||||
| "vsegf %%v7,%%v7 \n\t" | |||||
| "vsel %%v1,%%v1,%%v5,%%v7 \n\t" | |||||
| "vsel %%v2,%%v2,%%v6,%%v8 \n\t" | |||||
| "vag %%v4,%%v4,%%v3 \n\t" | |||||
| if (n <= 0 || inc_x <= 0) | |||||
| return (max); | |||||
| "agfi %%r1, 256 \n\t" | |||||
| "brctg %%r0, 0b \n\t" | |||||
| if (inc_x == 1) { | |||||
| "veslg %%v3,%%v0,32 \n\t" | |||||
| "vfchsb %%v4,%%v0,%%v3 \n\t" | |||||
| "vchlg %%v5,%%v2,%%v1 \n\t" | |||||
| "vfcesb %%v6,%%v0,%%v3 \n\t" | |||||
| "vn %%v5,%%v5,%%v6 \n\t" | |||||
| "vo %%v4,%%v4,%%v5 \n\t" | |||||
| "vsel %%v0,%%v0,%%v3,%%v4 \n\t" | |||||
| "vesrlg %%v4,%%v4,32 \n\t" | |||||
| "vsegf %%v4,%%v4 \n\t" | |||||
| "vsel %%v1,%%v1,%%v2,%%v4 \n\t" | |||||
| BLASLONG n1 = n & -32; | |||||
| if (n1 > 0) { | |||||
| "vrepf %%v2,%%v0,2 \n\t" | |||||
| "vrepg %%v3,%%v1,1 \n\t" | |||||
| "wfcsb %%v2,%%v0 \n\t" | |||||
| "jne 1f \n\t" | |||||
| "vstef %%v0,%1,0 \n\t" | |||||
| "vmnlg %%v0,%%v1,%%v3 \n\t" | |||||
| "vlgvg %0,%%v0,0 \n\t" | |||||
| "j 2f \n\t" | |||||
| "1: \n\t" | |||||
| "wfchsb %%v4,%%v2,%%v0 \n\t" | |||||
| "vesrlg %%v4,%%v4,32 \n\t" | |||||
| "vsegf %%v4,%%v4 \n\t" | |||||
| "vsel %%v1,%%v3,%%v1,%%v4 \n\t" | |||||
| "vsel %%v0,%%v2,%%v0,%%v4 \n\t" | |||||
| "ste %%f0,%1 \n\t" | |||||
| "vlgvg %0,%%v1,0 \n\t" | |||||
| "2: \n\t" | |||||
| "nop " | |||||
| :"=r"(iamax),"=m"(*amax) | |||||
| :"r"(n),"ZR"((const FLOAT (*)[n * 2])x) | |||||
| :"memory","cc","r0","r1","v0","v1","v2","v3","v4","v5","v6","v7","v8","v9","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" | |||||
| ); | |||||
| max = icamax_kernel_32(n1, x, &maxf); | |||||
| ix = n1 * 2; | |||||
| i = n1; | |||||
| } else { | |||||
| maxf = CABS1(x, 0); | |||||
| ix += 2; | |||||
| i++; | |||||
| } | |||||
| return iamax; | |||||
| } | |||||
| while (i < n) { | |||||
| if (CABS1(x, ix) > maxf) { | |||||
| max = i; | |||||
| maxf = CABS1(x, ix); | |||||
| } | |||||
| ix += 2; | |||||
| i++; | |||||
| } | |||||
| return (max + 1); | |||||
| BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||||
| { | |||||
| BLASLONG i = 0; | |||||
| BLASLONG ix = 0; | |||||
| FLOAT maxf = 0; | |||||
| BLASLONG max = 0; | |||||
| BLASLONG inc_x2; | |||||
| } else { | |||||
| if (n <= 0 || inc_x <= 0) return(max); | |||||
| if (inc_x == 1) { | |||||
| max = 0; | |||||
| maxf = CABS1(x, 0); | |||||
| inc_x2 = 2 * inc_x; | |||||
| BLASLONG n1 = n & -32; | |||||
| if (n1 > 0) { | |||||
| BLASLONG n1 = n & -4; | |||||
| while (i < n1) { | |||||
| max = icamax_kernel_32(n1, x, &maxf); | |||||
| ix = n1 * 2; | |||||
| i = n1; | |||||
| if (CABS1(x, ix) > maxf) { | |||||
| max = i; | |||||
| maxf = CABS1(x, ix); | |||||
| } | |||||
| if (CABS1(x, ix + inc_x2) > maxf) { | |||||
| max = i + 1; | |||||
| maxf = CABS1(x, ix + inc_x2); | |||||
| } | |||||
| if (CABS1(x, ix + 2 * inc_x2) > maxf) { | |||||
| max = i + 2; | |||||
| maxf = CABS1(x, ix + 2 * inc_x2); | |||||
| } | |||||
| if (CABS1(x, ix + 3 * inc_x2) > maxf) { | |||||
| max = i + 3; | |||||
| maxf = CABS1(x, ix + 3 * inc_x2); | |||||
| } | } | ||||
| else | |||||
| { | |||||
| maxf = CABS1(x,0); | |||||
| ix += 2; | |||||
| i++; | |||||
| } | |||||
| while(i < n) | |||||
| { | |||||
| if( CABS1(x,ix) > maxf ) | |||||
| { | |||||
| max = i; | |||||
| maxf = CABS1(x,ix); | |||||
| } | |||||
| ix += 2; | |||||
| i++; | |||||
| } | |||||
| return (max + 1); | |||||
| ix += inc_x2 * 4; | |||||
| } else { | |||||
| max = 0; | |||||
| maxf = CABS1(x,0); | |||||
| inc_x2 = 2 * inc_x; | |||||
| ix += inc_x2; | |||||
| i++; | |||||
| i += 4; | |||||
| while(i < n) | |||||
| { | |||||
| if( CABS1(x,ix) > maxf ) | |||||
| { | |||||
| max = i; | |||||
| maxf = CABS1(x,ix); | |||||
| } | |||||
| ix += inc_x2; | |||||
| i++; | |||||
| } | } | ||||
| return (max + 1); | |||||
| while (i < n) { | |||||
| if (CABS1(x, ix) > maxf) { | |||||
| max = i; | |||||
| maxf = CABS1(x, ix); | |||||
| } | |||||
| ix += inc_x2; | |||||
| i++; | |||||
| } | } | ||||
| return (max + 1); | |||||
| } | |||||
| } | } | ||||
| @@ -1,5 +1,5 @@ | |||||
| /*************************************************************************** | /*************************************************************************** | ||||
| Copyright (c) 2017, The OpenBLAS Project | |||||
| Copyright (c) 2019, The OpenBLAS Project | |||||
| All rights reserved. | All rights reserved. | ||||
| Redistribution and use in source and binary forms, with or without | Redistribution and use in source and binary forms, with or without | ||||
| modification, are permitted provided that the following conditions are | modification, are permitted provided that the following conditions are | ||||
| @@ -27,285 +27,276 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #include "common.h" | #include "common.h" | ||||
| #include <math.h> | #include <math.h> | ||||
| #if defined(DOUBLE) | |||||
| #define ABS fabs | |||||
| #else | |||||
| #define ABS fabsf | |||||
| #endif | |||||
| #define CABS1(x,i) (ABS(x[i]) + ABS(x[i + 1])) | |||||
| static BLASLONG icamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *amin) | |||||
| { | |||||
| BLASLONG iamin; | |||||
| #define CABS1(x,i) (fabsf(x[i]) + fabsf(x[i + 1])) | |||||
| __asm__ volatile ( | |||||
| "vlef %%v0,0(%3),0 \n\t" | |||||
| "vlef %%v1,4(%3),0 \n\t" | |||||
| "vlef %%v0,8(%3),1 \n\t" | |||||
| "vlef %%v1,12(%3),1 \n\t" | |||||
| "vlef %%v0,16(%3),2 \n\t" | |||||
| "vlef %%v1,20(%3),2 \n\t" | |||||
| "vlef %%v0,24(%3),3 \n\t" | |||||
| "vlef %%v1,28(%3),3 \n\t" | |||||
| "vflpsb %%v0,%%v0 \n\t" | |||||
| "vflpsb %%v1,%%v1 \n\t" | |||||
| "vfasb %%v0,%%v0,%%v1 \n\t" | |||||
| "vleig %%v1,0,0 \n\t" | |||||
| "vleig %%v1,2,1 \n\t" | |||||
| "vleig %%v2,1,0 \n\t" | |||||
| "vleig %%v2,3,1 \n\t" | |||||
| "vrepig %%v3,16 \n\t" | |||||
| "vzero %%v4 \n\t" | |||||
| "vleib %%v9,0,0 \n\t" | |||||
| "vleib %%v9,1,1 \n\t" | |||||
| "vleib %%v9,2,2 \n\t" | |||||
| "vleib %%v9,3,3 \n\t" | |||||
| "vleib %%v9,8,4 \n\t" | |||||
| "vleib %%v9,9,5 \n\t" | |||||
| "vleib %%v9,10,6 \n\t" | |||||
| "vleib %%v9,11,7 \n\t" | |||||
| "vleib %%v9,16,8 \n\t" | |||||
| "vleib %%v9,17,9 \n\t" | |||||
| "vleib %%v9,18,10 \n\t" | |||||
| "vleib %%v9,19,11 \n\t" | |||||
| "vleib %%v9,24,12 \n\t" | |||||
| "vleib %%v9,25,13 \n\t" | |||||
| "vleib %%v9,26,14 \n\t" | |||||
| "vleib %%v9,27,15 \n\t" | |||||
| "vleif %%v24,0,0 \n\t" | |||||
| "vleif %%v24,1,1 \n\t" | |||||
| "vleif %%v24,2,2 \n\t" | |||||
| "vleif %%v24,3,3 \n\t" | |||||
| "vleif %%v25,4,0 \n\t" | |||||
| "vleif %%v25,5,1 \n\t" | |||||
| "vleif %%v25,6,2 \n\t" | |||||
| "vleif %%v25,7,3 \n\t" | |||||
| "vleif %%v26,8,0 \n\t" | |||||
| "vleif %%v26,9,1 \n\t" | |||||
| "vleif %%v26,10,2 \n\t" | |||||
| "vleif %%v26,11,3 \n\t" | |||||
| "vleif %%v27,12,0 \n\t" | |||||
| "vleif %%v27,13,1 \n\t" | |||||
| "vleif %%v27,14,2 \n\t" | |||||
| "vleif %%v27,15,3 \n\t" | |||||
| "srlg %%r0,%2,5 \n\t" | |||||
| "xgr %%r1,%%r1 \n\t" | |||||
| "0: \n\t" | |||||
| "pfd 1, 1024(%%r1,%3) \n\t" | |||||
| static BLASLONG icamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *amin) { | |||||
| BLASLONG iamin; | |||||
| "vl %%v16,0(%%r1,%3) \n\t" | |||||
| "vl %%v28,16(%%r1,%3) \n\t" | |||||
| "vpkg %%v17,%%v16,%%v28 \n\t" | |||||
| "vperm %%v16,%%v16,%%v28,%%v9 \n\t" | |||||
| __asm__("vlef %%v0,0(%[x]),0\n\t" | |||||
| "vlef %%v1,4(%[x]),0\n\t" | |||||
| "vlef %%v0,8(%[x]),1\n\t" | |||||
| "vlef %%v1,12(%[x]),1\n\t" | |||||
| "vlef %%v0,16(%[x]),2\n\t" | |||||
| "vlef %%v1,20(%[x]),2\n\t" | |||||
| "vlef %%v0,24(%[x]),3\n\t" | |||||
| "vlef %%v1,28(%[x]),3\n\t" | |||||
| "vflpsb %%v0,%%v0\n\t" | |||||
| "vflpsb %%v1,%%v1\n\t" | |||||
| "vfasb %%v0,%%v0,%%v1\n\t" | |||||
| "vleig %%v1,0,0\n\t" | |||||
| "vleig %%v1,2,1\n\t" | |||||
| "vleig %%v2,1,0\n\t" | |||||
| "vleig %%v2,3,1\n\t" | |||||
| "vrepig %%v3,16\n\t" | |||||
| "vzero %%v4\n\t" | |||||
| "vleib %%v9,0,0\n\t" | |||||
| "vleib %%v9,1,1\n\t" | |||||
| "vleib %%v9,2,2\n\t" | |||||
| "vleib %%v9,3,3\n\t" | |||||
| "vleib %%v9,8,4\n\t" | |||||
| "vleib %%v9,9,5\n\t" | |||||
| "vleib %%v9,10,6\n\t" | |||||
| "vleib %%v9,11,7\n\t" | |||||
| "vleib %%v9,16,8\n\t" | |||||
| "vleib %%v9,17,9\n\t" | |||||
| "vleib %%v9,18,10\n\t" | |||||
| "vleib %%v9,19,11\n\t" | |||||
| "vleib %%v9,24,12\n\t" | |||||
| "vleib %%v9,25,13\n\t" | |||||
| "vleib %%v9,26,14\n\t" | |||||
| "vleib %%v9,27,15\n\t" | |||||
| "vleif %%v24,0,0\n\t" | |||||
| "vleif %%v24,1,1\n\t" | |||||
| "vleif %%v24,2,2\n\t" | |||||
| "vleif %%v24,3,3\n\t" | |||||
| "vleif %%v25,4,0\n\t" | |||||
| "vleif %%v25,5,1\n\t" | |||||
| "vleif %%v25,6,2\n\t" | |||||
| "vleif %%v25,7,3\n\t" | |||||
| "vleif %%v26,8,0\n\t" | |||||
| "vleif %%v26,9,1\n\t" | |||||
| "vleif %%v26,10,2\n\t" | |||||
| "vleif %%v26,11,3\n\t" | |||||
| "vleif %%v27,12,0\n\t" | |||||
| "vleif %%v27,13,1\n\t" | |||||
| "vleif %%v27,14,2\n\t" | |||||
| "vleif %%v27,15,3\n\t" | |||||
| "srlg %[n],%[n],5\n\t" | |||||
| "xgr %%r1,%%r1\n\t" | |||||
| "0:\n\t" | |||||
| "pfd 1, 1024(%%r1,%[x])\n\t" | |||||
| "vl %%v16,0(%%r1,%[x])\n\t" | |||||
| "vl %%v28,16(%%r1,%[x])\n\t" | |||||
| "vpkg %%v17,%%v16,%%v28\n\t" | |||||
| "vperm %%v16,%%v16,%%v28,%%v9\n\t" | |||||
| "vl %%v18,32(%%r1,%[x])\n\t" | |||||
| "vl %%v29,48(%%r1,%[x])\n\t" | |||||
| "vpkg %%v19,%%v18,%%v29\n\t" | |||||
| "vperm %%v18,%%v18,%%v29,%%v9\n\t" | |||||
| "vl %%v20,64(%%r1,%[x])\n\t" | |||||
| "vl %%v30,80(%%r1,%[x])\n\t" | |||||
| "vpkg %%v21,%%v20,%%v30\n\t" | |||||
| "vperm %%v20,%%v20,%%v30,%%v9\n\t" | |||||
| "vl %%v22,96(%%r1,%[x])\n\t" | |||||
| "vl %%v31,112(%%r1,%[x])\n\t" | |||||
| "vpkg %%v23,%%v22,%%v31\n\t" | |||||
| "vperm %%v22,%%v22,%%v31,%%v9\n\t" | |||||
| "vflpsb %%v16, %%v16\n\t" | |||||
| "vflpsb %%v17, %%v17\n\t" | |||||
| "vflpsb %%v18, %%v18\n\t" | |||||
| "vflpsb %%v19, %%v19\n\t" | |||||
| "vflpsb %%v20, %%v20\n\t" | |||||
| "vflpsb %%v21, %%v21\n\t" | |||||
| "vflpsb %%v22, %%v22\n\t" | |||||
| "vflpsb %%v23, %%v23\n\t" | |||||
| "vfasb %%v16,%%v16,%%v17\n\t" | |||||
| "vfasb %%v17,%%v18,%%v19\n\t" | |||||
| "vfasb %%v18,%%v20,%%v21\n\t" | |||||
| "vfasb %%v19,%%v22,%%v23\n\t" | |||||
| "vfchesb %%v5,%%v17,%%v16\n\t" | |||||
| "vfchesb %%v6,%%v19,%%v18\n\t" | |||||
| "vsel %%v16,%%v16,%%v17,%%v5\n\t" | |||||
| "vsel %%v5,%%v24,%%v25,%%v5\n\t" | |||||
| "vsel %%v17,%%v18,%%v19,%%v6\n\t" | |||||
| "vsel %%v6,%%v26,%%v27,%%v6\n\t" | |||||
| "vfchesb %%v18,%%v17,%%v16\n\t" | |||||
| "vsel %%v16,%%v16,%%v17,%%v18\n\t" | |||||
| "vsel %%v5,%%v5,%%v6,%%v18\n\t" | |||||
| "vsegf %%v6,%%v5\n\t" | |||||
| "vesrlg %%v5,%%v5,32\n\t" | |||||
| "vag %%v5,%%v5,%%v4\n\t" | |||||
| "vag %%v6,%%v6,%%v4\n\t" | |||||
| "vfchesb %%v7,%%v16,%%v0\n\t" | |||||
| "vsel %%v0,%%v0,%%v16,%%v7\n\t" | |||||
| "vsegf %%v8,%%v7\n\t" | |||||
| "vesrlg %%v7,%%v7,32\n\t" | |||||
| "vsegf %%v7,%%v7\n\t" | |||||
| "vsel %%v1,%%v1,%%v5,%%v7\n\t" | |||||
| "vsel %%v2,%%v2,%%v6,%%v8\n\t" | |||||
| "vag %%v4,%%v4,%%v3\n\t" | |||||
| "vl %%v16,128(%%r1,%[x])\n\t" | |||||
| "vl %%v28,144(%%r1,%[x])\n\t" | |||||
| "vpkg %%v17,%%v16,%%v28\n\t" | |||||
| "vperm %%v16,%%v16,%%v28,%%v9\n\t" | |||||
| "vl %%v18,160(%%r1,%[x])\n\t" | |||||
| "vl %%v29,176(%%r1,%[x])\n\t" | |||||
| "vpkg %%v19,%%v18,%%v29\n\t" | |||||
| "vperm %%v18,%%v18,%%v29,%%v9\n\t" | |||||
| "vl %%v20,192(%%r1,%[x])\n\t" | |||||
| "vl %%v30,208(%%r1,%[x])\n\t" | |||||
| "vpkg %%v21,%%v20,%%v30\n\t" | |||||
| "vperm %%v20,%%v20,%%v30,%%v9\n\t" | |||||
| "vl %%v22,224(%%r1,%[x])\n\t" | |||||
| "vl %%v31,240(%%r1,%[x])\n\t" | |||||
| "vpkg %%v23,%%v22,%%v31\n\t" | |||||
| "vperm %%v22,%%v22,%%v31,%%v9\n\t" | |||||
| "vflpsb %%v16, %%v16\n\t" | |||||
| "vflpsb %%v17, %%v17\n\t" | |||||
| "vflpsb %%v18, %%v18\n\t" | |||||
| "vflpsb %%v19, %%v19\n\t" | |||||
| "vflpsb %%v20, %%v20\n\t" | |||||
| "vflpsb %%v21, %%v21\n\t" | |||||
| "vflpsb %%v22, %%v22\n\t" | |||||
| "vflpsb %%v23, %%v23\n\t" | |||||
| "vfasb %%v16,%%v16,%%v17\n\t" | |||||
| "vfasb %%v17,%%v18,%%v19\n\t" | |||||
| "vfasb %%v18,%%v20,%%v21\n\t" | |||||
| "vfasb %%v19,%%v22,%%v23\n\t" | |||||
| "vfchesb %%v5,%%v17,%%v16\n\t" | |||||
| "vfchesb %%v6,%%v19,%%v18\n\t" | |||||
| "vsel %%v16,%%v16,%%v17,%%v5\n\t" | |||||
| "vsel %%v5,%%v24,%%v25,%%v5\n\t" | |||||
| "vsel %%v17,%%v18,%%v19,%%v6\n\t" | |||||
| "vsel %%v6,%%v26,%%v27,%%v6\n\t" | |||||
| "vfchesb %%v18,%%v17,%%v16\n\t" | |||||
| "vsel %%v16,%%v16,%%v17,%%v18\n\t" | |||||
| "vsel %%v5,%%v5,%%v6,%%v18\n\t" | |||||
| "vsegf %%v6,%%v5\n\t" | |||||
| "vesrlg %%v5,%%v5,32\n\t" | |||||
| "vag %%v5,%%v5,%%v4\n\t" | |||||
| "vag %%v6,%%v6,%%v4\n\t" | |||||
| "vfchesb %%v7,%%v16,%%v0\n\t" | |||||
| "vsel %%v0,%%v0,%%v16,%%v7\n\t" | |||||
| "vsegf %%v8,%%v7\n\t" | |||||
| "vesrlg %%v7,%%v7,32\n\t" | |||||
| "vsegf %%v7,%%v7\n\t" | |||||
| "vsel %%v1,%%v1,%%v5,%%v7\n\t" | |||||
| "vsel %%v2,%%v2,%%v6,%%v8\n\t" | |||||
| "vag %%v4,%%v4,%%v3\n\t" | |||||
| "agfi %%r1, 256\n\t" | |||||
| "brctg %[n], 0b\n\t" | |||||
| "veslg %%v3,%%v0,32\n\t" | |||||
| "vfchsb %%v4,%%v3,%%v0\n\t" | |||||
| "vchlg %%v5,%%v2,%%v1\n\t" | |||||
| "vfcesb %%v6,%%v0,%%v3\n\t" | |||||
| "vn %%v5,%%v5,%%v6\n\t" | |||||
| "vo %%v4,%%v4,%%v5\n\t" | |||||
| "vsel %%v0,%%v0,%%v3,%%v4\n\t" | |||||
| "vesrlg %%v4,%%v4,32\n\t" | |||||
| "vsegf %%v4,%%v4\n\t" | |||||
| "vsel %%v1,%%v1,%%v2,%%v4\n\t" | |||||
| "vrepf %%v2,%%v0,2\n\t" | |||||
| "vrepg %%v3,%%v1,1\n\t" | |||||
| "wfcsb %%v2,%%v0\n\t" | |||||
| "jne 1f\n\t" | |||||
| "vstef %%v0,%[amin],0\n\t" | |||||
| "vmnlg %%v0,%%v1,%%v3\n\t" | |||||
| "vlgvg %[iamin],%%v0,0\n\t" | |||||
| "j 2f\n\t" | |||||
| "1:\n\t" | |||||
| "wfchsb %%v4,%%v0,%%v2\n\t" | |||||
| "vesrlg %%v4,%%v4,32\n\t" | |||||
| "vsegf %%v4,%%v4\n\t" | |||||
| "vsel %%v1,%%v3,%%v1,%%v4\n\t" | |||||
| "vsel %%v0,%%v2,%%v0,%%v4\n\t" | |||||
| "ste %%f0,%[amin]\n\t" | |||||
| "vlgvg %[iamin],%%v1,0\n\t" | |||||
| "2:\n\t" | |||||
| "nop" | |||||
| : [iamin] "=r"(iamin),[amin] "=m"(*amin),[n] "+&r"(n) | |||||
| : "m"(*(const FLOAT (*)[n * 2]) x),[x] "a"(x) | |||||
| : "cc", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", | |||||
| "v9", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", | |||||
| "v25", "v26", "v27", "v28", "v29", "v30", "v31"); | |||||
| "vl %%v18,32(%%r1,%3) \n\t" | |||||
| "vl %%v29,48(%%r1,%3) \n\t" | |||||
| "vpkg %%v19,%%v18,%%v29 \n\t" | |||||
| "vperm %%v18,%%v18,%%v29,%%v9 \n\t" | |||||
| "vl %%v20,64(%%r1,%3) \n\t" | |||||
| "vl %%v30,80(%%r1,%3) \n\t" | |||||
| "vpkg %%v21,%%v20,%%v30 \n\t" | |||||
| "vperm %%v20,%%v20,%%v30,%%v9 \n\t" | |||||
| "vl %%v22,96(%%r1,%3) \n\t" | |||||
| "vl %%v31,112(%%r1,%3) \n\t" | |||||
| "vpkg %%v23,%%v22,%%v31 \n\t" | |||||
| "vperm %%v22,%%v22,%%v31,%%v9 \n\t" | |||||
| "vflpsb %%v16, %%v16 \n\t" | |||||
| "vflpsb %%v17, %%v17 \n\t" | |||||
| "vflpsb %%v18, %%v18 \n\t" | |||||
| "vflpsb %%v19, %%v19 \n\t" | |||||
| "vflpsb %%v20, %%v20 \n\t" | |||||
| "vflpsb %%v21, %%v21 \n\t" | |||||
| "vflpsb %%v22, %%v22 \n\t" | |||||
| "vflpsb %%v23, %%v23 \n\t" | |||||
| "vfasb %%v16,%%v16,%%v17 \n\t" | |||||
| "vfasb %%v17,%%v18,%%v19 \n\t" | |||||
| "vfasb %%v18,%%v20,%%v21 \n\t" | |||||
| "vfasb %%v19,%%v22,%%v23 \n\t" | |||||
| "vfchesb %%v5,%%v17,%%v16 \n\t" | |||||
| "vfchesb %%v6,%%v19,%%v18 \n\t" | |||||
| "vsel %%v16,%%v16,%%v17,%%v5 \n\t" | |||||
| "vsel %%v5,%%v24,%%v25,%%v5 \n\t" | |||||
| "vsel %%v17,%%v18,%%v19,%%v6 \n\t" | |||||
| "vsel %%v6,%%v26,%%v27,%%v6 \n\t" | |||||
| "vfchesb %%v18,%%v17,%%v16 \n\t" | |||||
| "vsel %%v16,%%v16,%%v17,%%v18 \n\t" | |||||
| "vsel %%v5,%%v5,%%v6,%%v18 \n\t" | |||||
| "vsegf %%v6,%%v5 \n\t" | |||||
| "vesrlg %%v5,%%v5,32 \n\t" | |||||
| "vag %%v5,%%v5,%%v4 \n\t" | |||||
| "vag %%v6,%%v6,%%v4 \n\t" | |||||
| "vfchesb %%v7,%%v16,%%v0 \n\t" | |||||
| "vsel %%v0,%%v0,%%v16,%%v7 \n\t" | |||||
| "vsegf %%v8,%%v7 \n\t" | |||||
| "vesrlg %%v7,%%v7,32 \n\t" | |||||
| "vsegf %%v7,%%v7 \n\t" | |||||
| "vsel %%v1,%%v1,%%v5,%%v7 \n\t" | |||||
| "vsel %%v2,%%v2,%%v6,%%v8 \n\t" | |||||
| "vag %%v4,%%v4,%%v3 \n\t" | |||||
| "vl %%v16,128(%%r1,%3) \n\t" | |||||
| "vl %%v28,144(%%r1,%3) \n\t" | |||||
| "vpkg %%v17,%%v16,%%v28 \n\t" | |||||
| "vperm %%v16,%%v16,%%v28,%%v9 \n\t" | |||||
| "vl %%v18,160(%%r1,%3) \n\t" | |||||
| "vl %%v29,176(%%r1,%3) \n\t" | |||||
| "vpkg %%v19,%%v18,%%v29 \n\t" | |||||
| "vperm %%v18,%%v18,%%v29,%%v9 \n\t" | |||||
| "vl %%v20,192(%%r1,%3) \n\t" | |||||
| "vl %%v30,208(%%r1,%3) \n\t" | |||||
| "vpkg %%v21,%%v20,%%v30 \n\t" | |||||
| "vperm %%v20,%%v20,%%v30,%%v9 \n\t" | |||||
| "vl %%v22,224(%%r1,%3) \n\t" | |||||
| "vl %%v31,240(%%r1,%3) \n\t" | |||||
| "vpkg %%v23,%%v22,%%v31 \n\t" | |||||
| "vperm %%v22,%%v22,%%v31,%%v9 \n\t" | |||||
| "vflpsb %%v16, %%v16 \n\t" | |||||
| "vflpsb %%v17, %%v17 \n\t" | |||||
| "vflpsb %%v18, %%v18 \n\t" | |||||
| "vflpsb %%v19, %%v19 \n\t" | |||||
| "vflpsb %%v20, %%v20 \n\t" | |||||
| "vflpsb %%v21, %%v21 \n\t" | |||||
| "vflpsb %%v22, %%v22 \n\t" | |||||
| "vflpsb %%v23, %%v23 \n\t" | |||||
| "vfasb %%v16,%%v16,%%v17 \n\t" | |||||
| "vfasb %%v17,%%v18,%%v19 \n\t" | |||||
| "vfasb %%v18,%%v20,%%v21 \n\t" | |||||
| "vfasb %%v19,%%v22,%%v23 \n\t" | |||||
| "vfchesb %%v5,%%v17,%%v16 \n\t" | |||||
| "vfchesb %%v6,%%v19,%%v18 \n\t" | |||||
| "vsel %%v16,%%v16,%%v17,%%v5 \n\t" | |||||
| "vsel %%v5,%%v24,%%v25,%%v5 \n\t" | |||||
| "vsel %%v17,%%v18,%%v19,%%v6 \n\t" | |||||
| "vsel %%v6,%%v26,%%v27,%%v6 \n\t" | |||||
| return iamin; | |||||
| } | |||||
| "vfchesb %%v18,%%v17,%%v16 \n\t" | |||||
| "vsel %%v16,%%v16,%%v17,%%v18 \n\t" | |||||
| "vsel %%v5,%%v5,%%v6,%%v18 \n\t" | |||||
| "vsegf %%v6,%%v5 \n\t" | |||||
| "vesrlg %%v5,%%v5,32 \n\t" | |||||
| "vag %%v5,%%v5,%%v4 \n\t" | |||||
| "vag %%v6,%%v6,%%v4 \n\t" | |||||
| BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { | |||||
| BLASLONG i = 0; | |||||
| BLASLONG ix = 0; | |||||
| FLOAT minf = 0; | |||||
| BLASLONG min = 0; | |||||
| BLASLONG inc_x2; | |||||
| "vfchesb %%v7,%%v16,%%v0 \n\t" | |||||
| "vsel %%v0,%%v0,%%v16,%%v7 \n\t" | |||||
| "vsegf %%v8,%%v7 \n\t" | |||||
| "vesrlg %%v7,%%v7,32 \n\t" | |||||
| "vsegf %%v7,%%v7 \n\t" | |||||
| "vsel %%v1,%%v1,%%v5,%%v7 \n\t" | |||||
| "vsel %%v2,%%v2,%%v6,%%v8 \n\t" | |||||
| "vag %%v4,%%v4,%%v3 \n\t" | |||||
| if (n <= 0 || inc_x <= 0) | |||||
| return (min); | |||||
| "agfi %%r1, 256 \n\t" | |||||
| "brctg %%r0, 0b \n\t" | |||||
| if (inc_x == 1) { | |||||
| "veslg %%v3,%%v0,32 \n\t" | |||||
| "vfchsb %%v4,%%v3,%%v0 \n\t" | |||||
| "vchlg %%v5,%%v2,%%v1 \n\t" | |||||
| "vfcesb %%v6,%%v0,%%v3 \n\t" | |||||
| "vn %%v5,%%v5,%%v6 \n\t" | |||||
| "vo %%v4,%%v4,%%v5 \n\t" | |||||
| "vsel %%v0,%%v0,%%v3,%%v4 \n\t" | |||||
| "vesrlg %%v4,%%v4,32 \n\t" | |||||
| "vsegf %%v4,%%v4 \n\t" | |||||
| "vsel %%v1,%%v1,%%v2,%%v4 \n\t" | |||||
| BLASLONG n1 = n & -32; | |||||
| if (n1 > 0) { | |||||
| "vrepf %%v2,%%v0,2 \n\t" | |||||
| "vrepg %%v3,%%v1,1 \n\t" | |||||
| "wfcsb %%v2,%%v0 \n\t" | |||||
| "jne 1f \n\t" | |||||
| "vstef %%v0,%1,0 \n\t" | |||||
| "vmnlg %%v0,%%v1,%%v3 \n\t" | |||||
| "vlgvg %0,%%v0,0 \n\t" | |||||
| "j 2f \n\t" | |||||
| "1: \n\t" | |||||
| "wfchsb %%v4,%%v0,%%v2 \n\t" | |||||
| "vesrlg %%v4,%%v4,32 \n\t" | |||||
| "vsegf %%v4,%%v4 \n\t" | |||||
| "vsel %%v1,%%v3,%%v1,%%v4 \n\t" | |||||
| "vsel %%v0,%%v2,%%v0,%%v4 \n\t" | |||||
| "ste %%f0,%1 \n\t" | |||||
| "vlgvg %0,%%v1,0 \n\t" | |||||
| "2: \n\t" | |||||
| "nop " | |||||
| :"=r"(iamin),"=m"(*amin) | |||||
| :"r"(n),"ZR"((const FLOAT (*)[n * 2])x) | |||||
| :"memory","cc","r0","r1","v0","v1","v2","v3","v4","v5","v6","v7","v8","v9","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" | |||||
| ); | |||||
| min = icamin_kernel_32(n1, x, &minf); | |||||
| ix = n1 * 2; | |||||
| i = n1; | |||||
| } else { | |||||
| minf = CABS1(x, 0); | |||||
| ix += 2; | |||||
| i++; | |||||
| } | |||||
| return iamin; | |||||
| } | |||||
| while (i < n) { | |||||
| if (CABS1(x, ix) < minf) { | |||||
| min = i; | |||||
| minf = CABS1(x, ix); | |||||
| } | |||||
| ix += 2; | |||||
| i++; | |||||
| } | |||||
| return (min + 1); | |||||
| BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||||
| { | |||||
| BLASLONG i = 0; | |||||
| BLASLONG ix = 0; | |||||
| FLOAT minf = 0; | |||||
| BLASLONG min = 0; | |||||
| BLASLONG inc_x2; | |||||
| } else { | |||||
| if (n <= 0 || inc_x <= 0) return(min); | |||||
| if (inc_x == 1) { | |||||
| min = 0; | |||||
| minf = CABS1(x, 0); | |||||
| inc_x2 = 2 * inc_x; | |||||
| BLASLONG n1 = n & -32; | |||||
| if (n1 > 0) { | |||||
| BLASLONG n1 = n & -4; | |||||
| while (i < n1) { | |||||
| min = icamin_kernel_32(n1, x, &minf); | |||||
| ix = n1 * 2; | |||||
| i = n1; | |||||
| if (CABS1(x, ix) < minf) { | |||||
| min = i; | |||||
| minf = CABS1(x, ix); | |||||
| } | |||||
| if (CABS1(x, ix + inc_x2) < minf) { | |||||
| min = i + 1; | |||||
| minf = CABS1(x, ix + inc_x2); | |||||
| } | |||||
| if (CABS1(x, ix + 2 * inc_x2) < minf) { | |||||
| min = i + 2; | |||||
| minf = CABS1(x, ix + 2 * inc_x2); | |||||
| } | |||||
| if (CABS1(x, ix + 3 * inc_x2) < minf) { | |||||
| min = i + 3; | |||||
| minf = CABS1(x, ix + 3 * inc_x2); | |||||
| } | } | ||||
| else | |||||
| { | |||||
| minf = CABS1(x,0); | |||||
| ix += 2; | |||||
| i++; | |||||
| } | |||||
| while(i < n) | |||||
| { | |||||
| if( CABS1(x,ix) < minf ) | |||||
| { | |||||
| min = i; | |||||
| minf = CABS1(x,ix); | |||||
| } | |||||
| ix += 2; | |||||
| i++; | |||||
| } | |||||
| return (min + 1); | |||||
| ix += inc_x2 * 4; | |||||
| } else { | |||||
| min = 0; | |||||
| minf = CABS1(x,0); | |||||
| inc_x2 = 2 * inc_x; | |||||
| ix += inc_x2; | |||||
| i++; | |||||
| i += 4; | |||||
| while(i < n) | |||||
| { | |||||
| if( CABS1(x,ix) < minf ) | |||||
| { | |||||
| min = i; | |||||
| minf = CABS1(x,ix); | |||||
| } | |||||
| ix += inc_x2; | |||||
| i++; | |||||
| } | } | ||||
| return (min + 1); | |||||
| while (i < n) { | |||||
| if (CABS1(x, ix) < minf) { | |||||
| min = i; | |||||
| minf = CABS1(x, ix); | |||||
| } | |||||
| ix += inc_x2; | |||||
| i++; | |||||
| } | } | ||||
| return (min + 1); | |||||
| } | |||||
| } | } | ||||
| @@ -1,5 +1,5 @@ | |||||
| /*************************************************************************** | /*************************************************************************** | ||||
| Copyright (c) 2013-2017, The OpenBLAS Project | |||||
| Copyright (c) 2013-2019, The OpenBLAS Project | |||||
| All rights reserved. | All rights reserved. | ||||
| Redistribution and use in source and binary forms, with or without | Redistribution and use in source and binary forms, with or without | ||||
| modification, are permitted provided that the following conditions are | modification, are permitted provided that the following conditions are | ||||
| @@ -28,237 +28,218 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #include "common.h" | #include "common.h" | ||||
| #include <math.h> | #include <math.h> | ||||
| #if defined(DOUBLE) | |||||
| #define ABS fabs | #define ABS fabs | ||||
| #else | |||||
| #define ABS fabsf | |||||
| #endif | |||||
| static BLASLONG idamax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *amax) | |||||
| { | |||||
| BLASLONG iamax; | |||||
| __asm__ volatile ( | |||||
| "vl %%v0,0(%3) \n\t" | |||||
| "vflpdb %%v0,%%v0 \n\t" | |||||
| "vleig %%v1,0,0 \n\t" | |||||
| "vleig %%v1,1,1 \n\t" | |||||
| "vrepig %%v2,16 \n\t" | |||||
| "vzero %%v3 \n\t" | |||||
| "vleig %%v24,0,0 \n\t" | |||||
| "vleig %%v24,1,1 \n\t" | |||||
| "vleig %%v25,2,0 \n\t" | |||||
| "vleig %%v25,3,1 \n\t" | |||||
| "vleig %%v26,4,0 \n\t" | |||||
| "vleig %%v26,5,1 \n\t" | |||||
| "vleig %%v27,6,0 \n\t" | |||||
| "vleig %%v27,7,1 \n\t" | |||||
| "vleig %%v28,8,0 \n\t" | |||||
| "vleig %%v28,9,1 \n\t" | |||||
| "vleig %%v29,10,0 \n\t" | |||||
| "vleig %%v29,11,1 \n\t" | |||||
| "vleig %%v30,12,0 \n\t" | |||||
| "vleig %%v30,13,1 \n\t" | |||||
| "vleig %%v31,14,0 \n\t" | |||||
| "vleig %%v31,15,1 \n\t" | |||||
| "srlg %%r0,%2,5 \n\t" | |||||
| "xgr %%r1,%%r1 \n\t" | |||||
| "0: \n\t" | |||||
| "pfd 1, 1024(%%r1,%3) \n\t" | |||||
| "vl %%v16,0(%%r1,%3) \n\t" | |||||
| "vl %%v17,16(%%r1,%3) \n\t" | |||||
| "vl %%v18,32(%%r1,%3) \n\t" | |||||
| "vl %%v19,48(%%r1,%3) \n\t" | |||||
| "vl %%v20,64(%%r1,%3) \n\t" | |||||
| "vl %%v21,80(%%r1,%3) \n\t" | |||||
| "vl %%v22,96(%%r1,%3) \n\t" | |||||
| "vl %%v23,112(%%r1,%3) \n\t" | |||||
| "vflpdb %%v16, %%v16 \n\t" | |||||
| "vflpdb %%v17, %%v17 \n\t" | |||||
| "vflpdb %%v18, %%v18 \n\t" | |||||
| "vflpdb %%v19, %%v19 \n\t" | |||||
| "vflpdb %%v20, %%v20 \n\t" | |||||
| "vflpdb %%v21, %%v21 \n\t" | |||||
| "vflpdb %%v22, %%v22 \n\t" | |||||
| "vflpdb %%v23, %%v23 \n\t" | |||||
| "vfchedb %%v4,%%v16,%%v17 \n\t" | |||||
| "vfchedb %%v5,%%v18,%%v19 \n\t" | |||||
| "vfchedb %%v6,%%v20,%%v21 \n\t" | |||||
| "vfchedb %%v7,%%v22,%%v23 \n\t" | |||||
| "vsel %%v16,%%v16,%%v17,%%v4 \n\t" | |||||
| "vsel %%v4,%%v24,%%v25,%%v4 \n\t" | |||||
| "vsel %%v17,%%v18,%%v19,%%v5 \n\t" | |||||
| "vsel %%v5,%%v26,%%v27,%%v5 \n\t" | |||||
| "vsel %%v18,%%v20,%%v21,%%v6 \n\t" | |||||
| "vsel %%v6,%%v28,%%v29,%%v6 \n\t" | |||||
| "vsel %%v19,%%v22,%%v23,%%v7 \n\t" | |||||
| "vsel %%v7,%%v30,%%v31,%%v7 \n\t" | |||||
| "vfchedb %%v20,%%v16,%%v17 \n\t" | |||||
| "vfchedb %%v21,%%v18,%%v19 \n\t" | |||||
| "vsel %%v16,%%v16,%%v17,%%v20 \n\t" | |||||
| "vsel %%v4,%%v4,%%v5,%%v20 \n\t" | |||||
| "vsel %%v17,%%v18,%%v19,%%v21 \n\t" | |||||
| "vsel %%v5,%%v6,%%v7,%%v21 \n\t" | |||||
| "vfchedb %%v18,%%v16,%%v17 \n\t" | |||||
| "vsel %%v16,%%v16,%%v17,%%v18 \n\t" | |||||
| "vsel %%v4,%%v4,%%v5,%%v18 \n\t" | |||||
| "vag %%v4,%%v4,%%v3 \n\t" | |||||
| "vfchedb %%v5,%%v0,%%v16 \n\t" | |||||
| "vsel %%v0,%%v0,%%v16,%%v5 \n\t" | |||||
| "vsel %%v1,%%v1,%%v4,%%v5 \n\t" | |||||
| "vag %%v3,%%v3,%%v2 \n\t" | |||||
| "vl %%v16,128(%%r1,%3) \n\t" | |||||
| "vl %%v17,144(%%r1,%3) \n\t" | |||||
| "vl %%v18,160(%%r1,%3) \n\t" | |||||
| "vl %%v19,176(%%r1,%3) \n\t" | |||||
| "vl %%v20,192(%%r1,%3) \n\t" | |||||
| "vl %%v21,208(%%r1,%3) \n\t" | |||||
| "vl %%v22,224(%%r1,%3) \n\t" | |||||
| "vl %%v23,240(%%r1,%3) \n\t" | |||||
| "vflpdb %%v16, %%v16 \n\t" | |||||
| "vflpdb %%v17, %%v17 \n\t" | |||||
| "vflpdb %%v18, %%v18 \n\t" | |||||
| "vflpdb %%v19, %%v19 \n\t" | |||||
| "vflpdb %%v20, %%v20 \n\t" | |||||
| "vflpdb %%v21, %%v21 \n\t" | |||||
| "vflpdb %%v22, %%v22 \n\t" | |||||
| "vflpdb %%v23, %%v23 \n\t" | |||||
| "vfchedb %%v4,%%v16,%%v17 \n\t" | |||||
| "vfchedb %%v5,%%v18,%%v19 \n\t" | |||||
| "vfchedb %%v6,%%v20,%%v21 \n\t" | |||||
| "vfchedb %%v7,%%v22,%%v23 \n\t" | |||||
| "vsel %%v16,%%v16,%%v17,%%v4 \n\t" | |||||
| "vsel %%v4,%%v24,%%v25,%%v4 \n\t" | |||||
| "vsel %%v17,%%v18,%%v19,%%v5 \n\t" | |||||
| "vsel %%v5,%%v26,%%v27,%%v5 \n\t" | |||||
| "vsel %%v18,%%v20,%%v21,%%v6 \n\t" | |||||
| "vsel %%v6,%%v28,%%v29,%%v6 \n\t" | |||||
| "vsel %%v19,%%v22,%%v23,%%v7 \n\t" | |||||
| "vsel %%v7,%%v30,%%v31,%%v7 \n\t" | |||||
| "vfchedb %%v20,%%v16,%%v17 \n\t" | |||||
| "vfchedb %%v21,%%v18,%%v19 \n\t" | |||||
| "vsel %%v16,%%v16,%%v17,%%v20 \n\t" | |||||
| "vsel %%v4,%%v4,%%v5,%%v20 \n\t" | |||||
| "vsel %%v17,%%v18,%%v19,%%v21 \n\t" | |||||
| "vsel %%v5,%%v6,%%v7,%%v21 \n\t" | |||||
| "vfchedb %%v18,%%v16,%%v17 \n\t" | |||||
| "vsel %%v16,%%v16,%%v17,%%v18 \n\t" | |||||
| "vsel %%v4,%%v4,%%v5,%%v18 \n\t" | |||||
| "vag %%v4,%%v4,%%v3 \n\t" | |||||
| "vfchedb %%v5,%%v0,%%v16 \n\t" | |||||
| "vsel %%v0,%%v0,%%v16,%%v5 \n\t" | |||||
| "vsel %%v1,%%v1,%%v4,%%v5 \n\t" | |||||
| "vag %%v3,%%v3,%%v2 \n\t" | |||||
| "agfi %%r1, 256 \n\t" | |||||
| "brctg %%r0, 0b \n\t" | |||||
| "vrepg %%v2,%%v0,1 \n\t" | |||||
| "vrepg %%v3,%%v1,1 \n\t" | |||||
| "wfcdb %%v2,%%v0 \n\t" | |||||
| "jne 1f \n\t" | |||||
| "vsteg %%v0,%1,0 \n\t" | |||||
| "vmnlg %%v0,%%v1,%%v3 \n\t" | |||||
| "vlgvg %0,%%v0,0 \n\t" | |||||
| "j 2f \n\t" | |||||
| "1: \n\t" | |||||
| "wfchdb %%v4,%%v2,%%v0 \n\t" | |||||
| "vsel %%v1,%%v3,%%v1,%%v4 \n\t" | |||||
| "vsel %%v0,%%v2,%%v0,%%v4 \n\t" | |||||
| "std %%f0,%1 \n\t" | |||||
| "vlgvg %0,%%v1,0 \n\t" | |||||
| "2: \n\t" | |||||
| "nop " | |||||
| :"=r"(iamax),"=m"(*amax) | |||||
| :"r"(n),"ZR"((const FLOAT (*)[n])x) | |||||
| :"memory","cc","r0","r1","v0","v1","v2","v3","v4","v5","v6","v7","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" | |||||
| ); | |||||
| return iamax; | |||||
| static BLASLONG idamax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *amax) { | |||||
| BLASLONG iamax; | |||||
| __asm__("vl %%v0,0(%[x])\n\t" | |||||
| "vflpdb %%v0,%%v0\n\t" | |||||
| "vleig %%v1,0,0\n\t" | |||||
| "vleig %%v1,1,1\n\t" | |||||
| "vrepig %%v2,16\n\t" | |||||
| "vzero %%v3\n\t" | |||||
| "vleig %%v24,0,0\n\t" | |||||
| "vleig %%v24,1,1\n\t" | |||||
| "vleig %%v25,2,0\n\t" | |||||
| "vleig %%v25,3,1\n\t" | |||||
| "vleig %%v26,4,0\n\t" | |||||
| "vleig %%v26,5,1\n\t" | |||||
| "vleig %%v27,6,0\n\t" | |||||
| "vleig %%v27,7,1\n\t" | |||||
| "vleig %%v28,8,0\n\t" | |||||
| "vleig %%v28,9,1\n\t" | |||||
| "vleig %%v29,10,0\n\t" | |||||
| "vleig %%v29,11,1\n\t" | |||||
| "vleig %%v30,12,0\n\t" | |||||
| "vleig %%v30,13,1\n\t" | |||||
| "vleig %%v31,14,0\n\t" | |||||
| "vleig %%v31,15,1\n\t" | |||||
| "srlg %[n],%[n],5\n\t" | |||||
| "xgr %%r1,%%r1\n\t" | |||||
| "0:\n\t" | |||||
| "pfd 1, 1024(%%r1,%[x])\n\t" | |||||
| "vl %%v16,0(%%r1,%[x])\n\t" | |||||
| "vl %%v17,16(%%r1,%[x])\n\t" | |||||
| "vl %%v18,32(%%r1,%[x])\n\t" | |||||
| "vl %%v19,48(%%r1,%[x])\n\t" | |||||
| "vl %%v20,64(%%r1,%[x])\n\t" | |||||
| "vl %%v21,80(%%r1,%[x])\n\t" | |||||
| "vl %%v22,96(%%r1,%[x])\n\t" | |||||
| "vl %%v23,112(%%r1,%[x])\n\t" | |||||
| "vflpdb %%v16, %%v16\n\t" | |||||
| "vflpdb %%v17, %%v17\n\t" | |||||
| "vflpdb %%v18, %%v18\n\t" | |||||
| "vflpdb %%v19, %%v19\n\t" | |||||
| "vflpdb %%v20, %%v20\n\t" | |||||
| "vflpdb %%v21, %%v21\n\t" | |||||
| "vflpdb %%v22, %%v22\n\t" | |||||
| "vflpdb %%v23, %%v23\n\t" | |||||
| "vfchedb %%v4,%%v16,%%v17\n\t" | |||||
| "vfchedb %%v5,%%v18,%%v19\n\t" | |||||
| "vfchedb %%v6,%%v20,%%v21\n\t" | |||||
| "vfchedb %%v7,%%v22,%%v23\n\t" | |||||
| "vsel %%v16,%%v16,%%v17,%%v4\n\t" | |||||
| "vsel %%v4,%%v24,%%v25,%%v4\n\t" | |||||
| "vsel %%v17,%%v18,%%v19,%%v5\n\t" | |||||
| "vsel %%v5,%%v26,%%v27,%%v5\n\t" | |||||
| "vsel %%v18,%%v20,%%v21,%%v6\n\t" | |||||
| "vsel %%v6,%%v28,%%v29,%%v6\n\t" | |||||
| "vsel %%v19,%%v22,%%v23,%%v7\n\t" | |||||
| "vsel %%v7,%%v30,%%v31,%%v7\n\t" | |||||
| "vfchedb %%v20,%%v16,%%v17\n\t" | |||||
| "vfchedb %%v21,%%v18,%%v19\n\t" | |||||
| "vsel %%v16,%%v16,%%v17,%%v20\n\t" | |||||
| "vsel %%v4,%%v4,%%v5,%%v20\n\t" | |||||
| "vsel %%v17,%%v18,%%v19,%%v21\n\t" | |||||
| "vsel %%v5,%%v6,%%v7,%%v21\n\t" | |||||
| "vfchedb %%v18,%%v16,%%v17\n\t" | |||||
| "vsel %%v16,%%v16,%%v17,%%v18\n\t" | |||||
| "vsel %%v4,%%v4,%%v5,%%v18\n\t" | |||||
| "vag %%v4,%%v4,%%v3\n\t" | |||||
| "vfchedb %%v5,%%v0,%%v16\n\t" | |||||
| "vsel %%v0,%%v0,%%v16,%%v5\n\t" | |||||
| "vsel %%v1,%%v1,%%v4,%%v5\n\t" | |||||
| "vag %%v3,%%v3,%%v2\n\t" | |||||
| "vl %%v16,128(%%r1,%[x])\n\t" | |||||
| "vl %%v17,144(%%r1,%[x])\n\t" | |||||
| "vl %%v18,160(%%r1,%[x])\n\t" | |||||
| "vl %%v19,176(%%r1,%[x])\n\t" | |||||
| "vl %%v20,192(%%r1,%[x])\n\t" | |||||
| "vl %%v21,208(%%r1,%[x])\n\t" | |||||
| "vl %%v22,224(%%r1,%[x])\n\t" | |||||
| "vl %%v23,240(%%r1,%[x])\n\t" | |||||
| "vflpdb %%v16, %%v16\n\t" | |||||
| "vflpdb %%v17, %%v17\n\t" | |||||
| "vflpdb %%v18, %%v18\n\t" | |||||
| "vflpdb %%v19, %%v19\n\t" | |||||
| "vflpdb %%v20, %%v20\n\t" | |||||
| "vflpdb %%v21, %%v21\n\t" | |||||
| "vflpdb %%v22, %%v22\n\t" | |||||
| "vflpdb %%v23, %%v23\n\t" | |||||
| "vfchedb %%v4,%%v16,%%v17\n\t" | |||||
| "vfchedb %%v5,%%v18,%%v19\n\t" | |||||
| "vfchedb %%v6,%%v20,%%v21\n\t" | |||||
| "vfchedb %%v7,%%v22,%%v23\n\t" | |||||
| "vsel %%v16,%%v16,%%v17,%%v4\n\t" | |||||
| "vsel %%v4,%%v24,%%v25,%%v4\n\t" | |||||
| "vsel %%v17,%%v18,%%v19,%%v5\n\t" | |||||
| "vsel %%v5,%%v26,%%v27,%%v5\n\t" | |||||
| "vsel %%v18,%%v20,%%v21,%%v6\n\t" | |||||
| "vsel %%v6,%%v28,%%v29,%%v6\n\t" | |||||
| "vsel %%v19,%%v22,%%v23,%%v7\n\t" | |||||
| "vsel %%v7,%%v30,%%v31,%%v7\n\t" | |||||
| "vfchedb %%v20,%%v16,%%v17\n\t" | |||||
| "vfchedb %%v21,%%v18,%%v19\n\t" | |||||
| "vsel %%v16,%%v16,%%v17,%%v20\n\t" | |||||
| "vsel %%v4,%%v4,%%v5,%%v20\n\t" | |||||
| "vsel %%v17,%%v18,%%v19,%%v21\n\t" | |||||
| "vsel %%v5,%%v6,%%v7,%%v21\n\t" | |||||
| "vfchedb %%v18,%%v16,%%v17\n\t" | |||||
| "vsel %%v16,%%v16,%%v17,%%v18\n\t" | |||||
| "vsel %%v4,%%v4,%%v5,%%v18\n\t" | |||||
| "vag %%v4,%%v4,%%v3\n\t" | |||||
| "vfchedb %%v5,%%v0,%%v16\n\t" | |||||
| "vsel %%v0,%%v0,%%v16,%%v5\n\t" | |||||
| "vsel %%v1,%%v1,%%v4,%%v5\n\t" | |||||
| "vag %%v3,%%v3,%%v2\n\t" | |||||
| "agfi %%r1, 256\n\t" | |||||
| "brctg %[n], 0b\n\t" | |||||
| "vrepg %%v2,%%v0,1\n\t" | |||||
| "vrepg %%v3,%%v1,1\n\t" | |||||
| "wfcdb %%v2,%%v0\n\t" | |||||
| "jne 1f\n\t" | |||||
| "vsteg %%v0,%[amax],0\n\t" | |||||
| "vmnlg %%v0,%%v1,%%v3\n\t" | |||||
| "vlgvg %[iamax],%%v0,0\n\t" | |||||
| "j 2f\n\t" | |||||
| "1:\n\t" | |||||
| "wfchdb %%v4,%%v2,%%v0\n\t" | |||||
| "vsel %%v1,%%v3,%%v1,%%v4\n\t" | |||||
| "vsel %%v0,%%v2,%%v0,%%v4\n\t" | |||||
| "std %%f0,%[amax]\n\t" | |||||
| "vlgvg %[iamax],%%v1,0\n\t" | |||||
| "2:\n\t" | |||||
| "nop" | |||||
| : [iamax] "=r"(iamax),[amax] "=m"(*amax),[n] "+&r"(n) | |||||
| : "m"(*(const FLOAT (*)[n]) x),[x] "a"(x) | |||||
| : "cc", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", | |||||
| "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", | |||||
| "v27", "v28", "v29", "v30", "v31"); | |||||
| return iamax; | |||||
| } | } | ||||
| BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { | |||||
| BLASLONG i = 0; | |||||
| BLASLONG j = 0; | |||||
| FLOAT maxf = 0.0; | |||||
| BLASLONG max = 0; | |||||
| if (n <= 0 || inc_x <= 0) return (max); | |||||
| if (inc_x == 1) { | |||||
| BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { | |||||
| BLASLONG i = 0; | |||||
| BLASLONG j = 0; | |||||
| FLOAT maxf = 0.0; | |||||
| BLASLONG max = 0; | |||||
| BLASLONG n1 = n & -32; | |||||
| if (n1 > 0) { | |||||
| if (n <= 0 || inc_x <= 0) | |||||
| return (max); | |||||
| max = idamax_kernel_32(n1, x, &maxf); | |||||
| if (inc_x == 1) { | |||||
| i = n1; | |||||
| } | |||||
| else | |||||
| { | |||||
| maxf = ABS(x[0]); | |||||
| i++; | |||||
| } | |||||
| BLASLONG n1 = n & -32; | |||||
| if (n1 > 0) { | |||||
| while (i < n) { | |||||
| if (ABS(x[i]) > maxf) { | |||||
| max = i; | |||||
| maxf = ABS(x[i]); | |||||
| } | |||||
| i++; | |||||
| } | |||||
| return (max + 1); | |||||
| max = idamax_kernel_32(n1, x, &maxf); | |||||
| i = n1; | |||||
| } else { | } else { | ||||
| maxf = ABS(x[0]); | |||||
| i++; | |||||
| } | |||||
| max = 0; | |||||
| maxf = ABS(x[0]); | |||||
| while (i < n) { | |||||
| if (ABS(x[i]) > maxf) { | |||||
| max = i; | |||||
| maxf = ABS(x[i]); | |||||
| } | |||||
| i++; | |||||
| } | |||||
| return (max + 1); | |||||
| BLASLONG n1 = n & -4; | |||||
| while (j < n1) { | |||||
| } else { | |||||
| if (ABS(x[i]) > maxf) { | |||||
| max = j; | |||||
| maxf = ABS(x[i]); | |||||
| } | |||||
| if (ABS(x[i + inc_x]) > maxf) { | |||||
| max = j + 1; | |||||
| maxf = ABS(x[i + inc_x]); | |||||
| } | |||||
| if (ABS(x[i + 2 * inc_x]) > maxf) { | |||||
| max = j + 2; | |||||
| maxf = ABS(x[i + 2 * inc_x]); | |||||
| } | |||||
| if (ABS(x[i + 3 * inc_x]) > maxf) { | |||||
| max = j + 3; | |||||
| maxf = ABS(x[i + 3 * inc_x]); | |||||
| } | |||||
| max = 0; | |||||
| maxf = ABS(x[0]); | |||||
| i += inc_x * 4; | |||||
| BLASLONG n1 = n & -4; | |||||
| while (j < n1) { | |||||
| j += 4; | |||||
| if (ABS(x[i]) > maxf) { | |||||
| max = j; | |||||
| maxf = ABS(x[i]); | |||||
| } | |||||
| if (ABS(x[i + inc_x]) > maxf) { | |||||
| max = j + 1; | |||||
| maxf = ABS(x[i + inc_x]); | |||||
| } | |||||
| if (ABS(x[i + 2 * inc_x]) > maxf) { | |||||
| max = j + 2; | |||||
| maxf = ABS(x[i + 2 * inc_x]); | |||||
| } | |||||
| if (ABS(x[i + 3 * inc_x]) > maxf) { | |||||
| max = j + 3; | |||||
| maxf = ABS(x[i + 3 * inc_x]); | |||||
| } | |||||
| } | |||||
| i += inc_x * 4; | |||||
| j += 4; | |||||
| } | |||||
| while (j < n) { | |||||
| if (ABS(x[i]) > maxf) { | |||||
| max = j; | |||||
| maxf = ABS(x[i]); | |||||
| } | |||||
| i += inc_x; | |||||
| j++; | |||||
| } | |||||
| return (max + 1); | |||||
| while (j < n) { | |||||
| if (ABS(x[i]) > maxf) { | |||||
| max = j; | |||||
| maxf = ABS(x[i]); | |||||
| } | |||||
| i += inc_x; | |||||
| j++; | |||||
| } | } | ||||
| return (max + 1); | |||||
| } | |||||
| } | } | ||||
| @@ -1,5 +1,5 @@ | |||||
| /*************************************************************************** | /*************************************************************************** | ||||
| Copyright (c) 2013-2017, The OpenBLAS Project | |||||
| Copyright (c) 2013-2019, The OpenBLAS Project | |||||
| All rights reserved. | All rights reserved. | ||||
| Redistribution and use in source and binary forms, with or without | Redistribution and use in source and binary forms, with or without | ||||
| modification, are permitted provided that the following conditions are | modification, are permitted provided that the following conditions are | ||||
| @@ -28,237 +28,218 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #include "common.h" | #include "common.h" | ||||
| #include <math.h> | #include <math.h> | ||||
| #if defined(DOUBLE) | |||||
| #define ABS fabs | #define ABS fabs | ||||
| #else | |||||
| #define ABS fabsf | |||||
| #endif | |||||
| static BLASLONG idamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *amin) | |||||
| { | |||||
| BLASLONG iamin; | |||||
| __asm__ volatile ( | |||||
| "vl %%v0,0(%3) \n\t" | |||||
| "vflpdb %%v0,%%v0 \n\t" | |||||
| "vleig %%v1,0,0 \n\t" | |||||
| "vleig %%v1,1,1 \n\t" | |||||
| "vrepig %%v2,16 \n\t" | |||||
| "vzero %%v3 \n\t" | |||||
| "vleig %%v24,0,0 \n\t" | |||||
| "vleig %%v24,1,1 \n\t" | |||||
| "vleig %%v25,2,0 \n\t" | |||||
| "vleig %%v25,3,1 \n\t" | |||||
| "vleig %%v26,4,0 \n\t" | |||||
| "vleig %%v26,5,1 \n\t" | |||||
| "vleig %%v27,6,0 \n\t" | |||||
| "vleig %%v27,7,1 \n\t" | |||||
| "vleig %%v28,8,0 \n\t" | |||||
| "vleig %%v28,9,1 \n\t" | |||||
| "vleig %%v29,10,0 \n\t" | |||||
| "vleig %%v29,11,1 \n\t" | |||||
| "vleig %%v30,12,0 \n\t" | |||||
| "vleig %%v30,13,1 \n\t" | |||||
| "vleig %%v31,14,0 \n\t" | |||||
| "vleig %%v31,15,1 \n\t" | |||||
| "srlg %%r0,%2,5 \n\t" | |||||
| "xgr %%r1,%%r1 \n\t" | |||||
| "0: \n\t" | |||||
| "pfd 1, 1024(%%r1,%3) \n\t" | |||||
| "vl %%v16,0(%%r1,%3) \n\t" | |||||
| "vl %%v17,16(%%r1,%3) \n\t" | |||||
| "vl %%v18,32(%%r1,%3) \n\t" | |||||
| "vl %%v19,48(%%r1,%3) \n\t" | |||||
| "vl %%v20,64(%%r1,%3) \n\t" | |||||
| "vl %%v21,80(%%r1,%3) \n\t" | |||||
| "vl %%v22,96(%%r1,%3) \n\t" | |||||
| "vl %%v23,112(%%r1,%3) \n\t" | |||||
| "vflpdb %%v16, %%v16 \n\t" | |||||
| "vflpdb %%v17, %%v17 \n\t" | |||||
| "vflpdb %%v18, %%v18 \n\t" | |||||
| "vflpdb %%v19, %%v19 \n\t" | |||||
| "vflpdb %%v20, %%v20 \n\t" | |||||
| "vflpdb %%v21, %%v21 \n\t" | |||||
| "vflpdb %%v22, %%v22 \n\t" | |||||
| "vflpdb %%v23, %%v23 \n\t" | |||||
| "vfchedb %%v4,%%v17,%%v16 \n\t" | |||||
| "vfchedb %%v5,%%v19,%%v18 \n\t" | |||||
| "vfchedb %%v6,%%v21,%%v20 \n\t" | |||||
| "vfchedb %%v7,%%v23,%%v22 \n\t" | |||||
| "vsel %%v16,%%v16,%%v17,%%v4 \n\t" | |||||
| "vsel %%v4,%%v24,%%v25,%%v4 \n\t" | |||||
| "vsel %%v17,%%v18,%%v19,%%v5 \n\t" | |||||
| "vsel %%v5,%%v26,%%v27,%%v5 \n\t" | |||||
| "vsel %%v18,%%v20,%%v21,%%v6 \n\t" | |||||
| "vsel %%v6,%%v28,%%v29,%%v6 \n\t" | |||||
| "vsel %%v19,%%v22,%%v23,%%v7 \n\t" | |||||
| "vsel %%v7,%%v30,%%v31,%%v7 \n\t" | |||||
| "vfchedb %%v20,%%v17,%%v16 \n\t" | |||||
| "vfchedb %%v21,%%v19,%%v18 \n\t" | |||||
| "vsel %%v16,%%v16,%%v17,%%v20 \n\t" | |||||
| "vsel %%v4,%%v4,%%v5,%%v20 \n\t" | |||||
| "vsel %%v17,%%v18,%%v19,%%v21 \n\t" | |||||
| "vsel %%v5,%%v6,%%v7,%%v21 \n\t" | |||||
| "vfchedb %%v18,%%v17,%%v16 \n\t" | |||||
| "vsel %%v16,%%v16,%%v17,%%v18 \n\t" | |||||
| "vsel %%v4,%%v4,%%v5,%%v18 \n\t" | |||||
| "vag %%v4,%%v4,%%v3 \n\t" | |||||
| "vfchedb %%v5,%%v16,%%v0 \n\t" | |||||
| "vsel %%v0,%%v0,%%v16,%%v5 \n\t" | |||||
| "vsel %%v1,%%v1,%%v4,%%v5 \n\t" | |||||
| "vag %%v3,%%v3,%%v2 \n\t" | |||||
| "vl %%v16,128(%%r1,%3) \n\t" | |||||
| "vl %%v17,144(%%r1,%3) \n\t" | |||||
| "vl %%v18,160(%%r1,%3) \n\t" | |||||
| "vl %%v19,176(%%r1,%3) \n\t" | |||||
| "vl %%v20,192(%%r1,%3) \n\t" | |||||
| "vl %%v21,208(%%r1,%3) \n\t" | |||||
| "vl %%v22,224(%%r1,%3) \n\t" | |||||
| "vl %%v23,240(%%r1,%3) \n\t" | |||||
| "vflpdb %%v16, %%v16 \n\t" | |||||
| "vflpdb %%v17, %%v17 \n\t" | |||||
| "vflpdb %%v18, %%v18 \n\t" | |||||
| "vflpdb %%v19, %%v19 \n\t" | |||||
| "vflpdb %%v20, %%v20 \n\t" | |||||
| "vflpdb %%v21, %%v21 \n\t" | |||||
| "vflpdb %%v22, %%v22 \n\t" | |||||
| "vflpdb %%v23, %%v23 \n\t" | |||||
| "vfchedb %%v4,%%v17,%%v16 \n\t" | |||||
| "vfchedb %%v5,%%v19,%%v18 \n\t" | |||||
| "vfchedb %%v6,%%v21,%%v20 \n\t" | |||||
| "vfchedb %%v7,%%v23,%%v22 \n\t" | |||||
| "vsel %%v16,%%v16,%%v17,%%v4 \n\t" | |||||
| "vsel %%v4,%%v24,%%v25,%%v4 \n\t" | |||||
| "vsel %%v17,%%v18,%%v19,%%v5 \n\t" | |||||
| "vsel %%v5,%%v26,%%v27,%%v5 \n\t" | |||||
| "vsel %%v18,%%v20,%%v21,%%v6 \n\t" | |||||
| "vsel %%v6,%%v28,%%v29,%%v6 \n\t" | |||||
| "vsel %%v19,%%v22,%%v23,%%v7 \n\t" | |||||
| "vsel %%v7,%%v30,%%v31,%%v7 \n\t" | |||||
| "vfchedb %%v20,%%v17,%%v16 \n\t" | |||||
| "vfchedb %%v21,%%v19,%%v18 \n\t" | |||||
| "vsel %%v16,%%v16,%%v17,%%v20 \n\t" | |||||
| "vsel %%v4,%%v4,%%v5,%%v20 \n\t" | |||||
| "vsel %%v17,%%v18,%%v19,%%v21 \n\t" | |||||
| "vsel %%v5,%%v6,%%v7,%%v21 \n\t" | |||||
| "vfchedb %%v18,%%v17,%%v16 \n\t" | |||||
| "vsel %%v16,%%v16,%%v17,%%v18 \n\t" | |||||
| "vsel %%v4,%%v4,%%v5,%%v18 \n\t" | |||||
| "vag %%v4,%%v4,%%v3 \n\t" | |||||
| "vfchedb %%v5,%%v16,%%v0 \n\t" | |||||
| "vsel %%v0,%%v0,%%v16,%%v5 \n\t" | |||||
| "vsel %%v1,%%v1,%%v4,%%v5 \n\t" | |||||
| "vag %%v3,%%v3,%%v2 \n\t" | |||||
| "agfi %%r1, 256 \n\t" | |||||
| "brctg %%r0, 0b \n\t" | |||||
| "vrepg %%v2,%%v0,1 \n\t" | |||||
| "vrepg %%v3,%%v1,1 \n\t" | |||||
| "wfcdb %%v2,%%v0 \n\t" | |||||
| "jne 1f \n\t" | |||||
| "vsteg %%v0,%1,0 \n\t" | |||||
| "vmnlg %%v0,%%v1,%%v3 \n\t" | |||||
| "vlgvg %0,%%v0,0 \n\t" | |||||
| "j 2f \n\t" | |||||
| "1: \n\t" | |||||
| "wfchdb %%v4,%%v0,%%v2 \n\t" | |||||
| "vsel %%v1,%%v3,%%v1,%%v4 \n\t" | |||||
| "vsel %%v0,%%v2,%%v0,%%v4 \n\t" | |||||
| "std %%f0,%1 \n\t" | |||||
| "vlgvg %0,%%v1,0 \n\t" | |||||
| "2: \n\t" | |||||
| "nop " | |||||
| :"=r"(iamin),"=m"(*amin) | |||||
| :"r"(n),"ZR"((const FLOAT (*)[n])x) | |||||
| :"memory","cc","r0","r1","v0","v1","v2","v3","v4","v5","v6","v7","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" | |||||
| ); | |||||
| return iamin; | |||||
| static BLASLONG idamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *amin) { | |||||
| BLASLONG iamin; | |||||
| __asm__("vl %%v0,0(%[x])\n\t" | |||||
| "vflpdb %%v0,%%v0\n\t" | |||||
| "vleig %%v1,0,0\n\t" | |||||
| "vleig %%v1,1,1\n\t" | |||||
| "vrepig %%v2,16\n\t" | |||||
| "vzero %%v3\n\t" | |||||
| "vleig %%v24,0,0\n\t" | |||||
| "vleig %%v24,1,1\n\t" | |||||
| "vleig %%v25,2,0\n\t" | |||||
| "vleig %%v25,3,1\n\t" | |||||
| "vleig %%v26,4,0\n\t" | |||||
| "vleig %%v26,5,1\n\t" | |||||
| "vleig %%v27,6,0\n\t" | |||||
| "vleig %%v27,7,1\n\t" | |||||
| "vleig %%v28,8,0\n\t" | |||||
| "vleig %%v28,9,1\n\t" | |||||
| "vleig %%v29,10,0\n\t" | |||||
| "vleig %%v29,11,1\n\t" | |||||
| "vleig %%v30,12,0\n\t" | |||||
| "vleig %%v30,13,1\n\t" | |||||
| "vleig %%v31,14,0\n\t" | |||||
| "vleig %%v31,15,1\n\t" | |||||
| "srlg %[n],%[n],5\n\t" | |||||
| "xgr %%r1,%%r1\n\t" | |||||
| "0:\n\t" | |||||
| "pfd 1, 1024(%%r1,%[x])\n\t" | |||||
| "vl %%v16,0(%%r1,%[x])\n\t" | |||||
| "vl %%v17,16(%%r1,%[x])\n\t" | |||||
| "vl %%v18,32(%%r1,%[x])\n\t" | |||||
| "vl %%v19,48(%%r1,%[x])\n\t" | |||||
| "vl %%v20,64(%%r1,%[x])\n\t" | |||||
| "vl %%v21,80(%%r1,%[x])\n\t" | |||||
| "vl %%v22,96(%%r1,%[x])\n\t" | |||||
| "vl %%v23,112(%%r1,%[x])\n\t" | |||||
| "vflpdb %%v16, %%v16\n\t" | |||||
| "vflpdb %%v17, %%v17\n\t" | |||||
| "vflpdb %%v18, %%v18\n\t" | |||||
| "vflpdb %%v19, %%v19\n\t" | |||||
| "vflpdb %%v20, %%v20\n\t" | |||||
| "vflpdb %%v21, %%v21\n\t" | |||||
| "vflpdb %%v22, %%v22\n\t" | |||||
| "vflpdb %%v23, %%v23\n\t" | |||||
| "vfchedb %%v4,%%v17,%%v16\n\t" | |||||
| "vfchedb %%v5,%%v19,%%v18\n\t" | |||||
| "vfchedb %%v6,%%v21,%%v20\n\t" | |||||
| "vfchedb %%v7,%%v23,%%v22\n\t" | |||||
| "vsel %%v16,%%v16,%%v17,%%v4\n\t" | |||||
| "vsel %%v4,%%v24,%%v25,%%v4\n\t" | |||||
| "vsel %%v17,%%v18,%%v19,%%v5\n\t" | |||||
| "vsel %%v5,%%v26,%%v27,%%v5\n\t" | |||||
| "vsel %%v18,%%v20,%%v21,%%v6\n\t" | |||||
| "vsel %%v6,%%v28,%%v29,%%v6\n\t" | |||||
| "vsel %%v19,%%v22,%%v23,%%v7\n\t" | |||||
| "vsel %%v7,%%v30,%%v31,%%v7\n\t" | |||||
| "vfchedb %%v20,%%v17,%%v16\n\t" | |||||
| "vfchedb %%v21,%%v19,%%v18\n\t" | |||||
| "vsel %%v16,%%v16,%%v17,%%v20\n\t" | |||||
| "vsel %%v4,%%v4,%%v5,%%v20\n\t" | |||||
| "vsel %%v17,%%v18,%%v19,%%v21\n\t" | |||||
| "vsel %%v5,%%v6,%%v7,%%v21\n\t" | |||||
| "vfchedb %%v18,%%v17,%%v16\n\t" | |||||
| "vsel %%v16,%%v16,%%v17,%%v18\n\t" | |||||
| "vsel %%v4,%%v4,%%v5,%%v18\n\t" | |||||
| "vag %%v4,%%v4,%%v3\n\t" | |||||
| "vfchedb %%v5,%%v16,%%v0\n\t" | |||||
| "vsel %%v0,%%v0,%%v16,%%v5\n\t" | |||||
| "vsel %%v1,%%v1,%%v4,%%v5\n\t" | |||||
| "vag %%v3,%%v3,%%v2\n\t" | |||||
| "vl %%v16,128(%%r1,%[x])\n\t" | |||||
| "vl %%v17,144(%%r1,%[x])\n\t" | |||||
| "vl %%v18,160(%%r1,%[x])\n\t" | |||||
| "vl %%v19,176(%%r1,%[x])\n\t" | |||||
| "vl %%v20,192(%%r1,%[x])\n\t" | |||||
| "vl %%v21,208(%%r1,%[x])\n\t" | |||||
| "vl %%v22,224(%%r1,%[x])\n\t" | |||||
| "vl %%v23,240(%%r1,%[x])\n\t" | |||||
| "vflpdb %%v16, %%v16\n\t" | |||||
| "vflpdb %%v17, %%v17\n\t" | |||||
| "vflpdb %%v18, %%v18\n\t" | |||||
| "vflpdb %%v19, %%v19\n\t" | |||||
| "vflpdb %%v20, %%v20\n\t" | |||||
| "vflpdb %%v21, %%v21\n\t" | |||||
| "vflpdb %%v22, %%v22\n\t" | |||||
| "vflpdb %%v23, %%v23\n\t" | |||||
| "vfchedb %%v4,%%v17,%%v16\n\t" | |||||
| "vfchedb %%v5,%%v19,%%v18\n\t" | |||||
| "vfchedb %%v6,%%v21,%%v20\n\t" | |||||
| "vfchedb %%v7,%%v23,%%v22\n\t" | |||||
| "vsel %%v16,%%v16,%%v17,%%v4\n\t" | |||||
| "vsel %%v4,%%v24,%%v25,%%v4\n\t" | |||||
| "vsel %%v17,%%v18,%%v19,%%v5\n\t" | |||||
| "vsel %%v5,%%v26,%%v27,%%v5\n\t" | |||||
| "vsel %%v18,%%v20,%%v21,%%v6\n\t" | |||||
| "vsel %%v6,%%v28,%%v29,%%v6\n\t" | |||||
| "vsel %%v19,%%v22,%%v23,%%v7\n\t" | |||||
| "vsel %%v7,%%v30,%%v31,%%v7\n\t" | |||||
| "vfchedb %%v20,%%v17,%%v16\n\t" | |||||
| "vfchedb %%v21,%%v19,%%v18\n\t" | |||||
| "vsel %%v16,%%v16,%%v17,%%v20\n\t" | |||||
| "vsel %%v4,%%v4,%%v5,%%v20\n\t" | |||||
| "vsel %%v17,%%v18,%%v19,%%v21\n\t" | |||||
| "vsel %%v5,%%v6,%%v7,%%v21\n\t" | |||||
| "vfchedb %%v18,%%v17,%%v16\n\t" | |||||
| "vsel %%v16,%%v16,%%v17,%%v18\n\t" | |||||
| "vsel %%v4,%%v4,%%v5,%%v18\n\t" | |||||
| "vag %%v4,%%v4,%%v3\n\t" | |||||
| "vfchedb %%v5,%%v16,%%v0\n\t" | |||||
| "vsel %%v0,%%v0,%%v16,%%v5\n\t" | |||||
| "vsel %%v1,%%v1,%%v4,%%v5\n\t" | |||||
| "vag %%v3,%%v3,%%v2\n\t" | |||||
| "agfi %%r1, 256\n\t" | |||||
| "brctg %[n], 0b\n\t" | |||||
| "vrepg %%v2,%%v0,1\n\t" | |||||
| "vrepg %%v3,%%v1,1\n\t" | |||||
| "wfcdb %%v2,%%v0\n\t" | |||||
| "jne 1f\n\t" | |||||
| "vsteg %%v0,%[amin],0\n\t" | |||||
| "vmnlg %%v0,%%v1,%%v3\n\t" | |||||
| "vlgvg %[iamin],%%v0,0\n\t" | |||||
| "j 2f\n\t" | |||||
| "1:\n\t" | |||||
| "wfchdb %%v4,%%v0,%%v2\n\t" | |||||
| "vsel %%v1,%%v3,%%v1,%%v4\n\t" | |||||
| "vsel %%v0,%%v2,%%v0,%%v4\n\t" | |||||
| "std %%f0,%[amin]\n\t" | |||||
| "vlgvg %[iamin],%%v1,0\n\t" | |||||
| "2:\n\t" | |||||
| "nop" | |||||
| : [iamin] "=r"(iamin),[amin] "=m"(*amin),[n] "+&r"(n) | |||||
| : "m"(*(const FLOAT (*)[n]) x),[x] "a"(x) | |||||
| : "cc", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", | |||||
| "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", | |||||
| "v27", "v28", "v29", "v30", "v31"); | |||||
| return iamin; | |||||
| } | } | ||||
| BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { | |||||
| BLASLONG i = 0; | |||||
| BLASLONG j = 0; | |||||
| FLOAT minf = 0.0; | |||||
| BLASLONG min = 0; | |||||
| if (n <= 0 || inc_x <= 0) return (min); | |||||
| if (inc_x == 1) { | |||||
| BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { | |||||
| BLASLONG i = 0; | |||||
| BLASLONG j = 0; | |||||
| FLOAT minf = 0.0; | |||||
| BLASLONG min = 0; | |||||
| BLASLONG n1 = n & -32; | |||||
| if (n1 > 0) { | |||||
| if (n <= 0 || inc_x <= 0) | |||||
| return (min); | |||||
| min = idamin_kernel_32(n1, x, &minf); | |||||
| if (inc_x == 1) { | |||||
| i = n1; | |||||
| } | |||||
| else | |||||
| { | |||||
| minf = ABS(x[0]); | |||||
| i++; | |||||
| } | |||||
| BLASLONG n1 = n & -32; | |||||
| if (n1 > 0) { | |||||
| while (i < n) { | |||||
| if (ABS(x[i]) < minf) { | |||||
| min = i; | |||||
| minf = ABS(x[i]); | |||||
| } | |||||
| i++; | |||||
| } | |||||
| return (min + 1); | |||||
| min = idamin_kernel_32(n1, x, &minf); | |||||
| i = n1; | |||||
| } else { | } else { | ||||
| minf = ABS(x[0]); | |||||
| i++; | |||||
| } | |||||
| min = 0; | |||||
| minf = ABS(x[0]); | |||||
| while (i < n) { | |||||
| if (ABS(x[i]) < minf) { | |||||
| min = i; | |||||
| minf = ABS(x[i]); | |||||
| } | |||||
| i++; | |||||
| } | |||||
| return (min + 1); | |||||
| BLASLONG n1 = n & -4; | |||||
| while (j < n1) { | |||||
| } else { | |||||
| if (ABS(x[i]) < minf) { | |||||
| min = j; | |||||
| minf = ABS(x[i]); | |||||
| } | |||||
| if (ABS(x[i + inc_x]) < minf) { | |||||
| min = j + 1; | |||||
| minf = ABS(x[i + inc_x]); | |||||
| } | |||||
| if (ABS(x[i + 2 * inc_x]) < minf) { | |||||
| min = j + 2; | |||||
| minf = ABS(x[i + 2 * inc_x]); | |||||
| } | |||||
| if (ABS(x[i + 3 * inc_x]) < minf) { | |||||
| min = j + 3; | |||||
| minf = ABS(x[i + 3 * inc_x]); | |||||
| } | |||||
| min = 0; | |||||
| minf = ABS(x[0]); | |||||
| i += inc_x * 4; | |||||
| BLASLONG n1 = n & -4; | |||||
| while (j < n1) { | |||||
| j += 4; | |||||
| if (ABS(x[i]) < minf) { | |||||
| min = j; | |||||
| minf = ABS(x[i]); | |||||
| } | |||||
| if (ABS(x[i + inc_x]) < minf) { | |||||
| min = j + 1; | |||||
| minf = ABS(x[i + inc_x]); | |||||
| } | |||||
| if (ABS(x[i + 2 * inc_x]) < minf) { | |||||
| min = j + 2; | |||||
| minf = ABS(x[i + 2 * inc_x]); | |||||
| } | |||||
| if (ABS(x[i + 3 * inc_x]) < minf) { | |||||
| min = j + 3; | |||||
| minf = ABS(x[i + 3 * inc_x]); | |||||
| } | |||||
| } | |||||
| i += inc_x * 4; | |||||
| j += 4; | |||||
| } | |||||
| while (j < n) { | |||||
| if (ABS(x[i]) < minf) { | |||||
| min = j; | |||||
| minf = ABS(x[i]); | |||||
| } | |||||
| i += inc_x; | |||||
| j++; | |||||
| } | |||||
| return (min + 1); | |||||
| while (j < n) { | |||||
| if (ABS(x[i]) < minf) { | |||||
| min = j; | |||||
| minf = ABS(x[i]); | |||||
| } | |||||
| i += inc_x; | |||||
| j++; | |||||
| } | } | ||||
| return (min + 1); | |||||
| } | |||||
| } | } | ||||
| @@ -1,5 +1,5 @@ | |||||
| /*************************************************************************** | /*************************************************************************** | ||||
| Copyright (c) 2013-2017, The OpenBLAS Project | |||||
| Copyright (c) 2013-2019, The OpenBLAS Project | |||||
| All rights reserved. | All rights reserved. | ||||
| Redistribution and use in source and binary forms, with or without | Redistribution and use in source and binary forms, with or without | ||||
| modification, are permitted provided that the following conditions are | modification, are permitted provided that the following conditions are | ||||
| @@ -27,214 +27,199 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #include "common.h" | #include "common.h" | ||||
| static BLASLONG idmax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *max) | |||||
| { | |||||
| BLASLONG imax; | |||||
| __asm__ volatile ( | |||||
| "vl %%v0,0(%3) \n\t" | |||||
| "vleig %%v1,0,0 \n\t" | |||||
| "vleig %%v1,1,1 \n\t" | |||||
| "vrepig %%v2,16 \n\t" | |||||
| "vzero %%v3 \n\t" | |||||
| "vleig %%v24,0,0 \n\t" | |||||
| "vleig %%v24,1,1 \n\t" | |||||
| "vleig %%v25,2,0 \n\t" | |||||
| "vleig %%v25,3,1 \n\t" | |||||
| "vleig %%v26,4,0 \n\t" | |||||
| "vleig %%v26,5,1 \n\t" | |||||
| "vleig %%v27,6,0 \n\t" | |||||
| "vleig %%v27,7,1 \n\t" | |||||
| "vleig %%v28,8,0 \n\t" | |||||
| "vleig %%v28,9,1 \n\t" | |||||
| "vleig %%v29,10,0 \n\t" | |||||
| "vleig %%v29,11,1 \n\t" | |||||
| "vleig %%v30,12,0 \n\t" | |||||
| "vleig %%v30,13,1 \n\t" | |||||
| "vleig %%v31,14,0 \n\t" | |||||
| "vleig %%v31,15,1 \n\t" | |||||
| "srlg %%r0,%2,5 \n\t" | |||||
| "xgr %%r1,%%r1 \n\t" | |||||
| "0: \n\t" | |||||
| "pfd 1, 1024(%%r1,%3) \n\t" | |||||
| "vl %%v16,0(%%r1,%3) \n\t" | |||||
| "vl %%v17,16(%%r1,%3) \n\t" | |||||
| "vl %%v18,32(%%r1,%3) \n\t" | |||||
| "vl %%v19,48(%%r1,%3) \n\t" | |||||
| "vl %%v20,64(%%r1,%3) \n\t" | |||||
| "vl %%v21,80(%%r1,%3) \n\t" | |||||
| "vl %%v22,96(%%r1,%3) \n\t" | |||||
| "vl %%v23,112(%%r1,%3) \n\t" | |||||
| "vfchedb %%v4,%%v16,%%v17 \n\t" | |||||
| "vfchedb %%v5,%%v18,%%v19 \n\t" | |||||
| "vfchedb %%v6,%%v20,%%v21 \n\t" | |||||
| "vfchedb %%v7,%%v22,%%v23 \n\t" | |||||
| "vsel %%v16,%%v16,%%v17,%%v4 \n\t" | |||||
| "vsel %%v4,%%v24,%%v25,%%v4 \n\t" | |||||
| "vsel %%v17,%%v18,%%v19,%%v5 \n\t" | |||||
| "vsel %%v5,%%v26,%%v27,%%v5 \n\t" | |||||
| "vsel %%v18,%%v20,%%v21,%%v6 \n\t" | |||||
| "vsel %%v6,%%v28,%%v29,%%v6 \n\t" | |||||
| "vsel %%v19,%%v22,%%v23,%%v7 \n\t" | |||||
| "vsel %%v7,%%v30,%%v31,%%v7 \n\t" | |||||
| "vfchedb %%v20,%%v16,%%v17 \n\t" | |||||
| "vfchedb %%v21,%%v18,%%v19 \n\t" | |||||
| "vsel %%v16,%%v16,%%v17,%%v20 \n\t" | |||||
| "vsel %%v4,%%v4,%%v5,%%v20 \n\t" | |||||
| "vsel %%v17,%%v18,%%v19,%%v21 \n\t" | |||||
| "vsel %%v5,%%v6,%%v7,%%v21 \n\t" | |||||
| "vfchedb %%v18,%%v16,%%v17 \n\t" | |||||
| "vsel %%v16,%%v16,%%v17,%%v18 \n\t" | |||||
| "vsel %%v4,%%v4,%%v5,%%v18 \n\t" | |||||
| "vag %%v4,%%v4,%%v3 \n\t" | |||||
| "vfchedb %%v5,%%v0,%%v16 \n\t" | |||||
| "vsel %%v0,%%v0,%%v16,%%v5 \n\t" | |||||
| "vsel %%v1,%%v1,%%v4,%%v5 \n\t" | |||||
| "vag %%v3,%%v3,%%v2 \n\t" | |||||
| "vl %%v16,128(%%r1,%3) \n\t" | |||||
| "vl %%v17,144(%%r1,%3) \n\t" | |||||
| "vl %%v18,160(%%r1,%3) \n\t" | |||||
| "vl %%v19,176(%%r1,%3) \n\t" | |||||
| "vl %%v20,192(%%r1,%3) \n\t" | |||||
| "vl %%v21,208(%%r1,%3) \n\t" | |||||
| "vl %%v22,224(%%r1,%3) \n\t" | |||||
| "vl %%v23,240(%%r1,%3) \n\t" | |||||
| "vfchedb %%v4,%%v16,%%v17 \n\t" | |||||
| "vfchedb %%v5,%%v18,%%v19 \n\t" | |||||
| "vfchedb %%v6,%%v20,%%v21 \n\t" | |||||
| "vfchedb %%v7,%%v22,%%v23 \n\t" | |||||
| "vsel %%v16,%%v16,%%v17,%%v4 \n\t" | |||||
| "vsel %%v4,%%v24,%%v25,%%v4 \n\t" | |||||
| "vsel %%v17,%%v18,%%v19,%%v5 \n\t" | |||||
| "vsel %%v5,%%v26,%%v27,%%v5 \n\t" | |||||
| "vsel %%v18,%%v20,%%v21,%%v6 \n\t" | |||||
| "vsel %%v6,%%v28,%%v29,%%v6 \n\t" | |||||
| "vsel %%v19,%%v22,%%v23,%%v7 \n\t" | |||||
| "vsel %%v7,%%v30,%%v31,%%v7 \n\t" | |||||
| "vfchedb %%v20,%%v16,%%v17 \n\t" | |||||
| "vfchedb %%v21,%%v18,%%v19 \n\t" | |||||
| "vsel %%v16,%%v16,%%v17,%%v20 \n\t" | |||||
| "vsel %%v4,%%v4,%%v5,%%v20 \n\t" | |||||
| "vsel %%v17,%%v18,%%v19,%%v21 \n\t" | |||||
| "vsel %%v5,%%v6,%%v7,%%v21 \n\t" | |||||
| "vfchedb %%v18,%%v16,%%v17 \n\t" | |||||
| "vsel %%v16,%%v16,%%v17,%%v18 \n\t" | |||||
| "vsel %%v4,%%v4,%%v5,%%v18 \n\t" | |||||
| "vag %%v4,%%v4,%%v3 \n\t" | |||||
| "vfchedb %%v5,%%v0,%%v16 \n\t" | |||||
| "vsel %%v0,%%v0,%%v16,%%v5 \n\t" | |||||
| "vsel %%v1,%%v1,%%v4,%%v5 \n\t" | |||||
| "vag %%v3,%%v3,%%v2 \n\t" | |||||
| "agfi %%r1, 256 \n\t" | |||||
| "brctg %%r0, 0b \n\t" | |||||
| "vrepg %%v2,%%v0,1 \n\t" | |||||
| "vrepg %%v3,%%v1,1 \n\t" | |||||
| "wfcdb %%v2,%%v0 \n\t" | |||||
| "jne 1f \n\t" | |||||
| "vsteg %%v0,%1,0 \n\t" | |||||
| "vmnlg %%v0,%%v1,%%v3 \n\t" | |||||
| "vlgvg %0,%%v0,0 \n\t" | |||||
| "j 2f \n\t" | |||||
| "1: \n\t" | |||||
| "wfchdb %%v4,%%v2,%%v0 \n\t" | |||||
| "vsel %%v1,%%v3,%%v1,%%v4 \n\t" | |||||
| "vsel %%v0,%%v2,%%v0,%%v4 \n\t" | |||||
| "std %%f0,%1 \n\t" | |||||
| "vlgvg %0,%%v1,0 \n\t" | |||||
| "2: \n\t" | |||||
| "nop " | |||||
| :"=r"(imax),"=m"(*max) | |||||
| :"r"(n),"ZR"((const FLOAT (*)[n])x) | |||||
| :"memory","cc","r0","r1","v0","v1","v2","v3","v4","v5","v6","v7","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" | |||||
| ); | |||||
| return imax; | |||||
| static BLASLONG idmax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *max) { | |||||
| BLASLONG imax; | |||||
| __asm__("vl %%v0,0(%[x])\n\t" | |||||
| "vleig %%v1,0,0\n\t" | |||||
| "vleig %%v1,1,1\n\t" | |||||
| "vrepig %%v2,16\n\t" | |||||
| "vzero %%v3\n\t" | |||||
| "vleig %%v24,0,0\n\t" | |||||
| "vleig %%v24,1,1\n\t" | |||||
| "vleig %%v25,2,0\n\t" | |||||
| "vleig %%v25,3,1\n\t" | |||||
| "vleig %%v26,4,0\n\t" | |||||
| "vleig %%v26,5,1\n\t" | |||||
| "vleig %%v27,6,0\n\t" | |||||
| "vleig %%v27,7,1\n\t" | |||||
| "vleig %%v28,8,0\n\t" | |||||
| "vleig %%v28,9,1\n\t" | |||||
| "vleig %%v29,10,0\n\t" | |||||
| "vleig %%v29,11,1\n\t" | |||||
| "vleig %%v30,12,0\n\t" | |||||
| "vleig %%v30,13,1\n\t" | |||||
| "vleig %%v31,14,0\n\t" | |||||
| "vleig %%v31,15,1\n\t" | |||||
| "srlg %[n],%[n],5\n\t" | |||||
| "xgr %%r1,%%r1\n\t" | |||||
| "0:\n\t" | |||||
| "pfd 1, 1024(%%r1,%[x])\n\t" | |||||
| "vl %%v16,0(%%r1,%[x])\n\t" | |||||
| "vl %%v17,16(%%r1,%[x])\n\t" | |||||
| "vl %%v18,32(%%r1,%[x])\n\t" | |||||
| "vl %%v19,48(%%r1,%[x])\n\t" | |||||
| "vl %%v20,64(%%r1,%[x])\n\t" | |||||
| "vl %%v21,80(%%r1,%[x])\n\t" | |||||
| "vl %%v22,96(%%r1,%[x])\n\t" | |||||
| "vl %%v23,112(%%r1,%[x])\n\t" | |||||
| "vfchedb %%v4,%%v16,%%v17\n\t" | |||||
| "vfchedb %%v5,%%v18,%%v19\n\t" | |||||
| "vfchedb %%v6,%%v20,%%v21\n\t" | |||||
| "vfchedb %%v7,%%v22,%%v23\n\t" | |||||
| "vsel %%v16,%%v16,%%v17,%%v4\n\t" | |||||
| "vsel %%v4,%%v24,%%v25,%%v4\n\t" | |||||
| "vsel %%v17,%%v18,%%v19,%%v5\n\t" | |||||
| "vsel %%v5,%%v26,%%v27,%%v5\n\t" | |||||
| "vsel %%v18,%%v20,%%v21,%%v6\n\t" | |||||
| "vsel %%v6,%%v28,%%v29,%%v6\n\t" | |||||
| "vsel %%v19,%%v22,%%v23,%%v7\n\t" | |||||
| "vsel %%v7,%%v30,%%v31,%%v7\n\t" | |||||
| "vfchedb %%v20,%%v16,%%v17\n\t" | |||||
| "vfchedb %%v21,%%v18,%%v19\n\t" | |||||
| "vsel %%v16,%%v16,%%v17,%%v20\n\t" | |||||
| "vsel %%v4,%%v4,%%v5,%%v20\n\t" | |||||
| "vsel %%v17,%%v18,%%v19,%%v21\n\t" | |||||
| "vsel %%v5,%%v6,%%v7,%%v21\n\t" | |||||
| "vfchedb %%v18,%%v16,%%v17\n\t" | |||||
| "vsel %%v16,%%v16,%%v17,%%v18\n\t" | |||||
| "vsel %%v4,%%v4,%%v5,%%v18\n\t" | |||||
| "vag %%v4,%%v4,%%v3\n\t" | |||||
| "vfchedb %%v5,%%v0,%%v16\n\t" | |||||
| "vsel %%v0,%%v0,%%v16,%%v5\n\t" | |||||
| "vsel %%v1,%%v1,%%v4,%%v5\n\t" | |||||
| "vag %%v3,%%v3,%%v2\n\t" | |||||
| "vl %%v16,128(%%r1,%[x])\n\t" | |||||
| "vl %%v17,144(%%r1,%[x])\n\t" | |||||
| "vl %%v18,160(%%r1,%[x])\n\t" | |||||
| "vl %%v19,176(%%r1,%[x])\n\t" | |||||
| "vl %%v20,192(%%r1,%[x])\n\t" | |||||
| "vl %%v21,208(%%r1,%[x])\n\t" | |||||
| "vl %%v22,224(%%r1,%[x])\n\t" | |||||
| "vl %%v23,240(%%r1,%[x])\n\t" | |||||
| "vfchedb %%v4,%%v16,%%v17\n\t" | |||||
| "vfchedb %%v5,%%v18,%%v19\n\t" | |||||
| "vfchedb %%v6,%%v20,%%v21\n\t" | |||||
| "vfchedb %%v7,%%v22,%%v23\n\t" | |||||
| "vsel %%v16,%%v16,%%v17,%%v4\n\t" | |||||
| "vsel %%v4,%%v24,%%v25,%%v4\n\t" | |||||
| "vsel %%v17,%%v18,%%v19,%%v5\n\t" | |||||
| "vsel %%v5,%%v26,%%v27,%%v5\n\t" | |||||
| "vsel %%v18,%%v20,%%v21,%%v6\n\t" | |||||
| "vsel %%v6,%%v28,%%v29,%%v6\n\t" | |||||
| "vsel %%v19,%%v22,%%v23,%%v7\n\t" | |||||
| "vsel %%v7,%%v30,%%v31,%%v7\n\t" | |||||
| "vfchedb %%v20,%%v16,%%v17\n\t" | |||||
| "vfchedb %%v21,%%v18,%%v19\n\t" | |||||
| "vsel %%v16,%%v16,%%v17,%%v20\n\t" | |||||
| "vsel %%v4,%%v4,%%v5,%%v20\n\t" | |||||
| "vsel %%v17,%%v18,%%v19,%%v21\n\t" | |||||
| "vsel %%v5,%%v6,%%v7,%%v21\n\t" | |||||
| "vfchedb %%v18,%%v16,%%v17\n\t" | |||||
| "vsel %%v16,%%v16,%%v17,%%v18\n\t" | |||||
| "vsel %%v4,%%v4,%%v5,%%v18\n\t" | |||||
| "vag %%v4,%%v4,%%v3\n\t" | |||||
| "vfchedb %%v5,%%v0,%%v16\n\t" | |||||
| "vsel %%v0,%%v0,%%v16,%%v5\n\t" | |||||
| "vsel %%v1,%%v1,%%v4,%%v5\n\t" | |||||
| "vag %%v3,%%v3,%%v2\n\t" | |||||
| "agfi %%r1, 256\n\t" | |||||
| "brctg %[n], 0b\n\t" | |||||
| "vrepg %%v2,%%v0,1\n\t" | |||||
| "vrepg %%v3,%%v1,1\n\t" | |||||
| "wfcdb %%v2,%%v0\n\t" | |||||
| "jne 1f\n\t" | |||||
| "vsteg %%v0,%[max],0\n\t" | |||||
| "vmnlg %%v0,%%v1,%%v3\n\t" | |||||
| "vlgvg %[imax],%%v0,0\n\t" | |||||
| "j 2f\n\t" | |||||
| "1:\n\t" | |||||
| "wfchdb %%v4,%%v2,%%v0\n\t" | |||||
| "vsel %%v1,%%v3,%%v1,%%v4\n\t" | |||||
| "vsel %%v0,%%v2,%%v0,%%v4\n\t" | |||||
| "std %%f0,%[max]\n\t" | |||||
| "vlgvg %[imax],%%v1,0\n\t" | |||||
| "2:\n\t" | |||||
| "nop" | |||||
| : [imax] "=r"(imax),[max] "=m"(*max),[n] "+&r"(n) | |||||
| : "m"(*(const FLOAT (*)[n]) x),[x] "a"(x) | |||||
| : "cc", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", | |||||
| "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", | |||||
| "v27", "v28", "v29", "v30", "v31"); | |||||
| return imax; | |||||
| } | } | ||||
| BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { | BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { | ||||
| BLASLONG i = 0; | |||||
| BLASLONG j = 0; | |||||
| FLOAT maxf = 0.0; | |||||
| BLASLONG max = 0; | |||||
| BLASLONG i = 0; | |||||
| BLASLONG j = 0; | |||||
| FLOAT maxf = 0.0; | |||||
| BLASLONG max = 0; | |||||
| if (n <= 0 || inc_x <= 0) | |||||
| return (max); | |||||
| if (n <= 0 || inc_x <= 0) return (max); | |||||
| if (inc_x == 1) { | |||||
| if (inc_x == 1) { | |||||
| BLASLONG n1 = n & -32; | |||||
| if (n1 > 0) { | |||||
| BLASLONG n1 = n & -32; | |||||
| if (n1 > 0) { | |||||
| max = idmax_kernel_32(n1, x, &maxf); | |||||
| max = idmax_kernel_32(n1, x, &maxf); | |||||
| i = n1; | |||||
| } else { | |||||
| maxf = x[0]; | |||||
| i++; | |||||
| } | |||||
| i = n1; | |||||
| } | |||||
| else | |||||
| { | |||||
| maxf = x[0]; | |||||
| i++; | |||||
| } | |||||
| while (i < n) { | |||||
| if (x[i] > maxf) { | |||||
| max = i; | |||||
| maxf = x[i]; | |||||
| } | |||||
| i++; | |||||
| } | |||||
| return (max + 1); | |||||
| while (i < n) { | |||||
| if (x[i] > maxf) { | |||||
| max = i; | |||||
| maxf = x[i]; | |||||
| } | |||||
| i++; | |||||
| } | |||||
| return (max + 1); | |||||
| } else { | |||||
| } else { | |||||
| max = 0; | |||||
| maxf = x[0]; | |||||
| BLASLONG n1 = n & -4; | |||||
| while (j < n1) { | |||||
| if (x[i] > maxf) { | |||||
| max = j; | |||||
| maxf = x[i]; | |||||
| } | |||||
| if (x[i + inc_x] > maxf) { | |||||
| max = j + 1; | |||||
| maxf = x[i + inc_x]; | |||||
| } | |||||
| if (x[i + 2 * inc_x] > maxf) { | |||||
| max = j + 2; | |||||
| maxf = x[i + 2 * inc_x]; | |||||
| } | |||||
| if (x[i + 3 * inc_x] > maxf) { | |||||
| max = j + 3; | |||||
| maxf = x[i + 3 * inc_x]; | |||||
| } | |||||
| i += inc_x * 4; | |||||
| j += 4; | |||||
| } | |||||
| max = 0; | |||||
| maxf = x[0]; | |||||
| BLASLONG n1 = n & -4; | |||||
| while (j < n1) { | |||||
| if (x[i] > maxf) { | |||||
| max = j; | |||||
| maxf = x[i]; | |||||
| } | |||||
| if (x[i + inc_x] > maxf) { | |||||
| max = j + 1; | |||||
| maxf = x[i + inc_x]; | |||||
| } | |||||
| if (x[i + 2 * inc_x] > maxf) { | |||||
| max = j + 2; | |||||
| maxf = x[i + 2 * inc_x]; | |||||
| } | |||||
| if (x[i + 3 * inc_x] > maxf) { | |||||
| max = j + 3; | |||||
| maxf = x[i + 3 * inc_x]; | |||||
| } | |||||
| i += inc_x * 4; | |||||
| j += 4; | |||||
| } | |||||
| while (j < n) { | |||||
| if (x[i] > maxf) { | |||||
| max = j; | |||||
| maxf = x[i]; | |||||
| } | |||||
| i += inc_x; | |||||
| j++; | |||||
| } | |||||
| return (max + 1); | |||||
| while (j < n) { | |||||
| if (x[i] > maxf) { | |||||
| max = j; | |||||
| maxf = x[i]; | |||||
| } | |||||
| i += inc_x; | |||||
| j++; | |||||
| } | } | ||||
| return (max + 1); | |||||
| } | |||||
| } | } | ||||
| @@ -1,5 +1,5 @@ | |||||
| /*************************************************************************** | /*************************************************************************** | ||||
| Copyright (c) 2013-2017, The OpenBLAS Project | |||||
| Copyright (c) 2013-2019, The OpenBLAS Project | |||||
| All rights reserved. | All rights reserved. | ||||
| Redistribution and use in source and binary forms, with or without | Redistribution and use in source and binary forms, with or without | ||||
| modification, are permitted provided that the following conditions are | modification, are permitted provided that the following conditions are | ||||
| @@ -27,214 +27,199 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #include "common.h" | #include "common.h" | ||||
| static BLASLONG idmin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *min) | |||||
| { | |||||
| BLASLONG imin; | |||||
| __asm__ volatile ( | |||||
| "vl %%v0,0(%3) \n\t" | |||||
| "vleig %%v1,0,0 \n\t" | |||||
| "vleig %%v1,1,1 \n\t" | |||||
| "vrepig %%v2,16 \n\t" | |||||
| "vzero %%v3 \n\t" | |||||
| "vleig %%v24,0,0 \n\t" | |||||
| "vleig %%v24,1,1 \n\t" | |||||
| "vleig %%v25,2,0 \n\t" | |||||
| "vleig %%v25,3,1 \n\t" | |||||
| "vleig %%v26,4,0 \n\t" | |||||
| "vleig %%v26,5,1 \n\t" | |||||
| "vleig %%v27,6,0 \n\t" | |||||
| "vleig %%v27,7,1 \n\t" | |||||
| "vleig %%v28,8,0 \n\t" | |||||
| "vleig %%v28,9,1 \n\t" | |||||
| "vleig %%v29,10,0 \n\t" | |||||
| "vleig %%v29,11,1 \n\t" | |||||
| "vleig %%v30,12,0 \n\t" | |||||
| "vleig %%v30,13,1 \n\t" | |||||
| "vleig %%v31,14,0 \n\t" | |||||
| "vleig %%v31,15,1 \n\t" | |||||
| "srlg %%r0,%2,5 \n\t" | |||||
| "xgr %%r1,%%r1 \n\t" | |||||
| "0: \n\t" | |||||
| "pfd 1, 1024(%%r1,%3) \n\t" | |||||
| "vl %%v16,0(%%r1,%3) \n\t" | |||||
| "vl %%v17,16(%%r1,%3) \n\t" | |||||
| "vl %%v18,32(%%r1,%3) \n\t" | |||||
| "vl %%v19,48(%%r1,%3) \n\t" | |||||
| "vl %%v20,64(%%r1,%3) \n\t" | |||||
| "vl %%v21,80(%%r1,%3) \n\t" | |||||
| "vl %%v22,96(%%r1,%3) \n\t" | |||||
| "vl %%v23,112(%%r1,%3) \n\t" | |||||
| "vfchedb %%v4,%%v17,%%v16 \n\t" | |||||
| "vfchedb %%v5,%%v19,%%v18 \n\t" | |||||
| "vfchedb %%v6,%%v21,%%v20 \n\t" | |||||
| "vfchedb %%v7,%%v23,%%v22 \n\t" | |||||
| "vsel %%v16,%%v16,%%v17,%%v4 \n\t" | |||||
| "vsel %%v4,%%v24,%%v25,%%v4 \n\t" | |||||
| "vsel %%v17,%%v18,%%v19,%%v5 \n\t" | |||||
| "vsel %%v5,%%v26,%%v27,%%v5 \n\t" | |||||
| "vsel %%v18,%%v20,%%v21,%%v6 \n\t" | |||||
| "vsel %%v6,%%v28,%%v29,%%v6 \n\t" | |||||
| "vsel %%v19,%%v22,%%v23,%%v7 \n\t" | |||||
| "vsel %%v7,%%v30,%%v31,%%v7 \n\t" | |||||
| "vfchedb %%v20,%%v17,%%v16 \n\t" | |||||
| "vfchedb %%v21,%%v19,%%v18 \n\t" | |||||
| "vsel %%v16,%%v16,%%v17,%%v20 \n\t" | |||||
| "vsel %%v4,%%v4,%%v5,%%v20 \n\t" | |||||
| "vsel %%v17,%%v18,%%v19,%%v21 \n\t" | |||||
| "vsel %%v5,%%v6,%%v7,%%v21 \n\t" | |||||
| "vfchedb %%v18,%%v17,%%v16 \n\t" | |||||
| "vsel %%v16,%%v16,%%v17,%%v18 \n\t" | |||||
| "vsel %%v4,%%v4,%%v5,%%v18 \n\t" | |||||
| "vag %%v4,%%v4,%%v3 \n\t" | |||||
| "vfchedb %%v5,%%v16,%%v0 \n\t" | |||||
| "vsel %%v0,%%v0,%%v16,%%v5 \n\t" | |||||
| "vsel %%v1,%%v1,%%v4,%%v5 \n\t" | |||||
| "vag %%v3,%%v3,%%v2 \n\t" | |||||
| "vl %%v16,128(%%r1,%3) \n\t" | |||||
| "vl %%v17,144(%%r1,%3) \n\t" | |||||
| "vl %%v18,160(%%r1,%3) \n\t" | |||||
| "vl %%v19,176(%%r1,%3) \n\t" | |||||
| "vl %%v20,192(%%r1,%3) \n\t" | |||||
| "vl %%v21,208(%%r1,%3) \n\t" | |||||
| "vl %%v22,224(%%r1,%3) \n\t" | |||||
| "vl %%v23,240(%%r1,%3) \n\t" | |||||
| "vfchedb %%v4,%%v17,%%v16 \n\t" | |||||
| "vfchedb %%v5,%%v19,%%v18 \n\t" | |||||
| "vfchedb %%v6,%%v21,%%v20 \n\t" | |||||
| "vfchedb %%v7,%%v23,%%v22 \n\t" | |||||
| "vsel %%v16,%%v16,%%v17,%%v4 \n\t" | |||||
| "vsel %%v4,%%v24,%%v25,%%v4 \n\t" | |||||
| "vsel %%v17,%%v18,%%v19,%%v5 \n\t" | |||||
| "vsel %%v5,%%v26,%%v27,%%v5 \n\t" | |||||
| "vsel %%v18,%%v20,%%v21,%%v6 \n\t" | |||||
| "vsel %%v6,%%v28,%%v29,%%v6 \n\t" | |||||
| "vsel %%v19,%%v22,%%v23,%%v7 \n\t" | |||||
| "vsel %%v7,%%v30,%%v31,%%v7 \n\t" | |||||
| "vfchedb %%v20,%%v17,%%v16 \n\t" | |||||
| "vfchedb %%v21,%%v19,%%v18 \n\t" | |||||
| "vsel %%v16,%%v16,%%v17,%%v20 \n\t" | |||||
| "vsel %%v4,%%v4,%%v5,%%v20 \n\t" | |||||
| "vsel %%v17,%%v18,%%v19,%%v21 \n\t" | |||||
| "vsel %%v5,%%v6,%%v7,%%v21 \n\t" | |||||
| "vfchedb %%v18,%%v17,%%v16 \n\t" | |||||
| "vsel %%v16,%%v16,%%v17,%%v18 \n\t" | |||||
| "vsel %%v4,%%v4,%%v5,%%v18 \n\t" | |||||
| "vag %%v4,%%v4,%%v3 \n\t" | |||||
| "vfchedb %%v5,%%v16,%%v0 \n\t" | |||||
| "vsel %%v0,%%v0,%%v16,%%v5 \n\t" | |||||
| "vsel %%v1,%%v1,%%v4,%%v5 \n\t" | |||||
| "vag %%v3,%%v3,%%v2 \n\t" | |||||
| "agfi %%r1, 256 \n\t" | |||||
| "brctg %%r0, 0b \n\t" | |||||
| "vrepg %%v2,%%v0,1 \n\t" | |||||
| "vrepg %%v3,%%v1,1 \n\t" | |||||
| "wfcdb %%v2,%%v0 \n\t" | |||||
| "jne 1f \n\t" | |||||
| "vsteg %%v0,%1,0 \n\t" | |||||
| "vmnlg %%v0,%%v1,%%v3 \n\t" | |||||
| "vlgvg %0,%%v0,0 \n\t" | |||||
| "j 2f \n\t" | |||||
| "1: \n\t" | |||||
| "wfchdb %%v4,%%v0,%%v2 \n\t" | |||||
| "vsel %%v1,%%v3,%%v1,%%v4 \n\t" | |||||
| "vsel %%v0,%%v2,%%v0,%%v4 \n\t" | |||||
| "std %%f0,%1 \n\t" | |||||
| "vlgvg %0,%%v1,0 \n\t" | |||||
| "2: \n\t" | |||||
| "nop " | |||||
| :"=r"(imin),"=m"(*min) | |||||
| :"r"(n),"ZR"((const FLOAT (*)[n])x) | |||||
| :"memory","cc","r0","r1","v0","v1","v2","v3","v4","v5","v6","v7","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" | |||||
| ); | |||||
| return imin; | |||||
| static BLASLONG idmin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *min) { | |||||
| BLASLONG imin; | |||||
| __asm__("vl %%v0,0(%[x])\n\t" | |||||
| "vleig %%v1,0,0\n\t" | |||||
| "vleig %%v1,1,1\n\t" | |||||
| "vrepig %%v2,16\n\t" | |||||
| "vzero %%v3\n\t" | |||||
| "vleig %%v24,0,0\n\t" | |||||
| "vleig %%v24,1,1\n\t" | |||||
| "vleig %%v25,2,0\n\t" | |||||
| "vleig %%v25,3,1\n\t" | |||||
| "vleig %%v26,4,0\n\t" | |||||
| "vleig %%v26,5,1\n\t" | |||||
| "vleig %%v27,6,0\n\t" | |||||
| "vleig %%v27,7,1\n\t" | |||||
| "vleig %%v28,8,0\n\t" | |||||
| "vleig %%v28,9,1\n\t" | |||||
| "vleig %%v29,10,0\n\t" | |||||
| "vleig %%v29,11,1\n\t" | |||||
| "vleig %%v30,12,0\n\t" | |||||
| "vleig %%v30,13,1\n\t" | |||||
| "vleig %%v31,14,0\n\t" | |||||
| "vleig %%v31,15,1\n\t" | |||||
| "srlg %[n],%[n],5\n\t" | |||||
| "xgr %%r1,%%r1\n\t" | |||||
| "0:\n\t" | |||||
| "pfd 1, 1024(%%r1,%[x])\n\t" | |||||
| "vl %%v16,0(%%r1,%[x])\n\t" | |||||
| "vl %%v17,16(%%r1,%[x])\n\t" | |||||
| "vl %%v18,32(%%r1,%[x])\n\t" | |||||
| "vl %%v19,48(%%r1,%[x])\n\t" | |||||
| "vl %%v20,64(%%r1,%[x])\n\t" | |||||
| "vl %%v21,80(%%r1,%[x])\n\t" | |||||
| "vl %%v22,96(%%r1,%[x])\n\t" | |||||
| "vl %%v23,112(%%r1,%[x])\n\t" | |||||
| "vfchedb %%v4,%%v17,%%v16\n\t" | |||||
| "vfchedb %%v5,%%v19,%%v18\n\t" | |||||
| "vfchedb %%v6,%%v21,%%v20\n\t" | |||||
| "vfchedb %%v7,%%v23,%%v22\n\t" | |||||
| "vsel %%v16,%%v16,%%v17,%%v4\n\t" | |||||
| "vsel %%v4,%%v24,%%v25,%%v4\n\t" | |||||
| "vsel %%v17,%%v18,%%v19,%%v5\n\t" | |||||
| "vsel %%v5,%%v26,%%v27,%%v5\n\t" | |||||
| "vsel %%v18,%%v20,%%v21,%%v6\n\t" | |||||
| "vsel %%v6,%%v28,%%v29,%%v6\n\t" | |||||
| "vsel %%v19,%%v22,%%v23,%%v7\n\t" | |||||
| "vsel %%v7,%%v30,%%v31,%%v7\n\t" | |||||
| "vfchedb %%v20,%%v17,%%v16\n\t" | |||||
| "vfchedb %%v21,%%v19,%%v18\n\t" | |||||
| "vsel %%v16,%%v16,%%v17,%%v20\n\t" | |||||
| "vsel %%v4,%%v4,%%v5,%%v20\n\t" | |||||
| "vsel %%v17,%%v18,%%v19,%%v21\n\t" | |||||
| "vsel %%v5,%%v6,%%v7,%%v21\n\t" | |||||
| "vfchedb %%v18,%%v17,%%v16\n\t" | |||||
| "vsel %%v16,%%v16,%%v17,%%v18\n\t" | |||||
| "vsel %%v4,%%v4,%%v5,%%v18\n\t" | |||||
| "vag %%v4,%%v4,%%v3\n\t" | |||||
| "vfchedb %%v5,%%v16,%%v0\n\t" | |||||
| "vsel %%v0,%%v0,%%v16,%%v5\n\t" | |||||
| "vsel %%v1,%%v1,%%v4,%%v5\n\t" | |||||
| "vag %%v3,%%v3,%%v2\n\t" | |||||
| "vl %%v16,128(%%r1,%[x])\n\t" | |||||
| "vl %%v17,144(%%r1,%[x])\n\t" | |||||
| "vl %%v18,160(%%r1,%[x])\n\t" | |||||
| "vl %%v19,176(%%r1,%[x])\n\t" | |||||
| "vl %%v20,192(%%r1,%[x])\n\t" | |||||
| "vl %%v21,208(%%r1,%[x])\n\t" | |||||
| "vl %%v22,224(%%r1,%[x])\n\t" | |||||
| "vl %%v23,240(%%r1,%[x])\n\t" | |||||
| "vfchedb %%v4,%%v17,%%v16\n\t" | |||||
| "vfchedb %%v5,%%v19,%%v18\n\t" | |||||
| "vfchedb %%v6,%%v21,%%v20\n\t" | |||||
| "vfchedb %%v7,%%v23,%%v22\n\t" | |||||
| "vsel %%v16,%%v16,%%v17,%%v4\n\t" | |||||
| "vsel %%v4,%%v24,%%v25,%%v4\n\t" | |||||
| "vsel %%v17,%%v18,%%v19,%%v5\n\t" | |||||
| "vsel %%v5,%%v26,%%v27,%%v5\n\t" | |||||
| "vsel %%v18,%%v20,%%v21,%%v6\n\t" | |||||
| "vsel %%v6,%%v28,%%v29,%%v6\n\t" | |||||
| "vsel %%v19,%%v22,%%v23,%%v7\n\t" | |||||
| "vsel %%v7,%%v30,%%v31,%%v7\n\t" | |||||
| "vfchedb %%v20,%%v17,%%v16\n\t" | |||||
| "vfchedb %%v21,%%v19,%%v18\n\t" | |||||
| "vsel %%v16,%%v16,%%v17,%%v20\n\t" | |||||
| "vsel %%v4,%%v4,%%v5,%%v20\n\t" | |||||
| "vsel %%v17,%%v18,%%v19,%%v21\n\t" | |||||
| "vsel %%v5,%%v6,%%v7,%%v21\n\t" | |||||
| "vfchedb %%v18,%%v17,%%v16\n\t" | |||||
| "vsel %%v16,%%v16,%%v17,%%v18\n\t" | |||||
| "vsel %%v4,%%v4,%%v5,%%v18\n\t" | |||||
| "vag %%v4,%%v4,%%v3\n\t" | |||||
| "vfchedb %%v5,%%v16,%%v0\n\t" | |||||
| "vsel %%v0,%%v0,%%v16,%%v5\n\t" | |||||
| "vsel %%v1,%%v1,%%v4,%%v5\n\t" | |||||
| "vag %%v3,%%v3,%%v2\n\t" | |||||
| "agfi %%r1, 256\n\t" | |||||
| "brctg %[n], 0b\n\t" | |||||
| "vrepg %%v2,%%v0,1\n\t" | |||||
| "vrepg %%v3,%%v1,1\n\t" | |||||
| "wfcdb %%v2,%%v0\n\t" | |||||
| "jne 1f\n\t" | |||||
| "vsteg %%v0,%[min],0\n\t" | |||||
| "vmnlg %%v0,%%v1,%%v3\n\t" | |||||
| "vlgvg %[imin],%%v0,0\n\t" | |||||
| "j 2f\n\t" | |||||
| "1:\n\t" | |||||
| "wfchdb %%v4,%%v0,%%v2\n\t" | |||||
| "vsel %%v1,%%v3,%%v1,%%v4\n\t" | |||||
| "vsel %%v0,%%v2,%%v0,%%v4\n\t" | |||||
| "std %%f0,%[min]\n\t" | |||||
| "vlgvg %[imin],%%v1,0\n\t" | |||||
| "2:\n\t" | |||||
| "nop" | |||||
| : [imin] "=r"(imin),[min] "=m"(*min),[n] "+&r"(n) | |||||
| : "m"(*(const FLOAT (*)[n]) x),[x] "a"(x) | |||||
| : "cc", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", | |||||
| "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", | |||||
| "v27", "v28", "v29", "v30", "v31"); | |||||
| return imin; | |||||
| } | } | ||||
| BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { | BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { | ||||
| BLASLONG i = 0; | |||||
| BLASLONG j = 0; | |||||
| FLOAT minf = 0.0; | |||||
| BLASLONG min = 0; | |||||
| BLASLONG i = 0; | |||||
| BLASLONG j = 0; | |||||
| FLOAT minf = 0.0; | |||||
| BLASLONG min = 0; | |||||
| if (n <= 0 || inc_x <= 0) | |||||
| return (min); | |||||
| if (n <= 0 || inc_x <= 0) return (min); | |||||
| if (inc_x == 1) { | |||||
| if (inc_x == 1) { | |||||
| BLASLONG n1 = n & -32; | |||||
| if (n1 > 0) { | |||||
| BLASLONG n1 = n & -32; | |||||
| if (n1 > 0) { | |||||
| min = idmin_kernel_32(n1, x, &minf); | |||||
| min = idmin_kernel_32(n1, x, &minf); | |||||
| i = n1; | |||||
| } else { | |||||
| minf = x[0]; | |||||
| i++; | |||||
| } | |||||
| i = n1; | |||||
| } | |||||
| else | |||||
| { | |||||
| minf = x[0]; | |||||
| i++; | |||||
| } | |||||
| while (i < n) { | |||||
| if (x[i] < minf) { | |||||
| min = i; | |||||
| minf = x[i]; | |||||
| } | |||||
| i++; | |||||
| } | |||||
| return (min + 1); | |||||
| while (i < n) { | |||||
| if (x[i] < minf) { | |||||
| min = i; | |||||
| minf = x[i]; | |||||
| } | |||||
| i++; | |||||
| } | |||||
| return (min + 1); | |||||
| } else { | |||||
| } else { | |||||
| min = 0; | |||||
| minf = x[0]; | |||||
| BLASLONG n1 = n & -4; | |||||
| while (j < n1) { | |||||
| if (x[i] < minf) { | |||||
| min = j; | |||||
| minf = x[i]; | |||||
| } | |||||
| if (x[i + inc_x] < minf) { | |||||
| min = j + 1; | |||||
| minf = x[i + inc_x]; | |||||
| } | |||||
| if (x[i + 2 * inc_x] < minf) { | |||||
| min = j + 2; | |||||
| minf = x[i + 2 * inc_x]; | |||||
| } | |||||
| if (x[i + 3 * inc_x] < minf) { | |||||
| min = j + 3; | |||||
| minf = x[i + 3 * inc_x]; | |||||
| } | |||||
| i += inc_x * 4; | |||||
| j += 4; | |||||
| } | |||||
| min = 0; | |||||
| minf = x[0]; | |||||
| BLASLONG n1 = n & -4; | |||||
| while (j < n1) { | |||||
| if (x[i] < minf) { | |||||
| min = j; | |||||
| minf = x[i]; | |||||
| } | |||||
| if (x[i + inc_x] < minf) { | |||||
| min = j + 1; | |||||
| minf = x[i + inc_x]; | |||||
| } | |||||
| if (x[i + 2 * inc_x] < minf) { | |||||
| min = j + 2; | |||||
| minf = x[i + 2 * inc_x]; | |||||
| } | |||||
| if (x[i + 3 * inc_x] < minf) { | |||||
| min = j + 3; | |||||
| minf = x[i + 3 * inc_x]; | |||||
| } | |||||
| i += inc_x * 4; | |||||
| j += 4; | |||||
| } | |||||
| while (j < n) { | |||||
| if (x[i] < minf) { | |||||
| min = j; | |||||
| minf = x[i]; | |||||
| } | |||||
| i += inc_x; | |||||
| j++; | |||||
| } | |||||
| return (min + 1); | |||||
| while (j < n) { | |||||
| if (x[i] < minf) { | |||||
| min = j; | |||||
| minf = x[i]; | |||||
| } | |||||
| i += inc_x; | |||||
| j++; | |||||
| } | } | ||||
| return (min + 1); | |||||
| } | |||||
| } | } | ||||
| @@ -1,5 +1,5 @@ | |||||
| /*************************************************************************** | /*************************************************************************** | ||||
| Copyright (c) 2013-2017, The OpenBLAS Project | |||||
| Copyright (c) 2013-2019, The OpenBLAS Project | |||||
| All rights reserved. | All rights reserved. | ||||
| Redistribution and use in source and binary forms, with or without | Redistribution and use in source and binary forms, with or without | ||||
| modification, are permitted provided that the following conditions are | modification, are permitted provided that the following conditions are | ||||
| @@ -28,282 +28,262 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #include "common.h" | #include "common.h" | ||||
| #include <math.h> | #include <math.h> | ||||
| #if defined(DOUBLE) | |||||
| #define ABS fabs | |||||
| #else | |||||
| #define ABS fabsf | #define ABS fabsf | ||||
| #endif | |||||
| static BLASLONG isamax_kernel_64(BLASLONG n, FLOAT *x, FLOAT *amax) | |||||
| { | |||||
| BLASLONG iamax; | |||||
| static BLASLONG isamax_kernel_64(BLASLONG n, FLOAT *x, FLOAT *amax) { | |||||
| BLASLONG iamax; | |||||
| __asm__ volatile ( | |||||
| "vl %%v0,0(%3) \n\t" | |||||
| "vflpsb %%v0,%%v0 \n\t" | |||||
| "vleig %%v1,0,0 \n\t" | |||||
| "vleig %%v1,2,1 \n\t" | |||||
| "vleig %%v2,1,0 \n\t" | |||||
| "vleig %%v2,3,1 \n\t" | |||||
| "vrepig %%v3,32 \n\t" | |||||
| "vzero %%v4 \n\t" | |||||
| "vleif %%v24,0,0 \n\t" | |||||
| "vleif %%v24,1,1 \n\t" | |||||
| "vleif %%v24,2,2 \n\t" | |||||
| "vleif %%v24,3,3 \n\t" | |||||
| "vleif %%v25,4,0 \n\t" | |||||
| "vleif %%v25,5,1 \n\t" | |||||
| "vleif %%v25,6,2 \n\t" | |||||
| "vleif %%v25,7,3 \n\t" | |||||
| "vleif %%v26,8,0 \n\t" | |||||
| "vleif %%v26,9,1 \n\t" | |||||
| "vleif %%v26,10,2 \n\t" | |||||
| "vleif %%v26,11,3 \n\t" | |||||
| "vleif %%v27,12,0 \n\t" | |||||
| "vleif %%v27,13,1 \n\t" | |||||
| "vleif %%v27,14,2 \n\t" | |||||
| "vleif %%v27,15,3 \n\t" | |||||
| "vleif %%v28,16,0 \n\t" | |||||
| "vleif %%v28,17,1 \n\t" | |||||
| "vleif %%v28,18,2 \n\t" | |||||
| "vleif %%v28,19,3 \n\t" | |||||
| "vleif %%v29,20,0 \n\t" | |||||
| "vleif %%v29,21,1 \n\t" | |||||
| "vleif %%v29,22,2 \n\t" | |||||
| "vleif %%v29,23,3 \n\t" | |||||
| "vleif %%v30,24,0 \n\t" | |||||
| "vleif %%v30,25,1 \n\t" | |||||
| "vleif %%v30,26,2 \n\t" | |||||
| "vleif %%v30,27,3 \n\t" | |||||
| "vleif %%v31,28,0 \n\t" | |||||
| "vleif %%v31,29,1 \n\t" | |||||
| "vleif %%v31,30,2 \n\t" | |||||
| "vleif %%v31,31,3 \n\t" | |||||
| "srlg %%r0,%2,6 \n\t" | |||||
| "xgr %%r1,%%r1 \n\t" | |||||
| "0: \n\t" | |||||
| "pfd 1, 1024(%%r1,%3) \n\t" | |||||
| __asm__("vl %%v0,0(%[x])\n\t" | |||||
| "vflpsb %%v0,%%v0\n\t" | |||||
| "vleig %%v1,0,0\n\t" | |||||
| "vleig %%v1,2,1\n\t" | |||||
| "vleig %%v2,1,0\n\t" | |||||
| "vleig %%v2,3,1\n\t" | |||||
| "vrepig %%v3,32\n\t" | |||||
| "vzero %%v4\n\t" | |||||
| "vleif %%v24,0,0\n\t" | |||||
| "vleif %%v24,1,1\n\t" | |||||
| "vleif %%v24,2,2\n\t" | |||||
| "vleif %%v24,3,3\n\t" | |||||
| "vleif %%v25,4,0\n\t" | |||||
| "vleif %%v25,5,1\n\t" | |||||
| "vleif %%v25,6,2\n\t" | |||||
| "vleif %%v25,7,3\n\t" | |||||
| "vleif %%v26,8,0\n\t" | |||||
| "vleif %%v26,9,1\n\t" | |||||
| "vleif %%v26,10,2\n\t" | |||||
| "vleif %%v26,11,3\n\t" | |||||
| "vleif %%v27,12,0\n\t" | |||||
| "vleif %%v27,13,1\n\t" | |||||
| "vleif %%v27,14,2\n\t" | |||||
| "vleif %%v27,15,3\n\t" | |||||
| "vleif %%v28,16,0\n\t" | |||||
| "vleif %%v28,17,1\n\t" | |||||
| "vleif %%v28,18,2\n\t" | |||||
| "vleif %%v28,19,3\n\t" | |||||
| "vleif %%v29,20,0\n\t" | |||||
| "vleif %%v29,21,1\n\t" | |||||
| "vleif %%v29,22,2\n\t" | |||||
| "vleif %%v29,23,3\n\t" | |||||
| "vleif %%v30,24,0\n\t" | |||||
| "vleif %%v30,25,1\n\t" | |||||
| "vleif %%v30,26,2\n\t" | |||||
| "vleif %%v30,27,3\n\t" | |||||
| "vleif %%v31,28,0\n\t" | |||||
| "vleif %%v31,29,1\n\t" | |||||
| "vleif %%v31,30,2\n\t" | |||||
| "vleif %%v31,31,3\n\t" | |||||
| "srlg %[n],%[n],6\n\t" | |||||
| "xgr %%r1,%%r1\n\t" | |||||
| "0:\n\t" | |||||
| "pfd 1, 1024(%%r1,%[x])\n\t" | |||||
| "vl %%v16,0(%%r1,%[x])\n\t" | |||||
| "vl %%v17,16(%%r1,%[x])\n\t" | |||||
| "vl %%v18,32(%%r1,%[x])\n\t" | |||||
| "vl %%v19,48(%%r1,%[x])\n\t" | |||||
| "vl %%v20,64(%%r1,%[x])\n\t" | |||||
| "vl %%v21,80(%%r1,%[x])\n\t" | |||||
| "vl %%v22,96(%%r1,%[x])\n\t" | |||||
| "vl %%v23,112(%%r1,%[x])\n\t" | |||||
| "vflpsb %%v16, %%v16\n\t" | |||||
| "vflpsb %%v17, %%v17\n\t" | |||||
| "vflpsb %%v18, %%v18\n\t" | |||||
| "vflpsb %%v19, %%v19\n\t" | |||||
| "vflpsb %%v20, %%v20\n\t" | |||||
| "vflpsb %%v21, %%v21\n\t" | |||||
| "vflpsb %%v22, %%v22\n\t" | |||||
| "vflpsb %%v23, %%v23\n\t" | |||||
| "vfchesb %%v5,%%v16,%%v17\n\t" | |||||
| "vfchesb %%v6,%%v18,%%v19\n\t" | |||||
| "vfchesb %%v7,%%v20,%%v21\n\t" | |||||
| "vfchesb %%v8,%%v22,%%v23\n\t" | |||||
| "vsel %%v16,%%v16,%%v17,%%v5\n\t" | |||||
| "vsel %%v5,%%v24,%%v25,%%v5\n\t" | |||||
| "vsel %%v17,%%v18,%%v19,%%v6\n\t" | |||||
| "vsel %%v6,%%v26,%%v27,%%v6\n\t" | |||||
| "vsel %%v18,%%v20,%%v21,%%v7\n\t" | |||||
| "vsel %%v7,%%v28,%%v29,%%v7\n\t" | |||||
| "vsel %%v19,%%v22,%%v23,%%v8\n\t" | |||||
| "vsel %%v8,%%v30,%%v31,%%v8\n\t" | |||||
| "vfchesb %%v20,%%v16,%%v17\n\t" | |||||
| "vfchesb %%v21,%%v18,%%v19\n\t" | |||||
| "vsel %%v16,%%v16,%%v17,%%v20\n\t" | |||||
| "vsel %%v5,%%v5,%%v6,%%v20\n\t" | |||||
| "vsel %%v17,%%v18,%%v19,%%v21\n\t" | |||||
| "vsel %%v6,%%v7,%%v8,%%v21\n\t" | |||||
| "vfchesb %%v18,%%v16,%%v17\n\t" | |||||
| "vsel %%v16,%%v16,%%v17,%%v18\n\t" | |||||
| "vsel %%v5,%%v5,%%v6,%%v18\n\t" | |||||
| "vsegf %%v6,%%v5\n\t" | |||||
| "vesrlg %%v5,%%v5,32\n\t" | |||||
| "vag %%v5,%%v5,%%v4\n\t" | |||||
| "vag %%v6,%%v6,%%v4\n\t" | |||||
| "vfchesb %%v7,%%v0,%%v16\n\t" | |||||
| "vsel %%v0,%%v0,%%v16,%%v7\n\t" | |||||
| "vsegf %%v8,%%v7\n\t" | |||||
| "vesrlg %%v7,%%v7,32\n\t" | |||||
| "vsegf %%v7,%%v7\n\t" | |||||
| "vsel %%v1,%%v1,%%v5,%%v7\n\t" | |||||
| "vsel %%v2,%%v2,%%v6,%%v8\n\t" | |||||
| "vag %%v4,%%v4,%%v3\n\t" | |||||
| "vl %%v16,128(%%r1,%[x])\n\t" | |||||
| "vl %%v17,144(%%r1,%[x])\n\t" | |||||
| "vl %%v18,160(%%r1,%[x])\n\t" | |||||
| "vl %%v19,176(%%r1,%[x])\n\t" | |||||
| "vl %%v20,192(%%r1,%[x])\n\t" | |||||
| "vl %%v21,208(%%r1,%[x])\n\t" | |||||
| "vl %%v22,224(%%r1,%[x])\n\t" | |||||
| "vl %%v23,240(%%r1,%[x])\n\t" | |||||
| "vflpsb %%v16, %%v16\n\t" | |||||
| "vflpsb %%v17, %%v17\n\t" | |||||
| "vflpsb %%v18, %%v18\n\t" | |||||
| "vflpsb %%v19, %%v19\n\t" | |||||
| "vflpsb %%v20, %%v20\n\t" | |||||
| "vflpsb %%v21, %%v21\n\t" | |||||
| "vflpsb %%v22, %%v22\n\t" | |||||
| "vflpsb %%v23, %%v23\n\t" | |||||
| "vfchesb %%v5,%%v16,%%v17\n\t" | |||||
| "vfchesb %%v6,%%v18,%%v19\n\t" | |||||
| "vfchesb %%v7,%%v20,%%v21\n\t" | |||||
| "vfchesb %%v8,%%v22,%%v23\n\t" | |||||
| "vsel %%v16,%%v16,%%v17,%%v5\n\t" | |||||
| "vsel %%v5,%%v24,%%v25,%%v5\n\t" | |||||
| "vsel %%v17,%%v18,%%v19,%%v6\n\t" | |||||
| "vsel %%v6,%%v26,%%v27,%%v6\n\t" | |||||
| "vsel %%v18,%%v20,%%v21,%%v7\n\t" | |||||
| "vsel %%v7,%%v28,%%v29,%%v7\n\t" | |||||
| "vsel %%v19,%%v22,%%v23,%%v8\n\t" | |||||
| "vsel %%v8,%%v30,%%v31,%%v8\n\t" | |||||
| "vfchesb %%v20,%%v16,%%v17\n\t" | |||||
| "vfchesb %%v21,%%v18,%%v19\n\t" | |||||
| "vsel %%v16,%%v16,%%v17,%%v20\n\t" | |||||
| "vsel %%v5,%%v5,%%v6,%%v20\n\t" | |||||
| "vsel %%v17,%%v18,%%v19,%%v21\n\t" | |||||
| "vsel %%v6,%%v7,%%v8,%%v21\n\t" | |||||
| "vfchesb %%v18,%%v16,%%v17\n\t" | |||||
| "vsel %%v16,%%v16,%%v17,%%v18\n\t" | |||||
| "vsel %%v5,%%v5,%%v6,%%v18\n\t" | |||||
| "vsegf %%v6,%%v5\n\t" | |||||
| "vesrlg %%v5,%%v5,32\n\t" | |||||
| "vag %%v5,%%v5,%%v4\n\t" | |||||
| "vag %%v6,%%v6,%%v4\n\t" | |||||
| "vfchesb %%v7,%%v0,%%v16\n\t" | |||||
| "vsel %%v0,%%v0,%%v16,%%v7\n\t" | |||||
| "vsegf %%v8,%%v7\n\t" | |||||
| "vesrlg %%v7,%%v7,32\n\t" | |||||
| "vsegf %%v7,%%v7\n\t" | |||||
| "vsel %%v1,%%v1,%%v5,%%v7\n\t" | |||||
| "vsel %%v2,%%v2,%%v6,%%v8\n\t" | |||||
| "vag %%v4,%%v4,%%v3\n\t" | |||||
| "agfi %%r1, 256\n\t" | |||||
| "brctg %[n], 0b\n\t" | |||||
| "veslg %%v3,%%v0,32\n\t" | |||||
| "vfchsb %%v4,%%v0,%%v3\n\t" | |||||
| "vchlg %%v5,%%v2,%%v1\n\t" | |||||
| "vfcesb %%v6,%%v0,%%v3\n\t" | |||||
| "vn %%v5,%%v5,%%v6\n\t" | |||||
| "vo %%v4,%%v4,%%v5\n\t" | |||||
| "vsel %%v0,%%v0,%%v3,%%v4\n\t" | |||||
| "vesrlg %%v4,%%v4,32\n\t" | |||||
| "vsegf %%v4,%%v4\n\t" | |||||
| "vsel %%v1,%%v1,%%v2,%%v4\n\t" | |||||
| "vrepf %%v2,%%v0,2\n\t" | |||||
| "vrepg %%v3,%%v1,1\n\t" | |||||
| "wfcsb %%v2,%%v0\n\t" | |||||
| "jne 1f\n\t" | |||||
| "vstef %%v0,%[amax],0\n\t" | |||||
| "vmnlg %%v0,%%v1,%%v3\n\t" | |||||
| "vlgvg %[iamax],%%v0,0\n\t" | |||||
| "j 2f\n\t" | |||||
| "1:\n\t" | |||||
| "wfchsb %%v4,%%v2,%%v0\n\t" | |||||
| "vesrlg %%v4,%%v4,32\n\t" | |||||
| "vsegf %%v4,%%v4\n\t" | |||||
| "vsel %%v1,%%v3,%%v1,%%v4\n\t" | |||||
| "vsel %%v0,%%v2,%%v0,%%v4\n\t" | |||||
| "ste %%f0,%[amax]\n\t" | |||||
| "vlgvg %[iamax],%%v1,0\n\t" | |||||
| "2:\n\t" | |||||
| "nop" | |||||
| : [iamax] "=r"(iamax),[amax] "=m"(*amax),[n] "+&r"(n) | |||||
| : "m"(*(const FLOAT (*)[n]) x),[x] "a"(x) | |||||
| : "cc", "r1", "v0", "v1", "v2", "v4", "v5", "v6", "v7", "v8", "v16", | |||||
| "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", | |||||
| "v27", "v28", "v29", "v30", "v31"); | |||||
| "vl %%v16,0(%%r1,%3) \n\t" | |||||
| "vl %%v17,16(%%r1,%3) \n\t" | |||||
| "vl %%v18,32(%%r1,%3) \n\t" | |||||
| "vl %%v19,48(%%r1,%3) \n\t" | |||||
| "vl %%v20,64(%%r1,%3) \n\t" | |||||
| "vl %%v21,80(%%r1,%3) \n\t" | |||||
| "vl %%v22,96(%%r1,%3) \n\t" | |||||
| "vl %%v23,112(%%r1,%3) \n\t" | |||||
| "vflpsb %%v16, %%v16 \n\t" | |||||
| "vflpsb %%v17, %%v17 \n\t" | |||||
| "vflpsb %%v18, %%v18 \n\t" | |||||
| "vflpsb %%v19, %%v19 \n\t" | |||||
| "vflpsb %%v20, %%v20 \n\t" | |||||
| "vflpsb %%v21, %%v21 \n\t" | |||||
| "vflpsb %%v22, %%v22 \n\t" | |||||
| "vflpsb %%v23, %%v23 \n\t" | |||||
| "vfchesb %%v5,%%v16,%%v17 \n\t" | |||||
| "vfchesb %%v6,%%v18,%%v19 \n\t" | |||||
| "vfchesb %%v7,%%v20,%%v21 \n\t" | |||||
| "vfchesb %%v8,%%v22,%%v23 \n\t" | |||||
| "vsel %%v16,%%v16,%%v17,%%v5 \n\t" | |||||
| "vsel %%v5,%%v24,%%v25,%%v5 \n\t" | |||||
| "vsel %%v17,%%v18,%%v19,%%v6 \n\t" | |||||
| "vsel %%v6,%%v26,%%v27,%%v6 \n\t" | |||||
| "vsel %%v18,%%v20,%%v21,%%v7 \n\t" | |||||
| "vsel %%v7,%%v28,%%v29,%%v7 \n\t" | |||||
| "vsel %%v19,%%v22,%%v23,%%v8 \n\t" | |||||
| "vsel %%v8,%%v30,%%v31,%%v8 \n\t" | |||||
| "vfchesb %%v20,%%v16,%%v17 \n\t" | |||||
| "vfchesb %%v21,%%v18,%%v19 \n\t" | |||||
| "vsel %%v16,%%v16,%%v17,%%v20 \n\t" | |||||
| "vsel %%v5,%%v5,%%v6,%%v20 \n\t" | |||||
| "vsel %%v17,%%v18,%%v19,%%v21 \n\t" | |||||
| "vsel %%v6,%%v7,%%v8,%%v21 \n\t" | |||||
| "vfchesb %%v18,%%v16,%%v17 \n\t" | |||||
| "vsel %%v16,%%v16,%%v17,%%v18 \n\t" | |||||
| "vsel %%v5,%%v5,%%v6,%%v18 \n\t" | |||||
| "vsegf %%v6,%%v5 \n\t" | |||||
| "vesrlg %%v5,%%v5,32 \n\t" | |||||
| "vag %%v5,%%v5,%%v4 \n\t" | |||||
| "vag %%v6,%%v6,%%v4 \n\t" | |||||
| "vfchesb %%v7,%%v0,%%v16 \n\t" | |||||
| "vsel %%v0,%%v0,%%v16,%%v7 \n\t" | |||||
| "vsegf %%v8,%%v7 \n\t" | |||||
| "vesrlg %%v7,%%v7,32 \n\t" | |||||
| "vsegf %%v7,%%v7 \n\t" | |||||
| "vsel %%v1,%%v1,%%v5,%%v7 \n\t" | |||||
| "vsel %%v2,%%v2,%%v6,%%v8 \n\t" | |||||
| "vag %%v4,%%v4,%%v3 \n\t" | |||||
| "vl %%v16,128(%%r1,%3) \n\t" | |||||
| "vl %%v17,144(%%r1,%3) \n\t" | |||||
| "vl %%v18,160(%%r1,%3) \n\t" | |||||
| "vl %%v19,176(%%r1,%3) \n\t" | |||||
| "vl %%v20,192(%%r1,%3) \n\t" | |||||
| "vl %%v21,208(%%r1,%3) \n\t" | |||||
| "vl %%v22,224(%%r1,%3) \n\t" | |||||
| "vl %%v23,240(%%r1,%3) \n\t" | |||||
| "vflpsb %%v16, %%v16 \n\t" | |||||
| "vflpsb %%v17, %%v17 \n\t" | |||||
| "vflpsb %%v18, %%v18 \n\t" | |||||
| "vflpsb %%v19, %%v19 \n\t" | |||||
| "vflpsb %%v20, %%v20 \n\t" | |||||
| "vflpsb %%v21, %%v21 \n\t" | |||||
| "vflpsb %%v22, %%v22 \n\t" | |||||
| "vflpsb %%v23, %%v23 \n\t" | |||||
| "vfchesb %%v5,%%v16,%%v17 \n\t" | |||||
| "vfchesb %%v6,%%v18,%%v19 \n\t" | |||||
| "vfchesb %%v7,%%v20,%%v21 \n\t" | |||||
| "vfchesb %%v8,%%v22,%%v23 \n\t" | |||||
| "vsel %%v16,%%v16,%%v17,%%v5 \n\t" | |||||
| "vsel %%v5,%%v24,%%v25,%%v5 \n\t" | |||||
| "vsel %%v17,%%v18,%%v19,%%v6 \n\t" | |||||
| "vsel %%v6,%%v26,%%v27,%%v6 \n\t" | |||||
| "vsel %%v18,%%v20,%%v21,%%v7 \n\t" | |||||
| "vsel %%v7,%%v28,%%v29,%%v7 \n\t" | |||||
| "vsel %%v19,%%v22,%%v23,%%v8 \n\t" | |||||
| "vsel %%v8,%%v30,%%v31,%%v8 \n\t" | |||||
| "vfchesb %%v20,%%v16,%%v17 \n\t" | |||||
| "vfchesb %%v21,%%v18,%%v19 \n\t" | |||||
| "vsel %%v16,%%v16,%%v17,%%v20 \n\t" | |||||
| "vsel %%v5,%%v5,%%v6,%%v20 \n\t" | |||||
| "vsel %%v17,%%v18,%%v19,%%v21 \n\t" | |||||
| "vsel %%v6,%%v7,%%v8,%%v21 \n\t" | |||||
| "vfchesb %%v18,%%v16,%%v17 \n\t" | |||||
| "vsel %%v16,%%v16,%%v17,%%v18 \n\t" | |||||
| "vsel %%v5,%%v5,%%v6,%%v18 \n\t" | |||||
| "vsegf %%v6,%%v5 \n\t" | |||||
| "vesrlg %%v5,%%v5,32 \n\t" | |||||
| "vag %%v5,%%v5,%%v4 \n\t" | |||||
| "vag %%v6,%%v6,%%v4 \n\t" | |||||
| "vfchesb %%v7,%%v0,%%v16 \n\t" | |||||
| "vsel %%v0,%%v0,%%v16,%%v7 \n\t" | |||||
| "vsegf %%v8,%%v7 \n\t" | |||||
| "vesrlg %%v7,%%v7,32 \n\t" | |||||
| "vsegf %%v7,%%v7 \n\t" | |||||
| "vsel %%v1,%%v1,%%v5,%%v7 \n\t" | |||||
| "vsel %%v2,%%v2,%%v6,%%v8 \n\t" | |||||
| "vag %%v4,%%v4,%%v3 \n\t" | |||||
| "agfi %%r1, 256 \n\t" | |||||
| "brctg %%r0, 0b \n\t" | |||||
| "veslg %%v3,%%v0,32 \n\t" | |||||
| "vfchsb %%v4,%%v0,%%v3 \n\t" | |||||
| "vchlg %%v5,%%v2,%%v1 \n\t" | |||||
| "vfcesb %%v6,%%v0,%%v3 \n\t" | |||||
| "vn %%v5,%%v5,%%v6 \n\t" | |||||
| "vo %%v4,%%v4,%%v5 \n\t" | |||||
| "vsel %%v0,%%v0,%%v3,%%v4 \n\t" | |||||
| "vesrlg %%v4,%%v4,32 \n\t" | |||||
| "vsegf %%v4,%%v4 \n\t" | |||||
| "vsel %%v1,%%v1,%%v2,%%v4 \n\t" | |||||
| "vrepf %%v2,%%v0,2 \n\t" | |||||
| "vrepg %%v3,%%v1,1 \n\t" | |||||
| "wfcsb %%v2,%%v0 \n\t" | |||||
| "jne 1f \n\t" | |||||
| "vstef %%v0,%1,0 \n\t" | |||||
| "vmnlg %%v0,%%v1,%%v3 \n\t" | |||||
| "vlgvg %0,%%v0,0 \n\t" | |||||
| "j 2f \n\t" | |||||
| "1: \n\t" | |||||
| "wfchsb %%v4,%%v2,%%v0 \n\t" | |||||
| "vesrlg %%v4,%%v4,32 \n\t" | |||||
| "vsegf %%v4,%%v4 \n\t" | |||||
| "vsel %%v1,%%v3,%%v1,%%v4 \n\t" | |||||
| "vsel %%v0,%%v2,%%v0,%%v4 \n\t" | |||||
| "ste %%f0,%1 \n\t" | |||||
| "vlgvg %0,%%v1,0 \n\t" | |||||
| "2: \n\t" | |||||
| "nop " | |||||
| :"=r"(iamax),"=m"(*amax) | |||||
| :"r"(n),"ZR"((const FLOAT (*)[n])x) | |||||
| :"memory","cc","r0","r1","v0","v1","v2","v4","v5","v6","v7","v8","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" | |||||
| ); | |||||
| return iamax; | |||||
| return iamax; | |||||
| } | } | ||||
| BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { | |||||
| BLASLONG i = 0; | |||||
| BLASLONG j = 0; | |||||
| FLOAT maxf = 0.0; | |||||
| BLASLONG max = 0; | |||||
| if (n <= 0 || inc_x <= 0) return (max); | |||||
| if (inc_x == 1) { | |||||
| BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { | |||||
| BLASLONG i = 0; | |||||
| BLASLONG j = 0; | |||||
| FLOAT maxf = 0.0; | |||||
| BLASLONG max = 0; | |||||
| BLASLONG n1 = n & -64; | |||||
| if (n1 > 0) { | |||||
| if (n <= 0 || inc_x <= 0) | |||||
| return (max); | |||||
| max = isamax_kernel_64(n1, x, &maxf); | |||||
| if (inc_x == 1) { | |||||
| i = n1; | |||||
| } | |||||
| else | |||||
| { | |||||
| maxf = ABS(x[0]); | |||||
| i++; | |||||
| } | |||||
| BLASLONG n1 = n & -64; | |||||
| if (n1 > 0) { | |||||
| while (i < n) { | |||||
| if (ABS(x[i]) > maxf) { | |||||
| max = i; | |||||
| maxf = ABS(x[i]); | |||||
| } | |||||
| i++; | |||||
| } | |||||
| return (max + 1); | |||||
| max = isamax_kernel_64(n1, x, &maxf); | |||||
| i = n1; | |||||
| } else { | } else { | ||||
| maxf = ABS(x[0]); | |||||
| i++; | |||||
| } | |||||
| while (i < n) { | |||||
| if (ABS(x[i]) > maxf) { | |||||
| max = i; | |||||
| maxf = ABS(x[i]); | |||||
| } | |||||
| i++; | |||||
| } | |||||
| return (max + 1); | |||||
| max = 0; | |||||
| maxf = ABS(x[0]); | |||||
| } else { | |||||
| BLASLONG n1 = n & -4; | |||||
| while (j < n1) { | |||||
| max = 0; | |||||
| maxf = ABS(x[0]); | |||||
| if (ABS(x[i]) > maxf) { | |||||
| max = j; | |||||
| maxf = ABS(x[i]); | |||||
| } | |||||
| if (ABS(x[i + inc_x]) > maxf) { | |||||
| max = j + 1; | |||||
| maxf = ABS(x[i + inc_x]); | |||||
| } | |||||
| if (ABS(x[i + 2 * inc_x]) > maxf) { | |||||
| max = j + 2; | |||||
| maxf = ABS(x[i + 2 * inc_x]); | |||||
| } | |||||
| if (ABS(x[i + 3 * inc_x]) > maxf) { | |||||
| max = j + 3; | |||||
| maxf = ABS(x[i + 3 * inc_x]); | |||||
| } | |||||
| BLASLONG n1 = n & -4; | |||||
| while (j < n1) { | |||||
| i += inc_x * 4; | |||||
| if (ABS(x[i]) > maxf) { | |||||
| max = j; | |||||
| maxf = ABS(x[i]); | |||||
| } | |||||
| if (ABS(x[i + inc_x]) > maxf) { | |||||
| max = j + 1; | |||||
| maxf = ABS(x[i + inc_x]); | |||||
| } | |||||
| if (ABS(x[i + 2 * inc_x]) > maxf) { | |||||
| max = j + 2; | |||||
| maxf = ABS(x[i + 2 * inc_x]); | |||||
| } | |||||
| if (ABS(x[i + 3 * inc_x]) > maxf) { | |||||
| max = j + 3; | |||||
| maxf = ABS(x[i + 3 * inc_x]); | |||||
| } | |||||
| j += 4; | |||||
| i += inc_x * 4; | |||||
| } | |||||
| j += 4; | |||||
| } | |||||
| while (j < n) { | |||||
| if (ABS(x[i]) > maxf) { | |||||
| max = j; | |||||
| maxf = ABS(x[i]); | |||||
| } | |||||
| i += inc_x; | |||||
| j++; | |||||
| } | |||||
| return (max + 1); | |||||
| while (j < n) { | |||||
| if (ABS(x[i]) > maxf) { | |||||
| max = j; | |||||
| maxf = ABS(x[i]); | |||||
| } | |||||
| i += inc_x; | |||||
| j++; | |||||
| } | } | ||||
| return (max + 1); | |||||
| } | |||||
| } | } | ||||
| @@ -1,5 +1,5 @@ | |||||
| /*************************************************************************** | /*************************************************************************** | ||||
| Copyright (c) 2013-2017, The OpenBLAS Project | |||||
| Copyright (c) 2013-2019, The OpenBLAS Project | |||||
| All rights reserved. | All rights reserved. | ||||
| Redistribution and use in source and binary forms, with or without | Redistribution and use in source and binary forms, with or without | ||||
| modification, are permitted provided that the following conditions are | modification, are permitted provided that the following conditions are | ||||
| @@ -28,282 +28,262 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #include "common.h" | #include "common.h" | ||||
| #include <math.h> | #include <math.h> | ||||
| #if defined(DOUBLE) | |||||
| #define ABS fabs | |||||
| #else | |||||
| #define ABS fabsf | #define ABS fabsf | ||||
| #endif | |||||
| static BLASLONG isamin_kernel_64(BLASLONG n, FLOAT *x, FLOAT *amin) | |||||
| { | |||||
| BLASLONG iamin; | |||||
| static BLASLONG isamin_kernel_64(BLASLONG n, FLOAT *x, FLOAT *amin) { | |||||
| BLASLONG iamin; | |||||
| __asm__ volatile ( | |||||
| "vl %%v0,0(%3) \n\t" | |||||
| "vflpsb %%v0,%%v0 \n\t" | |||||
| "vleig %%v1,0,0 \n\t" | |||||
| "vleig %%v1,2,1 \n\t" | |||||
| "vleig %%v2,1,0 \n\t" | |||||
| "vleig %%v2,3,1 \n\t" | |||||
| "vrepig %%v3,32 \n\t" | |||||
| "vzero %%v4 \n\t" | |||||
| "vleif %%v24,0,0 \n\t" | |||||
| "vleif %%v24,1,1 \n\t" | |||||
| "vleif %%v24,2,2 \n\t" | |||||
| "vleif %%v24,3,3 \n\t" | |||||
| "vleif %%v25,4,0 \n\t" | |||||
| "vleif %%v25,5,1 \n\t" | |||||
| "vleif %%v25,6,2 \n\t" | |||||
| "vleif %%v25,7,3 \n\t" | |||||
| "vleif %%v26,8,0 \n\t" | |||||
| "vleif %%v26,9,1 \n\t" | |||||
| "vleif %%v26,10,2 \n\t" | |||||
| "vleif %%v26,11,3 \n\t" | |||||
| "vleif %%v27,12,0 \n\t" | |||||
| "vleif %%v27,13,1 \n\t" | |||||
| "vleif %%v27,14,2 \n\t" | |||||
| "vleif %%v27,15,3 \n\t" | |||||
| "vleif %%v28,16,0 \n\t" | |||||
| "vleif %%v28,17,1 \n\t" | |||||
| "vleif %%v28,18,2 \n\t" | |||||
| "vleif %%v28,19,3 \n\t" | |||||
| "vleif %%v29,20,0 \n\t" | |||||
| "vleif %%v29,21,1 \n\t" | |||||
| "vleif %%v29,22,2 \n\t" | |||||
| "vleif %%v29,23,3 \n\t" | |||||
| "vleif %%v30,24,0 \n\t" | |||||
| "vleif %%v30,25,1 \n\t" | |||||
| "vleif %%v30,26,2 \n\t" | |||||
| "vleif %%v30,27,3 \n\t" | |||||
| "vleif %%v31,28,0 \n\t" | |||||
| "vleif %%v31,29,1 \n\t" | |||||
| "vleif %%v31,30,2 \n\t" | |||||
| "vleif %%v31,31,3 \n\t" | |||||
| "srlg %%r0,%2,6 \n\t" | |||||
| "xgr %%r1,%%r1 \n\t" | |||||
| "0: \n\t" | |||||
| "pfd 1, 1024(%%r1,%3) \n\t" | |||||
| __asm__("vl %%v0,0(%[x])\n\t" | |||||
| "vflpsb %%v0,%%v0\n\t" | |||||
| "vleig %%v1,0,0\n\t" | |||||
| "vleig %%v1,2,1\n\t" | |||||
| "vleig %%v2,1,0\n\t" | |||||
| "vleig %%v2,3,1\n\t" | |||||
| "vrepig %%v3,32\n\t" | |||||
| "vzero %%v4\n\t" | |||||
| "vleif %%v24,0,0\n\t" | |||||
| "vleif %%v24,1,1\n\t" | |||||
| "vleif %%v24,2,2\n\t" | |||||
| "vleif %%v24,3,3\n\t" | |||||
| "vleif %%v25,4,0\n\t" | |||||
| "vleif %%v25,5,1\n\t" | |||||
| "vleif %%v25,6,2\n\t" | |||||
| "vleif %%v25,7,3\n\t" | |||||
| "vleif %%v26,8,0\n\t" | |||||
| "vleif %%v26,9,1\n\t" | |||||
| "vleif %%v26,10,2\n\t" | |||||
| "vleif %%v26,11,3\n\t" | |||||
| "vleif %%v27,12,0\n\t" | |||||
| "vleif %%v27,13,1\n\t" | |||||
| "vleif %%v27,14,2\n\t" | |||||
| "vleif %%v27,15,3\n\t" | |||||
| "vleif %%v28,16,0\n\t" | |||||
| "vleif %%v28,17,1\n\t" | |||||
| "vleif %%v28,18,2\n\t" | |||||
| "vleif %%v28,19,3\n\t" | |||||
| "vleif %%v29,20,0\n\t" | |||||
| "vleif %%v29,21,1\n\t" | |||||
| "vleif %%v29,22,2\n\t" | |||||
| "vleif %%v29,23,3\n\t" | |||||
| "vleif %%v30,24,0\n\t" | |||||
| "vleif %%v30,25,1\n\t" | |||||
| "vleif %%v30,26,2\n\t" | |||||
| "vleif %%v30,27,3\n\t" | |||||
| "vleif %%v31,28,0\n\t" | |||||
| "vleif %%v31,29,1\n\t" | |||||
| "vleif %%v31,30,2\n\t" | |||||
| "vleif %%v31,31,3\n\t" | |||||
| "srlg %[n],%[n],6\n\t" | |||||
| "xgr %%r1,%%r1\n\t" | |||||
| "0:\n\t" | |||||
| "pfd 1, 1024(%%r1,%[x])\n\t" | |||||
| "vl %%v16,0(%%r1,%[x])\n\t" | |||||
| "vl %%v17,16(%%r1,%[x])\n\t" | |||||
| "vl %%v18,32(%%r1,%[x])\n\t" | |||||
| "vl %%v19,48(%%r1,%[x])\n\t" | |||||
| "vl %%v20,64(%%r1,%[x])\n\t" | |||||
| "vl %%v21,80(%%r1,%[x])\n\t" | |||||
| "vl %%v22,96(%%r1,%[x])\n\t" | |||||
| "vl %%v23,112(%%r1,%[x])\n\t" | |||||
| "vflpsb %%v16, %%v16\n\t" | |||||
| "vflpsb %%v17, %%v17\n\t" | |||||
| "vflpsb %%v18, %%v18\n\t" | |||||
| "vflpsb %%v19, %%v19\n\t" | |||||
| "vflpsb %%v20, %%v20\n\t" | |||||
| "vflpsb %%v21, %%v21\n\t" | |||||
| "vflpsb %%v22, %%v22\n\t" | |||||
| "vflpsb %%v23, %%v23\n\t" | |||||
| "vfchesb %%v5,%%v17,%%v16\n\t" | |||||
| "vfchesb %%v6,%%v19,%%v18\n\t" | |||||
| "vfchesb %%v7,%%v21,%%v20\n\t" | |||||
| "vfchesb %%v8,%%v23,%%v22\n\t" | |||||
| "vsel %%v16,%%v16,%%v17,%%v5\n\t" | |||||
| "vsel %%v5,%%v24,%%v25,%%v5\n\t" | |||||
| "vsel %%v17,%%v18,%%v19,%%v6\n\t" | |||||
| "vsel %%v6,%%v26,%%v27,%%v6\n\t" | |||||
| "vsel %%v18,%%v20,%%v21,%%v7\n\t" | |||||
| "vsel %%v7,%%v28,%%v29,%%v7\n\t" | |||||
| "vsel %%v19,%%v22,%%v23,%%v8\n\t" | |||||
| "vsel %%v8,%%v30,%%v31,%%v8\n\t" | |||||
| "vfchesb %%v20,%%v17,%%v16\n\t" | |||||
| "vfchesb %%v21,%%v19,%%v18\n\t" | |||||
| "vsel %%v16,%%v16,%%v17,%%v20\n\t" | |||||
| "vsel %%v5,%%v5,%%v6,%%v20\n\t" | |||||
| "vsel %%v17,%%v18,%%v19,%%v21\n\t" | |||||
| "vsel %%v6,%%v7,%%v8,%%v21\n\t" | |||||
| "vfchesb %%v18,%%v17,%%v16\n\t" | |||||
| "vsel %%v16,%%v16,%%v17,%%v18\n\t" | |||||
| "vsel %%v5,%%v5,%%v6,%%v18\n\t" | |||||
| "vsegf %%v6,%%v5\n\t" | |||||
| "vesrlg %%v5,%%v5,32\n\t" | |||||
| "vag %%v5,%%v5,%%v4\n\t" | |||||
| "vag %%v6,%%v6,%%v4\n\t" | |||||
| "vfchesb %%v7,%%v16,%%v0\n\t" | |||||
| "vsel %%v0,%%v0,%%v16,%%v7\n\t" | |||||
| "vsegf %%v8,%%v7\n\t" | |||||
| "vesrlg %%v7,%%v7,32\n\t" | |||||
| "vsegf %%v7,%%v7\n\t" | |||||
| "vsel %%v1,%%v1,%%v5,%%v7\n\t" | |||||
| "vsel %%v2,%%v2,%%v6,%%v8\n\t" | |||||
| "vag %%v4,%%v4,%%v3\n\t" | |||||
| "vl %%v16,128(%%r1,%[x])\n\t" | |||||
| "vl %%v17,144(%%r1,%[x])\n\t" | |||||
| "vl %%v18,160(%%r1,%[x])\n\t" | |||||
| "vl %%v19,176(%%r1,%[x])\n\t" | |||||
| "vl %%v20,192(%%r1,%[x])\n\t" | |||||
| "vl %%v21,208(%%r1,%[x])\n\t" | |||||
| "vl %%v22,224(%%r1,%[x])\n\t" | |||||
| "vl %%v23,240(%%r1,%[x])\n\t" | |||||
| "vflpsb %%v16, %%v16\n\t" | |||||
| "vflpsb %%v17, %%v17\n\t" | |||||
| "vflpsb %%v18, %%v18\n\t" | |||||
| "vflpsb %%v19, %%v19\n\t" | |||||
| "vflpsb %%v20, %%v20\n\t" | |||||
| "vflpsb %%v21, %%v21\n\t" | |||||
| "vflpsb %%v22, %%v22\n\t" | |||||
| "vflpsb %%v23, %%v23\n\t" | |||||
| "vfchesb %%v5,%%v17,%%v16\n\t" | |||||
| "vfchesb %%v6,%%v19,%%v18\n\t" | |||||
| "vfchesb %%v7,%%v21,%%v20\n\t" | |||||
| "vfchesb %%v8,%%v23,%%v22\n\t" | |||||
| "vsel %%v16,%%v16,%%v17,%%v5\n\t" | |||||
| "vsel %%v5,%%v24,%%v25,%%v5\n\t" | |||||
| "vsel %%v17,%%v18,%%v19,%%v6\n\t" | |||||
| "vsel %%v6,%%v26,%%v27,%%v6\n\t" | |||||
| "vsel %%v18,%%v20,%%v21,%%v7\n\t" | |||||
| "vsel %%v7,%%v28,%%v29,%%v7\n\t" | |||||
| "vsel %%v19,%%v22,%%v23,%%v8\n\t" | |||||
| "vsel %%v8,%%v30,%%v31,%%v8\n\t" | |||||
| "vfchesb %%v20,%%v17,%%v16\n\t" | |||||
| "vfchesb %%v21,%%v19,%%v18\n\t" | |||||
| "vsel %%v16,%%v16,%%v17,%%v20\n\t" | |||||
| "vsel %%v5,%%v5,%%v6,%%v20\n\t" | |||||
| "vsel %%v17,%%v18,%%v19,%%v21\n\t" | |||||
| "vsel %%v6,%%v7,%%v8,%%v21\n\t" | |||||
| "vfchesb %%v18,%%v17,%%v16\n\t" | |||||
| "vsel %%v16,%%v16,%%v17,%%v18\n\t" | |||||
| "vsel %%v5,%%v5,%%v6,%%v18\n\t" | |||||
| "vsegf %%v6,%%v5\n\t" | |||||
| "vesrlg %%v5,%%v5,32\n\t" | |||||
| "vag %%v5,%%v5,%%v4\n\t" | |||||
| "vag %%v6,%%v6,%%v4\n\t" | |||||
| "vfchesb %%v7,%%v16,%%v0\n\t" | |||||
| "vsel %%v0,%%v0,%%v16,%%v7\n\t" | |||||
| "vsegf %%v8,%%v7\n\t" | |||||
| "vesrlg %%v7,%%v7,32\n\t" | |||||
| "vsegf %%v7,%%v7\n\t" | |||||
| "vsel %%v1,%%v1,%%v5,%%v7\n\t" | |||||
| "vsel %%v2,%%v2,%%v6,%%v8\n\t" | |||||
| "vag %%v4,%%v4,%%v3\n\t" | |||||
| "agfi %%r1, 256\n\t" | |||||
| "brctg %[n], 0b\n\t" | |||||
| "veslg %%v3,%%v0,32\n\t" | |||||
| "vfchsb %%v4,%%v3,%%v0\n\t" | |||||
| "vchlg %%v5,%%v2,%%v1\n\t" | |||||
| "vfcesb %%v6,%%v0,%%v3\n\t" | |||||
| "vn %%v5,%%v5,%%v6\n\t" | |||||
| "vo %%v4,%%v4,%%v5\n\t" | |||||
| "vsel %%v0,%%v0,%%v3,%%v4\n\t" | |||||
| "vesrlg %%v4,%%v4,32\n\t" | |||||
| "vsegf %%v4,%%v4\n\t" | |||||
| "vsel %%v1,%%v1,%%v2,%%v4\n\t" | |||||
| "vrepf %%v2,%%v0,2\n\t" | |||||
| "vrepg %%v3,%%v1,1\n\t" | |||||
| "wfcsb %%v2,%%v0\n\t" | |||||
| "jne 1f\n\t" | |||||
| "vstef %%v0,%[amin],0\n\t" | |||||
| "vmnlg %%v0,%%v1,%%v3\n\t" | |||||
| "vlgvg %[iamin],%%v0,0\n\t" | |||||
| "j 2f\n\t" | |||||
| "1:\n\t" | |||||
| "wfchsb %%v4,%%v0,%%v2\n\t" | |||||
| "vesrlg %%v4,%%v4,32\n\t" | |||||
| "vsegf %%v4,%%v4\n\t" | |||||
| "vsel %%v1,%%v3,%%v1,%%v4\n\t" | |||||
| "vsel %%v0,%%v2,%%v0,%%v4\n\t" | |||||
| "ste %%f0,%[amin]\n\t" | |||||
| "vlgvg %[iamin],%%v1,0\n\t" | |||||
| "2:\n\t" | |||||
| "nop" | |||||
| : [iamin] "=r"(iamin),[amin] "=m"(*amin),[n] "+&r"(n) | |||||
| : "m"(*(const FLOAT (*)[n]) x),[x] "a"(x) | |||||
| : "cc", "r1", "v0", "v1", "v2", "v4", "v5", "v6", "v7", "v8", "v16", | |||||
| "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", | |||||
| "v27", "v28", "v29", "v30", "v31"); | |||||
| "vl %%v16,0(%%r1,%3) \n\t" | |||||
| "vl %%v17,16(%%r1,%3) \n\t" | |||||
| "vl %%v18,32(%%r1,%3) \n\t" | |||||
| "vl %%v19,48(%%r1,%3) \n\t" | |||||
| "vl %%v20,64(%%r1,%3) \n\t" | |||||
| "vl %%v21,80(%%r1,%3) \n\t" | |||||
| "vl %%v22,96(%%r1,%3) \n\t" | |||||
| "vl %%v23,112(%%r1,%3) \n\t" | |||||
| "vflpsb %%v16, %%v16 \n\t" | |||||
| "vflpsb %%v17, %%v17 \n\t" | |||||
| "vflpsb %%v18, %%v18 \n\t" | |||||
| "vflpsb %%v19, %%v19 \n\t" | |||||
| "vflpsb %%v20, %%v20 \n\t" | |||||
| "vflpsb %%v21, %%v21 \n\t" | |||||
| "vflpsb %%v22, %%v22 \n\t" | |||||
| "vflpsb %%v23, %%v23 \n\t" | |||||
| "vfchesb %%v5,%%v17,%%v16 \n\t" | |||||
| "vfchesb %%v6,%%v19,%%v18 \n\t" | |||||
| "vfchesb %%v7,%%v21,%%v20 \n\t" | |||||
| "vfchesb %%v8,%%v23,%%v22 \n\t" | |||||
| "vsel %%v16,%%v16,%%v17,%%v5 \n\t" | |||||
| "vsel %%v5,%%v24,%%v25,%%v5 \n\t" | |||||
| "vsel %%v17,%%v18,%%v19,%%v6 \n\t" | |||||
| "vsel %%v6,%%v26,%%v27,%%v6 \n\t" | |||||
| "vsel %%v18,%%v20,%%v21,%%v7 \n\t" | |||||
| "vsel %%v7,%%v28,%%v29,%%v7 \n\t" | |||||
| "vsel %%v19,%%v22,%%v23,%%v8 \n\t" | |||||
| "vsel %%v8,%%v30,%%v31,%%v8 \n\t" | |||||
| "vfchesb %%v20,%%v17,%%v16 \n\t" | |||||
| "vfchesb %%v21,%%v19,%%v18 \n\t" | |||||
| "vsel %%v16,%%v16,%%v17,%%v20 \n\t" | |||||
| "vsel %%v5,%%v5,%%v6,%%v20 \n\t" | |||||
| "vsel %%v17,%%v18,%%v19,%%v21 \n\t" | |||||
| "vsel %%v6,%%v7,%%v8,%%v21 \n\t" | |||||
| "vfchesb %%v18,%%v17,%%v16 \n\t" | |||||
| "vsel %%v16,%%v16,%%v17,%%v18 \n\t" | |||||
| "vsel %%v5,%%v5,%%v6,%%v18 \n\t" | |||||
| "vsegf %%v6,%%v5 \n\t" | |||||
| "vesrlg %%v5,%%v5,32 \n\t" | |||||
| "vag %%v5,%%v5,%%v4 \n\t" | |||||
| "vag %%v6,%%v6,%%v4 \n\t" | |||||
| "vfchesb %%v7,%%v16,%%v0 \n\t" | |||||
| "vsel %%v0,%%v0,%%v16,%%v7 \n\t" | |||||
| "vsegf %%v8,%%v7 \n\t" | |||||
| "vesrlg %%v7,%%v7,32 \n\t" | |||||
| "vsegf %%v7,%%v7 \n\t" | |||||
| "vsel %%v1,%%v1,%%v5,%%v7 \n\t" | |||||
| "vsel %%v2,%%v2,%%v6,%%v8 \n\t" | |||||
| "vag %%v4,%%v4,%%v3 \n\t" | |||||
| "vl %%v16,128(%%r1,%3) \n\t" | |||||
| "vl %%v17,144(%%r1,%3) \n\t" | |||||
| "vl %%v18,160(%%r1,%3) \n\t" | |||||
| "vl %%v19,176(%%r1,%3) \n\t" | |||||
| "vl %%v20,192(%%r1,%3) \n\t" | |||||
| "vl %%v21,208(%%r1,%3) \n\t" | |||||
| "vl %%v22,224(%%r1,%3) \n\t" | |||||
| "vl %%v23,240(%%r1,%3) \n\t" | |||||
| "vflpsb %%v16, %%v16 \n\t" | |||||
| "vflpsb %%v17, %%v17 \n\t" | |||||
| "vflpsb %%v18, %%v18 \n\t" | |||||
| "vflpsb %%v19, %%v19 \n\t" | |||||
| "vflpsb %%v20, %%v20 \n\t" | |||||
| "vflpsb %%v21, %%v21 \n\t" | |||||
| "vflpsb %%v22, %%v22 \n\t" | |||||
| "vflpsb %%v23, %%v23 \n\t" | |||||
| "vfchesb %%v5,%%v17,%%v16 \n\t" | |||||
| "vfchesb %%v6,%%v19,%%v18 \n\t" | |||||
| "vfchesb %%v7,%%v21,%%v20 \n\t" | |||||
| "vfchesb %%v8,%%v23,%%v22 \n\t" | |||||
| "vsel %%v16,%%v16,%%v17,%%v5 \n\t" | |||||
| "vsel %%v5,%%v24,%%v25,%%v5 \n\t" | |||||
| "vsel %%v17,%%v18,%%v19,%%v6 \n\t" | |||||
| "vsel %%v6,%%v26,%%v27,%%v6 \n\t" | |||||
| "vsel %%v18,%%v20,%%v21,%%v7 \n\t" | |||||
| "vsel %%v7,%%v28,%%v29,%%v7 \n\t" | |||||
| "vsel %%v19,%%v22,%%v23,%%v8 \n\t" | |||||
| "vsel %%v8,%%v30,%%v31,%%v8 \n\t" | |||||
| "vfchesb %%v20,%%v17,%%v16 \n\t" | |||||
| "vfchesb %%v21,%%v19,%%v18 \n\t" | |||||
| "vsel %%v16,%%v16,%%v17,%%v20 \n\t" | |||||
| "vsel %%v5,%%v5,%%v6,%%v20 \n\t" | |||||
| "vsel %%v17,%%v18,%%v19,%%v21 \n\t" | |||||
| "vsel %%v6,%%v7,%%v8,%%v21 \n\t" | |||||
| "vfchesb %%v18,%%v17,%%v16 \n\t" | |||||
| "vsel %%v16,%%v16,%%v17,%%v18 \n\t" | |||||
| "vsel %%v5,%%v5,%%v6,%%v18 \n\t" | |||||
| "vsegf %%v6,%%v5 \n\t" | |||||
| "vesrlg %%v5,%%v5,32 \n\t" | |||||
| "vag %%v5,%%v5,%%v4 \n\t" | |||||
| "vag %%v6,%%v6,%%v4 \n\t" | |||||
| "vfchesb %%v7,%%v16,%%v0 \n\t" | |||||
| "vsel %%v0,%%v0,%%v16,%%v7 \n\t" | |||||
| "vsegf %%v8,%%v7 \n\t" | |||||
| "vesrlg %%v7,%%v7,32 \n\t" | |||||
| "vsegf %%v7,%%v7 \n\t" | |||||
| "vsel %%v1,%%v1,%%v5,%%v7 \n\t" | |||||
| "vsel %%v2,%%v2,%%v6,%%v8 \n\t" | |||||
| "vag %%v4,%%v4,%%v3 \n\t" | |||||
| "agfi %%r1, 256 \n\t" | |||||
| "brctg %%r0, 0b \n\t" | |||||
| "veslg %%v3,%%v0,32 \n\t" | |||||
| "vfchsb %%v4,%%v3,%%v0 \n\t" | |||||
| "vchlg %%v5,%%v2,%%v1 \n\t" | |||||
| "vfcesb %%v6,%%v0,%%v3 \n\t" | |||||
| "vn %%v5,%%v5,%%v6 \n\t" | |||||
| "vo %%v4,%%v4,%%v5 \n\t" | |||||
| "vsel %%v0,%%v0,%%v3,%%v4 \n\t" | |||||
| "vesrlg %%v4,%%v4,32 \n\t" | |||||
| "vsegf %%v4,%%v4 \n\t" | |||||
| "vsel %%v1,%%v1,%%v2,%%v4 \n\t" | |||||
| "vrepf %%v2,%%v0,2 \n\t" | |||||
| "vrepg %%v3,%%v1,1 \n\t" | |||||
| "wfcsb %%v2,%%v0 \n\t" | |||||
| "jne 1f \n\t" | |||||
| "vstef %%v0,%1,0 \n\t" | |||||
| "vmnlg %%v0,%%v1,%%v3 \n\t" | |||||
| "vlgvg %0,%%v0,0 \n\t" | |||||
| "j 2f \n\t" | |||||
| "1: \n\t" | |||||
| "wfchsb %%v4,%%v0,%%v2 \n\t" | |||||
| "vesrlg %%v4,%%v4,32 \n\t" | |||||
| "vsegf %%v4,%%v4 \n\t" | |||||
| "vsel %%v1,%%v3,%%v1,%%v4 \n\t" | |||||
| "vsel %%v0,%%v2,%%v0,%%v4 \n\t" | |||||
| "ste %%f0,%1 \n\t" | |||||
| "vlgvg %0,%%v1,0 \n\t" | |||||
| "2: \n\t" | |||||
| "nop " | |||||
| :"=r"(iamin),"=m"(*amin) | |||||
| :"r"(n),"ZR"((const FLOAT (*)[n])x) | |||||
| :"memory","cc","r0","r1","v0","v1","v2","v4","v5","v6","v7","v8","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" | |||||
| ); | |||||
| return iamin; | |||||
| return iamin; | |||||
| } | } | ||||
| BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { | |||||
| BLASLONG i = 0; | |||||
| BLASLONG j = 0; | |||||
| FLOAT minf = 0.0; | |||||
| BLASLONG min = 0; | |||||
| if (n <= 0 || inc_x <= 0) return (min); | |||||
| if (inc_x == 1) { | |||||
| BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { | |||||
| BLASLONG i = 0; | |||||
| BLASLONG j = 0; | |||||
| FLOAT minf = 0.0; | |||||
| BLASLONG min = 0; | |||||
| BLASLONG n1 = n & -64; | |||||
| if (n1 > 0) { | |||||
| if (n <= 0 || inc_x <= 0) | |||||
| return (min); | |||||
| min = isamin_kernel_64(n1, x, &minf); | |||||
| if (inc_x == 1) { | |||||
| i = n1; | |||||
| } | |||||
| else | |||||
| { | |||||
| minf = ABS(x[0]); | |||||
| i++; | |||||
| } | |||||
| BLASLONG n1 = n & -64; | |||||
| if (n1 > 0) { | |||||
| while (i < n) { | |||||
| if (ABS(x[i]) < minf) { | |||||
| min = i; | |||||
| minf = ABS(x[i]); | |||||
| } | |||||
| i++; | |||||
| } | |||||
| return (min + 1); | |||||
| min = isamin_kernel_64(n1, x, &minf); | |||||
| i = n1; | |||||
| } else { | } else { | ||||
| minf = ABS(x[0]); | |||||
| i++; | |||||
| } | |||||
| while (i < n) { | |||||
| if (ABS(x[i]) < minf) { | |||||
| min = i; | |||||
| minf = ABS(x[i]); | |||||
| } | |||||
| i++; | |||||
| } | |||||
| return (min + 1); | |||||
| min = 0; | |||||
| minf = ABS(x[0]); | |||||
| } else { | |||||
| BLASLONG n1 = n & -4; | |||||
| while (j < n1) { | |||||
| min = 0; | |||||
| minf = ABS(x[0]); | |||||
| if (ABS(x[i]) < minf) { | |||||
| min = j; | |||||
| minf = ABS(x[i]); | |||||
| } | |||||
| if (ABS(x[i + inc_x]) < minf) { | |||||
| min = j + 1; | |||||
| minf = ABS(x[i + inc_x]); | |||||
| } | |||||
| if (ABS(x[i + 2 * inc_x]) < minf) { | |||||
| min = j + 2; | |||||
| minf = ABS(x[i + 2 * inc_x]); | |||||
| } | |||||
| if (ABS(x[i + 3 * inc_x]) < minf) { | |||||
| min = j + 3; | |||||
| minf = ABS(x[i + 3 * inc_x]); | |||||
| } | |||||
| BLASLONG n1 = n & -4; | |||||
| while (j < n1) { | |||||
| i += inc_x * 4; | |||||
| if (ABS(x[i]) < minf) { | |||||
| min = j; | |||||
| minf = ABS(x[i]); | |||||
| } | |||||
| if (ABS(x[i + inc_x]) < minf) { | |||||
| min = j + 1; | |||||
| minf = ABS(x[i + inc_x]); | |||||
| } | |||||
| if (ABS(x[i + 2 * inc_x]) < minf) { | |||||
| min = j + 2; | |||||
| minf = ABS(x[i + 2 * inc_x]); | |||||
| } | |||||
| if (ABS(x[i + 3 * inc_x]) < minf) { | |||||
| min = j + 3; | |||||
| minf = ABS(x[i + 3 * inc_x]); | |||||
| } | |||||
| j += 4; | |||||
| i += inc_x * 4; | |||||
| } | |||||
| j += 4; | |||||
| } | |||||
| while (j < n) { | |||||
| if (ABS(x[i]) < minf) { | |||||
| min = j; | |||||
| minf = ABS(x[i]); | |||||
| } | |||||
| i += inc_x; | |||||
| j++; | |||||
| } | |||||
| return (min + 1); | |||||
| while (j < n) { | |||||
| if (ABS(x[i]) < minf) { | |||||
| min = j; | |||||
| minf = ABS(x[i]); | |||||
| } | |||||
| i += inc_x; | |||||
| j++; | |||||
| } | } | ||||
| return (min + 1); | |||||
| } | |||||
| } | } | ||||
| @@ -1,5 +1,5 @@ | |||||
| /*************************************************************************** | /*************************************************************************** | ||||
| Copyright (c) 2013-2017, The OpenBLAS Project | |||||
| Copyright (c) 2013-2019, The OpenBLAS Project | |||||
| All rights reserved. | All rights reserved. | ||||
| Redistribution and use in source and binary forms, with or without | Redistribution and use in source and binary forms, with or without | ||||
| modification, are permitted provided that the following conditions are | modification, are permitted provided that the following conditions are | ||||
| @@ -27,259 +27,243 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #include "common.h" | #include "common.h" | ||||
| static BLASLONG ismax_kernel_64(BLASLONG n, FLOAT *x, FLOAT *max) | |||||
| { | |||||
| BLASLONG imax; | |||||
| static BLASLONG ismax_kernel_64(BLASLONG n, FLOAT *x, FLOAT *max) { | |||||
| BLASLONG imax; | |||||
| __asm__ volatile ( | |||||
| "vl %%v0,0(%3) \n\t" | |||||
| "vleig %%v1,0,0 \n\t" | |||||
| "vleig %%v1,2,1 \n\t" | |||||
| "vleig %%v2,1,0 \n\t" | |||||
| "vleig %%v2,3,1 \n\t" | |||||
| "vrepig %%v3,32 \n\t" | |||||
| "vzero %%v4 \n\t" | |||||
| "vleif %%v24,0,0 \n\t" | |||||
| "vleif %%v24,1,1 \n\t" | |||||
| "vleif %%v24,2,2 \n\t" | |||||
| "vleif %%v24,3,3 \n\t" | |||||
| "vleif %%v25,4,0 \n\t" | |||||
| "vleif %%v25,5,1 \n\t" | |||||
| "vleif %%v25,6,2 \n\t" | |||||
| "vleif %%v25,7,3 \n\t" | |||||
| "vleif %%v26,8,0 \n\t" | |||||
| "vleif %%v26,9,1 \n\t" | |||||
| "vleif %%v26,10,2 \n\t" | |||||
| "vleif %%v26,11,3 \n\t" | |||||
| "vleif %%v27,12,0 \n\t" | |||||
| "vleif %%v27,13,1 \n\t" | |||||
| "vleif %%v27,14,2 \n\t" | |||||
| "vleif %%v27,15,3 \n\t" | |||||
| "vleif %%v28,16,0 \n\t" | |||||
| "vleif %%v28,17,1 \n\t" | |||||
| "vleif %%v28,18,2 \n\t" | |||||
| "vleif %%v28,19,3 \n\t" | |||||
| "vleif %%v29,20,0 \n\t" | |||||
| "vleif %%v29,21,1 \n\t" | |||||
| "vleif %%v29,22,2 \n\t" | |||||
| "vleif %%v29,23,3 \n\t" | |||||
| "vleif %%v30,24,0 \n\t" | |||||
| "vleif %%v30,25,1 \n\t" | |||||
| "vleif %%v30,26,2 \n\t" | |||||
| "vleif %%v30,27,3 \n\t" | |||||
| "vleif %%v31,28,0 \n\t" | |||||
| "vleif %%v31,29,1 \n\t" | |||||
| "vleif %%v31,30,2 \n\t" | |||||
| "vleif %%v31,31,3 \n\t" | |||||
| "srlg %%r0,%2,6 \n\t" | |||||
| "xgr %%r1,%%r1 \n\t" | |||||
| "0: \n\t" | |||||
| "pfd 1, 1024(%%r1,%3) \n\t" | |||||
| __asm__("vl %%v0,0(%[x])\n\t" | |||||
| "vleig %%v1,0,0\n\t" | |||||
| "vleig %%v1,2,1\n\t" | |||||
| "vleig %%v2,1,0\n\t" | |||||
| "vleig %%v2,3,1\n\t" | |||||
| "vrepig %%v3,32\n\t" | |||||
| "vzero %%v4\n\t" | |||||
| "vleif %%v24,0,0\n\t" | |||||
| "vleif %%v24,1,1\n\t" | |||||
| "vleif %%v24,2,2\n\t" | |||||
| "vleif %%v24,3,3\n\t" | |||||
| "vleif %%v25,4,0\n\t" | |||||
| "vleif %%v25,5,1\n\t" | |||||
| "vleif %%v25,6,2\n\t" | |||||
| "vleif %%v25,7,3\n\t" | |||||
| "vleif %%v26,8,0\n\t" | |||||
| "vleif %%v26,9,1\n\t" | |||||
| "vleif %%v26,10,2\n\t" | |||||
| "vleif %%v26,11,3\n\t" | |||||
| "vleif %%v27,12,0\n\t" | |||||
| "vleif %%v27,13,1\n\t" | |||||
| "vleif %%v27,14,2\n\t" | |||||
| "vleif %%v27,15,3\n\t" | |||||
| "vleif %%v28,16,0\n\t" | |||||
| "vleif %%v28,17,1\n\t" | |||||
| "vleif %%v28,18,2\n\t" | |||||
| "vleif %%v28,19,3\n\t" | |||||
| "vleif %%v29,20,0\n\t" | |||||
| "vleif %%v29,21,1\n\t" | |||||
| "vleif %%v29,22,2\n\t" | |||||
| "vleif %%v29,23,3\n\t" | |||||
| "vleif %%v30,24,0\n\t" | |||||
| "vleif %%v30,25,1\n\t" | |||||
| "vleif %%v30,26,2\n\t" | |||||
| "vleif %%v30,27,3\n\t" | |||||
| "vleif %%v31,28,0\n\t" | |||||
| "vleif %%v31,29,1\n\t" | |||||
| "vleif %%v31,30,2\n\t" | |||||
| "vleif %%v31,31,3\n\t" | |||||
| "srlg %[n],%[n],6\n\t" | |||||
| "xgr %%r1,%%r1\n\t" | |||||
| "0:\n\t" | |||||
| "pfd 1, 1024(%%r1,%[x])\n\t" | |||||
| "vl %%v16,0(%%r1,%[x])\n\t" | |||||
| "vl %%v17,16(%%r1,%[x])\n\t" | |||||
| "vl %%v18,32(%%r1,%[x])\n\t" | |||||
| "vl %%v19,48(%%r1,%[x])\n\t" | |||||
| "vl %%v20,64(%%r1,%[x])\n\t" | |||||
| "vl %%v21,80(%%r1,%[x])\n\t" | |||||
| "vl %%v22,96(%%r1,%[x])\n\t" | |||||
| "vl %%v23,112(%%r1,%[x])\n\t" | |||||
| "vfchesb %%v5,%%v16,%%v17\n\t" | |||||
| "vfchesb %%v6,%%v18,%%v19\n\t" | |||||
| "vfchesb %%v7,%%v20,%%v21\n\t" | |||||
| "vfchesb %%v8,%%v22,%%v23\n\t" | |||||
| "vsel %%v16,%%v16,%%v17,%%v5\n\t" | |||||
| "vsel %%v5,%%v24,%%v25,%%v5\n\t" | |||||
| "vsel %%v17,%%v18,%%v19,%%v6\n\t" | |||||
| "vsel %%v6,%%v26,%%v27,%%v6\n\t" | |||||
| "vsel %%v18,%%v20,%%v21,%%v7\n\t" | |||||
| "vsel %%v7,%%v28,%%v29,%%v7\n\t" | |||||
| "vsel %%v19,%%v22,%%v23,%%v8\n\t" | |||||
| "vsel %%v8,%%v30,%%v31,%%v8\n\t" | |||||
| "vfchesb %%v20,%%v16,%%v17\n\t" | |||||
| "vfchesb %%v21,%%v18,%%v19\n\t" | |||||
| "vsel %%v16,%%v16,%%v17,%%v20\n\t" | |||||
| "vsel %%v5,%%v5,%%v6,%%v20\n\t" | |||||
| "vsel %%v17,%%v18,%%v19,%%v21\n\t" | |||||
| "vsel %%v6,%%v7,%%v8,%%v21\n\t" | |||||
| "vfchesb %%v18,%%v16,%%v17\n\t" | |||||
| "vsel %%v16,%%v16,%%v17,%%v18\n\t" | |||||
| "vsel %%v5,%%v5,%%v6,%%v18\n\t" | |||||
| "vsegf %%v6,%%v5\n\t" | |||||
| "vesrlg %%v5,%%v5,32\n\t" | |||||
| "vag %%v5,%%v5,%%v4\n\t" | |||||
| "vag %%v6,%%v6,%%v4\n\t" | |||||
| "vfchesb %%v7,%%v0,%%v16\n\t" | |||||
| "vsel %%v0,%%v0,%%v16,%%v7\n\t" | |||||
| "vsegf %%v8,%%v7\n\t" | |||||
| "vesrlg %%v7,%%v7,32\n\t" | |||||
| "vsegf %%v7,%%v7\n\t" | |||||
| "vsel %%v1,%%v1,%%v5,%%v7\n\t" | |||||
| "vsel %%v2,%%v2,%%v6,%%v8\n\t" | |||||
| "vag %%v4,%%v4,%%v3\n\t" | |||||
| "vl %%v16,128(%%r1,%[x])\n\t" | |||||
| "vl %%v17,144(%%r1,%[x])\n\t" | |||||
| "vl %%v18,160(%%r1,%[x])\n\t" | |||||
| "vl %%v19,176(%%r1,%[x])\n\t" | |||||
| "vl %%v20,192(%%r1,%[x])\n\t" | |||||
| "vl %%v21,208(%%r1,%[x])\n\t" | |||||
| "vl %%v22,224(%%r1,%[x])\n\t" | |||||
| "vl %%v23,240(%%r1,%[x])\n\t" | |||||
| "vfchesb %%v5,%%v16,%%v17\n\t" | |||||
| "vfchesb %%v6,%%v18,%%v19\n\t" | |||||
| "vfchesb %%v7,%%v20,%%v21\n\t" | |||||
| "vfchesb %%v8,%%v22,%%v23\n\t" | |||||
| "vsel %%v16,%%v16,%%v17,%%v5\n\t" | |||||
| "vsel %%v5,%%v24,%%v25,%%v5\n\t" | |||||
| "vsel %%v17,%%v18,%%v19,%%v6\n\t" | |||||
| "vsel %%v6,%%v26,%%v27,%%v6\n\t" | |||||
| "vsel %%v18,%%v20,%%v21,%%v7\n\t" | |||||
| "vsel %%v7,%%v28,%%v29,%%v7\n\t" | |||||
| "vsel %%v19,%%v22,%%v23,%%v8\n\t" | |||||
| "vsel %%v8,%%v30,%%v31,%%v8\n\t" | |||||
| "vfchesb %%v20,%%v16,%%v17\n\t" | |||||
| "vfchesb %%v21,%%v18,%%v19\n\t" | |||||
| "vsel %%v16,%%v16,%%v17,%%v20\n\t" | |||||
| "vsel %%v5,%%v5,%%v6,%%v20\n\t" | |||||
| "vsel %%v17,%%v18,%%v19,%%v21\n\t" | |||||
| "vsel %%v6,%%v7,%%v8,%%v21\n\t" | |||||
| "vfchesb %%v18,%%v16,%%v17\n\t" | |||||
| "vsel %%v16,%%v16,%%v17,%%v18\n\t" | |||||
| "vsel %%v5,%%v5,%%v6,%%v18\n\t" | |||||
| "vsegf %%v6,%%v5\n\t" | |||||
| "vesrlg %%v5,%%v5,32\n\t" | |||||
| "vag %%v5,%%v5,%%v4\n\t" | |||||
| "vag %%v6,%%v6,%%v4\n\t" | |||||
| "vfchesb %%v7,%%v0,%%v16\n\t" | |||||
| "vsel %%v0,%%v0,%%v16,%%v7\n\t" | |||||
| "vsegf %%v8,%%v7\n\t" | |||||
| "vesrlg %%v7,%%v7,32\n\t" | |||||
| "vsegf %%v7,%%v7\n\t" | |||||
| "vsel %%v1,%%v1,%%v5,%%v7\n\t" | |||||
| "vsel %%v2,%%v2,%%v6,%%v8\n\t" | |||||
| "vag %%v4,%%v4,%%v3\n\t" | |||||
| "agfi %%r1, 256\n\t" | |||||
| "brctg %[n], 0b\n\t" | |||||
| "veslg %%v3,%%v0,32\n\t" | |||||
| "vfchsb %%v4,%%v0,%%v3\n\t" | |||||
| "vchlg %%v5,%%v2,%%v1\n\t" | |||||
| "vfcesb %%v6,%%v0,%%v3\n\t" | |||||
| "vn %%v5,%%v5,%%v6\n\t" | |||||
| "vo %%v4,%%v4,%%v5\n\t" | |||||
| "vsel %%v0,%%v0,%%v3,%%v4\n\t" | |||||
| "vesrlg %%v4,%%v4,32\n\t" | |||||
| "vsegf %%v4,%%v4\n\t" | |||||
| "vsel %%v1,%%v1,%%v2,%%v4\n\t" | |||||
| "vrepf %%v2,%%v0,2\n\t" | |||||
| "vrepg %%v3,%%v1,1\n\t" | |||||
| "wfcsb %%v2,%%v0\n\t" | |||||
| "jne 1f\n\t" | |||||
| "vstef %%v0,%[max],0\n\t" | |||||
| "vmnlg %%v0,%%v1,%%v3\n\t" | |||||
| "vlgvg %[imax],%%v0,0\n\t" | |||||
| "j 2f\n\t" | |||||
| "1:\n\t" | |||||
| "wfchsb %%v4,%%v2,%%v0\n\t" | |||||
| "vesrlg %%v4,%%v4,32\n\t" | |||||
| "vsegf %%v4,%%v4\n\t" | |||||
| "vsel %%v1,%%v3,%%v1,%%v4\n\t" | |||||
| "vsel %%v0,%%v2,%%v0,%%v4\n\t" | |||||
| "ste %%f0,%[max]\n\t" | |||||
| "vlgvg %[imax],%%v1,0\n\t" | |||||
| "2:\n\t" | |||||
| "nop" | |||||
| : [imax] "=r"(imax),[max] "=m"(*max),[n] "+&r"(n) | |||||
| : "m"(*(const FLOAT (*)[n]) x),[x] "a"(x) | |||||
| : "cc", "r1", "v0", "v1", "v2", "v4", "v5", "v6", "v7", "v8", "v16", | |||||
| "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", | |||||
| "v27", "v28", "v29", "v30", "v31"); | |||||
| "vl %%v16,0(%%r1,%3) \n\t" | |||||
| "vl %%v17,16(%%r1,%3) \n\t" | |||||
| "vl %%v18,32(%%r1,%3) \n\t" | |||||
| "vl %%v19,48(%%r1,%3) \n\t" | |||||
| "vl %%v20,64(%%r1,%3) \n\t" | |||||
| "vl %%v21,80(%%r1,%3) \n\t" | |||||
| "vl %%v22,96(%%r1,%3) \n\t" | |||||
| "vl %%v23,112(%%r1,%3) \n\t" | |||||
| "vfchesb %%v5,%%v16,%%v17 \n\t" | |||||
| "vfchesb %%v6,%%v18,%%v19 \n\t" | |||||
| "vfchesb %%v7,%%v20,%%v21 \n\t" | |||||
| "vfchesb %%v8,%%v22,%%v23 \n\t" | |||||
| "vsel %%v16,%%v16,%%v17,%%v5 \n\t" | |||||
| "vsel %%v5,%%v24,%%v25,%%v5 \n\t" | |||||
| "vsel %%v17,%%v18,%%v19,%%v6 \n\t" | |||||
| "vsel %%v6,%%v26,%%v27,%%v6 \n\t" | |||||
| "vsel %%v18,%%v20,%%v21,%%v7 \n\t" | |||||
| "vsel %%v7,%%v28,%%v29,%%v7 \n\t" | |||||
| "vsel %%v19,%%v22,%%v23,%%v8 \n\t" | |||||
| "vsel %%v8,%%v30,%%v31,%%v8 \n\t" | |||||
| "vfchesb %%v20,%%v16,%%v17 \n\t" | |||||
| "vfchesb %%v21,%%v18,%%v19 \n\t" | |||||
| "vsel %%v16,%%v16,%%v17,%%v20 \n\t" | |||||
| "vsel %%v5,%%v5,%%v6,%%v20 \n\t" | |||||
| "vsel %%v17,%%v18,%%v19,%%v21 \n\t" | |||||
| "vsel %%v6,%%v7,%%v8,%%v21 \n\t" | |||||
| "vfchesb %%v18,%%v16,%%v17 \n\t" | |||||
| "vsel %%v16,%%v16,%%v17,%%v18 \n\t" | |||||
| "vsel %%v5,%%v5,%%v6,%%v18 \n\t" | |||||
| "vsegf %%v6,%%v5 \n\t" | |||||
| "vesrlg %%v5,%%v5,32 \n\t" | |||||
| "vag %%v5,%%v5,%%v4 \n\t" | |||||
| "vag %%v6,%%v6,%%v4 \n\t" | |||||
| "vfchesb %%v7,%%v0,%%v16 \n\t" | |||||
| "vsel %%v0,%%v0,%%v16,%%v7 \n\t" | |||||
| "vsegf %%v8,%%v7 \n\t" | |||||
| "vesrlg %%v7,%%v7,32 \n\t" | |||||
| "vsegf %%v7,%%v7 \n\t" | |||||
| "vsel %%v1,%%v1,%%v5,%%v7 \n\t" | |||||
| "vsel %%v2,%%v2,%%v6,%%v8 \n\t" | |||||
| "vag %%v4,%%v4,%%v3 \n\t" | |||||
| "vl %%v16,128(%%r1,%3) \n\t" | |||||
| "vl %%v17,144(%%r1,%3) \n\t" | |||||
| "vl %%v18,160(%%r1,%3) \n\t" | |||||
| "vl %%v19,176(%%r1,%3) \n\t" | |||||
| "vl %%v20,192(%%r1,%3) \n\t" | |||||
| "vl %%v21,208(%%r1,%3) \n\t" | |||||
| "vl %%v22,224(%%r1,%3) \n\t" | |||||
| "vl %%v23,240(%%r1,%3) \n\t" | |||||
| "vfchesb %%v5,%%v16,%%v17 \n\t" | |||||
| "vfchesb %%v6,%%v18,%%v19 \n\t" | |||||
| "vfchesb %%v7,%%v20,%%v21 \n\t" | |||||
| "vfchesb %%v8,%%v22,%%v23 \n\t" | |||||
| "vsel %%v16,%%v16,%%v17,%%v5 \n\t" | |||||
| "vsel %%v5,%%v24,%%v25,%%v5 \n\t" | |||||
| "vsel %%v17,%%v18,%%v19,%%v6 \n\t" | |||||
| "vsel %%v6,%%v26,%%v27,%%v6 \n\t" | |||||
| "vsel %%v18,%%v20,%%v21,%%v7 \n\t" | |||||
| "vsel %%v7,%%v28,%%v29,%%v7 \n\t" | |||||
| "vsel %%v19,%%v22,%%v23,%%v8 \n\t" | |||||
| "vsel %%v8,%%v30,%%v31,%%v8 \n\t" | |||||
| "vfchesb %%v20,%%v16,%%v17 \n\t" | |||||
| "vfchesb %%v21,%%v18,%%v19 \n\t" | |||||
| "vsel %%v16,%%v16,%%v17,%%v20 \n\t" | |||||
| "vsel %%v5,%%v5,%%v6,%%v20 \n\t" | |||||
| "vsel %%v17,%%v18,%%v19,%%v21 \n\t" | |||||
| "vsel %%v6,%%v7,%%v8,%%v21 \n\t" | |||||
| "vfchesb %%v18,%%v16,%%v17 \n\t" | |||||
| "vsel %%v16,%%v16,%%v17,%%v18 \n\t" | |||||
| "vsel %%v5,%%v5,%%v6,%%v18 \n\t" | |||||
| "vsegf %%v6,%%v5 \n\t" | |||||
| "vesrlg %%v5,%%v5,32 \n\t" | |||||
| "vag %%v5,%%v5,%%v4 \n\t" | |||||
| "vag %%v6,%%v6,%%v4 \n\t" | |||||
| "vfchesb %%v7,%%v0,%%v16 \n\t" | |||||
| "vsel %%v0,%%v0,%%v16,%%v7 \n\t" | |||||
| "vsegf %%v8,%%v7 \n\t" | |||||
| "vesrlg %%v7,%%v7,32 \n\t" | |||||
| "vsegf %%v7,%%v7 \n\t" | |||||
| "vsel %%v1,%%v1,%%v5,%%v7 \n\t" | |||||
| "vsel %%v2,%%v2,%%v6,%%v8 \n\t" | |||||
| "vag %%v4,%%v4,%%v3 \n\t" | |||||
| "agfi %%r1, 256 \n\t" | |||||
| "brctg %%r0, 0b \n\t" | |||||
| "veslg %%v3,%%v0,32 \n\t" | |||||
| "vfchsb %%v4,%%v0,%%v3 \n\t" | |||||
| "vchlg %%v5,%%v2,%%v1 \n\t" | |||||
| "vfcesb %%v6,%%v0,%%v3 \n\t" | |||||
| "vn %%v5,%%v5,%%v6 \n\t" | |||||
| "vo %%v4,%%v4,%%v5 \n\t" | |||||
| "vsel %%v0,%%v0,%%v3,%%v4 \n\t" | |||||
| "vesrlg %%v4,%%v4,32 \n\t" | |||||
| "vsegf %%v4,%%v4 \n\t" | |||||
| "vsel %%v1,%%v1,%%v2,%%v4 \n\t" | |||||
| "vrepf %%v2,%%v0,2 \n\t" | |||||
| "vrepg %%v3,%%v1,1 \n\t" | |||||
| "wfcsb %%v2,%%v0 \n\t" | |||||
| "jne 1f \n\t" | |||||
| "vstef %%v0,%1,0 \n\t" | |||||
| "vmnlg %%v0,%%v1,%%v3 \n\t" | |||||
| "vlgvg %0,%%v0,0 \n\t" | |||||
| "j 2f \n\t" | |||||
| "1: \n\t" | |||||
| "wfchsb %%v4,%%v2,%%v0 \n\t" | |||||
| "vesrlg %%v4,%%v4,32 \n\t" | |||||
| "vsegf %%v4,%%v4 \n\t" | |||||
| "vsel %%v1,%%v3,%%v1,%%v4 \n\t" | |||||
| "vsel %%v0,%%v2,%%v0,%%v4 \n\t" | |||||
| "ste %%f0,%1 \n\t" | |||||
| "vlgvg %0,%%v1,0 \n\t" | |||||
| "2: \n\t" | |||||
| "nop " | |||||
| :"=r"(imax),"=m"(*max) | |||||
| :"r"(n),"ZR"((const FLOAT (*)[n])x) | |||||
| :"memory","cc","r0","r1","v0","v1","v2","v4","v5","v6","v7","v8","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" | |||||
| ); | |||||
| return imax; | |||||
| return imax; | |||||
| } | } | ||||
| BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { | |||||
| BLASLONG i = 0; | |||||
| BLASLONG j = 0; | |||||
| FLOAT maxf = 0.0; | |||||
| BLASLONG max = 0; | |||||
| if (n <= 0 || inc_x <= 0) return (max); | |||||
| if (inc_x == 1) { | |||||
| BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { | |||||
| BLASLONG i = 0; | |||||
| BLASLONG j = 0; | |||||
| FLOAT maxf = 0.0; | |||||
| BLASLONG max = 0; | |||||
| BLASLONG n1 = n & -64; | |||||
| if (n1 > 0) { | |||||
| if (n <= 0 || inc_x <= 0) | |||||
| return (max); | |||||
| max = ismax_kernel_64(n1, x, &maxf); | |||||
| if (inc_x == 1) { | |||||
| i = n1; | |||||
| } | |||||
| else | |||||
| { | |||||
| maxf = x[0]; | |||||
| i++; | |||||
| } | |||||
| BLASLONG n1 = n & -64; | |||||
| if (n1 > 0) { | |||||
| while (i < n) { | |||||
| if (x[i] > maxf) { | |||||
| max = i; | |||||
| maxf = x[i]; | |||||
| } | |||||
| i++; | |||||
| } | |||||
| return (max + 1); | |||||
| max = ismax_kernel_64(n1, x, &maxf); | |||||
| i = n1; | |||||
| } else { | } else { | ||||
| maxf = x[0]; | |||||
| i++; | |||||
| } | |||||
| while (i < n) { | |||||
| if (x[i] > maxf) { | |||||
| max = i; | |||||
| maxf = x[i]; | |||||
| } | |||||
| i++; | |||||
| } | |||||
| return (max + 1); | |||||
| max = 0; | |||||
| maxf = x[0]; | |||||
| } else { | |||||
| BLASLONG n1 = n & -4; | |||||
| while (j < n1) { | |||||
| max = 0; | |||||
| maxf = x[0]; | |||||
| if (x[i] > maxf) { | |||||
| max = j; | |||||
| maxf = x[i]; | |||||
| } | |||||
| if (x[i + inc_x] > maxf) { | |||||
| max = j + 1; | |||||
| maxf = x[i + inc_x]; | |||||
| } | |||||
| if (x[i + 2 * inc_x] > maxf) { | |||||
| max = j + 2; | |||||
| maxf = x[i + 2 * inc_x]; | |||||
| } | |||||
| if (x[i + 3 * inc_x] > maxf) { | |||||
| max = j + 3; | |||||
| maxf = x[i + 3 * inc_x]; | |||||
| } | |||||
| BLASLONG n1 = n & -4; | |||||
| while (j < n1) { | |||||
| i += inc_x * 4; | |||||
| if (x[i] > maxf) { | |||||
| max = j; | |||||
| maxf = x[i]; | |||||
| } | |||||
| if (x[i + inc_x] > maxf) { | |||||
| max = j + 1; | |||||
| maxf = x[i + inc_x]; | |||||
| } | |||||
| if (x[i + 2 * inc_x] > maxf) { | |||||
| max = j + 2; | |||||
| maxf = x[i + 2 * inc_x]; | |||||
| } | |||||
| if (x[i + 3 * inc_x] > maxf) { | |||||
| max = j + 3; | |||||
| maxf = x[i + 3 * inc_x]; | |||||
| } | |||||
| j += 4; | |||||
| i += inc_x * 4; | |||||
| } | |||||
| j += 4; | |||||
| } | |||||
| while (j < n) { | |||||
| if (x[i] > maxf) { | |||||
| max = j; | |||||
| maxf = x[i]; | |||||
| } | |||||
| i += inc_x; | |||||
| j++; | |||||
| } | |||||
| return (max + 1); | |||||
| while (j < n) { | |||||
| if (x[i] > maxf) { | |||||
| max = j; | |||||
| maxf = x[i]; | |||||
| } | |||||
| i += inc_x; | |||||
| j++; | |||||
| } | } | ||||
| return (max + 1); | |||||
| } | |||||
| } | } | ||||
| @@ -1,5 +1,5 @@ | |||||
| /*************************************************************************** | /*************************************************************************** | ||||
| Copyright (c) 2013-2017, The OpenBLAS Project | |||||
| Copyright (c) 2013-2019, The OpenBLAS Project | |||||
| All rights reserved. | All rights reserved. | ||||
| Redistribution and use in source and binary forms, with or without | Redistribution and use in source and binary forms, with or without | ||||
| modification, are permitted provided that the following conditions are | modification, are permitted provided that the following conditions are | ||||
| @@ -27,259 +27,243 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #include "common.h" | #include "common.h" | ||||
| static BLASLONG ismin_kernel_64(BLASLONG n, FLOAT *x, FLOAT *min) | |||||
| { | |||||
| BLASLONG imin; | |||||
| static BLASLONG ismin_kernel_64(BLASLONG n, FLOAT *x, FLOAT *min) { | |||||
| BLASLONG imin; | |||||
| __asm__ volatile ( | |||||
| "vl %%v0,0(%3) \n\t" | |||||
| "vleig %%v1,0,0 \n\t" | |||||
| "vleig %%v1,2,1 \n\t" | |||||
| "vleig %%v2,1,0 \n\t" | |||||
| "vleig %%v2,3,1 \n\t" | |||||
| "vrepig %%v3,32 \n\t" | |||||
| "vzero %%v4 \n\t" | |||||
| "vleif %%v24,0,0 \n\t" | |||||
| "vleif %%v24,1,1 \n\t" | |||||
| "vleif %%v24,2,2 \n\t" | |||||
| "vleif %%v24,3,3 \n\t" | |||||
| "vleif %%v25,4,0 \n\t" | |||||
| "vleif %%v25,5,1 \n\t" | |||||
| "vleif %%v25,6,2 \n\t" | |||||
| "vleif %%v25,7,3 \n\t" | |||||
| "vleif %%v26,8,0 \n\t" | |||||
| "vleif %%v26,9,1 \n\t" | |||||
| "vleif %%v26,10,2 \n\t" | |||||
| "vleif %%v26,11,3 \n\t" | |||||
| "vleif %%v27,12,0 \n\t" | |||||
| "vleif %%v27,13,1 \n\t" | |||||
| "vleif %%v27,14,2 \n\t" | |||||
| "vleif %%v27,15,3 \n\t" | |||||
| "vleif %%v28,16,0 \n\t" | |||||
| "vleif %%v28,17,1 \n\t" | |||||
| "vleif %%v28,18,2 \n\t" | |||||
| "vleif %%v28,19,3 \n\t" | |||||
| "vleif %%v29,20,0 \n\t" | |||||
| "vleif %%v29,21,1 \n\t" | |||||
| "vleif %%v29,22,2 \n\t" | |||||
| "vleif %%v29,23,3 \n\t" | |||||
| "vleif %%v30,24,0 \n\t" | |||||
| "vleif %%v30,25,1 \n\t" | |||||
| "vleif %%v30,26,2 \n\t" | |||||
| "vleif %%v30,27,3 \n\t" | |||||
| "vleif %%v31,28,0 \n\t" | |||||
| "vleif %%v31,29,1 \n\t" | |||||
| "vleif %%v31,30,2 \n\t" | |||||
| "vleif %%v31,31,3 \n\t" | |||||
| "srlg %%r0,%2,6 \n\t" | |||||
| "xgr %%r1,%%r1 \n\t" | |||||
| "0: \n\t" | |||||
| "pfd 1, 1024(%%r1,%3) \n\t" | |||||
| __asm__("vl %%v0,0(%[x])\n\t" | |||||
| "vleig %%v1,0,0\n\t" | |||||
| "vleig %%v1,2,1\n\t" | |||||
| "vleig %%v2,1,0\n\t" | |||||
| "vleig %%v2,3,1\n\t" | |||||
| "vrepig %%v3,32\n\t" | |||||
| "vzero %%v4\n\t" | |||||
| "vleif %%v24,0,0\n\t" | |||||
| "vleif %%v24,1,1\n\t" | |||||
| "vleif %%v24,2,2\n\t" | |||||
| "vleif %%v24,3,3\n\t" | |||||
| "vleif %%v25,4,0\n\t" | |||||
| "vleif %%v25,5,1\n\t" | |||||
| "vleif %%v25,6,2\n\t" | |||||
| "vleif %%v25,7,3\n\t" | |||||
| "vleif %%v26,8,0\n\t" | |||||
| "vleif %%v26,9,1\n\t" | |||||
| "vleif %%v26,10,2\n\t" | |||||
| "vleif %%v26,11,3\n\t" | |||||
| "vleif %%v27,12,0\n\t" | |||||
| "vleif %%v27,13,1\n\t" | |||||
| "vleif %%v27,14,2\n\t" | |||||
| "vleif %%v27,15,3\n\t" | |||||
| "vleif %%v28,16,0\n\t" | |||||
| "vleif %%v28,17,1\n\t" | |||||
| "vleif %%v28,18,2\n\t" | |||||
| "vleif %%v28,19,3\n\t" | |||||
| "vleif %%v29,20,0\n\t" | |||||
| "vleif %%v29,21,1\n\t" | |||||
| "vleif %%v29,22,2\n\t" | |||||
| "vleif %%v29,23,3\n\t" | |||||
| "vleif %%v30,24,0\n\t" | |||||
| "vleif %%v30,25,1\n\t" | |||||
| "vleif %%v30,26,2\n\t" | |||||
| "vleif %%v30,27,3\n\t" | |||||
| "vleif %%v31,28,0\n\t" | |||||
| "vleif %%v31,29,1\n\t" | |||||
| "vleif %%v31,30,2\n\t" | |||||
| "vleif %%v31,31,3\n\t" | |||||
| "srlg %[n],%[n],6\n\t" | |||||
| "xgr %%r1,%%r1\n\t" | |||||
| "0:\n\t" | |||||
| "pfd 1, 1024(%%r1,%[x])\n\t" | |||||
| "vl %%v16,0(%%r1,%[x])\n\t" | |||||
| "vl %%v17,16(%%r1,%[x])\n\t" | |||||
| "vl %%v18,32(%%r1,%[x])\n\t" | |||||
| "vl %%v19,48(%%r1,%[x])\n\t" | |||||
| "vl %%v20,64(%%r1,%[x])\n\t" | |||||
| "vl %%v21,80(%%r1,%[x])\n\t" | |||||
| "vl %%v22,96(%%r1,%[x])\n\t" | |||||
| "vl %%v23,112(%%r1,%[x])\n\t" | |||||
| "vfchesb %%v5,%%v17,%%v16\n\t" | |||||
| "vfchesb %%v6,%%v19,%%v18\n\t" | |||||
| "vfchesb %%v7,%%v21,%%v20\n\t" | |||||
| "vfchesb %%v8,%%v23,%%v22\n\t" | |||||
| "vsel %%v16,%%v16,%%v17,%%v5\n\t" | |||||
| "vsel %%v5,%%v24,%%v25,%%v5\n\t" | |||||
| "vsel %%v17,%%v18,%%v19,%%v6\n\t" | |||||
| "vsel %%v6,%%v26,%%v27,%%v6\n\t" | |||||
| "vsel %%v18,%%v20,%%v21,%%v7\n\t" | |||||
| "vsel %%v7,%%v28,%%v29,%%v7\n\t" | |||||
| "vsel %%v19,%%v22,%%v23,%%v8\n\t" | |||||
| "vsel %%v8,%%v30,%%v31,%%v8\n\t" | |||||
| "vfchesb %%v20,%%v17,%%v16\n\t" | |||||
| "vfchesb %%v21,%%v19,%%v18\n\t" | |||||
| "vsel %%v16,%%v16,%%v17,%%v20\n\t" | |||||
| "vsel %%v5,%%v5,%%v6,%%v20\n\t" | |||||
| "vsel %%v17,%%v18,%%v19,%%v21\n\t" | |||||
| "vsel %%v6,%%v7,%%v8,%%v21\n\t" | |||||
| "vfchesb %%v18,%%v17,%%v16\n\t" | |||||
| "vsel %%v16,%%v16,%%v17,%%v18\n\t" | |||||
| "vsel %%v5,%%v5,%%v6,%%v18\n\t" | |||||
| "vsegf %%v6,%%v5\n\t" | |||||
| "vesrlg %%v5,%%v5,32\n\t" | |||||
| "vag %%v5,%%v5,%%v4\n\t" | |||||
| "vag %%v6,%%v6,%%v4\n\t" | |||||
| "vfchesb %%v7,%%v16,%%v0\n\t" | |||||
| "vsel %%v0,%%v0,%%v16,%%v7\n\t" | |||||
| "vsegf %%v8,%%v7\n\t" | |||||
| "vesrlg %%v7,%%v7,32\n\t" | |||||
| "vsegf %%v7,%%v7\n\t" | |||||
| "vsel %%v1,%%v1,%%v5,%%v7\n\t" | |||||
| "vsel %%v2,%%v2,%%v6,%%v8\n\t" | |||||
| "vag %%v4,%%v4,%%v3\n\t" | |||||
| "vl %%v16,128(%%r1,%[x])\n\t" | |||||
| "vl %%v17,144(%%r1,%[x])\n\t" | |||||
| "vl %%v18,160(%%r1,%[x])\n\t" | |||||
| "vl %%v19,176(%%r1,%[x])\n\t" | |||||
| "vl %%v20,192(%%r1,%[x])\n\t" | |||||
| "vl %%v21,208(%%r1,%[x])\n\t" | |||||
| "vl %%v22,224(%%r1,%[x])\n\t" | |||||
| "vl %%v23,240(%%r1,%[x])\n\t" | |||||
| "vfchesb %%v5,%%v17,%%v16\n\t" | |||||
| "vfchesb %%v6,%%v19,%%v18\n\t" | |||||
| "vfchesb %%v7,%%v21,%%v20\n\t" | |||||
| "vfchesb %%v8,%%v23,%%v22\n\t" | |||||
| "vsel %%v16,%%v16,%%v17,%%v5\n\t" | |||||
| "vsel %%v5,%%v24,%%v25,%%v5\n\t" | |||||
| "vsel %%v17,%%v18,%%v19,%%v6\n\t" | |||||
| "vsel %%v6,%%v26,%%v27,%%v6\n\t" | |||||
| "vsel %%v18,%%v20,%%v21,%%v7\n\t" | |||||
| "vsel %%v7,%%v28,%%v29,%%v7\n\t" | |||||
| "vsel %%v19,%%v22,%%v23,%%v8\n\t" | |||||
| "vsel %%v8,%%v30,%%v31,%%v8\n\t" | |||||
| "vfchesb %%v20,%%v17,%%v16\n\t" | |||||
| "vfchesb %%v21,%%v19,%%v18\n\t" | |||||
| "vsel %%v16,%%v16,%%v17,%%v20\n\t" | |||||
| "vsel %%v5,%%v5,%%v6,%%v20\n\t" | |||||
| "vsel %%v17,%%v18,%%v19,%%v21\n\t" | |||||
| "vsel %%v6,%%v7,%%v8,%%v21\n\t" | |||||
| "vfchesb %%v18,%%v17,%%v16\n\t" | |||||
| "vsel %%v16,%%v16,%%v17,%%v18\n\t" | |||||
| "vsel %%v5,%%v5,%%v6,%%v18\n\t" | |||||
| "vsegf %%v6,%%v5\n\t" | |||||
| "vesrlg %%v5,%%v5,32\n\t" | |||||
| "vag %%v5,%%v5,%%v4\n\t" | |||||
| "vag %%v6,%%v6,%%v4\n\t" | |||||
| "vfchesb %%v7,%%v16,%%v0\n\t" | |||||
| "vsel %%v0,%%v0,%%v16,%%v7\n\t" | |||||
| "vsegf %%v8,%%v7\n\t" | |||||
| "vesrlg %%v7,%%v7,32\n\t" | |||||
| "vsegf %%v7,%%v7\n\t" | |||||
| "vsel %%v1,%%v1,%%v5,%%v7\n\t" | |||||
| "vsel %%v2,%%v2,%%v6,%%v8\n\t" | |||||
| "vag %%v4,%%v4,%%v3\n\t" | |||||
| "agfi %%r1, 256\n\t" | |||||
| "brctg %[n], 0b\n\t" | |||||
| "veslg %%v3,%%v0,32\n\t" | |||||
| "vfchsb %%v4,%%v3,%%v0\n\t" | |||||
| "vchlg %%v5,%%v2,%%v1\n\t" | |||||
| "vfcesb %%v6,%%v0,%%v3\n\t" | |||||
| "vn %%v5,%%v5,%%v6\n\t" | |||||
| "vo %%v4,%%v4,%%v5\n\t" | |||||
| "vsel %%v0,%%v0,%%v3,%%v4\n\t" | |||||
| "vesrlg %%v4,%%v4,32\n\t" | |||||
| "vsegf %%v4,%%v4\n\t" | |||||
| "vsel %%v1,%%v1,%%v2,%%v4\n\t" | |||||
| "vrepf %%v2,%%v0,2\n\t" | |||||
| "vrepg %%v3,%%v1,1\n\t" | |||||
| "wfcsb %%v2,%%v0\n\t" | |||||
| "jne 1f\n\t" | |||||
| "vstef %%v0,%[min],0\n\t" | |||||
| "vmnlg %%v0,%%v1,%%v3\n\t" | |||||
| "vlgvg %[imin],%%v0,0\n\t" | |||||
| "j 2f\n\t" | |||||
| "1:\n\t" | |||||
| "wfchsb %%v4,%%v0,%%v2\n\t" | |||||
| "vesrlg %%v4,%%v4,32\n\t" | |||||
| "vsegf %%v4,%%v4\n\t" | |||||
| "vsel %%v1,%%v3,%%v1,%%v4\n\t" | |||||
| "vsel %%v0,%%v2,%%v0,%%v4\n\t" | |||||
| "ste %%f0,%[min]\n\t" | |||||
| "vlgvg %[imin],%%v1,0\n\t" | |||||
| "2:\n\t" | |||||
| "nop" | |||||
| : [imin] "=r"(imin),[min] "=m"(*min),[n] "+&r"(n) | |||||
| : "m"(*(const FLOAT (*)[n]) x),[x] "a"(x) | |||||
| : "cc", "r1", "v0", "v1", "v2", "v4", "v5", "v6", "v7", "v8", "v16", | |||||
| "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", | |||||
| "v27", "v28", "v29", "v30", "v31"); | |||||
| "vl %%v16,0(%%r1,%3) \n\t" | |||||
| "vl %%v17,16(%%r1,%3) \n\t" | |||||
| "vl %%v18,32(%%r1,%3) \n\t" | |||||
| "vl %%v19,48(%%r1,%3) \n\t" | |||||
| "vl %%v20,64(%%r1,%3) \n\t" | |||||
| "vl %%v21,80(%%r1,%3) \n\t" | |||||
| "vl %%v22,96(%%r1,%3) \n\t" | |||||
| "vl %%v23,112(%%r1,%3) \n\t" | |||||
| "vfchesb %%v5,%%v17,%%v16 \n\t" | |||||
| "vfchesb %%v6,%%v19,%%v18 \n\t" | |||||
| "vfchesb %%v7,%%v21,%%v20 \n\t" | |||||
| "vfchesb %%v8,%%v23,%%v22 \n\t" | |||||
| "vsel %%v16,%%v16,%%v17,%%v5 \n\t" | |||||
| "vsel %%v5,%%v24,%%v25,%%v5 \n\t" | |||||
| "vsel %%v17,%%v18,%%v19,%%v6 \n\t" | |||||
| "vsel %%v6,%%v26,%%v27,%%v6 \n\t" | |||||
| "vsel %%v18,%%v20,%%v21,%%v7 \n\t" | |||||
| "vsel %%v7,%%v28,%%v29,%%v7 \n\t" | |||||
| "vsel %%v19,%%v22,%%v23,%%v8 \n\t" | |||||
| "vsel %%v8,%%v30,%%v31,%%v8 \n\t" | |||||
| "vfchesb %%v20,%%v17,%%v16 \n\t" | |||||
| "vfchesb %%v21,%%v19,%%v18 \n\t" | |||||
| "vsel %%v16,%%v16,%%v17,%%v20 \n\t" | |||||
| "vsel %%v5,%%v5,%%v6,%%v20 \n\t" | |||||
| "vsel %%v17,%%v18,%%v19,%%v21 \n\t" | |||||
| "vsel %%v6,%%v7,%%v8,%%v21 \n\t" | |||||
| "vfchesb %%v18,%%v17,%%v16 \n\t" | |||||
| "vsel %%v16,%%v16,%%v17,%%v18 \n\t" | |||||
| "vsel %%v5,%%v5,%%v6,%%v18 \n\t" | |||||
| "vsegf %%v6,%%v5 \n\t" | |||||
| "vesrlg %%v5,%%v5,32 \n\t" | |||||
| "vag %%v5,%%v5,%%v4 \n\t" | |||||
| "vag %%v6,%%v6,%%v4 \n\t" | |||||
| "vfchesb %%v7,%%v16,%%v0 \n\t" | |||||
| "vsel %%v0,%%v0,%%v16,%%v7 \n\t" | |||||
| "vsegf %%v8,%%v7 \n\t" | |||||
| "vesrlg %%v7,%%v7,32 \n\t" | |||||
| "vsegf %%v7,%%v7 \n\t" | |||||
| "vsel %%v1,%%v1,%%v5,%%v7 \n\t" | |||||
| "vsel %%v2,%%v2,%%v6,%%v8 \n\t" | |||||
| "vag %%v4,%%v4,%%v3 \n\t" | |||||
| "vl %%v16,128(%%r1,%3) \n\t" | |||||
| "vl %%v17,144(%%r1,%3) \n\t" | |||||
| "vl %%v18,160(%%r1,%3) \n\t" | |||||
| "vl %%v19,176(%%r1,%3) \n\t" | |||||
| "vl %%v20,192(%%r1,%3) \n\t" | |||||
| "vl %%v21,208(%%r1,%3) \n\t" | |||||
| "vl %%v22,224(%%r1,%3) \n\t" | |||||
| "vl %%v23,240(%%r1,%3) \n\t" | |||||
| "vfchesb %%v5,%%v17,%%v16 \n\t" | |||||
| "vfchesb %%v6,%%v19,%%v18 \n\t" | |||||
| "vfchesb %%v7,%%v21,%%v20 \n\t" | |||||
| "vfchesb %%v8,%%v23,%%v22 \n\t" | |||||
| "vsel %%v16,%%v16,%%v17,%%v5 \n\t" | |||||
| "vsel %%v5,%%v24,%%v25,%%v5 \n\t" | |||||
| "vsel %%v17,%%v18,%%v19,%%v6 \n\t" | |||||
| "vsel %%v6,%%v26,%%v27,%%v6 \n\t" | |||||
| "vsel %%v18,%%v20,%%v21,%%v7 \n\t" | |||||
| "vsel %%v7,%%v28,%%v29,%%v7 \n\t" | |||||
| "vsel %%v19,%%v22,%%v23,%%v8 \n\t" | |||||
| "vsel %%v8,%%v30,%%v31,%%v8 \n\t" | |||||
| "vfchesb %%v20,%%v17,%%v16 \n\t" | |||||
| "vfchesb %%v21,%%v19,%%v18 \n\t" | |||||
| "vsel %%v16,%%v16,%%v17,%%v20 \n\t" | |||||
| "vsel %%v5,%%v5,%%v6,%%v20 \n\t" | |||||
| "vsel %%v17,%%v18,%%v19,%%v21 \n\t" | |||||
| "vsel %%v6,%%v7,%%v8,%%v21 \n\t" | |||||
| "vfchesb %%v18,%%v17,%%v16 \n\t" | |||||
| "vsel %%v16,%%v16,%%v17,%%v18 \n\t" | |||||
| "vsel %%v5,%%v5,%%v6,%%v18 \n\t" | |||||
| "vsegf %%v6,%%v5 \n\t" | |||||
| "vesrlg %%v5,%%v5,32 \n\t" | |||||
| "vag %%v5,%%v5,%%v4 \n\t" | |||||
| "vag %%v6,%%v6,%%v4 \n\t" | |||||
| "vfchesb %%v7,%%v16,%%v0 \n\t" | |||||
| "vsel %%v0,%%v0,%%v16,%%v7 \n\t" | |||||
| "vsegf %%v8,%%v7 \n\t" | |||||
| "vesrlg %%v7,%%v7,32 \n\t" | |||||
| "vsegf %%v7,%%v7 \n\t" | |||||
| "vsel %%v1,%%v1,%%v5,%%v7 \n\t" | |||||
| "vsel %%v2,%%v2,%%v6,%%v8 \n\t" | |||||
| "vag %%v4,%%v4,%%v3 \n\t" | |||||
| "agfi %%r1, 256 \n\t" | |||||
| "brctg %%r0, 0b \n\t" | |||||
| "veslg %%v3,%%v0,32 \n\t" | |||||
| "vfchsb %%v4,%%v3,%%v0 \n\t" | |||||
| "vchlg %%v5,%%v2,%%v1 \n\t" | |||||
| "vfcesb %%v6,%%v0,%%v3 \n\t" | |||||
| "vn %%v5,%%v5,%%v6 \n\t" | |||||
| "vo %%v4,%%v4,%%v5 \n\t" | |||||
| "vsel %%v0,%%v0,%%v3,%%v4 \n\t" | |||||
| "vesrlg %%v4,%%v4,32 \n\t" | |||||
| "vsegf %%v4,%%v4 \n\t" | |||||
| "vsel %%v1,%%v1,%%v2,%%v4 \n\t" | |||||
| "vrepf %%v2,%%v0,2 \n\t" | |||||
| "vrepg %%v3,%%v1,1 \n\t" | |||||
| "wfcsb %%v2,%%v0 \n\t" | |||||
| "jne 1f \n\t" | |||||
| "vstef %%v0,%1,0 \n\t" | |||||
| "vmnlg %%v0,%%v1,%%v3 \n\t" | |||||
| "vlgvg %0,%%v0,0 \n\t" | |||||
| "j 2f \n\t" | |||||
| "1: \n\t" | |||||
| "wfchsb %%v4,%%v0,%%v2 \n\t" | |||||
| "vesrlg %%v4,%%v4,32 \n\t" | |||||
| "vsegf %%v4,%%v4 \n\t" | |||||
| "vsel %%v1,%%v3,%%v1,%%v4 \n\t" | |||||
| "vsel %%v0,%%v2,%%v0,%%v4 \n\t" | |||||
| "ste %%f0,%1 \n\t" | |||||
| "vlgvg %0,%%v1,0 \n\t" | |||||
| "2: \n\t" | |||||
| "nop " | |||||
| :"=r"(imin),"=m"(*min) | |||||
| :"r"(n),"ZR"((const FLOAT (*)[n])x) | |||||
| :"memory","cc","r0","r1","v0","v1","v2","v4","v5","v6","v7","v8","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" | |||||
| ); | |||||
| return imin; | |||||
| return imin; | |||||
| } | } | ||||
| BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { | |||||
| BLASLONG i = 0; | |||||
| BLASLONG j = 0; | |||||
| FLOAT minf = 0.0; | |||||
| BLASLONG min = 0; | |||||
| if (n <= 0 || inc_x <= 0) return (min); | |||||
| if (inc_x == 1) { | |||||
| BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { | |||||
| BLASLONG i = 0; | |||||
| BLASLONG j = 0; | |||||
| FLOAT minf = 0.0; | |||||
| BLASLONG min = 0; | |||||
| BLASLONG n1 = n & -64; | |||||
| if (n1 > 0) { | |||||
| if (n <= 0 || inc_x <= 0) | |||||
| return (min); | |||||
| min = ismin_kernel_64(n1, x, &minf); | |||||
| if (inc_x == 1) { | |||||
| i = n1; | |||||
| } | |||||
| else | |||||
| { | |||||
| minf = x[0]; | |||||
| i++; | |||||
| } | |||||
| BLASLONG n1 = n & -64; | |||||
| if (n1 > 0) { | |||||
| while (i < n) { | |||||
| if (x[i] < minf) { | |||||
| min = i; | |||||
| minf = x[i]; | |||||
| } | |||||
| i++; | |||||
| } | |||||
| return (min + 1); | |||||
| min = ismin_kernel_64(n1, x, &minf); | |||||
| i = n1; | |||||
| } else { | } else { | ||||
| minf = x[0]; | |||||
| i++; | |||||
| } | |||||
| while (i < n) { | |||||
| if (x[i] < minf) { | |||||
| min = i; | |||||
| minf = x[i]; | |||||
| } | |||||
| i++; | |||||
| } | |||||
| return (min + 1); | |||||
| min = 0; | |||||
| minf = x[0]; | |||||
| } else { | |||||
| BLASLONG n1 = n & -4; | |||||
| while (j < n1) { | |||||
| min = 0; | |||||
| minf = x[0]; | |||||
| if (x[i] < minf) { | |||||
| min = j; | |||||
| minf = x[i]; | |||||
| } | |||||
| if (x[i + inc_x] < minf) { | |||||
| min = j + 1; | |||||
| minf = x[i + inc_x]; | |||||
| } | |||||
| if (x[i + 2 * inc_x] < minf) { | |||||
| min = j + 2; | |||||
| minf = x[i + 2 * inc_x]; | |||||
| } | |||||
| if (x[i + 3 * inc_x] < minf) { | |||||
| min = j + 3; | |||||
| minf = x[i + 3 * inc_x]; | |||||
| } | |||||
| BLASLONG n1 = n & -4; | |||||
| while (j < n1) { | |||||
| i += inc_x * 4; | |||||
| if (x[i] < minf) { | |||||
| min = j; | |||||
| minf = x[i]; | |||||
| } | |||||
| if (x[i + inc_x] < minf) { | |||||
| min = j + 1; | |||||
| minf = x[i + inc_x]; | |||||
| } | |||||
| if (x[i + 2 * inc_x] < minf) { | |||||
| min = j + 2; | |||||
| minf = x[i + 2 * inc_x]; | |||||
| } | |||||
| if (x[i + 3 * inc_x] < minf) { | |||||
| min = j + 3; | |||||
| minf = x[i + 3 * inc_x]; | |||||
| } | |||||
| j += 4; | |||||
| i += inc_x * 4; | |||||
| } | |||||
| j += 4; | |||||
| } | |||||
| while (j < n) { | |||||
| if (x[i] < minf) { | |||||
| min = j; | |||||
| minf = x[i]; | |||||
| } | |||||
| i += inc_x; | |||||
| j++; | |||||
| } | |||||
| return (min + 1); | |||||
| while (j < n) { | |||||
| if (x[i] < minf) { | |||||
| min = j; | |||||
| minf = x[i]; | |||||
| } | |||||
| i += inc_x; | |||||
| j++; | |||||
| } | } | ||||
| return (min + 1); | |||||
| } | |||||
| } | } | ||||
| @@ -1,5 +1,5 @@ | |||||
| /*************************************************************************** | /*************************************************************************** | ||||
| Copyright (c) 2017, The OpenBLAS Project | |||||
| Copyright (c) 2019, The OpenBLAS Project | |||||
| All rights reserved. | All rights reserved. | ||||
| Redistribution and use in source and binary forms, with or without | Redistribution and use in source and binary forms, with or without | ||||
| modification, are permitted provided that the following conditions are | modification, are permitted provided that the following conditions are | ||||
| @@ -27,220 +27,219 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #include "common.h" | #include "common.h" | ||||
| #include <math.h> | #include <math.h> | ||||
| #if defined(DOUBLE) | |||||
| #define ABS fabs | |||||
| #else | |||||
| #define ABS fabsf | |||||
| #endif | |||||
| #define CABS1(x,i) (ABS(x[i]) + ABS(x[i + 1])) | |||||
| static BLASLONG izamax_kernel_16(BLASLONG n, FLOAT *x, FLOAT *amax) | |||||
| { | |||||
| BLASLONG iamax; | |||||
| __asm__ volatile ( | |||||
| "vleg %%v0,0(%3),0 \n\t" | |||||
| "vleg %%v1,8(%3),0 \n\t" | |||||
| "vleg %%v0,16(%3),1 \n\t" | |||||
| "vleg %%v1,24(%3),1 \n\t" | |||||
| "vflpdb %%v0,%%v0 \n\t" | |||||
| "vflpdb %%v1,%%v1 \n\t" | |||||
| "vfadb %%v0,%%v0,%%v1 \n\t" | |||||
| "vleig %%v1,0,0 \n\t" | |||||
| "vleig %%v1,1,1 \n\t" | |||||
| "vrepig %%v2,8 \n\t" | |||||
| "vzero %%v3 \n\t" | |||||
| "vleig %%v24,0,0 \n\t" | |||||
| "vleig %%v24,1,1 \n\t" | |||||
| "vleig %%v25,2,0 \n\t" | |||||
| "vleig %%v25,3,1 \n\t" | |||||
| "vleig %%v26,4,0 \n\t" | |||||
| "vleig %%v26,5,1 \n\t" | |||||
| "vleig %%v27,6,0 \n\t" | |||||
| "vleig %%v27,7,1 \n\t" | |||||
| "srlg %%r0,%2,4 \n\t" | |||||
| "xgr %%r1,%%r1 \n\t" | |||||
| "0: \n\t" | |||||
| "pfd 1, 1024(%%r1,%3) \n\t" | |||||
| "vleg %%v16,0(%%r1,%3),0 \n\t" | |||||
| "vleg %%v17,8(%%r1,%3),0 \n\t" | |||||
| "vleg %%v16,16(%%r1,%3),1 \n\t" | |||||
| "vleg %%v17,24(%%r1,%3),1 \n\t" | |||||
| "vleg %%v18,32(%%r1,%3),0 \n\t" | |||||
| "vleg %%v19,40(%%r1,%3),0 \n\t" | |||||
| "vleg %%v18,48(%%r1,%3),1 \n\t" | |||||
| "vleg %%v19,56(%%r1,%3),1 \n\t" | |||||
| "vleg %%v20,64(%%r1,%3),0 \n\t" | |||||
| "vleg %%v21,72(%%r1,%3),0 \n\t" | |||||
| "vleg %%v20,80(%%r1,%3),1 \n\t" | |||||
| "vleg %%v21,88(%%r1,%3),1 \n\t" | |||||
| "vleg %%v22,96(%%r1,%3),0 \n\t" | |||||
| "vleg %%v23,104(%%r1,%3),0 \n\t" | |||||
| "vleg %%v22,112(%%r1,%3),1 \n\t" | |||||
| "vleg %%v23,120(%%r1,%3),1 \n\t" | |||||
| "vflpdb %%v16, %%v16 \n\t" | |||||
| "vflpdb %%v17, %%v17 \n\t" | |||||
| "vflpdb %%v18, %%v18 \n\t" | |||||
| "vflpdb %%v19, %%v19 \n\t" | |||||
| "vflpdb %%v20, %%v20 \n\t" | |||||
| "vflpdb %%v21, %%v21 \n\t" | |||||
| "vflpdb %%v22, %%v22 \n\t" | |||||
| "vflpdb %%v23, %%v23 \n\t" | |||||
| "vfadb %%v16,%%v16,%%v17 \n\t" | |||||
| "vfadb %%v17,%%v18,%%v19 \n\t" | |||||
| "vfadb %%v18,%%v20,%%v21 \n\t" | |||||
| "vfadb %%v19,%%v22,%%v23 \n\t" | |||||
| "vfchedb %%v4,%%v16,%%v17 \n\t" | |||||
| "vfchedb %%v5,%%v18,%%v19 \n\t" | |||||
| "vsel %%v16,%%v16,%%v17,%%v4 \n\t" | |||||
| "vsel %%v4,%%v24,%%v25,%%v4 \n\t" | |||||
| "vsel %%v17,%%v18,%%v19,%%v5 \n\t" | |||||
| "vsel %%v5,%%v26,%%v27,%%v5 \n\t" | |||||
| "vfchedb %%v18,%%v16,%%v17 \n\t" | |||||
| "vsel %%v16,%%v16,%%v17,%%v18 \n\t" | |||||
| "vsel %%v4,%%v4,%%v5,%%v18 \n\t" | |||||
| "vag %%v4,%%v4,%%v3 \n\t" | |||||
| "vfchedb %%v5,%%v0,%%v16 \n\t" | |||||
| "vsel %%v0,%%v0,%%v16,%%v5 \n\t" | |||||
| "vsel %%v1,%%v1,%%v4,%%v5 \n\t" | |||||
| "vag %%v3,%%v3,%%v2 \n\t" | |||||
| "vleg %%v16,128(%%r1,%3),0 \n\t" | |||||
| "vleg %%v17,136(%%r1,%3),0 \n\t" | |||||
| "vleg %%v16,144(%%r1,%3),1 \n\t" | |||||
| "vleg %%v17,152(%%r1,%3),1 \n\t" | |||||
| "vleg %%v18,160(%%r1,%3),0 \n\t" | |||||
| "vleg %%v19,168(%%r1,%3),0 \n\t" | |||||
| "vleg %%v18,176(%%r1,%3),1 \n\t" | |||||
| "vleg %%v19,184(%%r1,%3),1 \n\t" | |||||
| "vleg %%v20,192(%%r1,%3),0 \n\t" | |||||
| "vleg %%v21,200(%%r1,%3),0 \n\t" | |||||
| "vleg %%v20,208(%%r1,%3),1 \n\t" | |||||
| "vleg %%v21,216(%%r1,%3),1 \n\t" | |||||
| "vleg %%v22,224(%%r1,%3),0 \n\t" | |||||
| "vleg %%v23,232(%%r1,%3),0 \n\t" | |||||
| "vleg %%v22,240(%%r1,%3),1 \n\t" | |||||
| "vleg %%v23,248(%%r1,%3),1 \n\t" | |||||
| "vflpdb %%v16, %%v16 \n\t" | |||||
| "vflpdb %%v17, %%v17 \n\t" | |||||
| "vflpdb %%v18, %%v18 \n\t" | |||||
| "vflpdb %%v19, %%v19 \n\t" | |||||
| "vflpdb %%v20, %%v20 \n\t" | |||||
| "vflpdb %%v21, %%v21 \n\t" | |||||
| "vflpdb %%v22, %%v22 \n\t" | |||||
| "vflpdb %%v23, %%v23 \n\t" | |||||
| "vfadb %%v16,%%v16,%%v17 \n\t" | |||||
| "vfadb %%v17,%%v18,%%v19 \n\t" | |||||
| "vfadb %%v18,%%v20,%%v21 \n\t" | |||||
| "vfadb %%v19,%%v22,%%v23 \n\t" | |||||
| "vfchedb %%v4,%%v16,%%v17 \n\t" | |||||
| "vfchedb %%v5,%%v18,%%v19 \n\t" | |||||
| "vsel %%v16,%%v16,%%v17,%%v4 \n\t" | |||||
| "vsel %%v4,%%v24,%%v25,%%v4 \n\t" | |||||
| "vsel %%v17,%%v18,%%v19,%%v5 \n\t" | |||||
| "vsel %%v5,%%v26,%%v27,%%v5 \n\t" | |||||
| "vfchedb %%v18,%%v16,%%v17 \n\t" | |||||
| "vsel %%v16,%%v16,%%v17,%%v18 \n\t" | |||||
| "vsel %%v4,%%v4,%%v5,%%v18 \n\t" | |||||
| "vag %%v4,%%v4,%%v3 \n\t" | |||||
| "vfchedb %%v5,%%v0,%%v16 \n\t" | |||||
| "vsel %%v0,%%v0,%%v16,%%v5 \n\t" | |||||
| "vsel %%v1,%%v1,%%v4,%%v5 \n\t" | |||||
| "vag %%v3,%%v3,%%v2 \n\t" | |||||
| "agfi %%r1, 256 \n\t" | |||||
| "brctg %%r0, 0b \n\t" | |||||
| "vrepg %%v2,%%v0,1 \n\t" | |||||
| "vrepg %%v3,%%v1,1 \n\t" | |||||
| "wfcdb %%v2,%%v0 \n\t" | |||||
| "jne 1f \n\t" | |||||
| "vsteg %%v0,%1,0 \n\t" | |||||
| "vmnlg %%v0,%%v1,%%v3 \n\t" | |||||
| "vlgvg %0,%%v0,0 \n\t" | |||||
| "j 2f \n\t" | |||||
| "1: \n\t" | |||||
| "wfchdb %%v4,%%v2,%%v0 \n\t" | |||||
| "vsel %%v1,%%v3,%%v1,%%v4 \n\t" | |||||
| "vsel %%v0,%%v2,%%v0,%%v4 \n\t" | |||||
| "std %%f0,%1 \n\t" | |||||
| "vlgvg %0,%%v1,0 \n\t" | |||||
| "2: \n\t" | |||||
| "nop " | |||||
| :"=r"(iamax),"=m"(*amax) | |||||
| :"r"(n),"ZR"((const FLOAT (*)[n * 2])x) | |||||
| :"memory","cc","r0","r1","v0","v1","v2","v3","v4","v5","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27" | |||||
| ); | |||||
| return iamax; | |||||
| #define CABS1(x,i) (fabs(x[i]) + fabs(x[i + 1])) | |||||
| static BLASLONG izamax_kernel_16(BLASLONG n, FLOAT *x, FLOAT *amax) { | |||||
| BLASLONG iamax; | |||||
| __asm__("vleg %%v0,0(%[x]),0\n\t" | |||||
| "vleg %%v1,8(%[x]),0\n\t" | |||||
| "vleg %%v0,16(%[x]),1\n\t" | |||||
| "vleg %%v1,24(%[x]),1\n\t" | |||||
| "vflpdb %%v0,%%v0\n\t" | |||||
| "vflpdb %%v1,%%v1\n\t" | |||||
| "vfadb %%v0,%%v0,%%v1\n\t" | |||||
| "vleig %%v1,0,0\n\t" | |||||
| "vleig %%v1,1,1\n\t" | |||||
| "vrepig %%v2,8\n\t" | |||||
| "vzero %%v3\n\t" | |||||
| "vleig %%v24,0,0\n\t" | |||||
| "vleig %%v24,1,1\n\t" | |||||
| "vleig %%v25,2,0\n\t" | |||||
| "vleig %%v25,3,1\n\t" | |||||
| "vleig %%v26,4,0\n\t" | |||||
| "vleig %%v26,5,1\n\t" | |||||
| "vleig %%v27,6,0\n\t" | |||||
| "vleig %%v27,7,1\n\t" | |||||
| "srlg %[n],%[n],4\n\t" | |||||
| "xgr %%r1,%%r1\n\t" | |||||
| "0:\n\t" | |||||
| "pfd 1, 1024(%%r1,%[x])\n\t" | |||||
| "vleg %%v16,0(%%r1,%[x]),0\n\t" | |||||
| "vleg %%v17,8(%%r1,%[x]),0\n\t" | |||||
| "vleg %%v16,16(%%r1,%[x]),1\n\t" | |||||
| "vleg %%v17,24(%%r1,%[x]),1\n\t" | |||||
| "vleg %%v18,32(%%r1,%[x]),0\n\t" | |||||
| "vleg %%v19,40(%%r1,%[x]),0\n\t" | |||||
| "vleg %%v18,48(%%r1,%[x]),1\n\t" | |||||
| "vleg %%v19,56(%%r1,%[x]),1\n\t" | |||||
| "vleg %%v20,64(%%r1,%[x]),0\n\t" | |||||
| "vleg %%v21,72(%%r1,%[x]),0\n\t" | |||||
| "vleg %%v20,80(%%r1,%[x]),1\n\t" | |||||
| "vleg %%v21,88(%%r1,%[x]),1\n\t" | |||||
| "vleg %%v22,96(%%r1,%[x]),0\n\t" | |||||
| "vleg %%v23,104(%%r1,%[x]),0\n\t" | |||||
| "vleg %%v22,112(%%r1,%[x]),1\n\t" | |||||
| "vleg %%v23,120(%%r1,%[x]),1\n\t" | |||||
| "vflpdb %%v16, %%v16\n\t" | |||||
| "vflpdb %%v17, %%v17\n\t" | |||||
| "vflpdb %%v18, %%v18\n\t" | |||||
| "vflpdb %%v19, %%v19\n\t" | |||||
| "vflpdb %%v20, %%v20\n\t" | |||||
| "vflpdb %%v21, %%v21\n\t" | |||||
| "vflpdb %%v22, %%v22\n\t" | |||||
| "vflpdb %%v23, %%v23\n\t" | |||||
| "vfadb %%v16,%%v16,%%v17\n\t" | |||||
| "vfadb %%v17,%%v18,%%v19\n\t" | |||||
| "vfadb %%v18,%%v20,%%v21\n\t" | |||||
| "vfadb %%v19,%%v22,%%v23\n\t" | |||||
| "vfchedb %%v4,%%v16,%%v17\n\t" | |||||
| "vfchedb %%v5,%%v18,%%v19\n\t" | |||||
| "vsel %%v16,%%v16,%%v17,%%v4\n\t" | |||||
| "vsel %%v4,%%v24,%%v25,%%v4\n\t" | |||||
| "vsel %%v17,%%v18,%%v19,%%v5\n\t" | |||||
| "vsel %%v5,%%v26,%%v27,%%v5\n\t" | |||||
| "vfchedb %%v18,%%v16,%%v17\n\t" | |||||
| "vsel %%v16,%%v16,%%v17,%%v18\n\t" | |||||
| "vsel %%v4,%%v4,%%v5,%%v18\n\t" | |||||
| "vag %%v4,%%v4,%%v3\n\t" | |||||
| "vfchedb %%v5,%%v0,%%v16\n\t" | |||||
| "vsel %%v0,%%v0,%%v16,%%v5\n\t" | |||||
| "vsel %%v1,%%v1,%%v4,%%v5\n\t" | |||||
| "vag %%v3,%%v3,%%v2\n\t" | |||||
| "vleg %%v16,128(%%r1,%[x]),0\n\t" | |||||
| "vleg %%v17,136(%%r1,%[x]),0\n\t" | |||||
| "vleg %%v16,144(%%r1,%[x]),1\n\t" | |||||
| "vleg %%v17,152(%%r1,%[x]),1\n\t" | |||||
| "vleg %%v18,160(%%r1,%[x]),0\n\t" | |||||
| "vleg %%v19,168(%%r1,%[x]),0\n\t" | |||||
| "vleg %%v18,176(%%r1,%[x]),1\n\t" | |||||
| "vleg %%v19,184(%%r1,%[x]),1\n\t" | |||||
| "vleg %%v20,192(%%r1,%[x]),0\n\t" | |||||
| "vleg %%v21,200(%%r1,%[x]),0\n\t" | |||||
| "vleg %%v20,208(%%r1,%[x]),1\n\t" | |||||
| "vleg %%v21,216(%%r1,%[x]),1\n\t" | |||||
| "vleg %%v22,224(%%r1,%[x]),0\n\t" | |||||
| "vleg %%v23,232(%%r1,%[x]),0\n\t" | |||||
| "vleg %%v22,240(%%r1,%[x]),1\n\t" | |||||
| "vleg %%v23,248(%%r1,%[x]),1\n\t" | |||||
| "vflpdb %%v16, %%v16\n\t" | |||||
| "vflpdb %%v17, %%v17\n\t" | |||||
| "vflpdb %%v18, %%v18\n\t" | |||||
| "vflpdb %%v19, %%v19\n\t" | |||||
| "vflpdb %%v20, %%v20\n\t" | |||||
| "vflpdb %%v21, %%v21\n\t" | |||||
| "vflpdb %%v22, %%v22\n\t" | |||||
| "vflpdb %%v23, %%v23\n\t" | |||||
| "vfadb %%v16,%%v16,%%v17\n\t" | |||||
| "vfadb %%v17,%%v18,%%v19\n\t" | |||||
| "vfadb %%v18,%%v20,%%v21\n\t" | |||||
| "vfadb %%v19,%%v22,%%v23\n\t" | |||||
| "vfchedb %%v4,%%v16,%%v17\n\t" | |||||
| "vfchedb %%v5,%%v18,%%v19\n\t" | |||||
| "vsel %%v16,%%v16,%%v17,%%v4\n\t" | |||||
| "vsel %%v4,%%v24,%%v25,%%v4\n\t" | |||||
| "vsel %%v17,%%v18,%%v19,%%v5\n\t" | |||||
| "vsel %%v5,%%v26,%%v27,%%v5\n\t" | |||||
| "vfchedb %%v18,%%v16,%%v17\n\t" | |||||
| "vsel %%v16,%%v16,%%v17,%%v18\n\t" | |||||
| "vsel %%v4,%%v4,%%v5,%%v18\n\t" | |||||
| "vag %%v4,%%v4,%%v3\n\t" | |||||
| "vfchedb %%v5,%%v0,%%v16\n\t" | |||||
| "vsel %%v0,%%v0,%%v16,%%v5\n\t" | |||||
| "vsel %%v1,%%v1,%%v4,%%v5\n\t" | |||||
| "vag %%v3,%%v3,%%v2\n\t" | |||||
| "agfi %%r1, 256\n\t" | |||||
| "brctg %[n], 0b\n\t" | |||||
| "vrepg %%v2,%%v0,1\n\t" | |||||
| "vrepg %%v3,%%v1,1\n\t" | |||||
| "wfcdb %%v2,%%v0\n\t" | |||||
| "jne 1f\n\t" | |||||
| "vsteg %%v0,%[amax],0\n\t" | |||||
| "vmnlg %%v0,%%v1,%%v3\n\t" | |||||
| "vlgvg %[iamax],%%v0,0\n\t" | |||||
| "j 2f\n\t" | |||||
| "1:\n\t" | |||||
| "wfchdb %%v4,%%v2,%%v0\n\t" | |||||
| "vsel %%v1,%%v3,%%v1,%%v4\n\t" | |||||
| "vsel %%v0,%%v2,%%v0,%%v4\n\t" | |||||
| "std %%f0,%[amax]\n\t" | |||||
| "vlgvg %[iamax],%%v1,0\n\t" | |||||
| "2:\n\t" | |||||
| "nop" | |||||
| : [iamax] "=r"(iamax),[amax] "=m"(*amax),[n] "+&r"(n) | |||||
| : "m"(*(const FLOAT (*)[n * 2]) x),[x] "a"(x) | |||||
| : "cc", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v16", "v17", "v18", | |||||
| "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27"); | |||||
| return iamax; | |||||
| } | } | ||||
| BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||||
| { | |||||
| BLASLONG i = 0; | |||||
| BLASLONG ix = 0; | |||||
| FLOAT maxf = 0; | |||||
| BLASLONG max = 0; | |||||
| BLASLONG inc_x2; | |||||
| BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { | |||||
| BLASLONG i = 0; | |||||
| BLASLONG ix = 0; | |||||
| FLOAT maxf = 0; | |||||
| BLASLONG max = 0; | |||||
| BLASLONG inc_x2; | |||||
| if (n <= 0 || inc_x <= 0) | |||||
| return (max); | |||||
| if (inc_x == 1) { | |||||
| if (n <= 0 || inc_x <= 0) return(max); | |||||
| if (inc_x == 1) { | |||||
| BLASLONG n1 = n & -16; | |||||
| if (n1 > 0) { | |||||
| BLASLONG n1 = n & -16; | |||||
| if (n1 > 0) { | |||||
| max = izamax_kernel_16(n1, x, &maxf); | |||||
| ix = n1 * 2; | |||||
| i = n1; | |||||
| } else { | |||||
| maxf = CABS1(x, 0); | |||||
| ix += 2; | |||||
| i++; | |||||
| } | |||||
| max = izamax_kernel_16(n1, x, &maxf); | |||||
| ix = n1 * 2; | |||||
| i = n1; | |||||
| while (i < n) { | |||||
| if (CABS1(x, ix) > maxf) { | |||||
| max = i; | |||||
| maxf = CABS1(x, ix); | |||||
| } | } | ||||
| else | |||||
| { | |||||
| maxf = CABS1(x,0); | |||||
| ix += 2; | |||||
| i++; | |||||
| } | |||||
| while(i < n) | |||||
| { | |||||
| if( CABS1(x,ix) > maxf ) | |||||
| { | |||||
| max = i; | |||||
| maxf = CABS1(x,ix); | |||||
| } | |||||
| ix += 2; | |||||
| i++; | |||||
| ix += 2; | |||||
| i++; | |||||
| } | } | ||||
| return (max + 1); | |||||
| return (max + 1); | |||||
| } else { | |||||
| } else { | |||||
| max = 0; | max = 0; | ||||
| maxf = CABS1(x,0); | |||||
| maxf = CABS1(x, 0); | |||||
| inc_x2 = 2 * inc_x; | inc_x2 = 2 * inc_x; | ||||
| ix += inc_x2; | |||||
| i++; | |||||
| while(i < n) | |||||
| { | |||||
| if( CABS1(x,ix) > maxf ) | |||||
| { | |||||
| max = i; | |||||
| maxf = CABS1(x,ix); | |||||
| } | |||||
| ix += inc_x2; | |||||
| i++; | |||||
| BLASLONG n1 = n & -4; | |||||
| while (i < n1) { | |||||
| if (CABS1(x, ix) > maxf) { | |||||
| max = i; | |||||
| maxf = CABS1(x, ix); | |||||
| } | |||||
| if (CABS1(x, ix + inc_x2) > maxf) { | |||||
| max = i + 1; | |||||
| maxf = CABS1(x, ix + inc_x2); | |||||
| } | |||||
| if (CABS1(x, ix + 2 * inc_x2) > maxf) { | |||||
| max = i + 2; | |||||
| maxf = CABS1(x, ix + 2 * inc_x2); | |||||
| } | |||||
| if (CABS1(x, ix + 3 * inc_x2) > maxf) { | |||||
| max = i + 3; | |||||
| maxf = CABS1(x, ix + 3 * inc_x2); | |||||
| } | |||||
| ix += inc_x2 * 4; | |||||
| i += 4; | |||||
| } | } | ||||
| return (max + 1); | |||||
| while (i < n) { | |||||
| if (CABS1(x, ix) > maxf) { | |||||
| max = i; | |||||
| maxf = CABS1(x, ix); | |||||
| } | |||||
| ix += inc_x2; | |||||
| i++; | |||||
| } | } | ||||
| return (max + 1); | |||||
| } | |||||
| } | } | ||||
| @@ -1,5 +1,5 @@ | |||||
| /*************************************************************************** | /*************************************************************************** | ||||
| Copyright (c) 2017, The OpenBLAS Project | |||||
| Copyright (c) 2019, The OpenBLAS Project | |||||
| All rights reserved. | All rights reserved. | ||||
| Redistribution and use in source and binary forms, with or without | Redistribution and use in source and binary forms, with or without | ||||
| modification, are permitted provided that the following conditions are | modification, are permitted provided that the following conditions are | ||||
| @@ -27,220 +27,219 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #include "common.h" | #include "common.h" | ||||
| #include <math.h> | #include <math.h> | ||||
| #if defined(DOUBLE) | |||||
| #define ABS fabs | |||||
| #else | |||||
| #define ABS fabsf | |||||
| #endif | |||||
| #define CABS1(x,i) (ABS(x[i]) + ABS(x[i + 1])) | |||||
| static BLASLONG izamin_kernel_16(BLASLONG n, FLOAT *x, FLOAT *amin) | |||||
| { | |||||
| BLASLONG iamin; | |||||
| __asm__ volatile ( | |||||
| "vleg %%v0,0(%3),0 \n\t" | |||||
| "vleg %%v1,8(%3),0 \n\t" | |||||
| "vleg %%v0,16(%3),1 \n\t" | |||||
| "vleg %%v1,24(%3),1 \n\t" | |||||
| "vflpdb %%v0,%%v0 \n\t" | |||||
| "vflpdb %%v1,%%v1 \n\t" | |||||
| "vfadb %%v0,%%v0,%%v1 \n\t" | |||||
| "vleig %%v1,0,0 \n\t" | |||||
| "vleig %%v1,1,1 \n\t" | |||||
| "vrepig %%v2,8 \n\t" | |||||
| "vzero %%v3 \n\t" | |||||
| "vleig %%v24,0,0 \n\t" | |||||
| "vleig %%v24,1,1 \n\t" | |||||
| "vleig %%v25,2,0 \n\t" | |||||
| "vleig %%v25,3,1 \n\t" | |||||
| "vleig %%v26,4,0 \n\t" | |||||
| "vleig %%v26,5,1 \n\t" | |||||
| "vleig %%v27,6,0 \n\t" | |||||
| "vleig %%v27,7,1 \n\t" | |||||
| "srlg %%r0,%2,4 \n\t" | |||||
| "xgr %%r1,%%r1 \n\t" | |||||
| "0: \n\t" | |||||
| "pfd 1, 1024(%%r1,%3) \n\t" | |||||
| "vleg %%v16,0(%%r1,%3),0 \n\t" | |||||
| "vleg %%v17,8(%%r1,%3),0 \n\t" | |||||
| "vleg %%v16,16(%%r1,%3),1 \n\t" | |||||
| "vleg %%v17,24(%%r1,%3),1 \n\t" | |||||
| "vleg %%v18,32(%%r1,%3),0 \n\t" | |||||
| "vleg %%v19,40(%%r1,%3),0 \n\t" | |||||
| "vleg %%v18,48(%%r1,%3),1 \n\t" | |||||
| "vleg %%v19,56(%%r1,%3),1 \n\t" | |||||
| "vleg %%v20,64(%%r1,%3),0 \n\t" | |||||
| "vleg %%v21,72(%%r1,%3),0 \n\t" | |||||
| "vleg %%v20,80(%%r1,%3),1 \n\t" | |||||
| "vleg %%v21,88(%%r1,%3),1 \n\t" | |||||
| "vleg %%v22,96(%%r1,%3),0 \n\t" | |||||
| "vleg %%v23,104(%%r1,%3),0 \n\t" | |||||
| "vleg %%v22,112(%%r1,%3),1 \n\t" | |||||
| "vleg %%v23,120(%%r1,%3),1 \n\t" | |||||
| "vflpdb %%v16, %%v16 \n\t" | |||||
| "vflpdb %%v17, %%v17 \n\t" | |||||
| "vflpdb %%v18, %%v18 \n\t" | |||||
| "vflpdb %%v19, %%v19 \n\t" | |||||
| "vflpdb %%v20, %%v20 \n\t" | |||||
| "vflpdb %%v21, %%v21 \n\t" | |||||
| "vflpdb %%v22, %%v22 \n\t" | |||||
| "vflpdb %%v23, %%v23 \n\t" | |||||
| "vfadb %%v16,%%v16,%%v17 \n\t" | |||||
| "vfadb %%v17,%%v18,%%v19 \n\t" | |||||
| "vfadb %%v18,%%v20,%%v21 \n\t" | |||||
| "vfadb %%v19,%%v22,%%v23 \n\t" | |||||
| "vfchedb %%v4,%%v17,%%v16 \n\t" | |||||
| "vfchedb %%v5,%%v19,%%v18 \n\t" | |||||
| "vsel %%v16,%%v16,%%v17,%%v4 \n\t" | |||||
| "vsel %%v4,%%v24,%%v25,%%v4 \n\t" | |||||
| "vsel %%v17,%%v18,%%v19,%%v5 \n\t" | |||||
| "vsel %%v5,%%v26,%%v27,%%v5 \n\t" | |||||
| "vfchedb %%v18,%%v17,%%v16 \n\t" | |||||
| "vsel %%v16,%%v16,%%v17,%%v18 \n\t" | |||||
| "vsel %%v4,%%v4,%%v5,%%v18 \n\t" | |||||
| "vag %%v4,%%v4,%%v3 \n\t" | |||||
| "vfchedb %%v5,%%v16,%%v0 \n\t" | |||||
| "vsel %%v0,%%v0,%%v16,%%v5 \n\t" | |||||
| "vsel %%v1,%%v1,%%v4,%%v5 \n\t" | |||||
| "vag %%v3,%%v3,%%v2 \n\t" | |||||
| "vleg %%v16,128(%%r1,%3),0 \n\t" | |||||
| "vleg %%v17,136(%%r1,%3),0 \n\t" | |||||
| "vleg %%v16,144(%%r1,%3),1 \n\t" | |||||
| "vleg %%v17,152(%%r1,%3),1 \n\t" | |||||
| "vleg %%v18,160(%%r1,%3),0 \n\t" | |||||
| "vleg %%v19,168(%%r1,%3),0 \n\t" | |||||
| "vleg %%v18,176(%%r1,%3),1 \n\t" | |||||
| "vleg %%v19,184(%%r1,%3),1 \n\t" | |||||
| "vleg %%v20,192(%%r1,%3),0 \n\t" | |||||
| "vleg %%v21,200(%%r1,%3),0 \n\t" | |||||
| "vleg %%v20,208(%%r1,%3),1 \n\t" | |||||
| "vleg %%v21,216(%%r1,%3),1 \n\t" | |||||
| "vleg %%v22,224(%%r1,%3),0 \n\t" | |||||
| "vleg %%v23,232(%%r1,%3),0 \n\t" | |||||
| "vleg %%v22,240(%%r1,%3),1 \n\t" | |||||
| "vleg %%v23,248(%%r1,%3),1 \n\t" | |||||
| "vflpdb %%v16, %%v16 \n\t" | |||||
| "vflpdb %%v17, %%v17 \n\t" | |||||
| "vflpdb %%v18, %%v18 \n\t" | |||||
| "vflpdb %%v19, %%v19 \n\t" | |||||
| "vflpdb %%v20, %%v20 \n\t" | |||||
| "vflpdb %%v21, %%v21 \n\t" | |||||
| "vflpdb %%v22, %%v22 \n\t" | |||||
| "vflpdb %%v23, %%v23 \n\t" | |||||
| "vfadb %%v16,%%v16,%%v17 \n\t" | |||||
| "vfadb %%v17,%%v18,%%v19 \n\t" | |||||
| "vfadb %%v18,%%v20,%%v21 \n\t" | |||||
| "vfadb %%v19,%%v22,%%v23 \n\t" | |||||
| "vfchedb %%v4,%%v17,%%v16 \n\t" | |||||
| "vfchedb %%v5,%%v19,%%v18 \n\t" | |||||
| "vsel %%v16,%%v16,%%v17,%%v4 \n\t" | |||||
| "vsel %%v4,%%v24,%%v25,%%v4 \n\t" | |||||
| "vsel %%v17,%%v18,%%v19,%%v5 \n\t" | |||||
| "vsel %%v5,%%v26,%%v27,%%v5 \n\t" | |||||
| "vfchedb %%v18,%%v17,%%v16 \n\t" | |||||
| "vsel %%v16,%%v16,%%v17,%%v18 \n\t" | |||||
| "vsel %%v4,%%v4,%%v5,%%v18 \n\t" | |||||
| "vag %%v4,%%v4,%%v3 \n\t" | |||||
| "vfchedb %%v5,%%v16,%%v0 \n\t" | |||||
| "vsel %%v0,%%v0,%%v16,%%v5 \n\t" | |||||
| "vsel %%v1,%%v1,%%v4,%%v5 \n\t" | |||||
| "vag %%v3,%%v3,%%v2 \n\t" | |||||
| "agfi %%r1, 256 \n\t" | |||||
| "brctg %%r0, 0b \n\t" | |||||
| "vrepg %%v2,%%v0,1 \n\t" | |||||
| "vrepg %%v3,%%v1,1 \n\t" | |||||
| "wfcdb %%v2,%%v0 \n\t" | |||||
| "jne 1f \n\t" | |||||
| "vsteg %%v0,%1,0 \n\t" | |||||
| "vmnlg %%v0,%%v1,%%v3 \n\t" | |||||
| "vlgvg %0,%%v0,0 \n\t" | |||||
| "j 2f \n\t" | |||||
| "1: \n\t" | |||||
| "wfchdb %%v4,%%v0,%%v2 \n\t" | |||||
| "vsel %%v1,%%v3,%%v1,%%v4 \n\t" | |||||
| "vsel %%v0,%%v2,%%v0,%%v4 \n\t" | |||||
| "std %%f0,%1 \n\t" | |||||
| "vlgvg %0,%%v1,0 \n\t" | |||||
| "2: \n\t" | |||||
| "nop " | |||||
| :"=r"(iamin),"=m"(*amin) | |||||
| :"r"(n),"ZR"((const FLOAT (*)[n * 2])x) | |||||
| :"memory","cc","r0","r1","v0","v1","v2","v3","v4","v5","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27" | |||||
| ); | |||||
| return iamin; | |||||
| #define CABS1(x,i) (fabs(x[i]) + fabs(x[i + 1])) | |||||
| static BLASLONG izamin_kernel_16(BLASLONG n, FLOAT *x, FLOAT *amin) { | |||||
| BLASLONG iamin; | |||||
| __asm__("vleg %%v0,0(%[x]),0\n\t" | |||||
| "vleg %%v1,8(%[x]),0\n\t" | |||||
| "vleg %%v0,16(%[x]),1\n\t" | |||||
| "vleg %%v1,24(%[x]),1\n\t" | |||||
| "vflpdb %%v0,%%v0\n\t" | |||||
| "vflpdb %%v1,%%v1\n\t" | |||||
| "vfadb %%v0,%%v0,%%v1\n\t" | |||||
| "vleig %%v1,0,0\n\t" | |||||
| "vleig %%v1,1,1\n\t" | |||||
| "vrepig %%v2,8\n\t" | |||||
| "vzero %%v3\n\t" | |||||
| "vleig %%v24,0,0\n\t" | |||||
| "vleig %%v24,1,1\n\t" | |||||
| "vleig %%v25,2,0\n\t" | |||||
| "vleig %%v25,3,1\n\t" | |||||
| "vleig %%v26,4,0\n\t" | |||||
| "vleig %%v26,5,1\n\t" | |||||
| "vleig %%v27,6,0\n\t" | |||||
| "vleig %%v27,7,1\n\t" | |||||
| "srlg %[n],%[n],4\n\t" | |||||
| "xgr %%r1,%%r1\n\t" | |||||
| "0:\n\t" | |||||
| "pfd 1, 1024(%%r1,%[x])\n\t" | |||||
| "vleg %%v16,0(%%r1,%[x]),0\n\t" | |||||
| "vleg %%v17,8(%%r1,%[x]),0\n\t" | |||||
| "vleg %%v16,16(%%r1,%[x]),1\n\t" | |||||
| "vleg %%v17,24(%%r1,%[x]),1\n\t" | |||||
| "vleg %%v18,32(%%r1,%[x]),0\n\t" | |||||
| "vleg %%v19,40(%%r1,%[x]),0\n\t" | |||||
| "vleg %%v18,48(%%r1,%[x]),1\n\t" | |||||
| "vleg %%v19,56(%%r1,%[x]),1\n\t" | |||||
| "vleg %%v20,64(%%r1,%[x]),0\n\t" | |||||
| "vleg %%v21,72(%%r1,%[x]),0\n\t" | |||||
| "vleg %%v20,80(%%r1,%[x]),1\n\t" | |||||
| "vleg %%v21,88(%%r1,%[x]),1\n\t" | |||||
| "vleg %%v22,96(%%r1,%[x]),0\n\t" | |||||
| "vleg %%v23,104(%%r1,%[x]),0\n\t" | |||||
| "vleg %%v22,112(%%r1,%[x]),1\n\t" | |||||
| "vleg %%v23,120(%%r1,%[x]),1\n\t" | |||||
| "vflpdb %%v16, %%v16\n\t" | |||||
| "vflpdb %%v17, %%v17\n\t" | |||||
| "vflpdb %%v18, %%v18\n\t" | |||||
| "vflpdb %%v19, %%v19\n\t" | |||||
| "vflpdb %%v20, %%v20\n\t" | |||||
| "vflpdb %%v21, %%v21\n\t" | |||||
| "vflpdb %%v22, %%v22\n\t" | |||||
| "vflpdb %%v23, %%v23\n\t" | |||||
| "vfadb %%v16,%%v16,%%v17\n\t" | |||||
| "vfadb %%v17,%%v18,%%v19\n\t" | |||||
| "vfadb %%v18,%%v20,%%v21\n\t" | |||||
| "vfadb %%v19,%%v22,%%v23\n\t" | |||||
| "vfchedb %%v4,%%v17,%%v16\n\t" | |||||
| "vfchedb %%v5,%%v19,%%v18\n\t" | |||||
| "vsel %%v16,%%v16,%%v17,%%v4\n\t" | |||||
| "vsel %%v4,%%v24,%%v25,%%v4\n\t" | |||||
| "vsel %%v17,%%v18,%%v19,%%v5\n\t" | |||||
| "vsel %%v5,%%v26,%%v27,%%v5\n\t" | |||||
| "vfchedb %%v18,%%v17,%%v16\n\t" | |||||
| "vsel %%v16,%%v16,%%v17,%%v18\n\t" | |||||
| "vsel %%v4,%%v4,%%v5,%%v18\n\t" | |||||
| "vag %%v4,%%v4,%%v3\n\t" | |||||
| "vfchedb %%v5,%%v16,%%v0\n\t" | |||||
| "vsel %%v0,%%v0,%%v16,%%v5\n\t" | |||||
| "vsel %%v1,%%v1,%%v4,%%v5\n\t" | |||||
| "vag %%v3,%%v3,%%v2\n\t" | |||||
| "vleg %%v16,128(%%r1,%[x]),0\n\t" | |||||
| "vleg %%v17,136(%%r1,%[x]),0\n\t" | |||||
| "vleg %%v16,144(%%r1,%[x]),1\n\t" | |||||
| "vleg %%v17,152(%%r1,%[x]),1\n\t" | |||||
| "vleg %%v18,160(%%r1,%[x]),0\n\t" | |||||
| "vleg %%v19,168(%%r1,%[x]),0\n\t" | |||||
| "vleg %%v18,176(%%r1,%[x]),1\n\t" | |||||
| "vleg %%v19,184(%%r1,%[x]),1\n\t" | |||||
| "vleg %%v20,192(%%r1,%[x]),0\n\t" | |||||
| "vleg %%v21,200(%%r1,%[x]),0\n\t" | |||||
| "vleg %%v20,208(%%r1,%[x]),1\n\t" | |||||
| "vleg %%v21,216(%%r1,%[x]),1\n\t" | |||||
| "vleg %%v22,224(%%r1,%[x]),0\n\t" | |||||
| "vleg %%v23,232(%%r1,%[x]),0\n\t" | |||||
| "vleg %%v22,240(%%r1,%[x]),1\n\t" | |||||
| "vleg %%v23,248(%%r1,%[x]),1\n\t" | |||||
| "vflpdb %%v16, %%v16\n\t" | |||||
| "vflpdb %%v17, %%v17\n\t" | |||||
| "vflpdb %%v18, %%v18\n\t" | |||||
| "vflpdb %%v19, %%v19\n\t" | |||||
| "vflpdb %%v20, %%v20\n\t" | |||||
| "vflpdb %%v21, %%v21\n\t" | |||||
| "vflpdb %%v22, %%v22\n\t" | |||||
| "vflpdb %%v23, %%v23\n\t" | |||||
| "vfadb %%v16,%%v16,%%v17\n\t" | |||||
| "vfadb %%v17,%%v18,%%v19\n\t" | |||||
| "vfadb %%v18,%%v20,%%v21\n\t" | |||||
| "vfadb %%v19,%%v22,%%v23\n\t" | |||||
| "vfchedb %%v4,%%v17,%%v16\n\t" | |||||
| "vfchedb %%v5,%%v19,%%v18\n\t" | |||||
| "vsel %%v16,%%v16,%%v17,%%v4\n\t" | |||||
| "vsel %%v4,%%v24,%%v25,%%v4\n\t" | |||||
| "vsel %%v17,%%v18,%%v19,%%v5\n\t" | |||||
| "vsel %%v5,%%v26,%%v27,%%v5\n\t" | |||||
| "vfchedb %%v18,%%v17,%%v16\n\t" | |||||
| "vsel %%v16,%%v16,%%v17,%%v18\n\t" | |||||
| "vsel %%v4,%%v4,%%v5,%%v18\n\t" | |||||
| "vag %%v4,%%v4,%%v3\n\t" | |||||
| "vfchedb %%v5,%%v16,%%v0\n\t" | |||||
| "vsel %%v0,%%v0,%%v16,%%v5\n\t" | |||||
| "vsel %%v1,%%v1,%%v4,%%v5\n\t" | |||||
| "vag %%v3,%%v3,%%v2\n\t" | |||||
| "agfi %%r1, 256\n\t" | |||||
| "brctg %[n], 0b\n\t" | |||||
| "vrepg %%v2,%%v0,1\n\t" | |||||
| "vrepg %%v3,%%v1,1\n\t" | |||||
| "wfcdb %%v2,%%v0\n\t" | |||||
| "jne 1f\n\t" | |||||
| "vsteg %%v0,%[amin],0\n\t" | |||||
| "vmnlg %%v0,%%v1,%%v3\n\t" | |||||
| "vlgvg %[iamin],%%v0,0\n\t" | |||||
| "j 2f\n\t" | |||||
| "1:\n\t" | |||||
| "wfchdb %%v4,%%v0,%%v2\n\t" | |||||
| "vsel %%v1,%%v3,%%v1,%%v4\n\t" | |||||
| "vsel %%v0,%%v2,%%v0,%%v4\n\t" | |||||
| "std %%f0,%[amin]\n\t" | |||||
| "vlgvg %[iamin],%%v1,0\n\t" | |||||
| "2:\n\t" | |||||
| "nop" | |||||
| : [iamin] "=r"(iamin),[amin] "=m"(*amin),[n] "+&r"(n) | |||||
| : "m"(*(const FLOAT (*)[n * 2]) x),[x] "a"(x) | |||||
| : "cc", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v16", "v17", "v18", | |||||
| "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27"); | |||||
| return iamin; | |||||
| } | } | ||||
| BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||||
| { | |||||
| BLASLONG i = 0; | |||||
| BLASLONG ix = 0; | |||||
| FLOAT minf = 0; | |||||
| BLASLONG min = 0; | |||||
| BLASLONG inc_x2; | |||||
| BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { | |||||
| BLASLONG i = 0; | |||||
| BLASLONG ix = 0; | |||||
| FLOAT minf = 0; | |||||
| BLASLONG min = 0; | |||||
| BLASLONG inc_x2; | |||||
| if (n <= 0 || inc_x <= 0) | |||||
| return (min); | |||||
| if (inc_x == 1) { | |||||
| if (n <= 0 || inc_x <= 0) return(min); | |||||
| if (inc_x == 1) { | |||||
| BLASLONG n1 = n & -16; | |||||
| if (n1 > 0) { | |||||
| BLASLONG n1 = n & -16; | |||||
| if (n1 > 0) { | |||||
| min = izamin_kernel_16(n1, x, &minf); | |||||
| ix = n1 * 2; | |||||
| i = n1; | |||||
| } else { | |||||
| minf = CABS1(x, 0); | |||||
| ix += 2; | |||||
| i++; | |||||
| } | |||||
| min = izamin_kernel_16(n1, x, &minf); | |||||
| ix = n1 * 2; | |||||
| i = n1; | |||||
| while (i < n) { | |||||
| if (CABS1(x, ix) < minf) { | |||||
| min = i; | |||||
| minf = CABS1(x, ix); | |||||
| } | } | ||||
| else | |||||
| { | |||||
| minf = CABS1(x,0); | |||||
| ix += 2; | |||||
| i++; | |||||
| } | |||||
| while(i < n) | |||||
| { | |||||
| if( CABS1(x,ix) < minf ) | |||||
| { | |||||
| min = i; | |||||
| minf = CABS1(x,ix); | |||||
| } | |||||
| ix += 2; | |||||
| i++; | |||||
| ix += 2; | |||||
| i++; | |||||
| } | } | ||||
| return (min + 1); | |||||
| return (min + 1); | |||||
| } else { | |||||
| } else { | |||||
| min = 0; | min = 0; | ||||
| minf = CABS1(x,0); | |||||
| minf = CABS1(x, 0); | |||||
| inc_x2 = 2 * inc_x; | inc_x2 = 2 * inc_x; | ||||
| ix += inc_x2; | |||||
| i++; | |||||
| while(i < n) | |||||
| { | |||||
| if( CABS1(x,ix) < minf ) | |||||
| { | |||||
| min = i; | |||||
| minf = CABS1(x,ix); | |||||
| } | |||||
| ix += inc_x2; | |||||
| i++; | |||||
| BLASLONG n1 = n & -4; | |||||
| while (i < n1) { | |||||
| if (CABS1(x, ix) < minf) { | |||||
| min = i; | |||||
| minf = CABS1(x, ix); | |||||
| } | |||||
| if (CABS1(x, ix + inc_x2) < minf) { | |||||
| min = i + 1; | |||||
| minf = CABS1(x, ix + inc_x2); | |||||
| } | |||||
| if (CABS1(x, ix + 2 * inc_x2) < minf) { | |||||
| min = i + 2; | |||||
| minf = CABS1(x, ix + 2 * inc_x2); | |||||
| } | |||||
| if (CABS1(x, ix + 3 * inc_x2) < minf) { | |||||
| min = i + 3; | |||||
| minf = CABS1(x, ix + 3 * inc_x2); | |||||
| } | |||||
| ix += inc_x2 * 4; | |||||
| i += 4; | |||||
| } | } | ||||
| return (min + 1); | |||||
| while (i < n) { | |||||
| if (CABS1(x, ix) < minf) { | |||||
| min = i; | |||||
| minf = CABS1(x, ix); | |||||
| } | |||||
| ix += inc_x2; | |||||
| i++; | |||||
| } | } | ||||
| return (min + 1); | |||||
| } | |||||
| } | } | ||||
| @@ -1,5 +1,5 @@ | |||||
| /*************************************************************************** | /*************************************************************************** | ||||
| Copyright (c) 2013-2017, The OpenBLAS Project | |||||
| Copyright (c) 2013-2019, The OpenBLAS Project | |||||
| All rights reserved. | All rights reserved. | ||||
| Redistribution and use in source and binary forms, with or without | Redistribution and use in source and binary forms, with or without | ||||
| modification, are permitted provided that the following conditions are | modification, are permitted provided that the following conditions are | ||||
| @@ -28,142 +28,125 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #include "common.h" | #include "common.h" | ||||
| #include <math.h> | #include <math.h> | ||||
| #if defined(DOUBLE) | |||||
| #define ABS fabs | |||||
| #else | |||||
| #define ABS fabsf | #define ABS fabsf | ||||
| #endif | |||||
| static FLOAT samax_kernel_64(BLASLONG n, FLOAT *x) | |||||
| { | |||||
| FLOAT amax; | |||||
| __asm__ volatile ( | |||||
| "vl %%v0,0(%2) \n\t" | |||||
| "srlg %%r0,%1,6 \n\t" | |||||
| "xgr %%r1,%%r1 \n\t" | |||||
| "0: \n\t" | |||||
| "pfd 1, 1024(%%r1,%2) \n\t" | |||||
| "vl %%v16,0(%%r1,%2) \n\t" | |||||
| "vl %%v17,16(%%r1,%2) \n\t" | |||||
| "vl %%v18,32(%%r1,%2) \n\t" | |||||
| "vl %%v19,48(%%r1,%2) \n\t" | |||||
| "vl %%v20,64(%%r1,%2) \n\t" | |||||
| "vl %%v21,80(%%r1,%2) \n\t" | |||||
| "vl %%v22,96(%%r1,%2) \n\t" | |||||
| "vl %%v23,112(%%r1,%2) \n\t" | |||||
| "vl %%v24,128(%%r1,%2) \n\t" | |||||
| "vl %%v25,144(%%r1,%2) \n\t" | |||||
| "vl %%v26,160(%%r1,%2) \n\t" | |||||
| "vl %%v27,176(%%r1,%2) \n\t" | |||||
| "vl %%v28,192(%%r1,%2) \n\t" | |||||
| "vl %%v29,208(%%r1,%2) \n\t" | |||||
| "vl %%v30,224(%%r1,%2) \n\t" | |||||
| "vl %%v31,240(%%r1,%2) \n\t" | |||||
| "vfmaxsb %%v16,%%v16,%%v24,8 \n\t" | |||||
| "vfmaxsb %%v17,%%v17,%%v25,8 \n\t" | |||||
| "vfmaxsb %%v18,%%v18,%%v26,8 \n\t" | |||||
| "vfmaxsb %%v19,%%v19,%%v27,8 \n\t" | |||||
| "vfmaxsb %%v20,%%v20,%%v28,8 \n\t" | |||||
| "vfmaxsb %%v21,%%v21,%%v29,8 \n\t" | |||||
| "vfmaxsb %%v22,%%v22,%%v30,8 \n\t" | |||||
| "vfmaxsb %%v23,%%v23,%%v31,8 \n\t" | |||||
| "vfmaxsb %%v16,%%v16,%%v20,8 \n\t" | |||||
| "vfmaxsb %%v17,%%v17,%%v21,8 \n\t" | |||||
| "vfmaxsb %%v18,%%v18,%%v22,8 \n\t" | |||||
| "vfmaxsb %%v19,%%v19,%%v23,8 \n\t" | |||||
| "vfmaxsb %%v16,%%v16,%%v18,8 \n\t" | |||||
| "vfmaxsb %%v17,%%v17,%%v19,8 \n\t" | |||||
| "vfmaxsb %%v16,%%v16,%%v17,8 \n\t" | |||||
| "vfmaxsb %%v0,%%v0,%%v16,8 \n\t" | |||||
| "agfi %%r1, 256 \n\t" | |||||
| "brctg %%r0, 0b \n\t" | |||||
| "veslg %%v16,%%v0,32 \n\t" | |||||
| "vfmaxsb %%v0,%%v0,%%v16,8 \n\t" | |||||
| "vrepf %%v16,%%v0,2 \n\t" | |||||
| "wfmaxsb %%v0,%%v0,%%v16,8 \n\t" | |||||
| "lper %0,%%f0 " | |||||
| :"=f"(amax) | |||||
| :"r"(n),"ZR"((const FLOAT (*)[n])x) | |||||
| :"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" | |||||
| ); | |||||
| return amax; | |||||
| } | |||||
| FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { | |||||
| BLASLONG i = 0; | |||||
| BLASLONG j = 0; | |||||
| FLOAT maxf = 0.0; | |||||
| if (n <= 0 || inc_x <= 0) return (maxf); | |||||
| static FLOAT samax_kernel_64(BLASLONG n, FLOAT *x) { | |||||
| FLOAT amax; | |||||
| __asm__("vl %%v0,0(%[x])\n\t" | |||||
| "srlg %[n],%[n],6\n\t" | |||||
| "xgr %%r1,%%r1\n\t" | |||||
| "0:\n\t" | |||||
| "pfd 1, 1024(%%r1,%[x])\n\t" | |||||
| "vl %%v16,0(%%r1,%[x])\n\t" | |||||
| "vl %%v17,16(%%r1,%[x])\n\t" | |||||
| "vl %%v18,32(%%r1,%[x])\n\t" | |||||
| "vl %%v19,48(%%r1,%[x])\n\t" | |||||
| "vl %%v20,64(%%r1,%[x])\n\t" | |||||
| "vl %%v21,80(%%r1,%[x])\n\t" | |||||
| "vl %%v22,96(%%r1,%[x])\n\t" | |||||
| "vl %%v23,112(%%r1,%[x])\n\t" | |||||
| "vl %%v24,128(%%r1,%[x])\n\t" | |||||
| "vl %%v25,144(%%r1,%[x])\n\t" | |||||
| "vl %%v26,160(%%r1,%[x])\n\t" | |||||
| "vl %%v27,176(%%r1,%[x])\n\t" | |||||
| "vl %%v28,192(%%r1,%[x])\n\t" | |||||
| "vl %%v29,208(%%r1,%[x])\n\t" | |||||
| "vl %%v30,224(%%r1,%[x])\n\t" | |||||
| "vl %%v31,240(%%r1,%[x])\n\t" | |||||
| "vfmaxsb %%v16,%%v16,%%v24,8\n\t" | |||||
| "vfmaxsb %%v17,%%v17,%%v25,8\n\t" | |||||
| "vfmaxsb %%v18,%%v18,%%v26,8\n\t" | |||||
| "vfmaxsb %%v19,%%v19,%%v27,8\n\t" | |||||
| "vfmaxsb %%v20,%%v20,%%v28,8\n\t" | |||||
| "vfmaxsb %%v21,%%v21,%%v29,8\n\t" | |||||
| "vfmaxsb %%v22,%%v22,%%v30,8\n\t" | |||||
| "vfmaxsb %%v23,%%v23,%%v31,8\n\t" | |||||
| "vfmaxsb %%v16,%%v16,%%v20,8\n\t" | |||||
| "vfmaxsb %%v17,%%v17,%%v21,8\n\t" | |||||
| "vfmaxsb %%v18,%%v18,%%v22,8\n\t" | |||||
| "vfmaxsb %%v19,%%v19,%%v23,8\n\t" | |||||
| "vfmaxsb %%v16,%%v16,%%v18,8\n\t" | |||||
| "vfmaxsb %%v17,%%v17,%%v19,8\n\t" | |||||
| "vfmaxsb %%v16,%%v16,%%v17,8\n\t" | |||||
| "vfmaxsb %%v0,%%v0,%%v16,8\n\t" | |||||
| "agfi %%r1, 256\n\t" | |||||
| "brctg %[n], 0b\n\t" | |||||
| "veslg %%v16,%%v0,32\n\t" | |||||
| "vfmaxsb %%v0,%%v0,%%v16,8\n\t" | |||||
| "vrepf %%v16,%%v0,2\n\t" | |||||
| "wfmaxsb %%v0,%%v0,%%v16,8\n\t" | |||||
| "lper %[amax],%%f0" | |||||
| : [amax] "=f"(amax),[n] "+&r"(n) | |||||
| : "m"(*(const FLOAT (*)[n]) x),[x] "a"(x) | |||||
| : "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22", | |||||
| "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"); | |||||
| return amax; | |||||
| } | |||||
| if (inc_x == 1) { | |||||
| FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { | |||||
| BLASLONG i = 0; | |||||
| BLASLONG j = 0; | |||||
| FLOAT maxf = 0.0; | |||||
| BLASLONG n1 = n & -64; | |||||
| if (n1 > 0) { | |||||
| if (n <= 0 || inc_x <= 0) | |||||
| return (maxf); | |||||
| maxf = samax_kernel_64(n1, x); | |||||
| if (inc_x == 1) { | |||||
| i = n1; | |||||
| } | |||||
| else | |||||
| { | |||||
| maxf=ABS(x[0]); | |||||
| i++; | |||||
| } | |||||
| BLASLONG n1 = n & -64; | |||||
| if (n1 > 0) { | |||||
| while (i < n) { | |||||
| if (ABS(x[i]) > maxf) { | |||||
| maxf = ABS(x[i]); | |||||
| } | |||||
| i++; | |||||
| } | |||||
| return (maxf); | |||||
| maxf = samax_kernel_64(n1, x); | |||||
| i = n1; | |||||
| } else { | } else { | ||||
| maxf = ABS(x[0]); | |||||
| i++; | |||||
| } | |||||
| while (i < n) { | |||||
| if (ABS(x[i]) > maxf) { | |||||
| maxf = ABS(x[i]); | |||||
| } | |||||
| i++; | |||||
| } | |||||
| return (maxf); | |||||
| maxf=ABS(x[0]); | |||||
| } else { | |||||
| BLASLONG n1 = n & -4; | |||||
| while (j < n1) { | |||||
| maxf = ABS(x[0]); | |||||
| if (ABS(x[i]) > maxf) { | |||||
| maxf = ABS(x[i]); | |||||
| } | |||||
| if (ABS(x[i + inc_x]) > maxf) { | |||||
| maxf = ABS(x[i + inc_x]); | |||||
| } | |||||
| if (ABS(x[i + 2 * inc_x]) > maxf) { | |||||
| maxf = ABS(x[i + 2 * inc_x]); | |||||
| } | |||||
| if (ABS(x[i + 3 * inc_x]) > maxf) { | |||||
| maxf = ABS(x[i + 3 * inc_x]); | |||||
| } | |||||
| BLASLONG n1 = n & -4; | |||||
| while (j < n1) { | |||||
| i += inc_x * 4; | |||||
| if (ABS(x[i]) > maxf) { | |||||
| maxf = ABS(x[i]); | |||||
| } | |||||
| if (ABS(x[i + inc_x]) > maxf) { | |||||
| maxf = ABS(x[i + inc_x]); | |||||
| } | |||||
| if (ABS(x[i + 2 * inc_x]) > maxf) { | |||||
| maxf = ABS(x[i + 2 * inc_x]); | |||||
| } | |||||
| if (ABS(x[i + 3 * inc_x]) > maxf) { | |||||
| maxf = ABS(x[i + 3 * inc_x]); | |||||
| } | |||||
| j += 4; | |||||
| i += inc_x * 4; | |||||
| } | |||||
| j += 4; | |||||
| } | |||||
| while (j < n) { | |||||
| if (ABS(x[i]) > maxf) { | |||||
| maxf = ABS(x[i]); | |||||
| } | |||||
| i += inc_x; | |||||
| j++; | |||||
| } | |||||
| return (maxf); | |||||
| while (j < n) { | |||||
| if (ABS(x[i]) > maxf) { | |||||
| maxf = ABS(x[i]); | |||||
| } | |||||
| i += inc_x; | |||||
| j++; | |||||
| } | } | ||||
| return (maxf); | |||||
| } | |||||
| } | } | ||||
| @@ -1,5 +1,5 @@ | |||||
| /*************************************************************************** | /*************************************************************************** | ||||
| Copyright (c) 2013-2017, The OpenBLAS Project | |||||
| Copyright (c) 2013-2019, The OpenBLAS Project | |||||
| All rights reserved. | All rights reserved. | ||||
| Redistribution and use in source and binary forms, with or without | Redistribution and use in source and binary forms, with or without | ||||
| modification, are permitted provided that the following conditions are | modification, are permitted provided that the following conditions are | ||||
| @@ -28,142 +28,125 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #include "common.h" | #include "common.h" | ||||
| #include <math.h> | #include <math.h> | ||||
| #if defined(DOUBLE) | |||||
| #define ABS fabs | |||||
| #else | |||||
| #define ABS fabsf | #define ABS fabsf | ||||
| #endif | |||||
| static FLOAT samin_kernel_64(BLASLONG n, FLOAT *x) | |||||
| { | |||||
| FLOAT amin; | |||||
| __asm__ volatile ( | |||||
| "vl %%v0,0(%2) \n\t" | |||||
| "srlg %%r0,%1,6 \n\t" | |||||
| "xgr %%r1,%%r1 \n\t" | |||||
| "0: \n\t" | |||||
| "pfd 1, 1024(%%r1,%2) \n\t" | |||||
| "vl %%v16,0(%%r1,%2) \n\t" | |||||
| "vl %%v17,16(%%r1,%2) \n\t" | |||||
| "vl %%v18,32(%%r1,%2) \n\t" | |||||
| "vl %%v19,48(%%r1,%2) \n\t" | |||||
| "vl %%v20,64(%%r1,%2) \n\t" | |||||
| "vl %%v21,80(%%r1,%2) \n\t" | |||||
| "vl %%v22,96(%%r1,%2) \n\t" | |||||
| "vl %%v23,112(%%r1,%2) \n\t" | |||||
| "vl %%v24,128(%%r1,%2) \n\t" | |||||
| "vl %%v25,144(%%r1,%2) \n\t" | |||||
| "vl %%v26,160(%%r1,%2) \n\t" | |||||
| "vl %%v27,176(%%r1,%2) \n\t" | |||||
| "vl %%v28,192(%%r1,%2) \n\t" | |||||
| "vl %%v29,208(%%r1,%2) \n\t" | |||||
| "vl %%v30,224(%%r1,%2) \n\t" | |||||
| "vl %%v31,240(%%r1,%2) \n\t" | |||||
| "vfminsb %%v16,%%v16,%%v24,8 \n\t" | |||||
| "vfminsb %%v17,%%v17,%%v25,8 \n\t" | |||||
| "vfminsb %%v18,%%v18,%%v26,8 \n\t" | |||||
| "vfminsb %%v19,%%v19,%%v27,8 \n\t" | |||||
| "vfminsb %%v20,%%v20,%%v28,8 \n\t" | |||||
| "vfminsb %%v21,%%v21,%%v29,8 \n\t" | |||||
| "vfminsb %%v22,%%v22,%%v30,8 \n\t" | |||||
| "vfminsb %%v23,%%v23,%%v31,8 \n\t" | |||||
| "vfminsb %%v16,%%v16,%%v20,8 \n\t" | |||||
| "vfminsb %%v17,%%v17,%%v21,8 \n\t" | |||||
| "vfminsb %%v18,%%v18,%%v22,8 \n\t" | |||||
| "vfminsb %%v19,%%v19,%%v23,8 \n\t" | |||||
| "vfminsb %%v16,%%v16,%%v18,8 \n\t" | |||||
| "vfminsb %%v17,%%v17,%%v19,8 \n\t" | |||||
| "vfminsb %%v16,%%v16,%%v17,8 \n\t" | |||||
| "vfminsb %%v0,%%v0,%%v16,8 \n\t" | |||||
| "agfi %%r1, 256 \n\t" | |||||
| "brctg %%r0, 0b \n\t" | |||||
| "veslg %%v16,%%v0,32 \n\t" | |||||
| "vfminsb %%v0,%%v0,%%v16,8 \n\t" | |||||
| "vrepf %%v16,%%v0,2 \n\t" | |||||
| "wfminsb %%v0,%%v0,%%v16,8 \n\t" | |||||
| "lper %0,%%f0 " | |||||
| :"=f"(amin) | |||||
| :"r"(n),"ZR"((const FLOAT (*)[n])x) | |||||
| :"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" | |||||
| ); | |||||
| return amin; | |||||
| } | |||||
| FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { | |||||
| BLASLONG i = 0; | |||||
| BLASLONG j = 0; | |||||
| FLOAT minf = 0.0; | |||||
| if (n <= 0 || inc_x <= 0) return (minf); | |||||
| static FLOAT samin_kernel_64(BLASLONG n, FLOAT *x) { | |||||
| FLOAT amin; | |||||
| __asm__("vl %%v0,0(%[x])\n\t" | |||||
| "srlg %[n],%[n],6\n\t" | |||||
| "xgr %%r1,%%r1\n\t" | |||||
| "0:\n\t" | |||||
| "pfd 1, 1024(%%r1,%[x])\n\t" | |||||
| "vl %%v16,0(%%r1,%[x])\n\t" | |||||
| "vl %%v17,16(%%r1,%[x])\n\t" | |||||
| "vl %%v18,32(%%r1,%[x])\n\t" | |||||
| "vl %%v19,48(%%r1,%[x])\n\t" | |||||
| "vl %%v20,64(%%r1,%[x])\n\t" | |||||
| "vl %%v21,80(%%r1,%[x])\n\t" | |||||
| "vl %%v22,96(%%r1,%[x])\n\t" | |||||
| "vl %%v23,112(%%r1,%[x])\n\t" | |||||
| "vl %%v24,128(%%r1,%[x])\n\t" | |||||
| "vl %%v25,144(%%r1,%[x])\n\t" | |||||
| "vl %%v26,160(%%r1,%[x])\n\t" | |||||
| "vl %%v27,176(%%r1,%[x])\n\t" | |||||
| "vl %%v28,192(%%r1,%[x])\n\t" | |||||
| "vl %%v29,208(%%r1,%[x])\n\t" | |||||
| "vl %%v30,224(%%r1,%[x])\n\t" | |||||
| "vl %%v31,240(%%r1,%[x])\n\t" | |||||
| "vfminsb %%v16,%%v16,%%v24,8\n\t" | |||||
| "vfminsb %%v17,%%v17,%%v25,8\n\t" | |||||
| "vfminsb %%v18,%%v18,%%v26,8\n\t" | |||||
| "vfminsb %%v19,%%v19,%%v27,8\n\t" | |||||
| "vfminsb %%v20,%%v20,%%v28,8\n\t" | |||||
| "vfminsb %%v21,%%v21,%%v29,8\n\t" | |||||
| "vfminsb %%v22,%%v22,%%v30,8\n\t" | |||||
| "vfminsb %%v23,%%v23,%%v31,8\n\t" | |||||
| "vfminsb %%v16,%%v16,%%v20,8\n\t" | |||||
| "vfminsb %%v17,%%v17,%%v21,8\n\t" | |||||
| "vfminsb %%v18,%%v18,%%v22,8\n\t" | |||||
| "vfminsb %%v19,%%v19,%%v23,8\n\t" | |||||
| "vfminsb %%v16,%%v16,%%v18,8\n\t" | |||||
| "vfminsb %%v17,%%v17,%%v19,8\n\t" | |||||
| "vfminsb %%v16,%%v16,%%v17,8\n\t" | |||||
| "vfminsb %%v0,%%v0,%%v16,8\n\t" | |||||
| "agfi %%r1, 256\n\t" | |||||
| "brctg %[n], 0b\n\t" | |||||
| "veslg %%v16,%%v0,32\n\t" | |||||
| "vfminsb %%v0,%%v0,%%v16,8\n\t" | |||||
| "vrepf %%v16,%%v0,2\n\t" | |||||
| "wfminsb %%v0,%%v0,%%v16,8\n\t" | |||||
| "lper %[amin],%%f0" | |||||
| : [amin] "=f"(amin),[n] "+&r"(n) | |||||
| : "m"(*(const FLOAT (*)[n]) x),[x] "a"(x) | |||||
| : "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22", | |||||
| "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"); | |||||
| return amin; | |||||
| } | |||||
| if (inc_x == 1) { | |||||
| FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { | |||||
| BLASLONG i = 0; | |||||
| BLASLONG j = 0; | |||||
| FLOAT minf = 0.0; | |||||
| BLASLONG n1 = n & -64; | |||||
| if (n1 > 0) { | |||||
| if (n <= 0 || inc_x <= 0) | |||||
| return (minf); | |||||
| minf = samin_kernel_64(n1, x); | |||||
| if (inc_x == 1) { | |||||
| i = n1; | |||||
| } | |||||
| else | |||||
| { | |||||
| minf=ABS(x[0]); | |||||
| i++; | |||||
| } | |||||
| BLASLONG n1 = n & -64; | |||||
| if (n1 > 0) { | |||||
| while (i < n) { | |||||
| if (ABS(x[i]) < minf) { | |||||
| minf = ABS(x[i]); | |||||
| } | |||||
| i++; | |||||
| } | |||||
| return (minf); | |||||
| minf = samin_kernel_64(n1, x); | |||||
| i = n1; | |||||
| } else { | } else { | ||||
| minf = ABS(x[0]); | |||||
| i++; | |||||
| } | |||||
| while (i < n) { | |||||
| if (ABS(x[i]) < minf) { | |||||
| minf = ABS(x[i]); | |||||
| } | |||||
| i++; | |||||
| } | |||||
| return (minf); | |||||
| minf=ABS(x[0]); | |||||
| } else { | |||||
| BLASLONG n1 = n & -4; | |||||
| while (j < n1) { | |||||
| minf = ABS(x[0]); | |||||
| if (ABS(x[i]) < minf) { | |||||
| minf = ABS(x[i]); | |||||
| } | |||||
| if (ABS(x[i + inc_x]) < minf) { | |||||
| minf = ABS(x[i + inc_x]); | |||||
| } | |||||
| if (ABS(x[i + 2 * inc_x]) < minf) { | |||||
| minf = ABS(x[i + 2 * inc_x]); | |||||
| } | |||||
| if (ABS(x[i + 3 * inc_x]) < minf) { | |||||
| minf = ABS(x[i + 3 * inc_x]); | |||||
| } | |||||
| BLASLONG n1 = n & -4; | |||||
| while (j < n1) { | |||||
| i += inc_x * 4; | |||||
| if (ABS(x[i]) < minf) { | |||||
| minf = ABS(x[i]); | |||||
| } | |||||
| if (ABS(x[i + inc_x]) < minf) { | |||||
| minf = ABS(x[i + inc_x]); | |||||
| } | |||||
| if (ABS(x[i + 2 * inc_x]) < minf) { | |||||
| minf = ABS(x[i + 2 * inc_x]); | |||||
| } | |||||
| if (ABS(x[i + 3 * inc_x]) < minf) { | |||||
| minf = ABS(x[i + 3 * inc_x]); | |||||
| } | |||||
| j += 4; | |||||
| i += inc_x * 4; | |||||
| } | |||||
| j += 4; | |||||
| } | |||||
| while (j < n) { | |||||
| if (ABS(x[i]) < minf) { | |||||
| minf = ABS(x[i]); | |||||
| } | |||||
| i += inc_x; | |||||
| j++; | |||||
| } | |||||
| return (minf); | |||||
| while (j < n) { | |||||
| if (ABS(x[i]) < minf) { | |||||
| minf = ABS(x[i]); | |||||
| } | |||||
| i += inc_x; | |||||
| j++; | |||||
| } | } | ||||
| return (minf); | |||||
| } | |||||
| } | } | ||||
| @@ -1,5 +1,5 @@ | |||||
| /*************************************************************************** | /*************************************************************************** | ||||
| Copyright (c) 2013-2018, The OpenBLAS Project | |||||
| Copyright (c) 2013-2019, The OpenBLAS Project | |||||
| All rights reserved. | All rights reserved. | ||||
| Redistribution and use in source and binary forms, with or without | Redistribution and use in source and binary forms, with or without | ||||
| modification, are permitted provided that the following conditions are | modification, are permitted provided that the following conditions are | ||||
| @@ -28,147 +28,141 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #include "common.h" | #include "common.h" | ||||
| #include <math.h> | #include <math.h> | ||||
| #if defined(DOUBLE) | |||||
| #define ABS fabs | |||||
| #else | |||||
| #define ABS fabsf | |||||
| #endif | |||||
| static FLOAT sasum_kernel_64(BLASLONG n, FLOAT *x) | |||||
| { | |||||
| FLOAT asum; | |||||
| __asm__ ( | |||||
| "vzero %%v0 \n\t" | |||||
| "vzero %%v1 \n\t" | |||||
| "vzero %%v2 \n\t" | |||||
| "vzero %%v3 \n\t" | |||||
| "srlg %%r0,%1,6 \n\t" | |||||
| "xgr %%r1,%%r1 \n\t" | |||||
| "0: \n\t" | |||||
| "pfd 1, 1024(%%r1,%2) \n\t" | |||||
| "vl %%v16, 0(%%r1,%2) \n\t" | |||||
| "vl %%v17, 16(%%r1,%2) \n\t" | |||||
| "vl %%v18, 32(%%r1,%2) \n\t" | |||||
| "vl %%v19, 48(%%r1,%2) \n\t" | |||||
| "vl %%v20, 64(%%r1,%2) \n\t" | |||||
| "vl %%v21, 80(%%r1,%2) \n\t" | |||||
| "vl %%v22, 96(%%r1,%2) \n\t" | |||||
| "vl %%v23, 112(%%r1,%2) \n\t" | |||||
| "vflpsb %%v16, %%v16 \n\t" | |||||
| "vflpsb %%v17, %%v17 \n\t" | |||||
| "vflpsb %%v18, %%v18 \n\t" | |||||
| "vflpsb %%v19, %%v19 \n\t" | |||||
| "vflpsb %%v20, %%v20 \n\t" | |||||
| "vflpsb %%v21, %%v21 \n\t" | |||||
| "vflpsb %%v22, %%v22 \n\t" | |||||
| "vflpsb %%v23, %%v23 \n\t" | |||||
| "vfasb %%v0,%%v0,%%v16 \n\t" | |||||
| "vfasb %%v1,%%v1,%%v17 \n\t" | |||||
| "vfasb %%v2,%%v2,%%v18 \n\t" | |||||
| "vfasb %%v3,%%v3,%%v19 \n\t" | |||||
| "vfasb %%v0,%%v0,%%v20 \n\t" | |||||
| "vfasb %%v1,%%v1,%%v21 \n\t" | |||||
| "vfasb %%v2,%%v2,%%v22 \n\t" | |||||
| "vfasb %%v3,%%v3,%%v23 \n\t" | |||||
| "vl %%v16, 128(%%r1,%2) \n\t" | |||||
| "vl %%v17, 144(%%r1,%2) \n\t" | |||||
| "vl %%v18, 160(%%r1,%2) \n\t" | |||||
| "vl %%v19, 176(%%r1,%2) \n\t" | |||||
| "vl %%v20, 192(%%r1,%2) \n\t" | |||||
| "vl %%v21, 208(%%r1,%2) \n\t" | |||||
| "vl %%v22, 224(%%r1,%2) \n\t" | |||||
| "vl %%v23, 240(%%r1,%2) \n\t" | |||||
| "vflpsb %%v16, %%v16 \n\t" | |||||
| "vflpsb %%v17, %%v17 \n\t" | |||||
| "vflpsb %%v18, %%v18 \n\t" | |||||
| "vflpsb %%v19, %%v19 \n\t" | |||||
| "vflpsb %%v20, %%v20 \n\t" | |||||
| "vflpsb %%v21, %%v21 \n\t" | |||||
| "vflpsb %%v22, %%v22 \n\t" | |||||
| "vflpsb %%v23, %%v23 \n\t" | |||||
| "vfasb %%v0,%%v0,%%v16 \n\t" | |||||
| "vfasb %%v1,%%v1,%%v17 \n\t" | |||||
| "vfasb %%v2,%%v2,%%v18 \n\t" | |||||
| "vfasb %%v3,%%v3,%%v19 \n\t" | |||||
| "vfasb %%v0,%%v0,%%v20 \n\t" | |||||
| "vfasb %%v1,%%v1,%%v21 \n\t" | |||||
| "vfasb %%v2,%%v2,%%v22 \n\t" | |||||
| "vfasb %%v3,%%v3,%%v23 \n\t" | |||||
| "agfi %%r1,256 \n\t" | |||||
| "brctg %%r0,0b \n\t" | |||||
| "vfasb %%v0,%%v0,%%v1 \n\t" | |||||
| "vfasb %%v0,%%v0,%%v2 \n\t" | |||||
| "vfasb %%v0,%%v0,%%v3 \n\t" | |||||
| "veslg %%v1,%%v0,32 \n\t" | |||||
| "vfasb %%v0,%%v0,%%v1 \n\t" | |||||
| "vrepf %%v1,%%v0,2 \n\t" | |||||
| "aebr %%f0,%%f1 \n\t" | |||||
| "ler %0,%%f0 " | |||||
| :"=f"(asum) | |||||
| :"r"(n),"ZR"((const FLOAT (*)[n])x) | |||||
| :"memory","cc","r0","r1","v0","v1","v2","v3","v16","v17","v18","v19","v20","v21","v22","v23" | |||||
| ); | |||||
| return asum; | |||||
| #define ABS fabsf | |||||
| static FLOAT sasum_kernel_64(BLASLONG n, FLOAT *x) { | |||||
| FLOAT asum; | |||||
| __asm__("vzero %%v24\n\t" | |||||
| "vzero %%v25\n\t" | |||||
| "vzero %%v26\n\t" | |||||
| "vzero %%v27\n\t" | |||||
| "vzero %%v28\n\t" | |||||
| "vzero %%v29\n\t" | |||||
| "vzero %%v30\n\t" | |||||
| "vzero %%v31\n\t" | |||||
| "srlg %[n],%[n],6\n\t" | |||||
| "xgr %%r1,%%r1\n\t" | |||||
| "0:\n\t" | |||||
| "pfd 1, 1024(%%r1,%[x])\n\t" | |||||
| "vl %%v16, 0(%%r1,%[x])\n\t" | |||||
| "vl %%v17, 16(%%r1,%[x])\n\t" | |||||
| "vl %%v18, 32(%%r1,%[x])\n\t" | |||||
| "vl %%v19, 48(%%r1,%[x])\n\t" | |||||
| "vl %%v20, 64(%%r1,%[x])\n\t" | |||||
| "vl %%v21, 80(%%r1,%[x])\n\t" | |||||
| "vl %%v22, 96(%%r1,%[x])\n\t" | |||||
| "vl %%v23, 112(%%r1,%[x])\n\t" | |||||
| "vflpsb %%v16, %%v16\n\t" | |||||
| "vflpsb %%v17, %%v17\n\t" | |||||
| "vflpsb %%v18, %%v18\n\t" | |||||
| "vflpsb %%v19, %%v19\n\t" | |||||
| "vflpsb %%v20, %%v20\n\t" | |||||
| "vflpsb %%v21, %%v21\n\t" | |||||
| "vflpsb %%v22, %%v22\n\t" | |||||
| "vflpsb %%v23, %%v23\n\t" | |||||
| "vfasb %%v24,%%v24,%%v16\n\t" | |||||
| "vfasb %%v25,%%v25,%%v17\n\t" | |||||
| "vfasb %%v26,%%v26,%%v18\n\t" | |||||
| "vfasb %%v27,%%v27,%%v19\n\t" | |||||
| "vfasb %%v28,%%v28,%%v20\n\t" | |||||
| "vfasb %%v29,%%v29,%%v21\n\t" | |||||
| "vfasb %%v30,%%v30,%%v22\n\t" | |||||
| "vfasb %%v31,%%v31,%%v23\n\t" | |||||
| "vl %%v16, 128(%%r1,%[x])\n\t" | |||||
| "vl %%v17, 144(%%r1,%[x])\n\t" | |||||
| "vl %%v18, 160(%%r1,%[x])\n\t" | |||||
| "vl %%v19, 176(%%r1,%[x])\n\t" | |||||
| "vl %%v20, 192(%%r1,%[x])\n\t" | |||||
| "vl %%v21, 208(%%r1,%[x])\n\t" | |||||
| "vl %%v22, 224(%%r1,%[x])\n\t" | |||||
| "vl %%v23, 240(%%r1,%[x])\n\t" | |||||
| "vflpsb %%v16, %%v16\n\t" | |||||
| "vflpsb %%v17, %%v17\n\t" | |||||
| "vflpsb %%v18, %%v18\n\t" | |||||
| "vflpsb %%v19, %%v19\n\t" | |||||
| "vflpsb %%v20, %%v20\n\t" | |||||
| "vflpsb %%v21, %%v21\n\t" | |||||
| "vflpsb %%v22, %%v22\n\t" | |||||
| "vflpsb %%v23, %%v23\n\t" | |||||
| "vfasb %%v24,%%v24,%%v16\n\t" | |||||
| "vfasb %%v25,%%v25,%%v17\n\t" | |||||
| "vfasb %%v26,%%v26,%%v18\n\t" | |||||
| "vfasb %%v27,%%v27,%%v19\n\t" | |||||
| "vfasb %%v28,%%v28,%%v20\n\t" | |||||
| "vfasb %%v29,%%v29,%%v21\n\t" | |||||
| "vfasb %%v30,%%v30,%%v22\n\t" | |||||
| "vfasb %%v31,%%v31,%%v23\n\t" | |||||
| "agfi %%r1,256\n\t" | |||||
| "brctg %[n],0b\n\t" | |||||
| "vfasb %%v24,%%v24,%%v25\n\t" | |||||
| "vfasb %%v24,%%v24,%%v26\n\t" | |||||
| "vfasb %%v24,%%v24,%%v27\n\t" | |||||
| "vfasb %%v24,%%v24,%%v28\n\t" | |||||
| "vfasb %%v24,%%v24,%%v29\n\t" | |||||
| "vfasb %%v24,%%v24,%%v30\n\t" | |||||
| "vfasb %%v24,%%v24,%%v31\n\t" | |||||
| "veslg %%v25,%%v24,32\n\t" | |||||
| "vfasb %%v24,%%v24,%%v25\n\t" | |||||
| "vrepf %%v25,%%v24,2\n\t" | |||||
| "vfasb %%v24,%%v24,%%v25\n\t" | |||||
| "vstef %%v24,%[asum],0" | |||||
| : [asum] "=m"(asum),[n] "+&r"(n) | |||||
| : "m"(*(const FLOAT (*)[n]) x),[x] "a"(x) | |||||
| : "cc", "r1", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", | |||||
| "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"); | |||||
| return asum; | |||||
| } | } | ||||
| FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { | FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { | ||||
| BLASLONG i = 0; | |||||
| BLASLONG j = 0; | |||||
| FLOAT sumf = 0.0; | |||||
| BLASLONG n1; | |||||
| BLASLONG i = 0; | |||||
| BLASLONG j = 0; | |||||
| FLOAT sumf = 0.0; | |||||
| BLASLONG n1; | |||||
| if (n <= 0 || inc_x <= 0) return sumf; | |||||
| if (inc_x == 1) { | |||||
| n1 = n & -64; | |||||
| if (n1 > 0) { | |||||
| if (n <= 0 || inc_x <= 0) | |||||
| return sumf; | |||||
| sumf = sasum_kernel_64(n1, x); | |||||
| i = n1; | |||||
| } | |||||
| if (inc_x == 1) { | |||||
| while (i < n) { | |||||
| sumf += ABS(x[i]); | |||||
| i++; | |||||
| } | |||||
| n1 = n & -64; | |||||
| } else { | |||||
| BLASLONG n1 = n & -4; | |||||
| register FLOAT sum1, sum2; | |||||
| sum1 = 0.0; | |||||
| sum2 = 0.0; | |||||
| while (j < n1) { | |||||
| if (n1 > 0) { | |||||
| sum1 += ABS(x[i]); | |||||
| sum2 += ABS(x[i + inc_x]); | |||||
| sum1 += ABS(x[i + 2 * inc_x]); | |||||
| sum2 += ABS(x[i + 3 * inc_x]); | |||||
| sumf = sasum_kernel_64(n1, x); | |||||
| i = n1; | |||||
| } | |||||
| i += inc_x * 4; | |||||
| j += 4; | |||||
| while (i < n) { | |||||
| sumf += ABS(x[i]); | |||||
| i++; | |||||
| } | |||||
| } | |||||
| sumf = sum1 + sum2; | |||||
| while (j < n) { | |||||
| } else { | |||||
| BLASLONG n1 = n & -4; | |||||
| register FLOAT sum1, sum2; | |||||
| sum1 = 0.0; | |||||
| sum2 = 0.0; | |||||
| while (j < n1) { | |||||
| sumf += ABS(x[i]); | |||||
| i += inc_x; | |||||
| j++; | |||||
| } | |||||
| sum1 += ABS(x[i]); | |||||
| sum2 += ABS(x[i + inc_x]); | |||||
| sum1 += ABS(x[i + 2 * inc_x]); | |||||
| sum2 += ABS(x[i + 3 * inc_x]); | |||||
| i += inc_x * 4; | |||||
| j += 4; | |||||
| } | } | ||||
| return sumf; | |||||
| } | |||||
| sumf = sum1 + sum2; | |||||
| while (j < n) { | |||||
| sumf += ABS(x[i]); | |||||
| i += inc_x; | |||||
| j++; | |||||
| } | |||||
| } | |||||
| return sumf; | |||||
| } | |||||
| @@ -1,5 +1,5 @@ | |||||
| /*************************************************************************** | /*************************************************************************** | ||||
| Copyright (c) 2013-2017, The OpenBLAS Project | |||||
| Copyright (c) 2013-2019, The OpenBLAS Project | |||||
| All rights reserved. | All rights reserved. | ||||
| Redistribution and use in source and binary forms, with or without | Redistribution and use in source and binary forms, with or without | ||||
| modification, are permitted provided that the following conditions are | modification, are permitted provided that the following conditions are | ||||
| @@ -27,158 +27,141 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #include "common.h" | #include "common.h" | ||||
| static void saxpy_kernel_64(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) | |||||
| { | |||||
| __asm__ volatile( | |||||
| "vlrepf %%v0,%3 \n\t" | |||||
| "srlg %%r0,%0,6 \n\t" | |||||
| "xgr %%r1,%%r1 \n\t" | |||||
| "0: \n\t" | |||||
| "pfd 1, 1024(%%r1,%1) \n\t" | |||||
| "pfd 2, 1024(%%r1,%2) \n\t" | |||||
| "vl %%v16,0(%%r1,%1) \n\t" | |||||
| "vl %%v17,16(%%r1,%1) \n\t" | |||||
| "vl %%v18,32(%%r1,%1) \n\t" | |||||
| "vl %%v19,48(%%r1,%1) \n\t" | |||||
| "vl %%v20,0(%%r1,%2) \n\t" | |||||
| "vl %%v21,16(%%r1,%2) \n\t" | |||||
| "vl %%v22,32(%%r1,%2) \n\t" | |||||
| "vl %%v23,48(%%r1,%2) \n\t" | |||||
| "vfmasb %%v16,%%v0,%%v16,%%v20 \n\t" | |||||
| "vfmasb %%v17,%%v0,%%v17,%%v21 \n\t" | |||||
| "vfmasb %%v18,%%v0,%%v18,%%v22 \n\t" | |||||
| "vfmasb %%v19,%%v0,%%v19,%%v23 \n\t" | |||||
| "vl %%v24,64(%%r1,%1) \n\t" | |||||
| "vl %%v25,80(%%r1,%1) \n\t" | |||||
| "vl %%v26,96(%%r1,%1) \n\t" | |||||
| "vl %%v27,112(%%r1,%1) \n\t" | |||||
| "vl %%v28,64(%%r1,%2) \n\t" | |||||
| "vl %%v29,80(%%r1,%2) \n\t" | |||||
| "vl %%v30,96(%%r1,%2) \n\t" | |||||
| "vl %%v31,112(%%r1,%2) \n\t" | |||||
| "vfmasb %%v20,%%v0,%%v24,%%v28 \n\t" | |||||
| "vfmasb %%v21,%%v0,%%v25,%%v29 \n\t" | |||||
| "vfmasb %%v22,%%v0,%%v26,%%v30 \n\t" | |||||
| "vfmasb %%v23,%%v0,%%v27,%%v31 \n\t" | |||||
| "vst %%v16,0(%%r1,%2) \n\t" | |||||
| "vst %%v17,16(%%r1,%2) \n\t" | |||||
| "vst %%v18,32(%%r1,%2) \n\t" | |||||
| "vst %%v19,48(%%r1,%2) \n\t" | |||||
| "vst %%v20,64(%%r1,%2) \n\t" | |||||
| "vst %%v21,80(%%r1,%2) \n\t" | |||||
| "vst %%v22,96(%%r1,%2) \n\t" | |||||
| "vst %%v23,112(%%r1,%2) \n\t" | |||||
| "vl %%v16,128(%%r1,%1) \n\t" | |||||
| "vl %%v17,144(%%r1,%1) \n\t" | |||||
| "vl %%v18,160(%%r1,%1) \n\t" | |||||
| "vl %%v19,176(%%r1,%1) \n\t" | |||||
| "vl %%v20,128(%%r1,%2) \n\t" | |||||
| "vl %%v21,144(%%r1,%2) \n\t" | |||||
| "vl %%v22,160(%%r1,%2) \n\t" | |||||
| "vl %%v23,176(%%r1,%2) \n\t" | |||||
| "vfmasb %%v16,%%v0,%%v16,%%v20 \n\t" | |||||
| "vfmasb %%v17,%%v0,%%v17,%%v21 \n\t" | |||||
| "vfmasb %%v18,%%v0,%%v18,%%v22 \n\t" | |||||
| "vfmasb %%v19,%%v0,%%v19,%%v23 \n\t" | |||||
| "vl %%v24,192(%%r1,%1) \n\t" | |||||
| "vl %%v25,208(%%r1,%1) \n\t" | |||||
| "vl %%v26,224(%%r1,%1) \n\t" | |||||
| "vl %%v27,240(%%r1,%1) \n\t" | |||||
| "vl %%v28,192(%%r1,%2) \n\t" | |||||
| "vl %%v29,208(%%r1,%2) \n\t" | |||||
| "vl %%v30,224(%%r1,%2) \n\t" | |||||
| "vl %%v31,240(%%r1,%2) \n\t" | |||||
| "vfmasb %%v20,%%v0,%%v24,%%v28 \n\t" | |||||
| "vfmasb %%v21,%%v0,%%v25,%%v29 \n\t" | |||||
| "vfmasb %%v22,%%v0,%%v26,%%v30 \n\t" | |||||
| "vfmasb %%v23,%%v0,%%v27,%%v31 \n\t" | |||||
| "vst %%v16,128(%%r1,%2) \n\t" | |||||
| "vst %%v17,144(%%r1,%2) \n\t" | |||||
| "vst %%v18,160(%%r1,%2) \n\t" | |||||
| "vst %%v19,176(%%r1,%2) \n\t" | |||||
| "vst %%v20,192(%%r1,%2) \n\t" | |||||
| "vst %%v21,208(%%r1,%2) \n\t" | |||||
| "vst %%v22,224(%%r1,%2) \n\t" | |||||
| "vst %%v23,240(%%r1,%2) \n\t" | |||||
| "agfi %%r1,256 \n\t" | |||||
| "brctg %%r0,0b " | |||||
| : | |||||
| :"r"(n),"ZR"((const FLOAT (*)[n])x),"ZR"((FLOAT (*)[n])y),"m"(*alpha) | |||||
| :"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" | |||||
| ); | |||||
| static void saxpy_kernel_64(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) { | |||||
| __asm__("vlrepf %%v0,%[alpha]\n\t" | |||||
| "srlg %[n],%[n],6\n\t" | |||||
| "xgr %%r1,%%r1\n\t" | |||||
| "0:\n\t" | |||||
| "pfd 1, 1024(%%r1,%[x])\n\t" | |||||
| "pfd 2, 1024(%%r1,%[y])\n\t" | |||||
| "vl %%v16,0(%%r1,%[x])\n\t" | |||||
| "vl %%v17,16(%%r1,%[x])\n\t" | |||||
| "vl %%v18,32(%%r1,%[x])\n\t" | |||||
| "vl %%v19,48(%%r1,%[x])\n\t" | |||||
| "vl %%v20,0(%%r1,%[y])\n\t" | |||||
| "vl %%v21,16(%%r1,%[y])\n\t" | |||||
| "vl %%v22,32(%%r1,%[y])\n\t" | |||||
| "vl %%v23,48(%%r1,%[y])\n\t" | |||||
| "vl %%v24,64(%%r1,%[x])\n\t" | |||||
| "vl %%v25,80(%%r1,%[x])\n\t" | |||||
| "vl %%v26,96(%%r1,%[x])\n\t" | |||||
| "vl %%v27,112(%%r1,%[x])\n\t" | |||||
| "vl %%v28,64(%%r1,%[y])\n\t" | |||||
| "vl %%v29,80(%%r1,%[y])\n\t" | |||||
| "vl %%v30,96(%%r1,%[y])\n\t" | |||||
| "vl %%v31,112(%%r1,%[y])\n\t" | |||||
| "vfmasb %%v16,%%v0,%%v16,%%v20\n\t" | |||||
| "vfmasb %%v17,%%v0,%%v17,%%v21\n\t" | |||||
| "vfmasb %%v18,%%v0,%%v18,%%v22\n\t" | |||||
| "vfmasb %%v19,%%v0,%%v19,%%v23\n\t" | |||||
| "vfmasb %%v24,%%v0,%%v24,%%v28\n\t" | |||||
| "vfmasb %%v25,%%v0,%%v25,%%v29\n\t" | |||||
| "vfmasb %%v26,%%v0,%%v26,%%v30\n\t" | |||||
| "vfmasb %%v27,%%v0,%%v27,%%v31\n\t" | |||||
| "vst %%v16,0(%%r1,%[y])\n\t" | |||||
| "vst %%v17,16(%%r1,%[y])\n\t" | |||||
| "vst %%v18,32(%%r1,%[y])\n\t" | |||||
| "vst %%v19,48(%%r1,%[y])\n\t" | |||||
| "vst %%v24,64(%%r1,%[y])\n\t" | |||||
| "vst %%v25,80(%%r1,%[y])\n\t" | |||||
| "vst %%v26,96(%%r1,%[y])\n\t" | |||||
| "vst %%v27,112(%%r1,%[y])\n\t" | |||||
| "vl %%v16,128(%%r1,%[x])\n\t" | |||||
| "vl %%v17,144(%%r1,%[x])\n\t" | |||||
| "vl %%v18,160(%%r1,%[x])\n\t" | |||||
| "vl %%v19,176(%%r1,%[x])\n\t" | |||||
| "vl %%v20,128(%%r1,%[y])\n\t" | |||||
| "vl %%v21,144(%%r1,%[y])\n\t" | |||||
| "vl %%v22,160(%%r1,%[y])\n\t" | |||||
| "vl %%v23,176(%%r1,%[y])\n\t" | |||||
| "vl %%v24,192(%%r1,%[x])\n\t" | |||||
| "vl %%v25,208(%%r1,%[x])\n\t" | |||||
| "vl %%v26,224(%%r1,%[x])\n\t" | |||||
| "vl %%v27,240(%%r1,%[x])\n\t" | |||||
| "vl %%v28,192(%%r1,%[y])\n\t" | |||||
| "vl %%v29,208(%%r1,%[y])\n\t" | |||||
| "vl %%v30,224(%%r1,%[y])\n\t" | |||||
| "vl %%v31,240(%%r1,%[y])\n\t" | |||||
| "vfmasb %%v16,%%v0,%%v16,%%v20\n\t" | |||||
| "vfmasb %%v17,%%v0,%%v17,%%v21\n\t" | |||||
| "vfmasb %%v18,%%v0,%%v18,%%v22\n\t" | |||||
| "vfmasb %%v19,%%v0,%%v19,%%v23\n\t" | |||||
| "vfmasb %%v24,%%v0,%%v24,%%v28\n\t" | |||||
| "vfmasb %%v25,%%v0,%%v25,%%v29\n\t" | |||||
| "vfmasb %%v26,%%v0,%%v26,%%v30\n\t" | |||||
| "vfmasb %%v27,%%v0,%%v27,%%v31\n\t" | |||||
| "vst %%v16,128(%%r1,%[y])\n\t" | |||||
| "vst %%v17,144(%%r1,%[y])\n\t" | |||||
| "vst %%v18,160(%%r1,%[y])\n\t" | |||||
| "vst %%v19,176(%%r1,%[y])\n\t" | |||||
| "vst %%v24,192(%%r1,%[y])\n\t" | |||||
| "vst %%v25,208(%%r1,%[y])\n\t" | |||||
| "vst %%v26,224(%%r1,%[y])\n\t" | |||||
| "vst %%v27,240(%%r1,%[y])\n\t" | |||||
| "agfi %%r1,256\n\t" | |||||
| "brctg %[n],0b" | |||||
| : "+m"(*(FLOAT (*)[n]) y),[n] "+&r"(n) | |||||
| : [y] "a"(y), "m"(*(const FLOAT (*)[n]) x),[x] "a"(x), | |||||
| [alpha] "m"(*alpha) | |||||
| : "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22", | |||||
| "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"); | |||||
| } | } | ||||
| int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) | |||||
| { | |||||
| BLASLONG i=0; | |||||
| BLASLONG ix=0,iy=0; | |||||
| int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, | |||||
| BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, | |||||
| BLASLONG dummy2) { | |||||
| BLASLONG i = 0; | |||||
| BLASLONG ix = 0, iy = 0; | |||||
| if ( n <= 0 ) return 0 ; | |||||
| if (n <= 0) | |||||
| return 0; | |||||
| if ( (inc_x == 1) && (inc_y == 1) ) | |||||
| { | |||||
| if ((inc_x == 1) && (inc_y == 1)) { | |||||
| BLASLONG n1 = n & -64; | |||||
| BLASLONG n1 = n & -64; | |||||
| if ( n1 ) | |||||
| saxpy_kernel_64(n1, x, y , &da); | |||||
| if (n1) | |||||
| saxpy_kernel_64(n1, x, y, &da); | |||||
| i = n1; | |||||
| while(i < n) | |||||
| { | |||||
| y[i] += da * x[i] ; | |||||
| i++ ; | |||||
| } | |||||
| return 0 ; | |||||
| i = n1; | |||||
| while (i < n) { | |||||
| y[i] += da * x[i]; | |||||
| i++; | |||||
| } | } | ||||
| return 0; | |||||
| BLASLONG n1 = n & -4; | |||||
| } | |||||
| while(i < n1) | |||||
| { | |||||
| BLASLONG n1 = n & -4; | |||||
| FLOAT m1 = da * x[ix] ; | |||||
| FLOAT m2 = da * x[ix+inc_x] ; | |||||
| FLOAT m3 = da * x[ix+2*inc_x] ; | |||||
| FLOAT m4 = da * x[ix+3*inc_x] ; | |||||
| while (i < n1) { | |||||
| y[iy] += m1 ; | |||||
| y[iy+inc_y] += m2 ; | |||||
| y[iy+2*inc_y] += m3 ; | |||||
| y[iy+3*inc_y] += m4 ; | |||||
| FLOAT m1 = da * x[ix]; | |||||
| FLOAT m2 = da * x[ix + inc_x]; | |||||
| FLOAT m3 = da * x[ix + 2 * inc_x]; | |||||
| FLOAT m4 = da * x[ix + 3 * inc_x]; | |||||
| ix += inc_x*4 ; | |||||
| iy += inc_y*4 ; | |||||
| i+=4 ; | |||||
| y[iy] += m1; | |||||
| y[iy + inc_y] += m2; | |||||
| y[iy + 2 * inc_y] += m3; | |||||
| y[iy + 3 * inc_y] += m4; | |||||
| } | |||||
| ix += inc_x * 4; | |||||
| iy += inc_y * 4; | |||||
| i += 4; | |||||
| while(i < n) | |||||
| { | |||||
| } | |||||
| y[iy] += da * x[ix] ; | |||||
| ix += inc_x ; | |||||
| iy += inc_y ; | |||||
| i++ ; | |||||
| while (i < n) { | |||||
| } | |||||
| return 0 ; | |||||
| } | |||||
| y[iy] += da * x[ix]; | |||||
| ix += inc_x; | |||||
| iy += inc_y; | |||||
| i++; | |||||
| } | |||||
| return 0; | |||||
| } | |||||
| @@ -1,5 +1,5 @@ | |||||
| /*************************************************************************** | /*************************************************************************** | ||||
| Copyright (c) 2013-2018, The OpenBLAS Project | |||||
| Copyright (c) 2013-2019, The OpenBLAS Project | |||||
| All rights reserved. | All rights reserved. | ||||
| Redistribution and use in source and binary forms, with or without | Redistribution and use in source and binary forms, with or without | ||||
| modification, are permitted provided that the following conditions are | modification, are permitted provided that the following conditions are | ||||
| @@ -27,59 +27,53 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #include "common.h" | #include "common.h" | ||||
| static void scopy_kernel_64(BLASLONG n, FLOAT *x, FLOAT *y) | |||||
| { | |||||
| __asm__ volatile ( | |||||
| "lgr %%r1,%1 \n\t" | |||||
| "lgr %%r2,%2 \n\t" | |||||
| "srlg %%r0,%0,6 \n\t" | |||||
| "0: \n\t" | |||||
| "pfd 1, 1024(%%r1) \n\t" | |||||
| "pfd 2, 1024(%%r2) \n\t" | |||||
| "mvc 0(256,%%r2),0(%%r1) \n\t" | |||||
| "agfi %%r1,256 \n\t" | |||||
| "agfi %%r2,256 \n\t" | |||||
| "brctg %%r0,0b " | |||||
| : | |||||
| :"r"(n),"a"((const FLOAT (*)[n])x),"a"((FLOAT (*)[n])y) | |||||
| :"memory","cc","r0","r1","r2" | |||||
| ); | |||||
| static void scopy_kernel_64(BLASLONG n, FLOAT *x, FLOAT *y) { | |||||
| __asm__("srlg %[n],%[n],6\n\t" | |||||
| "0:\n\t" | |||||
| "pfd 1, 1024(%[x])\n\t" | |||||
| "pfd 2, 1024(%[y])\n\t" | |||||
| "mvc 0(256,%[y]),0(%[x])\n\t" | |||||
| "la %[x],256(%[x])\n\t" | |||||
| "la %[y],256(%[y])\n\t" | |||||
| "brctg %[n],0b" | |||||
| : "=m"(*(FLOAT (*)[n]) y),[x] "+&a"(x),[y] "+&a"(y) | |||||
| : "m"(*(const FLOAT (*)[n]) x) | |||||
| : "cc"); | |||||
| } | } | ||||
| int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) { | int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) { | ||||
| BLASLONG i = 0; | |||||
| BLASLONG ix = 0, iy = 0; | |||||
| BLASLONG i = 0; | |||||
| BLASLONG ix = 0, iy = 0; | |||||
| if (n <= 0) return 0; | |||||
| if ((inc_x == 1) && (inc_y == 1)) { | |||||
| BLASLONG n1 = n & -64; | |||||
| if (n1 > 0) { | |||||
| scopy_kernel_64(n1, x, y); | |||||
| i = n1; | |||||
| } | |||||
| if (n <= 0) | |||||
| return 0; | |||||
| while (i < n) { | |||||
| y[i] = x[i]; | |||||
| i++; | |||||
| if ((inc_x == 1) && (inc_y == 1)) { | |||||
| } | |||||
| BLASLONG n1 = n & -64; | |||||
| if (n1 > 0) { | |||||
| scopy_kernel_64(n1, x, y); | |||||
| i = n1; | |||||
| } | |||||
| while (i < n) { | |||||
| y[i] = x[i]; | |||||
| i++; | |||||
| } else { | |||||
| } | |||||
| while (i < n) { | |||||
| } else { | |||||
| y[iy] = x[ix]; | |||||
| ix += inc_x; | |||||
| iy += inc_y; | |||||
| i++; | |||||
| while (i < n) { | |||||
| } | |||||
| y[iy] = x[ix]; | |||||
| ix += inc_x; | |||||
| iy += inc_y; | |||||
| i++; | |||||
| } | } | ||||
| return 0; | |||||
| } | |||||
| return 0; | |||||
| } | } | ||||
| @@ -1,5 +1,5 @@ | |||||
| /*************************************************************************** | /*************************************************************************** | ||||
| Copyright (c) 2013-2018,The OpenBLAS Project | |||||
| Copyright (c) 2013-2019,The OpenBLAS Project | |||||
| All rights reserved. | All rights reserved. | ||||
| Redistribution and use in source and binary forms,with or without | Redistribution and use in source and binary forms,with or without | ||||
| modification,are permitted provided that the following conditions are | modification,are permitted provided that the following conditions are | ||||
| @@ -27,114 +27,118 @@ USE OF THIS SOFTWARE,EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #include "common.h" | #include "common.h" | ||||
| static FLOAT sdot_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y) | |||||
| { | |||||
| FLOAT dot; | |||||
| __asm__ volatile ( | |||||
| "vzero %%v0 \n\t" | |||||
| "srlg %%r0,%1,5 \n\t" | |||||
| "xgr %%r1,%%r1 \n\t" | |||||
| "0: \n\t" | |||||
| "pfd 1,1024(%%r1,%2) \n\t" | |||||
| "pfd 1,1024(%%r1,%3) \n\t" | |||||
| "vl %%v16,0(%%r1,%2) \n\t" | |||||
| "vl %%v17,16(%%r1,%2) \n\t" | |||||
| "vl %%v18,32(%%r1,%2) \n\t" | |||||
| "vl %%v19,48(%%r1,%2) \n\t" | |||||
| "vl %%v20,64(%%r1,%2) \n\t" | |||||
| "vl %%v21,80(%%r1,%2) \n\t" | |||||
| "vl %%v22,96(%%r1,%2) \n\t" | |||||
| "vl %%v23,112(%%r1,%2) \n\t" | |||||
| "vl %%v24,0(%%r1,%3) \n\t" | |||||
| "vfmasb %%v0,%%v16,%%v24,%%v0 \n\t" | |||||
| "vl %%v25,16(%%r1,%3) \n\t" | |||||
| "vfmasb %%v0,%%v17,%%v25,%%v0 \n\t" | |||||
| "vl %%v26,32(%%r1,%3) \n\t" | |||||
| "vfmasb %%v0,%%v18,%%v26,%%v0 \n\t" | |||||
| "vl %%v27,48(%%r1,%3) \n\t" | |||||
| "vfmasb %%v0,%%v19,%%v27,%%v0 \n\t" | |||||
| "vl %%v28,64(%%r1,%3) \n\t" | |||||
| "vfmasb %%v0,%%v20,%%v28,%%v0 \n\t" | |||||
| "vl %%v29,80(%%r1,%3) \n\t" | |||||
| "vfmasb %%v0,%%v21,%%v29,%%v0 \n\t" | |||||
| "vl %%v30,96(%%r1,%3) \n\t" | |||||
| "vfmasb %%v0,%%v22,%%v30,%%v0 \n\t" | |||||
| "vl %%v31,112(%%r1,%3) \n\t" | |||||
| "vfmasb %%v0,%%v23,%%v31,%%v0 \n\t" | |||||
| "agfi %%r1,128 \n\t" | |||||
| "brctg %%r0,0b \n\t" | |||||
| "vrepf %%v1,%%v0,1 \n\t" | |||||
| "vrepf %%v2,%%v0,2 \n\t" | |||||
| "vrepf %%v3,%%v0,3 \n\t" | |||||
| "aebr %%f0,%%f1 \n\t" | |||||
| "aebr %%f0,%%f2 \n\t" | |||||
| "aebr %%f0,%%f3 \n\t" | |||||
| "ler %0,%%f0 " | |||||
| :"=f"(dot) | |||||
| :"r"(n),"ZR"((const FLOAT (*)[n])x),"ZR"((const FLOAT (*)[n])y) | |||||
| :"memory","cc","r0","r1","v0","v1","v2","v3","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" | |||||
| ); | |||||
| return dot; | |||||
| static FLOAT sdot_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y) { | |||||
| FLOAT dot; | |||||
| __asm__("vzero %%v0\n\t" | |||||
| "vzero %%v1\n\t" | |||||
| "vzero %%v2\n\t" | |||||
| "vzero %%v3\n\t" | |||||
| "vzero %%v4\n\t" | |||||
| "vzero %%v5\n\t" | |||||
| "vzero %%v6\n\t" | |||||
| "vzero %%v7\n\t" | |||||
| "srlg %[n],%[n],5\n\t" | |||||
| "xgr %%r1,%%r1\n\t" | |||||
| "0:\n\t" | |||||
| "pfd 1,1024(%%r1,%[x])\n\t" | |||||
| "pfd 1,1024(%%r1,%[y])\n\t" | |||||
| "vl %%v16,0(%%r1,%[x])\n\t" | |||||
| "vl %%v17,16(%%r1,%[x])\n\t" | |||||
| "vl %%v18,32(%%r1,%[x])\n\t" | |||||
| "vl %%v19,48(%%r1,%[x])\n\t" | |||||
| "vl %%v20,64(%%r1,%[x])\n\t" | |||||
| "vl %%v21,80(%%r1,%[x])\n\t" | |||||
| "vl %%v22,96(%%r1,%[x])\n\t" | |||||
| "vl %%v23,112(%%r1,%[x])\n\t" | |||||
| "vl %%v24,0(%%r1,%[y])\n\t" | |||||
| "vl %%v25,16(%%r1,%[y])\n\t" | |||||
| "vl %%v26,32(%%r1,%[y])\n\t" | |||||
| "vl %%v27,48(%%r1,%[y])\n\t" | |||||
| "vl %%v28,64(%%r1,%[y])\n\t" | |||||
| "vl %%v29,80(%%r1,%[y])\n\t" | |||||
| "vl %%v30,96(%%r1,%[y])\n\t" | |||||
| "vl %%v31,112(%%r1,%[y])\n\t" | |||||
| "vfmasb %%v0,%%v16,%%v24,%%v0\n\t" | |||||
| "vfmasb %%v1,%%v17,%%v25,%%v1\n\t" | |||||
| "vfmasb %%v2,%%v18,%%v26,%%v2\n\t" | |||||
| "vfmasb %%v3,%%v19,%%v27,%%v3\n\t" | |||||
| "vfmasb %%v4,%%v20,%%v28,%%v4\n\t" | |||||
| "vfmasb %%v5,%%v21,%%v29,%%v5\n\t" | |||||
| "vfmasb %%v6,%%v22,%%v30,%%v6\n\t" | |||||
| "vfmasb %%v7,%%v23,%%v31,%%v7\n\t" | |||||
| "agfi %%r1,128\n\t" | |||||
| "brctg %[n],0b\n\t" | |||||
| "vfasb %%v0,%%v0,%%v1\n\t" | |||||
| "vfasb %%v0,%%v0,%%v2\n\t" | |||||
| "vfasb %%v0,%%v0,%%v3\n\t" | |||||
| "vfasb %%v0,%%v0,%%v4\n\t" | |||||
| "vfasb %%v0,%%v0,%%v5\n\t" | |||||
| "vfasb %%v0,%%v0,%%v6\n\t" | |||||
| "vfasb %%v0,%%v0,%%v7\n\t" | |||||
| "vrepf %%v1,%%v0,1\n\t" | |||||
| "vrepf %%v2,%%v0,2\n\t" | |||||
| "vrepf %%v3,%%v0,3\n\t" | |||||
| "aebr %%f0,%%f1\n\t" | |||||
| "aebr %%f0,%%f2\n\t" | |||||
| "aebr %%f0,%%f3\n\t" | |||||
| "ler %[dot],%%f0" | |||||
| : [dot] "=f"(dot),[n] "+&r"(n) | |||||
| : "m"(*(const FLOAT (*)[n]) x),[x] "a"(x), "m"(*(const FLOAT (*)[n]) y), | |||||
| [y] "a"(y) | |||||
| : "cc", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", | |||||
| "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", | |||||
| "v27", "v28", "v29", "v30", "v31"); | |||||
| return dot; | |||||
| } | } | ||||
| FLOAT CNAME(BLASLONG n,FLOAT *x,BLASLONG inc_x,FLOAT *y,BLASLONG inc_y) | |||||
| { | |||||
| BLASLONG i=0; | |||||
| BLASLONG ix=0,iy=0; | |||||
| FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) { | |||||
| BLASLONG i = 0; | |||||
| BLASLONG ix = 0, iy = 0; | |||||
| FLOAT dot = 0.0 ; | |||||
| FLOAT dot = 0.0; | |||||
| if ( n <= 0 ) return(dot); | |||||
| if (n <= 0) | |||||
| return (dot); | |||||
| if ( (inc_x == 1) && (inc_y == 1) ) | |||||
| { | |||||
| if ((inc_x == 1) && (inc_y == 1)) { | |||||
| BLASLONG n1 = n & -32; | |||||
| BLASLONG n1 = n & -32; | |||||
| if ( n1 ) | |||||
| dot = sdot_kernel_32(n1,x,y); | |||||
| if (n1) | |||||
| dot = sdot_kernel_32(n1, x, y); | |||||
| i = n1; | |||||
| while(i < n) | |||||
| { | |||||
| i = n1; | |||||
| while (i < n) { | |||||
| dot += y[i] * x[i] ; | |||||
| i++ ; | |||||
| dot += y[i] * x[i]; | |||||
| i++; | |||||
| } | |||||
| return(dot); | |||||
| } | |||||
| return (dot); | |||||
| } | |||||
| } | |||||
| BLASLONG n1 = n & -2; | |||||
| BLASLONG n1 = n & -2; | |||||
| while (i < n1) { | |||||
| while(i < n1) | |||||
| { | |||||
| dot += y[iy] * x[ix] + y[iy + inc_y] * x[ix + inc_x]; | |||||
| ix += inc_x * 2; | |||||
| iy += inc_y * 2; | |||||
| i += 2; | |||||
| dot += y[iy] * x[ix] + y[iy+inc_y] * x[ix+inc_x]; | |||||
| ix += inc_x*2 ; | |||||
| iy += inc_y*2 ; | |||||
| i+=2 ; | |||||
| } | |||||
| } | |||||
| while (i < n) { | |||||
| while(i < n) | |||||
| { | |||||
| dot += y[iy] * x[ix]; | |||||
| ix += inc_x; | |||||
| iy += inc_y; | |||||
| i++; | |||||
| dot += y[iy] * x[ix] ; | |||||
| ix += inc_x ; | |||||
| iy += inc_y ; | |||||
| i++ ; | |||||
| } | |||||
| return(dot); | |||||
| } | |||||
| return (dot); | |||||
| } | } | ||||
| @@ -1,5 +1,5 @@ | |||||
| /*************************************************************************** | /*************************************************************************** | ||||
| Copyright (c) 2013-2017, The OpenBLAS Project | |||||
| Copyright (c) 2013-2019, The OpenBLAS Project | |||||
| All rights reserved. | All rights reserved. | ||||
| Redistribution and use in source and binary forms, with or without | Redistribution and use in source and binary forms, with or without | ||||
| modification, are permitted provided that the following conditions are | modification, are permitted provided that the following conditions are | ||||
| @@ -27,136 +27,123 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #include "common.h" | #include "common.h" | ||||
| static FLOAT smax_kernel_64(BLASLONG n, FLOAT *x) | |||||
| { | |||||
| FLOAT max; | |||||
| __asm__ volatile ( | |||||
| "vl %%v0,0(%2) \n\t" | |||||
| "srlg %%r0,%1,6 \n\t" | |||||
| "xgr %%r1,%%r1 \n\t" | |||||
| "0: \n\t" | |||||
| "pfd 1, 1024(%%r1,%2) \n\t" | |||||
| "vl %%v16,0(%%r1,%2) \n\t" | |||||
| "vl %%v17,16(%%r1,%2) \n\t" | |||||
| "vl %%v18,32(%%r1,%2) \n\t" | |||||
| "vl %%v19,48(%%r1,%2) \n\t" | |||||
| "vl %%v20,64(%%r1,%2) \n\t" | |||||
| "vl %%v21,80(%%r1,%2) \n\t" | |||||
| "vl %%v22,96(%%r1,%2) \n\t" | |||||
| "vl %%v23,112(%%r1,%2) \n\t" | |||||
| "vl %%v24,128(%%r1,%2) \n\t" | |||||
| "vl %%v25,144(%%r1,%2) \n\t" | |||||
| "vl %%v26,160(%%r1,%2) \n\t" | |||||
| "vl %%v27,176(%%r1,%2) \n\t" | |||||
| "vl %%v28,192(%%r1,%2) \n\t" | |||||
| "vl %%v29,208(%%r1,%2) \n\t" | |||||
| "vl %%v30,224(%%r1,%2) \n\t" | |||||
| "vl %%v31,240(%%r1,%2) \n\t" | |||||
| "vfmaxsb %%v16,%%v16,%%v24,0 \n\t" | |||||
| "vfmaxsb %%v17,%%v17,%%v25,0 \n\t" | |||||
| "vfmaxsb %%v18,%%v18,%%v26,0 \n\t" | |||||
| "vfmaxsb %%v19,%%v19,%%v27,0 \n\t" | |||||
| "vfmaxsb %%v20,%%v20,%%v28,0 \n\t" | |||||
| "vfmaxsb %%v21,%%v21,%%v29,0 \n\t" | |||||
| "vfmaxsb %%v22,%%v22,%%v30,0 \n\t" | |||||
| "vfmaxsb %%v23,%%v23,%%v31,0 \n\t" | |||||
| "vfmaxsb %%v16,%%v16,%%v20,0 \n\t" | |||||
| "vfmaxsb %%v17,%%v17,%%v21,0 \n\t" | |||||
| "vfmaxsb %%v18,%%v18,%%v22,0 \n\t" | |||||
| "vfmaxsb %%v19,%%v19,%%v23,0 \n\t" | |||||
| "vfmaxsb %%v16,%%v16,%%v18,0 \n\t" | |||||
| "vfmaxsb %%v17,%%v17,%%v19,0 \n\t" | |||||
| "vfmaxsb %%v16,%%v16,%%v17,0 \n\t" | |||||
| "vfmaxsb %%v0,%%v0,%%v16,0 \n\t" | |||||
| "agfi %%r1, 256 \n\t" | |||||
| "brctg %%r0, 0b \n\t" | |||||
| "veslg %%v16,%%v0,32 \n\t" | |||||
| "vfmaxsb %%v0,%%v0,%%v16,0 \n\t" | |||||
| "vrepf %%v16,%%v0,2 \n\t" | |||||
| "wfmaxsb %%v0,%%v0,%%v16,0 \n\t" | |||||
| "ler %0,%%f0 " | |||||
| :"=f"(max) | |||||
| :"r"(n),"ZR"((const FLOAT (*)[n])x) | |||||
| :"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" | |||||
| ); | |||||
| return max; | |||||
| static FLOAT smax_kernel_64(BLASLONG n, FLOAT *x) { | |||||
| FLOAT max; | |||||
| __asm__("vl %%v0,0(%[x])\n\t" | |||||
| "srlg %[n],%[n],6\n\t" | |||||
| "xgr %%r1,%%r1\n\t" | |||||
| "0:\n\t" | |||||
| "pfd 1, 1024(%%r1,%[x])\n\t" | |||||
| "vl %%v16,0(%%r1,%[x])\n\t" | |||||
| "vl %%v17,16(%%r1,%[x])\n\t" | |||||
| "vl %%v18,32(%%r1,%[x])\n\t" | |||||
| "vl %%v19,48(%%r1,%[x])\n\t" | |||||
| "vl %%v20,64(%%r1,%[x])\n\t" | |||||
| "vl %%v21,80(%%r1,%[x])\n\t" | |||||
| "vl %%v22,96(%%r1,%[x])\n\t" | |||||
| "vl %%v23,112(%%r1,%[x])\n\t" | |||||
| "vl %%v24,128(%%r1,%[x])\n\t" | |||||
| "vl %%v25,144(%%r1,%[x])\n\t" | |||||
| "vl %%v26,160(%%r1,%[x])\n\t" | |||||
| "vl %%v27,176(%%r1,%[x])\n\t" | |||||
| "vl %%v28,192(%%r1,%[x])\n\t" | |||||
| "vl %%v29,208(%%r1,%[x])\n\t" | |||||
| "vl %%v30,224(%%r1,%[x])\n\t" | |||||
| "vl %%v31,240(%%r1,%[x])\n\t" | |||||
| "vfmaxsb %%v16,%%v16,%%v24,0\n\t" | |||||
| "vfmaxsb %%v17,%%v17,%%v25,0\n\t" | |||||
| "vfmaxsb %%v18,%%v18,%%v26,0\n\t" | |||||
| "vfmaxsb %%v19,%%v19,%%v27,0\n\t" | |||||
| "vfmaxsb %%v20,%%v20,%%v28,0\n\t" | |||||
| "vfmaxsb %%v21,%%v21,%%v29,0\n\t" | |||||
| "vfmaxsb %%v22,%%v22,%%v30,0\n\t" | |||||
| "vfmaxsb %%v23,%%v23,%%v31,0\n\t" | |||||
| "vfmaxsb %%v16,%%v16,%%v20,0\n\t" | |||||
| "vfmaxsb %%v17,%%v17,%%v21,0\n\t" | |||||
| "vfmaxsb %%v18,%%v18,%%v22,0\n\t" | |||||
| "vfmaxsb %%v19,%%v19,%%v23,0\n\t" | |||||
| "vfmaxsb %%v16,%%v16,%%v18,0\n\t" | |||||
| "vfmaxsb %%v17,%%v17,%%v19,0\n\t" | |||||
| "vfmaxsb %%v16,%%v16,%%v17,0\n\t" | |||||
| "vfmaxsb %%v0,%%v0,%%v16,0\n\t" | |||||
| "agfi %%r1, 256\n\t" | |||||
| "brctg %[n], 0b\n\t" | |||||
| "veslg %%v16,%%v0,32\n\t" | |||||
| "vfmaxsb %%v0,%%v0,%%v16,0\n\t" | |||||
| "vrepf %%v16,%%v0,2\n\t" | |||||
| "wfmaxsb %%v0,%%v0,%%v16,0\n\t" | |||||
| "ler %[max],%%f0" | |||||
| : [max] "=f"(max),[n] "+&r"(n) | |||||
| : "m"(*(const FLOAT (*)[n]) x),[x] "a"(x) | |||||
| : "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22", | |||||
| "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"); | |||||
| return max; | |||||
| } | } | ||||
| FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { | |||||
| BLASLONG i = 0; | |||||
| BLASLONG j = 0; | |||||
| FLOAT maxf = 0.0; | |||||
| if (n <= 0 || inc_x <= 0) return (maxf); | |||||
| if (inc_x == 1) { | |||||
| FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { | |||||
| BLASLONG i = 0; | |||||
| BLASLONG j = 0; | |||||
| FLOAT maxf = 0.0; | |||||
| BLASLONG n1 = n & -64; | |||||
| if (n1 > 0) { | |||||
| if (n <= 0 || inc_x <= 0) | |||||
| return (maxf); | |||||
| maxf = smax_kernel_64(n1, x); | |||||
| if (inc_x == 1) { | |||||
| i = n1; | |||||
| } | |||||
| else | |||||
| { | |||||
| maxf=x[0]; | |||||
| i++; | |||||
| } | |||||
| BLASLONG n1 = n & -64; | |||||
| if (n1 > 0) { | |||||
| while (i < n) { | |||||
| if (x[i] > maxf) { | |||||
| maxf = x[i]; | |||||
| } | |||||
| i++; | |||||
| } | |||||
| return (maxf); | |||||
| maxf = smax_kernel_64(n1, x); | |||||
| i = n1; | |||||
| } else { | } else { | ||||
| maxf = x[0]; | |||||
| i++; | |||||
| } | |||||
| maxf=x[0]; | |||||
| while (i < n) { | |||||
| if (x[i] > maxf) { | |||||
| maxf = x[i]; | |||||
| } | |||||
| i++; | |||||
| } | |||||
| return (maxf); | |||||
| } else { | |||||
| BLASLONG n1 = n & -4; | |||||
| while (j < n1) { | |||||
| maxf = x[0]; | |||||
| if (x[i] > maxf) { | |||||
| maxf = x[i]; | |||||
| } | |||||
| if (x[i + inc_x] > maxf) { | |||||
| maxf = x[i + inc_x]; | |||||
| } | |||||
| if (x[i + 2 * inc_x] > maxf) { | |||||
| maxf = x[i + 2 * inc_x]; | |||||
| } | |||||
| if (x[i + 3 * inc_x] > maxf) { | |||||
| maxf = x[i + 3 * inc_x]; | |||||
| } | |||||
| BLASLONG n1 = n & -4; | |||||
| while (j < n1) { | |||||
| i += inc_x * 4; | |||||
| if (x[i] > maxf) { | |||||
| maxf = x[i]; | |||||
| } | |||||
| if (x[i + inc_x] > maxf) { | |||||
| maxf = x[i + inc_x]; | |||||
| } | |||||
| if (x[i + 2 * inc_x] > maxf) { | |||||
| maxf = x[i + 2 * inc_x]; | |||||
| } | |||||
| if (x[i + 3 * inc_x] > maxf) { | |||||
| maxf = x[i + 3 * inc_x]; | |||||
| } | |||||
| j += 4; | |||||
| i += inc_x * 4; | |||||
| } | |||||
| j += 4; | |||||
| } | |||||
| while (j < n) { | |||||
| if (x[i] > maxf) { | |||||
| maxf = x[i]; | |||||
| } | |||||
| i += inc_x; | |||||
| j++; | |||||
| } | |||||
| return (maxf); | |||||
| while (j < n) { | |||||
| if (x[i] > maxf) { | |||||
| maxf = x[i]; | |||||
| } | |||||
| i += inc_x; | |||||
| j++; | |||||
| } | } | ||||
| return (maxf); | |||||
| } | |||||
| } | } | ||||
| @@ -1,5 +1,5 @@ | |||||
| /*************************************************************************** | /*************************************************************************** | ||||
| Copyright (c) 2013-2017, The OpenBLAS Project | |||||
| Copyright (c) 2013-2019, The OpenBLAS Project | |||||
| All rights reserved. | All rights reserved. | ||||
| Redistribution and use in source and binary forms, with or without | Redistribution and use in source and binary forms, with or without | ||||
| modification, are permitted provided that the following conditions are | modification, are permitted provided that the following conditions are | ||||
| @@ -27,136 +27,123 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #include "common.h" | #include "common.h" | ||||
| static FLOAT smin_kernel_64(BLASLONG n, FLOAT *x) | |||||
| { | |||||
| FLOAT min; | |||||
| __asm__ volatile ( | |||||
| "vl %%v0,0(%2) \n\t" | |||||
| "srlg %%r0,%1,6 \n\t" | |||||
| "xgr %%r1,%%r1 \n\t" | |||||
| "0: \n\t" | |||||
| "pfd 1, 1024(%%r1,%2) \n\t" | |||||
| "vl %%v16,0(%%r1,%2) \n\t" | |||||
| "vl %%v17,16(%%r1,%2) \n\t" | |||||
| "vl %%v18,32(%%r1,%2) \n\t" | |||||
| "vl %%v19,48(%%r1,%2) \n\t" | |||||
| "vl %%v20,64(%%r1,%2) \n\t" | |||||
| "vl %%v21,80(%%r1,%2) \n\t" | |||||
| "vl %%v22,96(%%r1,%2) \n\t" | |||||
| "vl %%v23,112(%%r1,%2) \n\t" | |||||
| "vl %%v24,128(%%r1,%2) \n\t" | |||||
| "vl %%v25,144(%%r1,%2) \n\t" | |||||
| "vl %%v26,160(%%r1,%2) \n\t" | |||||
| "vl %%v27,176(%%r1,%2) \n\t" | |||||
| "vl %%v28,192(%%r1,%2) \n\t" | |||||
| "vl %%v29,208(%%r1,%2) \n\t" | |||||
| "vl %%v30,224(%%r1,%2) \n\t" | |||||
| "vl %%v31,240(%%r1,%2) \n\t" | |||||
| "vfminsb %%v16,%%v16,%%v24,0 \n\t" | |||||
| "vfminsb %%v17,%%v17,%%v25,0 \n\t" | |||||
| "vfminsb %%v18,%%v18,%%v26,0 \n\t" | |||||
| "vfminsb %%v19,%%v19,%%v27,0 \n\t" | |||||
| "vfminsb %%v20,%%v20,%%v28,0 \n\t" | |||||
| "vfminsb %%v21,%%v21,%%v29,0 \n\t" | |||||
| "vfminsb %%v22,%%v22,%%v30,0 \n\t" | |||||
| "vfminsb %%v23,%%v23,%%v31,0 \n\t" | |||||
| "vfminsb %%v16,%%v16,%%v20,0 \n\t" | |||||
| "vfminsb %%v17,%%v17,%%v21,0 \n\t" | |||||
| "vfminsb %%v18,%%v18,%%v22,0 \n\t" | |||||
| "vfminsb %%v19,%%v19,%%v23,0 \n\t" | |||||
| "vfminsb %%v16,%%v16,%%v18,0 \n\t" | |||||
| "vfminsb %%v17,%%v17,%%v19,0 \n\t" | |||||
| "vfminsb %%v16,%%v16,%%v17,0 \n\t" | |||||
| "vfminsb %%v0,%%v0,%%v16,0 \n\t" | |||||
| "agfi %%r1, 256 \n\t" | |||||
| "brctg %%r0, 0b \n\t" | |||||
| "veslg %%v16,%%v0,32 \n\t" | |||||
| "vfminsb %%v0,%%v0,%%v16,0 \n\t" | |||||
| "vrepf %%v16,%%v0,2 \n\t" | |||||
| "wfminsb %%v0,%%v0,%%v16,0 \n\t" | |||||
| "ler %0,%%f0 " | |||||
| :"=f"(min) | |||||
| :"r"(n),"ZR"((const FLOAT (*)[n])x) | |||||
| :"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" | |||||
| ); | |||||
| return min; | |||||
| static FLOAT smin_kernel_64(BLASLONG n, FLOAT *x) { | |||||
| FLOAT min; | |||||
| __asm__("vl %%v0,0(%[x])\n\t" | |||||
| "srlg %[n],%[n],6\n\t" | |||||
| "xgr %%r1,%%r1\n\t" | |||||
| "0:\n\t" | |||||
| "pfd 1, 1024(%%r1,%[x])\n\t" | |||||
| "vl %%v16,0(%%r1,%[x])\n\t" | |||||
| "vl %%v17,16(%%r1,%[x])\n\t" | |||||
| "vl %%v18,32(%%r1,%[x])\n\t" | |||||
| "vl %%v19,48(%%r1,%[x])\n\t" | |||||
| "vl %%v20,64(%%r1,%[x])\n\t" | |||||
| "vl %%v21,80(%%r1,%[x])\n\t" | |||||
| "vl %%v22,96(%%r1,%[x])\n\t" | |||||
| "vl %%v23,112(%%r1,%[x])\n\t" | |||||
| "vl %%v24,128(%%r1,%[x])\n\t" | |||||
| "vl %%v25,144(%%r1,%[x])\n\t" | |||||
| "vl %%v26,160(%%r1,%[x])\n\t" | |||||
| "vl %%v27,176(%%r1,%[x])\n\t" | |||||
| "vl %%v28,192(%%r1,%[x])\n\t" | |||||
| "vl %%v29,208(%%r1,%[x])\n\t" | |||||
| "vl %%v30,224(%%r1,%[x])\n\t" | |||||
| "vl %%v31,240(%%r1,%[x])\n\t" | |||||
| "vfminsb %%v16,%%v16,%%v24,0\n\t" | |||||
| "vfminsb %%v17,%%v17,%%v25,0\n\t" | |||||
| "vfminsb %%v18,%%v18,%%v26,0\n\t" | |||||
| "vfminsb %%v19,%%v19,%%v27,0\n\t" | |||||
| "vfminsb %%v20,%%v20,%%v28,0\n\t" | |||||
| "vfminsb %%v21,%%v21,%%v29,0\n\t" | |||||
| "vfminsb %%v22,%%v22,%%v30,0\n\t" | |||||
| "vfminsb %%v23,%%v23,%%v31,0\n\t" | |||||
| "vfminsb %%v16,%%v16,%%v20,0\n\t" | |||||
| "vfminsb %%v17,%%v17,%%v21,0\n\t" | |||||
| "vfminsb %%v18,%%v18,%%v22,0\n\t" | |||||
| "vfminsb %%v19,%%v19,%%v23,0\n\t" | |||||
| "vfminsb %%v16,%%v16,%%v18,0\n\t" | |||||
| "vfminsb %%v17,%%v17,%%v19,0\n\t" | |||||
| "vfminsb %%v16,%%v16,%%v17,0\n\t" | |||||
| "vfminsb %%v0,%%v0,%%v16,0\n\t" | |||||
| "agfi %%r1, 256\n\t" | |||||
| "brctg %[n], 0b\n\t" | |||||
| "veslg %%v16,%%v0,32\n\t" | |||||
| "vfminsb %%v0,%%v0,%%v16,0\n\t" | |||||
| "vrepf %%v16,%%v0,2\n\t" | |||||
| "wfminsb %%v0,%%v0,%%v16,0\n\t" | |||||
| "ler %[min],%%f0" | |||||
| : [min] "=f"(min),[n] "+&r"(n) | |||||
| : "m"(*(const FLOAT (*)[n]) x),[x] "a"(x) | |||||
| : "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22", | |||||
| "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"); | |||||
| return min; | |||||
| } | } | ||||
| FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { | |||||
| BLASLONG i = 0; | |||||
| BLASLONG j = 0; | |||||
| FLOAT minf = 0.0; | |||||
| if (n <= 0 || inc_x <= 0) return (minf); | |||||
| if (inc_x == 1) { | |||||
| FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { | |||||
| BLASLONG i = 0; | |||||
| BLASLONG j = 0; | |||||
| FLOAT minf = 0.0; | |||||
| BLASLONG n1 = n & -64; | |||||
| if (n1 > 0) { | |||||
| if (n <= 0 || inc_x <= 0) | |||||
| return (minf); | |||||
| minf = smin_kernel_64(n1, x); | |||||
| if (inc_x == 1) { | |||||
| i = n1; | |||||
| } | |||||
| else | |||||
| { | |||||
| minf=x[0]; | |||||
| i++; | |||||
| } | |||||
| BLASLONG n1 = n & -64; | |||||
| if (n1 > 0) { | |||||
| while (i < n) { | |||||
| if (x[i] < minf) { | |||||
| minf = x[i]; | |||||
| } | |||||
| i++; | |||||
| } | |||||
| return (minf); | |||||
| minf = smin_kernel_64(n1, x); | |||||
| i = n1; | |||||
| } else { | } else { | ||||
| minf = x[0]; | |||||
| i++; | |||||
| } | |||||
| minf=x[0]; | |||||
| while (i < n) { | |||||
| if (x[i] < minf) { | |||||
| minf = x[i]; | |||||
| } | |||||
| i++; | |||||
| } | |||||
| return (minf); | |||||
| } else { | |||||
| BLASLONG n1 = n & -4; | |||||
| while (j < n1) { | |||||
| minf = x[0]; | |||||
| if (x[i] < minf) { | |||||
| minf = x[i]; | |||||
| } | |||||
| if (x[i + inc_x] < minf) { | |||||
| minf = x[i + inc_x]; | |||||
| } | |||||
| if (x[i + 2 * inc_x] < minf) { | |||||
| minf = x[i + 2 * inc_x]; | |||||
| } | |||||
| if (x[i + 3 * inc_x] < minf) { | |||||
| minf = x[i + 3 * inc_x]; | |||||
| } | |||||
| BLASLONG n1 = n & -4; | |||||
| while (j < n1) { | |||||
| i += inc_x * 4; | |||||
| if (x[i] < minf) { | |||||
| minf = x[i]; | |||||
| } | |||||
| if (x[i + inc_x] < minf) { | |||||
| minf = x[i + inc_x]; | |||||
| } | |||||
| if (x[i + 2 * inc_x] < minf) { | |||||
| minf = x[i + 2 * inc_x]; | |||||
| } | |||||
| if (x[i + 3 * inc_x] < minf) { | |||||
| minf = x[i + 3 * inc_x]; | |||||
| } | |||||
| j += 4; | |||||
| i += inc_x * 4; | |||||
| } | |||||
| j += 4; | |||||
| } | |||||
| while (j < n) { | |||||
| if (x[i] < minf) { | |||||
| minf = x[i]; | |||||
| } | |||||
| i += inc_x; | |||||
| j++; | |||||
| } | |||||
| return (minf); | |||||
| while (j < n) { | |||||
| if (x[i] < minf) { | |||||
| minf = x[i]; | |||||
| } | |||||
| i += inc_x; | |||||
| j++; | |||||
| } | } | ||||
| return (minf); | |||||
| } | |||||
| } | } | ||||
| @@ -1,5 +1,5 @@ | |||||
| /*************************************************************************** | /*************************************************************************** | ||||
| Copyright (c) 2013-2018, The OpenBLAS Project | |||||
| Copyright (c) 2013-2019, The OpenBLAS Project | |||||
| All rights reserved. | All rights reserved. | ||||
| Redistribution and use in source and binary forms, with or without | Redistribution and use in source and binary forms, with or without | ||||
| modification, are permitted provided that the following conditions are | modification, are permitted provided that the following conditions are | ||||
| @@ -27,220 +27,199 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #include "common.h" | #include "common.h" | ||||
| static void srot_kernel_64(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *c, FLOAT *s) | |||||
| { | |||||
| __asm__ ( | |||||
| "vlrepf %%v0,%3 \n\t" | |||||
| "vlrepf %%v1,%4 \n\t" | |||||
| "srlg %%r0,%0,6 \n\t" | |||||
| "xgr %%r1,%%r1 \n\t" | |||||
| "0: \n\t" | |||||
| "pfd 2, 1024(%%r1,%1) \n\t" | |||||
| "pfd 2, 1024(%%r1,%2) \n\t" | |||||
| "vl %%v24, 0(%%r1,%1) \n\t" | |||||
| "vl %%v25, 16(%%r1,%1) \n\t" | |||||
| "vl %%v26, 32(%%r1,%1) \n\t" | |||||
| "vl %%v27, 48(%%r1,%1) \n\t" | |||||
| "vl %%v16, 0(%%r1,%2) \n\t" | |||||
| "vl %%v17, 16(%%r1,%2) \n\t" | |||||
| "vl %%v18, 32(%%r1,%2) \n\t" | |||||
| "vl %%v19, 48(%%r1,%2) \n\t" | |||||
| "vfmsb %%v28,%%v24,%%v0 \n\t" | |||||
| "vfmsb %%v29,%%v25,%%v0 \n\t" | |||||
| "vfmsb %%v20,%%v24,%%v1 \n\t" /* yn=x*s */ | |||||
| "vfmsb %%v21,%%v25,%%v1 \n\t" /* yn=x*s */ | |||||
| "vfmsb %%v30,%%v26,%%v0 \n\t" | |||||
| "vfmsb %%v22,%%v26,%%v1 \n\t" /* yn=x*s */ | |||||
| "vfmsb %%v31,%%v27,%%v0 \n\t" | |||||
| "vfmsb %%v23,%%v27,%%v1 \n\t" /* yn=x*s */ | |||||
| /* 2nd parts*/ | |||||
| "vfmasb %%v28,%%v16,%%v1,%%v28 \n\t" | |||||
| "vfmssb %%v20,%%v16,%%v0,%%v20 \n\t" /* yn=y*c-yn */ | |||||
| "vfmasb %%v29,%%v17,%%v1,%%v29 \n\t" | |||||
| "vfmssb %%v21,%%v17,%%v0,%%v21 \n\t" /* yn=y*c-yn */ | |||||
| "vfmasb %%v30,%%v18,%%v1,%%v30 \n\t" | |||||
| "vfmssb %%v22,%%v18,%%v0,%%v22 \n\t" /* yn=y*c-yn */ | |||||
| "vfmasb %%v31,%%v19,%%v1,%%v31 \n\t" | |||||
| "vfmssb %%v23,%%v19,%%v0,%%v23 \n\t" /* yn=y*c-yn */ | |||||
| "vst %%v28, 0(%%r1,%1) \n\t" | |||||
| "vst %%v29, 16(%%r1,%1) \n\t" | |||||
| "vst %%v30, 32(%%r1,%1) \n\t" | |||||
| "vst %%v31, 48(%%r1,%1) \n\t" | |||||
| "vst %%v20, 0(%%r1,%2) \n\t" | |||||
| "vst %%v21, 16(%%r1,%2) \n\t" | |||||
| "vst %%v22, 32(%%r1,%2) \n\t" | |||||
| "vst %%v23, 48(%%r1,%2) \n\t" | |||||
| "vl %%v24, 64(%%r1,%1) \n\t" | |||||
| "vl %%v25, 80(%%r1,%1) \n\t" | |||||
| "vl %%v26, 96(%%r1,%1) \n\t" | |||||
| "vl %%v27, 112(%%r1,%1) \n\t" | |||||
| "vl %%v16, 64(%%r1,%2) \n\t" | |||||
| "vl %%v17, 80(%%r1,%2) \n\t" | |||||
| "vl %%v18, 96(%%r1,%2) \n\t" | |||||
| "vl %%v19, 112(%%r1,%2) \n\t" | |||||
| "vfmsb %%v28,%%v24,%%v0 \n\t" | |||||
| "vfmsb %%v29,%%v25,%%v0 \n\t" | |||||
| "vfmsb %%v20,%%v24,%%v1 \n\t" /* yn=x*s */ | |||||
| "vfmsb %%v21,%%v25,%%v1 \n\t" /* yn=x*s */ | |||||
| "vfmsb %%v30,%%v26,%%v0 \n\t" | |||||
| "vfmsb %%v22,%%v26,%%v1 \n\t" /* yn=x*s */ | |||||
| "vfmsb %%v31,%%v27,%%v0 \n\t" | |||||
| "vfmsb %%v23,%%v27,%%v1 \n\t" /* yn=x*s */ | |||||
| /* 2nd parts*/ | |||||
| "vfmasb %%v28,%%v16,%%v1,%%v28 \n\t" | |||||
| "vfmssb %%v20,%%v16,%%v0,%%v20 \n\t" /* yn=y*c-yn */ | |||||
| "vfmasb %%v29,%%v17,%%v1,%%v29 \n\t" | |||||
| "vfmssb %%v21,%%v17,%%v0,%%v21 \n\t" /* yn=y*c-yn */ | |||||
| "vfmasb %%v30,%%v18,%%v1,%%v30 \n\t" | |||||
| "vfmssb %%v22,%%v18,%%v0,%%v22 \n\t" /* yn=y*c-yn */ | |||||
| "vfmasb %%v31,%%v19,%%v1,%%v31 \n\t" | |||||
| "vfmssb %%v23,%%v19,%%v0,%%v23 \n\t" /* yn=y*c-yn */ | |||||
| "vst %%v28, 64(%%r1,%1) \n\t" | |||||
| "vst %%v29, 80(%%r1,%1) \n\t" | |||||
| "vst %%v30, 96(%%r1,%1) \n\t" | |||||
| "vst %%v31, 112(%%r1,%1) \n\t" | |||||
| "vst %%v20, 64(%%r1,%2) \n\t" | |||||
| "vst %%v21, 80(%%r1,%2) \n\t" | |||||
| "vst %%v22, 96(%%r1,%2) \n\t" | |||||
| "vst %%v23, 112(%%r1,%2) \n\t" | |||||
| "vl %%v24, 128(%%r1,%1) \n\t" | |||||
| "vl %%v25, 144(%%r1,%1) \n\t" | |||||
| "vl %%v26, 160(%%r1,%1) \n\t" | |||||
| "vl %%v27, 176(%%r1,%1) \n\t" | |||||
| "vl %%v16, 128(%%r1,%2) \n\t" | |||||
| "vl %%v17, 144(%%r1,%2) \n\t" | |||||
| "vl %%v18, 160(%%r1,%2) \n\t" | |||||
| "vl %%v19, 176(%%r1,%2) \n\t" | |||||
| "vfmsb %%v28,%%v24,%%v0 \n\t" | |||||
| "vfmsb %%v29,%%v25,%%v0 \n\t" | |||||
| "vfmsb %%v20,%%v24,%%v1 \n\t" /* yn=x*s */ | |||||
| "vfmsb %%v21,%%v25,%%v1 \n\t" /* yn=x*s */ | |||||
| "vfmsb %%v30,%%v26,%%v0 \n\t" | |||||
| "vfmsb %%v22,%%v26,%%v1 \n\t" /* yn=x*s */ | |||||
| "vfmsb %%v31,%%v27,%%v0 \n\t" | |||||
| "vfmsb %%v23,%%v27,%%v1 \n\t" /* yn=x*s */ | |||||
| /* 2nd parts*/ | |||||
| "vfmasb %%v28,%%v16,%%v1,%%v28 \n\t" | |||||
| "vfmssb %%v20,%%v16,%%v0,%%v20 \n\t" /* yn=y*c-yn */ | |||||
| "vfmasb %%v29,%%v17,%%v1,%%v29 \n\t" | |||||
| "vfmssb %%v21,%%v17,%%v0,%%v21 \n\t" /* yn=y*c-yn */ | |||||
| "vfmasb %%v30,%%v18,%%v1,%%v30 \n\t" | |||||
| "vfmssb %%v22,%%v18,%%v0,%%v22 \n\t" /* yn=y*c-yn */ | |||||
| "vfmasb %%v31,%%v19,%%v1,%%v31 \n\t" | |||||
| "vfmssb %%v23,%%v19,%%v0,%%v23 \n\t" /* yn=y*c-yn */ | |||||
| "vst %%v28, 128(%%r1,%1) \n\t" | |||||
| "vst %%v29, 144(%%r1,%1) \n\t" | |||||
| "vst %%v30, 160(%%r1,%1) \n\t" | |||||
| "vst %%v31, 176(%%r1,%1) \n\t" | |||||
| "vst %%v20, 128(%%r1,%2) \n\t" | |||||
| "vst %%v21, 144(%%r1,%2) \n\t" | |||||
| "vst %%v22, 160(%%r1,%2) \n\t" | |||||
| "vst %%v23, 176(%%r1,%2) \n\t" | |||||
| "vl %%v24, 192(%%r1,%1) \n\t" | |||||
| "vl %%v25, 208(%%r1,%1) \n\t" | |||||
| "vl %%v26, 224(%%r1,%1) \n\t" | |||||
| "vl %%v27, 240(%%r1,%1) \n\t" | |||||
| "vl %%v16, 192(%%r1,%2) \n\t" | |||||
| "vl %%v17, 208(%%r1,%2) \n\t" | |||||
| "vl %%v18, 224(%%r1,%2) \n\t" | |||||
| "vl %%v19, 240(%%r1,%2) \n\t" | |||||
| "vfmsb %%v28,%%v24,%%v0 \n\t" | |||||
| "vfmsb %%v29,%%v25,%%v0 \n\t" | |||||
| "vfmsb %%v20,%%v24,%%v1 \n\t" /* yn=x*s */ | |||||
| "vfmsb %%v21,%%v25,%%v1 \n\t" /* yn=x*s */ | |||||
| "vfmsb %%v30,%%v26,%%v0 \n\t" | |||||
| "vfmsb %%v22,%%v26,%%v1 \n\t" /* yn=x*s */ | |||||
| "vfmsb %%v31,%%v27,%%v0 \n\t" | |||||
| "vfmsb %%v23,%%v27,%%v1 \n\t" /* yn=x*s */ | |||||
| /* 2nd parts*/ | |||||
| "vfmasb %%v28,%%v16,%%v1,%%v28 \n\t" | |||||
| "vfmssb %%v20,%%v16,%%v0,%%v20 \n\t" /* yn=y*c-yn */ | |||||
| "vfmasb %%v29,%%v17,%%v1,%%v29 \n\t" | |||||
| "vfmssb %%v21,%%v17,%%v0,%%v21 \n\t" /* yn=y*c-yn */ | |||||
| "vfmasb %%v30,%%v18,%%v1,%%v30 \n\t" | |||||
| "vfmssb %%v22,%%v18,%%v0,%%v22 \n\t" /* yn=y*c-yn */ | |||||
| "vfmasb %%v31,%%v19,%%v1,%%v31 \n\t" | |||||
| "vfmssb %%v23,%%v19,%%v0,%%v23 \n\t" /* yn=y*c-yn */ | |||||
| "vst %%v28, 192(%%r1,%1) \n\t" | |||||
| "vst %%v29, 208(%%r1,%1) \n\t" | |||||
| "vst %%v30, 224(%%r1,%1) \n\t" | |||||
| "vst %%v31, 240(%%r1,%1) \n\t" | |||||
| "vst %%v20, 192(%%r1,%2) \n\t" | |||||
| "vst %%v21, 208(%%r1,%2) \n\t" | |||||
| "vst %%v22, 224(%%r1,%2) \n\t" | |||||
| "vst %%v23, 240(%%r1,%2) \n\t" | |||||
| "agfi %%r1,256 \n\t" | |||||
| "brctg %%r0,0b " | |||||
| : | |||||
| :"r"(n),"ZR"((FLOAT (*)[n])x),"ZR"((FLOAT (*)[n])y),"m"(*c),"m"(*s) | |||||
| :"memory","cc","r0","r1","v0","v1","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" | |||||
| ); | |||||
| static void srot_kernel_64(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *c, FLOAT *s) { | |||||
| __asm__("vlrepf %%v0,%[c]\n\t" | |||||
| "vlrepf %%v1,%[s]\n\t" | |||||
| "srlg %[n],%[n],6\n\t" | |||||
| "xgr %%r1,%%r1\n\t" | |||||
| "0:\n\t" | |||||
| "pfd 2, 1024(%%r1,%[x])\n\t" | |||||
| "pfd 2, 1024(%%r1,%[y])\n\t" | |||||
| "vl %%v24, 0(%%r1,%[x])\n\t" | |||||
| "vl %%v25, 16(%%r1,%[x])\n\t" | |||||
| "vl %%v26, 32(%%r1,%[x])\n\t" | |||||
| "vl %%v27, 48(%%r1,%[x])\n\t" | |||||
| "vl %%v16, 0(%%r1,%[y])\n\t" | |||||
| "vl %%v17, 16(%%r1,%[y])\n\t" | |||||
| "vl %%v18, 32(%%r1,%[y])\n\t" | |||||
| "vl %%v19, 48(%%r1,%[y])\n\t" | |||||
| "vfmsb %%v28,%%v24,%%v0\n\t" | |||||
| "vfmsb %%v29,%%v25,%%v0\n\t" | |||||
| "vfmsb %%v20,%%v24,%%v1\n\t" /* yn=x*s */ | |||||
| "vfmsb %%v21,%%v25,%%v1\n\t" /* yn=x*s */ | |||||
| "vfmsb %%v30,%%v26,%%v0\n\t" | |||||
| "vfmsb %%v22,%%v26,%%v1\n\t" /* yn=x*s */ | |||||
| "vfmsb %%v31,%%v27,%%v0\n\t" | |||||
| "vfmsb %%v23,%%v27,%%v1\n\t" /* yn=x*s */ | |||||
| /* 2nd parts */ | |||||
| "vfmasb %%v28,%%v16,%%v1,%%v28\n\t" | |||||
| "vfmssb %%v20,%%v16,%%v0,%%v20\n\t" /* yn=y*c-yn */ | |||||
| "vfmasb %%v29,%%v17,%%v1,%%v29\n\t" | |||||
| "vfmssb %%v21,%%v17,%%v0,%%v21\n\t" /* yn=y*c-yn */ | |||||
| "vfmasb %%v30,%%v18,%%v1,%%v30\n\t" | |||||
| "vfmssb %%v22,%%v18,%%v0,%%v22\n\t" /* yn=y*c-yn */ | |||||
| "vfmasb %%v31,%%v19,%%v1,%%v31\n\t" | |||||
| "vfmssb %%v23,%%v19,%%v0,%%v23\n\t" /* yn=y*c-yn */ | |||||
| "vst %%v28, 0(%%r1,%[x])\n\t" | |||||
| "vst %%v29, 16(%%r1,%[x])\n\t" | |||||
| "vst %%v30, 32(%%r1,%[x])\n\t" | |||||
| "vst %%v31, 48(%%r1,%[x])\n\t" | |||||
| "vst %%v20, 0(%%r1,%[y])\n\t" | |||||
| "vst %%v21, 16(%%r1,%[y])\n\t" | |||||
| "vst %%v22, 32(%%r1,%[y])\n\t" | |||||
| "vst %%v23, 48(%%r1,%[y])\n\t" | |||||
| "vl %%v24, 64(%%r1,%[x])\n\t" | |||||
| "vl %%v25, 80(%%r1,%[x])\n\t" | |||||
| "vl %%v26, 96(%%r1,%[x])\n\t" | |||||
| "vl %%v27, 112(%%r1,%[x])\n\t" | |||||
| "vl %%v16, 64(%%r1,%[y])\n\t" | |||||
| "vl %%v17, 80(%%r1,%[y])\n\t" | |||||
| "vl %%v18, 96(%%r1,%[y])\n\t" | |||||
| "vl %%v19, 112(%%r1,%[y])\n\t" | |||||
| "vfmsb %%v28,%%v24,%%v0\n\t" | |||||
| "vfmsb %%v29,%%v25,%%v0\n\t" | |||||
| "vfmsb %%v20,%%v24,%%v1\n\t" /* yn=x*s */ | |||||
| "vfmsb %%v21,%%v25,%%v1\n\t" /* yn=x*s */ | |||||
| "vfmsb %%v30,%%v26,%%v0\n\t" | |||||
| "vfmsb %%v22,%%v26,%%v1\n\t" /* yn=x*s */ | |||||
| "vfmsb %%v31,%%v27,%%v0\n\t" | |||||
| "vfmsb %%v23,%%v27,%%v1\n\t" /* yn=x*s */ | |||||
| /* 2nd parts */ | |||||
| "vfmasb %%v28,%%v16,%%v1,%%v28\n\t" | |||||
| "vfmssb %%v20,%%v16,%%v0,%%v20\n\t" /* yn=y*c-yn */ | |||||
| "vfmasb %%v29,%%v17,%%v1,%%v29\n\t" | |||||
| "vfmssb %%v21,%%v17,%%v0,%%v21\n\t" /* yn=y*c-yn */ | |||||
| "vfmasb %%v30,%%v18,%%v1,%%v30\n\t" | |||||
| "vfmssb %%v22,%%v18,%%v0,%%v22\n\t" /* yn=y*c-yn */ | |||||
| "vfmasb %%v31,%%v19,%%v1,%%v31\n\t" | |||||
| "vfmssb %%v23,%%v19,%%v0,%%v23\n\t" /* yn=y*c-yn */ | |||||
| "vst %%v28, 64(%%r1,%[x])\n\t" | |||||
| "vst %%v29, 80(%%r1,%[x])\n\t" | |||||
| "vst %%v30, 96(%%r1,%[x])\n\t" | |||||
| "vst %%v31, 112(%%r1,%[x])\n\t" | |||||
| "vst %%v20, 64(%%r1,%[y])\n\t" | |||||
| "vst %%v21, 80(%%r1,%[y])\n\t" | |||||
| "vst %%v22, 96(%%r1,%[y])\n\t" | |||||
| "vst %%v23, 112(%%r1,%[y])\n\t" | |||||
| "vl %%v24, 128(%%r1,%[x])\n\t" | |||||
| "vl %%v25, 144(%%r1,%[x])\n\t" | |||||
| "vl %%v26, 160(%%r1,%[x])\n\t" | |||||
| "vl %%v27, 176(%%r1,%[x])\n\t" | |||||
| "vl %%v16, 128(%%r1,%[y])\n\t" | |||||
| "vl %%v17, 144(%%r1,%[y])\n\t" | |||||
| "vl %%v18, 160(%%r1,%[y])\n\t" | |||||
| "vl %%v19, 176(%%r1,%[y])\n\t" | |||||
| "vfmsb %%v28,%%v24,%%v0\n\t" | |||||
| "vfmsb %%v29,%%v25,%%v0\n\t" | |||||
| "vfmsb %%v20,%%v24,%%v1\n\t" /* yn=x*s */ | |||||
| "vfmsb %%v21,%%v25,%%v1\n\t" /* yn=x*s */ | |||||
| "vfmsb %%v30,%%v26,%%v0\n\t" | |||||
| "vfmsb %%v22,%%v26,%%v1\n\t" /* yn=x*s */ | |||||
| "vfmsb %%v31,%%v27,%%v0\n\t" | |||||
| "vfmsb %%v23,%%v27,%%v1\n\t" /* yn=x*s */ | |||||
| /* 2nd parts */ | |||||
| "vfmasb %%v28,%%v16,%%v1,%%v28\n\t" | |||||
| "vfmssb %%v20,%%v16,%%v0,%%v20\n\t" /* yn=y*c-yn */ | |||||
| "vfmasb %%v29,%%v17,%%v1,%%v29\n\t" | |||||
| "vfmssb %%v21,%%v17,%%v0,%%v21\n\t" /* yn=y*c-yn */ | |||||
| "vfmasb %%v30,%%v18,%%v1,%%v30\n\t" | |||||
| "vfmssb %%v22,%%v18,%%v0,%%v22\n\t" /* yn=y*c-yn */ | |||||
| "vfmasb %%v31,%%v19,%%v1,%%v31\n\t" | |||||
| "vfmssb %%v23,%%v19,%%v0,%%v23\n\t" /* yn=y*c-yn */ | |||||
| "vst %%v28, 128(%%r1,%[x])\n\t" | |||||
| "vst %%v29, 144(%%r1,%[x])\n\t" | |||||
| "vst %%v30, 160(%%r1,%[x])\n\t" | |||||
| "vst %%v31, 176(%%r1,%[x])\n\t" | |||||
| "vst %%v20, 128(%%r1,%[y])\n\t" | |||||
| "vst %%v21, 144(%%r1,%[y])\n\t" | |||||
| "vst %%v22, 160(%%r1,%[y])\n\t" | |||||
| "vst %%v23, 176(%%r1,%[y])\n\t" | |||||
| "vl %%v24, 192(%%r1,%[x])\n\t" | |||||
| "vl %%v25, 208(%%r1,%[x])\n\t" | |||||
| "vl %%v26, 224(%%r1,%[x])\n\t" | |||||
| "vl %%v27, 240(%%r1,%[x])\n\t" | |||||
| "vl %%v16, 192(%%r1,%[y])\n\t" | |||||
| "vl %%v17, 208(%%r1,%[y])\n\t" | |||||
| "vl %%v18, 224(%%r1,%[y])\n\t" | |||||
| "vl %%v19, 240(%%r1,%[y])\n\t" | |||||
| "vfmsb %%v28,%%v24,%%v0\n\t" | |||||
| "vfmsb %%v29,%%v25,%%v0\n\t" | |||||
| "vfmsb %%v20,%%v24,%%v1\n\t" /* yn=x*s */ | |||||
| "vfmsb %%v21,%%v25,%%v1\n\t" /* yn=x*s */ | |||||
| "vfmsb %%v30,%%v26,%%v0\n\t" | |||||
| "vfmsb %%v22,%%v26,%%v1\n\t" /* yn=x*s */ | |||||
| "vfmsb %%v31,%%v27,%%v0\n\t" | |||||
| "vfmsb %%v23,%%v27,%%v1\n\t" /* yn=x*s */ | |||||
| /* 2nd parts */ | |||||
| "vfmasb %%v28,%%v16,%%v1,%%v28\n\t" | |||||
| "vfmssb %%v20,%%v16,%%v0,%%v20\n\t" /* yn=y*c-yn */ | |||||
| "vfmasb %%v29,%%v17,%%v1,%%v29\n\t" | |||||
| "vfmssb %%v21,%%v17,%%v0,%%v21\n\t" /* yn=y*c-yn */ | |||||
| "vfmasb %%v30,%%v18,%%v1,%%v30\n\t" | |||||
| "vfmssb %%v22,%%v18,%%v0,%%v22\n\t" /* yn=y*c-yn */ | |||||
| "vfmasb %%v31,%%v19,%%v1,%%v31\n\t" | |||||
| "vfmssb %%v23,%%v19,%%v0,%%v23\n\t" /* yn=y*c-yn */ | |||||
| "vst %%v28, 192(%%r1,%[x])\n\t" | |||||
| "vst %%v29, 208(%%r1,%[x])\n\t" | |||||
| "vst %%v30, 224(%%r1,%[x])\n\t" | |||||
| "vst %%v31, 240(%%r1,%[x])\n\t" | |||||
| "vst %%v20, 192(%%r1,%[y])\n\t" | |||||
| "vst %%v21, 208(%%r1,%[y])\n\t" | |||||
| "vst %%v22, 224(%%r1,%[y])\n\t" | |||||
| "vst %%v23, 240(%%r1,%[y])\n\t" | |||||
| "agfi %%r1,256\n\t" | |||||
| "brctg %[n],0b" | |||||
| : "+m"(*(FLOAT (*)[n]) x), "+m"(*(FLOAT (*)[n]) y),[n] "+&r"(n) | |||||
| : [x] "a"(x),[y] "a"(y),[c] "m"(*c),[s] "m"(*s) | |||||
| : "cc", "r1", "v0", "v1", "v16", "v17", "v18", "v19", "v20", "v21", | |||||
| "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", | |||||
| "v31"); | |||||
| } | } | ||||
| int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT c, FLOAT s) | |||||
| { | |||||
| BLASLONG i=0; | |||||
| BLASLONG ix=0,iy=0; | |||||
| FLOAT temp; | |||||
| int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, | |||||
| FLOAT c, FLOAT s) { | |||||
| BLASLONG i = 0; | |||||
| BLASLONG ix = 0, iy = 0; | |||||
| if ( n <= 0 ) return(0); | |||||
| FLOAT temp; | |||||
| if ( (inc_x == 1) && (inc_y == 1) ) | |||||
| { | |||||
| if (n <= 0) | |||||
| return (0); | |||||
| BLASLONG n1 = n & -64; | |||||
| if ( n1 > 0 ) | |||||
| { | |||||
| FLOAT cosa,sina; | |||||
| cosa=c; | |||||
| sina=s; | |||||
| srot_kernel_64(n1, x, y, &cosa, &sina); | |||||
| i=n1; | |||||
| } | |||||
| if ((inc_x == 1) && (inc_y == 1)) { | |||||
| while(i < n) | |||||
| { | |||||
| temp = c*x[i] + s*y[i] ; | |||||
| y[i] = c*y[i] - s*x[i] ; | |||||
| x[i] = temp ; | |||||
| i++ ; | |||||
| BLASLONG n1 = n & -64; | |||||
| if (n1 > 0) { | |||||
| FLOAT cosa, sina; | |||||
| cosa = c; | |||||
| sina = s; | |||||
| srot_kernel_64(n1, x, y, &cosa, &sina); | |||||
| i = n1; | |||||
| } | |||||
| } | |||||
| while (i < n) { | |||||
| temp = c * x[i] + s * y[i]; | |||||
| y[i] = c * y[i] - s * x[i]; | |||||
| x[i] = temp; | |||||
| i++; | |||||
| } | } | ||||
| else | |||||
| { | |||||
| while(i < n) | |||||
| { | |||||
| temp = c*x[ix] + s*y[iy] ; | |||||
| y[iy] = c*y[iy] - s*x[ix] ; | |||||
| x[ix] = temp ; | |||||
| } else { | |||||
| ix += inc_x ; | |||||
| iy += inc_y ; | |||||
| i++ ; | |||||
| while (i < n) { | |||||
| temp = c * x[ix] + s * y[iy]; | |||||
| y[iy] = c * y[iy] - s * x[ix]; | |||||
| x[ix] = temp; | |||||
| } | |||||
| ix += inc_x; | |||||
| iy += inc_y; | |||||
| i++; | |||||
| } | } | ||||
| return(0); | |||||
| } | |||||
| } | |||||
| return (0); | |||||
| } | |||||
| @@ -1,5 +1,5 @@ | |||||
| /*************************************************************************** | /*************************************************************************** | ||||
| Copyright (c) 2013-2018, The OpenBLAS Project | |||||
| Copyright (c) 2013-2019, The OpenBLAS Project | |||||
| All rights reserved. | All rights reserved. | ||||
| Redistribution and use in source and binary forms, with or without | Redistribution and use in source and binary forms, with or without | ||||
| modification, are permitted provided that the following conditions are | modification, are permitted provided that the following conditions are | ||||
| @@ -27,175 +27,147 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #include "common.h" | #include "common.h" | ||||
| static void sscal_kernel_32(BLASLONG n, FLOAT da, FLOAT *x) | |||||
| { | |||||
| __asm__ volatile ( | |||||
| "vlrepf %%v0,%1 \n\t" | |||||
| "srlg %%r0,%0,5 \n\t" | |||||
| "xgr %%r1,%%r1 \n\t" | |||||
| "0: \n\t" | |||||
| "pfd 2, 1024(%%r1,%2) \n\t" | |||||
| "vl %%v24, 0(%%r1,%2) \n\t" | |||||
| "vfmsb %%v24,%%v24,%%v0 \n\t" | |||||
| "vst %%v24, 0(%%r1,%2) \n\t" | |||||
| "vl %%v25, 16(%%r1,%2) \n\t" | |||||
| "vfmsb %%v25,%%v25,%%v0 \n\t" | |||||
| "vst %%v25, 16(%%r1,%2) \n\t" | |||||
| "vl %%v26, 32(%%r1,%2) \n\t" | |||||
| "vfmsb %%v26,%%v26,%%v0 \n\t" | |||||
| "vst %%v26, 32(%%r1,%2) \n\t" | |||||
| "vl %%v27, 48(%%r1,%2) \n\t" | |||||
| "vfmsb %%v27,%%v27,%%v0 \n\t" | |||||
| "vst %%v27, 48(%%r1,%2) \n\t" | |||||
| "vl %%v24, 64(%%r1,%2) \n\t" | |||||
| "vfmsb %%v24,%%v24,%%v0 \n\t" | |||||
| "vst %%v24, 64(%%r1,%2) \n\t" | |||||
| "vl %%v25, 80(%%r1,%2) \n\t" | |||||
| "vfmsb %%v25,%%v25,%%v0 \n\t" | |||||
| "vst %%v25, 80(%%r1,%2) \n\t" | |||||
| "vl %%v26, 96(%%r1,%2) \n\t" | |||||
| "vfmsb %%v26,%%v26,%%v0 \n\t" | |||||
| "vst %%v26, 96(%%r1,%2) \n\t" | |||||
| "vl %%v27, 112(%%r1,%2) \n\t" | |||||
| "vfmsb %%v27,%%v27,%%v0 \n\t" | |||||
| "vst %%v27, 112(%%r1,%2) \n\t" | |||||
| "agfi %%r1,128 \n\t" | |||||
| "brctg %%r0,0b " | |||||
| : | |||||
| :"r"(n),"m"(da),"ZR"((FLOAT (*)[n])x) | |||||
| :"memory","cc","r0","r1","v0","v24","v25","v26","v27" | |||||
| ); | |||||
| static void sscal_kernel_32(BLASLONG n, FLOAT da, FLOAT *x) { | |||||
| __asm__("vlrepf %%v0,%[da]\n\t" | |||||
| "srlg %[n],%[n],5\n\t" | |||||
| "xgr %%r1,%%r1\n\t" | |||||
| "0:\n\t" | |||||
| "pfd 2, 1024(%%r1,%[x])\n\t" | |||||
| "vl %%v24,0(%%r1,%[x])\n\t" | |||||
| "vfmsb %%v24,%%v24,%%v0\n\t" | |||||
| "vst %%v24,0(%%r1,%[x])\n\t" | |||||
| "vl %%v25,16(%%r1,%[x])\n\t" | |||||
| "vfmsb %%v25,%%v25,%%v0\n\t" | |||||
| "vst %%v25,16(%%r1,%[x])\n\t" | |||||
| "vl %%v26,32(%%r1,%[x])\n\t" | |||||
| "vfmsb %%v26,%%v26,%%v0\n\t" | |||||
| "vst %%v26,32(%%r1,%[x])\n\t" | |||||
| "vl %%v27,48(%%r1,%[x])\n\t" | |||||
| "vfmsb %%v27,%%v27,%%v0\n\t" | |||||
| "vst %%v27,48(%%r1,%[x])\n\t" | |||||
| "vl %%v28,64(%%r1,%[x])\n\t" | |||||
| "vfmsb %%v28,%%v28,%%v0\n\t" | |||||
| "vst %%v28,64(%%r1,%[x])\n\t" | |||||
| "vl %%v29,80(%%r1,%[x])\n\t" | |||||
| "vfmsb %%v29,%%v29,%%v0\n\t" | |||||
| "vst %%v29,80(%%r1,%[x])\n\t" | |||||
| "vl %%v30,96(%%r1,%[x])\n\t" | |||||
| "vfmsb %%v30,%%v30,%%v0\n\t" | |||||
| "vst %%v30,96(%%r1,%[x])\n\t" | |||||
| "vl %%v31,112(%%r1,%[x])\n\t" | |||||
| "vfmsb %%v31,%%v31,%%v0\n\t" | |||||
| "vst %%v31,112(%%r1,%[x])\n\t" | |||||
| "agfi %%r1,128\n\t" | |||||
| "brctg %[n],0b" | |||||
| : "+m"(*(FLOAT (*)[n]) x),[n] "+&r"(n) | |||||
| : [x] "a"(x),[da] "m"(da) | |||||
| : "cc", "r1", "v0", "v24", "v25", "v26", "v27", "v28", "v29", "v30", | |||||
| "v31"); | |||||
| } | } | ||||
| static void sscal_kernel_32_zero(BLASLONG n, FLOAT *x) | |||||
| { | |||||
| __asm__ volatile( | |||||
| "vzero %%v24 \n\t" | |||||
| "vzero %%v25 \n\t" | |||||
| "vzero %%v26 \n\t" | |||||
| "vzero %%v27 \n\t" | |||||
| "srlg %%r0,%0,5 \n\t" | |||||
| "xgr %%r1,%%r1 \n\t" | |||||
| "0: \n\t" | |||||
| "pfd 2, 1024(%%r1,%1) \n\t" | |||||
| "vst %%v24,0(%%r1,%1) \n\t" | |||||
| "vst %%v25,16(%%r1,%1) \n\t" | |||||
| "vst %%v26,32(%%r1,%1) \n\t" | |||||
| "vst %%v27,48(%%r1,%1) \n\t" | |||||
| "vst %%v24,64(%%r1,%1) \n\t" | |||||
| "vst %%v25,80(%%r1,%1) \n\t" | |||||
| "vst %%v26,96(%%r1,%1) \n\t" | |||||
| "vst %%v27,112(%%r1,%1) \n\t" | |||||
| "agfi %%r1,128 \n\t" | |||||
| "brctg %%r0,0b " | |||||
| : | |||||
| :"r"(n),"ZR"((FLOAT (*)[n])x) | |||||
| :"memory","cc","r0","r1","v24","v25","v26","v27" | |||||
| ); | |||||
| static void sscal_kernel_32_zero(BLASLONG n, FLOAT *x) { | |||||
| __asm__("vzero %%v0\n\t" | |||||
| "srlg %[n],%[n],5\n\t" | |||||
| "xgr %%r1,%%r1\n\t" | |||||
| "0:\n\t" | |||||
| "pfd 2, 1024(%%r1,%[x])\n\t" | |||||
| "vst %%v0,0(%%r1,%[x])\n\t" | |||||
| "vst %%v0,16(%%r1,%[x])\n\t" | |||||
| "vst %%v0,32(%%r1,%[x])\n\t" | |||||
| "vst %%v0,48(%%r1,%[x])\n\t" | |||||
| "vst %%v0,64(%%r1,%[x])\n\t" | |||||
| "vst %%v0,80(%%r1,%[x])\n\t" | |||||
| "vst %%v0,96(%%r1,%[x])\n\t" | |||||
| "vst %%v0,112(%%r1,%[x])\n\t" | |||||
| "agfi %%r1,128\n\t" | |||||
| "brctg %[n],0b" | |||||
| : "=m"(*(FLOAT (*)[n]) x),[n] "+&r"(n) | |||||
| : [x] "a"(x) | |||||
| : "cc", "r1", "v0"); | |||||
| } | } | ||||
| int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) | |||||
| { | |||||
| BLASLONG i=0,j=0; | |||||
| if ( n <= 0 || inc_x <=0 ) | |||||
| return(0); | |||||
| if ( inc_x == 1 ) | |||||
| { | |||||
| if ( da == 0.0 ) | |||||
| { | |||||
| BLASLONG n1 = n & -32; | |||||
| if ( n1 > 0 ) | |||||
| { | |||||
| sscal_kernel_32_zero(n1, x); | |||||
| j=n1; | |||||
| } | |||||
| while(j < n) | |||||
| { | |||||
| x[j]=0.0; | |||||
| j++; | |||||
| } | |||||
| } | |||||
| else | |||||
| { | |||||
| BLASLONG n1 = n & -32; | |||||
| if ( n1 > 0 ) | |||||
| { | |||||
| sscal_kernel_32(n1, da, x); | |||||
| j=n1; | |||||
| } | |||||
| while(j < n) | |||||
| { | |||||
| x[j] = da * x[j] ; | |||||
| j++; | |||||
| } | |||||
| } | |||||
| int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, | |||||
| BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, | |||||
| BLASLONG dummy2) { | |||||
| BLASLONG i = 0, j = 0; | |||||
| if (n <= 0 || inc_x <= 0) | |||||
| return (0); | |||||
| if (inc_x == 1) { | |||||
| if (da == 0.0) { | |||||
| BLASLONG n1 = n & -32; | |||||
| if (n1 > 0) { | |||||
| sscal_kernel_32_zero(n1, x); | |||||
| j = n1; | |||||
| } | |||||
| while (j < n) { | |||||
| x[j] = 0.0; | |||||
| j++; | |||||
| } | |||||
| } else { | |||||
| BLASLONG n1 = n & -32; | |||||
| if (n1 > 0) { | |||||
| sscal_kernel_32(n1, da, x); | |||||
| j = n1; | |||||
| } | |||||
| while (j < n) { | |||||
| x[j] = da * x[j]; | |||||
| j++; | |||||
| } | |||||
| } | } | ||||
| else | |||||
| { | |||||
| if ( da == 0.0 ) | |||||
| { | |||||
| } else { | |||||
| BLASLONG n1 = n & -2; | |||||
| if (da == 0.0) { | |||||
| while (j < n1) { | |||||
| BLASLONG n1 = n & -2; | |||||
| x[i]=0.0; | |||||
| x[i + inc_x]=0.0; | |||||
| while (j < n1) { | |||||
| i += inc_x * 2; | |||||
| j += 2; | |||||
| x[i] = 0.0; | |||||
| x[i + inc_x] = 0.0; | |||||
| } | |||||
| while(j < n) | |||||
| { | |||||
| i += inc_x * 2; | |||||
| j += 2; | |||||
| x[i]=0.0; | |||||
| i += inc_x ; | |||||
| j++; | |||||
| } | |||||
| } | |||||
| while (j < n) { | |||||
| } | |||||
| else | |||||
| { | |||||
| BLASLONG n1 = n & -2; | |||||
| x[i] = 0.0; | |||||
| i += inc_x; | |||||
| j++; | |||||
| } | |||||
| while (j < n1) { | |||||
| } else { | |||||
| BLASLONG n1 = n & -2; | |||||
| x[i] = da * x[i] ; | |||||
| x[i + inc_x] = da * x[i + inc_x]; | |||||
| while (j < n1) { | |||||
| i += inc_x * 2; | |||||
| j += 2; | |||||
| x[i] = da * x[i]; | |||||
| x[i + inc_x] = da * x[i + inc_x]; | |||||
| } | |||||
| i += inc_x * 2; | |||||
| j += 2; | |||||
| while(j < n) | |||||
| { | |||||
| } | |||||
| x[i] = da * x[i] ; | |||||
| i += inc_x ; | |||||
| j++; | |||||
| } | |||||
| } | |||||
| while (j < n) { | |||||
| x[i] = da * x[i]; | |||||
| i += inc_x; | |||||
| j++; | |||||
| } | |||||
| } | } | ||||
| return 0; | |||||
| } | |||||
| } | |||||
| return 0; | |||||
| } | |||||
| @@ -1,5 +1,5 @@ | |||||
| /*************************************************************************** | /*************************************************************************** | ||||
| Copyright (c) 2013-2018, The OpenBLAS Project | |||||
| Copyright (c) 2013-2019, The OpenBLAS Project | |||||
| All rights reserved. | All rights reserved. | ||||
| Redistribution and use in source and binary forms, with or without | Redistribution and use in source and binary forms, with or without | ||||
| modification, are permitted provided that the following conditions are | modification, are permitted provided that the following conditions are | ||||
| @@ -27,138 +27,124 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #include "common.h" | #include "common.h" | ||||
| static void sswap_kernel_64(BLASLONG n, FLOAT *x, FLOAT *y) | |||||
| { | |||||
| __asm__ volatile( | |||||
| "srlg %%r0,%0,6 \n\t" | |||||
| "xgr %%r1,%%r1 \n\t" | |||||
| "0: \n\t" | |||||
| "pfd 2, 1024(%%r1,%1) \n\t" | |||||
| "pfd 2, 1024(%%r1,%2) \n\t" | |||||
| "vl %%v16, 0(%%r1,%1) \n\t" | |||||
| "vl %%v17, 16(%%r1,%1) \n\t" | |||||
| "vl %%v18, 32(%%r1,%1) \n\t" | |||||
| "vl %%v19, 48(%%r1,%1) \n\t" | |||||
| "vl %%v20, 64(%%r1,%1) \n\t" | |||||
| "vl %%v21, 80(%%r1,%1) \n\t" | |||||
| "vl %%v22, 96(%%r1,%1) \n\t" | |||||
| "vl %%v23, 112(%%r1,%1) \n\t" | |||||
| "vl %%v24, 128(%%r1,%1) \n\t" | |||||
| "vl %%v25, 144(%%r1,%1) \n\t" | |||||
| "vl %%v26, 160(%%r1,%1) \n\t" | |||||
| "vl %%v27, 176(%%r1,%1) \n\t" | |||||
| "vl %%v28, 192(%%r1,%1) \n\t" | |||||
| "vl %%v29, 208(%%r1,%1) \n\t" | |||||
| "vl %%v30, 224(%%r1,%1) \n\t" | |||||
| "vl %%v31, 240(%%r1,%1) \n\t" | |||||
| "vl %%v0, 0(%%r1,%2) \n\t" | |||||
| "vl %%v1, 16(%%r1,%2) \n\t" | |||||
| "vl %%v2, 32(%%r1,%2) \n\t" | |||||
| "vl %%v3, 48(%%r1,%2) \n\t" | |||||
| "vl %%v4, 64(%%r1,%2) \n\t" | |||||
| "vl %%v5, 80(%%r1,%2) \n\t" | |||||
| "vl %%v6, 96(%%r1,%2) \n\t" | |||||
| "vl %%v7, 112(%%r1,%2) \n\t" | |||||
| "vst %%v0, 0(%%r1,%1) \n\t" | |||||
| "vst %%v1, 16(%%r1,%1) \n\t" | |||||
| "vst %%v2, 32(%%r1,%1) \n\t" | |||||
| "vst %%v3, 48(%%r1,%1) \n\t" | |||||
| "vst %%v4, 64(%%r1,%1) \n\t" | |||||
| "vst %%v5, 80(%%r1,%1) \n\t" | |||||
| "vst %%v6, 96(%%r1,%1) \n\t" | |||||
| "vst %%v7, 112(%%r1,%1) \n\t" | |||||
| "vl %%v0, 128(%%r1,%2) \n\t" | |||||
| "vl %%v1, 144(%%r1,%2) \n\t" | |||||
| "vl %%v2, 160(%%r1,%2) \n\t" | |||||
| "vl %%v3, 176(%%r1,%2) \n\t" | |||||
| "vl %%v4, 192(%%r1,%2) \n\t" | |||||
| "vl %%v5, 208(%%r1,%2) \n\t" | |||||
| "vl %%v6, 224(%%r1,%2) \n\t" | |||||
| "vl %%v7, 240(%%r1,%2) \n\t" | |||||
| "vst %%v0, 128(%%r1,%1) \n\t" | |||||
| "vst %%v1, 144(%%r1,%1) \n\t" | |||||
| "vst %%v2, 160(%%r1,%1) \n\t" | |||||
| "vst %%v3, 176(%%r1,%1) \n\t" | |||||
| "vst %%v4, 192(%%r1,%1) \n\t" | |||||
| "vst %%v5, 208(%%r1,%1) \n\t" | |||||
| "vst %%v6, 224(%%r1,%1) \n\t" | |||||
| "vst %%v7, 240(%%r1,%1) \n\t" | |||||
| "vst %%v16, 0(%%r1,%2) \n\t" | |||||
| "vst %%v17, 16(%%r1,%2) \n\t" | |||||
| "vst %%v18, 32(%%r1,%2) \n\t" | |||||
| "vst %%v19, 48(%%r1,%2) \n\t" | |||||
| "vst %%v20, 64(%%r1,%2) \n\t" | |||||
| "vst %%v21, 80(%%r1,%2) \n\t" | |||||
| "vst %%v22, 96(%%r1,%2) \n\t" | |||||
| "vst %%v23, 112(%%r1,%2) \n\t" | |||||
| "vst %%v24, 128(%%r1,%2) \n\t" | |||||
| "vst %%v25, 144(%%r1,%2) \n\t" | |||||
| "vst %%v26, 160(%%r1,%2) \n\t" | |||||
| "vst %%v27, 176(%%r1,%2) \n\t" | |||||
| "vst %%v28, 192(%%r1,%2) \n\t" | |||||
| "vst %%v29, 208(%%r1,%2) \n\t" | |||||
| "vst %%v30, 224(%%r1,%2) \n\t" | |||||
| "vst %%v31, 240(%%r1,%2) \n\t" | |||||
| "agfi %%r1,256 \n\t" | |||||
| "brctg %%r0,0b " | |||||
| : | |||||
| :"r"(n),"ZR"((FLOAT (*)[n])x),"ZR"((FLOAT (*)[n])y) | |||||
| :"memory","cc","r0","r1","v0","v1","v2","v3","v4","v5","v6","v7","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" | |||||
| ); | |||||
| static void sswap_kernel_64(BLASLONG n, FLOAT *x, FLOAT *y) { | |||||
| __asm__("srlg %[n],%[n],6\n\t" | |||||
| "xgr %%r1,%%r1\n\t" | |||||
| "0:\n\t" | |||||
| "pfd 2, 1024(%%r1,%[x])\n\t" | |||||
| "pfd 2, 1024(%%r1,%[y])\n\t" | |||||
| "vl %%v16, 0(%%r1,%[x])\n\t" | |||||
| "vl %%v17, 16(%%r1,%[x])\n\t" | |||||
| "vl %%v18, 32(%%r1,%[x])\n\t" | |||||
| "vl %%v19, 48(%%r1,%[x])\n\t" | |||||
| "vl %%v20, 64(%%r1,%[x])\n\t" | |||||
| "vl %%v21, 80(%%r1,%[x])\n\t" | |||||
| "vl %%v22, 96(%%r1,%[x])\n\t" | |||||
| "vl %%v23, 112(%%r1,%[x])\n\t" | |||||
| "vl %%v24, 128(%%r1,%[x])\n\t" | |||||
| "vl %%v25, 144(%%r1,%[x])\n\t" | |||||
| "vl %%v26, 160(%%r1,%[x])\n\t" | |||||
| "vl %%v27, 176(%%r1,%[x])\n\t" | |||||
| "vl %%v28, 192(%%r1,%[x])\n\t" | |||||
| "vl %%v29, 208(%%r1,%[x])\n\t" | |||||
| "vl %%v30, 224(%%r1,%[x])\n\t" | |||||
| "vl %%v31, 240(%%r1,%[x])\n\t" | |||||
| "vl %%v0, 0(%%r1,%[y])\n\t" | |||||
| "vl %%v1, 16(%%r1,%[y])\n\t" | |||||
| "vl %%v2, 32(%%r1,%[y])\n\t" | |||||
| "vl %%v3, 48(%%r1,%[y])\n\t" | |||||
| "vl %%v4, 64(%%r1,%[y])\n\t" | |||||
| "vl %%v5, 80(%%r1,%[y])\n\t" | |||||
| "vl %%v6, 96(%%r1,%[y])\n\t" | |||||
| "vl %%v7, 112(%%r1,%[y])\n\t" | |||||
| "vst %%v0, 0(%%r1,%[x])\n\t" | |||||
| "vst %%v1, 16(%%r1,%[x])\n\t" | |||||
| "vst %%v2, 32(%%r1,%[x])\n\t" | |||||
| "vst %%v3, 48(%%r1,%[x])\n\t" | |||||
| "vst %%v4, 64(%%r1,%[x])\n\t" | |||||
| "vst %%v5, 80(%%r1,%[x])\n\t" | |||||
| "vst %%v6, 96(%%r1,%[x])\n\t" | |||||
| "vst %%v7, 112(%%r1,%[x])\n\t" | |||||
| "vl %%v0, 128(%%r1,%[y])\n\t" | |||||
| "vl %%v1, 144(%%r1,%[y])\n\t" | |||||
| "vl %%v2, 160(%%r1,%[y])\n\t" | |||||
| "vl %%v3, 176(%%r1,%[y])\n\t" | |||||
| "vl %%v4, 192(%%r1,%[y])\n\t" | |||||
| "vl %%v5, 208(%%r1,%[y])\n\t" | |||||
| "vl %%v6, 224(%%r1,%[y])\n\t" | |||||
| "vl %%v7, 240(%%r1,%[y])\n\t" | |||||
| "vst %%v0, 128(%%r1,%[x])\n\t" | |||||
| "vst %%v1, 144(%%r1,%[x])\n\t" | |||||
| "vst %%v2, 160(%%r1,%[x])\n\t" | |||||
| "vst %%v3, 176(%%r1,%[x])\n\t" | |||||
| "vst %%v4, 192(%%r1,%[x])\n\t" | |||||
| "vst %%v5, 208(%%r1,%[x])\n\t" | |||||
| "vst %%v6, 224(%%r1,%[x])\n\t" | |||||
| "vst %%v7, 240(%%r1,%[x])\n\t" | |||||
| "vst %%v16, 0(%%r1,%[y])\n\t" | |||||
| "vst %%v17, 16(%%r1,%[y])\n\t" | |||||
| "vst %%v18, 32(%%r1,%[y])\n\t" | |||||
| "vst %%v19, 48(%%r1,%[y])\n\t" | |||||
| "vst %%v20, 64(%%r1,%[y])\n\t" | |||||
| "vst %%v21, 80(%%r1,%[y])\n\t" | |||||
| "vst %%v22, 96(%%r1,%[y])\n\t" | |||||
| "vst %%v23, 112(%%r1,%[y])\n\t" | |||||
| "vst %%v24, 128(%%r1,%[y])\n\t" | |||||
| "vst %%v25, 144(%%r1,%[y])\n\t" | |||||
| "vst %%v26, 160(%%r1,%[y])\n\t" | |||||
| "vst %%v27, 176(%%r1,%[y])\n\t" | |||||
| "vst %%v28, 192(%%r1,%[y])\n\t" | |||||
| "vst %%v29, 208(%%r1,%[y])\n\t" | |||||
| "vst %%v30, 224(%%r1,%[y])\n\t" | |||||
| "vst %%v31, 240(%%r1,%[y])\n\t" | |||||
| "agfi %%r1,256\n\t" | |||||
| "brctg %[n],0b" | |||||
| : "+m"(*(FLOAT (*)[n]) x), "+m"(*(FLOAT (*)[n]) y),[n] "+&r"(n) | |||||
| : [x] "a"(x),[y] "a"(y) | |||||
| : "cc", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", | |||||
| "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", | |||||
| "v27", "v28", "v29", "v30", "v31"); | |||||
| } | } | ||||
| int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) | |||||
| { | |||||
| BLASLONG i=0; | |||||
| BLASLONG ix=0,iy=0; | |||||
| FLOAT temp; | |||||
| if ( n <= 0 ) return(0); | |||||
| if ( (inc_x == 1) && (inc_y == 1 )) | |||||
| { | |||||
| int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT *x, | |||||
| BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, | |||||
| BLASLONG dummy2) { | |||||
| BLASLONG i = 0; | |||||
| BLASLONG ix = 0, iy = 0; | |||||
| FLOAT temp; | |||||
| BLASLONG n1 = n & -64; | |||||
| if ( n1 > 0 ) | |||||
| { | |||||
| sswap_kernel_64(n1, x, y); | |||||
| i=n1; | |||||
| } | |||||
| if (n <= 0) | |||||
| return (0); | |||||
| while(i < n) | |||||
| { | |||||
| temp = y[i]; | |||||
| y[i] = x[i] ; | |||||
| x[i] = temp; | |||||
| i++ ; | |||||
| if ((inc_x == 1) && (inc_y == 1)) { | |||||
| } | |||||
| BLASLONG n1 = n & -64; | |||||
| if (n1 > 0) { | |||||
| sswap_kernel_64(n1, x, y); | |||||
| i = n1; | |||||
| } | |||||
| while (i < n) { | |||||
| temp = y[i]; | |||||
| y[i] = x[i]; | |||||
| x[i] = temp; | |||||
| i++; | |||||
| } | } | ||||
| else | |||||
| { | |||||
| while(i < n) | |||||
| { | |||||
| temp = y[iy]; | |||||
| y[iy] = x[ix] ; | |||||
| x[ix] = temp; | |||||
| ix += inc_x ; | |||||
| iy += inc_y ; | |||||
| i++ ; | |||||
| } else { | |||||
| } | |||||
| while (i < n) { | |||||
| temp = y[iy]; | |||||
| y[iy] = x[ix]; | |||||
| x[ix] = temp; | |||||
| ix += inc_x; | |||||
| iy += inc_y; | |||||
| i++; | |||||
| } | } | ||||
| return(0); | |||||
| } | |||||
| } | |||||
| return (0); | |||||
| } | |||||
| @@ -1,5 +1,5 @@ | |||||
| /*************************************************************************** | /*************************************************************************** | ||||
| Copyright (c) 2013-2017, The OpenBLAS Project | |||||
| Copyright (c) 2013-2019, The OpenBLAS Project | |||||
| All rights reserved. | All rights reserved. | ||||
| Redistribution and use in source and binary forms, with or without | Redistribution and use in source and binary forms, with or without | ||||
| modification, are permitted provided that the following conditions are | modification, are permitted provided that the following conditions are | ||||
| @@ -28,184 +28,165 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #include "common.h" | #include "common.h" | ||||
| #include <math.h> | #include <math.h> | ||||
| #if defined(DOUBLE) | |||||
| #define ABS fabs | |||||
| #else | |||||
| #define ABS fabsf | |||||
| #endif | |||||
| #define CABS1(x,i) (ABS(x[i]) + ABS(x[i + 1])) | |||||
| static FLOAT zamax_kernel_16(BLASLONG n, FLOAT *x) | |||||
| { | |||||
| FLOAT amax; | |||||
| __asm__ volatile ( | |||||
| "vleg %%v0,0(%2),0 \n\t" | |||||
| "vleg %%v16,8(%2),0 \n\t" | |||||
| "vleg %%v0,16(%2),1 \n\t" | |||||
| "vleg %%v16,24(%2),1 \n\t" | |||||
| "vflpdb %%v0,%%v0 \n\t" | |||||
| "vflpdb %%v16,%%v16 \n\t" | |||||
| "vfadb %%v0,%%v0,%%v16 \n\t" | |||||
| "srlg %%r0,%1,4 \n\t" | |||||
| "xgr %%r1,%%r1 \n\t" | |||||
| "0: \n\t" | |||||
| "pfd 1, 1024(%%r1,%2) \n\t" | |||||
| "vleg %%v16,0(%%r1,%2),0 \n\t" | |||||
| "vleg %%v17,8(%%r1,%2),0 \n\t" | |||||
| "vleg %%v16,16(%%r1,%2),1 \n\t" | |||||
| "vleg %%v17,24(%%r1,%2),1 \n\t" | |||||
| "vleg %%v18,32(%%r1,%2),0 \n\t" | |||||
| "vleg %%v19,40(%%r1,%2),0 \n\t" | |||||
| "vleg %%v18,48(%%r1,%2),1 \n\t" | |||||
| "vleg %%v19,56(%%r1,%2),1 \n\t" | |||||
| "vleg %%v20,64(%%r1,%2),0 \n\t" | |||||
| "vleg %%v21,72(%%r1,%2),0 \n\t" | |||||
| "vleg %%v20,80(%%r1,%2),1 \n\t" | |||||
| "vleg %%v21,88(%%r1,%2),1 \n\t" | |||||
| "vleg %%v22,96(%%r1,%2),0 \n\t" | |||||
| "vleg %%v23,104(%%r1,%2),0 \n\t" | |||||
| "vleg %%v22,112(%%r1,%2),1 \n\t" | |||||
| "vleg %%v23,120(%%r1,%2),1 \n\t" | |||||
| "vleg %%v24,128(%%r1,%2),0 \n\t" | |||||
| "vleg %%v25,136(%%r1,%2),0 \n\t" | |||||
| "vleg %%v24,144(%%r1,%2),1 \n\t" | |||||
| "vleg %%v25,152(%%r1,%2),1 \n\t" | |||||
| "vleg %%v26,160(%%r1,%2),0 \n\t" | |||||
| "vleg %%v27,168(%%r1,%2),0 \n\t" | |||||
| "vleg %%v26,176(%%r1,%2),1 \n\t" | |||||
| "vleg %%v27,184(%%r1,%2),1 \n\t" | |||||
| "vleg %%v28,192(%%r1,%2),0 \n\t" | |||||
| "vleg %%v29,200(%%r1,%2),0 \n\t" | |||||
| "vleg %%v28,208(%%r1,%2),1 \n\t" | |||||
| "vleg %%v29,216(%%r1,%2),1 \n\t" | |||||
| "vleg %%v30,224(%%r1,%2),0 \n\t" | |||||
| "vleg %%v31,232(%%r1,%2),0 \n\t" | |||||
| "vleg %%v30,240(%%r1,%2),1 \n\t" | |||||
| "vleg %%v31,248(%%r1,%2),1 \n\t" | |||||
| "vflpdb %%v16,%%v16 \n\t" | |||||
| "vflpdb %%v17,%%v17 \n\t" | |||||
| "vflpdb %%v18,%%v18 \n\t" | |||||
| "vflpdb %%v19,%%v19 \n\t" | |||||
| "vflpdb %%v20,%%v20 \n\t" | |||||
| "vflpdb %%v21,%%v21 \n\t" | |||||
| "vflpdb %%v22,%%v22 \n\t" | |||||
| "vflpdb %%v23,%%v23 \n\t" | |||||
| "vflpdb %%v24,%%v24 \n\t" | |||||
| "vflpdb %%v25,%%v25 \n\t" | |||||
| "vflpdb %%v26,%%v26 \n\t" | |||||
| "vflpdb %%v27,%%v27 \n\t" | |||||
| "vflpdb %%v28,%%v28 \n\t" | |||||
| "vflpdb %%v29,%%v29 \n\t" | |||||
| "vflpdb %%v30,%%v30 \n\t" | |||||
| "vflpdb %%v31,%%v31 \n\t" | |||||
| "vfadb %%v16,%%v16,%%v17 \n\t" | |||||
| "vfadb %%v18,%%v18,%%v19 \n\t" | |||||
| "vfadb %%v20,%%v20,%%v21 \n\t" | |||||
| "vfadb %%v22,%%v22,%%v23 \n\t" | |||||
| "vfadb %%v24,%%v24,%%v25 \n\t" | |||||
| "vfadb %%v26,%%v26,%%v27 \n\t" | |||||
| "vfadb %%v28,%%v28,%%v29 \n\t" | |||||
| "vfadb %%v30,%%v30,%%v31 \n\t" | |||||
| "vfmaxdb %%v16,%%v16,%%v24,0 \n\t" | |||||
| "vfmaxdb %%v18,%%v18,%%v26,0 \n\t" | |||||
| "vfmaxdb %%v20,%%v20,%%v28,0 \n\t" | |||||
| "vfmaxdb %%v22,%%v22,%%v30,0 \n\t" | |||||
| "vfmaxdb %%v16,%%v16,%%v20,0 \n\t" | |||||
| "vfmaxdb %%v18,%%v18,%%v22,0 \n\t" | |||||
| "vfmaxdb %%v16,%%v16,%%v18,0 \n\t" | |||||
| "vfmaxdb %%v0,%%v0,%%v16,0 \n\t" | |||||
| "agfi %%r1, 256 \n\t" | |||||
| "brctg %%r0, 0b \n\t" | |||||
| "vrepg %%v16,%%v0,1 \n\t" | |||||
| "wfmaxdb %%v0,%%v0,%%v16,0 \n\t" | |||||
| "ldr %0,%%f0 " | |||||
| :"=f"(amax) | |||||
| :"r"(n),"ZR"((const FLOAT (*)[n])x) | |||||
| :"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" | |||||
| ); | |||||
| return amax; | |||||
| #define CABS1(x,i) (fabs(x[i]) + fabs(x[i + 1])) | |||||
| static FLOAT zamax_kernel_16(BLASLONG n, FLOAT *x) { | |||||
| FLOAT amax; | |||||
| __asm__("vleg %%v0,0(%[x]),0\n\t" | |||||
| "vleg %%v16,8(%[x]),0\n\t" | |||||
| "vleg %%v0,16(%[x]),1\n\t" | |||||
| "vleg %%v16,24(%[x]),1\n\t" | |||||
| "vflpdb %%v0,%%v0\n\t" | |||||
| "vflpdb %%v16,%%v16\n\t" | |||||
| "vfadb %%v0,%%v0,%%v16\n\t" | |||||
| "srlg %[n],%[n],4\n\t" | |||||
| "xgr %%r1,%%r1\n\t" | |||||
| "0:\n\t" | |||||
| "pfd 1, 1024(%%r1,%[x])\n\t" | |||||
| "vleg %%v16,0(%%r1,%[x]),0\n\t" | |||||
| "vleg %%v17,8(%%r1,%[x]),0\n\t" | |||||
| "vleg %%v16,16(%%r1,%[x]),1\n\t" | |||||
| "vleg %%v17,24(%%r1,%[x]),1\n\t" | |||||
| "vleg %%v18,32(%%r1,%[x]),0\n\t" | |||||
| "vleg %%v19,40(%%r1,%[x]),0\n\t" | |||||
| "vleg %%v18,48(%%r1,%[x]),1\n\t" | |||||
| "vleg %%v19,56(%%r1,%[x]),1\n\t" | |||||
| "vleg %%v20,64(%%r1,%[x]),0\n\t" | |||||
| "vleg %%v21,72(%%r1,%[x]),0\n\t" | |||||
| "vleg %%v20,80(%%r1,%[x]),1\n\t" | |||||
| "vleg %%v21,88(%%r1,%[x]),1\n\t" | |||||
| "vleg %%v22,96(%%r1,%[x]),0\n\t" | |||||
| "vleg %%v23,104(%%r1,%[x]),0\n\t" | |||||
| "vleg %%v22,112(%%r1,%[x]),1\n\t" | |||||
| "vleg %%v23,120(%%r1,%[x]),1\n\t" | |||||
| "vleg %%v24,128(%%r1,%[x]),0\n\t" | |||||
| "vleg %%v25,136(%%r1,%[x]),0\n\t" | |||||
| "vleg %%v24,144(%%r1,%[x]),1\n\t" | |||||
| "vleg %%v25,152(%%r1,%[x]),1\n\t" | |||||
| "vleg %%v26,160(%%r1,%[x]),0\n\t" | |||||
| "vleg %%v27,168(%%r1,%[x]),0\n\t" | |||||
| "vleg %%v26,176(%%r1,%[x]),1\n\t" | |||||
| "vleg %%v27,184(%%r1,%[x]),1\n\t" | |||||
| "vleg %%v28,192(%%r1,%[x]),0\n\t" | |||||
| "vleg %%v29,200(%%r1,%[x]),0\n\t" | |||||
| "vleg %%v28,208(%%r1,%[x]),1\n\t" | |||||
| "vleg %%v29,216(%%r1,%[x]),1\n\t" | |||||
| "vleg %%v30,224(%%r1,%[x]),0\n\t" | |||||
| "vleg %%v31,232(%%r1,%[x]),0\n\t" | |||||
| "vleg %%v30,240(%%r1,%[x]),1\n\t" | |||||
| "vleg %%v31,248(%%r1,%[x]),1\n\t" | |||||
| "vflpdb %%v16,%%v16\n\t" | |||||
| "vflpdb %%v17,%%v17\n\t" | |||||
| "vflpdb %%v18,%%v18\n\t" | |||||
| "vflpdb %%v19,%%v19\n\t" | |||||
| "vflpdb %%v20,%%v20\n\t" | |||||
| "vflpdb %%v21,%%v21\n\t" | |||||
| "vflpdb %%v22,%%v22\n\t" | |||||
| "vflpdb %%v23,%%v23\n\t" | |||||
| "vflpdb %%v24,%%v24\n\t" | |||||
| "vflpdb %%v25,%%v25\n\t" | |||||
| "vflpdb %%v26,%%v26\n\t" | |||||
| "vflpdb %%v27,%%v27\n\t" | |||||
| "vflpdb %%v28,%%v28\n\t" | |||||
| "vflpdb %%v29,%%v29\n\t" | |||||
| "vflpdb %%v30,%%v30\n\t" | |||||
| "vflpdb %%v31,%%v31\n\t" | |||||
| "vfadb %%v16,%%v16,%%v17\n\t" | |||||
| "vfadb %%v18,%%v18,%%v19\n\t" | |||||
| "vfadb %%v20,%%v20,%%v21\n\t" | |||||
| "vfadb %%v22,%%v22,%%v23\n\t" | |||||
| "vfadb %%v24,%%v24,%%v25\n\t" | |||||
| "vfadb %%v26,%%v26,%%v27\n\t" | |||||
| "vfadb %%v28,%%v28,%%v29\n\t" | |||||
| "vfadb %%v30,%%v30,%%v31\n\t" | |||||
| "vfmaxdb %%v16,%%v16,%%v24,0\n\t" | |||||
| "vfmaxdb %%v18,%%v18,%%v26,0\n\t" | |||||
| "vfmaxdb %%v20,%%v20,%%v28,0\n\t" | |||||
| "vfmaxdb %%v22,%%v22,%%v30,0\n\t" | |||||
| "vfmaxdb %%v16,%%v16,%%v20,0\n\t" | |||||
| "vfmaxdb %%v18,%%v18,%%v22,0\n\t" | |||||
| "vfmaxdb %%v16,%%v16,%%v18,0\n\t" | |||||
| "vfmaxdb %%v0,%%v0,%%v16,0\n\t" | |||||
| "agfi %%r1, 256\n\t" | |||||
| "brctg %[n], 0b\n\t" | |||||
| "vrepg %%v16,%%v0,1\n\t" | |||||
| "wfmaxdb %%v0,%%v0,%%v16,0\n\t" | |||||
| "ldr %[amax],%%f0" | |||||
| : [amax] "=f"(amax),[n] "+&r"(n) | |||||
| : "m"(*(const FLOAT (*)[n * 2]) x),[x] "a"(x) | |||||
| : "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22", | |||||
| "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"); | |||||
| return amax; | |||||
| } | } | ||||
| FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { | FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { | ||||
| BLASLONG i = 0; | |||||
| BLASLONG ix = 0; | |||||
| FLOAT maxf = 0.0; | |||||
| BLASLONG inc_x2; | |||||
| if (n <= 0 || inc_x <= 0) return (maxf); | |||||
| if (inc_x == 1) { | |||||
| BLASLONG n1 = n & -16; | |||||
| if (n1 > 0) { | |||||
| maxf = zamax_kernel_16(n1, x); | |||||
| ix = n1 * 2; | |||||
| i = n1; | |||||
| } | |||||
| else | |||||
| { | |||||
| maxf=CABS1(x,0); | |||||
| ix += 2; | |||||
| i++; | |||||
| } | |||||
| while (i < n) { | |||||
| if (CABS1(x,ix) > maxf) { | |||||
| maxf = CABS1(x,ix); | |||||
| } | |||||
| ix += 2; | |||||
| i++; | |||||
| } | |||||
| return (maxf); | |||||
| BLASLONG i = 0; | |||||
| BLASLONG ix = 0; | |||||
| FLOAT maxf = 0.0; | |||||
| BLASLONG inc_x2; | |||||
| if (n <= 0 || inc_x <= 0) | |||||
| return (maxf); | |||||
| if (inc_x == 1) { | |||||
| BLASLONG n1 = n & -16; | |||||
| if (n1 > 0) { | |||||
| maxf = zamax_kernel_16(n1, x); | |||||
| ix = n1 * 2; | |||||
| i = n1; | |||||
| } else { | } else { | ||||
| maxf=CABS1(x,0); | |||||
| inc_x2 = 2 * inc_x; | |||||
| BLASLONG n1 = n & -4; | |||||
| while (i < n1) { | |||||
| if (CABS1(x,ix) > maxf) { | |||||
| maxf = CABS1(x,ix); | |||||
| } | |||||
| if (CABS1(x,ix+inc_x2) > maxf) { | |||||
| maxf = CABS1(x,ix+inc_x2); | |||||
| } | |||||
| if (CABS1(x,ix+inc_x2*2) > maxf) { | |||||
| maxf = CABS1(x,ix+inc_x2*2); | |||||
| } | |||||
| if (CABS1(x,ix+inc_x2*3) > maxf) { | |||||
| maxf = CABS1(x,ix+inc_x2*3); | |||||
| } | |||||
| ix += inc_x2 * 4; | |||||
| i += 4; | |||||
| } | |||||
| while (i < n) { | |||||
| if (CABS1(x,ix) > maxf) { | |||||
| maxf = CABS1(x,ix); | |||||
| } | |||||
| ix += inc_x2; | |||||
| i++; | |||||
| } | |||||
| return (maxf); | |||||
| maxf = CABS1(x, 0); | |||||
| ix += 2; | |||||
| i++; | |||||
| } | |||||
| while (i < n) { | |||||
| if (CABS1(x, ix) > maxf) { | |||||
| maxf = CABS1(x, ix); | |||||
| } | |||||
| ix += 2; | |||||
| i++; | |||||
| } | |||||
| return (maxf); | |||||
| } else { | |||||
| maxf = CABS1(x, 0); | |||||
| inc_x2 = 2 * inc_x; | |||||
| BLASLONG n1 = n & -4; | |||||
| while (i < n1) { | |||||
| if (CABS1(x, ix) > maxf) { | |||||
| maxf = CABS1(x, ix); | |||||
| } | |||||
| if (CABS1(x, ix + inc_x2) > maxf) { | |||||
| maxf = CABS1(x, ix + inc_x2); | |||||
| } | |||||
| if (CABS1(x, ix + inc_x2 * 2) > maxf) { | |||||
| maxf = CABS1(x, ix + inc_x2 * 2); | |||||
| } | |||||
| if (CABS1(x, ix + inc_x2 * 3) > maxf) { | |||||
| maxf = CABS1(x, ix + inc_x2 * 3); | |||||
| } | |||||
| ix += inc_x2 * 4; | |||||
| i += 4; | |||||
| } | |||||
| while (i < n) { | |||||
| if (CABS1(x, ix) > maxf) { | |||||
| maxf = CABS1(x, ix); | |||||
| } | |||||
| ix += inc_x2; | |||||
| i++; | |||||
| } | } | ||||
| return (maxf); | |||||
| } | |||||
| } | } | ||||
| @@ -1,5 +1,5 @@ | |||||
| /*************************************************************************** | /*************************************************************************** | ||||
| Copyright (c) 2013-2017, The OpenBLAS Project | |||||
| Copyright (c) 2013-2019, The OpenBLAS Project | |||||
| All rights reserved. | All rights reserved. | ||||
| Redistribution and use in source and binary forms, with or without | Redistribution and use in source and binary forms, with or without | ||||
| modification, are permitted provided that the following conditions are | modification, are permitted provided that the following conditions are | ||||
| @@ -28,194 +28,174 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #include "common.h" | #include "common.h" | ||||
| #include <math.h> | #include <math.h> | ||||
| #if defined(DOUBLE) | |||||
| #define ABS fabs | |||||
| #else | |||||
| #define ABS fabsf | |||||
| #endif | |||||
| #define CABS1(x,i) (ABS(x[i]) + ABS(x[i + 1])) | |||||
| static FLOAT zamax_kernel_16(BLASLONG n, FLOAT *x) | |||||
| { | |||||
| FLOAT amax; | |||||
| __asm__ volatile ( | |||||
| "vleg %%v0,0(%2),0 \n\t" | |||||
| "vleg %%v16,8(%2),0 \n\t" | |||||
| "vleg %%v0,16(%2),1 \n\t" | |||||
| "vleg %%v16,24(%2),1 \n\t" | |||||
| "vflpdb %%v0,%%v0 \n\t" | |||||
| "vflpdb %%v16,%%v16 \n\t" | |||||
| "vfadb %%v0,%%v0,%%v16 \n\t" | |||||
| "srlg %%r0,%1,4 \n\t" | |||||
| "xgr %%r1,%%r1 \n\t" | |||||
| "0: \n\t" | |||||
| "pfd 1, 1024(%%r1,%2) \n\t" | |||||
| "vleg %%v16,0(%%r1,%2),0 \n\t" | |||||
| "vleg %%v17,8(%%r1,%2),0 \n\t" | |||||
| "vleg %%v16,16(%%r1,%2),1 \n\t" | |||||
| "vleg %%v17,24(%%r1,%2),1 \n\t" | |||||
| "vleg %%v18,32(%%r1,%2),0 \n\t" | |||||
| "vleg %%v19,40(%%r1,%2),0 \n\t" | |||||
| "vleg %%v18,48(%%r1,%2),1 \n\t" | |||||
| "vleg %%v19,56(%%r1,%2),1 \n\t" | |||||
| "vleg %%v20,64(%%r1,%2),0 \n\t" | |||||
| "vleg %%v21,72(%%r1,%2),0 \n\t" | |||||
| "vleg %%v20,80(%%r1,%2),1 \n\t" | |||||
| "vleg %%v21,88(%%r1,%2),1 \n\t" | |||||
| "vleg %%v22,96(%%r1,%2),0 \n\t" | |||||
| "vleg %%v23,104(%%r1,%2),0 \n\t" | |||||
| "vleg %%v22,112(%%r1,%2),1 \n\t" | |||||
| "vleg %%v23,120(%%r1,%2),1 \n\t" | |||||
| "vflpdb %%v16, %%v16 \n\t" | |||||
| "vflpdb %%v17, %%v17 \n\t" | |||||
| "vflpdb %%v18, %%v18 \n\t" | |||||
| "vflpdb %%v19, %%v19 \n\t" | |||||
| "vflpdb %%v20, %%v20 \n\t" | |||||
| "vflpdb %%v21, %%v21 \n\t" | |||||
| "vflpdb %%v22, %%v22 \n\t" | |||||
| "vflpdb %%v23, %%v23 \n\t" | |||||
| "vfadb %%v16,%%v16,%%v17 \n\t" | |||||
| "vfadb %%v17,%%v18,%%v19 \n\t" | |||||
| "vfadb %%v18,%%v20,%%v21 \n\t" | |||||
| "vfadb %%v19,%%v22,%%v23 \n\t" | |||||
| "vfchdb %%v24,%%v16,%%v17 \n\t" | |||||
| "vfchdb %%v25,%%v18,%%v19 \n\t" | |||||
| "vsel %%v24,%%v16,%%v17,%%v24 \n\t" | |||||
| "vsel %%v25,%%v18,%%v19,%%v25 \n\t" | |||||
| "vfchdb %%v26,%%v24,%%v25 \n\t" | |||||
| "vsel %%v26,%%v24,%%v25,%%v26 \n\t" | |||||
| "vfchdb %%v27,%%v26,%%v0 \n\t" | |||||
| "vsel %%v0,%%v26,%%v0,%%v27 \n\t" | |||||
| "vleg %%v16,128(%%r1,%2),0 \n\t" | |||||
| "vleg %%v17,136(%%r1,%2),0 \n\t" | |||||
| "vleg %%v16,144(%%r1,%2),1 \n\t" | |||||
| "vleg %%v17,152(%%r1,%2),1 \n\t" | |||||
| "vleg %%v18,160(%%r1,%2),0 \n\t" | |||||
| "vleg %%v19,168(%%r1,%2),0 \n\t" | |||||
| "vleg %%v18,176(%%r1,%2),1 \n\t" | |||||
| "vleg %%v19,184(%%r1,%2),1 \n\t" | |||||
| "vleg %%v20,192(%%r1,%2),0 \n\t" | |||||
| "vleg %%v21,200(%%r1,%2),0 \n\t" | |||||
| "vleg %%v20,208(%%r1,%2),1 \n\t" | |||||
| "vleg %%v21,216(%%r1,%2),1 \n\t" | |||||
| "vleg %%v22,224(%%r1,%2),0 \n\t" | |||||
| "vleg %%v23,232(%%r1,%2),0 \n\t" | |||||
| "vleg %%v22,240(%%r1,%2),1 \n\t" | |||||
| "vleg %%v23,248(%%r1,%2),1 \n\t" | |||||
| "vflpdb %%v16, %%v16 \n\t" | |||||
| "vflpdb %%v17, %%v17 \n\t" | |||||
| "vflpdb %%v18, %%v18 \n\t" | |||||
| "vflpdb %%v19, %%v19 \n\t" | |||||
| "vflpdb %%v20, %%v20 \n\t" | |||||
| "vflpdb %%v21, %%v21 \n\t" | |||||
| "vflpdb %%v22, %%v22 \n\t" | |||||
| "vflpdb %%v23, %%v23 \n\t" | |||||
| "vfadb %%v16,%%v16,%%v17 \n\t" | |||||
| "vfadb %%v17,%%v18,%%v19 \n\t" | |||||
| "vfadb %%v18,%%v20,%%v21 \n\t" | |||||
| "vfadb %%v19,%%v22,%%v23 \n\t" | |||||
| "vfchdb %%v24,%%v16,%%v17 \n\t" | |||||
| "vfchdb %%v25,%%v18,%%v19 \n\t" | |||||
| "vsel %%v24,%%v16,%%v17,%%v24 \n\t" | |||||
| "vsel %%v25,%%v18,%%v19,%%v25 \n\t" | |||||
| "vfchdb %%v26,%%v24,%%v25 \n\t" | |||||
| "vsel %%v26,%%v24,%%v25,%%v26 \n\t" | |||||
| "vfchdb %%v27,%%v26,%%v0 \n\t" | |||||
| "vsel %%v0,%%v26,%%v0,%%v27 \n\t" | |||||
| "agfi %%r1, 256 \n\t" | |||||
| "brctg %%r0, 0b \n\t" | |||||
| "vrepg %%v16,%%v0,1 \n\t" | |||||
| "wfchdb %%v17,%%v0,%%v16 \n\t" | |||||
| "vsel %%v0,%%v0,%%v16,%%v17 \n\t" | |||||
| "ldr %0,%%f0 " | |||||
| :"=f"(amax) | |||||
| :"r"(n),"ZR"((const FLOAT (*)[n])x) | |||||
| :"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27" | |||||
| ); | |||||
| return amax; | |||||
| #define CABS1(x,i) (fabs(x[i]) + fabs(x[i + 1])) | |||||
| static FLOAT zamax_kernel_16(BLASLONG n, FLOAT *x) { | |||||
| FLOAT amax; | |||||
| __asm__("vleg %%v0,0(%[x]),0\n\t" | |||||
| "vleg %%v16,8(%[x]),0\n\t" | |||||
| "vleg %%v0,16(%[x]),1\n\t" | |||||
| "vleg %%v16,24(%[x]),1\n\t" | |||||
| "vflpdb %%v0,%%v0\n\t" | |||||
| "vflpdb %%v16,%%v16\n\t" | |||||
| "vfadb %%v0,%%v0,%%v16\n\t" | |||||
| "srlg %[n],%[n],4\n\t" | |||||
| "xgr %%r1,%%r1\n\t" | |||||
| "0:\n\t" | |||||
| "pfd 1, 1024(%%r1,%[x])\n\t" | |||||
| "vleg %%v16,0(%%r1,%[x]),0\n\t" | |||||
| "vleg %%v17,8(%%r1,%[x]),0\n\t" | |||||
| "vleg %%v16,16(%%r1,%[x]),1\n\t" | |||||
| "vleg %%v17,24(%%r1,%[x]),1\n\t" | |||||
| "vleg %%v18,32(%%r1,%[x]),0\n\t" | |||||
| "vleg %%v19,40(%%r1,%[x]),0\n\t" | |||||
| "vleg %%v18,48(%%r1,%[x]),1\n\t" | |||||
| "vleg %%v19,56(%%r1,%[x]),1\n\t" | |||||
| "vleg %%v20,64(%%r1,%[x]),0\n\t" | |||||
| "vleg %%v21,72(%%r1,%[x]),0\n\t" | |||||
| "vleg %%v20,80(%%r1,%[x]),1\n\t" | |||||
| "vleg %%v21,88(%%r1,%[x]),1\n\t" | |||||
| "vleg %%v22,96(%%r1,%[x]),0\n\t" | |||||
| "vleg %%v23,104(%%r1,%[x]),0\n\t" | |||||
| "vleg %%v22,112(%%r1,%[x]),1\n\t" | |||||
| "vleg %%v23,120(%%r1,%[x]),1\n\t" | |||||
| "vflpdb %%v16, %%v16\n\t" | |||||
| "vflpdb %%v17, %%v17\n\t" | |||||
| "vflpdb %%v18, %%v18\n\t" | |||||
| "vflpdb %%v19, %%v19\n\t" | |||||
| "vflpdb %%v20, %%v20\n\t" | |||||
| "vflpdb %%v21, %%v21\n\t" | |||||
| "vflpdb %%v22, %%v22\n\t" | |||||
| "vflpdb %%v23, %%v23\n\t" | |||||
| "vfadb %%v16,%%v16,%%v17\n\t" | |||||
| "vfadb %%v17,%%v18,%%v19\n\t" | |||||
| "vfadb %%v18,%%v20,%%v21\n\t" | |||||
| "vfadb %%v19,%%v22,%%v23\n\t" | |||||
| "vfchdb %%v24,%%v16,%%v17\n\t" | |||||
| "vfchdb %%v25,%%v18,%%v19\n\t" | |||||
| "vsel %%v24,%%v16,%%v17,%%v24\n\t" | |||||
| "vsel %%v25,%%v18,%%v19,%%v25\n\t" | |||||
| "vfchdb %%v26,%%v24,%%v25\n\t" | |||||
| "vsel %%v26,%%v24,%%v25,%%v26\n\t" | |||||
| "vfchdb %%v27,%%v26,%%v0\n\t" | |||||
| "vsel %%v0,%%v26,%%v0,%%v27\n\t" | |||||
| "vleg %%v16,128(%%r1,%[x]),0\n\t" | |||||
| "vleg %%v17,136(%%r1,%[x]),0\n\t" | |||||
| "vleg %%v16,144(%%r1,%[x]),1\n\t" | |||||
| "vleg %%v17,152(%%r1,%[x]),1\n\t" | |||||
| "vleg %%v18,160(%%r1,%[x]),0\n\t" | |||||
| "vleg %%v19,168(%%r1,%[x]),0\n\t" | |||||
| "vleg %%v18,176(%%r1,%[x]),1\n\t" | |||||
| "vleg %%v19,184(%%r1,%[x]),1\n\t" | |||||
| "vleg %%v20,192(%%r1,%[x]),0\n\t" | |||||
| "vleg %%v21,200(%%r1,%[x]),0\n\t" | |||||
| "vleg %%v20,208(%%r1,%[x]),1\n\t" | |||||
| "vleg %%v21,216(%%r1,%[x]),1\n\t" | |||||
| "vleg %%v22,224(%%r1,%[x]),0\n\t" | |||||
| "vleg %%v23,232(%%r1,%[x]),0\n\t" | |||||
| "vleg %%v22,240(%%r1,%[x]),1\n\t" | |||||
| "vleg %%v23,248(%%r1,%[x]),1\n\t" | |||||
| "vflpdb %%v16, %%v16\n\t" | |||||
| "vflpdb %%v17, %%v17\n\t" | |||||
| "vflpdb %%v18, %%v18\n\t" | |||||
| "vflpdb %%v19, %%v19\n\t" | |||||
| "vflpdb %%v20, %%v20\n\t" | |||||
| "vflpdb %%v21, %%v21\n\t" | |||||
| "vflpdb %%v22, %%v22\n\t" | |||||
| "vflpdb %%v23, %%v23\n\t" | |||||
| "vfadb %%v16,%%v16,%%v17\n\t" | |||||
| "vfadb %%v17,%%v18,%%v19\n\t" | |||||
| "vfadb %%v18,%%v20,%%v21\n\t" | |||||
| "vfadb %%v19,%%v22,%%v23\n\t" | |||||
| "vfchdb %%v24,%%v16,%%v17\n\t" | |||||
| "vfchdb %%v25,%%v18,%%v19\n\t" | |||||
| "vsel %%v24,%%v16,%%v17,%%v24\n\t" | |||||
| "vsel %%v25,%%v18,%%v19,%%v25\n\t" | |||||
| "vfchdb %%v26,%%v24,%%v25\n\t" | |||||
| "vsel %%v26,%%v24,%%v25,%%v26\n\t" | |||||
| "vfchdb %%v27,%%v26,%%v0\n\t" | |||||
| "vsel %%v0,%%v26,%%v0,%%v27\n\t" | |||||
| "agfi %%r1, 256\n\t" | |||||
| "brctg %[n], 0b\n\t" | |||||
| "vrepg %%v16,%%v0,1\n\t" | |||||
| "wfchdb %%v17,%%v0,%%v16\n\t" | |||||
| "vsel %%v0,%%v0,%%v16,%%v17\n\t" | |||||
| "ldr %[amax],%%f0" | |||||
| : [amax] "=f"(amax),[n] "+&r"(n) | |||||
| : "m"(*(const FLOAT (*)[n]) x),[x] "a"(x) | |||||
| : "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22", | |||||
| "v23", "v24", "v25", "v26", "v27"); | |||||
| return amax; | |||||
| } | } | ||||
| FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { | FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { | ||||
| BLASLONG i = 0; | |||||
| BLASLONG ix = 0; | |||||
| FLOAT maxf = 0.0; | |||||
| BLASLONG inc_x2; | |||||
| if (n <= 0 || inc_x <= 0) return (maxf); | |||||
| if (inc_x == 1) { | |||||
| BLASLONG n1 = n & -16; | |||||
| if (n1 > 0) { | |||||
| maxf = zamax_kernel_16(n1, x); | |||||
| ix = n1 * 2; | |||||
| i = n1; | |||||
| } | |||||
| else | |||||
| { | |||||
| maxf=CABS1(x,0); | |||||
| ix += 2; | |||||
| i++; | |||||
| } | |||||
| while (i < n) { | |||||
| if (CABS1(x,ix) > maxf) { | |||||
| maxf = CABS1(x,ix); | |||||
| } | |||||
| ix += 2; | |||||
| i++; | |||||
| } | |||||
| return (maxf); | |||||
| BLASLONG i = 0; | |||||
| BLASLONG ix = 0; | |||||
| FLOAT maxf = 0.0; | |||||
| BLASLONG inc_x2; | |||||
| if (n <= 0 || inc_x <= 0) | |||||
| return (maxf); | |||||
| if (inc_x == 1) { | |||||
| BLASLONG n1 = n & -16; | |||||
| if (n1 > 0) { | |||||
| maxf = zamax_kernel_16(n1, x); | |||||
| ix = n1 * 2; | |||||
| i = n1; | |||||
| } else { | } else { | ||||
| maxf=CABS1(x,0); | |||||
| inc_x2 = 2 * inc_x; | |||||
| BLASLONG n1 = n & -4; | |||||
| while (i < n1) { | |||||
| if (CABS1(x,ix) > maxf) { | |||||
| maxf = CABS1(x,ix); | |||||
| } | |||||
| if (CABS1(x,ix+inc_x2) > maxf) { | |||||
| maxf = CABS1(x,ix+inc_x2); | |||||
| } | |||||
| if (CABS1(x,ix+inc_x2*2) > maxf) { | |||||
| maxf = CABS1(x,ix+inc_x2*2); | |||||
| } | |||||
| if (CABS1(x,ix+inc_x2*3) > maxf) { | |||||
| maxf = CABS1(x,ix+inc_x2*3); | |||||
| } | |||||
| ix += inc_x2 * 4; | |||||
| i += 4; | |||||
| } | |||||
| while (i < n) { | |||||
| if (CABS1(x,ix) > maxf) { | |||||
| maxf = CABS1(x,ix); | |||||
| } | |||||
| ix += inc_x2; | |||||
| i++; | |||||
| } | |||||
| return (maxf); | |||||
| maxf = CABS1(x, 0); | |||||
| ix += 2; | |||||
| i++; | |||||
| } | |||||
| while (i < n) { | |||||
| if (CABS1(x, ix) > maxf) { | |||||
| maxf = CABS1(x, ix); | |||||
| } | |||||
| ix += 2; | |||||
| i++; | |||||
| } | |||||
| return (maxf); | |||||
| } else { | |||||
| maxf = CABS1(x, 0); | |||||
| inc_x2 = 2 * inc_x; | |||||
| BLASLONG n1 = n & -4; | |||||
| while (i < n1) { | |||||
| if (CABS1(x, ix) > maxf) { | |||||
| maxf = CABS1(x, ix); | |||||
| } | |||||
| if (CABS1(x, ix + inc_x2) > maxf) { | |||||
| maxf = CABS1(x, ix + inc_x2); | |||||
| } | |||||
| if (CABS1(x, ix + inc_x2 * 2) > maxf) { | |||||
| maxf = CABS1(x, ix + inc_x2 * 2); | |||||
| } | |||||
| if (CABS1(x, ix + inc_x2 * 3) > maxf) { | |||||
| maxf = CABS1(x, ix + inc_x2 * 3); | |||||
| } | |||||
| ix += inc_x2 * 4; | |||||
| i += 4; | |||||
| } | |||||
| while (i < n) { | |||||
| if (CABS1(x, ix) > maxf) { | |||||
| maxf = CABS1(x, ix); | |||||
| } | |||||
| ix += inc_x2; | |||||
| i++; | |||||
| } | } | ||||
| return (maxf); | |||||
| } | |||||
| } | } | ||||
| @@ -1,5 +1,5 @@ | |||||
| /*************************************************************************** | /*************************************************************************** | ||||
| Copyright (c) 2013-2017, The OpenBLAS Project | |||||
| Copyright (c) 2013-2019, The OpenBLAS Project | |||||
| All rights reserved. | All rights reserved. | ||||
| Redistribution and use in source and binary forms, with or without | Redistribution and use in source and binary forms, with or without | ||||
| modification, are permitted provided that the following conditions are | modification, are permitted provided that the following conditions are | ||||
| @@ -28,184 +28,165 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #include "common.h" | #include "common.h" | ||||
| #include <math.h> | #include <math.h> | ||||
| #if defined(DOUBLE) | |||||
| #define ABS fabs | |||||
| #else | |||||
| #define ABS fabsf | |||||
| #endif | |||||
| #define CABS1(x,i) (ABS(x[i]) + ABS(x[i + 1])) | |||||
| static FLOAT zamin_kernel_16(BLASLONG n, FLOAT *x) | |||||
| { | |||||
| FLOAT amin; | |||||
| __asm__ volatile ( | |||||
| "vleg %%v0,0(%2),0 \n\t" | |||||
| "vleg %%v16,8(%2),0 \n\t" | |||||
| "vleg %%v0,16(%2),1 \n\t" | |||||
| "vleg %%v16,24(%2),1 \n\t" | |||||
| "vflpdb %%v0,%%v0 \n\t" | |||||
| "vflpdb %%v16,%%v16 \n\t" | |||||
| "vfadb %%v0,%%v0,%%v16 \n\t" | |||||
| "srlg %%r0,%1,4 \n\t" | |||||
| "xgr %%r1,%%r1 \n\t" | |||||
| "0: \n\t" | |||||
| "pfd 1, 1024(%%r1,%2) \n\t" | |||||
| "vleg %%v16,0(%%r1,%2),0 \n\t" | |||||
| "vleg %%v17,8(%%r1,%2),0 \n\t" | |||||
| "vleg %%v16,16(%%r1,%2),1 \n\t" | |||||
| "vleg %%v17,24(%%r1,%2),1 \n\t" | |||||
| "vleg %%v18,32(%%r1,%2),0 \n\t" | |||||
| "vleg %%v19,40(%%r1,%2),0 \n\t" | |||||
| "vleg %%v18,48(%%r1,%2),1 \n\t" | |||||
| "vleg %%v19,56(%%r1,%2),1 \n\t" | |||||
| "vleg %%v20,64(%%r1,%2),0 \n\t" | |||||
| "vleg %%v21,72(%%r1,%2),0 \n\t" | |||||
| "vleg %%v20,80(%%r1,%2),1 \n\t" | |||||
| "vleg %%v21,88(%%r1,%2),1 \n\t" | |||||
| "vleg %%v22,96(%%r1,%2),0 \n\t" | |||||
| "vleg %%v23,104(%%r1,%2),0 \n\t" | |||||
| "vleg %%v22,112(%%r1,%2),1 \n\t" | |||||
| "vleg %%v23,120(%%r1,%2),1 \n\t" | |||||
| "vleg %%v24,128(%%r1,%2),0 \n\t" | |||||
| "vleg %%v25,136(%%r1,%2),0 \n\t" | |||||
| "vleg %%v24,144(%%r1,%2),1 \n\t" | |||||
| "vleg %%v25,152(%%r1,%2),1 \n\t" | |||||
| "vleg %%v26,160(%%r1,%2),0 \n\t" | |||||
| "vleg %%v27,168(%%r1,%2),0 \n\t" | |||||
| "vleg %%v26,176(%%r1,%2),1 \n\t" | |||||
| "vleg %%v27,184(%%r1,%2),1 \n\t" | |||||
| "vleg %%v28,192(%%r1,%2),0 \n\t" | |||||
| "vleg %%v29,200(%%r1,%2),0 \n\t" | |||||
| "vleg %%v28,208(%%r1,%2),1 \n\t" | |||||
| "vleg %%v29,216(%%r1,%2),1 \n\t" | |||||
| "vleg %%v30,224(%%r1,%2),0 \n\t" | |||||
| "vleg %%v31,232(%%r1,%2),0 \n\t" | |||||
| "vleg %%v30,240(%%r1,%2),1 \n\t" | |||||
| "vleg %%v31,248(%%r1,%2),1 \n\t" | |||||
| "vflpdb %%v16,%%v16 \n\t" | |||||
| "vflpdb %%v17,%%v17 \n\t" | |||||
| "vflpdb %%v18,%%v18 \n\t" | |||||
| "vflpdb %%v19,%%v19 \n\t" | |||||
| "vflpdb %%v20,%%v20 \n\t" | |||||
| "vflpdb %%v21,%%v21 \n\t" | |||||
| "vflpdb %%v22,%%v22 \n\t" | |||||
| "vflpdb %%v23,%%v23 \n\t" | |||||
| "vflpdb %%v24,%%v24 \n\t" | |||||
| "vflpdb %%v25,%%v25 \n\t" | |||||
| "vflpdb %%v26,%%v26 \n\t" | |||||
| "vflpdb %%v27,%%v27 \n\t" | |||||
| "vflpdb %%v28,%%v28 \n\t" | |||||
| "vflpdb %%v29,%%v29 \n\t" | |||||
| "vflpdb %%v30,%%v30 \n\t" | |||||
| "vflpdb %%v31,%%v31 \n\t" | |||||
| "vfadb %%v16,%%v16,%%v17 \n\t" | |||||
| "vfadb %%v18,%%v18,%%v19 \n\t" | |||||
| "vfadb %%v20,%%v20,%%v21 \n\t" | |||||
| "vfadb %%v22,%%v22,%%v23 \n\t" | |||||
| "vfadb %%v24,%%v24,%%v25 \n\t" | |||||
| "vfadb %%v26,%%v26,%%v27 \n\t" | |||||
| "vfadb %%v28,%%v28,%%v29 \n\t" | |||||
| "vfadb %%v30,%%v30,%%v31 \n\t" | |||||
| "vfmindb %%v16,%%v16,%%v24,0 \n\t" | |||||
| "vfmindb %%v18,%%v18,%%v26,0 \n\t" | |||||
| "vfmindb %%v20,%%v20,%%v28,0 \n\t" | |||||
| "vfmindb %%v22,%%v22,%%v30,0 \n\t" | |||||
| "vfmindb %%v16,%%v16,%%v20,0 \n\t" | |||||
| "vfmindb %%v18,%%v18,%%v22,0 \n\t" | |||||
| "vfmindb %%v16,%%v16,%%v18,0 \n\t" | |||||
| "vfmindb %%v0,%%v0,%%v16,0 \n\t" | |||||
| "agfi %%r1, 256 \n\t" | |||||
| "brctg %%r0, 0b \n\t" | |||||
| "vrepg %%v16,%%v0,1 \n\t" | |||||
| "wfmindb %%v0,%%v0,%%v16,0 \n\t" | |||||
| "ldr %0,%%f0 " | |||||
| :"=f"(amin) | |||||
| :"r"(n),"ZR"((const FLOAT (*)[n])x) | |||||
| :"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" | |||||
| ); | |||||
| return amin; | |||||
| #define CABS1(x,i) (fabs(x[i]) + fabs(x[i + 1])) | |||||
| static FLOAT zamin_kernel_16(BLASLONG n, FLOAT *x) { | |||||
| FLOAT amin; | |||||
| __asm__("vleg %%v0,0(%[x]),0\n\t" | |||||
| "vleg %%v16,8(%[x]),0\n\t" | |||||
| "vleg %%v0,16(%[x]),1\n\t" | |||||
| "vleg %%v16,24(%[x]),1\n\t" | |||||
| "vflpdb %%v0,%%v0\n\t" | |||||
| "vflpdb %%v16,%%v16\n\t" | |||||
| "vfadb %%v0,%%v0,%%v16\n\t" | |||||
| "srlg %[n],%[n],4\n\t" | |||||
| "xgr %%r1,%%r1\n\t" | |||||
| "0:\n\t" | |||||
| "pfd 1, 1024(%%r1,%[x])\n\t" | |||||
| "vleg %%v16,0(%%r1,%[x]),0\n\t" | |||||
| "vleg %%v17,8(%%r1,%[x]),0\n\t" | |||||
| "vleg %%v16,16(%%r1,%[x]),1\n\t" | |||||
| "vleg %%v17,24(%%r1,%[x]),1\n\t" | |||||
| "vleg %%v18,32(%%r1,%[x]),0\n\t" | |||||
| "vleg %%v19,40(%%r1,%[x]),0\n\t" | |||||
| "vleg %%v18,48(%%r1,%[x]),1\n\t" | |||||
| "vleg %%v19,56(%%r1,%[x]),1\n\t" | |||||
| "vleg %%v20,64(%%r1,%[x]),0\n\t" | |||||
| "vleg %%v21,72(%%r1,%[x]),0\n\t" | |||||
| "vleg %%v20,80(%%r1,%[x]),1\n\t" | |||||
| "vleg %%v21,88(%%r1,%[x]),1\n\t" | |||||
| "vleg %%v22,96(%%r1,%[x]),0\n\t" | |||||
| "vleg %%v23,104(%%r1,%[x]),0\n\t" | |||||
| "vleg %%v22,112(%%r1,%[x]),1\n\t" | |||||
| "vleg %%v23,120(%%r1,%[x]),1\n\t" | |||||
| "vleg %%v24,128(%%r1,%[x]),0\n\t" | |||||
| "vleg %%v25,136(%%r1,%[x]),0\n\t" | |||||
| "vleg %%v24,144(%%r1,%[x]),1\n\t" | |||||
| "vleg %%v25,152(%%r1,%[x]),1\n\t" | |||||
| "vleg %%v26,160(%%r1,%[x]),0\n\t" | |||||
| "vleg %%v27,168(%%r1,%[x]),0\n\t" | |||||
| "vleg %%v26,176(%%r1,%[x]),1\n\t" | |||||
| "vleg %%v27,184(%%r1,%[x]),1\n\t" | |||||
| "vleg %%v28,192(%%r1,%[x]),0\n\t" | |||||
| "vleg %%v29,200(%%r1,%[x]),0\n\t" | |||||
| "vleg %%v28,208(%%r1,%[x]),1\n\t" | |||||
| "vleg %%v29,216(%%r1,%[x]),1\n\t" | |||||
| "vleg %%v30,224(%%r1,%[x]),0\n\t" | |||||
| "vleg %%v31,232(%%r1,%[x]),0\n\t" | |||||
| "vleg %%v30,240(%%r1,%[x]),1\n\t" | |||||
| "vleg %%v31,248(%%r1,%[x]),1\n\t" | |||||
| "vflpdb %%v16,%%v16\n\t" | |||||
| "vflpdb %%v17,%%v17\n\t" | |||||
| "vflpdb %%v18,%%v18\n\t" | |||||
| "vflpdb %%v19,%%v19\n\t" | |||||
| "vflpdb %%v20,%%v20\n\t" | |||||
| "vflpdb %%v21,%%v21\n\t" | |||||
| "vflpdb %%v22,%%v22\n\t" | |||||
| "vflpdb %%v23,%%v23\n\t" | |||||
| "vflpdb %%v24,%%v24\n\t" | |||||
| "vflpdb %%v25,%%v25\n\t" | |||||
| "vflpdb %%v26,%%v26\n\t" | |||||
| "vflpdb %%v27,%%v27\n\t" | |||||
| "vflpdb %%v28,%%v28\n\t" | |||||
| "vflpdb %%v29,%%v29\n\t" | |||||
| "vflpdb %%v30,%%v30\n\t" | |||||
| "vflpdb %%v31,%%v31\n\t" | |||||
| "vfadb %%v16,%%v16,%%v17\n\t" | |||||
| "vfadb %%v18,%%v18,%%v19\n\t" | |||||
| "vfadb %%v20,%%v20,%%v21\n\t" | |||||
| "vfadb %%v22,%%v22,%%v23\n\t" | |||||
| "vfadb %%v24,%%v24,%%v25\n\t" | |||||
| "vfadb %%v26,%%v26,%%v27\n\t" | |||||
| "vfadb %%v28,%%v28,%%v29\n\t" | |||||
| "vfadb %%v30,%%v30,%%v31\n\t" | |||||
| "vfmindb %%v16,%%v16,%%v24,0\n\t" | |||||
| "vfmindb %%v18,%%v18,%%v26,0\n\t" | |||||
| "vfmindb %%v20,%%v20,%%v28,0\n\t" | |||||
| "vfmindb %%v22,%%v22,%%v30,0\n\t" | |||||
| "vfmindb %%v16,%%v16,%%v20,0\n\t" | |||||
| "vfmindb %%v18,%%v18,%%v22,0\n\t" | |||||
| "vfmindb %%v16,%%v16,%%v18,0\n\t" | |||||
| "vfmindb %%v0,%%v0,%%v16,0\n\t" | |||||
| "agfi %%r1, 256\n\t" | |||||
| "brctg %[n], 0b\n\t" | |||||
| "vrepg %%v16,%%v0,1\n\t" | |||||
| "wfmindb %%v0,%%v0,%%v16,0\n\t" | |||||
| "ldr %[amin],%%f0" | |||||
| : [amin] "=f"(amin),[n] "+&r"(n) | |||||
| : "m"(*(const FLOAT (*)[n * 2]) x),[x] "a"(x) | |||||
| : "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22", | |||||
| "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"); | |||||
| return amin; | |||||
| } | } | ||||
| FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { | FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { | ||||
| BLASLONG i = 0; | |||||
| BLASLONG ix = 0; | |||||
| FLOAT minf = 0.0; | |||||
| BLASLONG inc_x2; | |||||
| if (n <= 0 || inc_x <= 0) return (minf); | |||||
| if (inc_x == 1) { | |||||
| BLASLONG n1 = n & -16; | |||||
| if (n1 > 0) { | |||||
| minf = zamin_kernel_16(n1, x); | |||||
| ix = n1 * 2; | |||||
| i = n1; | |||||
| } | |||||
| else | |||||
| { | |||||
| minf=CABS1(x,0); | |||||
| ix += 2; | |||||
| i++; | |||||
| } | |||||
| while (i < n) { | |||||
| if (CABS1(x,ix) < minf) { | |||||
| minf = CABS1(x,ix); | |||||
| } | |||||
| ix += 2; | |||||
| i++; | |||||
| } | |||||
| return (minf); | |||||
| BLASLONG i = 0; | |||||
| BLASLONG ix = 0; | |||||
| FLOAT minf = 0.0; | |||||
| BLASLONG inc_x2; | |||||
| if (n <= 0 || inc_x <= 0) | |||||
| return (minf); | |||||
| if (inc_x == 1) { | |||||
| BLASLONG n1 = n & -16; | |||||
| if (n1 > 0) { | |||||
| minf = zamin_kernel_16(n1, x); | |||||
| ix = n1 * 2; | |||||
| i = n1; | |||||
| } else { | } else { | ||||
| minf = CABS1(x, 0); | |||||
| ix += 2; | |||||
| i++; | |||||
| } | |||||
| while (i < n) { | |||||
| if (CABS1(x, ix) < minf) { | |||||
| minf = CABS1(x, ix); | |||||
| } | |||||
| ix += 2; | |||||
| i++; | |||||
| } | |||||
| return (minf); | |||||
| minf=CABS1(x,0); | |||||
| inc_x2 = 2 * inc_x; | |||||
| } else { | |||||
| BLASLONG n1 = n & -4; | |||||
| while (i < n1) { | |||||
| minf = CABS1(x, 0); | |||||
| inc_x2 = 2 * inc_x; | |||||
| if (CABS1(x,ix) < minf) { | |||||
| minf = CABS1(x,ix); | |||||
| } | |||||
| if (CABS1(x,ix+inc_x2) < minf) { | |||||
| minf = CABS1(x,ix+inc_x2); | |||||
| } | |||||
| if (CABS1(x,ix+inc_x2*2) < minf) { | |||||
| minf = CABS1(x,ix+inc_x2*2); | |||||
| } | |||||
| if (CABS1(x,ix+inc_x2*3) < minf) { | |||||
| minf = CABS1(x,ix+inc_x2*3); | |||||
| } | |||||
| BLASLONG n1 = n & -4; | |||||
| while (i < n1) { | |||||
| ix += inc_x2 * 4; | |||||
| if (CABS1(x, ix) < minf) { | |||||
| minf = CABS1(x, ix); | |||||
| } | |||||
| if (CABS1(x, ix + inc_x2) < minf) { | |||||
| minf = CABS1(x, ix + inc_x2); | |||||
| } | |||||
| if (CABS1(x, ix + inc_x2 * 2) < minf) { | |||||
| minf = CABS1(x, ix + inc_x2 * 2); | |||||
| } | |||||
| if (CABS1(x, ix + inc_x2 * 3) < minf) { | |||||
| minf = CABS1(x, ix + inc_x2 * 3); | |||||
| } | |||||
| i += 4; | |||||
| ix += inc_x2 * 4; | |||||
| } | |||||
| i += 4; | |||||
| } | |||||
| while (i < n) { | |||||
| if (CABS1(x,ix) < minf) { | |||||
| minf = CABS1(x,ix); | |||||
| } | |||||
| ix += inc_x2; | |||||
| i++; | |||||
| } | |||||
| return (minf); | |||||
| while (i < n) { | |||||
| if (CABS1(x, ix) < minf) { | |||||
| minf = CABS1(x, ix); | |||||
| } | |||||
| ix += inc_x2; | |||||
| i++; | |||||
| } | } | ||||
| return (minf); | |||||
| } | |||||
| } | } | ||||
| @@ -1,5 +1,5 @@ | |||||
| /*************************************************************************** | /*************************************************************************** | ||||
| Copyright (c) 2013-2017, The OpenBLAS Project | |||||
| Copyright (c) 2013-2019, The OpenBLAS Project | |||||
| All rights reserved. | All rights reserved. | ||||
| Redistribution and use in source and binary forms, with or without | Redistribution and use in source and binary forms, with or without | ||||
| modification, are permitted provided that the following conditions are | modification, are permitted provided that the following conditions are | ||||
| @@ -28,194 +28,174 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #include "common.h" | #include "common.h" | ||||
| #include <math.h> | #include <math.h> | ||||
| #if defined(DOUBLE) | |||||
| #define ABS fabs | |||||
| #else | |||||
| #define ABS fabsf | |||||
| #endif | |||||
| #define CABS1(x,i) (ABS(x[i]) + ABS(x[i + 1])) | |||||
| static FLOAT zamin_kernel_16(BLASLONG n, FLOAT *x) | |||||
| { | |||||
| FLOAT amin; | |||||
| __asm__ volatile ( | |||||
| "vleg %%v0,0(%2),0 \n\t" | |||||
| "vleg %%v16,8(%2),0 \n\t" | |||||
| "vleg %%v0,16(%2),1 \n\t" | |||||
| "vleg %%v16,24(%2),1 \n\t" | |||||
| "vflpdb %%v0,%%v0 \n\t" | |||||
| "vflpdb %%v16,%%v16 \n\t" | |||||
| "vfadb %%v0,%%v0,%%v16 \n\t" | |||||
| "srlg %%r0,%1,4 \n\t" | |||||
| "xgr %%r1,%%r1 \n\t" | |||||
| "0: \n\t" | |||||
| "pfd 1, 1024(%%r1,%2) \n\t" | |||||
| "vleg %%v16,0(%%r1,%2),0 \n\t" | |||||
| "vleg %%v17,8(%%r1,%2),0 \n\t" | |||||
| "vleg %%v16,16(%%r1,%2),1 \n\t" | |||||
| "vleg %%v17,24(%%r1,%2),1 \n\t" | |||||
| "vleg %%v18,32(%%r1,%2),0 \n\t" | |||||
| "vleg %%v19,40(%%r1,%2),0 \n\t" | |||||
| "vleg %%v18,48(%%r1,%2),1 \n\t" | |||||
| "vleg %%v19,56(%%r1,%2),1 \n\t" | |||||
| "vleg %%v20,64(%%r1,%2),0 \n\t" | |||||
| "vleg %%v21,72(%%r1,%2),0 \n\t" | |||||
| "vleg %%v20,80(%%r1,%2),1 \n\t" | |||||
| "vleg %%v21,88(%%r1,%2),1 \n\t" | |||||
| "vleg %%v22,96(%%r1,%2),0 \n\t" | |||||
| "vleg %%v23,104(%%r1,%2),0 \n\t" | |||||
| "vleg %%v22,112(%%r1,%2),1 \n\t" | |||||
| "vleg %%v23,120(%%r1,%2),1 \n\t" | |||||
| "vflpdb %%v16, %%v16 \n\t" | |||||
| "vflpdb %%v17, %%v17 \n\t" | |||||
| "vflpdb %%v18, %%v18 \n\t" | |||||
| "vflpdb %%v19, %%v19 \n\t" | |||||
| "vflpdb %%v20, %%v20 \n\t" | |||||
| "vflpdb %%v21, %%v21 \n\t" | |||||
| "vflpdb %%v22, %%v22 \n\t" | |||||
| "vflpdb %%v23, %%v23 \n\t" | |||||
| "vfadb %%v16,%%v16,%%v17 \n\t" | |||||
| "vfadb %%v17,%%v18,%%v19 \n\t" | |||||
| "vfadb %%v18,%%v20,%%v21 \n\t" | |||||
| "vfadb %%v19,%%v22,%%v23 \n\t" | |||||
| "vfchdb %%v24,%%v17,%%v16 \n\t" | |||||
| "vfchdb %%v25,%%v19,%%v18 \n\t" | |||||
| "vsel %%v24,%%v16,%%v17,%%v24 \n\t" | |||||
| "vsel %%v25,%%v18,%%v19,%%v25 \n\t" | |||||
| "vfchdb %%v26,%%v25,%%v24 \n\t" | |||||
| "vsel %%v26,%%v24,%%v25,%%v26 \n\t" | |||||
| "vfchdb %%v27,%%v0,%%v26 \n\t" | |||||
| "vsel %%v0,%%v26,%%v0,%%v27 \n\t" | |||||
| "vleg %%v16,128(%%r1,%2),0 \n\t" | |||||
| "vleg %%v17,136(%%r1,%2),0 \n\t" | |||||
| "vleg %%v16,144(%%r1,%2),1 \n\t" | |||||
| "vleg %%v17,152(%%r1,%2),1 \n\t" | |||||
| "vleg %%v18,160(%%r1,%2),0 \n\t" | |||||
| "vleg %%v19,168(%%r1,%2),0 \n\t" | |||||
| "vleg %%v18,176(%%r1,%2),1 \n\t" | |||||
| "vleg %%v19,184(%%r1,%2),1 \n\t" | |||||
| "vleg %%v20,192(%%r1,%2),0 \n\t" | |||||
| "vleg %%v21,200(%%r1,%2),0 \n\t" | |||||
| "vleg %%v20,208(%%r1,%2),1 \n\t" | |||||
| "vleg %%v21,216(%%r1,%2),1 \n\t" | |||||
| "vleg %%v22,224(%%r1,%2),0 \n\t" | |||||
| "vleg %%v23,232(%%r1,%2),0 \n\t" | |||||
| "vleg %%v22,240(%%r1,%2),1 \n\t" | |||||
| "vleg %%v23,248(%%r1,%2),1 \n\t" | |||||
| "vflpdb %%v16, %%v16 \n\t" | |||||
| "vflpdb %%v17, %%v17 \n\t" | |||||
| "vflpdb %%v18, %%v18 \n\t" | |||||
| "vflpdb %%v19, %%v19 \n\t" | |||||
| "vflpdb %%v20, %%v20 \n\t" | |||||
| "vflpdb %%v21, %%v21 \n\t" | |||||
| "vflpdb %%v22, %%v22 \n\t" | |||||
| "vflpdb %%v23, %%v23 \n\t" | |||||
| "vfadb %%v16,%%v16,%%v17 \n\t" | |||||
| "vfadb %%v17,%%v18,%%v19 \n\t" | |||||
| "vfadb %%v18,%%v20,%%v21 \n\t" | |||||
| "vfadb %%v19,%%v22,%%v23 \n\t" | |||||
| "vfchdb %%v24,%%v17,%%v16 \n\t" | |||||
| "vfchdb %%v25,%%v19,%%v18 \n\t" | |||||
| "vsel %%v24,%%v16,%%v17,%%v24 \n\t" | |||||
| "vsel %%v25,%%v18,%%v19,%%v25 \n\t" | |||||
| "vfchdb %%v26,%%v25,%%v24 \n\t" | |||||
| "vsel %%v26,%%v24,%%v25,%%v26 \n\t" | |||||
| "vfchdb %%v27,%%v0,%%v26 \n\t" | |||||
| "vsel %%v0,%%v26,%%v0,%%v27 \n\t" | |||||
| "agfi %%r1, 256 \n\t" | |||||
| "brctg %%r0, 0b \n\t" | |||||
| "vrepg %%v16,%%v0,1 \n\t" | |||||
| "wfchdb %%v17,%%v16,%%v0 \n\t" | |||||
| "vsel %%v0,%%v0,%%v16,%%v17 \n\t" | |||||
| "ldr %0,%%f0 " | |||||
| :"=f"(amin) | |||||
| :"r"(n),"ZR"((const FLOAT (*)[n])x) | |||||
| :"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27" | |||||
| ); | |||||
| return amin; | |||||
| #define CABS1(x,i) (fabs(x[i]) + fabs(x[i + 1])) | |||||
| static FLOAT zamin_kernel_16(BLASLONG n, FLOAT *x) { | |||||
| FLOAT amin; | |||||
| __asm__("vleg %%v0,0(%[x]),0\n\t" | |||||
| "vleg %%v16,8(%[x]),0\n\t" | |||||
| "vleg %%v0,16(%[x]),1\n\t" | |||||
| "vleg %%v16,24(%[x]),1\n\t" | |||||
| "vflpdb %%v0,%%v0\n\t" | |||||
| "vflpdb %%v16,%%v16\n\t" | |||||
| "vfadb %%v0,%%v0,%%v16\n\t" | |||||
| "srlg %[n],%[n],4\n\t" | |||||
| "xgr %%r1,%%r1\n\t" | |||||
| "0:\n\t" | |||||
| "pfd 1, 1024(%%r1,%[x])\n\t" | |||||
| "vleg %%v16,0(%%r1,%[x]),0\n\t" | |||||
| "vleg %%v17,8(%%r1,%[x]),0\n\t" | |||||
| "vleg %%v16,16(%%r1,%[x]),1\n\t" | |||||
| "vleg %%v17,24(%%r1,%[x]),1\n\t" | |||||
| "vleg %%v18,32(%%r1,%[x]),0\n\t" | |||||
| "vleg %%v19,40(%%r1,%[x]),0\n\t" | |||||
| "vleg %%v18,48(%%r1,%[x]),1\n\t" | |||||
| "vleg %%v19,56(%%r1,%[x]),1\n\t" | |||||
| "vleg %%v20,64(%%r1,%[x]),0\n\t" | |||||
| "vleg %%v21,72(%%r1,%[x]),0\n\t" | |||||
| "vleg %%v20,80(%%r1,%[x]),1\n\t" | |||||
| "vleg %%v21,88(%%r1,%[x]),1\n\t" | |||||
| "vleg %%v22,96(%%r1,%[x]),0\n\t" | |||||
| "vleg %%v23,104(%%r1,%[x]),0\n\t" | |||||
| "vleg %%v22,112(%%r1,%[x]),1\n\t" | |||||
| "vleg %%v23,120(%%r1,%[x]),1\n\t" | |||||
| "vflpdb %%v16, %%v16\n\t" | |||||
| "vflpdb %%v17, %%v17\n\t" | |||||
| "vflpdb %%v18, %%v18\n\t" | |||||
| "vflpdb %%v19, %%v19\n\t" | |||||
| "vflpdb %%v20, %%v20\n\t" | |||||
| "vflpdb %%v21, %%v21\n\t" | |||||
| "vflpdb %%v22, %%v22\n\t" | |||||
| "vflpdb %%v23, %%v23\n\t" | |||||
| "vfadb %%v16,%%v16,%%v17\n\t" | |||||
| "vfadb %%v17,%%v18,%%v19\n\t" | |||||
| "vfadb %%v18,%%v20,%%v21\n\t" | |||||
| "vfadb %%v19,%%v22,%%v23\n\t" | |||||
| "vfchdb %%v24,%%v17,%%v16\n\t" | |||||
| "vfchdb %%v25,%%v19,%%v18\n\t" | |||||
| "vsel %%v24,%%v16,%%v17,%%v24\n\t" | |||||
| "vsel %%v25,%%v18,%%v19,%%v25\n\t" | |||||
| "vfchdb %%v26,%%v25,%%v24\n\t" | |||||
| "vsel %%v26,%%v24,%%v25,%%v26\n\t" | |||||
| "vfchdb %%v27,%%v0,%%v26\n\t" | |||||
| "vsel %%v0,%%v26,%%v0,%%v27\n\t" | |||||
| "vleg %%v16,128(%%r1,%[x]),0\n\t" | |||||
| "vleg %%v17,136(%%r1,%[x]),0\n\t" | |||||
| "vleg %%v16,144(%%r1,%[x]),1\n\t" | |||||
| "vleg %%v17,152(%%r1,%[x]),1\n\t" | |||||
| "vleg %%v18,160(%%r1,%[x]),0\n\t" | |||||
| "vleg %%v19,168(%%r1,%[x]),0\n\t" | |||||
| "vleg %%v18,176(%%r1,%[x]),1\n\t" | |||||
| "vleg %%v19,184(%%r1,%[x]),1\n\t" | |||||
| "vleg %%v20,192(%%r1,%[x]),0\n\t" | |||||
| "vleg %%v21,200(%%r1,%[x]),0\n\t" | |||||
| "vleg %%v20,208(%%r1,%[x]),1\n\t" | |||||
| "vleg %%v21,216(%%r1,%[x]),1\n\t" | |||||
| "vleg %%v22,224(%%r1,%[x]),0\n\t" | |||||
| "vleg %%v23,232(%%r1,%[x]),0\n\t" | |||||
| "vleg %%v22,240(%%r1,%[x]),1\n\t" | |||||
| "vleg %%v23,248(%%r1,%[x]),1\n\t" | |||||
| "vflpdb %%v16, %%v16\n\t" | |||||
| "vflpdb %%v17, %%v17\n\t" | |||||
| "vflpdb %%v18, %%v18\n\t" | |||||
| "vflpdb %%v19, %%v19\n\t" | |||||
| "vflpdb %%v20, %%v20\n\t" | |||||
| "vflpdb %%v21, %%v21\n\t" | |||||
| "vflpdb %%v22, %%v22\n\t" | |||||
| "vflpdb %%v23, %%v23\n\t" | |||||
| "vfadb %%v16,%%v16,%%v17\n\t" | |||||
| "vfadb %%v17,%%v18,%%v19\n\t" | |||||
| "vfadb %%v18,%%v20,%%v21\n\t" | |||||
| "vfadb %%v19,%%v22,%%v23\n\t" | |||||
| "vfchdb %%v24,%%v17,%%v16\n\t" | |||||
| "vfchdb %%v25,%%v19,%%v18\n\t" | |||||
| "vsel %%v24,%%v16,%%v17,%%v24\n\t" | |||||
| "vsel %%v25,%%v18,%%v19,%%v25\n\t" | |||||
| "vfchdb %%v26,%%v25,%%v24\n\t" | |||||
| "vsel %%v26,%%v24,%%v25,%%v26\n\t" | |||||
| "vfchdb %%v27,%%v0,%%v26\n\t" | |||||
| "vsel %%v0,%%v26,%%v0,%%v27\n\t" | |||||
| "agfi %%r1, 256\n\t" | |||||
| "brctg %[n], 0b\n\t" | |||||
| "vrepg %%v16,%%v0,1\n\t" | |||||
| "wfchdb %%v17,%%v16,%%v0\n\t" | |||||
| "vsel %%v0,%%v0,%%v16,%%v17\n\t" | |||||
| "ldr %[amin],%%f0" | |||||
| : [amin] "=f"(amin),[n] "+&r"(n) | |||||
| : "m"(*(const FLOAT (*)[n]) x),[x] "a"(x) | |||||
| : "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22", | |||||
| "v23", "v24", "v25", "v26", "v27"); | |||||
| return amin; | |||||
| } | } | ||||
| FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { | FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { | ||||
| BLASLONG i = 0; | |||||
| BLASLONG ix = 0; | |||||
| FLOAT minf = 0.0; | |||||
| BLASLONG inc_x2; | |||||
| if (n <= 0 || inc_x <= 0) return (minf); | |||||
| if (inc_x == 1) { | |||||
| BLASLONG n1 = n & -16; | |||||
| if (n1 > 0) { | |||||
| minf = zamin_kernel_16(n1, x); | |||||
| ix = n1 * 2; | |||||
| i = n1; | |||||
| } | |||||
| else | |||||
| { | |||||
| minf=CABS1(x,0); | |||||
| ix += 2; | |||||
| i++; | |||||
| } | |||||
| while (i < n) { | |||||
| if (CABS1(x,ix) < minf) { | |||||
| minf = CABS1(x,ix); | |||||
| } | |||||
| ix += 2; | |||||
| i++; | |||||
| } | |||||
| return (minf); | |||||
| BLASLONG i = 0; | |||||
| BLASLONG ix = 0; | |||||
| FLOAT minf = 0.0; | |||||
| BLASLONG inc_x2; | |||||
| if (n <= 0 || inc_x <= 0) | |||||
| return (minf); | |||||
| if (inc_x == 1) { | |||||
| BLASLONG n1 = n & -16; | |||||
| if (n1 > 0) { | |||||
| minf = zamin_kernel_16(n1, x); | |||||
| ix = n1 * 2; | |||||
| i = n1; | |||||
| } else { | } else { | ||||
| minf = CABS1(x, 0); | |||||
| ix += 2; | |||||
| i++; | |||||
| } | |||||
| while (i < n) { | |||||
| if (CABS1(x, ix) < minf) { | |||||
| minf = CABS1(x, ix); | |||||
| } | |||||
| ix += 2; | |||||
| i++; | |||||
| } | |||||
| return (minf); | |||||
| minf=CABS1(x,0); | |||||
| inc_x2 = 2 * inc_x; | |||||
| } else { | |||||
| BLASLONG n1 = n & -4; | |||||
| while (i < n1) { | |||||
| minf = CABS1(x, 0); | |||||
| inc_x2 = 2 * inc_x; | |||||
| if (CABS1(x,ix) < minf) { | |||||
| minf = CABS1(x,ix); | |||||
| } | |||||
| if (CABS1(x,ix+inc_x2) < minf) { | |||||
| minf = CABS1(x,ix+inc_x2); | |||||
| } | |||||
| if (CABS1(x,ix+inc_x2*2) < minf) { | |||||
| minf = CABS1(x,ix+inc_x2*2); | |||||
| } | |||||
| if (CABS1(x,ix+inc_x2*3) < minf) { | |||||
| minf = CABS1(x,ix+inc_x2*3); | |||||
| } | |||||
| BLASLONG n1 = n & -4; | |||||
| while (i < n1) { | |||||
| ix += inc_x2 * 4; | |||||
| if (CABS1(x, ix) < minf) { | |||||
| minf = CABS1(x, ix); | |||||
| } | |||||
| if (CABS1(x, ix + inc_x2) < minf) { | |||||
| minf = CABS1(x, ix + inc_x2); | |||||
| } | |||||
| if (CABS1(x, ix + inc_x2 * 2) < minf) { | |||||
| minf = CABS1(x, ix + inc_x2 * 2); | |||||
| } | |||||
| if (CABS1(x, ix + inc_x2 * 3) < minf) { | |||||
| minf = CABS1(x, ix + inc_x2 * 3); | |||||
| } | |||||
| i += 4; | |||||
| ix += inc_x2 * 4; | |||||
| } | |||||
| i += 4; | |||||
| } | |||||
| while (i < n) { | |||||
| if (CABS1(x,ix) < minf) { | |||||
| minf = CABS1(x,ix); | |||||
| } | |||||
| ix += inc_x2; | |||||
| i++; | |||||
| } | |||||
| return (minf); | |||||
| while (i < n) { | |||||
| if (CABS1(x, ix) < minf) { | |||||
| minf = CABS1(x, ix); | |||||
| } | |||||
| ix += inc_x2; | |||||
| i++; | |||||
| } | } | ||||
| return (minf); | |||||
| } | |||||
| } | } | ||||
| @@ -1,5 +1,5 @@ | |||||
| /*************************************************************************** | /*************************************************************************** | ||||
| Copyright (c) 2013-2017, The OpenBLAS Project | |||||
| Copyright (c) 2013-2019, The OpenBLAS Project | |||||
| All rights reserved. | All rights reserved. | ||||
| Redistribution and use in source and binary forms, with or without | Redistribution and use in source and binary forms, with or without | ||||
| modification, are permitted provided that the following conditions are | modification, are permitted provided that the following conditions are | ||||
| @@ -28,138 +28,126 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #include "common.h" | #include "common.h" | ||||
| #include <math.h> | #include <math.h> | ||||
| #if defined(DOUBLE) | |||||
| #define ABS fabs | #define ABS fabs | ||||
| #else | |||||
| #define ABS fabsf | |||||
| #endif | |||||
| static FLOAT zasum_kernel_16(BLASLONG n, FLOAT *x) | |||||
| { | |||||
| FLOAT asum; | |||||
| __asm__ ( | |||||
| "vzero %%v0 \n\t" | |||||
| "vzero %%v1 \n\t" | |||||
| "vzero %%v2 \n\t" | |||||
| "vzero %%v3 \n\t" | |||||
| "srlg %%r0,%1,4 \n\t" | |||||
| "xgr %%r1,%%r1 \n\t" | |||||
| "0: \n\t" | |||||
| "pfd 1, 1024(%%r1,%2) \n\t" | |||||
| "vl %%v16, 0(%%r1,%2) \n\t" | |||||
| "vl %%v17, 16(%%r1,%2) \n\t" | |||||
| "vl %%v18, 32(%%r1,%2) \n\t" | |||||
| "vl %%v19, 48(%%r1,%2) \n\t" | |||||
| "vl %%v20, 64(%%r1,%2) \n\t" | |||||
| "vl %%v21, 80(%%r1,%2) \n\t" | |||||
| "vl %%v22, 96(%%r1,%2) \n\t" | |||||
| "vl %%v23, 112(%%r1,%2) \n\t" | |||||
| "vflpdb %%v16, %%v16 \n\t" | |||||
| "vflpdb %%v17, %%v17 \n\t" | |||||
| "vflpdb %%v18, %%v18 \n\t" | |||||
| "vflpdb %%v19, %%v19 \n\t" | |||||
| "vflpdb %%v20, %%v20 \n\t" | |||||
| "vflpdb %%v21, %%v21 \n\t" | |||||
| "vflpdb %%v22, %%v22 \n\t" | |||||
| "vflpdb %%v23, %%v23 \n\t" | |||||
| "vfadb %%v0,%%v0,%%v16 \n\t" | |||||
| "vfadb %%v1,%%v1,%%v17 \n\t" | |||||
| "vfadb %%v2,%%v2,%%v18 \n\t" | |||||
| "vfadb %%v3,%%v3,%%v19 \n\t" | |||||
| "vfadb %%v0,%%v0,%%v20 \n\t" | |||||
| "vfadb %%v1,%%v1,%%v21 \n\t" | |||||
| "vfadb %%v2,%%v2,%%v22 \n\t" | |||||
| "vfadb %%v3,%%v3,%%v23 \n\t" | |||||
| "vl %%v16, 128(%%r1,%2) \n\t" | |||||
| "vl %%v17, 144(%%r1,%2) \n\t" | |||||
| "vl %%v18, 160(%%r1,%2) \n\t" | |||||
| "vl %%v19, 176(%%r1,%2) \n\t" | |||||
| "vl %%v20, 192(%%r1,%2) \n\t" | |||||
| "vl %%v21, 208(%%r1,%2) \n\t" | |||||
| "vl %%v22, 224(%%r1,%2) \n\t" | |||||
| "vl %%v23, 240(%%r1,%2) \n\t" | |||||
| "vflpdb %%v16, %%v16 \n\t" | |||||
| "vflpdb %%v17, %%v17 \n\t" | |||||
| "vflpdb %%v18, %%v18 \n\t" | |||||
| "vflpdb %%v19, %%v19 \n\t" | |||||
| "vflpdb %%v20, %%v20 \n\t" | |||||
| "vflpdb %%v21, %%v21 \n\t" | |||||
| "vflpdb %%v22, %%v22 \n\t" | |||||
| "vflpdb %%v23, %%v23 \n\t" | |||||
| "vfadb %%v0,%%v0,%%v16 \n\t" | |||||
| "vfadb %%v1,%%v1,%%v17 \n\t" | |||||
| "vfadb %%v2,%%v2,%%v18 \n\t" | |||||
| "vfadb %%v3,%%v3,%%v19 \n\t" | |||||
| "vfadb %%v0,%%v0,%%v20 \n\t" | |||||
| "vfadb %%v1,%%v1,%%v21 \n\t" | |||||
| "vfadb %%v2,%%v2,%%v22 \n\t" | |||||
| "vfadb %%v3,%%v3,%%v23 \n\t" | |||||
| "agfi %%r1,256 \n\t" | |||||
| "brctg %%r0,0b \n\t" | |||||
| "vfadb %%v0,%%v0,%%v1 \n\t" | |||||
| "vfadb %%v0,%%v0,%%v2 \n\t" | |||||
| "vfadb %%v0,%%v0,%%v3 \n\t" | |||||
| "vrepg %%v1,%%v0,1 \n\t" | |||||
| "adbr %%f0,%%f1 \n\t" | |||||
| "ldr %0,%%f0 " | |||||
| :"=f"(asum) | |||||
| :"r"(n),"ZR"((const FLOAT (*)[n * 2])x) | |||||
| :"memory","cc","r0","r1","v0","v1","v2","v3","v16","v17","v18","v19","v20","v21","v22","v23" | |||||
| ); | |||||
| return asum; | |||||
| static FLOAT zasum_kernel_16(BLASLONG n, FLOAT *x) { | |||||
| FLOAT asum; | |||||
| __asm__("vzero %%v24\n\t" | |||||
| "vzero %%v25\n\t" | |||||
| "vzero %%v26\n\t" | |||||
| "vzero %%v27\n\t" | |||||
| "vzero %%v28\n\t" | |||||
| "vzero %%v29\n\t" | |||||
| "vzero %%v30\n\t" | |||||
| "vzero %%v31\n\t" | |||||
| "srlg %[n],%[n],4\n\t" | |||||
| "xgr %%r1,%%r1\n\t" | |||||
| "0:\n\t" | |||||
| "pfd 1, 1024(%%r1,%[x])\n\t" | |||||
| "vl %%v16, 0(%%r1,%[x])\n\t" | |||||
| "vl %%v17, 16(%%r1,%[x])\n\t" | |||||
| "vl %%v18, 32(%%r1,%[x])\n\t" | |||||
| "vl %%v19, 48(%%r1,%[x])\n\t" | |||||
| "vl %%v20, 64(%%r1,%[x])\n\t" | |||||
| "vl %%v21, 80(%%r1,%[x])\n\t" | |||||
| "vl %%v22, 96(%%r1,%[x])\n\t" | |||||
| "vl %%v23, 112(%%r1,%[x])\n\t" | |||||
| "vflpdb %%v16, %%v16\n\t" | |||||
| "vflpdb %%v17, %%v17\n\t" | |||||
| "vflpdb %%v18, %%v18\n\t" | |||||
| "vflpdb %%v19, %%v19\n\t" | |||||
| "vflpdb %%v20, %%v20\n\t" | |||||
| "vflpdb %%v21, %%v21\n\t" | |||||
| "vflpdb %%v22, %%v22\n\t" | |||||
| "vflpdb %%v23, %%v23\n\t" | |||||
| "vfadb %%v24,%%v24,%%v16\n\t" | |||||
| "vfadb %%v25,%%v25,%%v17\n\t" | |||||
| "vfadb %%v26,%%v26,%%v18\n\t" | |||||
| "vfadb %%v27,%%v27,%%v19\n\t" | |||||
| "vfadb %%v28,%%v28,%%v20\n\t" | |||||
| "vfadb %%v29,%%v29,%%v21\n\t" | |||||
| "vfadb %%v30,%%v30,%%v22\n\t" | |||||
| "vfadb %%v31,%%v31,%%v23\n\t" | |||||
| "vl %%v16, 128(%%r1,%[x])\n\t" | |||||
| "vl %%v17, 144(%%r1,%[x])\n\t" | |||||
| "vl %%v18, 160(%%r1,%[x])\n\t" | |||||
| "vl %%v19, 176(%%r1,%[x])\n\t" | |||||
| "vl %%v20, 192(%%r1,%[x])\n\t" | |||||
| "vl %%v21, 208(%%r1,%[x])\n\t" | |||||
| "vl %%v22, 224(%%r1,%[x])\n\t" | |||||
| "vl %%v23, 240(%%r1,%[x])\n\t" | |||||
| "vflpdb %%v16, %%v16\n\t" | |||||
| "vflpdb %%v17, %%v17\n\t" | |||||
| "vflpdb %%v18, %%v18\n\t" | |||||
| "vflpdb %%v19, %%v19\n\t" | |||||
| "vflpdb %%v20, %%v20\n\t" | |||||
| "vflpdb %%v21, %%v21\n\t" | |||||
| "vflpdb %%v22, %%v22\n\t" | |||||
| "vflpdb %%v23, %%v23\n\t" | |||||
| "vfadb %%v24,%%v24,%%v16\n\t" | |||||
| "vfadb %%v25,%%v25,%%v17\n\t" | |||||
| "vfadb %%v26,%%v26,%%v18\n\t" | |||||
| "vfadb %%v27,%%v27,%%v19\n\t" | |||||
| "vfadb %%v28,%%v28,%%v20\n\t" | |||||
| "vfadb %%v29,%%v29,%%v21\n\t" | |||||
| "vfadb %%v30,%%v30,%%v22\n\t" | |||||
| "vfadb %%v31,%%v31,%%v23\n\t" | |||||
| "agfi %%r1,256\n\t" | |||||
| "brctg %[n],0b\n\t" | |||||
| "vfadb %%v24,%%v24,%%v25\n\t" | |||||
| "vfadb %%v24,%%v24,%%v26\n\t" | |||||
| "vfadb %%v24,%%v24,%%v27\n\t" | |||||
| "vfadb %%v24,%%v24,%%v28\n\t" | |||||
| "vfadb %%v24,%%v24,%%v29\n\t" | |||||
| "vfadb %%v24,%%v24,%%v30\n\t" | |||||
| "vfadb %%v24,%%v24,%%v31\n\t" | |||||
| "vrepg %%v25,%%v24,1\n\t" | |||||
| "vfadb %%v24,%%v24,%%v25\n\t" | |||||
| "vsteg %%v24,%[asum],0" | |||||
| : [asum] "=m"(asum),[n] "+&r"(n) | |||||
| : "m"(*(const FLOAT (*)[n * 2]) x),[x] "a"(x) | |||||
| : "cc", "r1", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", | |||||
| "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"); | |||||
| return asum; | |||||
| } | } | ||||
| FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||||
| { | |||||
| BLASLONG i=0; | |||||
| BLASLONG ip=0; | |||||
| FLOAT sumf = 0.0; | |||||
| BLASLONG n1; | |||||
| BLASLONG inc_x2; | |||||
| FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { | |||||
| BLASLONG i = 0; | |||||
| BLASLONG ip = 0; | |||||
| FLOAT sumf = 0.0; | |||||
| BLASLONG n1; | |||||
| BLASLONG inc_x2; | |||||
| if (n <= 0 || inc_x <= 0) return(sumf); | |||||
| if (n <= 0 || inc_x <= 0) | |||||
| return (sumf); | |||||
| if ( inc_x == 1 ) | |||||
| { | |||||
| if (inc_x == 1) { | |||||
| n1 = n & -16; | |||||
| if ( n1 > 0 ) | |||||
| { | |||||
| n1 = n & -16; | |||||
| if (n1 > 0) { | |||||
| sumf = zasum_kernel_16(n1, x); | |||||
| i=n1; | |||||
| ip=2*n1; | |||||
| } | |||||
| while(i < n) | |||||
| { | |||||
| sumf += ABS(x[ip]) + ABS(x[ip+1]); | |||||
| i++; | |||||
| ip+=2; | |||||
| } | |||||
| sumf = zasum_kernel_16(n1, x); | |||||
| i = n1; | |||||
| ip = 2 * n1; | |||||
| } | |||||
| while (i < n) { | |||||
| sumf += ABS(x[ip]) + ABS(x[ip + 1]); | |||||
| i++; | |||||
| ip += 2; | |||||
| } | } | ||||
| else | |||||
| { | |||||
| inc_x2 = 2* inc_x; | |||||
| while(i < n) | |||||
| { | |||||
| sumf += ABS(x[ip]) + ABS(x[ip+1]); | |||||
| ip+=inc_x2; | |||||
| i++; | |||||
| } | |||||
| } else { | |||||
| inc_x2 = 2 * inc_x; | |||||
| while (i < n) { | |||||
| sumf += ABS(x[ip]) + ABS(x[ip + 1]); | |||||
| ip += inc_x2; | |||||
| i++; | |||||
| } | } | ||||
| return(sumf); | |||||
| } | |||||
| } | |||||
| return (sumf); | |||||
| } | |||||
| @@ -1,5 +1,5 @@ | |||||
| /*************************************************************************** | /*************************************************************************** | ||||
| Copyright (c) 2017, The OpenBLAS Project | |||||
| Copyright (c) 2013-2019, The OpenBLAS Project | |||||
| All rights reserved. | All rights reserved. | ||||
| Redistribution and use in source and binary forms, with or without | Redistribution and use in source and binary forms, with or without | ||||
| modification, are permitted provided that the following conditions are | modification, are permitted provided that the following conditions are | ||||
| @@ -27,144 +27,136 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #include "common.h" | #include "common.h" | ||||
| static void zaxpy_kernel_8(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) | |||||
| { | |||||
| __asm__ volatile( | |||||
| static void zaxpy_kernel_8(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) { | |||||
| __asm__( | |||||
| #if !defined(CONJ) | #if !defined(CONJ) | ||||
| "vlrepg %%v0,0(%3) \n\t" | |||||
| "vleg %%v1,8(%3),0 \n\t" | |||||
| "wflcdb %%v1,%%v1 \n\t" | |||||
| "vleg %%v1,8(%3),1 \n\t" | |||||
| #else | |||||
| "vleg %%v0,0(%3),1 \n\t" | |||||
| "vflcdb %%v0,%%v0 \n\t" | |||||
| "vleg %%v0,0(%3),0 \n\t" | |||||
| "vlrepg %%v1,8(%3) \n\t" | |||||
| "vlrepg %%v0,0(%[alpha])\n\t" | |||||
| "vleg %%v1,8(%[alpha]),0\n\t" | |||||
| "wflcdb %%v1,%%v1\n\t" | |||||
| "vleg %%v1,8(%[alpha]),1\n\t" | |||||
| #else | |||||
| "vleg %%v0,0(%[alpha]),1\n\t" | |||||
| "vflcdb %%v0,%%v0\n\t" | |||||
| "vleg %%v0,0(%[alpha]),0\n\t" | |||||
| "vlrepg %%v1,8(%[alpha])\n\t" | |||||
| #endif | #endif | ||||
| "srlg %%r0,%0,3 \n\t" | |||||
| "xgr %%r1,%%r1 \n\t" | |||||
| "0: \n\t" | |||||
| "pfd 1, 1024(%%r1,%1) \n\t" | |||||
| "pfd 2, 1024(%%r1,%2) \n\t" | |||||
| "vl %%v16,0(%%r1,%1) \n\t" | |||||
| "vl %%v17,16(%%r1,%1) \n\t" | |||||
| "vl %%v18,32(%%r1,%1) \n\t" | |||||
| "vl %%v19,48(%%r1,%1) \n\t" | |||||
| "vl %%v20,0(%%r1,%2) \n\t" | |||||
| "vl %%v21,16(%%r1,%2) \n\t" | |||||
| "vl %%v22,32(%%r1,%2) \n\t" | |||||
| "vl %%v23,48(%%r1,%2) \n\t" | |||||
| "vpdi %%v24,%%v16,%%v16,4 \n\t" | |||||
| "vpdi %%v25,%%v17,%%v17,4 \n\t" | |||||
| "vpdi %%v26,%%v18,%%v18,4 \n\t" | |||||
| "vpdi %%v27,%%v19,%%v19,4 \n\t" | |||||
| "vfmadb %%v28,%%v16,%%v0,%%v20 \n\t" | |||||
| "vfmadb %%v29,%%v17,%%v0,%%v21 \n\t" | |||||
| "vfmadb %%v30,%%v18,%%v0,%%v22 \n\t" | |||||
| "vfmadb %%v31,%%v19,%%v0,%%v23 \n\t" | |||||
| "vfmadb %%v28,%%v24,%%v1,%%v28 \n\t" | |||||
| "vfmadb %%v29,%%v25,%%v1,%%v29 \n\t" | |||||
| "vfmadb %%v30,%%v26,%%v1,%%v30 \n\t" | |||||
| "vfmadb %%v31,%%v27,%%v1,%%v31 \n\t" | |||||
| "vst %%v28,0(%%r1,%2) \n\t" | |||||
| "vst %%v29,16(%%r1,%2) \n\t" | |||||
| "vst %%v30,32(%%r1,%2) \n\t" | |||||
| "vst %%v31,48(%%r1,%2) \n\t" | |||||
| "vl %%v16,64(%%r1,%1) \n\t" | |||||
| "vl %%v17,80(%%r1,%1) \n\t" | |||||
| "vl %%v18,96(%%r1,%1) \n\t" | |||||
| "vl %%v19,112(%%r1,%1) \n\t" | |||||
| "vl %%v20,64(%%r1,%2) \n\t" | |||||
| "vl %%v21,80(%%r1,%2) \n\t" | |||||
| "vl %%v22,96(%%r1,%2) \n\t" | |||||
| "vl %%v23,112(%%r1,%2) \n\t" | |||||
| "vpdi %%v24,%%v16,%%v16,4 \n\t" | |||||
| "vpdi %%v25,%%v17,%%v17,4 \n\t" | |||||
| "vpdi %%v26,%%v18,%%v18,4 \n\t" | |||||
| "vpdi %%v27,%%v19,%%v19,4 \n\t" | |||||
| "vfmadb %%v28,%%v16,%%v0,%%v20 \n\t" | |||||
| "vfmadb %%v29,%%v17,%%v0,%%v21 \n\t" | |||||
| "vfmadb %%v30,%%v18,%%v0,%%v22 \n\t" | |||||
| "vfmadb %%v31,%%v19,%%v0,%%v23 \n\t" | |||||
| "vfmadb %%v28,%%v24,%%v1,%%v28 \n\t" | |||||
| "vfmadb %%v29,%%v25,%%v1,%%v29 \n\t" | |||||
| "vfmadb %%v30,%%v26,%%v1,%%v30 \n\t" | |||||
| "vfmadb %%v31,%%v27,%%v1,%%v31 \n\t" | |||||
| "vst %%v28,64(%%r1,%2) \n\t" | |||||
| "vst %%v29,80(%%r1,%2) \n\t" | |||||
| "vst %%v30,96(%%r1,%2) \n\t" | |||||
| "vst %%v31,112(%%r1,%2) \n\t" | |||||
| "agfi %%r1,128 \n\t" | |||||
| "brctg %%r0,0b " | |||||
| : | |||||
| :"r"(n),"ZR"((const FLOAT (*)[n * 2])x),"ZR"((FLOAT (*)[n * 2])y),"ZQ"((const FLOAT (*)[2])alpha) | |||||
| :"memory","cc","r0","r1","v0","v1","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" | |||||
| ); | |||||
| "srlg %[n],%[n],3\n\t" | |||||
| "xgr %%r1,%%r1\n\t" | |||||
| "0:\n\t" | |||||
| "pfd 1, 1024(%%r1,%[x])\n\t" | |||||
| "pfd 2, 1024(%%r1,%[y])\n\t" | |||||
| "vl %%v8,0(%%r1,%[x])\n\t" | |||||
| "vl %%v9,16(%%r1,%[x])\n\t" | |||||
| "vl %%v10,32(%%r1,%[x])\n\t" | |||||
| "vl %%v11,48(%%r1,%[x])\n\t" | |||||
| "vl %%v12,0(%%r1,%[y])\n\t" | |||||
| "vl %%v13,16(%%r1,%[y])\n\t" | |||||
| "vl %%v14,32(%%r1,%[y])\n\t" | |||||
| "vl %%v15,48(%%r1,%[y])\n\t" | |||||
| "vl %%v16,64(%%r1,%[x])\n\t" | |||||
| "vl %%v17,80(%%r1,%[x])\n\t" | |||||
| "vl %%v18,96(%%r1,%[x])\n\t" | |||||
| "vl %%v19,112(%%r1,%[x])\n\t" | |||||
| "vl %%v20,64(%%r1,%[y])\n\t" | |||||
| "vl %%v21,80(%%r1,%[y])\n\t" | |||||
| "vl %%v22,96(%%r1,%[y])\n\t" | |||||
| "vl %%v23,112(%%r1,%[y])\n\t" | |||||
| "vpdi %%v24,%%v8,%%v8,4\n\t" | |||||
| "vpdi %%v25,%%v9,%%v9,4\n\t" | |||||
| "vpdi %%v26,%%v10,%%v10,4\n\t" | |||||
| "vpdi %%v27,%%v11,%%v11,4\n\t" | |||||
| "vpdi %%v28,%%v16,%%v16,4\n\t" | |||||
| "vpdi %%v29,%%v17,%%v17,4\n\t" | |||||
| "vpdi %%v30,%%v18,%%v18,4\n\t" | |||||
| "vpdi %%v31,%%v19,%%v19,4\n\t" | |||||
| "vfmadb %%v8,%%v8,%%v0,%%v12\n\t" | |||||
| "vfmadb %%v9,%%v9,%%v0,%%v13\n\t" | |||||
| "vfmadb %%v10,%%v10,%%v0,%%v14\n\t" | |||||
| "vfmadb %%v11,%%v11,%%v0,%%v15\n\t" | |||||
| "vfmadb %%v16,%%v16,%%v0,%%v20\n\t" | |||||
| "vfmadb %%v17,%%v17,%%v0,%%v21\n\t" | |||||
| "vfmadb %%v18,%%v18,%%v0,%%v22\n\t" | |||||
| "vfmadb %%v19,%%v19,%%v0,%%v23\n\t" | |||||
| "vfmadb %%v8,%%v24,%%v1,%%v8\n\t" | |||||
| "vfmadb %%v9,%%v25,%%v1,%%v9\n\t" | |||||
| "vfmadb %%v10,%%v26,%%v1,%%v10\n\t" | |||||
| "vfmadb %%v11,%%v27,%%v1,%%v11\n\t" | |||||
| "vfmadb %%v16,%%v28,%%v1,%%v16\n\t" | |||||
| "vfmadb %%v17,%%v29,%%v1,%%v17\n\t" | |||||
| "vfmadb %%v18,%%v30,%%v1,%%v18\n\t" | |||||
| "vfmadb %%v19,%%v31,%%v1,%%v19\n\t" | |||||
| "vst %%v8,0(%%r1,%[y])\n\t" | |||||
| "vst %%v9,16(%%r1,%[y])\n\t" | |||||
| "vst %%v10,32(%%r1,%[y])\n\t" | |||||
| "vst %%v11,48(%%r1,%[y])\n\t" | |||||
| "vst %%v16,64(%%r1,%[y])\n\t" | |||||
| "vst %%v17,80(%%r1,%[y])\n\t" | |||||
| "vst %%v18,96(%%r1,%[y])\n\t" | |||||
| "vst %%v19,112(%%r1,%[y])\n\t" | |||||
| "agfi %%r1,128\n\t" | |||||
| "brctg %[n],0b" | |||||
| : "+m"(*(FLOAT (*)[n * 2]) y),[n] "+&r"(n) | |||||
| : [y] "a"(y), "m"(*(const FLOAT (*)[n * 2]) x),[x] "a"(x), | |||||
| "m"(*(const FLOAT (*)[2]) alpha),[alpha] "a"(alpha) | |||||
| : "cc", "r1", "v0", "v1", "v8", "v9", "v10", "v11", "v12", "v13", | |||||
| "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", | |||||
| "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"); | |||||
| } | } | ||||
| int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) { | |||||
| BLASLONG i = 0; | |||||
| BLASLONG ix = 0, iy = 0; | |||||
| FLOAT da[2] __attribute__ ((aligned(16))); | |||||
| int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, | |||||
| FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, | |||||
| BLASLONG dummy2) { | |||||
| BLASLONG i = 0; | |||||
| BLASLONG ix = 0, iy = 0; | |||||
| FLOAT da[2] __attribute__ ((aligned(16))); | |||||
| if (n <= 0) return (0); | |||||
| if (n <= 0) | |||||
| return (0); | |||||
| if ((inc_x == 1) && (inc_y == 1)) { | |||||
| if ((inc_x == 1) && (inc_y == 1)) { | |||||
| BLASLONG n1 = n & -8; | |||||
| BLASLONG n1 = n & -8; | |||||
| if (n1) { | |||||
| da[0] = da_r; | |||||
| da[1] = da_i; | |||||
| zaxpy_kernel_8(n1, x, y, da); | |||||
| ix = 2 * n1; | |||||
| } | |||||
| i = n1; | |||||
| while (i < n) { | |||||
| if (n1) { | |||||
| da[0] = da_r; | |||||
| da[1] = da_i; | |||||
| zaxpy_kernel_8(n1, x, y, da); | |||||
| ix = 2 * n1; | |||||
| } | |||||
| i = n1; | |||||
| while (i < n) { | |||||
| #if !defined(CONJ) | #if !defined(CONJ) | ||||
| y[ix] += (da_r * x[ix] - da_i * x[ix + 1]); | |||||
| y[ix + 1] += (da_r * x[ix + 1] + da_i * x[ix]); | |||||
| y[ix] += (da_r * x[ix] - da_i * x[ix + 1]); | |||||
| y[ix + 1] += (da_r * x[ix + 1] + da_i * x[ix]); | |||||
| #else | #else | ||||
| y[ix] += (da_r * x[ix] + da_i * x[ix + 1]); | |||||
| y[ix + 1] -= (da_r * x[ix + 1] - da_i * x[ix]); | |||||
| y[ix] += (da_r * x[ix] + da_i * x[ix + 1]); | |||||
| y[ix + 1] -= (da_r * x[ix + 1] - da_i * x[ix]); | |||||
| #endif | #endif | ||||
| i++; | |||||
| ix += 2; | |||||
| } | |||||
| return (0); | |||||
| i++; | |||||
| ix += 2; | |||||
| } | } | ||||
| return (0); | |||||
| inc_x *= 2; | |||||
| inc_y *= 2; | |||||
| } | |||||
| while (i < n) { | |||||
| inc_x *= 2; | |||||
| inc_y *= 2; | |||||
| while (i < n) { | |||||
| #if !defined(CONJ) | #if !defined(CONJ) | ||||
| y[iy] += (da_r * x[ix] - da_i * x[ix + 1]); | |||||
| y[iy + 1] += (da_r * x[ix + 1] + da_i * x[ix]); | |||||
| y[iy] += (da_r * x[ix] - da_i * x[ix + 1]); | |||||
| y[iy + 1] += (da_r * x[ix + 1] + da_i * x[ix]); | |||||
| #else | #else | ||||
| y[iy] += (da_r * x[ix] + da_i * x[ix + 1]); | |||||
| y[iy + 1] -= (da_r * x[ix + 1] - da_i * x[ix]); | |||||
| y[iy] += (da_r * x[ix] + da_i * x[ix + 1]); | |||||
| y[iy + 1] -= (da_r * x[ix + 1] - da_i * x[ix]); | |||||
| #endif | #endif | ||||
| ix += inc_x; | |||||
| iy += inc_y; | |||||
| i++; | |||||
| ix += inc_x; | |||||
| iy += inc_y; | |||||
| i++; | |||||
| } | |||||
| return (0); | |||||
| } | |||||
| return (0); | |||||
| } | } | ||||
| @@ -1,5 +1,5 @@ | |||||
| /*************************************************************************** | /*************************************************************************** | ||||
| Copyright (c) 2013-2018, The OpenBLAS Project | |||||
| Copyright (c) 2013-2019, The OpenBLAS Project | |||||
| All rights reserved. | All rights reserved. | ||||
| Redistribution and use in source and binary forms, with or without | Redistribution and use in source and binary forms, with or without | ||||
| modification, are permitted provided that the following conditions are | modification, are permitted provided that the following conditions are | ||||
| @@ -27,73 +27,61 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #include "common.h" | #include "common.h" | ||||
| static void zcopy_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y) | |||||
| { | |||||
| __asm__ volatile ( | |||||
| "lgr %%r1,%1 \n\t" | |||||
| "lgr %%r2,%2 \n\t" | |||||
| "srlg %%r0,%0,4 \n\t" | |||||
| "0: \n\t" | |||||
| "pfd 1, 1024(%%r1) \n\t" | |||||
| "pfd 2, 1024(%%r2) \n\t" | |||||
| "mvc 0(256,%%r2),0(%%r1) \n\t" | |||||
| "agfi %%r1,256 \n\t" | |||||
| "agfi %%r2,256 \n\t" | |||||
| "brctg %%r0,0b " | |||||
| : | |||||
| :"r"(n),"a"((const FLOAT (*)[n * 2])x),"a"((FLOAT (*)[n * 2])y) | |||||
| :"memory","cc","r0","r1","r2" | |||||
| ); | |||||
| static void zcopy_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y) { | |||||
| __asm__("srlg %[n],%[n],4\n\t" | |||||
| "0:\n\t" | |||||
| "pfd 1, 1024(%[x])\n\t" | |||||
| "pfd 2, 1024(%[y])\n\t" | |||||
| "mvc 0(256,%[y]),0(%[x])\n\t" | |||||
| "la %[x],256(%[x])\n\t" | |||||
| "la %[y],256(%[y])\n\t" | |||||
| "brctg %[n],0b" | |||||
| : "=m"(*(FLOAT (*)[n * 2]) y),[x] "+&a"(x),[y] "+&a"(y) | |||||
| : "m"(*(const FLOAT (*)[n * 2]) x) | |||||
| : "cc"); | |||||
| } | } | ||||
| int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) | |||||
| { | |||||
| BLASLONG i=0; | |||||
| BLASLONG ix=0,iy=0; | |||||
| int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) { | |||||
| BLASLONG i = 0; | |||||
| BLASLONG ix = 0, iy = 0; | |||||
| if ( n <= 0 ) return(0); | |||||
| if (n <= 0) | |||||
| return (0); | |||||
| if ( (inc_x == 1) && (inc_y == 1 )) | |||||
| { | |||||
| if ((inc_x == 1) && (inc_y == 1)) { | |||||
| BLASLONG n1 = n & -16; | |||||
| if ( n1 > 0 ) | |||||
| { | |||||
| zcopy_kernel_16(n1, x, y); | |||||
| i=n1; | |||||
| ix=n1*2; | |||||
| iy=n1*2; | |||||
| } | |||||
| while(i < n) | |||||
| { | |||||
| y[iy] = x[iy] ; | |||||
| y[iy+1] = x[ix+1] ; | |||||
| ix+=2; | |||||
| iy+=2; | |||||
| i++ ; | |||||
| } | |||||
| BLASLONG n1 = n & -16; | |||||
| if (n1 > 0) { | |||||
| zcopy_kernel_16(n1, x, y); | |||||
| i = n1; | |||||
| ix = n1 * 2; | |||||
| iy = n1 * 2; | |||||
| } | |||||
| while (i < n) { | |||||
| y[iy] = x[iy]; | |||||
| y[iy + 1] = x[ix + 1]; | |||||
| ix += 2; | |||||
| iy += 2; | |||||
| i++; | |||||
| } | } | ||||
| else | |||||
| { | |||||
| BLASLONG inc_x2 = 2 * inc_x; | |||||
| BLASLONG inc_y2 = 2 * inc_y; | |||||
| } else { | |||||
| while(i < n) | |||||
| { | |||||
| y[iy] = x[ix] ; | |||||
| y[iy+1] = x[ix+1] ; | |||||
| ix += inc_x2 ; | |||||
| iy += inc_y2 ; | |||||
| i++ ; | |||||
| BLASLONG inc_x2 = 2 * inc_x; | |||||
| BLASLONG inc_y2 = 2 * inc_y; | |||||
| } | |||||
| while (i < n) { | |||||
| y[iy] = x[ix]; | |||||
| y[iy + 1] = x[ix + 1]; | |||||
| ix += inc_x2; | |||||
| iy += inc_y2; | |||||
| i++; | |||||
| } | } | ||||
| return(0); | |||||
| } | |||||
| return (0); | |||||
| } | } | ||||
| @@ -1,5 +1,5 @@ | |||||
| /*************************************************************************** | /*************************************************************************** | ||||
| Copyright (c) 2013-2017, The OpenBLAS Project | |||||
| Copyright (c) 2013-2019, The OpenBLAS Project | |||||
| All rights reserved. | All rights reserved. | ||||
| Redistribution and use in source and binary forms, with or without | Redistribution and use in source and binary forms, with or without | ||||
| modification, are permitted provided that the following conditions are | modification, are permitted provided that the following conditions are | ||||
| @@ -27,152 +27,146 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #include "common.h" | #include "common.h" | ||||
| static void zdot_kernel_8(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *d) | |||||
| { | |||||
| __asm__ volatile( | |||||
| "vzero %%v24 \n\t" | |||||
| "vzero %%v25 \n\t" | |||||
| "vzero %%v26 \n\t" | |||||
| "vzero %%v27 \n\t" | |||||
| "vzero %%v28 \n\t" | |||||
| "vzero %%v29 \n\t" | |||||
| "vzero %%v30 \n\t" | |||||
| "vzero %%v31 \n\t" | |||||
| "srlg %%r0,%0,3 \n\t" | |||||
| "xgr %%r1,%%r1 \n\t" | |||||
| "0: \n\t" | |||||
| "pfd 1, 1024(%%r1,%1) \n\t" | |||||
| "pfd 1, 1024(%%r1,%2) \n\t" | |||||
| "vl %%v16, 0(%%r1,%1) \n\t" | |||||
| "vl %%v17, 16(%%r1,%1) \n\t" | |||||
| "vl %%v18, 32(%%r1,%1) \n\t" | |||||
| "vl %%v19, 48(%%r1,%1) \n\t" | |||||
| "vl %%v0, 0(%%r1,%2) \n\t" | |||||
| "vl %%v1, 16(%%r1,%2) \n\t" | |||||
| "vl %%v2, 32(%%r1,%2) \n\t" | |||||
| "vl %%v3, 48(%%r1,%2) \n\t" | |||||
| "vpdi %%v20,%%v16,%%v16,4 \n\t" | |||||
| "vpdi %%v21,%%v17,%%v17,4 \n\t" | |||||
| "vpdi %%v22,%%v18,%%v18,4 \n\t" | |||||
| "vpdi %%v23,%%v19,%%v19,4 \n\t" | |||||
| "vfmadb %%v24,%%v16,%%v0,%%v24 \n\t" | |||||
| "vfmadb %%v25,%%v20,%%v0,%%v25 \n\t" | |||||
| "vfmadb %%v26,%%v17,%%v1,%%v26 \n\t" | |||||
| "vfmadb %%v27,%%v21,%%v1,%%v27 \n\t" | |||||
| "vfmadb %%v28,%%v18,%%v2,%%v28 \n\t" | |||||
| "vfmadb %%v29,%%v22,%%v2,%%v29 \n\t" | |||||
| "vfmadb %%v30,%%v19,%%v3,%%v30 \n\t" | |||||
| "vfmadb %%v31,%%v23,%%v3,%%v31 \n\t" | |||||
| "vl %%v16, 64(%%r1,%1) \n\t" | |||||
| "vl %%v17, 80(%%r1,%1) \n\t" | |||||
| "vl %%v18, 96(%%r1,%1) \n\t" | |||||
| "vl %%v19, 112(%%r1,%1) \n\t" | |||||
| "vl %%v0, 64(%%r1,%2) \n\t" | |||||
| "vl %%v1, 80(%%r1,%2) \n\t" | |||||
| "vl %%v2, 96(%%r1,%2) \n\t" | |||||
| "vl %%v3, 112(%%r1,%2) \n\t" | |||||
| "vpdi %%v20,%%v16,%%v16,4 \n\t" | |||||
| "vpdi %%v21,%%v17,%%v17,4 \n\t" | |||||
| "vpdi %%v22,%%v18,%%v18,4 \n\t" | |||||
| "vpdi %%v23,%%v19,%%v19,4 \n\t" | |||||
| "vfmadb %%v24,%%v16,%%v0,%%v24 \n\t" | |||||
| "vfmadb %%v25,%%v20,%%v0,%%v25 \n\t" | |||||
| "vfmadb %%v26,%%v17,%%v1,%%v26 \n\t" | |||||
| "vfmadb %%v27,%%v21,%%v1,%%v27 \n\t" | |||||
| "vfmadb %%v28,%%v18,%%v2,%%v28 \n\t" | |||||
| "vfmadb %%v29,%%v22,%%v2,%%v29 \n\t" | |||||
| "vfmadb %%v30,%%v19,%%v3,%%v30 \n\t" | |||||
| "vfmadb %%v31,%%v23,%%v3,%%v31 \n\t" | |||||
| "agfi %%r1,128 \n\t" | |||||
| "brctg %%r0,0b \n\t" | |||||
| "vfadb %%v24,%%v24,%%v26 \n\t" | |||||
| "vfadb %%v24,%%v24,%%v28 \n\t" | |||||
| "vfadb %%v24,%%v24,%%v30 \n\t" | |||||
| "vfadb %%v25,%%v25,%%v27 \n\t" | |||||
| "vfadb %%v25,%%v25,%%v29 \n\t" | |||||
| "vfadb %%v25,%%v25,%%v31 \n\t" | |||||
| "vsteg %%v24,0(%3),0 \n\t" | |||||
| "vsteg %%v24,8(%3),1 \n\t" | |||||
| "vsteg %%v25,16(%3),1 \n\t" | |||||
| "vsteg %%v25,24(%3),0 " | |||||
| : | |||||
| :"r"(n),"ZR"((const FLOAT (*)[n * 2])x),"ZR"((const FLOAT (*)[n * 2])y),"ZQ"((FLOAT (*)[4])d) | |||||
| :"memory","cc","r0","r1","v0","v1","v2","v3","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" | |||||
| ); | |||||
| static void zdot_kernel_8(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *d) { | |||||
| __asm__("vzero %%v24\n\t" | |||||
| "vzero %%v25\n\t" | |||||
| "vzero %%v26\n\t" | |||||
| "vzero %%v27\n\t" | |||||
| "vzero %%v28\n\t" | |||||
| "vzero %%v29\n\t" | |||||
| "vzero %%v30\n\t" | |||||
| "vzero %%v31\n\t" | |||||
| "srlg %[n],%[n],3\n\t" | |||||
| "xgr %%r1,%%r1\n\t" | |||||
| "0:\n\t" | |||||
| "pfd 1, 1024(%%r1,%[x])\n\t" | |||||
| "pfd 1, 1024(%%r1,%[y])\n\t" | |||||
| "vl %%v16, 0(%%r1,%[x])\n\t" | |||||
| "vl %%v17, 16(%%r1,%[x])\n\t" | |||||
| "vl %%v18, 32(%%r1,%[x])\n\t" | |||||
| "vl %%v19, 48(%%r1,%[x])\n\t" | |||||
| "vl %%v0, 0(%%r1,%[y])\n\t" | |||||
| "vl %%v1, 16(%%r1,%[y])\n\t" | |||||
| "vl %%v2, 32(%%r1,%[y])\n\t" | |||||
| "vl %%v3, 48(%%r1,%[y])\n\t" | |||||
| "vpdi %%v20,%%v16,%%v16,4\n\t" | |||||
| "vpdi %%v21,%%v17,%%v17,4\n\t" | |||||
| "vpdi %%v22,%%v18,%%v18,4\n\t" | |||||
| "vpdi %%v23,%%v19,%%v19,4\n\t" | |||||
| "vfmadb %%v24,%%v16,%%v0,%%v24\n\t" | |||||
| "vfmadb %%v25,%%v20,%%v0,%%v25\n\t" | |||||
| "vfmadb %%v26,%%v17,%%v1,%%v26\n\t" | |||||
| "vfmadb %%v27,%%v21,%%v1,%%v27\n\t" | |||||
| "vfmadb %%v28,%%v18,%%v2,%%v28\n\t" | |||||
| "vfmadb %%v29,%%v22,%%v2,%%v29\n\t" | |||||
| "vfmadb %%v30,%%v19,%%v3,%%v30\n\t" | |||||
| "vfmadb %%v31,%%v23,%%v3,%%v31\n\t" | |||||
| "vl %%v16, 64(%%r1,%[x])\n\t" | |||||
| "vl %%v17, 80(%%r1,%[x])\n\t" | |||||
| "vl %%v18, 96(%%r1,%[x])\n\t" | |||||
| "vl %%v19, 112(%%r1,%[x])\n\t" | |||||
| "vl %%v0, 64(%%r1,%[y])\n\t" | |||||
| "vl %%v1, 80(%%r1,%[y])\n\t" | |||||
| "vl %%v2, 96(%%r1,%[y])\n\t" | |||||
| "vl %%v3, 112(%%r1,%[y])\n\t" | |||||
| "vpdi %%v20,%%v16,%%v16,4\n\t" | |||||
| "vpdi %%v21,%%v17,%%v17,4\n\t" | |||||
| "vpdi %%v22,%%v18,%%v18,4\n\t" | |||||
| "vpdi %%v23,%%v19,%%v19,4\n\t" | |||||
| "vfmadb %%v24,%%v16,%%v0,%%v24\n\t" | |||||
| "vfmadb %%v25,%%v20,%%v0,%%v25\n\t" | |||||
| "vfmadb %%v26,%%v17,%%v1,%%v26\n\t" | |||||
| "vfmadb %%v27,%%v21,%%v1,%%v27\n\t" | |||||
| "vfmadb %%v28,%%v18,%%v2,%%v28\n\t" | |||||
| "vfmadb %%v29,%%v22,%%v2,%%v29\n\t" | |||||
| "vfmadb %%v30,%%v19,%%v3,%%v30\n\t" | |||||
| "vfmadb %%v31,%%v23,%%v3,%%v31\n\t" | |||||
| "agfi %%r1,128\n\t" | |||||
| "brctg %[n],0b\n\t" | |||||
| "vfadb %%v24,%%v24,%%v26\n\t" | |||||
| "vfadb %%v24,%%v24,%%v28\n\t" | |||||
| "vfadb %%v24,%%v24,%%v30\n\t" | |||||
| "vfadb %%v25,%%v25,%%v27\n\t" | |||||
| "vfadb %%v25,%%v25,%%v29\n\t" | |||||
| "vfadb %%v25,%%v25,%%v31\n\t" | |||||
| "vsteg %%v24,0(%[d]),0\n\t" | |||||
| "vsteg %%v24,8(%[d]),1\n\t" | |||||
| "vsteg %%v25,16(%[d]),1\n\t" | |||||
| "vsteg %%v25,24(%[d]),0" | |||||
| : "=m"(*(FLOAT (*)[4]) d),[n] "+&r"(n) | |||||
| : [d] "a"(d), "m"(*(const FLOAT (*)[n * 2]) x),[x] "a"(x), | |||||
| "m"(*(const FLOAT (*)[n * 2]) y),[y] "a"(y) | |||||
| : "cc", "r1", "v0", "v1", "v2", "v3", "v16", "v17", "v18", "v19", "v20", | |||||
| "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", | |||||
| "v31"); | |||||
| } | } | ||||
| OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) { | |||||
| BLASLONG i; | |||||
| BLASLONG ix, iy; | |||||
| OPENBLAS_COMPLEX_FLOAT result; | |||||
| FLOAT dot[4] __attribute__ ((aligned(16))) = {0.0, 0.0, 0.0, 0.0}; | |||||
| if (n <= 0) { | |||||
| CREAL(result) = 0.0; | |||||
| CIMAG(result) = 0.0; | |||||
| return (result); | |||||
| } | |||||
| OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, | |||||
| BLASLONG inc_y) { | |||||
| BLASLONG i; | |||||
| BLASLONG ix, iy; | |||||
| OPENBLAS_COMPLEX_FLOAT result; | |||||
| FLOAT dot[4] __attribute__ ((aligned(16))) = { | |||||
| 0.0, 0.0, 0.0, 0.0}; | |||||
| if (n <= 0) { | |||||
| CREAL(result) = 0.0; | |||||
| CIMAG(result) = 0.0; | |||||
| return (result); | |||||
| if ((inc_x == 1) && (inc_y == 1)) { | |||||
| } | |||||
| BLASLONG n1 = n & -8; | |||||
| if ((inc_x == 1) && (inc_y == 1)) { | |||||
| if (n1) | |||||
| zdot_kernel_8(n1, x, y, dot); | |||||
| BLASLONG n1 = n & -8; | |||||
| i = n1; | |||||
| BLASLONG j = i * 2; | |||||
| if (n1) | |||||
| zdot_kernel_8(n1, x, y, dot); | |||||
| while (i < n) { | |||||
| i = n1; | |||||
| BLASLONG j = i * 2; | |||||
| dot[0] += x[j] * y[j]; | |||||
| dot[1] += x[j + 1] * y[j + 1]; | |||||
| dot[2] += x[j] * y[j + 1]; | |||||
| dot[3] += x[j + 1] * y[j]; | |||||
| while (i < n) { | |||||
| j += 2; | |||||
| i++; | |||||
| dot[0] += x[j] * y[j]; | |||||
| dot[1] += x[j + 1] * y[j + 1]; | |||||
| dot[2] += x[j] * y[j + 1]; | |||||
| dot[3] += x[j + 1] * y[j]; | |||||
| } | |||||
| j += 2; | |||||
| i++; | |||||
| } | |||||
| } else { | |||||
| i = 0; | |||||
| ix = 0; | |||||
| iy = 0; | |||||
| inc_x <<= 1; | |||||
| inc_y <<= 1; | |||||
| while (i < n) { | |||||
| } else { | |||||
| i = 0; | |||||
| ix = 0; | |||||
| iy = 0; | |||||
| inc_x <<= 1; | |||||
| inc_y <<= 1; | |||||
| while (i < n) { | |||||
| dot[0] += x[ix] * y[iy]; | |||||
| dot[1] += x[ix + 1] * y[iy + 1]; | |||||
| dot[2] += x[ix] * y[iy + 1]; | |||||
| dot[3] += x[ix + 1] * y[iy]; | |||||
| dot[0] += x[ix] * y[iy]; | |||||
| dot[1] += x[ix + 1] * y[iy + 1]; | |||||
| dot[2] += x[ix] * y[iy + 1]; | |||||
| dot[3] += x[ix + 1] * y[iy]; | |||||
| ix += inc_x; | |||||
| iy += inc_y; | |||||
| i++; | |||||
| ix += inc_x; | |||||
| iy += inc_y; | |||||
| i++; | |||||
| } | |||||
| } | } | ||||
| } | |||||
| #if !defined(CONJ) | #if !defined(CONJ) | ||||
| CREAL(result) = dot[0] - dot[1]; | |||||
| CIMAG(result) = dot[2] + dot[3]; | |||||
| CREAL(result) = dot[0] - dot[1]; | |||||
| CIMAG(result) = dot[2] + dot[3]; | |||||
| #else | #else | ||||
| CREAL(result) = dot[0] + dot[1]; | |||||
| CIMAG(result) = dot[2] - dot[3]; | |||||
| CREAL(result) = dot[0] + dot[1]; | |||||
| CIMAG(result) = dot[2] - dot[3]; | |||||
| #endif | #endif | ||||
| return (result); | |||||
| return (result); | |||||
| } | } | ||||
| @@ -1,5 +1,5 @@ | |||||
| /*************************************************************************** | /*************************************************************************** | ||||
| Copyright (c) 2013-2018, The OpenBLAS Project | |||||
| Copyright (c) 2013-2019, The OpenBLAS Project | |||||
| All rights reserved. | All rights reserved. | ||||
| Redistribution and use in source and binary forms, with or without | Redistribution and use in source and binary forms, with or without | ||||
| modification, are permitted provided that the following conditions are | modification, are permitted provided that the following conditions are | ||||
| @@ -27,230 +27,209 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #include "common.h" | #include "common.h" | ||||
| static void zrot_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *c, FLOAT *s) | |||||
| { | |||||
| __asm__ ( | |||||
| "vlrepg %%v0,%3 \n\t" | |||||
| "vlrepg %%v1,%4 \n\t" | |||||
| "srlg %%r0,%0,4 \n\t" | |||||
| "xgr %%r1,%%r1 \n\t" | |||||
| "0: \n\t" | |||||
| "pfd 2, 1024(%%r1,%1) \n\t" | |||||
| "pfd 2, 1024(%%r1,%2) \n\t" | |||||
| "vl %%v24, 0(%%r1,%1) \n\t" | |||||
| "vl %%v25, 16(%%r1,%1) \n\t" | |||||
| "vl %%v26, 32(%%r1,%1) \n\t" | |||||
| "vl %%v27, 48(%%r1,%1) \n\t" | |||||
| "vl %%v16, 0(%%r1,%2) \n\t" | |||||
| "vl %%v17, 16(%%r1,%2) \n\t" | |||||
| "vl %%v18, 32(%%r1,%2) \n\t" | |||||
| "vl %%v19, 48(%%r1,%2) \n\t" | |||||
| "vfmdb %%v28,%%v24,%%v0 \n\t" | |||||
| "vfmdb %%v29,%%v25,%%v0 \n\t" | |||||
| "vfmdb %%v20,%%v24,%%v1 \n\t" /* yn=x*s */ | |||||
| "vfmdb %%v21,%%v25,%%v1 \n\t" /* yn=x*s */ | |||||
| "vfmdb %%v30,%%v26,%%v0 \n\t" | |||||
| "vfmdb %%v22,%%v26,%%v1 \n\t" /* yn=x*s */ | |||||
| "vfmdb %%v31,%%v27,%%v0 \n\t" | |||||
| "vfmdb %%v23,%%v27,%%v1 \n\t" /* yn=x*s */ | |||||
| /* 2nd parts*/ | |||||
| "vfmadb %%v28,%%v16,%%v1,%%v28 \n\t" | |||||
| "vfmsdb %%v20,%%v16,%%v0,%%v20 \n\t" /* yn=y*c-yn */ | |||||
| "vfmadb %%v29,%%v17,%%v1,%%v29 \n\t" | |||||
| "vfmsdb %%v21,%%v17,%%v0,%%v21 \n\t" /* yn=y*c-yn */ | |||||
| "vfmadb %%v30,%%v18,%%v1,%%v30 \n\t" | |||||
| "vfmsdb %%v22,%%v18,%%v0,%%v22 \n\t" /* yn=y*c-yn */ | |||||
| "vfmadb %%v31,%%v19,%%v1,%%v31 \n\t" | |||||
| "vfmsdb %%v23,%%v19,%%v0,%%v23 \n\t" /* yn=y*c-yn */ | |||||
| "vst %%v28, 0(%%r1,%1) \n\t" | |||||
| "vst %%v29, 16(%%r1,%1) \n\t" | |||||
| "vst %%v30, 32(%%r1,%1) \n\t" | |||||
| "vst %%v31, 48(%%r1,%1) \n\t" | |||||
| "vst %%v20, 0(%%r1,%2) \n\t" | |||||
| "vst %%v21, 16(%%r1,%2) \n\t" | |||||
| "vst %%v22, 32(%%r1,%2) \n\t" | |||||
| "vst %%v23, 48(%%r1,%2) \n\t" | |||||
| "vl %%v24, 64(%%r1,%1) \n\t" | |||||
| "vl %%v25, 80(%%r1,%1) \n\t" | |||||
| "vl %%v26, 96(%%r1,%1) \n\t" | |||||
| "vl %%v27, 112(%%r1,%1) \n\t" | |||||
| "vl %%v16, 64(%%r1,%2) \n\t" | |||||
| "vl %%v17, 80(%%r1,%2) \n\t" | |||||
| "vl %%v18, 96(%%r1,%2) \n\t" | |||||
| "vl %%v19, 112(%%r1,%2) \n\t" | |||||
| "vfmdb %%v28,%%v24,%%v0 \n\t" | |||||
| "vfmdb %%v29,%%v25,%%v0 \n\t" | |||||
| "vfmdb %%v20,%%v24,%%v1 \n\t" /* yn=x*s */ | |||||
| "vfmdb %%v21,%%v25,%%v1 \n\t" /* yn=x*s */ | |||||
| "vfmdb %%v30,%%v26,%%v0 \n\t" | |||||
| "vfmdb %%v22,%%v26,%%v1 \n\t" /* yn=x*s */ | |||||
| "vfmdb %%v31,%%v27,%%v0 \n\t" | |||||
| "vfmdb %%v23,%%v27,%%v1 \n\t" /* yn=x*s */ | |||||
| /* 2nd parts*/ | |||||
| "vfmadb %%v28,%%v16,%%v1,%%v28 \n\t" | |||||
| "vfmsdb %%v20,%%v16,%%v0,%%v20 \n\t" /* yn=y*c-yn */ | |||||
| "vfmadb %%v29,%%v17,%%v1,%%v29 \n\t" | |||||
| "vfmsdb %%v21,%%v17,%%v0,%%v21 \n\t" /* yn=y*c-yn */ | |||||
| "vfmadb %%v30,%%v18,%%v1,%%v30 \n\t" | |||||
| "vfmsdb %%v22,%%v18,%%v0,%%v22 \n\t" /* yn=y*c-yn */ | |||||
| "vfmadb %%v31,%%v19,%%v1,%%v31 \n\t" | |||||
| "vfmsdb %%v23,%%v19,%%v0,%%v23 \n\t" /* yn=y*c-yn */ | |||||
| "vst %%v28, 64(%%r1,%1) \n\t" | |||||
| "vst %%v29, 80(%%r1,%1) \n\t" | |||||
| "vst %%v30, 96(%%r1,%1) \n\t" | |||||
| "vst %%v31, 112(%%r1,%1) \n\t" | |||||
| "vst %%v20, 64(%%r1,%2) \n\t" | |||||
| "vst %%v21, 80(%%r1,%2) \n\t" | |||||
| "vst %%v22, 96(%%r1,%2) \n\t" | |||||
| "vst %%v23, 112(%%r1,%2) \n\t" | |||||
| "vl %%v24, 128(%%r1,%1) \n\t" | |||||
| "vl %%v25, 144(%%r1,%1) \n\t" | |||||
| "vl %%v26, 160(%%r1,%1) \n\t" | |||||
| "vl %%v27, 176(%%r1,%1) \n\t" | |||||
| "vl %%v16, 128(%%r1,%2) \n\t" | |||||
| "vl %%v17, 144(%%r1,%2) \n\t" | |||||
| "vl %%v18, 160(%%r1,%2) \n\t" | |||||
| "vl %%v19, 176(%%r1,%2) \n\t" | |||||
| "vfmdb %%v28,%%v24,%%v0 \n\t" | |||||
| "vfmdb %%v29,%%v25,%%v0 \n\t" | |||||
| "vfmdb %%v20,%%v24,%%v1 \n\t" /* yn=x*s */ | |||||
| "vfmdb %%v21,%%v25,%%v1 \n\t" /* yn=x*s */ | |||||
| "vfmdb %%v30,%%v26,%%v0 \n\t" | |||||
| "vfmdb %%v22,%%v26,%%v1 \n\t" /* yn=x*s */ | |||||
| "vfmdb %%v31,%%v27,%%v0 \n\t" | |||||
| "vfmdb %%v23,%%v27,%%v1 \n\t" /* yn=x*s */ | |||||
| /* 2nd parts*/ | |||||
| "vfmadb %%v28,%%v16,%%v1,%%v28 \n\t" | |||||
| "vfmsdb %%v20,%%v16,%%v0,%%v20 \n\t" /* yn=y*c-yn */ | |||||
| "vfmadb %%v29,%%v17,%%v1,%%v29 \n\t" | |||||
| "vfmsdb %%v21,%%v17,%%v0,%%v21 \n\t" /* yn=y*c-yn */ | |||||
| "vfmadb %%v30,%%v18,%%v1,%%v30 \n\t" | |||||
| "vfmsdb %%v22,%%v18,%%v0,%%v22 \n\t" /* yn=y*c-yn */ | |||||
| "vfmadb %%v31,%%v19,%%v1,%%v31 \n\t" | |||||
| "vfmsdb %%v23,%%v19,%%v0,%%v23 \n\t" /* yn=y*c-yn */ | |||||
| "vst %%v28, 128(%%r1,%1) \n\t" | |||||
| "vst %%v29, 144(%%r1,%1) \n\t" | |||||
| "vst %%v30, 160(%%r1,%1) \n\t" | |||||
| "vst %%v31, 176(%%r1,%1) \n\t" | |||||
| "vst %%v20, 128(%%r1,%2) \n\t" | |||||
| "vst %%v21, 144(%%r1,%2) \n\t" | |||||
| "vst %%v22, 160(%%r1,%2) \n\t" | |||||
| "vst %%v23, 176(%%r1,%2) \n\t" | |||||
| "vl %%v24, 192(%%r1,%1) \n\t" | |||||
| "vl %%v25, 208(%%r1,%1) \n\t" | |||||
| "vl %%v26, 224(%%r1,%1) \n\t" | |||||
| "vl %%v27, 240(%%r1,%1) \n\t" | |||||
| "vl %%v16, 192(%%r1,%2) \n\t" | |||||
| "vl %%v17, 208(%%r1,%2) \n\t" | |||||
| "vl %%v18, 224(%%r1,%2) \n\t" | |||||
| "vl %%v19, 240(%%r1,%2) \n\t" | |||||
| "vfmdb %%v28,%%v24,%%v0 \n\t" | |||||
| "vfmdb %%v29,%%v25,%%v0 \n\t" | |||||
| "vfmdb %%v20,%%v24,%%v1 \n\t" /* yn=x*s */ | |||||
| "vfmdb %%v21,%%v25,%%v1 \n\t" /* yn=x*s */ | |||||
| "vfmdb %%v30,%%v26,%%v0 \n\t" | |||||
| "vfmdb %%v22,%%v26,%%v1 \n\t" /* yn=x*s */ | |||||
| "vfmdb %%v31,%%v27,%%v0 \n\t" | |||||
| "vfmdb %%v23,%%v27,%%v1 \n\t" /* yn=x*s */ | |||||
| /* 2nd parts*/ | |||||
| "vfmadb %%v28,%%v16,%%v1,%%v28 \n\t" | |||||
| "vfmsdb %%v20,%%v16,%%v0,%%v20 \n\t" /* yn=y*c-yn */ | |||||
| "vfmadb %%v29,%%v17,%%v1,%%v29 \n\t" | |||||
| "vfmsdb %%v21,%%v17,%%v0,%%v21 \n\t" /* yn=y*c-yn */ | |||||
| "vfmadb %%v30,%%v18,%%v1,%%v30 \n\t" | |||||
| "vfmsdb %%v22,%%v18,%%v0,%%v22 \n\t" /* yn=y*c-yn */ | |||||
| "vfmadb %%v31,%%v19,%%v1,%%v31 \n\t" | |||||
| "vfmsdb %%v23,%%v19,%%v0,%%v23 \n\t" /* yn=y*c-yn */ | |||||
| "vst %%v28, 192(%%r1,%1) \n\t" | |||||
| "vst %%v29, 208(%%r1,%1) \n\t" | |||||
| "vst %%v30, 224(%%r1,%1) \n\t" | |||||
| "vst %%v31, 240(%%r1,%1) \n\t" | |||||
| "vst %%v20, 192(%%r1,%2) \n\t" | |||||
| "vst %%v21, 208(%%r1,%2) \n\t" | |||||
| "vst %%v22, 224(%%r1,%2) \n\t" | |||||
| "vst %%v23, 240(%%r1,%2) \n\t" | |||||
| "agfi %%r1,256 \n\t" | |||||
| "brctg %%r0,0b " | |||||
| : | |||||
| :"r"(n),"ZR"((FLOAT (*)[n * 2])x),"ZR"((FLOAT (*)[n * 2])y),"m"(*c),"m"(*s) | |||||
| :"memory","cc","r0","r1","v0","v1","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" | |||||
| ); | |||||
| static void zrot_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *c, FLOAT *s) { | |||||
| __asm__("vlrepg %%v0,%[c]\n\t" | |||||
| "vlrepg %%v1,%[s]\n\t" | |||||
| "srlg %[n],%[n],4\n\t" | |||||
| "xgr %%r1,%%r1\n\t" | |||||
| "0:\n\t" | |||||
| "pfd 2, 1024(%%r1,%[x])\n\t" | |||||
| "pfd 2, 1024(%%r1,%[y])\n\t" | |||||
| "vl %%v24, 0(%%r1,%[x])\n\t" | |||||
| "vl %%v25, 16(%%r1,%[x])\n\t" | |||||
| "vl %%v26, 32(%%r1,%[x])\n\t" | |||||
| "vl %%v27, 48(%%r1,%[x])\n\t" | |||||
| "vl %%v16, 0(%%r1,%[y])\n\t" | |||||
| "vl %%v17, 16(%%r1,%[y])\n\t" | |||||
| "vl %%v18, 32(%%r1,%[y])\n\t" | |||||
| "vl %%v19, 48(%%r1,%[y])\n\t" | |||||
| "vfmdb %%v28,%%v24,%%v0\n\t" | |||||
| "vfmdb %%v29,%%v25,%%v0\n\t" | |||||
| "vfmdb %%v20,%%v24,%%v1\n\t" /* yn=x*s */ | |||||
| "vfmdb %%v21,%%v25,%%v1\n\t" /* yn=x*s */ | |||||
| "vfmdb %%v30,%%v26,%%v0\n\t" | |||||
| "vfmdb %%v22,%%v26,%%v1\n\t" /* yn=x*s */ | |||||
| "vfmdb %%v31,%%v27,%%v0\n\t" | |||||
| "vfmdb %%v23,%%v27,%%v1\n\t" /* yn=x*s */ | |||||
| /* 2nd parts */ | |||||
| "vfmadb %%v28,%%v16,%%v1,%%v28\n\t" | |||||
| "vfmsdb %%v20,%%v16,%%v0,%%v20\n\t" /* yn=y*c-yn */ | |||||
| "vfmadb %%v29,%%v17,%%v1,%%v29\n\t" | |||||
| "vfmsdb %%v21,%%v17,%%v0,%%v21\n\t" /* yn=y*c-yn */ | |||||
| "vfmadb %%v30,%%v18,%%v1,%%v30\n\t" | |||||
| "vfmsdb %%v22,%%v18,%%v0,%%v22\n\t" /* yn=y*c-yn */ | |||||
| "vfmadb %%v31,%%v19,%%v1,%%v31\n\t" | |||||
| "vfmsdb %%v23,%%v19,%%v0,%%v23\n\t" /* yn=y*c-yn */ | |||||
| "vst %%v28, 0(%%r1,%[x])\n\t" | |||||
| "vst %%v29, 16(%%r1,%[x])\n\t" | |||||
| "vst %%v30, 32(%%r1,%[x])\n\t" | |||||
| "vst %%v31, 48(%%r1,%[x])\n\t" | |||||
| "vst %%v20, 0(%%r1,%[y])\n\t" | |||||
| "vst %%v21, 16(%%r1,%[y])\n\t" | |||||
| "vst %%v22, 32(%%r1,%[y])\n\t" | |||||
| "vst %%v23, 48(%%r1,%[y])\n\t" | |||||
| "vl %%v24, 64(%%r1,%[x])\n\t" | |||||
| "vl %%v25, 80(%%r1,%[x])\n\t" | |||||
| "vl %%v26, 96(%%r1,%[x])\n\t" | |||||
| "vl %%v27, 112(%%r1,%[x])\n\t" | |||||
| "vl %%v16, 64(%%r1,%[y])\n\t" | |||||
| "vl %%v17, 80(%%r1,%[y])\n\t" | |||||
| "vl %%v18, 96(%%r1,%[y])\n\t" | |||||
| "vl %%v19, 112(%%r1,%[y])\n\t" | |||||
| "vfmdb %%v28,%%v24,%%v0\n\t" | |||||
| "vfmdb %%v29,%%v25,%%v0\n\t" | |||||
| "vfmdb %%v20,%%v24,%%v1\n\t" /* yn=x*s */ | |||||
| "vfmdb %%v21,%%v25,%%v1\n\t" /* yn=x*s */ | |||||
| "vfmdb %%v30,%%v26,%%v0\n\t" | |||||
| "vfmdb %%v22,%%v26,%%v1\n\t" /* yn=x*s */ | |||||
| "vfmdb %%v31,%%v27,%%v0\n\t" | |||||
| "vfmdb %%v23,%%v27,%%v1\n\t" /* yn=x*s */ | |||||
| /* 2nd parts */ | |||||
| "vfmadb %%v28,%%v16,%%v1,%%v28\n\t" | |||||
| "vfmsdb %%v20,%%v16,%%v0,%%v20\n\t" /* yn=y*c-yn */ | |||||
| "vfmadb %%v29,%%v17,%%v1,%%v29\n\t" | |||||
| "vfmsdb %%v21,%%v17,%%v0,%%v21\n\t" /* yn=y*c-yn */ | |||||
| "vfmadb %%v30,%%v18,%%v1,%%v30\n\t" | |||||
| "vfmsdb %%v22,%%v18,%%v0,%%v22\n\t" /* yn=y*c-yn */ | |||||
| "vfmadb %%v31,%%v19,%%v1,%%v31\n\t" | |||||
| "vfmsdb %%v23,%%v19,%%v0,%%v23\n\t" /* yn=y*c-yn */ | |||||
| "vst %%v28, 64(%%r1,%[x])\n\t" | |||||
| "vst %%v29, 80(%%r1,%[x])\n\t" | |||||
| "vst %%v30, 96(%%r1,%[x])\n\t" | |||||
| "vst %%v31, 112(%%r1,%[x])\n\t" | |||||
| "vst %%v20, 64(%%r1,%[y])\n\t" | |||||
| "vst %%v21, 80(%%r1,%[y])\n\t" | |||||
| "vst %%v22, 96(%%r1,%[y])\n\t" | |||||
| "vst %%v23, 112(%%r1,%[y])\n\t" | |||||
| "vl %%v24, 128(%%r1,%[x])\n\t" | |||||
| "vl %%v25, 144(%%r1,%[x])\n\t" | |||||
| "vl %%v26, 160(%%r1,%[x])\n\t" | |||||
| "vl %%v27, 176(%%r1,%[x])\n\t" | |||||
| "vl %%v16, 128(%%r1,%[y])\n\t" | |||||
| "vl %%v17, 144(%%r1,%[y])\n\t" | |||||
| "vl %%v18, 160(%%r1,%[y])\n\t" | |||||
| "vl %%v19, 176(%%r1,%[y])\n\t" | |||||
| "vfmdb %%v28,%%v24,%%v0\n\t" | |||||
| "vfmdb %%v29,%%v25,%%v0\n\t" | |||||
| "vfmdb %%v20,%%v24,%%v1\n\t" /* yn=x*s */ | |||||
| "vfmdb %%v21,%%v25,%%v1\n\t" /* yn=x*s */ | |||||
| "vfmdb %%v30,%%v26,%%v0\n\t" | |||||
| "vfmdb %%v22,%%v26,%%v1\n\t" /* yn=x*s */ | |||||
| "vfmdb %%v31,%%v27,%%v0\n\t" | |||||
| "vfmdb %%v23,%%v27,%%v1\n\t" /* yn=x*s */ | |||||
| /* 2nd parts */ | |||||
| "vfmadb %%v28,%%v16,%%v1,%%v28\n\t" | |||||
| "vfmsdb %%v20,%%v16,%%v0,%%v20\n\t" /* yn=y*c-yn */ | |||||
| "vfmadb %%v29,%%v17,%%v1,%%v29\n\t" | |||||
| "vfmsdb %%v21,%%v17,%%v0,%%v21\n\t" /* yn=y*c-yn */ | |||||
| "vfmadb %%v30,%%v18,%%v1,%%v30\n\t" | |||||
| "vfmsdb %%v22,%%v18,%%v0,%%v22\n\t" /* yn=y*c-yn */ | |||||
| "vfmadb %%v31,%%v19,%%v1,%%v31\n\t" | |||||
| "vfmsdb %%v23,%%v19,%%v0,%%v23\n\t" /* yn=y*c-yn */ | |||||
| "vst %%v28, 128(%%r1,%[x])\n\t" | |||||
| "vst %%v29, 144(%%r1,%[x])\n\t" | |||||
| "vst %%v30, 160(%%r1,%[x])\n\t" | |||||
| "vst %%v31, 176(%%r1,%[x])\n\t" | |||||
| "vst %%v20, 128(%%r1,%[y])\n\t" | |||||
| "vst %%v21, 144(%%r1,%[y])\n\t" | |||||
| "vst %%v22, 160(%%r1,%[y])\n\t" | |||||
| "vst %%v23, 176(%%r1,%[y])\n\t" | |||||
| "vl %%v24, 192(%%r1,%[x])\n\t" | |||||
| "vl %%v25, 208(%%r1,%[x])\n\t" | |||||
| "vl %%v26, 224(%%r1,%[x])\n\t" | |||||
| "vl %%v27, 240(%%r1,%[x])\n\t" | |||||
| "vl %%v16, 192(%%r1,%[y])\n\t" | |||||
| "vl %%v17, 208(%%r1,%[y])\n\t" | |||||
| "vl %%v18, 224(%%r1,%[y])\n\t" | |||||
| "vl %%v19, 240(%%r1,%[y])\n\t" | |||||
| "vfmdb %%v28,%%v24,%%v0\n\t" | |||||
| "vfmdb %%v29,%%v25,%%v0\n\t" | |||||
| "vfmdb %%v20,%%v24,%%v1\n\t" /* yn=x*s */ | |||||
| "vfmdb %%v21,%%v25,%%v1\n\t" /* yn=x*s */ | |||||
| "vfmdb %%v30,%%v26,%%v0\n\t" | |||||
| "vfmdb %%v22,%%v26,%%v1\n\t" /* yn=x*s */ | |||||
| "vfmdb %%v31,%%v27,%%v0\n\t" | |||||
| "vfmdb %%v23,%%v27,%%v1\n\t" /* yn=x*s */ | |||||
| /* 2nd parts */ | |||||
| "vfmadb %%v28,%%v16,%%v1,%%v28\n\t" | |||||
| "vfmsdb %%v20,%%v16,%%v0,%%v20\n\t" /* yn=y*c-yn */ | |||||
| "vfmadb %%v29,%%v17,%%v1,%%v29\n\t" | |||||
| "vfmsdb %%v21,%%v17,%%v0,%%v21\n\t" /* yn=y*c-yn */ | |||||
| "vfmadb %%v30,%%v18,%%v1,%%v30\n\t" | |||||
| "vfmsdb %%v22,%%v18,%%v0,%%v22\n\t" /* yn=y*c-yn */ | |||||
| "vfmadb %%v31,%%v19,%%v1,%%v31\n\t" | |||||
| "vfmsdb %%v23,%%v19,%%v0,%%v23\n\t" /* yn=y*c-yn */ | |||||
| "vst %%v28, 192(%%r1,%[x])\n\t" | |||||
| "vst %%v29, 208(%%r1,%[x])\n\t" | |||||
| "vst %%v30, 224(%%r1,%[x])\n\t" | |||||
| "vst %%v31, 240(%%r1,%[x])\n\t" | |||||
| "vst %%v20, 192(%%r1,%[y])\n\t" | |||||
| "vst %%v21, 208(%%r1,%[y])\n\t" | |||||
| "vst %%v22, 224(%%r1,%[y])\n\t" | |||||
| "vst %%v23, 240(%%r1,%[y])\n\t" | |||||
| "agfi %%r1,256\n\t" | |||||
| "brctg %[n],0b" | |||||
| : "+m"(*(FLOAT (*)[n * 2]) x), "+m"(*(FLOAT (*)[n * 2]) y),[n] "+&r"(n) | |||||
| : [x] "a"(x),[y] "a"(y),[c] "m"(*c),[s] "m"(*s) | |||||
| : "cc", "r1", "v0", "v1", "v16", "v17", "v18", "v19", "v20", "v21", | |||||
| "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", | |||||
| "v31"); | |||||
| } | } | ||||
| int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT c, FLOAT s) | |||||
| { | |||||
| BLASLONG i=0; | |||||
| BLASLONG ix=0,iy=0; | |||||
| FLOAT temp[2]; | |||||
| BLASLONG inc_x2; | |||||
| BLASLONG inc_y2; | |||||
| if ( n <= 0 ) return(0); | |||||
| if ( (inc_x == 1) && (inc_y == 1) ) | |||||
| { | |||||
| BLASLONG n1 = n & -16; | |||||
| if ( n1 > 0 ) | |||||
| { | |||||
| FLOAT cosa,sina; | |||||
| cosa=c; | |||||
| sina=s; | |||||
| zrot_kernel_16(n1, x, y, &cosa, &sina); | |||||
| i=n1; | |||||
| ix=2*n1; | |||||
| } | |||||
| while(i < n) | |||||
| { | |||||
| temp[0] = c*x[ix] + s*y[ix] ; | |||||
| temp[1] = c*x[ix+1] + s*y[ix+1] ; | |||||
| y[ix] = c*y[ix] - s*x[ix] ; | |||||
| y[ix+1] = c*y[ix+1] - s*x[ix+1] ; | |||||
| x[ix] = temp[0] ; | |||||
| x[ix+1] = temp[1] ; | |||||
| ix += 2 ; | |||||
| i++ ; | |||||
| int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, | |||||
| FLOAT c, FLOAT s) { | |||||
| BLASLONG i = 0; | |||||
| BLASLONG ix = 0, iy = 0; | |||||
| FLOAT temp[2]; | |||||
| BLASLONG inc_x2; | |||||
| BLASLONG inc_y2; | |||||
| if (n <= 0) | |||||
| return (0); | |||||
| if ((inc_x == 1) && (inc_y == 1)) { | |||||
| BLASLONG n1 = n & -16; | |||||
| if (n1 > 0) { | |||||
| FLOAT cosa, sina; | |||||
| cosa = c; | |||||
| sina = s; | |||||
| zrot_kernel_16(n1, x, y, &cosa, &sina); | |||||
| i = n1; | |||||
| ix = 2 * n1; | |||||
| } | |||||
| } | |||||
| while (i < n) { | |||||
| temp[0] = c * x[ix] + s * y[ix]; | |||||
| temp[1] = c * x[ix + 1] + s * y[ix + 1]; | |||||
| y[ix] = c * y[ix] - s * x[ix]; | |||||
| y[ix + 1] = c * y[ix + 1] - s * x[ix + 1]; | |||||
| x[ix] = temp[0]; | |||||
| x[ix + 1] = temp[1]; | |||||
| ix += 2; | |||||
| i++; | |||||
| } | } | ||||
| else | |||||
| { | |||||
| inc_x2 = 2 * inc_x ; | |||||
| inc_y2 = 2 * inc_y ; | |||||
| while(i < n) | |||||
| { | |||||
| temp[0] = c*x[ix] + s*y[iy] ; | |||||
| temp[1] = c*x[ix+1] + s*y[iy+1] ; | |||||
| y[iy] = c*y[iy] - s*x[ix] ; | |||||
| y[iy+1] = c*y[iy+1] - s*x[ix+1] ; | |||||
| x[ix] = temp[0] ; | |||||
| x[ix+1] = temp[1] ; | |||||
| ix += inc_x2 ; | |||||
| iy += inc_y2 ; | |||||
| i++ ; | |||||
| } | |||||
| } else { | |||||
| inc_x2 = 2 * inc_x; | |||||
| inc_y2 = 2 * inc_y; | |||||
| while (i < n) { | |||||
| temp[0] = c * x[ix] + s * y[iy]; | |||||
| temp[1] = c * x[ix + 1] + s * y[iy + 1]; | |||||
| y[iy] = c * y[iy] - s * x[ix]; | |||||
| y[iy + 1] = c * y[iy + 1] - s * x[ix + 1]; | |||||
| x[ix] = temp[0]; | |||||
| x[ix + 1] = temp[1]; | |||||
| ix += inc_x2; | |||||
| iy += inc_y2; | |||||
| i++; | |||||
| } | } | ||||
| return(0); | |||||
| } | |||||
| } | |||||
| return (0); | |||||
| } | |||||
| @@ -1,5 +1,5 @@ | |||||
| /*************************************************************************** | /*************************************************************************** | ||||
| Copyright (c) 2013 - 2017, The OpenBLAS Project | |||||
| Copyright (c) 2013-2019, The OpenBLAS Project | |||||
| All rights reserved. | All rights reserved. | ||||
| Redistribution and use in source and binary forms, with or without | Redistribution and use in source and binary forms, with or without | ||||
| modification, are permitted provided that the following conditions are | modification, are permitted provided that the following conditions are | ||||
| @@ -27,426 +27,396 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #include "common.h" | #include "common.h" | ||||
| static void zscal_kernel_8(BLASLONG n, FLOAT *alpha, FLOAT *x) | |||||
| { | |||||
| __asm__ volatile( | |||||
| "vlrepg %%v0,0(%1) \n\t" | |||||
| "vleg %%v1,8(%1),0 \n\t" | |||||
| "wflcdb %%v1,%%v1 \n\t" | |||||
| "vleg %%v1,8(%1),1 \n\t" | |||||
| "srlg %%r0,%0,3 \n\t" | |||||
| "xgr %%r1,%%r1 \n\t" | |||||
| "0: \n\t" | |||||
| "pfd 2, 1024(%%r1,%2) \n\t" | |||||
| "vl %%v16,0(%%r1,%2) \n\t" | |||||
| "vl %%v17,16(%%r1,%2) \n\t" | |||||
| "vl %%v18,32(%%r1,%2) \n\t" | |||||
| "vl %%v19,48(%%r1,%2) \n\t" | |||||
| "vl %%v20,64(%%r1,%2) \n\t" | |||||
| "vl %%v21,80(%%r1,%2) \n\t" | |||||
| "vl %%v22,96(%%r1,%2) \n\t" | |||||
| "vl %%v23,112(%%r1,%2) \n\t" | |||||
| "vpdi %%v24,%%v16,%%v16,4 \n\t" | |||||
| "vpdi %%v25,%%v17,%%v17,4 \n\t" | |||||
| "vpdi %%v26,%%v18,%%v18,4 \n\t" | |||||
| "vpdi %%v27,%%v19,%%v19,4 \n\t" | |||||
| "vpdi %%v28,%%v20,%%v20,4 \n\t" | |||||
| "vpdi %%v29,%%v21,%%v21,4 \n\t" | |||||
| "vpdi %%v30,%%v22,%%v22,4 \n\t" | |||||
| "vpdi %%v31,%%v23,%%v23,4 \n\t" | |||||
| "vfmdb %%v16,%%v16,%%v0 \n\t" | |||||
| "vfmdb %%v17,%%v17,%%v0 \n\t" | |||||
| "vfmdb %%v18,%%v18,%%v0 \n\t" | |||||
| "vfmdb %%v19,%%v19,%%v0 \n\t" | |||||
| "vfmdb %%v20,%%v20,%%v0 \n\t" | |||||
| "vfmdb %%v21,%%v21,%%v0 \n\t" | |||||
| "vfmdb %%v22,%%v22,%%v0 \n\t" | |||||
| "vfmdb %%v23,%%v23,%%v0 \n\t" | |||||
| "vfmadb %%v16,%%v24,%%v1,%%v16 \n\t" | |||||
| "vfmadb %%v17,%%v25,%%v1,%%v17 \n\t" | |||||
| "vfmadb %%v18,%%v26,%%v1,%%v18 \n\t" | |||||
| "vfmadb %%v19,%%v27,%%v1,%%v19 \n\t" | |||||
| "vfmadb %%v20,%%v28,%%v1,%%v20 \n\t" | |||||
| "vfmadb %%v21,%%v29,%%v1,%%v21 \n\t" | |||||
| "vfmadb %%v22,%%v30,%%v1,%%v22 \n\t" | |||||
| "vfmadb %%v23,%%v31,%%v1,%%v23 \n\t" | |||||
| "vst %%v16,0(%%r1,%2) \n\t" | |||||
| "vst %%v17,16(%%r1,%2) \n\t" | |||||
| "vst %%v18,32(%%r1,%2) \n\t" | |||||
| "vst %%v19,48(%%r1,%2) \n\t" | |||||
| "vst %%v20,64(%%r1,%2) \n\t" | |||||
| "vst %%v21,80(%%r1,%2) \n\t" | |||||
| "vst %%v22,96(%%r1,%2) \n\t" | |||||
| "vst %%v23,112(%%r1,%2) \n\t" | |||||
| "agfi %%r1,128 \n\t" | |||||
| "brctg %%r0,0b " | |||||
| : | |||||
| :"r"(n),"ZQ"((const FLOAT (*)[2])alpha),"ZR"((FLOAT (*)[n * 2])x) | |||||
| :"memory","cc","r0","r1","v0","v1","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" | |||||
| ); | |||||
| } | |||||
| static void zscal_kernel_8_zero_r(BLASLONG n, FLOAT *alpha, FLOAT *x) | |||||
| { | |||||
| __asm__ volatile( | |||||
| "vleg %%v0,8(%1),0 \n\t" | |||||
| "wflcdb %%v0,%%v0 \n\t" | |||||
| "vleg %%v0,8(%1),1 \n\t" | |||||
| "srlg %%r0,%0,3 \n\t" | |||||
| "xgr %%r1,%%r1 \n\t" | |||||
| "0: \n\t" | |||||
| "pfd 2, 1024(%%r1,%2) \n\t" | |||||
| "vl %%v16,0(%%r1,%2) \n\t" | |||||
| "vl %%v17,16(%%r1,%2) \n\t" | |||||
| "vl %%v18,32(%%r1,%2) \n\t" | |||||
| "vl %%v19,48(%%r1,%2) \n\t" | |||||
| "vl %%v20,64(%%r1,%2) \n\t" | |||||
| "vl %%v21,80(%%r1,%2) \n\t" | |||||
| "vl %%v22,96(%%r1,%2) \n\t" | |||||
| "vl %%v23,112(%%r1,%2) \n\t" | |||||
| "vpdi %%v16,%%v16,%%v16,4 \n\t" | |||||
| "vpdi %%v17,%%v17,%%v17,4 \n\t" | |||||
| "vpdi %%v18,%%v18,%%v18,4 \n\t" | |||||
| "vpdi %%v19,%%v19,%%v19,4 \n\t" | |||||
| "vpdi %%v20,%%v20,%%v20,4 \n\t" | |||||
| "vpdi %%v21,%%v21,%%v21,4 \n\t" | |||||
| "vpdi %%v22,%%v22,%%v22,4 \n\t" | |||||
| "vpdi %%v23,%%v23,%%v23,4 \n\t" | |||||
| "vfmdb %%v16,%%v16,%%v0 \n\t" | |||||
| "vfmdb %%v17,%%v17,%%v0 \n\t" | |||||
| "vfmdb %%v18,%%v18,%%v0 \n\t" | |||||
| "vfmdb %%v19,%%v19,%%v0 \n\t" | |||||
| "vfmdb %%v20,%%v20,%%v0 \n\t" | |||||
| "vfmdb %%v21,%%v21,%%v0 \n\t" | |||||
| "vfmdb %%v22,%%v22,%%v0 \n\t" | |||||
| "vfmdb %%v23,%%v23,%%v0 \n\t" | |||||
| "vst %%v16,0(%%r1,%2) \n\t" | |||||
| "vst %%v17,16(%%r1,%2) \n\t" | |||||
| "vst %%v18,32(%%r1,%2) \n\t" | |||||
| "vst %%v19,48(%%r1,%2) \n\t" | |||||
| "vst %%v20,64(%%r1,%2) \n\t" | |||||
| "vst %%v21,80(%%r1,%2) \n\t" | |||||
| "vst %%v22,96(%%r1,%2) \n\t" | |||||
| "vst %%v23,112(%%r1,%2) \n\t" | |||||
| "agfi %%r1,128 \n\t" | |||||
| "brctg %%r0,0b " | |||||
| : | |||||
| :"r"(n),"ZQ"((const FLOAT (*)[2])alpha),"ZR"((FLOAT (*)[n * 2])x) | |||||
| :"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23" | |||||
| ); | |||||
| static void zscal_kernel_8(BLASLONG n, FLOAT *alpha, FLOAT *x) { | |||||
| __asm__("vlrepg %%v0,0(%[alpha])\n\t" | |||||
| "vleg %%v1,8(%[alpha]),0\n\t" | |||||
| "wflcdb %%v1,%%v1\n\t" | |||||
| "vleg %%v1,8(%[alpha]),1\n\t" | |||||
| "srlg %[n],%[n],3\n\t" | |||||
| "xgr %%r1,%%r1\n\t" | |||||
| "0:\n\t" | |||||
| "pfd 2, 1024(%%r1,%[x])\n\t" | |||||
| "vl %%v16,0(%%r1,%[x])\n\t" | |||||
| "vl %%v17,16(%%r1,%[x])\n\t" | |||||
| "vl %%v18,32(%%r1,%[x])\n\t" | |||||
| "vl %%v19,48(%%r1,%[x])\n\t" | |||||
| "vl %%v20,64(%%r1,%[x])\n\t" | |||||
| "vl %%v21,80(%%r1,%[x])\n\t" | |||||
| "vl %%v22,96(%%r1,%[x])\n\t" | |||||
| "vl %%v23,112(%%r1,%[x])\n\t" | |||||
| "vpdi %%v24,%%v16,%%v16,4\n\t" | |||||
| "vpdi %%v25,%%v17,%%v17,4\n\t" | |||||
| "vpdi %%v26,%%v18,%%v18,4\n\t" | |||||
| "vpdi %%v27,%%v19,%%v19,4\n\t" | |||||
| "vpdi %%v28,%%v20,%%v20,4\n\t" | |||||
| "vpdi %%v29,%%v21,%%v21,4\n\t" | |||||
| "vpdi %%v30,%%v22,%%v22,4\n\t" | |||||
| "vpdi %%v31,%%v23,%%v23,4\n\t" | |||||
| "vfmdb %%v16,%%v16,%%v0\n\t" | |||||
| "vfmdb %%v17,%%v17,%%v0\n\t" | |||||
| "vfmdb %%v18,%%v18,%%v0\n\t" | |||||
| "vfmdb %%v19,%%v19,%%v0\n\t" | |||||
| "vfmdb %%v20,%%v20,%%v0\n\t" | |||||
| "vfmdb %%v21,%%v21,%%v0\n\t" | |||||
| "vfmdb %%v22,%%v22,%%v0\n\t" | |||||
| "vfmdb %%v23,%%v23,%%v0\n\t" | |||||
| "vfmadb %%v16,%%v24,%%v1,%%v16\n\t" | |||||
| "vfmadb %%v17,%%v25,%%v1,%%v17\n\t" | |||||
| "vfmadb %%v18,%%v26,%%v1,%%v18\n\t" | |||||
| "vfmadb %%v19,%%v27,%%v1,%%v19\n\t" | |||||
| "vfmadb %%v20,%%v28,%%v1,%%v20\n\t" | |||||
| "vfmadb %%v21,%%v29,%%v1,%%v21\n\t" | |||||
| "vfmadb %%v22,%%v30,%%v1,%%v22\n\t" | |||||
| "vfmadb %%v23,%%v31,%%v1,%%v23\n\t" | |||||
| "vst %%v16,0(%%r1,%[x])\n\t" | |||||
| "vst %%v17,16(%%r1,%[x])\n\t" | |||||
| "vst %%v18,32(%%r1,%[x])\n\t" | |||||
| "vst %%v19,48(%%r1,%[x])\n\t" | |||||
| "vst %%v20,64(%%r1,%[x])\n\t" | |||||
| "vst %%v21,80(%%r1,%[x])\n\t" | |||||
| "vst %%v22,96(%%r1,%[x])\n\t" | |||||
| "vst %%v23,112(%%r1,%[x])\n\t" | |||||
| "agfi %%r1,128\n\t" | |||||
| "brctg %[n],0b" | |||||
| : "+m"(*(FLOAT (*)[n * 2]) x),[n] "+&r"(n) | |||||
| : [x] "a"(x), "m"(*(const FLOAT (*)[2]) alpha),[alpha] "a"(alpha) | |||||
| : "cc", "r1", "v0", "v1", "v16", "v17", "v18", "v19", "v20", "v21", | |||||
| "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", | |||||
| "v31"); | |||||
| } | } | ||||
| static void zscal_kernel_8_zero_i(BLASLONG n, FLOAT *alpha, FLOAT *x) | |||||
| { | |||||
| __asm__ volatile( | |||||
| "vlrepg %%v0,0(%1) \n\t" | |||||
| "srlg %%r0,%0,3 \n\t" | |||||
| "xgr %%r1,%%r1 \n\t" | |||||
| "0: \n\t" | |||||
| "pfd 2, 1024(%%r1,%2) \n\t" | |||||
| "vl %%v16,0(%%r1,%2) \n\t" | |||||
| "vl %%v17,16(%%r1,%2) \n\t" | |||||
| "vl %%v18,32(%%r1,%2) \n\t" | |||||
| "vl %%v19,48(%%r1,%2) \n\t" | |||||
| "vl %%v20,64(%%r1,%2) \n\t" | |||||
| "vl %%v21,80(%%r1,%2) \n\t" | |||||
| "vl %%v22,96(%%r1,%2) \n\t" | |||||
| "vl %%v23,112(%%r1,%2) \n\t" | |||||
| "vfmdb %%v16,%%v16,%%v0 \n\t" | |||||
| "vfmdb %%v17,%%v17,%%v0 \n\t" | |||||
| "vfmdb %%v18,%%v18,%%v0 \n\t" | |||||
| "vfmdb %%v19,%%v19,%%v0 \n\t" | |||||
| "vfmdb %%v20,%%v20,%%v0 \n\t" | |||||
| "vfmdb %%v21,%%v21,%%v0 \n\t" | |||||
| "vfmdb %%v22,%%v22,%%v0 \n\t" | |||||
| "vfmdb %%v23,%%v23,%%v0 \n\t" | |||||
| "vst %%v16,0(%%r1,%2) \n\t" | |||||
| "vst %%v17,16(%%r1,%2) \n\t" | |||||
| "vst %%v18,32(%%r1,%2) \n\t" | |||||
| "vst %%v19,48(%%r1,%2) \n\t" | |||||
| "vst %%v20,64(%%r1,%2) \n\t" | |||||
| "vst %%v21,80(%%r1,%2) \n\t" | |||||
| "vst %%v22,96(%%r1,%2) \n\t" | |||||
| "vst %%v23,112(%%r1,%2) \n\t" | |||||
| "agfi %%r1,128 \n\t" | |||||
| "brctg %%r0,0b " | |||||
| : | |||||
| :"r"(n),"ZQ"((const FLOAT (*)[2])alpha),"ZR"((FLOAT (*)[n * 2])x) | |||||
| :"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23" | |||||
| ); | |||||
| static void zscal_kernel_8_zero_r(BLASLONG n, FLOAT *alpha, FLOAT *x) { | |||||
| __asm__("vleg %%v0,8(%[alpha]),0\n\t" | |||||
| "wflcdb %%v0,%%v0\n\t" | |||||
| "vleg %%v0,8(%[alpha]),1\n\t" | |||||
| "srlg %[n],%[n],3\n\t" | |||||
| "xgr %%r1,%%r1\n\t" | |||||
| "0:\n\t" | |||||
| "pfd 2, 1024(%%r1,%[x])\n\t" | |||||
| "vl %%v16,0(%%r1,%[x])\n\t" | |||||
| "vl %%v17,16(%%r1,%[x])\n\t" | |||||
| "vl %%v18,32(%%r1,%[x])\n\t" | |||||
| "vl %%v19,48(%%r1,%[x])\n\t" | |||||
| "vl %%v20,64(%%r1,%[x])\n\t" | |||||
| "vl %%v21,80(%%r1,%[x])\n\t" | |||||
| "vl %%v22,96(%%r1,%[x])\n\t" | |||||
| "vl %%v23,112(%%r1,%[x])\n\t" | |||||
| "vpdi %%v16,%%v16,%%v16,4\n\t" | |||||
| "vpdi %%v17,%%v17,%%v17,4\n\t" | |||||
| "vpdi %%v18,%%v18,%%v18,4\n\t" | |||||
| "vpdi %%v19,%%v19,%%v19,4\n\t" | |||||
| "vpdi %%v20,%%v20,%%v20,4\n\t" | |||||
| "vpdi %%v21,%%v21,%%v21,4\n\t" | |||||
| "vpdi %%v22,%%v22,%%v22,4\n\t" | |||||
| "vpdi %%v23,%%v23,%%v23,4\n\t" | |||||
| "vfmdb %%v16,%%v16,%%v0\n\t" | |||||
| "vfmdb %%v17,%%v17,%%v0\n\t" | |||||
| "vfmdb %%v18,%%v18,%%v0\n\t" | |||||
| "vfmdb %%v19,%%v19,%%v0\n\t" | |||||
| "vfmdb %%v20,%%v20,%%v0\n\t" | |||||
| "vfmdb %%v21,%%v21,%%v0\n\t" | |||||
| "vfmdb %%v22,%%v22,%%v0\n\t" | |||||
| "vfmdb %%v23,%%v23,%%v0\n\t" | |||||
| "vst %%v16,0(%%r1,%[x])\n\t" | |||||
| "vst %%v17,16(%%r1,%[x])\n\t" | |||||
| "vst %%v18,32(%%r1,%[x])\n\t" | |||||
| "vst %%v19,48(%%r1,%[x])\n\t" | |||||
| "vst %%v20,64(%%r1,%[x])\n\t" | |||||
| "vst %%v21,80(%%r1,%[x])\n\t" | |||||
| "vst %%v22,96(%%r1,%[x])\n\t" | |||||
| "vst %%v23,112(%%r1,%[x])\n\t" | |||||
| "agfi %%r1,128\n\t" | |||||
| "brctg %[n],0b" | |||||
| : "+m"(*(FLOAT (*)[n * 2]) x),[n] "+&r"(n) | |||||
| : [x] "a"(x), "m"(*(const FLOAT (*)[2]) alpha),[alpha] "a"(alpha) | |||||
| : "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22", | |||||
| "v23"); | |||||
| } | } | ||||
| static void zscal_kernel_8_zero(BLASLONG n, FLOAT *x) | |||||
| { | |||||
| __asm__ volatile( | |||||
| "vzero %%v24 \n\t" | |||||
| "vzero %%v25 \n\t" | |||||
| "vzero %%v26 \n\t" | |||||
| "vzero %%v27 \n\t" | |||||
| "srlg %%r0,%0,3 \n\t" | |||||
| "xgr %%r1,%%r1 \n\t" | |||||
| "0: \n\t" | |||||
| "pfd 2, 1024(%%r1,%1) \n\t" | |||||
| "vst %%v24,0(%%r1,%1) \n\t" | |||||
| "vst %%v25,16(%%r1,%1) \n\t" | |||||
| "vst %%v26,32(%%r1,%1) \n\t" | |||||
| "vst %%v27,48(%%r1,%1) \n\t" | |||||
| "vst %%v24,64(%%r1,%1) \n\t" | |||||
| "vst %%v25,80(%%r1,%1) \n\t" | |||||
| "vst %%v26,96(%%r1,%1) \n\t" | |||||
| "vst %%v27,112(%%r1,%1) \n\t" | |||||
| "agfi %%r1,128 \n\t" | |||||
| "brctg %%r0,0b " | |||||
| : | |||||
| :"r"(n),"ZR"((FLOAT (*)[n * 2])x) | |||||
| :"memory","cc","r0","r1","v24","v25","v26","v27" | |||||
| ); | |||||
| static void zscal_kernel_8_zero_i(BLASLONG n, FLOAT *alpha, FLOAT *x) { | |||||
| __asm__("vlrepg %%v0,0(%[alpha])\n\t" | |||||
| "srlg %[n],%[n],3\n\t" | |||||
| "xgr %%r1,%%r1\n\t" | |||||
| "0:\n\t" | |||||
| "pfd 2, 1024(%%r1,%[x])\n\t" | |||||
| "vl %%v16,0(%%r1,%[x])\n\t" | |||||
| "vl %%v17,16(%%r1,%[x])\n\t" | |||||
| "vl %%v18,32(%%r1,%[x])\n\t" | |||||
| "vl %%v19,48(%%r1,%[x])\n\t" | |||||
| "vl %%v20,64(%%r1,%[x])\n\t" | |||||
| "vl %%v21,80(%%r1,%[x])\n\t" | |||||
| "vl %%v22,96(%%r1,%[x])\n\t" | |||||
| "vl %%v23,112(%%r1,%[x])\n\t" | |||||
| "vfmdb %%v16,%%v16,%%v0\n\t" | |||||
| "vfmdb %%v17,%%v17,%%v0\n\t" | |||||
| "vfmdb %%v18,%%v18,%%v0\n\t" | |||||
| "vfmdb %%v19,%%v19,%%v0\n\t" | |||||
| "vfmdb %%v20,%%v20,%%v0\n\t" | |||||
| "vfmdb %%v21,%%v21,%%v0\n\t" | |||||
| "vfmdb %%v22,%%v22,%%v0\n\t" | |||||
| "vfmdb %%v23,%%v23,%%v0\n\t" | |||||
| "vst %%v16,0(%%r1,%[x])\n\t" | |||||
| "vst %%v17,16(%%r1,%[x])\n\t" | |||||
| "vst %%v18,32(%%r1,%[x])\n\t" | |||||
| "vst %%v19,48(%%r1,%[x])\n\t" | |||||
| "vst %%v20,64(%%r1,%[x])\n\t" | |||||
| "vst %%v21,80(%%r1,%[x])\n\t" | |||||
| "vst %%v22,96(%%r1,%[x])\n\t" | |||||
| "vst %%v23,112(%%r1,%[x])\n\t" | |||||
| "agfi %%r1,128\n\t" | |||||
| "brctg %[n],0b" | |||||
| : "+m"(*(FLOAT (*)[n * 2]) x),[n] "+&r"(n) | |||||
| : [x] "a"(x), "m"(*(const FLOAT (*)[2]) alpha),[alpha] "a"(alpha) | |||||
| : "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22", | |||||
| "v23"); | |||||
| } | } | ||||
| static void zscal_kernel_inc_8(BLASLONG n, FLOAT *alpha, FLOAT *x, BLASLONG inc_x) | |||||
| { | |||||
| BLASLONG i; | |||||
| BLASLONG inc_x2 = 2 * inc_x; | |||||
| BLASLONG inc_x3 = inc_x2 + inc_x; | |||||
| FLOAT t0, t1, t2, t3; | |||||
| FLOAT da_r = alpha[0]; | |||||
| FLOAT da_i = alpha[1]; | |||||
| for (i = 0; i < n; i += 4) | |||||
| { | |||||
| t0 = da_r * x[0] - da_i * x[1]; | |||||
| t1 = da_r * x[inc_x] - da_i * x[inc_x + 1]; | |||||
| t2 = da_r * x[inc_x2] - da_i * x[inc_x2 + 1]; | |||||
| t3 = da_r * x[inc_x3] - da_i * x[inc_x3 + 1]; | |||||
| x[1] = da_i * x[0] + da_r * x[1]; | |||||
| x[inc_x + 1] = da_i * x[inc_x] + da_r * x[inc_x + 1]; | |||||
| x[inc_x2 + 1] = da_i * x[inc_x2] + da_r * x[inc_x2 + 1]; | |||||
| x[inc_x3 + 1] = da_i * x[inc_x3] + da_r * x[inc_x3 + 1]; | |||||
| x[0] = t0; | |||||
| x[inc_x] = t1; | |||||
| x[inc_x2] = t2; | |||||
| x[inc_x3] = t3; | |||||
| x += 4 * inc_x; | |||||
| } | |||||
| static void zscal_kernel_8_zero(BLASLONG n, FLOAT *x) { | |||||
| __asm__("vzero %%v0\n\t" | |||||
| "srlg %[n],%[n],3\n\t" | |||||
| "xgr %%r1,%%r1\n\t" | |||||
| "0:\n\t" | |||||
| "pfd 2, 1024(%%r1,%[x])\n\t" | |||||
| "vst %%v0,0(%%r1,%[x])\n\t" | |||||
| "vst %%v0,16(%%r1,%[x])\n\t" | |||||
| "vst %%v0,32(%%r1,%[x])\n\t" | |||||
| "vst %%v0,48(%%r1,%[x])\n\t" | |||||
| "vst %%v0,64(%%r1,%[x])\n\t" | |||||
| "vst %%v0,80(%%r1,%[x])\n\t" | |||||
| "vst %%v0,96(%%r1,%[x])\n\t" | |||||
| "vst %%v0,112(%%r1,%[x])\n\t" | |||||
| "agfi %%r1,128\n\t" | |||||
| "brctg %[n],0b" | |||||
| : "=m"(*(FLOAT (*)[n * 2]) x),[n] "+&r"(n) | |||||
| : [x] "a"(x) | |||||
| : "cc", "r1", "v0"); | |||||
| } | } | ||||
| int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) { | |||||
| BLASLONG i = 0, j = 0; | |||||
| FLOAT temp0; | |||||
| FLOAT temp1; | |||||
| FLOAT alpha[2] __attribute__ ((aligned(16))); | |||||
| if (inc_x != 1) { | |||||
| inc_x <<= 1; | |||||
| if (da_r == 0.0) { | |||||
| BLASLONG n1 = n & -2; | |||||
| if (da_i == 0.0) { | |||||
| static void zscal_kernel_inc_8(BLASLONG n, FLOAT *alpha, FLOAT *x, | |||||
| BLASLONG inc_x) { | |||||
| BLASLONG i; | |||||
| BLASLONG inc_x2 = 2 * inc_x; | |||||
| BLASLONG inc_x3 = inc_x2 + inc_x; | |||||
| FLOAT t0, t1, t2, t3; | |||||
| FLOAT da_r = alpha[0]; | |||||
| FLOAT da_i = alpha[1]; | |||||
| for (i = 0; i < n; i += 4) { | |||||
| t0 = da_r * x[0] - da_i * x[1]; | |||||
| t1 = da_r * x[inc_x] - da_i * x[inc_x + 1]; | |||||
| t2 = da_r * x[inc_x2] - da_i * x[inc_x2 + 1]; | |||||
| t3 = da_r * x[inc_x3] - da_i * x[inc_x3 + 1]; | |||||
| x[1] = da_i * x[0] + da_r * x[1]; | |||||
| x[inc_x + 1] = da_i * x[inc_x] + da_r * x[inc_x + 1]; | |||||
| x[inc_x2 + 1] = da_i * x[inc_x2] + da_r * x[inc_x2 + 1]; | |||||
| x[inc_x3 + 1] = da_i * x[inc_x3] + da_r * x[inc_x3 + 1]; | |||||
| x[0] = t0; | |||||
| x[inc_x] = t1; | |||||
| x[inc_x2] = t2; | |||||
| x[inc_x3] = t3; | |||||
| x += 4 * inc_x; | |||||
| } | |||||
| } | |||||
| while (j < n1) { | |||||
| int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, | |||||
| FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, | |||||
| BLASLONG dummy2) { | |||||
| BLASLONG i = 0, j = 0; | |||||
| FLOAT temp0; | |||||
| FLOAT temp1; | |||||
| FLOAT alpha[2] __attribute__ ((aligned(16))); | |||||
| x[i] = 0.0; | |||||
| x[i + 1] = 0.0; | |||||
| x[i + inc_x] = 0.0; | |||||
| x[i + 1 + inc_x] = 0.0; | |||||
| i += 2 * inc_x; | |||||
| j += 2; | |||||
| if (inc_x != 1) { | |||||
| inc_x <<= 1; | |||||
| } | |||||
| if (da_r == 0.0) { | |||||
| while (j < n) { | |||||
| BLASLONG n1 = n & -2; | |||||
| x[i] = 0.0; | |||||
| x[i + 1] = 0.0; | |||||
| i += inc_x; | |||||
| j++; | |||||
| if (da_i == 0.0) { | |||||
| } | |||||
| while (j < n1) { | |||||
| } else { | |||||
| x[i] = 0.0; | |||||
| x[i + 1] = 0.0; | |||||
| x[i + inc_x] = 0.0; | |||||
| x[i + 1 + inc_x] = 0.0; | |||||
| i += 2 * inc_x; | |||||
| j += 2; | |||||
| while (j < n1) { | |||||
| } | |||||
| temp0 = -da_i * x[i + 1]; | |||||
| x[i + 1] = da_i * x[i]; | |||||
| x[i] = temp0; | |||||
| temp1 = -da_i * x[i + 1 + inc_x]; | |||||
| x[i + 1 + inc_x] = da_i * x[i + inc_x]; | |||||
| x[i + inc_x] = temp1; | |||||
| i += 2 * inc_x; | |||||
| j += 2; | |||||
| while (j < n) { | |||||
| } | |||||
| x[i] = 0.0; | |||||
| x[i + 1] = 0.0; | |||||
| i += inc_x; | |||||
| j++; | |||||
| while (j < n) { | |||||
| } | |||||
| temp0 = -da_i * x[i + 1]; | |||||
| x[i + 1] = da_i * x[i]; | |||||
| x[i] = temp0; | |||||
| i += inc_x; | |||||
| j++; | |||||
| } else { | |||||
| } | |||||
| while (j < n1) { | |||||
| temp0 = -da_i * x[i + 1]; | |||||
| x[i + 1] = da_i * x[i]; | |||||
| x[i] = temp0; | |||||
| temp1 = -da_i * x[i + 1 + inc_x]; | |||||
| x[i + 1 + inc_x] = da_i * x[i + inc_x]; | |||||
| x[i + inc_x] = temp1; | |||||
| i += 2 * inc_x; | |||||
| j += 2; | |||||
| } | |||||
| } | |||||
| while (j < n) { | |||||
| } else { | |||||
| temp0 = -da_i * x[i + 1]; | |||||
| x[i + 1] = da_i * x[i]; | |||||
| x[i] = temp0; | |||||
| i += inc_x; | |||||
| j++; | |||||
| } | |||||
| if (da_i == 0.0) { | |||||
| BLASLONG n1 = n & -2; | |||||
| } | |||||
| while (j < n1) { | |||||
| } else { | |||||
| temp0 = da_r * x[i]; | |||||
| x[i + 1] = da_r * x[i + 1]; | |||||
| x[i] = temp0; | |||||
| temp1 = da_r * x[i + inc_x]; | |||||
| x[i + 1 + inc_x] = da_r * x[i + 1 + inc_x]; | |||||
| x[i + inc_x] = temp1; | |||||
| i += 2 * inc_x; | |||||
| j += 2; | |||||
| if (da_i == 0.0) { | |||||
| BLASLONG n1 = n & -2; | |||||
| } | |||||
| while (j < n1) { | |||||
| while (j < n) { | |||||
| temp0 = da_r * x[i]; | |||||
| x[i + 1] = da_r * x[i + 1]; | |||||
| x[i] = temp0; | |||||
| temp1 = da_r * x[i + inc_x]; | |||||
| x[i + 1 + inc_x] = da_r * x[i + 1 + inc_x]; | |||||
| x[i + inc_x] = temp1; | |||||
| i += 2 * inc_x; | |||||
| j += 2; | |||||
| temp0 = da_r * x[i]; | |||||
| x[i + 1] = da_r * x[i + 1]; | |||||
| x[i] = temp0; | |||||
| i += inc_x; | |||||
| j++; | |||||
| } | |||||
| } | |||||
| while (j < n) { | |||||
| } else { | |||||
| temp0 = da_r * x[i]; | |||||
| x[i + 1] = da_r * x[i + 1]; | |||||
| x[i] = temp0; | |||||
| i += inc_x; | |||||
| j++; | |||||
| BLASLONG n1 = n & -8; | |||||
| if (n1 > 0) { | |||||
| alpha[0] = da_r; | |||||
| alpha[1] = da_i; | |||||
| zscal_kernel_inc_8(n1, alpha, x, inc_x); | |||||
| j = n1; | |||||
| i = n1 * inc_x; | |||||
| } | |||||
| } | |||||
| while (j < n) { | |||||
| } else { | |||||
| temp0 = da_r * x[i] - da_i * x[i + 1]; | |||||
| x[i + 1] = da_r * x[i + 1] + da_i * x[i]; | |||||
| x[i] = temp0; | |||||
| i += inc_x; | |||||
| j++; | |||||
| BLASLONG n1 = n & -8; | |||||
| if (n1 > 0) { | |||||
| alpha[0] = da_r; | |||||
| alpha[1] = da_i; | |||||
| zscal_kernel_inc_8(n1, alpha, x, inc_x); | |||||
| j = n1; | |||||
| i = n1 * inc_x; | |||||
| } | |||||
| } | |||||
| while (j < n) { | |||||
| } | |||||
| temp0 = da_r * x[i] - da_i * x[i + 1]; | |||||
| x[i + 1] = da_r * x[i + 1] + da_i * x[i]; | |||||
| x[i] = temp0; | |||||
| i += inc_x; | |||||
| j++; | |||||
| } | } | ||||
| return (0); | |||||
| } | |||||
| } | |||||
| } | |||||
| BLASLONG n1 = n & -8; | |||||
| if (n1 > 0) { | |||||
| return (0); | |||||
| } | |||||
| alpha[0] = da_r; | |||||
| alpha[1] = da_i; | |||||
| BLASLONG n1 = n & -8; | |||||
| if (n1 > 0) { | |||||
| if (da_r == 0.0) | |||||
| if (da_i == 0) | |||||
| zscal_kernel_8_zero(n1, x); | |||||
| else | |||||
| zscal_kernel_8_zero_r(n1, alpha, x); | |||||
| else | |||||
| if (da_i == 0) | |||||
| zscal_kernel_8_zero_i(n1, alpha, x); | |||||
| else | |||||
| zscal_kernel_8(n1, alpha, x); | |||||
| alpha[0] = da_r; | |||||
| alpha[1] = da_i; | |||||
| i = n1 << 1; | |||||
| j = n1; | |||||
| } | |||||
| if (da_r == 0.0) | |||||
| if (da_i == 0) | |||||
| zscal_kernel_8_zero(n1, x); | |||||
| else | |||||
| zscal_kernel_8_zero_r(n1, alpha, x); | |||||
| else if (da_i == 0) | |||||
| zscal_kernel_8_zero_i(n1, alpha, x); | |||||
| else | |||||
| zscal_kernel_8(n1, alpha, x); | |||||
| i = n1 << 1; | |||||
| j = n1; | |||||
| } | |||||
| if (da_r == 0.0) { | |||||
| if (da_r == 0.0) { | |||||
| if (da_i == 0.0) { | |||||
| if (da_i == 0.0) { | |||||
| while (j < n) { | |||||
| while (j < n) { | |||||
| x[i] = 0.0; | |||||
| x[i + 1] = 0.0; | |||||
| i += 2; | |||||
| j++; | |||||
| x[i] = 0.0; | |||||
| x[i + 1] = 0.0; | |||||
| i += 2; | |||||
| j++; | |||||
| } | |||||
| } | |||||
| } else { | |||||
| } else { | |||||
| while (j < n) { | |||||
| while (j < n) { | |||||
| temp0 = -da_i * x[i + 1]; | |||||
| x[i + 1] = da_i * x[i]; | |||||
| x[i] = temp0; | |||||
| i += 2; | |||||
| j++; | |||||
| temp0 = -da_i * x[i + 1]; | |||||
| x[i + 1] = da_i * x[i]; | |||||
| x[i] = temp0; | |||||
| i += 2; | |||||
| j++; | |||||
| } | |||||
| } | |||||
| } | |||||
| } | |||||
| } else { | |||||
| } else { | |||||
| if (da_i == 0.0) { | |||||
| if (da_i == 0.0) { | |||||
| while (j < n) { | |||||
| while (j < n) { | |||||
| temp0 = da_r * x[i]; | |||||
| x[i + 1] = da_r * x[i + 1]; | |||||
| x[i] = temp0; | |||||
| i += 2; | |||||
| j++; | |||||
| temp0 = da_r * x[i]; | |||||
| x[i + 1] = da_r * x[i + 1]; | |||||
| x[i] = temp0; | |||||
| i += 2; | |||||
| j++; | |||||
| } | |||||
| } | |||||
| } else { | |||||
| } else { | |||||
| while (j < n) { | |||||
| while (j < n) { | |||||
| temp0 = da_r * x[i] - da_i * x[i + 1]; | |||||
| x[i + 1] = da_r * x[i + 1] + da_i * x[i]; | |||||
| x[i] = temp0; | |||||
| i += 2; | |||||
| j++; | |||||
| temp0 = da_r * x[i] - da_i * x[i + 1]; | |||||
| x[i + 1] = da_r * x[i + 1] + da_i * x[i]; | |||||
| x[i] = temp0; | |||||
| i += 2; | |||||
| j++; | |||||
| } | |||||
| } | |||||
| } | |||||
| } | } | ||||
| return (0); | |||||
| } | |||||
| return (0); | |||||
| } | } | ||||
| @@ -1,5 +1,5 @@ | |||||
| /*************************************************************************** | /*************************************************************************** | ||||
| Copyright (c) 2013-2017, The OpenBLAS Project | |||||
| Copyright (c) 2013-2019, The OpenBLAS Project | |||||
| All rights reserved. | All rights reserved. | ||||
| Redistribution and use in source and binary forms, with or without | Redistribution and use in source and binary forms, with or without | ||||
| modification, are permitted provided that the following conditions are | modification, are permitted provided that the following conditions are | ||||
| @@ -27,157 +27,142 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #include "common.h" | #include "common.h" | ||||
| static void zswap_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y) | |||||
| { | |||||
| __asm__ volatile( | |||||
| "srlg %%r0,%0,4 \n\t" | |||||
| "xgr %%r1,%%r1 \n\t" | |||||
| "0: \n\t" | |||||
| "pfd 2, 1024(%%r1,%1) \n\t" | |||||
| "pfd 2, 1024(%%r1,%2) \n\t" | |||||
| "vl %%v16, 0(%%r1,%1) \n\t" | |||||
| "vl %%v17, 16(%%r1,%1) \n\t" | |||||
| "vl %%v18, 32(%%r1,%1) \n\t" | |||||
| "vl %%v19, 48(%%r1,%1) \n\t" | |||||
| "vl %%v20, 64(%%r1,%1) \n\t" | |||||
| "vl %%v21, 80(%%r1,%1) \n\t" | |||||
| "vl %%v22, 96(%%r1,%1) \n\t" | |||||
| "vl %%v23, 112(%%r1,%1) \n\t" | |||||
| "vl %%v24, 128(%%r1,%1) \n\t" | |||||
| "vl %%v25, 144(%%r1,%1) \n\t" | |||||
| "vl %%v26, 160(%%r1,%1) \n\t" | |||||
| "vl %%v27, 176(%%r1,%1) \n\t" | |||||
| "vl %%v28, 192(%%r1,%1) \n\t" | |||||
| "vl %%v29, 208(%%r1,%1) \n\t" | |||||
| "vl %%v30, 224(%%r1,%1) \n\t" | |||||
| "vl %%v31, 240(%%r1,%1) \n\t" | |||||
| "vl %%v0, 0(%%r1,%2) \n\t" | |||||
| "vl %%v1, 16(%%r1,%2) \n\t" | |||||
| "vl %%v2, 32(%%r1,%2) \n\t" | |||||
| "vl %%v3, 48(%%r1,%2) \n\t" | |||||
| "vl %%v4, 64(%%r1,%2) \n\t" | |||||
| "vl %%v5, 80(%%r1,%2) \n\t" | |||||
| "vl %%v6, 96(%%r1,%2) \n\t" | |||||
| "vl %%v7, 112(%%r1,%2) \n\t" | |||||
| "vst %%v0, 0(%%r1,%1) \n\t" | |||||
| "vst %%v1, 16(%%r1,%1) \n\t" | |||||
| "vst %%v2, 32(%%r1,%1) \n\t" | |||||
| "vst %%v3, 48(%%r1,%1) \n\t" | |||||
| "vst %%v4, 64(%%r1,%1) \n\t" | |||||
| "vst %%v5, 80(%%r1,%1) \n\t" | |||||
| "vst %%v6, 96(%%r1,%1) \n\t" | |||||
| "vst %%v7, 112(%%r1,%1) \n\t" | |||||
| "vl %%v0, 128(%%r1,%2) \n\t" | |||||
| "vl %%v1, 144(%%r1,%2) \n\t" | |||||
| "vl %%v2, 160(%%r1,%2) \n\t" | |||||
| "vl %%v3, 176(%%r1,%2) \n\t" | |||||
| "vl %%v4, 192(%%r1,%2) \n\t" | |||||
| "vl %%v5, 208(%%r1,%2) \n\t" | |||||
| "vl %%v6, 224(%%r1,%2) \n\t" | |||||
| "vl %%v7, 240(%%r1,%2) \n\t" | |||||
| "vst %%v0, 128(%%r1,%1) \n\t" | |||||
| "vst %%v1, 144(%%r1,%1) \n\t" | |||||
| "vst %%v2, 160(%%r1,%1) \n\t" | |||||
| "vst %%v3, 176(%%r1,%1) \n\t" | |||||
| "vst %%v4, 192(%%r1,%1) \n\t" | |||||
| "vst %%v5, 208(%%r1,%1) \n\t" | |||||
| "vst %%v6, 224(%%r1,%1) \n\t" | |||||
| "vst %%v7, 240(%%r1,%1) \n\t" | |||||
| "vst %%v16, 0(%%r1,%2) \n\t" | |||||
| "vst %%v17, 16(%%r1,%2) \n\t" | |||||
| "vst %%v18, 32(%%r1,%2) \n\t" | |||||
| "vst %%v19, 48(%%r1,%2) \n\t" | |||||
| "vst %%v20, 64(%%r1,%2) \n\t" | |||||
| "vst %%v21, 80(%%r1,%2) \n\t" | |||||
| "vst %%v22, 96(%%r1,%2) \n\t" | |||||
| "vst %%v23, 112(%%r1,%2) \n\t" | |||||
| "vst %%v24, 128(%%r1,%2) \n\t" | |||||
| "vst %%v25, 144(%%r1,%2) \n\t" | |||||
| "vst %%v26, 160(%%r1,%2) \n\t" | |||||
| "vst %%v27, 176(%%r1,%2) \n\t" | |||||
| "vst %%v28, 192(%%r1,%2) \n\t" | |||||
| "vst %%v29, 208(%%r1,%2) \n\t" | |||||
| "vst %%v30, 224(%%r1,%2) \n\t" | |||||
| "vst %%v31, 240(%%r1,%2) \n\t" | |||||
| "agfi %%r1,256 \n\t" | |||||
| "brctg %%r0,0b " | |||||
| : | |||||
| :"r"(n),"ZR"((FLOAT (*)[n * 2])x),"ZR"((FLOAT (*)[n * 2])y) | |||||
| :"memory","cc","r0","r1","v0","v1","v2","v3","v4","v5","v6","v7","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" | |||||
| ); | |||||
| static void zswap_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y) { | |||||
| __asm__("srlg %[n],%[n],4\n\t" | |||||
| "xgr %%r1,%%r1\n\t" | |||||
| "0:\n\t" | |||||
| "pfd 2, 1024(%%r1,%[x])\n\t" | |||||
| "pfd 2, 1024(%%r1,%[y])\n\t" | |||||
| "vl %%v16, 0(%%r1,%[x])\n\t" | |||||
| "vl %%v17, 16(%%r1,%[x])\n\t" | |||||
| "vl %%v18, 32(%%r1,%[x])\n\t" | |||||
| "vl %%v19, 48(%%r1,%[x])\n\t" | |||||
| "vl %%v20, 64(%%r1,%[x])\n\t" | |||||
| "vl %%v21, 80(%%r1,%[x])\n\t" | |||||
| "vl %%v22, 96(%%r1,%[x])\n\t" | |||||
| "vl %%v23, 112(%%r1,%[x])\n\t" | |||||
| "vl %%v24, 128(%%r1,%[x])\n\t" | |||||
| "vl %%v25, 144(%%r1,%[x])\n\t" | |||||
| "vl %%v26, 160(%%r1,%[x])\n\t" | |||||
| "vl %%v27, 176(%%r1,%[x])\n\t" | |||||
| "vl %%v28, 192(%%r1,%[x])\n\t" | |||||
| "vl %%v29, 208(%%r1,%[x])\n\t" | |||||
| "vl %%v30, 224(%%r1,%[x])\n\t" | |||||
| "vl %%v31, 240(%%r1,%[x])\n\t" | |||||
| "vl %%v0, 0(%%r1,%[y])\n\t" | |||||
| "vl %%v1, 16(%%r1,%[y])\n\t" | |||||
| "vl %%v2, 32(%%r1,%[y])\n\t" | |||||
| "vl %%v3, 48(%%r1,%[y])\n\t" | |||||
| "vl %%v4, 64(%%r1,%[y])\n\t" | |||||
| "vl %%v5, 80(%%r1,%[y])\n\t" | |||||
| "vl %%v6, 96(%%r1,%[y])\n\t" | |||||
| "vl %%v7, 112(%%r1,%[y])\n\t" | |||||
| "vst %%v0, 0(%%r1,%[x])\n\t" | |||||
| "vst %%v1, 16(%%r1,%[x])\n\t" | |||||
| "vst %%v2, 32(%%r1,%[x])\n\t" | |||||
| "vst %%v3, 48(%%r1,%[x])\n\t" | |||||
| "vst %%v4, 64(%%r1,%[x])\n\t" | |||||
| "vst %%v5, 80(%%r1,%[x])\n\t" | |||||
| "vst %%v6, 96(%%r1,%[x])\n\t" | |||||
| "vst %%v7, 112(%%r1,%[x])\n\t" | |||||
| "vl %%v0, 128(%%r1,%[y])\n\t" | |||||
| "vl %%v1, 144(%%r1,%[y])\n\t" | |||||
| "vl %%v2, 160(%%r1,%[y])\n\t" | |||||
| "vl %%v3, 176(%%r1,%[y])\n\t" | |||||
| "vl %%v4, 192(%%r1,%[y])\n\t" | |||||
| "vl %%v5, 208(%%r1,%[y])\n\t" | |||||
| "vl %%v6, 224(%%r1,%[y])\n\t" | |||||
| "vl %%v7, 240(%%r1,%[y])\n\t" | |||||
| "vst %%v0, 128(%%r1,%[x])\n\t" | |||||
| "vst %%v1, 144(%%r1,%[x])\n\t" | |||||
| "vst %%v2, 160(%%r1,%[x])\n\t" | |||||
| "vst %%v3, 176(%%r1,%[x])\n\t" | |||||
| "vst %%v4, 192(%%r1,%[x])\n\t" | |||||
| "vst %%v5, 208(%%r1,%[x])\n\t" | |||||
| "vst %%v6, 224(%%r1,%[x])\n\t" | |||||
| "vst %%v7, 240(%%r1,%[x])\n\t" | |||||
| "vst %%v16, 0(%%r1,%[y])\n\t" | |||||
| "vst %%v17, 16(%%r1,%[y])\n\t" | |||||
| "vst %%v18, 32(%%r1,%[y])\n\t" | |||||
| "vst %%v19, 48(%%r1,%[y])\n\t" | |||||
| "vst %%v20, 64(%%r1,%[y])\n\t" | |||||
| "vst %%v21, 80(%%r1,%[y])\n\t" | |||||
| "vst %%v22, 96(%%r1,%[y])\n\t" | |||||
| "vst %%v23, 112(%%r1,%[y])\n\t" | |||||
| "vst %%v24, 128(%%r1,%[y])\n\t" | |||||
| "vst %%v25, 144(%%r1,%[y])\n\t" | |||||
| "vst %%v26, 160(%%r1,%[y])\n\t" | |||||
| "vst %%v27, 176(%%r1,%[y])\n\t" | |||||
| "vst %%v28, 192(%%r1,%[y])\n\t" | |||||
| "vst %%v29, 208(%%r1,%[y])\n\t" | |||||
| "vst %%v30, 224(%%r1,%[y])\n\t" | |||||
| "vst %%v31, 240(%%r1,%[y])\n\t" | |||||
| "agfi %%r1,256\n\t" | |||||
| "brctg %[n],0b" | |||||
| : "+m"(*(FLOAT (*)[n * 2]) x), "+m"(*(FLOAT (*)[n * 2]) y),[n] "+&r"(n) | |||||
| : [x] "a"(x),[y] "a"(y) | |||||
| : "cc", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", | |||||
| "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", | |||||
| "v27", "v28", "v29", "v30", "v31"); | |||||
| } | } | ||||
| int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT dummy4, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) | |||||
| { | |||||
| BLASLONG i=0; | |||||
| BLASLONG ix=0,iy=0; | |||||
| FLOAT temp[2]; | |||||
| BLASLONG inc_x2, inc_y2; | |||||
| if ( n <= 0 ) return(0); | |||||
| if ( (inc_x == 1) && (inc_y == 1 )) | |||||
| { | |||||
| BLASLONG n1 = n & -16; | |||||
| if ( n1 > 0 ) | |||||
| { | |||||
| zswap_kernel_16(n1, x, y); | |||||
| i=n1; | |||||
| ix = 2* n1; | |||||
| iy = 2* n1; | |||||
| } | |||||
| while(i < n) | |||||
| { | |||||
| temp[0] = x[ix] ; | |||||
| temp[1] = x[ix+1] ; | |||||
| x[ix] = y[iy] ; | |||||
| x[ix+1] = y[iy+1] ; | |||||
| y[iy] = temp[0] ; | |||||
| y[iy+1] = temp[1] ; | |||||
| ix += 2 ; | |||||
| iy += 2 ; | |||||
| i++ ; | |||||
| int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, | |||||
| FLOAT dummy4, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, | |||||
| FLOAT *dummy, BLASLONG dummy2) { | |||||
| BLASLONG i = 0; | |||||
| BLASLONG ix = 0, iy = 0; | |||||
| FLOAT temp[2]; | |||||
| BLASLONG inc_x2, inc_y2; | |||||
| if (n <= 0) | |||||
| return (0); | |||||
| if ((inc_x == 1) && (inc_y == 1)) { | |||||
| BLASLONG n1 = n & -16; | |||||
| if (n1 > 0) { | |||||
| zswap_kernel_16(n1, x, y); | |||||
| i = n1; | |||||
| ix = 2 * n1; | |||||
| iy = 2 * n1; | |||||
| } | |||||
| while (i < n) { | |||||
| } | |||||
| temp[0] = x[ix]; | |||||
| temp[1] = x[ix + 1]; | |||||
| x[ix] = y[iy]; | |||||
| x[ix + 1] = y[iy + 1]; | |||||
| y[iy] = temp[0]; | |||||
| y[iy + 1] = temp[1]; | |||||
| ix += 2; | |||||
| iy += 2; | |||||
| i++; | |||||
| } | } | ||||
| else | |||||
| { | |||||
| inc_x2 = 2 * inc_x; | |||||
| inc_y2 = 2 * inc_y; | |||||
| } else { | |||||
| while(i < n) | |||||
| { | |||||
| inc_x2 = 2 * inc_x; | |||||
| inc_y2 = 2 * inc_y; | |||||
| temp[0] = x[ix] ; | |||||
| temp[1] = x[ix+1] ; | |||||
| x[ix] = y[iy] ; | |||||
| x[ix+1] = y[iy+1] ; | |||||
| y[iy] = temp[0] ; | |||||
| y[iy+1] = temp[1] ; | |||||
| while (i < n) { | |||||
| ix += inc_x2 ; | |||||
| iy += inc_y2 ; | |||||
| i++ ; | |||||
| temp[0] = x[ix]; | |||||
| temp[1] = x[ix + 1]; | |||||
| x[ix] = y[iy]; | |||||
| x[ix + 1] = y[iy + 1]; | |||||
| y[iy] = temp[0]; | |||||
| y[iy + 1] = temp[1]; | |||||
| } | |||||
| ix += inc_x2; | |||||
| iy += inc_y2; | |||||
| i++; | |||||
| } | } | ||||
| return(0); | |||||
| } | |||||
| } | |||||
| return (0); | |||||
| } | |||||