Browse Source

[ZARCH] Format source code, Fix constraints

tags/v0.3.6^2
maamountki GitHub 6 years ago
parent
commit
81daf6bc38
No known key found for this signature in database GPG Key ID: 4AEE18F83AFDEB23
67 changed files with 13393 additions and 14601 deletions
  1. +172
    -198
      kernel/zarch/camax.c
  2. +172
    -198
      kernel/zarch/camin.c
  3. +112
    -124
      kernel/zarch/casum.c
  4. +108
    -124
      kernel/zarch/caxpy.c
  5. +45
    -57
      kernel/zarch/ccopy.c
  6. +124
    -130
      kernel/zarch/cdot.c
  7. +632
    -631
      kernel/zarch/cgemv_n_4.c
  8. +612
    -567
      kernel/zarch/cgemv_t_4.c
  9. +196
    -217
      kernel/zarch/crot.c
  10. +327
    -357
      kernel/zarch/cscal.c
  11. +124
    -139
      kernel/zarch/cswap.c
  12. +102
    -118
      kernel/zarch/damax.c
  13. +136
    -156
      kernel/zarch/damax_z13.c
  14. +102
    -118
      kernel/zarch/damin.c
  15. +136
    -156
      kernel/zarch/damin_z13.c
  16. +121
    -127
      kernel/zarch/dasum.c
  17. +118
    -135
      kernel/zarch/daxpy.c
  18. +35
    -41
      kernel/zarch/dcopy.c
  19. +100
    -96
      kernel/zarch/ddot.c
  20. +558
    -642
      kernel/zarch/dgemv_n_4.c
  21. +663
    -734
      kernel/zarch/dgemv_t_4.c
  22. +101
    -113
      kernel/zarch/dmax.c
  23. +118
    -134
      kernel/zarch/dmax_z13.c
  24. +101
    -113
      kernel/zarch/dmin.c
  25. +118
    -134
      kernel/zarch/dmin_z13.c
  26. +180
    -201
      kernel/zarch/drot.c
  27. +125
    -153
      kernel/zarch/dscal.c
  28. +124
    -122
      kernel/zarch/dsdot.c
  29. +108
    -120
      kernel/zarch/dswap.c
  30. +253
    -262
      kernel/zarch/icamax.c
  31. +253
    -262
      kernel/zarch/icamin.c
  32. +196
    -215
      kernel/zarch/idamax.c
  33. +196
    -215
      kernel/zarch/idamin.c
  34. +185
    -200
      kernel/zarch/idmax.c
  35. +185
    -200
      kernel/zarch/idmin.c
  36. +238
    -258
      kernel/zarch/isamax.c
  37. +238
    -258
      kernel/zarch/isamin.c
  38. +221
    -237
      kernel/zarch/ismax.c
  39. +221
    -237
      kernel/zarch/ismin.c
  40. +204
    -205
      kernel/zarch/izamax.c
  41. +204
    -205
      kernel/zarch/izamin.c
  42. +104
    -121
      kernel/zarch/samax.c
  43. +104
    -121
      kernel/zarch/samin.c
  44. +123
    -129
      kernel/zarch/sasum.c
  45. +118
    -135
      kernel/zarch/saxpy.c
  46. +35
    -41
      kernel/zarch/scopy.c
  47. +96
    -92
      kernel/zarch/sdot.c
  48. +538
    -619
      kernel/zarch/sgemv_n_4.c
  49. +657
    -723
      kernel/zarch/sgemv_t_4.c
  50. +103
    -116
      kernel/zarch/smax.c
  51. +103
    -116
      kernel/zarch/smin.c
  52. +180
    -201
      kernel/zarch/srot.c
  53. +120
    -148
      kernel/zarch/sscal.c
  54. +108
    -122
      kernel/zarch/sswap.c
  55. +157
    -176
      kernel/zarch/zamax.c
  56. +166
    -186
      kernel/zarch/zamax_z13.c
  57. +149
    -168
      kernel/zarch/zamin.c
  58. +158
    -178
      kernel/zarch/zamin_z13.c
  59. +110
    -122
      kernel/zarch/zasum.c
  60. +112
    -120
      kernel/zarch/zaxpy.c
  61. +45
    -57
      kernel/zarch/zcopy.c
  62. +120
    -126
      kernel/zarch/zdot.c
  63. +544
    -603
      kernel/zarch/zgemv_n_4.c
  64. +536
    -563
      kernel/zarch/zgemv_t_4.c
  65. +196
    -217
      kernel/zarch/zrot.c
  66. +323
    -353
      kernel/zarch/zscal.c
  67. +124
    -139
      kernel/zarch/zswap.c

+ 172
- 198
kernel/zarch/camax.c View File

@@ -1,5 +1,5 @@
/*************************************************************************** /***************************************************************************
Copyright (c) 2013-2017, The OpenBLAS Project
Copyright (c) 2013-2019, The OpenBLAS Project
All rights reserved. All rights reserved.
Redistribution and use in source and binary forms, with or without Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are modification, are permitted provided that the following conditions are
@@ -28,214 +28,188 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "common.h" #include "common.h"
#include <math.h> #include <math.h>


#if defined(DOUBLE)
#define ABS fabs
#else
#define ABS fabsf
#endif

#define CABS1(x,i) (ABS(x[i]) + ABS(x[i + 1]))

static FLOAT camax_kernel_32(BLASLONG n, FLOAT *x)
{
FLOAT amax;

__asm__ volatile (
"vlef %%v0,0(%2),0 \n\t"
"vlef %%v16,4(%2),0 \n\t"
"vlef %%v0,8(%2),1 \n\t"
"vlef %%v16,12(%2),1 \n\t"
"vlef %%v0,16(%2),2 \n\t"
"vlef %%v16,20(%2),2 \n\t"
"vlef %%v0,24(%2),3 \n\t"
"vlef %%v16,28(%2),3 \n\t"
"vflpsb %%v0,%%v0 \n\t"
"vflpsb %%v16,%%v16 \n\t"
"vfasb %%v0,%%v0,%%v16 \n\t"
"vleib %%v1,0,0 \n\t"
"vleib %%v1,1,1 \n\t"
"vleib %%v1,2,2 \n\t"
"vleib %%v1,3,3 \n\t"
"vleib %%v1,8,4 \n\t"
"vleib %%v1,9,5 \n\t"
"vleib %%v1,10,6 \n\t"
"vleib %%v1,11,7 \n\t"
"vleib %%v1,16,8 \n\t"
"vleib %%v1,17,9 \n\t"
"vleib %%v1,18,10 \n\t"
"vleib %%v1,19,11 \n\t"
"vleib %%v1,24,12 \n\t"
"vleib %%v1,25,13 \n\t"
"vleib %%v1,26,14 \n\t"
"vleib %%v1,27,15 \n\t"
"srlg %%r0,%1,5 \n\t"
"xgr %%r1,%%r1 \n\t"
"0: \n\t"
"pfd 1, 1024(%%r1,%2) \n\t"

"vl %%v16,0(%%r1,%2) \n\t"
"vl %%v2,16(%%r1,%2) \n\t"
"vpkg %%v17,%%v16,%%v2 \n\t"
"vperm %%v16,%%v16,%%v2,%%v1 \n\t"

"vl %%v18,32(%%r1,%2) \n\t"
"vl %%v2,48(%%r1,%2) \n\t"
"vpkg %%v19,%%v18,%%v2 \n\t"
"vperm %%v18,%%v18,%%v2,%%v1 \n\t"

"vl %%v20,64(%%r1,%2) \n\t"
"vl %%v2,80(%%r1,%2) \n\t"
"vpkg %%v21,%%v20,%%v2 \n\t"
"vperm %%v20,%%v20,%%v2,%%v1 \n\t"

"vl %%v22,96(%%r1,%2) \n\t"
"vl %%v2,112(%%r1,%2) \n\t"
"vpkg %%v23,%%v22,%%v2 \n\t"
"vperm %%v22,%%v22,%%v2,%%v1 \n\t"

"vl %%v24,128(%%r1,%2) \n\t"
"vl %%v2,144(%%r1,%2) \n\t"
"vpkg %%v25,%%v24,%%v2 \n\t"
"vperm %%v24,%%v24,%%v2,%%v1 \n\t"

"vl %%v26,160(%%r1,%2) \n\t"
"vl %%v2,176(%%r1,%2) \n\t"
"vpkg %%v27,%%v26,%%v2 \n\t"
"vperm %%v26,%%v26,%%v2,%%v1 \n\t"

"vl %%v28,192(%%r1,%2) \n\t"
"vl %%v2,208(%%r1,%2) \n\t"
"vpkg %%v29,%%v28,%%v2 \n\t"
"vperm %%v28,%%v28,%%v2,%%v1 \n\t"

"vl %%v30,224(%%r1,%2) \n\t"
"vl %%v2,240(%%r1,%2) \n\t"
"vpkg %%v31,%%v30,%%v2 \n\t"
"vperm %%v30,%%v30,%%v2,%%v1 \n\t"

"vflpsb %%v16,%%v16 \n\t"
"vflpsb %%v17,%%v17 \n\t"
"vflpsb %%v18,%%v18 \n\t"
"vflpsb %%v19,%%v19 \n\t"
"vflpsb %%v20,%%v20 \n\t"
"vflpsb %%v21,%%v21 \n\t"
"vflpsb %%v22,%%v22 \n\t"
"vflpsb %%v23,%%v23 \n\t"
"vflpsb %%v24,%%v24 \n\t"
"vflpsb %%v25,%%v25 \n\t"
"vflpsb %%v26,%%v26 \n\t"
"vflpsb %%v27,%%v27 \n\t"
"vflpsb %%v28,%%v28 \n\t"
"vflpsb %%v29,%%v29 \n\t"
"vflpsb %%v30,%%v30 \n\t"
"vflpsb %%v31,%%v31 \n\t"

"vfasb %%v16,%%v16,%%v17 \n\t"
"vfasb %%v18,%%v18,%%v19 \n\t"
"vfasb %%v20,%%v20,%%v21 \n\t"
"vfasb %%v22,%%v22,%%v23 \n\t"
"vfasb %%v24,%%v24,%%v25 \n\t"
"vfasb %%v26,%%v26,%%v27 \n\t"
"vfasb %%v28,%%v28,%%v29 \n\t"
"vfasb %%v30,%%v30,%%v31 \n\t"
"vfmaxsb %%v16,%%v16,%%v24,0 \n\t"
"vfmaxsb %%v18,%%v18,%%v26,0 \n\t"
"vfmaxsb %%v20,%%v20,%%v28,0 \n\t"
"vfmaxsb %%v22,%%v22,%%v30,0 \n\t"

"vfmaxsb %%v16,%%v16,%%v20,0 \n\t"
"vfmaxsb %%v18,%%v18,%%v22,0 \n\t"

"vfmaxsb %%v16,%%v16,%%v18,0 \n\t"

"vfmaxsb %%v0,%%v0,%%v16,0 \n\t"

"agfi %%r1, 256 \n\t"
"brctg %%r0, 0b \n\t"

"veslg %%v16,%%v0,32 \n\t"
"vfmaxsb %%v0,%%v0,%%v16,0 \n\t"

"vrepf %%v16,%%v0,2 \n\t"
"wfmaxsb %%v0,%%v0,%%v16,0 \n\t"
"ler %0,%%f0 "
:"=f"(amax)
:"r"(n),"ZR"((const FLOAT (*)[n])x)
:"memory","cc","r0","r1","v0","v1","v2","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
);

return amax;
#define CABS1(x,i) (fabsf(x[i]) + fabsf(x[i + 1]))

static FLOAT camax_kernel_32(BLASLONG n, FLOAT *x) {
FLOAT amax;

__asm__("vlef %%v0,0(%[x]),0\n\t"
"vlef %%v16,4(%[x]),0\n\t"
"vlef %%v0,8(%[x]),1\n\t"
"vlef %%v16,12(%[x]),1\n\t"
"vlef %%v0,16(%[x]),2\n\t"
"vlef %%v16,20(%[x]),2\n\t"
"vlef %%v0,24(%[x]),3\n\t"
"vlef %%v16,28(%[x]),3\n\t"
"vflpsb %%v0,%%v0\n\t"
"vflpsb %%v16,%%v16\n\t"
"vfasb %%v0,%%v0,%%v16\n\t"
"vleib %%v1,0,0\n\t"
"vleib %%v1,1,1\n\t"
"vleib %%v1,2,2\n\t"
"vleib %%v1,3,3\n\t"
"vleib %%v1,8,4\n\t"
"vleib %%v1,9,5\n\t"
"vleib %%v1,10,6\n\t"
"vleib %%v1,11,7\n\t"
"vleib %%v1,16,8\n\t"
"vleib %%v1,17,9\n\t"
"vleib %%v1,18,10\n\t"
"vleib %%v1,19,11\n\t"
"vleib %%v1,24,12\n\t"
"vleib %%v1,25,13\n\t"
"vleib %%v1,26,14\n\t"
"vleib %%v1,27,15\n\t"
"srlg %[n],%[n],5\n\t"
"xgr %%r1,%%r1\n\t"
"0:\n\t"
"pfd 1, 1024(%%r1,%[x])\n\t"
"vl %%v16,0(%%r1,%[x])\n\t"
"vl %%v2,16(%%r1,%[x])\n\t"
"vpkg %%v17,%%v16,%%v2\n\t"
"vperm %%v16,%%v16,%%v2,%%v1\n\t"
"vl %%v18,32(%%r1,%[x])\n\t"
"vl %%v2,48(%%r1,%[x])\n\t"
"vpkg %%v19,%%v18,%%v2\n\t"
"vperm %%v18,%%v18,%%v2,%%v1\n\t"
"vl %%v20,64(%%r1,%[x])\n\t"
"vl %%v2,80(%%r1,%[x])\n\t"
"vpkg %%v21,%%v20,%%v2\n\t"
"vperm %%v20,%%v20,%%v2,%%v1\n\t"
"vl %%v22,96(%%r1,%[x])\n\t"
"vl %%v2,112(%%r1,%[x])\n\t"
"vpkg %%v23,%%v22,%%v2\n\t"
"vperm %%v22,%%v22,%%v2,%%v1\n\t"
"vl %%v24,128(%%r1,%[x])\n\t"
"vl %%v2,144(%%r1,%[x])\n\t"
"vpkg %%v25,%%v24,%%v2\n\t"
"vperm %%v24,%%v24,%%v2,%%v1\n\t"
"vl %%v26,160(%%r1,%[x])\n\t"
"vl %%v2,176(%%r1,%[x])\n\t"
"vpkg %%v27,%%v26,%%v2\n\t"
"vperm %%v26,%%v26,%%v2,%%v1\n\t"
"vl %%v28,192(%%r1,%[x])\n\t"
"vl %%v2,208(%%r1,%[x])\n\t"
"vpkg %%v29,%%v28,%%v2\n\t"
"vperm %%v28,%%v28,%%v2,%%v1\n\t"
"vl %%v30,224(%%r1,%[x])\n\t"
"vl %%v2,240(%%r1,%[x])\n\t"
"vpkg %%v31,%%v30,%%v2\n\t"
"vperm %%v30,%%v30,%%v2,%%v1\n\t"
"vflpsb %%v16,%%v16\n\t"
"vflpsb %%v17,%%v17\n\t"
"vflpsb %%v18,%%v18\n\t"
"vflpsb %%v19,%%v19\n\t"
"vflpsb %%v20,%%v20\n\t"
"vflpsb %%v21,%%v21\n\t"
"vflpsb %%v22,%%v22\n\t"
"vflpsb %%v23,%%v23\n\t"
"vflpsb %%v24,%%v24\n\t"
"vflpsb %%v25,%%v25\n\t"
"vflpsb %%v26,%%v26\n\t"
"vflpsb %%v27,%%v27\n\t"
"vflpsb %%v28,%%v28\n\t"
"vflpsb %%v29,%%v29\n\t"
"vflpsb %%v30,%%v30\n\t"
"vflpsb %%v31,%%v31\n\t"
"vfasb %%v16,%%v16,%%v17\n\t"
"vfasb %%v18,%%v18,%%v19\n\t"
"vfasb %%v20,%%v20,%%v21\n\t"
"vfasb %%v22,%%v22,%%v23\n\t"
"vfasb %%v24,%%v24,%%v25\n\t"
"vfasb %%v26,%%v26,%%v27\n\t"
"vfasb %%v28,%%v28,%%v29\n\t"
"vfasb %%v30,%%v30,%%v31\n\t"
"vfmaxsb %%v16,%%v16,%%v24,0\n\t"
"vfmaxsb %%v18,%%v18,%%v26,0\n\t"
"vfmaxsb %%v20,%%v20,%%v28,0\n\t"
"vfmaxsb %%v22,%%v22,%%v30,0\n\t"
"vfmaxsb %%v16,%%v16,%%v20,0\n\t"
"vfmaxsb %%v18,%%v18,%%v22,0\n\t"
"vfmaxsb %%v16,%%v16,%%v18,0\n\t"
"vfmaxsb %%v0,%%v0,%%v16,0\n\t"
"agfi %%r1, 256\n\t"
"brctg %[n], 0b\n\t"
"veslg %%v16,%%v0,32\n\t"
"vfmaxsb %%v0,%%v0,%%v16,0\n\t"
"vrepf %%v16,%%v0,2\n\t"
"wfmaxsb %%v0,%%v0,%%v16,0\n\t"
"ler %[amax],%%f0"
: [amax] "=f"(amax),[n] "+&r"(n)
: "m"(*(const FLOAT (*)[n * 2]) x),[x] "a"(x)
: "cc", "r1", "v0", "v1", "v2", "v16", "v17", "v18", "v19", "v20",
"v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30",
"v31");

return amax;
} }

FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
BLASLONG i = 0;
BLASLONG ix = 0;
FLOAT maxf = 0.0;
BLASLONG inc_x2;

if (n <= 0 || inc_x <= 0) return (maxf);

if (inc_x == 1) {

BLASLONG n1 = n & -32;
if (n1 > 0) {

maxf = camax_kernel_32(n1, x);
ix = n1 * 2;
i = n1;
}
else
{
maxf=CABS1(x,0);
ix += 2;
i++;
}

while (i < n) {
if (CABS1(x,ix) > maxf) {
maxf = CABS1(x,ix);
}
ix += 2;
i++;
}
return (maxf);
BLASLONG i = 0;
BLASLONG ix = 0;
FLOAT maxf = 0.0;
BLASLONG inc_x2;

if (n <= 0 || inc_x <= 0)
return (maxf);

if (inc_x == 1) {

BLASLONG n1 = n & -32;
if (n1 > 0) {


maxf = camax_kernel_32(n1, x);
ix = n1 * 2;
i = n1;
} else { } else {
maxf = CABS1(x, 0);
ix += 2;
i++;
}

while (i < n) {
if (CABS1(x, ix) > maxf) {
maxf = CABS1(x, ix);
}
ix += 2;
i++;
}
return (maxf);


maxf=CABS1(x,0);
inc_x2 = 2 * inc_x;
} else {


BLASLONG n1 = n & -4;
while (i < n1) {
maxf = CABS1(x, 0);
inc_x2 = 2 * inc_x;


if (CABS1(x,ix) > maxf) {
maxf = CABS1(x,ix);
}
if (CABS1(x,ix+inc_x2) > maxf) {
maxf = CABS1(x,ix+inc_x2);
}
if (CABS1(x,ix+inc_x2*2) > maxf) {
maxf = CABS1(x,ix+inc_x2*2);
}
if (CABS1(x,ix+inc_x2*3) > maxf) {
maxf = CABS1(x,ix+inc_x2*3);
}
BLASLONG n1 = n & -4;
while (i < n1) {


ix += inc_x2 * 4;
if (CABS1(x, ix) > maxf) {
maxf = CABS1(x, ix);
}
if (CABS1(x, ix + inc_x2) > maxf) {
maxf = CABS1(x, ix + inc_x2);
}
if (CABS1(x, ix + inc_x2 * 2) > maxf) {
maxf = CABS1(x, ix + inc_x2 * 2);
}
if (CABS1(x, ix + inc_x2 * 3) > maxf) {
maxf = CABS1(x, ix + inc_x2 * 3);
}


i += 4;
ix += inc_x2 * 4;


}
i += 4;


}


while (i < n) {
if (CABS1(x,ix) > maxf) {
maxf = CABS1(x,ix);
}
ix += inc_x2;
i++;
}
return (maxf);
while (i < n) {
if (CABS1(x, ix) > maxf) {
maxf = CABS1(x, ix);
}
ix += inc_x2;
i++;
} }
return (maxf);
}
} }

+ 172
- 198
kernel/zarch/camin.c View File

@@ -1,5 +1,5 @@
/*************************************************************************** /***************************************************************************
Copyright (c) 2013-2017, The OpenBLAS Project
Copyright (c) 2013-2019, The OpenBLAS Project
All rights reserved. All rights reserved.
Redistribution and use in source and binary forms, with or without Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are modification, are permitted provided that the following conditions are
@@ -28,214 +28,188 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "common.h" #include "common.h"
#include <math.h> #include <math.h>


#if defined(DOUBLE)
#define ABS fabs
#else
#define ABS fabsf
#endif

#define CABS1(x,i) (ABS(x[i]) + ABS(x[i + 1]))

static FLOAT camin_kernel_32(BLASLONG n, FLOAT *x)
{
FLOAT amin;

__asm__ volatile (
"vlef %%v0,0(%2),0 \n\t"
"vlef %%v16,4(%2),0 \n\t"
"vlef %%v0,8(%2),1 \n\t"
"vlef %%v16,12(%2),1 \n\t"
"vlef %%v0,16(%2),2 \n\t"
"vlef %%v16,20(%2),2 \n\t"
"vlef %%v0,24(%2),3 \n\t"
"vlef %%v16,28(%2),3 \n\t"
"vflpsb %%v0,%%v0 \n\t"
"vflpsb %%v16,%%v16 \n\t"
"vfasb %%v0,%%v0,%%v16 \n\t"
"vleib %%v1,0,0 \n\t"
"vleib %%v1,1,1 \n\t"
"vleib %%v1,2,2 \n\t"
"vleib %%v1,3,3 \n\t"
"vleib %%v1,8,4 \n\t"
"vleib %%v1,9,5 \n\t"
"vleib %%v1,10,6 \n\t"
"vleib %%v1,11,7 \n\t"
"vleib %%v1,16,8 \n\t"
"vleib %%v1,17,9 \n\t"
"vleib %%v1,18,10 \n\t"
"vleib %%v1,19,11 \n\t"
"vleib %%v1,24,12 \n\t"
"vleib %%v1,25,13 \n\t"
"vleib %%v1,26,14 \n\t"
"vleib %%v1,27,15 \n\t"
"srlg %%r0,%1,5 \n\t"
"xgr %%r1,%%r1 \n\t"
"0: \n\t"
"pfd 1, 1024(%%r1,%2) \n\t"

"vl %%v16,0(%%r1,%2) \n\t"
"vl %%v2,16(%%r1,%2) \n\t"
"vpkg %%v17,%%v16,%%v2 \n\t"
"vperm %%v16,%%v16,%%v2,%%v1 \n\t"

"vl %%v18,32(%%r1,%2) \n\t"
"vl %%v2,48(%%r1,%2) \n\t"
"vpkg %%v19,%%v18,%%v2 \n\t"
"vperm %%v18,%%v18,%%v2,%%v1 \n\t"

"vl %%v20,64(%%r1,%2) \n\t"
"vl %%v2,80(%%r1,%2) \n\t"
"vpkg %%v21,%%v20,%%v2 \n\t"
"vperm %%v20,%%v20,%%v2,%%v1 \n\t"

"vl %%v22,96(%%r1,%2) \n\t"
"vl %%v2,112(%%r1,%2) \n\t"
"vpkg %%v23,%%v22,%%v2 \n\t"
"vperm %%v22,%%v22,%%v2,%%v1 \n\t"

"vl %%v24,128(%%r1,%2) \n\t"
"vl %%v2,144(%%r1,%2) \n\t"
"vpkg %%v25,%%v24,%%v2 \n\t"
"vperm %%v24,%%v24,%%v2,%%v1 \n\t"

"vl %%v26,160(%%r1,%2) \n\t"
"vl %%v2,176(%%r1,%2) \n\t"
"vpkg %%v27,%%v26,%%v2 \n\t"
"vperm %%v26,%%v26,%%v2,%%v1 \n\t"

"vl %%v28,192(%%r1,%2) \n\t"
"vl %%v2,208(%%r1,%2) \n\t"
"vpkg %%v29,%%v28,%%v2 \n\t"
"vperm %%v28,%%v28,%%v2,%%v1 \n\t"

"vl %%v30,224(%%r1,%2) \n\t"
"vl %%v2,240(%%r1,%2) \n\t"
"vpkg %%v31,%%v30,%%v2 \n\t"
"vperm %%v30,%%v30,%%v2,%%v1 \n\t"

"vflpsb %%v16,%%v16 \n\t"
"vflpsb %%v17,%%v17 \n\t"
"vflpsb %%v18,%%v18 \n\t"
"vflpsb %%v19,%%v19 \n\t"
"vflpsb %%v20,%%v20 \n\t"
"vflpsb %%v21,%%v21 \n\t"
"vflpsb %%v22,%%v22 \n\t"
"vflpsb %%v23,%%v23 \n\t"
"vflpsb %%v24,%%v24 \n\t"
"vflpsb %%v25,%%v25 \n\t"
"vflpsb %%v26,%%v26 \n\t"
"vflpsb %%v27,%%v27 \n\t"
"vflpsb %%v28,%%v28 \n\t"
"vflpsb %%v29,%%v29 \n\t"
"vflpsb %%v30,%%v30 \n\t"
"vflpsb %%v31,%%v31 \n\t"

"vfasb %%v16,%%v16,%%v17 \n\t"
"vfasb %%v18,%%v18,%%v19 \n\t"
"vfasb %%v20,%%v20,%%v21 \n\t"
"vfasb %%v22,%%v22,%%v23 \n\t"
"vfasb %%v24,%%v24,%%v25 \n\t"
"vfasb %%v26,%%v26,%%v27 \n\t"
"vfasb %%v28,%%v28,%%v29 \n\t"
"vfasb %%v30,%%v30,%%v31 \n\t"
"vfminsb %%v16,%%v16,%%v24,0 \n\t"
"vfminsb %%v18,%%v18,%%v26,0 \n\t"
"vfminsb %%v20,%%v20,%%v28,0 \n\t"
"vfminsb %%v22,%%v22,%%v30,0 \n\t"

"vfminsb %%v16,%%v16,%%v20,0 \n\t"
"vfminsb %%v18,%%v18,%%v22,0 \n\t"

"vfminsb %%v16,%%v16,%%v18,0 \n\t"

"vfminsb %%v0,%%v0,%%v16,0 \n\t"

"agfi %%r1, 256 \n\t"
"brctg %%r0, 0b \n\t"

"veslg %%v16,%%v0,32 \n\t"
"vfminsb %%v0,%%v0,%%v16,0 \n\t"

"vrepf %%v16,%%v0,2 \n\t"
"wfminsb %%v0,%%v0,%%v16,0 \n\t"
"ler %0,%%f0 "
:"=f"(amin)
:"r"(n),"ZR"((const FLOAT (*)[n])x)
:"memory","cc","r0","r1","v0","v1","v2","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
);

return amin;
#define CABS1(x,i) (fabsf(x[i]) + fabsf(x[i + 1]))

static FLOAT camin_kernel_32(BLASLONG n, FLOAT *x) {
FLOAT amin;

__asm__("vlef %%v0,0(%[x]),0\n\t"
"vlef %%v16,4(%[x]),0\n\t"
"vlef %%v0,8(%[x]),1\n\t"
"vlef %%v16,12(%[x]),1\n\t"
"vlef %%v0,16(%[x]),2\n\t"
"vlef %%v16,20(%[x]),2\n\t"
"vlef %%v0,24(%[x]),3\n\t"
"vlef %%v16,28(%[x]),3\n\t"
"vflpsb %%v0,%%v0\n\t"
"vflpsb %%v16,%%v16\n\t"
"vfasb %%v0,%%v0,%%v16\n\t"
"vleib %%v1,0,0\n\t"
"vleib %%v1,1,1\n\t"
"vleib %%v1,2,2\n\t"
"vleib %%v1,3,3\n\t"
"vleib %%v1,8,4\n\t"
"vleib %%v1,9,5\n\t"
"vleib %%v1,10,6\n\t"
"vleib %%v1,11,7\n\t"
"vleib %%v1,16,8\n\t"
"vleib %%v1,17,9\n\t"
"vleib %%v1,18,10\n\t"
"vleib %%v1,19,11\n\t"
"vleib %%v1,24,12\n\t"
"vleib %%v1,25,13\n\t"
"vleib %%v1,26,14\n\t"
"vleib %%v1,27,15\n\t"
"srlg %[n],%[n],5\n\t"
"xgr %%r1,%%r1\n\t"
"0:\n\t"
"pfd 1, 1024(%%r1,%[x])\n\t"
"vl %%v16,0(%%r1,%[x])\n\t"
"vl %%v2,16(%%r1,%[x])\n\t"
"vpkg %%v17,%%v16,%%v2\n\t"
"vperm %%v16,%%v16,%%v2,%%v1\n\t"
"vl %%v18,32(%%r1,%[x])\n\t"
"vl %%v2,48(%%r1,%[x])\n\t"
"vpkg %%v19,%%v18,%%v2\n\t"
"vperm %%v18,%%v18,%%v2,%%v1\n\t"
"vl %%v20,64(%%r1,%[x])\n\t"
"vl %%v2,80(%%r1,%[x])\n\t"
"vpkg %%v21,%%v20,%%v2\n\t"
"vperm %%v20,%%v20,%%v2,%%v1\n\t"
"vl %%v22,96(%%r1,%[x])\n\t"
"vl %%v2,112(%%r1,%[x])\n\t"
"vpkg %%v23,%%v22,%%v2\n\t"
"vperm %%v22,%%v22,%%v2,%%v1\n\t"
"vl %%v24,128(%%r1,%[x])\n\t"
"vl %%v2,144(%%r1,%[x])\n\t"
"vpkg %%v25,%%v24,%%v2\n\t"
"vperm %%v24,%%v24,%%v2,%%v1\n\t"
"vl %%v26,160(%%r1,%[x])\n\t"
"vl %%v2,176(%%r1,%[x])\n\t"
"vpkg %%v27,%%v26,%%v2\n\t"
"vperm %%v26,%%v26,%%v2,%%v1\n\t"
"vl %%v28,192(%%r1,%[x])\n\t"
"vl %%v2,208(%%r1,%[x])\n\t"
"vpkg %%v29,%%v28,%%v2\n\t"
"vperm %%v28,%%v28,%%v2,%%v1\n\t"
"vl %%v30,224(%%r1,%[x])\n\t"
"vl %%v2,240(%%r1,%[x])\n\t"
"vpkg %%v31,%%v30,%%v2\n\t"
"vperm %%v30,%%v30,%%v2,%%v1\n\t"
"vflpsb %%v16,%%v16\n\t"
"vflpsb %%v17,%%v17\n\t"
"vflpsb %%v18,%%v18\n\t"
"vflpsb %%v19,%%v19\n\t"
"vflpsb %%v20,%%v20\n\t"
"vflpsb %%v21,%%v21\n\t"
"vflpsb %%v22,%%v22\n\t"
"vflpsb %%v23,%%v23\n\t"
"vflpsb %%v24,%%v24\n\t"
"vflpsb %%v25,%%v25\n\t"
"vflpsb %%v26,%%v26\n\t"
"vflpsb %%v27,%%v27\n\t"
"vflpsb %%v28,%%v28\n\t"
"vflpsb %%v29,%%v29\n\t"
"vflpsb %%v30,%%v30\n\t"
"vflpsb %%v31,%%v31\n\t"
"vfasb %%v16,%%v16,%%v17\n\t"
"vfasb %%v18,%%v18,%%v19\n\t"
"vfasb %%v20,%%v20,%%v21\n\t"
"vfasb %%v22,%%v22,%%v23\n\t"
"vfasb %%v24,%%v24,%%v25\n\t"
"vfasb %%v26,%%v26,%%v27\n\t"
"vfasb %%v28,%%v28,%%v29\n\t"
"vfasb %%v30,%%v30,%%v31\n\t"
"vfminsb %%v16,%%v16,%%v24,0\n\t"
"vfminsb %%v18,%%v18,%%v26,0\n\t"
"vfminsb %%v20,%%v20,%%v28,0\n\t"
"vfminsb %%v22,%%v22,%%v30,0\n\t"
"vfminsb %%v16,%%v16,%%v20,0\n\t"
"vfminsb %%v18,%%v18,%%v22,0\n\t"
"vfminsb %%v16,%%v16,%%v18,0\n\t"
"vfminsb %%v0,%%v0,%%v16,0\n\t"
"agfi %%r1, 256\n\t"
"brctg %[n], 0b\n\t"
"veslg %%v16,%%v0,32\n\t"
"vfminsb %%v0,%%v0,%%v16,0\n\t"
"vrepf %%v16,%%v0,2\n\t"
"wfminsb %%v0,%%v0,%%v16,0\n\t"
"ler %[amin],%%f0"
: [amin] "=f"(amin),[n] "+&r"(n)
: "m"(*(const FLOAT (*)[n * 2]) x),[x] "a"(x)
: "cc", "r1", "v0", "v1", "v2", "v16", "v17", "v18", "v19", "v20",
"v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30",
"v31");

return amin;
} }

FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
BLASLONG i = 0;
BLASLONG ix = 0;
FLOAT minf = 0.0;
BLASLONG inc_x2;

if (n <= 0 || inc_x <= 0) return (minf);

if (inc_x == 1) {

BLASLONG n1 = n & -32;
if (n1 > 0) {

minf = camin_kernel_32(n1, x);
ix = n1 * 2;
i = n1;
}
else
{
minf=CABS1(x,0);
ix += 2;
i++;
}

while (i < n) {
if (CABS1(x,ix) < minf) {
minf = CABS1(x,ix);
}
ix += 2;
i++;
}
return (minf);
BLASLONG i = 0;
BLASLONG ix = 0;
FLOAT minf = 0.0;
BLASLONG inc_x2;

if (n <= 0 || inc_x <= 0)
return (minf);

if (inc_x == 1) {

BLASLONG n1 = n & -32;
if (n1 > 0) {


minf = camin_kernel_32(n1, x);
ix = n1 * 2;
i = n1;
} else { } else {
minf = CABS1(x, 0);
ix += 2;
i++;
}

while (i < n) {
if (CABS1(x, ix) < minf) {
minf = CABS1(x, ix);
}
ix += 2;
i++;
}
return (minf);


minf=CABS1(x,0);
inc_x2 = 2 * inc_x;
} else {


BLASLONG n1 = n & -4;
while (i < n1) {
minf = CABS1(x, 0);
inc_x2 = 2 * inc_x;


if (CABS1(x,ix) < minf) {
minf = CABS1(x,ix);
}
if (CABS1(x,ix+inc_x2) < minf) {
minf = CABS1(x,ix+inc_x2);
}
if (CABS1(x,ix+inc_x2*2) < minf) {
minf = CABS1(x,ix+inc_x2*2);
}
if (CABS1(x,ix+inc_x2*3) < minf) {
minf = CABS1(x,ix+inc_x2*3);
}
BLASLONG n1 = n & -4;
while (i < n1) {


ix += inc_x2 * 4;
if (CABS1(x, ix) < minf) {
minf = CABS1(x, ix);
}
if (CABS1(x, ix + inc_x2) < minf) {
minf = CABS1(x, ix + inc_x2);
}
if (CABS1(x, ix + inc_x2 * 2) < minf) {
minf = CABS1(x, ix + inc_x2 * 2);
}
if (CABS1(x, ix + inc_x2 * 3) < minf) {
minf = CABS1(x, ix + inc_x2 * 3);
}


i += 4;
ix += inc_x2 * 4;


}
i += 4;


}


while (i < n) {
if (CABS1(x,ix) < minf) {
minf = CABS1(x,ix);
}
ix += inc_x2;
i++;
}
return (minf);
while (i < n) {
if (CABS1(x, ix) < minf) {
minf = CABS1(x, ix);
}
ix += inc_x2;
i++;
} }
return (minf);
}
} }

+ 112
- 124
kernel/zarch/casum.c View File

@@ -1,5 +1,5 @@
/*************************************************************************** /***************************************************************************
Copyright (c) 2013-2017, The OpenBLAS Project
Copyright (c) 2013-2019, The OpenBLAS Project
All rights reserved. All rights reserved.
Redistribution and use in source and binary forms, with or without Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are modification, are permitted provided that the following conditions are
@@ -28,140 +28,128 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "common.h" #include "common.h"
#include <math.h> #include <math.h>


#if defined(DOUBLE)
#define ABS fabs
#else
#define ABS fabsf #define ABS fabsf
#endif


static FLOAT casum_kernel_32(BLASLONG n, FLOAT *x)
{
FLOAT asum;
__asm__ (
"vzero %%v0 \n\t"
"vzero %%v1 \n\t"
"vzero %%v2 \n\t"
"vzero %%v3 \n\t"
"srlg %%r0,%1,5 \n\t"
"xgr %%r1,%%r1 \n\t"
"0: \n\t"
"pfd 1, 1024(%%r1,%2) \n\t"
"vl %%v16, 0(%%r1,%2) \n\t"
"vl %%v17, 16(%%r1,%2) \n\t"
"vl %%v18, 32(%%r1,%2) \n\t"
"vl %%v19, 48(%%r1,%2) \n\t"
"vl %%v20, 64(%%r1,%2) \n\t"
"vl %%v21, 80(%%r1,%2) \n\t"
"vl %%v22, 96(%%r1,%2) \n\t"
"vl %%v23, 112(%%r1,%2) \n\t"
"vflpsb %%v16, %%v16 \n\t"
"vflpsb %%v17, %%v17 \n\t"
"vflpsb %%v18, %%v18 \n\t"
"vflpsb %%v19, %%v19 \n\t"
"vflpsb %%v20, %%v20 \n\t"
"vflpsb %%v21, %%v21 \n\t"
"vflpsb %%v22, %%v22 \n\t"
"vflpsb %%v23, %%v23 \n\t"
"vfasb %%v0,%%v0,%%v16 \n\t"
"vfasb %%v1,%%v1,%%v17 \n\t"
"vfasb %%v2,%%v2,%%v18 \n\t"
"vfasb %%v3,%%v3,%%v19 \n\t"
"vfasb %%v0,%%v0,%%v20 \n\t"
"vfasb %%v1,%%v1,%%v21 \n\t"
"vfasb %%v2,%%v2,%%v22 \n\t"
"vfasb %%v3,%%v3,%%v23 \n\t"
"vl %%v16, 128(%%r1,%2) \n\t"
"vl %%v17, 144(%%r1,%2) \n\t"
"vl %%v18, 160(%%r1,%2) \n\t"
"vl %%v19, 176(%%r1,%2) \n\t"
"vl %%v20, 192(%%r1,%2) \n\t"
"vl %%v21, 208(%%r1,%2) \n\t"
"vl %%v22, 224(%%r1,%2) \n\t"
"vl %%v23, 240(%%r1,%2) \n\t"
"vflpsb %%v16, %%v16 \n\t"
"vflpsb %%v17, %%v17 \n\t"
"vflpsb %%v18, %%v18 \n\t"
"vflpsb %%v19, %%v19 \n\t"
"vflpsb %%v20, %%v20 \n\t"
"vflpsb %%v21, %%v21 \n\t"
"vflpsb %%v22, %%v22 \n\t"
"vflpsb %%v23, %%v23 \n\t"
"vfasb %%v0,%%v0,%%v16 \n\t"
"vfasb %%v1,%%v1,%%v17 \n\t"
"vfasb %%v2,%%v2,%%v18 \n\t"
"vfasb %%v3,%%v3,%%v19 \n\t"
"vfasb %%v0,%%v0,%%v20 \n\t"
"vfasb %%v1,%%v1,%%v21 \n\t"
"vfasb %%v2,%%v2,%%v22 \n\t"
"vfasb %%v3,%%v3,%%v23 \n\t"
"agfi %%r1,256 \n\t"
"brctg %%r0,0b \n\t"
"vfasb %%v0,%%v0,%%v1 \n\t"
"vfasb %%v0,%%v0,%%v2 \n\t"
"vfasb %%v0,%%v0,%%v3 \n\t"
"veslg %%v1,%%v0,32 \n\t"
"vfasb %%v0,%%v0,%%v1 \n\t"
"vrepf %%v1,%%v0,2 \n\t"
"aebr %%f0,%%f1 \n\t"
"ler %0,%%f0 "
:"=f"(asum)
:"r"(n),"ZR"((const FLOAT (*)[n * 2])x)
:"memory","cc","r0","r1","v0","v1","v2","v3","v16","v17","v18","v19","v20","v21","v22","v23"
);
return asum;
static FLOAT casum_kernel_32(BLASLONG n, FLOAT *x) {
FLOAT asum;
__asm__("vzero %%v24\n\t"
"vzero %%v25\n\t"
"vzero %%v26\n\t"
"vzero %%v27\n\t"
"vzero %%v28\n\t"
"vzero %%v29\n\t"
"vzero %%v30\n\t"
"vzero %%v31\n\t"
"srlg %[n],%[n],5\n\t"
"xgr %%r1,%%r1\n\t"
"0:\n\t"
"pfd 1, 1024(%%r1,%[x])\n\t"
"vl %%v16, 0(%%r1,%[x])\n\t"
"vl %%v17, 16(%%r1,%[x])\n\t"
"vl %%v18, 32(%%r1,%[x])\n\t"
"vl %%v19, 48(%%r1,%[x])\n\t"
"vl %%v20, 64(%%r1,%[x])\n\t"
"vl %%v21, 80(%%r1,%[x])\n\t"
"vl %%v22, 96(%%r1,%[x])\n\t"
"vl %%v23, 112(%%r1,%[x])\n\t"
"vflpsb %%v16, %%v16\n\t"
"vflpsb %%v17, %%v17\n\t"
"vflpsb %%v18, %%v18\n\t"
"vflpsb %%v19, %%v19\n\t"
"vflpsb %%v20, %%v20\n\t"
"vflpsb %%v21, %%v21\n\t"
"vflpsb %%v22, %%v22\n\t"
"vflpsb %%v23, %%v23\n\t"
"vfasb %%v24,%%v24,%%v16\n\t"
"vfasb %%v25,%%v25,%%v17\n\t"
"vfasb %%v26,%%v26,%%v18\n\t"
"vfasb %%v27,%%v27,%%v19\n\t"
"vfasb %%v28,%%v28,%%v20\n\t"
"vfasb %%v29,%%v29,%%v21\n\t"
"vfasb %%v30,%%v30,%%v22\n\t"
"vfasb %%v31,%%v31,%%v23\n\t"
"vl %%v16, 128(%%r1,%[x])\n\t"
"vl %%v17, 144(%%r1,%[x])\n\t"
"vl %%v18, 160(%%r1,%[x])\n\t"
"vl %%v19, 176(%%r1,%[x])\n\t"
"vl %%v20, 192(%%r1,%[x])\n\t"
"vl %%v21, 208(%%r1,%[x])\n\t"
"vl %%v22, 224(%%r1,%[x])\n\t"
"vl %%v23, 240(%%r1,%[x])\n\t"
"vflpsb %%v16, %%v16\n\t"
"vflpsb %%v17, %%v17\n\t"
"vflpsb %%v18, %%v18\n\t"
"vflpsb %%v19, %%v19\n\t"
"vflpsb %%v20, %%v20\n\t"
"vflpsb %%v21, %%v21\n\t"
"vflpsb %%v22, %%v22\n\t"
"vflpsb %%v23, %%v23\n\t"
"vfasb %%v24,%%v24,%%v16\n\t"
"vfasb %%v25,%%v25,%%v17\n\t"
"vfasb %%v26,%%v26,%%v18\n\t"
"vfasb %%v27,%%v27,%%v19\n\t"
"vfasb %%v28,%%v28,%%v20\n\t"
"vfasb %%v29,%%v29,%%v21\n\t"
"vfasb %%v30,%%v30,%%v22\n\t"
"vfasb %%v31,%%v31,%%v23\n\t"
"agfi %%r1,256\n\t"
"brctg %[n],0b\n\t"
"vfasb %%v24,%%v24,%%v25\n\t"
"vfasb %%v24,%%v24,%%v26\n\t"
"vfasb %%v24,%%v24,%%v27\n\t"
"vfasb %%v24,%%v24,%%v28\n\t"
"vfasb %%v24,%%v24,%%v29\n\t"
"vfasb %%v24,%%v24,%%v30\n\t"
"vfasb %%v24,%%v24,%%v31\n\t"
"veslg %%v25,%%v24,32\n\t"
"vfasb %%v24,%%v24,%%v25\n\t"
"vrepf %%v25,%%v24,2\n\t"
"vfasb %%v24,%%v24,%%v25\n\t"
"vstef %%v24,%[asum],0"
: [asum] "=m"(asum),[n] "+&r"(n)
: "m"(*(const FLOAT (*)[n * 2]) x),[x] "a"(x)
: "cc", "r1", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23",
"v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31");
return asum;
} }


FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
{
BLASLONG i=0;
BLASLONG ip=0;
FLOAT sumf = 0.0;
BLASLONG n1;
BLASLONG inc_x2;

if (n <= 0 || inc_x <= 0) return(sumf);
FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
BLASLONG i = 0;
BLASLONG ip = 0;
FLOAT sumf = 0.0;
BLASLONG n1;
BLASLONG inc_x2;


if ( inc_x == 1 )
{
if (n <= 0 || inc_x <= 0)
return (sumf);


n1 = n & -32;
if ( n1 > 0 )
{
if (inc_x == 1) {


sumf = casum_kernel_32(n1, x);
i=n1;
ip=2*n1;
}
n1 = n & -32;
if (n1 > 0) {


while(i < n)
{
sumf += ABS(x[ip]) + ABS(x[ip+1]);
i++;
ip+=2;
}
sumf = casum_kernel_32(n1, x);
i = n1;
ip = 2 * n1;
}


while (i < n) {
sumf += ABS(x[ip]) + ABS(x[ip + 1]);
i++;
ip += 2;
} }
else
{
inc_x2 = 2* inc_x;


while(i < n)
{
sumf += ABS(x[ip]) + ABS(x[ip+1]);
ip+=inc_x2;
i++;
}
} else {
inc_x2 = 2 * inc_x;


while (i < n) {
sumf += ABS(x[ip]) + ABS(x[ip + 1]);
ip += inc_x2;
i++;
} }
return(sumf);
}



}
return (sumf);
}

+ 108
- 124
kernel/zarch/caxpy.c View File

@@ -1,5 +1,5 @@
/*************************************************************************** /***************************************************************************
Copyright (c) 2017, The OpenBLAS Project
Copyright (c) 2013-2019, The OpenBLAS Project
All rights reserved. All rights reserved.
Redistribution and use in source and binary forms, with or without Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are modification, are permitted provided that the following conditions are
@@ -27,148 +27,132 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.


#include "common.h" #include "common.h"


static void caxpy_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
{
__asm__ volatile(
static void caxpy_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) {
__asm__(
#if !defined(CONJ) #if !defined(CONJ)
"vlrepf %%v0,0(%3) \n\t"
"vlef %%v1,4(%3),0 \n\t"
"vlef %%v1,4(%3),2 \n\t"
"vflcsb %%v1,%%v1 \n\t"
"vlef %%v1,4(%3),1 \n\t"
"vlef %%v1,4(%3),3 \n\t"
#else
"vlef %%v0,0(%3),1 \n\t"
"vlef %%v0,0(%3),3 \n\t"
"vflcsb %%v0,%%v0 \n\t"
"vlef %%v0,0(%3),0 \n\t"
"vlef %%v0,0(%3),2 \n\t"
"vlrepf %%v1,4(%3) \n\t"
"vlrepf %%v0,0(%[alpha])\n\t"
"vlef %%v1,4(%[alpha]),0\n\t"
"vlef %%v1,4(%[alpha]),2\n\t"
"vflcsb %%v1,%%v1\n\t"
"vlef %%v1,4(%[alpha]),1\n\t"
"vlef %%v1,4(%[alpha]),3\n\t"
#else
"vlef %%v0,0(%[alpha]),1\n\t"
"vlef %%v0,0(%[alpha]),3\n\t"
"vflcsb %%v0,%%v0\n\t"
"vlef %%v0,0(%[alpha]),0\n\t"
"vlef %%v0,0(%[alpha]),2\n\t"
"vlrepf %%v1,4(%[alpha])\n\t"
#endif #endif
"srlg %%r0,%0,4 \n\t"
"xgr %%r1,%%r1 \n\t"
"0: \n\t"
"pfd 1, 1024(%%r1,%1) \n\t"
"pfd 2, 1024(%%r1,%2) \n\t"

"vl %%v16,0(%%r1,%1) \n\t"
"vl %%v17,16(%%r1,%1) \n\t"
"vl %%v18,32(%%r1,%1) \n\t"
"vl %%v19,48(%%r1,%1) \n\t"
"vl %%v20,0(%%r1,%2) \n\t"
"vl %%v21,16(%%r1,%2) \n\t"
"vl %%v22,32(%%r1,%2) \n\t"
"vl %%v23,48(%%r1,%2) \n\t"
"verllg %%v24,%%v16,32 \n\t"
"verllg %%v25,%%v17,32 \n\t"
"verllg %%v26,%%v18,32 \n\t"
"verllg %%v27,%%v19,32 \n\t"

"vfmasb %%v28,%%v16,%%v0,%%v20 \n\t"
"vfmasb %%v29,%%v17,%%v0,%%v21 \n\t"
"vfmasb %%v30,%%v18,%%v0,%%v22 \n\t"
"vfmasb %%v31,%%v19,%%v0,%%v23 \n\t"

"vfmasb %%v28,%%v24,%%v1,%%v28 \n\t"
"vfmasb %%v29,%%v25,%%v1,%%v29 \n\t"
"vfmasb %%v30,%%v26,%%v1,%%v30 \n\t"
"vfmasb %%v31,%%v27,%%v1,%%v31 \n\t"

"vst %%v28,0(%%r1,%2) \n\t"
"vst %%v29,16(%%r1,%2) \n\t"
"vst %%v30,32(%%r1,%2) \n\t"
"vst %%v31,48(%%r1,%2) \n\t"

"vl %%v16,64(%%r1,%1) \n\t"
"vl %%v17,80(%%r1,%1) \n\t"
"vl %%v18,96(%%r1,%1) \n\t"
"vl %%v19,112(%%r1,%1) \n\t"
"vl %%v20,64(%%r1,%2) \n\t"
"vl %%v21,80(%%r1,%2) \n\t"
"vl %%v22,96(%%r1,%2) \n\t"
"vl %%v23,112(%%r1,%2) \n\t"
"verllg %%v24,%%v16,32 \n\t"
"verllg %%v25,%%v17,32 \n\t"
"verllg %%v26,%%v18,32 \n\t"
"verllg %%v27,%%v19,32 \n\t"

"vfmasb %%v28,%%v16,%%v0,%%v20 \n\t"
"vfmasb %%v29,%%v17,%%v0,%%v21 \n\t"
"vfmasb %%v30,%%v18,%%v0,%%v22 \n\t"
"vfmasb %%v31,%%v19,%%v0,%%v23 \n\t"

"vfmasb %%v28,%%v24,%%v1,%%v28 \n\t"
"vfmasb %%v29,%%v25,%%v1,%%v29 \n\t"
"vfmasb %%v30,%%v26,%%v1,%%v30 \n\t"
"vfmasb %%v31,%%v27,%%v1,%%v31 \n\t"

"vst %%v28,64(%%r1,%2) \n\t"
"vst %%v29,80(%%r1,%2) \n\t"
"vst %%v30,96(%%r1,%2) \n\t"
"vst %%v31,112(%%r1,%2) \n\t"

"agfi %%r1,128 \n\t"
"brctg %%r0,0b "
:
:"r"(n),"ZR"((const FLOAT (*)[n * 2])x),"ZR"((FLOAT (*)[n * 2])y),"ZQ"((const FLOAT (*)[2])alpha)
:"memory","cc","r0","r1","v0","v1","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
);
"srlg %[n],%[n],4\n\t"
"xgr %%r1,%%r1\n\t"
"0:\n\t"
"pfd 1, 1024(%%r1,%[x])\n\t"
"pfd 2, 1024(%%r1,%[y])\n\t"
"vl %%v8,0(%%r1,%[x])\n\t"
"vl %%v9,16(%%r1,%[x])\n\t"
"vl %%v10,32(%%r1,%[x])\n\t"
"vl %%v11,48(%%r1,%[x])\n\t"
"vl %%v12,0(%%r1,%[y])\n\t"
"vl %%v13,16(%%r1,%[y])\n\t"
"vl %%v14,32(%%r1,%[y])\n\t"
"vl %%v15,48(%%r1,%[y])\n\t"
"vl %%v16,64(%%r1,%[x])\n\t"
"vl %%v17,80(%%r1,%[x])\n\t"
"vl %%v18,96(%%r1,%[x])\n\t"
"vl %%v19,112(%%r1,%[x])\n\t"
"vl %%v20,64(%%r1,%[y])\n\t"
"vl %%v21,80(%%r1,%[y])\n\t"
"vl %%v22,96(%%r1,%[y])\n\t"
"vl %%v23,112(%%r1,%[y])\n\t"
"vfmasb %%v8,%%v8,%%v0,%%v12\n\t"
"vfmasb %%v9,%%v9,%%v0,%%v13\n\t"
"vfmasb %%v10,%%v10,%%v0,%%v14\n\t"
"vfmasb %%v11,%%v11,%%v0,%%v15\n\t"
"vfmasb %%v16,%%v16,%%v0,%%v20\n\t"
"vfmasb %%v17,%%v17,%%v0,%%v21\n\t"
"vfmasb %%v18,%%v18,%%v0,%%v22\n\t"
"vfmasb %%v19,%%v19,%%v0,%%v23\n\t"
"vfmasb %%v8,%%v24,%%v1,%%v8\n\t"
"vfmasb %%v9,%%v25,%%v1,%%v9\n\t"
"vfmasb %%v10,%%v26,%%v1,%%v10\n\t"
"vfmasb %%v11,%%v27,%%v1,%%v11\n\t"
"vfmasb %%v16,%%v28,%%v1,%%v16\n\t"
"vfmasb %%v17,%%v29,%%v1,%%v17\n\t"
"vfmasb %%v18,%%v30,%%v1,%%v18\n\t"
"vfmasb %%v19,%%v31,%%v1,%%v19\n\t"
"vst %%v8,0(%%r1,%[y])\n\t"
"vst %%v9,16(%%r1,%[y])\n\t"
"vst %%v10,32(%%r1,%[y])\n\t"
"vst %%v11,48(%%r1,%[y])\n\t"
"vst %%v16,64(%%r1,%[y])\n\t"
"vst %%v17,80(%%r1,%[y])\n\t"
"vst %%v18,96(%%r1,%[y])\n\t"
"vst %%v19,112(%%r1,%[y])\n\t"
"agfi %%r1,128\n\t"
"brctg %[n],0b"
: "+m"(*(FLOAT (*)[n * 2]) y),[n] "+&r"(n)
: [y] "a"(y), "m"(*(const FLOAT (*)[n * 2]) x),[x] "a"(x),
"m"(*(const FLOAT (*)[2]) alpha),[alpha] "a"(alpha)
: "cc", "r1", "v0", "v1", "v8", "v9", "v10", "v11", "v12", "v13",
"v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23",
"v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31");
} }


int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) {
BLASLONG i = 0;
BLASLONG ix = 0, iy = 0;
FLOAT da[2] __attribute__ ((aligned(16)));
int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i,
FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy,
BLASLONG dummy2) {
BLASLONG i = 0;
BLASLONG ix = 0, iy = 0;
FLOAT da[2] __attribute__ ((aligned(16)));


if (n <= 0) return (0);
if (n <= 0)
return (0);


if ((inc_x == 1) && (inc_y == 1)) {
if ((inc_x == 1) && (inc_y == 1)) {


BLASLONG n1 = n & -16;
BLASLONG n1 = n & -16;


if (n1) {
da[0] = da_r;
da[1] = da_i;
caxpy_kernel_16(n1, x, y, da);
ix = 2 * n1;
}
i = n1;
while (i < n) {
if (n1) {
da[0] = da_r;
da[1] = da_i;
caxpy_kernel_16(n1, x, y, da);
ix = 2 * n1;
}
i = n1;
while (i < n) {
#if !defined(CONJ) #if !defined(CONJ)
y[ix] += (da_r * x[ix] - da_i * x[ix + 1]);
y[ix + 1] += (da_r * x[ix + 1] + da_i * x[ix]);
y[ix] += (da_r * x[ix] - da_i * x[ix + 1]);
y[ix + 1] += (da_r * x[ix + 1] + da_i * x[ix]);
#else #else
y[ix] += (da_r * x[ix] + da_i * x[ix + 1]);
y[ix + 1] -= (da_r * x[ix + 1] - da_i * x[ix]);
y[ix] += (da_r * x[ix] + da_i * x[ix + 1]);
y[ix + 1] -= (da_r * x[ix + 1] - da_i * x[ix]);
#endif #endif
i++;
ix += 2;

}
return (0);

i++;
ix += 2;


} }
return (0);


inc_x *= 2;
inc_y *= 2;
}


while (i < n) {
inc_x *= 2;
inc_y *= 2;

while (i < n) {


#if !defined(CONJ) #if !defined(CONJ)
y[iy] += (da_r * x[ix] - da_i * x[ix + 1]);
y[iy + 1] += (da_r * x[ix + 1] + da_i * x[ix]);
y[iy] += (da_r * x[ix] - da_i * x[ix + 1]);
y[iy + 1] += (da_r * x[ix + 1] + da_i * x[ix]);
#else #else
y[iy] += (da_r * x[ix] + da_i * x[ix + 1]);
y[iy + 1] -= (da_r * x[ix + 1] - da_i * x[ix]);
y[iy] += (da_r * x[ix] + da_i * x[ix + 1]);
y[iy + 1] -= (da_r * x[ix + 1] - da_i * x[ix]);
#endif #endif
ix += inc_x;
iy += inc_y;
i++;
ix += inc_x;
iy += inc_y;
i++;


}
return (0);
}
return (0);


} }



+ 45
- 57
kernel/zarch/ccopy.c View File

@@ -1,5 +1,5 @@
/*************************************************************************** /***************************************************************************
Copyright (c) 2013-2018, The OpenBLAS Project
Copyright (c) 2013-2019, The OpenBLAS Project
All rights reserved. All rights reserved.
Redistribution and use in source and binary forms, with or without Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are modification, are permitted provided that the following conditions are
@@ -27,73 +27,61 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.


#include "common.h" #include "common.h"


static void ccopy_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y)
{
__asm__ volatile (
"lgr %%r1,%1 \n\t"
"lgr %%r2,%2 \n\t"
"srlg %%r0,%0,5 \n\t"
"0: \n\t"
"pfd 1, 1024(%%r1) \n\t"
"pfd 2, 1024(%%r2) \n\t"
"mvc 0(256,%%r2),0(%%r1) \n\t"
"agfi %%r1,256 \n\t"
"agfi %%r2,256 \n\t"
"brctg %%r0,0b "
:
:"r"(n),"a"((const FLOAT (*)[n * 2])x),"a"((FLOAT (*)[n * 2])y)
:"memory","cc","r0","r1","r2"
);
static void ccopy_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y) {
__asm__("srlg %[n],%[n],5\n\t"
"0:\n\t"
"pfd 1, 1024(%[x])\n\t"
"pfd 2, 1024(%[y])\n\t"
"mvc 0(256,%[y]),0(%[x])\n\t"
"la %[x],256(%[x])\n\t"
"la %[y],256(%[y])\n\t"
"brctg %[n],0b"
: "=m"(*(FLOAT (*)[n * 2]) y),[x] "+&a"(x),[y] "+&a"(y)
: "m"(*(const FLOAT (*)[n * 2]) x)
: "cc");
} }


int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
{
BLASLONG i=0;
BLASLONG ix=0,iy=0;
int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) {
BLASLONG i = 0;
BLASLONG ix = 0, iy = 0;


if ( n <= 0 ) return(0);
if (n <= 0)
return (0);


if ( (inc_x == 1) && (inc_y == 1 ))
{
if ((inc_x == 1) && (inc_y == 1)) {


BLASLONG n1 = n & -32;
if ( n1 > 0 )
{
ccopy_kernel_32(n1, x, y);
i=n1;
ix=n1*2;
iy=n1*2;
}

while(i < n)
{
y[iy] = x[iy] ;
y[iy+1] = x[ix+1] ;
ix+=2;
iy+=2;
i++ ;

}
BLASLONG n1 = n & -32;
if (n1 > 0) {
ccopy_kernel_32(n1, x, y);
i = n1;
ix = n1 * 2;
iy = n1 * 2;
}


while (i < n) {
y[iy] = x[iy];
y[iy + 1] = x[ix + 1];
ix += 2;
iy += 2;
i++;


} }
else
{


BLASLONG inc_x2 = 2 * inc_x;
BLASLONG inc_y2 = 2 * inc_y;
} else {


while(i < n)
{
y[iy] = x[ix] ;
y[iy+1] = x[ix+1] ;
ix += inc_x2 ;
iy += inc_y2 ;
i++ ;
BLASLONG inc_x2 = 2 * inc_x;
BLASLONG inc_y2 = 2 * inc_y;


}
while (i < n) {
y[iy] = x[ix];
y[iy + 1] = x[ix + 1];
ix += inc_x2;
iy += inc_y2;
i++;


} }
return(0);

}

return (0);
} }

+ 124
- 130
kernel/zarch/cdot.c View File

@@ -1,5 +1,5 @@
/*************************************************************************** /***************************************************************************
Copyright (c) 2013-2017, The OpenBLAS Project
Copyright (c) 2013-2019, The OpenBLAS Project
All rights reserved. All rights reserved.
Redistribution and use in source and binary forms, with or without Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are modification, are permitted provided that the following conditions are
@@ -27,156 +27,150 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.


#include "common.h" #include "common.h"


static void cdot_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *d)
{
__asm__ volatile(
"vzero %%v24 \n\t"
"vzero %%v25 \n\t"
"vzero %%v26 \n\t"
"vzero %%v27 \n\t"
"vzero %%v28 \n\t"
"vzero %%v29 \n\t"
"vzero %%v30 \n\t"
"vzero %%v31 \n\t"
"srlg %%r0,%0,4 \n\t"
"xgr %%r1,%%r1 \n\t"
"0: \n\t"
"pfd 1, 1024(%%r1,%1) \n\t"
"pfd 1, 1024(%%r1,%2) \n\t"

"vl %%v16, 0(%%r1,%1) \n\t"
"vl %%v17, 16(%%r1,%1) \n\t"
"vl %%v18, 32(%%r1,%1) \n\t"
"vl %%v19, 48(%%r1,%1) \n\t"
"vl %%v0, 0(%%r1,%2) \n\t"
"vl %%v1, 16(%%r1,%2) \n\t"
"vl %%v2, 32(%%r1,%2) \n\t"
"vl %%v3, 48(%%r1,%2) \n\t"
"verllg %%v20,%%v16,32 \n\t"
"verllg %%v21,%%v17,32 \n\t"
"verllg %%v22,%%v18,32 \n\t"
"verllg %%v23,%%v19,32 \n\t"

"vfmasb %%v24,%%v16,%%v0,%%v24 \n\t"
"vfmasb %%v25,%%v20,%%v0,%%v25 \n\t"
"vfmasb %%v26,%%v17,%%v1,%%v26 \n\t"
"vfmasb %%v27,%%v21,%%v1,%%v27 \n\t"
"vfmasb %%v28,%%v18,%%v2,%%v28 \n\t"
"vfmasb %%v29,%%v22,%%v2,%%v29 \n\t"
"vfmasb %%v30,%%v19,%%v3,%%v30 \n\t"
"vfmasb %%v31,%%v23,%%v3,%%v31 \n\t"

"vl %%v16, 64(%%r1,%1) \n\t"
"vl %%v17, 80(%%r1,%1) \n\t"
"vl %%v18, 96(%%r1,%1) \n\t"
"vl %%v19, 112(%%r1,%1) \n\t"
"vl %%v0, 64(%%r1,%2) \n\t"
"vl %%v1, 80(%%r1,%2) \n\t"
"vl %%v2, 96(%%r1,%2) \n\t"
"vl %%v3, 112(%%r1,%2) \n\t"
"verllg %%v20,%%v16,32 \n\t"
"verllg %%v21,%%v17,32 \n\t"
"verllg %%v22,%%v18,32 \n\t"
"verllg %%v23,%%v19,32 \n\t"

"vfmasb %%v24,%%v16,%%v0,%%v24 \n\t"
"vfmasb %%v25,%%v20,%%v0,%%v25 \n\t"
"vfmasb %%v26,%%v17,%%v1,%%v26 \n\t"
"vfmasb %%v27,%%v21,%%v1,%%v27 \n\t"
"vfmasb %%v28,%%v18,%%v2,%%v28 \n\t"
"vfmasb %%v29,%%v22,%%v2,%%v29 \n\t"
"vfmasb %%v30,%%v19,%%v3,%%v30 \n\t"
"vfmasb %%v31,%%v23,%%v3,%%v31 \n\t"

"agfi %%r1,128 \n\t"
"brctg %%r0,0b \n\t"
"vfasb %%v24,%%v24,%%v26 \n\t"
"vfasb %%v24,%%v24,%%v28 \n\t"
"vfasb %%v24,%%v24,%%v30 \n\t"
"vrepg %%v26,%%v24,1 \n\t"
"vfasb %%v24,%%v24,%%v26 \n\t"
"vfasb %%v25,%%v25,%%v27 \n\t"
"vfasb %%v25,%%v25,%%v29 \n\t"
"vfasb %%v25,%%v25,%%v31 \n\t"
"vrepg %%v27,%%v25,1 \n\t"
"vfasb %%v25,%%v25,%%v27 \n\t"
"vstef %%v24,0(%3),0 \n\t"
"vstef %%v24,4(%3),1 \n\t"
"vstef %%v25,8(%3),1 \n\t"
"vstef %%v25,12(%3),0 "
:
:"r"(n),"ZR"((const FLOAT (*)[n * 2])x),"ZR"((const FLOAT (*)[n * 2])y),"ZQ"((FLOAT (*)[4])d)
:"memory","cc","r0","r1","v0","v1","v2","v3","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
);
static void cdot_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *d) {
__asm__("vzero %%v24\n\t"
"vzero %%v25\n\t"
"vzero %%v26\n\t"
"vzero %%v27\n\t"
"vzero %%v28\n\t"
"vzero %%v29\n\t"
"vzero %%v30\n\t"
"vzero %%v31\n\t"
"srlg %[n],%[n],4\n\t"
"xgr %%r1,%%r1\n\t"
"0:\n\t"
"pfd 1, 1024(%%r1,%[x])\n\t"
"pfd 1, 1024(%%r1,%[y])\n\t"
"vl %%v16, 0(%%r1,%[x])\n\t"
"vl %%v17, 16(%%r1,%[x])\n\t"
"vl %%v18, 32(%%r1,%[x])\n\t"
"vl %%v19, 48(%%r1,%[x])\n\t"
"vl %%v0, 0(%%r1,%[y])\n\t"
"vl %%v1, 16(%%r1,%[y])\n\t"
"vl %%v2, 32(%%r1,%[y])\n\t"
"vl %%v3, 48(%%r1,%[y])\n\t"
"verllg %%v20,%%v16,32\n\t"
"verllg %%v21,%%v17,32\n\t"
"verllg %%v22,%%v18,32\n\t"
"verllg %%v23,%%v19,32\n\t"
"vfmasb %%v24,%%v16,%%v0,%%v24\n\t"
"vfmasb %%v25,%%v20,%%v0,%%v25\n\t"
"vfmasb %%v26,%%v17,%%v1,%%v26\n\t"
"vfmasb %%v27,%%v21,%%v1,%%v27\n\t"
"vfmasb %%v28,%%v18,%%v2,%%v28\n\t"
"vfmasb %%v29,%%v22,%%v2,%%v29\n\t"
"vfmasb %%v30,%%v19,%%v3,%%v30\n\t"
"vfmasb %%v31,%%v23,%%v3,%%v31\n\t"
"vl %%v16, 64(%%r1,%[x])\n\t"
"vl %%v17, 80(%%r1,%[x])\n\t"
"vl %%v18, 96(%%r1,%[x])\n\t"
"vl %%v19, 112(%%r1,%[x])\n\t"
"vl %%v0, 64(%%r1,%[y])\n\t"
"vl %%v1, 80(%%r1,%[y])\n\t"
"vl %%v2, 96(%%r1,%[y])\n\t"
"vl %%v3, 112(%%r1,%[y])\n\t"
"verllg %%v20,%%v16,32\n\t"
"verllg %%v21,%%v17,32\n\t"
"verllg %%v22,%%v18,32\n\t"
"verllg %%v23,%%v19,32\n\t"
"vfmasb %%v24,%%v16,%%v0,%%v24\n\t"
"vfmasb %%v25,%%v20,%%v0,%%v25\n\t"
"vfmasb %%v26,%%v17,%%v1,%%v26\n\t"
"vfmasb %%v27,%%v21,%%v1,%%v27\n\t"
"vfmasb %%v28,%%v18,%%v2,%%v28\n\t"
"vfmasb %%v29,%%v22,%%v2,%%v29\n\t"
"vfmasb %%v30,%%v19,%%v3,%%v30\n\t"
"vfmasb %%v31,%%v23,%%v3,%%v31\n\t"
"agfi %%r1,128\n\t"
"brctg %[n],0b\n\t"
"vfasb %%v24,%%v24,%%v26\n\t"
"vfasb %%v24,%%v24,%%v28\n\t"
"vfasb %%v24,%%v24,%%v30\n\t"
"vrepg %%v26,%%v24,1\n\t"
"vfasb %%v24,%%v24,%%v26\n\t"
"vfasb %%v25,%%v25,%%v27\n\t"
"vfasb %%v25,%%v25,%%v29\n\t"
"vfasb %%v25,%%v25,%%v31\n\t"
"vrepg %%v27,%%v25,1\n\t"
"vfasb %%v25,%%v25,%%v27\n\t"
"vstef %%v24,0(%[d]),0\n\t"
"vstef %%v24,4(%[d]),1\n\t"
"vstef %%v25,8(%[d]),1\n\t"
"vstef %%v25,12(%[d]),0"
: "=m"(*(FLOAT (*)[4]) d),[n] "+&r"(n)
: [d] "a"(d), "m"(*(const FLOAT (*)[n * 2]) x),[x] "a"(x),
"m"(*(const FLOAT (*)[n * 2]) y),[y] "a"(y)
: "cc", "r1", "v0", "v1", "v2", "v3", "v16", "v17", "v18", "v19", "v20",
"v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30",
"v31");
} }


OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) {
BLASLONG i;
BLASLONG ix, iy;
OPENBLAS_COMPLEX_FLOAT result;
FLOAT dot[4] __attribute__ ((aligned(16))) = {0.0, 0.0, 0.0, 0.0};
if (n <= 0) {
CREAL(result) = 0.0;
CIMAG(result) = 0.0;
return (result);
}
OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y,
BLASLONG inc_y) {
BLASLONG i;
BLASLONG ix, iy;
OPENBLAS_COMPLEX_FLOAT result;
FLOAT dot[4] __attribute__ ((aligned(16))) = {
0.0, 0.0, 0.0, 0.0};
if (n <= 0) {
CREAL(result) = 0.0;
CIMAG(result) = 0.0;
return (result);


if ((inc_x == 1) && (inc_y == 1)) {
}


BLASLONG n1 = n & -16;
if ((inc_x == 1) && (inc_y == 1)) {


if (n1)
cdot_kernel_16(n1, x, y, dot);
BLASLONG n1 = n & -16;


i = n1;
BLASLONG j = i * 2;
if (n1)
cdot_kernel_16(n1, x, y, dot);


while (i < n) {
i = n1;
BLASLONG j = i * 2;


dot[0] += x[j] * y[j];
dot[1] += x[j + 1] * y[j + 1];
dot[2] += x[j] * y[j + 1];
dot[3] += x[j + 1] * y[j];
while (i < n) {


j += 2;
i++;
dot[0] += x[j] * y[j];
dot[1] += x[j + 1] * y[j + 1];
dot[2] += x[j] * y[j + 1];
dot[3] += x[j + 1] * y[j];


}
j += 2;
i++;


}


} else {
i = 0;
ix = 0;
iy = 0;
inc_x <<= 1;
inc_y <<= 1;
while (i < n) {
} else {
i = 0;
ix = 0;
iy = 0;
inc_x <<= 1;
inc_y <<= 1;
while (i < n) {


dot[0] += x[ix] * y[iy];
dot[1] += x[ix + 1] * y[iy + 1];
dot[2] += x[ix] * y[iy + 1];
dot[3] += x[ix + 1] * y[iy];
dot[0] += x[ix] * y[iy];
dot[1] += x[ix + 1] * y[iy + 1];
dot[2] += x[ix] * y[iy + 1];
dot[3] += x[ix + 1] * y[iy];


ix += inc_x;
iy += inc_y;
i++;
ix += inc_x;
iy += inc_y;
i++;


}
} }
}


#if !defined(CONJ) #if !defined(CONJ)
CREAL(result) = dot[0] - dot[1];
CIMAG(result) = dot[2] + dot[3];
CREAL(result) = dot[0] - dot[1];
CIMAG(result) = dot[2] + dot[3];
#else #else
CREAL(result) = dot[0] + dot[1];
CIMAG(result) = dot[2] - dot[3];
CREAL(result) = dot[0] + dot[1];
CIMAG(result) = dot[2] - dot[3];


#endif #endif


return (result);
return (result);


} }



+ 632
- 631
kernel/zarch/cgemv_n_4.c
File diff suppressed because it is too large
View File


+ 612
- 567
kernel/zarch/cgemv_t_4.c
File diff suppressed because it is too large
View File


+ 196
- 217
kernel/zarch/crot.c View File

@@ -1,5 +1,5 @@
/*************************************************************************** /***************************************************************************
Copyright (c) 2013-2018, The OpenBLAS Project
Copyright (c) 2013-2019, The OpenBLAS Project
All rights reserved. All rights reserved.
Redistribution and use in source and binary forms, with or without Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are modification, are permitted provided that the following conditions are
@@ -27,230 +27,209 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.


#include "common.h" #include "common.h"


static void crot_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *c, FLOAT *s)
{
__asm__ (
"vlrepf %%v0,%3 \n\t"
"vlrepf %%v1,%4 \n\t"
"srlg %%r0,%0,5 \n\t"
"xgr %%r1,%%r1 \n\t"
"0: \n\t"
"pfd 2, 1024(%%r1,%1) \n\t"
"pfd 2, 1024(%%r1,%2) \n\t"
"vl %%v24, 0(%%r1,%1) \n\t"
"vl %%v25, 16(%%r1,%1) \n\t"
"vl %%v26, 32(%%r1,%1) \n\t"
"vl %%v27, 48(%%r1,%1) \n\t"
"vl %%v16, 0(%%r1,%2) \n\t"
"vl %%v17, 16(%%r1,%2) \n\t"
"vl %%v18, 32(%%r1,%2) \n\t"
"vl %%v19, 48(%%r1,%2) \n\t"
"vfmsb %%v28,%%v24,%%v0 \n\t"
"vfmsb %%v29,%%v25,%%v0 \n\t"
"vfmsb %%v20,%%v24,%%v1 \n\t" /* yn=x*s */
"vfmsb %%v21,%%v25,%%v1 \n\t" /* yn=x*s */
"vfmsb %%v30,%%v26,%%v0 \n\t"
"vfmsb %%v22,%%v26,%%v1 \n\t" /* yn=x*s */
"vfmsb %%v31,%%v27,%%v0 \n\t"
"vfmsb %%v23,%%v27,%%v1 \n\t" /* yn=x*s */
/* 2nd parts*/
"vfmasb %%v28,%%v16,%%v1,%%v28 \n\t"
"vfmssb %%v20,%%v16,%%v0,%%v20 \n\t" /* yn=y*c-yn */
"vfmasb %%v29,%%v17,%%v1,%%v29 \n\t"
"vfmssb %%v21,%%v17,%%v0,%%v21 \n\t" /* yn=y*c-yn */
"vfmasb %%v30,%%v18,%%v1,%%v30 \n\t"
"vfmssb %%v22,%%v18,%%v0,%%v22 \n\t" /* yn=y*c-yn */
"vfmasb %%v31,%%v19,%%v1,%%v31 \n\t"
"vfmssb %%v23,%%v19,%%v0,%%v23 \n\t" /* yn=y*c-yn */
"vst %%v28, 0(%%r1,%1) \n\t"
"vst %%v29, 16(%%r1,%1) \n\t"
"vst %%v30, 32(%%r1,%1) \n\t"
"vst %%v31, 48(%%r1,%1) \n\t"
"vst %%v20, 0(%%r1,%2) \n\t"
"vst %%v21, 16(%%r1,%2) \n\t"
"vst %%v22, 32(%%r1,%2) \n\t"
"vst %%v23, 48(%%r1,%2) \n\t"
"vl %%v24, 64(%%r1,%1) \n\t"
"vl %%v25, 80(%%r1,%1) \n\t"
"vl %%v26, 96(%%r1,%1) \n\t"
"vl %%v27, 112(%%r1,%1) \n\t"
"vl %%v16, 64(%%r1,%2) \n\t"
"vl %%v17, 80(%%r1,%2) \n\t"
"vl %%v18, 96(%%r1,%2) \n\t"
"vl %%v19, 112(%%r1,%2) \n\t"
"vfmsb %%v28,%%v24,%%v0 \n\t"
"vfmsb %%v29,%%v25,%%v0 \n\t"
"vfmsb %%v20,%%v24,%%v1 \n\t" /* yn=x*s */
"vfmsb %%v21,%%v25,%%v1 \n\t" /* yn=x*s */
"vfmsb %%v30,%%v26,%%v0 \n\t"
"vfmsb %%v22,%%v26,%%v1 \n\t" /* yn=x*s */
"vfmsb %%v31,%%v27,%%v0 \n\t"
"vfmsb %%v23,%%v27,%%v1 \n\t" /* yn=x*s */
/* 2nd parts*/
"vfmasb %%v28,%%v16,%%v1,%%v28 \n\t"
"vfmssb %%v20,%%v16,%%v0,%%v20 \n\t" /* yn=y*c-yn */
"vfmasb %%v29,%%v17,%%v1,%%v29 \n\t"
"vfmssb %%v21,%%v17,%%v0,%%v21 \n\t" /* yn=y*c-yn */
"vfmasb %%v30,%%v18,%%v1,%%v30 \n\t"
"vfmssb %%v22,%%v18,%%v0,%%v22 \n\t" /* yn=y*c-yn */
"vfmasb %%v31,%%v19,%%v1,%%v31 \n\t"
"vfmssb %%v23,%%v19,%%v0,%%v23 \n\t" /* yn=y*c-yn */
"vst %%v28, 64(%%r1,%1) \n\t"
"vst %%v29, 80(%%r1,%1) \n\t"
"vst %%v30, 96(%%r1,%1) \n\t"
"vst %%v31, 112(%%r1,%1) \n\t"
"vst %%v20, 64(%%r1,%2) \n\t"
"vst %%v21, 80(%%r1,%2) \n\t"
"vst %%v22, 96(%%r1,%2) \n\t"
"vst %%v23, 112(%%r1,%2) \n\t"
"vl %%v24, 128(%%r1,%1) \n\t"
"vl %%v25, 144(%%r1,%1) \n\t"
"vl %%v26, 160(%%r1,%1) \n\t"
"vl %%v27, 176(%%r1,%1) \n\t"
"vl %%v16, 128(%%r1,%2) \n\t"
"vl %%v17, 144(%%r1,%2) \n\t"
"vl %%v18, 160(%%r1,%2) \n\t"
"vl %%v19, 176(%%r1,%2) \n\t"
"vfmsb %%v28,%%v24,%%v0 \n\t"
"vfmsb %%v29,%%v25,%%v0 \n\t"
"vfmsb %%v20,%%v24,%%v1 \n\t" /* yn=x*s */
"vfmsb %%v21,%%v25,%%v1 \n\t" /* yn=x*s */
"vfmsb %%v30,%%v26,%%v0 \n\t"
"vfmsb %%v22,%%v26,%%v1 \n\t" /* yn=x*s */
"vfmsb %%v31,%%v27,%%v0 \n\t"
"vfmsb %%v23,%%v27,%%v1 \n\t" /* yn=x*s */
/* 2nd parts*/
"vfmasb %%v28,%%v16,%%v1,%%v28 \n\t"
"vfmssb %%v20,%%v16,%%v0,%%v20 \n\t" /* yn=y*c-yn */
"vfmasb %%v29,%%v17,%%v1,%%v29 \n\t"
"vfmssb %%v21,%%v17,%%v0,%%v21 \n\t" /* yn=y*c-yn */
"vfmasb %%v30,%%v18,%%v1,%%v30 \n\t"
"vfmssb %%v22,%%v18,%%v0,%%v22 \n\t" /* yn=y*c-yn */
"vfmasb %%v31,%%v19,%%v1,%%v31 \n\t"
"vfmssb %%v23,%%v19,%%v0,%%v23 \n\t" /* yn=y*c-yn */
"vst %%v28, 128(%%r1,%1) \n\t"
"vst %%v29, 144(%%r1,%1) \n\t"
"vst %%v30, 160(%%r1,%1) \n\t"
"vst %%v31, 176(%%r1,%1) \n\t"
"vst %%v20, 128(%%r1,%2) \n\t"
"vst %%v21, 144(%%r1,%2) \n\t"
"vst %%v22, 160(%%r1,%2) \n\t"
"vst %%v23, 176(%%r1,%2) \n\t"
"vl %%v24, 192(%%r1,%1) \n\t"
"vl %%v25, 208(%%r1,%1) \n\t"
"vl %%v26, 224(%%r1,%1) \n\t"
"vl %%v27, 240(%%r1,%1) \n\t"
"vl %%v16, 192(%%r1,%2) \n\t"
"vl %%v17, 208(%%r1,%2) \n\t"
"vl %%v18, 224(%%r1,%2) \n\t"
"vl %%v19, 240(%%r1,%2) \n\t"
"vfmsb %%v28,%%v24,%%v0 \n\t"
"vfmsb %%v29,%%v25,%%v0 \n\t"
"vfmsb %%v20,%%v24,%%v1 \n\t" /* yn=x*s */
"vfmsb %%v21,%%v25,%%v1 \n\t" /* yn=x*s */
"vfmsb %%v30,%%v26,%%v0 \n\t"
"vfmsb %%v22,%%v26,%%v1 \n\t" /* yn=x*s */
"vfmsb %%v31,%%v27,%%v0 \n\t"
"vfmsb %%v23,%%v27,%%v1 \n\t" /* yn=x*s */
/* 2nd parts*/
"vfmasb %%v28,%%v16,%%v1,%%v28 \n\t"
"vfmssb %%v20,%%v16,%%v0,%%v20 \n\t" /* yn=y*c-yn */
"vfmasb %%v29,%%v17,%%v1,%%v29 \n\t"
"vfmssb %%v21,%%v17,%%v0,%%v21 \n\t" /* yn=y*c-yn */
"vfmasb %%v30,%%v18,%%v1,%%v30 \n\t"
"vfmssb %%v22,%%v18,%%v0,%%v22 \n\t" /* yn=y*c-yn */
"vfmasb %%v31,%%v19,%%v1,%%v31 \n\t"
"vfmssb %%v23,%%v19,%%v0,%%v23 \n\t" /* yn=y*c-yn */
"vst %%v28, 192(%%r1,%1) \n\t"
"vst %%v29, 208(%%r1,%1) \n\t"
"vst %%v30, 224(%%r1,%1) \n\t"
"vst %%v31, 240(%%r1,%1) \n\t"
"vst %%v20, 192(%%r1,%2) \n\t"
"vst %%v21, 208(%%r1,%2) \n\t"
"vst %%v22, 224(%%r1,%2) \n\t"
"vst %%v23, 240(%%r1,%2) \n\t"
"agfi %%r1,256 \n\t"
"brctg %%r0,0b "
:
:"r"(n),"ZR"((FLOAT (*)[n * 2])x),"ZR"((FLOAT (*)[n * 2])y),"m"(*c),"m"(*s)
:"memory","cc","r0","r1","v0","v1","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
);
static void crot_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *c, FLOAT *s) {
__asm__("vlrepf %%v0,%[c]\n\t"
"vlrepf %%v1,%[s]\n\t"
"srlg %[n],%[n],5\n\t"
"xgr %%r1,%%r1\n\t"
"0:\n\t"
"pfd 2, 1024(%%r1,%[x])\n\t"
"pfd 2, 1024(%%r1,%[y])\n\t"
"vl %%v24, 0(%%r1,%[x])\n\t"
"vl %%v25, 16(%%r1,%[x])\n\t"
"vl %%v26, 32(%%r1,%[x])\n\t"
"vl %%v27, 48(%%r1,%[x])\n\t"
"vl %%v16, 0(%%r1,%[y])\n\t"
"vl %%v17, 16(%%r1,%[y])\n\t"
"vl %%v18, 32(%%r1,%[y])\n\t"
"vl %%v19, 48(%%r1,%[y])\n\t"
"vfmsb %%v28,%%v24,%%v0\n\t"
"vfmsb %%v29,%%v25,%%v0\n\t"
"vfmsb %%v20,%%v24,%%v1\n\t" /* yn=x*s */
"vfmsb %%v21,%%v25,%%v1\n\t" /* yn=x*s */
"vfmsb %%v30,%%v26,%%v0\n\t"
"vfmsb %%v22,%%v26,%%v1\n\t" /* yn=x*s */
"vfmsb %%v31,%%v27,%%v0\n\t"
"vfmsb %%v23,%%v27,%%v1\n\t" /* yn=x*s */
/* 2nd parts */
"vfmasb %%v28,%%v16,%%v1,%%v28\n\t"
"vfmssb %%v20,%%v16,%%v0,%%v20\n\t" /* yn=y*c-yn */
"vfmasb %%v29,%%v17,%%v1,%%v29\n\t"
"vfmssb %%v21,%%v17,%%v0,%%v21\n\t" /* yn=y*c-yn */
"vfmasb %%v30,%%v18,%%v1,%%v30\n\t"
"vfmssb %%v22,%%v18,%%v0,%%v22\n\t" /* yn=y*c-yn */
"vfmasb %%v31,%%v19,%%v1,%%v31\n\t"
"vfmssb %%v23,%%v19,%%v0,%%v23\n\t" /* yn=y*c-yn */
"vst %%v28, 0(%%r1,%[x])\n\t"
"vst %%v29, 16(%%r1,%[x])\n\t"
"vst %%v30, 32(%%r1,%[x])\n\t"
"vst %%v31, 48(%%r1,%[x])\n\t"
"vst %%v20, 0(%%r1,%[y])\n\t"
"vst %%v21, 16(%%r1,%[y])\n\t"
"vst %%v22, 32(%%r1,%[y])\n\t"
"vst %%v23, 48(%%r1,%[y])\n\t"
"vl %%v24, 64(%%r1,%[x])\n\t"
"vl %%v25, 80(%%r1,%[x])\n\t"
"vl %%v26, 96(%%r1,%[x])\n\t"
"vl %%v27, 112(%%r1,%[x])\n\t"
"vl %%v16, 64(%%r1,%[y])\n\t"
"vl %%v17, 80(%%r1,%[y])\n\t"
"vl %%v18, 96(%%r1,%[y])\n\t"
"vl %%v19, 112(%%r1,%[y])\n\t"
"vfmsb %%v28,%%v24,%%v0\n\t"
"vfmsb %%v29,%%v25,%%v0\n\t"
"vfmsb %%v20,%%v24,%%v1\n\t" /* yn=x*s */
"vfmsb %%v21,%%v25,%%v1\n\t" /* yn=x*s */
"vfmsb %%v30,%%v26,%%v0\n\t"
"vfmsb %%v22,%%v26,%%v1\n\t" /* yn=x*s */
"vfmsb %%v31,%%v27,%%v0\n\t"
"vfmsb %%v23,%%v27,%%v1\n\t" /* yn=x*s */
/* 2nd parts */
"vfmasb %%v28,%%v16,%%v1,%%v28\n\t"
"vfmssb %%v20,%%v16,%%v0,%%v20\n\t" /* yn=y*c-yn */
"vfmasb %%v29,%%v17,%%v1,%%v29\n\t"
"vfmssb %%v21,%%v17,%%v0,%%v21\n\t" /* yn=y*c-yn */
"vfmasb %%v30,%%v18,%%v1,%%v30\n\t"
"vfmssb %%v22,%%v18,%%v0,%%v22\n\t" /* yn=y*c-yn */
"vfmasb %%v31,%%v19,%%v1,%%v31\n\t"
"vfmssb %%v23,%%v19,%%v0,%%v23\n\t" /* yn=y*c-yn */
"vst %%v28, 64(%%r1,%[x])\n\t"
"vst %%v29, 80(%%r1,%[x])\n\t"
"vst %%v30, 96(%%r1,%[x])\n\t"
"vst %%v31, 112(%%r1,%[x])\n\t"
"vst %%v20, 64(%%r1,%[y])\n\t"
"vst %%v21, 80(%%r1,%[y])\n\t"
"vst %%v22, 96(%%r1,%[y])\n\t"
"vst %%v23, 112(%%r1,%[y])\n\t"
"vl %%v24, 128(%%r1,%[x])\n\t"
"vl %%v25, 144(%%r1,%[x])\n\t"
"vl %%v26, 160(%%r1,%[x])\n\t"
"vl %%v27, 176(%%r1,%[x])\n\t"
"vl %%v16, 128(%%r1,%[y])\n\t"
"vl %%v17, 144(%%r1,%[y])\n\t"
"vl %%v18, 160(%%r1,%[y])\n\t"
"vl %%v19, 176(%%r1,%[y])\n\t"
"vfmsb %%v28,%%v24,%%v0\n\t"
"vfmsb %%v29,%%v25,%%v0\n\t"
"vfmsb %%v20,%%v24,%%v1\n\t" /* yn=x*s */
"vfmsb %%v21,%%v25,%%v1\n\t" /* yn=x*s */
"vfmsb %%v30,%%v26,%%v0\n\t"
"vfmsb %%v22,%%v26,%%v1\n\t" /* yn=x*s */
"vfmsb %%v31,%%v27,%%v0\n\t"
"vfmsb %%v23,%%v27,%%v1\n\t" /* yn=x*s */
/* 2nd parts */
"vfmasb %%v28,%%v16,%%v1,%%v28\n\t"
"vfmssb %%v20,%%v16,%%v0,%%v20\n\t" /* yn=y*c-yn */
"vfmasb %%v29,%%v17,%%v1,%%v29\n\t"
"vfmssb %%v21,%%v17,%%v0,%%v21\n\t" /* yn=y*c-yn */
"vfmasb %%v30,%%v18,%%v1,%%v30\n\t"
"vfmssb %%v22,%%v18,%%v0,%%v22\n\t" /* yn=y*c-yn */
"vfmasb %%v31,%%v19,%%v1,%%v31\n\t"
"vfmssb %%v23,%%v19,%%v0,%%v23\n\t" /* yn=y*c-yn */
"vst %%v28, 128(%%r1,%[x])\n\t"
"vst %%v29, 144(%%r1,%[x])\n\t"
"vst %%v30, 160(%%r1,%[x])\n\t"
"vst %%v31, 176(%%r1,%[x])\n\t"
"vst %%v20, 128(%%r1,%[y])\n\t"
"vst %%v21, 144(%%r1,%[y])\n\t"
"vst %%v22, 160(%%r1,%[y])\n\t"
"vst %%v23, 176(%%r1,%[y])\n\t"
"vl %%v24, 192(%%r1,%[x])\n\t"
"vl %%v25, 208(%%r1,%[x])\n\t"
"vl %%v26, 224(%%r1,%[x])\n\t"
"vl %%v27, 240(%%r1,%[x])\n\t"
"vl %%v16, 192(%%r1,%[y])\n\t"
"vl %%v17, 208(%%r1,%[y])\n\t"
"vl %%v18, 224(%%r1,%[y])\n\t"
"vl %%v19, 240(%%r1,%[y])\n\t"
"vfmsb %%v28,%%v24,%%v0\n\t"
"vfmsb %%v29,%%v25,%%v0\n\t"
"vfmsb %%v20,%%v24,%%v1\n\t" /* yn=x*s */
"vfmsb %%v21,%%v25,%%v1\n\t" /* yn=x*s */
"vfmsb %%v30,%%v26,%%v0\n\t"
"vfmsb %%v22,%%v26,%%v1\n\t" /* yn=x*s */
"vfmsb %%v31,%%v27,%%v0\n\t"
"vfmsb %%v23,%%v27,%%v1\n\t" /* yn=x*s */
/* 2nd parts */
"vfmasb %%v28,%%v16,%%v1,%%v28\n\t"
"vfmssb %%v20,%%v16,%%v0,%%v20\n\t" /* yn=y*c-yn */
"vfmasb %%v29,%%v17,%%v1,%%v29\n\t"
"vfmssb %%v21,%%v17,%%v0,%%v21\n\t" /* yn=y*c-yn */
"vfmasb %%v30,%%v18,%%v1,%%v30\n\t"
"vfmssb %%v22,%%v18,%%v0,%%v22\n\t" /* yn=y*c-yn */
"vfmasb %%v31,%%v19,%%v1,%%v31\n\t"
"vfmssb %%v23,%%v19,%%v0,%%v23\n\t" /* yn=y*c-yn */
"vst %%v28, 192(%%r1,%[x])\n\t"
"vst %%v29, 208(%%r1,%[x])\n\t"
"vst %%v30, 224(%%r1,%[x])\n\t"
"vst %%v31, 240(%%r1,%[x])\n\t"
"vst %%v20, 192(%%r1,%[y])\n\t"
"vst %%v21, 208(%%r1,%[y])\n\t"
"vst %%v22, 224(%%r1,%[y])\n\t"
"vst %%v23, 240(%%r1,%[y])\n\t"
"agfi %%r1,256\n\t"
"brctg %[n],0b"
: "+m"(*(FLOAT (*)[n * 2]) x), "+m"(*(FLOAT (*)[n * 2]) y),[n] "+&r"(n)
: [x] "a"(x),[y] "a"(y),[c] "m"(*c),[s] "m"(*s)
: "cc", "r1", "v0", "v1", "v16", "v17", "v18", "v19", "v20", "v21",
"v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30",
"v31");
} }


int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT c, FLOAT s)
{
BLASLONG i=0;
BLASLONG ix=0,iy=0;
FLOAT temp[2];
BLASLONG inc_x2;
BLASLONG inc_y2;

if ( n <= 0 ) return(0);

if ( (inc_x == 1) && (inc_y == 1) )
{

BLASLONG n1 = n & -32;
if ( n1 > 0 )
{
FLOAT cosa,sina;
cosa=c;
sina=s;
crot_kernel_32(n1, x, y, &cosa, &sina);
i=n1;
ix=2*n1;
}

while(i < n)
{
temp[0] = c*x[ix] + s*y[ix] ;
temp[1] = c*x[ix+1] + s*y[ix+1] ;
y[ix] = c*y[ix] - s*x[ix] ;
y[ix+1] = c*y[ix+1] - s*x[ix+1] ;
x[ix] = temp[0] ;
x[ix+1] = temp[1] ;

ix += 2 ;
i++ ;
int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y,
FLOAT c, FLOAT s) {
BLASLONG i = 0;
BLASLONG ix = 0, iy = 0;
FLOAT temp[2];
BLASLONG inc_x2;
BLASLONG inc_y2;

if (n <= 0)
return (0);

if ((inc_x == 1) && (inc_y == 1)) {

BLASLONG n1 = n & -32;
if (n1 > 0) {
FLOAT cosa, sina;
cosa = c;
sina = s;
crot_kernel_32(n1, x, y, &cosa, &sina);
i = n1;
ix = 2 * n1;
}


}
while (i < n) {
temp[0] = c * x[ix] + s * y[ix];
temp[1] = c * x[ix + 1] + s * y[ix + 1];
y[ix] = c * y[ix] - s * x[ix];
y[ix + 1] = c * y[ix + 1] - s * x[ix + 1];
x[ix] = temp[0];
x[ix + 1] = temp[1];


ix += 2;
i++;


} }
else
{
inc_x2 = 2 * inc_x ;
inc_y2 = 2 * inc_y ;
while(i < n)
{
temp[0] = c*x[ix] + s*y[iy] ;
temp[1] = c*x[ix+1] + s*y[iy+1] ;
y[iy] = c*y[iy] - s*x[ix] ;
y[iy+1] = c*y[iy+1] - s*x[ix+1] ;
x[ix] = temp[0] ;
x[ix+1] = temp[1] ;

ix += inc_x2 ;
iy += inc_y2 ;
i++ ;


}
} else {
inc_x2 = 2 * inc_x;
inc_y2 = 2 * inc_y;
while (i < n) {
temp[0] = c * x[ix] + s * y[iy];
temp[1] = c * x[ix + 1] + s * y[iy + 1];
y[iy] = c * y[iy] - s * x[ix];
y[iy + 1] = c * y[iy + 1] - s * x[ix + 1];
x[ix] = temp[0];
x[ix + 1] = temp[1];

ix += inc_x2;
iy += inc_y2;
i++;


} }
return(0);
}


}
return (0);


}

+ 327
- 357
kernel/zarch/cscal.c View File

@@ -1,5 +1,5 @@
/*************************************************************************** /***************************************************************************
Copyright (c) 2013 - 2017, The OpenBLAS Project
Copyright (c) 2013-2019, The OpenBLAS Project
All rights reserved. All rights reserved.
Redistribution and use in source and binary forms, with or without Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are modification, are permitted provided that the following conditions are
@@ -27,430 +27,400 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.


#include "common.h" #include "common.h"


static void cscal_kernel_16(BLASLONG n, FLOAT *alpha, FLOAT *x)
{
__asm__ volatile(
"vlrepf %%v0,0(%1) \n\t"
"vlef %%v1,4(%1),0 \n\t"
"vlef %%v1,4(%1),2 \n\t"
"vflcsb %%v1,%%v1 \n\t"
"vlef %%v1,4(%1),1 \n\t"
"vlef %%v1,4(%1),3 \n\t"
"srlg %%r0,%0,4 \n\t"
"xgr %%r1,%%r1 \n\t"
"0: \n\t"
"pfd 2, 1024(%%r1,%2) \n\t"

"vl %%v16,0(%%r1,%2) \n\t"
"vl %%v17,16(%%r1,%2) \n\t"
"vl %%v18,32(%%r1,%2) \n\t"
"vl %%v19,48(%%r1,%2) \n\t"
"vl %%v20,64(%%r1,%2) \n\t"
"vl %%v21,80(%%r1,%2) \n\t"
"vl %%v22,96(%%r1,%2) \n\t"
"vl %%v23,112(%%r1,%2) \n\t"
"verllg %%v24,%%v16,32 \n\t"
"verllg %%v25,%%v17,32 \n\t"
"verllg %%v26,%%v18,32 \n\t"
"verllg %%v27,%%v19,32 \n\t"
"verllg %%v28,%%v20,32 \n\t"
"verllg %%v29,%%v21,32 \n\t"
"verllg %%v30,%%v22,32 \n\t"
"verllg %%v31,%%v23,32 \n\t"

"vfmsb %%v16,%%v16,%%v0 \n\t"
"vfmsb %%v17,%%v17,%%v0 \n\t"
"vfmsb %%v18,%%v18,%%v0 \n\t"
"vfmsb %%v19,%%v19,%%v0 \n\t"
"vfmsb %%v20,%%v20,%%v0 \n\t"
"vfmsb %%v21,%%v21,%%v0 \n\t"
"vfmsb %%v22,%%v22,%%v0 \n\t"
"vfmsb %%v23,%%v23,%%v0 \n\t"
"vfmasb %%v16,%%v24,%%v1,%%v16 \n\t"
"vfmasb %%v17,%%v25,%%v1,%%v17 \n\t"
"vfmasb %%v18,%%v26,%%v1,%%v18 \n\t"
"vfmasb %%v19,%%v27,%%v1,%%v19 \n\t"
"vfmasb %%v20,%%v28,%%v1,%%v20 \n\t"
"vfmasb %%v21,%%v29,%%v1,%%v21 \n\t"
"vfmasb %%v22,%%v30,%%v1,%%v22 \n\t"
"vfmasb %%v23,%%v31,%%v1,%%v23 \n\t"

"vst %%v16,0(%%r1,%2) \n\t"
"vst %%v17,16(%%r1,%2) \n\t"
"vst %%v18,32(%%r1,%2) \n\t"
"vst %%v19,48(%%r1,%2) \n\t"
"vst %%v20,64(%%r1,%2) \n\t"
"vst %%v21,80(%%r1,%2) \n\t"
"vst %%v22,96(%%r1,%2) \n\t"
"vst %%v23,112(%%r1,%2) \n\t"

"agfi %%r1,128 \n\t"
"brctg %%r0,0b "
:
:"r"(n),"ZQ"((const FLOAT (*)[2])alpha),"ZR"((FLOAT (*)[n * 2])x)
:"memory","cc","r0","r1","v0","v1","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
);
}
static void cscal_kernel_16_zero_r(BLASLONG n, FLOAT *alpha, FLOAT *x)
{
__asm__ volatile(
"vlef %%v0,4(%1),0 \n\t"
"vlef %%v0,4(%1),2 \n\t"
"vflcsb %%v0,%%v0 \n\t"
"vlef %%v0,4(%1),1 \n\t"
"vlef %%v0,4(%1),3 \n\t"
"srlg %%r0,%0,4 \n\t"
"xgr %%r1,%%r1 \n\t"
"0: \n\t"
"pfd 2, 1024(%%r1,%2) \n\t"

"vl %%v16,0(%%r1,%2) \n\t"
"vl %%v17,16(%%r1,%2) \n\t"
"vl %%v18,32(%%r1,%2) \n\t"
"vl %%v19,48(%%r1,%2) \n\t"
"vl %%v20,64(%%r1,%2) \n\t"
"vl %%v21,80(%%r1,%2) \n\t"
"vl %%v22,96(%%r1,%2) \n\t"
"vl %%v23,112(%%r1,%2) \n\t"
"verllg %%v16,%%v16,32 \n\t"
"verllg %%v17,%%v17,32 \n\t"
"verllg %%v18,%%v18,32 \n\t"
"verllg %%v19,%%v19,32 \n\t"
"verllg %%v20,%%v20,32 \n\t"
"verllg %%v21,%%v21,32 \n\t"
"verllg %%v22,%%v22,32 \n\t"
"verllg %%v23,%%v23,32 \n\t"

"vfmsb %%v16,%%v16,%%v0 \n\t"
"vfmsb %%v17,%%v17,%%v0 \n\t"
"vfmsb %%v18,%%v18,%%v0 \n\t"
"vfmsb %%v19,%%v19,%%v0 \n\t"
"vfmsb %%v20,%%v20,%%v0 \n\t"
"vfmsb %%v21,%%v21,%%v0 \n\t"
"vfmsb %%v22,%%v22,%%v0 \n\t"
"vfmsb %%v23,%%v23,%%v0 \n\t"

"vst %%v16,0(%%r1,%2) \n\t"
"vst %%v17,16(%%r1,%2) \n\t"
"vst %%v18,32(%%r1,%2) \n\t"
"vst %%v19,48(%%r1,%2) \n\t"
"vst %%v20,64(%%r1,%2) \n\t"
"vst %%v21,80(%%r1,%2) \n\t"
"vst %%v22,96(%%r1,%2) \n\t"
"vst %%v23,112(%%r1,%2) \n\t"

"agfi %%r1,128 \n\t"
"brctg %%r0,0b "
:
:"r"(n),"ZQ"((const FLOAT (*)[2])alpha),"ZR"((FLOAT (*)[n * 2])x)
:"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23"
);
static void cscal_kernel_16(BLASLONG n, FLOAT *alpha, FLOAT *x) {
__asm__("vlrepf %%v0,0(%[alpha])\n\t"
"vlef %%v1,4(%[alpha]),0\n\t"
"vlef %%v1,4(%[alpha]),2\n\t"
"vflcsb %%v1,%%v1\n\t"
"vlef %%v1,4(%[alpha]),1\n\t"
"vlef %%v1,4(%[alpha]),3\n\t"
"srlg %[n],%[n],4\n\t"
"xgr %%r1,%%r1\n\t"
"0:\n\t"
"pfd 2, 1024(%%r1,%[x])\n\t"
"vl %%v16,0(%%r1,%[x])\n\t"
"vl %%v17,16(%%r1,%[x])\n\t"
"vl %%v18,32(%%r1,%[x])\n\t"
"vl %%v19,48(%%r1,%[x])\n\t"
"vl %%v20,64(%%r1,%[x])\n\t"
"vl %%v21,80(%%r1,%[x])\n\t"
"vl %%v22,96(%%r1,%[x])\n\t"
"vl %%v23,112(%%r1,%[x])\n\t"
"verllg %%v24,%%v16,32\n\t"
"verllg %%v25,%%v17,32\n\t"
"verllg %%v26,%%v18,32\n\t"
"verllg %%v27,%%v19,32\n\t"
"verllg %%v28,%%v20,32\n\t"
"verllg %%v29,%%v21,32\n\t"
"verllg %%v30,%%v22,32\n\t"
"verllg %%v31,%%v23,32\n\t"
"vfmsb %%v16,%%v16,%%v0\n\t"
"vfmsb %%v17,%%v17,%%v0\n\t"
"vfmsb %%v18,%%v18,%%v0\n\t"
"vfmsb %%v19,%%v19,%%v0\n\t"
"vfmsb %%v20,%%v20,%%v0\n\t"
"vfmsb %%v21,%%v21,%%v0\n\t"
"vfmsb %%v22,%%v22,%%v0\n\t"
"vfmsb %%v23,%%v23,%%v0\n\t"
"vfmasb %%v16,%%v24,%%v1,%%v16\n\t"
"vfmasb %%v17,%%v25,%%v1,%%v17\n\t"
"vfmasb %%v18,%%v26,%%v1,%%v18\n\t"
"vfmasb %%v19,%%v27,%%v1,%%v19\n\t"
"vfmasb %%v20,%%v28,%%v1,%%v20\n\t"
"vfmasb %%v21,%%v29,%%v1,%%v21\n\t"
"vfmasb %%v22,%%v30,%%v1,%%v22\n\t"
"vfmasb %%v23,%%v31,%%v1,%%v23\n\t"
"vst %%v16,0(%%r1,%[x])\n\t"
"vst %%v17,16(%%r1,%[x])\n\t"
"vst %%v18,32(%%r1,%[x])\n\t"
"vst %%v19,48(%%r1,%[x])\n\t"
"vst %%v20,64(%%r1,%[x])\n\t"
"vst %%v21,80(%%r1,%[x])\n\t"
"vst %%v22,96(%%r1,%[x])\n\t"
"vst %%v23,112(%%r1,%[x])\n\t"
"agfi %%r1,128\n\t"
"brctg %[n],0b"
: "+m"(*(FLOAT (*)[n * 2]) x),[n] "+&r"(n)
: [x] "a"(x), "m"(*(const FLOAT (*)[2]) alpha),[alpha] "a"(alpha)
: "cc", "r1", "v0", "v1", "v16", "v17", "v18", "v19", "v20", "v21",
"v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30",
"v31");
} }


static void cscal_kernel_16_zero_i(BLASLONG n, FLOAT *alpha, FLOAT *x)
{
__asm__ volatile(
"vlrepf %%v0,0(%1) \n\t"
"srlg %%r0,%0,4 \n\t"
"xgr %%r1,%%r1 \n\t"
"0: \n\t"
"pfd 2, 1024(%%r1,%2) \n\t"

"vl %%v16,0(%%r1,%2) \n\t"
"vl %%v17,16(%%r1,%2) \n\t"
"vl %%v18,32(%%r1,%2) \n\t"
"vl %%v19,48(%%r1,%2) \n\t"
"vl %%v20,64(%%r1,%2) \n\t"
"vl %%v21,80(%%r1,%2) \n\t"
"vl %%v22,96(%%r1,%2) \n\t"
"vl %%v23,112(%%r1,%2) \n\t"

"vfmsb %%v16,%%v16,%%v0 \n\t"
"vfmsb %%v17,%%v17,%%v0 \n\t"
"vfmsb %%v18,%%v18,%%v0 \n\t"
"vfmsb %%v19,%%v19,%%v0 \n\t"
"vfmsb %%v20,%%v20,%%v0 \n\t"
"vfmsb %%v21,%%v21,%%v0 \n\t"
"vfmsb %%v22,%%v22,%%v0 \n\t"
"vfmsb %%v23,%%v23,%%v0 \n\t"

"vst %%v16,0(%%r1,%2) \n\t"
"vst %%v17,16(%%r1,%2) \n\t"
"vst %%v18,32(%%r1,%2) \n\t"
"vst %%v19,48(%%r1,%2) \n\t"
"vst %%v20,64(%%r1,%2) \n\t"
"vst %%v21,80(%%r1,%2) \n\t"
"vst %%v22,96(%%r1,%2) \n\t"
"vst %%v23,112(%%r1,%2) \n\t"

"agfi %%r1,128 \n\t"
"brctg %%r0,0b "
:
:"r"(n),"ZQ"((const FLOAT (*)[2])alpha),"ZR"((FLOAT (*)[n * 2])x)
:"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23"
);
static void cscal_kernel_16_zero_r(BLASLONG n, FLOAT *alpha, FLOAT *x) {
__asm__("vlef %%v0,4(%[alpha]),0\n\t"
"vlef %%v0,4(%[alpha]),2\n\t"
"vflcsb %%v0,%%v0\n\t"
"vlef %%v0,4(%[alpha]),1\n\t"
"vlef %%v0,4(%[alpha]),3\n\t"
"srlg %[n],%[n],4\n\t"
"xgr %%r1,%%r1\n\t"
"0:\n\t"
"pfd 2, 1024(%%r1,%[x])\n\t"
"vl %%v16,0(%%r1,%[x])\n\t"
"vl %%v17,16(%%r1,%[x])\n\t"
"vl %%v18,32(%%r1,%[x])\n\t"
"vl %%v19,48(%%r1,%[x])\n\t"
"vl %%v20,64(%%r1,%[x])\n\t"
"vl %%v21,80(%%r1,%[x])\n\t"
"vl %%v22,96(%%r1,%[x])\n\t"
"vl %%v23,112(%%r1,%[x])\n\t"
"verllg %%v16,%%v16,32\n\t"
"verllg %%v17,%%v17,32\n\t"
"verllg %%v18,%%v18,32\n\t"
"verllg %%v19,%%v19,32\n\t"
"verllg %%v20,%%v20,32\n\t"
"verllg %%v21,%%v21,32\n\t"
"verllg %%v22,%%v22,32\n\t"
"verllg %%v23,%%v23,32\n\t"
"vfmsb %%v16,%%v16,%%v0\n\t"
"vfmsb %%v17,%%v17,%%v0\n\t"
"vfmsb %%v18,%%v18,%%v0\n\t"
"vfmsb %%v19,%%v19,%%v0\n\t"
"vfmsb %%v20,%%v20,%%v0\n\t"
"vfmsb %%v21,%%v21,%%v0\n\t"
"vfmsb %%v22,%%v22,%%v0\n\t"
"vfmsb %%v23,%%v23,%%v0\n\t"
"vst %%v16,0(%%r1,%[x])\n\t"
"vst %%v17,16(%%r1,%[x])\n\t"
"vst %%v18,32(%%r1,%[x])\n\t"
"vst %%v19,48(%%r1,%[x])\n\t"
"vst %%v20,64(%%r1,%[x])\n\t"
"vst %%v21,80(%%r1,%[x])\n\t"
"vst %%v22,96(%%r1,%[x])\n\t"
"vst %%v23,112(%%r1,%[x])\n\t"
"agfi %%r1,128\n\t"
"brctg %[n],0b"
: "+m"(*(FLOAT (*)[n * 2]) x),[n] "+&r"(n)
: [x] "a"(x), "m"(*(const FLOAT (*)[2]) alpha),[alpha] "a"(alpha)
: "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22",
"v23");
} }


static void cscal_kernel_16_zero(BLASLONG n, FLOAT *x)
{
__asm__ volatile(
"vzero %%v24 \n\t"
"vzero %%v25 \n\t"
"vzero %%v26 \n\t"
"vzero %%v27 \n\t"
"srlg %%r0,%0,4 \n\t"
"xgr %%r1,%%r1 \n\t"
"0: \n\t"
"pfd 2, 1024(%%r1,%1) \n\t"

"vst %%v24,0(%%r1,%1) \n\t"
"vst %%v25,16(%%r1,%1) \n\t"
"vst %%v26,32(%%r1,%1) \n\t"
"vst %%v27,48(%%r1,%1) \n\t"
"vst %%v24,64(%%r1,%1) \n\t"
"vst %%v25,80(%%r1,%1) \n\t"
"vst %%v26,96(%%r1,%1) \n\t"
"vst %%v27,112(%%r1,%1) \n\t"
"agfi %%r1,128 \n\t"
"brctg %%r0,0b "
:
:"r"(n),"ZR"((FLOAT (*)[n * 2])x)
:"memory","cc","r0","r1","v24","v25","v26","v27"
);
static void cscal_kernel_16_zero_i(BLASLONG n, FLOAT *alpha, FLOAT *x) {
__asm__("vlrepf %%v0,0(%[alpha])\n\t"
"srlg %[n],%[n],4\n\t"
"xgr %%r1,%%r1\n\t"
"0:\n\t"
"pfd 2, 1024(%%r1,%[x])\n\t"
"vl %%v16,0(%%r1,%[x])\n\t"
"vl %%v17,16(%%r1,%[x])\n\t"
"vl %%v18,32(%%r1,%[x])\n\t"
"vl %%v19,48(%%r1,%[x])\n\t"
"vl %%v20,64(%%r1,%[x])\n\t"
"vl %%v21,80(%%r1,%[x])\n\t"
"vl %%v22,96(%%r1,%[x])\n\t"
"vl %%v23,112(%%r1,%[x])\n\t"
"vfmsb %%v16,%%v16,%%v0\n\t"
"vfmsb %%v17,%%v17,%%v0\n\t"
"vfmsb %%v18,%%v18,%%v0\n\t"
"vfmsb %%v19,%%v19,%%v0\n\t"
"vfmsb %%v20,%%v20,%%v0\n\t"
"vfmsb %%v21,%%v21,%%v0\n\t"
"vfmsb %%v22,%%v22,%%v0\n\t"
"vfmsb %%v23,%%v23,%%v0\n\t"
"vst %%v16,0(%%r1,%[x])\n\t"
"vst %%v17,16(%%r1,%[x])\n\t"
"vst %%v18,32(%%r1,%[x])\n\t"
"vst %%v19,48(%%r1,%[x])\n\t"
"vst %%v20,64(%%r1,%[x])\n\t"
"vst %%v21,80(%%r1,%[x])\n\t"
"vst %%v22,96(%%r1,%[x])\n\t"
"vst %%v23,112(%%r1,%[x])\n\t"
"agfi %%r1,128\n\t"
"brctg %[n],0b"
: "+m"(*(FLOAT (*)[n * 2]) x),[n] "+&r"(n)
: [x] "a"(x), "m"(*(const FLOAT (*)[2]) alpha),[alpha] "a"(alpha)
: "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22",
"v23");
} }


static void cscal_kernel_inc_8(BLASLONG n, FLOAT *alpha, FLOAT *x, BLASLONG inc_x)
{
BLASLONG i;
BLASLONG inc_x2 = 2 * inc_x;
BLASLONG inc_x3 = inc_x2 + inc_x;
FLOAT t0, t1, t2, t3;
FLOAT da_r = alpha[0];
FLOAT da_i = alpha[1];

for (i = 0; i < n; i += 4)
{
t0 = da_r * x[0] - da_i * x[1];
t1 = da_r * x[inc_x] - da_i * x[inc_x + 1];
t2 = da_r * x[inc_x2] - da_i * x[inc_x2 + 1];
t3 = da_r * x[inc_x3] - da_i * x[inc_x3 + 1];

x[1] = da_i * x[0] + da_r * x[1];
x[inc_x + 1] = da_i * x[inc_x] + da_r * x[inc_x + 1];
x[inc_x2 + 1] = da_i * x[inc_x2] + da_r * x[inc_x2 + 1];
x[inc_x3 + 1] = da_i * x[inc_x3] + da_r * x[inc_x3 + 1];

x[0] = t0;
x[inc_x] = t1;
x[inc_x2] = t2;
x[inc_x3] = t3;

x += 4 * inc_x;
}
static void cscal_kernel_16_zero(BLASLONG n, FLOAT *x) {
__asm__("vzero %%v0\n\t"
"srlg %[n],%[n],4\n\t"
"xgr %%r1,%%r1\n\t"
"0:\n\t"
"pfd 2, 1024(%%r1,%[x])\n\t"
"vst %%v0,0(%%r1,%[x])\n\t"
"vst %%v0,16(%%r1,%[x])\n\t"
"vst %%v0,32(%%r1,%[x])\n\t"
"vst %%v0,48(%%r1,%[x])\n\t"
"vst %%v0,64(%%r1,%[x])\n\t"
"vst %%v0,80(%%r1,%[x])\n\t"
"vst %%v0,96(%%r1,%[x])\n\t"
"vst %%v0,112(%%r1,%[x])\n\t"
"agfi %%r1,128\n\t"
"brctg %[n],0b"
: "=m"(*(FLOAT (*)[n * 2]) x),[n] "+&r"(n)
: [x] "a"(x)
: "cc", "r1", "v0");
} }


int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) {
BLASLONG i = 0, j = 0;
FLOAT temp0;
FLOAT temp1;
FLOAT alpha[2] __attribute__ ((aligned(16)));

if (inc_x != 1) {
inc_x <<= 1;

if (da_r == 0.0) {

BLASLONG n1 = n & -2;

if (da_i == 0.0) {
static void cscal_kernel_inc_8(BLASLONG n, FLOAT *alpha, FLOAT *x,
BLASLONG inc_x) {
BLASLONG i;
BLASLONG inc_x2 = 2 * inc_x;
BLASLONG inc_x3 = inc_x2 + inc_x;
FLOAT t0, t1, t2, t3;
FLOAT da_r = alpha[0];
FLOAT da_i = alpha[1];

for (i = 0; i < n; i += 4) {
t0 = da_r * x[0] - da_i * x[1];
t1 = da_r * x[inc_x] - da_i * x[inc_x + 1];
t2 = da_r * x[inc_x2] - da_i * x[inc_x2 + 1];
t3 = da_r * x[inc_x3] - da_i * x[inc_x3 + 1];

x[1] = da_i * x[0] + da_r * x[1];
x[inc_x + 1] = da_i * x[inc_x] + da_r * x[inc_x + 1];
x[inc_x2 + 1] = da_i * x[inc_x2] + da_r * x[inc_x2 + 1];
x[inc_x3 + 1] = da_i * x[inc_x3] + da_r * x[inc_x3 + 1];

x[0] = t0;
x[inc_x] = t1;
x[inc_x2] = t2;
x[inc_x3] = t3;

x += 4 * inc_x;
}
}


while (j < n1) {
int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i,
FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy,
BLASLONG dummy2) {
BLASLONG i = 0, j = 0;
FLOAT temp0;
FLOAT temp1;
FLOAT alpha[2] __attribute__ ((aligned(16)));


x[i] = 0.0;
x[i + 1] = 0.0;
x[i + inc_x] = 0.0;
x[i + 1 + inc_x] = 0.0;
i += 2 * inc_x;
j += 2;
if (inc_x != 1) {
inc_x <<= 1;


}
if (da_r == 0.0) {


while (j < n) {
BLASLONG n1 = n & -2;


x[i] = 0.0;
x[i + 1] = 0.0;
i += inc_x;
j++;
if (da_i == 0.0) {


}
while (j < n1) {


} else {
x[i] = 0.0;
x[i + 1] = 0.0;
x[i + inc_x] = 0.0;
x[i + 1 + inc_x] = 0.0;
i += 2 * inc_x;
j += 2;


while (j < n1) {
}


temp0 = -da_i * x[i + 1];
x[i + 1] = da_i * x[i];
x[i] = temp0;
temp1 = -da_i * x[i + 1 + inc_x];
x[i + 1 + inc_x] = da_i * x[i + inc_x];
x[i + inc_x] = temp1;
i += 2 * inc_x;
j += 2;
while (j < n) {


}
x[i] = 0.0;
x[i + 1] = 0.0;
i += inc_x;
j++;


while (j < n) {
}


temp0 = -da_i * x[i + 1];
x[i + 1] = da_i * x[i];
x[i] = temp0;
i += inc_x;
j++;
} else {


}
while (j < n1) {


temp0 = -da_i * x[i + 1];
x[i + 1] = da_i * x[i];
x[i] = temp0;
temp1 = -da_i * x[i + 1 + inc_x];
x[i + 1 + inc_x] = da_i * x[i + inc_x];
x[i + inc_x] = temp1;
i += 2 * inc_x;
j += 2;


}


}
while (j < n) {


} else {
temp0 = -da_i * x[i + 1];
x[i + 1] = da_i * x[i];
x[i] = temp0;
i += inc_x;
j++;


}


if (da_i == 0.0) {
BLASLONG n1 = n & -2;
}


while (j < n1) {
} else {


temp0 = da_r * x[i];
x[i + 1] = da_r * x[i + 1];
x[i] = temp0;
temp1 = da_r * x[i + inc_x];
x[i + 1 + inc_x] = da_r * x[i + 1 + inc_x];
x[i + inc_x] = temp1;
i += 2 * inc_x;
j += 2;
if (da_i == 0.0) {
BLASLONG n1 = n & -2;


}
while (j < n1) {


while (j < n) {
temp0 = da_r * x[i];
x[i + 1] = da_r * x[i + 1];
x[i] = temp0;
temp1 = da_r * x[i + inc_x];
x[i + 1 + inc_x] = da_r * x[i + 1 + inc_x];
x[i + inc_x] = temp1;
i += 2 * inc_x;
j += 2;


temp0 = da_r * x[i];
x[i + 1] = da_r * x[i + 1];
x[i] = temp0;
i += inc_x;
j++;
}


}
while (j < n) {


} else {
temp0 = da_r * x[i];
x[i + 1] = da_r * x[i + 1];
x[i] = temp0;
i += inc_x;
j++;


BLASLONG n1 = n & -8;
if (n1 > 0) {
alpha[0] = da_r;
alpha[1] = da_i;
cscal_kernel_inc_8(n1, alpha, x, inc_x);
j = n1;
i = n1 * inc_x;
}
}


while (j < n) {
} else {


temp0 = da_r * x[i] - da_i * x[i + 1];
x[i + 1] = da_r * x[i + 1] + da_i * x[i];
x[i] = temp0;
i += inc_x;
j++;
BLASLONG n1 = n & -8;
if (n1 > 0) {
alpha[0] = da_r;
alpha[1] = da_i;
cscal_kernel_inc_8(n1, alpha, x, inc_x);
j = n1;
i = n1 * inc_x;
}


}
while (j < n) {


}
temp0 = da_r * x[i] - da_i * x[i + 1];
x[i + 1] = da_r * x[i + 1] + da_i * x[i];
x[i] = temp0;
i += inc_x;
j++;


} }


return (0);
}
}


}


BLASLONG n1 = n & -16;
if (n1 > 0) {
return (0);
}


alpha[0] = da_r;
alpha[1] = da_i;
BLASLONG n1 = n & -16;
if (n1 > 0) {


if (da_r == 0.0)
if (da_i == 0)
cscal_kernel_16_zero(n1, x);
else
cscal_kernel_16_zero_r(n1, alpha, x);
else
if (da_i == 0)
cscal_kernel_16_zero_i(n1, alpha, x);
else
cscal_kernel_16(n1, alpha, x);
alpha[0] = da_r;
alpha[1] = da_i;


i = n1 << 1;
j = n1;
}
if (da_r == 0.0)
if (da_i == 0)
cscal_kernel_16_zero(n1, x);
else
cscal_kernel_16_zero_r(n1, alpha, x);
else if (da_i == 0)
cscal_kernel_16_zero_i(n1, alpha, x);
else
cscal_kernel_16(n1, alpha, x);


i = n1 << 1;
j = n1;
}


if (da_r == 0.0) {
if (da_r == 0.0) {


if (da_i == 0.0) {
if (da_i == 0.0) {


while (j < n) {
while (j < n) {


x[i] = 0.0;
x[i + 1] = 0.0;
i += 2;
j++;
x[i] = 0.0;
x[i + 1] = 0.0;
i += 2;
j++;


}
}


} else {
} else {


while (j < n) {
while (j < n) {


temp0 = -da_i * x[i + 1];
x[i + 1] = da_i * x[i];
x[i] = temp0;
i += 2;
j++;
temp0 = -da_i * x[i + 1];
x[i + 1] = da_i * x[i];
x[i] = temp0;
i += 2;
j++;


}
}


}
}


} else {
} else {


if (da_i == 0.0) {
if (da_i == 0.0) {


while (j < n) {
while (j < n) {


temp0 = da_r * x[i];
x[i + 1] = da_r * x[i + 1];
x[i] = temp0;
i += 2;
j++;
temp0 = da_r * x[i];
x[i + 1] = da_r * x[i + 1];
x[i] = temp0;
i += 2;
j++;


}
}


} else {
} else {


while (j < n) {
while (j < n) {


temp0 = da_r * x[i] - da_i * x[i + 1];
x[i + 1] = da_r * x[i + 1] + da_i * x[i];
x[i] = temp0;
i += 2;
j++;
temp0 = da_r * x[i] - da_i * x[i + 1];
x[i + 1] = da_r * x[i + 1] + da_i * x[i];
x[i] = temp0;
i += 2;
j++;


}

}
}


} }


return (0);
}

return (0);
} }

+ 124
- 139
kernel/zarch/cswap.c View File

@@ -1,5 +1,5 @@
/*************************************************************************** /***************************************************************************
Copyright (c) 2013-2017, The OpenBLAS Project
Copyright (c) 2013-2019, The OpenBLAS Project
All rights reserved. All rights reserved.
Redistribution and use in source and binary forms, with or without Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are modification, are permitted provided that the following conditions are
@@ -27,157 +27,142 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.


#include "common.h" #include "common.h"


static void cswap_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y)
{
__asm__ volatile(
"srlg %%r0,%0,5 \n\t"
"xgr %%r1,%%r1 \n\t"
"0: \n\t"
"pfd 2, 1024(%%r1,%1) \n\t"
"pfd 2, 1024(%%r1,%2) \n\t"
"vl %%v16, 0(%%r1,%1) \n\t"
"vl %%v17, 16(%%r1,%1) \n\t"
"vl %%v18, 32(%%r1,%1) \n\t"
"vl %%v19, 48(%%r1,%1) \n\t"
"vl %%v20, 64(%%r1,%1) \n\t"
"vl %%v21, 80(%%r1,%1) \n\t"
"vl %%v22, 96(%%r1,%1) \n\t"
"vl %%v23, 112(%%r1,%1) \n\t"
"vl %%v24, 128(%%r1,%1) \n\t"
"vl %%v25, 144(%%r1,%1) \n\t"
"vl %%v26, 160(%%r1,%1) \n\t"
"vl %%v27, 176(%%r1,%1) \n\t"
"vl %%v28, 192(%%r1,%1) \n\t"
"vl %%v29, 208(%%r1,%1) \n\t"
"vl %%v30, 224(%%r1,%1) \n\t"
"vl %%v31, 240(%%r1,%1) \n\t"

"vl %%v0, 0(%%r1,%2) \n\t"
"vl %%v1, 16(%%r1,%2) \n\t"
"vl %%v2, 32(%%r1,%2) \n\t"
"vl %%v3, 48(%%r1,%2) \n\t"
"vl %%v4, 64(%%r1,%2) \n\t"
"vl %%v5, 80(%%r1,%2) \n\t"
"vl %%v6, 96(%%r1,%2) \n\t"
"vl %%v7, 112(%%r1,%2) \n\t"
"vst %%v0, 0(%%r1,%1) \n\t"
"vst %%v1, 16(%%r1,%1) \n\t"
"vst %%v2, 32(%%r1,%1) \n\t"
"vst %%v3, 48(%%r1,%1) \n\t"
"vst %%v4, 64(%%r1,%1) \n\t"
"vst %%v5, 80(%%r1,%1) \n\t"
"vst %%v6, 96(%%r1,%1) \n\t"
"vst %%v7, 112(%%r1,%1) \n\t"

"vl %%v0, 128(%%r1,%2) \n\t"
"vl %%v1, 144(%%r1,%2) \n\t"
"vl %%v2, 160(%%r1,%2) \n\t"
"vl %%v3, 176(%%r1,%2) \n\t"
"vl %%v4, 192(%%r1,%2) \n\t"
"vl %%v5, 208(%%r1,%2) \n\t"
"vl %%v6, 224(%%r1,%2) \n\t"
"vl %%v7, 240(%%r1,%2) \n\t"
"vst %%v0, 128(%%r1,%1) \n\t"
"vst %%v1, 144(%%r1,%1) \n\t"
"vst %%v2, 160(%%r1,%1) \n\t"
"vst %%v3, 176(%%r1,%1) \n\t"
"vst %%v4, 192(%%r1,%1) \n\t"
"vst %%v5, 208(%%r1,%1) \n\t"
"vst %%v6, 224(%%r1,%1) \n\t"
"vst %%v7, 240(%%r1,%1) \n\t"

"vst %%v16, 0(%%r1,%2) \n\t"
"vst %%v17, 16(%%r1,%2) \n\t"
"vst %%v18, 32(%%r1,%2) \n\t"
"vst %%v19, 48(%%r1,%2) \n\t"
"vst %%v20, 64(%%r1,%2) \n\t"
"vst %%v21, 80(%%r1,%2) \n\t"
"vst %%v22, 96(%%r1,%2) \n\t"
"vst %%v23, 112(%%r1,%2) \n\t"
"vst %%v24, 128(%%r1,%2) \n\t"
"vst %%v25, 144(%%r1,%2) \n\t"
"vst %%v26, 160(%%r1,%2) \n\t"
"vst %%v27, 176(%%r1,%2) \n\t"
"vst %%v28, 192(%%r1,%2) \n\t"
"vst %%v29, 208(%%r1,%2) \n\t"
"vst %%v30, 224(%%r1,%2) \n\t"
"vst %%v31, 240(%%r1,%2) \n\t"
"agfi %%r1,256 \n\t"
"brctg %%r0,0b "
:
:"r"(n),"ZR"((FLOAT (*)[n * 2])x),"ZR"((FLOAT (*)[n * 2])y)
:"memory","cc","r0","r1","v0","v1","v2","v3","v4","v5","v6","v7","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
);
static void cswap_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y) {
__asm__("srlg %[n],%[n],5\n\t"
"xgr %%r1,%%r1\n\t"
"0:\n\t"
"pfd 2, 1024(%%r1,%[x])\n\t"
"pfd 2, 1024(%%r1,%[y])\n\t"
"vl %%v16, 0(%%r1,%[x])\n\t"
"vl %%v17, 16(%%r1,%[x])\n\t"
"vl %%v18, 32(%%r1,%[x])\n\t"
"vl %%v19, 48(%%r1,%[x])\n\t"
"vl %%v20, 64(%%r1,%[x])\n\t"
"vl %%v21, 80(%%r1,%[x])\n\t"
"vl %%v22, 96(%%r1,%[x])\n\t"
"vl %%v23, 112(%%r1,%[x])\n\t"
"vl %%v24, 128(%%r1,%[x])\n\t"
"vl %%v25, 144(%%r1,%[x])\n\t"
"vl %%v26, 160(%%r1,%[x])\n\t"
"vl %%v27, 176(%%r1,%[x])\n\t"
"vl %%v28, 192(%%r1,%[x])\n\t"
"vl %%v29, 208(%%r1,%[x])\n\t"
"vl %%v30, 224(%%r1,%[x])\n\t"
"vl %%v31, 240(%%r1,%[x])\n\t"
"vl %%v0, 0(%%r1,%[y])\n\t"
"vl %%v1, 16(%%r1,%[y])\n\t"
"vl %%v2, 32(%%r1,%[y])\n\t"
"vl %%v3, 48(%%r1,%[y])\n\t"
"vl %%v4, 64(%%r1,%[y])\n\t"
"vl %%v5, 80(%%r1,%[y])\n\t"
"vl %%v6, 96(%%r1,%[y])\n\t"
"vl %%v7, 112(%%r1,%[y])\n\t"
"vst %%v0, 0(%%r1,%[x])\n\t"
"vst %%v1, 16(%%r1,%[x])\n\t"
"vst %%v2, 32(%%r1,%[x])\n\t"
"vst %%v3, 48(%%r1,%[x])\n\t"
"vst %%v4, 64(%%r1,%[x])\n\t"
"vst %%v5, 80(%%r1,%[x])\n\t"
"vst %%v6, 96(%%r1,%[x])\n\t"
"vst %%v7, 112(%%r1,%[x])\n\t"
"vl %%v0, 128(%%r1,%[y])\n\t"
"vl %%v1, 144(%%r1,%[y])\n\t"
"vl %%v2, 160(%%r1,%[y])\n\t"
"vl %%v3, 176(%%r1,%[y])\n\t"
"vl %%v4, 192(%%r1,%[y])\n\t"
"vl %%v5, 208(%%r1,%[y])\n\t"
"vl %%v6, 224(%%r1,%[y])\n\t"
"vl %%v7, 240(%%r1,%[y])\n\t"
"vst %%v0, 128(%%r1,%[x])\n\t"
"vst %%v1, 144(%%r1,%[x])\n\t"
"vst %%v2, 160(%%r1,%[x])\n\t"
"vst %%v3, 176(%%r1,%[x])\n\t"
"vst %%v4, 192(%%r1,%[x])\n\t"
"vst %%v5, 208(%%r1,%[x])\n\t"
"vst %%v6, 224(%%r1,%[x])\n\t"
"vst %%v7, 240(%%r1,%[x])\n\t"
"vst %%v16, 0(%%r1,%[y])\n\t"
"vst %%v17, 16(%%r1,%[y])\n\t"
"vst %%v18, 32(%%r1,%[y])\n\t"
"vst %%v19, 48(%%r1,%[y])\n\t"
"vst %%v20, 64(%%r1,%[y])\n\t"
"vst %%v21, 80(%%r1,%[y])\n\t"
"vst %%v22, 96(%%r1,%[y])\n\t"
"vst %%v23, 112(%%r1,%[y])\n\t"
"vst %%v24, 128(%%r1,%[y])\n\t"
"vst %%v25, 144(%%r1,%[y])\n\t"
"vst %%v26, 160(%%r1,%[y])\n\t"
"vst %%v27, 176(%%r1,%[y])\n\t"
"vst %%v28, 192(%%r1,%[y])\n\t"
"vst %%v29, 208(%%r1,%[y])\n\t"
"vst %%v30, 224(%%r1,%[y])\n\t"
"vst %%v31, 240(%%r1,%[y])\n\t"
"agfi %%r1,256\n\t"
"brctg %[n],0b"
: "+m"(*(FLOAT (*)[n * 2]) x), "+m"(*(FLOAT (*)[n * 2]) y),[n] "+&r"(n)
: [x] "a"(x),[y] "a"(y)
: "cc", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16",
"v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26",
"v27", "v28", "v29", "v30", "v31");
} }


int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT dummy4, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2)
{
BLASLONG i=0;
BLASLONG ix=0,iy=0;
FLOAT temp[2];
BLASLONG inc_x2, inc_y2;

if ( n <= 0 ) return(0);

if ( (inc_x == 1) && (inc_y == 1 ))
{

BLASLONG n1 = n & -32;
if ( n1 > 0 )
{
cswap_kernel_32(n1, x, y);
i=n1;
ix = 2* n1;
iy = 2* n1;
}

while(i < n)
{

temp[0] = x[ix] ;
temp[1] = x[ix+1] ;
x[ix] = y[iy] ;
x[ix+1] = y[iy+1] ;
y[iy] = temp[0] ;
y[iy+1] = temp[1] ;

ix += 2 ;
iy += 2 ;
i++ ;
int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3,
FLOAT dummy4, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y,
FLOAT *dummy, BLASLONG dummy2) {
BLASLONG i = 0;
BLASLONG ix = 0, iy = 0;
FLOAT temp[2];
BLASLONG inc_x2, inc_y2;

if (n <= 0)
return (0);

if ((inc_x == 1) && (inc_y == 1)) {

BLASLONG n1 = n & -32;
if (n1 > 0) {
cswap_kernel_32(n1, x, y);
i = n1;
ix = 2 * n1;
iy = 2 * n1;
}


while (i < n) {


}
temp[0] = x[ix];
temp[1] = x[ix + 1];
x[ix] = y[iy];
x[ix + 1] = y[iy + 1];
y[iy] = temp[0];
y[iy + 1] = temp[1];


ix += 2;
iy += 2;
i++;


} }
else
{


inc_x2 = 2 * inc_x;
inc_y2 = 2 * inc_y;
} else {


while(i < n)
{
inc_x2 = 2 * inc_x;
inc_y2 = 2 * inc_y;


temp[0] = x[ix] ;
temp[1] = x[ix+1] ;
x[ix] = y[iy] ;
x[ix+1] = y[iy+1] ;
y[iy] = temp[0] ;
y[iy+1] = temp[1] ;
while (i < n) {


ix += inc_x2 ;
iy += inc_y2 ;
i++ ;
temp[0] = x[ix];
temp[1] = x[ix + 1];
x[ix] = y[iy];
x[ix + 1] = y[iy + 1];
y[iy] = temp[0];
y[iy + 1] = temp[1];


}
ix += inc_x2;
iy += inc_y2;
i++;


} }
return(0);

}


}
return (0);


}

+ 102
- 118
kernel/zarch/damax.c View File

@@ -1,5 +1,5 @@
/*************************************************************************** /***************************************************************************
Copyright (c) 2013-2017, The OpenBLAS Project
Copyright (c) 2013-2019, The OpenBLAS Project
All rights reserved. All rights reserved.
Redistribution and use in source and binary forms, with or without Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are modification, are permitted provided that the following conditions are
@@ -28,139 +28,123 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "common.h" #include "common.h"
#include <math.h> #include <math.h>


#if defined(DOUBLE)
#define ABS fabs #define ABS fabs
#else
#define ABS fabsf
#endif

static FLOAT damax_kernel_32(BLASLONG n, FLOAT *x)
{
FLOAT amax;

__asm__ volatile (
"vl %%v0,0(%2) \n\t"
"srlg %%r0,%1,5 \n\t"
"xgr %%r1,%%r1 \n\t"
"0: \n\t"
"pfd 1, 1024(%%r1,%2) \n\t"

"vl %%v16,0(%%r1,%2) \n\t"
"vl %%v17,16(%%r1,%2) \n\t"
"vl %%v18,32(%%r1,%2) \n\t"
"vl %%v19,48(%%r1,%2) \n\t"
"vl %%v20,64(%%r1,%2) \n\t"
"vl %%v21,80(%%r1,%2) \n\t"
"vl %%v22,96(%%r1,%2) \n\t"
"vl %%v23,112(%%r1,%2) \n\t"
"vl %%v24,128(%%r1,%2) \n\t"
"vl %%v25,144(%%r1,%2) \n\t"
"vl %%v26,160(%%r1,%2) \n\t"
"vl %%v27,176(%%r1,%2) \n\t"
"vl %%v28,192(%%r1,%2) \n\t"
"vl %%v29,208(%%r1,%2) \n\t"
"vl %%v30,224(%%r1,%2) \n\t"
"vl %%v31,240(%%r1,%2) \n\t"
"vfmaxdb %%v16,%%v16,%%v24,8 \n\t"
"vfmaxdb %%v17,%%v17,%%v25,8 \n\t"
"vfmaxdb %%v18,%%v18,%%v26,8 \n\t"
"vfmaxdb %%v19,%%v19,%%v27,8 \n\t"
"vfmaxdb %%v20,%%v20,%%v28,8 \n\t"
"vfmaxdb %%v21,%%v21,%%v29,8 \n\t"
"vfmaxdb %%v22,%%v22,%%v30,8 \n\t"
"vfmaxdb %%v23,%%v23,%%v31,8 \n\t"

"vfmaxdb %%v16,%%v16,%%v20,8 \n\t"
"vfmaxdb %%v17,%%v17,%%v21,8 \n\t"
"vfmaxdb %%v18,%%v18,%%v22,8 \n\t"
"vfmaxdb %%v19,%%v19,%%v23,8 \n\t"

"vfmaxdb %%v16,%%v16,%%v18,8 \n\t"
"vfmaxdb %%v17,%%v17,%%v19,8 \n\t"

"vfmaxdb %%v16,%%v16,%%v17,8 \n\t"

"vfmaxdb %%v0,%%v0,%%v16,8 \n\t"

"agfi %%r1, 256 \n\t"
"brctg %%r0, 0b \n\t"

"vrepg %%v16,%%v0,1 \n\t"
"wfmaxdb %%v0,%%v0,%%v16,8 \n\t"
"lpdr %0,%%f0 "
:"=f"(amax)
:"r"(n),"ZR"((const FLOAT (*)[n])x)
:"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
);

return amax;
}
FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
BLASLONG i = 0;
BLASLONG j = 0;
FLOAT maxf = 0.0;


if (n <= 0 || inc_x <= 0) return (maxf);
static FLOAT damax_kernel_32(BLASLONG n, FLOAT *x) {
FLOAT amax;

__asm__("vl %%v0,0(%[x])\n\t"
"srlg %[n],%[n],5\n\t"
"xgr %%r1,%%r1\n\t"
"0:\n\t"
"pfd 1, 1024(%%r1,%[x])\n\t"
"vl %%v16,0(%%r1,%[x])\n\t"
"vl %%v17,16(%%r1,%[x])\n\t"
"vl %%v18,32(%%r1,%[x])\n\t"
"vl %%v19,48(%%r1,%[x])\n\t"
"vl %%v20,64(%%r1,%[x])\n\t"
"vl %%v21,80(%%r1,%[x])\n\t"
"vl %%v22,96(%%r1,%[x])\n\t"
"vl %%v23,112(%%r1,%[x])\n\t"
"vl %%v24,128(%%r1,%[x])\n\t"
"vl %%v25,144(%%r1,%[x])\n\t"
"vl %%v26,160(%%r1,%[x])\n\t"
"vl %%v27,176(%%r1,%[x])\n\t"
"vl %%v28,192(%%r1,%[x])\n\t"
"vl %%v29,208(%%r1,%[x])\n\t"
"vl %%v30,224(%%r1,%[x])\n\t"
"vl %%v31,240(%%r1,%[x])\n\t"
"vfmaxdb %%v16,%%v16,%%v24,8\n\t"
"vfmaxdb %%v17,%%v17,%%v25,8\n\t"
"vfmaxdb %%v18,%%v18,%%v26,8\n\t"
"vfmaxdb %%v19,%%v19,%%v27,8\n\t"
"vfmaxdb %%v20,%%v20,%%v28,8\n\t"
"vfmaxdb %%v21,%%v21,%%v29,8\n\t"
"vfmaxdb %%v22,%%v22,%%v30,8\n\t"
"vfmaxdb %%v23,%%v23,%%v31,8\n\t"
"vfmaxdb %%v16,%%v16,%%v20,8\n\t"
"vfmaxdb %%v17,%%v17,%%v21,8\n\t"
"vfmaxdb %%v18,%%v18,%%v22,8\n\t"
"vfmaxdb %%v19,%%v19,%%v23,8\n\t"
"vfmaxdb %%v16,%%v16,%%v18,8\n\t"
"vfmaxdb %%v17,%%v17,%%v19,8\n\t"
"vfmaxdb %%v16,%%v16,%%v17,8\n\t"
"vfmaxdb %%v0,%%v0,%%v16,8\n\t"
"agfi %%r1, 256\n\t"
"brctg %[n], 0b\n\t"
"vrepg %%v16,%%v0,1\n\t"
"wfmaxdb %%v0,%%v0,%%v16,8\n\t"
"lpdr %[amax],%%f0"
: [amax] "=f"(amax),[n] "+&r"(n)
: "m"(*(const FLOAT (*)[n]) x),[x] "a"(x)
: "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22",
"v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31");

return amax;
}


if (inc_x == 1) {
FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
BLASLONG i = 0;
BLASLONG j = 0;
FLOAT maxf = 0.0;


BLASLONG n1 = n & -32;
if (n1 > 0) {
if (n <= 0 || inc_x <= 0)
return (maxf);


maxf = damax_kernel_32(n1, x);
if (inc_x == 1) {


i = n1;
}
else
{
maxf=ABS(x[0]);
i++;
}
BLASLONG n1 = n & -32;
if (n1 > 0) {


while (i < n) {
if (ABS(x[i]) > maxf) {
maxf = ABS(x[i]);
}
i++;
}
return (maxf);
maxf = damax_kernel_32(n1, x);


i = n1;
} else { } else {
maxf = ABS(x[0]);
i++;
}

while (i < n) {
if (ABS(x[i]) > maxf) {
maxf = ABS(x[i]);
}
i++;
}
return (maxf);


maxf=ABS(x[0]);
} else {


BLASLONG n1 = n & -4;
while (j < n1) {
maxf = ABS(x[0]);


if (ABS(x[i]) > maxf) {
maxf = ABS(x[i]);
}
if (ABS(x[i + inc_x]) > maxf) {
maxf = ABS(x[i + inc_x]);
}
if (ABS(x[i + 2 * inc_x]) > maxf) {
maxf = ABS(x[i + 2 * inc_x]);
}
if (ABS(x[i + 3 * inc_x]) > maxf) {
maxf = ABS(x[i + 3 * inc_x]);
}
BLASLONG n1 = n & -4;
while (j < n1) {


i += inc_x * 4;
if (ABS(x[i]) > maxf) {
maxf = ABS(x[i]);
}
if (ABS(x[i + inc_x]) > maxf) {
maxf = ABS(x[i + inc_x]);
}
if (ABS(x[i + 2 * inc_x]) > maxf) {
maxf = ABS(x[i + 2 * inc_x]);
}
if (ABS(x[i + 3 * inc_x]) > maxf) {
maxf = ABS(x[i + 3 * inc_x]);
}


j += 4;
i += inc_x * 4;


}
j += 4;


}


while (j < n) {
if (ABS(x[i]) > maxf) {
maxf = ABS(x[i]);
}
i += inc_x;
j++;
}
return (maxf);
while (j < n) {
if (ABS(x[i]) > maxf) {
maxf = ABS(x[i]);
}
i += inc_x;
j++;
} }
return (maxf);
}
} }

+ 136
- 156
kernel/zarch/damax_z13.c View File

@@ -1,5 +1,5 @@
/*************************************************************************** /***************************************************************************
Copyright (c) 2013-2017, The OpenBLAS Project
Copyright (c) 2013-2019, The OpenBLAS Project
All rights reserved. All rights reserved.
Redistribution and use in source and binary forms, with or without Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are modification, are permitted provided that the following conditions are
@@ -28,177 +28,157 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "common.h" #include "common.h"
#include <math.h> #include <math.h>


#if defined(DOUBLE)
#define ABS fabs #define ABS fabs
#else
#define ABS fabsf
#endif

static FLOAT damax_kernel_32(BLASLONG n, FLOAT *x)
{
FLOAT amax;

__asm__ volatile (
"vl %%v0,0(%2) \n\t"
"vflpdb %%v0,%%v0 \n\t"
"srlg %%r0,%1,5 \n\t"
"xgr %%r1,%%r1 \n\t"
"0: \n\t"
"pfd 1, 1024(%%r1,%2) \n\t"

"vl %%v16,0(%%r1,%2) \n\t"
"vl %%v17,16(%%r1,%2) \n\t"
"vl %%v18,32(%%r1,%2) \n\t"
"vl %%v19,48(%%r1,%2) \n\t"
"vl %%v20,64(%%r1,%2) \n\t"
"vl %%v21,80(%%r1,%2) \n\t"
"vl %%v22,96(%%r1,%2) \n\t"
"vl %%v23,112(%%r1,%2) \n\t"
"vflpdb %%v16, %%v16 \n\t"
"vflpdb %%v17, %%v17 \n\t"
"vflpdb %%v18, %%v18 \n\t"
"vflpdb %%v19, %%v19 \n\t"
"vflpdb %%v20, %%v20 \n\t"
"vflpdb %%v21, %%v21 \n\t"
"vflpdb %%v22, %%v22 \n\t"
"vflpdb %%v23, %%v23 \n\t"
"vfchdb %%v24,%%v16,%%v17 \n\t"
"vfchdb %%v25,%%v18,%%v19 \n\t"
"vfchdb %%v26,%%v20,%%v21 \n\t"
"vfchdb %%v27,%%v22,%%v23 \n\t"
"vsel %%v24,%%v16,%%v17,%%v24 \n\t"
"vsel %%v25,%%v18,%%v19,%%v25 \n\t"
"vsel %%v26,%%v20,%%v21,%%v26 \n\t"
"vsel %%v27,%%v22,%%v23,%%v27 \n\t"

"vfchdb %%v28,%%v24,%%v25 \n\t"
"vfchdb %%v29,%%v26,%%v27 \n\t"
"vsel %%v28,%%v24,%%v25,%%v28 \n\t"
"vsel %%v29,%%v26,%%v27,%%v29 \n\t"

"vfchdb %%v30,%%v28,%%v29 \n\t"
"vsel %%v30,%%v28,%%v29,%%v30 \n\t"

"vfchdb %%v31,%%v30,%%v0 \n\t"
"vsel %%v0,%%v30,%%v0,%%v31 \n\t"

"vl %%v16,128(%%r1,%2) \n\t"
"vl %%v17,144(%%r1,%2) \n\t"
"vl %%v18,160(%%r1,%2) \n\t"
"vl %%v19,176(%%r1,%2) \n\t"
"vl %%v20,192(%%r1,%2) \n\t"
"vl %%v21,208(%%r1,%2) \n\t"
"vl %%v22,224(%%r1,%2) \n\t"
"vl %%v23,240(%%r1,%2) \n\t"
"vflpdb %%v16, %%v16 \n\t"
"vflpdb %%v17, %%v17 \n\t"
"vflpdb %%v18, %%v18 \n\t"
"vflpdb %%v19, %%v19 \n\t"
"vflpdb %%v20, %%v20 \n\t"
"vflpdb %%v21, %%v21 \n\t"
"vflpdb %%v22, %%v22 \n\t"
"vflpdb %%v23, %%v23 \n\t"
"vfchdb %%v24,%%v16,%%v17 \n\t"
"vfchdb %%v25,%%v18,%%v19 \n\t"
"vfchdb %%v26,%%v20,%%v21 \n\t"
"vfchdb %%v27,%%v22,%%v23 \n\t"
"vsel %%v24,%%v16,%%v17,%%v24 \n\t"
"vsel %%v25,%%v18,%%v19,%%v25 \n\t"
"vsel %%v26,%%v20,%%v21,%%v26 \n\t"
"vsel %%v27,%%v22,%%v23,%%v27 \n\t"

"vfchdb %%v28,%%v24,%%v25 \n\t"
"vfchdb %%v29,%%v26,%%v27 \n\t"
"vsel %%v28,%%v24,%%v25,%%v28 \n\t"
"vsel %%v29,%%v26,%%v27,%%v29 \n\t"

"vfchdb %%v30,%%v28,%%v29 \n\t"
"vsel %%v30,%%v28,%%v29,%%v30 \n\t"

"vfchdb %%v31,%%v30,%%v0 \n\t"
"vsel %%v0,%%v30,%%v0,%%v31 \n\t"

"agfi %%r1, 256 \n\t"
"brctg %%r0, 0b \n\t"

"vrepg %%v16,%%v0,1 \n\t"
"wfchdb %%v17,%%v0,%%v16 \n\t"
"vsel %%v0,%%v0,%%v16,%%v17 \n\t"
"ldr %0,%%f0 "
:"=f"(amax)
:"r"(n),"ZR"((const FLOAT (*)[n])x)
:"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
);

return amax;
}
FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
BLASLONG i = 0;
BLASLONG j = 0;
FLOAT maxf = 0.0;


if (n <= 0 || inc_x <= 0) return (maxf);
static FLOAT damax_kernel_32(BLASLONG n, FLOAT *x) {
FLOAT amax;

__asm__("vl %%v0,0(%[x])\n\t"
"vflpdb %%v0,%%v0\n\t"
"srlg %[n],%[n],5\n\t"
"xgr %%r1,%%r1\n\t"
"0:\n\t"
"pfd 1, 1024(%%r1,%[x])\n\t"
"vl %%v16,0(%%r1,%[x])\n\t"
"vl %%v17,16(%%r1,%[x])\n\t"
"vl %%v18,32(%%r1,%[x])\n\t"
"vl %%v19,48(%%r1,%[x])\n\t"
"vl %%v20,64(%%r1,%[x])\n\t"
"vl %%v21,80(%%r1,%[x])\n\t"
"vl %%v22,96(%%r1,%[x])\n\t"
"vl %%v23,112(%%r1,%[x])\n\t"
"vflpdb %%v16, %%v16\n\t"
"vflpdb %%v17, %%v17\n\t"
"vflpdb %%v18, %%v18\n\t"
"vflpdb %%v19, %%v19\n\t"
"vflpdb %%v20, %%v20\n\t"
"vflpdb %%v21, %%v21\n\t"
"vflpdb %%v22, %%v22\n\t"
"vflpdb %%v23, %%v23\n\t"
"vfchdb %%v24,%%v16,%%v17\n\t"
"vfchdb %%v25,%%v18,%%v19\n\t"
"vfchdb %%v26,%%v20,%%v21\n\t"
"vfchdb %%v27,%%v22,%%v23\n\t"
"vsel %%v24,%%v16,%%v17,%%v24\n\t"
"vsel %%v25,%%v18,%%v19,%%v25\n\t"
"vsel %%v26,%%v20,%%v21,%%v26\n\t"
"vsel %%v27,%%v22,%%v23,%%v27\n\t"
"vfchdb %%v28,%%v24,%%v25\n\t"
"vfchdb %%v29,%%v26,%%v27\n\t"
"vsel %%v28,%%v24,%%v25,%%v28\n\t"
"vsel %%v29,%%v26,%%v27,%%v29\n\t"
"vfchdb %%v30,%%v28,%%v29\n\t"
"vsel %%v30,%%v28,%%v29,%%v30\n\t"
"vfchdb %%v31,%%v30,%%v0\n\t"
"vsel %%v0,%%v30,%%v0,%%v31\n\t"
"vl %%v16,128(%%r1,%[x])\n\t"
"vl %%v17,144(%%r1,%[x])\n\t"
"vl %%v18,160(%%r1,%[x])\n\t"
"vl %%v19,176(%%r1,%[x])\n\t"
"vl %%v20,192(%%r1,%[x])\n\t"
"vl %%v21,208(%%r1,%[x])\n\t"
"vl %%v22,224(%%r1,%[x])\n\t"
"vl %%v23,240(%%r1,%[x])\n\t"
"vflpdb %%v16, %%v16\n\t"
"vflpdb %%v17, %%v17\n\t"
"vflpdb %%v18, %%v18\n\t"
"vflpdb %%v19, %%v19\n\t"
"vflpdb %%v20, %%v20\n\t"
"vflpdb %%v21, %%v21\n\t"
"vflpdb %%v22, %%v22\n\t"
"vflpdb %%v23, %%v23\n\t"
"vfchdb %%v24,%%v16,%%v17\n\t"
"vfchdb %%v25,%%v18,%%v19\n\t"
"vfchdb %%v26,%%v20,%%v21\n\t"
"vfchdb %%v27,%%v22,%%v23\n\t"
"vsel %%v24,%%v16,%%v17,%%v24\n\t"
"vsel %%v25,%%v18,%%v19,%%v25\n\t"
"vsel %%v26,%%v20,%%v21,%%v26\n\t"
"vsel %%v27,%%v22,%%v23,%%v27\n\t"
"vfchdb %%v28,%%v24,%%v25\n\t"
"vfchdb %%v29,%%v26,%%v27\n\t"
"vsel %%v28,%%v24,%%v25,%%v28\n\t"
"vsel %%v29,%%v26,%%v27,%%v29\n\t"
"vfchdb %%v30,%%v28,%%v29\n\t"
"vsel %%v30,%%v28,%%v29,%%v30\n\t"
"vfchdb %%v31,%%v30,%%v0\n\t"
"vsel %%v0,%%v30,%%v0,%%v31\n\t"
"agfi %%r1, 256\n\t"
"brctg %[n], 0b\n\t"
"vrepg %%v16,%%v0,1\n\t"
"wfchdb %%v17,%%v0,%%v16\n\t"
"vsel %%v0,%%v0,%%v16,%%v17\n\t"
"ldr %[amax],%%f0"
: [amax] "=f"(amax),[n] "+&r"(n)
: "m"(*(const FLOAT (*)[n]) x),[x] "a"(x)
: "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22",
"v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31");

return amax;
}


if (inc_x == 1) {
FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
BLASLONG i = 0;
BLASLONG j = 0;
FLOAT maxf = 0.0;


BLASLONG n1 = n & -32;
if (n1 > 0) {
if (n <= 0 || inc_x <= 0)
return (maxf);


maxf = damax_kernel_32(n1, x);
if (inc_x == 1) {


i = n1;
}
else
{
maxf=ABS(x[0]);
i++;
}
BLASLONG n1 = n & -32;
if (n1 > 0) {


while (i < n) {
if (ABS(x[i]) > maxf) {
maxf = ABS(x[i]);
}
i++;
}
return (maxf);
maxf = damax_kernel_32(n1, x);


i = n1;
} else { } else {
maxf = ABS(x[0]);
i++;
}

while (i < n) {
if (ABS(x[i]) > maxf) {
maxf = ABS(x[i]);
}
i++;
}
return (maxf);


maxf=ABS(x[0]);
} else {


BLASLONG n1 = n & -4;
while (j < n1) {
maxf = ABS(x[0]);


if (ABS(x[i]) > maxf) {
maxf = ABS(x[i]);
}
if (ABS(x[i + inc_x]) > maxf) {
maxf = ABS(x[i + inc_x]);
}
if (ABS(x[i + 2 * inc_x]) > maxf) {
maxf = ABS(x[i + 2 * inc_x]);
}
if (ABS(x[i + 3 * inc_x]) > maxf) {
maxf = ABS(x[i + 3 * inc_x]);
}
BLASLONG n1 = n & -4;
while (j < n1) {


i += inc_x * 4;
if (ABS(x[i]) > maxf) {
maxf = ABS(x[i]);
}
if (ABS(x[i + inc_x]) > maxf) {
maxf = ABS(x[i + inc_x]);
}
if (ABS(x[i + 2 * inc_x]) > maxf) {
maxf = ABS(x[i + 2 * inc_x]);
}
if (ABS(x[i + 3 * inc_x]) > maxf) {
maxf = ABS(x[i + 3 * inc_x]);
}


j += 4;
i += inc_x * 4;


}
j += 4;


}


while (j < n) {
if (ABS(x[i]) > maxf) {
maxf = ABS(x[i]);
}
i += inc_x;
j++;
}
return (maxf);
while (j < n) {
if (ABS(x[i]) > maxf) {
maxf = ABS(x[i]);
}
i += inc_x;
j++;
} }
return (maxf);
}
} }

+ 102
- 118
kernel/zarch/damin.c View File

@@ -1,5 +1,5 @@
/*************************************************************************** /***************************************************************************
Copyright (c) 2013-2017, The OpenBLAS Project
Copyright (c) 2013-2019, The OpenBLAS Project
All rights reserved. All rights reserved.
Redistribution and use in source and binary forms, with or without Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are modification, are permitted provided that the following conditions are
@@ -28,139 +28,123 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "common.h" #include "common.h"
#include <math.h> #include <math.h>


#if defined(DOUBLE)
#define ABS fabs #define ABS fabs
#else
#define ABS fabsf
#endif

static FLOAT damin_kernel_32(BLASLONG n, FLOAT *x)
{
FLOAT amin;

__asm__ volatile (
"vl %%v0,0(%2) \n\t"
"srlg %%r0,%1,5 \n\t"
"xgr %%r1,%%r1 \n\t"
"0: \n\t"
"pfd 1, 1024(%%r1,%2) \n\t"

"vl %%v16,0(%%r1,%2) \n\t"
"vl %%v17,16(%%r1,%2) \n\t"
"vl %%v18,32(%%r1,%2) \n\t"
"vl %%v19,48(%%r1,%2) \n\t"
"vl %%v20,64(%%r1,%2) \n\t"
"vl %%v21,80(%%r1,%2) \n\t"
"vl %%v22,96(%%r1,%2) \n\t"
"vl %%v23,112(%%r1,%2) \n\t"
"vl %%v24,128(%%r1,%2) \n\t"
"vl %%v25,144(%%r1,%2) \n\t"
"vl %%v26,160(%%r1,%2) \n\t"
"vl %%v27,176(%%r1,%2) \n\t"
"vl %%v28,192(%%r1,%2) \n\t"
"vl %%v29,208(%%r1,%2) \n\t"
"vl %%v30,224(%%r1,%2) \n\t"
"vl %%v31,240(%%r1,%2) \n\t"
"vfmindb %%v16,%%v16,%%v24,8 \n\t"
"vfmindb %%v17,%%v17,%%v25,8 \n\t"
"vfmindb %%v18,%%v18,%%v26,8 \n\t"
"vfmindb %%v19,%%v19,%%v27,8 \n\t"
"vfmindb %%v20,%%v20,%%v28,8 \n\t"
"vfmindb %%v21,%%v21,%%v29,8 \n\t"
"vfmindb %%v22,%%v22,%%v30,8 \n\t"
"vfmindb %%v23,%%v23,%%v31,8 \n\t"

"vfmindb %%v16,%%v16,%%v20,8 \n\t"
"vfmindb %%v17,%%v17,%%v21,8 \n\t"
"vfmindb %%v18,%%v18,%%v22,8 \n\t"
"vfmindb %%v19,%%v19,%%v23,8 \n\t"

"vfmindb %%v16,%%v16,%%v18,8 \n\t"
"vfmindb %%v17,%%v17,%%v19,8 \n\t"

"vfmindb %%v16,%%v16,%%v17,8 \n\t"

"vfmindb %%v0,%%v0,%%v16,8 \n\t"

"agfi %%r1, 256 \n\t"
"brctg %%r0, 0b \n\t"

"vrepg %%v16,%%v0,1 \n\t"
"wfmindb %%v0,%%v0,%%v16,8 \n\t"
"lpdr %0,%%f0 "
:"=f"(amin)
:"r"(n),"ZR"((const FLOAT (*)[n])x)
:"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
);

return amin;
}
FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
BLASLONG i = 0;
BLASLONG j = 0;
FLOAT minf = 0.0;


if (n <= 0 || inc_x <= 0) return (minf);
static FLOAT damin_kernel_32(BLASLONG n, FLOAT *x) {
FLOAT amin;

__asm__("vl %%v0,0(%[x])\n\t"
"srlg %[n],%[n],5\n\t"
"xgr %%r1,%%r1\n\t"
"0:\n\t"
"pfd 1, 1024(%%r1,%[x])\n\t"
"vl %%v16,0(%%r1,%[x])\n\t"
"vl %%v17,16(%%r1,%[x])\n\t"
"vl %%v18,32(%%r1,%[x])\n\t"
"vl %%v19,48(%%r1,%[x])\n\t"
"vl %%v20,64(%%r1,%[x])\n\t"
"vl %%v21,80(%%r1,%[x])\n\t"
"vl %%v22,96(%%r1,%[x])\n\t"
"vl %%v23,112(%%r1,%[x])\n\t"
"vl %%v24,128(%%r1,%[x])\n\t"
"vl %%v25,144(%%r1,%[x])\n\t"
"vl %%v26,160(%%r1,%[x])\n\t"
"vl %%v27,176(%%r1,%[x])\n\t"
"vl %%v28,192(%%r1,%[x])\n\t"
"vl %%v29,208(%%r1,%[x])\n\t"
"vl %%v30,224(%%r1,%[x])\n\t"
"vl %%v31,240(%%r1,%[x])\n\t"
"vfmindb %%v16,%%v16,%%v24,8\n\t"
"vfmindb %%v17,%%v17,%%v25,8\n\t"
"vfmindb %%v18,%%v18,%%v26,8\n\t"
"vfmindb %%v19,%%v19,%%v27,8\n\t"
"vfmindb %%v20,%%v20,%%v28,8\n\t"
"vfmindb %%v21,%%v21,%%v29,8\n\t"
"vfmindb %%v22,%%v22,%%v30,8\n\t"
"vfmindb %%v23,%%v23,%%v31,8\n\t"
"vfmindb %%v16,%%v16,%%v20,8\n\t"
"vfmindb %%v17,%%v17,%%v21,8\n\t"
"vfmindb %%v18,%%v18,%%v22,8\n\t"
"vfmindb %%v19,%%v19,%%v23,8\n\t"
"vfmindb %%v16,%%v16,%%v18,8\n\t"
"vfmindb %%v17,%%v17,%%v19,8\n\t"
"vfmindb %%v16,%%v16,%%v17,8\n\t"
"vfmindb %%v0,%%v0,%%v16,8\n\t"
"agfi %%r1, 256\n\t"
"brctg %[n], 0b\n\t"
"vrepg %%v16,%%v0,1\n\t"
"wfmindb %%v0,%%v0,%%v16,8\n\t"
"lpdr %[amin],%%f0"
: [amin] "=f"(amin),[n] "+&r"(n)
: "m"(*(const FLOAT (*)[n]) x),[x] "a"(x)
: "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22",
"v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31");

return amin;
}


if (inc_x == 1) {
FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
BLASLONG i = 0;
BLASLONG j = 0;
FLOAT minf = 0.0;


BLASLONG n1 = n & -32;
if (n1 > 0) {
if (n <= 0 || inc_x <= 0)
return (minf);


minf = damin_kernel_32(n1, x);
if (inc_x == 1) {


i = n1;
}
else
{
minf=ABS(x[0]);
i++;
}
BLASLONG n1 = n & -32;
if (n1 > 0) {


while (i < n) {
if (ABS(x[i]) < minf) {
minf = ABS(x[i]);
}
i++;
}
return (minf);
minf = damin_kernel_32(n1, x);


i = n1;
} else { } else {
minf = ABS(x[0]);
i++;
}

while (i < n) {
if (ABS(x[i]) < minf) {
minf = ABS(x[i]);
}
i++;
}
return (minf);


minf=ABS(x[0]);
} else {


BLASLONG n1 = n & -4;
while (j < n1) {
minf = ABS(x[0]);


if (ABS(x[i]) < minf) {
minf = ABS(x[i]);
}
if (ABS(x[i + inc_x]) < minf) {
minf = ABS(x[i + inc_x]);
}
if (ABS(x[i + 2 * inc_x]) < minf) {
minf = ABS(x[i + 2 * inc_x]);
}
if (ABS(x[i + 3 * inc_x]) < minf) {
minf = ABS(x[i + 3 * inc_x]);
}
BLASLONG n1 = n & -4;
while (j < n1) {


i += inc_x * 4;
if (ABS(x[i]) < minf) {
minf = ABS(x[i]);
}
if (ABS(x[i + inc_x]) < minf) {
minf = ABS(x[i + inc_x]);
}
if (ABS(x[i + 2 * inc_x]) < minf) {
minf = ABS(x[i + 2 * inc_x]);
}
if (ABS(x[i + 3 * inc_x]) < minf) {
minf = ABS(x[i + 3 * inc_x]);
}


j += 4;
i += inc_x * 4;


}
j += 4;


}


while (j < n) {
if (ABS(x[i]) < minf) {
minf = ABS(x[i]);
}
i += inc_x;
j++;
}
return (minf);
while (j < n) {
if (ABS(x[i]) < minf) {
minf = ABS(x[i]);
}
i += inc_x;
j++;
} }
return (minf);
}
} }

+ 136
- 156
kernel/zarch/damin_z13.c View File

@@ -1,5 +1,5 @@
/*************************************************************************** /***************************************************************************
Copyright (c) 2013-2017, The OpenBLAS Project
Copyright (c) 2013-2019, The OpenBLAS Project
All rights reserved. All rights reserved.
Redistribution and use in source and binary forms, with or without Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are modification, are permitted provided that the following conditions are
@@ -28,177 +28,157 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "common.h" #include "common.h"
#include <math.h> #include <math.h>


#if defined(DOUBLE)
#define ABS fabs #define ABS fabs
#else
#define ABS fabsf
#endif

static FLOAT damin_kernel_32(BLASLONG n, FLOAT *x)
{
FLOAT amin;

__asm__ volatile (
"vl %%v0,0(%2) \n\t"
"vflpdb %%v0,%%v0 \n\t"
"srlg %%r0,%1,5 \n\t"
"xgr %%r1,%%r1 \n\t"
"0: \n\t"
"pfd 1, 1024(%%r1,%2) \n\t"

"vl %%v16,0(%%r1,%2) \n\t"
"vl %%v17,16(%%r1,%2) \n\t"
"vl %%v18,32(%%r1,%2) \n\t"
"vl %%v19,48(%%r1,%2) \n\t"
"vl %%v20,64(%%r1,%2) \n\t"
"vl %%v21,80(%%r1,%2) \n\t"
"vl %%v22,96(%%r1,%2) \n\t"
"vl %%v23,112(%%r1,%2) \n\t"
"vflpdb %%v16, %%v16 \n\t"
"vflpdb %%v17, %%v17 \n\t"
"vflpdb %%v18, %%v18 \n\t"
"vflpdb %%v19, %%v19 \n\t"
"vflpdb %%v20, %%v20 \n\t"
"vflpdb %%v21, %%v21 \n\t"
"vflpdb %%v22, %%v22 \n\t"
"vflpdb %%v23, %%v23 \n\t"
"vfchdb %%v24,%%v17,%%v16 \n\t"
"vfchdb %%v25,%%v19,%%v18 \n\t"
"vfchdb %%v26,%%v21,%%v20 \n\t"
"vfchdb %%v27,%%v23,%%v22 \n\t"
"vsel %%v24,%%v16,%%v17,%%v24 \n\t"
"vsel %%v25,%%v18,%%v19,%%v25 \n\t"
"vsel %%v26,%%v20,%%v21,%%v26 \n\t"
"vsel %%v27,%%v22,%%v23,%%v27 \n\t"

"vfchdb %%v28,%%v25,%%v24 \n\t"
"vfchdb %%v29,%%v27,%%v26 \n\t"
"vsel %%v28,%%v24,%%v25,%%v28 \n\t"
"vsel %%v29,%%v26,%%v27,%%v29 \n\t"

"vfchdb %%v30,%%v29,%%v28 \n\t"
"vsel %%v30,%%v28,%%v29,%%v30 \n\t"

"vfchdb %%v31,%%v0,%%v30 \n\t"
"vsel %%v0,%%v30,%%v0,%%v31 \n\t"

"vl %%v16,128(%%r1,%2) \n\t"
"vl %%v17,144(%%r1,%2) \n\t"
"vl %%v18,160(%%r1,%2) \n\t"
"vl %%v19,176(%%r1,%2) \n\t"
"vl %%v20,192(%%r1,%2) \n\t"
"vl %%v21,208(%%r1,%2) \n\t"
"vl %%v22,224(%%r1,%2) \n\t"
"vl %%v23,240(%%r1,%2) \n\t"
"vflpdb %%v16, %%v16 \n\t"
"vflpdb %%v17, %%v17 \n\t"
"vflpdb %%v18, %%v18 \n\t"
"vflpdb %%v19, %%v19 \n\t"
"vflpdb %%v20, %%v20 \n\t"
"vflpdb %%v21, %%v21 \n\t"
"vflpdb %%v22, %%v22 \n\t"
"vflpdb %%v23, %%v23 \n\t"
"vfchdb %%v24,%%v17,%%v16 \n\t"
"vfchdb %%v25,%%v19,%%v18 \n\t"
"vfchdb %%v26,%%v21,%%v20 \n\t"
"vfchdb %%v27,%%v23,%%v22 \n\t"
"vsel %%v24,%%v16,%%v17,%%v24 \n\t"
"vsel %%v25,%%v18,%%v19,%%v25 \n\t"
"vsel %%v26,%%v20,%%v21,%%v26 \n\t"
"vsel %%v27,%%v22,%%v23,%%v27 \n\t"

"vfchdb %%v28,%%v25,%%v24 \n\t"
"vfchdb %%v29,%%v27,%%v26 \n\t"
"vsel %%v28,%%v24,%%v25,%%v28 \n\t"
"vsel %%v29,%%v26,%%v27,%%v29 \n\t"

"vfchdb %%v30,%%v29,%%v28 \n\t"
"vsel %%v30,%%v28,%%v29,%%v30 \n\t"

"vfchdb %%v31,%%v0,%%v30 \n\t"
"vsel %%v0,%%v30,%%v0,%%v31 \n\t"

"agfi %%r1, 256 \n\t"
"brctg %%r0, 0b \n\t"

"vrepg %%v16,%%v0,1 \n\t"
"wfchdb %%v17,%%v16,%%v0 \n\t"
"vsel %%v0,%%v0,%%v16,%%v17 \n\t"
"ldr %0,%%f0 "
:"=f"(amin)
:"r"(n),"ZR"((const FLOAT (*)[n])x)
:"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
);

return amin;
}
FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
BLASLONG i = 0;
BLASLONG j = 0;
FLOAT minf = 0.0;


if (n <= 0 || inc_x <= 0) return (minf);
static FLOAT damin_kernel_32(BLASLONG n, FLOAT *x) {
FLOAT amin;

__asm__("vl %%v0,0(%[x])\n\t"
"vflpdb %%v0,%%v0\n\t"
"srlg %[n],%[n],5\n\t"
"xgr %%r1,%%r1\n\t"
"0:\n\t"
"pfd 1, 1024(%%r1,%[x])\n\t"
"vl %%v16,0(%%r1,%[x])\n\t"
"vl %%v17,16(%%r1,%[x])\n\t"
"vl %%v18,32(%%r1,%[x])\n\t"
"vl %%v19,48(%%r1,%[x])\n\t"
"vl %%v20,64(%%r1,%[x])\n\t"
"vl %%v21,80(%%r1,%[x])\n\t"
"vl %%v22,96(%%r1,%[x])\n\t"
"vl %%v23,112(%%r1,%[x])\n\t"
"vflpdb %%v16, %%v16\n\t"
"vflpdb %%v17, %%v17\n\t"
"vflpdb %%v18, %%v18\n\t"
"vflpdb %%v19, %%v19\n\t"
"vflpdb %%v20, %%v20\n\t"
"vflpdb %%v21, %%v21\n\t"
"vflpdb %%v22, %%v22\n\t"
"vflpdb %%v23, %%v23\n\t"
"vfchdb %%v24,%%v17,%%v16\n\t"
"vfchdb %%v25,%%v19,%%v18\n\t"
"vfchdb %%v26,%%v21,%%v20\n\t"
"vfchdb %%v27,%%v23,%%v22\n\t"
"vsel %%v24,%%v16,%%v17,%%v24\n\t"
"vsel %%v25,%%v18,%%v19,%%v25\n\t"
"vsel %%v26,%%v20,%%v21,%%v26\n\t"
"vsel %%v27,%%v22,%%v23,%%v27\n\t"
"vfchdb %%v28,%%v25,%%v24\n\t"
"vfchdb %%v29,%%v27,%%v26\n\t"
"vsel %%v28,%%v24,%%v25,%%v28\n\t"
"vsel %%v29,%%v26,%%v27,%%v29\n\t"
"vfchdb %%v30,%%v29,%%v28\n\t"
"vsel %%v30,%%v28,%%v29,%%v30\n\t"
"vfchdb %%v31,%%v0,%%v30\n\t"
"vsel %%v0,%%v30,%%v0,%%v31\n\t"
"vl %%v16,128(%%r1,%[x])\n\t"
"vl %%v17,144(%%r1,%[x])\n\t"
"vl %%v18,160(%%r1,%[x])\n\t"
"vl %%v19,176(%%r1,%[x])\n\t"
"vl %%v20,192(%%r1,%[x])\n\t"
"vl %%v21,208(%%r1,%[x])\n\t"
"vl %%v22,224(%%r1,%[x])\n\t"
"vl %%v23,240(%%r1,%[x])\n\t"
"vflpdb %%v16, %%v16\n\t"
"vflpdb %%v17, %%v17\n\t"
"vflpdb %%v18, %%v18\n\t"
"vflpdb %%v19, %%v19\n\t"
"vflpdb %%v20, %%v20\n\t"
"vflpdb %%v21, %%v21\n\t"
"vflpdb %%v22, %%v22\n\t"
"vflpdb %%v23, %%v23\n\t"
"vfchdb %%v24,%%v17,%%v16\n\t"
"vfchdb %%v25,%%v19,%%v18\n\t"
"vfchdb %%v26,%%v21,%%v20\n\t"
"vfchdb %%v27,%%v23,%%v22\n\t"
"vsel %%v24,%%v16,%%v17,%%v24\n\t"
"vsel %%v25,%%v18,%%v19,%%v25\n\t"
"vsel %%v26,%%v20,%%v21,%%v26\n\t"
"vsel %%v27,%%v22,%%v23,%%v27\n\t"
"vfchdb %%v28,%%v25,%%v24\n\t"
"vfchdb %%v29,%%v27,%%v26\n\t"
"vsel %%v28,%%v24,%%v25,%%v28\n\t"
"vsel %%v29,%%v26,%%v27,%%v29\n\t"
"vfchdb %%v30,%%v29,%%v28\n\t"
"vsel %%v30,%%v28,%%v29,%%v30\n\t"
"vfchdb %%v31,%%v0,%%v30\n\t"
"vsel %%v0,%%v30,%%v0,%%v31\n\t"
"agfi %%r1, 256\n\t"
"brctg %[n], 0b\n\t"
"vrepg %%v16,%%v0,1\n\t"
"wfchdb %%v17,%%v16,%%v0\n\t"
"vsel %%v0,%%v0,%%v16,%%v17\n\t"
"ldr %[amin],%%f0"
: [amin] "=f"(amin),[n] "+&r"(n)
: "m"(*(const FLOAT (*)[n]) x),[x] "a"(x)
: "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22",
"v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31");

return amin;
}


if (inc_x == 1) {
FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
BLASLONG i = 0;
BLASLONG j = 0;
FLOAT minf = 0.0;


BLASLONG n1 = n & -32;
if (n1 > 0) {
if (n <= 0 || inc_x <= 0)
return (minf);


minf = damin_kernel_32(n1, x);
if (inc_x == 1) {


i = n1;
}
else
{
minf=ABS(x[0]);
i++;
}
BLASLONG n1 = n & -32;
if (n1 > 0) {


while (i < n) {
if (ABS(x[i]) < minf) {
minf = ABS(x[i]);
}
i++;
}
return (minf);
minf = damin_kernel_32(n1, x);


i = n1;
} else { } else {
minf = ABS(x[0]);
i++;
}

while (i < n) {
if (ABS(x[i]) < minf) {
minf = ABS(x[i]);
}
i++;
}
return (minf);


minf=ABS(x[0]);
} else {


BLASLONG n1 = n & -4;
while (j < n1) {
minf = ABS(x[0]);


if (ABS(x[i]) < minf) {
minf = ABS(x[i]);
}
if (ABS(x[i + inc_x]) < minf) {
minf = ABS(x[i + inc_x]);
}
if (ABS(x[i + 2 * inc_x]) < minf) {
minf = ABS(x[i + 2 * inc_x]);
}
if (ABS(x[i + 3 * inc_x]) < minf) {
minf = ABS(x[i + 3 * inc_x]);
}
BLASLONG n1 = n & -4;
while (j < n1) {


i += inc_x * 4;
if (ABS(x[i]) < minf) {
minf = ABS(x[i]);
}
if (ABS(x[i + inc_x]) < minf) {
minf = ABS(x[i + inc_x]);
}
if (ABS(x[i + 2 * inc_x]) < minf) {
minf = ABS(x[i + 2 * inc_x]);
}
if (ABS(x[i + 3 * inc_x]) < minf) {
minf = ABS(x[i + 3 * inc_x]);
}


j += 4;
i += inc_x * 4;


}
j += 4;


}


while (j < n) {
if (ABS(x[i]) < minf) {
minf = ABS(x[i]);
}
i += inc_x;
j++;
}
return (minf);
while (j < n) {
if (ABS(x[i]) < minf) {
minf = ABS(x[i]);
}
i += inc_x;
j++;
} }
return (minf);
}
} }

+ 121
- 127
kernel/zarch/dasum.c View File

@@ -1,5 +1,5 @@
/*************************************************************************** /***************************************************************************
Copyright (c) 2013-2018, The OpenBLAS Project
Copyright (c) 2013-2019, The OpenBLAS Project
All rights reserved. All rights reserved.
Redistribution and use in source and binary forms, with or without Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are modification, are permitted provided that the following conditions are
@@ -28,145 +28,139 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "common.h" #include "common.h"
#include <math.h> #include <math.h>


#if defined(DOUBLE)
#define ABS fabs
#else
#define ABS fabsf
#endif

static FLOAT dasum_kernel_32(BLASLONG n, FLOAT *x)
{
FLOAT asum;

__asm__ (
"vzero %%v0 \n\t"
"vzero %%v1 \n\t"
"vzero %%v2 \n\t"
"vzero %%v3 \n\t"
"srlg %%r0,%1,5 \n\t"
"xgr %%r1,%%r1 \n\t"
"0: \n\t"
"pfd 1, 1024(%%r1,%2) \n\t"
"vl %%v16, 0(%%r1,%2) \n\t"
"vl %%v17, 16(%%r1,%2) \n\t"
"vl %%v18, 32(%%r1,%2) \n\t"
"vl %%v19, 48(%%r1,%2) \n\t"
"vl %%v20, 64(%%r1,%2) \n\t"
"vl %%v21, 80(%%r1,%2) \n\t"
"vl %%v22, 96(%%r1,%2) \n\t"
"vl %%v23, 112(%%r1,%2) \n\t"

"vflpdb %%v16, %%v16 \n\t"
"vflpdb %%v17, %%v17 \n\t"
"vflpdb %%v18, %%v18 \n\t"
"vflpdb %%v19, %%v19 \n\t"
"vflpdb %%v20, %%v20 \n\t"
"vflpdb %%v21, %%v21 \n\t"
"vflpdb %%v22, %%v22 \n\t"
"vflpdb %%v23, %%v23 \n\t"

"vfadb %%v0,%%v0,%%v16 \n\t"
"vfadb %%v1,%%v1,%%v17 \n\t"
"vfadb %%v2,%%v2,%%v18 \n\t"
"vfadb %%v3,%%v3,%%v19 \n\t"
"vfadb %%v0,%%v0,%%v20 \n\t"
"vfadb %%v1,%%v1,%%v21 \n\t"
"vfadb %%v2,%%v2,%%v22 \n\t"
"vfadb %%v3,%%v3,%%v23 \n\t"

"vl %%v16, 128(%%r1,%2) \n\t"
"vl %%v17, 144(%%r1,%2) \n\t"
"vl %%v18, 160(%%r1,%2) \n\t"
"vl %%v19, 176(%%r1,%2) \n\t"
"vl %%v20, 192(%%r1,%2) \n\t"
"vl %%v21, 208(%%r1,%2) \n\t"
"vl %%v22, 224(%%r1,%2) \n\t"
"vl %%v23, 240(%%r1,%2) \n\t"

"vflpdb %%v16, %%v16 \n\t"
"vflpdb %%v17, %%v17 \n\t"
"vflpdb %%v18, %%v18 \n\t"
"vflpdb %%v19, %%v19 \n\t"
"vflpdb %%v20, %%v20 \n\t"
"vflpdb %%v21, %%v21 \n\t"
"vflpdb %%v22, %%v22 \n\t"
"vflpdb %%v23, %%v23 \n\t"

"vfadb %%v0,%%v0,%%v16 \n\t"
"vfadb %%v1,%%v1,%%v17 \n\t"
"vfadb %%v2,%%v2,%%v18 \n\t"
"vfadb %%v3,%%v3,%%v19 \n\t"
"vfadb %%v0,%%v0,%%v20 \n\t"
"vfadb %%v1,%%v1,%%v21 \n\t"
"vfadb %%v2,%%v2,%%v22 \n\t"
"vfadb %%v3,%%v3,%%v23 \n\t"
"agfi %%r1,256 \n\t"
"brctg %%r0,0b \n\t"
"vfadb %%v0,%%v0,%%v1 \n\t"
"vfadb %%v0,%%v0,%%v2 \n\t"
"vfadb %%v0,%%v0,%%v3 \n\t"
"vrepg %%v1,%%v0,1 \n\t"
"adbr %%f0,%%f1 \n\t"
"ldr %0,%%f0 "
:"=f"(asum)
:"r"(n),"ZR"((const FLOAT (*)[n])x)
:"memory","cc","r0","r1","v0","v1","v2","v3","v16","v17","v18","v19","v20","v21","v22","v23"
);

return asum;
#define ABS fabs

static FLOAT dasum_kernel_32(BLASLONG n, FLOAT *x) {
FLOAT asum;

__asm__("vzero %%v24\n\t"
"vzero %%v25\n\t"
"vzero %%v26\n\t"
"vzero %%v27\n\t"
"vzero %%v28\n\t"
"vzero %%v29\n\t"
"vzero %%v30\n\t"
"vzero %%v31\n\t"
"srlg %[n],%[n],5\n\t"
"xgr %%r1,%%r1\n\t"
"0:\n\t"
"pfd 1, 1024(%%r1,%[x])\n\t"
"vl %%v16, 0(%%r1,%[x])\n\t"
"vl %%v17, 16(%%r1,%[x])\n\t"
"vl %%v18, 32(%%r1,%[x])\n\t"
"vl %%v19, 48(%%r1,%[x])\n\t"
"vl %%v20, 64(%%r1,%[x])\n\t"
"vl %%v21, 80(%%r1,%[x])\n\t"
"vl %%v22, 96(%%r1,%[x])\n\t"
"vl %%v23, 112(%%r1,%[x])\n\t"
"vflpdb %%v16, %%v16\n\t"
"vflpdb %%v17, %%v17\n\t"
"vflpdb %%v18, %%v18\n\t"
"vflpdb %%v19, %%v19\n\t"
"vflpdb %%v20, %%v20\n\t"
"vflpdb %%v21, %%v21\n\t"
"vflpdb %%v22, %%v22\n\t"
"vflpdb %%v23, %%v23\n\t"
"vfadb %%v24,%%v24,%%v16\n\t"
"vfadb %%v25,%%v25,%%v17\n\t"
"vfadb %%v26,%%v26,%%v18\n\t"
"vfadb %%v27,%%v27,%%v19\n\t"
"vfadb %%v28,%%v28,%%v20\n\t"
"vfadb %%v29,%%v29,%%v21\n\t"
"vfadb %%v30,%%v30,%%v22\n\t"
"vfadb %%v31,%%v31,%%v23\n\t"
"vl %%v16, 128(%%r1,%[x])\n\t"
"vl %%v17, 144(%%r1,%[x])\n\t"
"vl %%v18, 160(%%r1,%[x])\n\t"
"vl %%v19, 176(%%r1,%[x])\n\t"
"vl %%v20, 192(%%r1,%[x])\n\t"
"vl %%v21, 208(%%r1,%[x])\n\t"
"vl %%v22, 224(%%r1,%[x])\n\t"
"vl %%v23, 240(%%r1,%[x])\n\t"
"vflpdb %%v16, %%v16\n\t"
"vflpdb %%v17, %%v17\n\t"
"vflpdb %%v18, %%v18\n\t"
"vflpdb %%v19, %%v19\n\t"
"vflpdb %%v20, %%v20\n\t"
"vflpdb %%v21, %%v21\n\t"
"vflpdb %%v22, %%v22\n\t"
"vflpdb %%v23, %%v23\n\t"
"vfadb %%v24,%%v24,%%v16\n\t"
"vfadb %%v25,%%v25,%%v17\n\t"
"vfadb %%v26,%%v26,%%v18\n\t"
"vfadb %%v27,%%v27,%%v19\n\t"
"vfadb %%v28,%%v28,%%v20\n\t"
"vfadb %%v29,%%v29,%%v21\n\t"
"vfadb %%v30,%%v30,%%v22\n\t"
"vfadb %%v31,%%v31,%%v23\n\t"
"agfi %%r1,256\n\t"
"brctg %[n],0b\n\t"
"vfadb %%v24,%%v24,%%v25\n\t"
"vfadb %%v24,%%v24,%%v26\n\t"
"vfadb %%v24,%%v24,%%v27\n\t"
"vfadb %%v24,%%v24,%%v28\n\t"
"vfadb %%v24,%%v24,%%v29\n\t"
"vfadb %%v24,%%v24,%%v30\n\t"
"vfadb %%v24,%%v24,%%v31\n\t"
"vrepg %%v25,%%v24,1\n\t"
"vfadb %%v24,%%v24,%%v25\n\t"
"vsteg %%v24,%[asum],0"
: [asum] "=m"(asum),[n] "+&r"(n)
: "m"(*(const FLOAT (*)[n]) x),[x] "a"(x)
: "cc", "r1", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23",
"v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31");

return asum;
} }


FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
BLASLONG i = 0;
BLASLONG j = 0;
FLOAT sumf = 0.0;
BLASLONG n1;
BLASLONG i = 0;
BLASLONG j = 0;
FLOAT sumf = 0.0;
BLASLONG n1;


if (n <= 0 || inc_x <= 0) return sumf;

if (inc_x == 1) {

n1 = n & -32;
if (n1 > 0) {
if (n <= 0 || inc_x <= 0)
return sumf;


sumf = dasum_kernel_32(n1, x);
i = n1;
}
if (inc_x == 1) {


while (i < n) {
sumf += ABS(x[i]);
i++;
}
n1 = n & -32;


} else {
BLASLONG n1 = n & -4;
register FLOAT sum1, sum2;
sum1 = 0.0;
sum2 = 0.0;
while (j < n1) {
if (n1 > 0) {


sum1 += ABS(x[i]);
sum2 += ABS(x[i + inc_x]);
sum1 += ABS(x[i + 2 * inc_x]);
sum2 += ABS(x[i + 3 * inc_x]);
sumf = dasum_kernel_32(n1, x);
i = n1;
}


i += inc_x * 4;
j += 4;
while (i < n) {
sumf += ABS(x[i]);
i++;
}


}
sumf = sum1 + sum2;
while (j < n) {
} else {
BLASLONG n1 = n & -4;
register FLOAT sum1, sum2;
sum1 = 0.0;
sum2 = 0.0;
while (j < n1) {


sumf += ABS(x[i]);
i += inc_x;
j++;
}
sum1 += ABS(x[i]);
sum2 += ABS(x[i + inc_x]);
sum1 += ABS(x[i + 2 * inc_x]);
sum2 += ABS(x[i + 3 * inc_x]);


i += inc_x * 4;
j += 4;


} }
return sumf;
}
sumf = sum1 + sum2;
while (j < n) {


sumf += ABS(x[i]);
i += inc_x;
j++;
}


}
return sumf;
}

+ 118
- 135
kernel/zarch/daxpy.c View File

@@ -1,5 +1,5 @@
/*************************************************************************** /***************************************************************************
Copyright (c) 2013-2017, The OpenBLAS Project
Copyright (c) 2013-2019, The OpenBLAS Project
All rights reserved. All rights reserved.
Redistribution and use in source and binary forms, with or without Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are modification, are permitted provided that the following conditions are
@@ -27,158 +27,141 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.


#include "common.h" #include "common.h"


static void daxpy_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
{
__asm__ volatile(
"vlrepg %%v0,%3 \n\t"
"srlg %%r0,%0,5 \n\t"
"xgr %%r1,%%r1 \n\t"
"0: \n\t"
"pfd 1, 1024(%%r1,%1) \n\t"
"pfd 2, 1024(%%r1,%2) \n\t"

"vl %%v16,0(%%r1,%1) \n\t"
"vl %%v17,16(%%r1,%1) \n\t"
"vl %%v18,32(%%r1,%1) \n\t"
"vl %%v19,48(%%r1,%1) \n\t"
"vl %%v20,0(%%r1,%2) \n\t"
"vl %%v21,16(%%r1,%2) \n\t"
"vl %%v22,32(%%r1,%2) \n\t"
"vl %%v23,48(%%r1,%2) \n\t"
"vfmadb %%v16,%%v0,%%v16,%%v20 \n\t"
"vfmadb %%v17,%%v0,%%v17,%%v21 \n\t"
"vfmadb %%v18,%%v0,%%v18,%%v22 \n\t"
"vfmadb %%v19,%%v0,%%v19,%%v23 \n\t"

"vl %%v24,64(%%r1,%1) \n\t"
"vl %%v25,80(%%r1,%1) \n\t"
"vl %%v26,96(%%r1,%1) \n\t"
"vl %%v27,112(%%r1,%1) \n\t"
"vl %%v28,64(%%r1,%2) \n\t"
"vl %%v29,80(%%r1,%2) \n\t"
"vl %%v30,96(%%r1,%2) \n\t"
"vl %%v31,112(%%r1,%2) \n\t"

"vfmadb %%v20,%%v0,%%v24,%%v28 \n\t"
"vfmadb %%v21,%%v0,%%v25,%%v29 \n\t"
"vfmadb %%v22,%%v0,%%v26,%%v30 \n\t"
"vfmadb %%v23,%%v0,%%v27,%%v31 \n\t"

"vst %%v16,0(%%r1,%2) \n\t"
"vst %%v17,16(%%r1,%2) \n\t"
"vst %%v18,32(%%r1,%2) \n\t"
"vst %%v19,48(%%r1,%2) \n\t"
"vst %%v20,64(%%r1,%2) \n\t"
"vst %%v21,80(%%r1,%2) \n\t"
"vst %%v22,96(%%r1,%2) \n\t"
"vst %%v23,112(%%r1,%2) \n\t"

"vl %%v16,128(%%r1,%1) \n\t"
"vl %%v17,144(%%r1,%1) \n\t"
"vl %%v18,160(%%r1,%1) \n\t"
"vl %%v19,176(%%r1,%1) \n\t"
"vl %%v20,128(%%r1,%2) \n\t"
"vl %%v21,144(%%r1,%2) \n\t"
"vl %%v22,160(%%r1,%2) \n\t"
"vl %%v23,176(%%r1,%2) \n\t"
"vfmadb %%v16,%%v0,%%v16,%%v20 \n\t"
"vfmadb %%v17,%%v0,%%v17,%%v21 \n\t"
"vfmadb %%v18,%%v0,%%v18,%%v22 \n\t"
"vfmadb %%v19,%%v0,%%v19,%%v23 \n\t"

"vl %%v24,192(%%r1,%1) \n\t"
"vl %%v25,208(%%r1,%1) \n\t"
"vl %%v26,224(%%r1,%1) \n\t"
"vl %%v27,240(%%r1,%1) \n\t"
"vl %%v28,192(%%r1,%2) \n\t"
"vl %%v29,208(%%r1,%2) \n\t"
"vl %%v30,224(%%r1,%2) \n\t"
"vl %%v31,240(%%r1,%2) \n\t"

"vfmadb %%v20,%%v0,%%v24,%%v28 \n\t"
"vfmadb %%v21,%%v0,%%v25,%%v29 \n\t"
"vfmadb %%v22,%%v0,%%v26,%%v30 \n\t"
"vfmadb %%v23,%%v0,%%v27,%%v31 \n\t"
"vst %%v16,128(%%r1,%2) \n\t"
"vst %%v17,144(%%r1,%2) \n\t"
"vst %%v18,160(%%r1,%2) \n\t"
"vst %%v19,176(%%r1,%2) \n\t"
"vst %%v20,192(%%r1,%2) \n\t"
"vst %%v21,208(%%r1,%2) \n\t"
"vst %%v22,224(%%r1,%2) \n\t"
"vst %%v23,240(%%r1,%2) \n\t"

"agfi %%r1,256 \n\t"
"brctg %%r0,0b "
:
:"r"(n),"ZR"((const FLOAT (*)[n])x),"ZR"((FLOAT (*)[n])y),"m"(*alpha)
:"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
);
static void daxpy_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) {
__asm__("vlrepg %%v0,%[alpha]\n\t"
"srlg %[n],%[n],5\n\t"
"xgr %%r1,%%r1\n\t"
"0:\n\t"
"pfd 1, 1024(%%r1,%[x])\n\t"
"pfd 2, 1024(%%r1,%[y])\n\t"
"vl %%v16,0(%%r1,%[x])\n\t"
"vl %%v17,16(%%r1,%[x])\n\t"
"vl %%v18,32(%%r1,%[x])\n\t"
"vl %%v19,48(%%r1,%[x])\n\t"
"vl %%v20,0(%%r1,%[y])\n\t"
"vl %%v21,16(%%r1,%[y])\n\t"
"vl %%v22,32(%%r1,%[y])\n\t"
"vl %%v23,48(%%r1,%[y])\n\t"
"vl %%v24,64(%%r1,%[x])\n\t"
"vl %%v25,80(%%r1,%[x])\n\t"
"vl %%v26,96(%%r1,%[x])\n\t"
"vl %%v27,112(%%r1,%[x])\n\t"
"vl %%v28,64(%%r1,%[y])\n\t"
"vl %%v29,80(%%r1,%[y])\n\t"
"vl %%v30,96(%%r1,%[y])\n\t"
"vl %%v31,112(%%r1,%[y])\n\t"
"vfmadb %%v16,%%v0,%%v16,%%v20\n\t"
"vfmadb %%v17,%%v0,%%v17,%%v21\n\t"
"vfmadb %%v18,%%v0,%%v18,%%v22\n\t"
"vfmadb %%v19,%%v0,%%v19,%%v23\n\t"
"vfmadb %%v24,%%v0,%%v24,%%v28\n\t"
"vfmadb %%v25,%%v0,%%v25,%%v29\n\t"
"vfmadb %%v26,%%v0,%%v26,%%v30\n\t"
"vfmadb %%v27,%%v0,%%v27,%%v31\n\t"
"vst %%v16,0(%%r1,%[y])\n\t"
"vst %%v17,16(%%r1,%[y])\n\t"
"vst %%v18,32(%%r1,%[y])\n\t"
"vst %%v19,48(%%r1,%[y])\n\t"
"vst %%v24,64(%%r1,%[y])\n\t"
"vst %%v25,80(%%r1,%[y])\n\t"
"vst %%v26,96(%%r1,%[y])\n\t"
"vst %%v27,112(%%r1,%[y])\n\t"
"vl %%v16,128(%%r1,%[x])\n\t"
"vl %%v17,144(%%r1,%[x])\n\t"
"vl %%v18,160(%%r1,%[x])\n\t"
"vl %%v19,176(%%r1,%[x])\n\t"
"vl %%v20,128(%%r1,%[y])\n\t"
"vl %%v21,144(%%r1,%[y])\n\t"
"vl %%v22,160(%%r1,%[y])\n\t"
"vl %%v23,176(%%r1,%[y])\n\t"
"vl %%v24,192(%%r1,%[x])\n\t"
"vl %%v25,208(%%r1,%[x])\n\t"
"vl %%v26,224(%%r1,%[x])\n\t"
"vl %%v27,240(%%r1,%[x])\n\t"
"vl %%v28,192(%%r1,%[y])\n\t"
"vl %%v29,208(%%r1,%[y])\n\t"
"vl %%v30,224(%%r1,%[y])\n\t"
"vl %%v31,240(%%r1,%[y])\n\t"
"vfmadb %%v16,%%v0,%%v16,%%v20\n\t"
"vfmadb %%v17,%%v0,%%v17,%%v21\n\t"
"vfmadb %%v18,%%v0,%%v18,%%v22\n\t"
"vfmadb %%v19,%%v0,%%v19,%%v23\n\t"
"vfmadb %%v24,%%v0,%%v24,%%v28\n\t"
"vfmadb %%v25,%%v0,%%v25,%%v29\n\t"
"vfmadb %%v26,%%v0,%%v26,%%v30\n\t"
"vfmadb %%v27,%%v0,%%v27,%%v31\n\t"
"vst %%v16,128(%%r1,%[y])\n\t"
"vst %%v17,144(%%r1,%[y])\n\t"
"vst %%v18,160(%%r1,%[y])\n\t"
"vst %%v19,176(%%r1,%[y])\n\t"
"vst %%v24,192(%%r1,%[y])\n\t"
"vst %%v25,208(%%r1,%[y])\n\t"
"vst %%v26,224(%%r1,%[y])\n\t"
"vst %%v27,240(%%r1,%[y])\n\t"
"agfi %%r1,256\n\t"
"brctg %[n],0b"
: "+m"(*(FLOAT (*)[n]) y),[n] "+&r"(n)
: [y] "a"(y), "m"(*(const FLOAT (*)[n]) x),[x] "a"(x),
[alpha] "m"(*alpha)
: "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22",
"v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31");
} }


int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2)
{
BLASLONG i=0;
BLASLONG ix=0,iy=0;
int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x,
BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy,
BLASLONG dummy2) {
BLASLONG i = 0;
BLASLONG ix = 0, iy = 0;


if ( n <= 0 ) return 0 ;
if (n <= 0)
return 0;


if ( (inc_x == 1) && (inc_y == 1) )
{
if ((inc_x == 1) && (inc_y == 1)) {


BLASLONG n1 = n & -32;
BLASLONG n1 = n & -32;


if ( n1 )
daxpy_kernel_32(n1, x, y , &da);
if (n1)
daxpy_kernel_32(n1, x, y, &da);


i = n1;
while(i < n)
{

y[i] += da * x[i] ;
i++ ;

}
return 0 ;
i = n1;
while (i < n) {


y[i] += da * x[i];
i++;


} }
return 0;


BLASLONG n1 = n & -4;
}


while(i < n1)
{
BLASLONG n1 = n & -4;


FLOAT m1 = da * x[ix] ;
FLOAT m2 = da * x[ix+inc_x] ;
FLOAT m3 = da * x[ix+2*inc_x] ;
FLOAT m4 = da * x[ix+3*inc_x] ;
while (i < n1) {


y[iy] += m1 ;
y[iy+inc_y] += m2 ;
y[iy+2*inc_y] += m3 ;
y[iy+3*inc_y] += m4 ;
FLOAT m1 = da * x[ix];
FLOAT m2 = da * x[ix + inc_x];
FLOAT m3 = da * x[ix + 2 * inc_x];
FLOAT m4 = da * x[ix + 3 * inc_x];


ix += inc_x*4 ;
iy += inc_y*4 ;
i+=4 ;
y[iy] += m1;
y[iy + inc_y] += m2;
y[iy + 2 * inc_y] += m3;
y[iy + 3 * inc_y] += m4;


}
ix += inc_x * 4;
iy += inc_y * 4;
i += 4;


while(i < n)
{
}


y[iy] += da * x[ix] ;
ix += inc_x ;
iy += inc_y ;
i++ ;
while (i < n) {


}
return 0 ;
}
y[iy] += da * x[ix];
ix += inc_x;
iy += inc_y;
i++;


}
return 0;


}

+ 35
- 41
kernel/zarch/dcopy.c View File

@@ -1,5 +1,5 @@
/*************************************************************************** /***************************************************************************
Copyright (c) 2013-2018, The OpenBLAS Project
Copyright (c) 2013-2019, The OpenBLAS Project
All rights reserved. All rights reserved.
Redistribution and use in source and binary forms, with or without Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are modification, are permitted provided that the following conditions are
@@ -27,59 +27,53 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.


#include "common.h" #include "common.h"


static void dcopy_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y)
{
__asm__ volatile (
"lgr %%r1,%1 \n\t"
"lgr %%r2,%2 \n\t"
"srlg %%r0,%0,5 \n\t"
"0: \n\t"
"pfd 1, 1024(%%r1) \n\t"
"pfd 2, 1024(%%r2) \n\t"
"mvc 0(256,%%r2),0(%%r1) \n\t"
"agfi %%r1,256 \n\t"
"agfi %%r2,256 \n\t"
"brctg %%r0,0b "
:
:"r"(n),"a"((const FLOAT (*)[n])x),"a"((FLOAT (*)[n])y)
:"memory","cc","r0","r1","r2"
);
static void dcopy_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y) {
__asm__("srlg %[n],%[n],5\n\t"
"0:\n\t"
"pfd 1, 1024(%[x])\n\t"
"pfd 2, 1024(%[y])\n\t"
"mvc 0(256,%[y]),0(%[x])\n\t"
"la %[x],256(%[x])\n\t"
"la %[y],256(%[y])\n\t"
"brctg %[n],0b"
: "=m"(*(FLOAT (*)[n]) y),[x] "+&a"(x),[y] "+&a"(y)
: "m"(*(const FLOAT (*)[n]) x)
: "cc");
} }


int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) { int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) {
BLASLONG i = 0;
BLASLONG ix = 0, iy = 0;
BLASLONG i = 0;
BLASLONG ix = 0, iy = 0;


if (n <= 0) return 0;

if ((inc_x == 1) && (inc_y == 1)) {

BLASLONG n1 = n & -32;
if (n1 > 0) {
dcopy_kernel_32(n1, x, y);
i = n1;
}
if (n <= 0)
return 0;


while (i < n) {
y[i] = x[i];
i++;
if ((inc_x == 1) && (inc_y == 1)) {


}
BLASLONG n1 = n & -32;
if (n1 > 0) {
dcopy_kernel_32(n1, x, y);
i = n1;
}


while (i < n) {
y[i] = x[i];
i++;


} else {
}


while (i < n) {
} else {


y[iy] = x[ix];
ix += inc_x;
iy += inc_y;
i++;
while (i < n) {


}
y[iy] = x[ix];
ix += inc_x;
iy += inc_y;
i++;


} }
return 0;


}
return 0;


} }

+ 100
- 96
kernel/zarch/ddot.c View File

@@ -1,5 +1,5 @@
/*************************************************************************** /***************************************************************************
Copyright (c) 2013-2017, The OpenBLAS Project
Copyright (c) 2013-2019, The OpenBLAS Project
All rights reserved. All rights reserved.
Redistribution and use in source and binary forms, with or without Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are modification, are permitted provided that the following conditions are
@@ -27,123 +27,127 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.


#include "common.h" #include "common.h"


static FLOAT ddot_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y)
{
FLOAT dot;

__asm__ volatile (
"vzero %%v0 \n\t"
"srlg %%r0,%1,4 \n\t"
"xgr %%r1,%%r1 \n\t"
"0: \n\t"
"pfd 1,1024(%%r1,%2) \n\t"
"pfd 1,1024(%%r1,%3) \n\t"

"vl %%v16,0(%%r1,%2) \n\t"
"vl %%v17,16(%%r1,%2) \n\t"
"vl %%v18,32(%%r1,%2) \n\t"
"vl %%v19,48(%%r1,%2) \n\t"
"vl %%v20,64(%%r1,%2) \n\t"
"vl %%v21,80(%%r1,%2) \n\t"
"vl %%v22,96(%%r1,%2) \n\t"
"vl %%v23,112(%%r1,%2) \n\t"

"vl %%v24,0(%%r1,%3) \n\t"
"vfmadb %%v0,%%v16,%%v24,%%v0 \n\t"
"vl %%v25,16(%%r1,%3) \n\t"
"vfmadb %%v0,%%v17,%%v25,%%v0 \n\t"
"vl %%v26,32(%%r1,%3) \n\t"
"vfmadb %%v0,%%v18,%%v26,%%v0 \n\t"
"vl %%v27,48(%%r1,%3) \n\t"
"vfmadb %%v0,%%v19,%%v27,%%v0 \n\t"
"vl %%v28,64(%%r1,%3) \n\t"
"vfmadb %%v0,%%v20,%%v28,%%v0 \n\t"
"vl %%v29,80(%%r1,%3) \n\t"
"vfmadb %%v0,%%v21,%%v29,%%v0 \n\t"
"vl %%v30,96(%%r1,%3) \n\t"
"vfmadb %%v0,%%v22,%%v30,%%v0 \n\t"
"vl %%v31,112(%%r1,%3) \n\t"
"vfmadb %%v0,%%v23,%%v31,%%v0 \n\t"
"agfi %%r1,128 \n\t"
"brctg %%r0,0b \n\t"
"vrepg %%v1,%%v0,1 \n\t"
"adbr %%f0,%%f1 \n\t"
"ldr %0,%%f0 "
:"=f"(dot)
:"r"(n),"ZR"((const FLOAT (*)[n])x),"ZR"((const FLOAT (*)[n])y)
:"memory","cc","r0","r1","v0","v1","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
);

return dot;
static FLOAT ddot_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y) {
FLOAT dot;

__asm__("vzero %%v0\n\t"
"vzero %%v1\n\t"
"vzero %%v2\n\t"
"vzero %%v3\n\t"
"vzero %%v4\n\t"
"vzero %%v5\n\t"
"vzero %%v6\n\t"
"vzero %%v7\n\t"
"srlg %[n],%[n],4\n\t"
"xgr %%r1,%%r1\n\t"
"0:\n\t"
"pfd 1,1024(%%r1,%[x])\n\t"
"pfd 1,1024(%%r1,%[y])\n\t"
"vl %%v16,0(%%r1,%[x])\n\t"
"vl %%v17,16(%%r1,%[x])\n\t"
"vl %%v18,32(%%r1,%[x])\n\t"
"vl %%v19,48(%%r1,%[x])\n\t"
"vl %%v20,64(%%r1,%[x])\n\t"
"vl %%v21,80(%%r1,%[x])\n\t"
"vl %%v22,96(%%r1,%[x])\n\t"
"vl %%v23,112(%%r1,%[x])\n\t"
"vl %%v24,0(%%r1,%[y])\n\t"
"vl %%v25,16(%%r1,%[y])\n\t"
"vl %%v26,32(%%r1,%[y])\n\t"
"vl %%v27,48(%%r1,%[y])\n\t"
"vl %%v28,64(%%r1,%[y])\n\t"
"vl %%v29,80(%%r1,%[y])\n\t"
"vl %%v30,96(%%r1,%[y])\n\t"
"vl %%v31,112(%%r1,%[y])\n\t"
"vfmadb %%v0,%%v16,%%v24,%%v0\n\t"
"vfmadb %%v1,%%v17,%%v25,%%v1\n\t"
"vfmadb %%v2,%%v18,%%v26,%%v2\n\t"
"vfmadb %%v3,%%v19,%%v27,%%v3\n\t"
"vfmadb %%v4,%%v20,%%v28,%%v4\n\t"
"vfmadb %%v5,%%v21,%%v29,%%v5\n\t"
"vfmadb %%v6,%%v22,%%v30,%%v6\n\t"
"vfmadb %%v7,%%v23,%%v31,%%v7\n\t"
"agfi %%r1,128\n\t"
"brctg %[n],0b\n\t"
"vfadb %%v0,%%v0,%%v1\n\t"
"vfadb %%v0,%%v0,%%v2\n\t"
"vfadb %%v0,%%v0,%%v3\n\t"
"vfadb %%v0,%%v0,%%v4\n\t"
"vfadb %%v0,%%v0,%%v5\n\t"
"vfadb %%v0,%%v0,%%v6\n\t"
"vfadb %%v0,%%v0,%%v7\n\t"
"vrepg %%v1,%%v0,1\n\t"
"adbr %%f0,%%f1\n\t"
"ldr %[dot],%%f0"
: [dot] "=f"(dot),[n] "+&r"(n)
: "m"(*(const FLOAT (*)[n]) x),[x] "a"(x), "m"(*(const FLOAT (*)[n]) y),
[y] "a"(y)
: "cc", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16",
"v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26",
"v27", "v28", "v29", "v30", "v31");

return dot;
} }


FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
{
BLASLONG i=0;
BLASLONG ix=0,iy=0;
FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) {
BLASLONG i = 0;
BLASLONG ix = 0, iy = 0;


FLOAT dot = 0.0 ;
FLOAT dot = 0.0;


if ( n <= 0 ) return(dot);
if (n <= 0)
return (dot);


if ( (inc_x == 1) && (inc_y == 1) )
{
if ((inc_x == 1) && (inc_y == 1)) {


BLASLONG n1 = n & -16;
BLASLONG n1 = n & -16;


if ( n1 )
dot = ddot_kernel_16(n1, x, y);
if (n1)
dot = ddot_kernel_16(n1, x, y);


i = n1;
while(i < n)
{

dot += y[i] * x[i] ;
i++ ;

}
return(dot);
i = n1;
while (i < n) {


dot += y[i] * x[i];
i++;


} }
return (dot);


FLOAT temp1 = 0.0;
FLOAT temp2 = 0.0;
}


BLASLONG n1 = n & -4;
FLOAT temp1 = 0.0;
FLOAT temp2 = 0.0;


while(i < n1)
{
BLASLONG n1 = n & -4;


FLOAT m1 = y[iy] * x[ix] ;
FLOAT m2 = y[iy+inc_y] * x[ix+inc_x] ;
while (i < n1) {


FLOAT m3 = y[iy+2*inc_y] * x[ix+2*inc_x] ;
FLOAT m4 = y[iy+3*inc_y] * x[ix+3*inc_x] ;
FLOAT m1 = y[iy] * x[ix];
FLOAT m2 = y[iy + inc_y] * x[ix + inc_x];


ix += inc_x*4 ;
iy += inc_y*4 ;
FLOAT m3 = y[iy + 2 * inc_y] * x[ix + 2 * inc_x];
FLOAT m4 = y[iy + 3 * inc_y] * x[ix + 3 * inc_x];


temp1 += m1+m3;
temp2 += m2+m4;
ix += inc_x * 4;
iy += inc_y * 4;


i+=4 ;
temp1 += m1 + m3;
temp2 += m2 + m4;


}
i += 4;


while(i < n)
{
}


temp1 += y[iy] * x[ix] ;
ix += inc_x ;
iy += inc_y ;
i++ ;
while (i < n) {


}
dot = temp1 + temp2;
return(dot);

}
temp1 += y[iy] * x[ix];
ix += inc_x;
iy += inc_y;
i++;


}
dot = temp1 + temp2;
return (dot);


}

+ 558
- 642
kernel/zarch/dgemv_n_4.c
File diff suppressed because it is too large
View File


+ 663
- 734
kernel/zarch/dgemv_t_4.c
File diff suppressed because it is too large
View File


+ 101
- 113
kernel/zarch/dmax.c View File

@@ -1,5 +1,5 @@
/*************************************************************************** /***************************************************************************
Copyright (c) 2013-2017, The OpenBLAS Project
Copyright (c) 2013-2019, The OpenBLAS Project
All rights reserved. All rights reserved.
Redistribution and use in source and binary forms, with or without Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are modification, are permitted provided that the following conditions are
@@ -27,133 +27,121 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.


#include "common.h" #include "common.h"


static FLOAT dmax_kernel_32(BLASLONG n, FLOAT *x)
{
FLOAT max;

__asm__ volatile (
"vl %%v0,0(%2) \n\t"
"srlg %%r0,%1,5 \n\t"
"xgr %%r1,%%r1 \n\t"
"0: \n\t"
"pfd 1, 1024(%%r1,%2) \n\t"

"vl %%v16,0(%%r1,%2) \n\t"
"vl %%v17,16(%%r1,%2) \n\t"
"vl %%v18,32(%%r1,%2) \n\t"
"vl %%v19,48(%%r1,%2) \n\t"
"vl %%v20,64(%%r1,%2) \n\t"
"vl %%v21,80(%%r1,%2) \n\t"
"vl %%v22,96(%%r1,%2) \n\t"
"vl %%v23,112(%%r1,%2) \n\t"
"vl %%v24,128(%%r1,%2) \n\t"
"vl %%v25,144(%%r1,%2) \n\t"
"vl %%v26,160(%%r1,%2) \n\t"
"vl %%v27,176(%%r1,%2) \n\t"
"vl %%v28,192(%%r1,%2) \n\t"
"vl %%v29,208(%%r1,%2) \n\t"
"vl %%v30,224(%%r1,%2) \n\t"
"vl %%v31,240(%%r1,%2) \n\t"
"vfmaxdb %%v16,%%v16,%%v24,0 \n\t"
"vfmaxdb %%v17,%%v17,%%v25,0 \n\t"
"vfmaxdb %%v18,%%v18,%%v26,0 \n\t"
"vfmaxdb %%v19,%%v19,%%v27,0 \n\t"
"vfmaxdb %%v20,%%v20,%%v28,0 \n\t"
"vfmaxdb %%v21,%%v21,%%v29,0 \n\t"
"vfmaxdb %%v22,%%v22,%%v30,0 \n\t"
"vfmaxdb %%v23,%%v23,%%v31,0 \n\t"

"vfmaxdb %%v16,%%v16,%%v20,0 \n\t"
"vfmaxdb %%v17,%%v17,%%v21,0 \n\t"
"vfmaxdb %%v18,%%v18,%%v22,0 \n\t"
"vfmaxdb %%v19,%%v19,%%v23,0 \n\t"

"vfmaxdb %%v16,%%v16,%%v18,0 \n\t"
"vfmaxdb %%v17,%%v17,%%v19,0 \n\t"

"vfmaxdb %%v16,%%v16,%%v17,0 \n\t"

"vfmaxdb %%v0,%%v0,%%v16,0 \n\t"

"agfi %%r1, 256 \n\t"
"brctg %%r0, 0b \n\t"

"vrepg %%v16,%%v0,1 \n\t"
"wfmaxdb %%v0,%%v0,%%v16,0 \n\t"
"ldr %0,%%f0 "
:"=f"(max)
:"r"(n),"ZR"((const FLOAT (*)[n])x)
:"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
);

return max;
static FLOAT dmax_kernel_32(BLASLONG n, FLOAT *x) {
FLOAT max;

__asm__("vl %%v0,0(%[x])\n\t"
"srlg %[n],%[n],5\n\t"
"xgr %%r1,%%r1\n\t"
"0:\n\t"
"pfd 1, 1024(%%r1,%[x])\n\t"
"vl %%v16,0(%%r1,%[x])\n\t"
"vl %%v17,16(%%r1,%[x])\n\t"
"vl %%v18,32(%%r1,%[x])\n\t"
"vl %%v19,48(%%r1,%[x])\n\t"
"vl %%v20,64(%%r1,%[x])\n\t"
"vl %%v21,80(%%r1,%[x])\n\t"
"vl %%v22,96(%%r1,%[x])\n\t"
"vl %%v23,112(%%r1,%[x])\n\t"
"vl %%v24,128(%%r1,%[x])\n\t"
"vl %%v25,144(%%r1,%[x])\n\t"
"vl %%v26,160(%%r1,%[x])\n\t"
"vl %%v27,176(%%r1,%[x])\n\t"
"vl %%v28,192(%%r1,%[x])\n\t"
"vl %%v29,208(%%r1,%[x])\n\t"
"vl %%v30,224(%%r1,%[x])\n\t"
"vl %%v31,240(%%r1,%[x])\n\t"
"vfmaxdb %%v16,%%v16,%%v24,0\n\t"
"vfmaxdb %%v17,%%v17,%%v25,0\n\t"
"vfmaxdb %%v18,%%v18,%%v26,0\n\t"
"vfmaxdb %%v19,%%v19,%%v27,0\n\t"
"vfmaxdb %%v20,%%v20,%%v28,0\n\t"
"vfmaxdb %%v21,%%v21,%%v29,0\n\t"
"vfmaxdb %%v22,%%v22,%%v30,0\n\t"
"vfmaxdb %%v23,%%v23,%%v31,0\n\t"
"vfmaxdb %%v16,%%v16,%%v20,0\n\t"
"vfmaxdb %%v17,%%v17,%%v21,0\n\t"
"vfmaxdb %%v18,%%v18,%%v22,0\n\t"
"vfmaxdb %%v19,%%v19,%%v23,0\n\t"
"vfmaxdb %%v16,%%v16,%%v18,0\n\t"
"vfmaxdb %%v17,%%v17,%%v19,0\n\t"
"vfmaxdb %%v16,%%v16,%%v17,0\n\t"
"vfmaxdb %%v0,%%v0,%%v16,0\n\t"
"agfi %%r1, 256\n\t"
"brctg %[n], 0b\n\t"
"vrepg %%v16,%%v0,1\n\t"
"wfmaxdb %%v0,%%v0,%%v16,0\n\t"
"ldr %[max],%%f0"
: [max] "=f"(max),[n] "+&r"(n)
: "m"(*(const FLOAT (*)[n]) x),[x] "a"(x)
: "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22",
"v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31");

return max;
} }
FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
BLASLONG i = 0;
BLASLONG j = 0;
FLOAT maxf = 0.0;

if (n <= 0 || inc_x <= 0) return (maxf);


if (inc_x == 1) {
FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
BLASLONG i = 0;
BLASLONG j = 0;
FLOAT maxf = 0.0;


BLASLONG n1 = n & -32;
if (n1 > 0) {
if (n <= 0 || inc_x <= 0)
return (maxf);


maxf = dmax_kernel_32(n1, x);
if (inc_x == 1) {


i = n1;
}
else
{
maxf=x[0];
i++;
}
BLASLONG n1 = n & -32;
if (n1 > 0) {


while (i < n) {
if (x[i] > maxf) {
maxf = x[i];
}
i++;
}
return (maxf);
maxf = dmax_kernel_32(n1, x);


i = n1;
} else { } else {
maxf = x[0];
i++;
}


maxf=x[0];
while (i < n) {
if (x[i] > maxf) {
maxf = x[i];
}
i++;
}
return (maxf);

} else {


BLASLONG n1 = n & -4;
while (j < n1) {
maxf = x[0];


if (x[i] > maxf) {
maxf = x[i];
}
if (x[i + inc_x] > maxf) {
maxf = x[i + inc_x];
}
if (x[i + 2 * inc_x] > maxf) {
maxf = x[i + 2 * inc_x];
}
if (x[i + 3 * inc_x] > maxf) {
maxf = x[i + 3 * inc_x];
}
BLASLONG n1 = n & -4;
while (j < n1) {


i += inc_x * 4;
if (x[i] > maxf) {
maxf = x[i];
}
if (x[i + inc_x] > maxf) {
maxf = x[i + inc_x];
}
if (x[i + 2 * inc_x] > maxf) {
maxf = x[i + 2 * inc_x];
}
if (x[i + 3 * inc_x] > maxf) {
maxf = x[i + 3 * inc_x];
}


j += 4;
i += inc_x * 4;


}
j += 4;


}


while (j < n) {
if (x[i] > maxf) {
maxf = x[i];
}
i += inc_x;
j++;
}
return (maxf);
while (j < n) {
if (x[i] > maxf) {
maxf = x[i];
}
i += inc_x;
j++;
} }
return (maxf);
}
} }

+ 118
- 134
kernel/zarch/dmax_z13.c View File

@@ -1,5 +1,5 @@
/*************************************************************************** /***************************************************************************
Copyright (c) 2013-2017, The OpenBLAS Project
Copyright (c) 2013-2019, The OpenBLAS Project
All rights reserved. All rights reserved.
Redistribution and use in source and binary forms, with or without Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are modification, are permitted provided that the following conditions are
@@ -27,154 +27,138 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.


#include "common.h" #include "common.h"


static FLOAT dmax_kernel_32(BLASLONG n, FLOAT *x)
{
FLOAT max;

__asm__ volatile (
"vl %%v0,0(%2) \n\t"
"srlg %%r0,%1,5 \n\t"
"xgr %%r1,%%r1 \n\t"
"0: \n\t"
"pfd 1, 1024(%%r1,%2) \n\t"

"vl %%v16,0(%%r1,%2) \n\t"
"vl %%v17,16(%%r1,%2) \n\t"
"vl %%v18,32(%%r1,%2) \n\t"
"vl %%v19,48(%%r1,%2) \n\t"
"vl %%v20,64(%%r1,%2) \n\t"
"vl %%v21,80(%%r1,%2) \n\t"
"vl %%v22,96(%%r1,%2) \n\t"
"vl %%v23,112(%%r1,%2) \n\t"
"vfchdb %%v24,%%v16,%%v17 \n\t"
"vfchdb %%v25,%%v18,%%v19 \n\t"
"vfchdb %%v26,%%v20,%%v21 \n\t"
"vfchdb %%v27,%%v22,%%v23 \n\t"
"vsel %%v24,%%v16,%%v17,%%v24 \n\t"
"vsel %%v25,%%v18,%%v19,%%v25 \n\t"
"vsel %%v26,%%v20,%%v21,%%v26 \n\t"
"vsel %%v27,%%v22,%%v23,%%v27 \n\t"

"vfchdb %%v28,%%v24,%%v25 \n\t"
"vfchdb %%v29,%%v26,%%v27 \n\t"
"vsel %%v28,%%v24,%%v25,%%v28 \n\t"
"vsel %%v29,%%v26,%%v27,%%v29 \n\t"

"vfchdb %%v30,%%v28,%%v29 \n\t"
"vsel %%v30,%%v28,%%v29,%%v30 \n\t"

"vfchdb %%v31,%%v30,%%v0 \n\t"
"vsel %%v0,%%v30,%%v0,%%v31 \n\t"

"vl %%v16,128(%%r1,%2) \n\t"
"vl %%v17,144(%%r1,%2) \n\t"
"vl %%v18,160(%%r1,%2) \n\t"
"vl %%v19,176(%%r1,%2) \n\t"
"vl %%v20,192(%%r1,%2) \n\t"
"vl %%v21,208(%%r1,%2) \n\t"
"vl %%v22,224(%%r1,%2) \n\t"
"vl %%v23,240(%%r1,%2) \n\t"
"vfchdb %%v24,%%v16,%%v17 \n\t"
"vfchdb %%v25,%%v18,%%v19 \n\t"
"vfchdb %%v26,%%v20,%%v21 \n\t"
"vfchdb %%v27,%%v22,%%v23 \n\t"
"vsel %%v24,%%v16,%%v17,%%v24 \n\t"
"vsel %%v25,%%v18,%%v19,%%v25 \n\t"
"vsel %%v26,%%v20,%%v21,%%v26 \n\t"
"vsel %%v27,%%v22,%%v23,%%v27 \n\t"

"vfchdb %%v28,%%v24,%%v25 \n\t"
"vfchdb %%v29,%%v26,%%v27 \n\t"
"vsel %%v28,%%v24,%%v25,%%v28 \n\t"
"vsel %%v29,%%v26,%%v27,%%v29 \n\t"

"vfchdb %%v30,%%v28,%%v29 \n\t"
"vsel %%v30,%%v28,%%v29,%%v30 \n\t"

"vfchdb %%v31,%%v30,%%v0 \n\t"
"vsel %%v0,%%v30,%%v0,%%v31 \n\t"

"agfi %%r1, 256 \n\t"
"brctg %%r0, 0b \n\t"

"vrepg %%v16,%%v0,1 \n\t"
"wfchdb %%v17,%%v0,%%v16 \n\t"
"vsel %%v0,%%v0,%%v16,%%v17 \n\t"
"ldr %0,%%f0 "
:"=f"(max)
:"r"(n),"ZR"((const FLOAT (*)[n])x)
:"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
);

return max;
static FLOAT dmax_kernel_32(BLASLONG n, FLOAT *x) {
FLOAT max;

__asm__("vl %%v0,0(%[x])\n\t"
"srlg %[n],%[n],5\n\t"
"xgr %%r1,%%r1\n\t"
"0:\n\t"
"pfd 1, 1024(%%r1,%[x])\n\t"
"vl %%v16,0(%%r1,%[x])\n\t"
"vl %%v17,16(%%r1,%[x])\n\t"
"vl %%v18,32(%%r1,%[x])\n\t"
"vl %%v19,48(%%r1,%[x])\n\t"
"vl %%v20,64(%%r1,%[x])\n\t"
"vl %%v21,80(%%r1,%[x])\n\t"
"vl %%v22,96(%%r1,%[x])\n\t"
"vl %%v23,112(%%r1,%[x])\n\t"
"vfchdb %%v24,%%v16,%%v17\n\t"
"vfchdb %%v25,%%v18,%%v19\n\t"
"vfchdb %%v26,%%v20,%%v21\n\t"
"vfchdb %%v27,%%v22,%%v23\n\t"
"vsel %%v24,%%v16,%%v17,%%v24\n\t"
"vsel %%v25,%%v18,%%v19,%%v25\n\t"
"vsel %%v26,%%v20,%%v21,%%v26\n\t"
"vsel %%v27,%%v22,%%v23,%%v27\n\t"
"vfchdb %%v28,%%v24,%%v25\n\t"
"vfchdb %%v29,%%v26,%%v27\n\t"
"vsel %%v28,%%v24,%%v25,%%v28\n\t"
"vsel %%v29,%%v26,%%v27,%%v29\n\t"
"vfchdb %%v30,%%v28,%%v29\n\t"
"vsel %%v30,%%v28,%%v29,%%v30\n\t"
"vfchdb %%v31,%%v30,%%v0\n\t"
"vsel %%v0,%%v30,%%v0,%%v31\n\t"
"vl %%v16,128(%%r1,%[x])\n\t"
"vl %%v17,144(%%r1,%[x])\n\t"
"vl %%v18,160(%%r1,%[x])\n\t"
"vl %%v19,176(%%r1,%[x])\n\t"
"vl %%v20,192(%%r1,%[x])\n\t"
"vl %%v21,208(%%r1,%[x])\n\t"
"vl %%v22,224(%%r1,%[x])\n\t"
"vl %%v23,240(%%r1,%[x])\n\t"
"vfchdb %%v24,%%v16,%%v17\n\t"
"vfchdb %%v25,%%v18,%%v19\n\t"
"vfchdb %%v26,%%v20,%%v21\n\t"
"vfchdb %%v27,%%v22,%%v23\n\t"
"vsel %%v24,%%v16,%%v17,%%v24\n\t"
"vsel %%v25,%%v18,%%v19,%%v25\n\t"
"vsel %%v26,%%v20,%%v21,%%v26\n\t"
"vsel %%v27,%%v22,%%v23,%%v27\n\t"
"vfchdb %%v28,%%v24,%%v25\n\t"
"vfchdb %%v29,%%v26,%%v27\n\t"
"vsel %%v28,%%v24,%%v25,%%v28\n\t"
"vsel %%v29,%%v26,%%v27,%%v29\n\t"
"vfchdb %%v30,%%v28,%%v29\n\t"
"vsel %%v30,%%v28,%%v29,%%v30\n\t"
"vfchdb %%v31,%%v30,%%v0\n\t"
"vsel %%v0,%%v30,%%v0,%%v31\n\t"
"agfi %%r1, 256\n\t"
"brctg %[n], 0b\n\t"
"vrepg %%v16,%%v0,1\n\t"
"wfchdb %%v17,%%v0,%%v16\n\t"
"vsel %%v0,%%v0,%%v16,%%v17\n\t"
"ldr %[max],%%f0"
: [max] "=f"(max),[n] "+&r"(n)
: "m"(*(const FLOAT (*)[n]) x),[x] "a"(x)
: "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22",
"v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31");

return max;
} }
FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
BLASLONG i = 0;
BLASLONG j = 0;
FLOAT maxf = 0.0;

if (n <= 0 || inc_x <= 0) return (maxf);


if (inc_x == 1) {
FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
BLASLONG i = 0;
BLASLONG j = 0;
FLOAT maxf = 0.0;


BLASLONG n1 = n & -32;
if (n1 > 0) {
if (n <= 0 || inc_x <= 0)
return (maxf);


maxf = dmax_kernel_32(n1, x);
if (inc_x == 1) {


i = n1;
}
else
{
maxf=x[0];
i++;
}
BLASLONG n1 = n & -32;
if (n1 > 0) {


while (i < n) {
if (x[i] > maxf) {
maxf = x[i];
}
i++;
}
return (maxf);
maxf = dmax_kernel_32(n1, x);


i = n1;
} else { } else {
maxf = x[0];
i++;
}


maxf=x[0];
while (i < n) {
if (x[i] > maxf) {
maxf = x[i];
}
i++;
}
return (maxf);

} else {


BLASLONG n1 = n & -4;
while (j < n1) {
maxf = x[0];


if (x[i] > maxf) {
maxf = x[i];
}
if (x[i + inc_x] > maxf) {
maxf = x[i + inc_x];
}
if (x[i + 2 * inc_x] > maxf) {
maxf = x[i + 2 * inc_x];
}
if (x[i + 3 * inc_x] > maxf) {
maxf = x[i + 3 * inc_x];
}
BLASLONG n1 = n & -4;
while (j < n1) {


i += inc_x * 4;
if (x[i] > maxf) {
maxf = x[i];
}
if (x[i + inc_x] > maxf) {
maxf = x[i + inc_x];
}
if (x[i + 2 * inc_x] > maxf) {
maxf = x[i + 2 * inc_x];
}
if (x[i + 3 * inc_x] > maxf) {
maxf = x[i + 3 * inc_x];
}


j += 4;
i += inc_x * 4;


}
j += 4;


}


while (j < n) {
if (x[i] > maxf) {
maxf = x[i];
}
i += inc_x;
j++;
}
return (maxf);
while (j < n) {
if (x[i] > maxf) {
maxf = x[i];
}
i += inc_x;
j++;
} }
return (maxf);
}
} }

+ 101
- 113
kernel/zarch/dmin.c View File

@@ -1,5 +1,5 @@
/*************************************************************************** /***************************************************************************
Copyright (c) 2013-2017, The OpenBLAS Project
Copyright (c) 2013-2019, The OpenBLAS Project
All rights reserved. All rights reserved.
Redistribution and use in source and binary forms, with or without Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are modification, are permitted provided that the following conditions are
@@ -27,133 +27,121 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.


#include "common.h" #include "common.h"


static FLOAT dmin_kernel_32(BLASLONG n, FLOAT *x)
{
FLOAT min;

__asm__ volatile (
"vl %%v0,0(%2) \n\t"
"srlg %%r0,%1,5 \n\t"
"xgr %%r1,%%r1 \n\t"
"0: \n\t"
"pfd 1, 1024(%%r1,%2) \n\t"

"vl %%v16,0(%%r1,%2) \n\t"
"vl %%v17,16(%%r1,%2) \n\t"
"vl %%v18,32(%%r1,%2) \n\t"
"vl %%v19,48(%%r1,%2) \n\t"
"vl %%v20,64(%%r1,%2) \n\t"
"vl %%v21,80(%%r1,%2) \n\t"
"vl %%v22,96(%%r1,%2) \n\t"
"vl %%v23,112(%%r1,%2) \n\t"
"vl %%v24,128(%%r1,%2) \n\t"
"vl %%v25,144(%%r1,%2) \n\t"
"vl %%v26,160(%%r1,%2) \n\t"
"vl %%v27,176(%%r1,%2) \n\t"
"vl %%v28,192(%%r1,%2) \n\t"
"vl %%v29,208(%%r1,%2) \n\t"
"vl %%v30,224(%%r1,%2) \n\t"
"vl %%v31,240(%%r1,%2) \n\t"
"vfmindb %%v16,%%v16,%%v24,0 \n\t"
"vfmindb %%v17,%%v17,%%v25,0 \n\t"
"vfmindb %%v18,%%v18,%%v26,0 \n\t"
"vfmindb %%v19,%%v19,%%v27,0 \n\t"
"vfmindb %%v20,%%v20,%%v28,0 \n\t"
"vfmindb %%v21,%%v21,%%v29,0 \n\t"
"vfmindb %%v22,%%v22,%%v30,0 \n\t"
"vfmindb %%v23,%%v23,%%v31,0 \n\t"

"vfmindb %%v16,%%v16,%%v20,0 \n\t"
"vfmindb %%v17,%%v17,%%v21,0 \n\t"
"vfmindb %%v18,%%v18,%%v22,0 \n\t"
"vfmindb %%v19,%%v19,%%v23,0 \n\t"

"vfmindb %%v16,%%v16,%%v18,0 \n\t"
"vfmindb %%v17,%%v17,%%v19,0 \n\t"

"vfmindb %%v16,%%v16,%%v17,0 \n\t"

"vfmindb %%v0,%%v0,%%v16,0 \n\t"

"agfi %%r1, 256 \n\t"
"brctg %%r0, 0b \n\t"

"vrepg %%v16,%%v0,1 \n\t"
"wfmindb %%v0,%%v0,%%v16,0 \n\t"
"ldr %0,%%f0 "
:"=f"(min)
:"r"(n),"ZR"((const FLOAT (*)[n])x)
:"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
);

return min;
static FLOAT dmin_kernel_32(BLASLONG n, FLOAT *x) {
FLOAT min;

__asm__("vl %%v0,0(%[x])\n\t"
"srlg %[n],%[n],5\n\t"
"xgr %%r1,%%r1\n\t"
"0:\n\t"
"pfd 1, 1024(%%r1,%[x])\n\t"
"vl %%v16,0(%%r1,%[x])\n\t"
"vl %%v17,16(%%r1,%[x])\n\t"
"vl %%v18,32(%%r1,%[x])\n\t"
"vl %%v19,48(%%r1,%[x])\n\t"
"vl %%v20,64(%%r1,%[x])\n\t"
"vl %%v21,80(%%r1,%[x])\n\t"
"vl %%v22,96(%%r1,%[x])\n\t"
"vl %%v23,112(%%r1,%[x])\n\t"
"vl %%v24,128(%%r1,%[x])\n\t"
"vl %%v25,144(%%r1,%[x])\n\t"
"vl %%v26,160(%%r1,%[x])\n\t"
"vl %%v27,176(%%r1,%[x])\n\t"
"vl %%v28,192(%%r1,%[x])\n\t"
"vl %%v29,208(%%r1,%[x])\n\t"
"vl %%v30,224(%%r1,%[x])\n\t"
"vl %%v31,240(%%r1,%[x])\n\t"
"vfmindb %%v16,%%v16,%%v24,0\n\t"
"vfmindb %%v17,%%v17,%%v25,0\n\t"
"vfmindb %%v18,%%v18,%%v26,0\n\t"
"vfmindb %%v19,%%v19,%%v27,0\n\t"
"vfmindb %%v20,%%v20,%%v28,0\n\t"
"vfmindb %%v21,%%v21,%%v29,0\n\t"
"vfmindb %%v22,%%v22,%%v30,0\n\t"
"vfmindb %%v23,%%v23,%%v31,0\n\t"
"vfmindb %%v16,%%v16,%%v20,0\n\t"
"vfmindb %%v17,%%v17,%%v21,0\n\t"
"vfmindb %%v18,%%v18,%%v22,0\n\t"
"vfmindb %%v19,%%v19,%%v23,0\n\t"
"vfmindb %%v16,%%v16,%%v18,0\n\t"
"vfmindb %%v17,%%v17,%%v19,0\n\t"
"vfmindb %%v16,%%v16,%%v17,0\n\t"
"vfmindb %%v0,%%v0,%%v16,0\n\t"
"agfi %%r1, 256\n\t"
"brctg %[n], 0b\n\t"
"vrepg %%v16,%%v0,1\n\t"
"wfmindb %%v0,%%v0,%%v16,0\n\t"
"ldr %[min],%%f0"
: [min] "=f"(min),[n] "+&r"(n)
: "m"(*(const FLOAT (*)[n]) x),[x] "a"(x)
: "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22",
"v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31");

return min;
} }
FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
BLASLONG i = 0;
BLASLONG j = 0;
FLOAT minf = 0.0;

if (n <= 0 || inc_x <= 0) return (minf);


if (inc_x == 1) {
FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
BLASLONG i = 0;
BLASLONG j = 0;
FLOAT minf = 0.0;


BLASLONG n1 = n & -32;
if (n1 > 0) {
if (n <= 0 || inc_x <= 0)
return (minf);


minf = dmin_kernel_32(n1, x);
if (inc_x == 1) {


i = n1;
}
else
{
minf=x[0];
i++;
}
BLASLONG n1 = n & -32;
if (n1 > 0) {


while (i < n) {
if (x[i] < minf) {
minf = x[i];
}
i++;
}
return (minf);
minf = dmin_kernel_32(n1, x);


i = n1;
} else { } else {
minf = x[0];
i++;
}


minf=x[0];
while (i < n) {
if (x[i] < minf) {
minf = x[i];
}
i++;
}
return (minf);

} else {


BLASLONG n1 = n & -4;
while (j < n1) {
minf = x[0];


if (x[i] < minf) {
minf = x[i];
}
if (x[i + inc_x] < minf) {
minf = x[i + inc_x];
}
if (x[i + 2 * inc_x] < minf) {
minf = x[i + 2 * inc_x];
}
if (x[i + 3 * inc_x] < minf) {
minf = x[i + 3 * inc_x];
}
BLASLONG n1 = n & -4;
while (j < n1) {


i += inc_x * 4;
if (x[i] < minf) {
minf = x[i];
}
if (x[i + inc_x] < minf) {
minf = x[i + inc_x];
}
if (x[i + 2 * inc_x] < minf) {
minf = x[i + 2 * inc_x];
}
if (x[i + 3 * inc_x] < minf) {
minf = x[i + 3 * inc_x];
}


j += 4;
i += inc_x * 4;


}
j += 4;


}


while (j < n) {
if (x[i] < minf) {
minf = x[i];
}
i += inc_x;
j++;
}
return (minf);
while (j < n) {
if (x[i] < minf) {
minf = x[i];
}
i += inc_x;
j++;
} }
return (minf);
}
} }

+ 118
- 134
kernel/zarch/dmin_z13.c View File

@@ -1,5 +1,5 @@
/*************************************************************************** /***************************************************************************
Copyright (c) 2013-2017, The OpenBLAS Project
Copyright (c) 2013-2019, The OpenBLAS Project
All rights reserved. All rights reserved.
Redistribution and use in source and binary forms, with or without Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are modification, are permitted provided that the following conditions are
@@ -27,154 +27,138 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.


#include "common.h" #include "common.h"


static FLOAT dmin_kernel_32(BLASLONG n, FLOAT *x)
{
FLOAT min;

__asm__ volatile (
"vl %%v0,0(%2) \n\t"
"srlg %%r0,%1,5 \n\t"
"xgr %%r1,%%r1 \n\t"
"0: \n\t"
"pfd 1, 1024(%%r1,%2) \n\t"

"vl %%v16,0(%%r1,%2) \n\t"
"vl %%v17,16(%%r1,%2) \n\t"
"vl %%v18,32(%%r1,%2) \n\t"
"vl %%v19,48(%%r1,%2) \n\t"
"vl %%v20,64(%%r1,%2) \n\t"
"vl %%v21,80(%%r1,%2) \n\t"
"vl %%v22,96(%%r1,%2) \n\t"
"vl %%v23,112(%%r1,%2) \n\t"
"vfchdb %%v24,%%v17,%%v16 \n\t"
"vfchdb %%v25,%%v19,%%v18 \n\t"
"vfchdb %%v26,%%v21,%%v20 \n\t"
"vfchdb %%v27,%%v23,%%v22 \n\t"
"vsel %%v24,%%v16,%%v17,%%v24 \n\t"
"vsel %%v25,%%v18,%%v19,%%v25 \n\t"
"vsel %%v26,%%v20,%%v21,%%v26 \n\t"
"vsel %%v27,%%v22,%%v23,%%v27 \n\t"

"vfchdb %%v28,%%v25,%%v24 \n\t"
"vfchdb %%v29,%%v27,%%v26 \n\t"
"vsel %%v28,%%v24,%%v25,%%v28 \n\t"
"vsel %%v29,%%v26,%%v27,%%v29 \n\t"

"vfchdb %%v30,%%v29,%%v28 \n\t"
"vsel %%v30,%%v28,%%v29,%%v30 \n\t"

"vfchdb %%v31,%%v0,%%v30 \n\t"
"vsel %%v0,%%v30,%%v0,%%v31 \n\t"

"vl %%v16,128(%%r1,%2) \n\t"
"vl %%v17,144(%%r1,%2) \n\t"
"vl %%v18,160(%%r1,%2) \n\t"
"vl %%v19,176(%%r1,%2) \n\t"
"vl %%v20,192(%%r1,%2) \n\t"
"vl %%v21,208(%%r1,%2) \n\t"
"vl %%v22,224(%%r1,%2) \n\t"
"vl %%v23,240(%%r1,%2) \n\t"
"vfchdb %%v24,%%v17,%%v16 \n\t"
"vfchdb %%v25,%%v19,%%v18 \n\t"
"vfchdb %%v26,%%v21,%%v20 \n\t"
"vfchdb %%v27,%%v23,%%v22 \n\t"
"vsel %%v24,%%v16,%%v17,%%v24 \n\t"
"vsel %%v25,%%v18,%%v19,%%v25 \n\t"
"vsel %%v26,%%v20,%%v21,%%v26 \n\t"
"vsel %%v27,%%v22,%%v23,%%v27 \n\t"

"vfchdb %%v28,%%v25,%%v24 \n\t"
"vfchdb %%v29,%%v27,%%v26 \n\t"
"vsel %%v28,%%v24,%%v25,%%v28 \n\t"
"vsel %%v29,%%v26,%%v27,%%v29 \n\t"

"vfchdb %%v30,%%v29,%%v28 \n\t"
"vsel %%v30,%%v28,%%v29,%%v30 \n\t"

"vfchdb %%v31,%%v0,%%v30 \n\t"
"vsel %%v0,%%v30,%%v0,%%v31 \n\t"

"agfi %%r1, 256 \n\t"
"brctg %%r0, 0b \n\t"

"vrepg %%v16,%%v0,1 \n\t"
"wfchdb %%v17,%%v16,%%v0 \n\t"
"vsel %%v0,%%v0,%%v16,%%v17 \n\t"
"ldr %0,%%f0 "
:"=f"(min)
:"r"(n),"ZR"((const FLOAT (*)[n])x)
:"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
);

return min;
static FLOAT dmin_kernel_32(BLASLONG n, FLOAT *x) {
FLOAT min;

__asm__("vl %%v0,0(%[x])\n\t"
"srlg %[n],%[n],5\n\t"
"xgr %%r1,%%r1\n\t"
"0:\n\t"
"pfd 1, 1024(%%r1,%[x])\n\t"
"vl %%v16,0(%%r1,%[x])\n\t"
"vl %%v17,16(%%r1,%[x])\n\t"
"vl %%v18,32(%%r1,%[x])\n\t"
"vl %%v19,48(%%r1,%[x])\n\t"
"vl %%v20,64(%%r1,%[x])\n\t"
"vl %%v21,80(%%r1,%[x])\n\t"
"vl %%v22,96(%%r1,%[x])\n\t"
"vl %%v23,112(%%r1,%[x])\n\t"
"vfchdb %%v24,%%v17,%%v16\n\t"
"vfchdb %%v25,%%v19,%%v18\n\t"
"vfchdb %%v26,%%v21,%%v20\n\t"
"vfchdb %%v27,%%v23,%%v22\n\t"
"vsel %%v24,%%v16,%%v17,%%v24\n\t"
"vsel %%v25,%%v18,%%v19,%%v25\n\t"
"vsel %%v26,%%v20,%%v21,%%v26\n\t"
"vsel %%v27,%%v22,%%v23,%%v27\n\t"
"vfchdb %%v28,%%v25,%%v24\n\t"
"vfchdb %%v29,%%v27,%%v26\n\t"
"vsel %%v28,%%v24,%%v25,%%v28\n\t"
"vsel %%v29,%%v26,%%v27,%%v29\n\t"
"vfchdb %%v30,%%v29,%%v28\n\t"
"vsel %%v30,%%v28,%%v29,%%v30\n\t"
"vfchdb %%v31,%%v0,%%v30\n\t"
"vsel %%v0,%%v30,%%v0,%%v31\n\t"
"vl %%v16,128(%%r1,%[x])\n\t"
"vl %%v17,144(%%r1,%[x])\n\t"
"vl %%v18,160(%%r1,%[x])\n\t"
"vl %%v19,176(%%r1,%[x])\n\t"
"vl %%v20,192(%%r1,%[x])\n\t"
"vl %%v21,208(%%r1,%[x])\n\t"
"vl %%v22,224(%%r1,%[x])\n\t"
"vl %%v23,240(%%r1,%[x])\n\t"
"vfchdb %%v24,%%v17,%%v16\n\t"
"vfchdb %%v25,%%v19,%%v18\n\t"
"vfchdb %%v26,%%v21,%%v20\n\t"
"vfchdb %%v27,%%v23,%%v22\n\t"
"vsel %%v24,%%v16,%%v17,%%v24\n\t"
"vsel %%v25,%%v18,%%v19,%%v25\n\t"
"vsel %%v26,%%v20,%%v21,%%v26\n\t"
"vsel %%v27,%%v22,%%v23,%%v27\n\t"
"vfchdb %%v28,%%v25,%%v24\n\t"
"vfchdb %%v29,%%v27,%%v26\n\t"
"vsel %%v28,%%v24,%%v25,%%v28\n\t"
"vsel %%v29,%%v26,%%v27,%%v29\n\t"
"vfchdb %%v30,%%v29,%%v28\n\t"
"vsel %%v30,%%v28,%%v29,%%v30\n\t"
"vfchdb %%v31,%%v0,%%v30\n\t"
"vsel %%v0,%%v30,%%v0,%%v31\n\t"
"agfi %%r1, 256\n\t"
"brctg %[n], 0b\n\t"
"vrepg %%v16,%%v0,1\n\t"
"wfchdb %%v17,%%v16,%%v0\n\t"
"vsel %%v0,%%v0,%%v16,%%v17\n\t"
"ldr %[min],%%f0"
: [min] "=f"(min),[n] "+&r"(n)
: "m"(*(const FLOAT (*)[n]) x),[x] "a"(x)
: "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22",
"v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31");

return min;
} }
FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
BLASLONG i = 0;
BLASLONG j = 0;
FLOAT minf = 0.0;

if (n <= 0 || inc_x <= 0) return (minf);


if (inc_x == 1) {
FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
BLASLONG i = 0;
BLASLONG j = 0;
FLOAT minf = 0.0;


BLASLONG n1 = n & -32;
if (n1 > 0) {
if (n <= 0 || inc_x <= 0)
return (minf);


minf = dmin_kernel_32(n1, x);
if (inc_x == 1) {


i = n1;
}
else
{
minf=x[0];
i++;
}
BLASLONG n1 = n & -32;
if (n1 > 0) {


while (i < n) {
if (x[i] < minf) {
minf = x[i];
}
i++;
}
return (minf);
minf = dmin_kernel_32(n1, x);


i = n1;
} else { } else {
minf = x[0];
i++;
}


minf=x[0];
while (i < n) {
if (x[i] < minf) {
minf = x[i];
}
i++;
}
return (minf);

} else {


BLASLONG n1 = n & -4;
while (j < n1) {
minf = x[0];


if (x[i] < minf) {
minf = x[i];
}
if (x[i + inc_x] < minf) {
minf = x[i + inc_x];
}
if (x[i + 2 * inc_x] < minf) {
minf = x[i + 2 * inc_x];
}
if (x[i + 3 * inc_x] < minf) {
minf = x[i + 3 * inc_x];
}
BLASLONG n1 = n & -4;
while (j < n1) {


i += inc_x * 4;
if (x[i] < minf) {
minf = x[i];
}
if (x[i + inc_x] < minf) {
minf = x[i + inc_x];
}
if (x[i + 2 * inc_x] < minf) {
minf = x[i + 2 * inc_x];
}
if (x[i + 3 * inc_x] < minf) {
minf = x[i + 3 * inc_x];
}


j += 4;
i += inc_x * 4;


}
j += 4;


}


while (j < n) {
if (x[i] < minf) {
minf = x[i];
}
i += inc_x;
j++;
}
return (minf);
while (j < n) {
if (x[i] < minf) {
minf = x[i];
}
i += inc_x;
j++;
} }
return (minf);
}
} }

+ 180
- 201
kernel/zarch/drot.c View File

@@ -1,5 +1,5 @@
/*************************************************************************** /***************************************************************************
Copyright (c) 2013-2018, The OpenBLAS Project
Copyright (c) 2013-2019, The OpenBLAS Project
All rights reserved. All rights reserved.
Redistribution and use in source and binary forms, with or without Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are modification, are permitted provided that the following conditions are
@@ -27,220 +27,199 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.


#include "common.h" #include "common.h"


static void drot_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *c, FLOAT *s)
{
__asm__ (
"vlrepg %%v0,%3 \n\t"
"vlrepg %%v1,%4 \n\t"
"srlg %%r0,%0,5 \n\t"
"xgr %%r1,%%r1 \n\t"
"0: \n\t"
"pfd 2, 1024(%%r1,%1) \n\t"
"pfd 2, 1024(%%r1,%2) \n\t"
"vl %%v24, 0(%%r1,%1) \n\t"
"vl %%v25, 16(%%r1,%1) \n\t"
"vl %%v26, 32(%%r1,%1) \n\t"
"vl %%v27, 48(%%r1,%1) \n\t"
"vl %%v16, 0(%%r1,%2) \n\t"
"vl %%v17, 16(%%r1,%2) \n\t"
"vl %%v18, 32(%%r1,%2) \n\t"
"vl %%v19, 48(%%r1,%2) \n\t"
"vfmdb %%v28,%%v24,%%v0 \n\t"
"vfmdb %%v29,%%v25,%%v0 \n\t"
"vfmdb %%v20,%%v24,%%v1 \n\t" /* yn=x*s */
"vfmdb %%v21,%%v25,%%v1 \n\t" /* yn=x*s */
"vfmdb %%v30,%%v26,%%v0 \n\t"
"vfmdb %%v22,%%v26,%%v1 \n\t" /* yn=x*s */
"vfmdb %%v31,%%v27,%%v0 \n\t"
"vfmdb %%v23,%%v27,%%v1 \n\t" /* yn=x*s */
/* 2nd parts*/
"vfmadb %%v28,%%v16,%%v1,%%v28 \n\t"
"vfmsdb %%v20,%%v16,%%v0,%%v20 \n\t" /* yn=y*c-yn */
"vfmadb %%v29,%%v17,%%v1,%%v29 \n\t"
"vfmsdb %%v21,%%v17,%%v0,%%v21 \n\t" /* yn=y*c-yn */
"vfmadb %%v30,%%v18,%%v1,%%v30 \n\t"
"vfmsdb %%v22,%%v18,%%v0,%%v22 \n\t" /* yn=y*c-yn */
"vfmadb %%v31,%%v19,%%v1,%%v31 \n\t"
"vfmsdb %%v23,%%v19,%%v0,%%v23 \n\t" /* yn=y*c-yn */
"vst %%v28, 0(%%r1,%1) \n\t"
"vst %%v29, 16(%%r1,%1) \n\t"
"vst %%v30, 32(%%r1,%1) \n\t"
"vst %%v31, 48(%%r1,%1) \n\t"
"vst %%v20, 0(%%r1,%2) \n\t"
"vst %%v21, 16(%%r1,%2) \n\t"
"vst %%v22, 32(%%r1,%2) \n\t"
"vst %%v23, 48(%%r1,%2) \n\t"
"vl %%v24, 64(%%r1,%1) \n\t"
"vl %%v25, 80(%%r1,%1) \n\t"
"vl %%v26, 96(%%r1,%1) \n\t"
"vl %%v27, 112(%%r1,%1) \n\t"
"vl %%v16, 64(%%r1,%2) \n\t"
"vl %%v17, 80(%%r1,%2) \n\t"
"vl %%v18, 96(%%r1,%2) \n\t"
"vl %%v19, 112(%%r1,%2) \n\t"
"vfmdb %%v28,%%v24,%%v0 \n\t"
"vfmdb %%v29,%%v25,%%v0 \n\t"
"vfmdb %%v20,%%v24,%%v1 \n\t" /* yn=x*s */
"vfmdb %%v21,%%v25,%%v1 \n\t" /* yn=x*s */
"vfmdb %%v30,%%v26,%%v0 \n\t"
"vfmdb %%v22,%%v26,%%v1 \n\t" /* yn=x*s */
"vfmdb %%v31,%%v27,%%v0 \n\t"
"vfmdb %%v23,%%v27,%%v1 \n\t" /* yn=x*s */
/* 2nd parts*/
"vfmadb %%v28,%%v16,%%v1,%%v28 \n\t"
"vfmsdb %%v20,%%v16,%%v0,%%v20 \n\t" /* yn=y*c-yn */
"vfmadb %%v29,%%v17,%%v1,%%v29 \n\t"
"vfmsdb %%v21,%%v17,%%v0,%%v21 \n\t" /* yn=y*c-yn */
"vfmadb %%v30,%%v18,%%v1,%%v30 \n\t"
"vfmsdb %%v22,%%v18,%%v0,%%v22 \n\t" /* yn=y*c-yn */
"vfmadb %%v31,%%v19,%%v1,%%v31 \n\t"
"vfmsdb %%v23,%%v19,%%v0,%%v23 \n\t" /* yn=y*c-yn */
"vst %%v28, 64(%%r1,%1) \n\t"
"vst %%v29, 80(%%r1,%1) \n\t"
"vst %%v30, 96(%%r1,%1) \n\t"
"vst %%v31, 112(%%r1,%1) \n\t"
"vst %%v20, 64(%%r1,%2) \n\t"
"vst %%v21, 80(%%r1,%2) \n\t"
"vst %%v22, 96(%%r1,%2) \n\t"
"vst %%v23, 112(%%r1,%2) \n\t"
"vl %%v24, 128(%%r1,%1) \n\t"
"vl %%v25, 144(%%r1,%1) \n\t"
"vl %%v26, 160(%%r1,%1) \n\t"
"vl %%v27, 176(%%r1,%1) \n\t"
"vl %%v16, 128(%%r1,%2) \n\t"
"vl %%v17, 144(%%r1,%2) \n\t"
"vl %%v18, 160(%%r1,%2) \n\t"
"vl %%v19, 176(%%r1,%2) \n\t"
"vfmdb %%v28,%%v24,%%v0 \n\t"
"vfmdb %%v29,%%v25,%%v0 \n\t"
"vfmdb %%v20,%%v24,%%v1 \n\t" /* yn=x*s */
"vfmdb %%v21,%%v25,%%v1 \n\t" /* yn=x*s */
"vfmdb %%v30,%%v26,%%v0 \n\t"
"vfmdb %%v22,%%v26,%%v1 \n\t" /* yn=x*s */
"vfmdb %%v31,%%v27,%%v0 \n\t"
"vfmdb %%v23,%%v27,%%v1 \n\t" /* yn=x*s */
/* 2nd parts*/
"vfmadb %%v28,%%v16,%%v1,%%v28 \n\t"
"vfmsdb %%v20,%%v16,%%v0,%%v20 \n\t" /* yn=y*c-yn */
"vfmadb %%v29,%%v17,%%v1,%%v29 \n\t"
"vfmsdb %%v21,%%v17,%%v0,%%v21 \n\t" /* yn=y*c-yn */
"vfmadb %%v30,%%v18,%%v1,%%v30 \n\t"
"vfmsdb %%v22,%%v18,%%v0,%%v22 \n\t" /* yn=y*c-yn */
"vfmadb %%v31,%%v19,%%v1,%%v31 \n\t"
"vfmsdb %%v23,%%v19,%%v0,%%v23 \n\t" /* yn=y*c-yn */
"vst %%v28, 128(%%r1,%1) \n\t"
"vst %%v29, 144(%%r1,%1) \n\t"
"vst %%v30, 160(%%r1,%1) \n\t"
"vst %%v31, 176(%%r1,%1) \n\t"
"vst %%v20, 128(%%r1,%2) \n\t"
"vst %%v21, 144(%%r1,%2) \n\t"
"vst %%v22, 160(%%r1,%2) \n\t"
"vst %%v23, 176(%%r1,%2) \n\t"
"vl %%v24, 192(%%r1,%1) \n\t"
"vl %%v25, 208(%%r1,%1) \n\t"
"vl %%v26, 224(%%r1,%1) \n\t"
"vl %%v27, 240(%%r1,%1) \n\t"
"vl %%v16, 192(%%r1,%2) \n\t"
"vl %%v17, 208(%%r1,%2) \n\t"
"vl %%v18, 224(%%r1,%2) \n\t"
"vl %%v19, 240(%%r1,%2) \n\t"
"vfmdb %%v28,%%v24,%%v0 \n\t"
"vfmdb %%v29,%%v25,%%v0 \n\t"
"vfmdb %%v20,%%v24,%%v1 \n\t" /* yn=x*s */
"vfmdb %%v21,%%v25,%%v1 \n\t" /* yn=x*s */
"vfmdb %%v30,%%v26,%%v0 \n\t"
"vfmdb %%v22,%%v26,%%v1 \n\t" /* yn=x*s */
"vfmdb %%v31,%%v27,%%v0 \n\t"
"vfmdb %%v23,%%v27,%%v1 \n\t" /* yn=x*s */
/* 2nd parts*/
"vfmadb %%v28,%%v16,%%v1,%%v28 \n\t"
"vfmsdb %%v20,%%v16,%%v0,%%v20 \n\t" /* yn=y*c-yn */
"vfmadb %%v29,%%v17,%%v1,%%v29 \n\t"
"vfmsdb %%v21,%%v17,%%v0,%%v21 \n\t" /* yn=y*c-yn */
"vfmadb %%v30,%%v18,%%v1,%%v30 \n\t"
"vfmsdb %%v22,%%v18,%%v0,%%v22 \n\t" /* yn=y*c-yn */
"vfmadb %%v31,%%v19,%%v1,%%v31 \n\t"
"vfmsdb %%v23,%%v19,%%v0,%%v23 \n\t" /* yn=y*c-yn */
"vst %%v28, 192(%%r1,%1) \n\t"
"vst %%v29, 208(%%r1,%1) \n\t"
"vst %%v30, 224(%%r1,%1) \n\t"
"vst %%v31, 240(%%r1,%1) \n\t"
"vst %%v20, 192(%%r1,%2) \n\t"
"vst %%v21, 208(%%r1,%2) \n\t"
"vst %%v22, 224(%%r1,%2) \n\t"
"vst %%v23, 240(%%r1,%2) \n\t"
"agfi %%r1,256 \n\t"
"brctg %%r0,0b "
:
:"r"(n),"ZR"((FLOAT (*)[n])x),"ZR"((FLOAT (*)[n])y),"m"(*c),"m"(*s)
:"memory","cc","r0","r1","v0","v1","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
);
static void drot_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *c, FLOAT *s) {
__asm__("vlrepg %%v0,%[c]\n\t"
"vlrepg %%v1,%[s]\n\t"
"srlg %[n],%[n],5\n\t"
"xgr %%r1,%%r1\n\t"
"0:\n\t"
"pfd 2, 1024(%%r1,%[x])\n\t"
"pfd 2, 1024(%%r1,%[y])\n\t"
"vl %%v24, 0(%%r1,%[x])\n\t"
"vl %%v25, 16(%%r1,%[x])\n\t"
"vl %%v26, 32(%%r1,%[x])\n\t"
"vl %%v27, 48(%%r1,%[x])\n\t"
"vl %%v16, 0(%%r1,%[y])\n\t"
"vl %%v17, 16(%%r1,%[y])\n\t"
"vl %%v18, 32(%%r1,%[y])\n\t"
"vl %%v19, 48(%%r1,%[y])\n\t"
"vfmdb %%v28,%%v24,%%v0\n\t"
"vfmdb %%v29,%%v25,%%v0\n\t"
"vfmdb %%v20,%%v24,%%v1\n\t" /* yn=x*s */
"vfmdb %%v21,%%v25,%%v1\n\t" /* yn=x*s */
"vfmdb %%v30,%%v26,%%v0\n\t"
"vfmdb %%v22,%%v26,%%v1\n\t" /* yn=x*s */
"vfmdb %%v31,%%v27,%%v0\n\t"
"vfmdb %%v23,%%v27,%%v1\n\t" /* yn=x*s */
/* 2nd parts */
"vfmadb %%v28,%%v16,%%v1,%%v28\n\t"
"vfmsdb %%v20,%%v16,%%v0,%%v20\n\t" /* yn=y*c-yn */
"vfmadb %%v29,%%v17,%%v1,%%v29\n\t"
"vfmsdb %%v21,%%v17,%%v0,%%v21\n\t" /* yn=y*c-yn */
"vfmadb %%v30,%%v18,%%v1,%%v30\n\t"
"vfmsdb %%v22,%%v18,%%v0,%%v22\n\t" /* yn=y*c-yn */
"vfmadb %%v31,%%v19,%%v1,%%v31\n\t"
"vfmsdb %%v23,%%v19,%%v0,%%v23\n\t" /* yn=y*c-yn */
"vst %%v28, 0(%%r1,%[x])\n\t"
"vst %%v29, 16(%%r1,%[x])\n\t"
"vst %%v30, 32(%%r1,%[x])\n\t"
"vst %%v31, 48(%%r1,%[x])\n\t"
"vst %%v20, 0(%%r1,%[y])\n\t"
"vst %%v21, 16(%%r1,%[y])\n\t"
"vst %%v22, 32(%%r1,%[y])\n\t"
"vst %%v23, 48(%%r1,%[y])\n\t"
"vl %%v24, 64(%%r1,%[x])\n\t"
"vl %%v25, 80(%%r1,%[x])\n\t"
"vl %%v26, 96(%%r1,%[x])\n\t"
"vl %%v27, 112(%%r1,%[x])\n\t"
"vl %%v16, 64(%%r1,%[y])\n\t"
"vl %%v17, 80(%%r1,%[y])\n\t"
"vl %%v18, 96(%%r1,%[y])\n\t"
"vl %%v19, 112(%%r1,%[y])\n\t"
"vfmdb %%v28,%%v24,%%v0\n\t"
"vfmdb %%v29,%%v25,%%v0\n\t"
"vfmdb %%v20,%%v24,%%v1\n\t" /* yn=x*s */
"vfmdb %%v21,%%v25,%%v1\n\t" /* yn=x*s */
"vfmdb %%v30,%%v26,%%v0\n\t"
"vfmdb %%v22,%%v26,%%v1\n\t" /* yn=x*s */
"vfmdb %%v31,%%v27,%%v0\n\t"
"vfmdb %%v23,%%v27,%%v1\n\t" /* yn=x*s */
/* 2nd parts */
"vfmadb %%v28,%%v16,%%v1,%%v28\n\t"
"vfmsdb %%v20,%%v16,%%v0,%%v20\n\t" /* yn=y*c-yn */
"vfmadb %%v29,%%v17,%%v1,%%v29\n\t"
"vfmsdb %%v21,%%v17,%%v0,%%v21\n\t" /* yn=y*c-yn */
"vfmadb %%v30,%%v18,%%v1,%%v30\n\t"
"vfmsdb %%v22,%%v18,%%v0,%%v22\n\t" /* yn=y*c-yn */
"vfmadb %%v31,%%v19,%%v1,%%v31\n\t"
"vfmsdb %%v23,%%v19,%%v0,%%v23\n\t" /* yn=y*c-yn */
"vst %%v28, 64(%%r1,%[x])\n\t"
"vst %%v29, 80(%%r1,%[x])\n\t"
"vst %%v30, 96(%%r1,%[x])\n\t"
"vst %%v31, 112(%%r1,%[x])\n\t"
"vst %%v20, 64(%%r1,%[y])\n\t"
"vst %%v21, 80(%%r1,%[y])\n\t"
"vst %%v22, 96(%%r1,%[y])\n\t"
"vst %%v23, 112(%%r1,%[y])\n\t"
"vl %%v24, 128(%%r1,%[x])\n\t"
"vl %%v25, 144(%%r1,%[x])\n\t"
"vl %%v26, 160(%%r1,%[x])\n\t"
"vl %%v27, 176(%%r1,%[x])\n\t"
"vl %%v16, 128(%%r1,%[y])\n\t"
"vl %%v17, 144(%%r1,%[y])\n\t"
"vl %%v18, 160(%%r1,%[y])\n\t"
"vl %%v19, 176(%%r1,%[y])\n\t"
"vfmdb %%v28,%%v24,%%v0\n\t"
"vfmdb %%v29,%%v25,%%v0\n\t"
"vfmdb %%v20,%%v24,%%v1\n\t" /* yn=x*s */
"vfmdb %%v21,%%v25,%%v1\n\t" /* yn=x*s */
"vfmdb %%v30,%%v26,%%v0\n\t"
"vfmdb %%v22,%%v26,%%v1\n\t" /* yn=x*s */
"vfmdb %%v31,%%v27,%%v0\n\t"
"vfmdb %%v23,%%v27,%%v1\n\t" /* yn=x*s */
/* 2nd parts */
"vfmadb %%v28,%%v16,%%v1,%%v28\n\t"
"vfmsdb %%v20,%%v16,%%v0,%%v20\n\t" /* yn=y*c-yn */
"vfmadb %%v29,%%v17,%%v1,%%v29\n\t"
"vfmsdb %%v21,%%v17,%%v0,%%v21\n\t" /* yn=y*c-yn */
"vfmadb %%v30,%%v18,%%v1,%%v30\n\t"
"vfmsdb %%v22,%%v18,%%v0,%%v22\n\t" /* yn=y*c-yn */
"vfmadb %%v31,%%v19,%%v1,%%v31\n\t"
"vfmsdb %%v23,%%v19,%%v0,%%v23\n\t" /* yn=y*c-yn */
"vst %%v28, 128(%%r1,%[x])\n\t"
"vst %%v29, 144(%%r1,%[x])\n\t"
"vst %%v30, 160(%%r1,%[x])\n\t"
"vst %%v31, 176(%%r1,%[x])\n\t"
"vst %%v20, 128(%%r1,%[y])\n\t"
"vst %%v21, 144(%%r1,%[y])\n\t"
"vst %%v22, 160(%%r1,%[y])\n\t"
"vst %%v23, 176(%%r1,%[y])\n\t"
"vl %%v24, 192(%%r1,%[x])\n\t"
"vl %%v25, 208(%%r1,%[x])\n\t"
"vl %%v26, 224(%%r1,%[x])\n\t"
"vl %%v27, 240(%%r1,%[x])\n\t"
"vl %%v16, 192(%%r1,%[y])\n\t"
"vl %%v17, 208(%%r1,%[y])\n\t"
"vl %%v18, 224(%%r1,%[y])\n\t"
"vl %%v19, 240(%%r1,%[y])\n\t"
"vfmdb %%v28,%%v24,%%v0\n\t"
"vfmdb %%v29,%%v25,%%v0\n\t"
"vfmdb %%v20,%%v24,%%v1\n\t" /* yn=x*s */
"vfmdb %%v21,%%v25,%%v1\n\t" /* yn=x*s */
"vfmdb %%v30,%%v26,%%v0\n\t"
"vfmdb %%v22,%%v26,%%v1\n\t" /* yn=x*s */
"vfmdb %%v31,%%v27,%%v0\n\t"
"vfmdb %%v23,%%v27,%%v1\n\t" /* yn=x*s */
/* 2nd parts */
"vfmadb %%v28,%%v16,%%v1,%%v28\n\t"
"vfmsdb %%v20,%%v16,%%v0,%%v20\n\t" /* yn=y*c-yn */
"vfmadb %%v29,%%v17,%%v1,%%v29\n\t"
"vfmsdb %%v21,%%v17,%%v0,%%v21\n\t" /* yn=y*c-yn */
"vfmadb %%v30,%%v18,%%v1,%%v30\n\t"
"vfmsdb %%v22,%%v18,%%v0,%%v22\n\t" /* yn=y*c-yn */
"vfmadb %%v31,%%v19,%%v1,%%v31\n\t"
"vfmsdb %%v23,%%v19,%%v0,%%v23\n\t" /* yn=y*c-yn */
"vst %%v28, 192(%%r1,%[x])\n\t"
"vst %%v29, 208(%%r1,%[x])\n\t"
"vst %%v30, 224(%%r1,%[x])\n\t"
"vst %%v31, 240(%%r1,%[x])\n\t"
"vst %%v20, 192(%%r1,%[y])\n\t"
"vst %%v21, 208(%%r1,%[y])\n\t"
"vst %%v22, 224(%%r1,%[y])\n\t"
"vst %%v23, 240(%%r1,%[y])\n\t"
"agfi %%r1,256\n\t"
"brctg %[n],0b"
: "+m"(*(FLOAT (*)[n]) x), "+m"(*(FLOAT (*)[n]) y),[n] "+&r"(n)
: [x] "a"(x),[y] "a"(y),[c] "m"(*c),[s] "m"(*s)
: "cc", "r1", "v0", "v1", "v16", "v17", "v18", "v19", "v20", "v21",
"v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30",
"v31");
} }


int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT c, FLOAT s)
{
BLASLONG i=0;
BLASLONG ix=0,iy=0;
FLOAT temp;
int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y,
FLOAT c, FLOAT s) {
BLASLONG i = 0;
BLASLONG ix = 0, iy = 0;


if ( n <= 0 ) return(0);
FLOAT temp;


if ( (inc_x == 1) && (inc_y == 1) )
{
if (n <= 0)
return (0);


BLASLONG n1 = n & -32;
if ( n1 > 0 )
{
FLOAT cosa,sina;
cosa=c;
sina=s;
drot_kernel_32(n1, x, y, &cosa, &sina);
i=n1;
}
if ((inc_x == 1) && (inc_y == 1)) {


while(i < n)
{
temp = c*x[i] + s*y[i] ;
y[i] = c*y[i] - s*x[i] ;
x[i] = temp ;

i++ ;
BLASLONG n1 = n & -32;
if (n1 > 0) {
FLOAT cosa, sina;
cosa = c;
sina = s;
drot_kernel_32(n1, x, y, &cosa, &sina);
i = n1;
}


}
while (i < n) {
temp = c * x[i] + s * y[i];
y[i] = c * y[i] - s * x[i];
x[i] = temp;


i++;


} }
else
{


while(i < n)
{
temp = c*x[ix] + s*y[iy] ;
y[iy] = c*y[iy] - s*x[ix] ;
x[ix] = temp ;
} else {


ix += inc_x ;
iy += inc_y ;
i++ ;
while (i < n) {
temp = c * x[ix] + s * y[iy];
y[iy] = c * y[iy] - s * x[ix];
x[ix] = temp;


}
ix += inc_x;
iy += inc_y;
i++;


} }
return(0);

}


}
return (0);


}

+ 125
- 153
kernel/zarch/dscal.c View File

@@ -1,5 +1,5 @@
/*************************************************************************** /***************************************************************************
Copyright (c) 2013-2017, The OpenBLAS Project
Copyright (c) 2013-2019, The OpenBLAS Project
All rights reserved. All rights reserved.
Redistribution and use in source and binary forms, with or without Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are modification, are permitted provided that the following conditions are
@@ -27,179 +27,151 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.


#include "common.h" #include "common.h"


static void dscal_kernel_16(BLASLONG n, FLOAT da, FLOAT *x)
{
__asm__ volatile (
"vlrepg %%v0,%1 \n\t"
"srlg %%r0,%0,4 \n\t"
"xgr %%r1,%%r1 \n\t"
"0: \n\t"
"pfd 2, 1024(%%r1,%2) \n\t"
"vl %%v24, 0(%%r1,%2) \n\t"
"vfmdb %%v24,%%v24,%%v0 \n\t"
"vst %%v24, 0(%%r1,%2) \n\t"
"vl %%v25, 16(%%r1,%2) \n\t"
"vfmdb %%v25,%%v25,%%v0 \n\t"
"vst %%v25, 16(%%r1,%2) \n\t"
"vl %%v26, 32(%%r1,%2) \n\t"
"vfmdb %%v26,%%v26,%%v0 \n\t"
"vst %%v26, 32(%%r1,%2) \n\t"
"vl %%v27, 48(%%r1,%2) \n\t"
"vfmdb %%v27,%%v27,%%v0 \n\t"
"vst %%v27, 48(%%r1,%2) \n\t"
"vl %%v24, 64(%%r1,%2) \n\t"
"vfmdb %%v24,%%v24,%%v0 \n\t"
"vst %%v24, 64(%%r1,%2) \n\t"
"vl %%v25, 80(%%r1,%2) \n\t"
"vfmdb %%v25,%%v25,%%v0 \n\t"
"vst %%v25, 80(%%r1,%2) \n\t"
"vl %%v26, 96(%%r1,%2) \n\t"
"vfmdb %%v26,%%v26,%%v0 \n\t"
"vst %%v26, 96(%%r1,%2) \n\t"
"vl %%v27, 112(%%r1,%2) \n\t"
"vfmdb %%v27,%%v27,%%v0 \n\t"
"vst %%v27, 112(%%r1,%2) \n\t"
"agfi %%r1,128 \n\t"
"brctg %%r0,0b "
:
:"r"(n),"m"(da),"ZR"((FLOAT (*)[n])x)
:"memory","cc","r0","r1","v0","v24","v25","v26","v27"
);
static void dscal_kernel_16(BLASLONG n, FLOAT da, FLOAT *x) {
__asm__("vlrepg %%v0,%[da]\n\t"
"srlg %[n],%[n],4\n\t"
"xgr %%r1,%%r1\n\t"
"0:\n\t"
"pfd 2, 1024(%%r1,%[x])\n\t"
"vl %%v24,0(%%r1,%[x])\n\t"
"vfmdb %%v24,%%v24,%%v0\n\t"
"vst %%v24,0(%%r1,%[x])\n\t"
"vl %%v25,16(%%r1,%[x])\n\t"
"vfmdb %%v25,%%v25,%%v0\n\t"
"vst %%v25,16(%%r1,%[x])\n\t"
"vl %%v26,32(%%r1,%[x])\n\t"
"vfmdb %%v26,%%v26,%%v0\n\t"
"vst %%v26,32(%%r1,%[x])\n\t"
"vl %%v27,48(%%r1,%[x])\n\t"
"vfmdb %%v27,%%v27,%%v0\n\t"
"vst %%v27,48(%%r1,%[x])\n\t"
"vl %%v28,64(%%r1,%[x])\n\t"
"vfmdb %%v28,%%v28,%%v0\n\t"
"vst %%v28,64(%%r1,%[x])\n\t"
"vl %%v29,80(%%r1,%[x])\n\t"
"vfmdb %%v29,%%v29,%%v0\n\t"
"vst %%v29,80(%%r1,%[x])\n\t"
"vl %%v30,96(%%r1,%[x])\n\t"
"vfmdb %%v30,%%v30,%%v0\n\t"
"vst %%v30,96(%%r1,%[x])\n\t"
"vl %%v31,112(%%r1,%[x])\n\t"
"vfmdb %%v31,%%v31,%%v0\n\t"
"vst %%v31,112(%%r1,%[x])\n\t"
"agfi %%r1,128\n\t"
"brctg %[n],0b"
: "+m"(*(FLOAT (*)[n]) x),[n] "+&r"(n)
: [x] "a"(x),[da] "m"(da)
: "cc", "r1", "v0", "v24", "v25", "v26", "v27", "v28", "v29", "v30",
"v31");
} }


static void dscal_kernel_16_zero(BLASLONG n, FLOAT *x)
{
__asm__ volatile(
"vzero %%v24 \n\t"
"vzero %%v25 \n\t"
"vzero %%v26 \n\t"
"vzero %%v27 \n\t"
"srlg %%r0,%0,4 \n\t"
"xgr %%r1,%%r1 \n\t"
"0: \n\t"
"pfd 2, 1024(%%r1,%1) \n\t"

"vst %%v24,0(%%r1,%1) \n\t"
"vst %%v25,16(%%r1,%1) \n\t"
"vst %%v26,32(%%r1,%1) \n\t"
"vst %%v27,48(%%r1,%1) \n\t"
"vst %%v24,64(%%r1,%1) \n\t"
"vst %%v25,80(%%r1,%1) \n\t"
"vst %%v26,96(%%r1,%1) \n\t"
"vst %%v27,112(%%r1,%1) \n\t"
"agfi %%r1,128 \n\t"
"brctg %%r0,0b "
:
:"r"(n),"ZR"((FLOAT (*)[n])x)
:"memory","cc","r0","r1","v24","v25","v26","v27"
);
static void dscal_kernel_16_zero(BLASLONG n, FLOAT *x) {
__asm__("vzero %%v0\n\t"
"srlg %[n],%[n],4\n\t"
"xgr %%r1,%%r1\n\t"
"0:\n\t"
"pfd 2, 1024(%%r1,%[x])\n\t"
"vst %%v0,0(%%r1,%[x])\n\t"
"vst %%v0,16(%%r1,%[x])\n\t"
"vst %%v0,32(%%r1,%[x])\n\t"
"vst %%v0,48(%%r1,%[x])\n\t"
"vst %%v0,64(%%r1,%[x])\n\t"
"vst %%v0,80(%%r1,%[x])\n\t"
"vst %%v0,96(%%r1,%[x])\n\t"
"vst %%v0,112(%%r1,%[x])\n\t"
"agfi %%r1,128\n\t"
"brctg %[n],0b"
: "=m"(*(FLOAT (*)[n]) x),[n] "+&r"(n)
: [x] "a"(x)
: "cc", "r1", "v0");
} }
int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2)
{
BLASLONG i=0,j=0;
if ( n <= 0 || inc_x <=0 )
return(0);

if ( inc_x == 1 )
{

if ( da == 0.0 )
{

BLASLONG n1 = n & -16;
if ( n1 > 0 )
{
dscal_kernel_16_zero(n1, x);
j=n1;
}

while(j < n)
{

x[j]=0.0;
j++;
}

}
else
{

BLASLONG n1 = n & -16;
if ( n1 > 0 )
{
dscal_kernel_16(n1, da, x);
j=n1;
}
while(j < n)
{

x[j] = da * x[j] ;
j++;
}
}


int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x,
BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy,
BLASLONG dummy2) {
BLASLONG i = 0, j = 0;
if (n <= 0 || inc_x <= 0)
return (0);


if (inc_x == 1) {

if (da == 0.0) {

BLASLONG n1 = n & -16;
if (n1 > 0) {

dscal_kernel_16_zero(n1, x);
j = n1;
}

while (j < n) {

x[j] = 0.0;
j++;
}

} else {

BLASLONG n1 = n & -16;
if (n1 > 0) {
dscal_kernel_16(n1, da, x);
j = n1;
}
while (j < n) {

x[j] = da * x[j];
j++;
}
} }
else
{


if ( da == 0.0 )
{
} else {


BLASLONG n1 = n & -4;
if (da == 0.0) {


while (j < n1) {
BLASLONG n1 = n & -4;


x[i]=0.0;
x[i + inc_x]=0.0;
x[i + 2 * inc_x]=0.0;
x[i + 3 * inc_x]=0.0;
while (j < n1) {


i += inc_x * 4;
j += 4;
x[i] = 0.0;
x[i + inc_x] = 0.0;
x[i + 2 * inc_x] = 0.0;
x[i + 3 * inc_x] = 0.0;


}
while(j < n)
{
i += inc_x * 4;
j += 4;


x[i]=0.0;
i += inc_x ;
j++;
}
}
while (j < n) {


}
else
{
BLASLONG n1 = n & -4;
x[i] = 0.0;
i += inc_x;
j++;
}


while (j < n1) {
} else {
BLASLONG n1 = n & -4;


x[i] = da * x[i] ;
x[i + inc_x] = da * x[i + inc_x];
x[i + 2 * inc_x] = da * x[i + 2 * inc_x];
x[i + 3 * inc_x] = da * x[i + 3 * inc_x];
while (j < n1) {


i += inc_x * 4;
j += 4;
x[i] = da * x[i];
x[i + inc_x] = da * x[i + inc_x];
x[i + 2 * inc_x] = da * x[i + 2 * inc_x];
x[i + 3 * inc_x] = da * x[i + 3 * inc_x];


}
i += inc_x * 4;
j += 4;


while(j < n)
{
}


x[i] = da * x[i] ;
i += inc_x ;
j++;
}
}
while (j < n) {


x[i] = da * x[i];
i += inc_x;
j++;
}
} }
return 0;

}


}
return 0;


}

+ 124
- 122
kernel/zarch/dsdot.c View File

@@ -1,5 +1,5 @@
/*************************************************************************** /***************************************************************************
Copyright (c) 2013-2018,The OpenBLAS Project
Copyright (c) 2013-2019,The OpenBLAS Project
All rights reserved. All rights reserved.
Redistribution and use in source and binary forms,with or without Redistribution and use in source and binary forms,with or without
modification,are permitted provided that the following conditions are modification,are permitted provided that the following conditions are
@@ -27,144 +27,146 @@ USE OF THIS SOFTWARE,EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.


#include "common.h" #include "common.h"


static double dsdot_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y)
{
double dot;

__asm__ volatile (
"vzero %%v0 \n\t"
"srlg %%r0,%1,4 \n\t"
"xgr %%r1,%%r1 \n\t"
"0: \n\t"
"pfd 1,1024(%%r1,%2) \n\t"
"pfd 1,1024(%%r1,%3) \n\t"

"vlef %%v16,0(%%r1,%2),0 \n\t"
"vlef %%v16,4(%%r1,%2),2 \n\t"
"vlef %%v17,8(%%r1,%2),0 \n\t"
"vlef %%v17,12(%%r1,%2),2 \n\t"
"vlef %%v18,16(%%r1,%2),0 \n\t"
"vlef %%v18,20(%%r1,%2),2 \n\t"
"vlef %%v19,24(%%r1,%2),0 \n\t"
"vlef %%v19,28(%%r1,%2),2 \n\t"
"vlef %%v20,32(%%r1,%2),0 \n\t"
"vlef %%v20,36(%%r1,%2),2 \n\t"
"vlef %%v21,40(%%r1,%2),0 \n\t"
"vlef %%v21,44(%%r1,%2),2 \n\t"
"vlef %%v22,48(%%r1,%2),0 \n\t"
"vlef %%v22,52(%%r1,%2),2 \n\t"
"vlef %%v23,56(%%r1,%2),0 \n\t"
"vlef %%v23,60(%%r1,%2),2 \n\t"

"vflls %%v16,%%v16 \n\t"
"vflls %%v17,%%v17 \n\t"
"vflls %%v18,%%v18 \n\t"
"vflls %%v19,%%v19 \n\t"
"vflls %%v20,%%v20 \n\t"
"vflls %%v21,%%v21 \n\t"
"vflls %%v22,%%v22 \n\t"
"vflls %%v23,%%v23 \n\t"

"vlef %%v24,0(%%r1,%3),0 \n\t"
"vlef %%v24,4(%%r1,%3),2 \n\t"
"vflls %%v24,%%v24 \n\t"
"vfmadb %%v0,%%v16,%%v24,%%v0 \n\t"
"vlef %%v25,8(%%r1,%3),0 \n\t"
"vlef %%v25,12(%%r1,%3),2 \n\t"
"vflls %%v25,%%v25 \n\t"
"vfmadb %%v0,%%v17,%%v25,%%v0 \n\t"
"vlef %%v26,16(%%r1,%3),0 \n\t"
"vlef %%v26,20(%%r1,%3),2 \n\t"
"vflls %%v26,%%v26 \n\t"
"vfmadb %%v0,%%v18,%%v26,%%v0 \n\t"
"vlef %%v27,24(%%r1,%3),0 \n\t"
"vlef %%v27,28(%%r1,%3),2 \n\t"
"vflls %%v27,%%v27 \n\t"
"vfmadb %%v0,%%v19,%%v27,%%v0 \n\t"
"vlef %%v28,32(%%r1,%3),0 \n\t"
"vlef %%v28,36(%%r1,%3),2 \n\t"
"vflls %%v28,%%v28 \n\t"
"vfmadb %%v0,%%v20,%%v28,%%v0 \n\t"
"vlef %%v29,40(%%r1,%3),0 \n\t"
"vlef %%v29,44(%%r1,%3),2 \n\t"
"vflls %%v29,%%v29 \n\t"
"vfmadb %%v0,%%v21,%%v29,%%v0 \n\t"
"vlef %%v30,48(%%r1,%3),0 \n\t"
"vlef %%v30,52(%%r1,%3),2 \n\t"
"vflls %%v30,%%v30 \n\t"
"vfmadb %%v0,%%v22,%%v30,%%v0 \n\t"
"vlef %%v31,56(%%r1,%3),0 \n\t"
"vlef %%v31,60(%%r1,%3),2 \n\t"
"vflls %%v31,%%v31 \n\t"
"vfmadb %%v0,%%v23,%%v31,%%v0 \n\t"

"agfi %%r1,64 \n\t"
"brctg %%r0,0b \n\t"
"vrepg %%v1,%%v0,1 \n\t"
"adbr %%f0,%%f1 \n\t"
"ldr %0,%%f0 "
:"=f"(dot)
:"r"(n),"ZR"((const FLOAT (*)[n])x),"ZR"((const FLOAT (*)[n])y)
:"memory","cc","r0","r1","v0","v1","v2","v3","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
);

return dot;
static double dsdot_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y) {
double dot;

__asm__("vzero %%v0\n\t"
"vzero %%v1\n\t"
"vzero %%v2\n\t"
"vzero %%v3\n\t"
"vzero %%v4\n\t"
"vzero %%v5\n\t"
"vzero %%v6\n\t"
"vzero %%v7\n\t"
"srlg %[n],%[n],4\n\t"
"xgr %%r1,%%r1\n\t"
"0:\n\t"
"pfd 1,1024(%%r1,%[x])\n\t"
"pfd 1,1024(%%r1,%[y])\n\t"
"vlef %%v16,0(%%r1,%[x]),0\n\t"
"vlef %%v16,4(%%r1,%[x]),2\n\t"
"vlef %%v17,8(%%r1,%[x]),0\n\t"
"vlef %%v17,12(%%r1,%[x]),2\n\t"
"vlef %%v18,16(%%r1,%[x]),0\n\t"
"vlef %%v18,20(%%r1,%[x]),2\n\t"
"vlef %%v19,24(%%r1,%[x]),0\n\t"
"vlef %%v19,28(%%r1,%[x]),2\n\t"
"vlef %%v20,32(%%r1,%[x]),0\n\t"
"vlef %%v20,36(%%r1,%[x]),2\n\t"
"vlef %%v21,40(%%r1,%[x]),0\n\t"
"vlef %%v21,44(%%r1,%[x]),2\n\t"
"vlef %%v22,48(%%r1,%[x]),0\n\t"
"vlef %%v22,52(%%r1,%[x]),2\n\t"
"vlef %%v23,56(%%r1,%[x]),0\n\t"
"vlef %%v23,60(%%r1,%[x]),2\n\t"
"vflls %%v16,%%v16\n\t"
"vflls %%v17,%%v17\n\t"
"vflls %%v18,%%v18\n\t"
"vflls %%v19,%%v19\n\t"
"vflls %%v20,%%v20\n\t"
"vflls %%v21,%%v21\n\t"
"vflls %%v22,%%v22\n\t"
"vflls %%v23,%%v23\n\t"
"vlef %%v24,0(%%r1,%[y]),0\n\t"
"vlef %%v24,4(%%r1,%[y]),2\n\t"
"vflls %%v24,%%v24\n\t"
"vfmadb %%v0,%%v16,%%v24,%%v0\n\t"
"vlef %%v25,8(%%r1,%[y]),0\n\t"
"vlef %%v25,12(%%r1,%[y]),2\n\t"
"vflls %%v25,%%v25\n\t"
"vfmadb %%v1,%%v17,%%v25,%%v1\n\t"
"vlef %%v26,16(%%r1,%[y]),0\n\t"
"vlef %%v26,20(%%r1,%[y]),2\n\t"
"vflls %%v26,%%v26\n\t"
"vfmadb %%v2,%%v18,%%v26,%%v2\n\t"
"vlef %%v27,24(%%r1,%[y]),0\n\t"
"vlef %%v27,28(%%r1,%[y]),2\n\t"
"vflls %%v27,%%v27\n\t"
"vfmadb %%v3,%%v19,%%v27,%%v3\n\t"
"vlef %%v28,32(%%r1,%[y]),0\n\t"
"vlef %%v28,36(%%r1,%[y]),2\n\t"
"vflls %%v28,%%v28\n\t"
"vfmadb %%v4,%%v20,%%v28,%%v4\n\t"
"vlef %%v29,40(%%r1,%[y]),0\n\t"
"vlef %%v29,44(%%r1,%[y]),2\n\t"
"vflls %%v29,%%v29\n\t"
"vfmadb %%v5,%%v21,%%v29,%%v5\n\t"
"vlef %%v30,48(%%r1,%[y]),0\n\t"
"vlef %%v30,52(%%r1,%[y]),2\n\t"
"vflls %%v30,%%v30\n\t"
"vfmadb %%v6,%%v22,%%v30,%%v6\n\t"
"vlef %%v31,56(%%r1,%[y]),0\n\t"
"vlef %%v31,60(%%r1,%[y]),2\n\t"
"vflls %%v31,%%v31\n\t"
"vfmadb %%v7,%%v23,%%v31,%%v7\n\t"
"agfi %%r1,64\n\t"
"brctg %[n],0b\n\t"
"vfadb %%v0,%%v0,%%v1\n\t"
"vfadb %%v0,%%v0,%%v2\n\t"
"vfadb %%v0,%%v0,%%v3\n\t"
"vfadb %%v0,%%v0,%%v4\n\t"
"vfadb %%v0,%%v0,%%v5\n\t"
"vfadb %%v0,%%v0,%%v6\n\t"
"vfadb %%v0,%%v0,%%v7\n\t"
"vrepg %%v1,%%v0,1\n\t"
"adbr %%f0,%%f1\n\t"
"ldr %[dot],%%f0"
: [dot] "=f"(dot),[n] "+&r"(n)
: "m"(*(const FLOAT (*)[n]) x),[x] "a"(x), "m"(*(const FLOAT (*)[n]) y),[y] "a"(y)
: "cc", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16",
"v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26",
"v27", "v28", "v29", "v30", "v31");

return dot;
} }


double CNAME(BLASLONG n,FLOAT *x,BLASLONG inc_x,FLOAT *y,BLASLONG inc_y)
{
BLASLONG i=0;
BLASLONG ix=0,iy=0;
double CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) {
BLASLONG i = 0;
BLASLONG ix = 0, iy = 0;


double dot = 0.0 ;
double dot = 0.0;


if ( n <= 0 ) return(dot);
if (n <= 0)
return (dot);


if ( (inc_x == 1) && (inc_y == 1) )
{
if ((inc_x == 1) && (inc_y == 1)) {


BLASLONG n1 = n & -16;
BLASLONG n1 = n & -16;


if ( n1 )
dot = dsdot_kernel_16(n1,x,y);
if (n1)
dot = dsdot_kernel_16(n1, x, y);


i = n1;
while(i < n)
{
i = n1;
while (i < n) {


dot += (double) y[i] * (double) x[i] ;
i++ ;
dot += (double) y[i] * (double) x[i];
i++;


}
return(dot);
}
return (dot);


}


}
BLASLONG n1 = n & -2;


BLASLONG n1 = n & -2;
while (i < n1) {


while(i < n1)
{
dot += (double) y[iy] * (double) x[ix];
dot += (double) y[iy + inc_y] * (double) x[ix + inc_x];
ix += inc_x * 2;
iy += inc_y * 2;
i += 2;


dot += (double) y[iy] * (double) x[ix];
dot += (double) y[iy+inc_y] * (double) x[ix+inc_x];
ix += inc_x*2 ;
iy += inc_y*2 ;
i+=2 ;
}


}
while (i < n) {


while(i < n)
{
dot += (double) y[iy] * (double) x[ix];
ix += inc_x;
iy += inc_y;
i++;


dot += (double) y[iy] * (double) x[ix] ;
ix += inc_x ;
iy += inc_y ;
i++ ;

}
return(dot);
}
return (dot);


} }



+ 108
- 120
kernel/zarch/dswap.c View File

@@ -1,5 +1,5 @@
/*************************************************************************** /***************************************************************************
Copyright (c) 2013-2018, The OpenBLAS Project
Copyright (c) 2013-2019, The OpenBLAS Project
All rights reserved. All rights reserved.
Redistribution and use in source and binary forms, with or without Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are modification, are permitted provided that the following conditions are
@@ -27,136 +27,124 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.


#include "common.h" #include "common.h"


static void dswap_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y)
{
__asm__ volatile(
"srlg %%r0,%0,5 \n\t"
"xgr %%r1,%%r1 \n\t"
"0: \n\t"
"pfd 2, 1024(%%r1,%1) \n\t"
"pfd 2, 1024(%%r1,%2) \n\t"
"vl %%v16, 0(%%r1,%1) \n\t"
"vl %%v17, 16(%%r1,%1) \n\t"
"vl %%v18, 32(%%r1,%1) \n\t"
"vl %%v19, 48(%%r1,%1) \n\t"
"vl %%v20, 64(%%r1,%1) \n\t"
"vl %%v21, 80(%%r1,%1) \n\t"
"vl %%v22, 96(%%r1,%1) \n\t"
"vl %%v23, 112(%%r1,%1) \n\t"
"vl %%v24, 128(%%r1,%1) \n\t"
"vl %%v25, 144(%%r1,%1) \n\t"
"vl %%v26, 160(%%r1,%1) \n\t"
"vl %%v27, 176(%%r1,%1) \n\t"
"vl %%v28, 192(%%r1,%1) \n\t"
"vl %%v29, 208(%%r1,%1) \n\t"
"vl %%v30, 224(%%r1,%1) \n\t"
"vl %%v31, 240(%%r1,%1) \n\t"

"vl %%v0, 0(%%r1,%2) \n\t"
"vl %%v1, 16(%%r1,%2) \n\t"
"vl %%v2, 32(%%r1,%2) \n\t"
"vl %%v3, 48(%%r1,%2) \n\t"
"vl %%v4, 64(%%r1,%2) \n\t"
"vl %%v5, 80(%%r1,%2) \n\t"
"vl %%v6, 96(%%r1,%2) \n\t"
"vl %%v7, 112(%%r1,%2) \n\t"
"vst %%v0, 0(%%r1,%1) \n\t"
"vst %%v1, 16(%%r1,%1) \n\t"
"vst %%v2, 32(%%r1,%1) \n\t"
"vst %%v3, 48(%%r1,%1) \n\t"
"vst %%v4, 64(%%r1,%1) \n\t"
"vst %%v5, 80(%%r1,%1) \n\t"
"vst %%v6, 96(%%r1,%1) \n\t"
"vst %%v7, 112(%%r1,%1) \n\t"

"vl %%v0, 128(%%r1,%2) \n\t"
"vl %%v1, 144(%%r1,%2) \n\t"
"vl %%v2, 160(%%r1,%2) \n\t"
"vl %%v3, 176(%%r1,%2) \n\t"
"vl %%v4, 192(%%r1,%2) \n\t"
"vl %%v5, 208(%%r1,%2) \n\t"
"vl %%v6, 224(%%r1,%2) \n\t"
"vl %%v7, 240(%%r1,%2) \n\t"
"vst %%v0, 128(%%r1,%1) \n\t"
"vst %%v1, 144(%%r1,%1) \n\t"
"vst %%v2, 160(%%r1,%1) \n\t"
"vst %%v3, 176(%%r1,%1) \n\t"
"vst %%v4, 192(%%r1,%1) \n\t"
"vst %%v5, 208(%%r1,%1) \n\t"
"vst %%v6, 224(%%r1,%1) \n\t"
"vst %%v7, 240(%%r1,%1) \n\t"

"vst %%v16, 0(%%r1,%2) \n\t"
"vst %%v17, 16(%%r1,%2) \n\t"
"vst %%v18, 32(%%r1,%2) \n\t"
"vst %%v19, 48(%%r1,%2) \n\t"
"vst %%v20, 64(%%r1,%2) \n\t"
"vst %%v21, 80(%%r1,%2) \n\t"
"vst %%v22, 96(%%r1,%2) \n\t"
"vst %%v23, 112(%%r1,%2) \n\t"
"vst %%v24, 128(%%r1,%2) \n\t"
"vst %%v25, 144(%%r1,%2) \n\t"
"vst %%v26, 160(%%r1,%2) \n\t"
"vst %%v27, 176(%%r1,%2) \n\t"
"vst %%v28, 192(%%r1,%2) \n\t"
"vst %%v29, 208(%%r1,%2) \n\t"
"vst %%v30, 224(%%r1,%2) \n\t"
"vst %%v31, 240(%%r1,%2) \n\t"
"agfi %%r1,256 \n\t"
"brctg %%r0,0b "
:
:"r"(n),"ZR"((FLOAT (*)[n])x),"ZR"((FLOAT (*)[n])y)
:"memory","cc","r0","r1","v0","v1","v2","v3","v4","v5","v6","v7","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
);
static void dswap_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y) {
__asm__("srlg %[n],%[n],5\n\t"
"xgr %%r1,%%r1\n\t"
"0:\n\t"
"pfd 2, 1024(%%r1,%[x])\n\t"
"pfd 2, 1024(%%r1,%[y])\n\t"
"vl %%v16, 0(%%r1,%[x])\n\t"
"vl %%v17, 16(%%r1,%[x])\n\t"
"vl %%v18, 32(%%r1,%[x])\n\t"
"vl %%v19, 48(%%r1,%[x])\n\t"
"vl %%v20, 64(%%r1,%[x])\n\t"
"vl %%v21, 80(%%r1,%[x])\n\t"
"vl %%v22, 96(%%r1,%[x])\n\t"
"vl %%v23, 112(%%r1,%[x])\n\t"
"vl %%v24, 128(%%r1,%[x])\n\t"
"vl %%v25, 144(%%r1,%[x])\n\t"
"vl %%v26, 160(%%r1,%[x])\n\t"
"vl %%v27, 176(%%r1,%[x])\n\t"
"vl %%v28, 192(%%r1,%[x])\n\t"
"vl %%v29, 208(%%r1,%[x])\n\t"
"vl %%v30, 224(%%r1,%[x])\n\t"
"vl %%v31, 240(%%r1,%[x])\n\t"
"vl %%v0, 0(%%r1,%[y])\n\t"
"vl %%v1, 16(%%r1,%[y])\n\t"
"vl %%v2, 32(%%r1,%[y])\n\t"
"vl %%v3, 48(%%r1,%[y])\n\t"
"vl %%v4, 64(%%r1,%[y])\n\t"
"vl %%v5, 80(%%r1,%[y])\n\t"
"vl %%v6, 96(%%r1,%[y])\n\t"
"vl %%v7, 112(%%r1,%[y])\n\t"
"vst %%v0, 0(%%r1,%[x])\n\t"
"vst %%v1, 16(%%r1,%[x])\n\t"
"vst %%v2, 32(%%r1,%[x])\n\t"
"vst %%v3, 48(%%r1,%[x])\n\t"
"vst %%v4, 64(%%r1,%[x])\n\t"
"vst %%v5, 80(%%r1,%[x])\n\t"
"vst %%v6, 96(%%r1,%[x])\n\t"
"vst %%v7, 112(%%r1,%[x])\n\t"
"vl %%v0, 128(%%r1,%[y])\n\t"
"vl %%v1, 144(%%r1,%[y])\n\t"
"vl %%v2, 160(%%r1,%[y])\n\t"
"vl %%v3, 176(%%r1,%[y])\n\t"
"vl %%v4, 192(%%r1,%[y])\n\t"
"vl %%v5, 208(%%r1,%[y])\n\t"
"vl %%v6, 224(%%r1,%[y])\n\t"
"vl %%v7, 240(%%r1,%[y])\n\t"
"vst %%v0, 128(%%r1,%[x])\n\t"
"vst %%v1, 144(%%r1,%[x])\n\t"
"vst %%v2, 160(%%r1,%[x])\n\t"
"vst %%v3, 176(%%r1,%[x])\n\t"
"vst %%v4, 192(%%r1,%[x])\n\t"
"vst %%v5, 208(%%r1,%[x])\n\t"
"vst %%v6, 224(%%r1,%[x])\n\t"
"vst %%v7, 240(%%r1,%[x])\n\t"
"vst %%v16, 0(%%r1,%[y])\n\t"
"vst %%v17, 16(%%r1,%[y])\n\t"
"vst %%v18, 32(%%r1,%[y])\n\t"
"vst %%v19, 48(%%r1,%[y])\n\t"
"vst %%v20, 64(%%r1,%[y])\n\t"
"vst %%v21, 80(%%r1,%[y])\n\t"
"vst %%v22, 96(%%r1,%[y])\n\t"
"vst %%v23, 112(%%r1,%[y])\n\t"
"vst %%v24, 128(%%r1,%[y])\n\t"
"vst %%v25, 144(%%r1,%[y])\n\t"
"vst %%v26, 160(%%r1,%[y])\n\t"
"vst %%v27, 176(%%r1,%[y])\n\t"
"vst %%v28, 192(%%r1,%[y])\n\t"
"vst %%v29, 208(%%r1,%[y])\n\t"
"vst %%v30, 224(%%r1,%[y])\n\t"
"vst %%v31, 240(%%r1,%[y])\n\t"
"agfi %%r1,256\n\t"
"brctg %[n],0b"
: "+m"(*(FLOAT (*)[n]) x), "+m"(*(FLOAT (*)[n]) y),[n] "+&r"(n)
: [x] "a"(x),[y] "a"(y)
: "cc", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16",
"v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26",
"v27", "v28", "v29", "v30", "v31");
} }


int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2)
{
BLASLONG i=0;
BLASLONG ix=0,iy=0;
FLOAT temp;

if ( n <= 0 ) return(0);
int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT *x,
BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy,
BLASLONG dummy2) {
BLASLONG i = 0;
BLASLONG ix = 0, iy = 0;
FLOAT temp;


if ( (inc_x == 1) && (inc_y == 1 ))
{
if (n <= 0)
return (0);


BLASLONG n1 = n & -32;
if ( n1 > 0 )
{
dswap_kernel_32(n1, x, y);
i=n1;
}
if ((inc_x == 1) && (inc_y == 1)) {


while(i < n)
{
temp = y[i];
y[i] = x[i] ;
x[i] = temp;
i++ ;

}
BLASLONG n1 = n & -32;
if (n1 > 0) {
dswap_kernel_32(n1, x, y);
i = n1;
}


while (i < n) {
temp = y[i];
y[i] = x[i];
x[i] = temp;
i++;


} }
else
{


while(i < n)
{
temp = y[iy];
y[iy] = x[ix] ;
x[ix] = temp;
ix += inc_x ;
iy += inc_y ;
i++ ;
} else {


}
while (i < n) {
temp = y[iy];
y[iy] = x[ix];
x[ix] = temp;
ix += inc_x;
iy += inc_y;
i++;


} }
return(0);

}
return (0);


} }

+ 253
- 262
kernel/zarch/icamax.c View File

@@ -1,5 +1,5 @@
/*************************************************************************** /***************************************************************************
Copyright (c) 2017, The OpenBLAS Project
Copyright (c) 2019, The OpenBLAS Project
All rights reserved. All rights reserved.
Redistribution and use in source and binary forms, with or without Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are modification, are permitted provided that the following conditions are
@@ -27,285 +27,276 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.


#include "common.h" #include "common.h"
#include <math.h> #include <math.h>
#if defined(DOUBLE)
#define ABS fabs
#else
#define ABS fabsf
#endif
#define CABS1(x,i) (ABS(x[i]) + ABS(x[i + 1]))


static BLASLONG icamax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *amax)
{
BLASLONG iamax;
#define CABS1(x,i) (fabsf(x[i]) + fabsf(x[i + 1]))


__asm__ volatile (
"vlef %%v0,0(%3),0 \n\t"
"vlef %%v1,4(%3),0 \n\t"
"vlef %%v0,8(%3),1 \n\t"
"vlef %%v1,12(%3),1 \n\t"
"vlef %%v0,16(%3),2 \n\t"
"vlef %%v1,20(%3),2 \n\t"
"vlef %%v0,24(%3),3 \n\t"
"vlef %%v1,28(%3),3 \n\t"
"vflpsb %%v0,%%v0 \n\t"
"vflpsb %%v1,%%v1 \n\t"
"vfasb %%v0,%%v0,%%v1 \n\t"
"vleig %%v1,0,0 \n\t"
"vleig %%v1,2,1 \n\t"
"vleig %%v2,1,0 \n\t"
"vleig %%v2,3,1 \n\t"
"vrepig %%v3,16 \n\t"
"vzero %%v4 \n\t"
"vleib %%v9,0,0 \n\t"
"vleib %%v9,1,1 \n\t"
"vleib %%v9,2,2 \n\t"
"vleib %%v9,3,3 \n\t"
"vleib %%v9,8,4 \n\t"
"vleib %%v9,9,5 \n\t"
"vleib %%v9,10,6 \n\t"
"vleib %%v9,11,7 \n\t"
"vleib %%v9,16,8 \n\t"
"vleib %%v9,17,9 \n\t"
"vleib %%v9,18,10 \n\t"
"vleib %%v9,19,11 \n\t"
"vleib %%v9,24,12 \n\t"
"vleib %%v9,25,13 \n\t"
"vleib %%v9,26,14 \n\t"
"vleib %%v9,27,15 \n\t"
"vleif %%v24,0,0 \n\t"
"vleif %%v24,1,1 \n\t"
"vleif %%v24,2,2 \n\t"
"vleif %%v24,3,3 \n\t"
"vleif %%v25,4,0 \n\t"
"vleif %%v25,5,1 \n\t"
"vleif %%v25,6,2 \n\t"
"vleif %%v25,7,3 \n\t"
"vleif %%v26,8,0 \n\t"
"vleif %%v26,9,1 \n\t"
"vleif %%v26,10,2 \n\t"
"vleif %%v26,11,3 \n\t"
"vleif %%v27,12,0 \n\t"
"vleif %%v27,13,1 \n\t"
"vleif %%v27,14,2 \n\t"
"vleif %%v27,15,3 \n\t"
"srlg %%r0,%2,5 \n\t"
"xgr %%r1,%%r1 \n\t"
"0: \n\t"
"pfd 1, 1024(%%r1,%3) \n\t"
static BLASLONG icamax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *amax) {
BLASLONG iamax;


"vl %%v16,0(%%r1,%3) \n\t"
"vl %%v28,16(%%r1,%3) \n\t"
"vpkg %%v17,%%v16,%%v28 \n\t"
"vperm %%v16,%%v16,%%v28,%%v9 \n\t"
__asm__("vlef %%v0,0(%[x]),0\n\t"
"vlef %%v1,4(%[x]),0\n\t"
"vlef %%v0,8(%[x]),1\n\t"
"vlef %%v1,12(%[x]),1\n\t"
"vlef %%v0,16(%[x]),2\n\t"
"vlef %%v1,20(%[x]),2\n\t"
"vlef %%v0,24(%[x]),3\n\t"
"vlef %%v1,28(%[x]),3\n\t"
"vflpsb %%v0,%%v0\n\t"
"vflpsb %%v1,%%v1\n\t"
"vfasb %%v0,%%v0,%%v1\n\t"
"vleig %%v1,0,0\n\t"
"vleig %%v1,2,1\n\t"
"vleig %%v2,1,0\n\t"
"vleig %%v2,3,1\n\t"
"vrepig %%v3,16\n\t"
"vzero %%v4\n\t"
"vleib %%v9,0,0\n\t"
"vleib %%v9,1,1\n\t"
"vleib %%v9,2,2\n\t"
"vleib %%v9,3,3\n\t"
"vleib %%v9,8,4\n\t"
"vleib %%v9,9,5\n\t"
"vleib %%v9,10,6\n\t"
"vleib %%v9,11,7\n\t"
"vleib %%v9,16,8\n\t"
"vleib %%v9,17,9\n\t"
"vleib %%v9,18,10\n\t"
"vleib %%v9,19,11\n\t"
"vleib %%v9,24,12\n\t"
"vleib %%v9,25,13\n\t"
"vleib %%v9,26,14\n\t"
"vleib %%v9,27,15\n\t"
"vleif %%v24,0,0\n\t"
"vleif %%v24,1,1\n\t"
"vleif %%v24,2,2\n\t"
"vleif %%v24,3,3\n\t"
"vleif %%v25,4,0\n\t"
"vleif %%v25,5,1\n\t"
"vleif %%v25,6,2\n\t"
"vleif %%v25,7,3\n\t"
"vleif %%v26,8,0\n\t"
"vleif %%v26,9,1\n\t"
"vleif %%v26,10,2\n\t"
"vleif %%v26,11,3\n\t"
"vleif %%v27,12,0\n\t"
"vleif %%v27,13,1\n\t"
"vleif %%v27,14,2\n\t"
"vleif %%v27,15,3\n\t"
"srlg %[n],%[n],5\n\t"
"xgr %%r1,%%r1\n\t"
"0:\n\t"
"pfd 1, 1024(%%r1,%[x])\n\t"
"vl %%v16,0(%%r1,%[x])\n\t"
"vl %%v28,16(%%r1,%[x])\n\t"
"vpkg %%v17,%%v16,%%v28\n\t"
"vperm %%v16,%%v16,%%v28,%%v9\n\t"
"vl %%v18,32(%%r1,%[x])\n\t"
"vl %%v29,48(%%r1,%[x])\n\t"
"vpkg %%v19,%%v18,%%v29\n\t"
"vperm %%v18,%%v18,%%v29,%%v9\n\t"
"vl %%v20,64(%%r1,%[x])\n\t"
"vl %%v30,80(%%r1,%[x])\n\t"
"vpkg %%v21,%%v20,%%v30\n\t"
"vperm %%v20,%%v20,%%v30,%%v9\n\t"
"vl %%v22,96(%%r1,%[x])\n\t"
"vl %%v31,112(%%r1,%[x])\n\t"
"vpkg %%v23,%%v22,%%v31\n\t"
"vperm %%v22,%%v22,%%v31,%%v9\n\t"
"vflpsb %%v16, %%v16\n\t"
"vflpsb %%v17, %%v17\n\t"
"vflpsb %%v18, %%v18\n\t"
"vflpsb %%v19, %%v19\n\t"
"vflpsb %%v20, %%v20\n\t"
"vflpsb %%v21, %%v21\n\t"
"vflpsb %%v22, %%v22\n\t"
"vflpsb %%v23, %%v23\n\t"
"vfasb %%v16,%%v16,%%v17\n\t"
"vfasb %%v17,%%v18,%%v19\n\t"
"vfasb %%v18,%%v20,%%v21\n\t"
"vfasb %%v19,%%v22,%%v23\n\t"
"vfchesb %%v5,%%v16,%%v17\n\t"
"vfchesb %%v6,%%v18,%%v19\n\t"
"vsel %%v16,%%v16,%%v17,%%v5\n\t"
"vsel %%v5,%%v24,%%v25,%%v5\n\t"
"vsel %%v17,%%v18,%%v19,%%v6\n\t"
"vsel %%v6,%%v26,%%v27,%%v6\n\t"
"vfchesb %%v18,%%v16,%%v17\n\t"
"vsel %%v16,%%v16,%%v17,%%v18\n\t"
"vsel %%v5,%%v5,%%v6,%%v18\n\t"
"vsegf %%v6,%%v5\n\t"
"vesrlg %%v5,%%v5,32\n\t"
"vag %%v5,%%v5,%%v4\n\t"
"vag %%v6,%%v6,%%v4\n\t"
"vfchesb %%v7,%%v0,%%v16\n\t"
"vsel %%v0,%%v0,%%v16,%%v7\n\t"
"vsegf %%v8,%%v7\n\t"
"vesrlg %%v7,%%v7,32\n\t"
"vsegf %%v7,%%v7\n\t"
"vsel %%v1,%%v1,%%v5,%%v7\n\t"
"vsel %%v2,%%v2,%%v6,%%v8\n\t"
"vag %%v4,%%v4,%%v3\n\t"
"vl %%v16,128(%%r1,%[x])\n\t"
"vl %%v28,144(%%r1,%[x])\n\t"
"vpkg %%v17,%%v16,%%v28\n\t"
"vperm %%v16,%%v16,%%v28,%%v9\n\t"
"vl %%v18,160(%%r1,%[x])\n\t"
"vl %%v29,176(%%r1,%[x])\n\t"
"vpkg %%v19,%%v18,%%v29\n\t"
"vperm %%v18,%%v18,%%v29,%%v9\n\t"
"vl %%v20,192(%%r1,%[x])\n\t"
"vl %%v30,208(%%r1,%[x])\n\t"
"vpkg %%v21,%%v20,%%v30\n\t"
"vperm %%v20,%%v20,%%v30,%%v9\n\t"
"vl %%v22,224(%%r1,%[x])\n\t"
"vl %%v31,240(%%r1,%[x])\n\t"
"vpkg %%v23,%%v22,%%v31\n\t"
"vperm %%v22,%%v22,%%v31,%%v9\n\t"
"vflpsb %%v16, %%v16\n\t"
"vflpsb %%v17, %%v17\n\t"
"vflpsb %%v18, %%v18\n\t"
"vflpsb %%v19, %%v19\n\t"
"vflpsb %%v20, %%v20\n\t"
"vflpsb %%v21, %%v21\n\t"
"vflpsb %%v22, %%v22\n\t"
"vflpsb %%v23, %%v23\n\t"
"vfasb %%v16,%%v16,%%v17\n\t"
"vfasb %%v17,%%v18,%%v19\n\t"
"vfasb %%v18,%%v20,%%v21\n\t"
"vfasb %%v19,%%v22,%%v23\n\t"
"vfchesb %%v5,%%v16,%%v17\n\t"
"vfchesb %%v6,%%v18,%%v19\n\t"
"vsel %%v16,%%v16,%%v17,%%v5\n\t"
"vsel %%v5,%%v24,%%v25,%%v5\n\t"
"vsel %%v17,%%v18,%%v19,%%v6\n\t"
"vsel %%v6,%%v26,%%v27,%%v6\n\t"
"vfchesb %%v18,%%v16,%%v17\n\t"
"vsel %%v16,%%v16,%%v17,%%v18\n\t"
"vsel %%v5,%%v5,%%v6,%%v18\n\t"
"vsegf %%v6,%%v5\n\t"
"vesrlg %%v5,%%v5,32\n\t"
"vag %%v5,%%v5,%%v4\n\t"
"vag %%v6,%%v6,%%v4\n\t"
"vfchesb %%v7,%%v0,%%v16\n\t"
"vsel %%v0,%%v0,%%v16,%%v7\n\t"
"vsegf %%v8,%%v7\n\t"
"vesrlg %%v7,%%v7,32\n\t"
"vsegf %%v7,%%v7\n\t"
"vsel %%v1,%%v1,%%v5,%%v7\n\t"
"vsel %%v2,%%v2,%%v6,%%v8\n\t"
"vag %%v4,%%v4,%%v3\n\t"
"agfi %%r1, 256\n\t"
"brctg %[n], 0b\n\t"
"veslg %%v3,%%v0,32\n\t"
"vfchsb %%v4,%%v0,%%v3\n\t"
"vchlg %%v5,%%v2,%%v1\n\t"
"vfcesb %%v6,%%v0,%%v3\n\t"
"vn %%v5,%%v5,%%v6\n\t"
"vo %%v4,%%v4,%%v5\n\t"
"vsel %%v0,%%v0,%%v3,%%v4\n\t"
"vesrlg %%v4,%%v4,32\n\t"
"vsegf %%v4,%%v4\n\t"
"vsel %%v1,%%v1,%%v2,%%v4\n\t"
"vrepf %%v2,%%v0,2\n\t"
"vrepg %%v3,%%v1,1\n\t"
"wfcsb %%v2,%%v0\n\t"
"jne 1f\n\t"
"vstef %%v0,%[amax],0\n\t"
"vmnlg %%v0,%%v1,%%v3\n\t"
"vlgvg %[iamax],%%v0,0\n\t"
"j 2f\n\t"
"1:\n\t"
"wfchsb %%v4,%%v2,%%v0\n\t"
"vesrlg %%v4,%%v4,32\n\t"
"vsegf %%v4,%%v4\n\t"
"vsel %%v1,%%v3,%%v1,%%v4\n\t"
"vsel %%v0,%%v2,%%v0,%%v4\n\t"
"ste %%f0,%[amax]\n\t"
"vlgvg %[iamax],%%v1,0\n\t"
"2:\n\t"
"nop"
: [iamax] "=r"(iamax),[amax] "=m"(*amax),[n] "+&r"(n)
: "m"(*(const FLOAT (*)[n * 2]) x),[x] "a"(x)
: "cc", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8",
"v9", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24",
"v25", "v26", "v27", "v28", "v29", "v30", "v31");


"vl %%v18,32(%%r1,%3) \n\t"
"vl %%v29,48(%%r1,%3) \n\t"
"vpkg %%v19,%%v18,%%v29 \n\t"
"vperm %%v18,%%v18,%%v29,%%v9 \n\t"

"vl %%v20,64(%%r1,%3) \n\t"
"vl %%v30,80(%%r1,%3) \n\t"
"vpkg %%v21,%%v20,%%v30 \n\t"
"vperm %%v20,%%v20,%%v30,%%v9 \n\t"

"vl %%v22,96(%%r1,%3) \n\t"
"vl %%v31,112(%%r1,%3) \n\t"
"vpkg %%v23,%%v22,%%v31 \n\t"
"vperm %%v22,%%v22,%%v31,%%v9 \n\t"

"vflpsb %%v16, %%v16 \n\t"
"vflpsb %%v17, %%v17 \n\t"
"vflpsb %%v18, %%v18 \n\t"
"vflpsb %%v19, %%v19 \n\t"
"vflpsb %%v20, %%v20 \n\t"
"vflpsb %%v21, %%v21 \n\t"
"vflpsb %%v22, %%v22 \n\t"
"vflpsb %%v23, %%v23 \n\t"
"vfasb %%v16,%%v16,%%v17 \n\t"
"vfasb %%v17,%%v18,%%v19 \n\t"
"vfasb %%v18,%%v20,%%v21 \n\t"
"vfasb %%v19,%%v22,%%v23 \n\t"
"vfchesb %%v5,%%v16,%%v17 \n\t"
"vfchesb %%v6,%%v18,%%v19 \n\t"
"vsel %%v16,%%v16,%%v17,%%v5 \n\t"
"vsel %%v5,%%v24,%%v25,%%v5 \n\t"
"vsel %%v17,%%v18,%%v19,%%v6 \n\t"
"vsel %%v6,%%v26,%%v27,%%v6 \n\t"

"vfchesb %%v18,%%v16,%%v17 \n\t"
"vsel %%v16,%%v16,%%v17,%%v18 \n\t"
"vsel %%v5,%%v5,%%v6,%%v18 \n\t"
"vsegf %%v6,%%v5 \n\t"
"vesrlg %%v5,%%v5,32 \n\t"
"vag %%v5,%%v5,%%v4 \n\t"
"vag %%v6,%%v6,%%v4 \n\t"

"vfchesb %%v7,%%v0,%%v16 \n\t"
"vsel %%v0,%%v0,%%v16,%%v7 \n\t"
"vsegf %%v8,%%v7 \n\t"
"vesrlg %%v7,%%v7,32 \n\t"
"vsegf %%v7,%%v7 \n\t"
"vsel %%v1,%%v1,%%v5,%%v7 \n\t"
"vsel %%v2,%%v2,%%v6,%%v8 \n\t"
"vag %%v4,%%v4,%%v3 \n\t"

"vl %%v16,128(%%r1,%3) \n\t"
"vl %%v28,144(%%r1,%3) \n\t"
"vpkg %%v17,%%v16,%%v28 \n\t"
"vperm %%v16,%%v16,%%v28,%%v9 \n\t"

"vl %%v18,160(%%r1,%3) \n\t"
"vl %%v29,176(%%r1,%3) \n\t"
"vpkg %%v19,%%v18,%%v29 \n\t"
"vperm %%v18,%%v18,%%v29,%%v9 \n\t"

"vl %%v20,192(%%r1,%3) \n\t"
"vl %%v30,208(%%r1,%3) \n\t"
"vpkg %%v21,%%v20,%%v30 \n\t"
"vperm %%v20,%%v20,%%v30,%%v9 \n\t"

"vl %%v22,224(%%r1,%3) \n\t"
"vl %%v31,240(%%r1,%3) \n\t"
"vpkg %%v23,%%v22,%%v31 \n\t"
"vperm %%v22,%%v22,%%v31,%%v9 \n\t"

"vflpsb %%v16, %%v16 \n\t"
"vflpsb %%v17, %%v17 \n\t"
"vflpsb %%v18, %%v18 \n\t"
"vflpsb %%v19, %%v19 \n\t"
"vflpsb %%v20, %%v20 \n\t"
"vflpsb %%v21, %%v21 \n\t"
"vflpsb %%v22, %%v22 \n\t"
"vflpsb %%v23, %%v23 \n\t"
"vfasb %%v16,%%v16,%%v17 \n\t"
"vfasb %%v17,%%v18,%%v19 \n\t"
"vfasb %%v18,%%v20,%%v21 \n\t"
"vfasb %%v19,%%v22,%%v23 \n\t"
"vfchesb %%v5,%%v16,%%v17 \n\t"
"vfchesb %%v6,%%v18,%%v19 \n\t"
"vsel %%v16,%%v16,%%v17,%%v5 \n\t"
"vsel %%v5,%%v24,%%v25,%%v5 \n\t"
"vsel %%v17,%%v18,%%v19,%%v6 \n\t"
"vsel %%v6,%%v26,%%v27,%%v6 \n\t"
return iamax;
}


"vfchesb %%v18,%%v16,%%v17 \n\t"
"vsel %%v16,%%v16,%%v17,%%v18 \n\t"
"vsel %%v5,%%v5,%%v6,%%v18 \n\t"
"vsegf %%v6,%%v5 \n\t"
"vesrlg %%v5,%%v5,32 \n\t"
"vag %%v5,%%v5,%%v4 \n\t"
"vag %%v6,%%v6,%%v4 \n\t"
BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
BLASLONG i = 0;
BLASLONG ix = 0;
FLOAT maxf = 0;
BLASLONG max = 0;
BLASLONG inc_x2;


"vfchesb %%v7,%%v0,%%v16 \n\t"
"vsel %%v0,%%v0,%%v16,%%v7 \n\t"
"vsegf %%v8,%%v7 \n\t"
"vesrlg %%v7,%%v7,32 \n\t"
"vsegf %%v7,%%v7 \n\t"
"vsel %%v1,%%v1,%%v5,%%v7 \n\t"
"vsel %%v2,%%v2,%%v6,%%v8 \n\t"
"vag %%v4,%%v4,%%v3 \n\t"
if (n <= 0 || inc_x <= 0)
return (max);


"agfi %%r1, 256 \n\t"
"brctg %%r0, 0b \n\t"
if (inc_x == 1) {


"veslg %%v3,%%v0,32 \n\t"
"vfchsb %%v4,%%v0,%%v3 \n\t"
"vchlg %%v5,%%v2,%%v1 \n\t"
"vfcesb %%v6,%%v0,%%v3 \n\t"
"vn %%v5,%%v5,%%v6 \n\t"
"vo %%v4,%%v4,%%v5 \n\t"
"vsel %%v0,%%v0,%%v3,%%v4 \n\t"
"vesrlg %%v4,%%v4,32 \n\t"
"vsegf %%v4,%%v4 \n\t"
"vsel %%v1,%%v1,%%v2,%%v4 \n\t"
BLASLONG n1 = n & -32;
if (n1 > 0) {


"vrepf %%v2,%%v0,2 \n\t"
"vrepg %%v3,%%v1,1 \n\t"
"wfcsb %%v2,%%v0 \n\t"
"jne 1f \n\t"
"vstef %%v0,%1,0 \n\t"
"vmnlg %%v0,%%v1,%%v3 \n\t"
"vlgvg %0,%%v0,0 \n\t"
"j 2f \n\t"
"1: \n\t"
"wfchsb %%v4,%%v2,%%v0 \n\t"
"vesrlg %%v4,%%v4,32 \n\t"
"vsegf %%v4,%%v4 \n\t"
"vsel %%v1,%%v3,%%v1,%%v4 \n\t"
"vsel %%v0,%%v2,%%v0,%%v4 \n\t"
"ste %%f0,%1 \n\t"
"vlgvg %0,%%v1,0 \n\t"
"2: \n\t"
"nop "
:"=r"(iamax),"=m"(*amax)
:"r"(n),"ZR"((const FLOAT (*)[n * 2])x)
:"memory","cc","r0","r1","v0","v1","v2","v3","v4","v5","v6","v7","v8","v9","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
);
max = icamax_kernel_32(n1, x, &maxf);
ix = n1 * 2;
i = n1;
} else {
maxf = CABS1(x, 0);
ix += 2;
i++;
}


return iamax;
}
while (i < n) {
if (CABS1(x, ix) > maxf) {
max = i;
maxf = CABS1(x, ix);
}
ix += 2;
i++;
}
return (max + 1);


BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
{
BLASLONG i = 0;
BLASLONG ix = 0;
FLOAT maxf = 0;
BLASLONG max = 0;
BLASLONG inc_x2;
} else {


if (n <= 0 || inc_x <= 0) return(max);
if (inc_x == 1) {
max = 0;
maxf = CABS1(x, 0);
inc_x2 = 2 * inc_x;


BLASLONG n1 = n & -32;
if (n1 > 0) {
BLASLONG n1 = n & -4;
while (i < n1) {


max = icamax_kernel_32(n1, x, &maxf);
ix = n1 * 2;
i = n1;
if (CABS1(x, ix) > maxf) {
max = i;
maxf = CABS1(x, ix);
}
if (CABS1(x, ix + inc_x2) > maxf) {
max = i + 1;
maxf = CABS1(x, ix + inc_x2);
}
if (CABS1(x, ix + 2 * inc_x2) > maxf) {
max = i + 2;
maxf = CABS1(x, ix + 2 * inc_x2);
}
if (CABS1(x, ix + 3 * inc_x2) > maxf) {
max = i + 3;
maxf = CABS1(x, ix + 3 * inc_x2);
} }
else
{
maxf = CABS1(x,0);
ix += 2;
i++;
}


while(i < n)
{
if( CABS1(x,ix) > maxf )
{
max = i;
maxf = CABS1(x,ix);
}
ix += 2;
i++;
}
return (max + 1);
ix += inc_x2 * 4;


} else {
max = 0;
maxf = CABS1(x,0);
inc_x2 = 2 * inc_x;
ix += inc_x2;
i++;
i += 4;


while(i < n)
{
if( CABS1(x,ix) > maxf )
{
max = i;
maxf = CABS1(x,ix);
}
ix += inc_x2;
i++;
} }
return (max + 1);

while (i < n) {
if (CABS1(x, ix) > maxf) {
max = i;
maxf = CABS1(x, ix);
}
ix += inc_x2;
i++;
} }
return (max + 1);
}
} }



+ 253
- 262
kernel/zarch/icamin.c View File

@@ -1,5 +1,5 @@
/*************************************************************************** /***************************************************************************
Copyright (c) 2017, The OpenBLAS Project
Copyright (c) 2019, The OpenBLAS Project
All rights reserved. All rights reserved.
Redistribution and use in source and binary forms, with or without Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are modification, are permitted provided that the following conditions are
@@ -27,285 +27,276 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.


#include "common.h" #include "common.h"
#include <math.h> #include <math.h>
#if defined(DOUBLE)
#define ABS fabs
#else
#define ABS fabsf
#endif
#define CABS1(x,i) (ABS(x[i]) + ABS(x[i + 1]))


static BLASLONG icamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *amin)
{
BLASLONG iamin;
#define CABS1(x,i) (fabsf(x[i]) + fabsf(x[i + 1]))


__asm__ volatile (
"vlef %%v0,0(%3),0 \n\t"
"vlef %%v1,4(%3),0 \n\t"
"vlef %%v0,8(%3),1 \n\t"
"vlef %%v1,12(%3),1 \n\t"
"vlef %%v0,16(%3),2 \n\t"
"vlef %%v1,20(%3),2 \n\t"
"vlef %%v0,24(%3),3 \n\t"
"vlef %%v1,28(%3),3 \n\t"
"vflpsb %%v0,%%v0 \n\t"
"vflpsb %%v1,%%v1 \n\t"
"vfasb %%v0,%%v0,%%v1 \n\t"
"vleig %%v1,0,0 \n\t"
"vleig %%v1,2,1 \n\t"
"vleig %%v2,1,0 \n\t"
"vleig %%v2,3,1 \n\t"
"vrepig %%v3,16 \n\t"
"vzero %%v4 \n\t"
"vleib %%v9,0,0 \n\t"
"vleib %%v9,1,1 \n\t"
"vleib %%v9,2,2 \n\t"
"vleib %%v9,3,3 \n\t"
"vleib %%v9,8,4 \n\t"
"vleib %%v9,9,5 \n\t"
"vleib %%v9,10,6 \n\t"
"vleib %%v9,11,7 \n\t"
"vleib %%v9,16,8 \n\t"
"vleib %%v9,17,9 \n\t"
"vleib %%v9,18,10 \n\t"
"vleib %%v9,19,11 \n\t"
"vleib %%v9,24,12 \n\t"
"vleib %%v9,25,13 \n\t"
"vleib %%v9,26,14 \n\t"
"vleib %%v9,27,15 \n\t"
"vleif %%v24,0,0 \n\t"
"vleif %%v24,1,1 \n\t"
"vleif %%v24,2,2 \n\t"
"vleif %%v24,3,3 \n\t"
"vleif %%v25,4,0 \n\t"
"vleif %%v25,5,1 \n\t"
"vleif %%v25,6,2 \n\t"
"vleif %%v25,7,3 \n\t"
"vleif %%v26,8,0 \n\t"
"vleif %%v26,9,1 \n\t"
"vleif %%v26,10,2 \n\t"
"vleif %%v26,11,3 \n\t"
"vleif %%v27,12,0 \n\t"
"vleif %%v27,13,1 \n\t"
"vleif %%v27,14,2 \n\t"
"vleif %%v27,15,3 \n\t"
"srlg %%r0,%2,5 \n\t"
"xgr %%r1,%%r1 \n\t"
"0: \n\t"
"pfd 1, 1024(%%r1,%3) \n\t"
static BLASLONG icamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *amin) {
BLASLONG iamin;


"vl %%v16,0(%%r1,%3) \n\t"
"vl %%v28,16(%%r1,%3) \n\t"
"vpkg %%v17,%%v16,%%v28 \n\t"
"vperm %%v16,%%v16,%%v28,%%v9 \n\t"
__asm__("vlef %%v0,0(%[x]),0\n\t"
"vlef %%v1,4(%[x]),0\n\t"
"vlef %%v0,8(%[x]),1\n\t"
"vlef %%v1,12(%[x]),1\n\t"
"vlef %%v0,16(%[x]),2\n\t"
"vlef %%v1,20(%[x]),2\n\t"
"vlef %%v0,24(%[x]),3\n\t"
"vlef %%v1,28(%[x]),3\n\t"
"vflpsb %%v0,%%v0\n\t"
"vflpsb %%v1,%%v1\n\t"
"vfasb %%v0,%%v0,%%v1\n\t"
"vleig %%v1,0,0\n\t"
"vleig %%v1,2,1\n\t"
"vleig %%v2,1,0\n\t"
"vleig %%v2,3,1\n\t"
"vrepig %%v3,16\n\t"
"vzero %%v4\n\t"
"vleib %%v9,0,0\n\t"
"vleib %%v9,1,1\n\t"
"vleib %%v9,2,2\n\t"
"vleib %%v9,3,3\n\t"
"vleib %%v9,8,4\n\t"
"vleib %%v9,9,5\n\t"
"vleib %%v9,10,6\n\t"
"vleib %%v9,11,7\n\t"
"vleib %%v9,16,8\n\t"
"vleib %%v9,17,9\n\t"
"vleib %%v9,18,10\n\t"
"vleib %%v9,19,11\n\t"
"vleib %%v9,24,12\n\t"
"vleib %%v9,25,13\n\t"
"vleib %%v9,26,14\n\t"
"vleib %%v9,27,15\n\t"
"vleif %%v24,0,0\n\t"
"vleif %%v24,1,1\n\t"
"vleif %%v24,2,2\n\t"
"vleif %%v24,3,3\n\t"
"vleif %%v25,4,0\n\t"
"vleif %%v25,5,1\n\t"
"vleif %%v25,6,2\n\t"
"vleif %%v25,7,3\n\t"
"vleif %%v26,8,0\n\t"
"vleif %%v26,9,1\n\t"
"vleif %%v26,10,2\n\t"
"vleif %%v26,11,3\n\t"
"vleif %%v27,12,0\n\t"
"vleif %%v27,13,1\n\t"
"vleif %%v27,14,2\n\t"
"vleif %%v27,15,3\n\t"
"srlg %[n],%[n],5\n\t"
"xgr %%r1,%%r1\n\t"
"0:\n\t"
"pfd 1, 1024(%%r1,%[x])\n\t"
"vl %%v16,0(%%r1,%[x])\n\t"
"vl %%v28,16(%%r1,%[x])\n\t"
"vpkg %%v17,%%v16,%%v28\n\t"
"vperm %%v16,%%v16,%%v28,%%v9\n\t"
"vl %%v18,32(%%r1,%[x])\n\t"
"vl %%v29,48(%%r1,%[x])\n\t"
"vpkg %%v19,%%v18,%%v29\n\t"
"vperm %%v18,%%v18,%%v29,%%v9\n\t"
"vl %%v20,64(%%r1,%[x])\n\t"
"vl %%v30,80(%%r1,%[x])\n\t"
"vpkg %%v21,%%v20,%%v30\n\t"
"vperm %%v20,%%v20,%%v30,%%v9\n\t"
"vl %%v22,96(%%r1,%[x])\n\t"
"vl %%v31,112(%%r1,%[x])\n\t"
"vpkg %%v23,%%v22,%%v31\n\t"
"vperm %%v22,%%v22,%%v31,%%v9\n\t"
"vflpsb %%v16, %%v16\n\t"
"vflpsb %%v17, %%v17\n\t"
"vflpsb %%v18, %%v18\n\t"
"vflpsb %%v19, %%v19\n\t"
"vflpsb %%v20, %%v20\n\t"
"vflpsb %%v21, %%v21\n\t"
"vflpsb %%v22, %%v22\n\t"
"vflpsb %%v23, %%v23\n\t"
"vfasb %%v16,%%v16,%%v17\n\t"
"vfasb %%v17,%%v18,%%v19\n\t"
"vfasb %%v18,%%v20,%%v21\n\t"
"vfasb %%v19,%%v22,%%v23\n\t"
"vfchesb %%v5,%%v17,%%v16\n\t"
"vfchesb %%v6,%%v19,%%v18\n\t"
"vsel %%v16,%%v16,%%v17,%%v5\n\t"
"vsel %%v5,%%v24,%%v25,%%v5\n\t"
"vsel %%v17,%%v18,%%v19,%%v6\n\t"
"vsel %%v6,%%v26,%%v27,%%v6\n\t"
"vfchesb %%v18,%%v17,%%v16\n\t"
"vsel %%v16,%%v16,%%v17,%%v18\n\t"
"vsel %%v5,%%v5,%%v6,%%v18\n\t"
"vsegf %%v6,%%v5\n\t"
"vesrlg %%v5,%%v5,32\n\t"
"vag %%v5,%%v5,%%v4\n\t"
"vag %%v6,%%v6,%%v4\n\t"
"vfchesb %%v7,%%v16,%%v0\n\t"
"vsel %%v0,%%v0,%%v16,%%v7\n\t"
"vsegf %%v8,%%v7\n\t"
"vesrlg %%v7,%%v7,32\n\t"
"vsegf %%v7,%%v7\n\t"
"vsel %%v1,%%v1,%%v5,%%v7\n\t"
"vsel %%v2,%%v2,%%v6,%%v8\n\t"
"vag %%v4,%%v4,%%v3\n\t"
"vl %%v16,128(%%r1,%[x])\n\t"
"vl %%v28,144(%%r1,%[x])\n\t"
"vpkg %%v17,%%v16,%%v28\n\t"
"vperm %%v16,%%v16,%%v28,%%v9\n\t"
"vl %%v18,160(%%r1,%[x])\n\t"
"vl %%v29,176(%%r1,%[x])\n\t"
"vpkg %%v19,%%v18,%%v29\n\t"
"vperm %%v18,%%v18,%%v29,%%v9\n\t"
"vl %%v20,192(%%r1,%[x])\n\t"
"vl %%v30,208(%%r1,%[x])\n\t"
"vpkg %%v21,%%v20,%%v30\n\t"
"vperm %%v20,%%v20,%%v30,%%v9\n\t"
"vl %%v22,224(%%r1,%[x])\n\t"
"vl %%v31,240(%%r1,%[x])\n\t"
"vpkg %%v23,%%v22,%%v31\n\t"
"vperm %%v22,%%v22,%%v31,%%v9\n\t"
"vflpsb %%v16, %%v16\n\t"
"vflpsb %%v17, %%v17\n\t"
"vflpsb %%v18, %%v18\n\t"
"vflpsb %%v19, %%v19\n\t"
"vflpsb %%v20, %%v20\n\t"
"vflpsb %%v21, %%v21\n\t"
"vflpsb %%v22, %%v22\n\t"
"vflpsb %%v23, %%v23\n\t"
"vfasb %%v16,%%v16,%%v17\n\t"
"vfasb %%v17,%%v18,%%v19\n\t"
"vfasb %%v18,%%v20,%%v21\n\t"
"vfasb %%v19,%%v22,%%v23\n\t"
"vfchesb %%v5,%%v17,%%v16\n\t"
"vfchesb %%v6,%%v19,%%v18\n\t"
"vsel %%v16,%%v16,%%v17,%%v5\n\t"
"vsel %%v5,%%v24,%%v25,%%v5\n\t"
"vsel %%v17,%%v18,%%v19,%%v6\n\t"
"vsel %%v6,%%v26,%%v27,%%v6\n\t"
"vfchesb %%v18,%%v17,%%v16\n\t"
"vsel %%v16,%%v16,%%v17,%%v18\n\t"
"vsel %%v5,%%v5,%%v6,%%v18\n\t"
"vsegf %%v6,%%v5\n\t"
"vesrlg %%v5,%%v5,32\n\t"
"vag %%v5,%%v5,%%v4\n\t"
"vag %%v6,%%v6,%%v4\n\t"
"vfchesb %%v7,%%v16,%%v0\n\t"
"vsel %%v0,%%v0,%%v16,%%v7\n\t"
"vsegf %%v8,%%v7\n\t"
"vesrlg %%v7,%%v7,32\n\t"
"vsegf %%v7,%%v7\n\t"
"vsel %%v1,%%v1,%%v5,%%v7\n\t"
"vsel %%v2,%%v2,%%v6,%%v8\n\t"
"vag %%v4,%%v4,%%v3\n\t"
"agfi %%r1, 256\n\t"
"brctg %[n], 0b\n\t"
"veslg %%v3,%%v0,32\n\t"
"vfchsb %%v4,%%v3,%%v0\n\t"
"vchlg %%v5,%%v2,%%v1\n\t"
"vfcesb %%v6,%%v0,%%v3\n\t"
"vn %%v5,%%v5,%%v6\n\t"
"vo %%v4,%%v4,%%v5\n\t"
"vsel %%v0,%%v0,%%v3,%%v4\n\t"
"vesrlg %%v4,%%v4,32\n\t"
"vsegf %%v4,%%v4\n\t"
"vsel %%v1,%%v1,%%v2,%%v4\n\t"
"vrepf %%v2,%%v0,2\n\t"
"vrepg %%v3,%%v1,1\n\t"
"wfcsb %%v2,%%v0\n\t"
"jne 1f\n\t"
"vstef %%v0,%[amin],0\n\t"
"vmnlg %%v0,%%v1,%%v3\n\t"
"vlgvg %[iamin],%%v0,0\n\t"
"j 2f\n\t"
"1:\n\t"
"wfchsb %%v4,%%v0,%%v2\n\t"
"vesrlg %%v4,%%v4,32\n\t"
"vsegf %%v4,%%v4\n\t"
"vsel %%v1,%%v3,%%v1,%%v4\n\t"
"vsel %%v0,%%v2,%%v0,%%v4\n\t"
"ste %%f0,%[amin]\n\t"
"vlgvg %[iamin],%%v1,0\n\t"
"2:\n\t"
"nop"
: [iamin] "=r"(iamin),[amin] "=m"(*amin),[n] "+&r"(n)
: "m"(*(const FLOAT (*)[n * 2]) x),[x] "a"(x)
: "cc", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8",
"v9", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24",
"v25", "v26", "v27", "v28", "v29", "v30", "v31");


"vl %%v18,32(%%r1,%3) \n\t"
"vl %%v29,48(%%r1,%3) \n\t"
"vpkg %%v19,%%v18,%%v29 \n\t"
"vperm %%v18,%%v18,%%v29,%%v9 \n\t"

"vl %%v20,64(%%r1,%3) \n\t"
"vl %%v30,80(%%r1,%3) \n\t"
"vpkg %%v21,%%v20,%%v30 \n\t"
"vperm %%v20,%%v20,%%v30,%%v9 \n\t"

"vl %%v22,96(%%r1,%3) \n\t"
"vl %%v31,112(%%r1,%3) \n\t"
"vpkg %%v23,%%v22,%%v31 \n\t"
"vperm %%v22,%%v22,%%v31,%%v9 \n\t"

"vflpsb %%v16, %%v16 \n\t"
"vflpsb %%v17, %%v17 \n\t"
"vflpsb %%v18, %%v18 \n\t"
"vflpsb %%v19, %%v19 \n\t"
"vflpsb %%v20, %%v20 \n\t"
"vflpsb %%v21, %%v21 \n\t"
"vflpsb %%v22, %%v22 \n\t"
"vflpsb %%v23, %%v23 \n\t"
"vfasb %%v16,%%v16,%%v17 \n\t"
"vfasb %%v17,%%v18,%%v19 \n\t"
"vfasb %%v18,%%v20,%%v21 \n\t"
"vfasb %%v19,%%v22,%%v23 \n\t"
"vfchesb %%v5,%%v17,%%v16 \n\t"
"vfchesb %%v6,%%v19,%%v18 \n\t"
"vsel %%v16,%%v16,%%v17,%%v5 \n\t"
"vsel %%v5,%%v24,%%v25,%%v5 \n\t"
"vsel %%v17,%%v18,%%v19,%%v6 \n\t"
"vsel %%v6,%%v26,%%v27,%%v6 \n\t"

"vfchesb %%v18,%%v17,%%v16 \n\t"
"vsel %%v16,%%v16,%%v17,%%v18 \n\t"
"vsel %%v5,%%v5,%%v6,%%v18 \n\t"
"vsegf %%v6,%%v5 \n\t"
"vesrlg %%v5,%%v5,32 \n\t"
"vag %%v5,%%v5,%%v4 \n\t"
"vag %%v6,%%v6,%%v4 \n\t"

"vfchesb %%v7,%%v16,%%v0 \n\t"
"vsel %%v0,%%v0,%%v16,%%v7 \n\t"
"vsegf %%v8,%%v7 \n\t"
"vesrlg %%v7,%%v7,32 \n\t"
"vsegf %%v7,%%v7 \n\t"
"vsel %%v1,%%v1,%%v5,%%v7 \n\t"
"vsel %%v2,%%v2,%%v6,%%v8 \n\t"
"vag %%v4,%%v4,%%v3 \n\t"

"vl %%v16,128(%%r1,%3) \n\t"
"vl %%v28,144(%%r1,%3) \n\t"
"vpkg %%v17,%%v16,%%v28 \n\t"
"vperm %%v16,%%v16,%%v28,%%v9 \n\t"

"vl %%v18,160(%%r1,%3) \n\t"
"vl %%v29,176(%%r1,%3) \n\t"
"vpkg %%v19,%%v18,%%v29 \n\t"
"vperm %%v18,%%v18,%%v29,%%v9 \n\t"

"vl %%v20,192(%%r1,%3) \n\t"
"vl %%v30,208(%%r1,%3) \n\t"
"vpkg %%v21,%%v20,%%v30 \n\t"
"vperm %%v20,%%v20,%%v30,%%v9 \n\t"

"vl %%v22,224(%%r1,%3) \n\t"
"vl %%v31,240(%%r1,%3) \n\t"
"vpkg %%v23,%%v22,%%v31 \n\t"
"vperm %%v22,%%v22,%%v31,%%v9 \n\t"

"vflpsb %%v16, %%v16 \n\t"
"vflpsb %%v17, %%v17 \n\t"
"vflpsb %%v18, %%v18 \n\t"
"vflpsb %%v19, %%v19 \n\t"
"vflpsb %%v20, %%v20 \n\t"
"vflpsb %%v21, %%v21 \n\t"
"vflpsb %%v22, %%v22 \n\t"
"vflpsb %%v23, %%v23 \n\t"
"vfasb %%v16,%%v16,%%v17 \n\t"
"vfasb %%v17,%%v18,%%v19 \n\t"
"vfasb %%v18,%%v20,%%v21 \n\t"
"vfasb %%v19,%%v22,%%v23 \n\t"
"vfchesb %%v5,%%v17,%%v16 \n\t"
"vfchesb %%v6,%%v19,%%v18 \n\t"
"vsel %%v16,%%v16,%%v17,%%v5 \n\t"
"vsel %%v5,%%v24,%%v25,%%v5 \n\t"
"vsel %%v17,%%v18,%%v19,%%v6 \n\t"
"vsel %%v6,%%v26,%%v27,%%v6 \n\t"
return iamin;
}


"vfchesb %%v18,%%v17,%%v16 \n\t"
"vsel %%v16,%%v16,%%v17,%%v18 \n\t"
"vsel %%v5,%%v5,%%v6,%%v18 \n\t"
"vsegf %%v6,%%v5 \n\t"
"vesrlg %%v5,%%v5,32 \n\t"
"vag %%v5,%%v5,%%v4 \n\t"
"vag %%v6,%%v6,%%v4 \n\t"
BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
BLASLONG i = 0;
BLASLONG ix = 0;
FLOAT minf = 0;
BLASLONG min = 0;
BLASLONG inc_x2;


"vfchesb %%v7,%%v16,%%v0 \n\t"
"vsel %%v0,%%v0,%%v16,%%v7 \n\t"
"vsegf %%v8,%%v7 \n\t"
"vesrlg %%v7,%%v7,32 \n\t"
"vsegf %%v7,%%v7 \n\t"
"vsel %%v1,%%v1,%%v5,%%v7 \n\t"
"vsel %%v2,%%v2,%%v6,%%v8 \n\t"
"vag %%v4,%%v4,%%v3 \n\t"
if (n <= 0 || inc_x <= 0)
return (min);


"agfi %%r1, 256 \n\t"
"brctg %%r0, 0b \n\t"
if (inc_x == 1) {


"veslg %%v3,%%v0,32 \n\t"
"vfchsb %%v4,%%v3,%%v0 \n\t"
"vchlg %%v5,%%v2,%%v1 \n\t"
"vfcesb %%v6,%%v0,%%v3 \n\t"
"vn %%v5,%%v5,%%v6 \n\t"
"vo %%v4,%%v4,%%v5 \n\t"
"vsel %%v0,%%v0,%%v3,%%v4 \n\t"
"vesrlg %%v4,%%v4,32 \n\t"
"vsegf %%v4,%%v4 \n\t"
"vsel %%v1,%%v1,%%v2,%%v4 \n\t"
BLASLONG n1 = n & -32;
if (n1 > 0) {


"vrepf %%v2,%%v0,2 \n\t"
"vrepg %%v3,%%v1,1 \n\t"
"wfcsb %%v2,%%v0 \n\t"
"jne 1f \n\t"
"vstef %%v0,%1,0 \n\t"
"vmnlg %%v0,%%v1,%%v3 \n\t"
"vlgvg %0,%%v0,0 \n\t"
"j 2f \n\t"
"1: \n\t"
"wfchsb %%v4,%%v0,%%v2 \n\t"
"vesrlg %%v4,%%v4,32 \n\t"
"vsegf %%v4,%%v4 \n\t"
"vsel %%v1,%%v3,%%v1,%%v4 \n\t"
"vsel %%v0,%%v2,%%v0,%%v4 \n\t"
"ste %%f0,%1 \n\t"
"vlgvg %0,%%v1,0 \n\t"
"2: \n\t"
"nop "
:"=r"(iamin),"=m"(*amin)
:"r"(n),"ZR"((const FLOAT (*)[n * 2])x)
:"memory","cc","r0","r1","v0","v1","v2","v3","v4","v5","v6","v7","v8","v9","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
);
min = icamin_kernel_32(n1, x, &minf);
ix = n1 * 2;
i = n1;
} else {
minf = CABS1(x, 0);
ix += 2;
i++;
}


return iamin;
}
while (i < n) {
if (CABS1(x, ix) < minf) {
min = i;
minf = CABS1(x, ix);
}
ix += 2;
i++;
}
return (min + 1);


BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
{
BLASLONG i = 0;
BLASLONG ix = 0;
FLOAT minf = 0;
BLASLONG min = 0;
BLASLONG inc_x2;
} else {


if (n <= 0 || inc_x <= 0) return(min);
if (inc_x == 1) {
min = 0;
minf = CABS1(x, 0);
inc_x2 = 2 * inc_x;


BLASLONG n1 = n & -32;
if (n1 > 0) {
BLASLONG n1 = n & -4;
while (i < n1) {


min = icamin_kernel_32(n1, x, &minf);
ix = n1 * 2;
i = n1;
if (CABS1(x, ix) < minf) {
min = i;
minf = CABS1(x, ix);
}
if (CABS1(x, ix + inc_x2) < minf) {
min = i + 1;
minf = CABS1(x, ix + inc_x2);
}
if (CABS1(x, ix + 2 * inc_x2) < minf) {
min = i + 2;
minf = CABS1(x, ix + 2 * inc_x2);
}
if (CABS1(x, ix + 3 * inc_x2) < minf) {
min = i + 3;
minf = CABS1(x, ix + 3 * inc_x2);
} }
else
{
minf = CABS1(x,0);
ix += 2;
i++;
}


while(i < n)
{
if( CABS1(x,ix) < minf )
{
min = i;
minf = CABS1(x,ix);
}
ix += 2;
i++;
}
return (min + 1);
ix += inc_x2 * 4;


} else {
min = 0;
minf = CABS1(x,0);
inc_x2 = 2 * inc_x;
ix += inc_x2;
i++;
i += 4;


while(i < n)
{
if( CABS1(x,ix) < minf )
{
min = i;
minf = CABS1(x,ix);
}
ix += inc_x2;
i++;
} }
return (min + 1);

while (i < n) {
if (CABS1(x, ix) < minf) {
min = i;
minf = CABS1(x, ix);
}
ix += inc_x2;
i++;
} }
return (min + 1);
}
} }



+ 196
- 215
kernel/zarch/idamax.c View File

@@ -1,5 +1,5 @@
/*************************************************************************** /***************************************************************************
Copyright (c) 2013-2017, The OpenBLAS Project
Copyright (c) 2013-2019, The OpenBLAS Project
All rights reserved. All rights reserved.
Redistribution and use in source and binary forms, with or without Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are modification, are permitted provided that the following conditions are
@@ -28,237 +28,218 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "common.h" #include "common.h"
#include <math.h> #include <math.h>


#if defined(DOUBLE)
#define ABS fabs #define ABS fabs
#else
#define ABS fabsf
#endif


static BLASLONG idamax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *amax)
{
BLASLONG iamax;

__asm__ volatile (
"vl %%v0,0(%3) \n\t"
"vflpdb %%v0,%%v0 \n\t"
"vleig %%v1,0,0 \n\t"
"vleig %%v1,1,1 \n\t"
"vrepig %%v2,16 \n\t"
"vzero %%v3 \n\t"
"vleig %%v24,0,0 \n\t"
"vleig %%v24,1,1 \n\t"
"vleig %%v25,2,0 \n\t"
"vleig %%v25,3,1 \n\t"
"vleig %%v26,4,0 \n\t"
"vleig %%v26,5,1 \n\t"
"vleig %%v27,6,0 \n\t"
"vleig %%v27,7,1 \n\t"
"vleig %%v28,8,0 \n\t"
"vleig %%v28,9,1 \n\t"
"vleig %%v29,10,0 \n\t"
"vleig %%v29,11,1 \n\t"
"vleig %%v30,12,0 \n\t"
"vleig %%v30,13,1 \n\t"
"vleig %%v31,14,0 \n\t"
"vleig %%v31,15,1 \n\t"
"srlg %%r0,%2,5 \n\t"
"xgr %%r1,%%r1 \n\t"
"0: \n\t"
"pfd 1, 1024(%%r1,%3) \n\t"

"vl %%v16,0(%%r1,%3) \n\t"
"vl %%v17,16(%%r1,%3) \n\t"
"vl %%v18,32(%%r1,%3) \n\t"
"vl %%v19,48(%%r1,%3) \n\t"
"vl %%v20,64(%%r1,%3) \n\t"
"vl %%v21,80(%%r1,%3) \n\t"
"vl %%v22,96(%%r1,%3) \n\t"
"vl %%v23,112(%%r1,%3) \n\t"
"vflpdb %%v16, %%v16 \n\t"
"vflpdb %%v17, %%v17 \n\t"
"vflpdb %%v18, %%v18 \n\t"
"vflpdb %%v19, %%v19 \n\t"
"vflpdb %%v20, %%v20 \n\t"
"vflpdb %%v21, %%v21 \n\t"
"vflpdb %%v22, %%v22 \n\t"
"vflpdb %%v23, %%v23 \n\t"
"vfchedb %%v4,%%v16,%%v17 \n\t"
"vfchedb %%v5,%%v18,%%v19 \n\t"
"vfchedb %%v6,%%v20,%%v21 \n\t"
"vfchedb %%v7,%%v22,%%v23 \n\t"
"vsel %%v16,%%v16,%%v17,%%v4 \n\t"
"vsel %%v4,%%v24,%%v25,%%v4 \n\t"
"vsel %%v17,%%v18,%%v19,%%v5 \n\t"
"vsel %%v5,%%v26,%%v27,%%v5 \n\t"
"vsel %%v18,%%v20,%%v21,%%v6 \n\t"
"vsel %%v6,%%v28,%%v29,%%v6 \n\t"
"vsel %%v19,%%v22,%%v23,%%v7 \n\t"
"vsel %%v7,%%v30,%%v31,%%v7 \n\t"

"vfchedb %%v20,%%v16,%%v17 \n\t"
"vfchedb %%v21,%%v18,%%v19 \n\t"
"vsel %%v16,%%v16,%%v17,%%v20 \n\t"
"vsel %%v4,%%v4,%%v5,%%v20 \n\t"
"vsel %%v17,%%v18,%%v19,%%v21 \n\t"
"vsel %%v5,%%v6,%%v7,%%v21 \n\t"

"vfchedb %%v18,%%v16,%%v17 \n\t"
"vsel %%v16,%%v16,%%v17,%%v18 \n\t"
"vsel %%v4,%%v4,%%v5,%%v18 \n\t"
"vag %%v4,%%v4,%%v3 \n\t"

"vfchedb %%v5,%%v0,%%v16 \n\t"
"vsel %%v0,%%v0,%%v16,%%v5 \n\t"
"vsel %%v1,%%v1,%%v4,%%v5 \n\t"
"vag %%v3,%%v3,%%v2 \n\t"

"vl %%v16,128(%%r1,%3) \n\t"
"vl %%v17,144(%%r1,%3) \n\t"
"vl %%v18,160(%%r1,%3) \n\t"
"vl %%v19,176(%%r1,%3) \n\t"
"vl %%v20,192(%%r1,%3) \n\t"
"vl %%v21,208(%%r1,%3) \n\t"
"vl %%v22,224(%%r1,%3) \n\t"
"vl %%v23,240(%%r1,%3) \n\t"
"vflpdb %%v16, %%v16 \n\t"
"vflpdb %%v17, %%v17 \n\t"
"vflpdb %%v18, %%v18 \n\t"
"vflpdb %%v19, %%v19 \n\t"
"vflpdb %%v20, %%v20 \n\t"
"vflpdb %%v21, %%v21 \n\t"
"vflpdb %%v22, %%v22 \n\t"
"vflpdb %%v23, %%v23 \n\t"

"vfchedb %%v4,%%v16,%%v17 \n\t"
"vfchedb %%v5,%%v18,%%v19 \n\t"
"vfchedb %%v6,%%v20,%%v21 \n\t"
"vfchedb %%v7,%%v22,%%v23 \n\t"
"vsel %%v16,%%v16,%%v17,%%v4 \n\t"
"vsel %%v4,%%v24,%%v25,%%v4 \n\t"
"vsel %%v17,%%v18,%%v19,%%v5 \n\t"
"vsel %%v5,%%v26,%%v27,%%v5 \n\t"
"vsel %%v18,%%v20,%%v21,%%v6 \n\t"
"vsel %%v6,%%v28,%%v29,%%v6 \n\t"
"vsel %%v19,%%v22,%%v23,%%v7 \n\t"
"vsel %%v7,%%v30,%%v31,%%v7 \n\t"

"vfchedb %%v20,%%v16,%%v17 \n\t"
"vfchedb %%v21,%%v18,%%v19 \n\t"
"vsel %%v16,%%v16,%%v17,%%v20 \n\t"
"vsel %%v4,%%v4,%%v5,%%v20 \n\t"
"vsel %%v17,%%v18,%%v19,%%v21 \n\t"
"vsel %%v5,%%v6,%%v7,%%v21 \n\t"

"vfchedb %%v18,%%v16,%%v17 \n\t"
"vsel %%v16,%%v16,%%v17,%%v18 \n\t"
"vsel %%v4,%%v4,%%v5,%%v18 \n\t"
"vag %%v4,%%v4,%%v3 \n\t"

"vfchedb %%v5,%%v0,%%v16 \n\t"
"vsel %%v0,%%v0,%%v16,%%v5 \n\t"
"vsel %%v1,%%v1,%%v4,%%v5 \n\t"
"vag %%v3,%%v3,%%v2 \n\t"

"agfi %%r1, 256 \n\t"
"brctg %%r0, 0b \n\t"

"vrepg %%v2,%%v0,1 \n\t"
"vrepg %%v3,%%v1,1 \n\t"
"wfcdb %%v2,%%v0 \n\t"
"jne 1f \n\t"
"vsteg %%v0,%1,0 \n\t"
"vmnlg %%v0,%%v1,%%v3 \n\t"
"vlgvg %0,%%v0,0 \n\t"
"j 2f \n\t"
"1: \n\t"
"wfchdb %%v4,%%v2,%%v0 \n\t"
"vsel %%v1,%%v3,%%v1,%%v4 \n\t"
"vsel %%v0,%%v2,%%v0,%%v4 \n\t"
"std %%f0,%1 \n\t"
"vlgvg %0,%%v1,0 \n\t"
"2: \n\t"
"nop "
:"=r"(iamax),"=m"(*amax)
:"r"(n),"ZR"((const FLOAT (*)[n])x)
:"memory","cc","r0","r1","v0","v1","v2","v3","v4","v5","v6","v7","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
);

return iamax;
static BLASLONG idamax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *amax) {
BLASLONG iamax;

__asm__("vl %%v0,0(%[x])\n\t"
"vflpdb %%v0,%%v0\n\t"
"vleig %%v1,0,0\n\t"
"vleig %%v1,1,1\n\t"
"vrepig %%v2,16\n\t"
"vzero %%v3\n\t"
"vleig %%v24,0,0\n\t"
"vleig %%v24,1,1\n\t"
"vleig %%v25,2,0\n\t"
"vleig %%v25,3,1\n\t"
"vleig %%v26,4,0\n\t"
"vleig %%v26,5,1\n\t"
"vleig %%v27,6,0\n\t"
"vleig %%v27,7,1\n\t"
"vleig %%v28,8,0\n\t"
"vleig %%v28,9,1\n\t"
"vleig %%v29,10,0\n\t"
"vleig %%v29,11,1\n\t"
"vleig %%v30,12,0\n\t"
"vleig %%v30,13,1\n\t"
"vleig %%v31,14,0\n\t"
"vleig %%v31,15,1\n\t"
"srlg %[n],%[n],5\n\t"
"xgr %%r1,%%r1\n\t"
"0:\n\t"
"pfd 1, 1024(%%r1,%[x])\n\t"
"vl %%v16,0(%%r1,%[x])\n\t"
"vl %%v17,16(%%r1,%[x])\n\t"
"vl %%v18,32(%%r1,%[x])\n\t"
"vl %%v19,48(%%r1,%[x])\n\t"
"vl %%v20,64(%%r1,%[x])\n\t"
"vl %%v21,80(%%r1,%[x])\n\t"
"vl %%v22,96(%%r1,%[x])\n\t"
"vl %%v23,112(%%r1,%[x])\n\t"
"vflpdb %%v16, %%v16\n\t"
"vflpdb %%v17, %%v17\n\t"
"vflpdb %%v18, %%v18\n\t"
"vflpdb %%v19, %%v19\n\t"
"vflpdb %%v20, %%v20\n\t"
"vflpdb %%v21, %%v21\n\t"
"vflpdb %%v22, %%v22\n\t"
"vflpdb %%v23, %%v23\n\t"
"vfchedb %%v4,%%v16,%%v17\n\t"
"vfchedb %%v5,%%v18,%%v19\n\t"
"vfchedb %%v6,%%v20,%%v21\n\t"
"vfchedb %%v7,%%v22,%%v23\n\t"
"vsel %%v16,%%v16,%%v17,%%v4\n\t"
"vsel %%v4,%%v24,%%v25,%%v4\n\t"
"vsel %%v17,%%v18,%%v19,%%v5\n\t"
"vsel %%v5,%%v26,%%v27,%%v5\n\t"
"vsel %%v18,%%v20,%%v21,%%v6\n\t"
"vsel %%v6,%%v28,%%v29,%%v6\n\t"
"vsel %%v19,%%v22,%%v23,%%v7\n\t"
"vsel %%v7,%%v30,%%v31,%%v7\n\t"
"vfchedb %%v20,%%v16,%%v17\n\t"
"vfchedb %%v21,%%v18,%%v19\n\t"
"vsel %%v16,%%v16,%%v17,%%v20\n\t"
"vsel %%v4,%%v4,%%v5,%%v20\n\t"
"vsel %%v17,%%v18,%%v19,%%v21\n\t"
"vsel %%v5,%%v6,%%v7,%%v21\n\t"
"vfchedb %%v18,%%v16,%%v17\n\t"
"vsel %%v16,%%v16,%%v17,%%v18\n\t"
"vsel %%v4,%%v4,%%v5,%%v18\n\t"
"vag %%v4,%%v4,%%v3\n\t"
"vfchedb %%v5,%%v0,%%v16\n\t"
"vsel %%v0,%%v0,%%v16,%%v5\n\t"
"vsel %%v1,%%v1,%%v4,%%v5\n\t"
"vag %%v3,%%v3,%%v2\n\t"
"vl %%v16,128(%%r1,%[x])\n\t"
"vl %%v17,144(%%r1,%[x])\n\t"
"vl %%v18,160(%%r1,%[x])\n\t"
"vl %%v19,176(%%r1,%[x])\n\t"
"vl %%v20,192(%%r1,%[x])\n\t"
"vl %%v21,208(%%r1,%[x])\n\t"
"vl %%v22,224(%%r1,%[x])\n\t"
"vl %%v23,240(%%r1,%[x])\n\t"
"vflpdb %%v16, %%v16\n\t"
"vflpdb %%v17, %%v17\n\t"
"vflpdb %%v18, %%v18\n\t"
"vflpdb %%v19, %%v19\n\t"
"vflpdb %%v20, %%v20\n\t"
"vflpdb %%v21, %%v21\n\t"
"vflpdb %%v22, %%v22\n\t"
"vflpdb %%v23, %%v23\n\t"
"vfchedb %%v4,%%v16,%%v17\n\t"
"vfchedb %%v5,%%v18,%%v19\n\t"
"vfchedb %%v6,%%v20,%%v21\n\t"
"vfchedb %%v7,%%v22,%%v23\n\t"
"vsel %%v16,%%v16,%%v17,%%v4\n\t"
"vsel %%v4,%%v24,%%v25,%%v4\n\t"
"vsel %%v17,%%v18,%%v19,%%v5\n\t"
"vsel %%v5,%%v26,%%v27,%%v5\n\t"
"vsel %%v18,%%v20,%%v21,%%v6\n\t"
"vsel %%v6,%%v28,%%v29,%%v6\n\t"
"vsel %%v19,%%v22,%%v23,%%v7\n\t"
"vsel %%v7,%%v30,%%v31,%%v7\n\t"
"vfchedb %%v20,%%v16,%%v17\n\t"
"vfchedb %%v21,%%v18,%%v19\n\t"
"vsel %%v16,%%v16,%%v17,%%v20\n\t"
"vsel %%v4,%%v4,%%v5,%%v20\n\t"
"vsel %%v17,%%v18,%%v19,%%v21\n\t"
"vsel %%v5,%%v6,%%v7,%%v21\n\t"
"vfchedb %%v18,%%v16,%%v17\n\t"
"vsel %%v16,%%v16,%%v17,%%v18\n\t"
"vsel %%v4,%%v4,%%v5,%%v18\n\t"
"vag %%v4,%%v4,%%v3\n\t"
"vfchedb %%v5,%%v0,%%v16\n\t"
"vsel %%v0,%%v0,%%v16,%%v5\n\t"
"vsel %%v1,%%v1,%%v4,%%v5\n\t"
"vag %%v3,%%v3,%%v2\n\t"
"agfi %%r1, 256\n\t"
"brctg %[n], 0b\n\t"
"vrepg %%v2,%%v0,1\n\t"
"vrepg %%v3,%%v1,1\n\t"
"wfcdb %%v2,%%v0\n\t"
"jne 1f\n\t"
"vsteg %%v0,%[amax],0\n\t"
"vmnlg %%v0,%%v1,%%v3\n\t"
"vlgvg %[iamax],%%v0,0\n\t"
"j 2f\n\t"
"1:\n\t"
"wfchdb %%v4,%%v2,%%v0\n\t"
"vsel %%v1,%%v3,%%v1,%%v4\n\t"
"vsel %%v0,%%v2,%%v0,%%v4\n\t"
"std %%f0,%[amax]\n\t"
"vlgvg %[iamax],%%v1,0\n\t"
"2:\n\t"
"nop"
: [iamax] "=r"(iamax),[amax] "=m"(*amax),[n] "+&r"(n)
: "m"(*(const FLOAT (*)[n]) x),[x] "a"(x)
: "cc", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16",
"v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26",
"v27", "v28", "v29", "v30", "v31");

return iamax;
} }
BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
BLASLONG i = 0;
BLASLONG j = 0;
FLOAT maxf = 0.0;
BLASLONG max = 0;

if (n <= 0 || inc_x <= 0) return (max);


if (inc_x == 1) {
BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
BLASLONG i = 0;
BLASLONG j = 0;
FLOAT maxf = 0.0;
BLASLONG max = 0;


BLASLONG n1 = n & -32;
if (n1 > 0) {
if (n <= 0 || inc_x <= 0)
return (max);


max = idamax_kernel_32(n1, x, &maxf);
if (inc_x == 1) {


i = n1;
}
else
{
maxf = ABS(x[0]);
i++;
}
BLASLONG n1 = n & -32;
if (n1 > 0) {


while (i < n) {
if (ABS(x[i]) > maxf) {
max = i;
maxf = ABS(x[i]);
}
i++;
}
return (max + 1);
max = idamax_kernel_32(n1, x, &maxf);


i = n1;
} else { } else {
maxf = ABS(x[0]);
i++;
}


max = 0;
maxf = ABS(x[0]);
while (i < n) {
if (ABS(x[i]) > maxf) {
max = i;
maxf = ABS(x[i]);
}
i++;
}
return (max + 1);


BLASLONG n1 = n & -4;
while (j < n1) {
} else {


if (ABS(x[i]) > maxf) {
max = j;
maxf = ABS(x[i]);
}
if (ABS(x[i + inc_x]) > maxf) {
max = j + 1;
maxf = ABS(x[i + inc_x]);
}
if (ABS(x[i + 2 * inc_x]) > maxf) {
max = j + 2;
maxf = ABS(x[i + 2 * inc_x]);
}
if (ABS(x[i + 3 * inc_x]) > maxf) {
max = j + 3;
maxf = ABS(x[i + 3 * inc_x]);
}
max = 0;
maxf = ABS(x[0]);


i += inc_x * 4;
BLASLONG n1 = n & -4;
while (j < n1) {


j += 4;
if (ABS(x[i]) > maxf) {
max = j;
maxf = ABS(x[i]);
}
if (ABS(x[i + inc_x]) > maxf) {
max = j + 1;
maxf = ABS(x[i + inc_x]);
}
if (ABS(x[i + 2 * inc_x]) > maxf) {
max = j + 2;
maxf = ABS(x[i + 2 * inc_x]);
}
if (ABS(x[i + 3 * inc_x]) > maxf) {
max = j + 3;
maxf = ABS(x[i + 3 * inc_x]);
}


}
i += inc_x * 4;


j += 4;

}


while (j < n) {
if (ABS(x[i]) > maxf) {
max = j;
maxf = ABS(x[i]);
}
i += inc_x;
j++;
}
return (max + 1);
while (j < n) {
if (ABS(x[i]) > maxf) {
max = j;
maxf = ABS(x[i]);
}
i += inc_x;
j++;
} }
return (max + 1);
}
} }

+ 196
- 215
kernel/zarch/idamin.c View File

@@ -1,5 +1,5 @@
/*************************************************************************** /***************************************************************************
Copyright (c) 2013-2017, The OpenBLAS Project
Copyright (c) 2013-2019, The OpenBLAS Project
All rights reserved. All rights reserved.
Redistribution and use in source and binary forms, with or without Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are modification, are permitted provided that the following conditions are
@@ -28,237 +28,218 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "common.h" #include "common.h"
#include <math.h> #include <math.h>


#if defined(DOUBLE)
#define ABS fabs #define ABS fabs
#else
#define ABS fabsf
#endif


static BLASLONG idamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *amin)
{
BLASLONG iamin;

__asm__ volatile (
"vl %%v0,0(%3) \n\t"
"vflpdb %%v0,%%v0 \n\t"
"vleig %%v1,0,0 \n\t"
"vleig %%v1,1,1 \n\t"
"vrepig %%v2,16 \n\t"
"vzero %%v3 \n\t"
"vleig %%v24,0,0 \n\t"
"vleig %%v24,1,1 \n\t"
"vleig %%v25,2,0 \n\t"
"vleig %%v25,3,1 \n\t"
"vleig %%v26,4,0 \n\t"
"vleig %%v26,5,1 \n\t"
"vleig %%v27,6,0 \n\t"
"vleig %%v27,7,1 \n\t"
"vleig %%v28,8,0 \n\t"
"vleig %%v28,9,1 \n\t"
"vleig %%v29,10,0 \n\t"
"vleig %%v29,11,1 \n\t"
"vleig %%v30,12,0 \n\t"
"vleig %%v30,13,1 \n\t"
"vleig %%v31,14,0 \n\t"
"vleig %%v31,15,1 \n\t"
"srlg %%r0,%2,5 \n\t"
"xgr %%r1,%%r1 \n\t"
"0: \n\t"
"pfd 1, 1024(%%r1,%3) \n\t"

"vl %%v16,0(%%r1,%3) \n\t"
"vl %%v17,16(%%r1,%3) \n\t"
"vl %%v18,32(%%r1,%3) \n\t"
"vl %%v19,48(%%r1,%3) \n\t"
"vl %%v20,64(%%r1,%3) \n\t"
"vl %%v21,80(%%r1,%3) \n\t"
"vl %%v22,96(%%r1,%3) \n\t"
"vl %%v23,112(%%r1,%3) \n\t"
"vflpdb %%v16, %%v16 \n\t"
"vflpdb %%v17, %%v17 \n\t"
"vflpdb %%v18, %%v18 \n\t"
"vflpdb %%v19, %%v19 \n\t"
"vflpdb %%v20, %%v20 \n\t"
"vflpdb %%v21, %%v21 \n\t"
"vflpdb %%v22, %%v22 \n\t"
"vflpdb %%v23, %%v23 \n\t"
"vfchedb %%v4,%%v17,%%v16 \n\t"
"vfchedb %%v5,%%v19,%%v18 \n\t"
"vfchedb %%v6,%%v21,%%v20 \n\t"
"vfchedb %%v7,%%v23,%%v22 \n\t"
"vsel %%v16,%%v16,%%v17,%%v4 \n\t"
"vsel %%v4,%%v24,%%v25,%%v4 \n\t"
"vsel %%v17,%%v18,%%v19,%%v5 \n\t"
"vsel %%v5,%%v26,%%v27,%%v5 \n\t"
"vsel %%v18,%%v20,%%v21,%%v6 \n\t"
"vsel %%v6,%%v28,%%v29,%%v6 \n\t"
"vsel %%v19,%%v22,%%v23,%%v7 \n\t"
"vsel %%v7,%%v30,%%v31,%%v7 \n\t"

"vfchedb %%v20,%%v17,%%v16 \n\t"
"vfchedb %%v21,%%v19,%%v18 \n\t"
"vsel %%v16,%%v16,%%v17,%%v20 \n\t"
"vsel %%v4,%%v4,%%v5,%%v20 \n\t"
"vsel %%v17,%%v18,%%v19,%%v21 \n\t"
"vsel %%v5,%%v6,%%v7,%%v21 \n\t"

"vfchedb %%v18,%%v17,%%v16 \n\t"
"vsel %%v16,%%v16,%%v17,%%v18 \n\t"
"vsel %%v4,%%v4,%%v5,%%v18 \n\t"
"vag %%v4,%%v4,%%v3 \n\t"

"vfchedb %%v5,%%v16,%%v0 \n\t"
"vsel %%v0,%%v0,%%v16,%%v5 \n\t"
"vsel %%v1,%%v1,%%v4,%%v5 \n\t"
"vag %%v3,%%v3,%%v2 \n\t"

"vl %%v16,128(%%r1,%3) \n\t"
"vl %%v17,144(%%r1,%3) \n\t"
"vl %%v18,160(%%r1,%3) \n\t"
"vl %%v19,176(%%r1,%3) \n\t"
"vl %%v20,192(%%r1,%3) \n\t"
"vl %%v21,208(%%r1,%3) \n\t"
"vl %%v22,224(%%r1,%3) \n\t"
"vl %%v23,240(%%r1,%3) \n\t"
"vflpdb %%v16, %%v16 \n\t"
"vflpdb %%v17, %%v17 \n\t"
"vflpdb %%v18, %%v18 \n\t"
"vflpdb %%v19, %%v19 \n\t"
"vflpdb %%v20, %%v20 \n\t"
"vflpdb %%v21, %%v21 \n\t"
"vflpdb %%v22, %%v22 \n\t"
"vflpdb %%v23, %%v23 \n\t"

"vfchedb %%v4,%%v17,%%v16 \n\t"
"vfchedb %%v5,%%v19,%%v18 \n\t"
"vfchedb %%v6,%%v21,%%v20 \n\t"
"vfchedb %%v7,%%v23,%%v22 \n\t"
"vsel %%v16,%%v16,%%v17,%%v4 \n\t"
"vsel %%v4,%%v24,%%v25,%%v4 \n\t"
"vsel %%v17,%%v18,%%v19,%%v5 \n\t"
"vsel %%v5,%%v26,%%v27,%%v5 \n\t"
"vsel %%v18,%%v20,%%v21,%%v6 \n\t"
"vsel %%v6,%%v28,%%v29,%%v6 \n\t"
"vsel %%v19,%%v22,%%v23,%%v7 \n\t"
"vsel %%v7,%%v30,%%v31,%%v7 \n\t"

"vfchedb %%v20,%%v17,%%v16 \n\t"
"vfchedb %%v21,%%v19,%%v18 \n\t"
"vsel %%v16,%%v16,%%v17,%%v20 \n\t"
"vsel %%v4,%%v4,%%v5,%%v20 \n\t"
"vsel %%v17,%%v18,%%v19,%%v21 \n\t"
"vsel %%v5,%%v6,%%v7,%%v21 \n\t"

"vfchedb %%v18,%%v17,%%v16 \n\t"
"vsel %%v16,%%v16,%%v17,%%v18 \n\t"
"vsel %%v4,%%v4,%%v5,%%v18 \n\t"
"vag %%v4,%%v4,%%v3 \n\t"

"vfchedb %%v5,%%v16,%%v0 \n\t"
"vsel %%v0,%%v0,%%v16,%%v5 \n\t"
"vsel %%v1,%%v1,%%v4,%%v5 \n\t"
"vag %%v3,%%v3,%%v2 \n\t"

"agfi %%r1, 256 \n\t"
"brctg %%r0, 0b \n\t"

"vrepg %%v2,%%v0,1 \n\t"
"vrepg %%v3,%%v1,1 \n\t"
"wfcdb %%v2,%%v0 \n\t"
"jne 1f \n\t"
"vsteg %%v0,%1,0 \n\t"
"vmnlg %%v0,%%v1,%%v3 \n\t"
"vlgvg %0,%%v0,0 \n\t"
"j 2f \n\t"
"1: \n\t"
"wfchdb %%v4,%%v0,%%v2 \n\t"
"vsel %%v1,%%v3,%%v1,%%v4 \n\t"
"vsel %%v0,%%v2,%%v0,%%v4 \n\t"
"std %%f0,%1 \n\t"
"vlgvg %0,%%v1,0 \n\t"
"2: \n\t"
"nop "
:"=r"(iamin),"=m"(*amin)
:"r"(n),"ZR"((const FLOAT (*)[n])x)
:"memory","cc","r0","r1","v0","v1","v2","v3","v4","v5","v6","v7","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
);

return iamin;
static BLASLONG idamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *amin) {
BLASLONG iamin;

__asm__("vl %%v0,0(%[x])\n\t"
"vflpdb %%v0,%%v0\n\t"
"vleig %%v1,0,0\n\t"
"vleig %%v1,1,1\n\t"
"vrepig %%v2,16\n\t"
"vzero %%v3\n\t"
"vleig %%v24,0,0\n\t"
"vleig %%v24,1,1\n\t"
"vleig %%v25,2,0\n\t"
"vleig %%v25,3,1\n\t"
"vleig %%v26,4,0\n\t"
"vleig %%v26,5,1\n\t"
"vleig %%v27,6,0\n\t"
"vleig %%v27,7,1\n\t"
"vleig %%v28,8,0\n\t"
"vleig %%v28,9,1\n\t"
"vleig %%v29,10,0\n\t"
"vleig %%v29,11,1\n\t"
"vleig %%v30,12,0\n\t"
"vleig %%v30,13,1\n\t"
"vleig %%v31,14,0\n\t"
"vleig %%v31,15,1\n\t"
"srlg %[n],%[n],5\n\t"
"xgr %%r1,%%r1\n\t"
"0:\n\t"
"pfd 1, 1024(%%r1,%[x])\n\t"
"vl %%v16,0(%%r1,%[x])\n\t"
"vl %%v17,16(%%r1,%[x])\n\t"
"vl %%v18,32(%%r1,%[x])\n\t"
"vl %%v19,48(%%r1,%[x])\n\t"
"vl %%v20,64(%%r1,%[x])\n\t"
"vl %%v21,80(%%r1,%[x])\n\t"
"vl %%v22,96(%%r1,%[x])\n\t"
"vl %%v23,112(%%r1,%[x])\n\t"
"vflpdb %%v16, %%v16\n\t"
"vflpdb %%v17, %%v17\n\t"
"vflpdb %%v18, %%v18\n\t"
"vflpdb %%v19, %%v19\n\t"
"vflpdb %%v20, %%v20\n\t"
"vflpdb %%v21, %%v21\n\t"
"vflpdb %%v22, %%v22\n\t"
"vflpdb %%v23, %%v23\n\t"
"vfchedb %%v4,%%v17,%%v16\n\t"
"vfchedb %%v5,%%v19,%%v18\n\t"
"vfchedb %%v6,%%v21,%%v20\n\t"
"vfchedb %%v7,%%v23,%%v22\n\t"
"vsel %%v16,%%v16,%%v17,%%v4\n\t"
"vsel %%v4,%%v24,%%v25,%%v4\n\t"
"vsel %%v17,%%v18,%%v19,%%v5\n\t"
"vsel %%v5,%%v26,%%v27,%%v5\n\t"
"vsel %%v18,%%v20,%%v21,%%v6\n\t"
"vsel %%v6,%%v28,%%v29,%%v6\n\t"
"vsel %%v19,%%v22,%%v23,%%v7\n\t"
"vsel %%v7,%%v30,%%v31,%%v7\n\t"
"vfchedb %%v20,%%v17,%%v16\n\t"
"vfchedb %%v21,%%v19,%%v18\n\t"
"vsel %%v16,%%v16,%%v17,%%v20\n\t"
"vsel %%v4,%%v4,%%v5,%%v20\n\t"
"vsel %%v17,%%v18,%%v19,%%v21\n\t"
"vsel %%v5,%%v6,%%v7,%%v21\n\t"
"vfchedb %%v18,%%v17,%%v16\n\t"
"vsel %%v16,%%v16,%%v17,%%v18\n\t"
"vsel %%v4,%%v4,%%v5,%%v18\n\t"
"vag %%v4,%%v4,%%v3\n\t"
"vfchedb %%v5,%%v16,%%v0\n\t"
"vsel %%v0,%%v0,%%v16,%%v5\n\t"
"vsel %%v1,%%v1,%%v4,%%v5\n\t"
"vag %%v3,%%v3,%%v2\n\t"
"vl %%v16,128(%%r1,%[x])\n\t"
"vl %%v17,144(%%r1,%[x])\n\t"
"vl %%v18,160(%%r1,%[x])\n\t"
"vl %%v19,176(%%r1,%[x])\n\t"
"vl %%v20,192(%%r1,%[x])\n\t"
"vl %%v21,208(%%r1,%[x])\n\t"
"vl %%v22,224(%%r1,%[x])\n\t"
"vl %%v23,240(%%r1,%[x])\n\t"
"vflpdb %%v16, %%v16\n\t"
"vflpdb %%v17, %%v17\n\t"
"vflpdb %%v18, %%v18\n\t"
"vflpdb %%v19, %%v19\n\t"
"vflpdb %%v20, %%v20\n\t"
"vflpdb %%v21, %%v21\n\t"
"vflpdb %%v22, %%v22\n\t"
"vflpdb %%v23, %%v23\n\t"
"vfchedb %%v4,%%v17,%%v16\n\t"
"vfchedb %%v5,%%v19,%%v18\n\t"
"vfchedb %%v6,%%v21,%%v20\n\t"
"vfchedb %%v7,%%v23,%%v22\n\t"
"vsel %%v16,%%v16,%%v17,%%v4\n\t"
"vsel %%v4,%%v24,%%v25,%%v4\n\t"
"vsel %%v17,%%v18,%%v19,%%v5\n\t"
"vsel %%v5,%%v26,%%v27,%%v5\n\t"
"vsel %%v18,%%v20,%%v21,%%v6\n\t"
"vsel %%v6,%%v28,%%v29,%%v6\n\t"
"vsel %%v19,%%v22,%%v23,%%v7\n\t"
"vsel %%v7,%%v30,%%v31,%%v7\n\t"
"vfchedb %%v20,%%v17,%%v16\n\t"
"vfchedb %%v21,%%v19,%%v18\n\t"
"vsel %%v16,%%v16,%%v17,%%v20\n\t"
"vsel %%v4,%%v4,%%v5,%%v20\n\t"
"vsel %%v17,%%v18,%%v19,%%v21\n\t"
"vsel %%v5,%%v6,%%v7,%%v21\n\t"
"vfchedb %%v18,%%v17,%%v16\n\t"
"vsel %%v16,%%v16,%%v17,%%v18\n\t"
"vsel %%v4,%%v4,%%v5,%%v18\n\t"
"vag %%v4,%%v4,%%v3\n\t"
"vfchedb %%v5,%%v16,%%v0\n\t"
"vsel %%v0,%%v0,%%v16,%%v5\n\t"
"vsel %%v1,%%v1,%%v4,%%v5\n\t"
"vag %%v3,%%v3,%%v2\n\t"
"agfi %%r1, 256\n\t"
"brctg %[n], 0b\n\t"
"vrepg %%v2,%%v0,1\n\t"
"vrepg %%v3,%%v1,1\n\t"
"wfcdb %%v2,%%v0\n\t"
"jne 1f\n\t"
"vsteg %%v0,%[amin],0\n\t"
"vmnlg %%v0,%%v1,%%v3\n\t"
"vlgvg %[iamin],%%v0,0\n\t"
"j 2f\n\t"
"1:\n\t"
"wfchdb %%v4,%%v0,%%v2\n\t"
"vsel %%v1,%%v3,%%v1,%%v4\n\t"
"vsel %%v0,%%v2,%%v0,%%v4\n\t"
"std %%f0,%[amin]\n\t"
"vlgvg %[iamin],%%v1,0\n\t"
"2:\n\t"
"nop"
: [iamin] "=r"(iamin),[amin] "=m"(*amin),[n] "+&r"(n)
: "m"(*(const FLOAT (*)[n]) x),[x] "a"(x)
: "cc", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16",
"v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26",
"v27", "v28", "v29", "v30", "v31");

return iamin;
} }
BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
BLASLONG i = 0;
BLASLONG j = 0;
FLOAT minf = 0.0;
BLASLONG min = 0;

if (n <= 0 || inc_x <= 0) return (min);


if (inc_x == 1) {
BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
BLASLONG i = 0;
BLASLONG j = 0;
FLOAT minf = 0.0;
BLASLONG min = 0;


BLASLONG n1 = n & -32;
if (n1 > 0) {
if (n <= 0 || inc_x <= 0)
return (min);


min = idamin_kernel_32(n1, x, &minf);
if (inc_x == 1) {


i = n1;
}
else
{
minf = ABS(x[0]);
i++;
}
BLASLONG n1 = n & -32;
if (n1 > 0) {


while (i < n) {
if (ABS(x[i]) < minf) {
min = i;
minf = ABS(x[i]);
}
i++;
}
return (min + 1);
min = idamin_kernel_32(n1, x, &minf);


i = n1;
} else { } else {
minf = ABS(x[0]);
i++;
}


min = 0;
minf = ABS(x[0]);
while (i < n) {
if (ABS(x[i]) < minf) {
min = i;
minf = ABS(x[i]);
}
i++;
}
return (min + 1);


BLASLONG n1 = n & -4;
while (j < n1) {
} else {


if (ABS(x[i]) < minf) {
min = j;
minf = ABS(x[i]);
}
if (ABS(x[i + inc_x]) < minf) {
min = j + 1;
minf = ABS(x[i + inc_x]);
}
if (ABS(x[i + 2 * inc_x]) < minf) {
min = j + 2;
minf = ABS(x[i + 2 * inc_x]);
}
if (ABS(x[i + 3 * inc_x]) < minf) {
min = j + 3;
minf = ABS(x[i + 3 * inc_x]);
}
min = 0;
minf = ABS(x[0]);


i += inc_x * 4;
BLASLONG n1 = n & -4;
while (j < n1) {


j += 4;
if (ABS(x[i]) < minf) {
min = j;
minf = ABS(x[i]);
}
if (ABS(x[i + inc_x]) < minf) {
min = j + 1;
minf = ABS(x[i + inc_x]);
}
if (ABS(x[i + 2 * inc_x]) < minf) {
min = j + 2;
minf = ABS(x[i + 2 * inc_x]);
}
if (ABS(x[i + 3 * inc_x]) < minf) {
min = j + 3;
minf = ABS(x[i + 3 * inc_x]);
}


}
i += inc_x * 4;


j += 4;

}


while (j < n) {
if (ABS(x[i]) < minf) {
min = j;
minf = ABS(x[i]);
}
i += inc_x;
j++;
}
return (min + 1);
while (j < n) {
if (ABS(x[i]) < minf) {
min = j;
minf = ABS(x[i]);
}
i += inc_x;
j++;
} }
return (min + 1);
}
} }

+ 185
- 200
kernel/zarch/idmax.c View File

@@ -1,5 +1,5 @@
/*************************************************************************** /***************************************************************************
Copyright (c) 2013-2017, The OpenBLAS Project
Copyright (c) 2013-2019, The OpenBLAS Project
All rights reserved. All rights reserved.
Redistribution and use in source and binary forms, with or without Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are modification, are permitted provided that the following conditions are
@@ -27,214 +27,199 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.


#include "common.h" #include "common.h"


static BLASLONG idmax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *max)
{
BLASLONG imax;

__asm__ volatile (
"vl %%v0,0(%3) \n\t"
"vleig %%v1,0,0 \n\t"
"vleig %%v1,1,1 \n\t"
"vrepig %%v2,16 \n\t"
"vzero %%v3 \n\t"
"vleig %%v24,0,0 \n\t"
"vleig %%v24,1,1 \n\t"
"vleig %%v25,2,0 \n\t"
"vleig %%v25,3,1 \n\t"
"vleig %%v26,4,0 \n\t"
"vleig %%v26,5,1 \n\t"
"vleig %%v27,6,0 \n\t"
"vleig %%v27,7,1 \n\t"
"vleig %%v28,8,0 \n\t"
"vleig %%v28,9,1 \n\t"
"vleig %%v29,10,0 \n\t"
"vleig %%v29,11,1 \n\t"
"vleig %%v30,12,0 \n\t"
"vleig %%v30,13,1 \n\t"
"vleig %%v31,14,0 \n\t"
"vleig %%v31,15,1 \n\t"
"srlg %%r0,%2,5 \n\t"
"xgr %%r1,%%r1 \n\t"
"0: \n\t"
"pfd 1, 1024(%%r1,%3) \n\t"

"vl %%v16,0(%%r1,%3) \n\t"
"vl %%v17,16(%%r1,%3) \n\t"
"vl %%v18,32(%%r1,%3) \n\t"
"vl %%v19,48(%%r1,%3) \n\t"
"vl %%v20,64(%%r1,%3) \n\t"
"vl %%v21,80(%%r1,%3) \n\t"
"vl %%v22,96(%%r1,%3) \n\t"
"vl %%v23,112(%%r1,%3) \n\t"
"vfchedb %%v4,%%v16,%%v17 \n\t"
"vfchedb %%v5,%%v18,%%v19 \n\t"
"vfchedb %%v6,%%v20,%%v21 \n\t"
"vfchedb %%v7,%%v22,%%v23 \n\t"
"vsel %%v16,%%v16,%%v17,%%v4 \n\t"
"vsel %%v4,%%v24,%%v25,%%v4 \n\t"
"vsel %%v17,%%v18,%%v19,%%v5 \n\t"
"vsel %%v5,%%v26,%%v27,%%v5 \n\t"
"vsel %%v18,%%v20,%%v21,%%v6 \n\t"
"vsel %%v6,%%v28,%%v29,%%v6 \n\t"
"vsel %%v19,%%v22,%%v23,%%v7 \n\t"
"vsel %%v7,%%v30,%%v31,%%v7 \n\t"

"vfchedb %%v20,%%v16,%%v17 \n\t"
"vfchedb %%v21,%%v18,%%v19 \n\t"
"vsel %%v16,%%v16,%%v17,%%v20 \n\t"
"vsel %%v4,%%v4,%%v5,%%v20 \n\t"
"vsel %%v17,%%v18,%%v19,%%v21 \n\t"
"vsel %%v5,%%v6,%%v7,%%v21 \n\t"

"vfchedb %%v18,%%v16,%%v17 \n\t"
"vsel %%v16,%%v16,%%v17,%%v18 \n\t"
"vsel %%v4,%%v4,%%v5,%%v18 \n\t"
"vag %%v4,%%v4,%%v3 \n\t"

"vfchedb %%v5,%%v0,%%v16 \n\t"
"vsel %%v0,%%v0,%%v16,%%v5 \n\t"
"vsel %%v1,%%v1,%%v4,%%v5 \n\t"
"vag %%v3,%%v3,%%v2 \n\t"

"vl %%v16,128(%%r1,%3) \n\t"
"vl %%v17,144(%%r1,%3) \n\t"
"vl %%v18,160(%%r1,%3) \n\t"
"vl %%v19,176(%%r1,%3) \n\t"
"vl %%v20,192(%%r1,%3) \n\t"
"vl %%v21,208(%%r1,%3) \n\t"
"vl %%v22,224(%%r1,%3) \n\t"
"vl %%v23,240(%%r1,%3) \n\t"

"vfchedb %%v4,%%v16,%%v17 \n\t"
"vfchedb %%v5,%%v18,%%v19 \n\t"
"vfchedb %%v6,%%v20,%%v21 \n\t"
"vfchedb %%v7,%%v22,%%v23 \n\t"
"vsel %%v16,%%v16,%%v17,%%v4 \n\t"
"vsel %%v4,%%v24,%%v25,%%v4 \n\t"
"vsel %%v17,%%v18,%%v19,%%v5 \n\t"
"vsel %%v5,%%v26,%%v27,%%v5 \n\t"
"vsel %%v18,%%v20,%%v21,%%v6 \n\t"
"vsel %%v6,%%v28,%%v29,%%v6 \n\t"
"vsel %%v19,%%v22,%%v23,%%v7 \n\t"
"vsel %%v7,%%v30,%%v31,%%v7 \n\t"

"vfchedb %%v20,%%v16,%%v17 \n\t"
"vfchedb %%v21,%%v18,%%v19 \n\t"
"vsel %%v16,%%v16,%%v17,%%v20 \n\t"
"vsel %%v4,%%v4,%%v5,%%v20 \n\t"
"vsel %%v17,%%v18,%%v19,%%v21 \n\t"
"vsel %%v5,%%v6,%%v7,%%v21 \n\t"

"vfchedb %%v18,%%v16,%%v17 \n\t"
"vsel %%v16,%%v16,%%v17,%%v18 \n\t"
"vsel %%v4,%%v4,%%v5,%%v18 \n\t"
"vag %%v4,%%v4,%%v3 \n\t"

"vfchedb %%v5,%%v0,%%v16 \n\t"
"vsel %%v0,%%v0,%%v16,%%v5 \n\t"
"vsel %%v1,%%v1,%%v4,%%v5 \n\t"
"vag %%v3,%%v3,%%v2 \n\t"

"agfi %%r1, 256 \n\t"
"brctg %%r0, 0b \n\t"

"vrepg %%v2,%%v0,1 \n\t"
"vrepg %%v3,%%v1,1 \n\t"
"wfcdb %%v2,%%v0 \n\t"
"jne 1f \n\t"
"vsteg %%v0,%1,0 \n\t"
"vmnlg %%v0,%%v1,%%v3 \n\t"
"vlgvg %0,%%v0,0 \n\t"
"j 2f \n\t"
"1: \n\t"
"wfchdb %%v4,%%v2,%%v0 \n\t"
"vsel %%v1,%%v3,%%v1,%%v4 \n\t"
"vsel %%v0,%%v2,%%v0,%%v4 \n\t"
"std %%f0,%1 \n\t"
"vlgvg %0,%%v1,0 \n\t"
"2: \n\t"
"nop "
:"=r"(imax),"=m"(*max)
:"r"(n),"ZR"((const FLOAT (*)[n])x)
:"memory","cc","r0","r1","v0","v1","v2","v3","v4","v5","v6","v7","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
);

return imax;
static BLASLONG idmax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *max) {
BLASLONG imax;

__asm__("vl %%v0,0(%[x])\n\t"
"vleig %%v1,0,0\n\t"
"vleig %%v1,1,1\n\t"
"vrepig %%v2,16\n\t"
"vzero %%v3\n\t"
"vleig %%v24,0,0\n\t"
"vleig %%v24,1,1\n\t"
"vleig %%v25,2,0\n\t"
"vleig %%v25,3,1\n\t"
"vleig %%v26,4,0\n\t"
"vleig %%v26,5,1\n\t"
"vleig %%v27,6,0\n\t"
"vleig %%v27,7,1\n\t"
"vleig %%v28,8,0\n\t"
"vleig %%v28,9,1\n\t"
"vleig %%v29,10,0\n\t"
"vleig %%v29,11,1\n\t"
"vleig %%v30,12,0\n\t"
"vleig %%v30,13,1\n\t"
"vleig %%v31,14,0\n\t"
"vleig %%v31,15,1\n\t"
"srlg %[n],%[n],5\n\t"
"xgr %%r1,%%r1\n\t"
"0:\n\t"
"pfd 1, 1024(%%r1,%[x])\n\t"
"vl %%v16,0(%%r1,%[x])\n\t"
"vl %%v17,16(%%r1,%[x])\n\t"
"vl %%v18,32(%%r1,%[x])\n\t"
"vl %%v19,48(%%r1,%[x])\n\t"
"vl %%v20,64(%%r1,%[x])\n\t"
"vl %%v21,80(%%r1,%[x])\n\t"
"vl %%v22,96(%%r1,%[x])\n\t"
"vl %%v23,112(%%r1,%[x])\n\t"
"vfchedb %%v4,%%v16,%%v17\n\t"
"vfchedb %%v5,%%v18,%%v19\n\t"
"vfchedb %%v6,%%v20,%%v21\n\t"
"vfchedb %%v7,%%v22,%%v23\n\t"
"vsel %%v16,%%v16,%%v17,%%v4\n\t"
"vsel %%v4,%%v24,%%v25,%%v4\n\t"
"vsel %%v17,%%v18,%%v19,%%v5\n\t"
"vsel %%v5,%%v26,%%v27,%%v5\n\t"
"vsel %%v18,%%v20,%%v21,%%v6\n\t"
"vsel %%v6,%%v28,%%v29,%%v6\n\t"
"vsel %%v19,%%v22,%%v23,%%v7\n\t"
"vsel %%v7,%%v30,%%v31,%%v7\n\t"
"vfchedb %%v20,%%v16,%%v17\n\t"
"vfchedb %%v21,%%v18,%%v19\n\t"
"vsel %%v16,%%v16,%%v17,%%v20\n\t"
"vsel %%v4,%%v4,%%v5,%%v20\n\t"
"vsel %%v17,%%v18,%%v19,%%v21\n\t"
"vsel %%v5,%%v6,%%v7,%%v21\n\t"
"vfchedb %%v18,%%v16,%%v17\n\t"
"vsel %%v16,%%v16,%%v17,%%v18\n\t"
"vsel %%v4,%%v4,%%v5,%%v18\n\t"
"vag %%v4,%%v4,%%v3\n\t"
"vfchedb %%v5,%%v0,%%v16\n\t"
"vsel %%v0,%%v0,%%v16,%%v5\n\t"
"vsel %%v1,%%v1,%%v4,%%v5\n\t"
"vag %%v3,%%v3,%%v2\n\t"
"vl %%v16,128(%%r1,%[x])\n\t"
"vl %%v17,144(%%r1,%[x])\n\t"
"vl %%v18,160(%%r1,%[x])\n\t"
"vl %%v19,176(%%r1,%[x])\n\t"
"vl %%v20,192(%%r1,%[x])\n\t"
"vl %%v21,208(%%r1,%[x])\n\t"
"vl %%v22,224(%%r1,%[x])\n\t"
"vl %%v23,240(%%r1,%[x])\n\t"
"vfchedb %%v4,%%v16,%%v17\n\t"
"vfchedb %%v5,%%v18,%%v19\n\t"
"vfchedb %%v6,%%v20,%%v21\n\t"
"vfchedb %%v7,%%v22,%%v23\n\t"
"vsel %%v16,%%v16,%%v17,%%v4\n\t"
"vsel %%v4,%%v24,%%v25,%%v4\n\t"
"vsel %%v17,%%v18,%%v19,%%v5\n\t"
"vsel %%v5,%%v26,%%v27,%%v5\n\t"
"vsel %%v18,%%v20,%%v21,%%v6\n\t"
"vsel %%v6,%%v28,%%v29,%%v6\n\t"
"vsel %%v19,%%v22,%%v23,%%v7\n\t"
"vsel %%v7,%%v30,%%v31,%%v7\n\t"
"vfchedb %%v20,%%v16,%%v17\n\t"
"vfchedb %%v21,%%v18,%%v19\n\t"
"vsel %%v16,%%v16,%%v17,%%v20\n\t"
"vsel %%v4,%%v4,%%v5,%%v20\n\t"
"vsel %%v17,%%v18,%%v19,%%v21\n\t"
"vsel %%v5,%%v6,%%v7,%%v21\n\t"
"vfchedb %%v18,%%v16,%%v17\n\t"
"vsel %%v16,%%v16,%%v17,%%v18\n\t"
"vsel %%v4,%%v4,%%v5,%%v18\n\t"
"vag %%v4,%%v4,%%v3\n\t"
"vfchedb %%v5,%%v0,%%v16\n\t"
"vsel %%v0,%%v0,%%v16,%%v5\n\t"
"vsel %%v1,%%v1,%%v4,%%v5\n\t"
"vag %%v3,%%v3,%%v2\n\t"
"agfi %%r1, 256\n\t"
"brctg %[n], 0b\n\t"
"vrepg %%v2,%%v0,1\n\t"
"vrepg %%v3,%%v1,1\n\t"
"wfcdb %%v2,%%v0\n\t"
"jne 1f\n\t"
"vsteg %%v0,%[max],0\n\t"
"vmnlg %%v0,%%v1,%%v3\n\t"
"vlgvg %[imax],%%v0,0\n\t"
"j 2f\n\t"
"1:\n\t"
"wfchdb %%v4,%%v2,%%v0\n\t"
"vsel %%v1,%%v3,%%v1,%%v4\n\t"
"vsel %%v0,%%v2,%%v0,%%v4\n\t"
"std %%f0,%[max]\n\t"
"vlgvg %[imax],%%v1,0\n\t"
"2:\n\t"
"nop"
: [imax] "=r"(imax),[max] "=m"(*max),[n] "+&r"(n)
: "m"(*(const FLOAT (*)[n]) x),[x] "a"(x)
: "cc", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16",
"v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26",
"v27", "v28", "v29", "v30", "v31");

return imax;
} }

BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
BLASLONG i = 0;
BLASLONG j = 0;
FLOAT maxf = 0.0;
BLASLONG max = 0;
BLASLONG i = 0;
BLASLONG j = 0;
FLOAT maxf = 0.0;
BLASLONG max = 0;

if (n <= 0 || inc_x <= 0)
return (max);


if (n <= 0 || inc_x <= 0) return (max);
if (inc_x == 1) {


if (inc_x == 1) {
BLASLONG n1 = n & -32;
if (n1 > 0) {


BLASLONG n1 = n & -32;
if (n1 > 0) {
max = idmax_kernel_32(n1, x, &maxf);


max = idmax_kernel_32(n1, x, &maxf);
i = n1;
} else {
maxf = x[0];
i++;
}


i = n1;
}
else
{
maxf = x[0];
i++;
}
while (i < n) {
if (x[i] > maxf) {
max = i;
maxf = x[i];
}
i++;
}
return (max + 1);


while (i < n) {
if (x[i] > maxf) {
max = i;
maxf = x[i];
}
i++;
}
return (max + 1);
} else {


} else {
max = 0;
maxf = x[0];

BLASLONG n1 = n & -4;
while (j < n1) {

if (x[i] > maxf) {
max = j;
maxf = x[i];
}
if (x[i + inc_x] > maxf) {
max = j + 1;
maxf = x[i + inc_x];
}
if (x[i + 2 * inc_x] > maxf) {
max = j + 2;
maxf = x[i + 2 * inc_x];
}
if (x[i + 3 * inc_x] > maxf) {
max = j + 3;
maxf = x[i + 3 * inc_x];
}

i += inc_x * 4;

j += 4;

}


max = 0;
maxf = x[0];

BLASLONG n1 = n & -4;
while (j < n1) {

if (x[i] > maxf) {
max = j;
maxf = x[i];
}
if (x[i + inc_x] > maxf) {
max = j + 1;
maxf = x[i + inc_x];
}
if (x[i + 2 * inc_x] > maxf) {
max = j + 2;
maxf = x[i + 2 * inc_x];
}
if (x[i + 3 * inc_x] > maxf) {
max = j + 3;
maxf = x[i + 3 * inc_x];
}

i += inc_x * 4;

j += 4;

}


while (j < n) {
if (x[i] > maxf) {
max = j;
maxf = x[i];
}
i += inc_x;
j++;
}
return (max + 1);
while (j < n) {
if (x[i] > maxf) {
max = j;
maxf = x[i];
}
i += inc_x;
j++;
} }
return (max + 1);
}
} }

+ 185
- 200
kernel/zarch/idmin.c View File

@@ -1,5 +1,5 @@
/*************************************************************************** /***************************************************************************
Copyright (c) 2013-2017, The OpenBLAS Project
Copyright (c) 2013-2019, The OpenBLAS Project
All rights reserved. All rights reserved.
Redistribution and use in source and binary forms, with or without Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are modification, are permitted provided that the following conditions are
@@ -27,214 +27,199 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.


#include "common.h" #include "common.h"


static BLASLONG idmin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *min)
{
BLASLONG imin;

__asm__ volatile (
"vl %%v0,0(%3) \n\t"
"vleig %%v1,0,0 \n\t"
"vleig %%v1,1,1 \n\t"
"vrepig %%v2,16 \n\t"
"vzero %%v3 \n\t"
"vleig %%v24,0,0 \n\t"
"vleig %%v24,1,1 \n\t"
"vleig %%v25,2,0 \n\t"
"vleig %%v25,3,1 \n\t"
"vleig %%v26,4,0 \n\t"
"vleig %%v26,5,1 \n\t"
"vleig %%v27,6,0 \n\t"
"vleig %%v27,7,1 \n\t"
"vleig %%v28,8,0 \n\t"
"vleig %%v28,9,1 \n\t"
"vleig %%v29,10,0 \n\t"
"vleig %%v29,11,1 \n\t"
"vleig %%v30,12,0 \n\t"
"vleig %%v30,13,1 \n\t"
"vleig %%v31,14,0 \n\t"
"vleig %%v31,15,1 \n\t"
"srlg %%r0,%2,5 \n\t"
"xgr %%r1,%%r1 \n\t"
"0: \n\t"
"pfd 1, 1024(%%r1,%3) \n\t"

"vl %%v16,0(%%r1,%3) \n\t"
"vl %%v17,16(%%r1,%3) \n\t"
"vl %%v18,32(%%r1,%3) \n\t"
"vl %%v19,48(%%r1,%3) \n\t"
"vl %%v20,64(%%r1,%3) \n\t"
"vl %%v21,80(%%r1,%3) \n\t"
"vl %%v22,96(%%r1,%3) \n\t"
"vl %%v23,112(%%r1,%3) \n\t"
"vfchedb %%v4,%%v17,%%v16 \n\t"
"vfchedb %%v5,%%v19,%%v18 \n\t"
"vfchedb %%v6,%%v21,%%v20 \n\t"
"vfchedb %%v7,%%v23,%%v22 \n\t"
"vsel %%v16,%%v16,%%v17,%%v4 \n\t"
"vsel %%v4,%%v24,%%v25,%%v4 \n\t"
"vsel %%v17,%%v18,%%v19,%%v5 \n\t"
"vsel %%v5,%%v26,%%v27,%%v5 \n\t"
"vsel %%v18,%%v20,%%v21,%%v6 \n\t"
"vsel %%v6,%%v28,%%v29,%%v6 \n\t"
"vsel %%v19,%%v22,%%v23,%%v7 \n\t"
"vsel %%v7,%%v30,%%v31,%%v7 \n\t"

"vfchedb %%v20,%%v17,%%v16 \n\t"
"vfchedb %%v21,%%v19,%%v18 \n\t"
"vsel %%v16,%%v16,%%v17,%%v20 \n\t"
"vsel %%v4,%%v4,%%v5,%%v20 \n\t"
"vsel %%v17,%%v18,%%v19,%%v21 \n\t"
"vsel %%v5,%%v6,%%v7,%%v21 \n\t"

"vfchedb %%v18,%%v17,%%v16 \n\t"
"vsel %%v16,%%v16,%%v17,%%v18 \n\t"
"vsel %%v4,%%v4,%%v5,%%v18 \n\t"
"vag %%v4,%%v4,%%v3 \n\t"

"vfchedb %%v5,%%v16,%%v0 \n\t"
"vsel %%v0,%%v0,%%v16,%%v5 \n\t"
"vsel %%v1,%%v1,%%v4,%%v5 \n\t"
"vag %%v3,%%v3,%%v2 \n\t"

"vl %%v16,128(%%r1,%3) \n\t"
"vl %%v17,144(%%r1,%3) \n\t"
"vl %%v18,160(%%r1,%3) \n\t"
"vl %%v19,176(%%r1,%3) \n\t"
"vl %%v20,192(%%r1,%3) \n\t"
"vl %%v21,208(%%r1,%3) \n\t"
"vl %%v22,224(%%r1,%3) \n\t"
"vl %%v23,240(%%r1,%3) \n\t"

"vfchedb %%v4,%%v17,%%v16 \n\t"
"vfchedb %%v5,%%v19,%%v18 \n\t"
"vfchedb %%v6,%%v21,%%v20 \n\t"
"vfchedb %%v7,%%v23,%%v22 \n\t"
"vsel %%v16,%%v16,%%v17,%%v4 \n\t"
"vsel %%v4,%%v24,%%v25,%%v4 \n\t"
"vsel %%v17,%%v18,%%v19,%%v5 \n\t"
"vsel %%v5,%%v26,%%v27,%%v5 \n\t"
"vsel %%v18,%%v20,%%v21,%%v6 \n\t"
"vsel %%v6,%%v28,%%v29,%%v6 \n\t"
"vsel %%v19,%%v22,%%v23,%%v7 \n\t"
"vsel %%v7,%%v30,%%v31,%%v7 \n\t"

"vfchedb %%v20,%%v17,%%v16 \n\t"
"vfchedb %%v21,%%v19,%%v18 \n\t"
"vsel %%v16,%%v16,%%v17,%%v20 \n\t"
"vsel %%v4,%%v4,%%v5,%%v20 \n\t"
"vsel %%v17,%%v18,%%v19,%%v21 \n\t"
"vsel %%v5,%%v6,%%v7,%%v21 \n\t"

"vfchedb %%v18,%%v17,%%v16 \n\t"
"vsel %%v16,%%v16,%%v17,%%v18 \n\t"
"vsel %%v4,%%v4,%%v5,%%v18 \n\t"
"vag %%v4,%%v4,%%v3 \n\t"

"vfchedb %%v5,%%v16,%%v0 \n\t"
"vsel %%v0,%%v0,%%v16,%%v5 \n\t"
"vsel %%v1,%%v1,%%v4,%%v5 \n\t"
"vag %%v3,%%v3,%%v2 \n\t"

"agfi %%r1, 256 \n\t"
"brctg %%r0, 0b \n\t"

"vrepg %%v2,%%v0,1 \n\t"
"vrepg %%v3,%%v1,1 \n\t"
"wfcdb %%v2,%%v0 \n\t"
"jne 1f \n\t"
"vsteg %%v0,%1,0 \n\t"
"vmnlg %%v0,%%v1,%%v3 \n\t"
"vlgvg %0,%%v0,0 \n\t"
"j 2f \n\t"
"1: \n\t"
"wfchdb %%v4,%%v0,%%v2 \n\t"
"vsel %%v1,%%v3,%%v1,%%v4 \n\t"
"vsel %%v0,%%v2,%%v0,%%v4 \n\t"
"std %%f0,%1 \n\t"
"vlgvg %0,%%v1,0 \n\t"
"2: \n\t"
"nop "
:"=r"(imin),"=m"(*min)
:"r"(n),"ZR"((const FLOAT (*)[n])x)
:"memory","cc","r0","r1","v0","v1","v2","v3","v4","v5","v6","v7","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
);

return imin;
static BLASLONG idmin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *min) {
BLASLONG imin;

__asm__("vl %%v0,0(%[x])\n\t"
"vleig %%v1,0,0\n\t"
"vleig %%v1,1,1\n\t"
"vrepig %%v2,16\n\t"
"vzero %%v3\n\t"
"vleig %%v24,0,0\n\t"
"vleig %%v24,1,1\n\t"
"vleig %%v25,2,0\n\t"
"vleig %%v25,3,1\n\t"
"vleig %%v26,4,0\n\t"
"vleig %%v26,5,1\n\t"
"vleig %%v27,6,0\n\t"
"vleig %%v27,7,1\n\t"
"vleig %%v28,8,0\n\t"
"vleig %%v28,9,1\n\t"
"vleig %%v29,10,0\n\t"
"vleig %%v29,11,1\n\t"
"vleig %%v30,12,0\n\t"
"vleig %%v30,13,1\n\t"
"vleig %%v31,14,0\n\t"
"vleig %%v31,15,1\n\t"
"srlg %[n],%[n],5\n\t"
"xgr %%r1,%%r1\n\t"
"0:\n\t"
"pfd 1, 1024(%%r1,%[x])\n\t"
"vl %%v16,0(%%r1,%[x])\n\t"
"vl %%v17,16(%%r1,%[x])\n\t"
"vl %%v18,32(%%r1,%[x])\n\t"
"vl %%v19,48(%%r1,%[x])\n\t"
"vl %%v20,64(%%r1,%[x])\n\t"
"vl %%v21,80(%%r1,%[x])\n\t"
"vl %%v22,96(%%r1,%[x])\n\t"
"vl %%v23,112(%%r1,%[x])\n\t"
"vfchedb %%v4,%%v17,%%v16\n\t"
"vfchedb %%v5,%%v19,%%v18\n\t"
"vfchedb %%v6,%%v21,%%v20\n\t"
"vfchedb %%v7,%%v23,%%v22\n\t"
"vsel %%v16,%%v16,%%v17,%%v4\n\t"
"vsel %%v4,%%v24,%%v25,%%v4\n\t"
"vsel %%v17,%%v18,%%v19,%%v5\n\t"
"vsel %%v5,%%v26,%%v27,%%v5\n\t"
"vsel %%v18,%%v20,%%v21,%%v6\n\t"
"vsel %%v6,%%v28,%%v29,%%v6\n\t"
"vsel %%v19,%%v22,%%v23,%%v7\n\t"
"vsel %%v7,%%v30,%%v31,%%v7\n\t"
"vfchedb %%v20,%%v17,%%v16\n\t"
"vfchedb %%v21,%%v19,%%v18\n\t"
"vsel %%v16,%%v16,%%v17,%%v20\n\t"
"vsel %%v4,%%v4,%%v5,%%v20\n\t"
"vsel %%v17,%%v18,%%v19,%%v21\n\t"
"vsel %%v5,%%v6,%%v7,%%v21\n\t"
"vfchedb %%v18,%%v17,%%v16\n\t"
"vsel %%v16,%%v16,%%v17,%%v18\n\t"
"vsel %%v4,%%v4,%%v5,%%v18\n\t"
"vag %%v4,%%v4,%%v3\n\t"
"vfchedb %%v5,%%v16,%%v0\n\t"
"vsel %%v0,%%v0,%%v16,%%v5\n\t"
"vsel %%v1,%%v1,%%v4,%%v5\n\t"
"vag %%v3,%%v3,%%v2\n\t"
"vl %%v16,128(%%r1,%[x])\n\t"
"vl %%v17,144(%%r1,%[x])\n\t"
"vl %%v18,160(%%r1,%[x])\n\t"
"vl %%v19,176(%%r1,%[x])\n\t"
"vl %%v20,192(%%r1,%[x])\n\t"
"vl %%v21,208(%%r1,%[x])\n\t"
"vl %%v22,224(%%r1,%[x])\n\t"
"vl %%v23,240(%%r1,%[x])\n\t"
"vfchedb %%v4,%%v17,%%v16\n\t"
"vfchedb %%v5,%%v19,%%v18\n\t"
"vfchedb %%v6,%%v21,%%v20\n\t"
"vfchedb %%v7,%%v23,%%v22\n\t"
"vsel %%v16,%%v16,%%v17,%%v4\n\t"
"vsel %%v4,%%v24,%%v25,%%v4\n\t"
"vsel %%v17,%%v18,%%v19,%%v5\n\t"
"vsel %%v5,%%v26,%%v27,%%v5\n\t"
"vsel %%v18,%%v20,%%v21,%%v6\n\t"
"vsel %%v6,%%v28,%%v29,%%v6\n\t"
"vsel %%v19,%%v22,%%v23,%%v7\n\t"
"vsel %%v7,%%v30,%%v31,%%v7\n\t"
"vfchedb %%v20,%%v17,%%v16\n\t"
"vfchedb %%v21,%%v19,%%v18\n\t"
"vsel %%v16,%%v16,%%v17,%%v20\n\t"
"vsel %%v4,%%v4,%%v5,%%v20\n\t"
"vsel %%v17,%%v18,%%v19,%%v21\n\t"
"vsel %%v5,%%v6,%%v7,%%v21\n\t"
"vfchedb %%v18,%%v17,%%v16\n\t"
"vsel %%v16,%%v16,%%v17,%%v18\n\t"
"vsel %%v4,%%v4,%%v5,%%v18\n\t"
"vag %%v4,%%v4,%%v3\n\t"
"vfchedb %%v5,%%v16,%%v0\n\t"
"vsel %%v0,%%v0,%%v16,%%v5\n\t"
"vsel %%v1,%%v1,%%v4,%%v5\n\t"
"vag %%v3,%%v3,%%v2\n\t"
"agfi %%r1, 256\n\t"
"brctg %[n], 0b\n\t"
"vrepg %%v2,%%v0,1\n\t"
"vrepg %%v3,%%v1,1\n\t"
"wfcdb %%v2,%%v0\n\t"
"jne 1f\n\t"
"vsteg %%v0,%[min],0\n\t"
"vmnlg %%v0,%%v1,%%v3\n\t"
"vlgvg %[imin],%%v0,0\n\t"
"j 2f\n\t"
"1:\n\t"
"wfchdb %%v4,%%v0,%%v2\n\t"
"vsel %%v1,%%v3,%%v1,%%v4\n\t"
"vsel %%v0,%%v2,%%v0,%%v4\n\t"
"std %%f0,%[min]\n\t"
"vlgvg %[imin],%%v1,0\n\t"
"2:\n\t"
"nop"
: [imin] "=r"(imin),[min] "=m"(*min),[n] "+&r"(n)
: "m"(*(const FLOAT (*)[n]) x),[x] "a"(x)
: "cc", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16",
"v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26",
"v27", "v28", "v29", "v30", "v31");

return imin;
} }

BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
BLASLONG i = 0;
BLASLONG j = 0;
FLOAT minf = 0.0;
BLASLONG min = 0;
BLASLONG i = 0;
BLASLONG j = 0;
FLOAT minf = 0.0;
BLASLONG min = 0;

if (n <= 0 || inc_x <= 0)
return (min);


if (n <= 0 || inc_x <= 0) return (min);
if (inc_x == 1) {


if (inc_x == 1) {
BLASLONG n1 = n & -32;
if (n1 > 0) {


BLASLONG n1 = n & -32;
if (n1 > 0) {
min = idmin_kernel_32(n1, x, &minf);


min = idmin_kernel_32(n1, x, &minf);
i = n1;
} else {
minf = x[0];
i++;
}


i = n1;
}
else
{
minf = x[0];
i++;
}
while (i < n) {
if (x[i] < minf) {
min = i;
minf = x[i];
}
i++;
}
return (min + 1);


while (i < n) {
if (x[i] < minf) {
min = i;
minf = x[i];
}
i++;
}
return (min + 1);
} else {


} else {
min = 0;
minf = x[0];

BLASLONG n1 = n & -4;
while (j < n1) {

if (x[i] < minf) {
min = j;
minf = x[i];
}
if (x[i + inc_x] < minf) {
min = j + 1;
minf = x[i + inc_x];
}
if (x[i + 2 * inc_x] < minf) {
min = j + 2;
minf = x[i + 2 * inc_x];
}
if (x[i + 3 * inc_x] < minf) {
min = j + 3;
minf = x[i + 3 * inc_x];
}

i += inc_x * 4;

j += 4;

}


min = 0;
minf = x[0];

BLASLONG n1 = n & -4;
while (j < n1) {

if (x[i] < minf) {
min = j;
minf = x[i];
}
if (x[i + inc_x] < minf) {
min = j + 1;
minf = x[i + inc_x];
}
if (x[i + 2 * inc_x] < minf) {
min = j + 2;
minf = x[i + 2 * inc_x];
}
if (x[i + 3 * inc_x] < minf) {
min = j + 3;
minf = x[i + 3 * inc_x];
}

i += inc_x * 4;

j += 4;

}


while (j < n) {
if (x[i] < minf) {
min = j;
minf = x[i];
}
i += inc_x;
j++;
}
return (min + 1);
while (j < n) {
if (x[i] < minf) {
min = j;
minf = x[i];
}
i += inc_x;
j++;
} }
return (min + 1);
}
} }

+ 238
- 258
kernel/zarch/isamax.c View File

@@ -1,5 +1,5 @@
/*************************************************************************** /***************************************************************************
Copyright (c) 2013-2017, The OpenBLAS Project
Copyright (c) 2013-2019, The OpenBLAS Project
All rights reserved. All rights reserved.
Redistribution and use in source and binary forms, with or without Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are modification, are permitted provided that the following conditions are
@@ -28,282 +28,262 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "common.h" #include "common.h"
#include <math.h> #include <math.h>


#if defined(DOUBLE)
#define ABS fabs
#else
#define ABS fabsf #define ABS fabsf
#endif


static BLASLONG isamax_kernel_64(BLASLONG n, FLOAT *x, FLOAT *amax)
{
BLASLONG iamax;
static BLASLONG isamax_kernel_64(BLASLONG n, FLOAT *x, FLOAT *amax) {
BLASLONG iamax;


__asm__ volatile (
"vl %%v0,0(%3) \n\t"
"vflpsb %%v0,%%v0 \n\t"
"vleig %%v1,0,0 \n\t"
"vleig %%v1,2,1 \n\t"
"vleig %%v2,1,0 \n\t"
"vleig %%v2,3,1 \n\t"
"vrepig %%v3,32 \n\t"
"vzero %%v4 \n\t"
"vleif %%v24,0,0 \n\t"
"vleif %%v24,1,1 \n\t"
"vleif %%v24,2,2 \n\t"
"vleif %%v24,3,3 \n\t"
"vleif %%v25,4,0 \n\t"
"vleif %%v25,5,1 \n\t"
"vleif %%v25,6,2 \n\t"
"vleif %%v25,7,3 \n\t"
"vleif %%v26,8,0 \n\t"
"vleif %%v26,9,1 \n\t"
"vleif %%v26,10,2 \n\t"
"vleif %%v26,11,3 \n\t"
"vleif %%v27,12,0 \n\t"
"vleif %%v27,13,1 \n\t"
"vleif %%v27,14,2 \n\t"
"vleif %%v27,15,3 \n\t"
"vleif %%v28,16,0 \n\t"
"vleif %%v28,17,1 \n\t"
"vleif %%v28,18,2 \n\t"
"vleif %%v28,19,3 \n\t"
"vleif %%v29,20,0 \n\t"
"vleif %%v29,21,1 \n\t"
"vleif %%v29,22,2 \n\t"
"vleif %%v29,23,3 \n\t"
"vleif %%v30,24,0 \n\t"
"vleif %%v30,25,1 \n\t"
"vleif %%v30,26,2 \n\t"
"vleif %%v30,27,3 \n\t"
"vleif %%v31,28,0 \n\t"
"vleif %%v31,29,1 \n\t"
"vleif %%v31,30,2 \n\t"
"vleif %%v31,31,3 \n\t"
"srlg %%r0,%2,6 \n\t"
"xgr %%r1,%%r1 \n\t"
"0: \n\t"
"pfd 1, 1024(%%r1,%3) \n\t"
__asm__("vl %%v0,0(%[x])\n\t"
"vflpsb %%v0,%%v0\n\t"
"vleig %%v1,0,0\n\t"
"vleig %%v1,2,1\n\t"
"vleig %%v2,1,0\n\t"
"vleig %%v2,3,1\n\t"
"vrepig %%v3,32\n\t"
"vzero %%v4\n\t"
"vleif %%v24,0,0\n\t"
"vleif %%v24,1,1\n\t"
"vleif %%v24,2,2\n\t"
"vleif %%v24,3,3\n\t"
"vleif %%v25,4,0\n\t"
"vleif %%v25,5,1\n\t"
"vleif %%v25,6,2\n\t"
"vleif %%v25,7,3\n\t"
"vleif %%v26,8,0\n\t"
"vleif %%v26,9,1\n\t"
"vleif %%v26,10,2\n\t"
"vleif %%v26,11,3\n\t"
"vleif %%v27,12,0\n\t"
"vleif %%v27,13,1\n\t"
"vleif %%v27,14,2\n\t"
"vleif %%v27,15,3\n\t"
"vleif %%v28,16,0\n\t"
"vleif %%v28,17,1\n\t"
"vleif %%v28,18,2\n\t"
"vleif %%v28,19,3\n\t"
"vleif %%v29,20,0\n\t"
"vleif %%v29,21,1\n\t"
"vleif %%v29,22,2\n\t"
"vleif %%v29,23,3\n\t"
"vleif %%v30,24,0\n\t"
"vleif %%v30,25,1\n\t"
"vleif %%v30,26,2\n\t"
"vleif %%v30,27,3\n\t"
"vleif %%v31,28,0\n\t"
"vleif %%v31,29,1\n\t"
"vleif %%v31,30,2\n\t"
"vleif %%v31,31,3\n\t"
"srlg %[n],%[n],6\n\t"
"xgr %%r1,%%r1\n\t"
"0:\n\t"
"pfd 1, 1024(%%r1,%[x])\n\t"
"vl %%v16,0(%%r1,%[x])\n\t"
"vl %%v17,16(%%r1,%[x])\n\t"
"vl %%v18,32(%%r1,%[x])\n\t"
"vl %%v19,48(%%r1,%[x])\n\t"
"vl %%v20,64(%%r1,%[x])\n\t"
"vl %%v21,80(%%r1,%[x])\n\t"
"vl %%v22,96(%%r1,%[x])\n\t"
"vl %%v23,112(%%r1,%[x])\n\t"
"vflpsb %%v16, %%v16\n\t"
"vflpsb %%v17, %%v17\n\t"
"vflpsb %%v18, %%v18\n\t"
"vflpsb %%v19, %%v19\n\t"
"vflpsb %%v20, %%v20\n\t"
"vflpsb %%v21, %%v21\n\t"
"vflpsb %%v22, %%v22\n\t"
"vflpsb %%v23, %%v23\n\t"
"vfchesb %%v5,%%v16,%%v17\n\t"
"vfchesb %%v6,%%v18,%%v19\n\t"
"vfchesb %%v7,%%v20,%%v21\n\t"
"vfchesb %%v8,%%v22,%%v23\n\t"
"vsel %%v16,%%v16,%%v17,%%v5\n\t"
"vsel %%v5,%%v24,%%v25,%%v5\n\t"
"vsel %%v17,%%v18,%%v19,%%v6\n\t"
"vsel %%v6,%%v26,%%v27,%%v6\n\t"
"vsel %%v18,%%v20,%%v21,%%v7\n\t"
"vsel %%v7,%%v28,%%v29,%%v7\n\t"
"vsel %%v19,%%v22,%%v23,%%v8\n\t"
"vsel %%v8,%%v30,%%v31,%%v8\n\t"
"vfchesb %%v20,%%v16,%%v17\n\t"
"vfchesb %%v21,%%v18,%%v19\n\t"
"vsel %%v16,%%v16,%%v17,%%v20\n\t"
"vsel %%v5,%%v5,%%v6,%%v20\n\t"
"vsel %%v17,%%v18,%%v19,%%v21\n\t"
"vsel %%v6,%%v7,%%v8,%%v21\n\t"
"vfchesb %%v18,%%v16,%%v17\n\t"
"vsel %%v16,%%v16,%%v17,%%v18\n\t"
"vsel %%v5,%%v5,%%v6,%%v18\n\t"
"vsegf %%v6,%%v5\n\t"
"vesrlg %%v5,%%v5,32\n\t"
"vag %%v5,%%v5,%%v4\n\t"
"vag %%v6,%%v6,%%v4\n\t"
"vfchesb %%v7,%%v0,%%v16\n\t"
"vsel %%v0,%%v0,%%v16,%%v7\n\t"
"vsegf %%v8,%%v7\n\t"
"vesrlg %%v7,%%v7,32\n\t"
"vsegf %%v7,%%v7\n\t"
"vsel %%v1,%%v1,%%v5,%%v7\n\t"
"vsel %%v2,%%v2,%%v6,%%v8\n\t"
"vag %%v4,%%v4,%%v3\n\t"
"vl %%v16,128(%%r1,%[x])\n\t"
"vl %%v17,144(%%r1,%[x])\n\t"
"vl %%v18,160(%%r1,%[x])\n\t"
"vl %%v19,176(%%r1,%[x])\n\t"
"vl %%v20,192(%%r1,%[x])\n\t"
"vl %%v21,208(%%r1,%[x])\n\t"
"vl %%v22,224(%%r1,%[x])\n\t"
"vl %%v23,240(%%r1,%[x])\n\t"
"vflpsb %%v16, %%v16\n\t"
"vflpsb %%v17, %%v17\n\t"
"vflpsb %%v18, %%v18\n\t"
"vflpsb %%v19, %%v19\n\t"
"vflpsb %%v20, %%v20\n\t"
"vflpsb %%v21, %%v21\n\t"
"vflpsb %%v22, %%v22\n\t"
"vflpsb %%v23, %%v23\n\t"
"vfchesb %%v5,%%v16,%%v17\n\t"
"vfchesb %%v6,%%v18,%%v19\n\t"
"vfchesb %%v7,%%v20,%%v21\n\t"
"vfchesb %%v8,%%v22,%%v23\n\t"
"vsel %%v16,%%v16,%%v17,%%v5\n\t"
"vsel %%v5,%%v24,%%v25,%%v5\n\t"
"vsel %%v17,%%v18,%%v19,%%v6\n\t"
"vsel %%v6,%%v26,%%v27,%%v6\n\t"
"vsel %%v18,%%v20,%%v21,%%v7\n\t"
"vsel %%v7,%%v28,%%v29,%%v7\n\t"
"vsel %%v19,%%v22,%%v23,%%v8\n\t"
"vsel %%v8,%%v30,%%v31,%%v8\n\t"
"vfchesb %%v20,%%v16,%%v17\n\t"
"vfchesb %%v21,%%v18,%%v19\n\t"
"vsel %%v16,%%v16,%%v17,%%v20\n\t"
"vsel %%v5,%%v5,%%v6,%%v20\n\t"
"vsel %%v17,%%v18,%%v19,%%v21\n\t"
"vsel %%v6,%%v7,%%v8,%%v21\n\t"
"vfchesb %%v18,%%v16,%%v17\n\t"
"vsel %%v16,%%v16,%%v17,%%v18\n\t"
"vsel %%v5,%%v5,%%v6,%%v18\n\t"
"vsegf %%v6,%%v5\n\t"
"vesrlg %%v5,%%v5,32\n\t"
"vag %%v5,%%v5,%%v4\n\t"
"vag %%v6,%%v6,%%v4\n\t"
"vfchesb %%v7,%%v0,%%v16\n\t"
"vsel %%v0,%%v0,%%v16,%%v7\n\t"
"vsegf %%v8,%%v7\n\t"
"vesrlg %%v7,%%v7,32\n\t"
"vsegf %%v7,%%v7\n\t"
"vsel %%v1,%%v1,%%v5,%%v7\n\t"
"vsel %%v2,%%v2,%%v6,%%v8\n\t"
"vag %%v4,%%v4,%%v3\n\t"
"agfi %%r1, 256\n\t"
"brctg %[n], 0b\n\t"
"veslg %%v3,%%v0,32\n\t"
"vfchsb %%v4,%%v0,%%v3\n\t"
"vchlg %%v5,%%v2,%%v1\n\t"
"vfcesb %%v6,%%v0,%%v3\n\t"
"vn %%v5,%%v5,%%v6\n\t"
"vo %%v4,%%v4,%%v5\n\t"
"vsel %%v0,%%v0,%%v3,%%v4\n\t"
"vesrlg %%v4,%%v4,32\n\t"
"vsegf %%v4,%%v4\n\t"
"vsel %%v1,%%v1,%%v2,%%v4\n\t"
"vrepf %%v2,%%v0,2\n\t"
"vrepg %%v3,%%v1,1\n\t"
"wfcsb %%v2,%%v0\n\t"
"jne 1f\n\t"
"vstef %%v0,%[amax],0\n\t"
"vmnlg %%v0,%%v1,%%v3\n\t"
"vlgvg %[iamax],%%v0,0\n\t"
"j 2f\n\t"
"1:\n\t"
"wfchsb %%v4,%%v2,%%v0\n\t"
"vesrlg %%v4,%%v4,32\n\t"
"vsegf %%v4,%%v4\n\t"
"vsel %%v1,%%v3,%%v1,%%v4\n\t"
"vsel %%v0,%%v2,%%v0,%%v4\n\t"
"ste %%f0,%[amax]\n\t"
"vlgvg %[iamax],%%v1,0\n\t"
"2:\n\t"
"nop"
: [iamax] "=r"(iamax),[amax] "=m"(*amax),[n] "+&r"(n)
: "m"(*(const FLOAT (*)[n]) x),[x] "a"(x)
: "cc", "r1", "v0", "v1", "v2", "v4", "v5", "v6", "v7", "v8", "v16",
"v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26",
"v27", "v28", "v29", "v30", "v31");


"vl %%v16,0(%%r1,%3) \n\t"
"vl %%v17,16(%%r1,%3) \n\t"
"vl %%v18,32(%%r1,%3) \n\t"
"vl %%v19,48(%%r1,%3) \n\t"
"vl %%v20,64(%%r1,%3) \n\t"
"vl %%v21,80(%%r1,%3) \n\t"
"vl %%v22,96(%%r1,%3) \n\t"
"vl %%v23,112(%%r1,%3) \n\t"
"vflpsb %%v16, %%v16 \n\t"
"vflpsb %%v17, %%v17 \n\t"
"vflpsb %%v18, %%v18 \n\t"
"vflpsb %%v19, %%v19 \n\t"
"vflpsb %%v20, %%v20 \n\t"
"vflpsb %%v21, %%v21 \n\t"
"vflpsb %%v22, %%v22 \n\t"
"vflpsb %%v23, %%v23 \n\t"
"vfchesb %%v5,%%v16,%%v17 \n\t"
"vfchesb %%v6,%%v18,%%v19 \n\t"
"vfchesb %%v7,%%v20,%%v21 \n\t"
"vfchesb %%v8,%%v22,%%v23 \n\t"
"vsel %%v16,%%v16,%%v17,%%v5 \n\t"
"vsel %%v5,%%v24,%%v25,%%v5 \n\t"
"vsel %%v17,%%v18,%%v19,%%v6 \n\t"
"vsel %%v6,%%v26,%%v27,%%v6 \n\t"
"vsel %%v18,%%v20,%%v21,%%v7 \n\t"
"vsel %%v7,%%v28,%%v29,%%v7 \n\t"
"vsel %%v19,%%v22,%%v23,%%v8 \n\t"
"vsel %%v8,%%v30,%%v31,%%v8 \n\t"

"vfchesb %%v20,%%v16,%%v17 \n\t"
"vfchesb %%v21,%%v18,%%v19 \n\t"
"vsel %%v16,%%v16,%%v17,%%v20 \n\t"
"vsel %%v5,%%v5,%%v6,%%v20 \n\t"
"vsel %%v17,%%v18,%%v19,%%v21 \n\t"
"vsel %%v6,%%v7,%%v8,%%v21 \n\t"

"vfchesb %%v18,%%v16,%%v17 \n\t"
"vsel %%v16,%%v16,%%v17,%%v18 \n\t"
"vsel %%v5,%%v5,%%v6,%%v18 \n\t"
"vsegf %%v6,%%v5 \n\t"
"vesrlg %%v5,%%v5,32 \n\t"
"vag %%v5,%%v5,%%v4 \n\t"
"vag %%v6,%%v6,%%v4 \n\t"

"vfchesb %%v7,%%v0,%%v16 \n\t"
"vsel %%v0,%%v0,%%v16,%%v7 \n\t"
"vsegf %%v8,%%v7 \n\t"
"vesrlg %%v7,%%v7,32 \n\t"
"vsegf %%v7,%%v7 \n\t"
"vsel %%v1,%%v1,%%v5,%%v7 \n\t"
"vsel %%v2,%%v2,%%v6,%%v8 \n\t"
"vag %%v4,%%v4,%%v3 \n\t"

"vl %%v16,128(%%r1,%3) \n\t"
"vl %%v17,144(%%r1,%3) \n\t"
"vl %%v18,160(%%r1,%3) \n\t"
"vl %%v19,176(%%r1,%3) \n\t"
"vl %%v20,192(%%r1,%3) \n\t"
"vl %%v21,208(%%r1,%3) \n\t"
"vl %%v22,224(%%r1,%3) \n\t"
"vl %%v23,240(%%r1,%3) \n\t"
"vflpsb %%v16, %%v16 \n\t"
"vflpsb %%v17, %%v17 \n\t"
"vflpsb %%v18, %%v18 \n\t"
"vflpsb %%v19, %%v19 \n\t"
"vflpsb %%v20, %%v20 \n\t"
"vflpsb %%v21, %%v21 \n\t"
"vflpsb %%v22, %%v22 \n\t"
"vflpsb %%v23, %%v23 \n\t"
"vfchesb %%v5,%%v16,%%v17 \n\t"
"vfchesb %%v6,%%v18,%%v19 \n\t"
"vfchesb %%v7,%%v20,%%v21 \n\t"
"vfchesb %%v8,%%v22,%%v23 \n\t"
"vsel %%v16,%%v16,%%v17,%%v5 \n\t"
"vsel %%v5,%%v24,%%v25,%%v5 \n\t"
"vsel %%v17,%%v18,%%v19,%%v6 \n\t"
"vsel %%v6,%%v26,%%v27,%%v6 \n\t"
"vsel %%v18,%%v20,%%v21,%%v7 \n\t"
"vsel %%v7,%%v28,%%v29,%%v7 \n\t"
"vsel %%v19,%%v22,%%v23,%%v8 \n\t"
"vsel %%v8,%%v30,%%v31,%%v8 \n\t"

"vfchesb %%v20,%%v16,%%v17 \n\t"
"vfchesb %%v21,%%v18,%%v19 \n\t"
"vsel %%v16,%%v16,%%v17,%%v20 \n\t"
"vsel %%v5,%%v5,%%v6,%%v20 \n\t"
"vsel %%v17,%%v18,%%v19,%%v21 \n\t"
"vsel %%v6,%%v7,%%v8,%%v21 \n\t"

"vfchesb %%v18,%%v16,%%v17 \n\t"
"vsel %%v16,%%v16,%%v17,%%v18 \n\t"
"vsel %%v5,%%v5,%%v6,%%v18 \n\t"
"vsegf %%v6,%%v5 \n\t"
"vesrlg %%v5,%%v5,32 \n\t"
"vag %%v5,%%v5,%%v4 \n\t"
"vag %%v6,%%v6,%%v4 \n\t"

"vfchesb %%v7,%%v0,%%v16 \n\t"
"vsel %%v0,%%v0,%%v16,%%v7 \n\t"
"vsegf %%v8,%%v7 \n\t"
"vesrlg %%v7,%%v7,32 \n\t"
"vsegf %%v7,%%v7 \n\t"
"vsel %%v1,%%v1,%%v5,%%v7 \n\t"
"vsel %%v2,%%v2,%%v6,%%v8 \n\t"
"vag %%v4,%%v4,%%v3 \n\t"

"agfi %%r1, 256 \n\t"
"brctg %%r0, 0b \n\t"

"veslg %%v3,%%v0,32 \n\t"
"vfchsb %%v4,%%v0,%%v3 \n\t"
"vchlg %%v5,%%v2,%%v1 \n\t"
"vfcesb %%v6,%%v0,%%v3 \n\t"
"vn %%v5,%%v5,%%v6 \n\t"
"vo %%v4,%%v4,%%v5 \n\t"
"vsel %%v0,%%v0,%%v3,%%v4 \n\t"
"vesrlg %%v4,%%v4,32 \n\t"
"vsegf %%v4,%%v4 \n\t"
"vsel %%v1,%%v1,%%v2,%%v4 \n\t"

"vrepf %%v2,%%v0,2 \n\t"
"vrepg %%v3,%%v1,1 \n\t"
"wfcsb %%v2,%%v0 \n\t"
"jne 1f \n\t"
"vstef %%v0,%1,0 \n\t"
"vmnlg %%v0,%%v1,%%v3 \n\t"
"vlgvg %0,%%v0,0 \n\t"
"j 2f \n\t"
"1: \n\t"
"wfchsb %%v4,%%v2,%%v0 \n\t"
"vesrlg %%v4,%%v4,32 \n\t"
"vsegf %%v4,%%v4 \n\t"
"vsel %%v1,%%v3,%%v1,%%v4 \n\t"
"vsel %%v0,%%v2,%%v0,%%v4 \n\t"
"ste %%f0,%1 \n\t"
"vlgvg %0,%%v1,0 \n\t"
"2: \n\t"
"nop "
:"=r"(iamax),"=m"(*amax)
:"r"(n),"ZR"((const FLOAT (*)[n])x)
:"memory","cc","r0","r1","v0","v1","v2","v4","v5","v6","v7","v8","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
);

return iamax;
return iamax;
} }
BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
BLASLONG i = 0;
BLASLONG j = 0;
FLOAT maxf = 0.0;
BLASLONG max = 0;


if (n <= 0 || inc_x <= 0) return (max);

if (inc_x == 1) {
BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
BLASLONG i = 0;
BLASLONG j = 0;
FLOAT maxf = 0.0;
BLASLONG max = 0;


BLASLONG n1 = n & -64;
if (n1 > 0) {
if (n <= 0 || inc_x <= 0)
return (max);


max = isamax_kernel_64(n1, x, &maxf);
if (inc_x == 1) {


i = n1;
}
else
{
maxf = ABS(x[0]);
i++;
}
BLASLONG n1 = n & -64;
if (n1 > 0) {


while (i < n) {
if (ABS(x[i]) > maxf) {
max = i;
maxf = ABS(x[i]);
}
i++;
}
return (max + 1);
max = isamax_kernel_64(n1, x, &maxf);


i = n1;
} else { } else {
maxf = ABS(x[0]);
i++;
}

while (i < n) {
if (ABS(x[i]) > maxf) {
max = i;
maxf = ABS(x[i]);
}
i++;
}
return (max + 1);


max = 0;
maxf = ABS(x[0]);
} else {


BLASLONG n1 = n & -4;
while (j < n1) {
max = 0;
maxf = ABS(x[0]);


if (ABS(x[i]) > maxf) {
max = j;
maxf = ABS(x[i]);
}
if (ABS(x[i + inc_x]) > maxf) {
max = j + 1;
maxf = ABS(x[i + inc_x]);
}
if (ABS(x[i + 2 * inc_x]) > maxf) {
max = j + 2;
maxf = ABS(x[i + 2 * inc_x]);
}
if (ABS(x[i + 3 * inc_x]) > maxf) {
max = j + 3;
maxf = ABS(x[i + 3 * inc_x]);
}
BLASLONG n1 = n & -4;
while (j < n1) {


i += inc_x * 4;
if (ABS(x[i]) > maxf) {
max = j;
maxf = ABS(x[i]);
}
if (ABS(x[i + inc_x]) > maxf) {
max = j + 1;
maxf = ABS(x[i + inc_x]);
}
if (ABS(x[i + 2 * inc_x]) > maxf) {
max = j + 2;
maxf = ABS(x[i + 2 * inc_x]);
}
if (ABS(x[i + 3 * inc_x]) > maxf) {
max = j + 3;
maxf = ABS(x[i + 3 * inc_x]);
}


j += 4;
i += inc_x * 4;


}
j += 4;


}


while (j < n) {
if (ABS(x[i]) > maxf) {
max = j;
maxf = ABS(x[i]);
}
i += inc_x;
j++;
}
return (max + 1);
while (j < n) {
if (ABS(x[i]) > maxf) {
max = j;
maxf = ABS(x[i]);
}
i += inc_x;
j++;
} }
return (max + 1);
}
} }

+ 238
- 258
kernel/zarch/isamin.c View File

@@ -1,5 +1,5 @@
/*************************************************************************** /***************************************************************************
Copyright (c) 2013-2017, The OpenBLAS Project
Copyright (c) 2013-2019, The OpenBLAS Project
All rights reserved. All rights reserved.
Redistribution and use in source and binary forms, with or without Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are modification, are permitted provided that the following conditions are
@@ -28,282 +28,262 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "common.h" #include "common.h"
#include <math.h> #include <math.h>


#if defined(DOUBLE)
#define ABS fabs
#else
#define ABS fabsf #define ABS fabsf
#endif


static BLASLONG isamin_kernel_64(BLASLONG n, FLOAT *x, FLOAT *amin)
{
BLASLONG iamin;
static BLASLONG isamin_kernel_64(BLASLONG n, FLOAT *x, FLOAT *amin) {
BLASLONG iamin;


__asm__ volatile (
"vl %%v0,0(%3) \n\t"
"vflpsb %%v0,%%v0 \n\t"
"vleig %%v1,0,0 \n\t"
"vleig %%v1,2,1 \n\t"
"vleig %%v2,1,0 \n\t"
"vleig %%v2,3,1 \n\t"
"vrepig %%v3,32 \n\t"
"vzero %%v4 \n\t"
"vleif %%v24,0,0 \n\t"
"vleif %%v24,1,1 \n\t"
"vleif %%v24,2,2 \n\t"
"vleif %%v24,3,3 \n\t"
"vleif %%v25,4,0 \n\t"
"vleif %%v25,5,1 \n\t"
"vleif %%v25,6,2 \n\t"
"vleif %%v25,7,3 \n\t"
"vleif %%v26,8,0 \n\t"
"vleif %%v26,9,1 \n\t"
"vleif %%v26,10,2 \n\t"
"vleif %%v26,11,3 \n\t"
"vleif %%v27,12,0 \n\t"
"vleif %%v27,13,1 \n\t"
"vleif %%v27,14,2 \n\t"
"vleif %%v27,15,3 \n\t"
"vleif %%v28,16,0 \n\t"
"vleif %%v28,17,1 \n\t"
"vleif %%v28,18,2 \n\t"
"vleif %%v28,19,3 \n\t"
"vleif %%v29,20,0 \n\t"
"vleif %%v29,21,1 \n\t"
"vleif %%v29,22,2 \n\t"
"vleif %%v29,23,3 \n\t"
"vleif %%v30,24,0 \n\t"
"vleif %%v30,25,1 \n\t"
"vleif %%v30,26,2 \n\t"
"vleif %%v30,27,3 \n\t"
"vleif %%v31,28,0 \n\t"
"vleif %%v31,29,1 \n\t"
"vleif %%v31,30,2 \n\t"
"vleif %%v31,31,3 \n\t"
"srlg %%r0,%2,6 \n\t"
"xgr %%r1,%%r1 \n\t"
"0: \n\t"
"pfd 1, 1024(%%r1,%3) \n\t"
__asm__("vl %%v0,0(%[x])\n\t"
"vflpsb %%v0,%%v0\n\t"
"vleig %%v1,0,0\n\t"
"vleig %%v1,2,1\n\t"
"vleig %%v2,1,0\n\t"
"vleig %%v2,3,1\n\t"
"vrepig %%v3,32\n\t"
"vzero %%v4\n\t"
"vleif %%v24,0,0\n\t"
"vleif %%v24,1,1\n\t"
"vleif %%v24,2,2\n\t"
"vleif %%v24,3,3\n\t"
"vleif %%v25,4,0\n\t"
"vleif %%v25,5,1\n\t"
"vleif %%v25,6,2\n\t"
"vleif %%v25,7,3\n\t"
"vleif %%v26,8,0\n\t"
"vleif %%v26,9,1\n\t"
"vleif %%v26,10,2\n\t"
"vleif %%v26,11,3\n\t"
"vleif %%v27,12,0\n\t"
"vleif %%v27,13,1\n\t"
"vleif %%v27,14,2\n\t"
"vleif %%v27,15,3\n\t"
"vleif %%v28,16,0\n\t"
"vleif %%v28,17,1\n\t"
"vleif %%v28,18,2\n\t"
"vleif %%v28,19,3\n\t"
"vleif %%v29,20,0\n\t"
"vleif %%v29,21,1\n\t"
"vleif %%v29,22,2\n\t"
"vleif %%v29,23,3\n\t"
"vleif %%v30,24,0\n\t"
"vleif %%v30,25,1\n\t"
"vleif %%v30,26,2\n\t"
"vleif %%v30,27,3\n\t"
"vleif %%v31,28,0\n\t"
"vleif %%v31,29,1\n\t"
"vleif %%v31,30,2\n\t"
"vleif %%v31,31,3\n\t"
"srlg %[n],%[n],6\n\t"
"xgr %%r1,%%r1\n\t"
"0:\n\t"
"pfd 1, 1024(%%r1,%[x])\n\t"
"vl %%v16,0(%%r1,%[x])\n\t"
"vl %%v17,16(%%r1,%[x])\n\t"
"vl %%v18,32(%%r1,%[x])\n\t"
"vl %%v19,48(%%r1,%[x])\n\t"
"vl %%v20,64(%%r1,%[x])\n\t"
"vl %%v21,80(%%r1,%[x])\n\t"
"vl %%v22,96(%%r1,%[x])\n\t"
"vl %%v23,112(%%r1,%[x])\n\t"
"vflpsb %%v16, %%v16\n\t"
"vflpsb %%v17, %%v17\n\t"
"vflpsb %%v18, %%v18\n\t"
"vflpsb %%v19, %%v19\n\t"
"vflpsb %%v20, %%v20\n\t"
"vflpsb %%v21, %%v21\n\t"
"vflpsb %%v22, %%v22\n\t"
"vflpsb %%v23, %%v23\n\t"
"vfchesb %%v5,%%v17,%%v16\n\t"
"vfchesb %%v6,%%v19,%%v18\n\t"
"vfchesb %%v7,%%v21,%%v20\n\t"
"vfchesb %%v8,%%v23,%%v22\n\t"
"vsel %%v16,%%v16,%%v17,%%v5\n\t"
"vsel %%v5,%%v24,%%v25,%%v5\n\t"
"vsel %%v17,%%v18,%%v19,%%v6\n\t"
"vsel %%v6,%%v26,%%v27,%%v6\n\t"
"vsel %%v18,%%v20,%%v21,%%v7\n\t"
"vsel %%v7,%%v28,%%v29,%%v7\n\t"
"vsel %%v19,%%v22,%%v23,%%v8\n\t"
"vsel %%v8,%%v30,%%v31,%%v8\n\t"
"vfchesb %%v20,%%v17,%%v16\n\t"
"vfchesb %%v21,%%v19,%%v18\n\t"
"vsel %%v16,%%v16,%%v17,%%v20\n\t"
"vsel %%v5,%%v5,%%v6,%%v20\n\t"
"vsel %%v17,%%v18,%%v19,%%v21\n\t"
"vsel %%v6,%%v7,%%v8,%%v21\n\t"
"vfchesb %%v18,%%v17,%%v16\n\t"
"vsel %%v16,%%v16,%%v17,%%v18\n\t"
"vsel %%v5,%%v5,%%v6,%%v18\n\t"
"vsegf %%v6,%%v5\n\t"
"vesrlg %%v5,%%v5,32\n\t"
"vag %%v5,%%v5,%%v4\n\t"
"vag %%v6,%%v6,%%v4\n\t"
"vfchesb %%v7,%%v16,%%v0\n\t"
"vsel %%v0,%%v0,%%v16,%%v7\n\t"
"vsegf %%v8,%%v7\n\t"
"vesrlg %%v7,%%v7,32\n\t"
"vsegf %%v7,%%v7\n\t"
"vsel %%v1,%%v1,%%v5,%%v7\n\t"
"vsel %%v2,%%v2,%%v6,%%v8\n\t"
"vag %%v4,%%v4,%%v3\n\t"
"vl %%v16,128(%%r1,%[x])\n\t"
"vl %%v17,144(%%r1,%[x])\n\t"
"vl %%v18,160(%%r1,%[x])\n\t"
"vl %%v19,176(%%r1,%[x])\n\t"
"vl %%v20,192(%%r1,%[x])\n\t"
"vl %%v21,208(%%r1,%[x])\n\t"
"vl %%v22,224(%%r1,%[x])\n\t"
"vl %%v23,240(%%r1,%[x])\n\t"
"vflpsb %%v16, %%v16\n\t"
"vflpsb %%v17, %%v17\n\t"
"vflpsb %%v18, %%v18\n\t"
"vflpsb %%v19, %%v19\n\t"
"vflpsb %%v20, %%v20\n\t"
"vflpsb %%v21, %%v21\n\t"
"vflpsb %%v22, %%v22\n\t"
"vflpsb %%v23, %%v23\n\t"
"vfchesb %%v5,%%v17,%%v16\n\t"
"vfchesb %%v6,%%v19,%%v18\n\t"
"vfchesb %%v7,%%v21,%%v20\n\t"
"vfchesb %%v8,%%v23,%%v22\n\t"
"vsel %%v16,%%v16,%%v17,%%v5\n\t"
"vsel %%v5,%%v24,%%v25,%%v5\n\t"
"vsel %%v17,%%v18,%%v19,%%v6\n\t"
"vsel %%v6,%%v26,%%v27,%%v6\n\t"
"vsel %%v18,%%v20,%%v21,%%v7\n\t"
"vsel %%v7,%%v28,%%v29,%%v7\n\t"
"vsel %%v19,%%v22,%%v23,%%v8\n\t"
"vsel %%v8,%%v30,%%v31,%%v8\n\t"
"vfchesb %%v20,%%v17,%%v16\n\t"
"vfchesb %%v21,%%v19,%%v18\n\t"
"vsel %%v16,%%v16,%%v17,%%v20\n\t"
"vsel %%v5,%%v5,%%v6,%%v20\n\t"
"vsel %%v17,%%v18,%%v19,%%v21\n\t"
"vsel %%v6,%%v7,%%v8,%%v21\n\t"
"vfchesb %%v18,%%v17,%%v16\n\t"
"vsel %%v16,%%v16,%%v17,%%v18\n\t"
"vsel %%v5,%%v5,%%v6,%%v18\n\t"
"vsegf %%v6,%%v5\n\t"
"vesrlg %%v5,%%v5,32\n\t"
"vag %%v5,%%v5,%%v4\n\t"
"vag %%v6,%%v6,%%v4\n\t"
"vfchesb %%v7,%%v16,%%v0\n\t"
"vsel %%v0,%%v0,%%v16,%%v7\n\t"
"vsegf %%v8,%%v7\n\t"
"vesrlg %%v7,%%v7,32\n\t"
"vsegf %%v7,%%v7\n\t"
"vsel %%v1,%%v1,%%v5,%%v7\n\t"
"vsel %%v2,%%v2,%%v6,%%v8\n\t"
"vag %%v4,%%v4,%%v3\n\t"
"agfi %%r1, 256\n\t"
"brctg %[n], 0b\n\t"
"veslg %%v3,%%v0,32\n\t"
"vfchsb %%v4,%%v3,%%v0\n\t"
"vchlg %%v5,%%v2,%%v1\n\t"
"vfcesb %%v6,%%v0,%%v3\n\t"
"vn %%v5,%%v5,%%v6\n\t"
"vo %%v4,%%v4,%%v5\n\t"
"vsel %%v0,%%v0,%%v3,%%v4\n\t"
"vesrlg %%v4,%%v4,32\n\t"
"vsegf %%v4,%%v4\n\t"
"vsel %%v1,%%v1,%%v2,%%v4\n\t"
"vrepf %%v2,%%v0,2\n\t"
"vrepg %%v3,%%v1,1\n\t"
"wfcsb %%v2,%%v0\n\t"
"jne 1f\n\t"
"vstef %%v0,%[amin],0\n\t"
"vmnlg %%v0,%%v1,%%v3\n\t"
"vlgvg %[iamin],%%v0,0\n\t"
"j 2f\n\t"
"1:\n\t"
"wfchsb %%v4,%%v0,%%v2\n\t"
"vesrlg %%v4,%%v4,32\n\t"
"vsegf %%v4,%%v4\n\t"
"vsel %%v1,%%v3,%%v1,%%v4\n\t"
"vsel %%v0,%%v2,%%v0,%%v4\n\t"
"ste %%f0,%[amin]\n\t"
"vlgvg %[iamin],%%v1,0\n\t"
"2:\n\t"
"nop"
: [iamin] "=r"(iamin),[amin] "=m"(*amin),[n] "+&r"(n)
: "m"(*(const FLOAT (*)[n]) x),[x] "a"(x)
: "cc", "r1", "v0", "v1", "v2", "v4", "v5", "v6", "v7", "v8", "v16",
"v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26",
"v27", "v28", "v29", "v30", "v31");


"vl %%v16,0(%%r1,%3) \n\t"
"vl %%v17,16(%%r1,%3) \n\t"
"vl %%v18,32(%%r1,%3) \n\t"
"vl %%v19,48(%%r1,%3) \n\t"
"vl %%v20,64(%%r1,%3) \n\t"
"vl %%v21,80(%%r1,%3) \n\t"
"vl %%v22,96(%%r1,%3) \n\t"
"vl %%v23,112(%%r1,%3) \n\t"
"vflpsb %%v16, %%v16 \n\t"
"vflpsb %%v17, %%v17 \n\t"
"vflpsb %%v18, %%v18 \n\t"
"vflpsb %%v19, %%v19 \n\t"
"vflpsb %%v20, %%v20 \n\t"
"vflpsb %%v21, %%v21 \n\t"
"vflpsb %%v22, %%v22 \n\t"
"vflpsb %%v23, %%v23 \n\t"
"vfchesb %%v5,%%v17,%%v16 \n\t"
"vfchesb %%v6,%%v19,%%v18 \n\t"
"vfchesb %%v7,%%v21,%%v20 \n\t"
"vfchesb %%v8,%%v23,%%v22 \n\t"
"vsel %%v16,%%v16,%%v17,%%v5 \n\t"
"vsel %%v5,%%v24,%%v25,%%v5 \n\t"
"vsel %%v17,%%v18,%%v19,%%v6 \n\t"
"vsel %%v6,%%v26,%%v27,%%v6 \n\t"
"vsel %%v18,%%v20,%%v21,%%v7 \n\t"
"vsel %%v7,%%v28,%%v29,%%v7 \n\t"
"vsel %%v19,%%v22,%%v23,%%v8 \n\t"
"vsel %%v8,%%v30,%%v31,%%v8 \n\t"

"vfchesb %%v20,%%v17,%%v16 \n\t"
"vfchesb %%v21,%%v19,%%v18 \n\t"
"vsel %%v16,%%v16,%%v17,%%v20 \n\t"
"vsel %%v5,%%v5,%%v6,%%v20 \n\t"
"vsel %%v17,%%v18,%%v19,%%v21 \n\t"
"vsel %%v6,%%v7,%%v8,%%v21 \n\t"

"vfchesb %%v18,%%v17,%%v16 \n\t"
"vsel %%v16,%%v16,%%v17,%%v18 \n\t"
"vsel %%v5,%%v5,%%v6,%%v18 \n\t"
"vsegf %%v6,%%v5 \n\t"
"vesrlg %%v5,%%v5,32 \n\t"
"vag %%v5,%%v5,%%v4 \n\t"
"vag %%v6,%%v6,%%v4 \n\t"

"vfchesb %%v7,%%v16,%%v0 \n\t"
"vsel %%v0,%%v0,%%v16,%%v7 \n\t"
"vsegf %%v8,%%v7 \n\t"
"vesrlg %%v7,%%v7,32 \n\t"
"vsegf %%v7,%%v7 \n\t"
"vsel %%v1,%%v1,%%v5,%%v7 \n\t"
"vsel %%v2,%%v2,%%v6,%%v8 \n\t"
"vag %%v4,%%v4,%%v3 \n\t"

"vl %%v16,128(%%r1,%3) \n\t"
"vl %%v17,144(%%r1,%3) \n\t"
"vl %%v18,160(%%r1,%3) \n\t"
"vl %%v19,176(%%r1,%3) \n\t"
"vl %%v20,192(%%r1,%3) \n\t"
"vl %%v21,208(%%r1,%3) \n\t"
"vl %%v22,224(%%r1,%3) \n\t"
"vl %%v23,240(%%r1,%3) \n\t"
"vflpsb %%v16, %%v16 \n\t"
"vflpsb %%v17, %%v17 \n\t"
"vflpsb %%v18, %%v18 \n\t"
"vflpsb %%v19, %%v19 \n\t"
"vflpsb %%v20, %%v20 \n\t"
"vflpsb %%v21, %%v21 \n\t"
"vflpsb %%v22, %%v22 \n\t"
"vflpsb %%v23, %%v23 \n\t"
"vfchesb %%v5,%%v17,%%v16 \n\t"
"vfchesb %%v6,%%v19,%%v18 \n\t"
"vfchesb %%v7,%%v21,%%v20 \n\t"
"vfchesb %%v8,%%v23,%%v22 \n\t"
"vsel %%v16,%%v16,%%v17,%%v5 \n\t"
"vsel %%v5,%%v24,%%v25,%%v5 \n\t"
"vsel %%v17,%%v18,%%v19,%%v6 \n\t"
"vsel %%v6,%%v26,%%v27,%%v6 \n\t"
"vsel %%v18,%%v20,%%v21,%%v7 \n\t"
"vsel %%v7,%%v28,%%v29,%%v7 \n\t"
"vsel %%v19,%%v22,%%v23,%%v8 \n\t"
"vsel %%v8,%%v30,%%v31,%%v8 \n\t"

"vfchesb %%v20,%%v17,%%v16 \n\t"
"vfchesb %%v21,%%v19,%%v18 \n\t"
"vsel %%v16,%%v16,%%v17,%%v20 \n\t"
"vsel %%v5,%%v5,%%v6,%%v20 \n\t"
"vsel %%v17,%%v18,%%v19,%%v21 \n\t"
"vsel %%v6,%%v7,%%v8,%%v21 \n\t"

"vfchesb %%v18,%%v17,%%v16 \n\t"
"vsel %%v16,%%v16,%%v17,%%v18 \n\t"
"vsel %%v5,%%v5,%%v6,%%v18 \n\t"
"vsegf %%v6,%%v5 \n\t"
"vesrlg %%v5,%%v5,32 \n\t"
"vag %%v5,%%v5,%%v4 \n\t"
"vag %%v6,%%v6,%%v4 \n\t"

"vfchesb %%v7,%%v16,%%v0 \n\t"
"vsel %%v0,%%v0,%%v16,%%v7 \n\t"
"vsegf %%v8,%%v7 \n\t"
"vesrlg %%v7,%%v7,32 \n\t"
"vsegf %%v7,%%v7 \n\t"
"vsel %%v1,%%v1,%%v5,%%v7 \n\t"
"vsel %%v2,%%v2,%%v6,%%v8 \n\t"
"vag %%v4,%%v4,%%v3 \n\t"

"agfi %%r1, 256 \n\t"
"brctg %%r0, 0b \n\t"

"veslg %%v3,%%v0,32 \n\t"
"vfchsb %%v4,%%v3,%%v0 \n\t"
"vchlg %%v5,%%v2,%%v1 \n\t"
"vfcesb %%v6,%%v0,%%v3 \n\t"
"vn %%v5,%%v5,%%v6 \n\t"
"vo %%v4,%%v4,%%v5 \n\t"
"vsel %%v0,%%v0,%%v3,%%v4 \n\t"
"vesrlg %%v4,%%v4,32 \n\t"
"vsegf %%v4,%%v4 \n\t"
"vsel %%v1,%%v1,%%v2,%%v4 \n\t"

"vrepf %%v2,%%v0,2 \n\t"
"vrepg %%v3,%%v1,1 \n\t"
"wfcsb %%v2,%%v0 \n\t"
"jne 1f \n\t"
"vstef %%v0,%1,0 \n\t"
"vmnlg %%v0,%%v1,%%v3 \n\t"
"vlgvg %0,%%v0,0 \n\t"
"j 2f \n\t"
"1: \n\t"
"wfchsb %%v4,%%v0,%%v2 \n\t"
"vesrlg %%v4,%%v4,32 \n\t"
"vsegf %%v4,%%v4 \n\t"
"vsel %%v1,%%v3,%%v1,%%v4 \n\t"
"vsel %%v0,%%v2,%%v0,%%v4 \n\t"
"ste %%f0,%1 \n\t"
"vlgvg %0,%%v1,0 \n\t"
"2: \n\t"
"nop "
:"=r"(iamin),"=m"(*amin)
:"r"(n),"ZR"((const FLOAT (*)[n])x)
:"memory","cc","r0","r1","v0","v1","v2","v4","v5","v6","v7","v8","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
);

return iamin;
return iamin;
} }
BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
BLASLONG i = 0;
BLASLONG j = 0;
FLOAT minf = 0.0;
BLASLONG min = 0;


if (n <= 0 || inc_x <= 0) return (min);

if (inc_x == 1) {
BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
BLASLONG i = 0;
BLASLONG j = 0;
FLOAT minf = 0.0;
BLASLONG min = 0;


BLASLONG n1 = n & -64;
if (n1 > 0) {
if (n <= 0 || inc_x <= 0)
return (min);


min = isamin_kernel_64(n1, x, &minf);
if (inc_x == 1) {


i = n1;
}
else
{
minf = ABS(x[0]);
i++;
}
BLASLONG n1 = n & -64;
if (n1 > 0) {


while (i < n) {
if (ABS(x[i]) < minf) {
min = i;
minf = ABS(x[i]);
}
i++;
}
return (min + 1);
min = isamin_kernel_64(n1, x, &minf);


i = n1;
} else { } else {
minf = ABS(x[0]);
i++;
}

while (i < n) {
if (ABS(x[i]) < minf) {
min = i;
minf = ABS(x[i]);
}
i++;
}
return (min + 1);


min = 0;
minf = ABS(x[0]);
} else {


BLASLONG n1 = n & -4;
while (j < n1) {
min = 0;
minf = ABS(x[0]);


if (ABS(x[i]) < minf) {
min = j;
minf = ABS(x[i]);
}
if (ABS(x[i + inc_x]) < minf) {
min = j + 1;
minf = ABS(x[i + inc_x]);
}
if (ABS(x[i + 2 * inc_x]) < minf) {
min = j + 2;
minf = ABS(x[i + 2 * inc_x]);
}
if (ABS(x[i + 3 * inc_x]) < minf) {
min = j + 3;
minf = ABS(x[i + 3 * inc_x]);
}
BLASLONG n1 = n & -4;
while (j < n1) {


i += inc_x * 4;
if (ABS(x[i]) < minf) {
min = j;
minf = ABS(x[i]);
}
if (ABS(x[i + inc_x]) < minf) {
min = j + 1;
minf = ABS(x[i + inc_x]);
}
if (ABS(x[i + 2 * inc_x]) < minf) {
min = j + 2;
minf = ABS(x[i + 2 * inc_x]);
}
if (ABS(x[i + 3 * inc_x]) < minf) {
min = j + 3;
minf = ABS(x[i + 3 * inc_x]);
}


j += 4;
i += inc_x * 4;


}
j += 4;


}


while (j < n) {
if (ABS(x[i]) < minf) {
min = j;
minf = ABS(x[i]);
}
i += inc_x;
j++;
}
return (min + 1);
while (j < n) {
if (ABS(x[i]) < minf) {
min = j;
minf = ABS(x[i]);
}
i += inc_x;
j++;
} }
return (min + 1);
}
} }

+ 221
- 237
kernel/zarch/ismax.c View File

@@ -1,5 +1,5 @@
/*************************************************************************** /***************************************************************************
Copyright (c) 2013-2017, The OpenBLAS Project
Copyright (c) 2013-2019, The OpenBLAS Project
All rights reserved. All rights reserved.
Redistribution and use in source and binary forms, with or without Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are modification, are permitted provided that the following conditions are
@@ -27,259 +27,243 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.


#include "common.h" #include "common.h"


static BLASLONG ismax_kernel_64(BLASLONG n, FLOAT *x, FLOAT *max)
{
BLASLONG imax;
static BLASLONG ismax_kernel_64(BLASLONG n, FLOAT *x, FLOAT *max) {
BLASLONG imax;


__asm__ volatile (
"vl %%v0,0(%3) \n\t"
"vleig %%v1,0,0 \n\t"
"vleig %%v1,2,1 \n\t"
"vleig %%v2,1,0 \n\t"
"vleig %%v2,3,1 \n\t"
"vrepig %%v3,32 \n\t"
"vzero %%v4 \n\t"
"vleif %%v24,0,0 \n\t"
"vleif %%v24,1,1 \n\t"
"vleif %%v24,2,2 \n\t"
"vleif %%v24,3,3 \n\t"
"vleif %%v25,4,0 \n\t"
"vleif %%v25,5,1 \n\t"
"vleif %%v25,6,2 \n\t"
"vleif %%v25,7,3 \n\t"
"vleif %%v26,8,0 \n\t"
"vleif %%v26,9,1 \n\t"
"vleif %%v26,10,2 \n\t"
"vleif %%v26,11,3 \n\t"
"vleif %%v27,12,0 \n\t"
"vleif %%v27,13,1 \n\t"
"vleif %%v27,14,2 \n\t"
"vleif %%v27,15,3 \n\t"
"vleif %%v28,16,0 \n\t"
"vleif %%v28,17,1 \n\t"
"vleif %%v28,18,2 \n\t"
"vleif %%v28,19,3 \n\t"
"vleif %%v29,20,0 \n\t"
"vleif %%v29,21,1 \n\t"
"vleif %%v29,22,2 \n\t"
"vleif %%v29,23,3 \n\t"
"vleif %%v30,24,0 \n\t"
"vleif %%v30,25,1 \n\t"
"vleif %%v30,26,2 \n\t"
"vleif %%v30,27,3 \n\t"
"vleif %%v31,28,0 \n\t"
"vleif %%v31,29,1 \n\t"
"vleif %%v31,30,2 \n\t"
"vleif %%v31,31,3 \n\t"
"srlg %%r0,%2,6 \n\t"
"xgr %%r1,%%r1 \n\t"
"0: \n\t"
"pfd 1, 1024(%%r1,%3) \n\t"
__asm__("vl %%v0,0(%[x])\n\t"
"vleig %%v1,0,0\n\t"
"vleig %%v1,2,1\n\t"
"vleig %%v2,1,0\n\t"
"vleig %%v2,3,1\n\t"
"vrepig %%v3,32\n\t"
"vzero %%v4\n\t"
"vleif %%v24,0,0\n\t"
"vleif %%v24,1,1\n\t"
"vleif %%v24,2,2\n\t"
"vleif %%v24,3,3\n\t"
"vleif %%v25,4,0\n\t"
"vleif %%v25,5,1\n\t"
"vleif %%v25,6,2\n\t"
"vleif %%v25,7,3\n\t"
"vleif %%v26,8,0\n\t"
"vleif %%v26,9,1\n\t"
"vleif %%v26,10,2\n\t"
"vleif %%v26,11,3\n\t"
"vleif %%v27,12,0\n\t"
"vleif %%v27,13,1\n\t"
"vleif %%v27,14,2\n\t"
"vleif %%v27,15,3\n\t"
"vleif %%v28,16,0\n\t"
"vleif %%v28,17,1\n\t"
"vleif %%v28,18,2\n\t"
"vleif %%v28,19,3\n\t"
"vleif %%v29,20,0\n\t"
"vleif %%v29,21,1\n\t"
"vleif %%v29,22,2\n\t"
"vleif %%v29,23,3\n\t"
"vleif %%v30,24,0\n\t"
"vleif %%v30,25,1\n\t"
"vleif %%v30,26,2\n\t"
"vleif %%v30,27,3\n\t"
"vleif %%v31,28,0\n\t"
"vleif %%v31,29,1\n\t"
"vleif %%v31,30,2\n\t"
"vleif %%v31,31,3\n\t"
"srlg %[n],%[n],6\n\t"
"xgr %%r1,%%r1\n\t"
"0:\n\t"
"pfd 1, 1024(%%r1,%[x])\n\t"
"vl %%v16,0(%%r1,%[x])\n\t"
"vl %%v17,16(%%r1,%[x])\n\t"
"vl %%v18,32(%%r1,%[x])\n\t"
"vl %%v19,48(%%r1,%[x])\n\t"
"vl %%v20,64(%%r1,%[x])\n\t"
"vl %%v21,80(%%r1,%[x])\n\t"
"vl %%v22,96(%%r1,%[x])\n\t"
"vl %%v23,112(%%r1,%[x])\n\t"
"vfchesb %%v5,%%v16,%%v17\n\t"
"vfchesb %%v6,%%v18,%%v19\n\t"
"vfchesb %%v7,%%v20,%%v21\n\t"
"vfchesb %%v8,%%v22,%%v23\n\t"
"vsel %%v16,%%v16,%%v17,%%v5\n\t"
"vsel %%v5,%%v24,%%v25,%%v5\n\t"
"vsel %%v17,%%v18,%%v19,%%v6\n\t"
"vsel %%v6,%%v26,%%v27,%%v6\n\t"
"vsel %%v18,%%v20,%%v21,%%v7\n\t"
"vsel %%v7,%%v28,%%v29,%%v7\n\t"
"vsel %%v19,%%v22,%%v23,%%v8\n\t"
"vsel %%v8,%%v30,%%v31,%%v8\n\t"
"vfchesb %%v20,%%v16,%%v17\n\t"
"vfchesb %%v21,%%v18,%%v19\n\t"
"vsel %%v16,%%v16,%%v17,%%v20\n\t"
"vsel %%v5,%%v5,%%v6,%%v20\n\t"
"vsel %%v17,%%v18,%%v19,%%v21\n\t"
"vsel %%v6,%%v7,%%v8,%%v21\n\t"
"vfchesb %%v18,%%v16,%%v17\n\t"
"vsel %%v16,%%v16,%%v17,%%v18\n\t"
"vsel %%v5,%%v5,%%v6,%%v18\n\t"
"vsegf %%v6,%%v5\n\t"
"vesrlg %%v5,%%v5,32\n\t"
"vag %%v5,%%v5,%%v4\n\t"
"vag %%v6,%%v6,%%v4\n\t"
"vfchesb %%v7,%%v0,%%v16\n\t"
"vsel %%v0,%%v0,%%v16,%%v7\n\t"
"vsegf %%v8,%%v7\n\t"
"vesrlg %%v7,%%v7,32\n\t"
"vsegf %%v7,%%v7\n\t"
"vsel %%v1,%%v1,%%v5,%%v7\n\t"
"vsel %%v2,%%v2,%%v6,%%v8\n\t"
"vag %%v4,%%v4,%%v3\n\t"
"vl %%v16,128(%%r1,%[x])\n\t"
"vl %%v17,144(%%r1,%[x])\n\t"
"vl %%v18,160(%%r1,%[x])\n\t"
"vl %%v19,176(%%r1,%[x])\n\t"
"vl %%v20,192(%%r1,%[x])\n\t"
"vl %%v21,208(%%r1,%[x])\n\t"
"vl %%v22,224(%%r1,%[x])\n\t"
"vl %%v23,240(%%r1,%[x])\n\t"
"vfchesb %%v5,%%v16,%%v17\n\t"
"vfchesb %%v6,%%v18,%%v19\n\t"
"vfchesb %%v7,%%v20,%%v21\n\t"
"vfchesb %%v8,%%v22,%%v23\n\t"
"vsel %%v16,%%v16,%%v17,%%v5\n\t"
"vsel %%v5,%%v24,%%v25,%%v5\n\t"
"vsel %%v17,%%v18,%%v19,%%v6\n\t"
"vsel %%v6,%%v26,%%v27,%%v6\n\t"
"vsel %%v18,%%v20,%%v21,%%v7\n\t"
"vsel %%v7,%%v28,%%v29,%%v7\n\t"
"vsel %%v19,%%v22,%%v23,%%v8\n\t"
"vsel %%v8,%%v30,%%v31,%%v8\n\t"
"vfchesb %%v20,%%v16,%%v17\n\t"
"vfchesb %%v21,%%v18,%%v19\n\t"
"vsel %%v16,%%v16,%%v17,%%v20\n\t"
"vsel %%v5,%%v5,%%v6,%%v20\n\t"
"vsel %%v17,%%v18,%%v19,%%v21\n\t"
"vsel %%v6,%%v7,%%v8,%%v21\n\t"
"vfchesb %%v18,%%v16,%%v17\n\t"
"vsel %%v16,%%v16,%%v17,%%v18\n\t"
"vsel %%v5,%%v5,%%v6,%%v18\n\t"
"vsegf %%v6,%%v5\n\t"
"vesrlg %%v5,%%v5,32\n\t"
"vag %%v5,%%v5,%%v4\n\t"
"vag %%v6,%%v6,%%v4\n\t"
"vfchesb %%v7,%%v0,%%v16\n\t"
"vsel %%v0,%%v0,%%v16,%%v7\n\t"
"vsegf %%v8,%%v7\n\t"
"vesrlg %%v7,%%v7,32\n\t"
"vsegf %%v7,%%v7\n\t"
"vsel %%v1,%%v1,%%v5,%%v7\n\t"
"vsel %%v2,%%v2,%%v6,%%v8\n\t"
"vag %%v4,%%v4,%%v3\n\t"
"agfi %%r1, 256\n\t"
"brctg %[n], 0b\n\t"
"veslg %%v3,%%v0,32\n\t"
"vfchsb %%v4,%%v0,%%v3\n\t"
"vchlg %%v5,%%v2,%%v1\n\t"
"vfcesb %%v6,%%v0,%%v3\n\t"
"vn %%v5,%%v5,%%v6\n\t"
"vo %%v4,%%v4,%%v5\n\t"
"vsel %%v0,%%v0,%%v3,%%v4\n\t"
"vesrlg %%v4,%%v4,32\n\t"
"vsegf %%v4,%%v4\n\t"
"vsel %%v1,%%v1,%%v2,%%v4\n\t"
"vrepf %%v2,%%v0,2\n\t"
"vrepg %%v3,%%v1,1\n\t"
"wfcsb %%v2,%%v0\n\t"
"jne 1f\n\t"
"vstef %%v0,%[max],0\n\t"
"vmnlg %%v0,%%v1,%%v3\n\t"
"vlgvg %[imax],%%v0,0\n\t"
"j 2f\n\t"
"1:\n\t"
"wfchsb %%v4,%%v2,%%v0\n\t"
"vesrlg %%v4,%%v4,32\n\t"
"vsegf %%v4,%%v4\n\t"
"vsel %%v1,%%v3,%%v1,%%v4\n\t"
"vsel %%v0,%%v2,%%v0,%%v4\n\t"
"ste %%f0,%[max]\n\t"
"vlgvg %[imax],%%v1,0\n\t"
"2:\n\t"
"nop"
: [imax] "=r"(imax),[max] "=m"(*max),[n] "+&r"(n)
: "m"(*(const FLOAT (*)[n]) x),[x] "a"(x)
: "cc", "r1", "v0", "v1", "v2", "v4", "v5", "v6", "v7", "v8", "v16",
"v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26",
"v27", "v28", "v29", "v30", "v31");


"vl %%v16,0(%%r1,%3) \n\t"
"vl %%v17,16(%%r1,%3) \n\t"
"vl %%v18,32(%%r1,%3) \n\t"
"vl %%v19,48(%%r1,%3) \n\t"
"vl %%v20,64(%%r1,%3) \n\t"
"vl %%v21,80(%%r1,%3) \n\t"
"vl %%v22,96(%%r1,%3) \n\t"
"vl %%v23,112(%%r1,%3) \n\t"
"vfchesb %%v5,%%v16,%%v17 \n\t"
"vfchesb %%v6,%%v18,%%v19 \n\t"
"vfchesb %%v7,%%v20,%%v21 \n\t"
"vfchesb %%v8,%%v22,%%v23 \n\t"
"vsel %%v16,%%v16,%%v17,%%v5 \n\t"
"vsel %%v5,%%v24,%%v25,%%v5 \n\t"
"vsel %%v17,%%v18,%%v19,%%v6 \n\t"
"vsel %%v6,%%v26,%%v27,%%v6 \n\t"
"vsel %%v18,%%v20,%%v21,%%v7 \n\t"
"vsel %%v7,%%v28,%%v29,%%v7 \n\t"
"vsel %%v19,%%v22,%%v23,%%v8 \n\t"
"vsel %%v8,%%v30,%%v31,%%v8 \n\t"

"vfchesb %%v20,%%v16,%%v17 \n\t"
"vfchesb %%v21,%%v18,%%v19 \n\t"
"vsel %%v16,%%v16,%%v17,%%v20 \n\t"
"vsel %%v5,%%v5,%%v6,%%v20 \n\t"
"vsel %%v17,%%v18,%%v19,%%v21 \n\t"
"vsel %%v6,%%v7,%%v8,%%v21 \n\t"

"vfchesb %%v18,%%v16,%%v17 \n\t"
"vsel %%v16,%%v16,%%v17,%%v18 \n\t"
"vsel %%v5,%%v5,%%v6,%%v18 \n\t"
"vsegf %%v6,%%v5 \n\t"
"vesrlg %%v5,%%v5,32 \n\t"
"vag %%v5,%%v5,%%v4 \n\t"
"vag %%v6,%%v6,%%v4 \n\t"

"vfchesb %%v7,%%v0,%%v16 \n\t"
"vsel %%v0,%%v0,%%v16,%%v7 \n\t"
"vsegf %%v8,%%v7 \n\t"
"vesrlg %%v7,%%v7,32 \n\t"
"vsegf %%v7,%%v7 \n\t"
"vsel %%v1,%%v1,%%v5,%%v7 \n\t"
"vsel %%v2,%%v2,%%v6,%%v8 \n\t"
"vag %%v4,%%v4,%%v3 \n\t"

"vl %%v16,128(%%r1,%3) \n\t"
"vl %%v17,144(%%r1,%3) \n\t"
"vl %%v18,160(%%r1,%3) \n\t"
"vl %%v19,176(%%r1,%3) \n\t"
"vl %%v20,192(%%r1,%3) \n\t"
"vl %%v21,208(%%r1,%3) \n\t"
"vl %%v22,224(%%r1,%3) \n\t"
"vl %%v23,240(%%r1,%3) \n\t"
"vfchesb %%v5,%%v16,%%v17 \n\t"
"vfchesb %%v6,%%v18,%%v19 \n\t"
"vfchesb %%v7,%%v20,%%v21 \n\t"
"vfchesb %%v8,%%v22,%%v23 \n\t"
"vsel %%v16,%%v16,%%v17,%%v5 \n\t"
"vsel %%v5,%%v24,%%v25,%%v5 \n\t"
"vsel %%v17,%%v18,%%v19,%%v6 \n\t"
"vsel %%v6,%%v26,%%v27,%%v6 \n\t"
"vsel %%v18,%%v20,%%v21,%%v7 \n\t"
"vsel %%v7,%%v28,%%v29,%%v7 \n\t"
"vsel %%v19,%%v22,%%v23,%%v8 \n\t"
"vsel %%v8,%%v30,%%v31,%%v8 \n\t"

"vfchesb %%v20,%%v16,%%v17 \n\t"
"vfchesb %%v21,%%v18,%%v19 \n\t"
"vsel %%v16,%%v16,%%v17,%%v20 \n\t"
"vsel %%v5,%%v5,%%v6,%%v20 \n\t"
"vsel %%v17,%%v18,%%v19,%%v21 \n\t"
"vsel %%v6,%%v7,%%v8,%%v21 \n\t"

"vfchesb %%v18,%%v16,%%v17 \n\t"
"vsel %%v16,%%v16,%%v17,%%v18 \n\t"
"vsel %%v5,%%v5,%%v6,%%v18 \n\t"
"vsegf %%v6,%%v5 \n\t"
"vesrlg %%v5,%%v5,32 \n\t"
"vag %%v5,%%v5,%%v4 \n\t"
"vag %%v6,%%v6,%%v4 \n\t"

"vfchesb %%v7,%%v0,%%v16 \n\t"
"vsel %%v0,%%v0,%%v16,%%v7 \n\t"
"vsegf %%v8,%%v7 \n\t"
"vesrlg %%v7,%%v7,32 \n\t"
"vsegf %%v7,%%v7 \n\t"
"vsel %%v1,%%v1,%%v5,%%v7 \n\t"
"vsel %%v2,%%v2,%%v6,%%v8 \n\t"
"vag %%v4,%%v4,%%v3 \n\t"

"agfi %%r1, 256 \n\t"
"brctg %%r0, 0b \n\t"

"veslg %%v3,%%v0,32 \n\t"
"vfchsb %%v4,%%v0,%%v3 \n\t"
"vchlg %%v5,%%v2,%%v1 \n\t"
"vfcesb %%v6,%%v0,%%v3 \n\t"
"vn %%v5,%%v5,%%v6 \n\t"
"vo %%v4,%%v4,%%v5 \n\t"
"vsel %%v0,%%v0,%%v3,%%v4 \n\t"
"vesrlg %%v4,%%v4,32 \n\t"
"vsegf %%v4,%%v4 \n\t"
"vsel %%v1,%%v1,%%v2,%%v4 \n\t"

"vrepf %%v2,%%v0,2 \n\t"
"vrepg %%v3,%%v1,1 \n\t"
"wfcsb %%v2,%%v0 \n\t"
"jne 1f \n\t"
"vstef %%v0,%1,0 \n\t"
"vmnlg %%v0,%%v1,%%v3 \n\t"
"vlgvg %0,%%v0,0 \n\t"
"j 2f \n\t"
"1: \n\t"
"wfchsb %%v4,%%v2,%%v0 \n\t"
"vesrlg %%v4,%%v4,32 \n\t"
"vsegf %%v4,%%v4 \n\t"
"vsel %%v1,%%v3,%%v1,%%v4 \n\t"
"vsel %%v0,%%v2,%%v0,%%v4 \n\t"
"ste %%f0,%1 \n\t"
"vlgvg %0,%%v1,0 \n\t"
"2: \n\t"
"nop "
:"=r"(imax),"=m"(*max)
:"r"(n),"ZR"((const FLOAT (*)[n])x)
:"memory","cc","r0","r1","v0","v1","v2","v4","v5","v6","v7","v8","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
);

return imax;
return imax;
} }
BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
BLASLONG i = 0;
BLASLONG j = 0;
FLOAT maxf = 0.0;
BLASLONG max = 0;


if (n <= 0 || inc_x <= 0) return (max);

if (inc_x == 1) {
BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
BLASLONG i = 0;
BLASLONG j = 0;
FLOAT maxf = 0.0;
BLASLONG max = 0;


BLASLONG n1 = n & -64;
if (n1 > 0) {
if (n <= 0 || inc_x <= 0)
return (max);


max = ismax_kernel_64(n1, x, &maxf);
if (inc_x == 1) {


i = n1;
}
else
{
maxf = x[0];
i++;
}
BLASLONG n1 = n & -64;
if (n1 > 0) {


while (i < n) {
if (x[i] > maxf) {
max = i;
maxf = x[i];
}
i++;
}
return (max + 1);
max = ismax_kernel_64(n1, x, &maxf);


i = n1;
} else { } else {
maxf = x[0];
i++;
}

while (i < n) {
if (x[i] > maxf) {
max = i;
maxf = x[i];
}
i++;
}
return (max + 1);


max = 0;
maxf = x[0];
} else {


BLASLONG n1 = n & -4;
while (j < n1) {
max = 0;
maxf = x[0];


if (x[i] > maxf) {
max = j;
maxf = x[i];
}
if (x[i + inc_x] > maxf) {
max = j + 1;
maxf = x[i + inc_x];
}
if (x[i + 2 * inc_x] > maxf) {
max = j + 2;
maxf = x[i + 2 * inc_x];
}
if (x[i + 3 * inc_x] > maxf) {
max = j + 3;
maxf = x[i + 3 * inc_x];
}
BLASLONG n1 = n & -4;
while (j < n1) {


i += inc_x * 4;
if (x[i] > maxf) {
max = j;
maxf = x[i];
}
if (x[i + inc_x] > maxf) {
max = j + 1;
maxf = x[i + inc_x];
}
if (x[i + 2 * inc_x] > maxf) {
max = j + 2;
maxf = x[i + 2 * inc_x];
}
if (x[i + 3 * inc_x] > maxf) {
max = j + 3;
maxf = x[i + 3 * inc_x];
}


j += 4;
i += inc_x * 4;


}
j += 4;


}


while (j < n) {
if (x[i] > maxf) {
max = j;
maxf = x[i];
}
i += inc_x;
j++;
}
return (max + 1);
while (j < n) {
if (x[i] > maxf) {
max = j;
maxf = x[i];
}
i += inc_x;
j++;
} }
return (max + 1);
}
} }

+ 221
- 237
kernel/zarch/ismin.c View File

@@ -1,5 +1,5 @@
/*************************************************************************** /***************************************************************************
Copyright (c) 2013-2017, The OpenBLAS Project
Copyright (c) 2013-2019, The OpenBLAS Project
All rights reserved. All rights reserved.
Redistribution and use in source and binary forms, with or without Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are modification, are permitted provided that the following conditions are
@@ -27,259 +27,243 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.


#include "common.h" #include "common.h"


static BLASLONG ismin_kernel_64(BLASLONG n, FLOAT *x, FLOAT *min)
{
BLASLONG imin;
static BLASLONG ismin_kernel_64(BLASLONG n, FLOAT *x, FLOAT *min) {
BLASLONG imin;


__asm__ volatile (
"vl %%v0,0(%3) \n\t"
"vleig %%v1,0,0 \n\t"
"vleig %%v1,2,1 \n\t"
"vleig %%v2,1,0 \n\t"
"vleig %%v2,3,1 \n\t"
"vrepig %%v3,32 \n\t"
"vzero %%v4 \n\t"
"vleif %%v24,0,0 \n\t"
"vleif %%v24,1,1 \n\t"
"vleif %%v24,2,2 \n\t"
"vleif %%v24,3,3 \n\t"
"vleif %%v25,4,0 \n\t"
"vleif %%v25,5,1 \n\t"
"vleif %%v25,6,2 \n\t"
"vleif %%v25,7,3 \n\t"
"vleif %%v26,8,0 \n\t"
"vleif %%v26,9,1 \n\t"
"vleif %%v26,10,2 \n\t"
"vleif %%v26,11,3 \n\t"
"vleif %%v27,12,0 \n\t"
"vleif %%v27,13,1 \n\t"
"vleif %%v27,14,2 \n\t"
"vleif %%v27,15,3 \n\t"
"vleif %%v28,16,0 \n\t"
"vleif %%v28,17,1 \n\t"
"vleif %%v28,18,2 \n\t"
"vleif %%v28,19,3 \n\t"
"vleif %%v29,20,0 \n\t"
"vleif %%v29,21,1 \n\t"
"vleif %%v29,22,2 \n\t"
"vleif %%v29,23,3 \n\t"
"vleif %%v30,24,0 \n\t"
"vleif %%v30,25,1 \n\t"
"vleif %%v30,26,2 \n\t"
"vleif %%v30,27,3 \n\t"
"vleif %%v31,28,0 \n\t"
"vleif %%v31,29,1 \n\t"
"vleif %%v31,30,2 \n\t"
"vleif %%v31,31,3 \n\t"
"srlg %%r0,%2,6 \n\t"
"xgr %%r1,%%r1 \n\t"
"0: \n\t"
"pfd 1, 1024(%%r1,%3) \n\t"
__asm__("vl %%v0,0(%[x])\n\t"
"vleig %%v1,0,0\n\t"
"vleig %%v1,2,1\n\t"
"vleig %%v2,1,0\n\t"
"vleig %%v2,3,1\n\t"
"vrepig %%v3,32\n\t"
"vzero %%v4\n\t"
"vleif %%v24,0,0\n\t"
"vleif %%v24,1,1\n\t"
"vleif %%v24,2,2\n\t"
"vleif %%v24,3,3\n\t"
"vleif %%v25,4,0\n\t"
"vleif %%v25,5,1\n\t"
"vleif %%v25,6,2\n\t"
"vleif %%v25,7,3\n\t"
"vleif %%v26,8,0\n\t"
"vleif %%v26,9,1\n\t"
"vleif %%v26,10,2\n\t"
"vleif %%v26,11,3\n\t"
"vleif %%v27,12,0\n\t"
"vleif %%v27,13,1\n\t"
"vleif %%v27,14,2\n\t"
"vleif %%v27,15,3\n\t"
"vleif %%v28,16,0\n\t"
"vleif %%v28,17,1\n\t"
"vleif %%v28,18,2\n\t"
"vleif %%v28,19,3\n\t"
"vleif %%v29,20,0\n\t"
"vleif %%v29,21,1\n\t"
"vleif %%v29,22,2\n\t"
"vleif %%v29,23,3\n\t"
"vleif %%v30,24,0\n\t"
"vleif %%v30,25,1\n\t"
"vleif %%v30,26,2\n\t"
"vleif %%v30,27,3\n\t"
"vleif %%v31,28,0\n\t"
"vleif %%v31,29,1\n\t"
"vleif %%v31,30,2\n\t"
"vleif %%v31,31,3\n\t"
"srlg %[n],%[n],6\n\t"
"xgr %%r1,%%r1\n\t"
"0:\n\t"
"pfd 1, 1024(%%r1,%[x])\n\t"
"vl %%v16,0(%%r1,%[x])\n\t"
"vl %%v17,16(%%r1,%[x])\n\t"
"vl %%v18,32(%%r1,%[x])\n\t"
"vl %%v19,48(%%r1,%[x])\n\t"
"vl %%v20,64(%%r1,%[x])\n\t"
"vl %%v21,80(%%r1,%[x])\n\t"
"vl %%v22,96(%%r1,%[x])\n\t"
"vl %%v23,112(%%r1,%[x])\n\t"
"vfchesb %%v5,%%v17,%%v16\n\t"
"vfchesb %%v6,%%v19,%%v18\n\t"
"vfchesb %%v7,%%v21,%%v20\n\t"
"vfchesb %%v8,%%v23,%%v22\n\t"
"vsel %%v16,%%v16,%%v17,%%v5\n\t"
"vsel %%v5,%%v24,%%v25,%%v5\n\t"
"vsel %%v17,%%v18,%%v19,%%v6\n\t"
"vsel %%v6,%%v26,%%v27,%%v6\n\t"
"vsel %%v18,%%v20,%%v21,%%v7\n\t"
"vsel %%v7,%%v28,%%v29,%%v7\n\t"
"vsel %%v19,%%v22,%%v23,%%v8\n\t"
"vsel %%v8,%%v30,%%v31,%%v8\n\t"
"vfchesb %%v20,%%v17,%%v16\n\t"
"vfchesb %%v21,%%v19,%%v18\n\t"
"vsel %%v16,%%v16,%%v17,%%v20\n\t"
"vsel %%v5,%%v5,%%v6,%%v20\n\t"
"vsel %%v17,%%v18,%%v19,%%v21\n\t"
"vsel %%v6,%%v7,%%v8,%%v21\n\t"
"vfchesb %%v18,%%v17,%%v16\n\t"
"vsel %%v16,%%v16,%%v17,%%v18\n\t"
"vsel %%v5,%%v5,%%v6,%%v18\n\t"
"vsegf %%v6,%%v5\n\t"
"vesrlg %%v5,%%v5,32\n\t"
"vag %%v5,%%v5,%%v4\n\t"
"vag %%v6,%%v6,%%v4\n\t"
"vfchesb %%v7,%%v16,%%v0\n\t"
"vsel %%v0,%%v0,%%v16,%%v7\n\t"
"vsegf %%v8,%%v7\n\t"
"vesrlg %%v7,%%v7,32\n\t"
"vsegf %%v7,%%v7\n\t"
"vsel %%v1,%%v1,%%v5,%%v7\n\t"
"vsel %%v2,%%v2,%%v6,%%v8\n\t"
"vag %%v4,%%v4,%%v3\n\t"
"vl %%v16,128(%%r1,%[x])\n\t"
"vl %%v17,144(%%r1,%[x])\n\t"
"vl %%v18,160(%%r1,%[x])\n\t"
"vl %%v19,176(%%r1,%[x])\n\t"
"vl %%v20,192(%%r1,%[x])\n\t"
"vl %%v21,208(%%r1,%[x])\n\t"
"vl %%v22,224(%%r1,%[x])\n\t"
"vl %%v23,240(%%r1,%[x])\n\t"
"vfchesb %%v5,%%v17,%%v16\n\t"
"vfchesb %%v6,%%v19,%%v18\n\t"
"vfchesb %%v7,%%v21,%%v20\n\t"
"vfchesb %%v8,%%v23,%%v22\n\t"
"vsel %%v16,%%v16,%%v17,%%v5\n\t"
"vsel %%v5,%%v24,%%v25,%%v5\n\t"
"vsel %%v17,%%v18,%%v19,%%v6\n\t"
"vsel %%v6,%%v26,%%v27,%%v6\n\t"
"vsel %%v18,%%v20,%%v21,%%v7\n\t"
"vsel %%v7,%%v28,%%v29,%%v7\n\t"
"vsel %%v19,%%v22,%%v23,%%v8\n\t"
"vsel %%v8,%%v30,%%v31,%%v8\n\t"
"vfchesb %%v20,%%v17,%%v16\n\t"
"vfchesb %%v21,%%v19,%%v18\n\t"
"vsel %%v16,%%v16,%%v17,%%v20\n\t"
"vsel %%v5,%%v5,%%v6,%%v20\n\t"
"vsel %%v17,%%v18,%%v19,%%v21\n\t"
"vsel %%v6,%%v7,%%v8,%%v21\n\t"
"vfchesb %%v18,%%v17,%%v16\n\t"
"vsel %%v16,%%v16,%%v17,%%v18\n\t"
"vsel %%v5,%%v5,%%v6,%%v18\n\t"
"vsegf %%v6,%%v5\n\t"
"vesrlg %%v5,%%v5,32\n\t"
"vag %%v5,%%v5,%%v4\n\t"
"vag %%v6,%%v6,%%v4\n\t"
"vfchesb %%v7,%%v16,%%v0\n\t"
"vsel %%v0,%%v0,%%v16,%%v7\n\t"
"vsegf %%v8,%%v7\n\t"
"vesrlg %%v7,%%v7,32\n\t"
"vsegf %%v7,%%v7\n\t"
"vsel %%v1,%%v1,%%v5,%%v7\n\t"
"vsel %%v2,%%v2,%%v6,%%v8\n\t"
"vag %%v4,%%v4,%%v3\n\t"
"agfi %%r1, 256\n\t"
"brctg %[n], 0b\n\t"
"veslg %%v3,%%v0,32\n\t"
"vfchsb %%v4,%%v3,%%v0\n\t"
"vchlg %%v5,%%v2,%%v1\n\t"
"vfcesb %%v6,%%v0,%%v3\n\t"
"vn %%v5,%%v5,%%v6\n\t"
"vo %%v4,%%v4,%%v5\n\t"
"vsel %%v0,%%v0,%%v3,%%v4\n\t"
"vesrlg %%v4,%%v4,32\n\t"
"vsegf %%v4,%%v4\n\t"
"vsel %%v1,%%v1,%%v2,%%v4\n\t"
"vrepf %%v2,%%v0,2\n\t"
"vrepg %%v3,%%v1,1\n\t"
"wfcsb %%v2,%%v0\n\t"
"jne 1f\n\t"
"vstef %%v0,%[min],0\n\t"
"vmnlg %%v0,%%v1,%%v3\n\t"
"vlgvg %[imin],%%v0,0\n\t"
"j 2f\n\t"
"1:\n\t"
"wfchsb %%v4,%%v0,%%v2\n\t"
"vesrlg %%v4,%%v4,32\n\t"
"vsegf %%v4,%%v4\n\t"
"vsel %%v1,%%v3,%%v1,%%v4\n\t"
"vsel %%v0,%%v2,%%v0,%%v4\n\t"
"ste %%f0,%[min]\n\t"
"vlgvg %[imin],%%v1,0\n\t"
"2:\n\t"
"nop"
: [imin] "=r"(imin),[min] "=m"(*min),[n] "+&r"(n)
: "m"(*(const FLOAT (*)[n]) x),[x] "a"(x)
: "cc", "r1", "v0", "v1", "v2", "v4", "v5", "v6", "v7", "v8", "v16",
"v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26",
"v27", "v28", "v29", "v30", "v31");


"vl %%v16,0(%%r1,%3) \n\t"
"vl %%v17,16(%%r1,%3) \n\t"
"vl %%v18,32(%%r1,%3) \n\t"
"vl %%v19,48(%%r1,%3) \n\t"
"vl %%v20,64(%%r1,%3) \n\t"
"vl %%v21,80(%%r1,%3) \n\t"
"vl %%v22,96(%%r1,%3) \n\t"
"vl %%v23,112(%%r1,%3) \n\t"
"vfchesb %%v5,%%v17,%%v16 \n\t"
"vfchesb %%v6,%%v19,%%v18 \n\t"
"vfchesb %%v7,%%v21,%%v20 \n\t"
"vfchesb %%v8,%%v23,%%v22 \n\t"
"vsel %%v16,%%v16,%%v17,%%v5 \n\t"
"vsel %%v5,%%v24,%%v25,%%v5 \n\t"
"vsel %%v17,%%v18,%%v19,%%v6 \n\t"
"vsel %%v6,%%v26,%%v27,%%v6 \n\t"
"vsel %%v18,%%v20,%%v21,%%v7 \n\t"
"vsel %%v7,%%v28,%%v29,%%v7 \n\t"
"vsel %%v19,%%v22,%%v23,%%v8 \n\t"
"vsel %%v8,%%v30,%%v31,%%v8 \n\t"

"vfchesb %%v20,%%v17,%%v16 \n\t"
"vfchesb %%v21,%%v19,%%v18 \n\t"
"vsel %%v16,%%v16,%%v17,%%v20 \n\t"
"vsel %%v5,%%v5,%%v6,%%v20 \n\t"
"vsel %%v17,%%v18,%%v19,%%v21 \n\t"
"vsel %%v6,%%v7,%%v8,%%v21 \n\t"

"vfchesb %%v18,%%v17,%%v16 \n\t"
"vsel %%v16,%%v16,%%v17,%%v18 \n\t"
"vsel %%v5,%%v5,%%v6,%%v18 \n\t"
"vsegf %%v6,%%v5 \n\t"
"vesrlg %%v5,%%v5,32 \n\t"
"vag %%v5,%%v5,%%v4 \n\t"
"vag %%v6,%%v6,%%v4 \n\t"

"vfchesb %%v7,%%v16,%%v0 \n\t"
"vsel %%v0,%%v0,%%v16,%%v7 \n\t"
"vsegf %%v8,%%v7 \n\t"
"vesrlg %%v7,%%v7,32 \n\t"
"vsegf %%v7,%%v7 \n\t"
"vsel %%v1,%%v1,%%v5,%%v7 \n\t"
"vsel %%v2,%%v2,%%v6,%%v8 \n\t"
"vag %%v4,%%v4,%%v3 \n\t"

"vl %%v16,128(%%r1,%3) \n\t"
"vl %%v17,144(%%r1,%3) \n\t"
"vl %%v18,160(%%r1,%3) \n\t"
"vl %%v19,176(%%r1,%3) \n\t"
"vl %%v20,192(%%r1,%3) \n\t"
"vl %%v21,208(%%r1,%3) \n\t"
"vl %%v22,224(%%r1,%3) \n\t"
"vl %%v23,240(%%r1,%3) \n\t"
"vfchesb %%v5,%%v17,%%v16 \n\t"
"vfchesb %%v6,%%v19,%%v18 \n\t"
"vfchesb %%v7,%%v21,%%v20 \n\t"
"vfchesb %%v8,%%v23,%%v22 \n\t"
"vsel %%v16,%%v16,%%v17,%%v5 \n\t"
"vsel %%v5,%%v24,%%v25,%%v5 \n\t"
"vsel %%v17,%%v18,%%v19,%%v6 \n\t"
"vsel %%v6,%%v26,%%v27,%%v6 \n\t"
"vsel %%v18,%%v20,%%v21,%%v7 \n\t"
"vsel %%v7,%%v28,%%v29,%%v7 \n\t"
"vsel %%v19,%%v22,%%v23,%%v8 \n\t"
"vsel %%v8,%%v30,%%v31,%%v8 \n\t"

"vfchesb %%v20,%%v17,%%v16 \n\t"
"vfchesb %%v21,%%v19,%%v18 \n\t"
"vsel %%v16,%%v16,%%v17,%%v20 \n\t"
"vsel %%v5,%%v5,%%v6,%%v20 \n\t"
"vsel %%v17,%%v18,%%v19,%%v21 \n\t"
"vsel %%v6,%%v7,%%v8,%%v21 \n\t"

"vfchesb %%v18,%%v17,%%v16 \n\t"
"vsel %%v16,%%v16,%%v17,%%v18 \n\t"
"vsel %%v5,%%v5,%%v6,%%v18 \n\t"
"vsegf %%v6,%%v5 \n\t"
"vesrlg %%v5,%%v5,32 \n\t"
"vag %%v5,%%v5,%%v4 \n\t"
"vag %%v6,%%v6,%%v4 \n\t"

"vfchesb %%v7,%%v16,%%v0 \n\t"
"vsel %%v0,%%v0,%%v16,%%v7 \n\t"
"vsegf %%v8,%%v7 \n\t"
"vesrlg %%v7,%%v7,32 \n\t"
"vsegf %%v7,%%v7 \n\t"
"vsel %%v1,%%v1,%%v5,%%v7 \n\t"
"vsel %%v2,%%v2,%%v6,%%v8 \n\t"
"vag %%v4,%%v4,%%v3 \n\t"

"agfi %%r1, 256 \n\t"
"brctg %%r0, 0b \n\t"

"veslg %%v3,%%v0,32 \n\t"
"vfchsb %%v4,%%v3,%%v0 \n\t"
"vchlg %%v5,%%v2,%%v1 \n\t"
"vfcesb %%v6,%%v0,%%v3 \n\t"
"vn %%v5,%%v5,%%v6 \n\t"
"vo %%v4,%%v4,%%v5 \n\t"
"vsel %%v0,%%v0,%%v3,%%v4 \n\t"
"vesrlg %%v4,%%v4,32 \n\t"
"vsegf %%v4,%%v4 \n\t"
"vsel %%v1,%%v1,%%v2,%%v4 \n\t"

"vrepf %%v2,%%v0,2 \n\t"
"vrepg %%v3,%%v1,1 \n\t"
"wfcsb %%v2,%%v0 \n\t"
"jne 1f \n\t"
"vstef %%v0,%1,0 \n\t"
"vmnlg %%v0,%%v1,%%v3 \n\t"
"vlgvg %0,%%v0,0 \n\t"
"j 2f \n\t"
"1: \n\t"
"wfchsb %%v4,%%v0,%%v2 \n\t"
"vesrlg %%v4,%%v4,32 \n\t"
"vsegf %%v4,%%v4 \n\t"
"vsel %%v1,%%v3,%%v1,%%v4 \n\t"
"vsel %%v0,%%v2,%%v0,%%v4 \n\t"
"ste %%f0,%1 \n\t"
"vlgvg %0,%%v1,0 \n\t"
"2: \n\t"
"nop "
:"=r"(imin),"=m"(*min)
:"r"(n),"ZR"((const FLOAT (*)[n])x)
:"memory","cc","r0","r1","v0","v1","v2","v4","v5","v6","v7","v8","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
);

return imin;
return imin;
} }
BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
BLASLONG i = 0;
BLASLONG j = 0;
FLOAT minf = 0.0;
BLASLONG min = 0;


if (n <= 0 || inc_x <= 0) return (min);

if (inc_x == 1) {
BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
BLASLONG i = 0;
BLASLONG j = 0;
FLOAT minf = 0.0;
BLASLONG min = 0;


BLASLONG n1 = n & -64;
if (n1 > 0) {
if (n <= 0 || inc_x <= 0)
return (min);


min = ismin_kernel_64(n1, x, &minf);
if (inc_x == 1) {


i = n1;
}
else
{
minf = x[0];
i++;
}
BLASLONG n1 = n & -64;
if (n1 > 0) {


while (i < n) {
if (x[i] < minf) {
min = i;
minf = x[i];
}
i++;
}
return (min + 1);
min = ismin_kernel_64(n1, x, &minf);


i = n1;
} else { } else {
minf = x[0];
i++;
}

while (i < n) {
if (x[i] < minf) {
min = i;
minf = x[i];
}
i++;
}
return (min + 1);


min = 0;
minf = x[0];
} else {


BLASLONG n1 = n & -4;
while (j < n1) {
min = 0;
minf = x[0];


if (x[i] < minf) {
min = j;
minf = x[i];
}
if (x[i + inc_x] < minf) {
min = j + 1;
minf = x[i + inc_x];
}
if (x[i + 2 * inc_x] < minf) {
min = j + 2;
minf = x[i + 2 * inc_x];
}
if (x[i + 3 * inc_x] < minf) {
min = j + 3;
minf = x[i + 3 * inc_x];
}
BLASLONG n1 = n & -4;
while (j < n1) {


i += inc_x * 4;
if (x[i] < minf) {
min = j;
minf = x[i];
}
if (x[i + inc_x] < minf) {
min = j + 1;
minf = x[i + inc_x];
}
if (x[i + 2 * inc_x] < minf) {
min = j + 2;
minf = x[i + 2 * inc_x];
}
if (x[i + 3 * inc_x] < minf) {
min = j + 3;
minf = x[i + 3 * inc_x];
}


j += 4;
i += inc_x * 4;


}
j += 4;


}


while (j < n) {
if (x[i] < minf) {
min = j;
minf = x[i];
}
i += inc_x;
j++;
}
return (min + 1);
while (j < n) {
if (x[i] < minf) {
min = j;
minf = x[i];
}
i += inc_x;
j++;
} }
return (min + 1);
}
} }

+ 204
- 205
kernel/zarch/izamax.c View File

@@ -1,5 +1,5 @@
/*************************************************************************** /***************************************************************************
Copyright (c) 2017, The OpenBLAS Project
Copyright (c) 2019, The OpenBLAS Project
All rights reserved. All rights reserved.
Redistribution and use in source and binary forms, with or without Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are modification, are permitted provided that the following conditions are
@@ -27,220 +27,219 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.


#include "common.h" #include "common.h"
#include <math.h> #include <math.h>
#if defined(DOUBLE)
#define ABS fabs
#else
#define ABS fabsf
#endif
#define CABS1(x,i) (ABS(x[i]) + ABS(x[i + 1]))

static BLASLONG izamax_kernel_16(BLASLONG n, FLOAT *x, FLOAT *amax)
{
BLASLONG iamax;

__asm__ volatile (
"vleg %%v0,0(%3),0 \n\t"
"vleg %%v1,8(%3),0 \n\t"
"vleg %%v0,16(%3),1 \n\t"
"vleg %%v1,24(%3),1 \n\t"
"vflpdb %%v0,%%v0 \n\t"
"vflpdb %%v1,%%v1 \n\t"
"vfadb %%v0,%%v0,%%v1 \n\t"
"vleig %%v1,0,0 \n\t"
"vleig %%v1,1,1 \n\t"
"vrepig %%v2,8 \n\t"
"vzero %%v3 \n\t"
"vleig %%v24,0,0 \n\t"
"vleig %%v24,1,1 \n\t"
"vleig %%v25,2,0 \n\t"
"vleig %%v25,3,1 \n\t"
"vleig %%v26,4,0 \n\t"
"vleig %%v26,5,1 \n\t"
"vleig %%v27,6,0 \n\t"
"vleig %%v27,7,1 \n\t"
"srlg %%r0,%2,4 \n\t"
"xgr %%r1,%%r1 \n\t"
"0: \n\t"
"pfd 1, 1024(%%r1,%3) \n\t"

"vleg %%v16,0(%%r1,%3),0 \n\t"
"vleg %%v17,8(%%r1,%3),0 \n\t"
"vleg %%v16,16(%%r1,%3),1 \n\t"
"vleg %%v17,24(%%r1,%3),1 \n\t"
"vleg %%v18,32(%%r1,%3),0 \n\t"
"vleg %%v19,40(%%r1,%3),0 \n\t"
"vleg %%v18,48(%%r1,%3),1 \n\t"
"vleg %%v19,56(%%r1,%3),1 \n\t"
"vleg %%v20,64(%%r1,%3),0 \n\t"
"vleg %%v21,72(%%r1,%3),0 \n\t"
"vleg %%v20,80(%%r1,%3),1 \n\t"
"vleg %%v21,88(%%r1,%3),1 \n\t"
"vleg %%v22,96(%%r1,%3),0 \n\t"
"vleg %%v23,104(%%r1,%3),0 \n\t"
"vleg %%v22,112(%%r1,%3),1 \n\t"
"vleg %%v23,120(%%r1,%3),1 \n\t"
"vflpdb %%v16, %%v16 \n\t"
"vflpdb %%v17, %%v17 \n\t"
"vflpdb %%v18, %%v18 \n\t"
"vflpdb %%v19, %%v19 \n\t"
"vflpdb %%v20, %%v20 \n\t"
"vflpdb %%v21, %%v21 \n\t"
"vflpdb %%v22, %%v22 \n\t"
"vflpdb %%v23, %%v23 \n\t"
"vfadb %%v16,%%v16,%%v17 \n\t"
"vfadb %%v17,%%v18,%%v19 \n\t"
"vfadb %%v18,%%v20,%%v21 \n\t"
"vfadb %%v19,%%v22,%%v23 \n\t"
"vfchedb %%v4,%%v16,%%v17 \n\t"
"vfchedb %%v5,%%v18,%%v19 \n\t"
"vsel %%v16,%%v16,%%v17,%%v4 \n\t"
"vsel %%v4,%%v24,%%v25,%%v4 \n\t"
"vsel %%v17,%%v18,%%v19,%%v5 \n\t"
"vsel %%v5,%%v26,%%v27,%%v5 \n\t"

"vfchedb %%v18,%%v16,%%v17 \n\t"
"vsel %%v16,%%v16,%%v17,%%v18 \n\t"
"vsel %%v4,%%v4,%%v5,%%v18 \n\t"
"vag %%v4,%%v4,%%v3 \n\t"

"vfchedb %%v5,%%v0,%%v16 \n\t"
"vsel %%v0,%%v0,%%v16,%%v5 \n\t"
"vsel %%v1,%%v1,%%v4,%%v5 \n\t"
"vag %%v3,%%v3,%%v2 \n\t"

"vleg %%v16,128(%%r1,%3),0 \n\t"
"vleg %%v17,136(%%r1,%3),0 \n\t"
"vleg %%v16,144(%%r1,%3),1 \n\t"
"vleg %%v17,152(%%r1,%3),1 \n\t"
"vleg %%v18,160(%%r1,%3),0 \n\t"
"vleg %%v19,168(%%r1,%3),0 \n\t"
"vleg %%v18,176(%%r1,%3),1 \n\t"
"vleg %%v19,184(%%r1,%3),1 \n\t"
"vleg %%v20,192(%%r1,%3),0 \n\t"
"vleg %%v21,200(%%r1,%3),0 \n\t"
"vleg %%v20,208(%%r1,%3),1 \n\t"
"vleg %%v21,216(%%r1,%3),1 \n\t"
"vleg %%v22,224(%%r1,%3),0 \n\t"
"vleg %%v23,232(%%r1,%3),0 \n\t"
"vleg %%v22,240(%%r1,%3),1 \n\t"
"vleg %%v23,248(%%r1,%3),1 \n\t"
"vflpdb %%v16, %%v16 \n\t"
"vflpdb %%v17, %%v17 \n\t"
"vflpdb %%v18, %%v18 \n\t"
"vflpdb %%v19, %%v19 \n\t"
"vflpdb %%v20, %%v20 \n\t"
"vflpdb %%v21, %%v21 \n\t"
"vflpdb %%v22, %%v22 \n\t"
"vflpdb %%v23, %%v23 \n\t"
"vfadb %%v16,%%v16,%%v17 \n\t"
"vfadb %%v17,%%v18,%%v19 \n\t"
"vfadb %%v18,%%v20,%%v21 \n\t"
"vfadb %%v19,%%v22,%%v23 \n\t"
"vfchedb %%v4,%%v16,%%v17 \n\t"
"vfchedb %%v5,%%v18,%%v19 \n\t"
"vsel %%v16,%%v16,%%v17,%%v4 \n\t"
"vsel %%v4,%%v24,%%v25,%%v4 \n\t"
"vsel %%v17,%%v18,%%v19,%%v5 \n\t"
"vsel %%v5,%%v26,%%v27,%%v5 \n\t"

"vfchedb %%v18,%%v16,%%v17 \n\t"
"vsel %%v16,%%v16,%%v17,%%v18 \n\t"
"vsel %%v4,%%v4,%%v5,%%v18 \n\t"
"vag %%v4,%%v4,%%v3 \n\t"

"vfchedb %%v5,%%v0,%%v16 \n\t"
"vsel %%v0,%%v0,%%v16,%%v5 \n\t"
"vsel %%v1,%%v1,%%v4,%%v5 \n\t"
"vag %%v3,%%v3,%%v2 \n\t"

"agfi %%r1, 256 \n\t"
"brctg %%r0, 0b \n\t"

"vrepg %%v2,%%v0,1 \n\t"
"vrepg %%v3,%%v1,1 \n\t"
"wfcdb %%v2,%%v0 \n\t"
"jne 1f \n\t"
"vsteg %%v0,%1,0 \n\t"
"vmnlg %%v0,%%v1,%%v3 \n\t"
"vlgvg %0,%%v0,0 \n\t"
"j 2f \n\t"
"1: \n\t"
"wfchdb %%v4,%%v2,%%v0 \n\t"
"vsel %%v1,%%v3,%%v1,%%v4 \n\t"
"vsel %%v0,%%v2,%%v0,%%v4 \n\t"
"std %%f0,%1 \n\t"
"vlgvg %0,%%v1,0 \n\t"
"2: \n\t"
"nop "
:"=r"(iamax),"=m"(*amax)
:"r"(n),"ZR"((const FLOAT (*)[n * 2])x)
:"memory","cc","r0","r1","v0","v1","v2","v3","v4","v5","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27"
);

return iamax;

#define CABS1(x,i) (fabs(x[i]) + fabs(x[i + 1]))

static BLASLONG izamax_kernel_16(BLASLONG n, FLOAT *x, FLOAT *amax) {
BLASLONG iamax;

__asm__("vleg %%v0,0(%[x]),0\n\t"
"vleg %%v1,8(%[x]),0\n\t"
"vleg %%v0,16(%[x]),1\n\t"
"vleg %%v1,24(%[x]),1\n\t"
"vflpdb %%v0,%%v0\n\t"
"vflpdb %%v1,%%v1\n\t"
"vfadb %%v0,%%v0,%%v1\n\t"
"vleig %%v1,0,0\n\t"
"vleig %%v1,1,1\n\t"
"vrepig %%v2,8\n\t"
"vzero %%v3\n\t"
"vleig %%v24,0,0\n\t"
"vleig %%v24,1,1\n\t"
"vleig %%v25,2,0\n\t"
"vleig %%v25,3,1\n\t"
"vleig %%v26,4,0\n\t"
"vleig %%v26,5,1\n\t"
"vleig %%v27,6,0\n\t"
"vleig %%v27,7,1\n\t"
"srlg %[n],%[n],4\n\t"
"xgr %%r1,%%r1\n\t"
"0:\n\t"
"pfd 1, 1024(%%r1,%[x])\n\t"
"vleg %%v16,0(%%r1,%[x]),0\n\t"
"vleg %%v17,8(%%r1,%[x]),0\n\t"
"vleg %%v16,16(%%r1,%[x]),1\n\t"
"vleg %%v17,24(%%r1,%[x]),1\n\t"
"vleg %%v18,32(%%r1,%[x]),0\n\t"
"vleg %%v19,40(%%r1,%[x]),0\n\t"
"vleg %%v18,48(%%r1,%[x]),1\n\t"
"vleg %%v19,56(%%r1,%[x]),1\n\t"
"vleg %%v20,64(%%r1,%[x]),0\n\t"
"vleg %%v21,72(%%r1,%[x]),0\n\t"
"vleg %%v20,80(%%r1,%[x]),1\n\t"
"vleg %%v21,88(%%r1,%[x]),1\n\t"
"vleg %%v22,96(%%r1,%[x]),0\n\t"
"vleg %%v23,104(%%r1,%[x]),0\n\t"
"vleg %%v22,112(%%r1,%[x]),1\n\t"
"vleg %%v23,120(%%r1,%[x]),1\n\t"
"vflpdb %%v16, %%v16\n\t"
"vflpdb %%v17, %%v17\n\t"
"vflpdb %%v18, %%v18\n\t"
"vflpdb %%v19, %%v19\n\t"
"vflpdb %%v20, %%v20\n\t"
"vflpdb %%v21, %%v21\n\t"
"vflpdb %%v22, %%v22\n\t"
"vflpdb %%v23, %%v23\n\t"
"vfadb %%v16,%%v16,%%v17\n\t"
"vfadb %%v17,%%v18,%%v19\n\t"
"vfadb %%v18,%%v20,%%v21\n\t"
"vfadb %%v19,%%v22,%%v23\n\t"
"vfchedb %%v4,%%v16,%%v17\n\t"
"vfchedb %%v5,%%v18,%%v19\n\t"
"vsel %%v16,%%v16,%%v17,%%v4\n\t"
"vsel %%v4,%%v24,%%v25,%%v4\n\t"
"vsel %%v17,%%v18,%%v19,%%v5\n\t"
"vsel %%v5,%%v26,%%v27,%%v5\n\t"
"vfchedb %%v18,%%v16,%%v17\n\t"
"vsel %%v16,%%v16,%%v17,%%v18\n\t"
"vsel %%v4,%%v4,%%v5,%%v18\n\t"
"vag %%v4,%%v4,%%v3\n\t"
"vfchedb %%v5,%%v0,%%v16\n\t"
"vsel %%v0,%%v0,%%v16,%%v5\n\t"
"vsel %%v1,%%v1,%%v4,%%v5\n\t"
"vag %%v3,%%v3,%%v2\n\t"
"vleg %%v16,128(%%r1,%[x]),0\n\t"
"vleg %%v17,136(%%r1,%[x]),0\n\t"
"vleg %%v16,144(%%r1,%[x]),1\n\t"
"vleg %%v17,152(%%r1,%[x]),1\n\t"
"vleg %%v18,160(%%r1,%[x]),0\n\t"
"vleg %%v19,168(%%r1,%[x]),0\n\t"
"vleg %%v18,176(%%r1,%[x]),1\n\t"
"vleg %%v19,184(%%r1,%[x]),1\n\t"
"vleg %%v20,192(%%r1,%[x]),0\n\t"
"vleg %%v21,200(%%r1,%[x]),0\n\t"
"vleg %%v20,208(%%r1,%[x]),1\n\t"
"vleg %%v21,216(%%r1,%[x]),1\n\t"
"vleg %%v22,224(%%r1,%[x]),0\n\t"
"vleg %%v23,232(%%r1,%[x]),0\n\t"
"vleg %%v22,240(%%r1,%[x]),1\n\t"
"vleg %%v23,248(%%r1,%[x]),1\n\t"
"vflpdb %%v16, %%v16\n\t"
"vflpdb %%v17, %%v17\n\t"
"vflpdb %%v18, %%v18\n\t"
"vflpdb %%v19, %%v19\n\t"
"vflpdb %%v20, %%v20\n\t"
"vflpdb %%v21, %%v21\n\t"
"vflpdb %%v22, %%v22\n\t"
"vflpdb %%v23, %%v23\n\t"
"vfadb %%v16,%%v16,%%v17\n\t"
"vfadb %%v17,%%v18,%%v19\n\t"
"vfadb %%v18,%%v20,%%v21\n\t"
"vfadb %%v19,%%v22,%%v23\n\t"
"vfchedb %%v4,%%v16,%%v17\n\t"
"vfchedb %%v5,%%v18,%%v19\n\t"
"vsel %%v16,%%v16,%%v17,%%v4\n\t"
"vsel %%v4,%%v24,%%v25,%%v4\n\t"
"vsel %%v17,%%v18,%%v19,%%v5\n\t"
"vsel %%v5,%%v26,%%v27,%%v5\n\t"
"vfchedb %%v18,%%v16,%%v17\n\t"
"vsel %%v16,%%v16,%%v17,%%v18\n\t"
"vsel %%v4,%%v4,%%v5,%%v18\n\t"
"vag %%v4,%%v4,%%v3\n\t"
"vfchedb %%v5,%%v0,%%v16\n\t"
"vsel %%v0,%%v0,%%v16,%%v5\n\t"
"vsel %%v1,%%v1,%%v4,%%v5\n\t"
"vag %%v3,%%v3,%%v2\n\t"
"agfi %%r1, 256\n\t"
"brctg %[n], 0b\n\t"
"vrepg %%v2,%%v0,1\n\t"
"vrepg %%v3,%%v1,1\n\t"
"wfcdb %%v2,%%v0\n\t"
"jne 1f\n\t"
"vsteg %%v0,%[amax],0\n\t"
"vmnlg %%v0,%%v1,%%v3\n\t"
"vlgvg %[iamax],%%v0,0\n\t"
"j 2f\n\t"
"1:\n\t"
"wfchdb %%v4,%%v2,%%v0\n\t"
"vsel %%v1,%%v3,%%v1,%%v4\n\t"
"vsel %%v0,%%v2,%%v0,%%v4\n\t"
"std %%f0,%[amax]\n\t"
"vlgvg %[iamax],%%v1,0\n\t"
"2:\n\t"
"nop"
: [iamax] "=r"(iamax),[amax] "=m"(*amax),[n] "+&r"(n)
: "m"(*(const FLOAT (*)[n * 2]) x),[x] "a"(x)
: "cc", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v16", "v17", "v18",
"v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27");

return iamax;
} }


BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
{
BLASLONG i = 0;
BLASLONG ix = 0;
FLOAT maxf = 0;
BLASLONG max = 0;
BLASLONG inc_x2;
BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
BLASLONG i = 0;
BLASLONG ix = 0;
FLOAT maxf = 0;
BLASLONG max = 0;
BLASLONG inc_x2;

if (n <= 0 || inc_x <= 0)
return (max);

if (inc_x == 1) {


if (n <= 0 || inc_x <= 0) return(max);
if (inc_x == 1) {
BLASLONG n1 = n & -16;
if (n1 > 0) {


BLASLONG n1 = n & -16;
if (n1 > 0) {
max = izamax_kernel_16(n1, x, &maxf);
ix = n1 * 2;
i = n1;
} else {
maxf = CABS1(x, 0);
ix += 2;
i++;
}


max = izamax_kernel_16(n1, x, &maxf);
ix = n1 * 2;
i = n1;
while (i < n) {
if (CABS1(x, ix) > maxf) {
max = i;
maxf = CABS1(x, ix);
} }
else
{
maxf = CABS1(x,0);
ix += 2;
i++;
}

while(i < n)
{
if( CABS1(x,ix) > maxf )
{
max = i;
maxf = CABS1(x,ix);
}
ix += 2;
i++;
ix += 2;
i++;
} }
return (max + 1);
return (max + 1);

} else {


} else {
max = 0; max = 0;
maxf = CABS1(x,0);
maxf = CABS1(x, 0);
inc_x2 = 2 * inc_x; inc_x2 = 2 * inc_x;
ix += inc_x2;
i++;

while(i < n)
{
if( CABS1(x,ix) > maxf )
{
max = i;
maxf = CABS1(x,ix);
}
ix += inc_x2;
i++;

BLASLONG n1 = n & -4;
while (i < n1) {

if (CABS1(x, ix) > maxf) {
max = i;
maxf = CABS1(x, ix);
}
if (CABS1(x, ix + inc_x2) > maxf) {
max = i + 1;
maxf = CABS1(x, ix + inc_x2);
}
if (CABS1(x, ix + 2 * inc_x2) > maxf) {
max = i + 2;
maxf = CABS1(x, ix + 2 * inc_x2);
}
if (CABS1(x, ix + 3 * inc_x2) > maxf) {
max = i + 3;
maxf = CABS1(x, ix + 3 * inc_x2);
}

ix += inc_x2 * 4;

i += 4;

} }
return (max + 1);

while (i < n) {
if (CABS1(x, ix) > maxf) {
max = i;
maxf = CABS1(x, ix);
}
ix += inc_x2;
i++;
} }
return (max + 1);
}
} }



+ 204
- 205
kernel/zarch/izamin.c View File

@@ -1,5 +1,5 @@
/*************************************************************************** /***************************************************************************
Copyright (c) 2017, The OpenBLAS Project
Copyright (c) 2019, The OpenBLAS Project
All rights reserved. All rights reserved.
Redistribution and use in source and binary forms, with or without Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are modification, are permitted provided that the following conditions are
@@ -27,220 +27,219 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.


#include "common.h" #include "common.h"
#include <math.h> #include <math.h>
#if defined(DOUBLE)
#define ABS fabs
#else
#define ABS fabsf
#endif
#define CABS1(x,i) (ABS(x[i]) + ABS(x[i + 1]))

static BLASLONG izamin_kernel_16(BLASLONG n, FLOAT *x, FLOAT *amin)
{
BLASLONG iamin;

__asm__ volatile (
"vleg %%v0,0(%3),0 \n\t"
"vleg %%v1,8(%3),0 \n\t"
"vleg %%v0,16(%3),1 \n\t"
"vleg %%v1,24(%3),1 \n\t"
"vflpdb %%v0,%%v0 \n\t"
"vflpdb %%v1,%%v1 \n\t"
"vfadb %%v0,%%v0,%%v1 \n\t"
"vleig %%v1,0,0 \n\t"
"vleig %%v1,1,1 \n\t"
"vrepig %%v2,8 \n\t"
"vzero %%v3 \n\t"
"vleig %%v24,0,0 \n\t"
"vleig %%v24,1,1 \n\t"
"vleig %%v25,2,0 \n\t"
"vleig %%v25,3,1 \n\t"
"vleig %%v26,4,0 \n\t"
"vleig %%v26,5,1 \n\t"
"vleig %%v27,6,0 \n\t"
"vleig %%v27,7,1 \n\t"
"srlg %%r0,%2,4 \n\t"
"xgr %%r1,%%r1 \n\t"
"0: \n\t"
"pfd 1, 1024(%%r1,%3) \n\t"

"vleg %%v16,0(%%r1,%3),0 \n\t"
"vleg %%v17,8(%%r1,%3),0 \n\t"
"vleg %%v16,16(%%r1,%3),1 \n\t"
"vleg %%v17,24(%%r1,%3),1 \n\t"
"vleg %%v18,32(%%r1,%3),0 \n\t"
"vleg %%v19,40(%%r1,%3),0 \n\t"
"vleg %%v18,48(%%r1,%3),1 \n\t"
"vleg %%v19,56(%%r1,%3),1 \n\t"
"vleg %%v20,64(%%r1,%3),0 \n\t"
"vleg %%v21,72(%%r1,%3),0 \n\t"
"vleg %%v20,80(%%r1,%3),1 \n\t"
"vleg %%v21,88(%%r1,%3),1 \n\t"
"vleg %%v22,96(%%r1,%3),0 \n\t"
"vleg %%v23,104(%%r1,%3),0 \n\t"
"vleg %%v22,112(%%r1,%3),1 \n\t"
"vleg %%v23,120(%%r1,%3),1 \n\t"
"vflpdb %%v16, %%v16 \n\t"
"vflpdb %%v17, %%v17 \n\t"
"vflpdb %%v18, %%v18 \n\t"
"vflpdb %%v19, %%v19 \n\t"
"vflpdb %%v20, %%v20 \n\t"
"vflpdb %%v21, %%v21 \n\t"
"vflpdb %%v22, %%v22 \n\t"
"vflpdb %%v23, %%v23 \n\t"
"vfadb %%v16,%%v16,%%v17 \n\t"
"vfadb %%v17,%%v18,%%v19 \n\t"
"vfadb %%v18,%%v20,%%v21 \n\t"
"vfadb %%v19,%%v22,%%v23 \n\t"
"vfchedb %%v4,%%v17,%%v16 \n\t"
"vfchedb %%v5,%%v19,%%v18 \n\t"
"vsel %%v16,%%v16,%%v17,%%v4 \n\t"
"vsel %%v4,%%v24,%%v25,%%v4 \n\t"
"vsel %%v17,%%v18,%%v19,%%v5 \n\t"
"vsel %%v5,%%v26,%%v27,%%v5 \n\t"

"vfchedb %%v18,%%v17,%%v16 \n\t"
"vsel %%v16,%%v16,%%v17,%%v18 \n\t"
"vsel %%v4,%%v4,%%v5,%%v18 \n\t"
"vag %%v4,%%v4,%%v3 \n\t"

"vfchedb %%v5,%%v16,%%v0 \n\t"
"vsel %%v0,%%v0,%%v16,%%v5 \n\t"
"vsel %%v1,%%v1,%%v4,%%v5 \n\t"
"vag %%v3,%%v3,%%v2 \n\t"

"vleg %%v16,128(%%r1,%3),0 \n\t"
"vleg %%v17,136(%%r1,%3),0 \n\t"
"vleg %%v16,144(%%r1,%3),1 \n\t"
"vleg %%v17,152(%%r1,%3),1 \n\t"
"vleg %%v18,160(%%r1,%3),0 \n\t"
"vleg %%v19,168(%%r1,%3),0 \n\t"
"vleg %%v18,176(%%r1,%3),1 \n\t"
"vleg %%v19,184(%%r1,%3),1 \n\t"
"vleg %%v20,192(%%r1,%3),0 \n\t"
"vleg %%v21,200(%%r1,%3),0 \n\t"
"vleg %%v20,208(%%r1,%3),1 \n\t"
"vleg %%v21,216(%%r1,%3),1 \n\t"
"vleg %%v22,224(%%r1,%3),0 \n\t"
"vleg %%v23,232(%%r1,%3),0 \n\t"
"vleg %%v22,240(%%r1,%3),1 \n\t"
"vleg %%v23,248(%%r1,%3),1 \n\t"
"vflpdb %%v16, %%v16 \n\t"
"vflpdb %%v17, %%v17 \n\t"
"vflpdb %%v18, %%v18 \n\t"
"vflpdb %%v19, %%v19 \n\t"
"vflpdb %%v20, %%v20 \n\t"
"vflpdb %%v21, %%v21 \n\t"
"vflpdb %%v22, %%v22 \n\t"
"vflpdb %%v23, %%v23 \n\t"
"vfadb %%v16,%%v16,%%v17 \n\t"
"vfadb %%v17,%%v18,%%v19 \n\t"
"vfadb %%v18,%%v20,%%v21 \n\t"
"vfadb %%v19,%%v22,%%v23 \n\t"
"vfchedb %%v4,%%v17,%%v16 \n\t"
"vfchedb %%v5,%%v19,%%v18 \n\t"
"vsel %%v16,%%v16,%%v17,%%v4 \n\t"
"vsel %%v4,%%v24,%%v25,%%v4 \n\t"
"vsel %%v17,%%v18,%%v19,%%v5 \n\t"
"vsel %%v5,%%v26,%%v27,%%v5 \n\t"

"vfchedb %%v18,%%v17,%%v16 \n\t"
"vsel %%v16,%%v16,%%v17,%%v18 \n\t"
"vsel %%v4,%%v4,%%v5,%%v18 \n\t"
"vag %%v4,%%v4,%%v3 \n\t"

"vfchedb %%v5,%%v16,%%v0 \n\t"
"vsel %%v0,%%v0,%%v16,%%v5 \n\t"
"vsel %%v1,%%v1,%%v4,%%v5 \n\t"
"vag %%v3,%%v3,%%v2 \n\t"

"agfi %%r1, 256 \n\t"
"brctg %%r0, 0b \n\t"

"vrepg %%v2,%%v0,1 \n\t"
"vrepg %%v3,%%v1,1 \n\t"
"wfcdb %%v2,%%v0 \n\t"
"jne 1f \n\t"
"vsteg %%v0,%1,0 \n\t"
"vmnlg %%v0,%%v1,%%v3 \n\t"
"vlgvg %0,%%v0,0 \n\t"
"j 2f \n\t"
"1: \n\t"
"wfchdb %%v4,%%v0,%%v2 \n\t"
"vsel %%v1,%%v3,%%v1,%%v4 \n\t"
"vsel %%v0,%%v2,%%v0,%%v4 \n\t"
"std %%f0,%1 \n\t"
"vlgvg %0,%%v1,0 \n\t"
"2: \n\t"
"nop "
:"=r"(iamin),"=m"(*amin)
:"r"(n),"ZR"((const FLOAT (*)[n * 2])x)
:"memory","cc","r0","r1","v0","v1","v2","v3","v4","v5","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27"
);

return iamin;

#define CABS1(x,i) (fabs(x[i]) + fabs(x[i + 1]))

static BLASLONG izamin_kernel_16(BLASLONG n, FLOAT *x, FLOAT *amin) {
BLASLONG iamin;

__asm__("vleg %%v0,0(%[x]),0\n\t"
"vleg %%v1,8(%[x]),0\n\t"
"vleg %%v0,16(%[x]),1\n\t"
"vleg %%v1,24(%[x]),1\n\t"
"vflpdb %%v0,%%v0\n\t"
"vflpdb %%v1,%%v1\n\t"
"vfadb %%v0,%%v0,%%v1\n\t"
"vleig %%v1,0,0\n\t"
"vleig %%v1,1,1\n\t"
"vrepig %%v2,8\n\t"
"vzero %%v3\n\t"
"vleig %%v24,0,0\n\t"
"vleig %%v24,1,1\n\t"
"vleig %%v25,2,0\n\t"
"vleig %%v25,3,1\n\t"
"vleig %%v26,4,0\n\t"
"vleig %%v26,5,1\n\t"
"vleig %%v27,6,0\n\t"
"vleig %%v27,7,1\n\t"
"srlg %[n],%[n],4\n\t"
"xgr %%r1,%%r1\n\t"
"0:\n\t"
"pfd 1, 1024(%%r1,%[x])\n\t"
"vleg %%v16,0(%%r1,%[x]),0\n\t"
"vleg %%v17,8(%%r1,%[x]),0\n\t"
"vleg %%v16,16(%%r1,%[x]),1\n\t"
"vleg %%v17,24(%%r1,%[x]),1\n\t"
"vleg %%v18,32(%%r1,%[x]),0\n\t"
"vleg %%v19,40(%%r1,%[x]),0\n\t"
"vleg %%v18,48(%%r1,%[x]),1\n\t"
"vleg %%v19,56(%%r1,%[x]),1\n\t"
"vleg %%v20,64(%%r1,%[x]),0\n\t"
"vleg %%v21,72(%%r1,%[x]),0\n\t"
"vleg %%v20,80(%%r1,%[x]),1\n\t"
"vleg %%v21,88(%%r1,%[x]),1\n\t"
"vleg %%v22,96(%%r1,%[x]),0\n\t"
"vleg %%v23,104(%%r1,%[x]),0\n\t"
"vleg %%v22,112(%%r1,%[x]),1\n\t"
"vleg %%v23,120(%%r1,%[x]),1\n\t"
"vflpdb %%v16, %%v16\n\t"
"vflpdb %%v17, %%v17\n\t"
"vflpdb %%v18, %%v18\n\t"
"vflpdb %%v19, %%v19\n\t"
"vflpdb %%v20, %%v20\n\t"
"vflpdb %%v21, %%v21\n\t"
"vflpdb %%v22, %%v22\n\t"
"vflpdb %%v23, %%v23\n\t"
"vfadb %%v16,%%v16,%%v17\n\t"
"vfadb %%v17,%%v18,%%v19\n\t"
"vfadb %%v18,%%v20,%%v21\n\t"
"vfadb %%v19,%%v22,%%v23\n\t"
"vfchedb %%v4,%%v17,%%v16\n\t"
"vfchedb %%v5,%%v19,%%v18\n\t"
"vsel %%v16,%%v16,%%v17,%%v4\n\t"
"vsel %%v4,%%v24,%%v25,%%v4\n\t"
"vsel %%v17,%%v18,%%v19,%%v5\n\t"
"vsel %%v5,%%v26,%%v27,%%v5\n\t"
"vfchedb %%v18,%%v17,%%v16\n\t"
"vsel %%v16,%%v16,%%v17,%%v18\n\t"
"vsel %%v4,%%v4,%%v5,%%v18\n\t"
"vag %%v4,%%v4,%%v3\n\t"
"vfchedb %%v5,%%v16,%%v0\n\t"
"vsel %%v0,%%v0,%%v16,%%v5\n\t"
"vsel %%v1,%%v1,%%v4,%%v5\n\t"
"vag %%v3,%%v3,%%v2\n\t"
"vleg %%v16,128(%%r1,%[x]),0\n\t"
"vleg %%v17,136(%%r1,%[x]),0\n\t"
"vleg %%v16,144(%%r1,%[x]),1\n\t"
"vleg %%v17,152(%%r1,%[x]),1\n\t"
"vleg %%v18,160(%%r1,%[x]),0\n\t"
"vleg %%v19,168(%%r1,%[x]),0\n\t"
"vleg %%v18,176(%%r1,%[x]),1\n\t"
"vleg %%v19,184(%%r1,%[x]),1\n\t"
"vleg %%v20,192(%%r1,%[x]),0\n\t"
"vleg %%v21,200(%%r1,%[x]),0\n\t"
"vleg %%v20,208(%%r1,%[x]),1\n\t"
"vleg %%v21,216(%%r1,%[x]),1\n\t"
"vleg %%v22,224(%%r1,%[x]),0\n\t"
"vleg %%v23,232(%%r1,%[x]),0\n\t"
"vleg %%v22,240(%%r1,%[x]),1\n\t"
"vleg %%v23,248(%%r1,%[x]),1\n\t"
"vflpdb %%v16, %%v16\n\t"
"vflpdb %%v17, %%v17\n\t"
"vflpdb %%v18, %%v18\n\t"
"vflpdb %%v19, %%v19\n\t"
"vflpdb %%v20, %%v20\n\t"
"vflpdb %%v21, %%v21\n\t"
"vflpdb %%v22, %%v22\n\t"
"vflpdb %%v23, %%v23\n\t"
"vfadb %%v16,%%v16,%%v17\n\t"
"vfadb %%v17,%%v18,%%v19\n\t"
"vfadb %%v18,%%v20,%%v21\n\t"
"vfadb %%v19,%%v22,%%v23\n\t"
"vfchedb %%v4,%%v17,%%v16\n\t"
"vfchedb %%v5,%%v19,%%v18\n\t"
"vsel %%v16,%%v16,%%v17,%%v4\n\t"
"vsel %%v4,%%v24,%%v25,%%v4\n\t"
"vsel %%v17,%%v18,%%v19,%%v5\n\t"
"vsel %%v5,%%v26,%%v27,%%v5\n\t"
"vfchedb %%v18,%%v17,%%v16\n\t"
"vsel %%v16,%%v16,%%v17,%%v18\n\t"
"vsel %%v4,%%v4,%%v5,%%v18\n\t"
"vag %%v4,%%v4,%%v3\n\t"
"vfchedb %%v5,%%v16,%%v0\n\t"
"vsel %%v0,%%v0,%%v16,%%v5\n\t"
"vsel %%v1,%%v1,%%v4,%%v5\n\t"
"vag %%v3,%%v3,%%v2\n\t"
"agfi %%r1, 256\n\t"
"brctg %[n], 0b\n\t"
"vrepg %%v2,%%v0,1\n\t"
"vrepg %%v3,%%v1,1\n\t"
"wfcdb %%v2,%%v0\n\t"
"jne 1f\n\t"
"vsteg %%v0,%[amin],0\n\t"
"vmnlg %%v0,%%v1,%%v3\n\t"
"vlgvg %[iamin],%%v0,0\n\t"
"j 2f\n\t"
"1:\n\t"
"wfchdb %%v4,%%v0,%%v2\n\t"
"vsel %%v1,%%v3,%%v1,%%v4\n\t"
"vsel %%v0,%%v2,%%v0,%%v4\n\t"
"std %%f0,%[amin]\n\t"
"vlgvg %[iamin],%%v1,0\n\t"
"2:\n\t"
"nop"
: [iamin] "=r"(iamin),[amin] "=m"(*amin),[n] "+&r"(n)
: "m"(*(const FLOAT (*)[n * 2]) x),[x] "a"(x)
: "cc", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v16", "v17", "v18",
"v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27");

return iamin;
} }


BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
{
BLASLONG i = 0;
BLASLONG ix = 0;
FLOAT minf = 0;
BLASLONG min = 0;
BLASLONG inc_x2;
BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
BLASLONG i = 0;
BLASLONG ix = 0;
FLOAT minf = 0;
BLASLONG min = 0;
BLASLONG inc_x2;

if (n <= 0 || inc_x <= 0)
return (min);

if (inc_x == 1) {


if (n <= 0 || inc_x <= 0) return(min);
if (inc_x == 1) {
BLASLONG n1 = n & -16;
if (n1 > 0) {


BLASLONG n1 = n & -16;
if (n1 > 0) {
min = izamin_kernel_16(n1, x, &minf);
ix = n1 * 2;
i = n1;
} else {
minf = CABS1(x, 0);
ix += 2;
i++;
}


min = izamin_kernel_16(n1, x, &minf);
ix = n1 * 2;
i = n1;
while (i < n) {
if (CABS1(x, ix) < minf) {
min = i;
minf = CABS1(x, ix);
} }
else
{
minf = CABS1(x,0);
ix += 2;
i++;
}

while(i < n)
{
if( CABS1(x,ix) < minf )
{
min = i;
minf = CABS1(x,ix);
}
ix += 2;
i++;
ix += 2;
i++;
} }
return (min + 1);
return (min + 1);

} else {


} else {
min = 0; min = 0;
minf = CABS1(x,0);
minf = CABS1(x, 0);
inc_x2 = 2 * inc_x; inc_x2 = 2 * inc_x;
ix += inc_x2;
i++;

while(i < n)
{
if( CABS1(x,ix) < minf )
{
min = i;
minf = CABS1(x,ix);
}
ix += inc_x2;
i++;

BLASLONG n1 = n & -4;
while (i < n1) {

if (CABS1(x, ix) < minf) {
min = i;
minf = CABS1(x, ix);
}
if (CABS1(x, ix + inc_x2) < minf) {
min = i + 1;
minf = CABS1(x, ix + inc_x2);
}
if (CABS1(x, ix + 2 * inc_x2) < minf) {
min = i + 2;
minf = CABS1(x, ix + 2 * inc_x2);
}
if (CABS1(x, ix + 3 * inc_x2) < minf) {
min = i + 3;
minf = CABS1(x, ix + 3 * inc_x2);
}

ix += inc_x2 * 4;

i += 4;

} }
return (min + 1);

while (i < n) {
if (CABS1(x, ix) < minf) {
min = i;
minf = CABS1(x, ix);
}
ix += inc_x2;
i++;
} }
return (min + 1);
}
} }



+ 104
- 121
kernel/zarch/samax.c View File

@@ -1,5 +1,5 @@
/*************************************************************************** /***************************************************************************
Copyright (c) 2013-2017, The OpenBLAS Project
Copyright (c) 2013-2019, The OpenBLAS Project
All rights reserved. All rights reserved.
Redistribution and use in source and binary forms, with or without Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are modification, are permitted provided that the following conditions are
@@ -28,142 +28,125 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "common.h" #include "common.h"
#include <math.h> #include <math.h>


#if defined(DOUBLE)
#define ABS fabs
#else
#define ABS fabsf #define ABS fabsf
#endif

static FLOAT samax_kernel_64(BLASLONG n, FLOAT *x)
{
FLOAT amax;

__asm__ volatile (
"vl %%v0,0(%2) \n\t"
"srlg %%r0,%1,6 \n\t"
"xgr %%r1,%%r1 \n\t"
"0: \n\t"
"pfd 1, 1024(%%r1,%2) \n\t"

"vl %%v16,0(%%r1,%2) \n\t"
"vl %%v17,16(%%r1,%2) \n\t"
"vl %%v18,32(%%r1,%2) \n\t"
"vl %%v19,48(%%r1,%2) \n\t"
"vl %%v20,64(%%r1,%2) \n\t"
"vl %%v21,80(%%r1,%2) \n\t"
"vl %%v22,96(%%r1,%2) \n\t"
"vl %%v23,112(%%r1,%2) \n\t"
"vl %%v24,128(%%r1,%2) \n\t"
"vl %%v25,144(%%r1,%2) \n\t"
"vl %%v26,160(%%r1,%2) \n\t"
"vl %%v27,176(%%r1,%2) \n\t"
"vl %%v28,192(%%r1,%2) \n\t"
"vl %%v29,208(%%r1,%2) \n\t"
"vl %%v30,224(%%r1,%2) \n\t"
"vl %%v31,240(%%r1,%2) \n\t"
"vfmaxsb %%v16,%%v16,%%v24,8 \n\t"
"vfmaxsb %%v17,%%v17,%%v25,8 \n\t"
"vfmaxsb %%v18,%%v18,%%v26,8 \n\t"
"vfmaxsb %%v19,%%v19,%%v27,8 \n\t"
"vfmaxsb %%v20,%%v20,%%v28,8 \n\t"
"vfmaxsb %%v21,%%v21,%%v29,8 \n\t"
"vfmaxsb %%v22,%%v22,%%v30,8 \n\t"
"vfmaxsb %%v23,%%v23,%%v31,8 \n\t"

"vfmaxsb %%v16,%%v16,%%v20,8 \n\t"
"vfmaxsb %%v17,%%v17,%%v21,8 \n\t"
"vfmaxsb %%v18,%%v18,%%v22,8 \n\t"
"vfmaxsb %%v19,%%v19,%%v23,8 \n\t"

"vfmaxsb %%v16,%%v16,%%v18,8 \n\t"
"vfmaxsb %%v17,%%v17,%%v19,8 \n\t"

"vfmaxsb %%v16,%%v16,%%v17,8 \n\t"

"vfmaxsb %%v0,%%v0,%%v16,8 \n\t"

"agfi %%r1, 256 \n\t"
"brctg %%r0, 0b \n\t"

"veslg %%v16,%%v0,32 \n\t"
"vfmaxsb %%v0,%%v0,%%v16,8 \n\t"

"vrepf %%v16,%%v0,2 \n\t"
"wfmaxsb %%v0,%%v0,%%v16,8 \n\t"
"lper %0,%%f0 "
:"=f"(amax)
:"r"(n),"ZR"((const FLOAT (*)[n])x)
:"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
);

return amax;
}
FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
BLASLONG i = 0;
BLASLONG j = 0;
FLOAT maxf = 0.0;


if (n <= 0 || inc_x <= 0) return (maxf);
static FLOAT samax_kernel_64(BLASLONG n, FLOAT *x) {
FLOAT amax;

__asm__("vl %%v0,0(%[x])\n\t"
"srlg %[n],%[n],6\n\t"
"xgr %%r1,%%r1\n\t"
"0:\n\t"
"pfd 1, 1024(%%r1,%[x])\n\t"
"vl %%v16,0(%%r1,%[x])\n\t"
"vl %%v17,16(%%r1,%[x])\n\t"
"vl %%v18,32(%%r1,%[x])\n\t"
"vl %%v19,48(%%r1,%[x])\n\t"
"vl %%v20,64(%%r1,%[x])\n\t"
"vl %%v21,80(%%r1,%[x])\n\t"
"vl %%v22,96(%%r1,%[x])\n\t"
"vl %%v23,112(%%r1,%[x])\n\t"
"vl %%v24,128(%%r1,%[x])\n\t"
"vl %%v25,144(%%r1,%[x])\n\t"
"vl %%v26,160(%%r1,%[x])\n\t"
"vl %%v27,176(%%r1,%[x])\n\t"
"vl %%v28,192(%%r1,%[x])\n\t"
"vl %%v29,208(%%r1,%[x])\n\t"
"vl %%v30,224(%%r1,%[x])\n\t"
"vl %%v31,240(%%r1,%[x])\n\t"
"vfmaxsb %%v16,%%v16,%%v24,8\n\t"
"vfmaxsb %%v17,%%v17,%%v25,8\n\t"
"vfmaxsb %%v18,%%v18,%%v26,8\n\t"
"vfmaxsb %%v19,%%v19,%%v27,8\n\t"
"vfmaxsb %%v20,%%v20,%%v28,8\n\t"
"vfmaxsb %%v21,%%v21,%%v29,8\n\t"
"vfmaxsb %%v22,%%v22,%%v30,8\n\t"
"vfmaxsb %%v23,%%v23,%%v31,8\n\t"
"vfmaxsb %%v16,%%v16,%%v20,8\n\t"
"vfmaxsb %%v17,%%v17,%%v21,8\n\t"
"vfmaxsb %%v18,%%v18,%%v22,8\n\t"
"vfmaxsb %%v19,%%v19,%%v23,8\n\t"
"vfmaxsb %%v16,%%v16,%%v18,8\n\t"
"vfmaxsb %%v17,%%v17,%%v19,8\n\t"
"vfmaxsb %%v16,%%v16,%%v17,8\n\t"
"vfmaxsb %%v0,%%v0,%%v16,8\n\t"
"agfi %%r1, 256\n\t"
"brctg %[n], 0b\n\t"
"veslg %%v16,%%v0,32\n\t"
"vfmaxsb %%v0,%%v0,%%v16,8\n\t"
"vrepf %%v16,%%v0,2\n\t"
"wfmaxsb %%v0,%%v0,%%v16,8\n\t"
"lper %[amax],%%f0"
: [amax] "=f"(amax),[n] "+&r"(n)
: "m"(*(const FLOAT (*)[n]) x),[x] "a"(x)
: "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22",
"v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31");

return amax;
}


if (inc_x == 1) {
FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
BLASLONG i = 0;
BLASLONG j = 0;
FLOAT maxf = 0.0;


BLASLONG n1 = n & -64;
if (n1 > 0) {
if (n <= 0 || inc_x <= 0)
return (maxf);


maxf = samax_kernel_64(n1, x);
if (inc_x == 1) {


i = n1;
}
else
{
maxf=ABS(x[0]);
i++;
}
BLASLONG n1 = n & -64;
if (n1 > 0) {


while (i < n) {
if (ABS(x[i]) > maxf) {
maxf = ABS(x[i]);
}
i++;
}
return (maxf);
maxf = samax_kernel_64(n1, x);


i = n1;
} else { } else {
maxf = ABS(x[0]);
i++;
}

while (i < n) {
if (ABS(x[i]) > maxf) {
maxf = ABS(x[i]);
}
i++;
}
return (maxf);


maxf=ABS(x[0]);
} else {


BLASLONG n1 = n & -4;
while (j < n1) {
maxf = ABS(x[0]);


if (ABS(x[i]) > maxf) {
maxf = ABS(x[i]);
}
if (ABS(x[i + inc_x]) > maxf) {
maxf = ABS(x[i + inc_x]);
}
if (ABS(x[i + 2 * inc_x]) > maxf) {
maxf = ABS(x[i + 2 * inc_x]);
}
if (ABS(x[i + 3 * inc_x]) > maxf) {
maxf = ABS(x[i + 3 * inc_x]);
}
BLASLONG n1 = n & -4;
while (j < n1) {


i += inc_x * 4;
if (ABS(x[i]) > maxf) {
maxf = ABS(x[i]);
}
if (ABS(x[i + inc_x]) > maxf) {
maxf = ABS(x[i + inc_x]);
}
if (ABS(x[i + 2 * inc_x]) > maxf) {
maxf = ABS(x[i + 2 * inc_x]);
}
if (ABS(x[i + 3 * inc_x]) > maxf) {
maxf = ABS(x[i + 3 * inc_x]);
}


j += 4;
i += inc_x * 4;


}
j += 4;


}


while (j < n) {
if (ABS(x[i]) > maxf) {
maxf = ABS(x[i]);
}
i += inc_x;
j++;
}
return (maxf);
while (j < n) {
if (ABS(x[i]) > maxf) {
maxf = ABS(x[i]);
}
i += inc_x;
j++;
} }
return (maxf);
}
} }

+ 104
- 121
kernel/zarch/samin.c View File

@@ -1,5 +1,5 @@
/*************************************************************************** /***************************************************************************
Copyright (c) 2013-2017, The OpenBLAS Project
Copyright (c) 2013-2019, The OpenBLAS Project
All rights reserved. All rights reserved.
Redistribution and use in source and binary forms, with or without Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are modification, are permitted provided that the following conditions are
@@ -28,142 +28,125 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "common.h" #include "common.h"
#include <math.h> #include <math.h>


#if defined(DOUBLE)
#define ABS fabs
#else
#define ABS fabsf #define ABS fabsf
#endif

static FLOAT samin_kernel_64(BLASLONG n, FLOAT *x)
{
FLOAT amin;

__asm__ volatile (
"vl %%v0,0(%2) \n\t"
"srlg %%r0,%1,6 \n\t"
"xgr %%r1,%%r1 \n\t"
"0: \n\t"
"pfd 1, 1024(%%r1,%2) \n\t"

"vl %%v16,0(%%r1,%2) \n\t"
"vl %%v17,16(%%r1,%2) \n\t"
"vl %%v18,32(%%r1,%2) \n\t"
"vl %%v19,48(%%r1,%2) \n\t"
"vl %%v20,64(%%r1,%2) \n\t"
"vl %%v21,80(%%r1,%2) \n\t"
"vl %%v22,96(%%r1,%2) \n\t"
"vl %%v23,112(%%r1,%2) \n\t"
"vl %%v24,128(%%r1,%2) \n\t"
"vl %%v25,144(%%r1,%2) \n\t"
"vl %%v26,160(%%r1,%2) \n\t"
"vl %%v27,176(%%r1,%2) \n\t"
"vl %%v28,192(%%r1,%2) \n\t"
"vl %%v29,208(%%r1,%2) \n\t"
"vl %%v30,224(%%r1,%2) \n\t"
"vl %%v31,240(%%r1,%2) \n\t"
"vfminsb %%v16,%%v16,%%v24,8 \n\t"
"vfminsb %%v17,%%v17,%%v25,8 \n\t"
"vfminsb %%v18,%%v18,%%v26,8 \n\t"
"vfminsb %%v19,%%v19,%%v27,8 \n\t"
"vfminsb %%v20,%%v20,%%v28,8 \n\t"
"vfminsb %%v21,%%v21,%%v29,8 \n\t"
"vfminsb %%v22,%%v22,%%v30,8 \n\t"
"vfminsb %%v23,%%v23,%%v31,8 \n\t"

"vfminsb %%v16,%%v16,%%v20,8 \n\t"
"vfminsb %%v17,%%v17,%%v21,8 \n\t"
"vfminsb %%v18,%%v18,%%v22,8 \n\t"
"vfminsb %%v19,%%v19,%%v23,8 \n\t"

"vfminsb %%v16,%%v16,%%v18,8 \n\t"
"vfminsb %%v17,%%v17,%%v19,8 \n\t"

"vfminsb %%v16,%%v16,%%v17,8 \n\t"

"vfminsb %%v0,%%v0,%%v16,8 \n\t"

"agfi %%r1, 256 \n\t"
"brctg %%r0, 0b \n\t"

"veslg %%v16,%%v0,32 \n\t"
"vfminsb %%v0,%%v0,%%v16,8 \n\t"

"vrepf %%v16,%%v0,2 \n\t"
"wfminsb %%v0,%%v0,%%v16,8 \n\t"
"lper %0,%%f0 "
:"=f"(amin)
:"r"(n),"ZR"((const FLOAT (*)[n])x)
:"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
);

return amin;
}
FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
BLASLONG i = 0;
BLASLONG j = 0;
FLOAT minf = 0.0;


if (n <= 0 || inc_x <= 0) return (minf);
static FLOAT samin_kernel_64(BLASLONG n, FLOAT *x) {
FLOAT amin;

__asm__("vl %%v0,0(%[x])\n\t"
"srlg %[n],%[n],6\n\t"
"xgr %%r1,%%r1\n\t"
"0:\n\t"
"pfd 1, 1024(%%r1,%[x])\n\t"
"vl %%v16,0(%%r1,%[x])\n\t"
"vl %%v17,16(%%r1,%[x])\n\t"
"vl %%v18,32(%%r1,%[x])\n\t"
"vl %%v19,48(%%r1,%[x])\n\t"
"vl %%v20,64(%%r1,%[x])\n\t"
"vl %%v21,80(%%r1,%[x])\n\t"
"vl %%v22,96(%%r1,%[x])\n\t"
"vl %%v23,112(%%r1,%[x])\n\t"
"vl %%v24,128(%%r1,%[x])\n\t"
"vl %%v25,144(%%r1,%[x])\n\t"
"vl %%v26,160(%%r1,%[x])\n\t"
"vl %%v27,176(%%r1,%[x])\n\t"
"vl %%v28,192(%%r1,%[x])\n\t"
"vl %%v29,208(%%r1,%[x])\n\t"
"vl %%v30,224(%%r1,%[x])\n\t"
"vl %%v31,240(%%r1,%[x])\n\t"
"vfminsb %%v16,%%v16,%%v24,8\n\t"
"vfminsb %%v17,%%v17,%%v25,8\n\t"
"vfminsb %%v18,%%v18,%%v26,8\n\t"
"vfminsb %%v19,%%v19,%%v27,8\n\t"
"vfminsb %%v20,%%v20,%%v28,8\n\t"
"vfminsb %%v21,%%v21,%%v29,8\n\t"
"vfminsb %%v22,%%v22,%%v30,8\n\t"
"vfminsb %%v23,%%v23,%%v31,8\n\t"
"vfminsb %%v16,%%v16,%%v20,8\n\t"
"vfminsb %%v17,%%v17,%%v21,8\n\t"
"vfminsb %%v18,%%v18,%%v22,8\n\t"
"vfminsb %%v19,%%v19,%%v23,8\n\t"
"vfminsb %%v16,%%v16,%%v18,8\n\t"
"vfminsb %%v17,%%v17,%%v19,8\n\t"
"vfminsb %%v16,%%v16,%%v17,8\n\t"
"vfminsb %%v0,%%v0,%%v16,8\n\t"
"agfi %%r1, 256\n\t"
"brctg %[n], 0b\n\t"
"veslg %%v16,%%v0,32\n\t"
"vfminsb %%v0,%%v0,%%v16,8\n\t"
"vrepf %%v16,%%v0,2\n\t"
"wfminsb %%v0,%%v0,%%v16,8\n\t"
"lper %[amin],%%f0"
: [amin] "=f"(amin),[n] "+&r"(n)
: "m"(*(const FLOAT (*)[n]) x),[x] "a"(x)
: "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22",
"v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31");

return amin;
}


if (inc_x == 1) {
FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
BLASLONG i = 0;
BLASLONG j = 0;
FLOAT minf = 0.0;


BLASLONG n1 = n & -64;
if (n1 > 0) {
if (n <= 0 || inc_x <= 0)
return (minf);


minf = samin_kernel_64(n1, x);
if (inc_x == 1) {


i = n1;
}
else
{
minf=ABS(x[0]);
i++;
}
BLASLONG n1 = n & -64;
if (n1 > 0) {


while (i < n) {
if (ABS(x[i]) < minf) {
minf = ABS(x[i]);
}
i++;
}
return (minf);
minf = samin_kernel_64(n1, x);


i = n1;
} else { } else {
minf = ABS(x[0]);
i++;
}

while (i < n) {
if (ABS(x[i]) < minf) {
minf = ABS(x[i]);
}
i++;
}
return (minf);


minf=ABS(x[0]);
} else {


BLASLONG n1 = n & -4;
while (j < n1) {
minf = ABS(x[0]);


if (ABS(x[i]) < minf) {
minf = ABS(x[i]);
}
if (ABS(x[i + inc_x]) < minf) {
minf = ABS(x[i + inc_x]);
}
if (ABS(x[i + 2 * inc_x]) < minf) {
minf = ABS(x[i + 2 * inc_x]);
}
if (ABS(x[i + 3 * inc_x]) < minf) {
minf = ABS(x[i + 3 * inc_x]);
}
BLASLONG n1 = n & -4;
while (j < n1) {


i += inc_x * 4;
if (ABS(x[i]) < minf) {
minf = ABS(x[i]);
}
if (ABS(x[i + inc_x]) < minf) {
minf = ABS(x[i + inc_x]);
}
if (ABS(x[i + 2 * inc_x]) < minf) {
minf = ABS(x[i + 2 * inc_x]);
}
if (ABS(x[i + 3 * inc_x]) < minf) {
minf = ABS(x[i + 3 * inc_x]);
}


j += 4;
i += inc_x * 4;


}
j += 4;


}


while (j < n) {
if (ABS(x[i]) < minf) {
minf = ABS(x[i]);
}
i += inc_x;
j++;
}
return (minf);
while (j < n) {
if (ABS(x[i]) < minf) {
minf = ABS(x[i]);
}
i += inc_x;
j++;
} }
return (minf);
}
} }

+ 123
- 129
kernel/zarch/sasum.c View File

@@ -1,5 +1,5 @@
/*************************************************************************** /***************************************************************************
Copyright (c) 2013-2018, The OpenBLAS Project
Copyright (c) 2013-2019, The OpenBLAS Project
All rights reserved. All rights reserved.
Redistribution and use in source and binary forms, with or without Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are modification, are permitted provided that the following conditions are
@@ -28,147 +28,141 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "common.h" #include "common.h"
#include <math.h> #include <math.h>


#if defined(DOUBLE)
#define ABS fabs
#else
#define ABS fabsf
#endif

static FLOAT sasum_kernel_64(BLASLONG n, FLOAT *x)
{
FLOAT asum;

__asm__ (
"vzero %%v0 \n\t"
"vzero %%v1 \n\t"
"vzero %%v2 \n\t"
"vzero %%v3 \n\t"
"srlg %%r0,%1,6 \n\t"
"xgr %%r1,%%r1 \n\t"
"0: \n\t"
"pfd 1, 1024(%%r1,%2) \n\t"
"vl %%v16, 0(%%r1,%2) \n\t"
"vl %%v17, 16(%%r1,%2) \n\t"
"vl %%v18, 32(%%r1,%2) \n\t"
"vl %%v19, 48(%%r1,%2) \n\t"
"vl %%v20, 64(%%r1,%2) \n\t"
"vl %%v21, 80(%%r1,%2) \n\t"
"vl %%v22, 96(%%r1,%2) \n\t"
"vl %%v23, 112(%%r1,%2) \n\t"

"vflpsb %%v16, %%v16 \n\t"
"vflpsb %%v17, %%v17 \n\t"
"vflpsb %%v18, %%v18 \n\t"
"vflpsb %%v19, %%v19 \n\t"
"vflpsb %%v20, %%v20 \n\t"
"vflpsb %%v21, %%v21 \n\t"
"vflpsb %%v22, %%v22 \n\t"
"vflpsb %%v23, %%v23 \n\t"

"vfasb %%v0,%%v0,%%v16 \n\t"
"vfasb %%v1,%%v1,%%v17 \n\t"
"vfasb %%v2,%%v2,%%v18 \n\t"
"vfasb %%v3,%%v3,%%v19 \n\t"
"vfasb %%v0,%%v0,%%v20 \n\t"
"vfasb %%v1,%%v1,%%v21 \n\t"
"vfasb %%v2,%%v2,%%v22 \n\t"
"vfasb %%v3,%%v3,%%v23 \n\t"

"vl %%v16, 128(%%r1,%2) \n\t"
"vl %%v17, 144(%%r1,%2) \n\t"
"vl %%v18, 160(%%r1,%2) \n\t"
"vl %%v19, 176(%%r1,%2) \n\t"
"vl %%v20, 192(%%r1,%2) \n\t"
"vl %%v21, 208(%%r1,%2) \n\t"
"vl %%v22, 224(%%r1,%2) \n\t"
"vl %%v23, 240(%%r1,%2) \n\t"

"vflpsb %%v16, %%v16 \n\t"
"vflpsb %%v17, %%v17 \n\t"
"vflpsb %%v18, %%v18 \n\t"
"vflpsb %%v19, %%v19 \n\t"
"vflpsb %%v20, %%v20 \n\t"
"vflpsb %%v21, %%v21 \n\t"
"vflpsb %%v22, %%v22 \n\t"
"vflpsb %%v23, %%v23 \n\t"

"vfasb %%v0,%%v0,%%v16 \n\t"
"vfasb %%v1,%%v1,%%v17 \n\t"
"vfasb %%v2,%%v2,%%v18 \n\t"
"vfasb %%v3,%%v3,%%v19 \n\t"
"vfasb %%v0,%%v0,%%v20 \n\t"
"vfasb %%v1,%%v1,%%v21 \n\t"
"vfasb %%v2,%%v2,%%v22 \n\t"
"vfasb %%v3,%%v3,%%v23 \n\t"
"agfi %%r1,256 \n\t"
"brctg %%r0,0b \n\t"
"vfasb %%v0,%%v0,%%v1 \n\t"
"vfasb %%v0,%%v0,%%v2 \n\t"
"vfasb %%v0,%%v0,%%v3 \n\t"
"veslg %%v1,%%v0,32 \n\t"
"vfasb %%v0,%%v0,%%v1 \n\t"
"vrepf %%v1,%%v0,2 \n\t"
"aebr %%f0,%%f1 \n\t"
"ler %0,%%f0 "
:"=f"(asum)
:"r"(n),"ZR"((const FLOAT (*)[n])x)
:"memory","cc","r0","r1","v0","v1","v2","v3","v16","v17","v18","v19","v20","v21","v22","v23"
);

return asum;
#define ABS fabsf

static FLOAT sasum_kernel_64(BLASLONG n, FLOAT *x) {
FLOAT asum;

__asm__("vzero %%v24\n\t"
"vzero %%v25\n\t"
"vzero %%v26\n\t"
"vzero %%v27\n\t"
"vzero %%v28\n\t"
"vzero %%v29\n\t"
"vzero %%v30\n\t"
"vzero %%v31\n\t"
"srlg %[n],%[n],6\n\t"
"xgr %%r1,%%r1\n\t"
"0:\n\t"
"pfd 1, 1024(%%r1,%[x])\n\t"
"vl %%v16, 0(%%r1,%[x])\n\t"
"vl %%v17, 16(%%r1,%[x])\n\t"
"vl %%v18, 32(%%r1,%[x])\n\t"
"vl %%v19, 48(%%r1,%[x])\n\t"
"vl %%v20, 64(%%r1,%[x])\n\t"
"vl %%v21, 80(%%r1,%[x])\n\t"
"vl %%v22, 96(%%r1,%[x])\n\t"
"vl %%v23, 112(%%r1,%[x])\n\t"
"vflpsb %%v16, %%v16\n\t"
"vflpsb %%v17, %%v17\n\t"
"vflpsb %%v18, %%v18\n\t"
"vflpsb %%v19, %%v19\n\t"
"vflpsb %%v20, %%v20\n\t"
"vflpsb %%v21, %%v21\n\t"
"vflpsb %%v22, %%v22\n\t"
"vflpsb %%v23, %%v23\n\t"
"vfasb %%v24,%%v24,%%v16\n\t"
"vfasb %%v25,%%v25,%%v17\n\t"
"vfasb %%v26,%%v26,%%v18\n\t"
"vfasb %%v27,%%v27,%%v19\n\t"
"vfasb %%v28,%%v28,%%v20\n\t"
"vfasb %%v29,%%v29,%%v21\n\t"
"vfasb %%v30,%%v30,%%v22\n\t"
"vfasb %%v31,%%v31,%%v23\n\t"
"vl %%v16, 128(%%r1,%[x])\n\t"
"vl %%v17, 144(%%r1,%[x])\n\t"
"vl %%v18, 160(%%r1,%[x])\n\t"
"vl %%v19, 176(%%r1,%[x])\n\t"
"vl %%v20, 192(%%r1,%[x])\n\t"
"vl %%v21, 208(%%r1,%[x])\n\t"
"vl %%v22, 224(%%r1,%[x])\n\t"
"vl %%v23, 240(%%r1,%[x])\n\t"
"vflpsb %%v16, %%v16\n\t"
"vflpsb %%v17, %%v17\n\t"
"vflpsb %%v18, %%v18\n\t"
"vflpsb %%v19, %%v19\n\t"
"vflpsb %%v20, %%v20\n\t"
"vflpsb %%v21, %%v21\n\t"
"vflpsb %%v22, %%v22\n\t"
"vflpsb %%v23, %%v23\n\t"
"vfasb %%v24,%%v24,%%v16\n\t"
"vfasb %%v25,%%v25,%%v17\n\t"
"vfasb %%v26,%%v26,%%v18\n\t"
"vfasb %%v27,%%v27,%%v19\n\t"
"vfasb %%v28,%%v28,%%v20\n\t"
"vfasb %%v29,%%v29,%%v21\n\t"
"vfasb %%v30,%%v30,%%v22\n\t"
"vfasb %%v31,%%v31,%%v23\n\t"
"agfi %%r1,256\n\t"
"brctg %[n],0b\n\t"
"vfasb %%v24,%%v24,%%v25\n\t"
"vfasb %%v24,%%v24,%%v26\n\t"
"vfasb %%v24,%%v24,%%v27\n\t"
"vfasb %%v24,%%v24,%%v28\n\t"
"vfasb %%v24,%%v24,%%v29\n\t"
"vfasb %%v24,%%v24,%%v30\n\t"
"vfasb %%v24,%%v24,%%v31\n\t"
"veslg %%v25,%%v24,32\n\t"
"vfasb %%v24,%%v24,%%v25\n\t"
"vrepf %%v25,%%v24,2\n\t"
"vfasb %%v24,%%v24,%%v25\n\t"
"vstef %%v24,%[asum],0"
: [asum] "=m"(asum),[n] "+&r"(n)
: "m"(*(const FLOAT (*)[n]) x),[x] "a"(x)
: "cc", "r1", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23",
"v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31");

return asum;
} }


FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
BLASLONG i = 0;
BLASLONG j = 0;
FLOAT sumf = 0.0;
BLASLONG n1;
BLASLONG i = 0;
BLASLONG j = 0;
FLOAT sumf = 0.0;
BLASLONG n1;


if (n <= 0 || inc_x <= 0) return sumf;

if (inc_x == 1) {

n1 = n & -64;
if (n1 > 0) {
if (n <= 0 || inc_x <= 0)
return sumf;


sumf = sasum_kernel_64(n1, x);
i = n1;
}
if (inc_x == 1) {


while (i < n) {
sumf += ABS(x[i]);
i++;
}
n1 = n & -64;


} else {
BLASLONG n1 = n & -4;
register FLOAT sum1, sum2;
sum1 = 0.0;
sum2 = 0.0;
while (j < n1) {
if (n1 > 0) {


sum1 += ABS(x[i]);
sum2 += ABS(x[i + inc_x]);
sum1 += ABS(x[i + 2 * inc_x]);
sum2 += ABS(x[i + 3 * inc_x]);
sumf = sasum_kernel_64(n1, x);
i = n1;
}


i += inc_x * 4;
j += 4;
while (i < n) {
sumf += ABS(x[i]);
i++;
}


}
sumf = sum1 + sum2;
while (j < n) {
} else {
BLASLONG n1 = n & -4;
register FLOAT sum1, sum2;
sum1 = 0.0;
sum2 = 0.0;
while (j < n1) {


sumf += ABS(x[i]);
i += inc_x;
j++;
}
sum1 += ABS(x[i]);
sum2 += ABS(x[i + inc_x]);
sum1 += ABS(x[i + 2 * inc_x]);
sum2 += ABS(x[i + 3 * inc_x]);


i += inc_x * 4;
j += 4;


} }
return sumf;
}
sumf = sum1 + sum2;
while (j < n) {


sumf += ABS(x[i]);
i += inc_x;
j++;
}


}
return sumf;
}

+ 118
- 135
kernel/zarch/saxpy.c View File

@@ -1,5 +1,5 @@
/*************************************************************************** /***************************************************************************
Copyright (c) 2013-2017, The OpenBLAS Project
Copyright (c) 2013-2019, The OpenBLAS Project
All rights reserved. All rights reserved.
Redistribution and use in source and binary forms, with or without Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are modification, are permitted provided that the following conditions are
@@ -27,158 +27,141 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.


#include "common.h" #include "common.h"


static void saxpy_kernel_64(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
{
__asm__ volatile(
"vlrepf %%v0,%3 \n\t"
"srlg %%r0,%0,6 \n\t"
"xgr %%r1,%%r1 \n\t"
"0: \n\t"
"pfd 1, 1024(%%r1,%1) \n\t"
"pfd 2, 1024(%%r1,%2) \n\t"

"vl %%v16,0(%%r1,%1) \n\t"
"vl %%v17,16(%%r1,%1) \n\t"
"vl %%v18,32(%%r1,%1) \n\t"
"vl %%v19,48(%%r1,%1) \n\t"
"vl %%v20,0(%%r1,%2) \n\t"
"vl %%v21,16(%%r1,%2) \n\t"
"vl %%v22,32(%%r1,%2) \n\t"
"vl %%v23,48(%%r1,%2) \n\t"
"vfmasb %%v16,%%v0,%%v16,%%v20 \n\t"
"vfmasb %%v17,%%v0,%%v17,%%v21 \n\t"
"vfmasb %%v18,%%v0,%%v18,%%v22 \n\t"
"vfmasb %%v19,%%v0,%%v19,%%v23 \n\t"

"vl %%v24,64(%%r1,%1) \n\t"
"vl %%v25,80(%%r1,%1) \n\t"
"vl %%v26,96(%%r1,%1) \n\t"
"vl %%v27,112(%%r1,%1) \n\t"
"vl %%v28,64(%%r1,%2) \n\t"
"vl %%v29,80(%%r1,%2) \n\t"
"vl %%v30,96(%%r1,%2) \n\t"
"vl %%v31,112(%%r1,%2) \n\t"

"vfmasb %%v20,%%v0,%%v24,%%v28 \n\t"
"vfmasb %%v21,%%v0,%%v25,%%v29 \n\t"
"vfmasb %%v22,%%v0,%%v26,%%v30 \n\t"
"vfmasb %%v23,%%v0,%%v27,%%v31 \n\t"

"vst %%v16,0(%%r1,%2) \n\t"
"vst %%v17,16(%%r1,%2) \n\t"
"vst %%v18,32(%%r1,%2) \n\t"
"vst %%v19,48(%%r1,%2) \n\t"
"vst %%v20,64(%%r1,%2) \n\t"
"vst %%v21,80(%%r1,%2) \n\t"
"vst %%v22,96(%%r1,%2) \n\t"
"vst %%v23,112(%%r1,%2) \n\t"

"vl %%v16,128(%%r1,%1) \n\t"
"vl %%v17,144(%%r1,%1) \n\t"
"vl %%v18,160(%%r1,%1) \n\t"
"vl %%v19,176(%%r1,%1) \n\t"
"vl %%v20,128(%%r1,%2) \n\t"
"vl %%v21,144(%%r1,%2) \n\t"
"vl %%v22,160(%%r1,%2) \n\t"
"vl %%v23,176(%%r1,%2) \n\t"
"vfmasb %%v16,%%v0,%%v16,%%v20 \n\t"
"vfmasb %%v17,%%v0,%%v17,%%v21 \n\t"
"vfmasb %%v18,%%v0,%%v18,%%v22 \n\t"
"vfmasb %%v19,%%v0,%%v19,%%v23 \n\t"

"vl %%v24,192(%%r1,%1) \n\t"
"vl %%v25,208(%%r1,%1) \n\t"
"vl %%v26,224(%%r1,%1) \n\t"
"vl %%v27,240(%%r1,%1) \n\t"
"vl %%v28,192(%%r1,%2) \n\t"
"vl %%v29,208(%%r1,%2) \n\t"
"vl %%v30,224(%%r1,%2) \n\t"
"vl %%v31,240(%%r1,%2) \n\t"

"vfmasb %%v20,%%v0,%%v24,%%v28 \n\t"
"vfmasb %%v21,%%v0,%%v25,%%v29 \n\t"
"vfmasb %%v22,%%v0,%%v26,%%v30 \n\t"
"vfmasb %%v23,%%v0,%%v27,%%v31 \n\t"
"vst %%v16,128(%%r1,%2) \n\t"
"vst %%v17,144(%%r1,%2) \n\t"
"vst %%v18,160(%%r1,%2) \n\t"
"vst %%v19,176(%%r1,%2) \n\t"
"vst %%v20,192(%%r1,%2) \n\t"
"vst %%v21,208(%%r1,%2) \n\t"
"vst %%v22,224(%%r1,%2) \n\t"
"vst %%v23,240(%%r1,%2) \n\t"

"agfi %%r1,256 \n\t"
"brctg %%r0,0b "
:
:"r"(n),"ZR"((const FLOAT (*)[n])x),"ZR"((FLOAT (*)[n])y),"m"(*alpha)
:"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
);
static void saxpy_kernel_64(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) {
__asm__("vlrepf %%v0,%[alpha]\n\t"
"srlg %[n],%[n],6\n\t"
"xgr %%r1,%%r1\n\t"
"0:\n\t"
"pfd 1, 1024(%%r1,%[x])\n\t"
"pfd 2, 1024(%%r1,%[y])\n\t"
"vl %%v16,0(%%r1,%[x])\n\t"
"vl %%v17,16(%%r1,%[x])\n\t"
"vl %%v18,32(%%r1,%[x])\n\t"
"vl %%v19,48(%%r1,%[x])\n\t"
"vl %%v20,0(%%r1,%[y])\n\t"
"vl %%v21,16(%%r1,%[y])\n\t"
"vl %%v22,32(%%r1,%[y])\n\t"
"vl %%v23,48(%%r1,%[y])\n\t"
"vl %%v24,64(%%r1,%[x])\n\t"
"vl %%v25,80(%%r1,%[x])\n\t"
"vl %%v26,96(%%r1,%[x])\n\t"
"vl %%v27,112(%%r1,%[x])\n\t"
"vl %%v28,64(%%r1,%[y])\n\t"
"vl %%v29,80(%%r1,%[y])\n\t"
"vl %%v30,96(%%r1,%[y])\n\t"
"vl %%v31,112(%%r1,%[y])\n\t"
"vfmasb %%v16,%%v0,%%v16,%%v20\n\t"
"vfmasb %%v17,%%v0,%%v17,%%v21\n\t"
"vfmasb %%v18,%%v0,%%v18,%%v22\n\t"
"vfmasb %%v19,%%v0,%%v19,%%v23\n\t"
"vfmasb %%v24,%%v0,%%v24,%%v28\n\t"
"vfmasb %%v25,%%v0,%%v25,%%v29\n\t"
"vfmasb %%v26,%%v0,%%v26,%%v30\n\t"
"vfmasb %%v27,%%v0,%%v27,%%v31\n\t"
"vst %%v16,0(%%r1,%[y])\n\t"
"vst %%v17,16(%%r1,%[y])\n\t"
"vst %%v18,32(%%r1,%[y])\n\t"
"vst %%v19,48(%%r1,%[y])\n\t"
"vst %%v24,64(%%r1,%[y])\n\t"
"vst %%v25,80(%%r1,%[y])\n\t"
"vst %%v26,96(%%r1,%[y])\n\t"
"vst %%v27,112(%%r1,%[y])\n\t"
"vl %%v16,128(%%r1,%[x])\n\t"
"vl %%v17,144(%%r1,%[x])\n\t"
"vl %%v18,160(%%r1,%[x])\n\t"
"vl %%v19,176(%%r1,%[x])\n\t"
"vl %%v20,128(%%r1,%[y])\n\t"
"vl %%v21,144(%%r1,%[y])\n\t"
"vl %%v22,160(%%r1,%[y])\n\t"
"vl %%v23,176(%%r1,%[y])\n\t"
"vl %%v24,192(%%r1,%[x])\n\t"
"vl %%v25,208(%%r1,%[x])\n\t"
"vl %%v26,224(%%r1,%[x])\n\t"
"vl %%v27,240(%%r1,%[x])\n\t"
"vl %%v28,192(%%r1,%[y])\n\t"
"vl %%v29,208(%%r1,%[y])\n\t"
"vl %%v30,224(%%r1,%[y])\n\t"
"vl %%v31,240(%%r1,%[y])\n\t"
"vfmasb %%v16,%%v0,%%v16,%%v20\n\t"
"vfmasb %%v17,%%v0,%%v17,%%v21\n\t"
"vfmasb %%v18,%%v0,%%v18,%%v22\n\t"
"vfmasb %%v19,%%v0,%%v19,%%v23\n\t"
"vfmasb %%v24,%%v0,%%v24,%%v28\n\t"
"vfmasb %%v25,%%v0,%%v25,%%v29\n\t"
"vfmasb %%v26,%%v0,%%v26,%%v30\n\t"
"vfmasb %%v27,%%v0,%%v27,%%v31\n\t"
"vst %%v16,128(%%r1,%[y])\n\t"
"vst %%v17,144(%%r1,%[y])\n\t"
"vst %%v18,160(%%r1,%[y])\n\t"
"vst %%v19,176(%%r1,%[y])\n\t"
"vst %%v24,192(%%r1,%[y])\n\t"
"vst %%v25,208(%%r1,%[y])\n\t"
"vst %%v26,224(%%r1,%[y])\n\t"
"vst %%v27,240(%%r1,%[y])\n\t"
"agfi %%r1,256\n\t"
"brctg %[n],0b"
: "+m"(*(FLOAT (*)[n]) y),[n] "+&r"(n)
: [y] "a"(y), "m"(*(const FLOAT (*)[n]) x),[x] "a"(x),
[alpha] "m"(*alpha)
: "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22",
"v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31");
} }


int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2)
{
BLASLONG i=0;
BLASLONG ix=0,iy=0;
int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x,
BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy,
BLASLONG dummy2) {
BLASLONG i = 0;
BLASLONG ix = 0, iy = 0;


if ( n <= 0 ) return 0 ;
if (n <= 0)
return 0;


if ( (inc_x == 1) && (inc_y == 1) )
{
if ((inc_x == 1) && (inc_y == 1)) {


BLASLONG n1 = n & -64;
BLASLONG n1 = n & -64;


if ( n1 )
saxpy_kernel_64(n1, x, y , &da);
if (n1)
saxpy_kernel_64(n1, x, y, &da);


i = n1;
while(i < n)
{

y[i] += da * x[i] ;
i++ ;

}
return 0 ;
i = n1;
while (i < n) {


y[i] += da * x[i];
i++;


} }
return 0;


BLASLONG n1 = n & -4;
}


while(i < n1)
{
BLASLONG n1 = n & -4;


FLOAT m1 = da * x[ix] ;
FLOAT m2 = da * x[ix+inc_x] ;
FLOAT m3 = da * x[ix+2*inc_x] ;
FLOAT m4 = da * x[ix+3*inc_x] ;
while (i < n1) {


y[iy] += m1 ;
y[iy+inc_y] += m2 ;
y[iy+2*inc_y] += m3 ;
y[iy+3*inc_y] += m4 ;
FLOAT m1 = da * x[ix];
FLOAT m2 = da * x[ix + inc_x];
FLOAT m3 = da * x[ix + 2 * inc_x];
FLOAT m4 = da * x[ix + 3 * inc_x];


ix += inc_x*4 ;
iy += inc_y*4 ;
i+=4 ;
y[iy] += m1;
y[iy + inc_y] += m2;
y[iy + 2 * inc_y] += m3;
y[iy + 3 * inc_y] += m4;


}
ix += inc_x * 4;
iy += inc_y * 4;
i += 4;


while(i < n)
{
}


y[iy] += da * x[ix] ;
ix += inc_x ;
iy += inc_y ;
i++ ;
while (i < n) {


}
return 0 ;
}
y[iy] += da * x[ix];
ix += inc_x;
iy += inc_y;
i++;


}
return 0;


}

+ 35
- 41
kernel/zarch/scopy.c View File

@@ -1,5 +1,5 @@
/*************************************************************************** /***************************************************************************
Copyright (c) 2013-2018, The OpenBLAS Project
Copyright (c) 2013-2019, The OpenBLAS Project
All rights reserved. All rights reserved.
Redistribution and use in source and binary forms, with or without Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are modification, are permitted provided that the following conditions are
@@ -27,59 +27,53 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.


#include "common.h" #include "common.h"


static void scopy_kernel_64(BLASLONG n, FLOAT *x, FLOAT *y)
{
__asm__ volatile (
"lgr %%r1,%1 \n\t"
"lgr %%r2,%2 \n\t"
"srlg %%r0,%0,6 \n\t"
"0: \n\t"
"pfd 1, 1024(%%r1) \n\t"
"pfd 2, 1024(%%r2) \n\t"
"mvc 0(256,%%r2),0(%%r1) \n\t"
"agfi %%r1,256 \n\t"
"agfi %%r2,256 \n\t"
"brctg %%r0,0b "
:
:"r"(n),"a"((const FLOAT (*)[n])x),"a"((FLOAT (*)[n])y)
:"memory","cc","r0","r1","r2"
);
static void scopy_kernel_64(BLASLONG n, FLOAT *x, FLOAT *y) {
__asm__("srlg %[n],%[n],6\n\t"
"0:\n\t"
"pfd 1, 1024(%[x])\n\t"
"pfd 2, 1024(%[y])\n\t"
"mvc 0(256,%[y]),0(%[x])\n\t"
"la %[x],256(%[x])\n\t"
"la %[y],256(%[y])\n\t"
"brctg %[n],0b"
: "=m"(*(FLOAT (*)[n]) y),[x] "+&a"(x),[y] "+&a"(y)
: "m"(*(const FLOAT (*)[n]) x)
: "cc");
} }


int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) { int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) {
BLASLONG i = 0;
BLASLONG ix = 0, iy = 0;
BLASLONG i = 0;
BLASLONG ix = 0, iy = 0;


if (n <= 0) return 0;

if ((inc_x == 1) && (inc_y == 1)) {

BLASLONG n1 = n & -64;
if (n1 > 0) {
scopy_kernel_64(n1, x, y);
i = n1;
}
if (n <= 0)
return 0;


while (i < n) {
y[i] = x[i];
i++;
if ((inc_x == 1) && (inc_y == 1)) {


}
BLASLONG n1 = n & -64;
if (n1 > 0) {
scopy_kernel_64(n1, x, y);
i = n1;
}


while (i < n) {
y[i] = x[i];
i++;


} else {
}


while (i < n) {
} else {


y[iy] = x[ix];
ix += inc_x;
iy += inc_y;
i++;
while (i < n) {


}
y[iy] = x[ix];
ix += inc_x;
iy += inc_y;
i++;


} }
return 0;


}
return 0;


} }

+ 96
- 92
kernel/zarch/sdot.c View File

@@ -1,5 +1,5 @@
/*************************************************************************** /***************************************************************************
Copyright (c) 2013-2018,The OpenBLAS Project
Copyright (c) 2013-2019,The OpenBLAS Project
All rights reserved. All rights reserved.
Redistribution and use in source and binary forms,with or without Redistribution and use in source and binary forms,with or without
modification,are permitted provided that the following conditions are modification,are permitted provided that the following conditions are
@@ -27,114 +27,118 @@ USE OF THIS SOFTWARE,EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.


#include "common.h" #include "common.h"


static FLOAT sdot_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y)
{
FLOAT dot;

__asm__ volatile (
"vzero %%v0 \n\t"
"srlg %%r0,%1,5 \n\t"
"xgr %%r1,%%r1 \n\t"
"0: \n\t"
"pfd 1,1024(%%r1,%2) \n\t"
"pfd 1,1024(%%r1,%3) \n\t"

"vl %%v16,0(%%r1,%2) \n\t"
"vl %%v17,16(%%r1,%2) \n\t"
"vl %%v18,32(%%r1,%2) \n\t"
"vl %%v19,48(%%r1,%2) \n\t"
"vl %%v20,64(%%r1,%2) \n\t"
"vl %%v21,80(%%r1,%2) \n\t"
"vl %%v22,96(%%r1,%2) \n\t"
"vl %%v23,112(%%r1,%2) \n\t"

"vl %%v24,0(%%r1,%3) \n\t"
"vfmasb %%v0,%%v16,%%v24,%%v0 \n\t"
"vl %%v25,16(%%r1,%3) \n\t"
"vfmasb %%v0,%%v17,%%v25,%%v0 \n\t"
"vl %%v26,32(%%r1,%3) \n\t"
"vfmasb %%v0,%%v18,%%v26,%%v0 \n\t"
"vl %%v27,48(%%r1,%3) \n\t"
"vfmasb %%v0,%%v19,%%v27,%%v0 \n\t"
"vl %%v28,64(%%r1,%3) \n\t"
"vfmasb %%v0,%%v20,%%v28,%%v0 \n\t"
"vl %%v29,80(%%r1,%3) \n\t"
"vfmasb %%v0,%%v21,%%v29,%%v0 \n\t"
"vl %%v30,96(%%r1,%3) \n\t"
"vfmasb %%v0,%%v22,%%v30,%%v0 \n\t"
"vl %%v31,112(%%r1,%3) \n\t"
"vfmasb %%v0,%%v23,%%v31,%%v0 \n\t"
"agfi %%r1,128 \n\t"
"brctg %%r0,0b \n\t"
"vrepf %%v1,%%v0,1 \n\t"
"vrepf %%v2,%%v0,2 \n\t"
"vrepf %%v3,%%v0,3 \n\t"
"aebr %%f0,%%f1 \n\t"
"aebr %%f0,%%f2 \n\t"
"aebr %%f0,%%f3 \n\t"
"ler %0,%%f0 "
:"=f"(dot)
:"r"(n),"ZR"((const FLOAT (*)[n])x),"ZR"((const FLOAT (*)[n])y)
:"memory","cc","r0","r1","v0","v1","v2","v3","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
);

return dot;
static FLOAT sdot_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y) {
FLOAT dot;

__asm__("vzero %%v0\n\t"
"vzero %%v1\n\t"
"vzero %%v2\n\t"
"vzero %%v3\n\t"
"vzero %%v4\n\t"
"vzero %%v5\n\t"
"vzero %%v6\n\t"
"vzero %%v7\n\t"
"srlg %[n],%[n],5\n\t"
"xgr %%r1,%%r1\n\t"
"0:\n\t"
"pfd 1,1024(%%r1,%[x])\n\t"
"pfd 1,1024(%%r1,%[y])\n\t"
"vl %%v16,0(%%r1,%[x])\n\t"
"vl %%v17,16(%%r1,%[x])\n\t"
"vl %%v18,32(%%r1,%[x])\n\t"
"vl %%v19,48(%%r1,%[x])\n\t"
"vl %%v20,64(%%r1,%[x])\n\t"
"vl %%v21,80(%%r1,%[x])\n\t"
"vl %%v22,96(%%r1,%[x])\n\t"
"vl %%v23,112(%%r1,%[x])\n\t"
"vl %%v24,0(%%r1,%[y])\n\t"
"vl %%v25,16(%%r1,%[y])\n\t"
"vl %%v26,32(%%r1,%[y])\n\t"
"vl %%v27,48(%%r1,%[y])\n\t"
"vl %%v28,64(%%r1,%[y])\n\t"
"vl %%v29,80(%%r1,%[y])\n\t"
"vl %%v30,96(%%r1,%[y])\n\t"
"vl %%v31,112(%%r1,%[y])\n\t"
"vfmasb %%v0,%%v16,%%v24,%%v0\n\t"
"vfmasb %%v1,%%v17,%%v25,%%v1\n\t"
"vfmasb %%v2,%%v18,%%v26,%%v2\n\t"
"vfmasb %%v3,%%v19,%%v27,%%v3\n\t"
"vfmasb %%v4,%%v20,%%v28,%%v4\n\t"
"vfmasb %%v5,%%v21,%%v29,%%v5\n\t"
"vfmasb %%v6,%%v22,%%v30,%%v6\n\t"
"vfmasb %%v7,%%v23,%%v31,%%v7\n\t"
"agfi %%r1,128\n\t"
"brctg %[n],0b\n\t"
"vfasb %%v0,%%v0,%%v1\n\t"
"vfasb %%v0,%%v0,%%v2\n\t"
"vfasb %%v0,%%v0,%%v3\n\t"
"vfasb %%v0,%%v0,%%v4\n\t"
"vfasb %%v0,%%v0,%%v5\n\t"
"vfasb %%v0,%%v0,%%v6\n\t"
"vfasb %%v0,%%v0,%%v7\n\t"
"vrepf %%v1,%%v0,1\n\t"
"vrepf %%v2,%%v0,2\n\t"
"vrepf %%v3,%%v0,3\n\t"
"aebr %%f0,%%f1\n\t"
"aebr %%f0,%%f2\n\t"
"aebr %%f0,%%f3\n\t"
"ler %[dot],%%f0"
: [dot] "=f"(dot),[n] "+&r"(n)
: "m"(*(const FLOAT (*)[n]) x),[x] "a"(x), "m"(*(const FLOAT (*)[n]) y),
[y] "a"(y)
: "cc", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16",
"v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26",
"v27", "v28", "v29", "v30", "v31");

return dot;
} }


FLOAT CNAME(BLASLONG n,FLOAT *x,BLASLONG inc_x,FLOAT *y,BLASLONG inc_y)
{
BLASLONG i=0;
BLASLONG ix=0,iy=0;
FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) {
BLASLONG i = 0;
BLASLONG ix = 0, iy = 0;


FLOAT dot = 0.0 ;
FLOAT dot = 0.0;


if ( n <= 0 ) return(dot);
if (n <= 0)
return (dot);


if ( (inc_x == 1) && (inc_y == 1) )
{
if ((inc_x == 1) && (inc_y == 1)) {


BLASLONG n1 = n & -32;
BLASLONG n1 = n & -32;


if ( n1 )
dot = sdot_kernel_32(n1,x,y);
if (n1)
dot = sdot_kernel_32(n1, x, y);


i = n1;
while(i < n)
{
i = n1;
while (i < n) {


dot += y[i] * x[i] ;
i++ ;
dot += y[i] * x[i];
i++;


}
return(dot);
}
return (dot);


}


}
BLASLONG n1 = n & -2;


BLASLONG n1 = n & -2;
while (i < n1) {


while(i < n1)
{
dot += y[iy] * x[ix] + y[iy + inc_y] * x[ix + inc_x];
ix += inc_x * 2;
iy += inc_y * 2;
i += 2;


dot += y[iy] * x[ix] + y[iy+inc_y] * x[ix+inc_x];
ix += inc_x*2 ;
iy += inc_y*2 ;
i+=2 ;
}


}
while (i < n) {


while(i < n)
{
dot += y[iy] * x[ix];
ix += inc_x;
iy += inc_y;
i++;


dot += y[iy] * x[ix] ;
ix += inc_x ;
iy += inc_y ;
i++ ;

}
return(dot);
}
return (dot);


} }



+ 538
- 619
kernel/zarch/sgemv_n_4.c
File diff suppressed because it is too large
View File


+ 657
- 723
kernel/zarch/sgemv_t_4.c
File diff suppressed because it is too large
View File


+ 103
- 116
kernel/zarch/smax.c View File

@@ -1,5 +1,5 @@
/*************************************************************************** /***************************************************************************
Copyright (c) 2013-2017, The OpenBLAS Project
Copyright (c) 2013-2019, The OpenBLAS Project
All rights reserved. All rights reserved.
Redistribution and use in source and binary forms, with or without Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are modification, are permitted provided that the following conditions are
@@ -27,136 +27,123 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.


#include "common.h" #include "common.h"


static FLOAT smax_kernel_64(BLASLONG n, FLOAT *x)
{
FLOAT max;

__asm__ volatile (
"vl %%v0,0(%2) \n\t"
"srlg %%r0,%1,6 \n\t"
"xgr %%r1,%%r1 \n\t"
"0: \n\t"
"pfd 1, 1024(%%r1,%2) \n\t"

"vl %%v16,0(%%r1,%2) \n\t"
"vl %%v17,16(%%r1,%2) \n\t"
"vl %%v18,32(%%r1,%2) \n\t"
"vl %%v19,48(%%r1,%2) \n\t"
"vl %%v20,64(%%r1,%2) \n\t"
"vl %%v21,80(%%r1,%2) \n\t"
"vl %%v22,96(%%r1,%2) \n\t"
"vl %%v23,112(%%r1,%2) \n\t"
"vl %%v24,128(%%r1,%2) \n\t"
"vl %%v25,144(%%r1,%2) \n\t"
"vl %%v26,160(%%r1,%2) \n\t"
"vl %%v27,176(%%r1,%2) \n\t"
"vl %%v28,192(%%r1,%2) \n\t"
"vl %%v29,208(%%r1,%2) \n\t"
"vl %%v30,224(%%r1,%2) \n\t"
"vl %%v31,240(%%r1,%2) \n\t"
"vfmaxsb %%v16,%%v16,%%v24,0 \n\t"
"vfmaxsb %%v17,%%v17,%%v25,0 \n\t"
"vfmaxsb %%v18,%%v18,%%v26,0 \n\t"
"vfmaxsb %%v19,%%v19,%%v27,0 \n\t"
"vfmaxsb %%v20,%%v20,%%v28,0 \n\t"
"vfmaxsb %%v21,%%v21,%%v29,0 \n\t"
"vfmaxsb %%v22,%%v22,%%v30,0 \n\t"
"vfmaxsb %%v23,%%v23,%%v31,0 \n\t"

"vfmaxsb %%v16,%%v16,%%v20,0 \n\t"
"vfmaxsb %%v17,%%v17,%%v21,0 \n\t"
"vfmaxsb %%v18,%%v18,%%v22,0 \n\t"
"vfmaxsb %%v19,%%v19,%%v23,0 \n\t"

"vfmaxsb %%v16,%%v16,%%v18,0 \n\t"
"vfmaxsb %%v17,%%v17,%%v19,0 \n\t"

"vfmaxsb %%v16,%%v16,%%v17,0 \n\t"

"vfmaxsb %%v0,%%v0,%%v16,0 \n\t"

"agfi %%r1, 256 \n\t"
"brctg %%r0, 0b \n\t"

"veslg %%v16,%%v0,32 \n\t"
"vfmaxsb %%v0,%%v0,%%v16,0 \n\t"

"vrepf %%v16,%%v0,2 \n\t"
"wfmaxsb %%v0,%%v0,%%v16,0 \n\t"
"ler %0,%%f0 "
:"=f"(max)
:"r"(n),"ZR"((const FLOAT (*)[n])x)
:"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
);

return max;
static FLOAT smax_kernel_64(BLASLONG n, FLOAT *x) {
FLOAT max;

__asm__("vl %%v0,0(%[x])\n\t"
"srlg %[n],%[n],6\n\t"
"xgr %%r1,%%r1\n\t"
"0:\n\t"
"pfd 1, 1024(%%r1,%[x])\n\t"
"vl %%v16,0(%%r1,%[x])\n\t"
"vl %%v17,16(%%r1,%[x])\n\t"
"vl %%v18,32(%%r1,%[x])\n\t"
"vl %%v19,48(%%r1,%[x])\n\t"
"vl %%v20,64(%%r1,%[x])\n\t"
"vl %%v21,80(%%r1,%[x])\n\t"
"vl %%v22,96(%%r1,%[x])\n\t"
"vl %%v23,112(%%r1,%[x])\n\t"
"vl %%v24,128(%%r1,%[x])\n\t"
"vl %%v25,144(%%r1,%[x])\n\t"
"vl %%v26,160(%%r1,%[x])\n\t"
"vl %%v27,176(%%r1,%[x])\n\t"
"vl %%v28,192(%%r1,%[x])\n\t"
"vl %%v29,208(%%r1,%[x])\n\t"
"vl %%v30,224(%%r1,%[x])\n\t"
"vl %%v31,240(%%r1,%[x])\n\t"
"vfmaxsb %%v16,%%v16,%%v24,0\n\t"
"vfmaxsb %%v17,%%v17,%%v25,0\n\t"
"vfmaxsb %%v18,%%v18,%%v26,0\n\t"
"vfmaxsb %%v19,%%v19,%%v27,0\n\t"
"vfmaxsb %%v20,%%v20,%%v28,0\n\t"
"vfmaxsb %%v21,%%v21,%%v29,0\n\t"
"vfmaxsb %%v22,%%v22,%%v30,0\n\t"
"vfmaxsb %%v23,%%v23,%%v31,0\n\t"
"vfmaxsb %%v16,%%v16,%%v20,0\n\t"
"vfmaxsb %%v17,%%v17,%%v21,0\n\t"
"vfmaxsb %%v18,%%v18,%%v22,0\n\t"
"vfmaxsb %%v19,%%v19,%%v23,0\n\t"
"vfmaxsb %%v16,%%v16,%%v18,0\n\t"
"vfmaxsb %%v17,%%v17,%%v19,0\n\t"
"vfmaxsb %%v16,%%v16,%%v17,0\n\t"
"vfmaxsb %%v0,%%v0,%%v16,0\n\t"
"agfi %%r1, 256\n\t"
"brctg %[n], 0b\n\t"
"veslg %%v16,%%v0,32\n\t"
"vfmaxsb %%v0,%%v0,%%v16,0\n\t"
"vrepf %%v16,%%v0,2\n\t"
"wfmaxsb %%v0,%%v0,%%v16,0\n\t"
"ler %[max],%%f0"
: [max] "=f"(max),[n] "+&r"(n)
: "m"(*(const FLOAT (*)[n]) x),[x] "a"(x)
: "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22",
"v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31");

return max;
} }
FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
BLASLONG i = 0;
BLASLONG j = 0;
FLOAT maxf = 0.0;

if (n <= 0 || inc_x <= 0) return (maxf);


if (inc_x == 1) {
FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
BLASLONG i = 0;
BLASLONG j = 0;
FLOAT maxf = 0.0;


BLASLONG n1 = n & -64;
if (n1 > 0) {
if (n <= 0 || inc_x <= 0)
return (maxf);


maxf = smax_kernel_64(n1, x);
if (inc_x == 1) {


i = n1;
}
else
{
maxf=x[0];
i++;
}
BLASLONG n1 = n & -64;
if (n1 > 0) {


while (i < n) {
if (x[i] > maxf) {
maxf = x[i];
}
i++;
}
return (maxf);
maxf = smax_kernel_64(n1, x);


i = n1;
} else { } else {
maxf = x[0];
i++;
}


maxf=x[0];
while (i < n) {
if (x[i] > maxf) {
maxf = x[i];
}
i++;
}
return (maxf);

} else {


BLASLONG n1 = n & -4;
while (j < n1) {
maxf = x[0];


if (x[i] > maxf) {
maxf = x[i];
}
if (x[i + inc_x] > maxf) {
maxf = x[i + inc_x];
}
if (x[i + 2 * inc_x] > maxf) {
maxf = x[i + 2 * inc_x];
}
if (x[i + 3 * inc_x] > maxf) {
maxf = x[i + 3 * inc_x];
}
BLASLONG n1 = n & -4;
while (j < n1) {


i += inc_x * 4;
if (x[i] > maxf) {
maxf = x[i];
}
if (x[i + inc_x] > maxf) {
maxf = x[i + inc_x];
}
if (x[i + 2 * inc_x] > maxf) {
maxf = x[i + 2 * inc_x];
}
if (x[i + 3 * inc_x] > maxf) {
maxf = x[i + 3 * inc_x];
}


j += 4;
i += inc_x * 4;


}
j += 4;


}


while (j < n) {
if (x[i] > maxf) {
maxf = x[i];
}
i += inc_x;
j++;
}
return (maxf);
while (j < n) {
if (x[i] > maxf) {
maxf = x[i];
}
i += inc_x;
j++;
} }
return (maxf);
}
} }

+ 103
- 116
kernel/zarch/smin.c View File

@@ -1,5 +1,5 @@
/*************************************************************************** /***************************************************************************
Copyright (c) 2013-2017, The OpenBLAS Project
Copyright (c) 2013-2019, The OpenBLAS Project
All rights reserved. All rights reserved.
Redistribution and use in source and binary forms, with or without Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are modification, are permitted provided that the following conditions are
@@ -27,136 +27,123 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.


#include "common.h" #include "common.h"


static FLOAT smin_kernel_64(BLASLONG n, FLOAT *x)
{
FLOAT min;

__asm__ volatile (
"vl %%v0,0(%2) \n\t"
"srlg %%r0,%1,6 \n\t"
"xgr %%r1,%%r1 \n\t"
"0: \n\t"
"pfd 1, 1024(%%r1,%2) \n\t"

"vl %%v16,0(%%r1,%2) \n\t"
"vl %%v17,16(%%r1,%2) \n\t"
"vl %%v18,32(%%r1,%2) \n\t"
"vl %%v19,48(%%r1,%2) \n\t"
"vl %%v20,64(%%r1,%2) \n\t"
"vl %%v21,80(%%r1,%2) \n\t"
"vl %%v22,96(%%r1,%2) \n\t"
"vl %%v23,112(%%r1,%2) \n\t"
"vl %%v24,128(%%r1,%2) \n\t"
"vl %%v25,144(%%r1,%2) \n\t"
"vl %%v26,160(%%r1,%2) \n\t"
"vl %%v27,176(%%r1,%2) \n\t"
"vl %%v28,192(%%r1,%2) \n\t"
"vl %%v29,208(%%r1,%2) \n\t"
"vl %%v30,224(%%r1,%2) \n\t"
"vl %%v31,240(%%r1,%2) \n\t"
"vfminsb %%v16,%%v16,%%v24,0 \n\t"
"vfminsb %%v17,%%v17,%%v25,0 \n\t"
"vfminsb %%v18,%%v18,%%v26,0 \n\t"
"vfminsb %%v19,%%v19,%%v27,0 \n\t"
"vfminsb %%v20,%%v20,%%v28,0 \n\t"
"vfminsb %%v21,%%v21,%%v29,0 \n\t"
"vfminsb %%v22,%%v22,%%v30,0 \n\t"
"vfminsb %%v23,%%v23,%%v31,0 \n\t"

"vfminsb %%v16,%%v16,%%v20,0 \n\t"
"vfminsb %%v17,%%v17,%%v21,0 \n\t"
"vfminsb %%v18,%%v18,%%v22,0 \n\t"
"vfminsb %%v19,%%v19,%%v23,0 \n\t"

"vfminsb %%v16,%%v16,%%v18,0 \n\t"
"vfminsb %%v17,%%v17,%%v19,0 \n\t"

"vfminsb %%v16,%%v16,%%v17,0 \n\t"

"vfminsb %%v0,%%v0,%%v16,0 \n\t"

"agfi %%r1, 256 \n\t"
"brctg %%r0, 0b \n\t"

"veslg %%v16,%%v0,32 \n\t"
"vfminsb %%v0,%%v0,%%v16,0 \n\t"

"vrepf %%v16,%%v0,2 \n\t"
"wfminsb %%v0,%%v0,%%v16,0 \n\t"
"ler %0,%%f0 "
:"=f"(min)
:"r"(n),"ZR"((const FLOAT (*)[n])x)
:"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
);

return min;
static FLOAT smin_kernel_64(BLASLONG n, FLOAT *x) {
FLOAT min;

__asm__("vl %%v0,0(%[x])\n\t"
"srlg %[n],%[n],6\n\t"
"xgr %%r1,%%r1\n\t"
"0:\n\t"
"pfd 1, 1024(%%r1,%[x])\n\t"
"vl %%v16,0(%%r1,%[x])\n\t"
"vl %%v17,16(%%r1,%[x])\n\t"
"vl %%v18,32(%%r1,%[x])\n\t"
"vl %%v19,48(%%r1,%[x])\n\t"
"vl %%v20,64(%%r1,%[x])\n\t"
"vl %%v21,80(%%r1,%[x])\n\t"
"vl %%v22,96(%%r1,%[x])\n\t"
"vl %%v23,112(%%r1,%[x])\n\t"
"vl %%v24,128(%%r1,%[x])\n\t"
"vl %%v25,144(%%r1,%[x])\n\t"
"vl %%v26,160(%%r1,%[x])\n\t"
"vl %%v27,176(%%r1,%[x])\n\t"
"vl %%v28,192(%%r1,%[x])\n\t"
"vl %%v29,208(%%r1,%[x])\n\t"
"vl %%v30,224(%%r1,%[x])\n\t"
"vl %%v31,240(%%r1,%[x])\n\t"
"vfminsb %%v16,%%v16,%%v24,0\n\t"
"vfminsb %%v17,%%v17,%%v25,0\n\t"
"vfminsb %%v18,%%v18,%%v26,0\n\t"
"vfminsb %%v19,%%v19,%%v27,0\n\t"
"vfminsb %%v20,%%v20,%%v28,0\n\t"
"vfminsb %%v21,%%v21,%%v29,0\n\t"
"vfminsb %%v22,%%v22,%%v30,0\n\t"
"vfminsb %%v23,%%v23,%%v31,0\n\t"
"vfminsb %%v16,%%v16,%%v20,0\n\t"
"vfminsb %%v17,%%v17,%%v21,0\n\t"
"vfminsb %%v18,%%v18,%%v22,0\n\t"
"vfminsb %%v19,%%v19,%%v23,0\n\t"
"vfminsb %%v16,%%v16,%%v18,0\n\t"
"vfminsb %%v17,%%v17,%%v19,0\n\t"
"vfminsb %%v16,%%v16,%%v17,0\n\t"
"vfminsb %%v0,%%v0,%%v16,0\n\t"
"agfi %%r1, 256\n\t"
"brctg %[n], 0b\n\t"
"veslg %%v16,%%v0,32\n\t"
"vfminsb %%v0,%%v0,%%v16,0\n\t"
"vrepf %%v16,%%v0,2\n\t"
"wfminsb %%v0,%%v0,%%v16,0\n\t"
"ler %[min],%%f0"
: [min] "=f"(min),[n] "+&r"(n)
: "m"(*(const FLOAT (*)[n]) x),[x] "a"(x)
: "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22",
"v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31");

return min;
} }
FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
BLASLONG i = 0;
BLASLONG j = 0;
FLOAT minf = 0.0;

if (n <= 0 || inc_x <= 0) return (minf);


if (inc_x == 1) {
FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
BLASLONG i = 0;
BLASLONG j = 0;
FLOAT minf = 0.0;


BLASLONG n1 = n & -64;
if (n1 > 0) {
if (n <= 0 || inc_x <= 0)
return (minf);


minf = smin_kernel_64(n1, x);
if (inc_x == 1) {


i = n1;
}
else
{
minf=x[0];
i++;
}
BLASLONG n1 = n & -64;
if (n1 > 0) {


while (i < n) {
if (x[i] < minf) {
minf = x[i];
}
i++;
}
return (minf);
minf = smin_kernel_64(n1, x);


i = n1;
} else { } else {
minf = x[0];
i++;
}


minf=x[0];
while (i < n) {
if (x[i] < minf) {
minf = x[i];
}
i++;
}
return (minf);

} else {


BLASLONG n1 = n & -4;
while (j < n1) {
minf = x[0];


if (x[i] < minf) {
minf = x[i];
}
if (x[i + inc_x] < minf) {
minf = x[i + inc_x];
}
if (x[i + 2 * inc_x] < minf) {
minf = x[i + 2 * inc_x];
}
if (x[i + 3 * inc_x] < minf) {
minf = x[i + 3 * inc_x];
}
BLASLONG n1 = n & -4;
while (j < n1) {


i += inc_x * 4;
if (x[i] < minf) {
minf = x[i];
}
if (x[i + inc_x] < minf) {
minf = x[i + inc_x];
}
if (x[i + 2 * inc_x] < minf) {
minf = x[i + 2 * inc_x];
}
if (x[i + 3 * inc_x] < minf) {
minf = x[i + 3 * inc_x];
}


j += 4;
i += inc_x * 4;


}
j += 4;


}


while (j < n) {
if (x[i] < minf) {
minf = x[i];
}
i += inc_x;
j++;
}
return (minf);
while (j < n) {
if (x[i] < minf) {
minf = x[i];
}
i += inc_x;
j++;
} }
return (minf);
}
} }

+ 180
- 201
kernel/zarch/srot.c View File

@@ -1,5 +1,5 @@
/*************************************************************************** /***************************************************************************
Copyright (c) 2013-2018, The OpenBLAS Project
Copyright (c) 2013-2019, The OpenBLAS Project
All rights reserved. All rights reserved.
Redistribution and use in source and binary forms, with or without Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are modification, are permitted provided that the following conditions are
@@ -27,220 +27,199 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.


#include "common.h" #include "common.h"


static void srot_kernel_64(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *c, FLOAT *s)
{
__asm__ (
"vlrepf %%v0,%3 \n\t"
"vlrepf %%v1,%4 \n\t"
"srlg %%r0,%0,6 \n\t"
"xgr %%r1,%%r1 \n\t"
"0: \n\t"
"pfd 2, 1024(%%r1,%1) \n\t"
"pfd 2, 1024(%%r1,%2) \n\t"
"vl %%v24, 0(%%r1,%1) \n\t"
"vl %%v25, 16(%%r1,%1) \n\t"
"vl %%v26, 32(%%r1,%1) \n\t"
"vl %%v27, 48(%%r1,%1) \n\t"
"vl %%v16, 0(%%r1,%2) \n\t"
"vl %%v17, 16(%%r1,%2) \n\t"
"vl %%v18, 32(%%r1,%2) \n\t"
"vl %%v19, 48(%%r1,%2) \n\t"
"vfmsb %%v28,%%v24,%%v0 \n\t"
"vfmsb %%v29,%%v25,%%v0 \n\t"
"vfmsb %%v20,%%v24,%%v1 \n\t" /* yn=x*s */
"vfmsb %%v21,%%v25,%%v1 \n\t" /* yn=x*s */
"vfmsb %%v30,%%v26,%%v0 \n\t"
"vfmsb %%v22,%%v26,%%v1 \n\t" /* yn=x*s */
"vfmsb %%v31,%%v27,%%v0 \n\t"
"vfmsb %%v23,%%v27,%%v1 \n\t" /* yn=x*s */
/* 2nd parts*/
"vfmasb %%v28,%%v16,%%v1,%%v28 \n\t"
"vfmssb %%v20,%%v16,%%v0,%%v20 \n\t" /* yn=y*c-yn */
"vfmasb %%v29,%%v17,%%v1,%%v29 \n\t"
"vfmssb %%v21,%%v17,%%v0,%%v21 \n\t" /* yn=y*c-yn */
"vfmasb %%v30,%%v18,%%v1,%%v30 \n\t"
"vfmssb %%v22,%%v18,%%v0,%%v22 \n\t" /* yn=y*c-yn */
"vfmasb %%v31,%%v19,%%v1,%%v31 \n\t"
"vfmssb %%v23,%%v19,%%v0,%%v23 \n\t" /* yn=y*c-yn */
"vst %%v28, 0(%%r1,%1) \n\t"
"vst %%v29, 16(%%r1,%1) \n\t"
"vst %%v30, 32(%%r1,%1) \n\t"
"vst %%v31, 48(%%r1,%1) \n\t"
"vst %%v20, 0(%%r1,%2) \n\t"
"vst %%v21, 16(%%r1,%2) \n\t"
"vst %%v22, 32(%%r1,%2) \n\t"
"vst %%v23, 48(%%r1,%2) \n\t"
"vl %%v24, 64(%%r1,%1) \n\t"
"vl %%v25, 80(%%r1,%1) \n\t"
"vl %%v26, 96(%%r1,%1) \n\t"
"vl %%v27, 112(%%r1,%1) \n\t"
"vl %%v16, 64(%%r1,%2) \n\t"
"vl %%v17, 80(%%r1,%2) \n\t"
"vl %%v18, 96(%%r1,%2) \n\t"
"vl %%v19, 112(%%r1,%2) \n\t"
"vfmsb %%v28,%%v24,%%v0 \n\t"
"vfmsb %%v29,%%v25,%%v0 \n\t"
"vfmsb %%v20,%%v24,%%v1 \n\t" /* yn=x*s */
"vfmsb %%v21,%%v25,%%v1 \n\t" /* yn=x*s */
"vfmsb %%v30,%%v26,%%v0 \n\t"
"vfmsb %%v22,%%v26,%%v1 \n\t" /* yn=x*s */
"vfmsb %%v31,%%v27,%%v0 \n\t"
"vfmsb %%v23,%%v27,%%v1 \n\t" /* yn=x*s */
/* 2nd parts*/
"vfmasb %%v28,%%v16,%%v1,%%v28 \n\t"
"vfmssb %%v20,%%v16,%%v0,%%v20 \n\t" /* yn=y*c-yn */
"vfmasb %%v29,%%v17,%%v1,%%v29 \n\t"
"vfmssb %%v21,%%v17,%%v0,%%v21 \n\t" /* yn=y*c-yn */
"vfmasb %%v30,%%v18,%%v1,%%v30 \n\t"
"vfmssb %%v22,%%v18,%%v0,%%v22 \n\t" /* yn=y*c-yn */
"vfmasb %%v31,%%v19,%%v1,%%v31 \n\t"
"vfmssb %%v23,%%v19,%%v0,%%v23 \n\t" /* yn=y*c-yn */
"vst %%v28, 64(%%r1,%1) \n\t"
"vst %%v29, 80(%%r1,%1) \n\t"
"vst %%v30, 96(%%r1,%1) \n\t"
"vst %%v31, 112(%%r1,%1) \n\t"
"vst %%v20, 64(%%r1,%2) \n\t"
"vst %%v21, 80(%%r1,%2) \n\t"
"vst %%v22, 96(%%r1,%2) \n\t"
"vst %%v23, 112(%%r1,%2) \n\t"
"vl %%v24, 128(%%r1,%1) \n\t"
"vl %%v25, 144(%%r1,%1) \n\t"
"vl %%v26, 160(%%r1,%1) \n\t"
"vl %%v27, 176(%%r1,%1) \n\t"
"vl %%v16, 128(%%r1,%2) \n\t"
"vl %%v17, 144(%%r1,%2) \n\t"
"vl %%v18, 160(%%r1,%2) \n\t"
"vl %%v19, 176(%%r1,%2) \n\t"
"vfmsb %%v28,%%v24,%%v0 \n\t"
"vfmsb %%v29,%%v25,%%v0 \n\t"
"vfmsb %%v20,%%v24,%%v1 \n\t" /* yn=x*s */
"vfmsb %%v21,%%v25,%%v1 \n\t" /* yn=x*s */
"vfmsb %%v30,%%v26,%%v0 \n\t"
"vfmsb %%v22,%%v26,%%v1 \n\t" /* yn=x*s */
"vfmsb %%v31,%%v27,%%v0 \n\t"
"vfmsb %%v23,%%v27,%%v1 \n\t" /* yn=x*s */
/* 2nd parts*/
"vfmasb %%v28,%%v16,%%v1,%%v28 \n\t"
"vfmssb %%v20,%%v16,%%v0,%%v20 \n\t" /* yn=y*c-yn */
"vfmasb %%v29,%%v17,%%v1,%%v29 \n\t"
"vfmssb %%v21,%%v17,%%v0,%%v21 \n\t" /* yn=y*c-yn */
"vfmasb %%v30,%%v18,%%v1,%%v30 \n\t"
"vfmssb %%v22,%%v18,%%v0,%%v22 \n\t" /* yn=y*c-yn */
"vfmasb %%v31,%%v19,%%v1,%%v31 \n\t"
"vfmssb %%v23,%%v19,%%v0,%%v23 \n\t" /* yn=y*c-yn */
"vst %%v28, 128(%%r1,%1) \n\t"
"vst %%v29, 144(%%r1,%1) \n\t"
"vst %%v30, 160(%%r1,%1) \n\t"
"vst %%v31, 176(%%r1,%1) \n\t"
"vst %%v20, 128(%%r1,%2) \n\t"
"vst %%v21, 144(%%r1,%2) \n\t"
"vst %%v22, 160(%%r1,%2) \n\t"
"vst %%v23, 176(%%r1,%2) \n\t"
"vl %%v24, 192(%%r1,%1) \n\t"
"vl %%v25, 208(%%r1,%1) \n\t"
"vl %%v26, 224(%%r1,%1) \n\t"
"vl %%v27, 240(%%r1,%1) \n\t"
"vl %%v16, 192(%%r1,%2) \n\t"
"vl %%v17, 208(%%r1,%2) \n\t"
"vl %%v18, 224(%%r1,%2) \n\t"
"vl %%v19, 240(%%r1,%2) \n\t"
"vfmsb %%v28,%%v24,%%v0 \n\t"
"vfmsb %%v29,%%v25,%%v0 \n\t"
"vfmsb %%v20,%%v24,%%v1 \n\t" /* yn=x*s */
"vfmsb %%v21,%%v25,%%v1 \n\t" /* yn=x*s */
"vfmsb %%v30,%%v26,%%v0 \n\t"
"vfmsb %%v22,%%v26,%%v1 \n\t" /* yn=x*s */
"vfmsb %%v31,%%v27,%%v0 \n\t"
"vfmsb %%v23,%%v27,%%v1 \n\t" /* yn=x*s */
/* 2nd parts*/
"vfmasb %%v28,%%v16,%%v1,%%v28 \n\t"
"vfmssb %%v20,%%v16,%%v0,%%v20 \n\t" /* yn=y*c-yn */
"vfmasb %%v29,%%v17,%%v1,%%v29 \n\t"
"vfmssb %%v21,%%v17,%%v0,%%v21 \n\t" /* yn=y*c-yn */
"vfmasb %%v30,%%v18,%%v1,%%v30 \n\t"
"vfmssb %%v22,%%v18,%%v0,%%v22 \n\t" /* yn=y*c-yn */
"vfmasb %%v31,%%v19,%%v1,%%v31 \n\t"
"vfmssb %%v23,%%v19,%%v0,%%v23 \n\t" /* yn=y*c-yn */
"vst %%v28, 192(%%r1,%1) \n\t"
"vst %%v29, 208(%%r1,%1) \n\t"
"vst %%v30, 224(%%r1,%1) \n\t"
"vst %%v31, 240(%%r1,%1) \n\t"
"vst %%v20, 192(%%r1,%2) \n\t"
"vst %%v21, 208(%%r1,%2) \n\t"
"vst %%v22, 224(%%r1,%2) \n\t"
"vst %%v23, 240(%%r1,%2) \n\t"
"agfi %%r1,256 \n\t"
"brctg %%r0,0b "
:
:"r"(n),"ZR"((FLOAT (*)[n])x),"ZR"((FLOAT (*)[n])y),"m"(*c),"m"(*s)
:"memory","cc","r0","r1","v0","v1","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
);
static void srot_kernel_64(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *c, FLOAT *s) {
__asm__("vlrepf %%v0,%[c]\n\t"
"vlrepf %%v1,%[s]\n\t"
"srlg %[n],%[n],6\n\t"
"xgr %%r1,%%r1\n\t"
"0:\n\t"
"pfd 2, 1024(%%r1,%[x])\n\t"
"pfd 2, 1024(%%r1,%[y])\n\t"
"vl %%v24, 0(%%r1,%[x])\n\t"
"vl %%v25, 16(%%r1,%[x])\n\t"
"vl %%v26, 32(%%r1,%[x])\n\t"
"vl %%v27, 48(%%r1,%[x])\n\t"
"vl %%v16, 0(%%r1,%[y])\n\t"
"vl %%v17, 16(%%r1,%[y])\n\t"
"vl %%v18, 32(%%r1,%[y])\n\t"
"vl %%v19, 48(%%r1,%[y])\n\t"
"vfmsb %%v28,%%v24,%%v0\n\t"
"vfmsb %%v29,%%v25,%%v0\n\t"
"vfmsb %%v20,%%v24,%%v1\n\t" /* yn=x*s */
"vfmsb %%v21,%%v25,%%v1\n\t" /* yn=x*s */
"vfmsb %%v30,%%v26,%%v0\n\t"
"vfmsb %%v22,%%v26,%%v1\n\t" /* yn=x*s */
"vfmsb %%v31,%%v27,%%v0\n\t"
"vfmsb %%v23,%%v27,%%v1\n\t" /* yn=x*s */
/* 2nd parts */
"vfmasb %%v28,%%v16,%%v1,%%v28\n\t"
"vfmssb %%v20,%%v16,%%v0,%%v20\n\t" /* yn=y*c-yn */
"vfmasb %%v29,%%v17,%%v1,%%v29\n\t"
"vfmssb %%v21,%%v17,%%v0,%%v21\n\t" /* yn=y*c-yn */
"vfmasb %%v30,%%v18,%%v1,%%v30\n\t"
"vfmssb %%v22,%%v18,%%v0,%%v22\n\t" /* yn=y*c-yn */
"vfmasb %%v31,%%v19,%%v1,%%v31\n\t"
"vfmssb %%v23,%%v19,%%v0,%%v23\n\t" /* yn=y*c-yn */
"vst %%v28, 0(%%r1,%[x])\n\t"
"vst %%v29, 16(%%r1,%[x])\n\t"
"vst %%v30, 32(%%r1,%[x])\n\t"
"vst %%v31, 48(%%r1,%[x])\n\t"
"vst %%v20, 0(%%r1,%[y])\n\t"
"vst %%v21, 16(%%r1,%[y])\n\t"
"vst %%v22, 32(%%r1,%[y])\n\t"
"vst %%v23, 48(%%r1,%[y])\n\t"
"vl %%v24, 64(%%r1,%[x])\n\t"
"vl %%v25, 80(%%r1,%[x])\n\t"
"vl %%v26, 96(%%r1,%[x])\n\t"
"vl %%v27, 112(%%r1,%[x])\n\t"
"vl %%v16, 64(%%r1,%[y])\n\t"
"vl %%v17, 80(%%r1,%[y])\n\t"
"vl %%v18, 96(%%r1,%[y])\n\t"
"vl %%v19, 112(%%r1,%[y])\n\t"
"vfmsb %%v28,%%v24,%%v0\n\t"
"vfmsb %%v29,%%v25,%%v0\n\t"
"vfmsb %%v20,%%v24,%%v1\n\t" /* yn=x*s */
"vfmsb %%v21,%%v25,%%v1\n\t" /* yn=x*s */
"vfmsb %%v30,%%v26,%%v0\n\t"
"vfmsb %%v22,%%v26,%%v1\n\t" /* yn=x*s */
"vfmsb %%v31,%%v27,%%v0\n\t"
"vfmsb %%v23,%%v27,%%v1\n\t" /* yn=x*s */
/* 2nd parts */
"vfmasb %%v28,%%v16,%%v1,%%v28\n\t"
"vfmssb %%v20,%%v16,%%v0,%%v20\n\t" /* yn=y*c-yn */
"vfmasb %%v29,%%v17,%%v1,%%v29\n\t"
"vfmssb %%v21,%%v17,%%v0,%%v21\n\t" /* yn=y*c-yn */
"vfmasb %%v30,%%v18,%%v1,%%v30\n\t"
"vfmssb %%v22,%%v18,%%v0,%%v22\n\t" /* yn=y*c-yn */
"vfmasb %%v31,%%v19,%%v1,%%v31\n\t"
"vfmssb %%v23,%%v19,%%v0,%%v23\n\t" /* yn=y*c-yn */
"vst %%v28, 64(%%r1,%[x])\n\t"
"vst %%v29, 80(%%r1,%[x])\n\t"
"vst %%v30, 96(%%r1,%[x])\n\t"
"vst %%v31, 112(%%r1,%[x])\n\t"
"vst %%v20, 64(%%r1,%[y])\n\t"
"vst %%v21, 80(%%r1,%[y])\n\t"
"vst %%v22, 96(%%r1,%[y])\n\t"
"vst %%v23, 112(%%r1,%[y])\n\t"
"vl %%v24, 128(%%r1,%[x])\n\t"
"vl %%v25, 144(%%r1,%[x])\n\t"
"vl %%v26, 160(%%r1,%[x])\n\t"
"vl %%v27, 176(%%r1,%[x])\n\t"
"vl %%v16, 128(%%r1,%[y])\n\t"
"vl %%v17, 144(%%r1,%[y])\n\t"
"vl %%v18, 160(%%r1,%[y])\n\t"
"vl %%v19, 176(%%r1,%[y])\n\t"
"vfmsb %%v28,%%v24,%%v0\n\t"
"vfmsb %%v29,%%v25,%%v0\n\t"
"vfmsb %%v20,%%v24,%%v1\n\t" /* yn=x*s */
"vfmsb %%v21,%%v25,%%v1\n\t" /* yn=x*s */
"vfmsb %%v30,%%v26,%%v0\n\t"
"vfmsb %%v22,%%v26,%%v1\n\t" /* yn=x*s */
"vfmsb %%v31,%%v27,%%v0\n\t"
"vfmsb %%v23,%%v27,%%v1\n\t" /* yn=x*s */
/* 2nd parts */
"vfmasb %%v28,%%v16,%%v1,%%v28\n\t"
"vfmssb %%v20,%%v16,%%v0,%%v20\n\t" /* yn=y*c-yn */
"vfmasb %%v29,%%v17,%%v1,%%v29\n\t"
"vfmssb %%v21,%%v17,%%v0,%%v21\n\t" /* yn=y*c-yn */
"vfmasb %%v30,%%v18,%%v1,%%v30\n\t"
"vfmssb %%v22,%%v18,%%v0,%%v22\n\t" /* yn=y*c-yn */
"vfmasb %%v31,%%v19,%%v1,%%v31\n\t"
"vfmssb %%v23,%%v19,%%v0,%%v23\n\t" /* yn=y*c-yn */
"vst %%v28, 128(%%r1,%[x])\n\t"
"vst %%v29, 144(%%r1,%[x])\n\t"
"vst %%v30, 160(%%r1,%[x])\n\t"
"vst %%v31, 176(%%r1,%[x])\n\t"
"vst %%v20, 128(%%r1,%[y])\n\t"
"vst %%v21, 144(%%r1,%[y])\n\t"
"vst %%v22, 160(%%r1,%[y])\n\t"
"vst %%v23, 176(%%r1,%[y])\n\t"
"vl %%v24, 192(%%r1,%[x])\n\t"
"vl %%v25, 208(%%r1,%[x])\n\t"
"vl %%v26, 224(%%r1,%[x])\n\t"
"vl %%v27, 240(%%r1,%[x])\n\t"
"vl %%v16, 192(%%r1,%[y])\n\t"
"vl %%v17, 208(%%r1,%[y])\n\t"
"vl %%v18, 224(%%r1,%[y])\n\t"
"vl %%v19, 240(%%r1,%[y])\n\t"
"vfmsb %%v28,%%v24,%%v0\n\t"
"vfmsb %%v29,%%v25,%%v0\n\t"
"vfmsb %%v20,%%v24,%%v1\n\t" /* yn=x*s */
"vfmsb %%v21,%%v25,%%v1\n\t" /* yn=x*s */
"vfmsb %%v30,%%v26,%%v0\n\t"
"vfmsb %%v22,%%v26,%%v1\n\t" /* yn=x*s */
"vfmsb %%v31,%%v27,%%v0\n\t"
"vfmsb %%v23,%%v27,%%v1\n\t" /* yn=x*s */
/* 2nd parts */
"vfmasb %%v28,%%v16,%%v1,%%v28\n\t"
"vfmssb %%v20,%%v16,%%v0,%%v20\n\t" /* yn=y*c-yn */
"vfmasb %%v29,%%v17,%%v1,%%v29\n\t"
"vfmssb %%v21,%%v17,%%v0,%%v21\n\t" /* yn=y*c-yn */
"vfmasb %%v30,%%v18,%%v1,%%v30\n\t"
"vfmssb %%v22,%%v18,%%v0,%%v22\n\t" /* yn=y*c-yn */
"vfmasb %%v31,%%v19,%%v1,%%v31\n\t"
"vfmssb %%v23,%%v19,%%v0,%%v23\n\t" /* yn=y*c-yn */
"vst %%v28, 192(%%r1,%[x])\n\t"
"vst %%v29, 208(%%r1,%[x])\n\t"
"vst %%v30, 224(%%r1,%[x])\n\t"
"vst %%v31, 240(%%r1,%[x])\n\t"
"vst %%v20, 192(%%r1,%[y])\n\t"
"vst %%v21, 208(%%r1,%[y])\n\t"
"vst %%v22, 224(%%r1,%[y])\n\t"
"vst %%v23, 240(%%r1,%[y])\n\t"
"agfi %%r1,256\n\t"
"brctg %[n],0b"
: "+m"(*(FLOAT (*)[n]) x), "+m"(*(FLOAT (*)[n]) y),[n] "+&r"(n)
: [x] "a"(x),[y] "a"(y),[c] "m"(*c),[s] "m"(*s)
: "cc", "r1", "v0", "v1", "v16", "v17", "v18", "v19", "v20", "v21",
"v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30",
"v31");
} }


int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT c, FLOAT s)
{
BLASLONG i=0;
BLASLONG ix=0,iy=0;
FLOAT temp;
int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y,
FLOAT c, FLOAT s) {
BLASLONG i = 0;
BLASLONG ix = 0, iy = 0;


if ( n <= 0 ) return(0);
FLOAT temp;


if ( (inc_x == 1) && (inc_y == 1) )
{
if (n <= 0)
return (0);


BLASLONG n1 = n & -64;
if ( n1 > 0 )
{
FLOAT cosa,sina;
cosa=c;
sina=s;
srot_kernel_64(n1, x, y, &cosa, &sina);
i=n1;
}
if ((inc_x == 1) && (inc_y == 1)) {


while(i < n)
{
temp = c*x[i] + s*y[i] ;
y[i] = c*y[i] - s*x[i] ;
x[i] = temp ;

i++ ;
BLASLONG n1 = n & -64;
if (n1 > 0) {
FLOAT cosa, sina;
cosa = c;
sina = s;
srot_kernel_64(n1, x, y, &cosa, &sina);
i = n1;
}


}
while (i < n) {
temp = c * x[i] + s * y[i];
y[i] = c * y[i] - s * x[i];
x[i] = temp;


i++;


} }
else
{


while(i < n)
{
temp = c*x[ix] + s*y[iy] ;
y[iy] = c*y[iy] - s*x[ix] ;
x[ix] = temp ;
} else {


ix += inc_x ;
iy += inc_y ;
i++ ;
while (i < n) {
temp = c * x[ix] + s * y[iy];
y[iy] = c * y[iy] - s * x[ix];
x[ix] = temp;


}
ix += inc_x;
iy += inc_y;
i++;


} }
return(0);

}


}
return (0);


}

+ 120
- 148
kernel/zarch/sscal.c View File

@@ -1,5 +1,5 @@
/*************************************************************************** /***************************************************************************
Copyright (c) 2013-2018, The OpenBLAS Project
Copyright (c) 2013-2019, The OpenBLAS Project
All rights reserved. All rights reserved.
Redistribution and use in source and binary forms, with or without Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are modification, are permitted provided that the following conditions are
@@ -27,175 +27,147 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.


#include "common.h" #include "common.h"


static void sscal_kernel_32(BLASLONG n, FLOAT da, FLOAT *x)
{
__asm__ volatile (
"vlrepf %%v0,%1 \n\t"
"srlg %%r0,%0,5 \n\t"
"xgr %%r1,%%r1 \n\t"
"0: \n\t"
"pfd 2, 1024(%%r1,%2) \n\t"
"vl %%v24, 0(%%r1,%2) \n\t"
"vfmsb %%v24,%%v24,%%v0 \n\t"
"vst %%v24, 0(%%r1,%2) \n\t"
"vl %%v25, 16(%%r1,%2) \n\t"
"vfmsb %%v25,%%v25,%%v0 \n\t"
"vst %%v25, 16(%%r1,%2) \n\t"
"vl %%v26, 32(%%r1,%2) \n\t"
"vfmsb %%v26,%%v26,%%v0 \n\t"
"vst %%v26, 32(%%r1,%2) \n\t"
"vl %%v27, 48(%%r1,%2) \n\t"
"vfmsb %%v27,%%v27,%%v0 \n\t"
"vst %%v27, 48(%%r1,%2) \n\t"
"vl %%v24, 64(%%r1,%2) \n\t"
"vfmsb %%v24,%%v24,%%v0 \n\t"
"vst %%v24, 64(%%r1,%2) \n\t"
"vl %%v25, 80(%%r1,%2) \n\t"
"vfmsb %%v25,%%v25,%%v0 \n\t"
"vst %%v25, 80(%%r1,%2) \n\t"
"vl %%v26, 96(%%r1,%2) \n\t"
"vfmsb %%v26,%%v26,%%v0 \n\t"
"vst %%v26, 96(%%r1,%2) \n\t"
"vl %%v27, 112(%%r1,%2) \n\t"
"vfmsb %%v27,%%v27,%%v0 \n\t"
"vst %%v27, 112(%%r1,%2) \n\t"
"agfi %%r1,128 \n\t"
"brctg %%r0,0b "
:
:"r"(n),"m"(da),"ZR"((FLOAT (*)[n])x)
:"memory","cc","r0","r1","v0","v24","v25","v26","v27"
);
static void sscal_kernel_32(BLASLONG n, FLOAT da, FLOAT *x) {
__asm__("vlrepf %%v0,%[da]\n\t"
"srlg %[n],%[n],5\n\t"
"xgr %%r1,%%r1\n\t"
"0:\n\t"
"pfd 2, 1024(%%r1,%[x])\n\t"
"vl %%v24,0(%%r1,%[x])\n\t"
"vfmsb %%v24,%%v24,%%v0\n\t"
"vst %%v24,0(%%r1,%[x])\n\t"
"vl %%v25,16(%%r1,%[x])\n\t"
"vfmsb %%v25,%%v25,%%v0\n\t"
"vst %%v25,16(%%r1,%[x])\n\t"
"vl %%v26,32(%%r1,%[x])\n\t"
"vfmsb %%v26,%%v26,%%v0\n\t"
"vst %%v26,32(%%r1,%[x])\n\t"
"vl %%v27,48(%%r1,%[x])\n\t"
"vfmsb %%v27,%%v27,%%v0\n\t"
"vst %%v27,48(%%r1,%[x])\n\t"
"vl %%v28,64(%%r1,%[x])\n\t"
"vfmsb %%v28,%%v28,%%v0\n\t"
"vst %%v28,64(%%r1,%[x])\n\t"
"vl %%v29,80(%%r1,%[x])\n\t"
"vfmsb %%v29,%%v29,%%v0\n\t"
"vst %%v29,80(%%r1,%[x])\n\t"
"vl %%v30,96(%%r1,%[x])\n\t"
"vfmsb %%v30,%%v30,%%v0\n\t"
"vst %%v30,96(%%r1,%[x])\n\t"
"vl %%v31,112(%%r1,%[x])\n\t"
"vfmsb %%v31,%%v31,%%v0\n\t"
"vst %%v31,112(%%r1,%[x])\n\t"
"agfi %%r1,128\n\t"
"brctg %[n],0b"
: "+m"(*(FLOAT (*)[n]) x),[n] "+&r"(n)
: [x] "a"(x),[da] "m"(da)
: "cc", "r1", "v0", "v24", "v25", "v26", "v27", "v28", "v29", "v30",
"v31");
} }


static void sscal_kernel_32_zero(BLASLONG n, FLOAT *x)
{
__asm__ volatile(
"vzero %%v24 \n\t"
"vzero %%v25 \n\t"
"vzero %%v26 \n\t"
"vzero %%v27 \n\t"
"srlg %%r0,%0,5 \n\t"
"xgr %%r1,%%r1 \n\t"
"0: \n\t"
"pfd 2, 1024(%%r1,%1) \n\t"

"vst %%v24,0(%%r1,%1) \n\t"
"vst %%v25,16(%%r1,%1) \n\t"
"vst %%v26,32(%%r1,%1) \n\t"
"vst %%v27,48(%%r1,%1) \n\t"
"vst %%v24,64(%%r1,%1) \n\t"
"vst %%v25,80(%%r1,%1) \n\t"
"vst %%v26,96(%%r1,%1) \n\t"
"vst %%v27,112(%%r1,%1) \n\t"
"agfi %%r1,128 \n\t"
"brctg %%r0,0b "
:
:"r"(n),"ZR"((FLOAT (*)[n])x)
:"memory","cc","r0","r1","v24","v25","v26","v27"
);
static void sscal_kernel_32_zero(BLASLONG n, FLOAT *x) {
__asm__("vzero %%v0\n\t"
"srlg %[n],%[n],5\n\t"
"xgr %%r1,%%r1\n\t"
"0:\n\t"
"pfd 2, 1024(%%r1,%[x])\n\t"
"vst %%v0,0(%%r1,%[x])\n\t"
"vst %%v0,16(%%r1,%[x])\n\t"
"vst %%v0,32(%%r1,%[x])\n\t"
"vst %%v0,48(%%r1,%[x])\n\t"
"vst %%v0,64(%%r1,%[x])\n\t"
"vst %%v0,80(%%r1,%[x])\n\t"
"vst %%v0,96(%%r1,%[x])\n\t"
"vst %%v0,112(%%r1,%[x])\n\t"
"agfi %%r1,128\n\t"
"brctg %[n],0b"
: "=m"(*(FLOAT (*)[n]) x),[n] "+&r"(n)
: [x] "a"(x)
: "cc", "r1", "v0");
} }


int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2)
{
BLASLONG i=0,j=0;
if ( n <= 0 || inc_x <=0 )
return(0);

if ( inc_x == 1 )
{

if ( da == 0.0 )
{

BLASLONG n1 = n & -32;
if ( n1 > 0 )
{
sscal_kernel_32_zero(n1, x);
j=n1;
}

while(j < n)
{

x[j]=0.0;
j++;
}

}
else
{

BLASLONG n1 = n & -32;
if ( n1 > 0 )
{
sscal_kernel_32(n1, da, x);
j=n1;
}
while(j < n)
{

x[j] = da * x[j] ;
j++;
}
}
int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x,
BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy,
BLASLONG dummy2) {
BLASLONG i = 0, j = 0;
if (n <= 0 || inc_x <= 0)
return (0);


if (inc_x == 1) {


if (da == 0.0) {

BLASLONG n1 = n & -32;
if (n1 > 0) {

sscal_kernel_32_zero(n1, x);
j = n1;
}

while (j < n) {

x[j] = 0.0;
j++;
}

} else {

BLASLONG n1 = n & -32;
if (n1 > 0) {
sscal_kernel_32(n1, da, x);
j = n1;
}
while (j < n) {

x[j] = da * x[j];
j++;
}
} }
else
{


if ( da == 0.0 )
{
} else {


BLASLONG n1 = n & -2;
if (da == 0.0) {


while (j < n1) {
BLASLONG n1 = n & -2;


x[i]=0.0;
x[i + inc_x]=0.0;
while (j < n1) {


i += inc_x * 2;
j += 2;
x[i] = 0.0;
x[i + inc_x] = 0.0;


}
while(j < n)
{
i += inc_x * 2;
j += 2;


x[i]=0.0;
i += inc_x ;
j++;
}
}
while (j < n) {


}
else
{
BLASLONG n1 = n & -2;
x[i] = 0.0;
i += inc_x;
j++;
}


while (j < n1) {
} else {
BLASLONG n1 = n & -2;


x[i] = da * x[i] ;
x[i + inc_x] = da * x[i + inc_x];
while (j < n1) {


i += inc_x * 2;
j += 2;
x[i] = da * x[i];
x[i + inc_x] = da * x[i + inc_x];


}
i += inc_x * 2;
j += 2;


while(j < n)
{
}


x[i] = da * x[i] ;
i += inc_x ;
j++;
}
}
while (j < n) {


x[i] = da * x[i];
i += inc_x;
j++;
}
} }
return 0;

}


}
return 0;


}

+ 108
- 122
kernel/zarch/sswap.c View File

@@ -1,5 +1,5 @@
/*************************************************************************** /***************************************************************************
Copyright (c) 2013-2018, The OpenBLAS Project
Copyright (c) 2013-2019, The OpenBLAS Project
All rights reserved. All rights reserved.
Redistribution and use in source and binary forms, with or without Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are modification, are permitted provided that the following conditions are
@@ -27,138 +27,124 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.


#include "common.h" #include "common.h"


static void sswap_kernel_64(BLASLONG n, FLOAT *x, FLOAT *y)
{
__asm__ volatile(
"srlg %%r0,%0,6 \n\t"
"xgr %%r1,%%r1 \n\t"
"0: \n\t"
"pfd 2, 1024(%%r1,%1) \n\t"
"pfd 2, 1024(%%r1,%2) \n\t"
"vl %%v16, 0(%%r1,%1) \n\t"
"vl %%v17, 16(%%r1,%1) \n\t"
"vl %%v18, 32(%%r1,%1) \n\t"
"vl %%v19, 48(%%r1,%1) \n\t"
"vl %%v20, 64(%%r1,%1) \n\t"
"vl %%v21, 80(%%r1,%1) \n\t"
"vl %%v22, 96(%%r1,%1) \n\t"
"vl %%v23, 112(%%r1,%1) \n\t"
"vl %%v24, 128(%%r1,%1) \n\t"
"vl %%v25, 144(%%r1,%1) \n\t"
"vl %%v26, 160(%%r1,%1) \n\t"
"vl %%v27, 176(%%r1,%1) \n\t"
"vl %%v28, 192(%%r1,%1) \n\t"
"vl %%v29, 208(%%r1,%1) \n\t"
"vl %%v30, 224(%%r1,%1) \n\t"
"vl %%v31, 240(%%r1,%1) \n\t"

"vl %%v0, 0(%%r1,%2) \n\t"
"vl %%v1, 16(%%r1,%2) \n\t"
"vl %%v2, 32(%%r1,%2) \n\t"
"vl %%v3, 48(%%r1,%2) \n\t"
"vl %%v4, 64(%%r1,%2) \n\t"
"vl %%v5, 80(%%r1,%2) \n\t"
"vl %%v6, 96(%%r1,%2) \n\t"
"vl %%v7, 112(%%r1,%2) \n\t"
"vst %%v0, 0(%%r1,%1) \n\t"
"vst %%v1, 16(%%r1,%1) \n\t"
"vst %%v2, 32(%%r1,%1) \n\t"
"vst %%v3, 48(%%r1,%1) \n\t"
"vst %%v4, 64(%%r1,%1) \n\t"
"vst %%v5, 80(%%r1,%1) \n\t"
"vst %%v6, 96(%%r1,%1) \n\t"
"vst %%v7, 112(%%r1,%1) \n\t"

"vl %%v0, 128(%%r1,%2) \n\t"
"vl %%v1, 144(%%r1,%2) \n\t"
"vl %%v2, 160(%%r1,%2) \n\t"
"vl %%v3, 176(%%r1,%2) \n\t"
"vl %%v4, 192(%%r1,%2) \n\t"
"vl %%v5, 208(%%r1,%2) \n\t"
"vl %%v6, 224(%%r1,%2) \n\t"
"vl %%v7, 240(%%r1,%2) \n\t"
"vst %%v0, 128(%%r1,%1) \n\t"
"vst %%v1, 144(%%r1,%1) \n\t"
"vst %%v2, 160(%%r1,%1) \n\t"
"vst %%v3, 176(%%r1,%1) \n\t"
"vst %%v4, 192(%%r1,%1) \n\t"
"vst %%v5, 208(%%r1,%1) \n\t"
"vst %%v6, 224(%%r1,%1) \n\t"
"vst %%v7, 240(%%r1,%1) \n\t"

"vst %%v16, 0(%%r1,%2) \n\t"
"vst %%v17, 16(%%r1,%2) \n\t"
"vst %%v18, 32(%%r1,%2) \n\t"
"vst %%v19, 48(%%r1,%2) \n\t"
"vst %%v20, 64(%%r1,%2) \n\t"
"vst %%v21, 80(%%r1,%2) \n\t"
"vst %%v22, 96(%%r1,%2) \n\t"
"vst %%v23, 112(%%r1,%2) \n\t"
"vst %%v24, 128(%%r1,%2) \n\t"
"vst %%v25, 144(%%r1,%2) \n\t"
"vst %%v26, 160(%%r1,%2) \n\t"
"vst %%v27, 176(%%r1,%2) \n\t"
"vst %%v28, 192(%%r1,%2) \n\t"
"vst %%v29, 208(%%r1,%2) \n\t"
"vst %%v30, 224(%%r1,%2) \n\t"
"vst %%v31, 240(%%r1,%2) \n\t"
"agfi %%r1,256 \n\t"
"brctg %%r0,0b "
:
:"r"(n),"ZR"((FLOAT (*)[n])x),"ZR"((FLOAT (*)[n])y)
:"memory","cc","r0","r1","v0","v1","v2","v3","v4","v5","v6","v7","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
);
static void sswap_kernel_64(BLASLONG n, FLOAT *x, FLOAT *y) {
__asm__("srlg %[n],%[n],6\n\t"
"xgr %%r1,%%r1\n\t"
"0:\n\t"
"pfd 2, 1024(%%r1,%[x])\n\t"
"pfd 2, 1024(%%r1,%[y])\n\t"
"vl %%v16, 0(%%r1,%[x])\n\t"
"vl %%v17, 16(%%r1,%[x])\n\t"
"vl %%v18, 32(%%r1,%[x])\n\t"
"vl %%v19, 48(%%r1,%[x])\n\t"
"vl %%v20, 64(%%r1,%[x])\n\t"
"vl %%v21, 80(%%r1,%[x])\n\t"
"vl %%v22, 96(%%r1,%[x])\n\t"
"vl %%v23, 112(%%r1,%[x])\n\t"
"vl %%v24, 128(%%r1,%[x])\n\t"
"vl %%v25, 144(%%r1,%[x])\n\t"
"vl %%v26, 160(%%r1,%[x])\n\t"
"vl %%v27, 176(%%r1,%[x])\n\t"
"vl %%v28, 192(%%r1,%[x])\n\t"
"vl %%v29, 208(%%r1,%[x])\n\t"
"vl %%v30, 224(%%r1,%[x])\n\t"
"vl %%v31, 240(%%r1,%[x])\n\t"
"vl %%v0, 0(%%r1,%[y])\n\t"
"vl %%v1, 16(%%r1,%[y])\n\t"
"vl %%v2, 32(%%r1,%[y])\n\t"
"vl %%v3, 48(%%r1,%[y])\n\t"
"vl %%v4, 64(%%r1,%[y])\n\t"
"vl %%v5, 80(%%r1,%[y])\n\t"
"vl %%v6, 96(%%r1,%[y])\n\t"
"vl %%v7, 112(%%r1,%[y])\n\t"
"vst %%v0, 0(%%r1,%[x])\n\t"
"vst %%v1, 16(%%r1,%[x])\n\t"
"vst %%v2, 32(%%r1,%[x])\n\t"
"vst %%v3, 48(%%r1,%[x])\n\t"
"vst %%v4, 64(%%r1,%[x])\n\t"
"vst %%v5, 80(%%r1,%[x])\n\t"
"vst %%v6, 96(%%r1,%[x])\n\t"
"vst %%v7, 112(%%r1,%[x])\n\t"
"vl %%v0, 128(%%r1,%[y])\n\t"
"vl %%v1, 144(%%r1,%[y])\n\t"
"vl %%v2, 160(%%r1,%[y])\n\t"
"vl %%v3, 176(%%r1,%[y])\n\t"
"vl %%v4, 192(%%r1,%[y])\n\t"
"vl %%v5, 208(%%r1,%[y])\n\t"
"vl %%v6, 224(%%r1,%[y])\n\t"
"vl %%v7, 240(%%r1,%[y])\n\t"
"vst %%v0, 128(%%r1,%[x])\n\t"
"vst %%v1, 144(%%r1,%[x])\n\t"
"vst %%v2, 160(%%r1,%[x])\n\t"
"vst %%v3, 176(%%r1,%[x])\n\t"
"vst %%v4, 192(%%r1,%[x])\n\t"
"vst %%v5, 208(%%r1,%[x])\n\t"
"vst %%v6, 224(%%r1,%[x])\n\t"
"vst %%v7, 240(%%r1,%[x])\n\t"
"vst %%v16, 0(%%r1,%[y])\n\t"
"vst %%v17, 16(%%r1,%[y])\n\t"
"vst %%v18, 32(%%r1,%[y])\n\t"
"vst %%v19, 48(%%r1,%[y])\n\t"
"vst %%v20, 64(%%r1,%[y])\n\t"
"vst %%v21, 80(%%r1,%[y])\n\t"
"vst %%v22, 96(%%r1,%[y])\n\t"
"vst %%v23, 112(%%r1,%[y])\n\t"
"vst %%v24, 128(%%r1,%[y])\n\t"
"vst %%v25, 144(%%r1,%[y])\n\t"
"vst %%v26, 160(%%r1,%[y])\n\t"
"vst %%v27, 176(%%r1,%[y])\n\t"
"vst %%v28, 192(%%r1,%[y])\n\t"
"vst %%v29, 208(%%r1,%[y])\n\t"
"vst %%v30, 224(%%r1,%[y])\n\t"
"vst %%v31, 240(%%r1,%[y])\n\t"
"agfi %%r1,256\n\t"
"brctg %[n],0b"
: "+m"(*(FLOAT (*)[n]) x), "+m"(*(FLOAT (*)[n]) y),[n] "+&r"(n)
: [x] "a"(x),[y] "a"(y)
: "cc", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16",
"v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26",
"v27", "v28", "v29", "v30", "v31");
} }


int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2)
{
BLASLONG i=0;
BLASLONG ix=0,iy=0;
FLOAT temp;

if ( n <= 0 ) return(0);

if ( (inc_x == 1) && (inc_y == 1 ))
{
int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT *x,
BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy,
BLASLONG dummy2) {
BLASLONG i = 0;
BLASLONG ix = 0, iy = 0;
FLOAT temp;


BLASLONG n1 = n & -64;
if ( n1 > 0 )
{
sswap_kernel_64(n1, x, y);
i=n1;
}
if (n <= 0)
return (0);


while(i < n)
{
temp = y[i];
y[i] = x[i] ;
x[i] = temp;
i++ ;
if ((inc_x == 1) && (inc_y == 1)) {


}
BLASLONG n1 = n & -64;
if (n1 > 0) {
sswap_kernel_64(n1, x, y);
i = n1;
}


while (i < n) {
temp = y[i];
y[i] = x[i];
x[i] = temp;
i++;


} }
else
{


while(i < n)
{
temp = y[iy];
y[iy] = x[ix] ;
x[ix] = temp;
ix += inc_x ;
iy += inc_y ;
i++ ;
} else {


}
while (i < n) {
temp = y[iy];
y[iy] = x[ix];
x[ix] = temp;
ix += inc_x;
iy += inc_y;
i++;


} }
return(0);

}


}
return (0);


}

+ 157
- 176
kernel/zarch/zamax.c View File

@@ -1,5 +1,5 @@
/*************************************************************************** /***************************************************************************
Copyright (c) 2013-2017, The OpenBLAS Project
Copyright (c) 2013-2019, The OpenBLAS Project
All rights reserved. All rights reserved.
Redistribution and use in source and binary forms, with or without Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are modification, are permitted provided that the following conditions are
@@ -28,184 +28,165 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "common.h" #include "common.h"
#include <math.h> #include <math.h>


#if defined(DOUBLE)
#define ABS fabs
#else
#define ABS fabsf
#endif

#define CABS1(x,i) (ABS(x[i]) + ABS(x[i + 1]))

static FLOAT zamax_kernel_16(BLASLONG n, FLOAT *x)
{
FLOAT amax;

__asm__ volatile (
"vleg %%v0,0(%2),0 \n\t"
"vleg %%v16,8(%2),0 \n\t"
"vleg %%v0,16(%2),1 \n\t"
"vleg %%v16,24(%2),1 \n\t"
"vflpdb %%v0,%%v0 \n\t"
"vflpdb %%v16,%%v16 \n\t"
"vfadb %%v0,%%v0,%%v16 \n\t"
"srlg %%r0,%1,4 \n\t"
"xgr %%r1,%%r1 \n\t"
"0: \n\t"
"pfd 1, 1024(%%r1,%2) \n\t"

"vleg %%v16,0(%%r1,%2),0 \n\t"
"vleg %%v17,8(%%r1,%2),0 \n\t"
"vleg %%v16,16(%%r1,%2),1 \n\t"
"vleg %%v17,24(%%r1,%2),1 \n\t"
"vleg %%v18,32(%%r1,%2),0 \n\t"
"vleg %%v19,40(%%r1,%2),0 \n\t"
"vleg %%v18,48(%%r1,%2),1 \n\t"
"vleg %%v19,56(%%r1,%2),1 \n\t"
"vleg %%v20,64(%%r1,%2),0 \n\t"
"vleg %%v21,72(%%r1,%2),0 \n\t"
"vleg %%v20,80(%%r1,%2),1 \n\t"
"vleg %%v21,88(%%r1,%2),1 \n\t"
"vleg %%v22,96(%%r1,%2),0 \n\t"
"vleg %%v23,104(%%r1,%2),0 \n\t"
"vleg %%v22,112(%%r1,%2),1 \n\t"
"vleg %%v23,120(%%r1,%2),1 \n\t"
"vleg %%v24,128(%%r1,%2),0 \n\t"
"vleg %%v25,136(%%r1,%2),0 \n\t"
"vleg %%v24,144(%%r1,%2),1 \n\t"
"vleg %%v25,152(%%r1,%2),1 \n\t"
"vleg %%v26,160(%%r1,%2),0 \n\t"
"vleg %%v27,168(%%r1,%2),0 \n\t"
"vleg %%v26,176(%%r1,%2),1 \n\t"
"vleg %%v27,184(%%r1,%2),1 \n\t"
"vleg %%v28,192(%%r1,%2),0 \n\t"
"vleg %%v29,200(%%r1,%2),0 \n\t"
"vleg %%v28,208(%%r1,%2),1 \n\t"
"vleg %%v29,216(%%r1,%2),1 \n\t"
"vleg %%v30,224(%%r1,%2),0 \n\t"
"vleg %%v31,232(%%r1,%2),0 \n\t"
"vleg %%v30,240(%%r1,%2),1 \n\t"
"vleg %%v31,248(%%r1,%2),1 \n\t"

"vflpdb %%v16,%%v16 \n\t"
"vflpdb %%v17,%%v17 \n\t"
"vflpdb %%v18,%%v18 \n\t"
"vflpdb %%v19,%%v19 \n\t"
"vflpdb %%v20,%%v20 \n\t"
"vflpdb %%v21,%%v21 \n\t"
"vflpdb %%v22,%%v22 \n\t"
"vflpdb %%v23,%%v23 \n\t"
"vflpdb %%v24,%%v24 \n\t"
"vflpdb %%v25,%%v25 \n\t"
"vflpdb %%v26,%%v26 \n\t"
"vflpdb %%v27,%%v27 \n\t"
"vflpdb %%v28,%%v28 \n\t"
"vflpdb %%v29,%%v29 \n\t"
"vflpdb %%v30,%%v30 \n\t"
"vflpdb %%v31,%%v31 \n\t"

"vfadb %%v16,%%v16,%%v17 \n\t"
"vfadb %%v18,%%v18,%%v19 \n\t"
"vfadb %%v20,%%v20,%%v21 \n\t"
"vfadb %%v22,%%v22,%%v23 \n\t"
"vfadb %%v24,%%v24,%%v25 \n\t"
"vfadb %%v26,%%v26,%%v27 \n\t"
"vfadb %%v28,%%v28,%%v29 \n\t"
"vfadb %%v30,%%v30,%%v31 \n\t"
"vfmaxdb %%v16,%%v16,%%v24,0 \n\t"
"vfmaxdb %%v18,%%v18,%%v26,0 \n\t"
"vfmaxdb %%v20,%%v20,%%v28,0 \n\t"
"vfmaxdb %%v22,%%v22,%%v30,0 \n\t"

"vfmaxdb %%v16,%%v16,%%v20,0 \n\t"
"vfmaxdb %%v18,%%v18,%%v22,0 \n\t"

"vfmaxdb %%v16,%%v16,%%v18,0 \n\t"

"vfmaxdb %%v0,%%v0,%%v16,0 \n\t"

"agfi %%r1, 256 \n\t"
"brctg %%r0, 0b \n\t"

"vrepg %%v16,%%v0,1 \n\t"
"wfmaxdb %%v0,%%v0,%%v16,0 \n\t"
"ldr %0,%%f0 "
:"=f"(amax)
:"r"(n),"ZR"((const FLOAT (*)[n])x)
:"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
);

return amax;
#define CABS1(x,i) (fabs(x[i]) + fabs(x[i + 1]))

static FLOAT zamax_kernel_16(BLASLONG n, FLOAT *x) {
FLOAT amax;

__asm__("vleg %%v0,0(%[x]),0\n\t"
"vleg %%v16,8(%[x]),0\n\t"
"vleg %%v0,16(%[x]),1\n\t"
"vleg %%v16,24(%[x]),1\n\t"
"vflpdb %%v0,%%v0\n\t"
"vflpdb %%v16,%%v16\n\t"
"vfadb %%v0,%%v0,%%v16\n\t"
"srlg %[n],%[n],4\n\t"
"xgr %%r1,%%r1\n\t"
"0:\n\t"
"pfd 1, 1024(%%r1,%[x])\n\t"
"vleg %%v16,0(%%r1,%[x]),0\n\t"
"vleg %%v17,8(%%r1,%[x]),0\n\t"
"vleg %%v16,16(%%r1,%[x]),1\n\t"
"vleg %%v17,24(%%r1,%[x]),1\n\t"
"vleg %%v18,32(%%r1,%[x]),0\n\t"
"vleg %%v19,40(%%r1,%[x]),0\n\t"
"vleg %%v18,48(%%r1,%[x]),1\n\t"
"vleg %%v19,56(%%r1,%[x]),1\n\t"
"vleg %%v20,64(%%r1,%[x]),0\n\t"
"vleg %%v21,72(%%r1,%[x]),0\n\t"
"vleg %%v20,80(%%r1,%[x]),1\n\t"
"vleg %%v21,88(%%r1,%[x]),1\n\t"
"vleg %%v22,96(%%r1,%[x]),0\n\t"
"vleg %%v23,104(%%r1,%[x]),0\n\t"
"vleg %%v22,112(%%r1,%[x]),1\n\t"
"vleg %%v23,120(%%r1,%[x]),1\n\t"
"vleg %%v24,128(%%r1,%[x]),0\n\t"
"vleg %%v25,136(%%r1,%[x]),0\n\t"
"vleg %%v24,144(%%r1,%[x]),1\n\t"
"vleg %%v25,152(%%r1,%[x]),1\n\t"
"vleg %%v26,160(%%r1,%[x]),0\n\t"
"vleg %%v27,168(%%r1,%[x]),0\n\t"
"vleg %%v26,176(%%r1,%[x]),1\n\t"
"vleg %%v27,184(%%r1,%[x]),1\n\t"
"vleg %%v28,192(%%r1,%[x]),0\n\t"
"vleg %%v29,200(%%r1,%[x]),0\n\t"
"vleg %%v28,208(%%r1,%[x]),1\n\t"
"vleg %%v29,216(%%r1,%[x]),1\n\t"
"vleg %%v30,224(%%r1,%[x]),0\n\t"
"vleg %%v31,232(%%r1,%[x]),0\n\t"
"vleg %%v30,240(%%r1,%[x]),1\n\t"
"vleg %%v31,248(%%r1,%[x]),1\n\t"
"vflpdb %%v16,%%v16\n\t"
"vflpdb %%v17,%%v17\n\t"
"vflpdb %%v18,%%v18\n\t"
"vflpdb %%v19,%%v19\n\t"
"vflpdb %%v20,%%v20\n\t"
"vflpdb %%v21,%%v21\n\t"
"vflpdb %%v22,%%v22\n\t"
"vflpdb %%v23,%%v23\n\t"
"vflpdb %%v24,%%v24\n\t"
"vflpdb %%v25,%%v25\n\t"
"vflpdb %%v26,%%v26\n\t"
"vflpdb %%v27,%%v27\n\t"
"vflpdb %%v28,%%v28\n\t"
"vflpdb %%v29,%%v29\n\t"
"vflpdb %%v30,%%v30\n\t"
"vflpdb %%v31,%%v31\n\t"
"vfadb %%v16,%%v16,%%v17\n\t"
"vfadb %%v18,%%v18,%%v19\n\t"
"vfadb %%v20,%%v20,%%v21\n\t"
"vfadb %%v22,%%v22,%%v23\n\t"
"vfadb %%v24,%%v24,%%v25\n\t"
"vfadb %%v26,%%v26,%%v27\n\t"
"vfadb %%v28,%%v28,%%v29\n\t"
"vfadb %%v30,%%v30,%%v31\n\t"
"vfmaxdb %%v16,%%v16,%%v24,0\n\t"
"vfmaxdb %%v18,%%v18,%%v26,0\n\t"
"vfmaxdb %%v20,%%v20,%%v28,0\n\t"
"vfmaxdb %%v22,%%v22,%%v30,0\n\t"
"vfmaxdb %%v16,%%v16,%%v20,0\n\t"
"vfmaxdb %%v18,%%v18,%%v22,0\n\t"
"vfmaxdb %%v16,%%v16,%%v18,0\n\t"
"vfmaxdb %%v0,%%v0,%%v16,0\n\t"
"agfi %%r1, 256\n\t"
"brctg %[n], 0b\n\t"
"vrepg %%v16,%%v0,1\n\t"
"wfmaxdb %%v0,%%v0,%%v16,0\n\t"
"ldr %[amax],%%f0"
: [amax] "=f"(amax),[n] "+&r"(n)
: "m"(*(const FLOAT (*)[n * 2]) x),[x] "a"(x)
: "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22",
"v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31");

return amax;
} }

FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
BLASLONG i = 0;
BLASLONG ix = 0;
FLOAT maxf = 0.0;
BLASLONG inc_x2;

if (n <= 0 || inc_x <= 0) return (maxf);

if (inc_x == 1) {

BLASLONG n1 = n & -16;
if (n1 > 0) {

maxf = zamax_kernel_16(n1, x);
ix = n1 * 2;
i = n1;
}
else
{
maxf=CABS1(x,0);
ix += 2;
i++;
}

while (i < n) {
if (CABS1(x,ix) > maxf) {
maxf = CABS1(x,ix);
}
ix += 2;
i++;
}
return (maxf);
BLASLONG i = 0;
BLASLONG ix = 0;
FLOAT maxf = 0.0;
BLASLONG inc_x2;

if (n <= 0 || inc_x <= 0)
return (maxf);

if (inc_x == 1) {

BLASLONG n1 = n & -16;
if (n1 > 0) {


maxf = zamax_kernel_16(n1, x);
ix = n1 * 2;
i = n1;
} else { } else {
maxf=CABS1(x,0);
inc_x2 = 2 * inc_x;

BLASLONG n1 = n & -4;
while (i < n1) {

if (CABS1(x,ix) > maxf) {
maxf = CABS1(x,ix);
}
if (CABS1(x,ix+inc_x2) > maxf) {
maxf = CABS1(x,ix+inc_x2);
}
if (CABS1(x,ix+inc_x2*2) > maxf) {
maxf = CABS1(x,ix+inc_x2*2);
}
if (CABS1(x,ix+inc_x2*3) > maxf) {
maxf = CABS1(x,ix+inc_x2*3);
}

ix += inc_x2 * 4;

i += 4;

}


while (i < n) {
if (CABS1(x,ix) > maxf) {
maxf = CABS1(x,ix);
}
ix += inc_x2;
i++;
}
return (maxf);
maxf = CABS1(x, 0);
ix += 2;
i++;
}

while (i < n) {
if (CABS1(x, ix) > maxf) {
maxf = CABS1(x, ix);
}
ix += 2;
i++;
}
return (maxf);

} else {

maxf = CABS1(x, 0);
inc_x2 = 2 * inc_x;

BLASLONG n1 = n & -4;
while (i < n1) {

if (CABS1(x, ix) > maxf) {
maxf = CABS1(x, ix);
}
if (CABS1(x, ix + inc_x2) > maxf) {
maxf = CABS1(x, ix + inc_x2);
}
if (CABS1(x, ix + inc_x2 * 2) > maxf) {
maxf = CABS1(x, ix + inc_x2 * 2);
}
if (CABS1(x, ix + inc_x2 * 3) > maxf) {
maxf = CABS1(x, ix + inc_x2 * 3);
}

ix += inc_x2 * 4;

i += 4;

}

while (i < n) {
if (CABS1(x, ix) > maxf) {
maxf = CABS1(x, ix);
}
ix += inc_x2;
i++;
} }
return (maxf);
}
} }

+ 166
- 186
kernel/zarch/zamax_z13.c View File

@@ -1,5 +1,5 @@
/*************************************************************************** /***************************************************************************
Copyright (c) 2013-2017, The OpenBLAS Project
Copyright (c) 2013-2019, The OpenBLAS Project
All rights reserved. All rights reserved.
Redistribution and use in source and binary forms, with or without Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are modification, are permitted provided that the following conditions are
@@ -28,194 +28,174 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "common.h" #include "common.h"
#include <math.h> #include <math.h>


#if defined(DOUBLE)
#define ABS fabs
#else
#define ABS fabsf
#endif

#define CABS1(x,i) (ABS(x[i]) + ABS(x[i + 1]))

static FLOAT zamax_kernel_16(BLASLONG n, FLOAT *x)
{
FLOAT amax;

__asm__ volatile (
"vleg %%v0,0(%2),0 \n\t"
"vleg %%v16,8(%2),0 \n\t"
"vleg %%v0,16(%2),1 \n\t"
"vleg %%v16,24(%2),1 \n\t"
"vflpdb %%v0,%%v0 \n\t"
"vflpdb %%v16,%%v16 \n\t"
"vfadb %%v0,%%v0,%%v16 \n\t"
"srlg %%r0,%1,4 \n\t"
"xgr %%r1,%%r1 \n\t"
"0: \n\t"
"pfd 1, 1024(%%r1,%2) \n\t"

"vleg %%v16,0(%%r1,%2),0 \n\t"
"vleg %%v17,8(%%r1,%2),0 \n\t"
"vleg %%v16,16(%%r1,%2),1 \n\t"
"vleg %%v17,24(%%r1,%2),1 \n\t"
"vleg %%v18,32(%%r1,%2),0 \n\t"
"vleg %%v19,40(%%r1,%2),0 \n\t"
"vleg %%v18,48(%%r1,%2),1 \n\t"
"vleg %%v19,56(%%r1,%2),1 \n\t"
"vleg %%v20,64(%%r1,%2),0 \n\t"
"vleg %%v21,72(%%r1,%2),0 \n\t"
"vleg %%v20,80(%%r1,%2),1 \n\t"
"vleg %%v21,88(%%r1,%2),1 \n\t"
"vleg %%v22,96(%%r1,%2),0 \n\t"
"vleg %%v23,104(%%r1,%2),0 \n\t"
"vleg %%v22,112(%%r1,%2),1 \n\t"
"vleg %%v23,120(%%r1,%2),1 \n\t"
"vflpdb %%v16, %%v16 \n\t"
"vflpdb %%v17, %%v17 \n\t"
"vflpdb %%v18, %%v18 \n\t"
"vflpdb %%v19, %%v19 \n\t"
"vflpdb %%v20, %%v20 \n\t"
"vflpdb %%v21, %%v21 \n\t"
"vflpdb %%v22, %%v22 \n\t"
"vflpdb %%v23, %%v23 \n\t"
"vfadb %%v16,%%v16,%%v17 \n\t"
"vfadb %%v17,%%v18,%%v19 \n\t"
"vfadb %%v18,%%v20,%%v21 \n\t"
"vfadb %%v19,%%v22,%%v23 \n\t"
"vfchdb %%v24,%%v16,%%v17 \n\t"
"vfchdb %%v25,%%v18,%%v19 \n\t"
"vsel %%v24,%%v16,%%v17,%%v24 \n\t"
"vsel %%v25,%%v18,%%v19,%%v25 \n\t"

"vfchdb %%v26,%%v24,%%v25 \n\t"
"vsel %%v26,%%v24,%%v25,%%v26 \n\t"

"vfchdb %%v27,%%v26,%%v0 \n\t"
"vsel %%v0,%%v26,%%v0,%%v27 \n\t"

"vleg %%v16,128(%%r1,%2),0 \n\t"
"vleg %%v17,136(%%r1,%2),0 \n\t"
"vleg %%v16,144(%%r1,%2),1 \n\t"
"vleg %%v17,152(%%r1,%2),1 \n\t"
"vleg %%v18,160(%%r1,%2),0 \n\t"
"vleg %%v19,168(%%r1,%2),0 \n\t"
"vleg %%v18,176(%%r1,%2),1 \n\t"
"vleg %%v19,184(%%r1,%2),1 \n\t"
"vleg %%v20,192(%%r1,%2),0 \n\t"
"vleg %%v21,200(%%r1,%2),0 \n\t"
"vleg %%v20,208(%%r1,%2),1 \n\t"
"vleg %%v21,216(%%r1,%2),1 \n\t"
"vleg %%v22,224(%%r1,%2),0 \n\t"
"vleg %%v23,232(%%r1,%2),0 \n\t"
"vleg %%v22,240(%%r1,%2),1 \n\t"
"vleg %%v23,248(%%r1,%2),1 \n\t"
"vflpdb %%v16, %%v16 \n\t"
"vflpdb %%v17, %%v17 \n\t"
"vflpdb %%v18, %%v18 \n\t"
"vflpdb %%v19, %%v19 \n\t"
"vflpdb %%v20, %%v20 \n\t"
"vflpdb %%v21, %%v21 \n\t"
"vflpdb %%v22, %%v22 \n\t"
"vflpdb %%v23, %%v23 \n\t"
"vfadb %%v16,%%v16,%%v17 \n\t"
"vfadb %%v17,%%v18,%%v19 \n\t"
"vfadb %%v18,%%v20,%%v21 \n\t"
"vfadb %%v19,%%v22,%%v23 \n\t"
"vfchdb %%v24,%%v16,%%v17 \n\t"
"vfchdb %%v25,%%v18,%%v19 \n\t"
"vsel %%v24,%%v16,%%v17,%%v24 \n\t"
"vsel %%v25,%%v18,%%v19,%%v25 \n\t"

"vfchdb %%v26,%%v24,%%v25 \n\t"
"vsel %%v26,%%v24,%%v25,%%v26 \n\t"

"vfchdb %%v27,%%v26,%%v0 \n\t"
"vsel %%v0,%%v26,%%v0,%%v27 \n\t"

"agfi %%r1, 256 \n\t"
"brctg %%r0, 0b \n\t"

"vrepg %%v16,%%v0,1 \n\t"
"wfchdb %%v17,%%v0,%%v16 \n\t"
"vsel %%v0,%%v0,%%v16,%%v17 \n\t"
"ldr %0,%%f0 "
:"=f"(amax)
:"r"(n),"ZR"((const FLOAT (*)[n])x)
:"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27"
);

return amax;
#define CABS1(x,i) (fabs(x[i]) + fabs(x[i + 1]))

static FLOAT zamax_kernel_16(BLASLONG n, FLOAT *x) {
FLOAT amax;

__asm__("vleg %%v0,0(%[x]),0\n\t"
"vleg %%v16,8(%[x]),0\n\t"
"vleg %%v0,16(%[x]),1\n\t"
"vleg %%v16,24(%[x]),1\n\t"
"vflpdb %%v0,%%v0\n\t"
"vflpdb %%v16,%%v16\n\t"
"vfadb %%v0,%%v0,%%v16\n\t"
"srlg %[n],%[n],4\n\t"
"xgr %%r1,%%r1\n\t"
"0:\n\t"
"pfd 1, 1024(%%r1,%[x])\n\t"
"vleg %%v16,0(%%r1,%[x]),0\n\t"
"vleg %%v17,8(%%r1,%[x]),0\n\t"
"vleg %%v16,16(%%r1,%[x]),1\n\t"
"vleg %%v17,24(%%r1,%[x]),1\n\t"
"vleg %%v18,32(%%r1,%[x]),0\n\t"
"vleg %%v19,40(%%r1,%[x]),0\n\t"
"vleg %%v18,48(%%r1,%[x]),1\n\t"
"vleg %%v19,56(%%r1,%[x]),1\n\t"
"vleg %%v20,64(%%r1,%[x]),0\n\t"
"vleg %%v21,72(%%r1,%[x]),0\n\t"
"vleg %%v20,80(%%r1,%[x]),1\n\t"
"vleg %%v21,88(%%r1,%[x]),1\n\t"
"vleg %%v22,96(%%r1,%[x]),0\n\t"
"vleg %%v23,104(%%r1,%[x]),0\n\t"
"vleg %%v22,112(%%r1,%[x]),1\n\t"
"vleg %%v23,120(%%r1,%[x]),1\n\t"
"vflpdb %%v16, %%v16\n\t"
"vflpdb %%v17, %%v17\n\t"
"vflpdb %%v18, %%v18\n\t"
"vflpdb %%v19, %%v19\n\t"
"vflpdb %%v20, %%v20\n\t"
"vflpdb %%v21, %%v21\n\t"
"vflpdb %%v22, %%v22\n\t"
"vflpdb %%v23, %%v23\n\t"
"vfadb %%v16,%%v16,%%v17\n\t"
"vfadb %%v17,%%v18,%%v19\n\t"
"vfadb %%v18,%%v20,%%v21\n\t"
"vfadb %%v19,%%v22,%%v23\n\t"
"vfchdb %%v24,%%v16,%%v17\n\t"
"vfchdb %%v25,%%v18,%%v19\n\t"
"vsel %%v24,%%v16,%%v17,%%v24\n\t"
"vsel %%v25,%%v18,%%v19,%%v25\n\t"
"vfchdb %%v26,%%v24,%%v25\n\t"
"vsel %%v26,%%v24,%%v25,%%v26\n\t"
"vfchdb %%v27,%%v26,%%v0\n\t"
"vsel %%v0,%%v26,%%v0,%%v27\n\t"
"vleg %%v16,128(%%r1,%[x]),0\n\t"
"vleg %%v17,136(%%r1,%[x]),0\n\t"
"vleg %%v16,144(%%r1,%[x]),1\n\t"
"vleg %%v17,152(%%r1,%[x]),1\n\t"
"vleg %%v18,160(%%r1,%[x]),0\n\t"
"vleg %%v19,168(%%r1,%[x]),0\n\t"
"vleg %%v18,176(%%r1,%[x]),1\n\t"
"vleg %%v19,184(%%r1,%[x]),1\n\t"
"vleg %%v20,192(%%r1,%[x]),0\n\t"
"vleg %%v21,200(%%r1,%[x]),0\n\t"
"vleg %%v20,208(%%r1,%[x]),1\n\t"
"vleg %%v21,216(%%r1,%[x]),1\n\t"
"vleg %%v22,224(%%r1,%[x]),0\n\t"
"vleg %%v23,232(%%r1,%[x]),0\n\t"
"vleg %%v22,240(%%r1,%[x]),1\n\t"
"vleg %%v23,248(%%r1,%[x]),1\n\t"
"vflpdb %%v16, %%v16\n\t"
"vflpdb %%v17, %%v17\n\t"
"vflpdb %%v18, %%v18\n\t"
"vflpdb %%v19, %%v19\n\t"
"vflpdb %%v20, %%v20\n\t"
"vflpdb %%v21, %%v21\n\t"
"vflpdb %%v22, %%v22\n\t"
"vflpdb %%v23, %%v23\n\t"
"vfadb %%v16,%%v16,%%v17\n\t"
"vfadb %%v17,%%v18,%%v19\n\t"
"vfadb %%v18,%%v20,%%v21\n\t"
"vfadb %%v19,%%v22,%%v23\n\t"
"vfchdb %%v24,%%v16,%%v17\n\t"
"vfchdb %%v25,%%v18,%%v19\n\t"
"vsel %%v24,%%v16,%%v17,%%v24\n\t"
"vsel %%v25,%%v18,%%v19,%%v25\n\t"
"vfchdb %%v26,%%v24,%%v25\n\t"
"vsel %%v26,%%v24,%%v25,%%v26\n\t"
"vfchdb %%v27,%%v26,%%v0\n\t"
"vsel %%v0,%%v26,%%v0,%%v27\n\t"
"agfi %%r1, 256\n\t"
"brctg %[n], 0b\n\t"
"vrepg %%v16,%%v0,1\n\t"
"wfchdb %%v17,%%v0,%%v16\n\t"
"vsel %%v0,%%v0,%%v16,%%v17\n\t"
"ldr %[amax],%%f0"
: [amax] "=f"(amax),[n] "+&r"(n)
: "m"(*(const FLOAT (*)[n]) x),[x] "a"(x)
: "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22",
"v23", "v24", "v25", "v26", "v27");

return amax;
} }

FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
BLASLONG i = 0;
BLASLONG ix = 0;
FLOAT maxf = 0.0;
BLASLONG inc_x2;

if (n <= 0 || inc_x <= 0) return (maxf);

if (inc_x == 1) {

BLASLONG n1 = n & -16;
if (n1 > 0) {

maxf = zamax_kernel_16(n1, x);
ix = n1 * 2;
i = n1;
}
else
{
maxf=CABS1(x,0);
ix += 2;
i++;
}

while (i < n) {
if (CABS1(x,ix) > maxf) {
maxf = CABS1(x,ix);
}
ix += 2;
i++;
}
return (maxf);
BLASLONG i = 0;
BLASLONG ix = 0;
FLOAT maxf = 0.0;
BLASLONG inc_x2;

if (n <= 0 || inc_x <= 0)
return (maxf);

if (inc_x == 1) {

BLASLONG n1 = n & -16;
if (n1 > 0) {


maxf = zamax_kernel_16(n1, x);
ix = n1 * 2;
i = n1;
} else { } else {
maxf=CABS1(x,0);
inc_x2 = 2 * inc_x;

BLASLONG n1 = n & -4;
while (i < n1) {

if (CABS1(x,ix) > maxf) {
maxf = CABS1(x,ix);
}
if (CABS1(x,ix+inc_x2) > maxf) {
maxf = CABS1(x,ix+inc_x2);
}
if (CABS1(x,ix+inc_x2*2) > maxf) {
maxf = CABS1(x,ix+inc_x2*2);
}
if (CABS1(x,ix+inc_x2*3) > maxf) {
maxf = CABS1(x,ix+inc_x2*3);
}

ix += inc_x2 * 4;

i += 4;

}


while (i < n) {
if (CABS1(x,ix) > maxf) {
maxf = CABS1(x,ix);
}
ix += inc_x2;
i++;
}
return (maxf);
maxf = CABS1(x, 0);
ix += 2;
i++;
}

while (i < n) {
if (CABS1(x, ix) > maxf) {
maxf = CABS1(x, ix);
}
ix += 2;
i++;
}
return (maxf);

} else {

maxf = CABS1(x, 0);
inc_x2 = 2 * inc_x;

BLASLONG n1 = n & -4;
while (i < n1) {

if (CABS1(x, ix) > maxf) {
maxf = CABS1(x, ix);
}
if (CABS1(x, ix + inc_x2) > maxf) {
maxf = CABS1(x, ix + inc_x2);
}
if (CABS1(x, ix + inc_x2 * 2) > maxf) {
maxf = CABS1(x, ix + inc_x2 * 2);
}
if (CABS1(x, ix + inc_x2 * 3) > maxf) {
maxf = CABS1(x, ix + inc_x2 * 3);
}

ix += inc_x2 * 4;

i += 4;

}

while (i < n) {
if (CABS1(x, ix) > maxf) {
maxf = CABS1(x, ix);
}
ix += inc_x2;
i++;
} }
return (maxf);
}
} }

+ 149
- 168
kernel/zarch/zamin.c View File

@@ -1,5 +1,5 @@
/*************************************************************************** /***************************************************************************
Copyright (c) 2013-2017, The OpenBLAS Project
Copyright (c) 2013-2019, The OpenBLAS Project
All rights reserved. All rights reserved.
Redistribution and use in source and binary forms, with or without Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are modification, are permitted provided that the following conditions are
@@ -28,184 +28,165 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "common.h" #include "common.h"
#include <math.h> #include <math.h>


#if defined(DOUBLE)
#define ABS fabs
#else
#define ABS fabsf
#endif

#define CABS1(x,i) (ABS(x[i]) + ABS(x[i + 1]))

static FLOAT zamin_kernel_16(BLASLONG n, FLOAT *x)
{
FLOAT amin;

__asm__ volatile (
"vleg %%v0,0(%2),0 \n\t"
"vleg %%v16,8(%2),0 \n\t"
"vleg %%v0,16(%2),1 \n\t"
"vleg %%v16,24(%2),1 \n\t"
"vflpdb %%v0,%%v0 \n\t"
"vflpdb %%v16,%%v16 \n\t"
"vfadb %%v0,%%v0,%%v16 \n\t"
"srlg %%r0,%1,4 \n\t"
"xgr %%r1,%%r1 \n\t"
"0: \n\t"
"pfd 1, 1024(%%r1,%2) \n\t"

"vleg %%v16,0(%%r1,%2),0 \n\t"
"vleg %%v17,8(%%r1,%2),0 \n\t"
"vleg %%v16,16(%%r1,%2),1 \n\t"
"vleg %%v17,24(%%r1,%2),1 \n\t"
"vleg %%v18,32(%%r1,%2),0 \n\t"
"vleg %%v19,40(%%r1,%2),0 \n\t"
"vleg %%v18,48(%%r1,%2),1 \n\t"
"vleg %%v19,56(%%r1,%2),1 \n\t"
"vleg %%v20,64(%%r1,%2),0 \n\t"
"vleg %%v21,72(%%r1,%2),0 \n\t"
"vleg %%v20,80(%%r1,%2),1 \n\t"
"vleg %%v21,88(%%r1,%2),1 \n\t"
"vleg %%v22,96(%%r1,%2),0 \n\t"
"vleg %%v23,104(%%r1,%2),0 \n\t"
"vleg %%v22,112(%%r1,%2),1 \n\t"
"vleg %%v23,120(%%r1,%2),1 \n\t"
"vleg %%v24,128(%%r1,%2),0 \n\t"
"vleg %%v25,136(%%r1,%2),0 \n\t"
"vleg %%v24,144(%%r1,%2),1 \n\t"
"vleg %%v25,152(%%r1,%2),1 \n\t"
"vleg %%v26,160(%%r1,%2),0 \n\t"
"vleg %%v27,168(%%r1,%2),0 \n\t"
"vleg %%v26,176(%%r1,%2),1 \n\t"
"vleg %%v27,184(%%r1,%2),1 \n\t"
"vleg %%v28,192(%%r1,%2),0 \n\t"
"vleg %%v29,200(%%r1,%2),0 \n\t"
"vleg %%v28,208(%%r1,%2),1 \n\t"
"vleg %%v29,216(%%r1,%2),1 \n\t"
"vleg %%v30,224(%%r1,%2),0 \n\t"
"vleg %%v31,232(%%r1,%2),0 \n\t"
"vleg %%v30,240(%%r1,%2),1 \n\t"
"vleg %%v31,248(%%r1,%2),1 \n\t"

"vflpdb %%v16,%%v16 \n\t"
"vflpdb %%v17,%%v17 \n\t"
"vflpdb %%v18,%%v18 \n\t"
"vflpdb %%v19,%%v19 \n\t"
"vflpdb %%v20,%%v20 \n\t"
"vflpdb %%v21,%%v21 \n\t"
"vflpdb %%v22,%%v22 \n\t"
"vflpdb %%v23,%%v23 \n\t"
"vflpdb %%v24,%%v24 \n\t"
"vflpdb %%v25,%%v25 \n\t"
"vflpdb %%v26,%%v26 \n\t"
"vflpdb %%v27,%%v27 \n\t"
"vflpdb %%v28,%%v28 \n\t"
"vflpdb %%v29,%%v29 \n\t"
"vflpdb %%v30,%%v30 \n\t"
"vflpdb %%v31,%%v31 \n\t"

"vfadb %%v16,%%v16,%%v17 \n\t"
"vfadb %%v18,%%v18,%%v19 \n\t"
"vfadb %%v20,%%v20,%%v21 \n\t"
"vfadb %%v22,%%v22,%%v23 \n\t"
"vfadb %%v24,%%v24,%%v25 \n\t"
"vfadb %%v26,%%v26,%%v27 \n\t"
"vfadb %%v28,%%v28,%%v29 \n\t"
"vfadb %%v30,%%v30,%%v31 \n\t"
"vfmindb %%v16,%%v16,%%v24,0 \n\t"
"vfmindb %%v18,%%v18,%%v26,0 \n\t"
"vfmindb %%v20,%%v20,%%v28,0 \n\t"
"vfmindb %%v22,%%v22,%%v30,0 \n\t"

"vfmindb %%v16,%%v16,%%v20,0 \n\t"
"vfmindb %%v18,%%v18,%%v22,0 \n\t"

"vfmindb %%v16,%%v16,%%v18,0 \n\t"

"vfmindb %%v0,%%v0,%%v16,0 \n\t"

"agfi %%r1, 256 \n\t"
"brctg %%r0, 0b \n\t"

"vrepg %%v16,%%v0,1 \n\t"
"wfmindb %%v0,%%v0,%%v16,0 \n\t"
"ldr %0,%%f0 "
:"=f"(amin)
:"r"(n),"ZR"((const FLOAT (*)[n])x)
:"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
);

return amin;
#define CABS1(x,i) (fabs(x[i]) + fabs(x[i + 1]))

static FLOAT zamin_kernel_16(BLASLONG n, FLOAT *x) {
FLOAT amin;

__asm__("vleg %%v0,0(%[x]),0\n\t"
"vleg %%v16,8(%[x]),0\n\t"
"vleg %%v0,16(%[x]),1\n\t"
"vleg %%v16,24(%[x]),1\n\t"
"vflpdb %%v0,%%v0\n\t"
"vflpdb %%v16,%%v16\n\t"
"vfadb %%v0,%%v0,%%v16\n\t"
"srlg %[n],%[n],4\n\t"
"xgr %%r1,%%r1\n\t"
"0:\n\t"
"pfd 1, 1024(%%r1,%[x])\n\t"
"vleg %%v16,0(%%r1,%[x]),0\n\t"
"vleg %%v17,8(%%r1,%[x]),0\n\t"
"vleg %%v16,16(%%r1,%[x]),1\n\t"
"vleg %%v17,24(%%r1,%[x]),1\n\t"
"vleg %%v18,32(%%r1,%[x]),0\n\t"
"vleg %%v19,40(%%r1,%[x]),0\n\t"
"vleg %%v18,48(%%r1,%[x]),1\n\t"
"vleg %%v19,56(%%r1,%[x]),1\n\t"
"vleg %%v20,64(%%r1,%[x]),0\n\t"
"vleg %%v21,72(%%r1,%[x]),0\n\t"
"vleg %%v20,80(%%r1,%[x]),1\n\t"
"vleg %%v21,88(%%r1,%[x]),1\n\t"
"vleg %%v22,96(%%r1,%[x]),0\n\t"
"vleg %%v23,104(%%r1,%[x]),0\n\t"
"vleg %%v22,112(%%r1,%[x]),1\n\t"
"vleg %%v23,120(%%r1,%[x]),1\n\t"
"vleg %%v24,128(%%r1,%[x]),0\n\t"
"vleg %%v25,136(%%r1,%[x]),0\n\t"
"vleg %%v24,144(%%r1,%[x]),1\n\t"
"vleg %%v25,152(%%r1,%[x]),1\n\t"
"vleg %%v26,160(%%r1,%[x]),0\n\t"
"vleg %%v27,168(%%r1,%[x]),0\n\t"
"vleg %%v26,176(%%r1,%[x]),1\n\t"
"vleg %%v27,184(%%r1,%[x]),1\n\t"
"vleg %%v28,192(%%r1,%[x]),0\n\t"
"vleg %%v29,200(%%r1,%[x]),0\n\t"
"vleg %%v28,208(%%r1,%[x]),1\n\t"
"vleg %%v29,216(%%r1,%[x]),1\n\t"
"vleg %%v30,224(%%r1,%[x]),0\n\t"
"vleg %%v31,232(%%r1,%[x]),0\n\t"
"vleg %%v30,240(%%r1,%[x]),1\n\t"
"vleg %%v31,248(%%r1,%[x]),1\n\t"
"vflpdb %%v16,%%v16\n\t"
"vflpdb %%v17,%%v17\n\t"
"vflpdb %%v18,%%v18\n\t"
"vflpdb %%v19,%%v19\n\t"
"vflpdb %%v20,%%v20\n\t"
"vflpdb %%v21,%%v21\n\t"
"vflpdb %%v22,%%v22\n\t"
"vflpdb %%v23,%%v23\n\t"
"vflpdb %%v24,%%v24\n\t"
"vflpdb %%v25,%%v25\n\t"
"vflpdb %%v26,%%v26\n\t"
"vflpdb %%v27,%%v27\n\t"
"vflpdb %%v28,%%v28\n\t"
"vflpdb %%v29,%%v29\n\t"
"vflpdb %%v30,%%v30\n\t"
"vflpdb %%v31,%%v31\n\t"
"vfadb %%v16,%%v16,%%v17\n\t"
"vfadb %%v18,%%v18,%%v19\n\t"
"vfadb %%v20,%%v20,%%v21\n\t"
"vfadb %%v22,%%v22,%%v23\n\t"
"vfadb %%v24,%%v24,%%v25\n\t"
"vfadb %%v26,%%v26,%%v27\n\t"
"vfadb %%v28,%%v28,%%v29\n\t"
"vfadb %%v30,%%v30,%%v31\n\t"
"vfmindb %%v16,%%v16,%%v24,0\n\t"
"vfmindb %%v18,%%v18,%%v26,0\n\t"
"vfmindb %%v20,%%v20,%%v28,0\n\t"
"vfmindb %%v22,%%v22,%%v30,0\n\t"
"vfmindb %%v16,%%v16,%%v20,0\n\t"
"vfmindb %%v18,%%v18,%%v22,0\n\t"
"vfmindb %%v16,%%v16,%%v18,0\n\t"
"vfmindb %%v0,%%v0,%%v16,0\n\t"
"agfi %%r1, 256\n\t"
"brctg %[n], 0b\n\t"
"vrepg %%v16,%%v0,1\n\t"
"wfmindb %%v0,%%v0,%%v16,0\n\t"
"ldr %[amin],%%f0"
: [amin] "=f"(amin),[n] "+&r"(n)
: "m"(*(const FLOAT (*)[n * 2]) x),[x] "a"(x)
: "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22",
"v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31");

return amin;
} }

FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
BLASLONG i = 0;
BLASLONG ix = 0;
FLOAT minf = 0.0;
BLASLONG inc_x2;

if (n <= 0 || inc_x <= 0) return (minf);

if (inc_x == 1) {

BLASLONG n1 = n & -16;
if (n1 > 0) {

minf = zamin_kernel_16(n1, x);
ix = n1 * 2;
i = n1;
}
else
{
minf=CABS1(x,0);
ix += 2;
i++;
}

while (i < n) {
if (CABS1(x,ix) < minf) {
minf = CABS1(x,ix);
}
ix += 2;
i++;
}
return (minf);
BLASLONG i = 0;
BLASLONG ix = 0;
FLOAT minf = 0.0;
BLASLONG inc_x2;

if (n <= 0 || inc_x <= 0)
return (minf);

if (inc_x == 1) {

BLASLONG n1 = n & -16;
if (n1 > 0) {


minf = zamin_kernel_16(n1, x);
ix = n1 * 2;
i = n1;
} else { } else {
minf = CABS1(x, 0);
ix += 2;
i++;
}

while (i < n) {
if (CABS1(x, ix) < minf) {
minf = CABS1(x, ix);
}
ix += 2;
i++;
}
return (minf);


minf=CABS1(x,0);
inc_x2 = 2 * inc_x;
} else {


BLASLONG n1 = n & -4;
while (i < n1) {
minf = CABS1(x, 0);
inc_x2 = 2 * inc_x;


if (CABS1(x,ix) < minf) {
minf = CABS1(x,ix);
}
if (CABS1(x,ix+inc_x2) < minf) {
minf = CABS1(x,ix+inc_x2);
}
if (CABS1(x,ix+inc_x2*2) < minf) {
minf = CABS1(x,ix+inc_x2*2);
}
if (CABS1(x,ix+inc_x2*3) < minf) {
minf = CABS1(x,ix+inc_x2*3);
}
BLASLONG n1 = n & -4;
while (i < n1) {


ix += inc_x2 * 4;
if (CABS1(x, ix) < minf) {
minf = CABS1(x, ix);
}
if (CABS1(x, ix + inc_x2) < minf) {
minf = CABS1(x, ix + inc_x2);
}
if (CABS1(x, ix + inc_x2 * 2) < minf) {
minf = CABS1(x, ix + inc_x2 * 2);
}
if (CABS1(x, ix + inc_x2 * 3) < minf) {
minf = CABS1(x, ix + inc_x2 * 3);
}


i += 4;
ix += inc_x2 * 4;


}
i += 4;


}


while (i < n) {
if (CABS1(x,ix) < minf) {
minf = CABS1(x,ix);
}
ix += inc_x2;
i++;
}
return (minf);
while (i < n) {
if (CABS1(x, ix) < minf) {
minf = CABS1(x, ix);
}
ix += inc_x2;
i++;
} }
return (minf);
}
} }

+ 158
- 178
kernel/zarch/zamin_z13.c View File

@@ -1,5 +1,5 @@
/*************************************************************************** /***************************************************************************
Copyright (c) 2013-2017, The OpenBLAS Project
Copyright (c) 2013-2019, The OpenBLAS Project
All rights reserved. All rights reserved.
Redistribution and use in source and binary forms, with or without Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are modification, are permitted provided that the following conditions are
@@ -28,194 +28,174 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "common.h" #include "common.h"
#include <math.h> #include <math.h>


#if defined(DOUBLE)
#define ABS fabs
#else
#define ABS fabsf
#endif

#define CABS1(x,i) (ABS(x[i]) + ABS(x[i + 1]))

static FLOAT zamin_kernel_16(BLASLONG n, FLOAT *x)
{
FLOAT amin;

__asm__ volatile (
"vleg %%v0,0(%2),0 \n\t"
"vleg %%v16,8(%2),0 \n\t"
"vleg %%v0,16(%2),1 \n\t"
"vleg %%v16,24(%2),1 \n\t"
"vflpdb %%v0,%%v0 \n\t"
"vflpdb %%v16,%%v16 \n\t"
"vfadb %%v0,%%v0,%%v16 \n\t"
"srlg %%r0,%1,4 \n\t"
"xgr %%r1,%%r1 \n\t"
"0: \n\t"
"pfd 1, 1024(%%r1,%2) \n\t"

"vleg %%v16,0(%%r1,%2),0 \n\t"
"vleg %%v17,8(%%r1,%2),0 \n\t"
"vleg %%v16,16(%%r1,%2),1 \n\t"
"vleg %%v17,24(%%r1,%2),1 \n\t"
"vleg %%v18,32(%%r1,%2),0 \n\t"
"vleg %%v19,40(%%r1,%2),0 \n\t"
"vleg %%v18,48(%%r1,%2),1 \n\t"
"vleg %%v19,56(%%r1,%2),1 \n\t"
"vleg %%v20,64(%%r1,%2),0 \n\t"
"vleg %%v21,72(%%r1,%2),0 \n\t"
"vleg %%v20,80(%%r1,%2),1 \n\t"
"vleg %%v21,88(%%r1,%2),1 \n\t"
"vleg %%v22,96(%%r1,%2),0 \n\t"
"vleg %%v23,104(%%r1,%2),0 \n\t"
"vleg %%v22,112(%%r1,%2),1 \n\t"
"vleg %%v23,120(%%r1,%2),1 \n\t"
"vflpdb %%v16, %%v16 \n\t"
"vflpdb %%v17, %%v17 \n\t"
"vflpdb %%v18, %%v18 \n\t"
"vflpdb %%v19, %%v19 \n\t"
"vflpdb %%v20, %%v20 \n\t"
"vflpdb %%v21, %%v21 \n\t"
"vflpdb %%v22, %%v22 \n\t"
"vflpdb %%v23, %%v23 \n\t"
"vfadb %%v16,%%v16,%%v17 \n\t"
"vfadb %%v17,%%v18,%%v19 \n\t"
"vfadb %%v18,%%v20,%%v21 \n\t"
"vfadb %%v19,%%v22,%%v23 \n\t"
"vfchdb %%v24,%%v17,%%v16 \n\t"
"vfchdb %%v25,%%v19,%%v18 \n\t"
"vsel %%v24,%%v16,%%v17,%%v24 \n\t"
"vsel %%v25,%%v18,%%v19,%%v25 \n\t"

"vfchdb %%v26,%%v25,%%v24 \n\t"
"vsel %%v26,%%v24,%%v25,%%v26 \n\t"

"vfchdb %%v27,%%v0,%%v26 \n\t"
"vsel %%v0,%%v26,%%v0,%%v27 \n\t"

"vleg %%v16,128(%%r1,%2),0 \n\t"
"vleg %%v17,136(%%r1,%2),0 \n\t"
"vleg %%v16,144(%%r1,%2),1 \n\t"
"vleg %%v17,152(%%r1,%2),1 \n\t"
"vleg %%v18,160(%%r1,%2),0 \n\t"
"vleg %%v19,168(%%r1,%2),0 \n\t"
"vleg %%v18,176(%%r1,%2),1 \n\t"
"vleg %%v19,184(%%r1,%2),1 \n\t"
"vleg %%v20,192(%%r1,%2),0 \n\t"
"vleg %%v21,200(%%r1,%2),0 \n\t"
"vleg %%v20,208(%%r1,%2),1 \n\t"
"vleg %%v21,216(%%r1,%2),1 \n\t"
"vleg %%v22,224(%%r1,%2),0 \n\t"
"vleg %%v23,232(%%r1,%2),0 \n\t"
"vleg %%v22,240(%%r1,%2),1 \n\t"
"vleg %%v23,248(%%r1,%2),1 \n\t"
"vflpdb %%v16, %%v16 \n\t"
"vflpdb %%v17, %%v17 \n\t"
"vflpdb %%v18, %%v18 \n\t"
"vflpdb %%v19, %%v19 \n\t"
"vflpdb %%v20, %%v20 \n\t"
"vflpdb %%v21, %%v21 \n\t"
"vflpdb %%v22, %%v22 \n\t"
"vflpdb %%v23, %%v23 \n\t"
"vfadb %%v16,%%v16,%%v17 \n\t"
"vfadb %%v17,%%v18,%%v19 \n\t"
"vfadb %%v18,%%v20,%%v21 \n\t"
"vfadb %%v19,%%v22,%%v23 \n\t"
"vfchdb %%v24,%%v17,%%v16 \n\t"
"vfchdb %%v25,%%v19,%%v18 \n\t"
"vsel %%v24,%%v16,%%v17,%%v24 \n\t"
"vsel %%v25,%%v18,%%v19,%%v25 \n\t"

"vfchdb %%v26,%%v25,%%v24 \n\t"
"vsel %%v26,%%v24,%%v25,%%v26 \n\t"

"vfchdb %%v27,%%v0,%%v26 \n\t"
"vsel %%v0,%%v26,%%v0,%%v27 \n\t"

"agfi %%r1, 256 \n\t"
"brctg %%r0, 0b \n\t"

"vrepg %%v16,%%v0,1 \n\t"
"wfchdb %%v17,%%v16,%%v0 \n\t"
"vsel %%v0,%%v0,%%v16,%%v17 \n\t"
"ldr %0,%%f0 "
:"=f"(amin)
:"r"(n),"ZR"((const FLOAT (*)[n])x)
:"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27"
);

return amin;
#define CABS1(x,i) (fabs(x[i]) + fabs(x[i + 1]))

static FLOAT zamin_kernel_16(BLASLONG n, FLOAT *x) {
FLOAT amin;

__asm__("vleg %%v0,0(%[x]),0\n\t"
"vleg %%v16,8(%[x]),0\n\t"
"vleg %%v0,16(%[x]),1\n\t"
"vleg %%v16,24(%[x]),1\n\t"
"vflpdb %%v0,%%v0\n\t"
"vflpdb %%v16,%%v16\n\t"
"vfadb %%v0,%%v0,%%v16\n\t"
"srlg %[n],%[n],4\n\t"
"xgr %%r1,%%r1\n\t"
"0:\n\t"
"pfd 1, 1024(%%r1,%[x])\n\t"
"vleg %%v16,0(%%r1,%[x]),0\n\t"
"vleg %%v17,8(%%r1,%[x]),0\n\t"
"vleg %%v16,16(%%r1,%[x]),1\n\t"
"vleg %%v17,24(%%r1,%[x]),1\n\t"
"vleg %%v18,32(%%r1,%[x]),0\n\t"
"vleg %%v19,40(%%r1,%[x]),0\n\t"
"vleg %%v18,48(%%r1,%[x]),1\n\t"
"vleg %%v19,56(%%r1,%[x]),1\n\t"
"vleg %%v20,64(%%r1,%[x]),0\n\t"
"vleg %%v21,72(%%r1,%[x]),0\n\t"
"vleg %%v20,80(%%r1,%[x]),1\n\t"
"vleg %%v21,88(%%r1,%[x]),1\n\t"
"vleg %%v22,96(%%r1,%[x]),0\n\t"
"vleg %%v23,104(%%r1,%[x]),0\n\t"
"vleg %%v22,112(%%r1,%[x]),1\n\t"
"vleg %%v23,120(%%r1,%[x]),1\n\t"
"vflpdb %%v16, %%v16\n\t"
"vflpdb %%v17, %%v17\n\t"
"vflpdb %%v18, %%v18\n\t"
"vflpdb %%v19, %%v19\n\t"
"vflpdb %%v20, %%v20\n\t"
"vflpdb %%v21, %%v21\n\t"
"vflpdb %%v22, %%v22\n\t"
"vflpdb %%v23, %%v23\n\t"
"vfadb %%v16,%%v16,%%v17\n\t"
"vfadb %%v17,%%v18,%%v19\n\t"
"vfadb %%v18,%%v20,%%v21\n\t"
"vfadb %%v19,%%v22,%%v23\n\t"
"vfchdb %%v24,%%v17,%%v16\n\t"
"vfchdb %%v25,%%v19,%%v18\n\t"
"vsel %%v24,%%v16,%%v17,%%v24\n\t"
"vsel %%v25,%%v18,%%v19,%%v25\n\t"
"vfchdb %%v26,%%v25,%%v24\n\t"
"vsel %%v26,%%v24,%%v25,%%v26\n\t"
"vfchdb %%v27,%%v0,%%v26\n\t"
"vsel %%v0,%%v26,%%v0,%%v27\n\t"
"vleg %%v16,128(%%r1,%[x]),0\n\t"
"vleg %%v17,136(%%r1,%[x]),0\n\t"
"vleg %%v16,144(%%r1,%[x]),1\n\t"
"vleg %%v17,152(%%r1,%[x]),1\n\t"
"vleg %%v18,160(%%r1,%[x]),0\n\t"
"vleg %%v19,168(%%r1,%[x]),0\n\t"
"vleg %%v18,176(%%r1,%[x]),1\n\t"
"vleg %%v19,184(%%r1,%[x]),1\n\t"
"vleg %%v20,192(%%r1,%[x]),0\n\t"
"vleg %%v21,200(%%r1,%[x]),0\n\t"
"vleg %%v20,208(%%r1,%[x]),1\n\t"
"vleg %%v21,216(%%r1,%[x]),1\n\t"
"vleg %%v22,224(%%r1,%[x]),0\n\t"
"vleg %%v23,232(%%r1,%[x]),0\n\t"
"vleg %%v22,240(%%r1,%[x]),1\n\t"
"vleg %%v23,248(%%r1,%[x]),1\n\t"
"vflpdb %%v16, %%v16\n\t"
"vflpdb %%v17, %%v17\n\t"
"vflpdb %%v18, %%v18\n\t"
"vflpdb %%v19, %%v19\n\t"
"vflpdb %%v20, %%v20\n\t"
"vflpdb %%v21, %%v21\n\t"
"vflpdb %%v22, %%v22\n\t"
"vflpdb %%v23, %%v23\n\t"
"vfadb %%v16,%%v16,%%v17\n\t"
"vfadb %%v17,%%v18,%%v19\n\t"
"vfadb %%v18,%%v20,%%v21\n\t"
"vfadb %%v19,%%v22,%%v23\n\t"
"vfchdb %%v24,%%v17,%%v16\n\t"
"vfchdb %%v25,%%v19,%%v18\n\t"
"vsel %%v24,%%v16,%%v17,%%v24\n\t"
"vsel %%v25,%%v18,%%v19,%%v25\n\t"
"vfchdb %%v26,%%v25,%%v24\n\t"
"vsel %%v26,%%v24,%%v25,%%v26\n\t"
"vfchdb %%v27,%%v0,%%v26\n\t"
"vsel %%v0,%%v26,%%v0,%%v27\n\t"
"agfi %%r1, 256\n\t"
"brctg %[n], 0b\n\t"
"vrepg %%v16,%%v0,1\n\t"
"wfchdb %%v17,%%v16,%%v0\n\t"
"vsel %%v0,%%v0,%%v16,%%v17\n\t"
"ldr %[amin],%%f0"
: [amin] "=f"(amin),[n] "+&r"(n)
: "m"(*(const FLOAT (*)[n]) x),[x] "a"(x)
: "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22",
"v23", "v24", "v25", "v26", "v27");

return amin;
} }

FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
BLASLONG i = 0;
BLASLONG ix = 0;
FLOAT minf = 0.0;
BLASLONG inc_x2;

if (n <= 0 || inc_x <= 0) return (minf);

if (inc_x == 1) {

BLASLONG n1 = n & -16;
if (n1 > 0) {

minf = zamin_kernel_16(n1, x);
ix = n1 * 2;
i = n1;
}
else
{
minf=CABS1(x,0);
ix += 2;
i++;
}

while (i < n) {
if (CABS1(x,ix) < minf) {
minf = CABS1(x,ix);
}
ix += 2;
i++;
}
return (minf);
BLASLONG i = 0;
BLASLONG ix = 0;
FLOAT minf = 0.0;
BLASLONG inc_x2;

if (n <= 0 || inc_x <= 0)
return (minf);

if (inc_x == 1) {

BLASLONG n1 = n & -16;
if (n1 > 0) {


minf = zamin_kernel_16(n1, x);
ix = n1 * 2;
i = n1;
} else { } else {
minf = CABS1(x, 0);
ix += 2;
i++;
}

while (i < n) {
if (CABS1(x, ix) < minf) {
minf = CABS1(x, ix);
}
ix += 2;
i++;
}
return (minf);


minf=CABS1(x,0);
inc_x2 = 2 * inc_x;
} else {


BLASLONG n1 = n & -4;
while (i < n1) {
minf = CABS1(x, 0);
inc_x2 = 2 * inc_x;


if (CABS1(x,ix) < minf) {
minf = CABS1(x,ix);
}
if (CABS1(x,ix+inc_x2) < minf) {
minf = CABS1(x,ix+inc_x2);
}
if (CABS1(x,ix+inc_x2*2) < minf) {
minf = CABS1(x,ix+inc_x2*2);
}
if (CABS1(x,ix+inc_x2*3) < minf) {
minf = CABS1(x,ix+inc_x2*3);
}
BLASLONG n1 = n & -4;
while (i < n1) {


ix += inc_x2 * 4;
if (CABS1(x, ix) < minf) {
minf = CABS1(x, ix);
}
if (CABS1(x, ix + inc_x2) < minf) {
minf = CABS1(x, ix + inc_x2);
}
if (CABS1(x, ix + inc_x2 * 2) < minf) {
minf = CABS1(x, ix + inc_x2 * 2);
}
if (CABS1(x, ix + inc_x2 * 3) < minf) {
minf = CABS1(x, ix + inc_x2 * 3);
}


i += 4;
ix += inc_x2 * 4;


}
i += 4;


}


while (i < n) {
if (CABS1(x,ix) < minf) {
minf = CABS1(x,ix);
}
ix += inc_x2;
i++;
}
return (minf);
while (i < n) {
if (CABS1(x, ix) < minf) {
minf = CABS1(x, ix);
}
ix += inc_x2;
i++;
} }
return (minf);
}
} }

+ 110
- 122
kernel/zarch/zasum.c View File

@@ -1,5 +1,5 @@
/*************************************************************************** /***************************************************************************
Copyright (c) 2013-2017, The OpenBLAS Project
Copyright (c) 2013-2019, The OpenBLAS Project
All rights reserved. All rights reserved.
Redistribution and use in source and binary forms, with or without Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are modification, are permitted provided that the following conditions are
@@ -28,138 +28,126 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "common.h" #include "common.h"
#include <math.h> #include <math.h>


#if defined(DOUBLE)
#define ABS fabs #define ABS fabs
#else
#define ABS fabsf
#endif


static FLOAT zasum_kernel_16(BLASLONG n, FLOAT *x)
{
FLOAT asum;
__asm__ (
"vzero %%v0 \n\t"
"vzero %%v1 \n\t"
"vzero %%v2 \n\t"
"vzero %%v3 \n\t"
"srlg %%r0,%1,4 \n\t"
"xgr %%r1,%%r1 \n\t"
"0: \n\t"
"pfd 1, 1024(%%r1,%2) \n\t"
"vl %%v16, 0(%%r1,%2) \n\t"
"vl %%v17, 16(%%r1,%2) \n\t"
"vl %%v18, 32(%%r1,%2) \n\t"
"vl %%v19, 48(%%r1,%2) \n\t"
"vl %%v20, 64(%%r1,%2) \n\t"
"vl %%v21, 80(%%r1,%2) \n\t"
"vl %%v22, 96(%%r1,%2) \n\t"
"vl %%v23, 112(%%r1,%2) \n\t"
"vflpdb %%v16, %%v16 \n\t"
"vflpdb %%v17, %%v17 \n\t"
"vflpdb %%v18, %%v18 \n\t"
"vflpdb %%v19, %%v19 \n\t"
"vflpdb %%v20, %%v20 \n\t"
"vflpdb %%v21, %%v21 \n\t"
"vflpdb %%v22, %%v22 \n\t"
"vflpdb %%v23, %%v23 \n\t"
"vfadb %%v0,%%v0,%%v16 \n\t"
"vfadb %%v1,%%v1,%%v17 \n\t"
"vfadb %%v2,%%v2,%%v18 \n\t"
"vfadb %%v3,%%v3,%%v19 \n\t"
"vfadb %%v0,%%v0,%%v20 \n\t"
"vfadb %%v1,%%v1,%%v21 \n\t"
"vfadb %%v2,%%v2,%%v22 \n\t"
"vfadb %%v3,%%v3,%%v23 \n\t"
"vl %%v16, 128(%%r1,%2) \n\t"
"vl %%v17, 144(%%r1,%2) \n\t"
"vl %%v18, 160(%%r1,%2) \n\t"
"vl %%v19, 176(%%r1,%2) \n\t"
"vl %%v20, 192(%%r1,%2) \n\t"
"vl %%v21, 208(%%r1,%2) \n\t"
"vl %%v22, 224(%%r1,%2) \n\t"
"vl %%v23, 240(%%r1,%2) \n\t"
"vflpdb %%v16, %%v16 \n\t"
"vflpdb %%v17, %%v17 \n\t"
"vflpdb %%v18, %%v18 \n\t"
"vflpdb %%v19, %%v19 \n\t"
"vflpdb %%v20, %%v20 \n\t"
"vflpdb %%v21, %%v21 \n\t"
"vflpdb %%v22, %%v22 \n\t"
"vflpdb %%v23, %%v23 \n\t"
"vfadb %%v0,%%v0,%%v16 \n\t"
"vfadb %%v1,%%v1,%%v17 \n\t"
"vfadb %%v2,%%v2,%%v18 \n\t"
"vfadb %%v3,%%v3,%%v19 \n\t"
"vfadb %%v0,%%v0,%%v20 \n\t"
"vfadb %%v1,%%v1,%%v21 \n\t"
"vfadb %%v2,%%v2,%%v22 \n\t"
"vfadb %%v3,%%v3,%%v23 \n\t"
"agfi %%r1,256 \n\t"
"brctg %%r0,0b \n\t"
"vfadb %%v0,%%v0,%%v1 \n\t"
"vfadb %%v0,%%v0,%%v2 \n\t"
"vfadb %%v0,%%v0,%%v3 \n\t"
"vrepg %%v1,%%v0,1 \n\t"
"adbr %%f0,%%f1 \n\t"
"ldr %0,%%f0 "
:"=f"(asum)
:"r"(n),"ZR"((const FLOAT (*)[n * 2])x)
:"memory","cc","r0","r1","v0","v1","v2","v3","v16","v17","v18","v19","v20","v21","v22","v23"
);
return asum;
static FLOAT zasum_kernel_16(BLASLONG n, FLOAT *x) {
FLOAT asum;
__asm__("vzero %%v24\n\t"
"vzero %%v25\n\t"
"vzero %%v26\n\t"
"vzero %%v27\n\t"
"vzero %%v28\n\t"
"vzero %%v29\n\t"
"vzero %%v30\n\t"
"vzero %%v31\n\t"
"srlg %[n],%[n],4\n\t"
"xgr %%r1,%%r1\n\t"
"0:\n\t"
"pfd 1, 1024(%%r1,%[x])\n\t"
"vl %%v16, 0(%%r1,%[x])\n\t"
"vl %%v17, 16(%%r1,%[x])\n\t"
"vl %%v18, 32(%%r1,%[x])\n\t"
"vl %%v19, 48(%%r1,%[x])\n\t"
"vl %%v20, 64(%%r1,%[x])\n\t"
"vl %%v21, 80(%%r1,%[x])\n\t"
"vl %%v22, 96(%%r1,%[x])\n\t"
"vl %%v23, 112(%%r1,%[x])\n\t"
"vflpdb %%v16, %%v16\n\t"
"vflpdb %%v17, %%v17\n\t"
"vflpdb %%v18, %%v18\n\t"
"vflpdb %%v19, %%v19\n\t"
"vflpdb %%v20, %%v20\n\t"
"vflpdb %%v21, %%v21\n\t"
"vflpdb %%v22, %%v22\n\t"
"vflpdb %%v23, %%v23\n\t"
"vfadb %%v24,%%v24,%%v16\n\t"
"vfadb %%v25,%%v25,%%v17\n\t"
"vfadb %%v26,%%v26,%%v18\n\t"
"vfadb %%v27,%%v27,%%v19\n\t"
"vfadb %%v28,%%v28,%%v20\n\t"
"vfadb %%v29,%%v29,%%v21\n\t"
"vfadb %%v30,%%v30,%%v22\n\t"
"vfadb %%v31,%%v31,%%v23\n\t"
"vl %%v16, 128(%%r1,%[x])\n\t"
"vl %%v17, 144(%%r1,%[x])\n\t"
"vl %%v18, 160(%%r1,%[x])\n\t"
"vl %%v19, 176(%%r1,%[x])\n\t"
"vl %%v20, 192(%%r1,%[x])\n\t"
"vl %%v21, 208(%%r1,%[x])\n\t"
"vl %%v22, 224(%%r1,%[x])\n\t"
"vl %%v23, 240(%%r1,%[x])\n\t"
"vflpdb %%v16, %%v16\n\t"
"vflpdb %%v17, %%v17\n\t"
"vflpdb %%v18, %%v18\n\t"
"vflpdb %%v19, %%v19\n\t"
"vflpdb %%v20, %%v20\n\t"
"vflpdb %%v21, %%v21\n\t"
"vflpdb %%v22, %%v22\n\t"
"vflpdb %%v23, %%v23\n\t"
"vfadb %%v24,%%v24,%%v16\n\t"
"vfadb %%v25,%%v25,%%v17\n\t"
"vfadb %%v26,%%v26,%%v18\n\t"
"vfadb %%v27,%%v27,%%v19\n\t"
"vfadb %%v28,%%v28,%%v20\n\t"
"vfadb %%v29,%%v29,%%v21\n\t"
"vfadb %%v30,%%v30,%%v22\n\t"
"vfadb %%v31,%%v31,%%v23\n\t"
"agfi %%r1,256\n\t"
"brctg %[n],0b\n\t"
"vfadb %%v24,%%v24,%%v25\n\t"
"vfadb %%v24,%%v24,%%v26\n\t"
"vfadb %%v24,%%v24,%%v27\n\t"
"vfadb %%v24,%%v24,%%v28\n\t"
"vfadb %%v24,%%v24,%%v29\n\t"
"vfadb %%v24,%%v24,%%v30\n\t"
"vfadb %%v24,%%v24,%%v31\n\t"
"vrepg %%v25,%%v24,1\n\t"
"vfadb %%v24,%%v24,%%v25\n\t"
"vsteg %%v24,%[asum],0"
: [asum] "=m"(asum),[n] "+&r"(n)
: "m"(*(const FLOAT (*)[n * 2]) x),[x] "a"(x)
: "cc", "r1", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23",
"v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31");
return asum;
} }


FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
{
BLASLONG i=0;
BLASLONG ip=0;
FLOAT sumf = 0.0;
BLASLONG n1;
BLASLONG inc_x2;
FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
BLASLONG i = 0;
BLASLONG ip = 0;
FLOAT sumf = 0.0;
BLASLONG n1;
BLASLONG inc_x2;


if (n <= 0 || inc_x <= 0) return(sumf);
if (n <= 0 || inc_x <= 0)
return (sumf);


if ( inc_x == 1 )
{
if (inc_x == 1) {


n1 = n & -16;
if ( n1 > 0 )
{
n1 = n & -16;
if (n1 > 0) {


sumf = zasum_kernel_16(n1, x);
i=n1;
ip=2*n1;
}

while(i < n)
{
sumf += ABS(x[ip]) + ABS(x[ip+1]);
i++;
ip+=2;
}
sumf = zasum_kernel_16(n1, x);
i = n1;
ip = 2 * n1;
}


while (i < n) {
sumf += ABS(x[ip]) + ABS(x[ip + 1]);
i++;
ip += 2;
} }
else
{
inc_x2 = 2* inc_x;


while(i < n)
{
sumf += ABS(x[ip]) + ABS(x[ip+1]);
ip+=inc_x2;
i++;
}
} else {
inc_x2 = 2 * inc_x;


while (i < n) {
sumf += ABS(x[ip]) + ABS(x[ip + 1]);
ip += inc_x2;
i++;
} }
return(sumf);
}



}
return (sumf);
}

+ 112
- 120
kernel/zarch/zaxpy.c View File

@@ -1,5 +1,5 @@
/*************************************************************************** /***************************************************************************
Copyright (c) 2017, The OpenBLAS Project
Copyright (c) 2013-2019, The OpenBLAS Project
All rights reserved. All rights reserved.
Redistribution and use in source and binary forms, with or without Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are modification, are permitted provided that the following conditions are
@@ -27,144 +27,136 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.


#include "common.h" #include "common.h"


static void zaxpy_kernel_8(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
{
__asm__ volatile(
static void zaxpy_kernel_8(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) {
__asm__(
#if !defined(CONJ) #if !defined(CONJ)
"vlrepg %%v0,0(%3) \n\t"
"vleg %%v1,8(%3),0 \n\t"
"wflcdb %%v1,%%v1 \n\t"
"vleg %%v1,8(%3),1 \n\t"
#else
"vleg %%v0,0(%3),1 \n\t"
"vflcdb %%v0,%%v0 \n\t"
"vleg %%v0,0(%3),0 \n\t"
"vlrepg %%v1,8(%3) \n\t"
"vlrepg %%v0,0(%[alpha])\n\t"
"vleg %%v1,8(%[alpha]),0\n\t"
"wflcdb %%v1,%%v1\n\t"
"vleg %%v1,8(%[alpha]),1\n\t"
#else
"vleg %%v0,0(%[alpha]),1\n\t"
"vflcdb %%v0,%%v0\n\t"
"vleg %%v0,0(%[alpha]),0\n\t"
"vlrepg %%v1,8(%[alpha])\n\t"
#endif #endif
"srlg %%r0,%0,3 \n\t"
"xgr %%r1,%%r1 \n\t"
"0: \n\t"
"pfd 1, 1024(%%r1,%1) \n\t"
"pfd 2, 1024(%%r1,%2) \n\t"

"vl %%v16,0(%%r1,%1) \n\t"
"vl %%v17,16(%%r1,%1) \n\t"
"vl %%v18,32(%%r1,%1) \n\t"
"vl %%v19,48(%%r1,%1) \n\t"
"vl %%v20,0(%%r1,%2) \n\t"
"vl %%v21,16(%%r1,%2) \n\t"
"vl %%v22,32(%%r1,%2) \n\t"
"vl %%v23,48(%%r1,%2) \n\t"
"vpdi %%v24,%%v16,%%v16,4 \n\t"
"vpdi %%v25,%%v17,%%v17,4 \n\t"
"vpdi %%v26,%%v18,%%v18,4 \n\t"
"vpdi %%v27,%%v19,%%v19,4 \n\t"

"vfmadb %%v28,%%v16,%%v0,%%v20 \n\t"
"vfmadb %%v29,%%v17,%%v0,%%v21 \n\t"
"vfmadb %%v30,%%v18,%%v0,%%v22 \n\t"
"vfmadb %%v31,%%v19,%%v0,%%v23 \n\t"

"vfmadb %%v28,%%v24,%%v1,%%v28 \n\t"
"vfmadb %%v29,%%v25,%%v1,%%v29 \n\t"
"vfmadb %%v30,%%v26,%%v1,%%v30 \n\t"
"vfmadb %%v31,%%v27,%%v1,%%v31 \n\t"

"vst %%v28,0(%%r1,%2) \n\t"
"vst %%v29,16(%%r1,%2) \n\t"
"vst %%v30,32(%%r1,%2) \n\t"
"vst %%v31,48(%%r1,%2) \n\t"

"vl %%v16,64(%%r1,%1) \n\t"
"vl %%v17,80(%%r1,%1) \n\t"
"vl %%v18,96(%%r1,%1) \n\t"
"vl %%v19,112(%%r1,%1) \n\t"
"vl %%v20,64(%%r1,%2) \n\t"
"vl %%v21,80(%%r1,%2) \n\t"
"vl %%v22,96(%%r1,%2) \n\t"
"vl %%v23,112(%%r1,%2) \n\t"
"vpdi %%v24,%%v16,%%v16,4 \n\t"
"vpdi %%v25,%%v17,%%v17,4 \n\t"
"vpdi %%v26,%%v18,%%v18,4 \n\t"
"vpdi %%v27,%%v19,%%v19,4 \n\t"

"vfmadb %%v28,%%v16,%%v0,%%v20 \n\t"
"vfmadb %%v29,%%v17,%%v0,%%v21 \n\t"
"vfmadb %%v30,%%v18,%%v0,%%v22 \n\t"
"vfmadb %%v31,%%v19,%%v0,%%v23 \n\t"

"vfmadb %%v28,%%v24,%%v1,%%v28 \n\t"
"vfmadb %%v29,%%v25,%%v1,%%v29 \n\t"
"vfmadb %%v30,%%v26,%%v1,%%v30 \n\t"
"vfmadb %%v31,%%v27,%%v1,%%v31 \n\t"

"vst %%v28,64(%%r1,%2) \n\t"
"vst %%v29,80(%%r1,%2) \n\t"
"vst %%v30,96(%%r1,%2) \n\t"
"vst %%v31,112(%%r1,%2) \n\t"

"agfi %%r1,128 \n\t"
"brctg %%r0,0b "
:
:"r"(n),"ZR"((const FLOAT (*)[n * 2])x),"ZR"((FLOAT (*)[n * 2])y),"ZQ"((const FLOAT (*)[2])alpha)
:"memory","cc","r0","r1","v0","v1","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
);
"srlg %[n],%[n],3\n\t"
"xgr %%r1,%%r1\n\t"
"0:\n\t"
"pfd 1, 1024(%%r1,%[x])\n\t"
"pfd 2, 1024(%%r1,%[y])\n\t"
"vl %%v8,0(%%r1,%[x])\n\t"
"vl %%v9,16(%%r1,%[x])\n\t"
"vl %%v10,32(%%r1,%[x])\n\t"
"vl %%v11,48(%%r1,%[x])\n\t"
"vl %%v12,0(%%r1,%[y])\n\t"
"vl %%v13,16(%%r1,%[y])\n\t"
"vl %%v14,32(%%r1,%[y])\n\t"
"vl %%v15,48(%%r1,%[y])\n\t"
"vl %%v16,64(%%r1,%[x])\n\t"
"vl %%v17,80(%%r1,%[x])\n\t"
"vl %%v18,96(%%r1,%[x])\n\t"
"vl %%v19,112(%%r1,%[x])\n\t"
"vl %%v20,64(%%r1,%[y])\n\t"
"vl %%v21,80(%%r1,%[y])\n\t"
"vl %%v22,96(%%r1,%[y])\n\t"
"vl %%v23,112(%%r1,%[y])\n\t"
"vpdi %%v24,%%v8,%%v8,4\n\t"
"vpdi %%v25,%%v9,%%v9,4\n\t"
"vpdi %%v26,%%v10,%%v10,4\n\t"
"vpdi %%v27,%%v11,%%v11,4\n\t"
"vpdi %%v28,%%v16,%%v16,4\n\t"
"vpdi %%v29,%%v17,%%v17,4\n\t"
"vpdi %%v30,%%v18,%%v18,4\n\t"
"vpdi %%v31,%%v19,%%v19,4\n\t"
"vfmadb %%v8,%%v8,%%v0,%%v12\n\t"
"vfmadb %%v9,%%v9,%%v0,%%v13\n\t"
"vfmadb %%v10,%%v10,%%v0,%%v14\n\t"
"vfmadb %%v11,%%v11,%%v0,%%v15\n\t"
"vfmadb %%v16,%%v16,%%v0,%%v20\n\t"
"vfmadb %%v17,%%v17,%%v0,%%v21\n\t"
"vfmadb %%v18,%%v18,%%v0,%%v22\n\t"
"vfmadb %%v19,%%v19,%%v0,%%v23\n\t"
"vfmadb %%v8,%%v24,%%v1,%%v8\n\t"
"vfmadb %%v9,%%v25,%%v1,%%v9\n\t"
"vfmadb %%v10,%%v26,%%v1,%%v10\n\t"
"vfmadb %%v11,%%v27,%%v1,%%v11\n\t"
"vfmadb %%v16,%%v28,%%v1,%%v16\n\t"
"vfmadb %%v17,%%v29,%%v1,%%v17\n\t"
"vfmadb %%v18,%%v30,%%v1,%%v18\n\t"
"vfmadb %%v19,%%v31,%%v1,%%v19\n\t"
"vst %%v8,0(%%r1,%[y])\n\t"
"vst %%v9,16(%%r1,%[y])\n\t"
"vst %%v10,32(%%r1,%[y])\n\t"
"vst %%v11,48(%%r1,%[y])\n\t"
"vst %%v16,64(%%r1,%[y])\n\t"
"vst %%v17,80(%%r1,%[y])\n\t"
"vst %%v18,96(%%r1,%[y])\n\t"
"vst %%v19,112(%%r1,%[y])\n\t"
"agfi %%r1,128\n\t"
"brctg %[n],0b"
: "+m"(*(FLOAT (*)[n * 2]) y),[n] "+&r"(n)
: [y] "a"(y), "m"(*(const FLOAT (*)[n * 2]) x),[x] "a"(x),
"m"(*(const FLOAT (*)[2]) alpha),[alpha] "a"(alpha)
: "cc", "r1", "v0", "v1", "v8", "v9", "v10", "v11", "v12", "v13",
"v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23",
"v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31");
} }


int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) {
BLASLONG i = 0;
BLASLONG ix = 0, iy = 0;
FLOAT da[2] __attribute__ ((aligned(16)));
int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i,
FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy,
BLASLONG dummy2) {
BLASLONG i = 0;
BLASLONG ix = 0, iy = 0;
FLOAT da[2] __attribute__ ((aligned(16)));


if (n <= 0) return (0);
if (n <= 0)
return (0);


if ((inc_x == 1) && (inc_y == 1)) {
if ((inc_x == 1) && (inc_y == 1)) {


BLASLONG n1 = n & -8;
BLASLONG n1 = n & -8;


if (n1) {
da[0] = da_r;
da[1] = da_i;
zaxpy_kernel_8(n1, x, y, da);
ix = 2 * n1;
}
i = n1;
while (i < n) {
if (n1) {
da[0] = da_r;
da[1] = da_i;
zaxpy_kernel_8(n1, x, y, da);
ix = 2 * n1;
}
i = n1;
while (i < n) {
#if !defined(CONJ) #if !defined(CONJ)
y[ix] += (da_r * x[ix] - da_i * x[ix + 1]);
y[ix + 1] += (da_r * x[ix + 1] + da_i * x[ix]);
y[ix] += (da_r * x[ix] - da_i * x[ix + 1]);
y[ix + 1] += (da_r * x[ix + 1] + da_i * x[ix]);
#else #else
y[ix] += (da_r * x[ix] + da_i * x[ix + 1]);
y[ix + 1] -= (da_r * x[ix + 1] - da_i * x[ix]);
y[ix] += (da_r * x[ix] + da_i * x[ix + 1]);
y[ix + 1] -= (da_r * x[ix + 1] - da_i * x[ix]);
#endif #endif
i++;
ix += 2;

}
return (0);

i++;
ix += 2;


} }
return (0);


inc_x *= 2;
inc_y *= 2;
}


while (i < n) {
inc_x *= 2;
inc_y *= 2;

while (i < n) {


#if !defined(CONJ) #if !defined(CONJ)
y[iy] += (da_r * x[ix] - da_i * x[ix + 1]);
y[iy + 1] += (da_r * x[ix + 1] + da_i * x[ix]);
y[iy] += (da_r * x[ix] - da_i * x[ix + 1]);
y[iy + 1] += (da_r * x[ix + 1] + da_i * x[ix]);
#else #else
y[iy] += (da_r * x[ix] + da_i * x[ix + 1]);
y[iy + 1] -= (da_r * x[ix + 1] - da_i * x[ix]);
y[iy] += (da_r * x[ix] + da_i * x[ix + 1]);
y[iy + 1] -= (da_r * x[ix + 1] - da_i * x[ix]);
#endif #endif
ix += inc_x;
iy += inc_y;
i++;
ix += inc_x;
iy += inc_y;
i++;


}
return (0);
}
return (0);


} }



+ 45
- 57
kernel/zarch/zcopy.c View File

@@ -1,5 +1,5 @@
/*************************************************************************** /***************************************************************************
Copyright (c) 2013-2018, The OpenBLAS Project
Copyright (c) 2013-2019, The OpenBLAS Project
All rights reserved. All rights reserved.
Redistribution and use in source and binary forms, with or without Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are modification, are permitted provided that the following conditions are
@@ -27,73 +27,61 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.


#include "common.h" #include "common.h"


static void zcopy_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y)
{
__asm__ volatile (
"lgr %%r1,%1 \n\t"
"lgr %%r2,%2 \n\t"
"srlg %%r0,%0,4 \n\t"
"0: \n\t"
"pfd 1, 1024(%%r1) \n\t"
"pfd 2, 1024(%%r2) \n\t"
"mvc 0(256,%%r2),0(%%r1) \n\t"
"agfi %%r1,256 \n\t"
"agfi %%r2,256 \n\t"
"brctg %%r0,0b "
:
:"r"(n),"a"((const FLOAT (*)[n * 2])x),"a"((FLOAT (*)[n * 2])y)
:"memory","cc","r0","r1","r2"
);
static void zcopy_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y) {
__asm__("srlg %[n],%[n],4\n\t"
"0:\n\t"
"pfd 1, 1024(%[x])\n\t"
"pfd 2, 1024(%[y])\n\t"
"mvc 0(256,%[y]),0(%[x])\n\t"
"la %[x],256(%[x])\n\t"
"la %[y],256(%[y])\n\t"
"brctg %[n],0b"
: "=m"(*(FLOAT (*)[n * 2]) y),[x] "+&a"(x),[y] "+&a"(y)
: "m"(*(const FLOAT (*)[n * 2]) x)
: "cc");
} }


int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
{
BLASLONG i=0;
BLASLONG ix=0,iy=0;
int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) {
BLASLONG i = 0;
BLASLONG ix = 0, iy = 0;


if ( n <= 0 ) return(0);
if (n <= 0)
return (0);


if ( (inc_x == 1) && (inc_y == 1 ))
{
if ((inc_x == 1) && (inc_y == 1)) {


BLASLONG n1 = n & -16;
if ( n1 > 0 )
{
zcopy_kernel_16(n1, x, y);
i=n1;
ix=n1*2;
iy=n1*2;
}

while(i < n)
{
y[iy] = x[iy] ;
y[iy+1] = x[ix+1] ;
ix+=2;
iy+=2;
i++ ;

}
BLASLONG n1 = n & -16;
if (n1 > 0) {
zcopy_kernel_16(n1, x, y);
i = n1;
ix = n1 * 2;
iy = n1 * 2;
}


while (i < n) {
y[iy] = x[iy];
y[iy + 1] = x[ix + 1];
ix += 2;
iy += 2;
i++;


} }
else
{


BLASLONG inc_x2 = 2 * inc_x;
BLASLONG inc_y2 = 2 * inc_y;
} else {


while(i < n)
{
y[iy] = x[ix] ;
y[iy+1] = x[ix+1] ;
ix += inc_x2 ;
iy += inc_y2 ;
i++ ;
BLASLONG inc_x2 = 2 * inc_x;
BLASLONG inc_y2 = 2 * inc_y;


}
while (i < n) {
y[iy] = x[ix];
y[iy + 1] = x[ix + 1];
ix += inc_x2;
iy += inc_y2;
i++;


} }
return(0);

}

return (0);
} }

+ 120
- 126
kernel/zarch/zdot.c View File

@@ -1,5 +1,5 @@
/*************************************************************************** /***************************************************************************
Copyright (c) 2013-2017, The OpenBLAS Project
Copyright (c) 2013-2019, The OpenBLAS Project
All rights reserved. All rights reserved.
Redistribution and use in source and binary forms, with or without Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are modification, are permitted provided that the following conditions are
@@ -27,152 +27,146 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.


#include "common.h" #include "common.h"


static void zdot_kernel_8(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *d)
{
__asm__ volatile(
"vzero %%v24 \n\t"
"vzero %%v25 \n\t"
"vzero %%v26 \n\t"
"vzero %%v27 \n\t"
"vzero %%v28 \n\t"
"vzero %%v29 \n\t"
"vzero %%v30 \n\t"
"vzero %%v31 \n\t"
"srlg %%r0,%0,3 \n\t"
"xgr %%r1,%%r1 \n\t"
"0: \n\t"
"pfd 1, 1024(%%r1,%1) \n\t"
"pfd 1, 1024(%%r1,%2) \n\t"

"vl %%v16, 0(%%r1,%1) \n\t"
"vl %%v17, 16(%%r1,%1) \n\t"
"vl %%v18, 32(%%r1,%1) \n\t"
"vl %%v19, 48(%%r1,%1) \n\t"
"vl %%v0, 0(%%r1,%2) \n\t"
"vl %%v1, 16(%%r1,%2) \n\t"
"vl %%v2, 32(%%r1,%2) \n\t"
"vl %%v3, 48(%%r1,%2) \n\t"
"vpdi %%v20,%%v16,%%v16,4 \n\t"
"vpdi %%v21,%%v17,%%v17,4 \n\t"
"vpdi %%v22,%%v18,%%v18,4 \n\t"
"vpdi %%v23,%%v19,%%v19,4 \n\t"

"vfmadb %%v24,%%v16,%%v0,%%v24 \n\t"
"vfmadb %%v25,%%v20,%%v0,%%v25 \n\t"
"vfmadb %%v26,%%v17,%%v1,%%v26 \n\t"
"vfmadb %%v27,%%v21,%%v1,%%v27 \n\t"
"vfmadb %%v28,%%v18,%%v2,%%v28 \n\t"
"vfmadb %%v29,%%v22,%%v2,%%v29 \n\t"
"vfmadb %%v30,%%v19,%%v3,%%v30 \n\t"
"vfmadb %%v31,%%v23,%%v3,%%v31 \n\t"

"vl %%v16, 64(%%r1,%1) \n\t"
"vl %%v17, 80(%%r1,%1) \n\t"
"vl %%v18, 96(%%r1,%1) \n\t"
"vl %%v19, 112(%%r1,%1) \n\t"
"vl %%v0, 64(%%r1,%2) \n\t"
"vl %%v1, 80(%%r1,%2) \n\t"
"vl %%v2, 96(%%r1,%2) \n\t"
"vl %%v3, 112(%%r1,%2) \n\t"
"vpdi %%v20,%%v16,%%v16,4 \n\t"
"vpdi %%v21,%%v17,%%v17,4 \n\t"
"vpdi %%v22,%%v18,%%v18,4 \n\t"
"vpdi %%v23,%%v19,%%v19,4 \n\t"

"vfmadb %%v24,%%v16,%%v0,%%v24 \n\t"
"vfmadb %%v25,%%v20,%%v0,%%v25 \n\t"
"vfmadb %%v26,%%v17,%%v1,%%v26 \n\t"
"vfmadb %%v27,%%v21,%%v1,%%v27 \n\t"
"vfmadb %%v28,%%v18,%%v2,%%v28 \n\t"
"vfmadb %%v29,%%v22,%%v2,%%v29 \n\t"
"vfmadb %%v30,%%v19,%%v3,%%v30 \n\t"
"vfmadb %%v31,%%v23,%%v3,%%v31 \n\t"

"agfi %%r1,128 \n\t"
"brctg %%r0,0b \n\t"
"vfadb %%v24,%%v24,%%v26 \n\t"
"vfadb %%v24,%%v24,%%v28 \n\t"
"vfadb %%v24,%%v24,%%v30 \n\t"
"vfadb %%v25,%%v25,%%v27 \n\t"
"vfadb %%v25,%%v25,%%v29 \n\t"
"vfadb %%v25,%%v25,%%v31 \n\t"
"vsteg %%v24,0(%3),0 \n\t"
"vsteg %%v24,8(%3),1 \n\t"
"vsteg %%v25,16(%3),1 \n\t"
"vsteg %%v25,24(%3),0 "
:
:"r"(n),"ZR"((const FLOAT (*)[n * 2])x),"ZR"((const FLOAT (*)[n * 2])y),"ZQ"((FLOAT (*)[4])d)
:"memory","cc","r0","r1","v0","v1","v2","v3","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
);
static void zdot_kernel_8(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *d) {
__asm__("vzero %%v24\n\t"
"vzero %%v25\n\t"
"vzero %%v26\n\t"
"vzero %%v27\n\t"
"vzero %%v28\n\t"
"vzero %%v29\n\t"
"vzero %%v30\n\t"
"vzero %%v31\n\t"
"srlg %[n],%[n],3\n\t"
"xgr %%r1,%%r1\n\t"
"0:\n\t"
"pfd 1, 1024(%%r1,%[x])\n\t"
"pfd 1, 1024(%%r1,%[y])\n\t"
"vl %%v16, 0(%%r1,%[x])\n\t"
"vl %%v17, 16(%%r1,%[x])\n\t"
"vl %%v18, 32(%%r1,%[x])\n\t"
"vl %%v19, 48(%%r1,%[x])\n\t"
"vl %%v0, 0(%%r1,%[y])\n\t"
"vl %%v1, 16(%%r1,%[y])\n\t"
"vl %%v2, 32(%%r1,%[y])\n\t"
"vl %%v3, 48(%%r1,%[y])\n\t"
"vpdi %%v20,%%v16,%%v16,4\n\t"
"vpdi %%v21,%%v17,%%v17,4\n\t"
"vpdi %%v22,%%v18,%%v18,4\n\t"
"vpdi %%v23,%%v19,%%v19,4\n\t"
"vfmadb %%v24,%%v16,%%v0,%%v24\n\t"
"vfmadb %%v25,%%v20,%%v0,%%v25\n\t"
"vfmadb %%v26,%%v17,%%v1,%%v26\n\t"
"vfmadb %%v27,%%v21,%%v1,%%v27\n\t"
"vfmadb %%v28,%%v18,%%v2,%%v28\n\t"
"vfmadb %%v29,%%v22,%%v2,%%v29\n\t"
"vfmadb %%v30,%%v19,%%v3,%%v30\n\t"
"vfmadb %%v31,%%v23,%%v3,%%v31\n\t"
"vl %%v16, 64(%%r1,%[x])\n\t"
"vl %%v17, 80(%%r1,%[x])\n\t"
"vl %%v18, 96(%%r1,%[x])\n\t"
"vl %%v19, 112(%%r1,%[x])\n\t"
"vl %%v0, 64(%%r1,%[y])\n\t"
"vl %%v1, 80(%%r1,%[y])\n\t"
"vl %%v2, 96(%%r1,%[y])\n\t"
"vl %%v3, 112(%%r1,%[y])\n\t"
"vpdi %%v20,%%v16,%%v16,4\n\t"
"vpdi %%v21,%%v17,%%v17,4\n\t"
"vpdi %%v22,%%v18,%%v18,4\n\t"
"vpdi %%v23,%%v19,%%v19,4\n\t"
"vfmadb %%v24,%%v16,%%v0,%%v24\n\t"
"vfmadb %%v25,%%v20,%%v0,%%v25\n\t"
"vfmadb %%v26,%%v17,%%v1,%%v26\n\t"
"vfmadb %%v27,%%v21,%%v1,%%v27\n\t"
"vfmadb %%v28,%%v18,%%v2,%%v28\n\t"
"vfmadb %%v29,%%v22,%%v2,%%v29\n\t"
"vfmadb %%v30,%%v19,%%v3,%%v30\n\t"
"vfmadb %%v31,%%v23,%%v3,%%v31\n\t"
"agfi %%r1,128\n\t"
"brctg %[n],0b\n\t"
"vfadb %%v24,%%v24,%%v26\n\t"
"vfadb %%v24,%%v24,%%v28\n\t"
"vfadb %%v24,%%v24,%%v30\n\t"
"vfadb %%v25,%%v25,%%v27\n\t"
"vfadb %%v25,%%v25,%%v29\n\t"
"vfadb %%v25,%%v25,%%v31\n\t"
"vsteg %%v24,0(%[d]),0\n\t"
"vsteg %%v24,8(%[d]),1\n\t"
"vsteg %%v25,16(%[d]),1\n\t"
"vsteg %%v25,24(%[d]),0"
: "=m"(*(FLOAT (*)[4]) d),[n] "+&r"(n)
: [d] "a"(d), "m"(*(const FLOAT (*)[n * 2]) x),[x] "a"(x),
"m"(*(const FLOAT (*)[n * 2]) y),[y] "a"(y)
: "cc", "r1", "v0", "v1", "v2", "v3", "v16", "v17", "v18", "v19", "v20",
"v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30",
"v31");
} }


OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) {
BLASLONG i;
BLASLONG ix, iy;
OPENBLAS_COMPLEX_FLOAT result;
FLOAT dot[4] __attribute__ ((aligned(16))) = {0.0, 0.0, 0.0, 0.0};
if (n <= 0) {
CREAL(result) = 0.0;
CIMAG(result) = 0.0;
return (result);
}
OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y,
BLASLONG inc_y) {
BLASLONG i;
BLASLONG ix, iy;
OPENBLAS_COMPLEX_FLOAT result;
FLOAT dot[4] __attribute__ ((aligned(16))) = {
0.0, 0.0, 0.0, 0.0};
if (n <= 0) {
CREAL(result) = 0.0;
CIMAG(result) = 0.0;
return (result);


if ((inc_x == 1) && (inc_y == 1)) {
}


BLASLONG n1 = n & -8;
if ((inc_x == 1) && (inc_y == 1)) {


if (n1)
zdot_kernel_8(n1, x, y, dot);
BLASLONG n1 = n & -8;


i = n1;
BLASLONG j = i * 2;
if (n1)
zdot_kernel_8(n1, x, y, dot);


while (i < n) {
i = n1;
BLASLONG j = i * 2;


dot[0] += x[j] * y[j];
dot[1] += x[j + 1] * y[j + 1];
dot[2] += x[j] * y[j + 1];
dot[3] += x[j + 1] * y[j];
while (i < n) {


j += 2;
i++;
dot[0] += x[j] * y[j];
dot[1] += x[j + 1] * y[j + 1];
dot[2] += x[j] * y[j + 1];
dot[3] += x[j + 1] * y[j];


}
j += 2;
i++;


}


} else {
i = 0;
ix = 0;
iy = 0;
inc_x <<= 1;
inc_y <<= 1;
while (i < n) {
} else {
i = 0;
ix = 0;
iy = 0;
inc_x <<= 1;
inc_y <<= 1;
while (i < n) {


dot[0] += x[ix] * y[iy];
dot[1] += x[ix + 1] * y[iy + 1];
dot[2] += x[ix] * y[iy + 1];
dot[3] += x[ix + 1] * y[iy];
dot[0] += x[ix] * y[iy];
dot[1] += x[ix + 1] * y[iy + 1];
dot[2] += x[ix] * y[iy + 1];
dot[3] += x[ix + 1] * y[iy];


ix += inc_x;
iy += inc_y;
i++;
ix += inc_x;
iy += inc_y;
i++;


}
} }
}


#if !defined(CONJ) #if !defined(CONJ)
CREAL(result) = dot[0] - dot[1];
CIMAG(result) = dot[2] + dot[3];
CREAL(result) = dot[0] - dot[1];
CIMAG(result) = dot[2] + dot[3];
#else #else
CREAL(result) = dot[0] + dot[1];
CIMAG(result) = dot[2] - dot[3];
CREAL(result) = dot[0] + dot[1];
CIMAG(result) = dot[2] - dot[3];


#endif #endif


return (result);
return (result);


} }



+ 544
- 603
kernel/zarch/zgemv_n_4.c
File diff suppressed because it is too large
View File


+ 536
- 563
kernel/zarch/zgemv_t_4.c
File diff suppressed because it is too large
View File


+ 196
- 217
kernel/zarch/zrot.c View File

@@ -1,5 +1,5 @@
/*************************************************************************** /***************************************************************************
Copyright (c) 2013-2018, The OpenBLAS Project
Copyright (c) 2013-2019, The OpenBLAS Project
All rights reserved. All rights reserved.
Redistribution and use in source and binary forms, with or without Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are modification, are permitted provided that the following conditions are
@@ -27,230 +27,209 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.


#include "common.h" #include "common.h"


static void zrot_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *c, FLOAT *s)
{
__asm__ (
"vlrepg %%v0,%3 \n\t"
"vlrepg %%v1,%4 \n\t"
"srlg %%r0,%0,4 \n\t"
"xgr %%r1,%%r1 \n\t"
"0: \n\t"
"pfd 2, 1024(%%r1,%1) \n\t"
"pfd 2, 1024(%%r1,%2) \n\t"
"vl %%v24, 0(%%r1,%1) \n\t"
"vl %%v25, 16(%%r1,%1) \n\t"
"vl %%v26, 32(%%r1,%1) \n\t"
"vl %%v27, 48(%%r1,%1) \n\t"
"vl %%v16, 0(%%r1,%2) \n\t"
"vl %%v17, 16(%%r1,%2) \n\t"
"vl %%v18, 32(%%r1,%2) \n\t"
"vl %%v19, 48(%%r1,%2) \n\t"
"vfmdb %%v28,%%v24,%%v0 \n\t"
"vfmdb %%v29,%%v25,%%v0 \n\t"
"vfmdb %%v20,%%v24,%%v1 \n\t" /* yn=x*s */
"vfmdb %%v21,%%v25,%%v1 \n\t" /* yn=x*s */
"vfmdb %%v30,%%v26,%%v0 \n\t"
"vfmdb %%v22,%%v26,%%v1 \n\t" /* yn=x*s */
"vfmdb %%v31,%%v27,%%v0 \n\t"
"vfmdb %%v23,%%v27,%%v1 \n\t" /* yn=x*s */
/* 2nd parts*/
"vfmadb %%v28,%%v16,%%v1,%%v28 \n\t"
"vfmsdb %%v20,%%v16,%%v0,%%v20 \n\t" /* yn=y*c-yn */
"vfmadb %%v29,%%v17,%%v1,%%v29 \n\t"
"vfmsdb %%v21,%%v17,%%v0,%%v21 \n\t" /* yn=y*c-yn */
"vfmadb %%v30,%%v18,%%v1,%%v30 \n\t"
"vfmsdb %%v22,%%v18,%%v0,%%v22 \n\t" /* yn=y*c-yn */
"vfmadb %%v31,%%v19,%%v1,%%v31 \n\t"
"vfmsdb %%v23,%%v19,%%v0,%%v23 \n\t" /* yn=y*c-yn */
"vst %%v28, 0(%%r1,%1) \n\t"
"vst %%v29, 16(%%r1,%1) \n\t"
"vst %%v30, 32(%%r1,%1) \n\t"
"vst %%v31, 48(%%r1,%1) \n\t"
"vst %%v20, 0(%%r1,%2) \n\t"
"vst %%v21, 16(%%r1,%2) \n\t"
"vst %%v22, 32(%%r1,%2) \n\t"
"vst %%v23, 48(%%r1,%2) \n\t"
"vl %%v24, 64(%%r1,%1) \n\t"
"vl %%v25, 80(%%r1,%1) \n\t"
"vl %%v26, 96(%%r1,%1) \n\t"
"vl %%v27, 112(%%r1,%1) \n\t"
"vl %%v16, 64(%%r1,%2) \n\t"
"vl %%v17, 80(%%r1,%2) \n\t"
"vl %%v18, 96(%%r1,%2) \n\t"
"vl %%v19, 112(%%r1,%2) \n\t"
"vfmdb %%v28,%%v24,%%v0 \n\t"
"vfmdb %%v29,%%v25,%%v0 \n\t"
"vfmdb %%v20,%%v24,%%v1 \n\t" /* yn=x*s */
"vfmdb %%v21,%%v25,%%v1 \n\t" /* yn=x*s */
"vfmdb %%v30,%%v26,%%v0 \n\t"
"vfmdb %%v22,%%v26,%%v1 \n\t" /* yn=x*s */
"vfmdb %%v31,%%v27,%%v0 \n\t"
"vfmdb %%v23,%%v27,%%v1 \n\t" /* yn=x*s */
/* 2nd parts*/
"vfmadb %%v28,%%v16,%%v1,%%v28 \n\t"
"vfmsdb %%v20,%%v16,%%v0,%%v20 \n\t" /* yn=y*c-yn */
"vfmadb %%v29,%%v17,%%v1,%%v29 \n\t"
"vfmsdb %%v21,%%v17,%%v0,%%v21 \n\t" /* yn=y*c-yn */
"vfmadb %%v30,%%v18,%%v1,%%v30 \n\t"
"vfmsdb %%v22,%%v18,%%v0,%%v22 \n\t" /* yn=y*c-yn */
"vfmadb %%v31,%%v19,%%v1,%%v31 \n\t"
"vfmsdb %%v23,%%v19,%%v0,%%v23 \n\t" /* yn=y*c-yn */
"vst %%v28, 64(%%r1,%1) \n\t"
"vst %%v29, 80(%%r1,%1) \n\t"
"vst %%v30, 96(%%r1,%1) \n\t"
"vst %%v31, 112(%%r1,%1) \n\t"
"vst %%v20, 64(%%r1,%2) \n\t"
"vst %%v21, 80(%%r1,%2) \n\t"
"vst %%v22, 96(%%r1,%2) \n\t"
"vst %%v23, 112(%%r1,%2) \n\t"
"vl %%v24, 128(%%r1,%1) \n\t"
"vl %%v25, 144(%%r1,%1) \n\t"
"vl %%v26, 160(%%r1,%1) \n\t"
"vl %%v27, 176(%%r1,%1) \n\t"
"vl %%v16, 128(%%r1,%2) \n\t"
"vl %%v17, 144(%%r1,%2) \n\t"
"vl %%v18, 160(%%r1,%2) \n\t"
"vl %%v19, 176(%%r1,%2) \n\t"
"vfmdb %%v28,%%v24,%%v0 \n\t"
"vfmdb %%v29,%%v25,%%v0 \n\t"
"vfmdb %%v20,%%v24,%%v1 \n\t" /* yn=x*s */
"vfmdb %%v21,%%v25,%%v1 \n\t" /* yn=x*s */
"vfmdb %%v30,%%v26,%%v0 \n\t"
"vfmdb %%v22,%%v26,%%v1 \n\t" /* yn=x*s */
"vfmdb %%v31,%%v27,%%v0 \n\t"
"vfmdb %%v23,%%v27,%%v1 \n\t" /* yn=x*s */
/* 2nd parts*/
"vfmadb %%v28,%%v16,%%v1,%%v28 \n\t"
"vfmsdb %%v20,%%v16,%%v0,%%v20 \n\t" /* yn=y*c-yn */
"vfmadb %%v29,%%v17,%%v1,%%v29 \n\t"
"vfmsdb %%v21,%%v17,%%v0,%%v21 \n\t" /* yn=y*c-yn */
"vfmadb %%v30,%%v18,%%v1,%%v30 \n\t"
"vfmsdb %%v22,%%v18,%%v0,%%v22 \n\t" /* yn=y*c-yn */
"vfmadb %%v31,%%v19,%%v1,%%v31 \n\t"
"vfmsdb %%v23,%%v19,%%v0,%%v23 \n\t" /* yn=y*c-yn */
"vst %%v28, 128(%%r1,%1) \n\t"
"vst %%v29, 144(%%r1,%1) \n\t"
"vst %%v30, 160(%%r1,%1) \n\t"
"vst %%v31, 176(%%r1,%1) \n\t"
"vst %%v20, 128(%%r1,%2) \n\t"
"vst %%v21, 144(%%r1,%2) \n\t"
"vst %%v22, 160(%%r1,%2) \n\t"
"vst %%v23, 176(%%r1,%2) \n\t"
"vl %%v24, 192(%%r1,%1) \n\t"
"vl %%v25, 208(%%r1,%1) \n\t"
"vl %%v26, 224(%%r1,%1) \n\t"
"vl %%v27, 240(%%r1,%1) \n\t"
"vl %%v16, 192(%%r1,%2) \n\t"
"vl %%v17, 208(%%r1,%2) \n\t"
"vl %%v18, 224(%%r1,%2) \n\t"
"vl %%v19, 240(%%r1,%2) \n\t"
"vfmdb %%v28,%%v24,%%v0 \n\t"
"vfmdb %%v29,%%v25,%%v0 \n\t"
"vfmdb %%v20,%%v24,%%v1 \n\t" /* yn=x*s */
"vfmdb %%v21,%%v25,%%v1 \n\t" /* yn=x*s */
"vfmdb %%v30,%%v26,%%v0 \n\t"
"vfmdb %%v22,%%v26,%%v1 \n\t" /* yn=x*s */
"vfmdb %%v31,%%v27,%%v0 \n\t"
"vfmdb %%v23,%%v27,%%v1 \n\t" /* yn=x*s */
/* 2nd parts*/
"vfmadb %%v28,%%v16,%%v1,%%v28 \n\t"
"vfmsdb %%v20,%%v16,%%v0,%%v20 \n\t" /* yn=y*c-yn */
"vfmadb %%v29,%%v17,%%v1,%%v29 \n\t"
"vfmsdb %%v21,%%v17,%%v0,%%v21 \n\t" /* yn=y*c-yn */
"vfmadb %%v30,%%v18,%%v1,%%v30 \n\t"
"vfmsdb %%v22,%%v18,%%v0,%%v22 \n\t" /* yn=y*c-yn */
"vfmadb %%v31,%%v19,%%v1,%%v31 \n\t"
"vfmsdb %%v23,%%v19,%%v0,%%v23 \n\t" /* yn=y*c-yn */
"vst %%v28, 192(%%r1,%1) \n\t"
"vst %%v29, 208(%%r1,%1) \n\t"
"vst %%v30, 224(%%r1,%1) \n\t"
"vst %%v31, 240(%%r1,%1) \n\t"
"vst %%v20, 192(%%r1,%2) \n\t"
"vst %%v21, 208(%%r1,%2) \n\t"
"vst %%v22, 224(%%r1,%2) \n\t"
"vst %%v23, 240(%%r1,%2) \n\t"
"agfi %%r1,256 \n\t"
"brctg %%r0,0b "
:
:"r"(n),"ZR"((FLOAT (*)[n * 2])x),"ZR"((FLOAT (*)[n * 2])y),"m"(*c),"m"(*s)
:"memory","cc","r0","r1","v0","v1","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
);
static void zrot_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *c, FLOAT *s) {
__asm__("vlrepg %%v0,%[c]\n\t"
"vlrepg %%v1,%[s]\n\t"
"srlg %[n],%[n],4\n\t"
"xgr %%r1,%%r1\n\t"
"0:\n\t"
"pfd 2, 1024(%%r1,%[x])\n\t"
"pfd 2, 1024(%%r1,%[y])\n\t"
"vl %%v24, 0(%%r1,%[x])\n\t"
"vl %%v25, 16(%%r1,%[x])\n\t"
"vl %%v26, 32(%%r1,%[x])\n\t"
"vl %%v27, 48(%%r1,%[x])\n\t"
"vl %%v16, 0(%%r1,%[y])\n\t"
"vl %%v17, 16(%%r1,%[y])\n\t"
"vl %%v18, 32(%%r1,%[y])\n\t"
"vl %%v19, 48(%%r1,%[y])\n\t"
"vfmdb %%v28,%%v24,%%v0\n\t"
"vfmdb %%v29,%%v25,%%v0\n\t"
"vfmdb %%v20,%%v24,%%v1\n\t" /* yn=x*s */
"vfmdb %%v21,%%v25,%%v1\n\t" /* yn=x*s */
"vfmdb %%v30,%%v26,%%v0\n\t"
"vfmdb %%v22,%%v26,%%v1\n\t" /* yn=x*s */
"vfmdb %%v31,%%v27,%%v0\n\t"
"vfmdb %%v23,%%v27,%%v1\n\t" /* yn=x*s */
/* 2nd parts */
"vfmadb %%v28,%%v16,%%v1,%%v28\n\t"
"vfmsdb %%v20,%%v16,%%v0,%%v20\n\t" /* yn=y*c-yn */
"vfmadb %%v29,%%v17,%%v1,%%v29\n\t"
"vfmsdb %%v21,%%v17,%%v0,%%v21\n\t" /* yn=y*c-yn */
"vfmadb %%v30,%%v18,%%v1,%%v30\n\t"
"vfmsdb %%v22,%%v18,%%v0,%%v22\n\t" /* yn=y*c-yn */
"vfmadb %%v31,%%v19,%%v1,%%v31\n\t"
"vfmsdb %%v23,%%v19,%%v0,%%v23\n\t" /* yn=y*c-yn */
"vst %%v28, 0(%%r1,%[x])\n\t"
"vst %%v29, 16(%%r1,%[x])\n\t"
"vst %%v30, 32(%%r1,%[x])\n\t"
"vst %%v31, 48(%%r1,%[x])\n\t"
"vst %%v20, 0(%%r1,%[y])\n\t"
"vst %%v21, 16(%%r1,%[y])\n\t"
"vst %%v22, 32(%%r1,%[y])\n\t"
"vst %%v23, 48(%%r1,%[y])\n\t"
"vl %%v24, 64(%%r1,%[x])\n\t"
"vl %%v25, 80(%%r1,%[x])\n\t"
"vl %%v26, 96(%%r1,%[x])\n\t"
"vl %%v27, 112(%%r1,%[x])\n\t"
"vl %%v16, 64(%%r1,%[y])\n\t"
"vl %%v17, 80(%%r1,%[y])\n\t"
"vl %%v18, 96(%%r1,%[y])\n\t"
"vl %%v19, 112(%%r1,%[y])\n\t"
"vfmdb %%v28,%%v24,%%v0\n\t"
"vfmdb %%v29,%%v25,%%v0\n\t"
"vfmdb %%v20,%%v24,%%v1\n\t" /* yn=x*s */
"vfmdb %%v21,%%v25,%%v1\n\t" /* yn=x*s */
"vfmdb %%v30,%%v26,%%v0\n\t"
"vfmdb %%v22,%%v26,%%v1\n\t" /* yn=x*s */
"vfmdb %%v31,%%v27,%%v0\n\t"
"vfmdb %%v23,%%v27,%%v1\n\t" /* yn=x*s */
/* 2nd parts */
"vfmadb %%v28,%%v16,%%v1,%%v28\n\t"
"vfmsdb %%v20,%%v16,%%v0,%%v20\n\t" /* yn=y*c-yn */
"vfmadb %%v29,%%v17,%%v1,%%v29\n\t"
"vfmsdb %%v21,%%v17,%%v0,%%v21\n\t" /* yn=y*c-yn */
"vfmadb %%v30,%%v18,%%v1,%%v30\n\t"
"vfmsdb %%v22,%%v18,%%v0,%%v22\n\t" /* yn=y*c-yn */
"vfmadb %%v31,%%v19,%%v1,%%v31\n\t"
"vfmsdb %%v23,%%v19,%%v0,%%v23\n\t" /* yn=y*c-yn */
"vst %%v28, 64(%%r1,%[x])\n\t"
"vst %%v29, 80(%%r1,%[x])\n\t"
"vst %%v30, 96(%%r1,%[x])\n\t"
"vst %%v31, 112(%%r1,%[x])\n\t"
"vst %%v20, 64(%%r1,%[y])\n\t"
"vst %%v21, 80(%%r1,%[y])\n\t"
"vst %%v22, 96(%%r1,%[y])\n\t"
"vst %%v23, 112(%%r1,%[y])\n\t"
"vl %%v24, 128(%%r1,%[x])\n\t"
"vl %%v25, 144(%%r1,%[x])\n\t"
"vl %%v26, 160(%%r1,%[x])\n\t"
"vl %%v27, 176(%%r1,%[x])\n\t"
"vl %%v16, 128(%%r1,%[y])\n\t"
"vl %%v17, 144(%%r1,%[y])\n\t"
"vl %%v18, 160(%%r1,%[y])\n\t"
"vl %%v19, 176(%%r1,%[y])\n\t"
"vfmdb %%v28,%%v24,%%v0\n\t"
"vfmdb %%v29,%%v25,%%v0\n\t"
"vfmdb %%v20,%%v24,%%v1\n\t" /* yn=x*s */
"vfmdb %%v21,%%v25,%%v1\n\t" /* yn=x*s */
"vfmdb %%v30,%%v26,%%v0\n\t"
"vfmdb %%v22,%%v26,%%v1\n\t" /* yn=x*s */
"vfmdb %%v31,%%v27,%%v0\n\t"
"vfmdb %%v23,%%v27,%%v1\n\t" /* yn=x*s */
/* 2nd parts */
"vfmadb %%v28,%%v16,%%v1,%%v28\n\t"
"vfmsdb %%v20,%%v16,%%v0,%%v20\n\t" /* yn=y*c-yn */
"vfmadb %%v29,%%v17,%%v1,%%v29\n\t"
"vfmsdb %%v21,%%v17,%%v0,%%v21\n\t" /* yn=y*c-yn */
"vfmadb %%v30,%%v18,%%v1,%%v30\n\t"
"vfmsdb %%v22,%%v18,%%v0,%%v22\n\t" /* yn=y*c-yn */
"vfmadb %%v31,%%v19,%%v1,%%v31\n\t"
"vfmsdb %%v23,%%v19,%%v0,%%v23\n\t" /* yn=y*c-yn */
"vst %%v28, 128(%%r1,%[x])\n\t"
"vst %%v29, 144(%%r1,%[x])\n\t"
"vst %%v30, 160(%%r1,%[x])\n\t"
"vst %%v31, 176(%%r1,%[x])\n\t"
"vst %%v20, 128(%%r1,%[y])\n\t"
"vst %%v21, 144(%%r1,%[y])\n\t"
"vst %%v22, 160(%%r1,%[y])\n\t"
"vst %%v23, 176(%%r1,%[y])\n\t"
"vl %%v24, 192(%%r1,%[x])\n\t"
"vl %%v25, 208(%%r1,%[x])\n\t"
"vl %%v26, 224(%%r1,%[x])\n\t"
"vl %%v27, 240(%%r1,%[x])\n\t"
"vl %%v16, 192(%%r1,%[y])\n\t"
"vl %%v17, 208(%%r1,%[y])\n\t"
"vl %%v18, 224(%%r1,%[y])\n\t"
"vl %%v19, 240(%%r1,%[y])\n\t"
"vfmdb %%v28,%%v24,%%v0\n\t"
"vfmdb %%v29,%%v25,%%v0\n\t"
"vfmdb %%v20,%%v24,%%v1\n\t" /* yn=x*s */
"vfmdb %%v21,%%v25,%%v1\n\t" /* yn=x*s */
"vfmdb %%v30,%%v26,%%v0\n\t"
"vfmdb %%v22,%%v26,%%v1\n\t" /* yn=x*s */
"vfmdb %%v31,%%v27,%%v0\n\t"
"vfmdb %%v23,%%v27,%%v1\n\t" /* yn=x*s */
/* 2nd parts */
"vfmadb %%v28,%%v16,%%v1,%%v28\n\t"
"vfmsdb %%v20,%%v16,%%v0,%%v20\n\t" /* yn=y*c-yn */
"vfmadb %%v29,%%v17,%%v1,%%v29\n\t"
"vfmsdb %%v21,%%v17,%%v0,%%v21\n\t" /* yn=y*c-yn */
"vfmadb %%v30,%%v18,%%v1,%%v30\n\t"
"vfmsdb %%v22,%%v18,%%v0,%%v22\n\t" /* yn=y*c-yn */
"vfmadb %%v31,%%v19,%%v1,%%v31\n\t"
"vfmsdb %%v23,%%v19,%%v0,%%v23\n\t" /* yn=y*c-yn */
"vst %%v28, 192(%%r1,%[x])\n\t"
"vst %%v29, 208(%%r1,%[x])\n\t"
"vst %%v30, 224(%%r1,%[x])\n\t"
"vst %%v31, 240(%%r1,%[x])\n\t"
"vst %%v20, 192(%%r1,%[y])\n\t"
"vst %%v21, 208(%%r1,%[y])\n\t"
"vst %%v22, 224(%%r1,%[y])\n\t"
"vst %%v23, 240(%%r1,%[y])\n\t"
"agfi %%r1,256\n\t"
"brctg %[n],0b"
: "+m"(*(FLOAT (*)[n * 2]) x), "+m"(*(FLOAT (*)[n * 2]) y),[n] "+&r"(n)
: [x] "a"(x),[y] "a"(y),[c] "m"(*c),[s] "m"(*s)
: "cc", "r1", "v0", "v1", "v16", "v17", "v18", "v19", "v20", "v21",
"v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30",
"v31");
} }


int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT c, FLOAT s)
{
BLASLONG i=0;
BLASLONG ix=0,iy=0;
FLOAT temp[2];
BLASLONG inc_x2;
BLASLONG inc_y2;

if ( n <= 0 ) return(0);

if ( (inc_x == 1) && (inc_y == 1) )
{

BLASLONG n1 = n & -16;
if ( n1 > 0 )
{
FLOAT cosa,sina;
cosa=c;
sina=s;
zrot_kernel_16(n1, x, y, &cosa, &sina);
i=n1;
ix=2*n1;
}

while(i < n)
{
temp[0] = c*x[ix] + s*y[ix] ;
temp[1] = c*x[ix+1] + s*y[ix+1] ;
y[ix] = c*y[ix] - s*x[ix] ;
y[ix+1] = c*y[ix+1] - s*x[ix+1] ;
x[ix] = temp[0] ;
x[ix+1] = temp[1] ;

ix += 2 ;
i++ ;
int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y,
FLOAT c, FLOAT s) {
BLASLONG i = 0;
BLASLONG ix = 0, iy = 0;
FLOAT temp[2];
BLASLONG inc_x2;
BLASLONG inc_y2;

if (n <= 0)
return (0);

if ((inc_x == 1) && (inc_y == 1)) {

BLASLONG n1 = n & -16;
if (n1 > 0) {
FLOAT cosa, sina;
cosa = c;
sina = s;
zrot_kernel_16(n1, x, y, &cosa, &sina);
i = n1;
ix = 2 * n1;
}


}
while (i < n) {
temp[0] = c * x[ix] + s * y[ix];
temp[1] = c * x[ix + 1] + s * y[ix + 1];
y[ix] = c * y[ix] - s * x[ix];
y[ix + 1] = c * y[ix + 1] - s * x[ix + 1];
x[ix] = temp[0];
x[ix + 1] = temp[1];


ix += 2;
i++;


} }
else
{
inc_x2 = 2 * inc_x ;
inc_y2 = 2 * inc_y ;
while(i < n)
{
temp[0] = c*x[ix] + s*y[iy] ;
temp[1] = c*x[ix+1] + s*y[iy+1] ;
y[iy] = c*y[iy] - s*x[ix] ;
y[iy+1] = c*y[iy+1] - s*x[ix+1] ;
x[ix] = temp[0] ;
x[ix+1] = temp[1] ;

ix += inc_x2 ;
iy += inc_y2 ;
i++ ;


}
} else {
inc_x2 = 2 * inc_x;
inc_y2 = 2 * inc_y;
while (i < n) {
temp[0] = c * x[ix] + s * y[iy];
temp[1] = c * x[ix + 1] + s * y[iy + 1];
y[iy] = c * y[iy] - s * x[ix];
y[iy + 1] = c * y[iy + 1] - s * x[ix + 1];
x[ix] = temp[0];
x[ix + 1] = temp[1];

ix += inc_x2;
iy += inc_y2;
i++;


} }
return(0);
}


}
return (0);


}

+ 323
- 353
kernel/zarch/zscal.c View File

@@ -1,5 +1,5 @@
/*************************************************************************** /***************************************************************************
Copyright (c) 2013 - 2017, The OpenBLAS Project
Copyright (c) 2013-2019, The OpenBLAS Project
All rights reserved. All rights reserved.
Redistribution and use in source and binary forms, with or without Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are modification, are permitted provided that the following conditions are
@@ -27,426 +27,396 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.


#include "common.h" #include "common.h"


static void zscal_kernel_8(BLASLONG n, FLOAT *alpha, FLOAT *x)
{
__asm__ volatile(
"vlrepg %%v0,0(%1) \n\t"
"vleg %%v1,8(%1),0 \n\t"
"wflcdb %%v1,%%v1 \n\t"
"vleg %%v1,8(%1),1 \n\t"
"srlg %%r0,%0,3 \n\t"
"xgr %%r1,%%r1 \n\t"
"0: \n\t"
"pfd 2, 1024(%%r1,%2) \n\t"

"vl %%v16,0(%%r1,%2) \n\t"
"vl %%v17,16(%%r1,%2) \n\t"
"vl %%v18,32(%%r1,%2) \n\t"
"vl %%v19,48(%%r1,%2) \n\t"
"vl %%v20,64(%%r1,%2) \n\t"
"vl %%v21,80(%%r1,%2) \n\t"
"vl %%v22,96(%%r1,%2) \n\t"
"vl %%v23,112(%%r1,%2) \n\t"
"vpdi %%v24,%%v16,%%v16,4 \n\t"
"vpdi %%v25,%%v17,%%v17,4 \n\t"
"vpdi %%v26,%%v18,%%v18,4 \n\t"
"vpdi %%v27,%%v19,%%v19,4 \n\t"
"vpdi %%v28,%%v20,%%v20,4 \n\t"
"vpdi %%v29,%%v21,%%v21,4 \n\t"
"vpdi %%v30,%%v22,%%v22,4 \n\t"
"vpdi %%v31,%%v23,%%v23,4 \n\t"

"vfmdb %%v16,%%v16,%%v0 \n\t"
"vfmdb %%v17,%%v17,%%v0 \n\t"
"vfmdb %%v18,%%v18,%%v0 \n\t"
"vfmdb %%v19,%%v19,%%v0 \n\t"
"vfmdb %%v20,%%v20,%%v0 \n\t"
"vfmdb %%v21,%%v21,%%v0 \n\t"
"vfmdb %%v22,%%v22,%%v0 \n\t"
"vfmdb %%v23,%%v23,%%v0 \n\t"
"vfmadb %%v16,%%v24,%%v1,%%v16 \n\t"
"vfmadb %%v17,%%v25,%%v1,%%v17 \n\t"
"vfmadb %%v18,%%v26,%%v1,%%v18 \n\t"
"vfmadb %%v19,%%v27,%%v1,%%v19 \n\t"
"vfmadb %%v20,%%v28,%%v1,%%v20 \n\t"
"vfmadb %%v21,%%v29,%%v1,%%v21 \n\t"
"vfmadb %%v22,%%v30,%%v1,%%v22 \n\t"
"vfmadb %%v23,%%v31,%%v1,%%v23 \n\t"

"vst %%v16,0(%%r1,%2) \n\t"
"vst %%v17,16(%%r1,%2) \n\t"
"vst %%v18,32(%%r1,%2) \n\t"
"vst %%v19,48(%%r1,%2) \n\t"
"vst %%v20,64(%%r1,%2) \n\t"
"vst %%v21,80(%%r1,%2) \n\t"
"vst %%v22,96(%%r1,%2) \n\t"
"vst %%v23,112(%%r1,%2) \n\t"

"agfi %%r1,128 \n\t"
"brctg %%r0,0b "
:
:"r"(n),"ZQ"((const FLOAT (*)[2])alpha),"ZR"((FLOAT (*)[n * 2])x)
:"memory","cc","r0","r1","v0","v1","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
);
}
static void zscal_kernel_8_zero_r(BLASLONG n, FLOAT *alpha, FLOAT *x)
{
__asm__ volatile(
"vleg %%v0,8(%1),0 \n\t"
"wflcdb %%v0,%%v0 \n\t"
"vleg %%v0,8(%1),1 \n\t"
"srlg %%r0,%0,3 \n\t"
"xgr %%r1,%%r1 \n\t"
"0: \n\t"
"pfd 2, 1024(%%r1,%2) \n\t"

"vl %%v16,0(%%r1,%2) \n\t"
"vl %%v17,16(%%r1,%2) \n\t"
"vl %%v18,32(%%r1,%2) \n\t"
"vl %%v19,48(%%r1,%2) \n\t"
"vl %%v20,64(%%r1,%2) \n\t"
"vl %%v21,80(%%r1,%2) \n\t"
"vl %%v22,96(%%r1,%2) \n\t"
"vl %%v23,112(%%r1,%2) \n\t"
"vpdi %%v16,%%v16,%%v16,4 \n\t"
"vpdi %%v17,%%v17,%%v17,4 \n\t"
"vpdi %%v18,%%v18,%%v18,4 \n\t"
"vpdi %%v19,%%v19,%%v19,4 \n\t"
"vpdi %%v20,%%v20,%%v20,4 \n\t"
"vpdi %%v21,%%v21,%%v21,4 \n\t"
"vpdi %%v22,%%v22,%%v22,4 \n\t"
"vpdi %%v23,%%v23,%%v23,4 \n\t"

"vfmdb %%v16,%%v16,%%v0 \n\t"
"vfmdb %%v17,%%v17,%%v0 \n\t"
"vfmdb %%v18,%%v18,%%v0 \n\t"
"vfmdb %%v19,%%v19,%%v0 \n\t"
"vfmdb %%v20,%%v20,%%v0 \n\t"
"vfmdb %%v21,%%v21,%%v0 \n\t"
"vfmdb %%v22,%%v22,%%v0 \n\t"
"vfmdb %%v23,%%v23,%%v0 \n\t"

"vst %%v16,0(%%r1,%2) \n\t"
"vst %%v17,16(%%r1,%2) \n\t"
"vst %%v18,32(%%r1,%2) \n\t"
"vst %%v19,48(%%r1,%2) \n\t"
"vst %%v20,64(%%r1,%2) \n\t"
"vst %%v21,80(%%r1,%2) \n\t"
"vst %%v22,96(%%r1,%2) \n\t"
"vst %%v23,112(%%r1,%2) \n\t"

"agfi %%r1,128 \n\t"
"brctg %%r0,0b "
:
:"r"(n),"ZQ"((const FLOAT (*)[2])alpha),"ZR"((FLOAT (*)[n * 2])x)
:"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23"
);
static void zscal_kernel_8(BLASLONG n, FLOAT *alpha, FLOAT *x) {
__asm__("vlrepg %%v0,0(%[alpha])\n\t"
"vleg %%v1,8(%[alpha]),0\n\t"
"wflcdb %%v1,%%v1\n\t"
"vleg %%v1,8(%[alpha]),1\n\t"
"srlg %[n],%[n],3\n\t"
"xgr %%r1,%%r1\n\t"
"0:\n\t"
"pfd 2, 1024(%%r1,%[x])\n\t"
"vl %%v16,0(%%r1,%[x])\n\t"
"vl %%v17,16(%%r1,%[x])\n\t"
"vl %%v18,32(%%r1,%[x])\n\t"
"vl %%v19,48(%%r1,%[x])\n\t"
"vl %%v20,64(%%r1,%[x])\n\t"
"vl %%v21,80(%%r1,%[x])\n\t"
"vl %%v22,96(%%r1,%[x])\n\t"
"vl %%v23,112(%%r1,%[x])\n\t"
"vpdi %%v24,%%v16,%%v16,4\n\t"
"vpdi %%v25,%%v17,%%v17,4\n\t"
"vpdi %%v26,%%v18,%%v18,4\n\t"
"vpdi %%v27,%%v19,%%v19,4\n\t"
"vpdi %%v28,%%v20,%%v20,4\n\t"
"vpdi %%v29,%%v21,%%v21,4\n\t"
"vpdi %%v30,%%v22,%%v22,4\n\t"
"vpdi %%v31,%%v23,%%v23,4\n\t"
"vfmdb %%v16,%%v16,%%v0\n\t"
"vfmdb %%v17,%%v17,%%v0\n\t"
"vfmdb %%v18,%%v18,%%v0\n\t"
"vfmdb %%v19,%%v19,%%v0\n\t"
"vfmdb %%v20,%%v20,%%v0\n\t"
"vfmdb %%v21,%%v21,%%v0\n\t"
"vfmdb %%v22,%%v22,%%v0\n\t"
"vfmdb %%v23,%%v23,%%v0\n\t"
"vfmadb %%v16,%%v24,%%v1,%%v16\n\t"
"vfmadb %%v17,%%v25,%%v1,%%v17\n\t"
"vfmadb %%v18,%%v26,%%v1,%%v18\n\t"
"vfmadb %%v19,%%v27,%%v1,%%v19\n\t"
"vfmadb %%v20,%%v28,%%v1,%%v20\n\t"
"vfmadb %%v21,%%v29,%%v1,%%v21\n\t"
"vfmadb %%v22,%%v30,%%v1,%%v22\n\t"
"vfmadb %%v23,%%v31,%%v1,%%v23\n\t"
"vst %%v16,0(%%r1,%[x])\n\t"
"vst %%v17,16(%%r1,%[x])\n\t"
"vst %%v18,32(%%r1,%[x])\n\t"
"vst %%v19,48(%%r1,%[x])\n\t"
"vst %%v20,64(%%r1,%[x])\n\t"
"vst %%v21,80(%%r1,%[x])\n\t"
"vst %%v22,96(%%r1,%[x])\n\t"
"vst %%v23,112(%%r1,%[x])\n\t"
"agfi %%r1,128\n\t"
"brctg %[n],0b"
: "+m"(*(FLOAT (*)[n * 2]) x),[n] "+&r"(n)
: [x] "a"(x), "m"(*(const FLOAT (*)[2]) alpha),[alpha] "a"(alpha)
: "cc", "r1", "v0", "v1", "v16", "v17", "v18", "v19", "v20", "v21",
"v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30",
"v31");
} }


static void zscal_kernel_8_zero_i(BLASLONG n, FLOAT *alpha, FLOAT *x)
{
__asm__ volatile(
"vlrepg %%v0,0(%1) \n\t"
"srlg %%r0,%0,3 \n\t"
"xgr %%r1,%%r1 \n\t"
"0: \n\t"
"pfd 2, 1024(%%r1,%2) \n\t"

"vl %%v16,0(%%r1,%2) \n\t"
"vl %%v17,16(%%r1,%2) \n\t"
"vl %%v18,32(%%r1,%2) \n\t"
"vl %%v19,48(%%r1,%2) \n\t"
"vl %%v20,64(%%r1,%2) \n\t"
"vl %%v21,80(%%r1,%2) \n\t"
"vl %%v22,96(%%r1,%2) \n\t"
"vl %%v23,112(%%r1,%2) \n\t"

"vfmdb %%v16,%%v16,%%v0 \n\t"
"vfmdb %%v17,%%v17,%%v0 \n\t"
"vfmdb %%v18,%%v18,%%v0 \n\t"
"vfmdb %%v19,%%v19,%%v0 \n\t"
"vfmdb %%v20,%%v20,%%v0 \n\t"
"vfmdb %%v21,%%v21,%%v0 \n\t"
"vfmdb %%v22,%%v22,%%v0 \n\t"
"vfmdb %%v23,%%v23,%%v0 \n\t"

"vst %%v16,0(%%r1,%2) \n\t"
"vst %%v17,16(%%r1,%2) \n\t"
"vst %%v18,32(%%r1,%2) \n\t"
"vst %%v19,48(%%r1,%2) \n\t"
"vst %%v20,64(%%r1,%2) \n\t"
"vst %%v21,80(%%r1,%2) \n\t"
"vst %%v22,96(%%r1,%2) \n\t"
"vst %%v23,112(%%r1,%2) \n\t"

"agfi %%r1,128 \n\t"
"brctg %%r0,0b "
:
:"r"(n),"ZQ"((const FLOAT (*)[2])alpha),"ZR"((FLOAT (*)[n * 2])x)
:"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23"
);
static void zscal_kernel_8_zero_r(BLASLONG n, FLOAT *alpha, FLOAT *x) {
__asm__("vleg %%v0,8(%[alpha]),0\n\t"
"wflcdb %%v0,%%v0\n\t"
"vleg %%v0,8(%[alpha]),1\n\t"
"srlg %[n],%[n],3\n\t"
"xgr %%r1,%%r1\n\t"
"0:\n\t"
"pfd 2, 1024(%%r1,%[x])\n\t"
"vl %%v16,0(%%r1,%[x])\n\t"
"vl %%v17,16(%%r1,%[x])\n\t"
"vl %%v18,32(%%r1,%[x])\n\t"
"vl %%v19,48(%%r1,%[x])\n\t"
"vl %%v20,64(%%r1,%[x])\n\t"
"vl %%v21,80(%%r1,%[x])\n\t"
"vl %%v22,96(%%r1,%[x])\n\t"
"vl %%v23,112(%%r1,%[x])\n\t"
"vpdi %%v16,%%v16,%%v16,4\n\t"
"vpdi %%v17,%%v17,%%v17,4\n\t"
"vpdi %%v18,%%v18,%%v18,4\n\t"
"vpdi %%v19,%%v19,%%v19,4\n\t"
"vpdi %%v20,%%v20,%%v20,4\n\t"
"vpdi %%v21,%%v21,%%v21,4\n\t"
"vpdi %%v22,%%v22,%%v22,4\n\t"
"vpdi %%v23,%%v23,%%v23,4\n\t"
"vfmdb %%v16,%%v16,%%v0\n\t"
"vfmdb %%v17,%%v17,%%v0\n\t"
"vfmdb %%v18,%%v18,%%v0\n\t"
"vfmdb %%v19,%%v19,%%v0\n\t"
"vfmdb %%v20,%%v20,%%v0\n\t"
"vfmdb %%v21,%%v21,%%v0\n\t"
"vfmdb %%v22,%%v22,%%v0\n\t"
"vfmdb %%v23,%%v23,%%v0\n\t"
"vst %%v16,0(%%r1,%[x])\n\t"
"vst %%v17,16(%%r1,%[x])\n\t"
"vst %%v18,32(%%r1,%[x])\n\t"
"vst %%v19,48(%%r1,%[x])\n\t"
"vst %%v20,64(%%r1,%[x])\n\t"
"vst %%v21,80(%%r1,%[x])\n\t"
"vst %%v22,96(%%r1,%[x])\n\t"
"vst %%v23,112(%%r1,%[x])\n\t"
"agfi %%r1,128\n\t"
"brctg %[n],0b"
: "+m"(*(FLOAT (*)[n * 2]) x),[n] "+&r"(n)
: [x] "a"(x), "m"(*(const FLOAT (*)[2]) alpha),[alpha] "a"(alpha)
: "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22",
"v23");
} }


static void zscal_kernel_8_zero(BLASLONG n, FLOAT *x)
{
__asm__ volatile(
"vzero %%v24 \n\t"
"vzero %%v25 \n\t"
"vzero %%v26 \n\t"
"vzero %%v27 \n\t"
"srlg %%r0,%0,3 \n\t"
"xgr %%r1,%%r1 \n\t"
"0: \n\t"
"pfd 2, 1024(%%r1,%1) \n\t"

"vst %%v24,0(%%r1,%1) \n\t"
"vst %%v25,16(%%r1,%1) \n\t"
"vst %%v26,32(%%r1,%1) \n\t"
"vst %%v27,48(%%r1,%1) \n\t"
"vst %%v24,64(%%r1,%1) \n\t"
"vst %%v25,80(%%r1,%1) \n\t"
"vst %%v26,96(%%r1,%1) \n\t"
"vst %%v27,112(%%r1,%1) \n\t"
"agfi %%r1,128 \n\t"
"brctg %%r0,0b "
:
:"r"(n),"ZR"((FLOAT (*)[n * 2])x)
:"memory","cc","r0","r1","v24","v25","v26","v27"
);
static void zscal_kernel_8_zero_i(BLASLONG n, FLOAT *alpha, FLOAT *x) {
__asm__("vlrepg %%v0,0(%[alpha])\n\t"
"srlg %[n],%[n],3\n\t"
"xgr %%r1,%%r1\n\t"
"0:\n\t"
"pfd 2, 1024(%%r1,%[x])\n\t"
"vl %%v16,0(%%r1,%[x])\n\t"
"vl %%v17,16(%%r1,%[x])\n\t"
"vl %%v18,32(%%r1,%[x])\n\t"
"vl %%v19,48(%%r1,%[x])\n\t"
"vl %%v20,64(%%r1,%[x])\n\t"
"vl %%v21,80(%%r1,%[x])\n\t"
"vl %%v22,96(%%r1,%[x])\n\t"
"vl %%v23,112(%%r1,%[x])\n\t"
"vfmdb %%v16,%%v16,%%v0\n\t"
"vfmdb %%v17,%%v17,%%v0\n\t"
"vfmdb %%v18,%%v18,%%v0\n\t"
"vfmdb %%v19,%%v19,%%v0\n\t"
"vfmdb %%v20,%%v20,%%v0\n\t"
"vfmdb %%v21,%%v21,%%v0\n\t"
"vfmdb %%v22,%%v22,%%v0\n\t"
"vfmdb %%v23,%%v23,%%v0\n\t"
"vst %%v16,0(%%r1,%[x])\n\t"
"vst %%v17,16(%%r1,%[x])\n\t"
"vst %%v18,32(%%r1,%[x])\n\t"
"vst %%v19,48(%%r1,%[x])\n\t"
"vst %%v20,64(%%r1,%[x])\n\t"
"vst %%v21,80(%%r1,%[x])\n\t"
"vst %%v22,96(%%r1,%[x])\n\t"
"vst %%v23,112(%%r1,%[x])\n\t"
"agfi %%r1,128\n\t"
"brctg %[n],0b"
: "+m"(*(FLOAT (*)[n * 2]) x),[n] "+&r"(n)
: [x] "a"(x), "m"(*(const FLOAT (*)[2]) alpha),[alpha] "a"(alpha)
: "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22",
"v23");
} }


static void zscal_kernel_inc_8(BLASLONG n, FLOAT *alpha, FLOAT *x, BLASLONG inc_x)
{
BLASLONG i;
BLASLONG inc_x2 = 2 * inc_x;
BLASLONG inc_x3 = inc_x2 + inc_x;
FLOAT t0, t1, t2, t3;
FLOAT da_r = alpha[0];
FLOAT da_i = alpha[1];

for (i = 0; i < n; i += 4)
{
t0 = da_r * x[0] - da_i * x[1];
t1 = da_r * x[inc_x] - da_i * x[inc_x + 1];
t2 = da_r * x[inc_x2] - da_i * x[inc_x2 + 1];
t3 = da_r * x[inc_x3] - da_i * x[inc_x3 + 1];

x[1] = da_i * x[0] + da_r * x[1];
x[inc_x + 1] = da_i * x[inc_x] + da_r * x[inc_x + 1];
x[inc_x2 + 1] = da_i * x[inc_x2] + da_r * x[inc_x2 + 1];
x[inc_x3 + 1] = da_i * x[inc_x3] + da_r * x[inc_x3 + 1];

x[0] = t0;
x[inc_x] = t1;
x[inc_x2] = t2;
x[inc_x3] = t3;

x += 4 * inc_x;
}
static void zscal_kernel_8_zero(BLASLONG n, FLOAT *x) {
__asm__("vzero %%v0\n\t"
"srlg %[n],%[n],3\n\t"
"xgr %%r1,%%r1\n\t"
"0:\n\t"
"pfd 2, 1024(%%r1,%[x])\n\t"
"vst %%v0,0(%%r1,%[x])\n\t"
"vst %%v0,16(%%r1,%[x])\n\t"
"vst %%v0,32(%%r1,%[x])\n\t"
"vst %%v0,48(%%r1,%[x])\n\t"
"vst %%v0,64(%%r1,%[x])\n\t"
"vst %%v0,80(%%r1,%[x])\n\t"
"vst %%v0,96(%%r1,%[x])\n\t"
"vst %%v0,112(%%r1,%[x])\n\t"
"agfi %%r1,128\n\t"
"brctg %[n],0b"
: "=m"(*(FLOAT (*)[n * 2]) x),[n] "+&r"(n)
: [x] "a"(x)
: "cc", "r1", "v0");
} }


int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) {
BLASLONG i = 0, j = 0;
FLOAT temp0;
FLOAT temp1;
FLOAT alpha[2] __attribute__ ((aligned(16)));

if (inc_x != 1) {
inc_x <<= 1;

if (da_r == 0.0) {

BLASLONG n1 = n & -2;

if (da_i == 0.0) {
static void zscal_kernel_inc_8(BLASLONG n, FLOAT *alpha, FLOAT *x,
BLASLONG inc_x) {
BLASLONG i;
BLASLONG inc_x2 = 2 * inc_x;
BLASLONG inc_x3 = inc_x2 + inc_x;
FLOAT t0, t1, t2, t3;
FLOAT da_r = alpha[0];
FLOAT da_i = alpha[1];

for (i = 0; i < n; i += 4) {
t0 = da_r * x[0] - da_i * x[1];
t1 = da_r * x[inc_x] - da_i * x[inc_x + 1];
t2 = da_r * x[inc_x2] - da_i * x[inc_x2 + 1];
t3 = da_r * x[inc_x3] - da_i * x[inc_x3 + 1];

x[1] = da_i * x[0] + da_r * x[1];
x[inc_x + 1] = da_i * x[inc_x] + da_r * x[inc_x + 1];
x[inc_x2 + 1] = da_i * x[inc_x2] + da_r * x[inc_x2 + 1];
x[inc_x3 + 1] = da_i * x[inc_x3] + da_r * x[inc_x3 + 1];

x[0] = t0;
x[inc_x] = t1;
x[inc_x2] = t2;
x[inc_x3] = t3;

x += 4 * inc_x;
}
}


while (j < n1) {
int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i,
FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy,
BLASLONG dummy2) {
BLASLONG i = 0, j = 0;
FLOAT temp0;
FLOAT temp1;
FLOAT alpha[2] __attribute__ ((aligned(16)));


x[i] = 0.0;
x[i + 1] = 0.0;
x[i + inc_x] = 0.0;
x[i + 1 + inc_x] = 0.0;
i += 2 * inc_x;
j += 2;
if (inc_x != 1) {
inc_x <<= 1;


}
if (da_r == 0.0) {


while (j < n) {
BLASLONG n1 = n & -2;


x[i] = 0.0;
x[i + 1] = 0.0;
i += inc_x;
j++;
if (da_i == 0.0) {


}
while (j < n1) {


} else {
x[i] = 0.0;
x[i + 1] = 0.0;
x[i + inc_x] = 0.0;
x[i + 1 + inc_x] = 0.0;
i += 2 * inc_x;
j += 2;


while (j < n1) {
}


temp0 = -da_i * x[i + 1];
x[i + 1] = da_i * x[i];
x[i] = temp0;
temp1 = -da_i * x[i + 1 + inc_x];
x[i + 1 + inc_x] = da_i * x[i + inc_x];
x[i + inc_x] = temp1;
i += 2 * inc_x;
j += 2;
while (j < n) {


}
x[i] = 0.0;
x[i + 1] = 0.0;
i += inc_x;
j++;


while (j < n) {
}


temp0 = -da_i * x[i + 1];
x[i + 1] = da_i * x[i];
x[i] = temp0;
i += inc_x;
j++;
} else {


}
while (j < n1) {


temp0 = -da_i * x[i + 1];
x[i + 1] = da_i * x[i];
x[i] = temp0;
temp1 = -da_i * x[i + 1 + inc_x];
x[i + 1 + inc_x] = da_i * x[i + inc_x];
x[i + inc_x] = temp1;
i += 2 * inc_x;
j += 2;


}


}
while (j < n) {


} else {
temp0 = -da_i * x[i + 1];
x[i + 1] = da_i * x[i];
x[i] = temp0;
i += inc_x;
j++;


}


if (da_i == 0.0) {
BLASLONG n1 = n & -2;
}


while (j < n1) {
} else {


temp0 = da_r * x[i];
x[i + 1] = da_r * x[i + 1];
x[i] = temp0;
temp1 = da_r * x[i + inc_x];
x[i + 1 + inc_x] = da_r * x[i + 1 + inc_x];
x[i + inc_x] = temp1;
i += 2 * inc_x;
j += 2;
if (da_i == 0.0) {
BLASLONG n1 = n & -2;


}
while (j < n1) {


while (j < n) {
temp0 = da_r * x[i];
x[i + 1] = da_r * x[i + 1];
x[i] = temp0;
temp1 = da_r * x[i + inc_x];
x[i + 1 + inc_x] = da_r * x[i + 1 + inc_x];
x[i + inc_x] = temp1;
i += 2 * inc_x;
j += 2;


temp0 = da_r * x[i];
x[i + 1] = da_r * x[i + 1];
x[i] = temp0;
i += inc_x;
j++;
}


}
while (j < n) {


} else {
temp0 = da_r * x[i];
x[i + 1] = da_r * x[i + 1];
x[i] = temp0;
i += inc_x;
j++;


BLASLONG n1 = n & -8;
if (n1 > 0) {
alpha[0] = da_r;
alpha[1] = da_i;
zscal_kernel_inc_8(n1, alpha, x, inc_x);
j = n1;
i = n1 * inc_x;
}
}


while (j < n) {
} else {


temp0 = da_r * x[i] - da_i * x[i + 1];
x[i + 1] = da_r * x[i + 1] + da_i * x[i];
x[i] = temp0;
i += inc_x;
j++;
BLASLONG n1 = n & -8;
if (n1 > 0) {
alpha[0] = da_r;
alpha[1] = da_i;
zscal_kernel_inc_8(n1, alpha, x, inc_x);
j = n1;
i = n1 * inc_x;
}


}
while (j < n) {


}
temp0 = da_r * x[i] - da_i * x[i + 1];
x[i + 1] = da_r * x[i + 1] + da_i * x[i];
x[i] = temp0;
i += inc_x;
j++;


} }


return (0);
}
}


}


BLASLONG n1 = n & -8;
if (n1 > 0) {
return (0);
}


alpha[0] = da_r;
alpha[1] = da_i;
BLASLONG n1 = n & -8;
if (n1 > 0) {


if (da_r == 0.0)
if (da_i == 0)
zscal_kernel_8_zero(n1, x);
else
zscal_kernel_8_zero_r(n1, alpha, x);
else
if (da_i == 0)
zscal_kernel_8_zero_i(n1, alpha, x);
else
zscal_kernel_8(n1, alpha, x);
alpha[0] = da_r;
alpha[1] = da_i;


i = n1 << 1;
j = n1;
}
if (da_r == 0.0)
if (da_i == 0)
zscal_kernel_8_zero(n1, x);
else
zscal_kernel_8_zero_r(n1, alpha, x);
else if (da_i == 0)
zscal_kernel_8_zero_i(n1, alpha, x);
else
zscal_kernel_8(n1, alpha, x);


i = n1 << 1;
j = n1;
}


if (da_r == 0.0) {
if (da_r == 0.0) {


if (da_i == 0.0) {
if (da_i == 0.0) {


while (j < n) {
while (j < n) {


x[i] = 0.0;
x[i + 1] = 0.0;
i += 2;
j++;
x[i] = 0.0;
x[i + 1] = 0.0;
i += 2;
j++;


}
}


} else {
} else {


while (j < n) {
while (j < n) {


temp0 = -da_i * x[i + 1];
x[i + 1] = da_i * x[i];
x[i] = temp0;
i += 2;
j++;
temp0 = -da_i * x[i + 1];
x[i + 1] = da_i * x[i];
x[i] = temp0;
i += 2;
j++;


}
}


}
}


} else {
} else {


if (da_i == 0.0) {
if (da_i == 0.0) {


while (j < n) {
while (j < n) {


temp0 = da_r * x[i];
x[i + 1] = da_r * x[i + 1];
x[i] = temp0;
i += 2;
j++;
temp0 = da_r * x[i];
x[i + 1] = da_r * x[i + 1];
x[i] = temp0;
i += 2;
j++;


}
}


} else {
} else {


while (j < n) {
while (j < n) {


temp0 = da_r * x[i] - da_i * x[i + 1];
x[i + 1] = da_r * x[i + 1] + da_i * x[i];
x[i] = temp0;
i += 2;
j++;
temp0 = da_r * x[i] - da_i * x[i + 1];
x[i + 1] = da_r * x[i + 1] + da_i * x[i];
x[i] = temp0;
i += 2;
j++;


}

}
}


} }


return (0);
}

return (0);
} }

+ 124
- 139
kernel/zarch/zswap.c View File

@@ -1,5 +1,5 @@
/*************************************************************************** /***************************************************************************
Copyright (c) 2013-2017, The OpenBLAS Project
Copyright (c) 2013-2019, The OpenBLAS Project
All rights reserved. All rights reserved.
Redistribution and use in source and binary forms, with or without Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are modification, are permitted provided that the following conditions are
@@ -27,157 +27,142 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.


#include "common.h" #include "common.h"


static void zswap_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y)
{
__asm__ volatile(
"srlg %%r0,%0,4 \n\t"
"xgr %%r1,%%r1 \n\t"
"0: \n\t"
"pfd 2, 1024(%%r1,%1) \n\t"
"pfd 2, 1024(%%r1,%2) \n\t"
"vl %%v16, 0(%%r1,%1) \n\t"
"vl %%v17, 16(%%r1,%1) \n\t"
"vl %%v18, 32(%%r1,%1) \n\t"
"vl %%v19, 48(%%r1,%1) \n\t"
"vl %%v20, 64(%%r1,%1) \n\t"
"vl %%v21, 80(%%r1,%1) \n\t"
"vl %%v22, 96(%%r1,%1) \n\t"
"vl %%v23, 112(%%r1,%1) \n\t"
"vl %%v24, 128(%%r1,%1) \n\t"
"vl %%v25, 144(%%r1,%1) \n\t"
"vl %%v26, 160(%%r1,%1) \n\t"
"vl %%v27, 176(%%r1,%1) \n\t"
"vl %%v28, 192(%%r1,%1) \n\t"
"vl %%v29, 208(%%r1,%1) \n\t"
"vl %%v30, 224(%%r1,%1) \n\t"
"vl %%v31, 240(%%r1,%1) \n\t"

"vl %%v0, 0(%%r1,%2) \n\t"
"vl %%v1, 16(%%r1,%2) \n\t"
"vl %%v2, 32(%%r1,%2) \n\t"
"vl %%v3, 48(%%r1,%2) \n\t"
"vl %%v4, 64(%%r1,%2) \n\t"
"vl %%v5, 80(%%r1,%2) \n\t"
"vl %%v6, 96(%%r1,%2) \n\t"
"vl %%v7, 112(%%r1,%2) \n\t"
"vst %%v0, 0(%%r1,%1) \n\t"
"vst %%v1, 16(%%r1,%1) \n\t"
"vst %%v2, 32(%%r1,%1) \n\t"
"vst %%v3, 48(%%r1,%1) \n\t"
"vst %%v4, 64(%%r1,%1) \n\t"
"vst %%v5, 80(%%r1,%1) \n\t"
"vst %%v6, 96(%%r1,%1) \n\t"
"vst %%v7, 112(%%r1,%1) \n\t"

"vl %%v0, 128(%%r1,%2) \n\t"
"vl %%v1, 144(%%r1,%2) \n\t"
"vl %%v2, 160(%%r1,%2) \n\t"
"vl %%v3, 176(%%r1,%2) \n\t"
"vl %%v4, 192(%%r1,%2) \n\t"
"vl %%v5, 208(%%r1,%2) \n\t"
"vl %%v6, 224(%%r1,%2) \n\t"
"vl %%v7, 240(%%r1,%2) \n\t"
"vst %%v0, 128(%%r1,%1) \n\t"
"vst %%v1, 144(%%r1,%1) \n\t"
"vst %%v2, 160(%%r1,%1) \n\t"
"vst %%v3, 176(%%r1,%1) \n\t"
"vst %%v4, 192(%%r1,%1) \n\t"
"vst %%v5, 208(%%r1,%1) \n\t"
"vst %%v6, 224(%%r1,%1) \n\t"
"vst %%v7, 240(%%r1,%1) \n\t"

"vst %%v16, 0(%%r1,%2) \n\t"
"vst %%v17, 16(%%r1,%2) \n\t"
"vst %%v18, 32(%%r1,%2) \n\t"
"vst %%v19, 48(%%r1,%2) \n\t"
"vst %%v20, 64(%%r1,%2) \n\t"
"vst %%v21, 80(%%r1,%2) \n\t"
"vst %%v22, 96(%%r1,%2) \n\t"
"vst %%v23, 112(%%r1,%2) \n\t"
"vst %%v24, 128(%%r1,%2) \n\t"
"vst %%v25, 144(%%r1,%2) \n\t"
"vst %%v26, 160(%%r1,%2) \n\t"
"vst %%v27, 176(%%r1,%2) \n\t"
"vst %%v28, 192(%%r1,%2) \n\t"
"vst %%v29, 208(%%r1,%2) \n\t"
"vst %%v30, 224(%%r1,%2) \n\t"
"vst %%v31, 240(%%r1,%2) \n\t"
"agfi %%r1,256 \n\t"
"brctg %%r0,0b "
:
:"r"(n),"ZR"((FLOAT (*)[n * 2])x),"ZR"((FLOAT (*)[n * 2])y)
:"memory","cc","r0","r1","v0","v1","v2","v3","v4","v5","v6","v7","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
);
static void zswap_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y) {
__asm__("srlg %[n],%[n],4\n\t"
"xgr %%r1,%%r1\n\t"
"0:\n\t"
"pfd 2, 1024(%%r1,%[x])\n\t"
"pfd 2, 1024(%%r1,%[y])\n\t"
"vl %%v16, 0(%%r1,%[x])\n\t"
"vl %%v17, 16(%%r1,%[x])\n\t"
"vl %%v18, 32(%%r1,%[x])\n\t"
"vl %%v19, 48(%%r1,%[x])\n\t"
"vl %%v20, 64(%%r1,%[x])\n\t"
"vl %%v21, 80(%%r1,%[x])\n\t"
"vl %%v22, 96(%%r1,%[x])\n\t"
"vl %%v23, 112(%%r1,%[x])\n\t"
"vl %%v24, 128(%%r1,%[x])\n\t"
"vl %%v25, 144(%%r1,%[x])\n\t"
"vl %%v26, 160(%%r1,%[x])\n\t"
"vl %%v27, 176(%%r1,%[x])\n\t"
"vl %%v28, 192(%%r1,%[x])\n\t"
"vl %%v29, 208(%%r1,%[x])\n\t"
"vl %%v30, 224(%%r1,%[x])\n\t"
"vl %%v31, 240(%%r1,%[x])\n\t"
"vl %%v0, 0(%%r1,%[y])\n\t"
"vl %%v1, 16(%%r1,%[y])\n\t"
"vl %%v2, 32(%%r1,%[y])\n\t"
"vl %%v3, 48(%%r1,%[y])\n\t"
"vl %%v4, 64(%%r1,%[y])\n\t"
"vl %%v5, 80(%%r1,%[y])\n\t"
"vl %%v6, 96(%%r1,%[y])\n\t"
"vl %%v7, 112(%%r1,%[y])\n\t"
"vst %%v0, 0(%%r1,%[x])\n\t"
"vst %%v1, 16(%%r1,%[x])\n\t"
"vst %%v2, 32(%%r1,%[x])\n\t"
"vst %%v3, 48(%%r1,%[x])\n\t"
"vst %%v4, 64(%%r1,%[x])\n\t"
"vst %%v5, 80(%%r1,%[x])\n\t"
"vst %%v6, 96(%%r1,%[x])\n\t"
"vst %%v7, 112(%%r1,%[x])\n\t"
"vl %%v0, 128(%%r1,%[y])\n\t"
"vl %%v1, 144(%%r1,%[y])\n\t"
"vl %%v2, 160(%%r1,%[y])\n\t"
"vl %%v3, 176(%%r1,%[y])\n\t"
"vl %%v4, 192(%%r1,%[y])\n\t"
"vl %%v5, 208(%%r1,%[y])\n\t"
"vl %%v6, 224(%%r1,%[y])\n\t"
"vl %%v7, 240(%%r1,%[y])\n\t"
"vst %%v0, 128(%%r1,%[x])\n\t"
"vst %%v1, 144(%%r1,%[x])\n\t"
"vst %%v2, 160(%%r1,%[x])\n\t"
"vst %%v3, 176(%%r1,%[x])\n\t"
"vst %%v4, 192(%%r1,%[x])\n\t"
"vst %%v5, 208(%%r1,%[x])\n\t"
"vst %%v6, 224(%%r1,%[x])\n\t"
"vst %%v7, 240(%%r1,%[x])\n\t"
"vst %%v16, 0(%%r1,%[y])\n\t"
"vst %%v17, 16(%%r1,%[y])\n\t"
"vst %%v18, 32(%%r1,%[y])\n\t"
"vst %%v19, 48(%%r1,%[y])\n\t"
"vst %%v20, 64(%%r1,%[y])\n\t"
"vst %%v21, 80(%%r1,%[y])\n\t"
"vst %%v22, 96(%%r1,%[y])\n\t"
"vst %%v23, 112(%%r1,%[y])\n\t"
"vst %%v24, 128(%%r1,%[y])\n\t"
"vst %%v25, 144(%%r1,%[y])\n\t"
"vst %%v26, 160(%%r1,%[y])\n\t"
"vst %%v27, 176(%%r1,%[y])\n\t"
"vst %%v28, 192(%%r1,%[y])\n\t"
"vst %%v29, 208(%%r1,%[y])\n\t"
"vst %%v30, 224(%%r1,%[y])\n\t"
"vst %%v31, 240(%%r1,%[y])\n\t"
"agfi %%r1,256\n\t"
"brctg %[n],0b"
: "+m"(*(FLOAT (*)[n * 2]) x), "+m"(*(FLOAT (*)[n * 2]) y),[n] "+&r"(n)
: [x] "a"(x),[y] "a"(y)
: "cc", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16",
"v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26",
"v27", "v28", "v29", "v30", "v31");
} }


int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT dummy4, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2)
{
BLASLONG i=0;
BLASLONG ix=0,iy=0;
FLOAT temp[2];
BLASLONG inc_x2, inc_y2;

if ( n <= 0 ) return(0);

if ( (inc_x == 1) && (inc_y == 1 ))
{

BLASLONG n1 = n & -16;
if ( n1 > 0 )
{
zswap_kernel_16(n1, x, y);
i=n1;
ix = 2* n1;
iy = 2* n1;
}

while(i < n)
{

temp[0] = x[ix] ;
temp[1] = x[ix+1] ;
x[ix] = y[iy] ;
x[ix+1] = y[iy+1] ;
y[iy] = temp[0] ;
y[iy+1] = temp[1] ;

ix += 2 ;
iy += 2 ;
i++ ;
int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3,
FLOAT dummy4, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y,
FLOAT *dummy, BLASLONG dummy2) {
BLASLONG i = 0;
BLASLONG ix = 0, iy = 0;
FLOAT temp[2];
BLASLONG inc_x2, inc_y2;

if (n <= 0)
return (0);

if ((inc_x == 1) && (inc_y == 1)) {

BLASLONG n1 = n & -16;
if (n1 > 0) {
zswap_kernel_16(n1, x, y);
i = n1;
ix = 2 * n1;
iy = 2 * n1;
}


while (i < n) {


}
temp[0] = x[ix];
temp[1] = x[ix + 1];
x[ix] = y[iy];
x[ix + 1] = y[iy + 1];
y[iy] = temp[0];
y[iy + 1] = temp[1];


ix += 2;
iy += 2;
i++;


} }
else
{


inc_x2 = 2 * inc_x;
inc_y2 = 2 * inc_y;
} else {


while(i < n)
{
inc_x2 = 2 * inc_x;
inc_y2 = 2 * inc_y;


temp[0] = x[ix] ;
temp[1] = x[ix+1] ;
x[ix] = y[iy] ;
x[ix+1] = y[iy+1] ;
y[iy] = temp[0] ;
y[iy+1] = temp[1] ;
while (i < n) {


ix += inc_x2 ;
iy += inc_y2 ;
i++ ;
temp[0] = x[ix];
temp[1] = x[ix + 1];
x[ix] = y[iy];
x[ix + 1] = y[iy + 1];
y[iy] = temp[0];
y[iy + 1] = temp[1];


}
ix += inc_x2;
iy += inc_y2;
i++;


} }
return(0);

}


}
return (0);


}

Loading…
Cancel
Save