(Z13 ) Blas1 mikrokernels can be inlined by gcc. Refactoring,fixes,tuningstags/v0.3.0
| @@ -39,19 +39,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| static FLOAT dasum_kernel_32(BLASLONG n, FLOAT *x) { | static FLOAT dasum_kernel_32(BLASLONG n, FLOAT *x) { | ||||
| FLOAT asum ; | |||||
| FLOAT asum ; | |||||
| __asm__ ( | __asm__ ( | ||||
| "pfd 1, 0(%3) \n\t" | |||||
| "sllg %%r0,%2,3 \n\t" | |||||
| "agr %%r0,%3 \n\t" | |||||
| "pfd 1, 0(%[ptr_x]) \n\t" | |||||
| "sllg %%r0,%[n],3 \n\t" | |||||
| "agr %%r0,%[ptr_x] \n\t" | |||||
| "vzero %%v0 \n\t" | "vzero %%v0 \n\t" | ||||
| "vzero %%v1 \n\t" | "vzero %%v1 \n\t" | ||||
| "vzero %%v2 \n\t" | "vzero %%v2 \n\t" | ||||
| "vzero %%v3 \n\t" | "vzero %%v3 \n\t" | ||||
| ".align 16 \n\t" | ".align 16 \n\t" | ||||
| "1: \n\t" | |||||
| "pfd 1, 256(%1 ) \n\t" | |||||
| "vlm %%v24,%%v31, 0(%1 ) \n\t" | |||||
| "1: \n\t" | |||||
| "pfd 1, 256(%[ptr_temp] ) \n\t" | |||||
| "vlm %%v24,%%v31, 0(%[ptr_temp] ) \n\t" | |||||
| "vflpdb %%v24, %%v24 \n\t" | "vflpdb %%v24, %%v24 \n\t" | ||||
| "vflpdb %%v25, %%v25 \n\t" | "vflpdb %%v25, %%v25 \n\t" | ||||
| @@ -71,7 +71,7 @@ static FLOAT dasum_kernel_32(BLASLONG n, FLOAT *x) { | |||||
| "vfadb %%v2,%%v2,%%v30 \n\t" | "vfadb %%v2,%%v2,%%v30 \n\t" | ||||
| "vfadb %%v3,%%v3,%%v31 \n\t" | "vfadb %%v3,%%v3,%%v31 \n\t" | ||||
| "vlm %%v24,%%v31, 128(%1) \n\t" | |||||
| "vlm %%v24,%%v31, 128(%[ptr_temp]) \n\t" | |||||
| "vflpdb %%v24, %%v24 \n\t" | "vflpdb %%v24, %%v24 \n\t" | ||||
| "vflpdb %%v25, %%v25 \n\t" | "vflpdb %%v25, %%v25 \n\t" | ||||
| @@ -81,7 +81,7 @@ static FLOAT dasum_kernel_32(BLASLONG n, FLOAT *x) { | |||||
| "vflpdb %%v29, %%v29 \n\t" | "vflpdb %%v29, %%v29 \n\t" | ||||
| "vflpdb %%v30, %%v30 \n\t" | "vflpdb %%v30, %%v30 \n\t" | ||||
| "vflpdb %%v31, %%v31 \n\t" | "vflpdb %%v31, %%v31 \n\t" | ||||
| "la %1,256(%1) \n\t" | |||||
| "la %[ptr_temp],256(%[ptr_temp]) \n\t" | |||||
| "vfadb %%v0,%%v0,%%v24 \n\t" | "vfadb %%v0,%%v0,%%v24 \n\t" | ||||
| "vfadb %%v1,%%v1,%%v25 \n\t" | "vfadb %%v1,%%v1,%%v25 \n\t" | ||||
| "vfadb %%v2,%%v2,%%v26 \n\t" | "vfadb %%v2,%%v2,%%v26 \n\t" | ||||
| @@ -91,16 +91,16 @@ static FLOAT dasum_kernel_32(BLASLONG n, FLOAT *x) { | |||||
| "vfadb %%v2,%%v2,%%v30 \n\t" | "vfadb %%v2,%%v2,%%v30 \n\t" | ||||
| "vfadb %%v3,%%v3,%%v31 \n\t" | "vfadb %%v3,%%v3,%%v31 \n\t" | ||||
| "clgrjl %1,%%r0,1b \n\t" | |||||
| "clgrjl %[ptr_temp],%%r0,1b \n\t" | |||||
| "vfadb %%v24,%%v0,%%v1 \n\t" | "vfadb %%v24,%%v0,%%v1 \n\t" | ||||
| "vfadb %%v25,%%v2,%%v3 \n\t" | "vfadb %%v25,%%v2,%%v3 \n\t" | ||||
| "vfadb %%v0,%%v25,%%v24 \n\t" | "vfadb %%v0,%%v25,%%v24 \n\t" | ||||
| "vrepg %%v1,%%v0,1 \n\t" | "vrepg %%v1,%%v0,1 \n\t" | ||||
| "adbr %%f0,%%f1 \n\t" | "adbr %%f0,%%f1 \n\t" | ||||
| "ldr %0,%%f0 \n\t" | |||||
| : "=f"(asum),"+&a"(x) | |||||
| : "r"(n), "1"(x) | |||||
| : "cc", "r0" ,"f0","f1","v0","v1","v2","v3","v24","v25","v26","v27","v28","v29","v30","v31" | |||||
| "ldr %[asum],%%f0 \n\t" | |||||
| : [asum] "=f"(asum),[ptr_temp] "+&a"(x) | |||||
| : [mem] "m"( *(const double (*)[n])x ), [n] "r"(n), [ptr_x] "a"(x) | |||||
| : "cc", "r0" ,"f0","f1","v0","v1","v2","v3","v24","v25","v26","v27","v28","v29","v30","v31" | |||||
| ); | ); | ||||
| return asum; | return asum; | ||||
| @@ -27,15 +27,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #include "common.h" | #include "common.h" | ||||
| #define Z13_D 1 | |||||
| #define PREFETCH_INS 1 | #define PREFETCH_INS 1 | ||||
| #if defined(Z13_A) | #if defined(Z13_A) | ||||
| #include <vecintrin.h> | #include <vecintrin.h> | ||||
| static void daxpy_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) | |||||
| static void daxpy_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT alpha) | |||||
| { | { | ||||
| BLASLONG i = 0; | BLASLONG i = 0; | ||||
| __vector double v_a = {*alpha,*alpha}; | |||||
| __vector double v_a = {alpha,alpha}; | |||||
| __vector double * v_y=(__vector double *)y; | __vector double * v_y=(__vector double *)y; | ||||
| __vector double * v_x=(__vector double *)x; | __vector double * v_x=(__vector double *)x; | ||||
| @@ -60,256 +60,53 @@ static void daxpy_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) | |||||
| } | } | ||||
| } | } | ||||
| #elif defined(Z13_B) | |||||
| static void __attribute__ ((noinline)) daxpy_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) | |||||
| { | |||||
| __asm__ volatile( | |||||
| #if defined(PREFETCH_INS) | |||||
| "pfd 1, 0(%1) \n\t" | |||||
| "pfd 2, 0(%2) \n\t" | |||||
| #endif | |||||
| "vlrepg %%v0 , 0(%3) \n\t" | |||||
| "srlg %3,%0,5 \n\t" | |||||
| "xgr %%r1,%%r1 \n\t" | |||||
| "vlr %%v1,%%v0 \n\t" | |||||
| ".align 16 \n\t" | |||||
| "1: \n\t" | |||||
| #if defined(PREFETCH_INS) | |||||
| "pfd 1, 256(%%r1,%1) \n\t" | |||||
| "pfd 2, 256(%%r1,%2) \n\t" | |||||
| #endif | |||||
| "vl %%v24, 0(%%r1,%2) \n\t" | |||||
| "vl %%v16, 0(%%r1,%1) \n\t" | |||||
| "vfmadb %%v16,%%v0,%%v16,%%v24 \n\t" | |||||
| "vst %%v16, 0(%%r1,%2) \n\t" | |||||
| "vl %%v25, 16(%%r1,%2) \n\t" | |||||
| "vl %%v17, 16(%%r1,%1) \n\t" | |||||
| "vfmadb %%v17,%%v0,%%v17,%%v25 \n\t" | |||||
| "vst %%v17, 16(%%r1,%2) \n\t" | |||||
| "vl %%v26, 32(%%r1,%2) \n\t" | |||||
| "vl %%v18, 32(%%r1,%1) \n\t" | |||||
| "vfmadb %%v18,%%v0,%%v18,%%v26 \n\t" | |||||
| "vst %%v18, 32(%%r1,%2) \n\t" | |||||
| "vl %%v27, 48(%%r1,%2) \n\t" | |||||
| "vl %%v19, 48(%%r1,%1) \n\t" | |||||
| "vfmadb %%v19,%%v0,%%v19,%%v27 \n\t" | |||||
| "vst %%v19, 48(%%r1,%2) \n\t" | |||||
| "vl %%v24,( 0+64)(%%r1,%2) \n\t" | |||||
| "vl %%v16,( 0+64)(%%r1,%1) \n\t" | |||||
| "vfmadb %%v16,%%v0,%%v16,%%v24 \n\t" | |||||
| "vst %%v16,( 0+64)(%%r1,%2) \n\t" | |||||
| "vl %%v25, (16+64)(%%r1,%2) \n\t" | |||||
| "vl %%v17, (16+64)(%%r1,%1) \n\t" | |||||
| "vfmadb %%v17,%%v0,%%v17,%%v25 \n\t" | |||||
| "vst %%v17, (16+64)(%%r1,%2) \n\t" | |||||
| "vl %%v26, (32+64)(%%r1,%2) \n\t" | |||||
| "vl %%v18, (32+64)(%%r1,%1) \n\t" | |||||
| "vfmadb %%v18,%%v0,%%v18,%%v26 \n\t" | |||||
| "vst %%v18, (32+64)(%%r1,%2) \n\t" | |||||
| "vl %%v27, (48+64)(%%r1,%2) \n\t" | |||||
| "vl %%v19, (48+64)(%%r1,%1) \n\t" | |||||
| "vfmadb %%v19,%%v0,%%v19,%%v27 \n\t" | |||||
| "vst %%v19, (48+64)(%%r1,%2) \n\t" | |||||
| "vl %%v24,( 0+128)(%%r1,%2) \n\t" | |||||
| "vl %%v16,( 0+128)(%%r1,%1) \n\t" | |||||
| "vfmadb %%v16,%%v0,%%v16,%%v24 \n\t" | |||||
| "vst %%v16,( 0+128)(%%r1,%2) \n\t" | |||||
| "vl %%v25, (16+128)(%%r1,%2) \n\t" | |||||
| "vl %%v17, (16+128)(%%r1,%1) \n\t" | |||||
| "vfmadb %%v17,%%v0,%%v17,%%v25 \n\t" | |||||
| "vst %%v17, (16+128)(%%r1,%2) \n\t" | |||||
| "vl %%v26, (32+128)(%%r1,%2) \n\t" | |||||
| "vl %%v18, (32+128)(%%r1,%1) \n\t" | |||||
| "vfmadb %%v18,%%v0,%%v18,%%v26 \n\t" | |||||
| "vst %%v18, (32+128)(%%r1,%2) \n\t" | |||||
| "vl %%v27, (48+128)(%%r1,%2) \n\t" | |||||
| "vl %%v19, (48+128)(%%r1,%1) \n\t" | |||||
| "vfmadb %%v19,%%v0,%%v19,%%v27 \n\t" | |||||
| "vst %%v19, (48+128)(%%r1,%2) \n\t" | |||||
| "vl %%v24,( 0+192)(%%r1,%2) \n\t" | |||||
| "vl %%v16,( 0+192)(%%r1,%1) \n\t" | |||||
| "vfmadb %%v16,%%v0,%%v16,%%v24 \n\t" | |||||
| "vst %%v16,( 0+192)(%%r1,%2) \n\t" | |||||
| "vl %%v25, (16+192)(%%r1,%2) \n\t" | |||||
| "vl %%v17, (16+192)(%%r1,%1) \n\t" | |||||
| "vfmadb %%v17,%%v0,%%v17,%%v25 \n\t" | |||||
| "vst %%v17, (16+192)(%%r1,%2) \n\t" | |||||
| "vl %%v26, (32+192)(%%r1,%2) \n\t" | |||||
| "vl %%v18, (32+192)(%%r1,%1) \n\t" | |||||
| "vfmadb %%v18,%%v0,%%v18,%%v26 \n\t" | |||||
| "vst %%v18, (32+192)(%%r1,%2) \n\t" | |||||
| "vl %%v27, (48+192)(%%r1,%2) \n\t" | |||||
| "vl %%v19, (48+192)(%%r1,%1) \n\t" | |||||
| "vfmadb %%v19,%%v0,%%v19,%%v27 \n\t" | |||||
| "vst %%v19, (48+192)(%%r1,%2) \n\t" | |||||
| "la %%r1,256(%%r1) \n\t" | |||||
| "brctg %3,1b" | |||||
| : | |||||
| :"r"(n),"a"(x),"a"(y),"a"(alpha) | |||||
| :"cc", "memory", "r1" ,"v0" ,"v16","v17","v18","v19", "v24","v25","v26","v27" | |||||
| ); | |||||
| } | |||||
| #elif defined(Z13_C) | |||||
| static void __attribute__ ((noinline)) daxpy_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) | |||||
| { | |||||
| __asm__ volatile( | |||||
| #if defined(PREFETCH_INS) | |||||
| "pfd 1, 0(%1) \n\t" | |||||
| "pfd 2, 0(%2) \n\t" | |||||
| #endif | |||||
| "vlrepg %%v0 , 0(%3) \n\t" | |||||
| "srlg %3,%0,5 \n\t" | |||||
| "xgr %%r1,%%r1 \n\t" | |||||
| "vlr %%v1,%%v0 \n\t" | |||||
| ".align 16 \n\t" | |||||
| "1: \n\t" | |||||
| #if defined(PREFETCH_INS) | |||||
| "pfd 1, 256(%%r1,%1) \n\t" | |||||
| "pfd 2, 256(%%r1,%2) \n\t" | |||||
| #endif | |||||
| "vl %%v16, 0(%%r1,%1) \n\t" | |||||
| "vl %%v17, 16(%%r1,%1) \n\t" | |||||
| "vl %%v18, 32(%%r1,%1) \n\t" | |||||
| "vl %%v19, 48(%%r1,%1) \n\t" | |||||
| "vl %%v24, 0(%%r1,%2) \n\t" | |||||
| "vl %%v25, 16(%%r1,%2) \n\t" | |||||
| "vl %%v26, 32(%%r1,%2) \n\t" | |||||
| "vl %%v27, 48(%%r1,%2) \n\t" | |||||
| "vfmadb %%v16,%%v0,%%v16,%%v24 \n\t" | |||||
| "vfmadb %%v17,%%v1,%%v17,%%v25 \n\t" | |||||
| "vfmadb %%v18,%%v0,%%v18,%%v26 \n\t" | |||||
| "vfmadb %%v19,%%v1,%%v19,%%v27 \n\t" | |||||
| "vst %%v16, 0(%%r1,%2) \n\t" | |||||
| "vst %%v17, 16(%%r1,%2) \n\t" | |||||
| "vst %%v18, 32(%%r1,%2) \n\t" | |||||
| "vst %%v19, 48(%%r1,%2) \n\t" | |||||
| "vl %%v24, 64(%%r1,%1) \n\t" | |||||
| "vl %%v25, 80(%%r1,%1) \n\t" | |||||
| "vl %%v26, 96(%%r1,%1) \n\t" | |||||
| "vl %%v27, 112(%%r1,%1) \n\t" | |||||
| "vl %%v16, 64(%%r1,%2) \n\t" | |||||
| "vl %%v17, 80(%%r1,%2) \n\t" | |||||
| "vl %%v18, 96(%%r1,%2) \n\t" | |||||
| "vl %%v19, 112(%%r1,%2) \n\t" | |||||
| "vfmadb %%v24,%%v0,%%v24,%%v16 \n\t" | |||||
| "vfmadb %%v25,%%v1,%%v25,%%v17 \n\t" | |||||
| "vfmadb %%v26,%%v0,%%v26,%%v18 \n\t" | |||||
| "vfmadb %%v27,%%v1,%%v27,%%v19 \n\t" | |||||
| "vst %%v24, 64(%%r1,%2) \n\t" | |||||
| "vst %%v25, 80(%%r1,%2) \n\t" | |||||
| "vst %%v26, 96(%%r1,%2) \n\t" | |||||
| "vst %%v27, 112(%%r1,%2) \n\t" | |||||
| "vl %%v16, (0+128)(%%r1,%1) \n\t" | |||||
| "vl %%v17, (16+128)(%%r1,%1) \n\t" | |||||
| "vl %%v18, (32+128)(%%r1,%1) \n\t" | |||||
| "vl %%v19, (48+128)(%%r1,%1) \n\t" | |||||
| "vl %%v24, (0+128)(%%r1,%2) \n\t" | |||||
| "vl %%v25, (16+128)(%%r1,%2) \n\t" | |||||
| "vl %%v26, (32+128)(%%r1,%2) \n\t" | |||||
| "vl %%v27, (48+128)(%%r1,%2) \n\t" | |||||
| "vfmadb %%v16,%%v0,%%v16,%%v24 \n\t" | |||||
| "vfmadb %%v17,%%v1,%%v17,%%v25 \n\t" | |||||
| "vfmadb %%v18,%%v0,%%v18,%%v26 \n\t" | |||||
| "vfmadb %%v19,%%v1,%%v19,%%v27 \n\t" | |||||
| "vst %%v16, (0+128)(%%r1,%2) \n\t" | |||||
| "vst %%v17, (16+128)(%%r1,%2) \n\t" | |||||
| "vst %%v18, (32+128)(%%r1,%2) \n\t" | |||||
| "vst %%v19, (48+128)(%%r1,%2) \n\t" | |||||
| "vl %%v24, (64+128)(%%r1,%1) \n\t" | |||||
| "vl %%v25, (80+128)(%%r1,%1) \n\t" | |||||
| "vl %%v26, (96+128)(%%r1,%1) \n\t" | |||||
| "vl %%v27, (112+128)(%%r1,%1) \n\t" | |||||
| "vl %%v16, (64+128)(%%r1,%2) \n\t" | |||||
| "vl %%v17, (80+128)(%%r1,%2) \n\t" | |||||
| "vl %%v18, (96+128)(%%r1,%2) \n\t" | |||||
| "vl %%v19, (112+128)(%%r1,%2) \n\t" | |||||
| "vfmadb %%v24,%%v0,%%v24,%%v16 \n\t" | |||||
| "vfmadb %%v25,%%v1,%%v25,%%v17 \n\t" | |||||
| "vfmadb %%v26,%%v0,%%v26,%%v18 \n\t" | |||||
| "vfmadb %%v27,%%v1,%%v27,%%v19 \n\t" | |||||
| "vst %%v24, (64+128)(%%r1,%2) \n\t" | |||||
| "vst %%v25, (80+128)(%%r1,%2) \n\t" | |||||
| "vst %%v26, (96+128)(%%r1,%2) \n\t" | |||||
| "vst %%v27, (112+128)(%%r1,%2) \n\t" | |||||
| "la %%r1,256(%%r1) \n\t" | |||||
| "brctg %3,1b" | |||||
| : | |||||
| :"r"(n),"a"(x),"a"(y),"a"(alpha) | |||||
| :"cc", "memory", "r1" ,"v0","v1","v16","v17","v18","v19", "v24","v25","v26","v27" | |||||
| ); | |||||
| } | |||||
| #elif defined(Z13_D) | |||||
| static void __attribute__ ((noinline)) daxpy_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) | |||||
| #else | |||||
| static void daxpy_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT alpha) | |||||
| { | { | ||||
| __asm__ volatile( | __asm__ volatile( | ||||
| #if defined(PREFETCH_INS) | #if defined(PREFETCH_INS) | ||||
| "pfd 1, 0(%1) \n\t" | |||||
| "pfd 2, 0(%2) \n\t" | |||||
| #endif | |||||
| "vlrepg %%v0 , 0(%3) \n\t" | |||||
| "srlg %3,%0,5 \n\t" | |||||
| "vlr %%v1,%%v0 \n\t" | |||||
| "pfd 1, 0(%[x_tmp]) \n\t" | |||||
| "pfd 2, 0(%[y_tmp]) \n\t" | |||||
| #endif | |||||
| "lgdr %%r0,%[alpha] \n\t" | |||||
| "vlvgp %%v0,%%r0,%%r0 \n\t" | |||||
| "srlg %%r0,%[n],5 \n\t" | |||||
| "vlr %%v1,%%v0 \n\t" | |||||
| ".align 16 \n\t" | ".align 16 \n\t" | ||||
| "1: \n\t" | "1: \n\t" | ||||
| #if defined(PREFETCH_INS) | #if defined(PREFETCH_INS) | ||||
| "pfd 1, 256(%1) \n\t" | |||||
| "pfd 2, 256(%2) \n\t" | |||||
| "pfd 1, 256(%[x_tmp]) \n\t" | |||||
| "pfd 2, 256(%[y_tmp]) \n\t" | |||||
| #endif | #endif | ||||
| "vlm %%v16,%%v23, 0(%1) \n\t" | |||||
| "vlm %%v24, %%v31, 0(%2) \n\t" | |||||
| "vfmadb %%v16,%%v0,%%v16,%%v24 \n\t" | |||||
| "vfmadb %%v17,%%v1,%%v17,%%v25 \n\t" | |||||
| "vfmadb %%v18,%%v0,%%v18,%%v26 \n\t" | |||||
| "vfmadb %%v19,%%v1,%%v19,%%v27 \n\t" | |||||
| "vfmadb %%v20,%%v0,%%v20,%%v28 \n\t" | |||||
| "vfmadb %%v21,%%v1,%%v21,%%v29 \n\t" | |||||
| "vfmadb %%v22,%%v0,%%v22,%%v30 \n\t" | |||||
| "vfmadb %%v23,%%v1,%%v23,%%v31 \n\t" | |||||
| "vstm %%v16,%%v23, 0(%2) \n\t" | |||||
| "vlm %%v24,%%v31, 128(%1) \n\t" | |||||
| "vlm %%v16,%%v23, 128(%2) \n\t" | |||||
| "vfmadb %%v24,%%v0,%%v24,%%v16 \n\t" | |||||
| "vfmadb %%v25,%%v1,%%v25,%%v17 \n\t" | |||||
| "vfmadb %%v26,%%v0,%%v26,%%v18 \n\t" | |||||
| "vfmadb %%v27,%%v1,%%v27,%%v19 \n\t" | |||||
| "vfmadb %%v28,%%v0,%%v28,%%v20 \n\t" | |||||
| "vfmadb %%v29,%%v1,%%v29,%%v21 \n\t" | |||||
| "vfmadb %%v30,%%v0,%%v30,%%v22 \n\t" | |||||
| "vfmadb %%v31,%%v1,%%v31,%%v23 \n\t" | |||||
| "la %1,256(%1) \n\t" | |||||
| "vstm %%v24, %%v31, 128(%2) \n\t" | |||||
| "la %2,256(%2) \n\t" | |||||
| "brctg %3,1b" | |||||
| : | |||||
| :"r"(n),"a"(x),"a"(y),"a"(alpha) | |||||
| :"cc", "memory", "v0","v1","v16","v17","v18","v19","v20","v21", | |||||
| "vlm %%v16,%%v23, 0(%[x_tmp]) \n\t" | |||||
| "vlm %%v24, %%v31, 0(%[y_tmp]) \n\t" | |||||
| "vfmadb %%v16,%%v0,%%v16,%%v24 \n\t" | |||||
| "vfmadb %%v17,%%v1,%%v17,%%v25 \n\t" | |||||
| "vfmadb %%v18,%%v0,%%v18,%%v26 \n\t" | |||||
| "vfmadb %%v19,%%v1,%%v19,%%v27 \n\t" | |||||
| "vfmadb %%v20,%%v0,%%v20,%%v28 \n\t" | |||||
| "vfmadb %%v21,%%v1,%%v21,%%v29 \n\t" | |||||
| "vfmadb %%v22,%%v0,%%v22,%%v30 \n\t" | |||||
| "vfmadb %%v23,%%v1,%%v23,%%v31 \n\t" | |||||
| "vstm %%v16,%%v23, 0(%[y_tmp]) \n\t" | |||||
| "vlm %%v24,%%v31, 128(%[x_tmp]) \n\t" | |||||
| "vlm %%v16,%%v23, 128(%[y_tmp]) \n\t" | |||||
| "vfmadb %%v24,%%v0,%%v24,%%v16 \n\t" | |||||
| "vfmadb %%v25,%%v1,%%v25,%%v17 \n\t" | |||||
| "vfmadb %%v26,%%v0,%%v26,%%v18 \n\t" | |||||
| "vfmadb %%v27,%%v1,%%v27,%%v19 \n\t" | |||||
| "vfmadb %%v28,%%v0,%%v28,%%v20 \n\t" | |||||
| "vfmadb %%v29,%%v1,%%v29,%%v21 \n\t" | |||||
| "vfmadb %%v30,%%v0,%%v30,%%v22 \n\t" | |||||
| "vfmadb %%v31,%%v1,%%v31,%%v23 \n\t" | |||||
| "la %[x_tmp],256(%[x_tmp]) \n\t" | |||||
| "vstm %%v24, %%v31, 128(%[y_tmp]) \n\t" | |||||
| "la %[y_tmp],256(%[y_tmp]) \n\t" | |||||
| "brctg %%r0,1b" | |||||
| : [mem_y] "+m" (*(double (*)[n])y), [x_tmp] "+&a"(x), [y_tmp] "+&a"(y) | |||||
| : [mem_x] "m" (*(const double (*)[n])x), [n] "r"(n), [alpha] "f"(alpha) | |||||
| :"cc", "r0", "v0","v1","v16","v17","v18","v19","v20","v21", | |||||
| "v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" | "v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" | ||||
| ); | ); | ||||
| @@ -334,7 +131,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS | |||||
| BLASLONG n1 = n & -32; | BLASLONG n1 = n & -32; | ||||
| if ( n1 ) | if ( n1 ) | ||||
| daxpy_kernel_32(n1, x, y , &da ); | |||||
| daxpy_kernel_32(n1, x, y , da ); | |||||
| i = n1; | i = n1; | ||||
| while(i < n) | while(i < n) | ||||
| @@ -30,83 +30,84 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #if defined(Z13mvc) | #if defined(Z13mvc) | ||||
| static void __attribute__ ((noinline)) dcopy_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y) { | |||||
| static void dcopy_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y) { | |||||
| __asm__ volatile( | __asm__ volatile( | ||||
| "pfd 1, 0(%1) \n\t" | |||||
| "pfd 2, 0(%2) \n\t" | |||||
| "srlg %%r0,%0,5 \n\t" | |||||
| "pfd 1, 0(%[ptr_x]) \n\t" | |||||
| "pfd 2, 0(%[ptr_y]) \n\t" | |||||
| "srlg %[n_tmp],%[n_tmp],5 \n\t" | |||||
| ".align 16 \n\t" | ".align 16 \n\t" | ||||
| "1: \n\t" | "1: \n\t" | ||||
| "mvc 0(256,%2),0(%1) \n\t" | |||||
| "la %1,256(%1) \n\t" | |||||
| "la %2,256(%2) \n\t" | |||||
| "brctg %%r0,1b" | |||||
| : | |||||
| : "r"(n), "a"(x), "a"(y) | |||||
| : "cc", "memory","r0" | |||||
| "mvc 0(256,%[ptr_y]),0(%[ptr_x]) \n\t" | |||||
| "la %[ptr_x],256(%[ptr_x]) \n\t" | |||||
| "la %[ptr_y],256(%[ptr_y]) \n\t" | |||||
| "brctg %[n_tmp],1b" | |||||
| : [mem_y] "=m" (*(double (*)[n])y), [n_tmp] "+&r"(n) | |||||
| : [mem_x] "m" (*(const double (*)[n])x), | |||||
| [ptr_x] "a"(x), [ptr_y] "a"(y) | |||||
| : "cc" | |||||
| ); | ); | ||||
| return; | return; | ||||
| } | } | ||||
| #else | #else | ||||
| static void __attribute__ ((noinline)) dcopy_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y) { | |||||
| static void dcopy_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y) { | |||||
| __asm__ volatile( | __asm__ volatile( | ||||
| "pfd 1, 0(%1) \n\t" | |||||
| "pfd 2, 0(%2) \n\t" | |||||
| "srlg %%r0,%0,5 \n\t" | |||||
| "xgr %%r1,%%r1 \n\t" | |||||
| "pfd 1, 0(%[ptr_x]) \n\t" | |||||
| "pfd 2, 0(%[ptr_y]) \n\t" | |||||
| "srlg %[n_tmp],%[n_tmp],5 \n\t" | |||||
| "xgr %%r1,%%r1 \n\t" | |||||
| ".align 16 \n\t" | ".align 16 \n\t" | ||||
| "1: \n\t" | |||||
| "pfd 1, 256(%%r1,%1) \n\t" | |||||
| "pfd 2, 256(%%r1,%2) \n\t" | |||||
| "vl %%v24, 0(%%r1,%1) \n\t" | |||||
| "vst %%v24, 0(%%r1,%2) \n\t" | |||||
| "vl %%v25, 16(%%r1,%1) \n\t" | |||||
| "vst %%v25, 16(%%r1,%2) \n\t" | |||||
| "vl %%v26, 32(%%r1,%1) \n\t" | |||||
| "vst %%v26, 32(%%r1,%2) \n\t" | |||||
| "vl %%v27, 48(%%r1,%1) \n\t" | |||||
| "vst %%v27, 48(%%r1,%2) \n\t" | |||||
| "vl %%v24, 64(%%r1,%1) \n\t" | |||||
| "vst %%v24, 64(%%r1,%2) \n\t" | |||||
| "vl %%v25, 80(%%r1,%1) \n\t" | |||||
| "vst %%v25, 80(%%r1,%2) \n\t" | |||||
| "vl %%v26, 96(%%r1,%1) \n\t" | |||||
| "vst %%v26, 96(%%r1,%2) \n\t" | |||||
| "vl %%v27, 112(%%r1,%1) \n\t" | |||||
| "vst %%v27, 112(%%r1,%2) \n\t" | |||||
| "vl %%v24, 128(%%r1,%1) \n\t" | |||||
| "vst %%v24, 128(%%r1,%2) \n\t" | |||||
| "vl %%v25, 144(%%r1,%1) \n\t" | |||||
| "vst %%v25, 144(%%r1,%2) \n\t" | |||||
| "vl %%v26, 160(%%r1,%1) \n\t" | |||||
| "vst %%v26, 160(%%r1,%2) \n\t" | |||||
| "vl %%v27, 176(%%r1,%1) \n\t" | |||||
| "vst %%v27, 176(%%r1,%2) \n\t" | |||||
| "vl %%v24, 192(%%r1,%1) \n\t" | |||||
| "vst %%v24, 192(%%r1,%2) \n\t" | |||||
| "vl %%v25, 208(%%r1,%1) \n\t" | |||||
| "vst %%v25, 208(%%r1,%2) \n\t" | |||||
| "vl %%v26, 224(%%r1,%1) \n\t" | |||||
| "vst %%v26, 224(%%r1,%2) \n\t" | |||||
| "vl %%v27, 240(%%r1,%1) \n\t" | |||||
| "vst %%v27, 240(%%r1,%2) \n\t" | |||||
| "la %%r1,256(%%r1) \n\t" | |||||
| "brctg %%r0,1b" | |||||
| : | |||||
| : "r"(n), "a"(x), "a"(y) | |||||
| : "cc", "memory","r0","r1", "v24","v25","v26","v27" | |||||
| "1: \n\t" | |||||
| "pfd 1, 256(%%r1,%[ptr_x]) \n\t" | |||||
| "pfd 2, 256(%%r1,%[ptr_y]) \n\t" | |||||
| "vl %%v24, 0(%%r1,%[ptr_x]) \n\t" | |||||
| "vst %%v24, 0(%%r1,%[ptr_y]) \n\t" | |||||
| "vl %%v25, 16(%%r1,%[ptr_x]) \n\t" | |||||
| "vst %%v25, 16(%%r1,%[ptr_y]) \n\t" | |||||
| "vl %%v26, 32(%%r1,%[ptr_x]) \n\t" | |||||
| "vst %%v26, 32(%%r1,%[ptr_y]) \n\t" | |||||
| "vl %%v27, 48(%%r1,%[ptr_x]) \n\t" | |||||
| "vst %%v27, 48(%%r1,%[ptr_y]) \n\t" | |||||
| "vl %%v24, 64(%%r1,%[ptr_x]) \n\t" | |||||
| "vst %%v24, 64(%%r1,%[ptr_y]) \n\t" | |||||
| "vl %%v25, 80(%%r1,%[ptr_x]) \n\t" | |||||
| "vst %%v25, 80(%%r1,%[ptr_y]) \n\t" | |||||
| "vl %%v26, 96(%%r1,%[ptr_x]) \n\t" | |||||
| "vst %%v26, 96(%%r1,%[ptr_y]) \n\t" | |||||
| "vl %%v27, 112(%%r1,%[ptr_x]) \n\t" | |||||
| "vst %%v27, 112(%%r1,%[ptr_y]) \n\t" | |||||
| "vl %%v24, 128(%%r1,%[ptr_x]) \n\t" | |||||
| "vst %%v24, 128(%%r1,%[ptr_y]) \n\t" | |||||
| "vl %%v25, 144(%%r1,%[ptr_x]) \n\t" | |||||
| "vst %%v25, 144(%%r1,%[ptr_y]) \n\t" | |||||
| "vl %%v26, 160(%%r1,%[ptr_x]) \n\t" | |||||
| "vst %%v26, 160(%%r1,%[ptr_y]) \n\t" | |||||
| "vl %%v27, 176(%%r1,%[ptr_x]) \n\t" | |||||
| "vst %%v27, 176(%%r1,%[ptr_y]) \n\t" | |||||
| "vl %%v24, 192(%%r1,%[ptr_x]) \n\t" | |||||
| "vst %%v24, 192(%%r1,%[ptr_y]) \n\t" | |||||
| "vl %%v25, 208(%%r1,%[ptr_x]) \n\t" | |||||
| "vst %%v25, 208(%%r1,%[ptr_y]) \n\t" | |||||
| "vl %%v26, 224(%%r1,%[ptr_x]) \n\t" | |||||
| "vst %%v26, 224(%%r1,%[ptr_y]) \n\t" | |||||
| "vl %%v27, 240(%%r1,%[ptr_x]) \n\t" | |||||
| "vst %%v27, 240(%%r1,%[ptr_y]) \n\t" | |||||
| "la %%r1,256(%%r1) \n\t" | |||||
| "brctg %[n_tmp],1b" | |||||
| : [mem_y] "=m" (*(double (*)[n])y), [n_tmp] "+&r"(n) | |||||
| : [mem_x] "m" (*(const double (*)[n])x), [ptr_x] "a"(x), [ptr_y] "a"(y) | |||||
| : "cc", "r1", "v24","v25","v26","v27" | |||||
| ); | ); | ||||
| return; | return; | ||||
| @@ -30,65 +30,67 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #if defined(Z13) | #if defined(Z13) | ||||
| static FLOAT ddot_kernel_8(BLASLONG n, FLOAT *x, FLOAT *y) | |||||
| static FLOAT ddot_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y) | |||||
| { | { | ||||
| FLOAT dot; | FLOAT dot; | ||||
| __asm__ volatile( | __asm__ volatile( | ||||
| "pfd 1, 0(%2) \n\t" | |||||
| "pfd 1, 0(%3) \n\t" | |||||
| "pfd 1, 0(%[ptr_x_tmp]) \n\t" | |||||
| "pfd 1, 0(%[ptr_y_tmp]) \n\t" | |||||
| "vzero %%v24 \n\t" | "vzero %%v24 \n\t" | ||||
| "vzero %%v25 \n\t" | "vzero %%v25 \n\t" | ||||
| "vzero %%v26 \n\t" | "vzero %%v26 \n\t" | ||||
| "vzero %%v27 \n\t" | "vzero %%v27 \n\t" | ||||
| "srlg %1,%1,4 \n\t" | |||||
| "srlg %[n_tmp],%[n_tmp],4 \n\t" | |||||
| "xgr %%r1,%%r1 \n\t" | "xgr %%r1,%%r1 \n\t" | ||||
| ".align 16 \n\t" | ".align 16 \n\t" | ||||
| "1: \n\t" | |||||
| "pfd 1, 256(%%r1,%2) \n\t" | |||||
| "pfd 1, 256(%%r1,%3) \n\t" | |||||
| "vl %%v16, 0(%%r1,%2) \n\t" | |||||
| "vl %%v17, 16(%%r1,%2) \n\t" | |||||
| "vl %%v18, 32(%%r1,%2) \n\t" | |||||
| "vl %%v19, 48(%%r1,%2) \n\t" | |||||
| "vl %%v28, 0(%%r1,%3) \n\t" | |||||
| "vfmadb %%v24,%%v16,%%v28,%%v24 \n\t" | |||||
| "vl %%v29, 16(%%r1,%3) \n\t" | |||||
| "vfmadb %%v25,%%v17,%%v29,%%v25 \n\t" | |||||
| "1: \n\t" | |||||
| "pfd 1, 256(%%r1,%[ptr_x_tmp]) \n\t" | |||||
| "pfd 1, 256(%%r1,%[ptr_y_tmp]) \n\t" | |||||
| "vl %%v16, 0(%%r1,%[ptr_x_tmp]) \n\t" | |||||
| "vl %%v17, 16(%%r1,%[ptr_x_tmp]) \n\t" | |||||
| "vl %%v18, 32(%%r1,%[ptr_x_tmp]) \n\t" | |||||
| "vl %%v19, 48(%%r1,%[ptr_x_tmp]) \n\t" | |||||
| "vl %%v28, 0(%%r1,%[ptr_y_tmp]) \n\t" | |||||
| "vfmadb %%v24,%%v16,%%v28,%%v24 \n\t" | |||||
| "vl %%v29, 16(%%r1,%[ptr_y_tmp]) \n\t" | |||||
| "vfmadb %%v25,%%v17,%%v29,%%v25 \n\t" | |||||
| "vl %%v30, 32(%%r1,%3) \n\t" | |||||
| "vfmadb %%v26,%%v18,%%v30,%%v26 \n\t" | |||||
| "vl %%v31, 48(%%r1,%3) \n\t" | |||||
| "vfmadb %%v27,%%v19,%%v31,%%v27 \n\t" | |||||
| "vl %%v30, 32(%%r1,%[ptr_y_tmp]) \n\t" | |||||
| "vfmadb %%v26,%%v18,%%v30,%%v26 \n\t" | |||||
| "vl %%v31, 48(%%r1,%[ptr_y_tmp]) \n\t" | |||||
| "vfmadb %%v27,%%v19,%%v31,%%v27 \n\t" | |||||
| "vl %%v16, 64(%%r1,%2) \n\t" | |||||
| "vl %%v17, 80(%%r1,%2) \n\t" | |||||
| "vl %%v18, 96(%%r1,%2) \n\t" | |||||
| "vl %%v19, 112(%%r1,%2) \n\t" | |||||
| "vl %%v28, 64(%%r1,%3) \n\t" | |||||
| "vfmadb %%v24,%%v16,%%v28,%%v24 \n\t" | |||||
| "vl %%v29, 80(%%r1,%3) \n\t" | |||||
| "vfmadb %%v25,%%v17,%%v29,%%v25 \n\t" | |||||
| "vl %%v16, 64(%%r1 ,%[ptr_x_tmp]) \n\t" | |||||
| "vl %%v17, 80(%%r1,%[ptr_x_tmp]) \n\t" | |||||
| "vl %%v18, 96(%%r1,%[ptr_x_tmp]) \n\t" | |||||
| "vl %%v19, 112(%%r1,%[ptr_x_tmp]) \n\t" | |||||
| "vl %%v28, 64(%%r1,%[ptr_y_tmp]) \n\t" | |||||
| "vfmadb %%v24,%%v16,%%v28,%%v24 \n\t" | |||||
| "vl %%v29, 80(%%r1,%[ptr_y_tmp]) \n\t" | |||||
| "vfmadb %%v25,%%v17,%%v29,%%v25 \n\t" | |||||
| "vl %%v30, 96(%%r1,%3) \n\t" | |||||
| "vfmadb %%v26,%%v18,%%v30,%%v26 \n\t" | |||||
| "vl %%v31, 112(%%r1,%3) \n\t" | |||||
| "vfmadb %%v27,%%v19,%%v31,%%v27 \n\t" | |||||
| "vl %%v30, 96(%%r1,%[ptr_y_tmp]) \n\t" | |||||
| "vfmadb %%v26,%%v18,%%v30,%%v26 \n\t" | |||||
| "vl %%v31, 112(%%r1,%[ptr_y_tmp]) \n\t" | |||||
| "vfmadb %%v27,%%v19,%%v31,%%v27 \n\t" | |||||
| "la %%r1,128(%%r1) \n\t" | |||||
| "brctg %1,1b \n\t" | |||||
| "vfadb %%v24,%%v25,%%v24 \n\t" | |||||
| "vfadb %%v24,%%v26,%%v24 \n\t" | |||||
| "vfadb %%v24,%%v27,%%v24 \n\t" | |||||
| "vrepg %%v1,%%v24,1 \n\t" | |||||
| "vfadb %%v1,%%v24,%%v1 \n\t" | |||||
| "ldr %0, %%f1 \n\t" | |||||
| : "=f"(dot) ,"+&r"(n) | |||||
| : "a"(x),"a"(y) | |||||
| :"cc" , "r1","v16", "v17","v18","v19","v20","v21","v22","v23", | |||||
| "la %%r1,128(%%r1) \n\t" | |||||
| "brctg %[n_tmp],1b \n\t" | |||||
| "vfadb %%v24,%%v25,%%v24 \n\t" | |||||
| "vfadb %%v24,%%v26,%%v24 \n\t" | |||||
| "vfadb %%v24,%%v27,%%v24 \n\t" | |||||
| "vrepg %%v1,%%v24,1 \n\t" | |||||
| "vfadb %%v1,%%v24,%%v1 \n\t" | |||||
| "ldr %[dot], %%f1 \n\t" | |||||
| : [dot] "=f"(dot) ,[n_tmp] "+&r"(n) | |||||
| : [mem_x] "m"( *(const double (*)[n])x), | |||||
| [mem_y] "m"( *(const double (*)[n])y), | |||||
| [ptr_x_tmp]"a"(x), [ptr_y_tmp] "a"(y) | |||||
| :"cc" , "r1","f1","v16", "v17","v18","v19","v20","v21","v22","v23", | |||||
| "v24","v25","v26","v27","v28","v29","v30","v31" | "v24","v25","v26","v27","v28","v29","v30","v31" | ||||
| ); | ); | ||||
| @@ -99,14 +101,14 @@ static FLOAT ddot_kernel_8(BLASLONG n, FLOAT *x, FLOAT *y) | |||||
| #else | #else | ||||
| static FLOAT ddot_kernel_8(BLASLONG n, FLOAT *x, FLOAT *y ) | |||||
| static FLOAT ddot_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y ) | |||||
| { | { | ||||
| BLASLONG register i = 0; | BLASLONG register i = 0; | ||||
| FLOAT dot = 0.0; | FLOAT dot = 0.0; | ||||
| while(i < n) | while(i < n) | ||||
| { | { | ||||
| dot += y[i] * x[i] | |||||
| dot += y[i] * x[i] | |||||
| + y[i+1] * x[i+1] | + y[i+1] * x[i+1] | ||||
| + y[i+2] * x[i+2] | + y[i+2] * x[i+2] | ||||
| + y[i+3] * x[i+3] | + y[i+3] * x[i+3] | ||||
| @@ -114,8 +116,17 @@ static FLOAT ddot_kernel_8(BLASLONG n, FLOAT *x, FLOAT *y ) | |||||
| + y[i+5] * x[i+5] | + y[i+5] * x[i+5] | ||||
| + y[i+6] * x[i+6] | + y[i+6] * x[i+6] | ||||
| + y[i+7] * x[i+7] ; | + y[i+7] * x[i+7] ; | ||||
| dot += y[i+8] * x[i+8] | |||||
| + y[i+9] * x[i+9] | |||||
| + y[i+10] * x[i+10] | |||||
| + y[i+11] * x[i+11] | |||||
| + y[i+12] * x[i+12] | |||||
| + y[i+13] * x[i+13] | |||||
| + y[i+14] * x[i+14] | |||||
| + y[i+15] * x[i+15] ; | |||||
| i+=8 ; | |||||
| i+=16 ; | |||||
| } | } | ||||
| return dot; | return dot; | ||||
| @@ -138,10 +149,12 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) | |||||
| BLASLONG n1 = n & -16; | BLASLONG n1 = n & -16; | ||||
| if ( n1 ) | |||||
| dot = ddot_kernel_8(n1, x, y ); | |||||
| if ( n1 ){ | |||||
| dot = ddot_kernel_16(n1, x, y ); | |||||
| i = n1; | |||||
| } | |||||
| i = n1; | |||||
| while(i < n) | while(i < n) | ||||
| { | { | ||||
| @@ -24,44 +24,41 @@ CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | ||||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | ||||
| *****************************************************************************/ | *****************************************************************************/ | ||||
| #include "common.h" | #include "common.h" | ||||
| static void __attribute__ ((noinline)) drot_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *c, FLOAT *s) | |||||
| static void drot_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT cosA, FLOAT sinA) | |||||
| { | { | ||||
| __asm__ ( | __asm__ ( | ||||
| "pfd 2, 0(%1) \n\t" | |||||
| "pfd 2, 0(%2) \n\t" | |||||
| "vlrepg %%v0,0(%3) \n\t" | |||||
| "vlrepg %%v1,0(%4) \n\t" | |||||
| "srlg %%r0,%0,5 \n\t" | |||||
| "pfd 2, 0(%[ptr_x]) \n\t" | |||||
| "pfd 2, 0(%[ptr_y]) \n\t" | |||||
| "lgdr %%r1,%[cos] \n\t" | |||||
| "vlvgp %%v0,%%r1,%%r1 \n\t" | |||||
| "lgdr %%r1,%[sin] \n\t" | |||||
| "vlvgp %%v1,%%r1,%%r1 \n\t" | |||||
| "srlg %[n_tmp],%[n_tmp],5 \n\t" | |||||
| "xgr %%r1,%%r1 \n\t" | "xgr %%r1,%%r1 \n\t" | ||||
| ".align 16 \n\t" | ".align 16 \n\t" | ||||
| "1: \n\t" | |||||
| "pfd 2, 256(%%r1,%1) \n\t" | |||||
| "pfd 2, 256(%%r1,%2) \n\t" | |||||
| "vl %%v24, 0(%%r1,%1) \n\t" | |||||
| "vl %%v25, 16(%%r1,%1) \n\t" | |||||
| "vl %%v26, 32(%%r1,%1) \n\t" | |||||
| "vl %%v27, 48(%%r1,%1) \n\t" | |||||
| "vl %%v16, 0(%%r1,%2) \n\t" | |||||
| "vl %%v17, 16(%%r1,%2) \n\t" | |||||
| "vl %%v18, 32(%%r1,%2) \n\t" | |||||
| "vl %%v19, 48(%%r1,%2) \n\t" | |||||
| "vfmdb %%v28,%%v24,%%v0 \n\t" | |||||
| "vfmdb %%v29,%%v25,%%v0 \n\t" | |||||
| "vfmdb %%v20,%%v24,%%v1 \n\t" /* yn=x*s */ | |||||
| "vfmdb %%v21,%%v25,%%v1 \n\t" /* yn=x*s */ | |||||
| "vfmdb %%v30,%%v26,%%v0 \n\t" | |||||
| "vfmdb %%v22,%%v26,%%v1 \n\t" /* yn=x*s */ | |||||
| "vfmdb %%v31,%%v27,%%v0 \n\t" | |||||
| "vfmdb %%v23,%%v27,%%v1 \n\t" /* yn=x*s */ | |||||
| "1: \n\t" | |||||
| "pfd 2, 256(%%r1,%[ptr_x]) \n\t" | |||||
| "pfd 2, 256(%%r1,%[ptr_y]) \n\t" | |||||
| "vl %%v24, 0(%%r1,%[ptr_x]) \n\t" | |||||
| "vl %%v25, 16(%%r1,%[ptr_x]) \n\t" | |||||
| "vl %%v26, 32(%%r1,%[ptr_x]) \n\t" | |||||
| "vl %%v27, 48(%%r1,%[ptr_x]) \n\t" | |||||
| "vl %%v16, 0(%%r1,%[ptr_y]) \n\t" | |||||
| "vl %%v17, 16(%%r1,%[ptr_y]) \n\t" | |||||
| "vl %%v18, 32(%%r1,%[ptr_y]) \n\t" | |||||
| "vl %%v19, 48(%%r1,%[ptr_y]) \n\t" | |||||
| "vfmdb %%v28,%%v24,%%v0 \n\t" | |||||
| "vfmdb %%v29,%%v25,%%v0 \n\t" | |||||
| "vfmdb %%v20,%%v24,%%v1 \n\t" /* yn=x*s */ | |||||
| "vfmdb %%v21,%%v25,%%v1 \n\t" /* yn=x*s */ | |||||
| "vfmdb %%v30,%%v26,%%v0 \n\t" | |||||
| "vfmdb %%v22,%%v26,%%v1 \n\t" /* yn=x*s */ | |||||
| "vfmdb %%v31,%%v27,%%v0 \n\t" | |||||
| "vfmdb %%v23,%%v27,%%v1 \n\t" /* yn=x*s */ | |||||
| /* 2nd parts*/ | /* 2nd parts*/ | ||||
| "vfmadb %%v28,%%v16,%%v1,%%v28 \n\t" | "vfmadb %%v28,%%v16,%%v1,%%v28 \n\t" | ||||
| "vfmsdb %%v20,%%v16,%%v0,%%v20 \n\t" /* yn=y*c-yn */ | "vfmsdb %%v20,%%v16,%%v0,%%v20 \n\t" /* yn=y*c-yn */ | ||||
| @@ -71,35 +68,33 @@ static void __attribute__ ((noinline)) drot_kernel_32(BLASLONG n, FLOAT *x, FLO | |||||
| "vfmsdb %%v22,%%v18,%%v0,%%v22 \n\t" /* yn=y*c-yn */ | "vfmsdb %%v22,%%v18,%%v0,%%v22 \n\t" /* yn=y*c-yn */ | ||||
| "vfmadb %%v31,%%v19,%%v1,%%v31 \n\t" | "vfmadb %%v31,%%v19,%%v1,%%v31 \n\t" | ||||
| "vfmsdb %%v23,%%v19,%%v0,%%v23 \n\t" /* yn=y*c-yn */ | "vfmsdb %%v23,%%v19,%%v0,%%v23 \n\t" /* yn=y*c-yn */ | ||||
| "vst %%v28, 0(%%r1,%1) \n\t" | |||||
| "vst %%v29, 16(%%r1,%1) \n\t" | |||||
| "vst %%v30, 32(%%r1,%1) \n\t" | |||||
| "vst %%v31, 48(%%r1,%1) \n\t" | |||||
| "vst %%v20, 0(%%r1,%2) \n\t" | |||||
| "vst %%v21, 16(%%r1,%2) \n\t" | |||||
| "vst %%v22, 32(%%r1,%2) \n\t" | |||||
| "vst %%v23, 48(%%r1,%2) \n\t" | |||||
| "vl %%v24, 64(%%r1,%1) \n\t" | |||||
| "vl %%v25, 80(%%r1,%1) \n\t" | |||||
| "vl %%v26, 96(%%r1,%1) \n\t" | |||||
| "vl %%v27, 112(%%r1,%1) \n\t" | |||||
| "vl %%v16, 64(%%r1,%2) \n\t" | |||||
| "vl %%v17, 80(%%r1,%2) \n\t" | |||||
| "vl %%v18, 96(%%r1,%2) \n\t" | |||||
| "vl %%v19, 112(%%r1,%2) \n\t" | |||||
| "vfmdb %%v28,%%v24,%%v0 \n\t" | |||||
| "vfmdb %%v29,%%v25,%%v0 \n\t" | |||||
| "vfmdb %%v20,%%v24,%%v1 \n\t" /* yn=x*s */ | |||||
| "vfmdb %%v21,%%v25,%%v1 \n\t" /* yn=x*s */ | |||||
| "vfmdb %%v30,%%v26,%%v0 \n\t" | |||||
| "vfmdb %%v22,%%v26,%%v1 \n\t" /* yn=x*s */ | |||||
| "vfmdb %%v31,%%v27,%%v0 \n\t" | |||||
| "vfmdb %%v23,%%v27,%%v1 \n\t" /* yn=x*s */ | |||||
| "vst %%v28, 0(%%r1,%[ptr_x]) \n\t" | |||||
| "vst %%v29, 16(%%r1,%[ptr_x]) \n\t" | |||||
| "vst %%v30, 32(%%r1,%[ptr_x]) \n\t" | |||||
| "vst %%v31, 48(%%r1,%[ptr_x]) \n\t" | |||||
| "vst %%v20, 0(%%r1,%[ptr_y]) \n\t" | |||||
| "vst %%v21, 16(%%r1,%[ptr_y]) \n\t" | |||||
| "vst %%v22, 32(%%r1,%[ptr_y]) \n\t" | |||||
| "vst %%v23, 48(%%r1,%[ptr_y]) \n\t" | |||||
| "vl %%v24, 64(%%r1,%[ptr_x]) \n\t" | |||||
| "vl %%v25, 80(%%r1,%[ptr_x]) \n\t" | |||||
| "vl %%v26, 96(%%r1,%[ptr_x]) \n\t" | |||||
| "vl %%v27, 112(%%r1,%[ptr_x]) \n\t" | |||||
| "vl %%v16, 64(%%r1,%[ptr_y]) \n\t" | |||||
| "vl %%v17, 80(%%r1,%[ptr_y]) \n\t" | |||||
| "vl %%v18, 96(%%r1,%[ptr_y]) \n\t" | |||||
| "vl %%v19, 112(%%r1,%[ptr_y]) \n\t" | |||||
| "vfmdb %%v28,%%v24,%%v0 \n\t" | |||||
| "vfmdb %%v29,%%v25,%%v0 \n\t" | |||||
| "vfmdb %%v20,%%v24,%%v1 \n\t" /* yn=x*s */ | |||||
| "vfmdb %%v21,%%v25,%%v1 \n\t" /* yn=x*s */ | |||||
| "vfmdb %%v30,%%v26,%%v0 \n\t" | |||||
| "vfmdb %%v22,%%v26,%%v1 \n\t" /* yn=x*s */ | |||||
| "vfmdb %%v31,%%v27,%%v0 \n\t" | |||||
| "vfmdb %%v23,%%v27,%%v1 \n\t" /* yn=x*s */ | |||||
| /* 2nd parts*/ | /* 2nd parts*/ | ||||
| "vfmadb %%v28,%%v16,%%v1,%%v28 \n\t" | "vfmadb %%v28,%%v16,%%v1,%%v28 \n\t" | ||||
| "vfmsdb %%v20,%%v16,%%v0,%%v20 \n\t" /* yn=y*c-yn */ | "vfmsdb %%v20,%%v16,%%v0,%%v20 \n\t" /* yn=y*c-yn */ | ||||
| @@ -109,35 +104,33 @@ static void __attribute__ ((noinline)) drot_kernel_32(BLASLONG n, FLOAT *x, FLO | |||||
| "vfmsdb %%v22,%%v18,%%v0,%%v22 \n\t" /* yn=y*c-yn */ | "vfmsdb %%v22,%%v18,%%v0,%%v22 \n\t" /* yn=y*c-yn */ | ||||
| "vfmadb %%v31,%%v19,%%v1,%%v31 \n\t" | "vfmadb %%v31,%%v19,%%v1,%%v31 \n\t" | ||||
| "vfmsdb %%v23,%%v19,%%v0,%%v23 \n\t" /* yn=y*c-yn */ | "vfmsdb %%v23,%%v19,%%v0,%%v23 \n\t" /* yn=y*c-yn */ | ||||
| "vst %%v28, 64(%%r1,%1) \n\t" | |||||
| "vst %%v29, 80(%%r1,%1) \n\t" | |||||
| "vst %%v30, 96(%%r1,%1) \n\t" | |||||
| "vst %%v31, 112(%%r1,%1) \n\t" | |||||
| "vst %%v20, 64(%%r1,%2) \n\t" | |||||
| "vst %%v21, 80(%%r1,%2) \n\t" | |||||
| "vst %%v22, 96(%%r1,%2) \n\t" | |||||
| "vst %%v23, 112(%%r1,%2) \n\t" | |||||
| "vl %%v24, 128(%%r1,%1) \n\t" | |||||
| "vl %%v25, 144(%%r1,%1) \n\t" | |||||
| "vl %%v26, 160(%%r1,%1) \n\t" | |||||
| "vl %%v27, 176(%%r1,%1) \n\t" | |||||
| "vl %%v16, 128(%%r1,%2) \n\t" | |||||
| "vl %%v17, 144(%%r1,%2) \n\t" | |||||
| "vl %%v18, 160(%%r1,%2) \n\t" | |||||
| "vl %%v19, 176(%%r1,%2) \n\t" | |||||
| "vfmdb %%v28,%%v24,%%v0 \n\t" | |||||
| "vfmdb %%v29,%%v25,%%v0 \n\t" | |||||
| "vfmdb %%v20,%%v24,%%v1 \n\t" /* yn=x*s */ | |||||
| "vfmdb %%v21,%%v25,%%v1 \n\t" /* yn=x*s */ | |||||
| "vfmdb %%v30,%%v26,%%v0 \n\t" | |||||
| "vfmdb %%v22,%%v26,%%v1 \n\t" /* yn=x*s */ | |||||
| "vfmdb %%v31,%%v27,%%v0 \n\t" | |||||
| "vfmdb %%v23,%%v27,%%v1 \n\t" /* yn=x*s */ | |||||
| "vst %%v28, 64(%%r1,%[ptr_x]) \n\t" | |||||
| "vst %%v29, 80(%%r1,%[ptr_x]) \n\t" | |||||
| "vst %%v30, 96(%%r1,%[ptr_x]) \n\t" | |||||
| "vst %%v31, 112(%%r1,%[ptr_x]) \n\t" | |||||
| "vst %%v20, 64(%%r1,%[ptr_y]) \n\t" | |||||
| "vst %%v21, 80(%%r1,%[ptr_y]) \n\t" | |||||
| "vst %%v22, 96(%%r1,%[ptr_y]) \n\t" | |||||
| "vst %%v23, 112(%%r1,%[ptr_y]) \n\t" | |||||
| "vl %%v24, 128(%%r1,%[ptr_x]) \n\t" | |||||
| "vl %%v25, 144(%%r1,%[ptr_x]) \n\t" | |||||
| "vl %%v26, 160(%%r1,%[ptr_x]) \n\t" | |||||
| "vl %%v27, 176(%%r1,%[ptr_x]) \n\t" | |||||
| "vl %%v16, 128(%%r1,%[ptr_y]) \n\t" | |||||
| "vl %%v17, 144(%%r1,%[ptr_y]) \n\t" | |||||
| "vl %%v18, 160(%%r1,%[ptr_y]) \n\t" | |||||
| "vl %%v19, 176(%%r1,%[ptr_y]) \n\t" | |||||
| "vfmdb %%v28,%%v24,%%v0 \n\t" | |||||
| "vfmdb %%v29,%%v25,%%v0 \n\t" | |||||
| "vfmdb %%v20,%%v24,%%v1 \n\t" /* yn=x*s */ | |||||
| "vfmdb %%v21,%%v25,%%v1 \n\t" /* yn=x*s */ | |||||
| "vfmdb %%v30,%%v26,%%v0 \n\t" | |||||
| "vfmdb %%v22,%%v26,%%v1 \n\t" /* yn=x*s */ | |||||
| "vfmdb %%v31,%%v27,%%v0 \n\t" | |||||
| "vfmdb %%v23,%%v27,%%v1 \n\t" /* yn=x*s */ | |||||
| /* 2nd parts*/ | /* 2nd parts*/ | ||||
| "vfmadb %%v28,%%v16,%%v1,%%v28 \n\t" | "vfmadb %%v28,%%v16,%%v1,%%v28 \n\t" | ||||
| "vfmsdb %%v20,%%v16,%%v0,%%v20 \n\t" /* yn=y*c-yn */ | "vfmsdb %%v20,%%v16,%%v0,%%v20 \n\t" /* yn=y*c-yn */ | ||||
| @@ -147,35 +140,33 @@ static void __attribute__ ((noinline)) drot_kernel_32(BLASLONG n, FLOAT *x, FLO | |||||
| "vfmsdb %%v22,%%v18,%%v0,%%v22 \n\t" /* yn=y*c-yn */ | "vfmsdb %%v22,%%v18,%%v0,%%v22 \n\t" /* yn=y*c-yn */ | ||||
| "vfmadb %%v31,%%v19,%%v1,%%v31 \n\t" | "vfmadb %%v31,%%v19,%%v1,%%v31 \n\t" | ||||
| "vfmsdb %%v23,%%v19,%%v0,%%v23 \n\t" /* yn=y*c-yn */ | "vfmsdb %%v23,%%v19,%%v0,%%v23 \n\t" /* yn=y*c-yn */ | ||||
| "vst %%v28, 128(%%r1,%1) \n\t" | |||||
| "vst %%v29, 144(%%r1,%1) \n\t" | |||||
| "vst %%v30, 160(%%r1,%1) \n\t" | |||||
| "vst %%v31, 176(%%r1,%1) \n\t" | |||||
| "vst %%v20, 128(%%r1,%2) \n\t" | |||||
| "vst %%v21, 144(%%r1,%2) \n\t" | |||||
| "vst %%v22, 160(%%r1,%2) \n\t" | |||||
| "vst %%v23, 176(%%r1,%2) \n\t" | |||||
| "vl %%v24, 192(%%r1,%1) \n\t" | |||||
| "vl %%v25, 208(%%r1,%1) \n\t" | |||||
| "vl %%v26, 224(%%r1,%1) \n\t" | |||||
| "vl %%v27, 240(%%r1,%1) \n\t" | |||||
| "vl %%v16, 192(%%r1,%2) \n\t" | |||||
| "vl %%v17, 208(%%r1,%2) \n\t" | |||||
| "vl %%v18, 224(%%r1,%2) \n\t" | |||||
| "vl %%v19, 240(%%r1,%2) \n\t" | |||||
| "vfmdb %%v28,%%v24,%%v0 \n\t" | |||||
| "vfmdb %%v29,%%v25,%%v0 \n\t" | |||||
| "vfmdb %%v20,%%v24,%%v1 \n\t" /* yn=x*s */ | |||||
| "vfmdb %%v21,%%v25,%%v1 \n\t" /* yn=x*s */ | |||||
| "vfmdb %%v30,%%v26,%%v0 \n\t" | |||||
| "vfmdb %%v22,%%v26,%%v1 \n\t" /* yn=x*s */ | |||||
| "vfmdb %%v31,%%v27,%%v0 \n\t" | |||||
| "vfmdb %%v23,%%v27,%%v1 \n\t" /* yn=x*s */ | |||||
| "vst %%v28, 128(%%r1,%[ptr_x]) \n\t" | |||||
| "vst %%v29, 144(%%r1,%[ptr_x]) \n\t" | |||||
| "vst %%v30, 160(%%r1,%[ptr_x]) \n\t" | |||||
| "vst %%v31, 176(%%r1,%[ptr_x]) \n\t" | |||||
| "vst %%v20, 128(%%r1,%[ptr_y]) \n\t" | |||||
| "vst %%v21, 144(%%r1,%[ptr_y]) \n\t" | |||||
| "vst %%v22, 160(%%r1,%[ptr_y]) \n\t" | |||||
| "vst %%v23, 176(%%r1,%[ptr_y]) \n\t" | |||||
| "vl %%v24, 192(%%r1,%[ptr_x]) \n\t" | |||||
| "vl %%v25, 208(%%r1,%[ptr_x]) \n\t" | |||||
| "vl %%v26, 224(%%r1,%[ptr_x]) \n\t" | |||||
| "vl %%v27, 240(%%r1,%[ptr_x]) \n\t" | |||||
| "vl %%v16, 192(%%r1,%[ptr_y]) \n\t" | |||||
| "vl %%v17, 208(%%r1,%[ptr_y]) \n\t" | |||||
| "vl %%v18, 224(%%r1,%[ptr_y]) \n\t" | |||||
| "vl %%v19, 240(%%r1,%[ptr_y]) \n\t" | |||||
| "vfmdb %%v28,%%v24,%%v0 \n\t" | |||||
| "vfmdb %%v29,%%v25,%%v0 \n\t" | |||||
| "vfmdb %%v20,%%v24,%%v1 \n\t" /* yn=x*s */ | |||||
| "vfmdb %%v21,%%v25,%%v1 \n\t" /* yn=x*s */ | |||||
| "vfmdb %%v30,%%v26,%%v0 \n\t" | |||||
| "vfmdb %%v22,%%v26,%%v1 \n\t" /* yn=x*s */ | |||||
| "vfmdb %%v31,%%v27,%%v0 \n\t" | |||||
| "vfmdb %%v23,%%v27,%%v1 \n\t" /* yn=x*s */ | |||||
| /* 2nd parts*/ | /* 2nd parts*/ | ||||
| "vfmadb %%v28,%%v16,%%v1,%%v28 \n\t" | "vfmadb %%v28,%%v16,%%v1,%%v28 \n\t" | ||||
| "vfmsdb %%v20,%%v16,%%v0,%%v20 \n\t" /* yn=y*c-yn */ | "vfmsdb %%v20,%%v16,%%v0,%%v20 \n\t" /* yn=y*c-yn */ | ||||
| @@ -185,34 +176,29 @@ static void __attribute__ ((noinline)) drot_kernel_32(BLASLONG n, FLOAT *x, FLO | |||||
| "vfmsdb %%v22,%%v18,%%v0,%%v22 \n\t" /* yn=y*c-yn */ | "vfmsdb %%v22,%%v18,%%v0,%%v22 \n\t" /* yn=y*c-yn */ | ||||
| "vfmadb %%v31,%%v19,%%v1,%%v31 \n\t" | "vfmadb %%v31,%%v19,%%v1,%%v31 \n\t" | ||||
| "vfmsdb %%v23,%%v19,%%v0,%%v23 \n\t" /* yn=y*c-yn */ | "vfmsdb %%v23,%%v19,%%v0,%%v23 \n\t" /* yn=y*c-yn */ | ||||
| "vst %%v28, 192(%%r1,%1) \n\t" | |||||
| "vst %%v29, 208(%%r1,%1) \n\t" | |||||
| "vst %%v30, 224(%%r1,%1) \n\t" | |||||
| "vst %%v31, 240(%%r1,%1) \n\t" | |||||
| "vst %%v20, 192(%%r1,%2) \n\t" | |||||
| "vst %%v21, 208(%%r1,%2) \n\t" | |||||
| "vst %%v22, 224(%%r1,%2) \n\t" | |||||
| "vst %%v23, 240(%%r1,%2) \n\t" | |||||
| "la %%r1,256(%%r1) \n\t" | |||||
| "brctg %%r0,1b" | |||||
| : | |||||
| : "r"(n), "a"(x), "a"(y),"a"(c),"a"(s) | |||||
| : "cc", "memory","r0","r1" ,"v0","v1","v16", | |||||
| "vst %%v28, 192(%%r1,%[ptr_x]) \n\t" | |||||
| "vst %%v29, 208(%%r1,%[ptr_x]) \n\t" | |||||
| "vst %%v30, 224(%%r1,%[ptr_x]) \n\t" | |||||
| "vst %%v31, 240(%%r1,%[ptr_x]) \n\t" | |||||
| "vst %%v20, 192(%%r1,%[ptr_y]) \n\t" | |||||
| "vst %%v21, 208(%%r1,%[ptr_y]) \n\t" | |||||
| "vst %%v22, 224(%%r1,%[ptr_y]) \n\t" | |||||
| "vst %%v23, 240(%%r1,%[ptr_y]) \n\t" | |||||
| "la %%r1,256(%%r1) \n\t" | |||||
| "brctg %[n_tmp],1b" | |||||
| : [mem_x] "+m" (*(double (*)[n])x), | |||||
| [mem_y] "+m" (*(double (*)[n])y), | |||||
| [n_tmp] "+&r"(n) | |||||
| : [ptr_x] "a"(x), [ptr_y] "a"(y),[cos] "f"(cosA),[sin] "f"(sinA) | |||||
| : "cc", "r1" ,"v0","v1","v16", | |||||
| "v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" | "v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" | ||||
| ); | ); | ||||
| return; | return; | ||||
| } | } | ||||
| int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT c, FLOAT s) | int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT c, FLOAT s) | ||||
| { | { | ||||
| BLASLONG i=0; | BLASLONG i=0; | ||||
| @@ -228,10 +214,8 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT | |||||
| BLASLONG n1 = n & -32; | BLASLONG n1 = n & -32; | ||||
| if ( n1 > 0 ) | if ( n1 > 0 ) | ||||
| { | { | ||||
| FLOAT cosa,sina; | |||||
| cosa=c; | |||||
| sina=s; | |||||
| drot_kernel_32(n1, x, y, &cosa, &sina); | |||||
| drot_kernel_32(n1, x, y, c, s); | |||||
| i=n1; | i=n1; | ||||
| } | } | ||||
| @@ -245,7 +229,6 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT | |||||
| } | } | ||||
| } | } | ||||
| else | else | ||||
| { | { | ||||
| @@ -267,4 +250,3 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT | |||||
| } | } | ||||
| @@ -28,78 +28,87 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #include "common.h" | #include "common.h" | ||||
| #if defined(Z13) | |||||
| static void __attribute__ ((noinline)) dscal_kernel_8( BLASLONG n, FLOAT da , FLOAT *x ) | |||||
| static void dscal_kernel_32( BLASLONG n, FLOAT da , FLOAT *x ) | |||||
| { | { | ||||
| __asm__ ("pfd 2, 0(%1) \n\t" | |||||
| "vrepg %%v0 , %%v0,0 \n\t" | |||||
| "sllg %%r0,%0,3 \n\t" | |||||
| "xgr %%r1,%%r1 \n\t" | |||||
| ".align 16 \n\t" | |||||
| "1: \n\t" | |||||
| "pfd 2, 256(%%r1,%1) \n\t" | |||||
| "vl %%v24, 0(%%r1,%1) \n\t" | |||||
| "vfmdb %%v24,%%v24,%%v0 \n\t" | |||||
| "vst %%v24, 0(%%r1,%1) \n\t" | |||||
| "vl %%v25, 16(%%r1,%1) \n\t" | |||||
| "vfmdb %%v25,%%v25,%%v0 \n\t" | |||||
| "vst %%v25, 16(%%r1,%1) \n\t" | |||||
| "vl %%v26, 32(%%r1,%1) \n\t" | |||||
| "vfmdb %%v26,%%v26,%%v0 \n\t" | |||||
| "vst %%v26, 32(%%r1,%1) \n\t" | |||||
| "vl %%v27, 48(%%r1,%1) \n\t" | |||||
| "vfmdb %%v27,%%v27,%%v0 \n\t" | |||||
| "vst %%v27, 48(%%r1,%1) \n\t" | |||||
| "vl %%v24, 64(%%r1,%1) \n\t" | |||||
| "vfmdb %%v24,%%v24,%%v0 \n\t" | |||||
| "vst %%v24, 64(%%r1,%1) \n\t" | |||||
| "vl %%v25, 80(%%r1,%1) \n\t" | |||||
| "vfmdb %%v25,%%v25,%%v0 \n\t" | |||||
| "vst %%v25, 80(%%r1,%1) \n\t" | |||||
| "vl %%v26, 96(%%r1,%1) \n\t" | |||||
| "vfmdb %%v26,%%v26,%%v0 \n\t" | |||||
| "vst %%v26, 96(%%r1,%1) \n\t" | |||||
| "vl %%v27, 112(%%r1,%1) \n\t" | |||||
| "vfmdb %%v27,%%v27,%%v0 \n\t" | |||||
| "vst %%v27, 112(%%r1,%1) \n\t" | |||||
| "la %%r1,128(%%r1) \n\t" | |||||
| "clgrjl %%r1,%%r0,1b \n\t" | |||||
| : | |||||
| :"r"(n),"a"(x),"f"(da) | |||||
| :"cc" , "memory" ,"r0","r1","v0","v24","v25","v26","v27" | |||||
| /* faster than sequence of triples(vl vfmd vst) (tested OPENBLAS_LOOPS=10000) */ | |||||
| __asm__ ("pfd 2, 0(%[x_ptr]) \n\t" | |||||
| "lgdr %%r0,%[alpha] \n\t" | |||||
| "vlvgp %%v0,%%r0,%%r0 \n\t" | |||||
| "vlr %%v1,%%v0 \n\t" | |||||
| "sllg %%r0,%[n],3 \n\t" | |||||
| "agr %%r0,%[x_ptr] \n\t" | |||||
| ".align 16 \n\t" | |||||
| "1: \n\t" | |||||
| "pfd 2, 256(%[x_ptr]) \n\t" | |||||
| "vlm %%v16,%%v23, 0(%[x_ptr]) \n\t" | |||||
| "vfmdb %%v16,%%v16,%%v0 \n\t" | |||||
| "vfmdb %%v17,%%v17,%%v1 \n\t" | |||||
| "vfmdb %%v18,%%v18,%%v0 \n\t" | |||||
| "vfmdb %%v19,%%v19,%%v1 \n\t" | |||||
| "vfmdb %%v20,%%v20,%%v0 \n\t" | |||||
| "vfmdb %%v21,%%v21,%%v1 \n\t" | |||||
| "vfmdb %%v22,%%v22,%%v0 \n\t" | |||||
| "vfmdb %%v23,%%v23,%%v1 \n\t" | |||||
| "vstm %%v16,%%v23, 0(%[x_ptr]) \n\t" | |||||
| "vlm %%v24,%%v31,128(%[x_ptr]) \n\t" | |||||
| "vfmdb %%v24,%%v24,%%v0 \n\t" | |||||
| "vfmdb %%v25,%%v25,%%v1 \n\t" | |||||
| "vfmdb %%v26,%%v26,%%v0 \n\t" | |||||
| "vfmdb %%v27,%%v27,%%v1 \n\t" | |||||
| "vfmdb %%v28,%%v28,%%v0 \n\t" | |||||
| "vfmdb %%v29,%%v29,%%v1 \n\t" | |||||
| "vfmdb %%v30,%%v30,%%v0 \n\t" | |||||
| "vfmdb %%v31,%%v31,%%v1 \n\t" | |||||
| "vstm %%v24,%%v31,128(%[x_ptr]) \n\t" | |||||
| "la %[x_ptr], 256(%[x_ptr]) \n\t" | |||||
| "clgrjl %[x_ptr],%%r0,1b \n\t" | |||||
| : [mem] "+m" (*(double (*)[n])x) ,[x_ptr] "+&a"(x) | |||||
| : [n] "r"(n),[alpha] "f"(da) | |||||
| :"cc" , "r0","v0","v1","v16","v17","v18","v19","v20","v21", | |||||
| "v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" | |||||
| ); | ); | ||||
| } | |||||
| } | |||||
| static void __attribute__ ((noinline)) dscal_kernel_8_zero( BLASLONG n, FLOAT da , FLOAT *x ) | |||||
| static void dscal_kernel_32_zero( BLASLONG n, FLOAT *x ) | |||||
| { | { | ||||
| __asm__ ("pfd 2, 0(%1) \n\t" | |||||
| "vzero %%v0 \n\t" | |||||
| "sllg %%r0,%0,3 \n\t" | |||||
| "xgr %%r1,%%r1 \n\t" | |||||
| ".align 16 \n\t" | |||||
| "1: \n\t" | |||||
| "pfd 2, 256(%%r1,%1) \n\t" | |||||
| "vst %%v0, 0(%%r1,%1) \n\t" | |||||
| "vst %%v0, 16(%%r1,%1) \n\t" | |||||
| "vst %%v0, 32(%%r1,%1) \n\t" | |||||
| "vst %%v0, 48(%%r1,%1) \n\t" | |||||
| "vst %%v0, 64(%%r1,%1) \n\t" | |||||
| "vst %%v0, 80(%%r1,%1) \n\t" | |||||
| "vst %%v0, 96(%%r1,%1) \n\t" | |||||
| "vst %%v0, 112(%%r1,%1) \n\t" | |||||
| "la %%r1,128(%%r1) \n\t" | |||||
| "clgrjl %%r1,%%r0,1b \n\t" | |||||
| : | |||||
| :"r"(n),"a"(x),"f"(da) | |||||
| :"cc" , "memory" ,"r0","r1","v0" | |||||
| __asm__ ("pfd 2, 0(%[x_ptr]) \n\t" | |||||
| "vzero %%v24 \n\t" | |||||
| "sllg %%r0,%[n],3 \n\t" | |||||
| "vzero %%v25 \n\t" | |||||
| "agr %%r0,%[x_ptr] \n\t" | |||||
| ".align 16 \n\t" | |||||
| "1: \n\t" | |||||
| "pfd 2, 256(%[x_ptr]) \n\t" | |||||
| "vst %%v24, 0(%[x_ptr]) \n\t" | |||||
| "vst %%v25, 16(%[x_ptr]) \n\t" | |||||
| "vst %%v24, 32(%[x_ptr]) \n\t" | |||||
| "vst %%v25, 48(%[x_ptr]) \n\t" | |||||
| "vst %%v24, 64(%[x_ptr]) \n\t" | |||||
| "vst %%v25, 80(%[x_ptr]) \n\t" | |||||
| "vst %%v24, 96(%[x_ptr]) \n\t" | |||||
| "vst %%v25, 112(%[x_ptr]) \n\t" | |||||
| "vst %%v24, 128(%[x_ptr]) \n\t" | |||||
| "vst %%v25, 144(%[x_ptr]) \n\t" | |||||
| "vst %%v24, 160(%[x_ptr]) \n\t" | |||||
| "vst %%v25, 176(%[x_ptr]) \n\t" | |||||
| "vst %%v24, 192(%[x_ptr]) \n\t" | |||||
| "vst %%v25, 208(%[x_ptr]) \n\t" | |||||
| "vst %%v24, 224(%[x_ptr]) \n\t" | |||||
| "vst %%v25, 240(%[x_ptr]) \n\t" | |||||
| "la %[x_ptr],256(%[x_ptr]) \n\t" | |||||
| "clgrjl %[x_ptr],%%r0,1b \n\t" | |||||
| : [mem] "=m" (*(double (*)[n])x) ,[x_ptr] "+&a"(x) | |||||
| : [n] "r"(n) | |||||
| :"cc" , "r0", "v24" ,"v25" | |||||
| ); | ); | ||||
| } | } | ||||
| #endif | |||||
| int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) | int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) | ||||
| { | { | ||||
| @@ -114,11 +123,11 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS | |||||
| if ( da == 0.0 ) | if ( da == 0.0 ) | ||||
| { | { | ||||
| BLASLONG n1 = n & -16; | |||||
| BLASLONG n1 = n & -32; | |||||
| if ( n1 > 0 ) | if ( n1 > 0 ) | ||||
| { | { | ||||
| dscal_kernel_8_zero(n1 , da , x); | |||||
| dscal_kernel_32_zero(n1 , x); | |||||
| j=n1; | j=n1; | ||||
| } | } | ||||
| @@ -133,10 +142,10 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS | |||||
| else | else | ||||
| { | { | ||||
| BLASLONG n1 = n & -16; | |||||
| BLASLONG n1 = n & -32; | |||||
| if ( n1 > 0 ) | if ( n1 > 0 ) | ||||
| { | { | ||||
| dscal_kernel_8(n1 , da , x); | |||||
| dscal_kernel_32(n1 , da , x); | |||||
| j=n1; | j=n1; | ||||
| } | } | ||||
| while(j < n) | while(j < n) | ||||
| @@ -29,299 +29,205 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #include "common.h" | #include "common.h" | ||||
| #define Z13_SWAP_C 1 | |||||
| #if defined(Z13_SWAP_A) | #if defined(Z13_SWAP_A) | ||||
| static void __attribute__ ((noinline)) dswap_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y) | |||||
| static void dswap_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y) | |||||
| { | { | ||||
| __asm__ volatile( | __asm__ volatile( | ||||
| "pfd 1, 0(%1) \n\t" | |||||
| "pfd 2, 0(%2) \n\t" | |||||
| "srlg %%r0,%0,5 \n\t" | |||||
| "pfd 1, 0(%[ptr_x]) \n\t" | |||||
| "pfd 2, 0(%[ptr_y]) \n\t" | |||||
| "srlg %[n_tmp],%[n_tmp],5 \n\t" | |||||
| "xgr %%r1,%%r1 \n\t" | "xgr %%r1,%%r1 \n\t" | ||||
| ".align 16 \n\t" | ".align 16 \n\t" | ||||
| "1: \n\t" | |||||
| "pfd 2, 256(%%r1,%1) \n\t" | |||||
| "pfd 2, 256(%%r1,%2) \n\t" | |||||
| "1: \n\t" | |||||
| "pfd 2, 256(%%r1,%[ptr_x]) \n\t" | |||||
| "pfd 2, 256(%%r1,%[ptr_y]) \n\t" | |||||
| "vl %%v24, 0(%%r1,%1) \n\t" | |||||
| "vl %%v16, 0(%%r1,%2) \n\t" | |||||
| "vst %%v24, 0(%%r1,%2) \n\t" | |||||
| "vst %%v16, 0(%%r1,%1) \n\t" | |||||
| "vl %%v25, 16(%%r1,%1) \n\t" | |||||
| "vl %%v17, 16(%%r1,%2) \n\t" | |||||
| "vst %%v25, 16(%%r1,%2) \n\t" | |||||
| "vst %%v17, 16(%%r1,%1) \n\t" | |||||
| "vl %%v26, 32(%%r1,%1) \n\t" | |||||
| "vl %%v18, 32(%%r1,%2) \n\t" | |||||
| "vst %%v26, 32(%%r1,%2) \n\t" | |||||
| "vst %%v18, 32(%%r1,%1) \n\t" | |||||
| "vl %%v27, 48(%%r1,%1) \n\t" | |||||
| "vl %%v19, 48(%%r1,%2) \n\t" | |||||
| "vst %%v27, 48(%%r1,%2) \n\t" | |||||
| "vst %%v19, 48(%%r1,%1) \n\t" | |||||
| "vl %%v28, 64(%%r1,%1) \n\t" | |||||
| "vl %%v20, 64(%%r1,%2) \n\t" | |||||
| "vst %%v28, 64(%%r1,%2) \n\t" | |||||
| "vst %%v20, 64(%%r1,%1) \n\t" | |||||
| "vl %%v29, 80(%%r1,%1) \n\t" | |||||
| "vl %%v21, 80(%%r1,%2) \n\t" | |||||
| "vst %%v29, 80(%%r1,%2) \n\t" | |||||
| "vst %%v21, 80(%%r1,%1) \n\t" | |||||
| "vl %%v30, 96(%%r1,%1) \n\t" | |||||
| "vl %%v22, 96(%%r1,%2) \n\t" | |||||
| "vst %%v30, 96(%%r1,%2) \n\t" | |||||
| "vst %%v22, 96(%%r1,%1) \n\t" | |||||
| "vl %%v31, 112(%%r1,%1) \n\t" | |||||
| "vl %%v23, 112(%%r1,%2) \n\t" | |||||
| "vst %%v31, 112(%%r1,%2) \n\t" | |||||
| "vst %%v23, 112(%%r1,%1) \n\t" | |||||
| "vl %%v24, 128(%%r1,%1) \n\t" | |||||
| "vl %%v16, 128(%%r1,%2) \n\t" | |||||
| "vst %%v24, 128(%%r1,%2) \n\t" | |||||
| "vst %%v16, 128(%%r1,%1) \n\t" | |||||
| "vl %%v25, 144(%%r1,%1) \n\t" | |||||
| "vl %%v17, 144(%%r1,%2) \n\t" | |||||
| "vst %%v25, 144(%%r1,%2) \n\t" | |||||
| "vst %%v17, 144(%%r1,%1) \n\t" | |||||
| "vl %%v26, 160(%%r1,%1) \n\t" | |||||
| "vl %%v18, 160(%%r1,%2) \n\t" | |||||
| "vst %%v26, 160(%%r1,%2) \n\t" | |||||
| "vst %%v18, 160(%%r1,%1) \n\t" | |||||
| "vl %%v27, 176(%%r1,%1) \n\t" | |||||
| "vl %%v19, 176(%%r1,%2) \n\t" | |||||
| "vst %%v27, 176(%%r1,%2) \n\t" | |||||
| "vst %%v19, 176(%%r1,%1) \n\t" | |||||
| "vl %%v28, 192(%%r1,%1) \n\t" | |||||
| "vl %%v20, 192(%%r1,%2) \n\t" | |||||
| "vst %%v28, 192(%%r1,%2) \n\t" | |||||
| "vst %%v20, 192(%%r1,%1) \n\t" | |||||
| "vl %%v29, 208(%%r1,%1) \n\t" | |||||
| "vl %%v21, 208(%%r1,%2) \n\t" | |||||
| "vst %%v29, 208(%%r1,%2) \n\t" | |||||
| "vst %%v21, 208(%%r1,%1) \n\t" | |||||
| "vl %%v30, 224(%%r1,%1) \n\t" | |||||
| "vl %%v22, 224(%%r1,%2) \n\t" | |||||
| "vst %%v30, 224(%%r1,%2) \n\t" | |||||
| "vst %%v22, 224(%%r1,%1) \n\t" | |||||
| "vl %%v31, 240(%%r1,%1) \n\t" | |||||
| "vl %%v23, 240(%%r1,%2) \n\t" | |||||
| "vst %%v31, 240(%%r1,%2) \n\t" | |||||
| "vst %%v23, 240(%%r1,%1) \n\t" | |||||
| "vl %%v24, 0(%%r1,%[ptr_x]) \n\t" | |||||
| "vl %%v16, 0(%%r1,%[ptr_y]) \n\t" | |||||
| "vst %%v24, 0(%%r1,%[ptr_y]) \n\t" | |||||
| "vst %%v16, 0(%%r1,%[ptr_x]) \n\t" | |||||
| "vl %%v25, 16(%%r1,%[ptr_x]) \n\t" | |||||
| "vl %%v17, 16(%%r1,%[ptr_y]) \n\t" | |||||
| "vst %%v25, 16(%%r1,%[ptr_y]) \n\t" | |||||
| "vst %%v17, 16(%%r1,%[ptr_x]) \n\t" | |||||
| "vl %%v26, 32(%%r1,%[ptr_x]) \n\t" | |||||
| "vl %%v18, 32(%%r1,%[ptr_y]) \n\t" | |||||
| "vst %%v26, 32(%%r1,%[ptr_y]) \n\t" | |||||
| "vst %%v18, 32(%%r1,%[ptr_x]) \n\t" | |||||
| "vl %%v27, 48(%%r1,%[ptr_x]) \n\t" | |||||
| "vl %%v19, 48(%%r1,%[ptr_y]) \n\t" | |||||
| "vst %%v27, 48(%%r1,%[ptr_y]) \n\t" | |||||
| "vst %%v19, 48(%%r1,%[ptr_x]) \n\t" | |||||
| "vl %%v28, 64(%%r1,%[ptr_x]) \n\t" | |||||
| "vl %%v20, 64(%%r1,%[ptr_y]) \n\t" | |||||
| "vst %%v28, 64(%%r1,%[ptr_y]) \n\t" | |||||
| "vst %%v20, 64(%%r1,%[ptr_x]) \n\t" | |||||
| "vl %%v29, 80(%%r1,%[ptr_x]) \n\t" | |||||
| "vl %%v21, 80(%%r1,%[ptr_y]) \n\t" | |||||
| "vst %%v29, 80(%%r1,%[ptr_y]) \n\t" | |||||
| "vst %%v21, 80(%%r1,%[ptr_x]) \n\t" | |||||
| "vl %%v30, 96(%%r1,%[ptr_x]) \n\t" | |||||
| "vl %%v22, 96(%%r1,%[ptr_y]) \n\t" | |||||
| "vst %%v30, 96(%%r1,%[ptr_y]) \n\t" | |||||
| "vst %%v22, 96(%%r1,%[ptr_x]) \n\t" | |||||
| "vl %%v31, 112(%%r1,%[ptr_x]) \n\t" | |||||
| "vl %%v23, 112(%%r1,%[ptr_y]) \n\t" | |||||
| "vst %%v31, 112(%%r1,%[ptr_y]) \n\t" | |||||
| "vst %%v23, 112(%%r1,%[ptr_x]) \n\t" | |||||
| "vl %%v24, 128(%%r1,%[ptr_x]) \n\t" | |||||
| "vl %%v16, 128(%%r1,%[ptr_y]) \n\t" | |||||
| "vst %%v24, 128(%%r1,%[ptr_y]) \n\t" | |||||
| "vst %%v16, 128(%%r1,%[ptr_x]) \n\t" | |||||
| "vl %%v25, 144(%%r1,%[ptr_x]) \n\t" | |||||
| "vl %%v17, 144(%%r1,%[ptr_y]) \n\t" | |||||
| "vst %%v25, 144(%%r1,%[ptr_y]) \n\t" | |||||
| "vst %%v17, 144(%%r1,%[ptr_x]) \n\t" | |||||
| "vl %%v26, 160(%%r1,%[ptr_x]) \n\t" | |||||
| "vl %%v18, 160(%%r1,%[ptr_y]) \n\t" | |||||
| "vst %%v26, 160(%%r1,%[ptr_y]) \n\t" | |||||
| "vst %%v18, 160(%%r1,%[ptr_x]) \n\t" | |||||
| "vl %%v27, 176(%%r1,%[ptr_x]) \n\t" | |||||
| "vl %%v19, 176(%%r1,%[ptr_y]) \n\t" | |||||
| "vst %%v27, 176(%%r1,%[ptr_y]) \n\t" | |||||
| "vst %%v19, 176(%%r1,%[ptr_x]) \n\t" | |||||
| "vl %%v28, 192(%%r1,%[ptr_x]) \n\t" | |||||
| "vl %%v20, 192(%%r1,%[ptr_y]) \n\t" | |||||
| "vst %%v28, 192(%%r1,%[ptr_y]) \n\t" | |||||
| "vst %%v20, 192(%%r1,%[ptr_x]) \n\t" | |||||
| "vl %%v29, 208(%%r1,%[ptr_x]) \n\t" | |||||
| "vl %%v21, 208(%%r1,%[ptr_y]) \n\t" | |||||
| "vst %%v29, 208(%%r1,%[ptr_y]) \n\t" | |||||
| "vst %%v21, 208(%%r1,%[ptr_x]) \n\t" | |||||
| "vl %%v30, 224(%%r1,%[ptr_x]) \n\t" | |||||
| "vl %%v22, 224(%%r1,%[ptr_y]) \n\t" | |||||
| "vst %%v30, 224(%%r1,%[ptr_y]) \n\t" | |||||
| "vst %%v22, 224(%%r1,%[ptr_x]) \n\t" | |||||
| "vl %%v31, 240(%%r1,%[ptr_x]) \n\t" | |||||
| "vl %%v23, 240(%%r1,%[ptr_y]) \n\t" | |||||
| "vst %%v31, 240(%%r1,%[ptr_y]) \n\t" | |||||
| "vst %%v23, 240(%%r1,%[ptr_x]) \n\t" | |||||
| "la %%r1,256(%%r1) \n\t" | |||||
| "brctg %%r0,1b" | |||||
| : | |||||
| : "r"(n), "a"(x), "a"(y) | |||||
| : "cc", "memory" ,"r0","r1", "v16","v17","v18","v19","v20","v21","v22","v23" | |||||
| "la %%r1,256(%%r1) \n\t" | |||||
| "brctg %[n_tmp],1b" | |||||
| : [mem_x] "+m" (*(double (*)[n])x), | |||||
| [mem_y] "+m" (*(double (*)[n])y), | |||||
| [n_tmp] "+&r"(n) | |||||
| : [ptr_x] "a"(x), [ptr_y] "a"(y) | |||||
| : "cc", "r1", "v16","v17","v18","v19","v20","v21","v22","v23" | |||||
| ,"v24","v25","v26","v27","v28","v29","v30","v31" | ,"v24","v25","v26","v27","v28","v29","v30","v31" | ||||
| ); | ); | ||||
| return; | return; | ||||
| } | } | ||||
| #elif defined(Z13_SWAP_B) | |||||
| static void __attribute__ ((noinline)) dswap_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y) | |||||
| { | |||||
| __asm__ volatile( | |||||
| "pfd 2, 0(%1) \n\t" | |||||
| "pfd 2, 0(%2) \n\t" | |||||
| "srlg %%r0,%0,5 \n\t" | |||||
| "xgr %%r1,%%r1 \n\t" | |||||
| ".align 16 \n\t" | |||||
| "1: \n\t" | |||||
| "pfd 2, 256(%%r1,%1) \n\t" | |||||
| "pfd 2, 256(%%r1,%2) \n\t" | |||||
| "vl %%v24, 0(%%r1,%1) \n\t" | |||||
| "vl %%v25, 16(%%r1,%1) \n\t" | |||||
| "vl %%v26, 32(%%r1,%1) \n\t" | |||||
| "vl %%v27, 48(%%r1,%1) \n\t" | |||||
| "vl %%v28, 64(%%r1,%1) \n\t" | |||||
| "vl %%v29, 80(%%r1,%1) \n\t" | |||||
| "vl %%v30, 96(%%r1,%1) \n\t" | |||||
| "vl %%v31, 112(%%r1,%1) \n\t" | |||||
| "vl %%v16, 0(%%r1,%2) \n\t" | |||||
| "vl %%v17, 16(%%r1,%2) \n\t" | |||||
| "vl %%v18, 32(%%r1,%2) \n\t" | |||||
| "vl %%v19, 48(%%r1,%2) \n\t" | |||||
| "vl %%v20, 64(%%r1,%2) \n\t" | |||||
| "vl %%v21, 80(%%r1,%2) \n\t" | |||||
| "vl %%v22, 96(%%r1,%2) \n\t" | |||||
| "vl %%v23, 112(%%r1,%2) \n\t" | |||||
| "vst %%v24, 0(%%r1,%2) \n\t" | |||||
| "vst %%v25, 16(%%r1,%2) \n\t" | |||||
| "vst %%v26, 32(%%r1,%2) \n\t" | |||||
| "vst %%v27, 48(%%r1,%2) \n\t" | |||||
| "vst %%v28, 64(%%r1,%2) \n\t" | |||||
| "vst %%v29, 80(%%r1,%2) \n\t" | |||||
| "vst %%v30, 96(%%r1,%2) \n\t" | |||||
| "vst %%v31, 112(%%r1,%2)\n\t" | |||||
| "vst %%v16, 0(%%r1,%1) \n\t" | |||||
| "vst %%v17, 16(%%r1,%1) \n\t" | |||||
| "vst %%v18, 32(%%r1,%1) \n\t" | |||||
| "vst %%v19, 48(%%r1,%1) \n\t" | |||||
| "vst %%v20, 64(%%r1,%1) \n\t" | |||||
| "vst %%v21, 80(%%r1,%1) \n\t" | |||||
| "vst %%v22, 96(%%r1,%1) \n\t" | |||||
| "vst %%v23, 112(%%r1,%1)\n\t" | |||||
| "vl %%v24, 128(%%r1,%1) \n\t" | |||||
| "vl %%v25, 144(%%r1,%1) \n\t" | |||||
| "vl %%v26, 160(%%r1,%1) \n\t" | |||||
| "vl %%v27, 176(%%r1,%1) \n\t" | |||||
| "vl %%v28, 192(%%r1,%1) \n\t" | |||||
| "vl %%v29, 208(%%r1,%1) \n\t" | |||||
| "vl %%v30, 224(%%r1,%1) \n\t" | |||||
| "vl %%v31, 240(%%r1,%1) \n\t" | |||||
| "vl %%v16, 128(%%r1,%2) \n\t" | |||||
| "vl %%v17, 144(%%r1,%2) \n\t" | |||||
| "vl %%v18, 160(%%r1,%2) \n\t" | |||||
| "vl %%v19, 176(%%r1,%2) \n\t" | |||||
| "vl %%v20, 192(%%r1,%2) \n\t" | |||||
| "vl %%v21, 208(%%r1,%2) \n\t" | |||||
| "vl %%v22, 224(%%r1,%2) \n\t" | |||||
| "vl %%v23, 240(%%r1,%2) \n\t" | |||||
| "vst %%v24, 128(%%r1,%2) \n\t" | |||||
| "vst %%v25, 144(%%r1,%2) \n\t" | |||||
| "vst %%v26, 160(%%r1,%2) \n\t" | |||||
| "vst %%v27, 176(%%r1,%2) \n\t" | |||||
| "vst %%v28, 192(%%r1,%2) \n\t" | |||||
| "vst %%v29, 208(%%r1,%2) \n\t" | |||||
| "vst %%v30, 224(%%r1,%2) \n\t" | |||||
| "vst %%v31, 240(%%r1,%2) \n\t" | |||||
| "vst %%v16, 128(%%r1,%1) \n\t" | |||||
| "vst %%v17, 144(%%r1,%1) \n\t" | |||||
| "vst %%v18, 160(%%r1,%1) \n\t" | |||||
| "vst %%v19, 176(%%r1,%1) \n\t" | |||||
| "vst %%v20, 192(%%r1,%1) \n\t" | |||||
| "vst %%v21, 208(%%r1,%1) \n\t" | |||||
| "vst %%v22, 224(%%r1,%1) \n\t" | |||||
| "vst %%v23, 240(%%r1,%1) \n\t" | |||||
| "la %%r1,256(%%r1) \n\t" | |||||
| "brctg %%r0,1b" | |||||
| : | |||||
| : "r"(n), "a"(x), "a"(y) | |||||
| : "cc", "memory","r0","r1", "v16", | |||||
| "v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" | |||||
| ); | |||||
| return; | |||||
| } | |||||
| #else | |||||
| #elif defined(Z13_SWAP_C) | |||||
| static void __attribute__ ((noinline)) dswap_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y) | static void __attribute__ ((noinline)) dswap_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y) | ||||
| { | { | ||||
| __asm__ volatile( | __asm__ volatile( | ||||
| "pfd 2, 0(%1) \n\t" | |||||
| "pfd 2, 0(%2) \n\t" | |||||
| "srlg %%r0,%0,5 \n\t" | |||||
| "pfd 2, 0(%[ptr_x]) \n\t" | |||||
| "pfd 2, 0(%[ptr_y]) \n\t" | |||||
| "srlg %[n_tmp],%[n_tmp],5 \n\t" | |||||
| "xgr %%r1,%%r1 \n\t" | "xgr %%r1,%%r1 \n\t" | ||||
| ".align 16 \n\t" | ".align 16 \n\t" | ||||
| "1: \n\t" | "1: \n\t" | ||||
| "pfd 2, 256(%%r1,%1) \n\t" | |||||
| "pfd 2, 256(%%r1,%2) \n\t" | |||||
| "pfd 2, 256(%%r1,%[ptr_x]) \n\t" | |||||
| "pfd 2, 256(%%r1,%[ptr_y]) \n\t" | |||||
| "vl %%v16, 0(%%r1,%1) \n\t" | |||||
| "vl %%v17, 16(%%r1,%1) \n\t" | |||||
| "vl %%v18, 32(%%r1,%1) \n\t" | |||||
| "vl %%v19, 48(%%r1,%1) \n\t" | |||||
| "vl %%v20, 64(%%r1,%1) \n\t" | |||||
| "vl %%v21, 80(%%r1,%1) \n\t" | |||||
| "vl %%v22, 96(%%r1,%1) \n\t" | |||||
| "vl %%v23, 112(%%r1,%1) \n\t" | |||||
| "vl %%v24, 128(%%r1,%1) \n\t" | |||||
| "vl %%v25, 144(%%r1,%1) \n\t" | |||||
| "vl %%v26, 160(%%r1,%1) \n\t" | |||||
| "vl %%v27, 176(%%r1,%1) \n\t" | |||||
| "vl %%v28, 192(%%r1,%1) \n\t" | |||||
| "vl %%v29, 208(%%r1,%1) \n\t" | |||||
| "vl %%v30, 224(%%r1,%1) \n\t" | |||||
| "vl %%v31, 240(%%r1,%1) \n\t" | |||||
| "vl %%v0, 0(%%r1,%2) \n\t" | |||||
| "vl %%v1, 16(%%r1,%2) \n\t" | |||||
| "vl %%v2, 32(%%r1,%2) \n\t" | |||||
| "vl %%v3, 48(%%r1,%2) \n\t" | |||||
| "vl %%v4, 64(%%r1,%2) \n\t" | |||||
| "vl %%v5, 80(%%r1,%2) \n\t" | |||||
| "vl %%v6, 96(%%r1,%2) \n\t" | |||||
| "vl %%v7, 112(%%r1,%2) \n\t" | |||||
| "vst %%v0, 0(%%r1,%1) \n\t" | |||||
| "vst %%v1, 16(%%r1,%1) \n\t" | |||||
| "vst %%v2, 32(%%r1,%1) \n\t" | |||||
| "vst %%v3, 48(%%r1,%1) \n\t" | |||||
| "vst %%v4, 64(%%r1,%1) \n\t" | |||||
| "vst %%v5, 80(%%r1,%1) \n\t" | |||||
| "vst %%v6, 96(%%r1,%1) \n\t" | |||||
| "vst %%v7, 112(%%r1,%1) \n\t" | |||||
| "vl %%v0, 128(%%r1,%2) \n\t" | |||||
| "vl %%v1, 144(%%r1,%2) \n\t" | |||||
| "vl %%v2, 160(%%r1,%2) \n\t" | |||||
| "vl %%v3, 176(%%r1,%2) \n\t" | |||||
| "vl %%v4, 192(%%r1,%2) \n\t" | |||||
| "vl %%v5, 208(%%r1,%2) \n\t" | |||||
| "vl %%v6, 224(%%r1,%2) \n\t" | |||||
| "vl %%v7, 240(%%r1,%2) \n\t" | |||||
| "vst %%v0, 128(%%r1,%1) \n\t" | |||||
| "vst %%v1, 144(%%r1,%1) \n\t" | |||||
| "vst %%v2, 160(%%r1,%1) \n\t" | |||||
| "vst %%v3, 176(%%r1,%1) \n\t" | |||||
| "vst %%v4, 192(%%r1,%1) \n\t" | |||||
| "vst %%v5, 208(%%r1,%1) \n\t" | |||||
| "vst %%v6, 224(%%r1,%1) \n\t" | |||||
| "vst %%v7, 240(%%r1,%1) \n\t" | |||||
| "vst %%v16, 0(%%r1,%2) \n\t" | |||||
| "vst %%v17, 16(%%r1,%2) \n\t" | |||||
| "vst %%v18, 32(%%r1,%2) \n\t" | |||||
| "vst %%v19, 48(%%r1,%2) \n\t" | |||||
| "vst %%v20, 64(%%r1,%2) \n\t" | |||||
| "vst %%v21, 80(%%r1,%2) \n\t" | |||||
| "vst %%v22, 96(%%r1,%2) \n\t" | |||||
| "vst %%v23, 112(%%r1,%2) \n\t" | |||||
| "vst %%v24, 128(%%r1,%2) \n\t" | |||||
| "vst %%v25, 144(%%r1,%2) \n\t" | |||||
| "vst %%v26, 160(%%r1,%2) \n\t" | |||||
| "vst %%v27, 176(%%r1,%2) \n\t" | |||||
| "vst %%v28, 192(%%r1,%2) \n\t" | |||||
| "vst %%v29, 208(%%r1,%2) \n\t" | |||||
| "vst %%v30, 224(%%r1,%2) \n\t" | |||||
| "vst %%v31, 240(%%r1,%2) \n\t" | |||||
| "vl %%v16, 0(%%r1,%[ptr_x]) \n\t" | |||||
| "vl %%v17, 16(%%r1,%[ptr_x]) \n\t" | |||||
| "vl %%v18, 32(%%r1,%[ptr_x]) \n\t" | |||||
| "vl %%v19, 48(%%r1,%[ptr_x]) \n\t" | |||||
| "vl %%v20, 64(%%r1,%[ptr_x]) \n\t" | |||||
| "vl %%v21, 80(%%r1,%[ptr_x]) \n\t" | |||||
| "vl %%v22, 96(%%r1,%[ptr_x]) \n\t" | |||||
| "vl %%v23, 112(%%r1,%[ptr_x]) \n\t" | |||||
| "vl %%v24, 128(%%r1,%[ptr_x]) \n\t" | |||||
| "vl %%v25, 144(%%r1,%[ptr_x]) \n\t" | |||||
| "vl %%v26, 160(%%r1,%[ptr_x]) \n\t" | |||||
| "vl %%v27, 176(%%r1,%[ptr_x]) \n\t" | |||||
| "vl %%v28, 192(%%r1,%[ptr_x]) \n\t" | |||||
| "vl %%v29, 208(%%r1,%[ptr_x]) \n\t" | |||||
| "vl %%v30, 224(%%r1,%[ptr_x]) \n\t" | |||||
| "vl %%v31, 240(%%r1,%[ptr_x]) \n\t" | |||||
| "vl %%v0, 0(%%r1,%[ptr_y]) \n\t" | |||||
| "vl %%v1, 16(%%r1,%[ptr_y]) \n\t" | |||||
| "vl %%v2, 32(%%r1,%[ptr_y]) \n\t" | |||||
| "vl %%v3, 48(%%r1,%[ptr_y]) \n\t" | |||||
| "vl %%v4, 64(%%r1,%[ptr_y]) \n\t" | |||||
| "vl %%v5, 80(%%r1,%[ptr_y]) \n\t" | |||||
| "vl %%v6, 96(%%r1,%[ptr_y]) \n\t" | |||||
| "vl %%v7, 112(%%r1,%[ptr_y]) \n\t" | |||||
| "vst %%v0, 0(%%r1,%[ptr_x]) \n\t" | |||||
| "vst %%v1, 16(%%r1,%[ptr_x]) \n\t" | |||||
| "vst %%v2, 32(%%r1,%[ptr_x]) \n\t" | |||||
| "vst %%v3, 48(%%r1,%[ptr_x]) \n\t" | |||||
| "vst %%v4, 64(%%r1,%[ptr_x]) \n\t" | |||||
| "vst %%v5, 80(%%r1,%[ptr_x]) \n\t" | |||||
| "vst %%v6, 96(%%r1,%[ptr_x]) \n\t" | |||||
| "vst %%v7, 112(%%r1,%[ptr_x]) \n\t" | |||||
| "vl %%v0, 128(%%r1,%[ptr_y]) \n\t" | |||||
| "vl %%v1, 144(%%r1,%[ptr_y]) \n\t" | |||||
| "vl %%v2, 160(%%r1,%[ptr_y]) \n\t" | |||||
| "vl %%v3, 176(%%r1,%[ptr_y]) \n\t" | |||||
| "vl %%v4, 192(%%r1,%[ptr_y]) \n\t" | |||||
| "vl %%v5, 208(%%r1,%[ptr_y]) \n\t" | |||||
| "vl %%v6, 224(%%r1,%[ptr_y]) \n\t" | |||||
| "vl %%v7, 240(%%r1,%[ptr_y]) \n\t" | |||||
| "vst %%v0, 128(%%r1,%[ptr_x]) \n\t" | |||||
| "vst %%v1, 144(%%r1,%[ptr_x]) \n\t" | |||||
| "vst %%v2, 160(%%r1,%[ptr_x]) \n\t" | |||||
| "vst %%v3, 176(%%r1,%[ptr_x]) \n\t" | |||||
| "vst %%v4, 192(%%r1,%[ptr_x]) \n\t" | |||||
| "vst %%v5, 208(%%r1,%[ptr_x]) \n\t" | |||||
| "vst %%v6, 224(%%r1,%[ptr_x]) \n\t" | |||||
| "vst %%v7, 240(%%r1,%[ptr_x]) \n\t" | |||||
| "vst %%v16, 0(%%r1,%[ptr_y]) \n\t" | |||||
| "vst %%v17, 16(%%r1,%[ptr_y]) \n\t" | |||||
| "vst %%v18, 32(%%r1,%[ptr_y]) \n\t" | |||||
| "vst %%v19, 48(%%r1,%[ptr_y]) \n\t" | |||||
| "vst %%v20, 64(%%r1,%[ptr_y]) \n\t" | |||||
| "vst %%v21, 80(%%r1,%[ptr_y]) \n\t" | |||||
| "vst %%v22, 96(%%r1,%[ptr_y]) \n\t" | |||||
| "vst %%v23, 112(%%r1,%[ptr_y]) \n\t" | |||||
| "vst %%v24, 128(%%r1,%[ptr_y]) \n\t" | |||||
| "vst %%v25, 144(%%r1,%[ptr_y]) \n\t" | |||||
| "vst %%v26, 160(%%r1,%[ptr_y]) \n\t" | |||||
| "vst %%v27, 176(%%r1,%[ptr_y]) \n\t" | |||||
| "vst %%v28, 192(%%r1,%[ptr_y]) \n\t" | |||||
| "vst %%v29, 208(%%r1,%[ptr_y]) \n\t" | |||||
| "vst %%v30, 224(%%r1,%[ptr_y]) \n\t" | |||||
| "vst %%v31, 240(%%r1,%[ptr_y]) \n\t" | |||||
| "la %%r1,256(%%r1) \n\t" | |||||
| "brctg %%r0,1b" | |||||
| : | |||||
| : "r"(n), "a"(x), "a"(y) | |||||
| : "cc", "memory","r0","r1", "v0","v1","v2","v3","v4","v5","v6","v7","v16", | |||||
| "la %%r1,256(%%r1) \n\t" | |||||
| "brctg %[n_tmp],1b" | |||||
| : [mem_x] "+m" (*(double (*)[n])x), | |||||
| [mem_y] "+m" (*(double (*)[n])y), | |||||
| [n_tmp] "+&r"(n) | |||||
| : [ptr_x] "a"(x), [ptr_y] "a"(y) | |||||
| : "cc", "memory","r1", "v0","v1","v2","v3","v4","v5","v6","v7","v16", | |||||
| "v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" | "v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" | ||||
| ); | ); | ||||
| return; | return; | ||||
| @@ -43,15 +43,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| * Warning: requirements n>0 and n % 32 == 0 | * Warning: requirements n>0 and n % 32 == 0 | ||||
| * @param n | * @param n | ||||
| * @param x pointer to the vector | * @param x pointer to the vector | ||||
| * @param minf (out) maximum absolute value .( only for output ) | |||||
| * @param maxf (out) maximum absolute value .( only for output ) | |||||
| * @return index | * @return index | ||||
| */ | */ | ||||
| static BLASLONG diamax_kernel_32_TUNED(BLASLONG n, FLOAT *x, FLOAT *maxf) { | static BLASLONG diamax_kernel_32_TUNED(BLASLONG n, FLOAT *x, FLOAT *maxf) { | ||||
| BLASLONG index; | BLASLONG index; | ||||
| __asm__( | __asm__( | ||||
| "pfd 1, 0(%4) \n\t" | |||||
| "sllg %%r0,%3,3 \n\t" | |||||
| "agr %%r0,%4 \n\t" | |||||
| "pfd 1, 0(%[ptr_x]) \n\t" | |||||
| "sllg %%r0,%[n],3 \n\t" | |||||
| "agr %%r0,%[ptr_x] \n\t" | |||||
| "vleig %%v20,0,0 \n\t" | "vleig %%v20,0,0 \n\t" | ||||
| "vleig %%v20,1,1 \n\t" | "vleig %%v20,1,1 \n\t" | ||||
| "vleig %%v21,2,0 \n\t" | "vleig %%v21,2,0 \n\t" | ||||
| @@ -61,13 +61,13 @@ static BLASLONG diamax_kernel_32_TUNED(BLASLONG n, FLOAT *x, FLOAT *maxf) { | |||||
| "vleig %%v23,6,0 \n\t" | "vleig %%v23,6,0 \n\t" | ||||
| "vleig %%v23,7,1 \n\t" | "vleig %%v23,7,1 \n\t" | ||||
| "vrepig %%v4,8 \n\t" | "vrepig %%v4,8 \n\t" | ||||
| "vzero %%v5 \n\t" | |||||
| "vzero %%v18 \n\t" | |||||
| "vzero %%v19 \n\t" | |||||
| "vzero %%v5 \n\t" | |||||
| "vzero %%v18 \n\t" | |||||
| "vzero %%v19 \n\t" | |||||
| ".align 16 \n\t" | ".align 16 \n\t" | ||||
| "1: \n\t" | "1: \n\t" | ||||
| "pfd 1, 256(%2 ) \n\t" | |||||
| "vlm %%v24,%%v31, 0(%2 ) \n\t" | |||||
| "pfd 1, 256(%[ptr_tmp] ) \n\t" | |||||
| "vlm %%v24,%%v31, 0(%[ptr_tmp] ) \n\t" | |||||
| "vflpdb %%v24, %%v24 \n\t" | "vflpdb %%v24, %%v24 \n\t" | ||||
| "vflpdb %%v25, %%v25 \n\t" | "vflpdb %%v25, %%v25 \n\t" | ||||
| "vflpdb %%v26, %%v26 \n\t" | "vflpdb %%v26, %%v26 \n\t" | ||||
| @@ -89,24 +89,24 @@ static BLASLONG diamax_kernel_32_TUNED(BLASLONG n, FLOAT *x, FLOAT *maxf) { | |||||
| "vsel %%v26,%%v23,%%v22,%%v17 \n\t" | "vsel %%v26,%%v23,%%v22,%%v17 \n\t" | ||||
| "vsel %%v27,%%v31,%%v30,%%v17 \n\t" | "vsel %%v27,%%v31,%%v30,%%v17 \n\t" | ||||
| "vfchdb %%v28, %%v3,%%v0 \n\t" | |||||
| "vfchdb %%v29,%%v27, %%v25 \n\t" | |||||
| "vsel %%v1,%%v2,%%v1,%%v28 \n\t" | |||||
| "vsel %%v0,%%v3,%%v0,%%v28 \n\t" | |||||
| "vsel %%v24,%%v26,%%v24,%%v29 \n\t" | |||||
| "vsel %%v25,%%v27,%%v25,%%v29 \n\t" | |||||
| "vag %%v1,%%v1,%%v5 \n\t" | |||||
| "vag %%v24,%%v24,%%v5 \n\t" | |||||
| "vag %%v24,%%v24,%%v4 \n\t" | |||||
| "vfchdb %%v16,%%v25 , %%v0 \n\t" | |||||
| "vag %%v5,%%v5,%%v4 \n\t" | |||||
| "vsel %%v29,%%v25,%%v0,%%v16 \n\t" | |||||
| "vsel %%v28,%%v24,%%v1,%%v16 \n\t" | |||||
| "vfchdb %%v17, %%v29,%%v18 \n\t" | |||||
| "vsel %%v19,%%v28,%%v19,%%v17 \n\t" | |||||
| "vfchdb %%v28, %%v3,%%v0 \n\t" | |||||
| "vfchdb %%v29,%%v27, %%v25 \n\t" | |||||
| "vsel %%v1,%%v2,%%v1,%%v28 \n\t" | |||||
| "vsel %%v0,%%v3,%%v0,%%v28 \n\t" | |||||
| "vsel %%v24,%%v26,%%v24,%%v29 \n\t" | |||||
| "vsel %%v25,%%v27,%%v25,%%v29 \n\t" | |||||
| "vag %%v1,%%v1,%%v5 \n\t" | |||||
| "vag %%v24,%%v24,%%v5 \n\t" | |||||
| "vag %%v24,%%v24,%%v4 \n\t" | |||||
| "vfchdb %%v16,%%v25 , %%v0 \n\t" | |||||
| "vag %%v5,%%v5,%%v4 \n\t" | |||||
| "vsel %%v29,%%v25,%%v0,%%v16 \n\t" | |||||
| "vsel %%v28,%%v24,%%v1,%%v16 \n\t" | |||||
| "vfchdb %%v17, %%v29,%%v18 \n\t" | |||||
| "vsel %%v19,%%v28,%%v19,%%v17 \n\t" | |||||
| "vsel %%v18,%%v29,%%v18,%%v17 \n\t" | "vsel %%v18,%%v29,%%v18,%%v17 \n\t" | ||||
| "vag %%v5,%%v5,%%v4 \n\t" | |||||
| "vlm %%v24,%%v31,128(%2 ) \n\t" | |||||
| "vag %%v5,%%v5,%%v4 \n\t" | |||||
| "vlm %%v24,%%v31,128(%[ptr_tmp] ) \n\t" | |||||
| "vflpdb %%v24, %%v24 \n\t" | "vflpdb %%v24, %%v24 \n\t" | ||||
| "vflpdb %%v25, %%v25 \n\t" | "vflpdb %%v25, %%v25 \n\t" | ||||
| "vflpdb %%v26, %%v26 \n\t" | "vflpdb %%v26, %%v26 \n\t" | ||||
| @@ -134,37 +134,38 @@ static BLASLONG diamax_kernel_32_TUNED(BLASLONG n, FLOAT *x, FLOAT *maxf) { | |||||
| "vsel %%v0,%%v3,%%v0,%%v28 \n\t" | "vsel %%v0,%%v3,%%v0,%%v28 \n\t" | ||||
| "vsel %%v24,%%v26,%%v24,%%v29 \n\t" | "vsel %%v24,%%v26,%%v24,%%v29 \n\t" | ||||
| "vsel %%v25,%%v27,%%v25,%%v29 \n\t" | "vsel %%v25,%%v27,%%v25,%%v29 \n\t" | ||||
| "vag %%v1,%%v1,%%v5 \n\t" | |||||
| "vag %%v24,%%v24,%%v5 \n\t" | |||||
| "la %2,256(%2) \n\t" | |||||
| "vag %%v24,%%v24,%%v4 \n\t" | |||||
| "vag %%v1,%%v1,%%v5 \n\t" | |||||
| "vag %%v24,%%v24,%%v5 \n\t" | |||||
| "la %[ptr_tmp],256(%[ptr_tmp]) \n\t" | |||||
| "vag %%v24,%%v24,%%v4 \n\t" | |||||
| "vfchdb %%v16,%%v25 , %%v0 \n\t" | "vfchdb %%v16,%%v25 , %%v0 \n\t" | ||||
| "vag %%v5,%%v5,%%v4 \n\t" | "vag %%v5,%%v5,%%v4 \n\t" | ||||
| "vsel %%v29,%%v25,%%v0,%%v16 \n\t" | |||||
| "vsel %%v28,%%v24,%%v1,%%v16 \n\t" | |||||
| "vsel %%v29,%%v25,%%v0,%%v16 \n\t" | |||||
| "vsel %%v28,%%v24,%%v1,%%v16 \n\t" | |||||
| "vfchdb %%v17, %%v29,%%v18 \n\t" | "vfchdb %%v17, %%v29,%%v18 \n\t" | ||||
| "vsel %%v19,%%v28,%%v19,%%v17 \n\t" | "vsel %%v19,%%v28,%%v19,%%v17 \n\t" | ||||
| "vsel %%v18,%%v29,%%v18,%%v17 \n\t" | "vsel %%v18,%%v29,%%v18,%%v17 \n\t" | ||||
| "vag %%v5,%%v5,%%v4 \n\t" | |||||
| "clgrjl %2,%%r0,1b \n\t" | |||||
| "vag %%v5,%%v5,%%v4 \n\t" | |||||
| "clgrjl %[ptr_tmp],%%r0,1b \n\t" | |||||
| "vrepg %%v26,%%v18,1 \n\t" | |||||
| "vrepg %%v5,%%v19,1 \n\t" | |||||
| "wfcdb %%v26,%%v18 \n\t" | |||||
| "vrepg %%v26,%%v18,1 \n\t" | |||||
| "vrepg %%v5,%%v19,1 \n\t" | |||||
| "wfcdb %%v26,%%v18 \n\t" | |||||
| "jne 2f \n\t" | "jne 2f \n\t" | ||||
| "vsteg %%v18,%1,0 \n\t" | |||||
| "vmnlg %%v1,%%v5,%%v19 \n\t" | |||||
| "vlgvg %0,%%v1,0 \n\t" | |||||
| "j 3f \n\t" | |||||
| "2: \n\t" | |||||
| "vsteg %%v18,%[maxf],0 \n\t" | |||||
| "vmnlg %%v1,%%v5,%%v19 \n\t" | |||||
| "j 3f \n\t" | |||||
| "2: \n\t" | |||||
| "wfchdb %%v16,%%v26,%%v18 \n\t" | "wfchdb %%v16,%%v26,%%v18 \n\t" | ||||
| "vsel %%v1,%%v5,%%v19,%%v16 \n\t" | "vsel %%v1,%%v5,%%v19,%%v16 \n\t" | ||||
| "vsel %%v0,%%v26,%%v18,%%v16 \n\t" | "vsel %%v0,%%v26,%%v18,%%v16 \n\t" | ||||
| "vlgvg %0,%%v1,0 \n\t" | |||||
| "std %%f0,%1 \n\t" | |||||
| "3: " | |||||
| : "=r"(index) ,"=m"(*maxf) , "+&a"(x) | |||||
| : "r"(n), "2"(x) | |||||
| "std %%f0,%[maxf] \n\t" | |||||
| "3: \n\t" | |||||
| "vlgvg %[index],%%v1,0 \n\t" | |||||
| : [index] "+r"(index) ,[maxf] "=m"(*maxf), [ptr_tmp] "+&a"(x) | |||||
| : [mem] "m"( *(const double (*)[n])x), [n] "r"(n), [ptr_x] "r"(x) | |||||
| : "cc", "r0", "f0","v0","v1","v2","v3","v4","v5","v6","v7","v16", | : "cc", "r0", "f0","v0","v1","v2","v3","v4","v5","v6","v7","v16", | ||||
| "v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" | "v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" | ||||
| ); | ); | ||||
| @@ -48,9 +48,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| static BLASLONG diamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *minf) { | static BLASLONG diamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *minf) { | ||||
| BLASLONG index; | BLASLONG index; | ||||
| __asm__( | __asm__( | ||||
| "pfd 1, 0(%4) \n\t" | |||||
| "sllg %%r0,%3,3 \n\t" | |||||
| "agr %%r0,%4 \n\t" | |||||
| "pfd 1, 0(%[ptr_x]) \n\t" | |||||
| "sllg %%r0,%[n],3 \n\t" | |||||
| "agr %%r0,%[ptr_x] \n\t" | |||||
| "vleig %%v20,0,0 \n\t" | "vleig %%v20,0,0 \n\t" | ||||
| "vleig %%v20,1,1 \n\t" | "vleig %%v20,1,1 \n\t" | ||||
| "vleig %%v21,2,0 \n\t" | "vleig %%v21,2,0 \n\t" | ||||
| @@ -60,14 +60,14 @@ static BLASLONG diamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *minf) { | |||||
| "vleig %%v23,6,0 \n\t" | "vleig %%v23,6,0 \n\t" | ||||
| "vleig %%v23,7,1 \n\t" | "vleig %%v23,7,1 \n\t" | ||||
| "vrepig %%v4,8 \n\t" | "vrepig %%v4,8 \n\t" | ||||
| "vlrepg %%v18,0(%4) \n\t" | |||||
| "vzero %%v5 \n\t" | |||||
| "vflpdb %%v18, %%v18 \n\t" | |||||
| "vzero %%v19 \n\t" | |||||
| "vlrepg %%v18,0(%[ptr_x]) \n\t" | |||||
| "vzero %%v5 \n\t" | |||||
| "vflpdb %%v18, %%v18 \n\t" | |||||
| "vzero %%v19 \n\t" | |||||
| ".align 16 \n\t" | ".align 16 \n\t" | ||||
| "1: \n\t" | "1: \n\t" | ||||
| "pfd 1, 256(%2 ) \n\t" | |||||
| "vlm %%v24,%%v31, 0(%2 ) \n\t" | |||||
| "pfd 1, 256(%[ptr_tmp] ) \n\t" | |||||
| "vlm %%v24,%%v31, 0(%[ptr_tmp] ) \n\t" | |||||
| "vflpdb %%v24, %%v24 \n\t" | "vflpdb %%v24, %%v24 \n\t" | ||||
| "vflpdb %%v25, %%v25 \n\t" | "vflpdb %%v25, %%v25 \n\t" | ||||
| @@ -99,22 +99,22 @@ static BLASLONG diamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *minf) { | |||||
| "vsel %%v24,%%v26,%%v24,%%v29 \n\t" | "vsel %%v24,%%v26,%%v24,%%v29 \n\t" | ||||
| "vsel %%v25,%%v27,%%v25,%%v29 \n\t" | "vsel %%v25,%%v27,%%v25,%%v29 \n\t" | ||||
| "vag %%v1,%%v1,%%v5 \n\t" | |||||
| "vag %%v24,%%v24,%%v5 \n\t" | |||||
| "vag %%v24,%%v24,%%v4 \n\t" | |||||
| "vag %%v1,%%v1,%%v5 \n\t" | |||||
| "vag %%v24,%%v24,%%v5 \n\t" | |||||
| "vag %%v24,%%v24,%%v4 \n\t" | |||||
| "vfchdb %%v16, %%v0,%%v25 \n\t" | |||||
| "vag %%v5,%%v5,%%v4 \n\t" | |||||
| "vfchdb %%v16, %%v0,%%v25 \n\t" | |||||
| "vag %%v5,%%v5,%%v4 \n\t" | |||||
| "vsel %%v29,%%v25,%%v0,%%v16 \n\t" | "vsel %%v29,%%v25,%%v0,%%v16 \n\t" | ||||
| "vsel %%v28,%%v24,%%v1,%%v16 \n\t" | |||||
| "vsel %%v28,%%v24,%%v1,%%v16 \n\t" | |||||
| "vfchdb %%v17,%%v18, %%v29 \n\t" | |||||
| "vsel %%v19,%%v28,%%v19,%%v17 \n\t" | |||||
| "vfchdb %%v17,%%v18, %%v29 \n\t" | |||||
| "vsel %%v19,%%v28,%%v19,%%v17 \n\t" | |||||
| "vsel %%v18,%%v29,%%v18,%%v17 \n\t" | "vsel %%v18,%%v29,%%v18,%%v17 \n\t" | ||||
| "vag %%v5,%%v5,%%v4 \n\t" | |||||
| "vag %%v5,%%v5,%%v4 \n\t" | |||||
| "vlm %%v24,%%v31,128(%2 ) \n\t" | |||||
| "vlm %%v24,%%v31,128(%[ptr_tmp] ) \n\t" | |||||
| "vflpdb %%v24, %%v24 \n\t" | "vflpdb %%v24, %%v24 \n\t" | ||||
| "vflpdb %%v25, %%v25 \n\t" | "vflpdb %%v25, %%v25 \n\t" | ||||
| "vflpdb %%v26, %%v26 \n\t" | "vflpdb %%v26, %%v26 \n\t" | ||||
| @@ -147,7 +147,7 @@ static BLASLONG diamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *minf) { | |||||
| "vag %%v1,%%v1,%%v5 \n\t" | "vag %%v1,%%v1,%%v5 \n\t" | ||||
| "vag %%v24,%%v24,%%v5 \n\t" | "vag %%v24,%%v24,%%v5 \n\t" | ||||
| "la %2,256(%2) \n\t" | |||||
| "la %[ptr_tmp],256(%[ptr_tmp]) \n\t" | |||||
| "vag %%v24,%%v24,%%v4 \n\t" | "vag %%v24,%%v24,%%v4 \n\t" | ||||
| "vfchdb %%v16, %%v0,%%v25 \n\t" | "vfchdb %%v16, %%v0,%%v25 \n\t" | ||||
| @@ -161,27 +161,28 @@ static BLASLONG diamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *minf) { | |||||
| "vag %%v5,%%v5,%%v4 \n\t" | "vag %%v5,%%v5,%%v4 \n\t" | ||||
| "clgrjl %2,%%r0,1b \n\t" | |||||
| "clgrjl %[ptr_tmp],%%r0,1b \n\t" | |||||
| "vrepg %%v26,%%v18,1 \n\t" | "vrepg %%v26,%%v18,1 \n\t" | ||||
| "vrepg %%v5,%%v19,1 \n\t" | "vrepg %%v5,%%v19,1 \n\t" | ||||
| "wfcdb %%v26,%%v18 \n\t" | "wfcdb %%v26,%%v18 \n\t" | ||||
| "jne 2f \n\t" | "jne 2f \n\t" | ||||
| "vsteg %%v18,%1,0 \n\t" | |||||
| "vmnlg %%v1,%%v5,%%v19 \n\t" | |||||
| "vlgvg %0,%%v1,0 \n\t" | |||||
| "j 3f \n\t" | |||||
| "2: \n\t" | |||||
| "vsteg %%v18,%[minf],0 \n\t" | |||||
| "vmnlg %%v1,%%v5,%%v19 \n\t" | |||||
| "j 3f \n\t" | |||||
| "2: \n\t" | |||||
| "wfchdb %%v16,%%v18 ,%%v26 \n\t " | "wfchdb %%v16,%%v18 ,%%v26 \n\t " | ||||
| "vsel %%v1,%%v5,%%v19,%%v16 \n\t" | "vsel %%v1,%%v5,%%v19,%%v16 \n\t" | ||||
| "vsel %%v0,%%v26,%%v18,%%v16 \n\t" | "vsel %%v0,%%v26,%%v18,%%v16 \n\t" | ||||
| "vlgvg %0,%%v1,0 \n\t" | |||||
| "std %%f0,%1 \n\t" | |||||
| "3:" | |||||
| "std %%f0,%[minf] \n\t" | |||||
| "3: \n\t" | |||||
| "vlgvg %[index],%%v1,0 \n\t" | |||||
| : "+r"(index) ,"=m"(*minf),"+&a"(x) | |||||
| : "r"(n), "2"(x) | |||||
| : [index] "+r"(index) ,[minf] "=m"(*minf), [ptr_tmp] "+&a"(x) | |||||
| : [mem] "m"( *(const double (*)[n])x), [n] "r"(n), [ptr_x] "r"(x) | |||||
| : "cc","r0", "f0","v0","v1","v2","v3","v4","v5","v6","v7","v16", | : "cc","r0", "f0","v0","v1","v2","v3","v4","v5","v6","v7","v16", | ||||
| "v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" | "v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" | ||||
| @@ -37,16 +37,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| /** | /** | ||||
| * Find maximum index | * Find maximum index | ||||
| * Warning: requirements n>0 and n % 8 == 0 | |||||
| * Warning: requirements n>0 and n % 16 == 0 | |||||
| * @param n | * @param n | ||||
| * @param x pointer to the vector | * @param x pointer to the vector | ||||
| * @param minf (out) maximum absolute value .( only for output ) | |||||
| * @param maxf (out) maximum absolute value .( only for output ) | |||||
| * @return index | * @return index | ||||
| */ | */ | ||||
| static BLASLONG ziamax_kernel_8_TUNED(BLASLONG n, FLOAT *x, FLOAT *maxf) { | |||||
| static BLASLONG ziamax_kernel_16_TUNED(BLASLONG n, FLOAT *x, FLOAT *maxf) { | |||||
| BLASLONG index; | BLASLONG index; | ||||
| __asm__( | __asm__( | ||||
| "pfd 1, 0(%4) \n\t" | |||||
| "pfd 1, 0(%[ptr_x]) \n\t" | |||||
| "vleig %%v16,0,0 \n\t" | "vleig %%v16,0,0 \n\t" | ||||
| "vleig %%v16,1,1 \n\t" | "vleig %%v16,1,1 \n\t" | ||||
| "vleig %%v17,2,0 \n\t" | "vleig %%v17,2,0 \n\t" | ||||
| @@ -65,32 +65,32 @@ static BLASLONG ziamax_kernel_8_TUNED(BLASLONG n, FLOAT *x, FLOAT *maxf) { | |||||
| "vleig %%v23,15,1 \n\t" | "vleig %%v23,15,1 \n\t" | ||||
| "sllg %%r0,%3,4 \n\t" | |||||
| "agr %%r0,%4 \n\t" | |||||
| "vzero %%v6 \n\t" | |||||
| "vzero %%v7 \n\t" | |||||
| "vrepig %%v4,16 \n\t" | |||||
| "vzero %%v5 \n\t" | |||||
| "sllg %%r0,%[n],4 \n\t" | |||||
| "agr %%r0,%[ptr_x] \n\t" | |||||
| "vzero %%v6 \n\t" | |||||
| "vzero %%v7 \n\t" | |||||
| "vrepig %%v4,16 \n\t" | |||||
| "vzero %%v5 \n\t" | |||||
| ".align 16 \n\t" | ".align 16 \n\t" | ||||
| "1: \n\t" | |||||
| "pfd 1, 256(%2 ) \n\t" | |||||
| "1: \n\t" | |||||
| "pfd 1, 256(%[ptr_tmp] ) \n\t" | |||||
| "vleg %%v24 , 0( %2),0 \n\t" | |||||
| "vleg %%v25 , 8( %2),0 \n\t" | |||||
| "vleg %%v24 , 16( %2),1 \n\t" | |||||
| "vleg %%v25 , 24( %2),1 \n\t" | |||||
| "vleg %%v26 , 32( %2),0 \n\t" | |||||
| "vleg %%v27 , 40( %2),0 \n\t" | |||||
| "vleg %%v26 , 48( %2),1 \n\t" | |||||
| "vleg %%v27 , 56( %2),1 \n\t" | |||||
| "vleg %%v28 , 64( %2),0 \n\t" | |||||
| "vleg %%v29 , 72( %2),0 \n\t" | |||||
| "vleg %%v28 , 80( %2),1 \n\t" | |||||
| "vleg %%v29 , 88( %2),1 \n\t" | |||||
| "vleg %%v30 , 96( %2),0 \n\t" | |||||
| "vleg %%v31 ,104( %2),0 \n\t" | |||||
| "vleg %%v30 ,112( %2),1 \n\t" | |||||
| "vleg %%v31 ,120( %2),1 \n\t" | |||||
| "vleg %%v24 , 0(%[ptr_tmp]),0 \n\t" | |||||
| "vleg %%v25 , 8(%[ptr_tmp]),0 \n\t" | |||||
| "vleg %%v24 , 16(%[ptr_tmp]),1 \n\t" | |||||
| "vleg %%v25 , 24(%[ptr_tmp]),1 \n\t" | |||||
| "vleg %%v26 , 32(%[ptr_tmp]),0 \n\t" | |||||
| "vleg %%v27 , 40(%[ptr_tmp]),0 \n\t" | |||||
| "vleg %%v26 , 48(%[ptr_tmp]),1 \n\t" | |||||
| "vleg %%v27 , 56(%[ptr_tmp]),1 \n\t" | |||||
| "vleg %%v28 , 64(%[ptr_tmp]),0 \n\t" | |||||
| "vleg %%v29 , 72(%[ptr_tmp]),0 \n\t" | |||||
| "vleg %%v28 , 80(%[ptr_tmp]),1 \n\t" | |||||
| "vleg %%v29 , 88(%[ptr_tmp]),1 \n\t" | |||||
| "vleg %%v30 , 96(%[ptr_tmp]),0 \n\t" | |||||
| "vleg %%v31 ,104(%[ptr_tmp]),0 \n\t" | |||||
| "vleg %%v30 ,112(%[ptr_tmp]),1 \n\t" | |||||
| "vleg %%v31 ,120(%[ptr_tmp]),1 \n\t" | |||||
| "vflpdb %%v24, %%v24 \n\t" | "vflpdb %%v24, %%v24 \n\t" | ||||
| "vflpdb %%v25, %%v25 \n\t" | "vflpdb %%v25, %%v25 \n\t" | ||||
| "vflpdb %%v26, %%v26 \n\t" | "vflpdb %%v26, %%v26 \n\t" | ||||
| @@ -100,28 +100,28 @@ static BLASLONG ziamax_kernel_8_TUNED(BLASLONG n, FLOAT *x, FLOAT *maxf) { | |||||
| "vflpdb %%v30, %%v30 \n\t" | "vflpdb %%v30, %%v30 \n\t" | ||||
| "vflpdb %%v31, %%v31 \n\t" | "vflpdb %%v31, %%v31 \n\t" | ||||
| "vfadb %%v0,%%v24,%%v25 \n\t" | |||||
| "vfadb %%v1,%%v26,%%v27 \n\t" | |||||
| "vfadb %%v2,%%v28,%%v29 \n\t" | |||||
| "vfadb %%v3,%%v30,%%v31 \n\t" | |||||
| "vfadb %%v0,%%v24,%%v25 \n\t" | |||||
| "vfadb %%v1,%%v26,%%v27 \n\t" | |||||
| "vfadb %%v2,%%v28,%%v29 \n\t" | |||||
| "vfadb %%v3,%%v30,%%v31 \n\t" | |||||
| "vleg %%v24 , 128( %2),0 \n\t" | |||||
| "vleg %%v25 , 136( %2),0 \n\t" | |||||
| "vleg %%v24 , 144( %2),1 \n\t" | |||||
| "vleg %%v25 , 152( %2),1 \n\t" | |||||
| "vleg %%v26 , 160( %2),0 \n\t" | |||||
| "vleg %%v27 , 168( %2),0 \n\t" | |||||
| "vleg %%v26 , 176( %2),1 \n\t" | |||||
| "vleg %%v27 , 184( %2),1 \n\t" | |||||
| "vleg %%v28 , 192( %2),0 \n\t" | |||||
| "vleg %%v29 , 200( %2),0 \n\t" | |||||
| "vleg %%v28 , 208( %2),1 \n\t" | |||||
| "vleg %%v29 , 216( %2),1 \n\t" | |||||
| "vleg %%v30 , 224( %2),0 \n\t" | |||||
| "vleg %%v31 , 232( %2),0 \n\t" | |||||
| "vleg %%v30 , 240( %2),1 \n\t" | |||||
| "vleg %%v31 , 248( %2),1 \n\t" | |||||
| "vleg %%v24 , 128(%[ptr_tmp]),0 \n\t" | |||||
| "vleg %%v25 , 136(%[ptr_tmp]),0 \n\t" | |||||
| "vleg %%v24 , 144(%[ptr_tmp]),1 \n\t" | |||||
| "vleg %%v25 , 152(%[ptr_tmp]),1 \n\t" | |||||
| "vleg %%v26 , 160(%[ptr_tmp]),0 \n\t" | |||||
| "vleg %%v27 , 168(%[ptr_tmp]),0 \n\t" | |||||
| "vleg %%v26 , 176(%[ptr_tmp]),1 \n\t" | |||||
| "vleg %%v27 , 184(%[ptr_tmp]),1 \n\t" | |||||
| "vleg %%v28 , 192(%[ptr_tmp]),0 \n\t" | |||||
| "vleg %%v29 , 200(%[ptr_tmp]),0 \n\t" | |||||
| "vleg %%v28 , 208(%[ptr_tmp]),1 \n\t" | |||||
| "vleg %%v29 , 216(%[ptr_tmp]),1 \n\t" | |||||
| "vleg %%v30 , 224(%[ptr_tmp]),0 \n\t" | |||||
| "vleg %%v31 , 232(%[ptr_tmp]),0 \n\t" | |||||
| "vleg %%v30 , 240(%[ptr_tmp]),1 \n\t" | |||||
| "vleg %%v31 , 248(%[ptr_tmp]),1 \n\t" | |||||
| "vflpdb %%v24, %%v24 \n\t" | "vflpdb %%v24, %%v24 \n\t" | ||||
| "vflpdb %%v25, %%v25 \n\t" | "vflpdb %%v25, %%v25 \n\t" | ||||
| "vflpdb %%v26, %%v26 \n\t" | "vflpdb %%v26, %%v26 \n\t" | ||||
| @@ -131,70 +131,70 @@ static BLASLONG ziamax_kernel_8_TUNED(BLASLONG n, FLOAT *x, FLOAT *maxf) { | |||||
| "vflpdb %%v30, %%v30 \n\t" | "vflpdb %%v30, %%v30 \n\t" | ||||
| "vflpdb %%v31, %%v31 \n\t" | "vflpdb %%v31, %%v31 \n\t" | ||||
| "vfadb %%v24,%%v24,%%v25 \n\t" | |||||
| "vfadb %%v26,%%v26,%%v27 \n\t" | |||||
| "vfadb %%v28,%%v28,%%v29 \n\t" | |||||
| "vfadb %%v30,%%v30,%%v31 \n\t" | |||||
| "vfadb %%v24,%%v24,%%v25 \n\t" | |||||
| "vfadb %%v26,%%v26,%%v27 \n\t" | |||||
| "vfadb %%v28,%%v28,%%v29 \n\t" | |||||
| "vfadb %%v30,%%v30,%%v31 \n\t" | |||||
| "vfchdb %%v25,%%v1,%%v0 \n\t" | "vfchdb %%v25,%%v1,%%v0 \n\t" | ||||
| "vsel %%v29,%%v17,%%v16,%%v25 \n\t" | |||||
| "vsel %%v31,%%v1,%%v0,%%v25 \n\t" | |||||
| "vsel %%v29,%%v17,%%v16,%%v25 \n\t" | |||||
| "vsel %%v31,%%v1,%%v0,%%v25 \n\t" | |||||
| "vfchdb %%v27,%%v3,%%v2 \n\t " | "vfchdb %%v27,%%v3,%%v2 \n\t " | ||||
| "vsel %%v0,%%v19,%%v18,%%v27 \n\t" | |||||
| "vsel %%v1,%%v3,%%v2,%%v27 \n\t" | |||||
| "vsel %%v0,%%v19,%%v18,%%v27 \n\t" | |||||
| "vsel %%v1,%%v3,%%v2,%%v27 \n\t" | |||||
| "vfchdb %%v25,%%v26,%%v24 \n\t " | |||||
| "vsel %%v2,%%v21,%%v20,%%v25 \n\t" | |||||
| "vsel %%v3,%%v26,%%v24,%%v25 \n\t" | |||||
| "vfchdb %%v25,%%v26,%%v24 \n\t" | |||||
| "vsel %%v2,%%v21,%%v20,%%v25 \n\t" | |||||
| "vsel %%v3,%%v26,%%v24,%%v25 \n\t" | |||||
| "vfchdb %%v27,%%v30,%%v28 \n\t " | |||||
| "vsel %%v25,%%v23,%%v22,%%v27 \n\t" | |||||
| "vsel %%v27,%%v30,%%v28,%%v27 \n\t" | |||||
| "vfchdb %%v27,%%v30,%%v28 \n\t" | |||||
| "vsel %%v25,%%v23,%%v22,%%v27 \n\t" | |||||
| "vsel %%v27,%%v30,%%v28,%%v27 \n\t" | |||||
| "vfchdb %%v24, %%v1,%%v31 \n\t" | |||||
| "vsel %%v26,%%v0,%%v29,%%v24 \n\t" | |||||
| "vsel %%v28,%%v1,%%v31,%%v24 \n\t" | |||||
| "vfchdb %%v24, %%v1,%%v31 \n\t" | |||||
| "vsel %%v26,%%v0,%%v29,%%v24 \n\t" | |||||
| "vsel %%v28,%%v1,%%v31,%%v24 \n\t" | |||||
| "vfchdb %%v30, %%v27,%%v3 \n\t" | |||||
| "vsel %%v29,%%v25,%%v2,%%v30 \n\t" | |||||
| "vsel %%v31,%%v27,%%v3 ,%%v30 \n\t" | |||||
| "vfchdb %%v30, %%v27,%%v3 \n\t" | |||||
| "vsel %%v29,%%v25,%%v2,%%v30 \n\t" | |||||
| "vsel %%v31,%%v27,%%v3 ,%%v30 \n\t" | |||||
| "la %2,256(%2) \n\t" | |||||
| "la %[ptr_tmp],256(%[ptr_tmp]) \n\t" | |||||
| "vfchdb %%v0, %%v31,%%v28 \n\t" | |||||
| "vsel %%v25,%%v29,%%v26,%%v0 \n\t" | |||||
| "vsel %%v27,%%v31,%%v28,%%v0 \n\t" | |||||
| "vfchdb %%v0, %%v31,%%v28 \n\t" | |||||
| "vsel %%v25,%%v29,%%v26,%%v0 \n\t" | |||||
| "vsel %%v27,%%v31,%%v28,%%v0 \n\t" | |||||
| "vag %%v25,%%v25,%%v5 \n\t" | |||||
| "vag %%v25,%%v25,%%v5 \n\t" | |||||
| //cmp with previous | //cmp with previous | ||||
| "vfchdb %%v30, %%v27,%%v6 \n\t" | |||||
| "vsel %%v7,%%v25,%%v7,%%v30 \n\t" | |||||
| "vsel %%v6,%%v27,%%v6,%%v30 \n\t" | |||||
| "vfchdb %%v30, %%v27,%%v6 \n\t" | |||||
| "vsel %%v7,%%v25,%%v7,%%v30 \n\t" | |||||
| "vsel %%v6,%%v27,%%v6,%%v30 \n\t" | |||||
| "vag %%v5,%%v5,%%v4 \n\t" | |||||
| "vag %%v5,%%v5,%%v4 \n\t" | |||||
| "clgrjl %2,%%r0,1b \n\t" | |||||
| "clgrjl %[ptr_tmp],%%r0,1b \n\t" | |||||
| //xtract index | //xtract index | ||||
| "vrepg %%v26,%%v6,1 \n\t" | |||||
| "vrepg %%v5,%%v7,1 \n\t" | |||||
| "vrepg %%v26,%%v6,1 \n\t" | |||||
| "vrepg %%v5,%%v7,1 \n\t" | |||||
| "wfcdb %%v26,%%v6 \n\t" | "wfcdb %%v26,%%v6 \n\t" | ||||
| "jne 2f \n\t" | |||||
| "vsteg %%v6,%1,0 \n\t" | |||||
| "jne 2f \n\t" | |||||
| "vsteg %%v6,%[maxf],0 \n\t" | |||||
| "vmnlg %%v1,%%v5,%%v7 \n\t" | "vmnlg %%v1,%%v5,%%v7 \n\t" | ||||
| "vlgvg %0,%%v1,0 \n\t" | |||||
| "j 3 \n\t" | |||||
| "2: \n\t" | |||||
| "wfchdb %%v16,%%v26,%%v6 \n\t" | |||||
| "vsel %%v1,%%v5,%%v7,%%v16 \n\t" | |||||
| "vsel %%v0,%%v26,%%v6,%%v16 \n\t" | |||||
| "vlgvg %0,%%v1,0 \n\t" | |||||
| "std %%f0,%1 \n\t" | |||||
| "3: \n\t" | |||||
| : "=r"(index),"=m"(*maxf),"+&a"(x) | |||||
| : "r"(n), "2"(x) | |||||
| "vlgvg %[index],%%v1,0 \n\t" | |||||
| "j 3 \n\t" | |||||
| "2: \n\t" | |||||
| "wfchdb %%v16,%%v26,%%v6 \n\t" | |||||
| "vsel %%v1,%%v5,%%v7,%%v16 \n\t" | |||||
| "vsel %%v0,%%v26,%%v6,%%v16 \n\t" | |||||
| "vlgvg %[index],%%v1,0 \n\t" | |||||
| "std %%f0,%[maxf] \n\t" | |||||
| "3: \n\t" | |||||
| : [index] "+r"(index) ,[maxf] "=m"(*maxf), [ptr_tmp] "+&a"(x) | |||||
| : [mem] "m"( *(const double (*)[2*n])x), [n] "r"(n), [ptr_x] "r"(x) | |||||
| : "cc","r0", "f0","v0","v1","v2","v3","v4","v5","v6","v7","v16", | : "cc","r0", "f0","v0","v1","v2","v3","v4","v5","v6","v7","v16", | ||||
| "v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" | "v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" | ||||
| @@ -220,12 +220,12 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||||
| if (inc_x == 1) { | if (inc_x == 1) { | ||||
| BLASLONG n1 = n & -8; | |||||
| BLASLONG n1 = n & -16; | |||||
| if (n1 > 0) { | if (n1 > 0) { | ||||
| max = ziamax_kernel_8_TUNED(n1, x, &maxf); | |||||
| max = ziamax_kernel_16_TUNED(n1, x, &maxf); | |||||
| i = n1; | i = n1; | ||||
| ix = n1 << 1; | |||||
| } | } | ||||
| while(i < n) | while(i < n) | ||||
| @@ -35,16 +35,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| /** | /** | ||||
| * Find minimum index | * Find minimum index | ||||
| * Warning: requirements n>0 and n % 8 == 0 | |||||
| * Warning: requirements n>0 and n % 16 == 0 | |||||
| * @param n | * @param n | ||||
| * @param x pointer to the vector | * @param x pointer to the vector | ||||
| * @param minf (out) minimum absolute value .( only for output ) | * @param minf (out) minimum absolute value .( only for output ) | ||||
| * @return minimum index | * @return minimum index | ||||
| */ | */ | ||||
| static BLASLONG ziamin_kernel_8_TUNED(BLASLONG n, FLOAT *x, FLOAT *minf) { | |||||
| static BLASLONG ziamin_kernel_16_TUNED(BLASLONG n, FLOAT *x, FLOAT *minf) { | |||||
| BLASLONG index ; | BLASLONG index ; | ||||
| __asm__( | __asm__( | ||||
| "pfd 1, 0(%4) \n\t" | |||||
| "pfd 1, 0(%[ptr_x]) \n\t" | |||||
| "vleig %%v16,0,0 \n\t" | "vleig %%v16,0,0 \n\t" | ||||
| "vleig %%v16,1,1 \n\t" | "vleig %%v16,1,1 \n\t" | ||||
| "vleig %%v17,2,0 \n\t" | "vleig %%v17,2,0 \n\t" | ||||
| @@ -61,143 +61,143 @@ static BLASLONG ziamin_kernel_8_TUNED(BLASLONG n, FLOAT *x, FLOAT *minf) { | |||||
| "vleig %%v22,13,1 \n\t" | "vleig %%v22,13,1 \n\t" | ||||
| "vleig %%v23,14,0 \n\t" | "vleig %%v23,14,0 \n\t" | ||||
| "vleig %%v23,15,1 \n\t" | "vleig %%v23,15,1 \n\t" | ||||
| "ld %%f6,0(%4) \n\t" | |||||
| "lpdbr %%f6,%%f6 \n\t" | |||||
| "ld %%f7,8(%4) \n\t" | |||||
| "lpdbr %%f7,%%f7 \n\t" | |||||
| "adbr %%f6,%%f7 \n\t" | |||||
| "sllg %%r0,%3,4 \n\t" | |||||
| "agr %%r0,%4 \n\t" | |||||
| "vrepg %%v6,%%v6,0 \n\t" | |||||
| "vzero %%v7 \n\t" | |||||
| "vrepig %%v4,16 \n\t" | |||||
| "vzero %%v5 \n\t" | |||||
| "ld %%f6,0(%[ptr_x]) \n\t" | |||||
| "lpdbr %%f6,%%f6 \n\t" | |||||
| "ld %%f7,8(%[ptr_x]) \n\t" | |||||
| "lpdbr %%f7,%%f7 \n\t" | |||||
| "adbr %%f6,%%f7 \n\t" | |||||
| "sllg %%r0,%[n],4 \n\t" | |||||
| "agr %%r0,%[ptr_x] \n\t" | |||||
| "vrepg %%v6,%%v6,0 \n\t" | |||||
| "vzero %%v7 \n\t" | |||||
| "vrepig %%v4,16 \n\t" | |||||
| "vzero %%v5 \n\t" | |||||
| ".align 16 \n\t" | ".align 16 \n\t" | ||||
| "1: \n\t" | |||||
| "pfd 1, 256(%2 ) \n\t" | |||||
| "1: \n\t" | |||||
| "pfd 1, 256(%[ptr_tmp] ) \n\t" | |||||
| "vleg %%v24 , 0( %2),0 \n\t" | |||||
| "vleg %%v25 , 8( %2),0 \n\t" | |||||
| "vleg %%v24 , 16( %2),1 \n\t" | |||||
| "vleg %%v25 , 24( %2),1 \n\t" | |||||
| "vleg %%v26 , 32( %2),0 \n\t" | |||||
| "vleg %%v27 , 40( %2),0 \n\t" | |||||
| "vleg %%v26 , 48( %2),1 \n\t" | |||||
| "vleg %%v27 , 56( %2),1 \n\t" | |||||
| "vleg %%v28 , 64( %2),0 \n\t" | |||||
| "vleg %%v29 , 72( %2),0 \n\t" | |||||
| "vleg %%v28 , 80( %2),1 \n\t" | |||||
| "vleg %%v29 , 88( %2),1 \n\t" | |||||
| "vleg %%v30 , 96( %2),0 \n\t" | |||||
| "vleg %%v31 ,104( %2),0 \n\t" | |||||
| "vleg %%v30 ,112( %2),1 \n\t" | |||||
| "vleg %%v31 ,120( %2),1 \n\t" | |||||
| "vflpdb %%v24, %%v24 \n\t" | |||||
| "vflpdb %%v25, %%v25 \n\t" | |||||
| "vflpdb %%v26, %%v26 \n\t" | |||||
| "vflpdb %%v27, %%v27 \n\t" | |||||
| "vflpdb %%v28, %%v28 \n\t" | |||||
| "vflpdb %%v29, %%v29 \n\t" | |||||
| "vflpdb %%v30, %%v30 \n\t" | |||||
| "vflpdb %%v31, %%v31 \n\t" | |||||
| "vleg %%v24 , 0(%[ptr_tmp]),0 \n\t" | |||||
| "vleg %%v25 , 8(%[ptr_tmp]),0 \n\t" | |||||
| "vleg %%v24 , 16(%[ptr_tmp]),1 \n\t" | |||||
| "vleg %%v25 , 24(%[ptr_tmp]),1 \n\t" | |||||
| "vleg %%v26 , 32(%[ptr_tmp]),0 \n\t" | |||||
| "vleg %%v27 , 40(%[ptr_tmp]),0 \n\t" | |||||
| "vleg %%v26 , 48(%[ptr_tmp]),1 \n\t" | |||||
| "vleg %%v27 , 56(%[ptr_tmp]),1 \n\t" | |||||
| "vleg %%v28 , 64(%[ptr_tmp]),0 \n\t" | |||||
| "vleg %%v29 , 72(%[ptr_tmp]),0 \n\t" | |||||
| "vleg %%v28 , 80(%[ptr_tmp]),1 \n\t" | |||||
| "vleg %%v29 , 88(%[ptr_tmp]),1 \n\t" | |||||
| "vleg %%v30 , 96(%[ptr_tmp]),0 \n\t" | |||||
| "vleg %%v31 ,104(%[ptr_tmp]),0 \n\t" | |||||
| "vleg %%v30 ,112(%[ptr_tmp]),1 \n\t" | |||||
| "vleg %%v31 ,120(%[ptr_tmp]),1 \n\t" | |||||
| "vflpdb %%v24, %%v24 \n\t" | |||||
| "vflpdb %%v25, %%v25 \n\t" | |||||
| "vflpdb %%v26, %%v26 \n\t" | |||||
| "vflpdb %%v27, %%v27 \n\t" | |||||
| "vflpdb %%v28, %%v28 \n\t" | |||||
| "vflpdb %%v29, %%v29 \n\t" | |||||
| "vflpdb %%v30, %%v30 \n\t" | |||||
| "vflpdb %%v31, %%v31 \n\t" | |||||
| "vfadb %%v0,%%v24,%%v25 \n\t" | |||||
| "vfadb %%v1,%%v26,%%v27 \n\t" | |||||
| "vfadb %%v2,%%v28,%%v29 \n\t" | |||||
| "vfadb %%v3,%%v30,%%v31 \n\t" | |||||
| "vfadb %%v0,%%v24,%%v25 \n\t" | |||||
| "vfadb %%v1,%%v26,%%v27 \n\t" | |||||
| "vfadb %%v2,%%v28,%%v29 \n\t" | |||||
| "vfadb %%v3,%%v30,%%v31 \n\t" | |||||
| "vleg %%v24 ,128( %2),0 \n\t" | |||||
| "vleg %%v25 ,136( %2),0 \n\t" | |||||
| "vleg %%v24 ,144( %2),1 \n\t" | |||||
| "vleg %%v25 ,152( %2),1 \n\t" | |||||
| "vleg %%v26 ,160( %2),0 \n\t" | |||||
| "vleg %%v27 ,168( %2),0 \n\t" | |||||
| "vleg %%v26 ,176( %2),1 \n\t" | |||||
| "vleg %%v27 ,184( %2),1 \n\t" | |||||
| "vleg %%v28 ,192( %2),0 \n\t" | |||||
| "vleg %%v29 ,200( %2),0 \n\t" | |||||
| "vleg %%v28 ,208( %2),1 \n\t" | |||||
| "vleg %%v29 ,216( %2),1 \n\t" | |||||
| "vleg %%v30 ,224( %2),0 \n\t" | |||||
| "vleg %%v31 ,232( %2),0 \n\t" | |||||
| "vleg %%v30 ,240( %2),1 \n\t" | |||||
| "vleg %%v31 ,248( %2),1 \n\t" | |||||
| "vflpdb %%v24, %%v24 \n\t" | |||||
| "vflpdb %%v25, %%v25 \n\t" | |||||
| "vflpdb %%v26, %%v26 \n\t" | |||||
| "vflpdb %%v27, %%v27 \n\t" | |||||
| "vflpdb %%v28, %%v28 \n\t" | |||||
| "vflpdb %%v29, %%v29 \n\t" | |||||
| "vflpdb %%v30, %%v30 \n\t" | |||||
| "vflpdb %%v31, %%v31 \n\t" | |||||
| "vleg %%v24 ,128(%[ptr_tmp]),0 \n\t" | |||||
| "vleg %%v25 ,136(%[ptr_tmp]),0 \n\t" | |||||
| "vleg %%v24 ,144(%[ptr_tmp]),1 \n\t" | |||||
| "vleg %%v25 ,152(%[ptr_tmp]),1 \n\t" | |||||
| "vleg %%v26 ,160(%[ptr_tmp]),0 \n\t" | |||||
| "vleg %%v27 ,168(%[ptr_tmp]),0 \n\t" | |||||
| "vleg %%v26 ,176(%[ptr_tmp]),1 \n\t" | |||||
| "vleg %%v27 ,184(%[ptr_tmp]),1 \n\t" | |||||
| "vleg %%v28 ,192(%[ptr_tmp]),0 \n\t" | |||||
| "vleg %%v29 ,200(%[ptr_tmp]),0 \n\t" | |||||
| "vleg %%v28 ,208(%[ptr_tmp]),1 \n\t" | |||||
| "vleg %%v29 ,216(%[ptr_tmp]),1 \n\t" | |||||
| "vleg %%v30 ,224(%[ptr_tmp]),0 \n\t" | |||||
| "vleg %%v31 ,232(%[ptr_tmp]),0 \n\t" | |||||
| "vleg %%v30 ,240(%[ptr_tmp]),1 \n\t" | |||||
| "vleg %%v31 ,248(%[ptr_tmp]),1 \n\t" | |||||
| "vflpdb %%v24, %%v24 \n\t" | |||||
| "vflpdb %%v25, %%v25 \n\t" | |||||
| "vflpdb %%v26, %%v26 \n\t" | |||||
| "vflpdb %%v27, %%v27 \n\t" | |||||
| "vflpdb %%v28, %%v28 \n\t" | |||||
| "vflpdb %%v29, %%v29 \n\t" | |||||
| "vflpdb %%v30, %%v30 \n\t" | |||||
| "vflpdb %%v31, %%v31 \n\t" | |||||
| "vfadb %%v24,%%v24,%%v25 \n\t" | |||||
| "vfadb %%v26,%%v26,%%v27 \n\t" | |||||
| "vfadb %%v28,%%v28,%%v29 \n\t" | |||||
| "vfadb %%v30,%%v30,%%v31 \n\t" | |||||
| "vfadb %%v24,%%v24,%%v25 \n\t" | |||||
| "vfadb %%v26,%%v26,%%v27 \n\t" | |||||
| "vfadb %%v28,%%v28,%%v29 \n\t" | |||||
| "vfadb %%v30,%%v30,%%v31 \n\t" | |||||
| "vfchdb %%v25,%%v0 ,%%v1 \n\t" | |||||
| "vsel %%v29,%%v17,%%v16,%%v25 \n\t" | |||||
| "vsel %%v31,%%v1,%%v0,%%v25 \n\t" | |||||
| "vfchdb %%v25,%%v0 ,%%v1 \n\t" | |||||
| "vsel %%v29,%%v17,%%v16,%%v25 \n\t" | |||||
| "vsel %%v31,%%v1,%%v0,%%v25 \n\t" | |||||
| "vfchdb %%v27,%%v2,%%v3 \n\t" | |||||
| "vsel %%v0,%%v19,%%v18,%%v27 \n\t" | |||||
| "vsel %%v1,%%v3,%%v2,%%v27 \n\t" | |||||
| "vfchdb %%v27,%%v2,%%v3 \n\t" | |||||
| "vsel %%v0,%%v19,%%v18,%%v27 \n\t" | |||||
| "vsel %%v1,%%v3,%%v2,%%v27 \n\t" | |||||
| "vfchdb %%v25,%%v24,%%v26 \n\t" | |||||
| "vsel %%v2,%%v21,%%v20,%%v25 \n\t" | |||||
| "vsel %%v3,%%v26,%%v24,%%v25 \n\t" | |||||
| "vfchdb %%v25,%%v24,%%v26 \n\t" | |||||
| "vsel %%v2,%%v21,%%v20,%%v25 \n\t" | |||||
| "vsel %%v3,%%v26,%%v24,%%v25 \n\t" | |||||
| "vfchdb %%v27,%%v28,%%v30 \n\t" | |||||
| "vsel %%v25,%%v23,%%v22,%%v27 \n\t" | |||||
| "vsel %%v27,%%v30,%%v28,%%v27 \n\t" | |||||
| "vfchdb %%v27,%%v28,%%v30 \n\t" | |||||
| "vsel %%v25,%%v23,%%v22,%%v27 \n\t" | |||||
| "vsel %%v27,%%v30,%%v28,%%v27 \n\t" | |||||
| "vfchdb %%v24,%%v31, %%v1 \n\t" | |||||
| "vsel %%v26,%%v0,%%v29,%%v24 \n\t" | |||||
| "vsel %%v28,%%v1,%%v31,%%v24 \n\t" | |||||
| "vfchdb %%v24,%%v31, %%v1 \n\t" | |||||
| "vsel %%v26,%%v0,%%v29,%%v24 \n\t" | |||||
| "vsel %%v28,%%v1,%%v31,%%v24 \n\t" | |||||
| "vfchdb %%v30,%%v3, %%v27 \n\t" | |||||
| "vsel %%v29,%%v25,%%v2,%%v30 \n\t" | |||||
| "vsel %%v31,%%v27,%%v3 ,%%v30 \n\t" | |||||
| "vfchdb %%v30,%%v3, %%v27 \n\t" | |||||
| "vsel %%v29,%%v25,%%v2,%%v30 \n\t" | |||||
| "vsel %%v31,%%v27,%%v3 ,%%v30 \n\t" | |||||
| "la %2,256(%2) \n\t" | |||||
| "la %[ptr_tmp],256(%[ptr_tmp]) \n\t" | |||||
| "vfchdb %%v0,%%v28, %%v31 \n\t" | |||||
| "vsel %%v25,%%v29,%%v26,%%v0 \n\t" | |||||
| "vsel %%v27,%%v31,%%v28,%%v0 \n\t" | |||||
| "vfchdb %%v0,%%v28, %%v31 \n\t" | |||||
| "vsel %%v25,%%v29,%%v26,%%v0 \n\t" | |||||
| "vsel %%v27,%%v31,%%v28,%%v0 \n\t" | |||||
| "vag %%v25,%%v25,%%v5 \n\t" | |||||
| "vag %%v25,%%v25,%%v5 \n\t" | |||||
| //cmp with previous | //cmp with previous | ||||
| "vfchdb %%v30,%%v6 , %%v27 \n\t" | |||||
| "vsel %%v7,%%v25,%%v7,%%v30 \n\t" | |||||
| "vsel %%v6,%%v27,%%v6,%%v30 \n\t" | |||||
| "vfchdb %%v30,%%v6 , %%v27 \n\t" | |||||
| "vsel %%v7,%%v25,%%v7,%%v30 \n\t" | |||||
| "vsel %%v6,%%v27,%%v6,%%v30 \n\t" | |||||
| "vag %%v5,%%v5,%%v4 \n\t" | |||||
| "vag %%v5,%%v5,%%v4 \n\t" | |||||
| "clgrjl %2,%%r0,1b \n\t" | |||||
| "clgrjl %[ptr_tmp],%%r0,1b \n\t" | |||||
| //xtract index | //xtract index | ||||
| "vrepg %%v26,%%v6,1 \n\t" | |||||
| "vrepg %%v5,%%v7,1 \n\t" | |||||
| "wfcdb %%v26,%%v6 \n\t" | |||||
| "jne 2f \n\t" | |||||
| "vsteg %%v6,%1,0 \n\t" | |||||
| "vrepg %%v26,%%v6,1 \n\t" | |||||
| "vrepg %%v5,%%v7,1 \n\t" | |||||
| "wfcdb %%v26,%%v6 \n\t" | |||||
| "jne 2f \n\t" | |||||
| "vsteg %%v6,%[minf],0 \n\t" | |||||
| "vmnlg %%v1,%%v5,%%v7 \n\t" | "vmnlg %%v1,%%v5,%%v7 \n\t" | ||||
| "vlgvg %0,%%v1,0 \n\t" | |||||
| "j 3f \n\t" | |||||
| "vlgvg %[index],%%v1,0 \n\t" | |||||
| "j 3f \n\t" | |||||
| "2: \n\t" | "2: \n\t" | ||||
| "wfchdb %%v16,%%v6 ,%%v26 \n\t" | |||||
| "vsel %%v1,%%v5,%%v7,%%v16 \n\t" | |||||
| "vsel %%v0,%%v26,%%v6,%%v16 \n\t" | |||||
| "vlgvg %0,%%v1,0 \n\t" | |||||
| "std %%f0,%1 \n\t" | |||||
| "wfchdb %%v16,%%v6 ,%%v26 \n\t" | |||||
| "vsel %%v1,%%v5,%%v7,%%v16 \n\t" | |||||
| "vsel %%v0,%%v26,%%v6,%%v16 \n\t" | |||||
| "vlgvg %[index],%%v1,0 \n\t" | |||||
| "std %%f0,%[minf] \n\t" | |||||
| "3: \n\t" | "3: \n\t" | ||||
| : "+r"(index) ,"=m"(*minf), "+&a"(x) | |||||
| : "r"(n), "2"(x) | |||||
| : [index] "+r"(index) ,[minf] "=m"(*minf), [ptr_tmp] "+&a"(x) | |||||
| : [mem] "m"( *(const double (*)[2*n])x), [n] "r"(n), [ptr_x] "r"(x) | |||||
| : "cc","r0","f0","v0","v1","v2","v3","v4","v5","v6","v7","v16", | : "cc","r0","f0","v0","v1","v2","v3","v4","v5","v6","v7","v16", | ||||
| "v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" | "v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" | ||||
| @@ -224,12 +224,12 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||||
| if (inc_x == 1) { | if (inc_x == 1) { | ||||
| BLASLONG n1 = n & -8; | |||||
| BLASLONG n1 = n & -16; | |||||
| if (n1 > 0) { | if (n1 > 0) { | ||||
| min = ziamin_kernel_8_TUNED(n1, x, &minf); | |||||
| min = ziamin_kernel_16_TUNED(n1, x, &minf); | |||||
| i = n1; | i = n1; | ||||
| ix = n1 << 1; | |||||
| } | } | ||||
| else { | else { | ||||
| //assign minf | //assign minf | ||||
| @@ -44,65 +44,65 @@ static FLOAT zasum_kernel_16(BLASLONG n, FLOAT *x) { | |||||
| FLOAT asum; | FLOAT asum; | ||||
| __asm__ ( | __asm__ ( | ||||
| "pfd 1, 0(%3) \n\t" | |||||
| "sllg %%r0,%2,4 \n\t" | |||||
| "agr %%r0,%3 \n\t" | |||||
| "vzero %%v0 \n\t" | |||||
| "vzero %%v1 \n\t" | |||||
| "vzero %%v22 \n\t" | |||||
| "vzero %%v23 \n\t" | |||||
| "pfd 1, 0(%[ptr_x]) \n\t" | |||||
| "sllg %%r0,%[n],4 \n\t" | |||||
| "agr %%r0,%[ptr_x] \n\t" | |||||
| "vzero %%v0 \n\t" | |||||
| "vzero %%v1 \n\t" | |||||
| "vzero %%v22 \n\t" | |||||
| "vzero %%v23 \n\t" | |||||
| ".align 16 \n\t" | ".align 16 \n\t" | ||||
| "1: \n\t" | |||||
| "pfd 1, 256(%1 ) \n\t" | |||||
| "vlm %%v24,%%v31,0(%1) \n\t" | |||||
| "1: \n\t" | |||||
| "pfd 1, 256(%[ptr_tmp] ) \n\t" | |||||
| "vlm %%v24,%%v31,0(%[ptr_tmp]) \n\t" | |||||
| "vflpdb %%v24, %%v24 \n\t" | |||||
| "vflpdb %%v25, %%v25 \n\t" | |||||
| "vflpdb %%v26, %%v26 \n\t" | |||||
| "vflpdb %%v27, %%v27 \n\t" | |||||
| "vflpdb %%v28, %%v28 \n\t" | |||||
| "vflpdb %%v29, %%v29 \n\t" | |||||
| "vflpdb %%v30, %%v30 \n\t" | |||||
| "vflpdb %%v31, %%v31 \n\t" | |||||
| "vflpdb %%v24, %%v24 \n\t" | |||||
| "vflpdb %%v25, %%v25 \n\t" | |||||
| "vflpdb %%v26, %%v26 \n\t" | |||||
| "vflpdb %%v27, %%v27 \n\t" | |||||
| "vflpdb %%v28, %%v28 \n\t" | |||||
| "vflpdb %%v29, %%v29 \n\t" | |||||
| "vflpdb %%v30, %%v30 \n\t" | |||||
| "vflpdb %%v31, %%v31 \n\t" | |||||
| "vfadb %%v0,%%v0,%%v24 \n\t" | |||||
| "vfadb %%v1,%%v1,%%v25 \n\t" | |||||
| "vfadb %%v23,%%v23,%%v26 \n\t" | |||||
| "vfadb %%v22,%%v22,%%v27 \n\t" | |||||
| "vfadb %%v0,%%v0,%%v28 \n\t" | |||||
| "vfadb %%v1,%%v1,%%v29 \n\t" | |||||
| "vfadb %%v23,%%v23,%%v30 \n\t" | |||||
| "vfadb %%v22,%%v22,%%v31 \n\t" | |||||
| "vfadb %%v0,%%v0,%%v24 \n\t" | |||||
| "vfadb %%v1,%%v1,%%v25 \n\t" | |||||
| "vfadb %%v23,%%v23,%%v26 \n\t" | |||||
| "vfadb %%v22,%%v22,%%v27 \n\t" | |||||
| "vfadb %%v0,%%v0,%%v28 \n\t" | |||||
| "vfadb %%v1,%%v1,%%v29 \n\t" | |||||
| "vfadb %%v23,%%v23,%%v30 \n\t" | |||||
| "vfadb %%v22,%%v22,%%v31 \n\t" | |||||
| "vlm %%v24,%%v31, 128(%1 ) \n\t" | |||||
| "vlm %%v24,%%v31, 128(%[ptr_tmp]) \n\t" | |||||
| "vflpdb %%v24, %%v24 \n\t" | |||||
| "vflpdb %%v25, %%v25 \n\t" | |||||
| "vflpdb %%v26, %%v26 \n\t" | |||||
| "vflpdb %%v27, %%v27 \n\t" | |||||
| "vflpdb %%v28, %%v28 \n\t" | |||||
| "vflpdb %%v29, %%v29 \n\t" | |||||
| "vflpdb %%v30, %%v30 \n\t" | |||||
| "vflpdb %%v31, %%v31 \n\t" | |||||
| "la %1,256(%1) \n\t" | |||||
| "vfadb %%v0,%%v0,%%v24 \n\t" | |||||
| "vfadb %%v1,%%v1,%%v25 \n\t" | |||||
| "vfadb %%v23,%%v23,%%v26 \n\t" | |||||
| "vfadb %%v22,%%v22,%%v27 \n\t" | |||||
| "vfadb %%v0,%%v0,%%v28 \n\t" | |||||
| "vfadb %%v1,%%v1,%%v29 \n\t" | |||||
| "vfadb %%v23,%%v23,%%v30 \n\t" | |||||
| "vfadb %%v22,%%v22,%%v31 \n\t" | |||||
| "vflpdb %%v24, %%v24 \n\t" | |||||
| "vflpdb %%v25, %%v25 \n\t" | |||||
| "vflpdb %%v26, %%v26 \n\t" | |||||
| "vflpdb %%v27, %%v27 \n\t" | |||||
| "vflpdb %%v28, %%v28 \n\t" | |||||
| "vflpdb %%v29, %%v29 \n\t" | |||||
| "vflpdb %%v30, %%v30 \n\t" | |||||
| "vflpdb %%v31, %%v31 \n\t" | |||||
| "la %[ptr_tmp],256(%[ptr_tmp]) \n\t" | |||||
| "vfadb %%v0,%%v0,%%v24 \n\t" | |||||
| "vfadb %%v1,%%v1,%%v25 \n\t" | |||||
| "vfadb %%v23,%%v23,%%v26 \n\t" | |||||
| "vfadb %%v22,%%v22,%%v27 \n\t" | |||||
| "vfadb %%v0,%%v0,%%v28 \n\t" | |||||
| "vfadb %%v1,%%v1,%%v29 \n\t" | |||||
| "vfadb %%v23,%%v23,%%v30 \n\t" | |||||
| "vfadb %%v22,%%v22,%%v31 \n\t" | |||||
| "clgrjl %1,%%r0,1b \n\t" | |||||
| "vfadb %%v24,%%v0,%%v1 \n\t" | |||||
| "vfadb %%v25,%%v23,%%v22 \n\t" | |||||
| "vfadb %%v0,%%v25,%%v24 \n\t" | |||||
| "vrepg %%v1,%%v0,1 \n\t" | |||||
| "adbr %%f0,%%f1 \n\t" | |||||
| "ldr %0 ,%%f0" | |||||
| : "=f"(asum),"+&a"(x) | |||||
| : "r"(n), "1"(x) | |||||
| "clgrjl %[ptr_tmp],%%r0,1b \n\t" | |||||
| "vfadb %%v24,%%v0,%%v1 \n\t" | |||||
| "vfadb %%v25,%%v23,%%v22 \n\t" | |||||
| "vfadb %%v0,%%v25,%%v24 \n\t" | |||||
| "vrepg %%v1,%%v0,1 \n\t" | |||||
| "adbr %%f0,%%f1 \n\t" | |||||
| "ldr %[asum] ,%%f0" | |||||
| : [asum] "=f"(asum),[ptr_tmp] "+&a"(x) | |||||
| : [mem] "m"( *(const double (*)[2*n])x ), [n] "r"(n), [ptr_x] "a"(x) | |||||
| : "cc", "r0","f0","f1","v0","v1","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" | : "cc", "r0","f0","f1","v0","v1","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" | ||||
| ); | ); | ||||
| return asum; | return asum; | ||||
| @@ -28,36 +28,36 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #include "common.h" | #include "common.h" | ||||
| static void __attribute__ ((noinline)) zaxpy_kernel_8(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) { | |||||
| __asm__ ("pfd 1, 0(%1) \n\t" | |||||
| "pfd 2, 0(%2) \n\t" | |||||
| "vlrepg %%v28 , 0(%3) \n\t" | |||||
| "vlrepg %%v29, 8(%3) \n\t" | |||||
| "srlg %3,%0,3 \n\t" | |||||
| "xgr %%r1,%%r1 \n\t" | |||||
| static void zaxpy_kernel_8(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT da_r,FLOAT da_i) { | |||||
| __asm__ ("pfd 1, 0(%[x_tmp]) \n\t" | |||||
| "pfd 2, 0(%[y_tmp]) \n\t" | |||||
| "lgdr %%r1,%[alpha_r] \n\t" | |||||
| "vlvgp %%v28,%%r1,%%r1 \n\t" | |||||
| "lgdr %%r1,%[alpha_i] \n\t" | |||||
| "vlvgp %%v29,%%r1,%%r1 \n\t" | |||||
| "sllg %[tmp],%[tmp],4 \n\t" | |||||
| "xgr %%r1,%%r1 \n\t" | |||||
| ".align 16 \n\t" | ".align 16 \n\t" | ||||
| "1: \n\t" | |||||
| "pfd 1, 256(%%r1,%1) \n\t" | |||||
| "pfd 2, 256(%%r1,%2) \n\t" | |||||
| "vleg %%v16 , 0(%%r1,%2),0 \n\t" | |||||
| "vleg %%v17 , 8(%%r1,%2),0 \n\t" | |||||
| "vleg %%v16 , 16(%%r1,%2),1 \n\t" | |||||
| "vleg %%v17 , 24(%%r1,%2),1 \n\t" | |||||
| "vleg %%v18 , 32(%%r1,%2),0 \n\t" | |||||
| "vleg %%v19 , 40(%%r1,%2),0 \n\t" | |||||
| "vleg %%v18 , 48(%%r1,%2),1 \n\t" | |||||
| "vleg %%v19 , 56(%%r1,%2),1 \n\t" | |||||
| "vleg %%v24 , 0(%%r1,%1),0 \n\t" | |||||
| "vleg %%v25 , 8(%%r1,%1),0 \n\t" | |||||
| "vleg %%v24 , 16(%%r1,%1),1 \n\t" | |||||
| "vleg %%v25 , 24(%%r1,%1),1 \n\t" | |||||
| "vleg %%v26 , 32(%%r1,%1),0 \n\t" | |||||
| "vleg %%v27 , 40(%%r1,%1),0 \n\t" | |||||
| "vleg %%v26 , 48(%%r1,%1),1 \n\t" | |||||
| "vleg %%v27 , 56(%%r1,%1),1 \n\t" | |||||
| "1: \n\t" | |||||
| "pfd 1, 256(%%r1,%[x_tmp]) \n\t" | |||||
| "pfd 2, 256(%%r1,%[y_tmp]) \n\t" | |||||
| "vleg %%v16 , 0(%%r1,%[y_tmp]),0 \n\t" | |||||
| "vleg %%v17 , 8(%%r1,%[y_tmp]),0 \n\t" | |||||
| "vleg %%v16 , 16(%%r1,%[y_tmp]),1 \n\t" | |||||
| "vleg %%v17 , 24(%%r1,%[y_tmp]),1 \n\t" | |||||
| "vleg %%v18 , 32(%%r1,%[y_tmp]),0 \n\t" | |||||
| "vleg %%v19 , 40(%%r1,%[y_tmp]),0 \n\t" | |||||
| "vleg %%v18 , 48(%%r1,%[y_tmp]),1 \n\t" | |||||
| "vleg %%v19 , 56(%%r1,%[y_tmp]),1 \n\t" | |||||
| "vleg %%v24 , 0(%%r1,%[x_tmp]),0 \n\t" | |||||
| "vleg %%v25 , 8(%%r1,%[x_tmp]),0 \n\t" | |||||
| "vleg %%v24 , 16(%%r1,%[x_tmp]),1 \n\t" | |||||
| "vleg %%v25 , 24(%%r1,%[x_tmp]),1 \n\t" | |||||
| "vleg %%v26 , 32(%%r1,%[x_tmp]),0 \n\t" | |||||
| "vleg %%v27 , 40(%%r1,%[x_tmp]),0 \n\t" | |||||
| "vleg %%v26 , 48(%%r1,%[x_tmp]),1 \n\t" | |||||
| "vleg %%v27 , 56(%%r1,%[x_tmp]),1 \n\t" | |||||
| #if !defined(CONJ) | #if !defined(CONJ) | ||||
| "vfmsdb %%v16, %%v25, %%v29,%%v16 \n\t" | "vfmsdb %%v16, %%v25, %%v29,%%v16 \n\t" | ||||
| "vfmadb %%v17, %%v24, %%v29, %%v17 \n\t" | "vfmadb %%v17, %%v24, %%v29, %%v17 \n\t" | ||||
| @@ -79,35 +79,35 @@ static void __attribute__ ((noinline)) zaxpy_kernel_8(BLASLONG n, FLOAT *x, FLOA | |||||
| "vfmsdb %%v19, %%v26, %%v29, %%v19 \n\t" | "vfmsdb %%v19, %%v26, %%v29, %%v19 \n\t" | ||||
| #endif | #endif | ||||
| "vsteg %%v16 , 0(%%r1,%2),0 \n\t" | |||||
| "vsteg %%v17 , 8(%%r1,%2),0 \n\t" | |||||
| "vsteg %%v16 , 16(%%r1,%2),1 \n\t" | |||||
| "vsteg %%v17 , 24(%%r1,%2),1 \n\t" | |||||
| "vsteg %%v18 , 32(%%r1,%2),0 \n\t" | |||||
| "vsteg %%v19 , 40(%%r1,%2),0 \n\t" | |||||
| "vsteg %%v18 , 48(%%r1,%2),1 \n\t" | |||||
| "vsteg %%v19 , 56(%%r1,%2),1 \n\t" | |||||
| "vleg %%v20 , 64(%%r1,%2),0 \n\t" | |||||
| "vleg %%v21 , 72(%%r1,%2),0 \n\t" | |||||
| "vleg %%v20 , 80(%%r1,%2),1 \n\t" | |||||
| "vleg %%v21 , 88(%%r1,%2),1 \n\t" | |||||
| "vleg %%v22 , 96(%%r1,%2),0 \n\t" | |||||
| "vleg %%v23 , 104(%%r1,%2),0 \n\t" | |||||
| "vleg %%v22 , 112(%%r1,%2),1 \n\t" | |||||
| "vleg %%v23 , 120(%%r1,%2),1 \n\t" | |||||
| "vleg %%v24 , 64(%%r1,%1),0 \n\t" | |||||
| "vleg %%v25 , 72(%%r1,%1),0 \n\t" | |||||
| "vleg %%v24 , 80(%%r1,%1),1 \n\t" | |||||
| "vleg %%v25 , 88(%%r1,%1),1 \n\t" | |||||
| "vleg %%v26 , 96(%%r1,%1),0 \n\t" | |||||
| "vleg %%v27 , 104(%%r1,%1),0 \n\t" | |||||
| "vleg %%v26 , 112(%%r1,%1),1 \n\t" | |||||
| "vleg %%v27 , 120(%%r1,%1),1 \n\t" | |||||
| "vsteg %%v16 , 0(%%r1,%[y_tmp]),0 \n\t" | |||||
| "vsteg %%v17 , 8(%%r1,%[y_tmp]),0 \n\t" | |||||
| "vsteg %%v16 , 16(%%r1,%[y_tmp]),1 \n\t" | |||||
| "vsteg %%v17 , 24(%%r1,%[y_tmp]),1 \n\t" | |||||
| "vsteg %%v18 , 32(%%r1,%[y_tmp]),0 \n\t" | |||||
| "vsteg %%v19 , 40(%%r1,%[y_tmp]),0 \n\t" | |||||
| "vsteg %%v18 , 48(%%r1,%[y_tmp]),1 \n\t" | |||||
| "vsteg %%v19 , 56(%%r1,%[y_tmp]),1 \n\t" | |||||
| "vleg %%v20 , 64(%%r1,%[y_tmp]),0 \n\t" | |||||
| "vleg %%v21 , 72(%%r1,%[y_tmp]),0 \n\t" | |||||
| "vleg %%v20 , 80(%%r1,%[y_tmp]),1 \n\t" | |||||
| "vleg %%v21 , 88(%%r1,%[y_tmp]),1 \n\t" | |||||
| "vleg %%v22 , 96(%%r1,%[y_tmp]),0 \n\t" | |||||
| "vleg %%v23 , 104(%%r1,%[y_tmp]),0 \n\t" | |||||
| "vleg %%v22 , 112(%%r1,%[y_tmp]),1 \n\t" | |||||
| "vleg %%v23 , 120(%%r1,%[y_tmp]),1 \n\t" | |||||
| "vleg %%v24 , 64(%%r1,%[x_tmp]),0 \n\t" | |||||
| "vleg %%v25 , 72(%%r1,%[x_tmp]),0 \n\t" | |||||
| "vleg %%v24 , 80(%%r1,%[x_tmp]),1 \n\t" | |||||
| "vleg %%v25 , 88(%%r1,%[x_tmp]),1 \n\t" | |||||
| "vleg %%v26 , 96(%%r1,%[x_tmp]),0 \n\t" | |||||
| "vleg %%v27 , 104(%%r1,%[x_tmp]),0 \n\t" | |||||
| "vleg %%v26 , 112(%%r1,%[x_tmp]),1 \n\t" | |||||
| "vleg %%v27 , 120(%%r1,%[x_tmp]),1 \n\t" | |||||
| #if !defined(CONJ) | #if !defined(CONJ) | ||||
| "vfmsdb %%v20, %%v25, %%v29,%%v20 \n\t" | "vfmsdb %%v20, %%v25, %%v29,%%v20 \n\t" | ||||
| "vfmadb %%v21, %%v24, %%v29, %%v21 \n\t" | "vfmadb %%v21, %%v24, %%v29, %%v21 \n\t" | ||||
| @@ -128,21 +128,21 @@ static void __attribute__ ((noinline)) zaxpy_kernel_8(BLASLONG n, FLOAT *x, FLOA | |||||
| "vfmadb %%v22, %%v26, %%v28, %%v22 \n\t" | "vfmadb %%v22, %%v26, %%v28, %%v22 \n\t" | ||||
| "vfmsdb %%v23, %%v26, %%v29, %%v23 \n\t" | "vfmsdb %%v23, %%v26, %%v29, %%v23 \n\t" | ||||
| #endif | #endif | ||||
| "vsteg %%v20 , 64(%%r1,%2),0 \n\t" | |||||
| "vsteg %%v21 , 72(%%r1,%2),0 \n\t" | |||||
| "vsteg %%v20 , 80(%%r1,%2),1 \n\t" | |||||
| "vsteg %%v21 , 88(%%r1,%2),1 \n\t" | |||||
| "vsteg %%v22 , 96(%%r1,%2),0 \n\t" | |||||
| "vsteg %%v23 , 104(%%r1,%2),0 \n\t" | |||||
| "vsteg %%v22 , 112(%%r1,%2),1 \n\t" | |||||
| "vsteg %%v23 , 120(%%r1,%2),1 \n\t" | |||||
| "la %%r1,128(%%r1) \n\t" | |||||
| "brctg %3,1b" | |||||
| : | |||||
| : "r"(n), "a"(x), "a"(y), "a"(alpha) | |||||
| : "cc", "memory", "r1","v16", | |||||
| "vsteg %%v20 , 64(%%r1,%[y_tmp]),0 \n\t" | |||||
| "vsteg %%v21 , 72(%%r1,%[y_tmp]),0 \n\t" | |||||
| "vsteg %%v20 , 80(%%r1,%[y_tmp]),1 \n\t" | |||||
| "vsteg %%v21 , 88(%%r1,%[y_tmp]),1 \n\t" | |||||
| "vsteg %%v22 , 96(%%r1,%[y_tmp]),0 \n\t" | |||||
| "vsteg %%v23 , 104(%%r1,%[y_tmp]),0 \n\t" | |||||
| "vsteg %%v22 , 112(%%r1,%[y_tmp]),1 \n\t" | |||||
| "vsteg %%v23 , 120(%%r1,%[y_tmp]),1 \n\t" | |||||
| "la %%r1,128(%%r1) \n\t" | |||||
| "clgrjl %%r1,%[tmp],1b \n\t" | |||||
| : [mem_y] "+m" (*(double (*)[2*n])y),[tmp]"+&r"(n) | |||||
| : [mem_x] "m" (*(const double (*)[2*n])x), [x_tmp] "a"(x), [y_tmp] "a"(y), [alpha_r] "f"(da_r),[alpha_i] "f"(da_i) | |||||
| : "cc", "r1","v16", | |||||
| "v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29" | "v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29" | ||||
| ); | ); | ||||
| @@ -151,7 +151,6 @@ static void __attribute__ ((noinline)) zaxpy_kernel_8(BLASLONG n, FLOAT *x, FLOA | |||||
| int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) { | int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) { | ||||
| BLASLONG i = 0; | BLASLONG i = 0; | ||||
| BLASLONG ix = 0, iy = 0; | BLASLONG ix = 0, iy = 0; | ||||
| FLOAT da[2]; | |||||
| if (n <= 0) return (0); | if (n <= 0) return (0); | ||||
| @@ -159,10 +158,8 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, | |||||
| BLASLONG n1 = n & -8; | BLASLONG n1 = n & -8; | ||||
| if (n1) { | |||||
| da[0] = da_r; | |||||
| da[1] = da_i; | |||||
| zaxpy_kernel_8(n1, x, y, da); | |||||
| if (n1) { | |||||
| zaxpy_kernel_8(n1, x, y, da_r,da_i); | |||||
| ix = 2 * n1; | ix = 2 * n1; | ||||
| } | } | ||||
| i = n1; | i = n1; | ||||
| @@ -27,64 +27,64 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #include "common.h" | #include "common.h" | ||||
| static void __attribute__ ((noinline)) zcopy_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y) { | |||||
| static void zcopy_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y) { | |||||
| __asm__ volatile( | __asm__ volatile( | ||||
| "pfd 1, 0(%1) \n\t" | |||||
| "pfd 2, 0(%2) \n\t" | |||||
| "srlg %%r0,%0,4 \n\t" | |||||
| "xgr %%r1,%%r1 \n\t" | |||||
| "pfd 1, 0(%[ptr_x]) \n\t" | |||||
| "pfd 2, 0(%[ptr_y]) \n\t" | |||||
| "srlg %[n_tmp],%[n_tmp],4 \n\t" | |||||
| "xgr %%r1,%%r1 \n\t" | |||||
| ".align 16 \n\t" | ".align 16 \n\t" | ||||
| "1: \n\t" | |||||
| "pfd 1, 256(%%r1,%1) \n\t" | |||||
| "pfd 2, 256(%%r1,%2) \n\t" | |||||
| "vl %%v24, 0(%%r1,%1) \n\t" | |||||
| "vst %%v24, 0(%%r1,%2) \n\t" | |||||
| "vl %%v25, 16(%%r1,%1) \n\t" | |||||
| "vst %%v25, 16(%%r1,%2) \n\t" | |||||
| "vl %%v26, 32(%%r1,%1) \n\t" | |||||
| "vst %%v26, 32(%%r1,%2) \n\t" | |||||
| "vl %%v27, 48(%%r1,%1) \n\t" | |||||
| "vst %%v27, 48(%%r1,%2) \n\t" | |||||
| "vl %%v28, 64(%%r1,%1) \n\t" | |||||
| "vst %%v28, 64(%%r1,%2) \n\t" | |||||
| "vl %%v29, 80(%%r1,%1) \n\t" | |||||
| "vst %%v29, 80(%%r1,%2) \n\t" | |||||
| "vl %%v30, 96(%%r1,%1) \n\t" | |||||
| "vst %%v30, 96(%%r1,%2) \n\t" | |||||
| "vl %%v31,112(%%r1,%1) \n\t" | |||||
| "vst %%v31,112(%%r1,%2) \n\t" | |||||
| "vl %%v24,128(%%r1,%1) \n\t" | |||||
| "vst %%v24,128(%%r1,%2) \n\t" | |||||
| "vl %%v25,144(%%r1,%1) \n\t" | |||||
| "vst %%v25,144(%%r1,%2) \n\t" | |||||
| "vl %%v26,160(%%r1,%1) \n\t" | |||||
| "vst %%v26,160(%%r1,%2) \n\t" | |||||
| "vl %%v27,176(%%r1,%1) \n\t" | |||||
| "vst %%v27,176(%%r1,%2) \n\t" | |||||
| "vl %%v28, 192(%%r1,%1) \n\t" | |||||
| "vst %%v28, 192(%%r1,%2) \n\t" | |||||
| "vl %%v29, 208(%%r1,%1) \n\t" | |||||
| "vst %%v29, 208(%%r1,%2) \n\t" | |||||
| "vl %%v30, 224(%%r1,%1) \n\t" | |||||
| "vst %%v30, 224(%%r1,%2) \n\t" | |||||
| "vl %%v31, 240(%%r1,%1) \n\t" | |||||
| "vst %%v31, 240(%%r1,%2) \n\t" | |||||
| "la %%r1,256(%%r1) \n\t" | |||||
| "brctg %%r0,1b" | |||||
| : | |||||
| : "r"(n), "a"(x), "a"(y) | |||||
| : "cc", "memory","r0","r1","v24","v25","v26","v27","v28","v29","v30","v31" | |||||
| "1: \n\t" | |||||
| "pfd 1, 256(%%r1,%[ptr_x]) \n\t" | |||||
| "pfd 2, 256(%%r1,%[ptr_y]) \n\t" | |||||
| "vl %%v24, 0(%%r1,%[ptr_x]) \n\t" | |||||
| "vst %%v24, 0(%%r1,%[ptr_y]) \n\t" | |||||
| "vl %%v25, 16(%%r1,%[ptr_x]) \n\t" | |||||
| "vst %%v25, 16(%%r1,%[ptr_y]) \n\t" | |||||
| "vl %%v26, 32(%%r1,%[ptr_x]) \n\t" | |||||
| "vst %%v26, 32(%%r1,%[ptr_y]) \n\t" | |||||
| "vl %%v27, 48(%%r1,%[ptr_x]) \n\t" | |||||
| "vst %%v27, 48(%%r1,%[ptr_y]) \n\t" | |||||
| "vl %%v28, 64(%%r1,%[ptr_x]) \n\t" | |||||
| "vst %%v28, 64(%%r1,%[ptr_y]) \n\t" | |||||
| "vl %%v29, 80(%%r1,%[ptr_x]) \n\t" | |||||
| "vst %%v29, 80(%%r1,%[ptr_y]) \n\t" | |||||
| "vl %%v30, 96(%%r1,%[ptr_x]) \n\t" | |||||
| "vst %%v30, 96(%%r1,%[ptr_y]) \n\t" | |||||
| "vl %%v31, 112(%%r1,%[ptr_x]) \n\t" | |||||
| "vst %%v31, 112(%%r1,%[ptr_y]) \n\t" | |||||
| "vl %%v24, 128(%%r1,%[ptr_x]) \n\t" | |||||
| "vst %%v24, 128(%%r1,%[ptr_y]) \n\t" | |||||
| "vl %%v25, 144(%%r1,%[ptr_x]) \n\t" | |||||
| "vst %%v25, 144(%%r1,%[ptr_y]) \n\t" | |||||
| "vl %%v26, 160(%%r1,%[ptr_x]) \n\t" | |||||
| "vst %%v26, 160(%%r1,%[ptr_y]) \n\t" | |||||
| "vl %%v27, 176(%%r1,%[ptr_x]) \n\t" | |||||
| "vst %%v27, 176(%%r1,%[ptr_y]) \n\t" | |||||
| "vl %%v28, 192(%%r1,%[ptr_x]) \n\t" | |||||
| "vst %%v28, 192(%%r1,%[ptr_y]) \n\t" | |||||
| "vl %%v29, 208(%%r1,%[ptr_x]) \n\t" | |||||
| "vst %%v29, 208(%%r1,%[ptr_y]) \n\t" | |||||
| "vl %%v30, 224(%%r1,%[ptr_x]) \n\t" | |||||
| "vst %%v30, 224(%%r1,%[ptr_y]) \n\t" | |||||
| "vl %%v31, 240(%%r1,%[ptr_x]) \n\t" | |||||
| "vst %%v31, 240(%%r1,%[ptr_y]) \n\t" | |||||
| "la %%r1,256(%%r1) \n\t" | |||||
| "brctg %[n_tmp],1b" | |||||
| : [mem_y] "=m" (*(double (*)[2*n])y), [n_tmp] "+&r"(n) | |||||
| : [mem_x] "m" (*(const double (*)[2*n])x), [ptr_x] "a"(x), [ptr_y] "a"(y) | |||||
| : "cc", "r1", "v24","v25","v26","v27","v28","v29","v30","v31" | |||||
| ); | ); | ||||
| return; | |||||
| return; | |||||
| } | } | ||||
| @@ -32,75 +32,77 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| static void zdot_kernel_8(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *d) { | static void zdot_kernel_8(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *d) { | ||||
| __asm__ volatile( | __asm__ volatile( | ||||
| "pfd 1, 0(%2) \n\t" | |||||
| "pfd 1, 0(%3) \n\t" | |||||
| "pfd 1, 0(%[ptr_x_tmp]) \n\t" | |||||
| "pfd 1, 0(%[ptr_y_tmp]) \n\t" | |||||
| "vzero %%v24 \n\t" | "vzero %%v24 \n\t" | ||||
| "vzero %%v25 \n\t" | "vzero %%v25 \n\t" | ||||
| "vzero %%v26 \n\t" | "vzero %%v26 \n\t" | ||||
| "vzero %%v27 \n\t" | "vzero %%v27 \n\t" | ||||
| "srlg %1,%1,3 \n\t" | |||||
| "xgr %%r1,%%r1 \n\t" | |||||
| "srlg %[n_tmp],%[n_tmp],3 \n\t" | |||||
| "xgr %%r1,%%r1 \n\t" | |||||
| ".align 16 \n\t" | ".align 16 \n\t" | ||||
| "1: \n\t" | |||||
| "pfd 1, 256(%%r1,%2) \n\t" | |||||
| "pfd 1, 256(%%r1,%3) \n\t" | |||||
| "vl %%v16, 0(%%r1,%2) \n\t" | |||||
| "vl %%v17, 16(%%r1,%2) \n\t" | |||||
| "vl %%v18, 32(%%r1,%2) \n\t" | |||||
| "vl %%v19, 48(%%r1,%2) \n\t" | |||||
| "vl %%v28, 0(%%r1,%3) \n\t" | |||||
| "vl %%v29, 16(%%r1,%3) \n\t" | |||||
| "vl %%v30, 32(%%r1,%3) \n\t" | |||||
| "vl %%v31, 48(%%r1,%3) \n\t" | |||||
| "vpdi %%v20,%%v16,%%v16,4 \n\t" | |||||
| "vpdi %%v21,%%v17,%%v17,4 \n\t" | |||||
| "vpdi %%v22,%%v18,%%v18,4 \n\t" | |||||
| "vpdi %%v23,%%v19,%%v19,4 \n\t" | |||||
| "vfmadb %%v24,%%v16,%%v28,%%v24 \n\t" | |||||
| "vfmadb %%v25,%%v20,%%v28,%%v25 \n\t" | |||||
| "vfmadb %%v26,%%v17,%%v29,%%v26 \n\t" | |||||
| "vfmadb %%v27,%%v21,%%v29,%%v27 \n\t" | |||||
| "vfmadb %%v24,%%v18,%%v30,%%v24 \n\t" | |||||
| "vfmadb %%v25,%%v22,%%v30,%%v25 \n\t" | |||||
| "vfmadb %%v26,%%v19,%%v31,%%v26 \n\t" | |||||
| "vfmadb %%v27,%%v23,%%v31,%%v27 \n\t" | |||||
| "vl %%v16, 64(%%r1,%2) \n\t" | |||||
| "vl %%v17, 80(%%r1,%2) \n\t" | |||||
| "vl %%v18, 96(%%r1,%2) \n\t" | |||||
| "vl %%v19,112(%%r1,%2) \n\t" | |||||
| "vl %%v28, 64(%%r1,%3) \n\t" | |||||
| "vl %%v29, 80(%%r1,%3) \n\t" | |||||
| "vl %%v30, 96(%%r1,%3) \n\t" | |||||
| "vl %%v31,112(%%r1,%3) \n\t" | |||||
| "vpdi %%v20,%%v16,%%v16,4 \n\t" | |||||
| "vpdi %%v21,%%v17,%%v17,4 \n\t" | |||||
| "vpdi %%v22,%%v18,%%v18,4 \n\t" | |||||
| "vpdi %%v23,%%v19,%%v19,4 \n\t" | |||||
| "vfmadb %%v24,%%v16,%%v28,%%v24 \n\t" | |||||
| "vfmadb %%v25,%%v20,%%v28,%%v25 \n\t" | |||||
| "vfmadb %%v26,%%v17,%%v29,%%v26 \n\t" | |||||
| "vfmadb %%v27,%%v21,%%v29,%%v27 \n\t" | |||||
| "vfmadb %%v24,%%v18,%%v30,%%v24 \n\t" | |||||
| "vfmadb %%v25,%%v22,%%v30,%%v25 \n\t" | |||||
| "vfmadb %%v26,%%v19,%%v31,%%v26 \n\t" | |||||
| "vfmadb %%v27,%%v23,%%v31,%%v27 \n\t" | |||||
| "la %%r1,128(%%r1) \n\t" | |||||
| "brctg %1,1b \n\t" | |||||
| "vfadb %%v24,%%v26,%%v24 \n\t" | |||||
| "vfadb %%v25,%%v25,%%v27 \n\t" | |||||
| "vsteg %%v24,0(%4),0 \n\t" | |||||
| "vsteg %%v24,8(%4),1 \n\t" | |||||
| "vsteg %%v25,16(%4),1 \n\t" | |||||
| "vsteg %%v25,24(%4),0 \n\t" | |||||
| : "=m"(*d) ,"+&r"(n) | |||||
| : "a"(x), "a"(y), "a"(d) | |||||
| "1: \n\t" | |||||
| "pfd 1, 256(%%r1,%[ptr_x_tmp]) \n\t" | |||||
| "pfd 1, 256(%%r1,%[ptr_y_tmp]) \n\t" | |||||
| "vl %%v16, 0(%%r1,%[ptr_x_tmp]) \n\t" | |||||
| "vl %%v17, 16(%%r1,%[ptr_x_tmp]) \n\t" | |||||
| "vl %%v18, 32(%%r1,%[ptr_x_tmp]) \n\t" | |||||
| "vl %%v19, 48(%%r1,%[ptr_x_tmp]) \n\t" | |||||
| "vl %%v28, 0(%%r1,%[ptr_y_tmp]) \n\t" | |||||
| "vl %%v29, 16(%%r1,%[ptr_y_tmp]) \n\t" | |||||
| "vl %%v30, 32(%%r1,%[ptr_y_tmp]) \n\t" | |||||
| "vl %%v31, 48(%%r1,%[ptr_y_tmp]) \n\t" | |||||
| "vpdi %%v20,%%v16,%%v16,4 \n\t" | |||||
| "vpdi %%v21,%%v17,%%v17,4 \n\t" | |||||
| "vpdi %%v22,%%v18,%%v18,4 \n\t" | |||||
| "vpdi %%v23,%%v19,%%v19,4 \n\t" | |||||
| "vfmadb %%v24,%%v16,%%v28,%%v24 \n\t" | |||||
| "vfmadb %%v25,%%v20,%%v28,%%v25 \n\t" | |||||
| "vfmadb %%v26,%%v17,%%v29,%%v26 \n\t" | |||||
| "vfmadb %%v27,%%v21,%%v29,%%v27 \n\t" | |||||
| "vfmadb %%v24,%%v18,%%v30,%%v24 \n\t" | |||||
| "vfmadb %%v25,%%v22,%%v30,%%v25 \n\t" | |||||
| "vfmadb %%v26,%%v19,%%v31,%%v26 \n\t" | |||||
| "vfmadb %%v27,%%v23,%%v31,%%v27 \n\t" | |||||
| "vl %%v16, 64(%%r1,%[ptr_x_tmp]) \n\t" | |||||
| "vl %%v17, 80(%%r1,%[ptr_x_tmp]) \n\t" | |||||
| "vl %%v18, 96(%%r1,%[ptr_x_tmp]) \n\t" | |||||
| "vl %%v19,112(%%r1,%[ptr_x_tmp]) \n\t" | |||||
| "vl %%v28, 64(%%r1,%[ptr_y_tmp]) \n\t" | |||||
| "vl %%v29, 80(%%r1,%[ptr_y_tmp]) \n\t" | |||||
| "vl %%v30, 96(%%r1,%[ptr_y_tmp]) \n\t" | |||||
| "vl %%v31,112(%%r1,%[ptr_y_tmp]) \n\t" | |||||
| "vpdi %%v20,%%v16,%%v16,4 \n\t" | |||||
| "vpdi %%v21,%%v17,%%v17,4 \n\t" | |||||
| "vpdi %%v22,%%v18,%%v18,4 \n\t" | |||||
| "vpdi %%v23,%%v19,%%v19,4 \n\t" | |||||
| "vfmadb %%v24,%%v16,%%v28,%%v24 \n\t" | |||||
| "vfmadb %%v25,%%v20,%%v28,%%v25 \n\t" | |||||
| "vfmadb %%v26,%%v17,%%v29,%%v26 \n\t" | |||||
| "vfmadb %%v27,%%v21,%%v29,%%v27 \n\t" | |||||
| "vfmadb %%v24,%%v18,%%v30,%%v24 \n\t" | |||||
| "vfmadb %%v25,%%v22,%%v30,%%v25 \n\t" | |||||
| "vfmadb %%v26,%%v19,%%v31,%%v26 \n\t" | |||||
| "vfmadb %%v27,%%v23,%%v31,%%v27 \n\t" | |||||
| "la %%r1,128(%%r1) \n\t" | |||||
| "brctg %[n_tmp],1b \n\t" | |||||
| "vfadb %%v24,%%v26,%%v24 \n\t" | |||||
| "vfadb %%v25,%%v25,%%v27 \n\t" | |||||
| "vsteg %%v24, 0(%[ptr_d]),0 \n\t" | |||||
| "vsteg %%v24, 8(%[ptr_d]),1 \n\t" | |||||
| "vsteg %%v25,16(%[ptr_d]),1 \n\t" | |||||
| "vsteg %%v25,24(%[ptr_d]),0 \n\t" | |||||
| : [mem_out] "=m"(*(double (*)[4])d ) ,[n_tmp] "+&r"(n) | |||||
| : [mem_x] "m"( *(const double (*)[2*n])x), | |||||
| [mem_y] "m"( *(const double (*)[2*n])y), | |||||
| [ptr_x_tmp] "a"(x), [ptr_y_tmp] "a"(y), [ptr_d] "a"(d) | |||||
| : "cc", "r1","v16", | : "cc", "r1","v16", | ||||
| "v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" | "v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" | ||||
| ); | ); | ||||
| @@ -150,8 +152,8 @@ static void zdot_kernel_8(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *d) { | |||||
| #endif | #endif | ||||
| OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) { | OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) { | ||||
| BLASLONG i; | |||||
| BLASLONG ix, iy; | |||||
| BLASLONG i = 0; | |||||
| BLASLONG ix=0, iy=0; | |||||
| OPENBLAS_COMPLEX_FLOAT result; | OPENBLAS_COMPLEX_FLOAT result; | ||||
| FLOAT dot[4] __attribute__ ((aligned(16))) = {0.0, 0.0, 0.0, 0.0}; | FLOAT dot[4] __attribute__ ((aligned(16))) = {0.0, 0.0, 0.0, 0.0}; | ||||
| @@ -164,13 +166,15 @@ OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLA | |||||
| if ((inc_x == 1) && (inc_y == 1)) { | if ((inc_x == 1) && (inc_y == 1)) { | ||||
| BLASLONG n1 = n & -16; | |||||
| BLASLONG n1 = n & -8; | |||||
| BLASLONG j=0; | |||||
| if (n1) | |||||
| if (n1){ | |||||
| zdot_kernel_8(n1, x, y, dot); | zdot_kernel_8(n1, x, y, dot); | ||||
| i = n1; | |||||
| BLASLONG j = i * 2; | |||||
| i = n1; | |||||
| j = n1 <<1; | |||||
| } | |||||
| while (i < n) { | while (i < n) { | ||||
| @@ -24,41 +24,41 @@ CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | ||||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | ||||
| *****************************************************************************/ | *****************************************************************************/ | ||||
| #include "common.h" | #include "common.h" | ||||
| static void __attribute__ ((noinline)) zrot_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *c, FLOAT *s) | |||||
| static void zrot_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT cosA, FLOAT sinA) | |||||
| { | { | ||||
| __asm__ ( | __asm__ ( | ||||
| "pfd 2, 0(%1) \n\t" | |||||
| "pfd 2, 0(%2) \n\t" | |||||
| "vlrepg %%v0,0(%3) \n\t" | |||||
| "vlrepg %%v1,0(%4) \n\t" | |||||
| "srlg %%r0,%0,4 \n\t" | |||||
| "xgr %%r1,%%r1 \n\t" | |||||
| "pfd 2, 0(%[ptr_x]) \n\t" | |||||
| "pfd 2, 0(%[ptr_y]) \n\t" | |||||
| "lgdr %%r1,%[cos] \n\t" | |||||
| "vlvgp %%v0,%%r1,%%r1 \n\t" | |||||
| "lgdr %%r1,%[sin] \n\t" | |||||
| "vlvgp %%v1,%%r1,%%r1 \n\t" | |||||
| "sllg %[tmp],%[tmp],4 \n\t" | |||||
| "xgr %%r1,%%r1 \n\t" | |||||
| ".align 16 \n\t" | ".align 16 \n\t" | ||||
| "1: \n\t" | |||||
| "pfd 2, 256(%%r1,%1) \n\t" | |||||
| "pfd 2, 256(%%r1,%2) \n\t" | |||||
| "vl %%v24, 0(%%r1,%1) \n\t" | |||||
| "vl %%v25, 16(%%r1,%1) \n\t" | |||||
| "vl %%v26, 32(%%r1,%1) \n\t" | |||||
| "vl %%v27, 48(%%r1,%1) \n\t" | |||||
| "vl %%v16, 0(%%r1,%2) \n\t" | |||||
| "vl %%v17, 16(%%r1,%2) \n\t" | |||||
| "vl %%v18, 32(%%r1,%2) \n\t" | |||||
| "vl %%v19, 48(%%r1,%2) \n\t" | |||||
| "1: \n\t" | |||||
| "pfd 2, 256(%%r1,%[ptr_x]) \n\t" | |||||
| "pfd 2, 256(%%r1,%[ptr_y]) \n\t" | |||||
| "vl %%v24, 0(%%r1,%[ptr_x]) \n\t" | |||||
| "vl %%v25, 16(%%r1,%[ptr_x]) \n\t" | |||||
| "vl %%v26, 32(%%r1,%[ptr_x]) \n\t" | |||||
| "vl %%v27, 48(%%r1,%[ptr_x]) \n\t" | |||||
| "vl %%v16, 0(%%r1,%[ptr_y]) \n\t" | |||||
| "vl %%v17, 16(%%r1,%[ptr_y]) \n\t" | |||||
| "vl %%v18, 32(%%r1,%[ptr_y]) \n\t" | |||||
| "vl %%v19, 48(%%r1,%[ptr_y]) \n\t" | |||||
| "vfmdb %%v28,%%v24,%%v0 \n\t" | |||||
| "vfmdb %%v29,%%v25,%%v0 \n\t" | |||||
| "vfmdb %%v20,%%v24,%%v1 \n\t" /* yn=x*s */ | |||||
| "vfmdb %%v21,%%v25,%%v1 \n\t" /* yn=x*s */ | |||||
| "vfmdb %%v30,%%v26,%%v0 \n\t" | |||||
| "vfmdb %%v22,%%v26,%%v1 \n\t" /* yn=x*s */ | |||||
| "vfmdb %%v31,%%v27,%%v0 \n\t" | |||||
| "vfmdb %%v23,%%v27,%%v1 \n\t" /* yn=x*s */ | |||||
| "vfmdb %%v28,%%v24,%%v0 \n\t" | |||||
| "vfmdb %%v29,%%v25,%%v0 \n\t" | |||||
| "vfmdb %%v20,%%v24,%%v1 \n\t" /* yn=x*s */ | |||||
| "vfmdb %%v21,%%v25,%%v1 \n\t" /* yn=x*s */ | |||||
| "vfmdb %%v30,%%v26,%%v0 \n\t" | |||||
| "vfmdb %%v22,%%v26,%%v1 \n\t" /* yn=x*s */ | |||||
| "vfmdb %%v31,%%v27,%%v0 \n\t" | |||||
| "vfmdb %%v23,%%v27,%%v1 \n\t" /* yn=x*s */ | |||||
| /* 2nd parts*/ | /* 2nd parts*/ | ||||
| "vfmadb %%v28,%%v16,%%v1,%%v28 \n\t" | "vfmadb %%v28,%%v16,%%v1,%%v28 \n\t" | ||||
| "vfmsdb %%v20,%%v16,%%v0,%%v20 \n\t" /* yn=y*c-yn */ | "vfmsdb %%v20,%%v16,%%v0,%%v20 \n\t" /* yn=y*c-yn */ | ||||
| @@ -68,35 +68,33 @@ static void __attribute__ ((noinline)) zrot_kernel_16(BLASLONG n, FLOAT *x, FLO | |||||
| "vfmsdb %%v22,%%v18,%%v0,%%v22 \n\t" /* yn=y*c-yn */ | "vfmsdb %%v22,%%v18,%%v0,%%v22 \n\t" /* yn=y*c-yn */ | ||||
| "vfmadb %%v31,%%v19,%%v1,%%v31 \n\t" | "vfmadb %%v31,%%v19,%%v1,%%v31 \n\t" | ||||
| "vfmsdb %%v23,%%v19,%%v0,%%v23 \n\t" /* yn=y*c-yn */ | "vfmsdb %%v23,%%v19,%%v0,%%v23 \n\t" /* yn=y*c-yn */ | ||||
| "vst %%v28, 0(%%r1,%[ptr_x]) \n\t" | |||||
| "vst %%v29, 16(%%r1,%[ptr_x]) \n\t" | |||||
| "vst %%v30, 32(%%r1,%[ptr_x]) \n\t" | |||||
| "vst %%v31, 48(%%r1,%[ptr_x]) \n\t" | |||||
| "vst %%v20, 0(%%r1,%[ptr_y]) \n\t" | |||||
| "vst %%v21, 16(%%r1,%[ptr_y]) \n\t" | |||||
| "vst %%v22, 32(%%r1,%[ptr_y]) \n\t" | |||||
| "vst %%v23, 48(%%r1,%[ptr_y]) \n\t" | |||||
| "vl %%v24, 64(%%r1,%[ptr_x]) \n\t" | |||||
| "vl %%v25, 80(%%r1,%[ptr_x]) \n\t" | |||||
| "vl %%v26, 96(%%r1,%[ptr_x]) \n\t" | |||||
| "vl %%v27,112(%%r1,%[ptr_x]) \n\t" | |||||
| "vl %%v16, 64(%%r1,%[ptr_y]) \n\t" | |||||
| "vl %%v17, 80(%%r1,%[ptr_y]) \n\t" | |||||
| "vl %%v18, 96(%%r1,%[ptr_y]) \n\t" | |||||
| "vl %%v19,112(%%r1,%[ptr_y]) \n\t" | |||||
| "vst %%v28, 0(%%r1,%1) \n\t" | |||||
| "vst %%v29, 16(%%r1,%1) \n\t" | |||||
| "vst %%v30, 32(%%r1,%1) \n\t" | |||||
| "vst %%v31, 48(%%r1,%1) \n\t" | |||||
| "vst %%v20, 0(%%r1,%2) \n\t" | |||||
| "vst %%v21, 16(%%r1,%2) \n\t" | |||||
| "vst %%v22, 32(%%r1,%2) \n\t" | |||||
| "vst %%v23, 48(%%r1,%2) \n\t" | |||||
| "vl %%v24, 64(%%r1,%1) \n\t" | |||||
| "vl %%v25, 80(%%r1,%1) \n\t" | |||||
| "vl %%v26, 96(%%r1,%1) \n\t" | |||||
| "vl %%v27,112(%%r1,%1) \n\t" | |||||
| "vl %%v16, 64(%%r1,%2) \n\t" | |||||
| "vl %%v17, 80(%%r1,%2) \n\t" | |||||
| "vl %%v18, 96(%%r1,%2) \n\t" | |||||
| "vl %%v19,112(%%r1,%2) \n\t" | |||||
| "vfmdb %%v28,%%v24,%%v0 \n\t" | |||||
| "vfmdb %%v29,%%v25,%%v0 \n\t" | |||||
| "vfmdb %%v20,%%v24,%%v1 \n\t" /* yn=x*s */ | |||||
| "vfmdb %%v21,%%v25,%%v1 \n\t" /* yn=x*s */ | |||||
| "vfmdb %%v30,%%v26,%%v0 \n\t" | |||||
| "vfmdb %%v22,%%v26,%%v1 \n\t" /* yn=x*s */ | |||||
| "vfmdb %%v31,%%v27,%%v0 \n\t" | |||||
| "vfmdb %%v23,%%v27,%%v1 \n\t" /* yn=x*s */ | |||||
| "vfmdb %%v28,%%v24,%%v0 \n\t" | |||||
| "vfmdb %%v29,%%v25,%%v0 \n\t" | |||||
| "vfmdb %%v20,%%v24,%%v1 \n\t" /* yn=x*s */ | |||||
| "vfmdb %%v21,%%v25,%%v1 \n\t" /* yn=x*s */ | |||||
| "vfmdb %%v30,%%v26,%%v0 \n\t" | |||||
| "vfmdb %%v22,%%v26,%%v1 \n\t" /* yn=x*s */ | |||||
| "vfmdb %%v31,%%v27,%%v0 \n\t" | |||||
| "vfmdb %%v23,%%v27,%%v1 \n\t" /* yn=x*s */ | |||||
| /* 2nd parts*/ | /* 2nd parts*/ | ||||
| "vfmadb %%v28,%%v16,%%v1,%%v28 \n\t" | "vfmadb %%v28,%%v16,%%v1,%%v28 \n\t" | ||||
| "vfmsdb %%v20,%%v16,%%v0,%%v20 \n\t" /* yn=y*c-yn */ | "vfmsdb %%v20,%%v16,%%v0,%%v20 \n\t" /* yn=y*c-yn */ | ||||
| @@ -106,35 +104,33 @@ static void __attribute__ ((noinline)) zrot_kernel_16(BLASLONG n, FLOAT *x, FLO | |||||
| "vfmsdb %%v22,%%v18,%%v0,%%v22 \n\t" /* yn=y*c-yn */ | "vfmsdb %%v22,%%v18,%%v0,%%v22 \n\t" /* yn=y*c-yn */ | ||||
| "vfmadb %%v31,%%v19,%%v1,%%v31 \n\t" | "vfmadb %%v31,%%v19,%%v1,%%v31 \n\t" | ||||
| "vfmsdb %%v23,%%v19,%%v0,%%v23 \n\t" /* yn=y*c-yn */ | "vfmsdb %%v23,%%v19,%%v0,%%v23 \n\t" /* yn=y*c-yn */ | ||||
| "vst %%v28, 64(%%r1,%[ptr_x]) \n\t" | |||||
| "vst %%v29, 80(%%r1,%[ptr_x]) \n\t" | |||||
| "vst %%v30, 96(%%r1,%[ptr_x]) \n\t" | |||||
| "vst %%v31, 112(%%r1,%[ptr_x]) \n\t" | |||||
| "vst %%v20, 64(%%r1,%[ptr_y]) \n\t" | |||||
| "vst %%v21, 80(%%r1,%[ptr_y]) \n\t" | |||||
| "vst %%v22, 96(%%r1,%[ptr_y]) \n\t" | |||||
| "vst %%v23, 112(%%r1,%[ptr_y]) \n\t" | |||||
| "vl %%v24, 128(%%r1,%[ptr_x]) \n\t" | |||||
| "vl %%v25, 144(%%r1,%[ptr_x]) \n\t" | |||||
| "vl %%v26, 160(%%r1,%[ptr_x]) \n\t" | |||||
| "vl %%v27, 176(%%r1,%[ptr_x]) \n\t" | |||||
| "vl %%v16, 128(%%r1,%[ptr_y]) \n\t" | |||||
| "vl %%v17, 144(%%r1,%[ptr_y]) \n\t" | |||||
| "vl %%v18, 160(%%r1,%[ptr_y]) \n\t" | |||||
| "vl %%v19, 176(%%r1,%[ptr_y]) \n\t" | |||||
| "vst %%v28, 64(%%r1,%1) \n\t" | |||||
| "vst %%v29, 80(%%r1,%1) \n\t" | |||||
| "vst %%v30, 96(%%r1,%1) \n\t" | |||||
| "vst %%v31, 112(%%r1,%1) \n\t" | |||||
| "vst %%v20, 64(%%r1,%2) \n\t" | |||||
| "vst %%v21, 80(%%r1,%2) \n\t" | |||||
| "vst %%v22, 96(%%r1,%2) \n\t" | |||||
| "vst %%v23, 112(%%r1,%2) \n\t" | |||||
| "vl %%v24, 128(%%r1,%1) \n\t" | |||||
| "vl %%v25, 144(%%r1,%1) \n\t" | |||||
| "vl %%v26, 160(%%r1,%1) \n\t" | |||||
| "vl %%v27, 176(%%r1,%1) \n\t" | |||||
| "vl %%v16, 128(%%r1,%2) \n\t" | |||||
| "vl %%v17, 144(%%r1,%2) \n\t" | |||||
| "vl %%v18, 160(%%r1,%2) \n\t" | |||||
| "vl %%v19, 176(%%r1,%2) \n\t" | |||||
| "vfmdb %%v28,%%v24,%%v0 \n\t" | |||||
| "vfmdb %%v29,%%v25,%%v0 \n\t" | |||||
| "vfmdb %%v20,%%v24,%%v1 \n\t" /* yn=x*s */ | |||||
| "vfmdb %%v21,%%v25,%%v1 \n\t" /* yn=x*s */ | |||||
| "vfmdb %%v30,%%v26,%%v0 \n\t" | |||||
| "vfmdb %%v22,%%v26,%%v1 \n\t" /* yn=x*s */ | |||||
| "vfmdb %%v31,%%v27,%%v0 \n\t" | |||||
| "vfmdb %%v23,%%v27,%%v1 \n\t" /* yn=x*s */ | |||||
| "vfmdb %%v28,%%v24,%%v0 \n\t" | |||||
| "vfmdb %%v29,%%v25,%%v0 \n\t" | |||||
| "vfmdb %%v20,%%v24,%%v1 \n\t" /* yn=x*s */ | |||||
| "vfmdb %%v21,%%v25,%%v1 \n\t" /* yn=x*s */ | |||||
| "vfmdb %%v30,%%v26,%%v0 \n\t" | |||||
| "vfmdb %%v22,%%v26,%%v1 \n\t" /* yn=x*s */ | |||||
| "vfmdb %%v31,%%v27,%%v0 \n\t" | |||||
| "vfmdb %%v23,%%v27,%%v1 \n\t" /* yn=x*s */ | |||||
| /* 2nd parts*/ | /* 2nd parts*/ | ||||
| "vfmadb %%v28,%%v16,%%v1,%%v28 \n\t" | "vfmadb %%v28,%%v16,%%v1,%%v28 \n\t" | ||||
| "vfmsdb %%v20,%%v16,%%v0,%%v20 \n\t" /* yn=y*c-yn */ | "vfmsdb %%v20,%%v16,%%v0,%%v20 \n\t" /* yn=y*c-yn */ | ||||
| @@ -144,35 +140,33 @@ static void __attribute__ ((noinline)) zrot_kernel_16(BLASLONG n, FLOAT *x, FLO | |||||
| "vfmsdb %%v22,%%v18,%%v0,%%v22 \n\t" /* yn=y*c-yn */ | "vfmsdb %%v22,%%v18,%%v0,%%v22 \n\t" /* yn=y*c-yn */ | ||||
| "vfmadb %%v31,%%v19,%%v1,%%v31 \n\t" | "vfmadb %%v31,%%v19,%%v1,%%v31 \n\t" | ||||
| "vfmsdb %%v23,%%v19,%%v0,%%v23 \n\t" /* yn=y*c-yn */ | "vfmsdb %%v23,%%v19,%%v0,%%v23 \n\t" /* yn=y*c-yn */ | ||||
| "vst %%v28, 128(%%r1,%[ptr_x]) \n\t" | |||||
| "vst %%v29, 144(%%r1,%[ptr_x]) \n\t" | |||||
| "vst %%v30, 160(%%r1,%[ptr_x]) \n\t" | |||||
| "vst %%v31, 176(%%r1,%[ptr_x]) \n\t" | |||||
| "vst %%v20, 128(%%r1,%[ptr_y]) \n\t" | |||||
| "vst %%v21, 144(%%r1,%[ptr_y]) \n\t" | |||||
| "vst %%v22, 160(%%r1,%[ptr_y]) \n\t" | |||||
| "vst %%v23, 176(%%r1,%[ptr_y]) \n\t" | |||||
| "vl %%v24, 192(%%r1,%[ptr_x]) \n\t" | |||||
| "vl %%v25, 208(%%r1,%[ptr_x]) \n\t" | |||||
| "vl %%v26, 224(%%r1,%[ptr_x]) \n\t" | |||||
| "vl %%v27, 240(%%r1,%[ptr_x]) \n\t" | |||||
| "vl %%v16, 192(%%r1,%[ptr_y]) \n\t" | |||||
| "vl %%v17, 208(%%r1,%[ptr_y]) \n\t" | |||||
| "vl %%v18, 224(%%r1,%[ptr_y]) \n\t" | |||||
| "vl %%v19, 240(%%r1,%[ptr_y]) \n\t" | |||||
| "vst %%v28, 128(%%r1,%1) \n\t" | |||||
| "vst %%v29, 144(%%r1,%1) \n\t" | |||||
| "vst %%v30, 160(%%r1,%1) \n\t" | |||||
| "vst %%v31, 176(%%r1,%1) \n\t" | |||||
| "vst %%v20, 128(%%r1,%2) \n\t" | |||||
| "vst %%v21, 144(%%r1,%2) \n\t" | |||||
| "vst %%v22, 160(%%r1,%2) \n\t" | |||||
| "vst %%v23, 176(%%r1,%2) \n\t" | |||||
| "vl %%v24, 192(%%r1,%1) \n\t" | |||||
| "vl %%v25, 208(%%r1,%1) \n\t" | |||||
| "vl %%v26, 224(%%r1,%1) \n\t" | |||||
| "vl %%v27, 240(%%r1,%1) \n\t" | |||||
| "vl %%v16, 192(%%r1,%2) \n\t" | |||||
| "vl %%v17, 208(%%r1,%2) \n\t" | |||||
| "vl %%v18, 224(%%r1,%2) \n\t" | |||||
| "vl %%v19, 240(%%r1,%2) \n\t" | |||||
| "vfmdb %%v28,%%v24,%%v0 \n\t" | |||||
| "vfmdb %%v29,%%v25,%%v0 \n\t" | |||||
| "vfmdb %%v20,%%v24,%%v1 \n\t" /* yn=x*s */ | |||||
| "vfmdb %%v21,%%v25,%%v1 \n\t" /* yn=x*s */ | |||||
| "vfmdb %%v30,%%v26,%%v0 \n\t" | |||||
| "vfmdb %%v22,%%v26,%%v1 \n\t" /* yn=x*s */ | |||||
| "vfmdb %%v31,%%v27,%%v0 \n\t" | |||||
| "vfmdb %%v23,%%v27,%%v1 \n\t" /* yn=x*s */ | |||||
| "vfmdb %%v28,%%v24,%%v0 \n\t" | |||||
| "vfmdb %%v29,%%v25,%%v0 \n\t" | |||||
| "vfmdb %%v20,%%v24,%%v1 \n\t" /* yn=x*s */ | |||||
| "vfmdb %%v21,%%v25,%%v1 \n\t" /* yn=x*s */ | |||||
| "vfmdb %%v30,%%v26,%%v0 \n\t" | |||||
| "vfmdb %%v22,%%v26,%%v1 \n\t" /* yn=x*s */ | |||||
| "vfmdb %%v31,%%v27,%%v0 \n\t" | |||||
| "vfmdb %%v23,%%v27,%%v1 \n\t" /* yn=x*s */ | |||||
| /* 2nd parts*/ | /* 2nd parts*/ | ||||
| "vfmadb %%v28,%%v16,%%v1,%%v28 \n\t" | "vfmadb %%v28,%%v16,%%v1,%%v28 \n\t" | ||||
| "vfmsdb %%v20,%%v16,%%v0,%%v20 \n\t" /* yn=y*c-yn */ | "vfmsdb %%v20,%%v16,%%v0,%%v20 \n\t" /* yn=y*c-yn */ | ||||
| @@ -182,33 +176,29 @@ static void __attribute__ ((noinline)) zrot_kernel_16(BLASLONG n, FLOAT *x, FLO | |||||
| "vfmsdb %%v22,%%v18,%%v0,%%v22 \n\t" /* yn=y*c-yn */ | "vfmsdb %%v22,%%v18,%%v0,%%v22 \n\t" /* yn=y*c-yn */ | ||||
| "vfmadb %%v31,%%v19,%%v1,%%v31 \n\t" | "vfmadb %%v31,%%v19,%%v1,%%v31 \n\t" | ||||
| "vfmsdb %%v23,%%v19,%%v0,%%v23 \n\t" /* yn=y*c-yn */ | "vfmsdb %%v23,%%v19,%%v0,%%v23 \n\t" /* yn=y*c-yn */ | ||||
| "vst %%v28, 192(%%r1,%1) \n\t" | |||||
| "vst %%v29, 208(%%r1,%1) \n\t" | |||||
| "vst %%v30, 224(%%r1,%1) \n\t" | |||||
| "vst %%v31, 240(%%r1,%1) \n\t" | |||||
| "vst %%v20, 192(%%r1,%2) \n\t" | |||||
| "vst %%v21, 208(%%r1,%2) \n\t" | |||||
| "vst %%v22, 224(%%r1,%2) \n\t" | |||||
| "vst %%v23, 240(%%r1,%2) \n\t" | |||||
| "la %%r1,256(%%r1) \n\t" | |||||
| "brctg %%r0,1b" | |||||
| : | |||||
| : "r"(n), "a"(x), "a"(y),"a"(c),"a"(s) | |||||
| : "cc", "memory","r0","r1" ,"v0","v1","v16", | |||||
| "vst %%v28, 192(%%r1,%[ptr_x]) \n\t" | |||||
| "vst %%v29, 208(%%r1,%[ptr_x]) \n\t" | |||||
| "vst %%v30, 224(%%r1,%[ptr_x]) \n\t" | |||||
| "vst %%v31, 240(%%r1,%[ptr_x]) \n\t" | |||||
| "vst %%v20, 192(%%r1,%[ptr_y]) \n\t" | |||||
| "vst %%v21, 208(%%r1,%[ptr_y]) \n\t" | |||||
| "vst %%v22, 224(%%r1,%[ptr_y]) \n\t" | |||||
| "vst %%v23, 240(%%r1,%[ptr_y]) \n\t" | |||||
| "la %%r1,256(%%r1) \n\t" | |||||
| "clgrjl %%r1,%[tmp],1b \n\t" | |||||
| : [mem_x] "+m" (*(double (*)[2*n])x), | |||||
| [mem_y] "+m" (*(double (*)[2*n])y), | |||||
| [tmp] "+&r"(n) | |||||
| : [ptr_x] "a"(x), [ptr_y] "a"(y),[cos] "f"(cosA),[sin] "f"(sinA) | |||||
| : "cc","r1" ,"v0","v1","v16", | |||||
| "v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" | "v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" | ||||
| ); | ); | ||||
| return; | return; | ||||
| } | } | ||||
| int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT c, FLOAT s) | int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT c, FLOAT s) | ||||
| { | { | ||||
| BLASLONG i=0; | BLASLONG i=0; | ||||
| @@ -224,11 +214,8 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT | |||||
| BLASLONG n1 = n & -16; | BLASLONG n1 = n & -16; | ||||
| if ( n1 > 0 ) | if ( n1 > 0 ) | ||||
| { | |||||
| FLOAT cosa,sina; | |||||
| cosa=c; | |||||
| sina=s; | |||||
| zrot_kernel_16(n1, x, y, &cosa, &sina); | |||||
| { | |||||
| zrot_kernel_16(n1, x, y, c, s); | |||||
| i=n1; | i=n1; | ||||
| ix=2*n1; | ix=2*n1; | ||||
| } | } | ||||
| @@ -247,7 +234,6 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT | |||||
| } | } | ||||
| } | } | ||||
| else | else | ||||
| { | { | ||||
| @@ -273,4 +259,3 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT | |||||
| } | } | ||||
| @@ -29,229 +29,218 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| static void __attribute__ ((noinline)) zscal_kernel_8(BLASLONG n, FLOAT *alpha, FLOAT *x) { | |||||
| static void zscal_kernel_8(BLASLONG n, FLOAT da_r,FLOAT da_i, FLOAT *x) { | |||||
| __asm__( | __asm__( | ||||
| "pfd 1, 0(%1) \n\t" | |||||
| "sllg %%r0,%0,4 \n\t" | |||||
| "agr %%r0,%2 \n\t" | |||||
| "vlrepg %%v24,0(%1) \n\t" | |||||
| "vlrepg %%v25,8(%1) \n\t" | |||||
| "pfd 1, 0(%[x_ptr]) \n\t" | |||||
| "lgdr %%r0,%[alpha_r] \n\t" | |||||
| "vlvgp %%v24,%%r0,%%r0 \n\t" | |||||
| "lgdr %%r0,%[alpha_i] \n\t" | |||||
| "vlvgp %%v25,%%r0,%%r0 \n\t" | |||||
| "sllg %%r0,%[n],4 \n\t" | |||||
| "agr %%r0,%[x_ptr] \n\t" | |||||
| ".align 16 \n\t" | ".align 16 \n\t" | ||||
| "1: \n\t" | |||||
| "pfd 2, 256(%2 ) \n\t" | |||||
| "vleg %%v20 , 0(%2),0 \n\t" | |||||
| "vleg %%v21 , 8(%2),0 \n\t" | |||||
| "vleg %%v20 , 16(%2),1 \n\t" | |||||
| "vleg %%v21 , 24(%2),1 \n\t" | |||||
| "vleg %%v22 , 32(%2),0 \n\t" | |||||
| "vleg %%v23 , 40(%2),0 \n\t" | |||||
| "vleg %%v22 , 48(%2),1 \n\t" | |||||
| "vleg %%v23 , 56(%2),1 \n\t" | |||||
| "vfmdb %%v16, %%v21, %%v25 \n\t" | |||||
| "vfmdb %%v17, %%v20, %%v25 \n\t" | |||||
| "vfmdb %%v18, %%v23, %%v25 \n\t" | |||||
| "vfmdb %%v19, %%v22, %%v25 \n\t" | |||||
| "1: \n\t" | |||||
| "pfd 2, 256(%[x_ptr] ) \n\t" | |||||
| "vleg %%v20 , 0(%[x_ptr]),0 \n\t" | |||||
| "vleg %%v21 , 8(%[x_ptr]),0 \n\t" | |||||
| "vleg %%v20 , 16(%[x_ptr]),1 \n\t" | |||||
| "vleg %%v21 , 24(%[x_ptr]),1 \n\t" | |||||
| "vleg %%v22 , 32(%[x_ptr]),0 \n\t" | |||||
| "vleg %%v23 , 40(%[x_ptr]),0 \n\t" | |||||
| "vleg %%v22 , 48(%[x_ptr]),1 \n\t" | |||||
| "vleg %%v23 , 56(%[x_ptr]),1 \n\t" | |||||
| "vfmdb %%v16, %%v21, %%v25 \n\t" | |||||
| "vfmdb %%v17, %%v20, %%v25 \n\t" | |||||
| "vfmdb %%v18, %%v23, %%v25 \n\t" | |||||
| "vfmdb %%v19, %%v22, %%v25 \n\t" | |||||
| "vfmsdb %%v16, %%v20, %%v24 ,%%v16 \n\t" | "vfmsdb %%v16, %%v20, %%v24 ,%%v16 \n\t" | ||||
| "vfmadb %%v17, %%v21, %%v24, %%v17 \n\t" | "vfmadb %%v17, %%v21, %%v24, %%v17 \n\t" | ||||
| "vfmsdb %%v18, %%v22, %%v24, %%v18 \n\t" | "vfmsdb %%v18, %%v22, %%v24, %%v18 \n\t" | ||||
| "vfmadb %%v19, %%v23, %%v24, %%v19 \n\t" | "vfmadb %%v19, %%v23, %%v24, %%v19 \n\t" | ||||
| "vsteg %%v16 , 0(%2),0 \n\t" | |||||
| "vsteg %%v17 , 8(%2),0 \n\t" | |||||
| "vsteg %%v16 , 16(%2),1 \n\t" | |||||
| "vsteg %%v17 , 24(%2),1 \n\t" | |||||
| "vsteg %%v18 , 32(%2),0 \n\t" | |||||
| "vsteg %%v19 , 40(%2),0 \n\t" | |||||
| "vsteg %%v18 , 48(%2),1 \n\t" | |||||
| "vsteg %%v19 , 56(%2),1 \n\t" | |||||
| "vleg %%v20 , 64(%2),0 \n\t" | |||||
| "vleg %%v21 , 72(%2),0 \n\t" | |||||
| "vleg %%v20 , 80(%2),1 \n\t" | |||||
| "vleg %%v21 , 88(%2),1 \n\t" | |||||
| "vleg %%v22 , 96(%2),0 \n\t" | |||||
| "vleg %%v23 , 104(%2),0 \n\t" | |||||
| "vleg %%v22 , 112(%2),1 \n\t" | |||||
| "vleg %%v23 , 120(%2),1 \n\t" | |||||
| "vfmdb %%v16, %%v21, %%v25 \n\t" | |||||
| "vfmdb %%v17, %%v20, %%v25 \n\t" | |||||
| "vfmdb %%v18, %%v23, %%v25 \n\t" | |||||
| "vfmdb %%v19, %%v22, %%v25 \n\t" | |||||
| "vsteg %%v16 , 0(%[x_ptr]),0 \n\t" | |||||
| "vsteg %%v17 , 8(%[x_ptr]),0 \n\t" | |||||
| "vsteg %%v16 , 16(%[x_ptr]),1 \n\t" | |||||
| "vsteg %%v17 , 24(%[x_ptr]),1 \n\t" | |||||
| "vsteg %%v18 , 32(%[x_ptr]),0 \n\t" | |||||
| "vsteg %%v19 , 40(%[x_ptr]),0 \n\t" | |||||
| "vsteg %%v18 , 48(%[x_ptr]),1 \n\t" | |||||
| "vsteg %%v19 , 56(%[x_ptr]),1 \n\t" | |||||
| "vleg %%v20 , 64(%[x_ptr]),0 \n\t" | |||||
| "vleg %%v21 , 72(%[x_ptr]),0 \n\t" | |||||
| "vleg %%v20 , 80(%[x_ptr]),1 \n\t" | |||||
| "vleg %%v21 , 88(%[x_ptr]),1 \n\t" | |||||
| "vleg %%v22 , 96(%[x_ptr]),0 \n\t" | |||||
| "vleg %%v23 , 104(%[x_ptr]),0 \n\t" | |||||
| "vleg %%v22 , 112(%[x_ptr]),1 \n\t" | |||||
| "vleg %%v23 , 120(%[x_ptr]),1 \n\t" | |||||
| "vfmdb %%v16, %%v21, %%v25 \n\t" | |||||
| "vfmdb %%v17, %%v20, %%v25 \n\t" | |||||
| "vfmdb %%v18, %%v23, %%v25 \n\t" | |||||
| "vfmdb %%v19, %%v22, %%v25 \n\t" | |||||
| "vfmsdb %%v16, %%v20, %%v24 ,%%v16 \n\t" | "vfmsdb %%v16, %%v20, %%v24 ,%%v16 \n\t" | ||||
| "vfmadb %%v17, %%v21, %%v24, %%v17 \n\t" | "vfmadb %%v17, %%v21, %%v24, %%v17 \n\t" | ||||
| "vfmsdb %%v18, %%v22, %%v24, %%v18 \n\t" | "vfmsdb %%v18, %%v22, %%v24, %%v18 \n\t" | ||||
| "vfmadb %%v19, %%v23, %%v24, %%v19 \n\t" | "vfmadb %%v19, %%v23, %%v24, %%v19 \n\t" | ||||
| "vsteg %%v16 , 64(%2),0 \n\t" | |||||
| "vsteg %%v17 , 72(%2),0 \n\t" | |||||
| "vsteg %%v16 , 80(%2),1 \n\t" | |||||
| "vsteg %%v17 , 88(%2),1 \n\t" | |||||
| "vsteg %%v18 , 96(%2),0 \n\t" | |||||
| "vsteg %%v19 , 104(%2),0 \n\t" | |||||
| "vsteg %%v18 , 112(%2),1 \n\t" | |||||
| "vsteg %%v19 , 120(%2),1 \n\t" | |||||
| "vsteg %%v16 , 64(%[x_ptr]),0 \n\t" | |||||
| "vsteg %%v17 , 72(%[x_ptr]),0 \n\t" | |||||
| "vsteg %%v16 , 80(%[x_ptr]),1 \n\t" | |||||
| "vsteg %%v17 , 88(%[x_ptr]),1 \n\t" | |||||
| "vsteg %%v18 , 96(%[x_ptr]),0 \n\t" | |||||
| "vsteg %%v19 , 104(%[x_ptr]),0 \n\t" | |||||
| "vsteg %%v18 , 112(%[x_ptr]),1 \n\t" | |||||
| "vsteg %%v19 , 120(%[x_ptr]),1 \n\t" | |||||
| "la %2,128(%2) \n\t" | |||||
| "clgrjl %2,%%r0,1b \n\t" | |||||
| : | |||||
| : "r"(n), "a"(alpha), "a"(x) | |||||
| "la %[x_ptr],128(%[x_ptr]) \n\t" | |||||
| "clgrjl %[x_ptr],%%r0,1b \n\t" | |||||
| : [mem] "+m" (*(double (*)[2*n])x) ,[x_ptr] "+&a"(x) | |||||
| : [n] "r"(n), [alpha_r] "f"(da_r),[alpha_i] "f"(da_i) | |||||
| : "cc", "memory","r0","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25" | : "cc", "memory","r0","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25" | ||||
| ); | ); | ||||
| } | } | ||||
| static void __attribute__ ((noinline)) zscal_kernel_8_zero_r(BLASLONG n, FLOAT *alpha, FLOAT *x) { | |||||
| static void zscal_kernel_8_zero_r(BLASLONG n, FLOAT da_i, FLOAT *x) { | |||||
| __asm__ ( "pfd 2, 0(%1) \n\t" | |||||
| "ld %%f0,8(%2) \n\t" | |||||
| "lcdbr %%f1,%%f0 \n\t" | |||||
| "lgdr %%r0,%%f1 \n\t" | |||||
| "vlvgg %%v0,%%r0,1 \n\t" | |||||
| "vlr %%v16,%%v0 \n\t" | |||||
| "vlr %%v17 ,%%v0 \n\t" | |||||
| "vlr %%v1,%%v0 \n\t" | |||||
| "sllg %%r0,%0,4 \n\t" | |||||
| "agr %%r0,%1 \n\t" | |||||
| __asm__ ( "pfd 2, 0(%1) \n\t" | |||||
| "lgdr %%r0,%[alpha] \n\t" | |||||
| "vlvgg %%v16,%%r0,0 \n\t" | |||||
| "lcdbr %[alpha],%[alpha] \n\t" | |||||
| "lgdr %%r0,%[alpha] \n\t" | |||||
| "vlvgg %%v16,%%r0,1 \n\t" | |||||
| "vlr %%v17 ,%%v16 \n\t" | |||||
| "sllg %%r0,%[n],4 \n\t" | |||||
| "agr %%r0,%[x_ptr] \n\t" | |||||
| ".align 16 \n\t" | ".align 16 \n\t" | ||||
| "1: \n\t" | |||||
| "vl %%v24, 0(%1) \n\t" | |||||
| "vfmdb %%v24,%%v24,%%v0 \n\t" | |||||
| "vsteg %%v24, 0(%1),1 \n\t" | |||||
| "vsteg %%v24, 8(%1),0 \n\t" | |||||
| "vl %%v25, 16(%1) \n\t" | |||||
| "vfmdb %%v25,%%v25,%%v1 \n\t" | |||||
| "vsteg %%v25, 16(%1),1 \n\t" | |||||
| "vsteg %%v25, 24(%1),0 \n\t" | |||||
| "vl %%v26, 32(%1) \n\t" | |||||
| "vfmdb %%v26,%%v26,%%v16 \n\t" | |||||
| "vsteg %%v26, 32(%1),1 \n\t" | |||||
| "vsteg %%v26, 40(%1),0 \n\t" | |||||
| "vl %%v27, 48(%1) \n\t" | |||||
| "1: \n\t" | |||||
| "vl %%v24, 0(%[x_ptr]) \n\t" | |||||
| "vfmdb %%v24,%%v24,%%v16 \n\t" | |||||
| "vsteg %%v24, 0(%[x_ptr]),1 \n\t" | |||||
| "vsteg %%v24, 8(%[x_ptr]),0 \n\t" | |||||
| "vl %%v25, 16(%[x_ptr]) \n\t" | |||||
| "vfmdb %%v25,%%v25,%%v17 \n\t" | |||||
| "vsteg %%v25, 16(%[x_ptr]),1 \n\t" | |||||
| "vsteg %%v25, 24(%[x_ptr]),0 \n\t" | |||||
| "vl %%v26, 32(%[x_ptr]) \n\t" | |||||
| "vfmdb %%v26,%%v26,%%v16 \n\t" | |||||
| "vsteg %%v26, 32(%[x_ptr]),1 \n\t" | |||||
| "vsteg %%v26, 40(%[x_ptr]),0 \n\t" | |||||
| "vl %%v27, 48(%[x_ptr]) \n\t" | |||||
| "vfmdb %%v27,%%v27,%%v17 \n\t" | "vfmdb %%v27,%%v27,%%v17 \n\t" | ||||
| "vsteg %%v27, 40(%1),1 \n\t" | |||||
| "vsteg %%v27, 48(%1),0 \n\t" | |||||
| "vl %%v28, 64(%1) \n\t" | |||||
| "vfmdb %%v28,%%v28,%%v0 \n\t" | |||||
| "vsteg %%v28, 64(%1),1 \n\t" | |||||
| "vsteg %%v28, 72(%1),0 \n\t" | |||||
| "vl %%v29, 80(%1) \n\t" | |||||
| "vfmdb %%v29,%%v29,%%v1 \n\t" | |||||
| "vsteg %%v29, 80(%1),1 \n\t" | |||||
| "vsteg %%v29, 88(%1),0 \n\t" | |||||
| "vl %%v30, 96(%1) \n\t" | |||||
| "vfmdb %%v30,%%v30,%%v16 \n\t" | |||||
| "vsteg %%v27, 96(%1),1 \n\t" | |||||
| "vsteg %%v27, 104(%1),0 \n\t" | |||||
| "vl %%v31, 112(%1) \n\t" | |||||
| "vsteg %%v27, 40(%[x_ptr]),1 \n\t" | |||||
| "vsteg %%v27, 48(%[x_ptr]),0 \n\t" | |||||
| "vl %%v28, 64(%[x_ptr]) \n\t" | |||||
| "vfmdb %%v28,%%v28,%%v16 \n\t" | |||||
| "vsteg %%v28, 64(%[x_ptr]),1 \n\t" | |||||
| "vsteg %%v28, 72(%[x_ptr]),0 \n\t" | |||||
| "vl %%v29, 80(%[x_ptr]) \n\t" | |||||
| "vfmdb %%v29,%%v29,%%v17 \n\t" | |||||
| "vsteg %%v29, 80(%[x_ptr]),1 \n\t" | |||||
| "vsteg %%v29, 88(%[x_ptr]),0 \n\t" | |||||
| "vl %%v30, 96(%[x_ptr]) \n\t" | |||||
| "vfmdb %%v30,%%v30,%%v16 \n\t" | |||||
| "vsteg %%v27, 96(%[x_ptr]),1 \n\t" | |||||
| "vsteg %%v27, 104(%[x_ptr]),0 \n\t" | |||||
| "vl %%v31, 112(%[x_ptr]) \n\t" | |||||
| "vfmdb %%v31,%%v31,%%v17 \n\t" | "vfmdb %%v31,%%v31,%%v17 \n\t" | ||||
| "vsteg %%v31, 112(%1),1 \n\t" | |||||
| "vsteg %%v31, 120(%1),0 \n\t" | |||||
| "la %1,128(%1) \n\t" | |||||
| "clgrjl %1,%%r0,1b \n\t" | |||||
| : | |||||
| :"r"(n),"a"(x) ,"a"(alpha) | |||||
| :"cc", "memory","r0","f0", "f1","v0","v1","v18","v19","v24","v25","v26","v27","v28","v29","v30","v31" | |||||
| "vsteg %%v31, 112(%[x_ptr]),1 \n\t" | |||||
| "vsteg %%v31, 120(%[x_ptr]),0 \n\t" | |||||
| "la %[x_ptr],128(%[x_ptr]) \n\t" | |||||
| "clgrjl %[x_ptr],%%r0,1b \n\t" | |||||
| : [mem] "+m" (*(double (*)[2*n])x) ,[x_ptr] "+&a"(x) | |||||
| : [n] "r"(n),[alpha] "f"(da_i) | |||||
| :"cc", "r0","f0", "f1","v16","v17" ,"v24","v25","v26","v27","v28","v29","v30","v31" | |||||
| ); | ); | ||||
| } | } | ||||
| static void __attribute__ ((noinline)) zscal_kernel_8_zero_i(BLASLONG n, FLOAT *alpha, FLOAT *x) { | |||||
| __asm__ ("pfd 2, 0(%1) \n\t" | |||||
| "vlrepg %%v18,0(%2) \n\t" | |||||
| "vlr %%v19,%%v18 \n\t" | |||||
| "vlr %%v16 ,%%v18 \n\t" | |||||
| "vlr %%v17,%%v18 \n\t" | |||||
| "sllg %%r0,%0,4 \n\t" | |||||
| "agr %%r0,%1 \n\t" | |||||
| static void zscal_kernel_8_zero_i(BLASLONG n, FLOAT da_r, FLOAT *x) { | |||||
| __asm__ ("pfd 2, 0(%[x_ptr]) \n\t" | |||||
| "lgdr %%r0,%[alpha] \n\t" | |||||
| "vlvgp %%v18,%%r0,%%r0 \n\t" | |||||
| "vlr %%v19,%%v18 \n\t" | |||||
| "vlr %%v16,%%v18 \n\t" | |||||
| "vlr %%v17,%%v18 \n\t" | |||||
| "sllg %%r0,%[n],4 \n\t" | |||||
| "agr %%r0,%[x_ptr] \n\t" | |||||
| ".align 16 \n\t" | ".align 16 \n\t" | ||||
| "1: \n\t" | |||||
| "vl %%v24, 0(%1) \n\t" | |||||
| "vfmdb %%v24,%%v24,%%v18 \n\t" | |||||
| "vst %%v24, 0(%1) \n\t" | |||||
| "vl %%v25, 16(%1) \n\t" | |||||
| "vfmdb %%v25,%%v25,%%v19 \n\t" | |||||
| "vst %%v25, 16(%1) \n\t" | |||||
| "vl %%v26, 32(%1) \n\t" | |||||
| "vfmdb %%v26,%%v26,%%v16 \n\t" | |||||
| "vst %%v26, 32(%1) \n\t" | |||||
| "vl %%v27, 48(%1) \n\t" | |||||
| "vfmdb %%v27,%%v27,%%v17 \n\t" | |||||
| "vst %%v27, 48(%1) \n\t" | |||||
| "vl %%v28, 64(%1) \n\t" | |||||
| "vfmdb %%v28,%%v28,%%v18 \n\t" | |||||
| "vst %%v28, 64(%1) \n\t" | |||||
| "vl %%v29, 80(%1) \n\t" | |||||
| "vfmdb %%v29,%%v29,%%v19 \n\t" | |||||
| "vst %%v29, 80(%1) \n\t" | |||||
| "vl %%v30, 96(%1) \n\t" | |||||
| "vfmdb %%v30,%%v30,%%v16 \n\t" | |||||
| "vst %%v30, 96(%1) \n\t" | |||||
| "vl %%v31, 112(%1) \n\t" | |||||
| "vfmdb %%v31,%%v31,%%v17 \n\t" | |||||
| "vst %%v31, 112(%1) \n\t" | |||||
| "la %1,128(%1) \n\t" | |||||
| "clgrjl %1,%%r0,1b \n\t" | |||||
| : | |||||
| :"r"(n),"a"(x) ,"a"(alpha) | |||||
| :"cc", "memory","r0","v16", "v17","v18","v19","v24","v25","v26","v27","v28","v29","v30","v31" | |||||
| "1: \n\t" | |||||
| "vl %%v24, 0(%[x_ptr]) \n\t" | |||||
| "vfmdb %%v24,%%v24,%%v18 \n\t" | |||||
| "vst %%v24, 0(%[x_ptr]) \n\t" | |||||
| "vl %%v25, 16(%[x_ptr]) \n\t" | |||||
| "vfmdb %%v25,%%v25,%%v19 \n\t" | |||||
| "vst %%v25, 16(%[x_ptr]) \n\t" | |||||
| "vl %%v26, 32(%[x_ptr]) \n\t" | |||||
| "vfmdb %%v26,%%v26,%%v16 \n\t" | |||||
| "vst %%v26, 32(%[x_ptr]) \n\t" | |||||
| "vl %%v27, 48(%[x_ptr]) \n\t" | |||||
| "vfmdb %%v27,%%v27,%%v17 \n\t" | |||||
| "vst %%v27, 48(%[x_ptr]) \n\t" | |||||
| "vl %%v28, 64(%[x_ptr]) \n\t" | |||||
| "vfmdb %%v28,%%v28,%%v18 \n\t" | |||||
| "vst %%v28, 64(%[x_ptr]) \n\t" | |||||
| "vl %%v29, 80(%[x_ptr]) \n\t" | |||||
| "vfmdb %%v29,%%v29,%%v19 \n\t" | |||||
| "vst %%v29, 80(%[x_ptr]) \n\t" | |||||
| "vl %%v30, 96(%[x_ptr]) \n\t" | |||||
| "vfmdb %%v30,%%v30,%%v16 \n\t" | |||||
| "vst %%v30, 96(%[x_ptr]) \n\t" | |||||
| "vl %%v31,112(%[x_ptr]) \n\t" | |||||
| "vfmdb %%v31,%%v31,%%v17 \n\t" | |||||
| "vst %%v31,112(%[x_ptr]) \n\t" | |||||
| "la %[x_ptr],128(%[x_ptr]) \n\t" | |||||
| "clgrjl %[x_ptr],%%r0,1b \n\t" | |||||
| : [mem] "+m" (*(double (*)[2*n])x) ,[x_ptr] "+&a"(x) | |||||
| : [n] "r"(n),[alpha] "f"(da_r) | |||||
| : "cc", "r0","v16", "v17","v18","v19","v24","v25","v26","v27","v28","v29","v30","v31" | |||||
| ); | ); | ||||
| } | } | ||||
| static void __attribute__ ((noinline)) zscal_kernel_8_zero(BLASLONG n, FLOAT *x) { | |||||
| static void zscal_kernel_8_zero(BLASLONG n, FLOAT *x) { | |||||
| __asm__ ( "pfd 2, 0(%1) \n\t" | |||||
| __asm__ ( "pfd 2, 0(%[x_ptr]) \n\t" | |||||
| "vzero %%v24 \n\t" | "vzero %%v24 \n\t" | ||||
| "vzero %%v25 \n\t" | "vzero %%v25 \n\t" | ||||
| "vzero %%v26 \n\t" | "vzero %%v26 \n\t" | ||||
| "vzero %%v27 \n\t" | "vzero %%v27 \n\t" | ||||
| "sllg %%r0,%0,4 \n\t" | |||||
| "agr %%r0,%1 \n\t" | |||||
| "sllg %%r0,%[n],4 \n\t" | |||||
| "agr %%r0,%[x_ptr] \n\t" | |||||
| ".align 16 \n\t" | ".align 16 \n\t" | ||||
| "1: \n\t" | "1: \n\t" | ||||
| "pfd 2, 256( %1) \n\t" | |||||
| "vst %%v24, 0( %1) \n\t" | |||||
| "vst %%v25, 16( %1) \n\t" | |||||
| "vst %%v26, 32( %1) \n\t" | |||||
| "vst %%v27, 48( %1) \n\t" | |||||
| "vst %%v24, 64( %1) \n\t" | |||||
| "vst %%v25, 80( %1) \n\t" | |||||
| "vst %%v26, 96( %1) \n\t" | |||||
| "vst %%v27,112( %1) \n\t" | |||||
| "pfd 2, 256( %[x_ptr]) \n\t" | |||||
| "vst %%v24, 0( %[x_ptr]) \n\t" | |||||
| "vst %%v25, 16( %[x_ptr]) \n\t" | |||||
| "vst %%v26, 32( %[x_ptr]) \n\t" | |||||
| "vst %%v27, 48( %[x_ptr]) \n\t" | |||||
| "vst %%v24, 64( %[x_ptr]) \n\t" | |||||
| "vst %%v25, 80( %[x_ptr]) \n\t" | |||||
| "vst %%v26, 96( %[x_ptr]) \n\t" | |||||
| "vst %%v27,112( %[x_ptr]) \n\t" | |||||
| "la %1,128(%1) \n\t" | |||||
| "clgrjl %1,%%r0,1b \n\t" | |||||
| : | |||||
| :"r"(n),"a"(x) | |||||
| :"cc" , "memory" ,"r0","v24","v25","v26","v27" | |||||
| "la %[x_ptr],128(%[x_ptr]) \n\t" | |||||
| "clgrjl %[x_ptr],%%r0,1b \n\t" | |||||
| : [mem] "+m" (*(double (*)[2*n])x),[x_ptr] "+&a"(x) | |||||
| : [n] "r"(n) | |||||
| :"cc" ,"r0","v24","v25","v26","v27" | |||||
| ); | ); | ||||
| } | } | ||||
| static void zscal_kernel_inc_8(BLASLONG n, FLOAT *alpha, FLOAT *x, BLASLONG inc_x) __attribute__ ((noinline)); | |||||
| static void zscal_kernel_inc_8(BLASLONG n, FLOAT *alpha, FLOAT *x, BLASLONG inc_x) { | |||||
| static void zscal_kernel_inc_8(BLASLONG n, FLOAT da_r,FLOAT da_i, FLOAT *x, BLASLONG inc_x) { | |||||
| BLASLONG i; | BLASLONG i; | ||||
| BLASLONG inc_x2 = 2 * inc_x; | BLASLONG inc_x2 = 2 * inc_x; | ||||
| BLASLONG inc_x3 = inc_x2 + inc_x; | BLASLONG inc_x3 = inc_x2 + inc_x; | ||||
| FLOAT t0, t1, t2, t3; | |||||
| FLOAT da_r = alpha[0]; | |||||
| FLOAT da_i = alpha[1]; | |||||
| FLOAT t0, t1, t2, t3; | |||||
| for (i = 0; i < n; i += 4) { | for (i = 0; i < n; i += 4) { | ||||
| t0 = da_r * x[0] - da_i * x[1]; | t0 = da_r * x[0] - da_i * x[1]; | ||||
| @@ -280,7 +269,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, | |||||
| BLASLONG i = 0, j = 0; | BLASLONG i = 0, j = 0; | ||||
| FLOAT temp0; | FLOAT temp0; | ||||
| FLOAT temp1; | FLOAT temp1; | ||||
| FLOAT alpha[2]; | |||||
| if (inc_x != 1) { | if (inc_x != 1) { | ||||
| inc_x <<= 1; | inc_x <<= 1; | ||||
| @@ -372,10 +361,8 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, | |||||
| } else { | } else { | ||||
| BLASLONG n1 = n & -8; | BLASLONG n1 = n & -8; | ||||
| if (n1 > 0) { | |||||
| alpha[0] = da_r; | |||||
| alpha[1] = da_i; | |||||
| zscal_kernel_inc_8(n1, alpha, x, inc_x); | |||||
| if (n1 > 0) { | |||||
| zscal_kernel_inc_8(n1, da_r,da_i, x, inc_x); | |||||
| j = n1; | j = n1; | ||||
| i = n1 * inc_x; | i = n1 * inc_x; | ||||
| } | } | ||||
| @@ -401,19 +388,17 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, | |||||
| BLASLONG n1 = n & -8; | BLASLONG n1 = n & -8; | ||||
| if (n1 > 0) { | if (n1 > 0) { | ||||
| alpha[0] = da_r; | |||||
| alpha[1] = da_i; | |||||
| if (da_r == 0.0) | if (da_r == 0.0) | ||||
| if (da_i == 0) | if (da_i == 0) | ||||
| zscal_kernel_8_zero(n1, x); | zscal_kernel_8_zero(n1, x); | ||||
| else | else | ||||
| zscal_kernel_8_zero_r(n1, alpha, x); | |||||
| zscal_kernel_8_zero_r(n1, da_i, x); | |||||
| else | else | ||||
| if (da_i == 0) | if (da_i == 0) | ||||
| zscal_kernel_8_zero_i(n1, alpha, x); | |||||
| zscal_kernel_8_zero_i(n1, da_r, x); | |||||
| else | else | ||||
| zscal_kernel_8(n1, alpha, x); | |||||
| zscal_kernel_8(n1, da_r,da_i, x); | |||||
| i = n1 << 1; | i = n1 << 1; | ||||
| j = n1; | j = n1; | ||||
| @@ -29,99 +29,211 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #include "common.h" | #include "common.h" | ||||
| #if defined(Z13_SWAP_A) | |||||
| static void zswap_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y) | |||||
| { | |||||
| __asm__ volatile( | |||||
| "pfd 1, 0(%[ptr_x]) \n\t" | |||||
| "pfd 2, 0(%[ptr_y]) \n\t" | |||||
| "srlg %[n_tmp],%[n_tmp],4 \n\t" | |||||
| "xgr %%r1,%%r1 \n\t" | |||||
| ".align 16 \n\t" | |||||
| "1: \n\t" | |||||
| "pfd 2, 256(%%r1,%[ptr_x]) \n\t" | |||||
| "pfd 2, 256(%%r1,%[ptr_y]) \n\t" | |||||
| "vl %%v24, 0(%%r1,%[ptr_x]) \n\t" | |||||
| "vl %%v16, 0(%%r1,%[ptr_y]) \n\t" | |||||
| "vst %%v24, 0(%%r1,%[ptr_y]) \n\t" | |||||
| "vst %%v16, 0(%%r1,%[ptr_x]) \n\t" | |||||
| "vl %%v25, 16(%%r1,%[ptr_x]) \n\t" | |||||
| "vl %%v17, 16(%%r1,%[ptr_y]) \n\t" | |||||
| "vst %%v25, 16(%%r1,%[ptr_y]) \n\t" | |||||
| "vst %%v17, 16(%%r1,%[ptr_x]) \n\t" | |||||
| "vl %%v26, 32(%%r1,%[ptr_x]) \n\t" | |||||
| "vl %%v18, 32(%%r1,%[ptr_y]) \n\t" | |||||
| "vst %%v26, 32(%%r1,%[ptr_y]) \n\t" | |||||
| "vst %%v18, 32(%%r1,%[ptr_x]) \n\t" | |||||
| "vl %%v27, 48(%%r1,%[ptr_x]) \n\t" | |||||
| "vl %%v19, 48(%%r1,%[ptr_y]) \n\t" | |||||
| "vst %%v27, 48(%%r1,%[ptr_y]) \n\t" | |||||
| "vst %%v19, 48(%%r1,%[ptr_x]) \n\t" | |||||
| "vl %%v28, 64(%%r1,%[ptr_x]) \n\t" | |||||
| "vl %%v20, 64(%%r1,%[ptr_y]) \n\t" | |||||
| "vst %%v28, 64(%%r1,%[ptr_y]) \n\t" | |||||
| "vst %%v20, 64(%%r1,%[ptr_x]) \n\t" | |||||
| "vl %%v29, 80(%%r1,%[ptr_x]) \n\t" | |||||
| "vl %%v21, 80(%%r1,%[ptr_y]) \n\t" | |||||
| "vst %%v29, 80(%%r1,%[ptr_y]) \n\t" | |||||
| "vst %%v21, 80(%%r1,%[ptr_x]) \n\t" | |||||
| "vl %%v30, 96(%%r1,%[ptr_x]) \n\t" | |||||
| "vl %%v22, 96(%%r1,%[ptr_y]) \n\t" | |||||
| "vst %%v30, 96(%%r1,%[ptr_y]) \n\t" | |||||
| "vst %%v22, 96(%%r1,%[ptr_x]) \n\t" | |||||
| "vl %%v31, 112(%%r1,%[ptr_x]) \n\t" | |||||
| "vl %%v23, 112(%%r1,%[ptr_y]) \n\t" | |||||
| "vst %%v31, 112(%%r1,%[ptr_y]) \n\t" | |||||
| "vst %%v23, 112(%%r1,%[ptr_x]) \n\t" | |||||
| "vl %%v24, 128(%%r1,%[ptr_x]) \n\t" | |||||
| "vl %%v16, 128(%%r1,%[ptr_y]) \n\t" | |||||
| "vst %%v24, 128(%%r1,%[ptr_y]) \n\t" | |||||
| "vst %%v16, 128(%%r1,%[ptr_x]) \n\t" | |||||
| "vl %%v25, 144(%%r1,%[ptr_x]) \n\t" | |||||
| "vl %%v17, 144(%%r1,%[ptr_y]) \n\t" | |||||
| "vst %%v25, 144(%%r1,%[ptr_y]) \n\t" | |||||
| "vst %%v17, 144(%%r1,%[ptr_x]) \n\t" | |||||
| "vl %%v26, 160(%%r1,%[ptr_x]) \n\t" | |||||
| "vl %%v18, 160(%%r1,%[ptr_y]) \n\t" | |||||
| "vst %%v26, 160(%%r1,%[ptr_y]) \n\t" | |||||
| "vst %%v18, 160(%%r1,%[ptr_x]) \n\t" | |||||
| "vl %%v27, 176(%%r1,%[ptr_x]) \n\t" | |||||
| "vl %%v19, 176(%%r1,%[ptr_y]) \n\t" | |||||
| "vst %%v27, 176(%%r1,%[ptr_y]) \n\t" | |||||
| "vst %%v19, 176(%%r1,%[ptr_x]) \n\t" | |||||
| "vl %%v28, 192(%%r1,%[ptr_x]) \n\t" | |||||
| "vl %%v20, 192(%%r1,%[ptr_y]) \n\t" | |||||
| "vst %%v28, 192(%%r1,%[ptr_y]) \n\t" | |||||
| "vst %%v20, 192(%%r1,%[ptr_x]) \n\t" | |||||
| "vl %%v29, 208(%%r1,%[ptr_x]) \n\t" | |||||
| "vl %%v21, 208(%%r1,%[ptr_y]) \n\t" | |||||
| "vst %%v29, 208(%%r1,%[ptr_y]) \n\t" | |||||
| "vst %%v21, 208(%%r1,%[ptr_x]) \n\t" | |||||
| "vl %%v30, 224(%%r1,%[ptr_x]) \n\t" | |||||
| "vl %%v22, 224(%%r1,%[ptr_y]) \n\t" | |||||
| "vst %%v30, 224(%%r1,%[ptr_y]) \n\t" | |||||
| "vst %%v22, 224(%%r1,%[ptr_x]) \n\t" | |||||
| "vl %%v31, 240(%%r1,%[ptr_x]) \n\t" | |||||
| "vl %%v23, 240(%%r1,%[ptr_y]) \n\t" | |||||
| "vst %%v31, 240(%%r1,%[ptr_y]) \n\t" | |||||
| "vst %%v23, 240(%%r1,%[ptr_x]) \n\t" | |||||
| "la %%r1,256(%%r1) \n\t" | |||||
| "brctg %[n_tmp],1b" | |||||
| : [mem_x] "+m" (*(double (*)[2*n])x), | |||||
| [mem_y] "+m" (*(double (*)[2*n])y), | |||||
| [n_tmp] "+&r"(n) | |||||
| : [ptr_x] "a"(x), [ptr_y] "a"(y) | |||||
| : "cc", "r1", "v16","v17","v18","v19","v20","v21","v22","v23" | |||||
| ,"v24","v25","v26","v27","v28","v29","v30","v31" | |||||
| ); | |||||
| return; | |||||
| } | |||||
| #else | |||||
| static void __attribute__ ((noinline)) zswap_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y) | static void __attribute__ ((noinline)) zswap_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y) | ||||
| { | { | ||||
| __asm__ volatile( | __asm__ volatile( | ||||
| "pfd 2, 0(%1) \n\t" | |||||
| "pfd 2, 0(%2) \n\t" | |||||
| "srlg %%r0,%0,4 \n\t" | |||||
| "pfd 2, 0(%[ptr_x]) \n\t" | |||||
| "pfd 2, 0(%[ptr_y]) \n\t" | |||||
| "srlg %[n_tmp],%[n_tmp],4 \n\t" | |||||
| "xgr %%r1,%%r1 \n\t" | "xgr %%r1,%%r1 \n\t" | ||||
| ".align 16 \n\t" | ".align 16 \n\t" | ||||
| "1: \n\t" | "1: \n\t" | ||||
| "pfd 2, 256(%%r1,%1) \n\t" | |||||
| "pfd 2, 256(%%r1,%2) \n\t" | |||||
| "pfd 2, 256(%%r1,%[ptr_x]) \n\t" | |||||
| "pfd 2, 256(%%r1,%[ptr_y]) \n\t" | |||||
| "vl %%v16, 0(%%r1,%1) \n\t" | |||||
| "vl %%v17, 16(%%r1,%1) \n\t" | |||||
| "vl %%v18, 32(%%r1,%1) \n\t" | |||||
| "vl %%v19, 48(%%r1,%1) \n\t" | |||||
| "vl %%v20, 64(%%r1,%1) \n\t" | |||||
| "vl %%v21, 80(%%r1,%1) \n\t" | |||||
| "vl %%v22, 96(%%r1,%1) \n\t" | |||||
| "vl %%v23, 112(%%r1,%1) \n\t" | |||||
| "vl %%v24, 128(%%r1,%1) \n\t" | |||||
| "vl %%v25, 144(%%r1,%1) \n\t" | |||||
| "vl %%v26, 160(%%r1,%1) \n\t" | |||||
| "vl %%v27, 176(%%r1,%1) \n\t" | |||||
| "vl %%v28, 192(%%r1,%1) \n\t" | |||||
| "vl %%v29, 208(%%r1,%1) \n\t" | |||||
| "vl %%v30, 224(%%r1,%1) \n\t" | |||||
| "vl %%v31, 240(%%r1,%1) \n\t" | |||||
| "vl %%v0, 0(%%r1,%2) \n\t" | |||||
| "vl %%v1, 16(%%r1,%2) \n\t" | |||||
| "vl %%v2, 32(%%r1,%2) \n\t" | |||||
| "vl %%v3, 48(%%r1,%2) \n\t" | |||||
| "vl %%v4, 64(%%r1,%2) \n\t" | |||||
| "vl %%v5, 80(%%r1,%2) \n\t" | |||||
| "vl %%v6, 96(%%r1,%2) \n\t" | |||||
| "vl %%v7, 112(%%r1,%2) \n\t" | |||||
| "vst %%v0, 0(%%r1,%1) \n\t" | |||||
| "vst %%v1, 16(%%r1,%1) \n\t" | |||||
| "vst %%v2, 32(%%r1,%1) \n\t" | |||||
| "vst %%v3, 48(%%r1,%1) \n\t" | |||||
| "vst %%v4, 64(%%r1,%1) \n\t" | |||||
| "vst %%v5, 80(%%r1,%1) \n\t" | |||||
| "vst %%v6, 96(%%r1,%1) \n\t" | |||||
| "vst %%v7, 112(%%r1,%1) \n\t" | |||||
| "vl %%v0, 128(%%r1,%2) \n\t" | |||||
| "vl %%v1, 144(%%r1,%2) \n\t" | |||||
| "vl %%v2, 160(%%r1,%2) \n\t" | |||||
| "vl %%v3, 176(%%r1,%2) \n\t" | |||||
| "vl %%v4, 192(%%r1,%2) \n\t" | |||||
| "vl %%v5, 208(%%r1,%2) \n\t" | |||||
| "vl %%v6, 224(%%r1,%2) \n\t" | |||||
| "vl %%v7, 240(%%r1,%2) \n\t" | |||||
| "vst %%v0, 128(%%r1,%1) \n\t" | |||||
| "vst %%v1, 144(%%r1,%1) \n\t" | |||||
| "vst %%v2, 160(%%r1,%1) \n\t" | |||||
| "vst %%v3, 176(%%r1,%1) \n\t" | |||||
| "vst %%v4, 192(%%r1,%1) \n\t" | |||||
| "vst %%v5, 208(%%r1,%1) \n\t" | |||||
| "vst %%v6, 224(%%r1,%1) \n\t" | |||||
| "vst %%v7, 240(%%r1,%1) \n\t" | |||||
| "vst %%v16, 0(%%r1,%2) \n\t" | |||||
| "vst %%v17, 16(%%r1,%2) \n\t" | |||||
| "vst %%v18, 32(%%r1,%2) \n\t" | |||||
| "vst %%v19, 48(%%r1,%2) \n\t" | |||||
| "vst %%v20, 64(%%r1,%2) \n\t" | |||||
| "vst %%v21, 80(%%r1,%2) \n\t" | |||||
| "vst %%v22, 96(%%r1,%2) \n\t" | |||||
| "vst %%v23, 112(%%r1,%2) \n\t" | |||||
| "vst %%v24, 128(%%r1,%2) \n\t" | |||||
| "vst %%v25, 144(%%r1,%2) \n\t" | |||||
| "vst %%v26, 160(%%r1,%2) \n\t" | |||||
| "vst %%v27, 176(%%r1,%2) \n\t" | |||||
| "vst %%v28, 192(%%r1,%2) \n\t" | |||||
| "vst %%v29, 208(%%r1,%2) \n\t" | |||||
| "vst %%v30, 224(%%r1,%2) \n\t" | |||||
| "vst %%v31, 240(%%r1,%2) \n\t" | |||||
| "vl %%v16, 0(%%r1,%[ptr_x]) \n\t" | |||||
| "vl %%v17, 16(%%r1,%[ptr_x]) \n\t" | |||||
| "vl %%v18, 32(%%r1,%[ptr_x]) \n\t" | |||||
| "vl %%v19, 48(%%r1,%[ptr_x]) \n\t" | |||||
| "vl %%v20, 64(%%r1,%[ptr_x]) \n\t" | |||||
| "vl %%v21, 80(%%r1,%[ptr_x]) \n\t" | |||||
| "vl %%v22, 96(%%r1,%[ptr_x]) \n\t" | |||||
| "vl %%v23, 112(%%r1,%[ptr_x]) \n\t" | |||||
| "vl %%v24, 128(%%r1,%[ptr_x]) \n\t" | |||||
| "vl %%v25, 144(%%r1,%[ptr_x]) \n\t" | |||||
| "vl %%v26, 160(%%r1,%[ptr_x]) \n\t" | |||||
| "vl %%v27, 176(%%r1,%[ptr_x]) \n\t" | |||||
| "vl %%v28, 192(%%r1,%[ptr_x]) \n\t" | |||||
| "vl %%v29, 208(%%r1,%[ptr_x]) \n\t" | |||||
| "vl %%v30, 224(%%r1,%[ptr_x]) \n\t" | |||||
| "vl %%v31, 240(%%r1,%[ptr_x]) \n\t" | |||||
| "vl %%v0, 0(%%r1,%[ptr_y]) \n\t" | |||||
| "vl %%v1, 16(%%r1,%[ptr_y]) \n\t" | |||||
| "vl %%v2, 32(%%r1,%[ptr_y]) \n\t" | |||||
| "vl %%v3, 48(%%r1,%[ptr_y]) \n\t" | |||||
| "vl %%v4, 64(%%r1,%[ptr_y]) \n\t" | |||||
| "vl %%v5, 80(%%r1,%[ptr_y]) \n\t" | |||||
| "vl %%v6, 96(%%r1,%[ptr_y]) \n\t" | |||||
| "vl %%v7, 112(%%r1,%[ptr_y]) \n\t" | |||||
| "vst %%v0, 0(%%r1,%[ptr_x]) \n\t" | |||||
| "vst %%v1, 16(%%r1,%[ptr_x]) \n\t" | |||||
| "vst %%v2, 32(%%r1,%[ptr_x]) \n\t" | |||||
| "vst %%v3, 48(%%r1,%[ptr_x]) \n\t" | |||||
| "vst %%v4, 64(%%r1,%[ptr_x]) \n\t" | |||||
| "vst %%v5, 80(%%r1,%[ptr_x]) \n\t" | |||||
| "vst %%v6, 96(%%r1,%[ptr_x]) \n\t" | |||||
| "vst %%v7, 112(%%r1,%[ptr_x]) \n\t" | |||||
| "vl %%v0, 128(%%r1,%[ptr_y]) \n\t" | |||||
| "vl %%v1, 144(%%r1,%[ptr_y]) \n\t" | |||||
| "vl %%v2, 160(%%r1,%[ptr_y]) \n\t" | |||||
| "vl %%v3, 176(%%r1,%[ptr_y]) \n\t" | |||||
| "vl %%v4, 192(%%r1,%[ptr_y]) \n\t" | |||||
| "vl %%v5, 208(%%r1,%[ptr_y]) \n\t" | |||||
| "vl %%v6, 224(%%r1,%[ptr_y]) \n\t" | |||||
| "vl %%v7, 240(%%r1,%[ptr_y]) \n\t" | |||||
| "vst %%v0, 128(%%r1,%[ptr_x]) \n\t" | |||||
| "vst %%v1, 144(%%r1,%[ptr_x]) \n\t" | |||||
| "vst %%v2, 160(%%r1,%[ptr_x]) \n\t" | |||||
| "vst %%v3, 176(%%r1,%[ptr_x]) \n\t" | |||||
| "vst %%v4, 192(%%r1,%[ptr_x]) \n\t" | |||||
| "vst %%v5, 208(%%r1,%[ptr_x]) \n\t" | |||||
| "vst %%v6, 224(%%r1,%[ptr_x]) \n\t" | |||||
| "vst %%v7, 240(%%r1,%[ptr_x]) \n\t" | |||||
| "vst %%v16, 0(%%r1,%[ptr_y]) \n\t" | |||||
| "vst %%v17, 16(%%r1,%[ptr_y]) \n\t" | |||||
| "vst %%v18, 32(%%r1,%[ptr_y]) \n\t" | |||||
| "vst %%v19, 48(%%r1,%[ptr_y]) \n\t" | |||||
| "vst %%v20, 64(%%r1,%[ptr_y]) \n\t" | |||||
| "vst %%v21, 80(%%r1,%[ptr_y]) \n\t" | |||||
| "vst %%v22, 96(%%r1,%[ptr_y]) \n\t" | |||||
| "vst %%v23, 112(%%r1,%[ptr_y]) \n\t" | |||||
| "vst %%v24, 128(%%r1,%[ptr_y]) \n\t" | |||||
| "vst %%v25, 144(%%r1,%[ptr_y]) \n\t" | |||||
| "vst %%v26, 160(%%r1,%[ptr_y]) \n\t" | |||||
| "vst %%v27, 176(%%r1,%[ptr_y]) \n\t" | |||||
| "vst %%v28, 192(%%r1,%[ptr_y]) \n\t" | |||||
| "vst %%v29, 208(%%r1,%[ptr_y]) \n\t" | |||||
| "vst %%v30, 224(%%r1,%[ptr_y]) \n\t" | |||||
| "vst %%v31, 240(%%r1,%[ptr_y]) \n\t" | |||||
| "la %%r1,256(%%r1) \n\t" | |||||
| "brctg %%r0,1b" | |||||
| : | |||||
| : "r"(n), "a"(x), "a"(y) | |||||
| :"cc", "memory","r0","r1", "v0","v1","v2","v3","v4","v5","v6","v7","v16", | |||||
| "v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" | |||||
| "la %%r1,256(%%r1) \n\t" | |||||
| "brctg %[n_tmp],1b" | |||||
| : [mem_x] "+m" (*(double (*)[2*n])x), | |||||
| [mem_y] "+m" (*(double (*)[2*n])y), | |||||
| [n_tmp] "+&r"(n) | |||||
| : [ptr_x] "a"(x), [ptr_y] "a"(y) | |||||
| : "cc", "memory", "r1", "v0","v1","v2","v3","v4","v5","v6","v7","v16", | |||||
| "v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" | |||||
| ); | ); | ||||
| return; | return; | ||||
| } | } | ||||
| #endif | |||||