SGEMM, DGEMM, CGEMM, ZGEMM functions data prefetchtags/v0.2.20^2
| @@ -1082,7 +1082,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| } | } | ||||
| int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alphar, FLOAT alphai, | int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alphar, FLOAT alphai, | ||||
| FLOAT* A, FLOAT* B, FLOAT* C, BLASLONG ldc | |||||
| FLOAT *A, FLOAT *B, FLOAT *C, BLASLONG ldc | |||||
| #ifdef TRMMKERNEL | #ifdef TRMMKERNEL | ||||
| , BLASLONG offset | , BLASLONG offset | ||||
| #endif | #endif | ||||
| @@ -1092,18 +1092,14 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alphar, FLOAT alphai, | |||||
| #if defined(TRMMKERNEL) | #if defined(TRMMKERNEL) | ||||
| BLASLONG off; | BLASLONG off; | ||||
| #endif | #endif | ||||
| FLOAT *pc0, *pc1, *pc2, *pc3; | |||||
| FLOAT *pa0, *pb0; | |||||
| FLOAT *pc0, *pc1, *pc2, *pc3, *pa0, *pb0; | |||||
| FLOAT res0, res1, res2, res3, res4, res5, res6, res7; | FLOAT res0, res1, res2, res3, res4, res5, res6, res7; | ||||
| FLOAT res8, res9, res10, res11, res12, res13, res14, res15; | FLOAT res8, res9, res10, res11, res12, res13, res14, res15; | ||||
| FLOAT a0_r, a1_r; | |||||
| FLOAT a0_i, a1_i; | |||||
| FLOAT a0_r, a1_r, a0_i, a1_i, b0_i, b1_i, b2_i, b3_i; | |||||
| FLOAT b0_r, b1_r, b2_r, b3_r; | FLOAT b0_r, b1_r, b2_r, b3_r; | ||||
| FLOAT b0_i, b1_i, b2_i, b3_i; | |||||
| v4f32 src_a0, src_a1, src_a2, src_a3, src_b0, src_b1; | v4f32 src_a0, src_a1, src_a2, src_a3, src_b0, src_b1; | ||||
| v4f32 src_a0r, src_a0i, src_a1r, src_a1i, src_br, src_bi; | v4f32 src_a0r, src_a0i, src_a1r, src_a1i, src_br, src_bi; | ||||
| v4f32 dst0, dst1, dst2, dst3; | |||||
| v4f32 alpha_r, alpha_i; | |||||
| v4f32 dst0, dst1, dst2, dst3, alpha_r, alpha_i; | |||||
| v4f32 res0_r, res0_i, res1_r, res1_i, res2_r, res2_i, res3_r, res3_i; | v4f32 res0_r, res0_i, res1_r, res1_i, res2_r, res2_i, res3_r, res3_i; | ||||
| v4f32 res4_r, res4_i, res5_r, res5_i, res6_r, res6_i, res7_r, res7_i; | v4f32 res4_r, res4_i, res5_r, res5_i, res6_r, res6_i, res7_r, res7_i; | ||||
| v4f32 dst0_r, dst0_i, dst1_r, dst1_i; | v4f32 dst0_r, dst0_i, dst1_r, dst1_i; | ||||
| @@ -1122,12 +1118,12 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alphar, FLOAT alphai, | |||||
| pc2 = pc1 + 2 * ldc; | pc2 = pc1 + 2 * ldc; | ||||
| pc3 = pc2 + 2 * ldc; | pc3 = pc2 + 2 * ldc; | ||||
| pa0 = A; | |||||
| #if defined(TRMMKERNEL) && defined(LEFT) | #if defined(TRMMKERNEL) && defined(LEFT) | ||||
| off = offset; | off = offset; | ||||
| #endif | #endif | ||||
| pa0 = A; | |||||
| for (i = (m >> 3); i--;) | for (i = (m >> 3); i--;) | ||||
| { | { | ||||
| #if defined(TRMMKERNEL) | #if defined(TRMMKERNEL) | ||||
| @@ -1150,6 +1146,17 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alphar, FLOAT alphai, | |||||
| temp = k; | temp = k; | ||||
| #endif | #endif | ||||
| #ifdef ENABLE_PREFETCH | |||||
| __asm__ __volatile__( | |||||
| "pref 0, 64(%[pa0]) \n\t" | |||||
| "pref 0, 96(%[pa0]) \n\t" | |||||
| "pref 0, 32(%[pb0]) \n\t" | |||||
| : | |||||
| : [pa0] "r" (pa0), [pb0] "r" (pb0) | |||||
| ); | |||||
| #endif | |||||
| #if defined(NN) || defined(NT) || defined(TN) || defined(TT) | #if defined(NN) || defined(NT) || defined(TN) || defined(TT) | ||||
| CGEMM_KERNEL_8X4_MSA(, -, , +, +); | CGEMM_KERNEL_8X4_MSA(, -, , +, +); | ||||
| #endif | #endif | ||||
| @@ -1165,6 +1172,17 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alphar, FLOAT alphai, | |||||
| for (l = (temp - 1); l--;) | for (l = (temp - 1); l--;) | ||||
| { | { | ||||
| #ifdef ENABLE_PREFETCH | |||||
| __asm__ __volatile__( | |||||
| "pref 0, 64(%[pa0]) \n\t" | |||||
| "pref 0, 96(%[pa0]) \n\t" | |||||
| "pref 0, 32(%[pb0]) \n\t" | |||||
| : | |||||
| : [pa0] "r" (pa0), [pb0] "r" (pb0) | |||||
| ); | |||||
| #endif | |||||
| #if defined(NN) || defined(NT) || defined(TN) || defined(TT) | #if defined(NN) || defined(NT) || defined(TN) || defined(TT) | ||||
| CGEMM_KERNEL_8X4_MSA(+, -, +, +,); | CGEMM_KERNEL_8X4_MSA(+, -, +, +,); | ||||
| #endif | #endif | ||||
| @@ -1340,6 +1358,10 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alphar, FLOAT alphai, | |||||
| #else | #else | ||||
| CGEMM_SCALE_2X4 | CGEMM_SCALE_2X4 | ||||
| #endif | #endif | ||||
| pc0 += 4; | |||||
| pc1 += 4; | |||||
| pc2 += 4; | |||||
| pc3 += 4; | |||||
| #if defined(TRMMKERNEL) | #if defined(TRMMKERNEL) | ||||
| #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) | #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) | ||||
| @@ -1357,11 +1379,6 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alphar, FLOAT alphai, | |||||
| off += 2; // number of values in A | off += 2; // number of values in A | ||||
| #endif | #endif | ||||
| #endif | #endif | ||||
| pc0 += 4; | |||||
| pc1 += 4; | |||||
| pc2 += 4; | |||||
| pc3 += 4; | |||||
| } | } | ||||
| if (m & 1) | if (m & 1) | ||||
| @@ -1426,6 +1443,10 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alphar, FLOAT alphai, | |||||
| #else | #else | ||||
| CGEMM_SCALE_1X4 | CGEMM_SCALE_1X4 | ||||
| #endif | #endif | ||||
| pc0 += 2; | |||||
| pc1 += 2; | |||||
| pc2 += 2; | |||||
| pc3 += 2; | |||||
| #if defined(TRMMKERNEL) | #if defined(TRMMKERNEL) | ||||
| #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) | #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) | ||||
| @@ -1443,21 +1464,14 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alphar, FLOAT alphai, | |||||
| off += 1; // number of values in A | off += 1; // number of values in A | ||||
| #endif | #endif | ||||
| #endif | #endif | ||||
| pc0 += 2; | |||||
| pc1 += 2; | |||||
| pc2 += 2; | |||||
| pc3 += 2; | |||||
| } | } | ||||
| #if defined(TRMMKERNEL) && !defined(LEFT) | #if defined(TRMMKERNEL) && !defined(LEFT) | ||||
| off += 4; // number of values in A | off += 4; // number of values in A | ||||
| #endif | #endif | ||||
| l = k << 3; | |||||
| B = B + l; | |||||
| i = ldc << 3; | |||||
| C = C + i; | |||||
| B += (k << 3); | |||||
| C += (ldc << 3); | |||||
| } | } | ||||
| if (n & 2) | if (n & 2) | ||||
| @@ -1465,12 +1479,12 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alphar, FLOAT alphai, | |||||
| pc0 = C; | pc0 = C; | ||||
| pc1 = pc0 + 2 * ldc; | pc1 = pc0 + 2 * ldc; | ||||
| pa0 = A; | |||||
| #if defined(TRMMKERNEL) && defined(LEFT) | #if defined(TRMMKERNEL) && defined(LEFT) | ||||
| off = offset; | off = offset; | ||||
| #endif | #endif | ||||
| pa0 = A; | |||||
| for (i = (m >> 3); i--;) | for (i = (m >> 3); i--;) | ||||
| { | { | ||||
| #if defined(TRMMKERNEL) | #if defined(TRMMKERNEL) | ||||
| @@ -1691,6 +1705,8 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alphar, FLOAT alphai, | |||||
| #else | #else | ||||
| CGEMM_SCALE_2X2 | CGEMM_SCALE_2X2 | ||||
| #endif | #endif | ||||
| pc0 += 4; | |||||
| pc1 += 4; | |||||
| #if defined(TRMMKERNEL) | #if defined(TRMMKERNEL) | ||||
| #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) | #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) | ||||
| @@ -1708,9 +1724,6 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alphar, FLOAT alphai, | |||||
| off += 2; // number of values in A | off += 2; // number of values in A | ||||
| #endif | #endif | ||||
| #endif | #endif | ||||
| pc0 += 4; | |||||
| pc1 += 4; | |||||
| } | } | ||||
| if (m & 1) | if (m & 1) | ||||
| @@ -1775,6 +1788,8 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alphar, FLOAT alphai, | |||||
| #else | #else | ||||
| CGEMM_SCALE_1X2 | CGEMM_SCALE_1X2 | ||||
| #endif | #endif | ||||
| pc0 += 2; | |||||
| pc1 += 2; | |||||
| #if defined(TRMMKERNEL) | #if defined(TRMMKERNEL) | ||||
| #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) | #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) | ||||
| @@ -1792,30 +1807,26 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alphar, FLOAT alphai, | |||||
| off += 1; // number of values in A | off += 1; // number of values in A | ||||
| #endif | #endif | ||||
| #endif | #endif | ||||
| pc0 += 2; | |||||
| pc1 += 2; | |||||
| } | } | ||||
| #if defined(TRMMKERNEL) && !defined(LEFT) | #if defined(TRMMKERNEL) && !defined(LEFT) | ||||
| off += 2; // number of values in A | off += 2; // number of values in A | ||||
| #endif | #endif | ||||
| l = k << 2; | |||||
| B = B + l; | |||||
| i = ldc << 2; | |||||
| C = C + i; | |||||
| B += (k << 2); | |||||
| C += (ldc << 2); | |||||
| } | } | ||||
| if (n & 1) | if (n & 1) | ||||
| { | { | ||||
| pc0 = C; | pc0 = C; | ||||
| pa0 = A; | |||||
| #if defined(TRMMKERNEL) && defined(LEFT) | #if defined(TRMMKERNEL) && defined(LEFT) | ||||
| off = offset; | off = offset; | ||||
| #endif | #endif | ||||
| pa0 = A; | |||||
| for (i = (m >> 3); i--;) | for (i = (m >> 3); i--;) | ||||
| { | { | ||||
| #if defined(TRMMKERNEL) | #if defined(TRMMKERNEL) | ||||
| @@ -2036,6 +2047,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alphar, FLOAT alphai, | |||||
| #else | #else | ||||
| CGEMM_SCALE_2X1 | CGEMM_SCALE_2X1 | ||||
| #endif | #endif | ||||
| pc0 += 4; | |||||
| #if defined(TRMMKERNEL) | #if defined(TRMMKERNEL) | ||||
| #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) | #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) | ||||
| @@ -2053,8 +2065,6 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alphar, FLOAT alphai, | |||||
| off += 2; // number of values in A | off += 2; // number of values in A | ||||
| #endif | #endif | ||||
| #endif | #endif | ||||
| pc0 += 4; | |||||
| } | } | ||||
| if (m & 1) | if (m & 1) | ||||
| @@ -2119,6 +2129,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alphar, FLOAT alphai, | |||||
| #else | #else | ||||
| CGEMM_SCALE_1X1 | CGEMM_SCALE_1X1 | ||||
| #endif | #endif | ||||
| pc0 += 2; | |||||
| #if defined(TRMMKERNEL) | #if defined(TRMMKERNEL) | ||||
| #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) | #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) | ||||
| @@ -2136,18 +2147,14 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alphar, FLOAT alphai, | |||||
| off += 1; // number of values in A | off += 1; // number of values in A | ||||
| #endif | #endif | ||||
| #endif | #endif | ||||
| pc0 += 2; | |||||
| } | } | ||||
| #if defined(TRMMKERNEL) && !defined(LEFT) | #if defined(TRMMKERNEL) && !defined(LEFT) | ||||
| off += 1; // number of values in A | off += 1; // number of values in A | ||||
| #endif | #endif | ||||
| l = k << 1; | |||||
| B = B + l; | |||||
| i = ldc << 1; | |||||
| C = C + i; | |||||
| B += (k << 1); | |||||
| C += (ldc << 1); | |||||
| } | } | ||||
| return 0; | return 0; | ||||
| @@ -91,6 +91,15 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B, | |||||
| pb0 = B; | pb0 = B; | ||||
| temp = k; | temp = k; | ||||
| #endif | #endif | ||||
| #ifdef ENABLE_PREFETCH | |||||
| __asm__ __volatile__( | |||||
| "pref 0, 32(%[pa0]) \n\t" | |||||
| "pref 0, 32(%[pb0]) \n\t" | |||||
| : | |||||
| : [pa0] "r" (pa0), [pb0] "r" (pb0) | |||||
| ); | |||||
| #endif | |||||
| LD_SP2_INC(pa0, 4, src_a0, src_a1); | LD_SP2_INC(pa0, 4, src_a0, src_a1); | ||||
| LD_SP2_INC(pb0, 4, src_b0, src_b1); | LD_SP2_INC(pb0, 4, src_b0, src_b1); | ||||
| @@ -129,6 +138,18 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B, | |||||
| for (l = ((temp - 1) >> 1); l--;) | for (l = ((temp - 1) >> 1); l--;) | ||||
| { | { | ||||
| #ifdef ENABLE_PREFETCH | |||||
| __asm__ __volatile__( | |||||
| "pref 0, 64(%[pa0]) \n\t" | |||||
| "pref 0, 96(%[pa0]) \n\t" | |||||
| "pref 0, 64(%[pb0]) \n\t" | |||||
| "pref 0, 96(%[pb0]) \n\t" | |||||
| : | |||||
| : [pa0] "r" (pa0), [pb0] "r" (pb0) | |||||
| ); | |||||
| #endif | |||||
| LD_SP2_INC(pa0, 4, src_a0, src_a1); | LD_SP2_INC(pa0, 4, src_a0, src_a1); | ||||
| LD_SP2_INC(pb0, 4, src_b0, src_b1); | LD_SP2_INC(pb0, 4, src_b0, src_b1); | ||||
| @@ -500,6 +521,15 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B, | |||||
| ST_SP(dst2, pc6); | ST_SP(dst2, pc6); | ||||
| ST_SP(dst3, pc7); | ST_SP(dst3, pc7); | ||||
| pc0 += 4; | |||||
| pc1 += 4; | |||||
| pc2 += 4; | |||||
| pc3 += 4; | |||||
| pc4 += 4; | |||||
| pc5 += 4; | |||||
| pc6 += 4; | |||||
| pc7 += 4; | |||||
| #if defined(TRMMKERNEL) | #if defined(TRMMKERNEL) | ||||
| #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) | #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) | ||||
| temp = k - off; | temp = k - off; | ||||
| @@ -516,15 +546,6 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B, | |||||
| off += 4; // number of values in A | off += 4; // number of values in A | ||||
| #endif | #endif | ||||
| #endif | #endif | ||||
| pc0 += 4; | |||||
| pc1 += 4; | |||||
| pc2 += 4; | |||||
| pc3 += 4; | |||||
| pc4 += 4; | |||||
| pc5 += 4; | |||||
| pc6 += 4; | |||||
| pc7 += 4; | |||||
| } | } | ||||
| if (m & 2) | if (m & 2) | ||||
| @@ -763,6 +784,14 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B, | |||||
| pc6[1] += tmp13; | pc6[1] += tmp13; | ||||
| pc7[1] += tmp15; | pc7[1] += tmp15; | ||||
| #endif | #endif | ||||
| pc0 += 2; | |||||
| pc1 += 2; | |||||
| pc2 += 2; | |||||
| pc3 += 2; | |||||
| pc4 += 2; | |||||
| pc5 += 2; | |||||
| pc6 += 2; | |||||
| pc7 += 2; | |||||
| #if defined(TRMMKERNEL) | #if defined(TRMMKERNEL) | ||||
| #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) | #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) | ||||
| @@ -780,15 +809,6 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B, | |||||
| off += 2; // number of values in A | off += 2; // number of values in A | ||||
| #endif | #endif | ||||
| #endif | #endif | ||||
| pc0 += 2; | |||||
| pc1 += 2; | |||||
| pc2 += 2; | |||||
| pc3 += 2; | |||||
| pc4 += 2; | |||||
| pc5 += 2; | |||||
| pc6 += 2; | |||||
| pc7 += 2; | |||||
| } | } | ||||
| if (m & 1) | if (m & 1) | ||||
| @@ -959,6 +979,14 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B, | |||||
| pc6[0] += tmp6; | pc6[0] += tmp6; | ||||
| pc7[0] += tmp7; | pc7[0] += tmp7; | ||||
| #endif | #endif | ||||
| pc0 += 1; | |||||
| pc1 += 1; | |||||
| pc2 += 1; | |||||
| pc3 += 1; | |||||
| pc4 += 1; | |||||
| pc5 += 1; | |||||
| pc6 += 1; | |||||
| pc7 += 1; | |||||
| #if defined(TRMMKERNEL) | #if defined(TRMMKERNEL) | ||||
| #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) | #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) | ||||
| @@ -976,24 +1004,14 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B, | |||||
| off += 1; // number of values in A | off += 1; // number of values in A | ||||
| #endif | #endif | ||||
| #endif | #endif | ||||
| pc0 += 1; | |||||
| pc1 += 1; | |||||
| pc2 += 1; | |||||
| pc3 += 1; | |||||
| pc4 += 1; | |||||
| pc5 += 1; | |||||
| pc6 += 1; | |||||
| pc7 += 1; | |||||
| } | } | ||||
| #if defined(TRMMKERNEL) && !defined(LEFT) | #if defined(TRMMKERNEL) && !defined(LEFT) | ||||
| off += 8; // number of values in A | off += 8; // number of values in A | ||||
| #endif | #endif | ||||
| l = (k << 3); | |||||
| B = B + l; | |||||
| i = (ldc << 3); | |||||
| C = C + i; | |||||
| B += (k << 3); | |||||
| C += (ldc << 3); | |||||
| } | } | ||||
| if (n & 4) | if (n & 4) | ||||
| @@ -1003,12 +1021,12 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B, | |||||
| pc2 = pc1 + ldc; | pc2 = pc1 + ldc; | ||||
| pc3 = pc2 + ldc; | pc3 = pc2 + ldc; | ||||
| pa0 = A; | |||||
| #if defined(TRMMKERNEL) && defined(LEFT) | #if defined(TRMMKERNEL) && defined(LEFT) | ||||
| off = offset; | off = offset; | ||||
| #endif | #endif | ||||
| pa0 = A; | |||||
| for (i = (m >> 3); i--;) | for (i = (m >> 3); i--;) | ||||
| { | { | ||||
| #if defined(TRMMKERNEL) | #if defined(TRMMKERNEL) | ||||
| @@ -1145,7 +1163,6 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B, | |||||
| dst6 += res6 * v_alpha; | dst6 += res6 * v_alpha; | ||||
| dst7 += res7 * v_alpha; | dst7 += res7 * v_alpha; | ||||
| #endif | #endif | ||||
| ST_SP2_INC(dst0, dst1, pc0, 4); | ST_SP2_INC(dst0, dst1, pc0, 4); | ||||
| ST_SP2_INC(dst2, dst3, pc1, 4); | ST_SP2_INC(dst2, dst3, pc1, 4); | ||||
| ST_SP2_INC(dst4, dst5, pc2, 4); | ST_SP2_INC(dst4, dst5, pc2, 4); | ||||
| @@ -1268,6 +1285,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B, | |||||
| pa0 += 4; | pa0 += 4; | ||||
| pb0 += 4; | pb0 += 4; | ||||
| } | } | ||||
| #if defined(TRMMKERNEL) | #if defined(TRMMKERNEL) | ||||
| dst0 = res0 * v_alpha; | dst0 = res0 * v_alpha; | ||||
| dst1 = res1 * v_alpha; | dst1 = res1 * v_alpha; | ||||
| @@ -1289,6 +1307,11 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B, | |||||
| ST_SP(dst2, pc2); | ST_SP(dst2, pc2); | ||||
| ST_SP(dst3, pc3); | ST_SP(dst3, pc3); | ||||
| pc0 += 4; | |||||
| pc1 += 4; | |||||
| pc2 += 4; | |||||
| pc3 += 4; | |||||
| #if defined(TRMMKERNEL) | #if defined(TRMMKERNEL) | ||||
| #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) | #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) | ||||
| temp = k - off; | temp = k - off; | ||||
| @@ -1305,10 +1328,6 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B, | |||||
| off += 4; // number of values in A | off += 4; // number of values in A | ||||
| #endif | #endif | ||||
| #endif | #endif | ||||
| pc0 += 4; | |||||
| pc1 += 4; | |||||
| pc2 += 4; | |||||
| pc3 += 4; | |||||
| } | } | ||||
| if (m & 2) | if (m & 2) | ||||
| @@ -1459,6 +1478,10 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B, | |||||
| pc2[1] += tmp5; | pc2[1] += tmp5; | ||||
| pc3[1] += tmp7; | pc3[1] += tmp7; | ||||
| #endif | #endif | ||||
| pc0 += 2; | |||||
| pc1 += 2; | |||||
| pc2 += 2; | |||||
| pc3 += 2; | |||||
| #if defined(TRMMKERNEL) | #if defined(TRMMKERNEL) | ||||
| #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) | #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) | ||||
| @@ -1476,11 +1499,6 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B, | |||||
| off += 2; // number of values in A | off += 2; // number of values in A | ||||
| #endif | #endif | ||||
| #endif | #endif | ||||
| pc0 += 2; | |||||
| pc1 += 2; | |||||
| pc2 += 2; | |||||
| pc3 += 2; | |||||
| } | } | ||||
| if (m & 1) | if (m & 1) | ||||
| @@ -1591,6 +1609,10 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B, | |||||
| pc2[0] += tmp2; | pc2[0] += tmp2; | ||||
| pc3[0] += tmp3; | pc3[0] += tmp3; | ||||
| #endif | #endif | ||||
| pc0 += 1; | |||||
| pc1 += 1; | |||||
| pc2 += 1; | |||||
| pc3 += 1; | |||||
| #if defined(TRMMKERNEL) | #if defined(TRMMKERNEL) | ||||
| #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) | #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) | ||||
| @@ -1608,20 +1630,14 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B, | |||||
| off += 1; // number of values in A | off += 1; // number of values in A | ||||
| #endif | #endif | ||||
| #endif | #endif | ||||
| pc0 += 1; | |||||
| pc1 += 1; | |||||
| pc2 += 1; | |||||
| pc3 += 1; | |||||
| } | } | ||||
| #if defined(TRMMKERNEL) && !defined(LEFT) | #if defined(TRMMKERNEL) && !defined(LEFT) | ||||
| off += 4; // number of values in A | off += 4; // number of values in A | ||||
| #endif | #endif | ||||
| l = (k << 2); | |||||
| B = B + l; | |||||
| i = (ldc << 2); | |||||
| C = C + i; | |||||
| B += (k << 2); | |||||
| C += (ldc << 2); | |||||
| } | } | ||||
| if (n & 2) | if (n & 2) | ||||
| @@ -1629,12 +1645,12 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B, | |||||
| pc0 = C; | pc0 = C; | ||||
| pc1 = pc0 + ldc; | pc1 = pc0 + ldc; | ||||
| pa0 = A; | |||||
| #if defined(TRMMKERNEL) && defined(LEFT) | #if defined(TRMMKERNEL) && defined(LEFT) | ||||
| off = offset; | off = offset; | ||||
| #endif | #endif | ||||
| pa0 = A; | |||||
| for (i = (m >> 3); i--;) | for (i = (m >> 3); i--;) | ||||
| { | { | ||||
| #if defined(TRMMKERNEL) | #if defined(TRMMKERNEL) | ||||
| @@ -1847,6 +1863,9 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B, | |||||
| ST_SP(dst0, pc0); | ST_SP(dst0, pc0); | ||||
| ST_SP(dst1, pc1); | ST_SP(dst1, pc1); | ||||
| pc0 += 4; | |||||
| pc1 += 4; | |||||
| #if defined(TRMMKERNEL) | #if defined(TRMMKERNEL) | ||||
| #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) | #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) | ||||
| temp = k - off; | temp = k - off; | ||||
| @@ -1863,8 +1882,6 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B, | |||||
| off += 4; // number of values in A | off += 4; // number of values in A | ||||
| #endif | #endif | ||||
| #endif | #endif | ||||
| pc0 += 4; | |||||
| pc1 += 4; | |||||
| } | } | ||||
| if (m & 2) | if (m & 2) | ||||
| @@ -1967,6 +1984,8 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B, | |||||
| pc0[1] += tmp1; | pc0[1] += tmp1; | ||||
| pc1[1] += tmp3; | pc1[1] += tmp3; | ||||
| #endif | #endif | ||||
| pc0 += 2; | |||||
| pc1 += 2; | |||||
| #if defined(TRMMKERNEL) | #if defined(TRMMKERNEL) | ||||
| #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) | #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) | ||||
| @@ -1984,9 +2003,6 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B, | |||||
| off += 2; // number of values in A | off += 2; // number of values in A | ||||
| #endif | #endif | ||||
| #endif | #endif | ||||
| pc0 += 2; | |||||
| pc1 += 2; | |||||
| } | } | ||||
| if (m & 1) | if (m & 1) | ||||
| @@ -2067,6 +2083,8 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B, | |||||
| pc0[0] += tmp0; | pc0[0] += tmp0; | ||||
| pc1[0] += tmp1; | pc1[0] += tmp1; | ||||
| #endif | #endif | ||||
| pc0 += 1; | |||||
| pc1 += 1; | |||||
| #if defined(TRMMKERNEL) | #if defined(TRMMKERNEL) | ||||
| #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) | #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) | ||||
| @@ -2084,28 +2102,26 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B, | |||||
| off += 1; // number of values in A | off += 1; // number of values in A | ||||
| #endif | #endif | ||||
| #endif | #endif | ||||
| pc0 += 1; | |||||
| pc1 += 1; | |||||
| } | } | ||||
| #if defined(TRMMKERNEL) && !defined(LEFT) | #if defined(TRMMKERNEL) && !defined(LEFT) | ||||
| off += 2; // number of values in A | off += 2; // number of values in A | ||||
| #endif | #endif | ||||
| l = (k << 1); | |||||
| B = B + l; | |||||
| i = (ldc << 1); | |||||
| C = C + i; | |||||
| B += (k << 1); | |||||
| C += (ldc << 1); | |||||
| } | } | ||||
| if (n & 1) | if (n & 1) | ||||
| { | { | ||||
| pc0 = C; | pc0 = C; | ||||
| pa0 = A; | |||||
| #if defined(TRMMKERNEL) && defined(LEFT) | #if defined(TRMMKERNEL) && defined(LEFT) | ||||
| off = offset; | off = offset; | ||||
| #endif | #endif | ||||
| pa0 = A; | |||||
| for (i = (m >> 3); i--;) | for (i = (m >> 3); i--;) | ||||
| { | { | ||||
| #if defined(TRMMKERNEL) | #if defined(TRMMKERNEL) | ||||
| @@ -2272,6 +2288,8 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B, | |||||
| #endif | #endif | ||||
| ST_SP(dst0, pc0); | ST_SP(dst0, pc0); | ||||
| pc0 += 4; | |||||
| #if defined(TRMMKERNEL) | #if defined(TRMMKERNEL) | ||||
| #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) | #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) | ||||
| temp = k - off; | temp = k - off; | ||||
| @@ -2288,7 +2306,6 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B, | |||||
| off += 4; // number of values in A | off += 4; // number of values in A | ||||
| #endif | #endif | ||||
| #endif | #endif | ||||
| pc0 += 4; | |||||
| } | } | ||||
| if (m & 2) | if (m & 2) | ||||
| @@ -2359,6 +2376,9 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B, | |||||
| pb0 += 1; | pb0 += 1; | ||||
| } | } | ||||
| tmp0 = alpha * tmp0; | |||||
| tmp1 = alpha * tmp1; | |||||
| #if defined(TRMMKERNEL) | #if defined(TRMMKERNEL) | ||||
| pc0[0] = tmp0; | pc0[0] = tmp0; | ||||
| pc0[1] = tmp1; | pc0[1] = tmp1; | ||||
| @@ -2366,6 +2386,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B, | |||||
| pc0[0] += tmp0; | pc0[0] += tmp0; | ||||
| pc0[1] += tmp1; | pc0[1] += tmp1; | ||||
| #endif | #endif | ||||
| pc0 += 2; | |||||
| #if defined(TRMMKERNEL) | #if defined(TRMMKERNEL) | ||||
| #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) | #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) | ||||
| @@ -2383,8 +2404,6 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B, | |||||
| off += 2; // number of values in A | off += 2; // number of values in A | ||||
| #endif | #endif | ||||
| #endif | #endif | ||||
| pc0 += 2; | |||||
| } | } | ||||
| if (m & 1) | if (m & 1) | ||||
| @@ -2448,34 +2467,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B, | |||||
| #else | #else | ||||
| pc0[0] += alpha * tmp0; | pc0[0] += alpha * tmp0; | ||||
| #endif | #endif | ||||
| #if defined(TRMMKERNEL) | |||||
| #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) | |||||
| temp = k - off; | |||||
| #ifdef LEFT | |||||
| temp -= 1; // number of values in A | |||||
| #else | |||||
| temp -= 1; // number of values in B | |||||
| #endif | |||||
| pa0 += temp * 1; | |||||
| pb0 += temp * 1; | |||||
| #endif | |||||
| #ifdef LEFT | |||||
| off += 1; // number of values in A | |||||
| #endif | |||||
| #endif | |||||
| pc0 += 1; | |||||
| } | } | ||||
| #if defined(TRMMKERNEL) && !defined(LEFT) | |||||
| off += 1; // number of values in A | |||||
| #endif | |||||
| l = (k << 0); | |||||
| B = B + l; | |||||
| i = (ldc << 0); | |||||
| C = C + i; | |||||
| } | } | ||||
| return 0; | return 0; | ||||
| @@ -851,6 +851,18 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alphar, FLOAT alphai, | |||||
| temp = k; | temp = k; | ||||
| #endif | #endif | ||||
| #ifdef ENABLE_PREFETCH | |||||
| __asm__ __volatile__( | |||||
| "pref 0, 64(%[pa0]) \n\t" | |||||
| "pref 0, 96(%[pa0]) \n\t" | |||||
| "pref 0, 64(%[pb0]) \n\t" | |||||
| "pref 0, 96(%[pb0]) \n\t" | |||||
| : | |||||
| : [pa0] "r" (pa0), [pb0] "r" (pb0) | |||||
| ); | |||||
| #endif | |||||
| #if defined(NN) || defined(NT) || defined(TN) || defined(TT) | #if defined(NN) || defined(NT) || defined(TN) || defined(TT) | ||||
| ZGEMM_KERNEL_4X4_MSA(, -, , +, +); | ZGEMM_KERNEL_4X4_MSA(, -, , +, +); | ||||
| #endif | #endif | ||||
| @@ -866,6 +878,18 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alphar, FLOAT alphai, | |||||
| for (l = (temp - 1); l--;) | for (l = (temp - 1); l--;) | ||||
| { | { | ||||
| #ifdef ENABLE_PREFETCH | |||||
| __asm__ __volatile__( | |||||
| "pref 0, 64(%[pa0]) \n\t" | |||||
| "pref 0, 96(%[pa0]) \n\t" | |||||
| "pref 0, 64(%[pb0]) \n\t" | |||||
| "pref 0, 96(%[pb0]) \n\t" | |||||
| : | |||||
| : [pa0] "r" (pa0), [pb0] "r" (pb0) | |||||
| ); | |||||
| #endif | |||||
| #if defined(NN) || defined(NT) || defined(TN) || defined(TT) | #if defined(NN) || defined(NT) || defined(TN) || defined(TT) | ||||
| ZGEMM_KERNEL_4X4_MSA(+, -, +, +,); | ZGEMM_KERNEL_4X4_MSA(+, -, +, +,); | ||||
| #endif | #endif | ||||
| @@ -1039,6 +1063,10 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alphar, FLOAT alphai, | |||||
| #else | #else | ||||
| ZGEMM_SCALE_1X4_MSA | ZGEMM_SCALE_1X4_MSA | ||||
| #endif | #endif | ||||
| pc0 += 2; | |||||
| pc1 += 2; | |||||
| pc2 += 2; | |||||
| pc3 += 2; | |||||
| #if defined(TRMMKERNEL) | #if defined(TRMMKERNEL) | ||||
| #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) | #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) | ||||
| @@ -1056,21 +1084,14 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alphar, FLOAT alphai, | |||||
| off += 1; // number of values in A | off += 1; // number of values in A | ||||
| #endif | #endif | ||||
| #endif | #endif | ||||
| pc0 += 2; | |||||
| pc1 += 2; | |||||
| pc2 += 2; | |||||
| pc3 += 2; | |||||
| } | } | ||||
| #if defined(TRMMKERNEL) && !defined(LEFT) | #if defined(TRMMKERNEL) && !defined(LEFT) | ||||
| off += 4; // number of values in A | off += 4; // number of values in A | ||||
| #endif | #endif | ||||
| l = k << 3; | |||||
| B = B + l; | |||||
| i = ldc << 3; | |||||
| C = C + i; | |||||
| B += (k << 3); | |||||
| C += (ldc << 3); | |||||
| } | } | ||||
| if (n & 2) | if (n & 2) | ||||
| @@ -1294,6 +1315,8 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alphar, FLOAT alphai, | |||||
| #else | #else | ||||
| ZGEMM_SCALE_1X2_MSA | ZGEMM_SCALE_1X2_MSA | ||||
| #endif | #endif | ||||
| pc0 += 2; | |||||
| pc1 += 2; | |||||
| #if defined(TRMMKERNEL) | #if defined(TRMMKERNEL) | ||||
| #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) | #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) | ||||
| @@ -1311,19 +1334,14 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alphar, FLOAT alphai, | |||||
| off += 1; // number of values in A | off += 1; // number of values in A | ||||
| #endif | #endif | ||||
| #endif | #endif | ||||
| pc0 += 2; | |||||
| pc1 += 2; | |||||
| } | } | ||||
| #if defined(TRMMKERNEL) && !defined(LEFT) | #if defined(TRMMKERNEL) && !defined(LEFT) | ||||
| off += 2; // number of values in A | off += 2; // number of values in A | ||||
| #endif | #endif | ||||
| l = k << 2; | |||||
| B = B + l; | |||||
| i = ldc << 2; | |||||
| C = C + i; | |||||
| B += (k << 2); | |||||
| C += (ldc << 2); | |||||
| } | } | ||||
| if (n & 1) | if (n & 1) | ||||
| @@ -1555,6 +1573,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alphar, FLOAT alphai, | |||||
| #else | #else | ||||
| ZGEMM_SCALE_1X1 | ZGEMM_SCALE_1X1 | ||||
| #endif | #endif | ||||
| pc0 += 2; | |||||
| #if defined(TRMMKERNEL) | #if defined(TRMMKERNEL) | ||||
| #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) | #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) | ||||
| @@ -1572,18 +1591,15 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alphar, FLOAT alphai, | |||||
| off += 1; // number of values in A | off += 1; // number of values in A | ||||
| #endif | #endif | ||||
| #endif | #endif | ||||
| pc0 += 2; | |||||
| } | } | ||||
| #if defined(TRMMKERNEL) && !defined(LEFT) | #if defined(TRMMKERNEL) && !defined(LEFT) | ||||
| off += 1; // number of values in A | off += 1; // number of values in A | ||||
| #endif | #endif | ||||
| l = k << 1; | |||||
| B = B + l; | |||||
| i = ldc << 1; | |||||
| C = C + i; | |||||
| B += (k << 1); | |||||
| C += (ldc << 1); | |||||
| } | } | ||||
| return 0; | return 0; | ||||
| } | } | ||||