Browse Source

Add vector registers to clobber list to prevent compiler optimization.

SME based SGEMMDIRECT kernel uses the vector registers (z) and adding
    clobber list informs compiler not to optimize these registers.
tags/v0.3.30
Vaisakh K V 6 months ago
parent
commit
04915be829
1 changed files with 18 additions and 3 deletions
  1. +18
    -3
      kernel/arm64/sgemm_direct_arm64_sme1.c

+ 18
- 3
kernel/arm64/sgemm_direct_arm64_sme1.c View File

@@ -7,7 +7,6 @@
#include <stdlib.h>
#include <inttypes.h>
#include <math.h>
#if defined(HAVE_SME)
/* Function prototypes */
@@ -44,7 +43,17 @@ void CNAME (BLASLONG M, BLASLONG N, BLASLONG K, float * __restrict A,\
m_mod = ceil((double)M/(double)vl_elms) * vl_elms;
float *A_mod = (float *) malloc(m_mod*K*sizeof(float));
/* Prevent compiler optimization by reading from memory instead
* of reading directly from vector (z) registers.
* */
asm volatile("" : : :"p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7",
"p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15",
"z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7",
"z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15",
"z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23",
"z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31");
/* Pre-process the left matrix to make it suitable for
matrix sum of outer-product calculation
*/
@@ -52,7 +61,13 @@ void CNAME (BLASLONG M, BLASLONG N, BLASLONG K, float * __restrict A,\
/* Calculate C = A*B */
sgemm_direct_sme1_2VLx2VL(M, K, N, A_mod, B, R);
asm volatile("" : : :"p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7",
"p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15",
"z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7",
"z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15",
"z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23",
"z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31");
free(A_mod);
}


Loading…
Cancel
Save