Browse Source

Small Matrix: skylakex: fix build error in old compiler

tags/v0.3.18
Wangyang Guo 4 years ago
parent
commit
44d0032f3b
6 changed files with 14 additions and 14 deletions
  1. +2
    -2
      kernel/x86_64/dgemm_small_kernel_nn_skylakex.c
  2. +1
    -1
      kernel/x86_64/dgemm_small_kernel_nt_skylakex.c
  3. +2
    -2
      kernel/x86_64/dgemm_small_kernel_tn_skylakex.c
  4. +5
    -5
      kernel/x86_64/dgemm_small_kernel_tt_skylakex.c
  5. +1
    -1
      kernel/x86_64/sgemm_small_kernel_nt_skylakex.c
  6. +3
    -3
      kernel/x86_64/sgemm_small_kernel_tt_skylakex.c

+ 2
- 2
kernel/x86_64/dgemm_small_kernel_nn_skylakex.c View File

@@ -372,8 +372,8 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT alp
0, 1, 0|8, 1|8, 4, 5, 4|8, 5|8, 0, 1, 0|8, 1|8, 4, 5, 4|8, 5|8,
2, 3, 2|8, 3|8, 6, 7, 6|8, 7|8, 2, 3, 2|8, 3|8, 6, 7, 6|8, 7|8,
}; };
__m512i idx_lo = _mm512_loadu_epi64(permute_table);
__m512i idx_hi = _mm512_loadu_epi64(permute_table + 8);
__m512i idx_lo = _mm512_loadu_si512(permute_table);
__m512i idx_hi = _mm512_loadu_si512(permute_table + 8);
for (; i < m4; i += 4, mi += 4) { for (; i < m4; i += 4, mi += 4) {
for (j = 0; j < n4; j += 4) { for (j = 0; j < n4; j += 4) {
DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0); DECLARE_RESULT_512(2, 0); DECLARE_RESULT_512(3, 0); DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0); DECLARE_RESULT_512(2, 0); DECLARE_RESULT_512(3, 0);


+ 1
- 1
kernel/x86_64/dgemm_small_kernel_nt_skylakex.c View File

@@ -385,7 +385,7 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT alp
for (int ii = 0; ii < 8; ii++) { for (int ii = 0; ii < 8; ii++) {
index_n[ii] = ii * ldc; index_n[ii] = ii * ldc;
} }
__m512i vindex_n = _mm512_loadu_epi64(index_n);
__m512i vindex_n = _mm512_loadu_si512(index_n);
for (; i < m4; i += 4) { for (; i < m4; i += 4) {
for (j = 0; j < n32; j += 32) { for (j = 0; j < n32; j += 32) {
DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0); DECLARE_RESULT_512(2, 0); DECLARE_RESULT_512(3, 0); DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0); DECLARE_RESULT_512(2, 0); DECLARE_RESULT_512(3, 0);


+ 2
- 2
kernel/x86_64/dgemm_small_kernel_tn_skylakex.c View File

@@ -105,8 +105,8 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT alp
0, 1, 0|8, 1|8, 4, 5, 4|8, 5|8, 0, 1, 0|8, 1|8, 4, 5, 4|8, 5|8,
2, 3, 2|8, 3|8, 6, 7, 6|8, 7|8, 2, 3, 2|8, 3|8, 6, 7, 6|8, 7|8,
}; };
__m512i idx_lo = _mm512_loadu_epi64(permute_table);
__m512i idx_hi = _mm512_loadu_epi64(permute_table + 8);
__m512i idx_lo = _mm512_loadu_si512(permute_table);
__m512i idx_hi = _mm512_loadu_si512(permute_table + 8);


for (i = 0; i < m4; i += 4) { for (i = 0; i < m4; i += 4) {
for (j = 0; j < n4; j += 4) { for (j = 0; j < n4; j += 4) {


+ 5
- 5
kernel/x86_64/dgemm_small_kernel_tt_skylakex.c View File

@@ -189,8 +189,8 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT alp
0, 1, 4, 5, 0|8, 1|8, 4|8, 5|8, 0, 1, 4, 5, 0|8, 1|8, 4|8, 5|8,
2, 3, 6, 7, 2|8, 3|8, 6|8, 7|8, 2, 3, 6, 7, 2|8, 3|8, 6|8, 7|8,
}; };
__m512i idx_lo = _mm512_loadu_epi64(permute_table);
__m512i idx_hi = _mm512_loadu_epi64(permute_table + 8);
__m512i idx_lo = _mm512_loadu_si512(permute_table);
__m512i idx_hi = _mm512_loadu_si512(permute_table + 8);


for (i = 0; i < m8; i += 8) { for (i = 0; i < m8; i += 8) {
for (j = 0; j < n16; j += 16) { for (j = 0; j < n16; j += 16) {
@@ -235,8 +235,8 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT alp
0, 1, 0|8, 1|8, 4, 5, 4|8, 5|8, 0, 1, 0|8, 1|8, 4, 5, 4|8, 5|8,
2, 3, 2|8, 3|8, 6, 7, 6|8, 7|8, 2, 3, 2|8, 3|8, 6, 7, 6|8, 7|8,
}; };
idx_lo = _mm512_loadu_epi64(permute_table2);
idx_hi = _mm512_loadu_epi64(permute_table2 + 8);
idx_lo = _mm512_loadu_si512(permute_table2);
idx_hi = _mm512_loadu_si512(permute_table2 + 8);


for (j = 0; j < n32; j += 32) { for (j = 0; j < n32; j += 32) {
DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0); DECLARE_RESULT_512(2, 0); DECLARE_RESULT_512(3, 0); DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0); DECLARE_RESULT_512(2, 0); DECLARE_RESULT_512(3, 0);
@@ -289,7 +289,7 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT alp
for (int ii = 0; ii < 8; ii++) { for (int ii = 0; ii < 8; ii++) {
index_n[ii] = ii * ldc; index_n[ii] = ii * ldc;
} }
__m512i vindex_n = _mm512_loadu_epi64(index_n);
__m512i vindex_n = _mm512_loadu_si512(index_n);
#if !defined(B0) #if !defined(B0)
__m512d beta_512 = _mm512_broadcastsd_pd(_mm_load_sd(&beta)); __m512d beta_512 = _mm512_broadcastsd_pd(_mm_load_sd(&beta));
#endif #endif


+ 1
- 1
kernel/x86_64/sgemm_small_kernel_nt_skylakex.c View File

@@ -385,7 +385,7 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT alp
for (int ii = 0; ii < 16; ii++) { for (int ii = 0; ii < 16; ii++) {
index_n[ii] = ii * ldc; index_n[ii] = ii * ldc;
} }
__m512i vindex_n = _mm512_loadu_epi32(index_n);
__m512i vindex_n = _mm512_loadu_si512(index_n);
for (; i < m4; i += 4) { for (; i < m4; i += 4) {
for (j = 0; j < n64; j += 64) { for (j = 0; j < n64; j += 64) {
DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0); DECLARE_RESULT_512(2, 0); DECLARE_RESULT_512(3, 0); DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0); DECLARE_RESULT_512(2, 0); DECLARE_RESULT_512(3, 0);


+ 3
- 3
kernel/x86_64/sgemm_small_kernel_tt_skylakex.c View File

@@ -215,8 +215,8 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT alp
0x0, 0x1, 0x2, 0x3, 0x10, 0x11, 0x12, 0x13, 0x8, 0x9, 0xa, 0xb, 0x18, 0x19, 0x1a, 0x1b, 0x0, 0x1, 0x2, 0x3, 0x10, 0x11, 0x12, 0x13, 0x8, 0x9, 0xa, 0xb, 0x18, 0x19, 0x1a, 0x1b,
0x4, 0x5, 0x6, 0x7, 0x14, 0x15, 0x16, 0x17, 0xc, 0xd, 0xe, 0xf, 0x1c, 0x1d, 0x1e, 0x1f, 0x4, 0x5, 0x6, 0x7, 0x14, 0x15, 0x16, 0x17, 0xc, 0xd, 0xe, 0xf, 0x1c, 0x1d, 0x1e, 0x1f,
}; };
__m512i idx_lo = _mm512_loadu_epi32(permute_table);
__m512i idx_hi = _mm512_loadu_epi32(permute_table + 16);
__m512i idx_lo = _mm512_loadu_si512(permute_table);
__m512i idx_hi = _mm512_loadu_si512(permute_table + 16);
__mmask16 kc = 0xcccc; __mmask16 kc = 0xcccc;
__mmask16 k3 = 0x3333; __mmask16 k3 = 0x3333;
__mmask8 mask8 = 0xff; // force use AVX128 instead of SSE __mmask8 mask8 = 0xff; // force use AVX128 instead of SSE
@@ -311,7 +311,7 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT alp
for (int ii = 0; ii < 16; ii++) { for (int ii = 0; ii < 16; ii++) {
index_n[ii] = ii * ldc; index_n[ii] = ii * ldc;
} }
__m512i vindex_n = _mm512_loadu_epi32(index_n);
__m512i vindex_n = _mm512_loadu_si512(index_n);
#if !defined(B0) #if !defined(B0)
__m512 beta_512 = _mm512_broadcastss_ps(_mm_load_ss(&beta)); __m512 beta_512 = _mm512_broadcastss_ps(_mm_load_ss(&beta));
#endif #endif


Loading…
Cancel
Save