@@ -372,8 +372,8 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT alp | |||||
0, 1, 0|8, 1|8, 4, 5, 4|8, 5|8, | 0, 1, 0|8, 1|8, 4, 5, 4|8, 5|8, | ||||
2, 3, 2|8, 3|8, 6, 7, 6|8, 7|8, | 2, 3, 2|8, 3|8, 6, 7, 6|8, 7|8, | ||||
}; | }; | ||||
__m512i idx_lo = _mm512_loadu_epi64(permute_table); | |||||
__m512i idx_hi = _mm512_loadu_epi64(permute_table + 8); | |||||
__m512i idx_lo = _mm512_loadu_si512(permute_table); | |||||
__m512i idx_hi = _mm512_loadu_si512(permute_table + 8); | |||||
for (; i < m4; i += 4, mi += 4) { | for (; i < m4; i += 4, mi += 4) { | ||||
for (j = 0; j < n4; j += 4) { | for (j = 0; j < n4; j += 4) { | ||||
DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0); DECLARE_RESULT_512(2, 0); DECLARE_RESULT_512(3, 0); | DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0); DECLARE_RESULT_512(2, 0); DECLARE_RESULT_512(3, 0); | ||||
@@ -385,7 +385,7 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT alp | |||||
for (int ii = 0; ii < 8; ii++) { | for (int ii = 0; ii < 8; ii++) { | ||||
index_n[ii] = ii * ldc; | index_n[ii] = ii * ldc; | ||||
} | } | ||||
__m512i vindex_n = _mm512_loadu_epi64(index_n); | |||||
__m512i vindex_n = _mm512_loadu_si512(index_n); | |||||
for (; i < m4; i += 4) { | for (; i < m4; i += 4) { | ||||
for (j = 0; j < n32; j += 32) { | for (j = 0; j < n32; j += 32) { | ||||
DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0); DECLARE_RESULT_512(2, 0); DECLARE_RESULT_512(3, 0); | DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0); DECLARE_RESULT_512(2, 0); DECLARE_RESULT_512(3, 0); | ||||
@@ -105,8 +105,8 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT alp | |||||
0, 1, 0|8, 1|8, 4, 5, 4|8, 5|8, | 0, 1, 0|8, 1|8, 4, 5, 4|8, 5|8, | ||||
2, 3, 2|8, 3|8, 6, 7, 6|8, 7|8, | 2, 3, 2|8, 3|8, 6, 7, 6|8, 7|8, | ||||
}; | }; | ||||
__m512i idx_lo = _mm512_loadu_epi64(permute_table); | |||||
__m512i idx_hi = _mm512_loadu_epi64(permute_table + 8); | |||||
__m512i idx_lo = _mm512_loadu_si512(permute_table); | |||||
__m512i idx_hi = _mm512_loadu_si512(permute_table + 8); | |||||
for (i = 0; i < m4; i += 4) { | for (i = 0; i < m4; i += 4) { | ||||
for (j = 0; j < n4; j += 4) { | for (j = 0; j < n4; j += 4) { | ||||
@@ -189,8 +189,8 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT alp | |||||
0, 1, 4, 5, 0|8, 1|8, 4|8, 5|8, | 0, 1, 4, 5, 0|8, 1|8, 4|8, 5|8, | ||||
2, 3, 6, 7, 2|8, 3|8, 6|8, 7|8, | 2, 3, 6, 7, 2|8, 3|8, 6|8, 7|8, | ||||
}; | }; | ||||
__m512i idx_lo = _mm512_loadu_epi64(permute_table); | |||||
__m512i idx_hi = _mm512_loadu_epi64(permute_table + 8); | |||||
__m512i idx_lo = _mm512_loadu_si512(permute_table); | |||||
__m512i idx_hi = _mm512_loadu_si512(permute_table + 8); | |||||
for (i = 0; i < m8; i += 8) { | for (i = 0; i < m8; i += 8) { | ||||
for (j = 0; j < n16; j += 16) { | for (j = 0; j < n16; j += 16) { | ||||
@@ -235,8 +235,8 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT alp | |||||
0, 1, 0|8, 1|8, 4, 5, 4|8, 5|8, | 0, 1, 0|8, 1|8, 4, 5, 4|8, 5|8, | ||||
2, 3, 2|8, 3|8, 6, 7, 6|8, 7|8, | 2, 3, 2|8, 3|8, 6, 7, 6|8, 7|8, | ||||
}; | }; | ||||
idx_lo = _mm512_loadu_epi64(permute_table2); | |||||
idx_hi = _mm512_loadu_epi64(permute_table2 + 8); | |||||
idx_lo = _mm512_loadu_si512(permute_table2); | |||||
idx_hi = _mm512_loadu_si512(permute_table2 + 8); | |||||
for (j = 0; j < n32; j += 32) { | for (j = 0; j < n32; j += 32) { | ||||
DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0); DECLARE_RESULT_512(2, 0); DECLARE_RESULT_512(3, 0); | DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0); DECLARE_RESULT_512(2, 0); DECLARE_RESULT_512(3, 0); | ||||
@@ -289,7 +289,7 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT alp | |||||
for (int ii = 0; ii < 8; ii++) { | for (int ii = 0; ii < 8; ii++) { | ||||
index_n[ii] = ii * ldc; | index_n[ii] = ii * ldc; | ||||
} | } | ||||
__m512i vindex_n = _mm512_loadu_epi64(index_n); | |||||
__m512i vindex_n = _mm512_loadu_si512(index_n); | |||||
#if !defined(B0) | #if !defined(B0) | ||||
__m512d beta_512 = _mm512_broadcastsd_pd(_mm_load_sd(&beta)); | __m512d beta_512 = _mm512_broadcastsd_pd(_mm_load_sd(&beta)); | ||||
#endif | #endif | ||||
@@ -385,7 +385,7 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT alp | |||||
for (int ii = 0; ii < 16; ii++) { | for (int ii = 0; ii < 16; ii++) { | ||||
index_n[ii] = ii * ldc; | index_n[ii] = ii * ldc; | ||||
} | } | ||||
__m512i vindex_n = _mm512_loadu_epi32(index_n); | |||||
__m512i vindex_n = _mm512_loadu_si512(index_n); | |||||
for (; i < m4; i += 4) { | for (; i < m4; i += 4) { | ||||
for (j = 0; j < n64; j += 64) { | for (j = 0; j < n64; j += 64) { | ||||
DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0); DECLARE_RESULT_512(2, 0); DECLARE_RESULT_512(3, 0); | DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0); DECLARE_RESULT_512(2, 0); DECLARE_RESULT_512(3, 0); | ||||
@@ -215,8 +215,8 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT alp | |||||
0x0, 0x1, 0x2, 0x3, 0x10, 0x11, 0x12, 0x13, 0x8, 0x9, 0xa, 0xb, 0x18, 0x19, 0x1a, 0x1b, | 0x0, 0x1, 0x2, 0x3, 0x10, 0x11, 0x12, 0x13, 0x8, 0x9, 0xa, 0xb, 0x18, 0x19, 0x1a, 0x1b, | ||||
0x4, 0x5, 0x6, 0x7, 0x14, 0x15, 0x16, 0x17, 0xc, 0xd, 0xe, 0xf, 0x1c, 0x1d, 0x1e, 0x1f, | 0x4, 0x5, 0x6, 0x7, 0x14, 0x15, 0x16, 0x17, 0xc, 0xd, 0xe, 0xf, 0x1c, 0x1d, 0x1e, 0x1f, | ||||
}; | }; | ||||
__m512i idx_lo = _mm512_loadu_epi32(permute_table); | |||||
__m512i idx_hi = _mm512_loadu_epi32(permute_table + 16); | |||||
__m512i idx_lo = _mm512_loadu_si512(permute_table); | |||||
__m512i idx_hi = _mm512_loadu_si512(permute_table + 16); | |||||
__mmask16 kc = 0xcccc; | __mmask16 kc = 0xcccc; | ||||
__mmask16 k3 = 0x3333; | __mmask16 k3 = 0x3333; | ||||
__mmask8 mask8 = 0xff; // force use AVX128 instead of SSE | __mmask8 mask8 = 0xff; // force use AVX128 instead of SSE | ||||
@@ -311,7 +311,7 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT alp | |||||
for (int ii = 0; ii < 16; ii++) { | for (int ii = 0; ii < 16; ii++) { | ||||
index_n[ii] = ii * ldc; | index_n[ii] = ii * ldc; | ||||
} | } | ||||
__m512i vindex_n = _mm512_loadu_epi32(index_n); | |||||
__m512i vindex_n = _mm512_loadu_si512(index_n); | |||||
#if !defined(B0) | #if !defined(B0) | ||||
__m512 beta_512 = _mm512_broadcastss_ps(_mm_load_ss(&beta)); | __m512 beta_512 = _mm512_broadcastss_ps(_mm_load_ss(&beta)); | ||||
#endif | #endif | ||||