@@ -263,7 +263,7 @@ if (DEFINED TARGET) | |||||
endif() | endif() | ||||
if (${TARGET} STREQUAL POWER10) | if (${TARGET} STREQUAL POWER10) | ||||
if (CMAKE_C_COMPILER VERSION VERSION_GREATER 10.2 OR CMAKE_C_COMPILER_VERSION VERSION_EQUAL 10.2) | |||||
if (CMAKE_C_COMPILER_VERSION VERSION_GREATER 10.2 OR CMAKE_C_COMPILER_VERSION VERSION_EQUAL 10.2) | |||||
set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -mcpu=power10 -mtune=power10 -mvsx -fno-fast-math") | set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -mcpu=power10 -mtune=power10 -mvsx -fno-fast-math") | ||||
else () | else () | ||||
message(FATAL_ERROR "Compiler GCC ${CMAKE_C_COMPILER_VERSION} does not support Power10.") | message(FATAL_ERROR "Compiler GCC ${CMAKE_C_COMPILER_VERSION} does not support Power10.") | ||||
@@ -26,7 +26,7 @@ endif | |||||
override CFLAGS += -DADD$(BU) -DCBLAS | override CFLAGS += -DADD$(BU) -DCBLAS | ||||
ifeq ($(F_COMPILER),GFORTRAN) | ifeq ($(F_COMPILER),GFORTRAN) | ||||
ifneq (, $(filter $(CORE),LOONGSON3R3 LOONGSON3R4)) | ifneq (, $(filter $(CORE),LOONGSON3R3 LOONGSON3R4)) | ||||
override FFLAGS = $(filter_out(-O2 -O3,$(FFLAGS))) -O0 | |||||
override FFLAGS := $(filter_out(-O2 -O3,$(FFLAGS))) -O0 | |||||
endif | endif | ||||
override FFLAGS += -fno-tree-vectorize | override FFLAGS += -fno-tree-vectorize | ||||
endif | endif | ||||
@@ -2680,7 +2680,7 @@ static int sbgemv_kernel_1x128_lda_direct(BLASLONG m, BLASLONG n, float alpha, b | |||||
BLASLONG tag_n_32x = n & (~31); | BLASLONG tag_n_32x = n & (~31); | ||||
BLASLONG tag_n_128x = n & (~127); | BLASLONG tag_n_128x = n & (~127); | ||||
__m512 accum512_bridge[8]; | |||||
__m512 accum512_bridge[16]; | |||||
__m512 accum512_t_0, accum512_t_1, accum512_t_2, accum512_t_3; | __m512 accum512_t_0, accum512_t_1, accum512_t_2, accum512_t_3; | ||||
__m256 accum256_0; | __m256 accum256_0; | ||||
__m128 accum128; | __m128 accum128; | ||||
@@ -5,5 +5,5 @@ Data file for testing REAL LAPACK linear equation routines RFP format | |||||
1 2 15 Values of NRHS (number of right hand sides) | 1 2 15 Values of NRHS (number of right hand sides) | ||||
9 Number of matrix types (list types on next line if 0 < NTYPES < 9) | 9 Number of matrix types (list types on next line if 0 < NTYPES < 9) | ||||
1 2 3 4 5 6 7 8 9 Matrix Types | 1 2 3 4 5 6 7 8 9 Matrix Types | ||||
42.0 Threshold value of test ratio | |||||
45.0 Threshold value of test ratio | |||||
T Put T to test the error exits | T Put T to test the error exits |
@@ -121,7 +121,7 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, | |||||
HERK_THREAD_LN(&newarg, NULL, NULL, sa, sb, 0); | HERK_THREAD_LN(&newarg, NULL, NULL, sa, sb, 0); | ||||
#else | #else | ||||
syrk_thread(mode | BLAS_TRANSA_N | BLAS_TRANSB_T | BLAS_UPLO, | syrk_thread(mode | BLAS_TRANSA_N | BLAS_TRANSB_T | BLAS_UPLO, | ||||
&newarg, NULL, NULL, (int (*)(void))HERK_LN, sa, sb, args -> nthreads); | |||||
&newarg, NULL, NULL, (int (*)(blas_arg_t *, BLASLONG *, BLASLONG *, FLOAT *, FLOAT *, BLASLONG))HERK_LN, sa, sb, args -> nthreads); | |||||
#endif | #endif | ||||
} | } | ||||
} | } | ||||
@@ -121,7 +121,7 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, | |||||
HERK_THREAD_UC(&newarg, NULL, NULL, sa, sb, 0); | HERK_THREAD_UC(&newarg, NULL, NULL, sa, sb, 0); | ||||
#else | #else | ||||
syrk_thread(mode | BLAS_TRANSA_N | BLAS_TRANSB_T, | syrk_thread(mode | BLAS_TRANSA_N | BLAS_TRANSB_T, | ||||
&newarg, NULL, NULL, (int (*)(void))HERK_UC, sa, sb, args -> nthreads); | |||||
&newarg, NULL, NULL, (int (*)(blas_arg_t *, BLASLONG *, BLASLONG *, FLOAT *, FLOAT *, BLASLONG))HERK_UC, sa, sb, args -> nthreads); | |||||
#endif | #endif | ||||
} | } | ||||
} | } | ||||
@@ -2637,8 +2637,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
#undef SBGEMM_DEFAULT_Q | #undef SBGEMM_DEFAULT_Q | ||||
#define SBGEMM_DEFAULT_UNROLL_M 16 | #define SBGEMM_DEFAULT_UNROLL_M 16 | ||||
#define SBGEMM_DEFAULT_UNROLL_N 8 | #define SBGEMM_DEFAULT_UNROLL_N 8 | ||||
#define SBGEMM_DEFAULT_P 832 | |||||
#define SBGEMM_DEFAULT_Q 1026 | |||||
#define SBGEMM_DEFAULT_P 512 | |||||
#define SBGEMM_DEFAULT_Q 1024 | |||||
#define SBGEMM_DEFAULT_R 4096 | #define SBGEMM_DEFAULT_R 4096 | ||||
#endif | #endif | ||||
@@ -2,7 +2,7 @@ TOPDIR = .. | |||||
include ../Makefile.system | include ../Makefile.system | ||||
ifeq ($(F_COMPILER),GFORTRAN) | ifeq ($(F_COMPILER),GFORTRAN) | ||||
ifneq (, $(filter $(CORE),LOONGSON3R3 LOONGSON3R4)) | ifneq (, $(filter $(CORE),LOONGSON3R3 LOONGSON3R4)) | ||||
override FFLAGS = $(filter_out(-O2 -O3,$(FFLAGS))) -O0 | |||||
override FFLAGS := $(filter_out(-O2 -O3,$(FFLAGS))) -O0 | |||||
endif | endif | ||||
override FFLAGS += -fno-tree-vectorize | override FFLAGS += -fno-tree-vectorize | ||||
endif | endif | ||||
@@ -81,6 +81,8 @@ float16to32 (bfloat16_bits f16) | |||||
return f32.v; | return f32.v; | ||||
} | } | ||||
#define SBGEMM_LARGEST 256 | |||||
int | int | ||||
main (int argc, char *argv[]) | main (int argc, char *argv[]) | ||||
{ | { | ||||
@@ -88,12 +90,45 @@ main (int argc, char *argv[]) | |||||
int i, j, l; | int i, j, l; | ||||
blasint x, y; | blasint x, y; | ||||
int ret = 0; | int ret = 0; | ||||
int loop = 100; | |||||
int loop = SBGEMM_LARGEST; | |||||
char transA = 'N', transB = 'N'; | char transA = 'N', transB = 'N'; | ||||
float alpha = 1.0, beta = 0.0; | float alpha = 1.0, beta = 0.0; | ||||
for (x = 0; x <= loop; x++) | for (x = 0; x <= loop; x++) | ||||
{ | { | ||||
if ((x > 100) && (x != SBGEMM_LARGEST)) continue; | |||||
m = k = n = x; | |||||
float *A = (float *)malloc(m * k * sizeof(FLOAT)); | |||||
float *B = (float *)malloc(k * n * sizeof(FLOAT)); | |||||
float *C = (float *)malloc(m * n * sizeof(FLOAT)); | |||||
bfloat16_bits *AA = (bfloat16_bits *)malloc(m * k * sizeof(bfloat16_bits)); | |||||
bfloat16_bits *BB = (bfloat16_bits *)malloc(k * n * sizeof(bfloat16_bits)); | |||||
float *DD = (float *)malloc(m * n * sizeof(FLOAT)); | |||||
float *CC = (float *)malloc(m * n * sizeof(FLOAT)); | |||||
if ((A == NULL) || (B == NULL) || (C == NULL) || (AA == NULL) || (BB == NULL) || | |||||
(DD == NULL) || (CC == NULL)) | |||||
return 1; | |||||
bfloat16 atmp,btmp; | |||||
blasint one=1; | |||||
for (j = 0; j < m; j++) | |||||
{ | |||||
for (i = 0; i < k; i++) | |||||
{ | |||||
A[j * k + i] = ((FLOAT) rand () / (FLOAT) RAND_MAX) + 0.5; | |||||
sbstobf16_(&one, &A[j*k+i], &one, &atmp, &one); | |||||
AA[j * k + i].v = atmp; | |||||
} | |||||
} | |||||
for (j = 0; j < n; j++) | |||||
{ | |||||
for (i = 0; i < k; i++) | |||||
{ | |||||
B[j * k + i] = ((FLOAT) rand () / (FLOAT) RAND_MAX) + 0.5; | |||||
sbstobf16_(&one, &B[j*k+i], &one, &btmp, &one); | |||||
BB[j * k + i].v = btmp; | |||||
} | |||||
} | |||||
for (y = 0; y < 4; y++) | for (y = 0; y < 4; y++) | ||||
{ | { | ||||
if ((y == 0) || (y == 2)) { | if ((y == 0) || (y == 2)) { | ||||
@@ -106,40 +141,19 @@ main (int argc, char *argv[]) | |||||
} else { | } else { | ||||
transB = 'T'; | transB = 'T'; | ||||
} | } | ||||
m = k = n = x; | |||||
float A[m * k]; | |||||
float B[k * n]; | |||||
float C[m * n]; | |||||
bfloat16_bits AA[m * k], BB[k * n]; | |||||
float DD[m * n], CC[m * n]; | |||||
bfloat16 atmp,btmp; | |||||
blasint one=1; | |||||
for (j = 0; j < m; j++) | |||||
{ | |||||
for (i = 0; i < m; i++) | |||||
{ | |||||
A[j * k + i] = ((FLOAT) rand () / (FLOAT) RAND_MAX) + 0.5; | |||||
B[j * k + i] = ((FLOAT) rand () / (FLOAT) RAND_MAX) + 0.5; | |||||
C[j * k + i] = 0; | |||||
sbstobf16_(&one, &A[j*k+i], &one, &atmp, &one); | |||||
sbstobf16_(&one, &B[j*k+i], &one, &btmp, &one); | |||||
AA[j * k + i].v = atmp; | |||||
BB[j * k + i].v = btmp; | |||||
CC[j * k + i] = 0; | |||||
DD[j * k + i] = 0; | |||||
} | |||||
} | |||||
memset(CC, 0, m * n * sizeof(FLOAT)); | |||||
memset(DD, 0, m * n * sizeof(FLOAT)); | |||||
memset(C, 0, m * n * sizeof(FLOAT)); | |||||
SGEMM (&transA, &transB, &m, &n, &k, &alpha, A, | SGEMM (&transA, &transB, &m, &n, &k, &alpha, A, | ||||
&m, B, &k, &beta, C, &m); | &m, B, &k, &beta, C, &m); | ||||
SBGEMM (&transA, &transB, &m, &n, &k, &alpha, (bfloat16*) AA, | SBGEMM (&transA, &transB, &m, &n, &k, &alpha, (bfloat16*) AA, | ||||
&m, (bfloat16*)BB, &k, &beta, CC, &m); | &m, (bfloat16*)BB, &k, &beta, CC, &m); | ||||
for (i = 0; i < n; i++) | for (i = 0; i < n; i++) | ||||
for (j = 0; j < m; j++) | for (j = 0; j < m; j++) | ||||
if (fabs (CC[i * m + j] - C[i * m + j]) > 1.0) | |||||
ret++; | |||||
for (i = 0; i < n; i++) | |||||
for (j = 0; j < m; j++) | |||||
{ | |||||
for (l = 0; l < k; l++) | for (l = 0; l < k; l++) | ||||
if (transA == 'N' && transB == 'N') | if (transA == 'N' && transB == 'N') | ||||
{ | { | ||||
@@ -158,11 +172,19 @@ main (int argc, char *argv[]) | |||||
DD[i * m + j] += | DD[i * m + j] += | ||||
float16to32 (AA[k * j + l]) * float16to32 (BB[i + l * n]); | float16to32 (AA[k * j + l]) * float16to32 (BB[i + l * n]); | ||||
} | } | ||||
for (i = 0; i < n; i++) | |||||
for (j = 0; j < m; j++) | |||||
if (CC[i * m + j] != DD[i * m + j]) | |||||
if (fabs (CC[i * m + j] - C[i * m + j]) > 1.0) | |||||
ret++; | |||||
if (fabs (CC[i * m + j] - DD[i * m + j]) > 1.0) | |||||
ret++; | ret++; | ||||
} | |||||
} | } | ||||
free(A); | |||||
free(B); | |||||
free(C); | |||||
free(AA); | |||||
free(BB); | |||||
free(DD); | |||||
free(CC); | |||||
} | } | ||||
if (ret != 0) | if (ret != 0) | ||||