There is a recent compiler change in __builtin_mma_disassemble_acc() which affects the order of storing result in POWER10. Also removing new LDFLAG -mno-power10-stub as it is handled by linker automatically.tags/v0.3.11^2
@@ -617,7 +617,6 @@ DYNAMIC_CORE += POWER8 | |||||
ifneq ($(C_COMPILER), GCC) | ifneq ($(C_COMPILER), GCC) | ||||
DYNAMIC_CORE += POWER9 | DYNAMIC_CORE += POWER9 | ||||
DYNAMIC_CORE += POWER10 | DYNAMIC_CORE += POWER10 | ||||
override LDFLAGS += -Wl,-no-power10-stubs | |||||
endif | endif | ||||
ifeq ($(C_COMPILER), GCC) | ifeq ($(C_COMPILER), GCC) | ||||
ifeq ($(GCCVERSIONGT5), 1) | ifeq ($(GCCVERSIONGT5), 1) | ||||
@@ -627,11 +626,9 @@ $(info, OpenBLAS: Your gcc version is too old to build the POWER9 kernels.) | |||||
endif | endif | ||||
ifeq ($(GCCVERSIONGTEQ11), 1) | ifeq ($(GCCVERSIONGTEQ11), 1) | ||||
DYNAMIC_CORE += POWER10 | DYNAMIC_CORE += POWER10 | ||||
override LDFLAGS += -Wl,-no-power10-stubs | |||||
else ifeq ($(GCCVERSIONGTEQ10), 1) | else ifeq ($(GCCVERSIONGTEQ10), 1) | ||||
ifeq ($(GCCMINORVERSIONGTEQ2), 1) | ifeq ($(GCCMINORVERSIONGTEQ2), 1) | ||||
DYNAMIC_CORE += POWER10 | DYNAMIC_CORE += POWER10 | ||||
override LDFLAGS += -Wl,-no-power10-stubs | |||||
endif | endif | ||||
else | else | ||||
$(info, OpenBLAS: Your gcc version is too old to build the POWER10 kernels.) | $(info, OpenBLAS: Your gcc version is too old to build the POWER10 kernels.) | ||||
@@ -27,64 +27,64 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
#include "common.h" | #include "common.h" | ||||
#include <altivec.h> | #include <altivec.h> | ||||
typedef unsigned char vec_t __attribute__ ((vector_size (16))); | |||||
typedef __vector unsigned char vec_t; | |||||
typedef FLOAT v4sf_t __attribute__ ((vector_size (16))); | typedef FLOAT v4sf_t __attribute__ ((vector_size (16))); | ||||
typedef FLOAT v2sf_t __attribute__ ((vector_size (8))); | typedef FLOAT v2sf_t __attribute__ ((vector_size (8))); | ||||
#ifdef TRMMKERNEL | #ifdef TRMMKERNEL | ||||
#define SAVE_ACC(ACC, J) \ | #define SAVE_ACC(ACC, J) \ | ||||
__builtin_mma_disassemble_acc (result, ACC); \ | |||||
__builtin_mma_disassemble_acc ((void *)result, ACC); \ | |||||
rowC = (v4sf_t *) &CO[0* ldc+J]; \ | rowC = (v4sf_t *) &CO[0* ldc+J]; \ | ||||
rowC[0] = result[3] * alpha; \ | |||||
rowC[0] = result[0] * alpha; \ | |||||
rowC = (v4sf_t *) &CO[1*ldc+J]; \ | rowC = (v4sf_t *) &CO[1*ldc+J]; \ | ||||
rowC[0] = result[2] * alpha; \ | |||||
rowC = (v4sf_t *) &CO[2*ldc+J]; \ | |||||
rowC[0] = result[1] * alpha; \ | rowC[0] = result[1] * alpha; \ | ||||
rowC = (v4sf_t *) &CO[2*ldc+J]; \ | |||||
rowC[0] = result[2] * alpha; \ | |||||
rowC = (v4sf_t *) &CO[3*ldc+J]; \ | rowC = (v4sf_t *) &CO[3*ldc+J]; \ | ||||
rowC[0] = result[0] * alpha; | |||||
rowC[0] = result[3] * alpha; | |||||
#define SAVE_ACC1(ACC, J) \ | #define SAVE_ACC1(ACC, J) \ | ||||
__builtin_mma_disassemble_acc (result, ACC); \ | |||||
__builtin_mma_disassemble_acc ((void *)result, ACC); \ | |||||
rowC = (v4sf_t *) &CO[4* ldc+J]; \ | rowC = (v4sf_t *) &CO[4* ldc+J]; \ | ||||
rowC[0] = result[3] * alpha; \ | |||||
rowC[0] = result[0] * alpha; \ | |||||
rowC = (v4sf_t *) &CO[5*ldc+J]; \ | rowC = (v4sf_t *) &CO[5*ldc+J]; \ | ||||
rowC[0] = result[2] * alpha; \ | |||||
rowC = (v4sf_t *) &CO[6*ldc+J]; \ | |||||
rowC[0] = result[1] * alpha; \ | rowC[0] = result[1] * alpha; \ | ||||
rowC = (v4sf_t *) &CO[6*ldc+J]; \ | |||||
rowC[0] = result[2] * alpha; \ | |||||
rowC = (v4sf_t *) &CO[7*ldc+J]; \ | rowC = (v4sf_t *) &CO[7*ldc+J]; \ | ||||
rowC[0] = result[0] * alpha; | |||||
rowC[0] = result[3] * alpha; | |||||
#define SAVE2x4_ACC(ACC, J) \ | #define SAVE2x4_ACC(ACC, J) \ | ||||
__builtin_mma_disassemble_acc (result, ACC); \ | |||||
__builtin_mma_disassemble_acc ((void *)result, ACC); \ | |||||
rowC = (v4sf_t *) &CO[0* ldc+J]; \ | rowC = (v4sf_t *) &CO[0* ldc+J]; \ | ||||
rowC[0] = result[3] * alpha; \ | |||||
rowC[0] = result[0] * alpha; \ | |||||
rowC = (v4sf_t *) &CO[1* ldc+J]; \ | rowC = (v4sf_t *) &CO[1* ldc+J]; \ | ||||
rowC[0] = result[2] * alpha; | |||||
rowC[0] = result[1] * alpha; | |||||
#else | #else | ||||
#define SAVE_ACC(ACC, J) \ | #define SAVE_ACC(ACC, J) \ | ||||
__builtin_mma_disassemble_acc (result, ACC); \ | |||||
__builtin_mma_disassemble_acc ((void *)result, ACC); \ | |||||
rowC = (v4sf_t *) &CO[0* ldc+J]; \ | rowC = (v4sf_t *) &CO[0* ldc+J]; \ | ||||
rowC[0] += result[3] * alpha; \ | |||||
rowC[0] += result[0] * alpha; \ | |||||
rowC = (v4sf_t *) &CO[1*ldc+J]; \ | rowC = (v4sf_t *) &CO[1*ldc+J]; \ | ||||
rowC[0] += result[2] * alpha; \ | |||||
rowC = (v4sf_t *) &CO[2*ldc+J]; \ | |||||
rowC[0] += result[1] * alpha; \ | rowC[0] += result[1] * alpha; \ | ||||
rowC = (v4sf_t *) &CO[2*ldc+J]; \ | |||||
rowC[0] += result[2] * alpha; \ | |||||
rowC = (v4sf_t *) &CO[3*ldc+J]; \ | rowC = (v4sf_t *) &CO[3*ldc+J]; \ | ||||
rowC[0] += result[0] * alpha; | |||||
rowC[0] += result[3] * alpha; | |||||
#define SAVE_ACC1(ACC, J) \ | #define SAVE_ACC1(ACC, J) \ | ||||
__builtin_mma_disassemble_acc (result, ACC); \ | |||||
__builtin_mma_disassemble_acc ((void *)result, ACC); \ | |||||
rowC = (v4sf_t *) &CO[4* ldc+J]; \ | rowC = (v4sf_t *) &CO[4* ldc+J]; \ | ||||
rowC[0] += result[3] * alpha; \ | |||||
rowC[0] += result[0] * alpha; \ | |||||
rowC = (v4sf_t *) &CO[5*ldc+J]; \ | rowC = (v4sf_t *) &CO[5*ldc+J]; \ | ||||
rowC[0] += result[2] * alpha; \ | |||||
rowC = (v4sf_t *) &CO[6*ldc+J]; \ | |||||
rowC[0] += result[1] * alpha; \ | rowC[0] += result[1] * alpha; \ | ||||
rowC = (v4sf_t *) &CO[6*ldc+J]; \ | |||||
rowC[0] += result[2] * alpha; \ | |||||
rowC = (v4sf_t *) &CO[7*ldc+J]; \ | rowC = (v4sf_t *) &CO[7*ldc+J]; \ | ||||
rowC[0] += result[0] * alpha; | |||||
rowC[0] += result[3] * alpha; | |||||
#define SAVE2x4_ACC(ACC, J) \ | #define SAVE2x4_ACC(ACC, J) \ | ||||
__builtin_mma_disassemble_acc (result, ACC); \ | |||||
__builtin_mma_disassemble_acc ((void *)result, ACC); \ | |||||
rowC = (v4sf_t *) &CO[0* ldc+J]; \ | rowC = (v4sf_t *) &CO[0* ldc+J]; \ | ||||
rowC[0] += result[3] * alpha; \ | |||||
rowC[0] += result[0] * alpha; \ | |||||
rowC = (v4sf_t *) &CO[1* ldc+J]; \ | rowC = (v4sf_t *) &CO[1* ldc+J]; \ | ||||
rowC[0] += result[2] * alpha; | |||||
rowC[0] += result[1] * alpha; | |||||
#endif | #endif | ||||
#define SET_ACC_ZERO4() \ | #define SET_ACC_ZERO4() \ | ||||
@@ -27,103 +27,103 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
#include "common.h" | #include "common.h" | ||||
#include <altivec.h> | #include <altivec.h> | ||||
typedef unsigned char vec_t __attribute__ ((vector_size (16))); | |||||
typedef __vector unsigned char vec_t; | |||||
typedef FLOAT v4sf_t __attribute__ ((vector_size (16))); | typedef FLOAT v4sf_t __attribute__ ((vector_size (16))); | ||||
typedef FLOAT v2sf_t __attribute__ ((vector_size (8))); | typedef FLOAT v2sf_t __attribute__ ((vector_size (8))); | ||||
#if defined(TRMMKERNEL) | #if defined(TRMMKERNEL) | ||||
#define SAVE_ACC(ACC, J) \ | #define SAVE_ACC(ACC, J) \ | ||||
__builtin_mma_disassemble_acc (result, ACC); \ | |||||
__builtin_mma_disassemble_acc ((void *)result, ACC); \ | |||||
rowC = (v4sf_t *) &CO[0* ldc+J]; \ | rowC = (v4sf_t *) &CO[0* ldc+J]; \ | ||||
rowC[0] = result[3] * alpha; \ | |||||
rowC[0] = result[0] * alpha; \ | |||||
rowC = (v4sf_t *) &CO[1*ldc+J]; \ | rowC = (v4sf_t *) &CO[1*ldc+J]; \ | ||||
rowC[0] = result[2] * alpha; \ | |||||
rowC = (v4sf_t *) &CO[2*ldc+J]; \ | |||||
rowC[0] = result[1] * alpha; \ | rowC[0] = result[1] * alpha; \ | ||||
rowC = (v4sf_t *) &CO[2*ldc+J]; \ | |||||
rowC[0] = result[2] * alpha; \ | |||||
rowC = (v4sf_t *) &CO[3*ldc+J]; \ | rowC = (v4sf_t *) &CO[3*ldc+J]; \ | ||||
rowC[0] = result[0] * alpha; | |||||
rowC[0] = result[3] * alpha; | |||||
#define SAVE_ACC1(ACC, J) \ | #define SAVE_ACC1(ACC, J) \ | ||||
__builtin_mma_disassemble_acc (result, ACC); \ | |||||
__builtin_mma_disassemble_acc ((void *)result, ACC); \ | |||||
rowC = (v4sf_t *) &CO[4* ldc+J]; \ | rowC = (v4sf_t *) &CO[4* ldc+J]; \ | ||||
rowC[0] = result[3] * alpha; \ | |||||
rowC[0] = result[0] * alpha; \ | |||||
rowC = (v4sf_t *) &CO[5*ldc+J]; \ | rowC = (v4sf_t *) &CO[5*ldc+J]; \ | ||||
rowC[0] = result[2] * alpha; \ | |||||
rowC = (v4sf_t *) &CO[6*ldc+J]; \ | |||||
rowC[0] = result[1] * alpha; \ | rowC[0] = result[1] * alpha; \ | ||||
rowC = (v4sf_t *) &CO[6*ldc+J]; \ | |||||
rowC[0] = result[2] * alpha; \ | |||||
rowC = (v4sf_t *) &CO[7*ldc+J]; \ | rowC = (v4sf_t *) &CO[7*ldc+J]; \ | ||||
rowC[0] = result[0] * alpha; | |||||
rowC[0] = result[3] * alpha; | |||||
#define SAVE4x2_ACC(ACC, J) \ | #define SAVE4x2_ACC(ACC, J) \ | ||||
__builtin_mma_disassemble_acc (result, ACC); \ | |||||
__builtin_mma_disassemble_acc ((void *)result, ACC); \ | |||||
rowC = (v2sf_t *) &CO[0* ldc+J]; \ | rowC = (v2sf_t *) &CO[0* ldc+J]; \ | ||||
rowC[0] = result[6] * alpha; \ | |||||
rowC[0] = result[0] * alpha; \ | |||||
rowC = (v2sf_t *) &CO[1* ldc+J]; \ | rowC = (v2sf_t *) &CO[1* ldc+J]; \ | ||||
rowC[0] = result[4] * alpha; \ | |||||
rowC = (v2sf_t *) &CO[2* ldc+J]; \ | |||||
rowC[0] = result[2] * alpha; \ | rowC[0] = result[2] * alpha; \ | ||||
rowC = (v2sf_t *) &CO[2* ldc+J]; \ | |||||
rowC[0] = result[4] * alpha; \ | |||||
rowC = (v2sf_t *) &CO[3* ldc+J]; \ | rowC = (v2sf_t *) &CO[3* ldc+J]; \ | ||||
rowC[0] = result[0] * alpha; | |||||
rowC[0] = result[6] * alpha; | |||||
#define SAVE4x2_ACC1(ACC, J) \ | #define SAVE4x2_ACC1(ACC, J) \ | ||||
__builtin_mma_disassemble_acc (result, ACC); \ | |||||
__builtin_mma_disassemble_acc ((void *)result, ACC); \ | |||||
rowC = (v2sf_t *) &CO[4* ldc+J]; \ | rowC = (v2sf_t *) &CO[4* ldc+J]; \ | ||||
rowC[0] = result[6] * alpha; \ | |||||
rowC[0] = result[0] * alpha; \ | |||||
rowC = (v2sf_t *) &CO[5* ldc+J]; \ | rowC = (v2sf_t *) &CO[5* ldc+J]; \ | ||||
rowC[0] = result[4] * alpha; \ | |||||
rowC = (v2sf_t *) &CO[6* ldc+J]; \ | |||||
rowC[0] = result[2] * alpha; \ | rowC[0] = result[2] * alpha; \ | ||||
rowC = (v2sf_t *) &CO[6* ldc+J]; \ | |||||
rowC[0] = result[4] * alpha; \ | |||||
rowC = (v2sf_t *) &CO[7* ldc+J]; \ | rowC = (v2sf_t *) &CO[7* ldc+J]; \ | ||||
rowC[0] = result[0] * alpha; | |||||
rowC[0] = result[6] * alpha; | |||||
#define SAVE2x4_ACC(ACC, J) \ | #define SAVE2x4_ACC(ACC, J) \ | ||||
__builtin_mma_disassemble_acc (result, ACC); \ | |||||
__builtin_mma_disassemble_acc ((void *)result, ACC); \ | |||||
rowC = (v4sf_t *) &CO[0* ldc+J]; \ | rowC = (v4sf_t *) &CO[0* ldc+J]; \ | ||||
rowC[0] = result[3] * alpha; \ | |||||
rowC[0] = result[0] * alpha; \ | |||||
rowC = (v4sf_t *) &CO[1* ldc+J]; \ | rowC = (v4sf_t *) &CO[1* ldc+J]; \ | ||||
rowC[0] = result[2] * alpha; | |||||
rowC[0] = result[1] * alpha; | |||||
#else | #else | ||||
#define SAVE_ACC(ACC, J) \ | #define SAVE_ACC(ACC, J) \ | ||||
__builtin_mma_disassemble_acc (result, ACC); \ | |||||
__builtin_mma_disassemble_acc ((void *)result, ACC); \ | |||||
rowC = (v4sf_t *) &CO[0* ldc+J]; \ | rowC = (v4sf_t *) &CO[0* ldc+J]; \ | ||||
rowC[0] += result[3] * alpha; \ | |||||
rowC[0] += result[0] * alpha; \ | |||||
rowC = (v4sf_t *) &CO[1*ldc+J]; \ | rowC = (v4sf_t *) &CO[1*ldc+J]; \ | ||||
rowC[0] += result[2] * alpha; \ | |||||
rowC = (v4sf_t *) &CO[2*ldc+J]; \ | |||||
rowC[0] += result[1] * alpha; \ | rowC[0] += result[1] * alpha; \ | ||||
rowC = (v4sf_t *) &CO[2*ldc+J]; \ | |||||
rowC[0] += result[2] * alpha; \ | |||||
rowC = (v4sf_t *) &CO[3*ldc+J]; \ | rowC = (v4sf_t *) &CO[3*ldc+J]; \ | ||||
rowC[0] += result[0] * alpha; | |||||
rowC[0] += result[3] * alpha; | |||||
#define SAVE_ACC1(ACC, J) \ | #define SAVE_ACC1(ACC, J) \ | ||||
__builtin_mma_disassemble_acc (result, ACC); \ | |||||
__builtin_mma_disassemble_acc ((void *)result, ACC); \ | |||||
rowC = (v4sf_t *) &CO[4* ldc+J]; \ | rowC = (v4sf_t *) &CO[4* ldc+J]; \ | ||||
rowC[0] += result[3] * alpha; \ | |||||
rowC[0] += result[0] * alpha; \ | |||||
rowC = (v4sf_t *) &CO[5*ldc+J]; \ | rowC = (v4sf_t *) &CO[5*ldc+J]; \ | ||||
rowC[0] += result[2] * alpha; \ | |||||
rowC = (v4sf_t *) &CO[6*ldc+J]; \ | |||||
rowC[0] += result[1] * alpha; \ | rowC[0] += result[1] * alpha; \ | ||||
rowC = (v4sf_t *) &CO[6*ldc+J]; \ | |||||
rowC[0] += result[2] * alpha; \ | |||||
rowC = (v4sf_t *) &CO[7*ldc+J]; \ | rowC = (v4sf_t *) &CO[7*ldc+J]; \ | ||||
rowC[0] += result[0] * alpha; | |||||
rowC[0] += result[3] * alpha; | |||||
#define SAVE4x2_ACC(ACC, J) \ | #define SAVE4x2_ACC(ACC, J) \ | ||||
__builtin_mma_disassemble_acc (result, ACC); \ | |||||
__builtin_mma_disassemble_acc ((void *)result, ACC); \ | |||||
rowC = (v2sf_t *) &CO[0* ldc+J]; \ | rowC = (v2sf_t *) &CO[0* ldc+J]; \ | ||||
rowC[0] += result[6] * alpha; \ | |||||
rowC[0] += result[0] * alpha; \ | |||||
rowC = (v2sf_t *) &CO[1* ldc+J]; \ | rowC = (v2sf_t *) &CO[1* ldc+J]; \ | ||||
rowC[0] += result[4] * alpha; \ | |||||
rowC = (v2sf_t *) &CO[2* ldc+J]; \ | |||||
rowC[0] += result[2] * alpha; \ | rowC[0] += result[2] * alpha; \ | ||||
rowC = (v2sf_t *) &CO[2* ldc+J]; \ | |||||
rowC[0] += result[4] * alpha; \ | |||||
rowC = (v2sf_t *) &CO[3* ldc+J]; \ | rowC = (v2sf_t *) &CO[3* ldc+J]; \ | ||||
rowC[0] += result[0] * alpha; | |||||
rowC[0] += result[6] * alpha; | |||||
#define SAVE4x2_ACC1(ACC, J) \ | #define SAVE4x2_ACC1(ACC, J) \ | ||||
__builtin_mma_disassemble_acc (result, ACC); \ | |||||
__builtin_mma_disassemble_acc ((void *)result, ACC); \ | |||||
rowC = (v2sf_t *) &CO[4* ldc+J]; \ | rowC = (v2sf_t *) &CO[4* ldc+J]; \ | ||||
rowC[0] += result[6] * alpha; \ | |||||
rowC[0] += result[0] * alpha; \ | |||||
rowC = (v2sf_t *) &CO[5* ldc+J]; \ | rowC = (v2sf_t *) &CO[5* ldc+J]; \ | ||||
rowC[0] += result[4] * alpha; \ | |||||
rowC = (v2sf_t *) &CO[6* ldc+J]; \ | |||||
rowC[0] += result[2] * alpha; \ | rowC[0] += result[2] * alpha; \ | ||||
rowC = (v2sf_t *) &CO[6* ldc+J]; \ | |||||
rowC[0] += result[4] * alpha; \ | |||||
rowC = (v2sf_t *) &CO[7* ldc+J]; \ | rowC = (v2sf_t *) &CO[7* ldc+J]; \ | ||||
rowC[0] += result[0] * alpha; | |||||
rowC[0] += result[6] * alpha; | |||||
#define SAVE2x4_ACC(ACC, J) \ | #define SAVE2x4_ACC(ACC, J) \ | ||||
__builtin_mma_disassemble_acc (result, ACC); \ | |||||
__builtin_mma_disassemble_acc ((void *)result, ACC); \ | |||||
rowC = (v4sf_t *) &CO[0* ldc+J]; \ | rowC = (v4sf_t *) &CO[0* ldc+J]; \ | ||||
rowC[0] += result[3] * alpha; \ | |||||
rowC[0] += result[0] * alpha; \ | |||||
rowC = (v4sf_t *) &CO[1* ldc+J]; \ | rowC = (v4sf_t *) &CO[1* ldc+J]; \ | ||||
rowC[0] += result[2] * alpha; | |||||
rowC[0] += result[1] * alpha; | |||||
#endif | #endif | ||||
#define KERNEL(i, j) \ | #define KERNEL(i, j) \ | ||||
__builtin_mma_xvf32gerpp (&acc0, rowB[i], rowA[j]); \ | __builtin_mma_xvf32gerpp (&acc0, rowB[i], rowA[j]); \ | ||||
@@ -45,7 +45,7 @@ bfloat16tof32 (bfloat16 f16) | |||||
#define BF16TOF32(x) x | #define BF16TOF32(x) x | ||||
#endif | #endif | ||||
typedef unsigned char vec_t __attribute__ ((vector_size (16))); | |||||
typedef __vector unsigned char vec_t; | |||||
typedef FLOAT v4sf_t __attribute__ ((vector_size (16))); | typedef FLOAT v4sf_t __attribute__ ((vector_size (16))); | ||||
typedef FLOAT v2sf_t __attribute__ ((vector_size (8))); | typedef FLOAT v2sf_t __attribute__ ((vector_size (8))); | ||||
@@ -64,54 +64,54 @@ vector char mask = | |||||
#define MERGE_LOW(x, y) (vec_t) vec_mergel ((vector short)x, (vector short)y) | #define MERGE_LOW(x, y) (vec_t) vec_mergel ((vector short)x, (vector short)y) | ||||
#define SAVE_ACC(ACC, J) \ | #define SAVE_ACC(ACC, J) \ | ||||
__builtin_mma_disassemble_acc (result, ACC); \ | |||||
__builtin_mma_disassemble_acc ((void *)result, ACC); \ | |||||
rowC = (v4sf_t *) &CO[0* ldc+J]; \ | rowC = (v4sf_t *) &CO[0* ldc+J]; \ | ||||
rowC[0] += result[3] * alpha; \ | |||||
rowC[0] += result[0] * alpha; \ | |||||
rowC = (v4sf_t *) &CO[1*ldc+J]; \ | rowC = (v4sf_t *) &CO[1*ldc+J]; \ | ||||
rowC[0] += result[2] * alpha; \ | |||||
rowC = (v4sf_t *) &CO[2*ldc+J]; \ | |||||
rowC[0] += result[1] * alpha; \ | rowC[0] += result[1] * alpha; \ | ||||
rowC = (v4sf_t *) &CO[2*ldc+J]; \ | |||||
rowC[0] += result[2] * alpha; \ | |||||
rowC = (v4sf_t *) &CO[3*ldc+J]; \ | rowC = (v4sf_t *) &CO[3*ldc+J]; \ | ||||
rowC[0] += result[0] * alpha; | |||||
rowC[0] += result[3] * alpha; | |||||
#define SAVE_ACC1(ACC, J) \ | #define SAVE_ACC1(ACC, J) \ | ||||
__builtin_mma_disassemble_acc (result, ACC); \ | |||||
__builtin_mma_disassemble_acc ((void *)result, ACC); \ | |||||
rowC = (v4sf_t *) &CO[4* ldc+J]; \ | rowC = (v4sf_t *) &CO[4* ldc+J]; \ | ||||
rowC[0] += result[3] * alpha; \ | |||||
rowC[0] += result[0] * alpha; \ | |||||
rowC = (v4sf_t *) &CO[5*ldc+J]; \ | rowC = (v4sf_t *) &CO[5*ldc+J]; \ | ||||
rowC[0] += result[2] * alpha; \ | |||||
rowC = (v4sf_t *) &CO[6*ldc+J]; \ | |||||
rowC[0] += result[1] * alpha; \ | rowC[0] += result[1] * alpha; \ | ||||
rowC = (v4sf_t *) &CO[6*ldc+J]; \ | |||||
rowC[0] += result[2] * alpha; \ | |||||
rowC = (v4sf_t *) &CO[7*ldc+J]; \ | rowC = (v4sf_t *) &CO[7*ldc+J]; \ | ||||
rowC[0] += result[0] * alpha; | |||||
rowC[0] += result[3] * alpha; | |||||
#define SAVE4x2_ACC(ACC, J) \ | #define SAVE4x2_ACC(ACC, J) \ | ||||
__builtin_mma_disassemble_acc (result, ACC); \ | |||||
__builtin_mma_disassemble_acc ((void *)result, ACC); \ | |||||
rowC = (v2sf_t *) &CO[0* ldc+J]; \ | rowC = (v2sf_t *) &CO[0* ldc+J]; \ | ||||
rowC[0] += result[6] * alpha; \ | |||||
rowC[0] += result[0] * alpha; \ | |||||
rowC = (v2sf_t *) &CO[1* ldc+J]; \ | rowC = (v2sf_t *) &CO[1* ldc+J]; \ | ||||
rowC[0] += result[4] * alpha; \ | |||||
rowC = (v2sf_t *) &CO[2* ldc+J]; \ | |||||
rowC[0] += result[2] * alpha; \ | rowC[0] += result[2] * alpha; \ | ||||
rowC = (v2sf_t *) &CO[2* ldc+J]; \ | |||||
rowC[0] += result[4] * alpha; \ | |||||
rowC = (v2sf_t *) &CO[3* ldc+J]; \ | rowC = (v2sf_t *) &CO[3* ldc+J]; \ | ||||
rowC[0] += result[0] * alpha; | |||||
rowC[0] += result[6] * alpha; | |||||
#define SAVE4x2_ACC1(ACC, J) \ | #define SAVE4x2_ACC1(ACC, J) \ | ||||
__builtin_mma_disassemble_acc (result, ACC); \ | |||||
__builtin_mma_disassemble_acc ((void *)result, ACC); \ | |||||
rowC = (v2sf_t *) &CO[4* ldc+J]; \ | rowC = (v2sf_t *) &CO[4* ldc+J]; \ | ||||
rowC[0] += result[6] * alpha; \ | |||||
rowC[0] += result[0] * alpha; \ | |||||
rowC = (v2sf_t *) &CO[5* ldc+J]; \ | rowC = (v2sf_t *) &CO[5* ldc+J]; \ | ||||
rowC[0] += result[4] * alpha; \ | |||||
rowC = (v2sf_t *) &CO[6* ldc+J]; \ | |||||
rowC[0] += result[2] * alpha; \ | rowC[0] += result[2] * alpha; \ | ||||
rowC = (v2sf_t *) &CO[6* ldc+J]; \ | |||||
rowC[0] += result[4] * alpha; \ | |||||
rowC = (v2sf_t *) &CO[7* ldc+J]; \ | rowC = (v2sf_t *) &CO[7* ldc+J]; \ | ||||
rowC[0] += result[0] * alpha; | |||||
rowC[0] += result[6] * alpha; | |||||
#define MMA __builtin_mma_xvbf16ger2pp | #define MMA __builtin_mma_xvbf16ger2pp | ||||
#define SAVE2x4_ACC(ACC, J) \ | #define SAVE2x4_ACC(ACC, J) \ | ||||
__builtin_mma_disassemble_acc (result, ACC); \ | |||||
__builtin_mma_disassemble_acc ((void *)result, ACC); \ | |||||
rowC = (v4sf_t *) &CO[0* ldc+J]; \ | rowC = (v4sf_t *) &CO[0* ldc+J]; \ | ||||
rowC[0] += result[3] * alpha; \ | |||||
rowC[0] += result[0] * alpha; \ | |||||
rowC = (v4sf_t *) &CO[1* ldc+J]; \ | rowC = (v4sf_t *) &CO[1* ldc+J]; \ | ||||
rowC[0] += result[2] * alpha; | |||||
rowC[0] += result[1] * alpha; | |||||
#define SET_ACC_ZERO4() \ | #define SET_ACC_ZERO4() \ | ||||
__builtin_mma_xxsetaccz (&acc0); \ | __builtin_mma_xxsetaccz (&acc0); \ | ||||