@@ -89,20 +89,21 @@ task: | |||
type: text/plain | |||
macos_instance: | |||
image: ghcr.io/cirruslabs/macos-monterey-xcode:latest | |||
image: ghcr.io/cirruslabs/macos-sonoma-xcode:latest | |||
task: | |||
name: AppleM1/LLVM armv7-androidndk xbuild | |||
compile_script: | |||
- brew install android-ndk | |||
- brew install --cask android-ndk | |||
- export #PATH=/opt/homebrew/opt/llvm/bin:$PATH | |||
- export #LDFLAGS="-L/opt/homebrew/opt/llvm/lib" | |||
- export #CPPFLAGS="-I/opt/homebrew/opt/llvm/include" | |||
- ls /System/Volumes/Data/opt/homebrew | |||
- export #CPPFLAGS="-I/opt/homebrew/opt/llvm/include" | |||
- export ANDROID_NDK_HOME="/opt/homebrew/share/android-ndk" | |||
- ls /opt/homebrew | |||
- ls -l /System/Volumes/Data/opt/homebrew/Caskroom/android-ndk | |||
- find /System/Volumes/Data/opt/homebrew -name "armv7a-linux-androideabi*-ranlib" | |||
- find /opt/homebrew -name "armv7a-linux-androideabi*-ranlib" | |||
- #export CC=/Applications/Xcode-13.4.1.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/clang | |||
- #export CFLAGS="-O2 -unwindlib=none -Wno-macro-redefined -isysroot /Applications/Xcode-13.4.1.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS16.0.sdk -arch arm64 -miphoneos-version-min=10.0" | |||
- export CC=/System/Volumes/Data/opt/homebrew/Caskroom/android-ndk/26d/AndroidNDK*.app/Contents/NDK/toolchains/llvm/prebuilt/darwin-x86_64/bin/armv7a-linux-androideabi23-clang | |||
- export CC=/System/Volumes/Data/opt/homebrew/Caskroom/android-ndk/27/AndroidNDK*.app/Contents/NDK/toolchains/llvm/prebuilt/darwin-x86_64/bin/armv7a-linux-androideabi23-clang | |||
- make TARGET=ARMV7 ARM_SOFTFP_ABI=1 NUM_THREADS=32 HOSTCC=clang NOFORTRAN=1 RANLIB="ls -l" | |||
always: | |||
config_artifacts: | |||
@@ -85,6 +85,8 @@ Examples: | |||
make CC=loongcc FC=loongf95 HOSTCC=gcc TARGET=LOONGSON3A CROSS=1 CROSS_SUFFIX=mips64el-st-linux-gnu- NO_LAPACKE=1 NO_SHARED=1 BINARY=32 | |||
``` | |||
When compiling for a more modern CPU TARGET of the same architecture, e.g. TARGET=SKYLAKEX on a HASWELL host, option "CROSS=1" can be used to suppress the automatic invocation of the tests at the end of the build. | |||
### Debug version | |||
A debug version can be built using `make DEBUG=1`. | |||
@@ -1527,6 +1527,19 @@ int get_cpuname(void){ | |||
break; | |||
case 10: //family 6 exmodel 10 | |||
switch (model) { | |||
case 13: // Granite Rapids | |||
if(support_amx_bf16()) | |||
return CPUTYPE_SAPPHIRERAPIDS; | |||
if(support_avx512_bf16()) | |||
return CPUTYPE_COOPERLAKE; | |||
if(support_avx512()) | |||
return CPUTYPE_SKYLAKEX; | |||
if(support_avx2()) | |||
return CPUTYPE_HASWELL; | |||
if(support_avx()) | |||
return CPUTYPE_SANDYBRIDGE; | |||
else | |||
return CPUTYPE_NEHALEM; | |||
case 5: // Comet Lake H and S | |||
case 6: // Comet Lake U | |||
case 10: // Meteor Lake | |||
@@ -2352,8 +2365,22 @@ int get_coretype(void){ | |||
case 10: | |||
switch (model) { | |||
case 13: // Granite Rapids | |||
if(support_amx_bf16()) | |||
return CORE_SAPPHIRERAPIDS; | |||
if(support_avx512_bf16()) | |||
return CORE_COOPERLAKE; | |||
if(support_avx512()) | |||
return CORE_SKYLAKEX; | |||
if(support_avx2()) | |||
return CORE_HASWELL; | |||
if(support_avx()) | |||
return CORE_SANDYBRIDGE; | |||
else | |||
return CORE_NEHALEM; | |||
case 5: // Comet Lake H and S | |||
case 6: // Comet Lake U | |||
case 10: // Meteor Lake | |||
if(support_avx()) | |||
#ifndef NO_AVX2 | |||
return CORE_HASWELL; | |||
@@ -2362,6 +2389,7 @@ int get_coretype(void){ | |||
#endif | |||
else | |||
return CORE_NEHALEM; | |||
case 0: // Meteor Lake | |||
case 7:// Rocket Lake | |||
#ifndef NO_AVX512 | |||
if(support_avx512()) | |||
@@ -1076,6 +1076,8 @@ fprintf(STDERR, "Server[%2ld] Calculation started. Mode = 0x%03x M = %3ld N=%3l | |||
main_status[cpu] = MAIN_RUNNING1; | |||
#endif | |||
if (buffer == NULL) blas_thread_buffer[cpu] = blas_memory_alloc(2); | |||
//For target LOONGSON3R5, applying an offset to the buffer is essential | |||
//for minimizing cache conflicts and optimizing performance. | |||
#if defined(ARCH_LOONGARCH64) && !defined(NO_AFFINITY) | |||
@@ -880,10 +880,8 @@ lapackobjs2c="$lapackobjs2c | |||
# clatrs3 | |||
lapackobjs2d="$lapackobjs2d | |||
dgelqs | |||
dgelst | |||
dgeqp3rk | |||
dgeqrs | |||
dlaqp2rk | |||
dlaqp3rk | |||
dlarmm | |||
@@ -897,10 +895,8 @@ lapackobjs2d="$lapackobjs2d | |||
# dlaqz4 | |||
lapackobjs2z="$lapackobjs2z | |||
zgelqs | |||
zgelst | |||
zgeqp3rk | |||
zgeqrs | |||
zlaqp2rk | |||
zlaqp3rk | |||
zlatrs3 | |||
@@ -918,6 +914,7 @@ lapack_extendedprecision_objs=" | |||
" | |||
lapack_deprecated_objsc=" | |||
cgelqs cgeqrs | |||
cgegs cggsvd | |||
cgegv cggsvp | |||
cgelsx clahrd | |||
@@ -926,6 +923,7 @@ lapack_deprecated_objsc=" | |||
" | |||
lapack_deprecated_objsd=" | |||
dgelqs dgeqrs | |||
dgegs dgeqpf | |||
dgegv dggsvd | |||
dgelsx dggsvp | |||
@@ -933,6 +931,8 @@ lapack_deprecated_objsd=" | |||
dlatzm dtzrqf" | |||
lapack_deprecated_objss=" | |||
sgelqs | |||
sgeqrs | |||
sgelsx | |||
sgegs | |||
sgegv | |||
@@ -945,6 +945,8 @@ lapack_deprecated_objss=" | |||
" | |||
lapack_deprecated_objsz=" | |||
zgelqs | |||
zgeqrs | |||
zgegs | |||
zgegv | |||
zgelsx | |||
@@ -131,11 +131,11 @@ | |||
sd $21, 40($sp) | |||
sd $22, 48($sp) | |||
ST $f24, 56($sp) | |||
ST $f25, 64($sp) | |||
ST $f26, 72($sp) | |||
ST $f27, 80($sp) | |||
ST $f28, 88($sp) | |||
sdc1 $f24, 56($sp) | |||
sdc1 $f25, 64($sp) | |||
sdc1 $f26, 72($sp) | |||
sdc1 $f27, 80($sp) | |||
sdc1 $f28, 88($sp) | |||
#if defined(TRMMKERNEL) | |||
sd $23, 96($sp) | |||
@@ -146,10 +146,10 @@ | |||
#endif | |||
#ifndef __64BIT__ | |||
ST $f20,120($sp) | |||
ST $f21,128($sp) | |||
ST $f22,136($sp) | |||
ST $f23,144($sp) | |||
sdc1 $f20,120($sp) | |||
sdc1 $f21,128($sp) | |||
sdc1 $f22,136($sp) | |||
sdc1 $f23,144($sp) | |||
#endif | |||
.align 4 | |||
@@ -4000,11 +4000,11 @@ | |||
ld $21, 40($sp) | |||
ld $22, 48($sp) | |||
LD $f24, 56($sp) | |||
LD $f25, 64($sp) | |||
LD $f26, 72($sp) | |||
LD $f27, 80($sp) | |||
LD $f28, 88($sp) | |||
ldc1 $f24, 56($sp) | |||
ldc1 $f25, 64($sp) | |||
ldc1 $f26, 72($sp) | |||
ldc1 $f27, 80($sp) | |||
ldc1 $f28, 88($sp) | |||
#if defined(TRMMKERNEL) | |||
ld $23, 96($sp) | |||
@@ -4013,10 +4013,10 @@ | |||
#endif | |||
#ifndef __64BIT__ | |||
LD $f20,120($sp) | |||
LD $f21,128($sp) | |||
LD $f22,136($sp) | |||
LD $f23,144($sp) | |||
ldc1 $f20,120($sp) | |||
ldc1 $f21,128($sp) | |||
ldc1 $f22,136($sp) | |||
ldc1 $f23,144($sp) | |||
#endif | |||
daddiu $sp,$sp,STACKSIZE | |||
@@ -29,6 +29,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
#include "../common.h" | |||
#define SGEMM BLASFUNC(sgemm) | |||
#define SBGEMM BLASFUNC(sbgemm) | |||
#define SGEMV BLASFUNC(sgemv) | |||
#define SBGEMV BLASFUNC(sbgemv) | |||
typedef union | |||
{ | |||
unsigned short v; | |||
@@ -187,7 +189,79 @@ main (int argc, char *argv[]) | |||
free(CC); | |||
} | |||
if (ret != 0) | |||
if (ret != 0) { | |||
fprintf (stderr, "FATAL ERROR SBGEMM - Return code: %d\n", ret); | |||
return ret; | |||
} | |||
k = 1; | |||
for (x = 1; x <= loop; x++) | |||
{ | |||
float *A = (float *)malloc(x * x * sizeof(FLOAT)); | |||
float *B = (float *)malloc(x * sizeof(FLOAT)); | |||
float *C = (float *)malloc(x * sizeof(FLOAT)); | |||
bfloat16_bits *AA = (bfloat16_bits *)malloc(x * x * sizeof(bfloat16_bits)); | |||
bfloat16_bits *BB = (bfloat16_bits *)malloc(x * sizeof(bfloat16_bits)); | |||
float *DD = (float *)malloc(x * sizeof(FLOAT)); | |||
float *CC = (float *)malloc(x * sizeof(FLOAT)); | |||
if ((A == NULL) || (B == NULL) || (C == NULL) || (AA == NULL) || (BB == NULL) || | |||
(DD == NULL) || (CC == NULL)) | |||
return 1; | |||
bfloat16 atmp, btmp; | |||
blasint one = 1; | |||
for (j = 0; j < x; j++) | |||
{ | |||
for (i = 0; i < x; i++) | |||
{ | |||
A[j * x + i] = ((FLOAT) rand () / (FLOAT) RAND_MAX) + 0.5; | |||
sbstobf16_(&one, &A[j*x+i], &one, &atmp, &one); | |||
AA[j * x + i].v = atmp; | |||
} | |||
B[j] = ((FLOAT) rand () / (FLOAT) RAND_MAX) + 0.5; | |||
sbstobf16_(&one, &B[j], &one, &btmp, &one); | |||
BB[j].v = btmp; | |||
} | |||
for (y = 0; y < 2; y++) | |||
{ | |||
if (y == 0) { | |||
transA = 'N'; | |||
} else { | |||
transA = 'T'; | |||
} | |||
memset(CC, 0, x * sizeof(FLOAT)); | |||
memset(DD, 0, x * sizeof(FLOAT)); | |||
memset(C, 0, x * sizeof(FLOAT)); | |||
SGEMV (&transA, &x, &x, &alpha, A, &x, B, &k, &beta, C, &k); | |||
SBGEMV (&transA, &x, &x, &alpha, (bfloat16*) AA, &x, (bfloat16*) BB, &k, &beta, CC, &k); | |||
for (j = 0; j < x; j++) | |||
for (i = 0; i < x; i++) | |||
if (transA == 'N') { | |||
DD[i] += float16to32 (AA[j * x + i]) * float16to32 (BB[j]); | |||
} else if (transA == 'T') { | |||
DD[j] += float16to32 (AA[j * x + i]) * float16to32 (BB[i]); | |||
} | |||
for (j = 0; j < x; j++) { | |||
if (fabs (CC[j] - C[j]) > 1.0) | |||
ret++; | |||
if (fabs (CC[j] - DD[j]) > 1.0) | |||
ret++; | |||
} | |||
} | |||
free(A); | |||
free(B); | |||
free(C); | |||
free(AA); | |||
free(BB); | |||
free(DD); | |||
free(CC); | |||
} | |||
if (ret != 0) | |||
fprintf (stderr, "FATAL ERROR SBGEMV - Return code: %d\n", ret); | |||
return ret; | |||
} |