SVE zgemm&cgemm (and other BLAS 3 complex)tags/v0.3.20
@@ -201,3 +201,6 @@ In chronological order: | |||
* Bine Brank <https://github.com/binebrank> | |||
* [2021-10-27] Add vector-length-agnostic DGEMM kernels for Arm SVE | |||
* [2021-11-20] Vector-length-agnostic Arm SVE copy routines for DGEMM, DTRMM, DSYMM | |||
* [2021-11-12] SVE kernels for SGEMM, STRMM and corresponding SVE copy functions | |||
* [2022-01-06] SVE kernels for CGEMM, ZGEMM, CTRMM, ZTRMM and corresponding SVE copy functions | |||
* [2022-01-18] SVE kernels and copy functions for TRSM |
@@ -323,55 +323,93 @@ function (build_core TARGET_CORE KDIR TSUFFIX KERNEL_DEFINITIONS) | |||
#hemm | |||
GenerateNamedObjects("generic/zhemm_utcopy_${${float_char}GEMM_UNROLL_M}.c" "" "hemm_iutcopy" false "" "" false ${float_type}) | |||
GenerateNamedObjects("generic/zhemm_ltcopy_${${float_char}GEMM_UNROLL_M}.c" "LOWER" "hemm_iltcopy" false "" "" false ${float_type}) | |||
if (NOT DEFINED ${float_char}HEMMUTCOPY_M) | |||
set(HEMMUTCOPY_M "generic/zhemm_utcopy_${${float_char}GEMM_UNROLL_M}.c") | |||
set(HEMMLTCOPY_M "generic/zhemm_ltcopy_${${float_char}GEMM_UNROLL_M}.c") | |||
else () | |||
set(HEMMUTCOPY_M "${KERNELDIR}/${${float_char}HEMMUTCOPY_M}") | |||
set(HEMMLTCOPY_M "${KERNELDIR}/${${float_char}HEMMLTCOPY_M}") | |||
endif() | |||
GenerateNamedObjects(${HEMMUTCOPY_M} "" "hemm_iutcopy" false "" "" false ${float_type}) | |||
GenerateNamedObjects(${HEMMLTCOPY_M} "LOWER" "hemm_iltcopy" false "" "" false ${float_type}) | |||
GenerateNamedObjects("generic/zhemm_utcopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER" "hemm_outcopy" false "" "" false ${float_type}) | |||
GenerateNamedObjects("generic/zhemm_ltcopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER;LOWER" "hemm_oltcopy" false "" "" false ${float_type}) | |||
# symm for c and z | |||
if (NOT DEFINED ${float_char}SYMMUCOPY_M) | |||
set(SYMMUCOPY_M "generic/zsymm_ucopy_${${float_char}GEMM_UNROLL_M}.c") | |||
set(SYMMLCOPY_M "generic/zsymm_lcopy_${${float_char}GEMM_UNROLL_M}.c") | |||
else () | |||
set(SYMMUCOPY_M "${KERNELDIR}/${${float_char}SYMMUCOPY_M}") | |||
set(SYMMLCOPY_M "${KERNELDIR}/${${float_char}SYMMLCOPY_M}") | |||
endif() | |||
GenerateNamedObjects("generic/zsymm_ucopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER" "symm_outcopy" false "" "" false ${float_type}) | |||
GenerateNamedObjects("generic/zsymm_ucopy_${${float_char}GEMM_UNROLL_M}.c" "" "symm_iutcopy" false "" "" false ${float_type}) | |||
GenerateNamedObjects(${SYMMUCOPY_M} "" "symm_iutcopy" false "" "" false ${float_type}) | |||
GenerateNamedObjects("generic/zsymm_lcopy_${${float_char}GEMM_UNROLL_N}.c" "LOWER;OUTER" "symm_oltcopy" false "" "" false ${float_type}) | |||
GenerateNamedObjects("generic/zsymm_lcopy_${${float_char}GEMM_UNROLL_M}.c" "LOWER" "symm_iltcopy" false "" "" false ${float_type}) | |||
GenerateNamedObjects(${SYMMLCOPY_M} "LOWER" "symm_iltcopy" false "" "" false ${float_type}) | |||
GenerateNamedObjects("generic/ztrmm_uncopy_${${float_char}GEMM_UNROLL_M}.c" "UNIT" "trmm_iunucopy" false "" "" false ${float_type}) | |||
GenerateNamedObjects("generic/ztrmm_uncopy_${${float_char}GEMM_UNROLL_M}.c" "" "trmm_iunncopy" false "" "" false ${float_type}) | |||
if (NOT DEFINED ${float_char}TRMMUNCOPY_M) | |||
set(TRMMUNCOPY_M "generic/ztrmm_uncopy_${${float_char}GEMM_UNROLL_M}.c") | |||
set(TRMMLNCOPY_M "generic/ztrmm_lncopy_${${float_char}GEMM_UNROLL_M}.c") | |||
set(TRMMUTCOPY_M "generic/ztrmm_utcopy_${${float_char}GEMM_UNROLL_M}.c") | |||
set(TRMMLTCOPY_M "generic/ztrmm_ltcopy_${${float_char}GEMM_UNROLL_M}.c") | |||
else () | |||
set(TRMMUNCOPY_M "${KERNELDIR}/${${float_char}TRMMUNCOPY_M}") | |||
set(TRMMLNCOPY_M "${KERNELDIR}/${${float_char}TRMMLNCOPY_M}") | |||
set(TRMMUTCOPY_M "${KERNELDIR}/${${float_char}TRMMUTCOPY_M}") | |||
set(TRMMLTCOPY_M "${KERNELDIR}/${${float_char}TRMMLTCOPY_M}") | |||
endif () | |||
GenerateNamedObjects(${TRMMUNCOPY_M} "UNIT" "trmm_iunucopy" false "" "" false ${float_type}) | |||
GenerateNamedObjects(${TRMMUNCOPY_M} "" "trmm_iunncopy" false "" "" false ${float_type}) | |||
GenerateNamedObjects("generic/ztrmm_uncopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER;UNIT" "trmm_ounucopy" false "" "" false ${float_type}) | |||
GenerateNamedObjects("generic/ztrmm_uncopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER" "trmm_ounncopy" false "" "" false ${float_type}) | |||
GenerateNamedObjects("generic/ztrmm_lncopy_${${float_char}GEMM_UNROLL_M}.c" "LOWER;UNIT" "trmm_ilnucopy" false "" "" false ${float_type}) | |||
GenerateNamedObjects("generic/ztrmm_lncopy_${${float_char}GEMM_UNROLL_M}.c" "LOWER" "trmm_ilnncopy" false "" "" false ${float_type}) | |||
GenerateNamedObjects(${TRMMLNCOPY_M} "LOWER;UNIT" "trmm_ilnucopy" false "" "" false ${float_type}) | |||
GenerateNamedObjects(${TRMMLNCOPY_M} "LOWER" "trmm_ilnncopy" false "" "" false ${float_type}) | |||
GenerateNamedObjects("generic/ztrmm_lncopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER;LOWER;UNIT" "trmm_olnucopy" false "" "" false ${float_type}) | |||
GenerateNamedObjects("generic/ztrmm_lncopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER;LOWER" "trmm_olnncopy" false "" "" false ${float_type}) | |||
GenerateNamedObjects("generic/ztrmm_utcopy_${${float_char}GEMM_UNROLL_M}.c" "UNIT" "trmm_iutucopy" false "" "" false ${float_type}) | |||
GenerateNamedObjects("generic/ztrmm_utcopy_${${float_char}GEMM_UNROLL_M}.c" "" "trmm_iutncopy" false "" "" false ${float_type}) | |||
GenerateNamedObjects(${TRMMUTCOPY_M} "UNIT" "trmm_iutucopy" false "" "" false ${float_type}) | |||
GenerateNamedObjects(${TRMMUTCOPY_M} "" "trmm_iutncopy" false "" "" false ${float_type}) | |||
GenerateNamedObjects("generic/ztrmm_utcopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER;UNIT" "trmm_outucopy" false "" "" false ${float_type}) | |||
GenerateNamedObjects("generic/ztrmm_utcopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER" "trmm_outncopy" false "" "" false ${float_type}) | |||
GenerateNamedObjects("generic/ztrmm_ltcopy_${${float_char}GEMM_UNROLL_M}.c" "LOWER;UNIT" "trmm_iltucopy" false "" "" false ${float_type}) | |||
GenerateNamedObjects("generic/ztrmm_ltcopy_${${float_char}GEMM_UNROLL_M}.c" "LOWER" "trmm_iltncopy" false "" "" false ${float_type}) | |||
GenerateNamedObjects(${TRMMLTCOPY_M} "LOWER;UNIT" "trmm_iltucopy" false "" "" false ${float_type}) | |||
GenerateNamedObjects(${TRMMLTCOPY_M} "LOWER" "trmm_iltncopy" false "" "" false ${float_type}) | |||
GenerateNamedObjects("generic/ztrmm_ltcopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER;LOWER;UNIT" "trmm_oltucopy" false "" "" false ${float_type}) | |||
GenerateNamedObjects("generic/ztrmm_ltcopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER;LOWER" "trmm_oltncopy" false "" "" false ${float_type}) | |||
GenerateNamedObjects("generic/ztrsm_uncopy_${${float_char}GEMM_UNROLL_M}.c" "UNIT" "trsm_iunucopy" false "" "" false ${float_type}) | |||
GenerateNamedObjects("generic/ztrsm_uncopy_${${float_char}GEMM_UNROLL_M}.c" "" "trsm_iunncopy" false "" "" false ${float_type}) | |||
if (NOT DEFINED ZTRSMCOPYLN_M) | |||
set(ZTRSMUNCOPY_M "generic/ztrsm_uncopy_${${float_char}GEMM_UNROLL_M}.c") | |||
set(ZTRSMLNCOPY_M "generic/ztrsm_lncopy_${${float_char}GEMM_UNROLL_M}.c") | |||
set(ZTRSMUTCOPY_M "generic/ztrsm_utcopy_${${float_char}GEMM_UNROLL_M}.c") | |||
set(ZTRSMLTCOPY_M "generic/ztrsm_ltcopy_${${float_char}GEMM_UNROLL_M}.c") | |||
else () | |||
set(ZTRSMUNCOPY_M "${KERNELDIR}/${ZTRSMCOPYUN_M}") | |||
set(ZTRSMLNCOPY_M "${KERNELDIR}/${ZTRSMCOPYLN_M}") | |||
set(ZTRSMUTCOPY_M "${KERNELDIR}/${ZTRSMCOPYUT_M}") | |||
set(ZTRSMLTCOPY_M "${KERNELDIR}/${ZTRSMCOPYLT_M}") | |||
endif () | |||
GenerateNamedObjects(${ZTRSMUNCOPY_M} "UNIT" "trsm_iunucopy" false "" "" false ${float_type}) | |||
GenerateNamedObjects(${ZTRSMUNCOPY_M} "" "trsm_iunncopy" false "" "" false ${float_type}) | |||
GenerateNamedObjects("generic/ztrsm_uncopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER;UNIT" "trsm_ounucopy" false "" "" false ${float_type}) | |||
GenerateNamedObjects("generic/ztrsm_uncopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER" "trsm_ounncopy" false "" "" false ${float_type}) | |||
GenerateNamedObjects("generic/ztrsm_lncopy_${${float_char}GEMM_UNROLL_M}.c" "LOWER;UNIT" "trsm_ilnucopy" false "" "" false ${float_type}) | |||
GenerateNamedObjects("generic/ztrsm_lncopy_${${float_char}GEMM_UNROLL_M}.c" "LOWER" "trsm_ilnncopy" false "" "" false ${float_type}) | |||
GenerateNamedObjects(${ZTRSMLNCOPY_M} "LOWER;UNIT" "trsm_ilnucopy" false "" "" false ${float_type}) | |||
GenerateNamedObjects(${ZTRSMLNCOPY_M} "LOWER" "trsm_ilnncopy" false "" "" false ${float_type}) | |||
GenerateNamedObjects("generic/ztrsm_lncopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER;LOWER;UNIT" "trsm_olnucopy" false "" "" false ${float_type}) | |||
GenerateNamedObjects("generic/ztrsm_lncopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER;LOWER" "trsm_olnncopy" false "" "" false ${float_type}) | |||
GenerateNamedObjects("generic/ztrsm_utcopy_${${float_char}GEMM_UNROLL_M}.c" "UNIT" "trsm_iutucopy" false "" "" false ${float_type}) | |||
GenerateNamedObjects("generic/ztrsm_utcopy_${${float_char}GEMM_UNROLL_M}.c" "" "trsm_iutncopy" false "" "" false ${float_type}) | |||
GenerateNamedObjects(${ZTRSMUTCOPY_M} "UNIT" "trsm_iutucopy" false "" "" false ${float_type}) | |||
GenerateNamedObjects(${ZTRSMUTCOPY_M} "" "trsm_iutncopy" false "" "" false ${float_type}) | |||
GenerateNamedObjects("generic/ztrsm_utcopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER;UNIT" "trsm_outucopy" false "" "" false ${float_type}) | |||
GenerateNamedObjects("generic/ztrsm_utcopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER" "trsm_outncopy" false "" "" false ${float_type}) | |||
GenerateNamedObjects("generic/ztrsm_ltcopy_${${float_char}GEMM_UNROLL_M}.c" "LOWER;UNIT" "trsm_iltucopy" false "" "" false ${float_type}) | |||
GenerateNamedObjects("generic/ztrsm_ltcopy_${${float_char}GEMM_UNROLL_M}.c" "LOWER" "trsm_iltncopy" false "" "" false ${float_type}) | |||
GenerateNamedObjects(${ZTRSMLTCOPY_M} "LOWER;UNIT" "trsm_iltucopy" false "" "" false ${float_type}) | |||
GenerateNamedObjects(${ZTRSMLTCOPY_M} "LOWER" "trsm_iltncopy" false "" "" false ${float_type}) | |||
GenerateNamedObjects("generic/ztrsm_ltcopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER;LOWER;UNIT" "trsm_oltucopy" false "" "" false ${float_type}) | |||
GenerateNamedObjects("generic/ztrsm_ltcopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER;LOWER" "trsm_oltncopy" false "" "" false ${float_type}) | |||
@@ -465,23 +503,35 @@ endif () | |||
GenerateNamedObjects("generic/trmm_ltcopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER;LOWER;UNIT" "trmm_oltucopy" false "" "" false ${float_type}) | |||
GenerateNamedObjects("generic/trmm_ltcopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER;LOWER" "trmm_oltncopy" false "" "" false ${float_type}) | |||
GenerateNamedObjects("generic/trsm_uncopy_${${float_char}GEMM_UNROLL_M}.c" "UNIT" "trsm_iunucopy" false "" "" false ${float_type}) | |||
GenerateNamedObjects("generic/trsm_uncopy_${${float_char}GEMM_UNROLL_M}.c" "" "trsm_iunncopy" false "" "" false ${float_type}) | |||
if (NOT DEFINED TRSMCOPYLN_M) | |||
set(TRSMUNCOPY_M "generic/trsm_uncopy_${${float_char}GEMM_UNROLL_M}.c") | |||
set(TRSMLNCOPY_M "generic/trsm_lncopy_${${float_char}GEMM_UNROLL_M}.c") | |||
set(TRSMUTCOPY_M "generic/trsm_utcopy_${${float_char}GEMM_UNROLL_M}.c") | |||
set(TRSMLTCOPY_M "generic/trsm_ltcopy_${${float_char}GEMM_UNROLL_M}.c") | |||
else () | |||
set(TRSMUNCOPY_M "${KERNELDIR}/${TRSMCOPYUN_M}") | |||
set(TRSMLNCOPY_M "${KERNELDIR}/${TRSMCOPYLN_M}") | |||
set(TRSMUTCOPY_M "${KERNELDIR}/${TRSMCOPYUT_M}") | |||
set(TRSMLTCOPY_M "${KERNELDIR}/${TRSMCOPYLT_M}") | |||
endif () | |||
GenerateNamedObjects(${TRSMUNCOPY_M} "UNIT" "trsm_iunucopy" false "" "" false ${float_type}) | |||
GenerateNamedObjects(${TRSMUNCOPY_M} "" "trsm_iunncopy" false "" "" false ${float_type}) | |||
GenerateNamedObjects("generic/trsm_uncopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER;UNIT" "trsm_ounucopy" false "" "" false ${float_type}) | |||
GenerateNamedObjects("generic/trsm_uncopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER" "trsm_ounncopy" false "" "" false ${float_type}) | |||
GenerateNamedObjects("generic/trsm_lncopy_${${float_char}GEMM_UNROLL_M}.c" "LOWER;UNIT" "trsm_ilnucopy" false "" "" false ${float_type}) | |||
GenerateNamedObjects("generic/trsm_lncopy_${${float_char}GEMM_UNROLL_M}.c" "LOWER" "trsm_ilnncopy" false "" "" false ${float_type}) | |||
GenerateNamedObjects(${TRSMLNCOPY_M} "LOWER;UNIT" "trsm_ilnucopy" false "" "" false ${float_type}) | |||
GenerateNamedObjects(${TRSMLNCOPY_M} "LOWER" "trsm_ilnncopy" false "" "" false ${float_type}) | |||
GenerateNamedObjects("generic/trsm_lncopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER;LOWER;UNIT" "trsm_olnucopy" false "" "" false ${float_type}) | |||
GenerateNamedObjects("generic/trsm_lncopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER;LOWER" "trsm_olnncopy" false "" "" false ${float_type}) | |||
GenerateNamedObjects("generic/trsm_utcopy_${${float_char}GEMM_UNROLL_M}.c" "UNIT" "trsm_iutucopy" false "" "" false ${float_type}) | |||
GenerateNamedObjects("generic/trsm_utcopy_${${float_char}GEMM_UNROLL_M}.c" "" "trsm_iutncopy" false "" "" false ${float_type}) | |||
GenerateNamedObjects(${TRSMUTCOPY_M} "UNIT" "trsm_iutucopy" false "" "" false ${float_type}) | |||
GenerateNamedObjects(${TRSMUTCOPY_M} "" "trsm_iutncopy" false "" "" false ${float_type}) | |||
GenerateNamedObjects("generic/trsm_utcopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER;UNIT" "trsm_outucopy" false "" "" false ${float_type}) | |||
GenerateNamedObjects("generic/trsm_utcopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER" "trsm_outncopy" false "" "" false ${float_type}) | |||
GenerateNamedObjects("generic/trsm_ltcopy_${${float_char}GEMM_UNROLL_M}.c" "LOWER;UNIT" "trsm_iltucopy" false "" "" false ${float_type}) | |||
GenerateNamedObjects("generic/trsm_ltcopy_${${float_char}GEMM_UNROLL_M}.c" "LOWER" "trsm_iltncopy" false "" "" false ${float_type}) | |||
GenerateNamedObjects(${TRSMLTCOPY_M} "LOWER;UNIT" "trsm_iltucopy" false "" "" false ${float_type}) | |||
GenerateNamedObjects(${TRSMLTCOPY_M} "LOWER" "trsm_iltncopy" false "" "" false ${float_type}) | |||
GenerateNamedObjects("generic/trsm_ltcopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER;LOWER;UNIT" "trsm_oltucopy" false "" "" false ${float_type}) | |||
GenerateNamedObjects("generic/trsm_ltcopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER;LOWER" "trsm_oltncopy" false "" "" false ${float_type}) | |||
@@ -1691,29 +1691,61 @@ $(KDIR)qtrmm_oltucopy$(TSUFFIX).$(SUFFIX) : generic/trmm_ltcopy_$(QGEMM_UNROLL_N | |||
$(KDIR)qtrmm_oltncopy$(TSUFFIX).$(SUFFIX) : generic/trmm_ltcopy_$(QGEMM_UNROLL_N).c | |||
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -DOUTER -DLOWER -UUNIT $< -o $@ | |||
ifdef CTRMMUNCOPY_M | |||
$(KDIR)ctrmm_iunucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMUNCOPY_M) | |||
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ | |||
$(KDIR)ctrmm_iunncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMUNCOPY_M) | |||
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ | |||
else | |||
$(KDIR)ctrmm_iunucopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_uncopy_$(CGEMM_UNROLL_M).c | |||
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ | |||
$(KDIR)ctrmm_iunncopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_uncopy_$(CGEMM_UNROLL_M).c | |||
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ | |||
endif | |||
$(KDIR)ctrmm_ilnucopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_lncopy_$(CGEMM_UNROLL_M).c | |||
ifdef CTRMMLNCOPY_M | |||
$(KDIR)ctrmm_ilnucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMLNCOPY_M) | |||
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ | |||
$(KDIR)ctrmm_ilnncopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_lncopy_$(CGEMM_UNROLL_M).c | |||
$(KDIR)ctrmm_ilnncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMLNCOPY_M) | |||
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ | |||
else | |||
$(KDIR)ctrmm_ilnucopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_lncopy_$(CGEMM_UNROLL_M).c | |||
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ | |||
$(KDIR)ctrmm_ilnncopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_lncopy_$(CGEMM_UNROLL_M).c | |||
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ | |||
endif | |||
ifdef CTRMMUTCOPY_M | |||
$(KDIR)ctrmm_iutucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMUTCOPY_M) | |||
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ | |||
$(KDIR)ctrmm_iutncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMUTCOPY_M) | |||
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ | |||
else | |||
$(KDIR)ctrmm_iutucopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_utcopy_$(CGEMM_UNROLL_M).c | |||
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ | |||
$(KDIR)ctrmm_iutncopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_utcopy_$(CGEMM_UNROLL_M).c | |||
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ | |||
endif | |||
$(KDIR)ctrmm_iltucopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_ltcopy_$(CGEMM_UNROLL_M).c | |||
ifdef CTRMMLTCOPY_M | |||
$(KDIR)ctrmm_iltucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMLTCOPY_M) | |||
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ | |||
$(KDIR)ctrmm_iltncopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_ltcopy_$(CGEMM_UNROLL_M).c | |||
$(KDIR)ctrmm_iltncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMLTCOPY_M) | |||
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ | |||
else | |||
$(KDIR)ctrmm_iltucopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_ltcopy_$(CGEMM_UNROLL_M).c | |||
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ | |||
$(KDIR)ctrmm_iltncopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_ltcopy_$(CGEMM_UNROLL_M).c | |||
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ | |||
endif | |||
$(KDIR)ctrmm_ounucopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_uncopy_$(CGEMM_UNROLL_N).c | |||
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@ | |||
@@ -1739,29 +1771,61 @@ $(KDIR)ctrmm_oltucopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_ltcopy_$(CGEMM_UNROLL_ | |||
$(KDIR)ctrmm_oltncopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_ltcopy_$(CGEMM_UNROLL_N).c | |||
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -DOUTER -DLOWER -UUNIT $< -o $@ | |||
ifdef ZTRMMUNCOPY_M | |||
$(KDIR)ztrmm_iunucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMUNCOPY_M) | |||
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ | |||
$(KDIR)ztrmm_iunncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMUNCOPY_M) | |||
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ | |||
else | |||
$(KDIR)ztrmm_iunucopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_uncopy_$(ZGEMM_UNROLL_M).c | |||
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ | |||
$(KDIR)ztrmm_iunncopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_uncopy_$(ZGEMM_UNROLL_M).c | |||
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ | |||
endif | |||
ifdef ZTRMMLNCOPY_M | |||
$(KDIR)ztrmm_ilnucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMLNCOPY_M) | |||
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ | |||
$(KDIR)ztrmm_ilnncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMLNCOPY_M) | |||
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ | |||
else | |||
$(KDIR)ztrmm_ilnucopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_lncopy_$(ZGEMM_UNROLL_M).c | |||
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ | |||
$(KDIR)ztrmm_ilnncopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_lncopy_$(ZGEMM_UNROLL_M).c | |||
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ | |||
endif | |||
ifdef ZTRMMUTCOPY_M | |||
$(KDIR)ztrmm_iutucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMUTCOPY_M) | |||
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ | |||
$(KDIR)ztrmm_iutncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMUTCOPY_M) | |||
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ | |||
else | |||
$(KDIR)ztrmm_iutucopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_utcopy_$(ZGEMM_UNROLL_M).c | |||
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ | |||
$(KDIR)ztrmm_iutncopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_utcopy_$(ZGEMM_UNROLL_M).c | |||
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ | |||
endif | |||
ifdef ZTRMMLTCOPY_M | |||
$(KDIR)ztrmm_iltucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMLTCOPY_M) | |||
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ | |||
$(KDIR)ztrmm_iltncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMLTCOPY_M) | |||
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ | |||
else | |||
$(KDIR)ztrmm_iltucopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_ltcopy_$(ZGEMM_UNROLL_M).c | |||
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ | |||
$(KDIR)ztrmm_iltncopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_ltcopy_$(ZGEMM_UNROLL_M).c | |||
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ | |||
endif | |||
$(KDIR)ztrmm_ounucopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_uncopy_$(ZGEMM_UNROLL_N).c | |||
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@ | |||
@@ -1897,11 +1961,21 @@ $(KDIR)csymm_outcopy$(TSUFFIX).$(SUFFIX) : generic/zsymm_ucopy_$(CGEMM_UNROLL_N) | |||
$(KDIR)csymm_oltcopy$(TSUFFIX).$(SUFFIX) : generic/zsymm_lcopy_$(CGEMM_UNROLL_N).c | |||
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -DOUTER -DLOWER $< -o $@ | |||
ifdef CSYMMUCOPY_M | |||
$(KDIR)csymm_iutcopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CSYMMUCOPY_M) | |||
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -ULOWER $< -o $@ | |||
else | |||
$(KDIR)csymm_iutcopy$(TSUFFIX).$(SUFFIX) : generic/zsymm_ucopy_$(CGEMM_UNROLL_M).c | |||
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -ULOWER $< -o $@ | |||
endif | |||
ifdef CSYMMLCOPY_M | |||
$(KDIR)csymm_iltcopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CSYMMLCOPY_M) | |||
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -DLOWER $< -o $@ | |||
else | |||
$(KDIR)csymm_iltcopy$(TSUFFIX).$(SUFFIX) : generic/zsymm_lcopy_$(CGEMM_UNROLL_M).c | |||
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -DLOWER $< -o $@ | |||
endif | |||
$(KDIR)zsymm_outcopy$(TSUFFIX).$(SUFFIX) : generic/zsymm_ucopy_$(ZGEMM_UNROLL_N).c | |||
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -DOUTER -ULOWER $< -o $@ | |||
@@ -1909,11 +1983,21 @@ $(KDIR)zsymm_outcopy$(TSUFFIX).$(SUFFIX) : generic/zsymm_ucopy_$(ZGEMM_UNROLL_N) | |||
$(KDIR)zsymm_oltcopy$(TSUFFIX).$(SUFFIX) : generic/zsymm_lcopy_$(ZGEMM_UNROLL_N).c | |||
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -DOUTER -DLOWER $< -o $@ | |||
ifdef ZSYMMUCOPY_M | |||
$(KDIR)zsymm_iutcopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZSYMMUCOPY_M) | |||
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -ULOWER $< -o $@ | |||
else | |||
$(KDIR)zsymm_iutcopy$(TSUFFIX).$(SUFFIX) : generic/zsymm_ucopy_$(ZGEMM_UNROLL_M).c | |||
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -ULOWER $< -o $@ | |||
endif | |||
ifdef ZSYMMLCOPY_M | |||
$(KDIR)zsymm_iltcopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZSYMMLCOPY_M) | |||
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -DLOWER $< -o $@ | |||
else | |||
$(KDIR)zsymm_iltcopy$(TSUFFIX).$(SUFFIX) : generic/zsymm_lcopy_$(ZGEMM_UNROLL_M).c | |||
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -DLOWER $< -o $@ | |||
endif | |||
$(KDIR)xsymm_outcopy$(TSUFFIX).$(SUFFIX) : generic/zsymm_ucopy_$(XGEMM_UNROLL_N).c | |||
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -DOUTER -ULOWER $< -o $@ | |||
@@ -1933,11 +2017,21 @@ $(KDIR)chemm_outcopy$(TSUFFIX).$(SUFFIX) : generic/zhemm_utcopy_$(CGEMM_UNROLL_N | |||
$(KDIR)chemm_oltcopy$(TSUFFIX).$(SUFFIX) : generic/zhemm_ltcopy_$(CGEMM_UNROLL_N).c | |||
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -DOUTER $< -DLOWER -o $@ | |||
ifdef CHEMMUTCOPY_M | |||
$(KDIR)chemm_iutcopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CHEMMUTCOPY_M) | |||
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER $< -ULOWER -o $@ | |||
else | |||
$(KDIR)chemm_iutcopy$(TSUFFIX).$(SUFFIX) : generic/zhemm_utcopy_$(CGEMM_UNROLL_M).c | |||
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER $< -ULOWER -o $@ | |||
endif | |||
ifdef CHEMMLTCOPY_M | |||
$(KDIR)chemm_iltcopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CHEMMLTCOPY_M) | |||
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER $< -DLOWER -o $@ | |||
else | |||
$(KDIR)chemm_iltcopy$(TSUFFIX).$(SUFFIX) : generic/zhemm_ltcopy_$(CGEMM_UNROLL_M).c | |||
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER $< -DLOWER -o $@ | |||
endif | |||
$(KDIR)zhemm_outcopy$(TSUFFIX).$(SUFFIX) : generic/zhemm_utcopy_$(ZGEMM_UNROLL_N).c | |||
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -DOUTER $< -ULOWER -o $@ | |||
@@ -1945,11 +2039,21 @@ $(KDIR)zhemm_outcopy$(TSUFFIX).$(SUFFIX) : generic/zhemm_utcopy_$(ZGEMM_UNROLL_N | |||
$(KDIR)zhemm_oltcopy$(TSUFFIX).$(SUFFIX) : generic/zhemm_ltcopy_$(ZGEMM_UNROLL_N).c | |||
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -DOUTER $< -DLOWER -o $@ | |||
ifdef ZHEMMUTCOPY_M | |||
$(KDIR)zhemm_iutcopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZHEMMUTCOPY_M) | |||
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER $< -ULOWER -o $@ | |||
else | |||
$(KDIR)zhemm_iutcopy$(TSUFFIX).$(SUFFIX) : generic/zhemm_utcopy_$(ZGEMM_UNROLL_M).c | |||
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER $< -ULOWER -o $@ | |||
endif | |||
ifdef ZHEMMLTCOPY_M | |||
$(KDIR)zhemm_iltcopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZHEMMLTCOPY_M) | |||
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER $< -DLOWER -o $@ | |||
else | |||
$(KDIR)zhemm_iltcopy$(TSUFFIX).$(SUFFIX) : generic/zhemm_ltcopy_$(ZGEMM_UNROLL_M).c | |||
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER $< -DLOWER -o $@ | |||
endif | |||
$(KDIR)xhemm_outcopy$(TSUFFIX).$(SUFFIX) : generic/zhemm_utcopy_$(XGEMM_UNROLL_N).c | |||
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -DOUTER $< -ULOWER -o $@ | |||
@@ -2287,29 +2391,61 @@ $(KDIR)xhemm3m_iucopyi$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_ucopy_$(XGEMM3M_UNR | |||
$(KDIR)xhemm3m_ilcopyi$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_lcopy_$(XGEMM3M_UNROLL_M).c | |||
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -UUSE_ALPHA -DIMAGE_ONLY $< -o $@ | |||
ifdef TRSMCOPYUN_M | |||
$(KDIR)strsm_iunucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(TRSMCOPYUN_M) | |||
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ | |||
$(KDIR)strsm_iunncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(TRSMCOPYUN_M) | |||
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ | |||
else | |||
$(KDIR)strsm_iunucopy$(TSUFFIX).$(SUFFIX) : generic/trsm_uncopy_$(SGEMM_UNROLL_M).c | |||
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ | |||
$(KDIR)strsm_iunncopy$(TSUFFIX).$(SUFFIX) : generic/trsm_uncopy_$(SGEMM_UNROLL_M).c | |||
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ | |||
endif | |||
ifdef TRSMCOPYLN_M | |||
$(KDIR)strsm_ilnucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(TRSMCOPYLN_M) | |||
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ | |||
$(KDIR)strsm_ilnncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(TRSMCOPYLN_M) | |||
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ | |||
else | |||
$(KDIR)strsm_ilnucopy$(TSUFFIX).$(SUFFIX) : generic/trsm_lncopy_$(SGEMM_UNROLL_M).c | |||
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ | |||
$(KDIR)strsm_ilnncopy$(TSUFFIX).$(SUFFIX) : generic/trsm_lncopy_$(SGEMM_UNROLL_M).c | |||
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ | |||
endif | |||
ifdef TRSMCOPYUT_M | |||
$(KDIR)strsm_iutucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(TRSMCOPYUT_M) | |||
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ | |||
$(KDIR)strsm_iutncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(TRSMCOPYUT_M) | |||
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ | |||
else | |||
$(KDIR)strsm_iutucopy$(TSUFFIX).$(SUFFIX) : generic/trsm_utcopy_$(SGEMM_UNROLL_M).c | |||
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ | |||
$(KDIR)strsm_iutncopy$(TSUFFIX).$(SUFFIX) : generic/trsm_utcopy_$(SGEMM_UNROLL_M).c | |||
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ | |||
endif | |||
ifdef TRSMCOPYLT_M | |||
$(KDIR)strsm_iltucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(TRSMCOPYLT_M) | |||
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ | |||
$(KDIR)strsm_iltncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(TRSMCOPYLT_M) | |||
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ | |||
else | |||
$(KDIR)strsm_iltucopy$(TSUFFIX).$(SUFFIX) : generic/trsm_ltcopy_$(SGEMM_UNROLL_M).c | |||
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ | |||
$(KDIR)strsm_iltncopy$(TSUFFIX).$(SUFFIX) : generic/trsm_ltcopy_$(SGEMM_UNROLL_M).c | |||
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ | |||
endif | |||
$(KDIR)strsm_ounucopy$(TSUFFIX).$(SUFFIX) : generic/trsm_uncopy_$(SGEMM_UNROLL_N).c | |||
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@ | |||
@@ -2335,29 +2471,61 @@ $(KDIR)strsm_oltucopy$(TSUFFIX).$(SUFFIX) : generic/trsm_ltcopy_$(SGEMM_UNROLL_N | |||
$(KDIR)strsm_oltncopy$(TSUFFIX).$(SUFFIX) : generic/trsm_ltcopy_$(SGEMM_UNROLL_N).c | |||
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -DOUTER -DLOWER -UUNIT $< -o $@ | |||
ifdef TRSMCOPYUN_M | |||
$(KDIR)dtrsm_iunucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(TRSMCOPYUN_M) | |||
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ | |||
$(KDIR)dtrsm_iunncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(TRSMCOPYUN_M) | |||
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ | |||
else | |||
$(KDIR)dtrsm_iunucopy$(TSUFFIX).$(SUFFIX) : generic/trsm_uncopy_$(DGEMM_UNROLL_M).c | |||
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ | |||
$(KDIR)dtrsm_iunncopy$(TSUFFIX).$(SUFFIX) : generic/trsm_uncopy_$(DGEMM_UNROLL_M).c | |||
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ | |||
endif | |||
ifdef TRSMCOPYLN_M | |||
$(KDIR)dtrsm_ilnucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(TRSMCOPYLN_M) | |||
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ | |||
$(KDIR)dtrsm_ilnncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(TRSMCOPYLN_M) | |||
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ | |||
else | |||
$(KDIR)dtrsm_ilnucopy$(TSUFFIX).$(SUFFIX) : generic/trsm_lncopy_$(DGEMM_UNROLL_M).c | |||
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ | |||
$(KDIR)dtrsm_ilnncopy$(TSUFFIX).$(SUFFIX) : generic/trsm_lncopy_$(DGEMM_UNROLL_M).c | |||
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ | |||
endif | |||
ifdef TRSMCOPYUT_M | |||
$(KDIR)dtrsm_iutucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(TRSMCOPYUT_M) | |||
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ | |||
$(KDIR)dtrsm_iutncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(TRSMCOPYUT_M) | |||
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ | |||
else | |||
$(KDIR)dtrsm_iutucopy$(TSUFFIX).$(SUFFIX) : generic/trsm_utcopy_$(DGEMM_UNROLL_M).c | |||
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ | |||
$(KDIR)dtrsm_iutncopy$(TSUFFIX).$(SUFFIX) : generic/trsm_utcopy_$(DGEMM_UNROLL_M).c | |||
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ | |||
endif | |||
ifdef TRSMCOPYLT_M | |||
$(KDIR)dtrsm_iltucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(TRSMCOPYLT_M) | |||
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ | |||
$(KDIR)dtrsm_iltncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(TRSMCOPYLT_M) | |||
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ | |||
else | |||
$(KDIR)dtrsm_iltucopy$(TSUFFIX).$(SUFFIX) : generic/trsm_ltcopy_$(DGEMM_UNROLL_M).c | |||
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ | |||
$(KDIR)dtrsm_iltncopy$(TSUFFIX).$(SUFFIX) : generic/trsm_ltcopy_$(DGEMM_UNROLL_M).c | |||
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ | |||
endif | |||
$(KDIR)dtrsm_ounucopy$(TSUFFIX).$(SUFFIX) : generic/trsm_uncopy_$(DGEMM_UNROLL_N).c | |||
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@ | |||
@@ -2431,29 +2599,61 @@ $(KDIR)qtrsm_oltucopy$(TSUFFIX).$(SUFFIX) : generic/trsm_ltcopy_$(QGEMM_UNROLL_N | |||
$(KDIR)qtrsm_oltncopy$(TSUFFIX).$(SUFFIX) : generic/trsm_ltcopy_$(QGEMM_UNROLL_N).c | |||
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -DOUTER -DLOWER -UUNIT $< -o $@ | |||
ifdef ZTRSMCOPYUN_M | |||
$(KDIR)ctrsm_iunucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRSMCOPYUN_M) | |||
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ | |||
$(KDIR)ctrsm_iunncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRSMCOPYUN_M) | |||
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ | |||
else | |||
$(KDIR)ctrsm_iunucopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_uncopy_$(CGEMM_UNROLL_M).c | |||
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ | |||
$(KDIR)ctrsm_iunncopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_uncopy_$(CGEMM_UNROLL_M).c | |||
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ | |||
endif | |||
ifdef ZTRSMCOPYLN_M | |||
$(KDIR)ctrsm_ilnucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRSMCOPYLN_M) | |||
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ | |||
$(KDIR)ctrsm_ilnncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRSMCOPYLN_M) | |||
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ | |||
else | |||
$(KDIR)ctrsm_ilnucopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_lncopy_$(CGEMM_UNROLL_M).c | |||
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ | |||
$(KDIR)ctrsm_ilnncopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_lncopy_$(CGEMM_UNROLL_M).c | |||
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ | |||
endif | |||
ifdef ZTRSMCOPYUT_M | |||
$(KDIR)ctrsm_iutucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRSMCOPYUT_M) | |||
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ | |||
$(KDIR)ctrsm_iutncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRSMCOPYUT_M) | |||
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ | |||
else | |||
$(KDIR)ctrsm_iutucopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_utcopy_$(CGEMM_UNROLL_M).c | |||
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ | |||
$(KDIR)ctrsm_iutncopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_utcopy_$(CGEMM_UNROLL_M).c | |||
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ | |||
endif | |||
ifdef ZTRSMCOPYLT_M | |||
$(KDIR)ctrsm_iltucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRSMCOPYLT_M) | |||
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ | |||
$(KDIR)ctrsm_iltncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRSMCOPYLT_M) | |||
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ | |||
else | |||
$(KDIR)ctrsm_iltucopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_ltcopy_$(CGEMM_UNROLL_M).c | |||
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ | |||
$(KDIR)ctrsm_iltncopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_ltcopy_$(CGEMM_UNROLL_M).c | |||
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ | |||
endif | |||
$(KDIR)ctrsm_ounucopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_uncopy_$(CGEMM_UNROLL_N).c | |||
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@ | |||
@@ -2479,29 +2679,61 @@ $(KDIR)ctrsm_oltucopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_ltcopy_$(CGEMM_UNROLL_ | |||
$(KDIR)ctrsm_oltncopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_ltcopy_$(CGEMM_UNROLL_N).c | |||
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -DOUTER -DLOWER -UUNIT $< -o $@ | |||
ifdef ZTRSMCOPYUN_M | |||
$(KDIR)ztrsm_iunucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRSMCOPYUN_M) | |||
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ | |||
$(KDIR)ztrsm_iunncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRSMCOPYUN_M) | |||
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ | |||
else | |||
$(KDIR)ztrsm_iunucopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_uncopy_$(ZGEMM_UNROLL_M).c | |||
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ | |||
$(KDIR)ztrsm_iunncopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_uncopy_$(ZGEMM_UNROLL_M).c | |||
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ | |||
endif | |||
ifdef ZTRSMCOPYLN_M | |||
$(KDIR)ztrsm_ilnucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRSMCOPYLN_M) | |||
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ | |||
$(KDIR)ztrsm_ilnncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRSMCOPYLN_M) | |||
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ | |||
else | |||
$(KDIR)ztrsm_ilnucopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_lncopy_$(ZGEMM_UNROLL_M).c | |||
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ | |||
$(KDIR)ztrsm_ilnncopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_lncopy_$(ZGEMM_UNROLL_M).c | |||
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ | |||
endif | |||
ifdef ZTRSMCOPYUT_M | |||
$(KDIR)ztrsm_iutucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRSMCOPYUT_M) | |||
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ | |||
$(KDIR)ztrsm_iutncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRSMCOPYUT_M) | |||
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ | |||
else | |||
$(KDIR)ztrsm_iutucopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_utcopy_$(ZGEMM_UNROLL_M).c | |||
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ | |||
$(KDIR)ztrsm_iutncopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_utcopy_$(ZGEMM_UNROLL_M).c | |||
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ | |||
endif | |||
ifdef ZTRSMCOPYLT_M | |||
$(KDIR)ztrsm_iltucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRSMCOPYLT_M) | |||
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ | |||
$(KDIR)ztrsm_iltncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRSMCOPYLT_M) | |||
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ | |||
else | |||
$(KDIR)ztrsm_iltucopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_ltcopy_$(ZGEMM_UNROLL_M).c | |||
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ | |||
$(KDIR)ztrsm_iltncopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_ltcopy_$(ZGEMM_UNROLL_M).c | |||
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ | |||
endif | |||
$(KDIR)ztrsm_ounucopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_uncopy_$(ZGEMM_UNROLL_N).c | |||
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@ | |||
@@ -20,25 +20,36 @@ IDMAXKERNEL = ../arm/imax.c | |||
ISMINKERNEL = ../arm/imin.c | |||
IDMINKERNEL = ../arm/imin.c | |||
STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c | |||
STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c | |||
STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c | |||
STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | |||
DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c | |||
DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c | |||
DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c | |||
DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | |||
CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c | |||
CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c | |||
CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c | |||
CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | |||
ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c | |||
ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c | |||
ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c | |||
ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | |||
STRSMKERNEL_LN = trsm_kernel_LN_sve.c | |||
STRSMKERNEL_LT = trsm_kernel_LT_sve.c | |||
STRSMKERNEL_RN = trsm_kernel_RN_sve.c | |||
STRSMKERNEL_RT = trsm_kernel_RT_sve.c | |||
DTRSMKERNEL_LN = trsm_kernel_LN_sve.c | |||
DTRSMKERNEL_LT = trsm_kernel_LT_sve.c | |||
DTRSMKERNEL_RN = trsm_kernel_RN_sve.c | |||
DTRSMKERNEL_RT = trsm_kernel_RT_sve.c | |||
TRSMCOPYLN_M = trsm_lncopy_sve.c | |||
TRSMCOPYLT_M = trsm_ltcopy_sve.c | |||
TRSMCOPYUN_M = trsm_uncopy_sve.c | |||
TRSMCOPYUT_M = trsm_utcopy_sve.c | |||
CTRSMKERNEL_LN = trsm_kernel_LN_sve.c | |||
CTRSMKERNEL_LT = trsm_kernel_LT_sve.c | |||
CTRSMKERNEL_RN = trsm_kernel_RN_sve.c | |||
CTRSMKERNEL_RT = trsm_kernel_RT_sve.c | |||
ZTRSMKERNEL_LN = trsm_kernel_LN_sve.c | |||
ZTRSMKERNEL_LT = trsm_kernel_LT_sve.c | |||
ZTRSMKERNEL_RN = trsm_kernel_RN_sve.c | |||
ZTRSMKERNEL_RT = trsm_kernel_RT_sve.c | |||
ZTRSMCOPYLN_M = ztrsm_lncopy_sve.c | |||
ZTRSMCOPYLT_M = ztrsm_ltcopy_sve.c | |||
ZTRSMCOPYUN_M = ztrsm_uncopy_sve.c | |||
ZTRSMCOPYUT_M = ztrsm_utcopy_sve.c | |||
SAMAXKERNEL = amax.S | |||
DAMAXKERNEL = amax.S | |||
@@ -156,28 +167,50 @@ DTRMMLTCOPY_M = trmm_ltcopy_sve_v1.c | |||
DSYMMUCOPY_M = symm_ucopy_sve.c | |||
DSYMMLCOPY_M = symm_lcopy_sve.c | |||
CGEMMKERNEL = cgemm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N).S | |||
CTRMMKERNEL = ctrmm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N).S | |||
ifneq ($(CGEMM_UNROLL_M), $(CGEMM_UNROLL_N)) | |||
CGEMMINCOPY = ../generic/zgemm_ncopy_$(CGEMM_UNROLL_M).c | |||
CGEMMITCOPY = ../generic/zgemm_tcopy_$(CGEMM_UNROLL_M).c | |||
CGEMMKERNEL = cgemm_kernel_sve_v1x$(ZGEMM_UNROLL_N).S | |||
CTRMMKERNEL = ctrmm_kernel_sve_v1x$(ZGEMM_UNROLL_N).S | |||
CGEMMINCOPY = cgemm_ncopy_sve_v1.c | |||
CGEMMITCOPY = cgemm_tcopy_sve_v1.c | |||
CGEMMONCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_N).c | |||
CGEMMOTCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_N).c | |||
CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX) | |||
CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX) | |||
endif | |||
CGEMMONCOPY = ../generic/zgemm_ncopy_$(CGEMM_UNROLL_N).c | |||
CGEMMOTCOPY = ../generic/zgemm_tcopy_$(CGEMM_UNROLL_N).c | |||
CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||
CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||
ZGEMMKERNEL = zgemm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N).S | |||
ZTRMMKERNEL = ztrmm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N).S | |||
ifneq ($(ZGEMM_UNROLL_M), $(ZGEMM_UNROLL_N)) | |||
ZGEMMINCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_M).c | |||
ZGEMMITCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_M).c | |||
ZGEMMINCOPYOBJ = zgemm_incopy$(TSUFFIX).$(SUFFIX) | |||
ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX) | |||
endif | |||
CTRMMUNCOPY_M = ztrmm_uncopy_sve_v1.c | |||
CTRMMLNCOPY_M = ztrmm_lncopy_sve_v1.c | |||
CTRMMUTCOPY_M = ztrmm_utcopy_sve_v1.c | |||
CTRMMLTCOPY_M = ztrmm_ltcopy_sve_v1.c | |||
CHEMMLTCOPY_M = zhemm_ltcopy_sve.c | |||
CHEMMUTCOPY_M = zhemm_utcopy_sve.c | |||
CSYMMUCOPY_M = zsymm_ucopy_sve.c | |||
CSYMMLCOPY_M = zsymm_lcopy_sve.c | |||
ZGEMMKERNEL = zgemm_kernel_sve_v1x$(ZGEMM_UNROLL_N).S | |||
ZTRMMKERNEL = ztrmm_kernel_sve_v1x$(ZGEMM_UNROLL_N).S | |||
ZGEMMINCOPY = zgemm_ncopy_sve_v1.c | |||
ZGEMMITCOPY = zgemm_tcopy_sve_v1.c | |||
ZGEMMONCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_N).c | |||
ZGEMMOTCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_N).c | |||
ZGEMMINCOPYOBJ = zgemm_incopy$(TSUFFIX).$(SUFFIX) | |||
ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX) | |||
ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||
ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||
ZTRMMUNCOPY_M = ztrmm_uncopy_sve_v1.c | |||
ZTRMMLNCOPY_M = ztrmm_lncopy_sve_v1.c | |||
ZTRMMUTCOPY_M = ztrmm_utcopy_sve_v1.c | |||
ZTRMMLTCOPY_M = ztrmm_ltcopy_sve_v1.c | |||
ZHEMMLTCOPY_M = zhemm_ltcopy_sve.c | |||
ZHEMMUTCOPY_M = zhemm_utcopy_sve.c | |||
ZSYMMUCOPY_M = zsymm_ucopy_sve.c | |||
ZSYMMLCOPY_M = zsymm_lcopy_sve.c |
@@ -20,25 +20,36 @@ IDMAXKERNEL = ../arm/imax.c | |||
ISMINKERNEL = ../arm/imin.c | |||
IDMINKERNEL = ../arm/imin.c | |||
STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c | |||
STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c | |||
STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c | |||
STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | |||
DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c | |||
DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c | |||
DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c | |||
DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | |||
CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c | |||
CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c | |||
CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c | |||
CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | |||
ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c | |||
ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c | |||
ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c | |||
ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | |||
STRSMKERNEL_LN = trsm_kernel_LN_sve.c | |||
STRSMKERNEL_LT = trsm_kernel_LT_sve.c | |||
STRSMKERNEL_RN = trsm_kernel_RN_sve.c | |||
STRSMKERNEL_RT = trsm_kernel_RT_sve.c | |||
DTRSMKERNEL_LN = trsm_kernel_LN_sve.c | |||
DTRSMKERNEL_LT = trsm_kernel_LT_sve.c | |||
DTRSMKERNEL_RN = trsm_kernel_RN_sve.c | |||
DTRSMKERNEL_RT = trsm_kernel_RT_sve.c | |||
TRSMCOPYLN_M = trsm_lncopy_sve.c | |||
TRSMCOPYLT_M = trsm_ltcopy_sve.c | |||
TRSMCOPYUN_M = trsm_uncopy_sve.c | |||
TRSMCOPYUT_M = trsm_utcopy_sve.c | |||
CTRSMKERNEL_LN = trsm_kernel_LN_sve.c | |||
CTRSMKERNEL_LT = trsm_kernel_LT_sve.c | |||
CTRSMKERNEL_RN = trsm_kernel_RN_sve.c | |||
CTRSMKERNEL_RT = trsm_kernel_RT_sve.c | |||
ZTRSMKERNEL_LN = trsm_kernel_LN_sve.c | |||
ZTRSMKERNEL_LT = trsm_kernel_LT_sve.c | |||
ZTRSMKERNEL_RN = trsm_kernel_RN_sve.c | |||
ZTRSMKERNEL_RT = trsm_kernel_RT_sve.c | |||
ZTRSMCOPYLN_M = ztrsm_lncopy_sve.c | |||
ZTRSMCOPYLT_M = ztrsm_ltcopy_sve.c | |||
ZTRSMCOPYUN_M = ztrsm_uncopy_sve.c | |||
ZTRSMCOPYUT_M = ztrsm_utcopy_sve.c | |||
SAMAXKERNEL = amax.S | |||
DAMAXKERNEL = amax.S | |||
@@ -140,8 +151,8 @@ DTRMMKERNEL = dtrmm_kernel_sve_v1x$(DGEMM_UNROLL_N).S | |||
DGEMMINCOPY = dgemm_ncopy_sve_v1.c | |||
DGEMMITCOPY = dgemm_tcopy_sve_v1.c | |||
DGEMMONCOPY = ../generic/gemm_ncopy_$(DGEMM_UNROLL_N).c | |||
DGEMMOTCOPY = ../generic/gemm_tcopy_$(DGEMM_UNROLL_N).c | |||
DGEMMONCOPY = dgemm_ncopy_$(DGEMM_UNROLL_N).S | |||
DGEMMOTCOPY = dgemm_tcopy_$(DGEMM_UNROLL_N).S | |||
DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX) | |||
DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX) | |||
@@ -156,28 +167,50 @@ DTRMMLTCOPY_M = trmm_ltcopy_sve_v1.c | |||
DSYMMUCOPY_M = symm_ucopy_sve.c | |||
DSYMMLCOPY_M = symm_lcopy_sve.c | |||
CGEMMKERNEL = cgemm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N).S | |||
CTRMMKERNEL = ctrmm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N).S | |||
ifneq ($(CGEMM_UNROLL_M), $(CGEMM_UNROLL_N)) | |||
CGEMMINCOPY = ../generic/zgemm_ncopy_$(CGEMM_UNROLL_M).c | |||
CGEMMITCOPY = ../generic/zgemm_tcopy_$(CGEMM_UNROLL_M).c | |||
CGEMMKERNEL = cgemm_kernel_sve_v1x$(ZGEMM_UNROLL_N).S | |||
CTRMMKERNEL = ctrmm_kernel_sve_v1x$(ZGEMM_UNROLL_N).S | |||
CGEMMINCOPY = cgemm_ncopy_sve_v1.c | |||
CGEMMITCOPY = cgemm_tcopy_sve_v1.c | |||
CGEMMONCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_N).c | |||
CGEMMOTCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_N).c | |||
CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX) | |||
CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX) | |||
endif | |||
CGEMMONCOPY = ../generic/zgemm_ncopy_$(CGEMM_UNROLL_N).c | |||
CGEMMOTCOPY = ../generic/zgemm_tcopy_$(CGEMM_UNROLL_N).c | |||
CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||
CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||
ZGEMMKERNEL = zgemm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N).S | |||
ZTRMMKERNEL = ztrmm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N).S | |||
ifneq ($(ZGEMM_UNROLL_M), $(ZGEMM_UNROLL_N)) | |||
ZGEMMINCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_M).c | |||
ZGEMMITCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_M).c | |||
ZGEMMINCOPYOBJ = zgemm_incopy$(TSUFFIX).$(SUFFIX) | |||
ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX) | |||
endif | |||
CTRMMUNCOPY_M = ztrmm_uncopy_sve_v1.c | |||
CTRMMLNCOPY_M = ztrmm_lncopy_sve_v1.c | |||
CTRMMUTCOPY_M = ztrmm_utcopy_sve_v1.c | |||
CTRMMLTCOPY_M = ztrmm_ltcopy_sve_v1.c | |||
CHEMMLTCOPY_M = zhemm_ltcopy_sve.c | |||
CHEMMUTCOPY_M = zhemm_utcopy_sve.c | |||
CSYMMUCOPY_M = zsymm_ucopy_sve.c | |||
CSYMMLCOPY_M = zsymm_lcopy_sve.c | |||
ZGEMMKERNEL = zgemm_kernel_sve_v1x$(ZGEMM_UNROLL_N).S | |||
ZTRMMKERNEL = ztrmm_kernel_sve_v1x$(ZGEMM_UNROLL_N).S | |||
ZGEMMINCOPY = zgemm_ncopy_sve_v1.c | |||
ZGEMMITCOPY = zgemm_tcopy_sve_v1.c | |||
ZGEMMONCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_N).c | |||
ZGEMMOTCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_N).c | |||
ZGEMMINCOPYOBJ = zgemm_incopy$(TSUFFIX).$(SUFFIX) | |||
ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX) | |||
ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||
ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||
ZTRMMUNCOPY_M = ztrmm_uncopy_sve_v1.c | |||
ZTRMMLNCOPY_M = ztrmm_lncopy_sve_v1.c | |||
ZTRMMUTCOPY_M = ztrmm_utcopy_sve_v1.c | |||
ZTRMMLTCOPY_M = ztrmm_ltcopy_sve_v1.c | |||
ZHEMMLTCOPY_M = zhemm_ltcopy_sve.c | |||
ZHEMMUTCOPY_M = zhemm_utcopy_sve.c | |||
ZSYMMUCOPY_M = zsymm_ucopy_sve.c | |||
ZSYMMLCOPY_M = zsymm_lcopy_sve.c |
@@ -0,0 +1,874 @@ | |||
/******************************************************************************* | |||
Copyright (c) 2015, The OpenBLAS Project | |||
All rights reserved. | |||
Redistribution and use in source and binary forms, with or without | |||
modification, are permitted provided that the following conditions are | |||
met: | |||
1. Redistributions of source code must retain the above copyright | |||
notice, this list of conditions and the following disclaimer. | |||
2. Redistributions in binary form must reproduce the above copyright | |||
notice, this list of conditions and the following disclaimer in | |||
the documentation and/or other materials provided with the | |||
distribution. | |||
3. Neither the name of the OpenBLAS project nor the names of | |||
its contributors may be used to endorse or promote products | |||
derived from this software without specific prior written permission. | |||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
*******************************************************************************/ | |||
#define ASSEMBLER | |||
#include "common.h" | |||
/* X0 X1 X2 s0 X3 x4 x5 x6 */ | |||
/*int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha0,FLOAT* ba,FLOAT* bb,FLOAT* C,BLASLONG ldc */ | |||
#define origM x0 | |||
#define origN x1 | |||
#define origK x2 | |||
#define origPA x3 | |||
#define origPB x4 | |||
#define pC x5 | |||
#define LDC x6 | |||
#define temp x7 | |||
#define counterL x8 | |||
#define counterI x9 | |||
#define counterJ x10 | |||
#define pB x11 | |||
#define pCRow0 x12 | |||
#define pCRow1 x13 | |||
#define pCRow2 x14 | |||
#define pCRow3 x15 | |||
#define pA x16 | |||
#define lanes x17 | |||
#define alphaR w19 | |||
#define alphaI w20 | |||
#define alphaz_R z6.s | |||
#define alphaz_I z7.s | |||
#define alpha0_R s4 | |||
#define alpha0_I s5 | |||
#define A_PRE_SIZE 2560 | |||
#define B_PRE_SIZE 448 | |||
#define C_PRE_SIZE 128 | |||
#if defined(NN) || defined(NT) || defined(TN) || defined(TT) | |||
#define OP_rr fmla | |||
#define OP_ii fmls | |||
#define OP_ri fmla | |||
#define OP_ir fmla | |||
#elif defined(NR) || defined(NC) || defined(TR) || defined(TC) | |||
#define OP_rr fmla | |||
#define OP_ii fmla | |||
#define OP_ri fmls | |||
#define OP_ir fmla | |||
#elif defined(RN) || defined(RT) || defined(CN) || defined(CT) | |||
#define OP_rr fmla | |||
#define OP_ii fmla | |||
#define OP_ri fmla | |||
#define OP_ir fmls | |||
#elif defined(RR) || defined(RC) || defined(CR) || defined(CC) | |||
#define OP_rr fmla | |||
#define OP_ii fmls | |||
#define OP_ri fmls | |||
#define OP_ir fmls | |||
#endif | |||
// 00 origM | |||
// 01 origN | |||
// 02 origK | |||
// 03 origPA | |||
// 04 origPB | |||
// 05 pC | |||
// 06 origLDC -> LDC | |||
// 07 offset -> temp | |||
// 08 counterL | |||
// 09 counterI | |||
// 10 counterJ | |||
// 11 pB | |||
// 12 pCRow0 | |||
// 13 pCRow1 | |||
// 14 pCRow2 | |||
// 15 pCRow3 | |||
// 16 pA | |||
// 17 alpha_save_R | |||
// 18 must save alpha_save_I | |||
// 19 must save | |||
// 20 must save | |||
// 21 must save | |||
// 22 must save | |||
// 23 must save | |||
// 24 must save | |||
// 25 must save | |||
// 26 must save | |||
// 27 must save | |||
// 28 must save | |||
// 29 frame | |||
// 30 link | |||
// 31 sp | |||
//v00 ALPHA_R -> pA00_R, pA01_R | |||
//v01 ALPHA_I -> pA00_I, pA01_I | |||
//v02 pA02_R, pA03_R | |||
//v03 pA02_I, pA03_I | |||
//v04 pA10_R, pA11_R | |||
//v05 pA10_I, pA11_I | |||
//v06 pA12_R, pA13_R | |||
//v07 pA12_I, pA13_I | |||
//v08 must save pB00_R, pB01_R | |||
//v09 must save pB00_I, pB01_I | |||
//v10 must save pB02_R, pB03_R OR ALPHA0_R | |||
//v11 must save pB02_I, pB03_I OR ALPHA0_I | |||
//v12 must save pB10_R, pB11_R | |||
//v13 must save pB10_I, pB11_I | |||
//v14 must save pB12_R, pB13_R OR ALPHA1_R | |||
//v15 must save pB12_I, pB13_I OR ALPHA1_R | |||
//v16 pC0R | |||
//v17 pC0I | |||
//v18 pC1R | |||
//v19 pC1I | |||
//v20 pC2R | |||
//v21 pC2I | |||
//v22 pC3R | |||
//v23 pC3I | |||
//v24 pC3R | |||
//v25 pC3I | |||
//v26 pC22_R, pC23_R | |||
//v27 pC22_I, pC23_I | |||
//v28 pC30_R, pC31_R | |||
//v29 pC30_I, pC31_I | |||
//v30 pC32_R, pC33_R | |||
//v31 pC32_I, pC33_I | |||
/******************************************************************************* | |||
* Macro definitions | |||
*******************************************************************************/ | |||
.macro INITv1x4 | |||
dup z16.s, #0 | |||
dup z17.s, #0 | |||
dup z18.s, #0 | |||
dup z19.s, #0 | |||
dup z20.s, #0 | |||
dup z21.s, #0 | |||
dup z22.s, #0 | |||
dup z23.s, #0 | |||
.endm | |||
.macro KERNELv1x4_I | |||
ld2w {z0.s, z1.s}, p1/z, [pA] | |||
add pA, pA, lanes, lsl #3 // pA += lanes*2*4 | |||
ld2w {z2.s, z3.s}, p1/z, [pA] // next one | |||
add pA, pA, lanes, lsl #3 // pA += lanes*2*4 | |||
ld1rw z8.s, p0/z, [pB] | |||
ld1rw z9.s, p0/z, [pB, 4] | |||
ld1rw z10.s, p0/z, [pB, 8] | |||
ld1rw z11.s, p0/z, [pB, 12] | |||
ld1rw z12.s, p0/z, [pB, 16] | |||
ld1rw z13.s, p0/z, [pB, 20] | |||
ld1rw z14.s, p0/z, [pB, 24] | |||
ld1rw z15.s, p0/z, [pB, 28] | |||
add pB, pB, 32 | |||
fmla z16.s, p1/m, z0.s, z8.s | |||
OP_ir z17.s, p1/m, z1.s, z8.s | |||
ld1rw z8.s, p0/z, [pB] | |||
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ | |||
defined(RR) || defined(RC) || defined(CR) || defined(CC) | |||
#eor z17.16b, z17.16b, z17.16b | |||
fmls z17.s, p1/m, z0.s, z9.s | |||
#else | |||
fmla z17.s, p1/m, z0.s, z9.s | |||
#endif | |||
OP_ii z16.s, p1/m, z1.s, z9.s | |||
ld1rw z9.s, p0/z, [pB, 4] | |||
fmla z18.s, p1/m, z0.s, z10.s | |||
OP_ir z19.s, p1/m, z1.s, z10.s | |||
ld1rw z10.s, p0/z, [pB, 8] | |||
OP_ii z18.s, p1/m, z1.s, z11.s | |||
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ | |||
defined(RR) || defined(RC) || defined(CR) || defined(CC) | |||
#eor z19.16b, z21.16b, z21.16b | |||
fmls z19.s, p1/m, z0.s, z11.s | |||
#else | |||
fmla z19.s, p1/m, z0.s, z11.s | |||
#endif | |||
ld1rw z11.s, p0/z, [pB, 12] | |||
fmla z20.s, p1/m, z0.s, z12.s | |||
OP_ir z21.s, p1/m, z1.s, z12.s | |||
ld1rw z12.s, p0/z, [pB, 16] | |||
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ | |||
defined(RR) || defined(RC) || defined(CR) || defined(CC) | |||
#eor z21.16b, z23.16b, z23.16b | |||
fmls z21.s, p1/m, z0.s, z13.s | |||
#else | |||
fmla z21.s, p1/m, z0.s, z13.s | |||
#endif | |||
OP_ii z20.s, p1/m, z1.s, z13.s | |||
ld1rw z13.s, p0/z, [pB, 20] | |||
fmla z22.s, p1/m, z0.s, z14.s | |||
OP_ir z23.s, p1/m, z1.s, z14.s | |||
ld1rw z14.s, p0/z, [pB, 24] | |||
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ | |||
defined(RR) || defined(RC) || defined(CR) || defined(CC) | |||
#eor z23.16b, z19.16b, z19.16b | |||
fmls z23.s, p1/m, z0.s, z15.s | |||
#else | |||
fmla z23.s, p1/m, z0.s, z15.s | |||
#endif | |||
OP_ii z22.s, p1/m, z1.s, z15.s | |||
ld1rw z15.s, p0/z, [pB, 28] | |||
add pB, pB, 32 | |||
prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64] | |||
.endm | |||
.macro KERNELv1x4_M1 | |||
ld2w {z2.s, z3.s}, p1/z, [pA] | |||
add pA, pA, lanes, lsl #3 // pA = pA + lanes * 2 * 4 | |||
OP_rr z16.s, p1/m, z0.s, z8.s | |||
OP_ir z17.s, p1/m, z1.s, z8.s | |||
ld1rw z8.s, p0/z, [pB] | |||
OP_ii z16.s, p1/m, z1.s, z9.s | |||
OP_ri z17.s, p1/m, z0.s, z9.s | |||
ld1rw z9.s, p0/z, [pB, 4] | |||
OP_rr z18.s, p1/m, z0.s, z10.s | |||
OP_ir z19.s, p1/m, z1.s, z10.s | |||
ld1rw z10.s, p0/z, [pB, 8] | |||
OP_ii z18.s, p1/m, z1.s, z11.s | |||
OP_ri z19.s, p1/m, z0.s, z11.s | |||
ld1rw z11.s, p0/z, [pB, 12] | |||
OP_rr z20.s, p1/m, z0.s, z12.s | |||
OP_ir z21.s, p1/m, z1.s, z12.s | |||
ld1rw z12.s, p0/z, [pB, 16] | |||
OP_ii z20.s, p1/m, z1.s, z13.s | |||
OP_ri z21.s, p1/m, z0.s, z13.s | |||
ld1rw z13.s, p0/z, [pB, 20] | |||
OP_rr z22.s, p1/m, z0.s, z14.s | |||
OP_ir z23.s, p1/m, z1.s, z14.s | |||
ld1rw z14.s, p0/z, [pB, 24] | |||
OP_ii z22.s, p1/m, z1.s, z15.s | |||
OP_ri z23.s, p1/m, z0.s, z15.s | |||
ld1rw z15.s, p0/z, [pB, 28] | |||
add pB, pB, 32 | |||
prfm PLDL1KEEP, [pA, #A_PRE_SIZE] | |||
prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64] | |||
.endm | |||
.macro KERNELv1x4_M2 | |||
ld2w {z0.s, z1.s}, p1/z, [pA] | |||
add pA, pA, lanes, lsl #3 // pA = pA + lanes *2 * 4 | |||
OP_rr z16.s, p1/m, z2.s, z8.s | |||
OP_ir z17.s, p1/m, z3.s, z8.s | |||
ld1rw z8.s, p0/z, [pB] | |||
OP_ii z16.s, p1/m, z3.s, z9.s | |||
OP_ri z17.s, p1/m, z2.s, z9.s | |||
ld1rw z9.s, p0/z, [pB, 4] | |||
OP_rr z18.s, p1/m, z2.s, z10.s | |||
OP_ir z19.s, p1/m, z3.s, z10.s | |||
ld1rw z10.s, p0/z, [pB, 8] | |||
OP_ii z18.s, p1/m, z3.s, z11.s | |||
OP_ri z19.s, p1/m, z2.s, z11.s | |||
ld1rw z11.s, p0/z, [pB, 12] | |||
OP_rr z20.s, p1/m, z2.s, z12.s | |||
OP_ir z21.s, p1/m, z3.s, z12.s | |||
ld1rw z12.s, p0/z, [pB, 16] | |||
OP_ii z20.s, p1/m, z3.s, z13.s | |||
OP_ri z21.s, p1/m, z2.s, z13.s | |||
ld1rw z13.s, p0/z, [pB, 20] | |||
OP_rr z22.s, p1/m, z2.s, z14.s | |||
OP_ir z23.s, p1/m, z3.s, z14.s | |||
ld1rw z14.s, p0/z, [pB, 24] | |||
OP_ii z22.s, p1/m, z3.s, z15.s | |||
OP_ri z23.s, p1/m, z2.s, z15.s | |||
ld1rw z15.s, p0/z, [pB, 28] | |||
prfm PLDL1KEEP, [pB, #B_PRE_SIZE] | |||
add pB, pB, 32 | |||
prfm PLDL1KEEP, [pB, #B_PRE_SIZE+64] | |||
.endm | |||
.macro KERNELv1x4_E | |||
OP_rr z16.s, p1/m, z2.s, z8.s | |||
OP_ir z17.s, p1/m, z3.s, z8.s | |||
OP_ii z16.s, p1/m, z3.s, z9.s | |||
OP_ri z17.s, p1/m, z2.s, z9.s | |||
OP_rr z18.s, p1/m, z2.s, z10.s | |||
OP_ir z19.s, p1/m, z3.s, z10.s | |||
OP_ii z18.s, p1/m, z3.s, z11.s | |||
OP_ri z19.s, p1/m, z2.s, z11.s | |||
OP_rr z20.s, p1/m, z2.s, z12.s | |||
OP_ir z21.s, p1/m, z3.s, z12.s | |||
OP_ii z20.s, p1/m, z3.s, z13.s | |||
OP_ri z21.s, p1/m, z2.s, z13.s | |||
OP_rr z22.s, p1/m, z2.s, z14.s | |||
OP_ir z23.s, p1/m, z3.s, z14.s | |||
OP_ii z22.s, p1/m, z3.s, z15.s | |||
OP_ri z23.s, p1/m, z2.s, z15.s | |||
prfm PLDL1KEEP, [pB, #B_PRE_SIZE] | |||
prfm PLDL1KEEP, [pB, #B_PRE_SIZE+64] | |||
.endm | |||
.macro KERNELv1x4_SUB | |||
ld2w {z0.s, z1.s}, p1/z, [pA] | |||
add pA, pA, lanes, lsl #3 // pA = pA + lanes* 2 * 4 | |||
ld1rw z8.s, p0/z, [pB] | |||
ld1rw z9.s, p0/z, [pB, 4] | |||
ld1rw z10.s, p0/z, [pB, 8] | |||
ld1rw z11.s, p0/z, [pB, 12] | |||
OP_rr z16.s, p1/m, z0.s, z8.s | |||
OP_ir z17.s, p1/m, z1.s, z8.s | |||
OP_ii z16.s, p1/m, z1.s, z9.s | |||
OP_ri z17.s, p1/m, z0.s, z9.s | |||
ld1rw z12.s, p0/z, [pB, 16] | |||
ld1rw z13.s, p0/z, [pB, 20] | |||
ld1rw z14.s, p0/z, [pB, 24] | |||
ld1rw z15.s, p0/z, [pB, 28] | |||
OP_rr z18.s, p1/m, z0.s, z10.s | |||
OP_ir z19.s, p1/m, z1.s, z10.s | |||
OP_ii z18.s, p1/m, z1.s, z11.s | |||
OP_ri z19.s, p1/m, z0.s, z11.s | |||
add pB, pB, 32 | |||
OP_rr z20.s, p1/m, z0.s, z12.s | |||
OP_ir z21.s, p1/m, z1.s, z12.s | |||
OP_ii z20.s, p1/m, z1.s, z13.s | |||
OP_ri z21.s, p1/m, z0.s, z13.s | |||
OP_rr z22.s, p1/m, z0.s, z14.s | |||
OP_ir z23.s, p1/m, z1.s, z14.s | |||
OP_ii z22.s, p1/m, z1.s, z15.s | |||
OP_ri z23.s, p1/m, z0.s, z15.s | |||
prfm PLDL1KEEP, [pB, #B_PRE_SIZE] | |||
prfm PLDL1KEEP, [pA, #A_PRE_SIZE] | |||
.endm | |||
.macro SAVEv1x4 | |||
prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE] | |||
ld2w {z24.s, z25.s}, p1/z, [pCRow0] | |||
fmla z24.s, p1/m, z16.s, alphaz_R | |||
fmls z24.s, p1/m, z17.s, alphaz_I | |||
fmla z25.s, p1/m, z16.s, alphaz_I | |||
fmla z25.s, p1/m, z17.s, alphaz_R | |||
st2w {z24.s, z25.s}, p1, [pCRow0] | |||
add pCRow0, pCRow0, lanes, lsl #3 | |||
ld2w {z26.s, z27.s}, p1/z, [pCRow1] | |||
fmla z26.s, p1/m, z18.s, alphaz_R | |||
fmls z26.s, p1/m, z19.s, alphaz_I | |||
fmla z27.s, p1/m, z18.s, alphaz_I | |||
fmla z27.s, p1/m, z19.s, alphaz_R | |||
st2w {z26.s, z27.s}, p1, [pCRow1] | |||
add pCRow1, pCRow1, lanes, lsl #3 | |||
prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] | |||
ld2w {z28.s, z29.s}, p1/z, [pCRow2] | |||
fmla z28.s, p1/m, z20.s, alphaz_R | |||
fmls z28.s, p1/m, z21.s, alphaz_I | |||
fmla z29.s, p1/m, z20.s, alphaz_I | |||
fmla z29.s, p1/m, z21.s, alphaz_R | |||
st2w {z28.s, z29.s}, p1, [pCRow2] | |||
add pCRow2, pCRow2, lanes, lsl #3 | |||
ld2w {z30.s, z31.s}, p1/z, [pCRow3] | |||
fmla z30.s, p1/m, z22.s, alphaz_R | |||
fmls z30.s, p1/m, z23.s, alphaz_I | |||
fmla z31.s, p1/m, z22.s, alphaz_I | |||
fmla z31.s, p1/m, z23.s, alphaz_R | |||
st2w {z30.s, z31.s}, p1, [pCRow3] | |||
prfm PLDL2KEEP, [pCRow3, #C_PRE_SIZE] | |||
add pCRow3, pCRow3, lanes, lsl #3 // pC = pC + lanes * 2 *4 | |||
prfm PLDL2KEEP, [pCRow3, #C_PRE_SIZE] | |||
.endm | |||
/******************************************************************************/ | |||
.macro INITv1x2 | |||
dup z16.s, #0 | |||
dup z17.s, #0 | |||
dup z18.s, #0 | |||
dup z19.s, #0 | |||
.endm | |||
.macro KERNELv1x2_SUB | |||
ld2w {z0.s, z1.s}, p1/z, [pA] | |||
add pA, pA, lanes, lsl #3 // pA = pA + lanes* 2 * 4 | |||
ld1rw z8.s, p0/z, [pB] | |||
ld1rw z9.s, p0/z, [pB, 4] | |||
ld1rw z10.s, p0/z, [pB, 8] | |||
ld1rw z11.s, p0/z, [pB, 12] | |||
OP_rr z16.s, p1/m, z0.s, z8.s | |||
OP_ir z17.s, p1/m, z1.s, z8.s | |||
OP_ii z16.s, p1/m, z1.s, z9.s | |||
OP_ri z17.s, p1/m, z0.s, z9.s | |||
OP_rr z18.s, p1/m, z0.s, z10.s | |||
OP_ir z19.s, p1/m, z1.s, z10.s | |||
OP_ii z18.s, p1/m, z1.s, z11.s | |||
OP_ri z19.s, p1/m, z0.s, z11.s | |||
add pB, pB, 16 | |||
.endm | |||
.macro SAVEv1x2 | |||
prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE] | |||
ld2w {z24.s, z25.s}, p1/z, [pCRow0] | |||
fmla z24.s, p1/m, z16.s, alphaz_R | |||
fmls z24.s, p1/m, z17.s, alphaz_I | |||
fmla z25.s, p1/m, z16.s, alphaz_I | |||
fmla z25.s, p1/m, z17.s, alphaz_R | |||
st2w {z24.s, z25.s}, p1, [pCRow0] | |||
add pCRow0, pCRow0, lanes, lsl #3 | |||
ld2w {z26.s, z27.s}, p1/z, [pCRow1] | |||
fmla z26.s, p1/m, z18.s, alphaz_R | |||
fmls z26.s, p1/m, z19.s, alphaz_I | |||
fmla z27.s, p1/m, z18.s, alphaz_I | |||
fmla z27.s, p1/m, z19.s, alphaz_R | |||
st2w {z26.s, z27.s}, p1, [pCRow1] | |||
add pCRow1, pCRow1, lanes, lsl #3 | |||
prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] | |||
prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE] | |||
.endm | |||
/******************************************************************************/ | |||
.macro INITv1x1 | |||
dup z16.s, #0 | |||
dup z17.s, #0 | |||
.endm | |||
.macro KERNELv1x1_SUB | |||
ld2w {z0.s, z1.s}, p1/z, [pA] | |||
add pA, pA, lanes, lsl #3 // pA = pA + lanes* 2 * 4 | |||
ld1rw z8.s, p0/z, [pB] | |||
ld1rw z9.s, p0/z, [pB, 4] | |||
add pB, pB, 8 | |||
OP_rr z16.s, p1/m, z0.s, z8.s | |||
OP_ir z17.s, p1/m, z1.s, z8.s | |||
OP_ii z16.s, p1/m, z1.s, z9.s | |||
OP_ri z17.s, p1/m, z0.s, z9.s | |||
.endm | |||
.macro SAVEv1x1 | |||
prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE] | |||
ld2w {z24.s, z25.s}, p1/z, [pCRow0] | |||
fmla z24.s, p1/m, z16.s, alphaz_R | |||
fmls z24.s, p1/m, z17.s, alphaz_I | |||
fmla z25.s, p1/m, z16.s, alphaz_I | |||
fmla z25.s, p1/m, z17.s, alphaz_R | |||
st2w {z24.s, z25.s}, p1, [pCRow0] | |||
add pCRow0, pCRow0, lanes, lsl #3 // pC = pC + lanes * 2 *4 | |||
prfm PLDL2KEEP, [pCRow3, #C_PRE_SIZE] | |||
.endm | |||
/******************************************************************************/ | |||
/******************************************************************************* | |||
* End of macro definitions | |||
*******************************************************************************/ | |||
PROLOGUE | |||
.align 5 | |||
add sp, sp, #-(11 * 16) | |||
stp d8, d9, [sp, #(0 * 16)] | |||
stp d10, d11, [sp, #(1 * 16)] | |||
stp d12, d13, [sp, #(2 * 16)] | |||
stp d14, d15, [sp, #(3 * 16)] | |||
stp d16, d17, [sp, #(4 * 16)] | |||
stp x18, x19, [sp, #(5 * 16)] | |||
stp x20, x21, [sp, #(6 * 16)] | |||
stp x22, x23, [sp, #(7 * 16)] | |||
stp x24, x25, [sp, #(8 * 16)] | |||
stp x26, x27, [sp, #(9 * 16)] | |||
str x28, [sp, #(10 * 16)] | |||
prfm PLDL1KEEP, [origPB] | |||
prfm PLDL1KEEP, [origPA] | |||
fmov alphaR, s0 | |||
dup alphaz_R, alphaR | |||
fmov alphaI, s1 | |||
dup alphaz_I, alphaI | |||
lsl LDC, LDC, #3 // ldc = ldc * 2 * 4 | |||
ptrue p0.s // create true predicate | |||
mov pB, origPB | |||
// Loop over N | |||
mov counterJ, origN | |||
asr counterJ, counterJ, #2 // J = J / 4 | |||
cmp counterJ, #0 | |||
ble .Lcgemm_kernel_L2_BEGIN | |||
/******************************************************************************/ | |||
.Lcgemm_kernel_L4_BEGIN: | |||
mov pCRow0, pC | |||
add pCRow1, pCRow0, LDC | |||
add pCRow2, pCRow1, LDC | |||
add pCRow3, pCRow2, LDC | |||
add pC, pCRow3, LDC | |||
mov pA, origPA // pA = start of A array | |||
.Lcgemm_kernel_L4_Mv1_BEGIN: | |||
/* Loop over M is done in an SVE fashion. This has the benefit of the last M%SVE_LEN iterations being done in a single sweep */ | |||
mov counterI, #0 | |||
whilelt p1.s, counterI, origM | |||
cntp lanes, p0, p1.s // lanes contain number of active SVE lanes in M dimension | |||
.align 5 | |||
.Lcgemm_kernel_L4_Mv1_20: | |||
mov pB, origPB | |||
INITv1x4 // fill with zeros | |||
asr counterL , origK, #3 | |||
cmp counterL , #2 | |||
blt .Lcgemm_kernel_L4_Mv1_32 | |||
KERNELv1x4_I | |||
KERNELv1x4_M2 | |||
KERNELv1x4_M1 | |||
KERNELv1x4_M2 | |||
KERNELv1x4_M1 | |||
KERNELv1x4_M2 | |||
KERNELv1x4_M1 | |||
KERNELv1x4_M2 | |||
subs counterL, counterL, #2 // subtract 2 | |||
ble .Lcgemm_kernel_L4_Mv1_22a | |||
.align 5 | |||
.Lcgemm_kernel_L4_Mv1_22: | |||
KERNELv1x4_M1 | |||
KERNELv1x4_M2 | |||
KERNELv1x4_M1 | |||
KERNELv1x4_M2 | |||
KERNELv1x4_M1 | |||
KERNELv1x4_M2 | |||
KERNELv1x4_M1 | |||
KERNELv1x4_M2 | |||
subs counterL, counterL, #1 | |||
bgt .Lcgemm_kernel_L4_Mv1_22 | |||
.align 5 | |||
.Lcgemm_kernel_L4_Mv1_22a: | |||
KERNELv1x4_M1 | |||
KERNELv1x4_M2 | |||
KERNELv1x4_M1 | |||
KERNELv1x4_M2 | |||
KERNELv1x4_M1 | |||
KERNELv1x4_M2 | |||
KERNELv1x4_M1 | |||
KERNELv1x4_E | |||
b .Lcgemm_kernel_L4_Mv1_44 | |||
.align 5 | |||
.Lcgemm_kernel_L4_Mv1_32: | |||
tst counterL, #1 | |||
ble .Lcgemm_kernel_L4_Mv1_40 | |||
KERNELv1x4_I | |||
KERNELv1x4_M2 | |||
KERNELv1x4_M1 | |||
KERNELv1x4_M2 | |||
KERNELv1x4_M1 | |||
KERNELv1x4_M2 | |||
KERNELv1x4_M1 | |||
KERNELv1x4_E | |||
b .Lcgemm_kernel_L4_Mv1_44 | |||
.Lcgemm_kernel_L4_Mv1_40: | |||
INITv1x4 | |||
.Lcgemm_kernel_L4_Mv1_44: | |||
ands counterL , origK, #7 | |||
ble .Lcgemm_kernel_L4_Mv1_100 | |||
.align 5 | |||
.Lcgemm_kernel_L4_Mv1_46: | |||
KERNELv1x4_SUB | |||
subs counterL, counterL, #1 | |||
bne .Lcgemm_kernel_L4_Mv1_46 | |||
.Lcgemm_kernel_L4_Mv1_100: | |||
prfm PLDL1KEEP, [pA] | |||
prfm PLDL1KEEP, [pA, #64] | |||
prfm PLDL1KEEP, [origPB] | |||
SAVEv1x4 | |||
.Lcgemm_kernel_L4_Mv1_END: | |||
incw counterI | |||
whilelt p1.s, counterI, origM //SVE instruction | |||
cntp lanes, p0, p1.s // lanes contain number of active SVE lanes in M dimension | |||
b.any .Lcgemm_kernel_L4_Mv1_20 | |||
.Lcgemm_kernel_L4_END: | |||
lsl temp, origK, #5 | |||
add origPB, origPB, temp // B = B + K * 4 * 4 * 2 | |||
subs counterJ, counterJ , #1 // j-- | |||
bgt .Lcgemm_kernel_L4_BEGIN | |||
/******************************************************************************/ | |||
.Lcgemm_kernel_L2_BEGIN: // less than 2 left in N direction | |||
mov counterJ , origN | |||
tst counterJ , #3 | |||
ble .Lcgemm_kernel_L999 | |||
tst counterJ , #2 | |||
ble .Lcgemm_kernel_L1_BEGIN | |||
mov pCRow0, pC // pCRow0 = pC | |||
add pCRow1, pCRow0, LDC | |||
add pC,pC,LDC, lsl #1 | |||
mov pA, origPA // pA = A | |||
.Lcgemm_kernel_L2_Mv1_BEGIN: | |||
mov counterI, #0 | |||
whilelt p1.s, counterI, origM //SVE instruction | |||
cntp lanes, p0, p1.s | |||
.Lcgemm_kernel_L2_Mv1_20: | |||
INITv1x2 | |||
mov pB, origPB | |||
asr counterL , origK, #3 // counterL = counterL / 8 | |||
cmp counterL,#0 | |||
ble .Lcgemm_kernel_L2_Mv1_40 | |||
.align 5 | |||
.Lcgemm_kernel_L2_Mv1_22: | |||
KERNELv1x2_SUB | |||
KERNELv1x2_SUB | |||
KERNELv1x2_SUB | |||
KERNELv1x2_SUB | |||
KERNELv1x2_SUB | |||
KERNELv1x2_SUB | |||
KERNELv1x2_SUB | |||
KERNELv1x2_SUB | |||
subs counterL, counterL, #1 | |||
bgt .Lcgemm_kernel_L2_Mv1_22 | |||
.Lcgemm_kernel_L2_Mv1_40: | |||
ands counterL , origK, #7 // counterL = counterL % 8 | |||
ble .Lcgemm_kernel_L2_Mv1_100 | |||
.Lcgemm_kernel_L2_Mv1_42: | |||
KERNELv1x2_SUB | |||
subs counterL, counterL, #1 | |||
bgt .Lcgemm_kernel_L2_Mv1_42 | |||
.Lcgemm_kernel_L2_Mv1_100: | |||
SAVEv1x2 | |||
.Lcgemm_kernel_L2_Mv1_END: | |||
incw counterI | |||
whilelt p1.s, counterI, origM //SVE instruction | |||
cntp lanes, p0, p1.s | |||
b.any .Lcgemm_kernel_L2_Mv1_20 | |||
.Lcgemm_kernel_L2_END: | |||
lsl temp, origK, #4 | |||
add origPB, origPB, temp // B = B + K * 2 * 4 * 2 | |||
/******************************************************************************/ | |||
.Lcgemm_kernel_L1_BEGIN: | |||
mov counterJ , origN | |||
tst counterJ , #1 | |||
ble .Lcgemm_kernel_L999 // done | |||
mov pCRow0, pC // pCRow0 = C | |||
add pC , pC , LDC // Update pC to point to next | |||
mov pA, origPA // pA = A | |||
.Lcgemm_kernel_L1_Mv1_BEGIN: | |||
mov counterI, #0 | |||
whilelt p1.s, counterI, origM //SVE instruction | |||
cntp lanes, p0, p1.s | |||
.Lcgemm_kernel_L1_Mv1_20: | |||
INITv1x1 | |||
mov pB, origPB | |||
asr counterL , origK, #3 // counterL = counterL / 8 | |||
cmp counterL , #0 | |||
ble .Lcgemm_kernel_L1_Mv1_40 | |||
.align 5 | |||
.Lcgemm_kernel_L1_Mv1_22: | |||
KERNELv1x1_SUB | |||
KERNELv1x1_SUB | |||
KERNELv1x1_SUB | |||
KERNELv1x1_SUB | |||
KERNELv1x1_SUB | |||
KERNELv1x1_SUB | |||
KERNELv1x1_SUB | |||
KERNELv1x1_SUB | |||
subs counterL, counterL, #1 | |||
bgt .Lcgemm_kernel_L1_Mv1_22 | |||
.Lcgemm_kernel_L1_Mv1_40: | |||
ands counterL , origK, #7 // counterL = counterL % 8 | |||
ble .Lcgemm_kernel_L1_Mv1_100 | |||
.Lcgemm_kernel_L1_Mv1_42: | |||
KERNELv1x1_SUB | |||
subs counterL, counterL, #1 | |||
bgt .Lcgemm_kernel_L1_Mv1_42 | |||
.Lcgemm_kernel_L1_Mv1_100: | |||
SAVEv1x1 | |||
.Lcgemm_kernel_L1_Mv1_END: | |||
incw counterI | |||
whilelt p1.s, counterI, origM //SVE instruction | |||
cntp lanes, p0, p1.s | |||
b.any .Lcgemm_kernel_L1_Mv1_20 | |||
.Lcgemm_kernel_L1_END: | |||
/******************************************************************************/ | |||
.Lcgemm_kernel_L999: | |||
mov x0, #0 // set return value | |||
ldp d8, d9, [sp, #(0 * 16)] | |||
ldp d10, d11, [sp, #(1 * 16)] | |||
ldp d12, d13, [sp, #(2 * 16)] | |||
ldp d14, d15, [sp, #(3 * 16)] | |||
ldp d16, d17, [sp, #(4 * 16)] | |||
ldp x18, x19, [sp, #(5 * 16)] | |||
ldp x20, x21, [sp, #(6 * 16)] | |||
ldp x22, x23, [sp, #(7 * 16)] | |||
ldp x24, x25, [sp, #(8 * 16)] | |||
ldp x26, x27, [sp, #(9 * 16)] | |||
ldr x28, [sp, #(10 * 16)] | |||
add sp, sp, #(11*16) | |||
ret | |||
EPILOGUE | |||
@@ -0,0 +1,79 @@ | |||
/*********************************************************************/ | |||
/* Copyright 2009, 2010 The University of Texas at Austin. */ | |||
/* All rights reserved. */ | |||
/* */ | |||
/* Redistribution and use in source and binary forms, with or */ | |||
/* without modification, are permitted provided that the following */ | |||
/* conditions are met: */ | |||
/* */ | |||
/* 1. Redistributions of source code must retain the above */ | |||
/* copyright notice, this list of conditions and the following */ | |||
/* disclaimer. */ | |||
/* */ | |||
/* 2. Redistributions in binary form must reproduce the above */ | |||
/* copyright notice, this list of conditions and the following */ | |||
/* disclaimer in the documentation and/or other materials */ | |||
/* provided with the distribution. */ | |||
/* */ | |||
/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ | |||
/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ | |||
/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ | |||
/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ | |||
/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ | |||
/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ | |||
/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ | |||
/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ | |||
/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ | |||
/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ | |||
/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ | |||
/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ | |||
/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ | |||
/* POSSIBILITY OF SUCH DAMAGE. */ | |||
/* */ | |||
/* The views and conclusions contained in the software and */ | |||
/* documentation are those of the authors and should not be */ | |||
/* interpreted as representing official policies, either expressed */ | |||
/* or implied, of The University of Texas at Austin. */ | |||
/*********************************************************************/ | |||
#include <stdio.h> | |||
#include "common.h" | |||
#include <arm_sve.h> | |||
// TODO: write in assembly with proper unrolling of inner loop | |||
int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){ | |||
BLASLONG j; | |||
IFLOAT *aoffset, *aoffset1, *boffset; | |||
svint32_t lda_vec = svindex_s32(0, lda * 2); | |||
aoffset = a; | |||
boffset = b; | |||
j = 0; | |||
svbool_t pg = svwhilelt_b32(j, n); | |||
uint32_t active = svcntp_b32(svptrue_b32(), pg); | |||
do { | |||
aoffset1 = aoffset; | |||
uint32_t i_cnt = m; | |||
while (i_cnt--) { | |||
svfloat32_t a_vec_real = svld1_gather_index(pg, (float *) aoffset1, lda_vec); | |||
svfloat32_t a_vec_imag = svld1_gather_index(pg, ((float *) aoffset1) + 1, lda_vec); | |||
svst2_f32(pg, (float *) boffset, svcreate2(a_vec_real, a_vec_imag)); | |||
aoffset1 += 2; | |||
boffset += active * 2; | |||
} | |||
aoffset += active * lda * 2; | |||
j += svcntw(); | |||
pg = svwhilelt_b32(j, n); | |||
active = svcntp_b32(svptrue_b32(), pg); | |||
} while (svptest_any(svptrue_b32(), pg)); | |||
return 0; | |||
} |
@@ -0,0 +1,75 @@ | |||
/*********************************************************************/ | |||
/* Copyright 2009, 2010 The University of Texas at Austin. */ | |||
/* All rights reserved. */ | |||
/* */ | |||
/* Redistribution and use in source and binary forms, with or */ | |||
/* without modification, are permitted provided that the following */ | |||
/* conditions are met: */ | |||
/* */ | |||
/* 1. Redistributions of source code must retain the above */ | |||
/* copyright notice, this list of conditions and the following */ | |||
/* disclaimer. */ | |||
/* */ | |||
/* 2. Redistributions in binary form must reproduce the above */ | |||
/* copyright notice, this list of conditions and the following */ | |||
/* disclaimer in the documentation and/or other materials */ | |||
/* provided with the distribution. */ | |||
/* */ | |||
/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ | |||
/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ | |||
/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ | |||
/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ | |||
/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ | |||
/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ | |||
/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ | |||
/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ | |||
/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ | |||
/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ | |||
/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ | |||
/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ | |||
/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ | |||
/* POSSIBILITY OF SUCH DAMAGE. */ | |||
/* */ | |||
/* The views and conclusions contained in the software and */ | |||
/* documentation are those of the authors and should not be */ | |||
/* interpreted as representing official policies, either expressed */ | |||
/* or implied, of The University of Texas at Austin. */ | |||
/*********************************************************************/ | |||
#include <stdio.h> | |||
#include "common.h" | |||
#include <arm_sve.h> | |||
// TODO: write in assembly with proper unrolling of inner loop | |||
int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){ | |||
BLASLONG j; | |||
IFLOAT *aoffset, *aoffset1, *boffset; | |||
aoffset = a; | |||
boffset = b; | |||
j = 0; | |||
svbool_t pg = svwhilelt_b32(j, n); | |||
uint32_t active = svcntp_b32(svptrue_b32(), pg); | |||
do { | |||
aoffset1 = aoffset; | |||
uint32_t i_cnt = m; | |||
while (i_cnt--) { | |||
svfloat32x2_t a_vec = svld2(pg, (float *)aoffset1); | |||
svst2_f32(pg, (float *) boffset, a_vec); | |||
aoffset1 += lda * 2; | |||
boffset += active * 2; | |||
} | |||
aoffset += active * 2; | |||
j += svcntw(); | |||
pg = svwhilelt_b32(j, n); | |||
active = svcntp_b32(svptrue_b32(), pg); | |||
} while (svptest_any(svptrue_b32(), pg)); | |||
return 0; | |||
} |
@@ -0,0 +1,320 @@ | |||
/*********************************************************************/ | |||
/* Copyright 2009, 2010 The University of Texas at Austin. */ | |||
/* All rights reserved. */ | |||
/* */ | |||
/* Redistribution and use in source and binary forms, with or */ | |||
/* without modification, are permitted provided that the following */ | |||
/* conditions are met: */ | |||
/* */ | |||
/* 1. Redistributions of source code must retain the above */ | |||
/* copyright notice, this list of conditions and the following */ | |||
/* disclaimer. */ | |||
/* */ | |||
/* 2. Redistributions in binary form must reproduce the above */ | |||
/* copyright notice, this list of conditions and the following */ | |||
/* disclaimer in the documentation and/or other materials */ | |||
/* provided with the distribution. */ | |||
/* */ | |||
/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ | |||
/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ | |||
/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ | |||
/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ | |||
/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ | |||
/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ | |||
/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ | |||
/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ | |||
/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ | |||
/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ | |||
/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ | |||
/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ | |||
/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ | |||
/* POSSIBILITY OF SUCH DAMAGE. */ | |||
/* */ | |||
/* The views and conclusions contained in the software and */ | |||
/* documentation are those of the authors and should not be */ | |||
/* interpreted as representing official policies, either expressed */ | |||
/* or implied, of The University of Texas at Austin. */ | |||
/*********************************************************************/ | |||
#include "common.h" | |||
#include "arm_sve.h" | |||
static FLOAT dm1 = -1.; | |||
#ifdef CONJ | |||
#define GEMM_KERNEL GEMM_KERNEL_L | |||
#else | |||
#define GEMM_KERNEL GEMM_KERNEL_N | |||
#endif | |||
#if GEMM_DEFAULT_UNROLL_N == 1 | |||
#define GEMM_UNROLL_N_SHIFT 0 | |||
#endif | |||
#if GEMM_DEFAULT_UNROLL_N == 2 | |||
#define GEMM_UNROLL_N_SHIFT 1 | |||
#endif | |||
#if GEMM_DEFAULT_UNROLL_N == 4 | |||
#define GEMM_UNROLL_N_SHIFT 2 | |||
#endif | |||
#if GEMM_DEFAULT_UNROLL_N == 8 | |||
#define GEMM_UNROLL_N_SHIFT 3 | |||
#endif | |||
#if GEMM_DEFAULT_UNROLL_N == 16 | |||
#define GEMM_UNROLL_N_SHIFT 4 | |||
#endif | |||
#ifndef COMPLEX | |||
static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) { | |||
FLOAT aa, bb; | |||
int i, j, k; | |||
a += (m - 1) * m; | |||
b += (m - 1) * n; | |||
for (i = m - 1; i >= 0; i--) { | |||
aa = *(a + i); | |||
for (j = 0; j < n; j ++) { | |||
bb = *(c + i + j * ldc); | |||
bb *= aa; | |||
*b = bb; | |||
*(c + i + j * ldc) = bb; | |||
b ++; | |||
for (k = 0; k < i; k ++){ | |||
*(c + k + j * ldc) -= bb * *(a + k); | |||
} | |||
} | |||
a -= m; | |||
b -= 2 * n; | |||
} | |||
} | |||
#else | |||
static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) { | |||
FLOAT aa1, aa2; | |||
FLOAT bb1, bb2; | |||
FLOAT cc1, cc2; | |||
int i, j, k; | |||
ldc *= 2; | |||
a += (m - 1) * m * 2; | |||
b += (m - 1) * n * 2; | |||
for (i = m - 1; i >= 0; i--) { | |||
aa1 = *(a + i * 2 + 0); | |||
aa2 = *(a + i * 2 + 1); | |||
for (j = 0; j < n; j ++) { | |||
bb1 = *(c + i * 2 + 0 + j * ldc); | |||
bb2 = *(c + i * 2 + 1 + j * ldc); | |||
#ifndef CONJ | |||
cc1 = aa1 * bb1 - aa2 * bb2; | |||
cc2 = aa1 * bb2 + aa2 * bb1; | |||
#else | |||
cc1 = aa1 * bb1 + aa2 * bb2; | |||
cc2 = aa1 * bb2 - aa2 * bb1; | |||
#endif | |||
*(b + 0) = cc1; | |||
*(b + 1) = cc2; | |||
*(c + i * 2 + 0 + j * ldc) = cc1; | |||
*(c + i * 2 + 1 + j * ldc) = cc2; | |||
b += 2; | |||
for (k = 0; k < i; k ++){ | |||
#ifndef CONJ | |||
*(c + k * 2 + 0 + j * ldc) -= cc1 * *(a + k * 2 + 0) - cc2 * *(a + k * 2 + 1); | |||
*(c + k * 2 + 1 + j * ldc) -= cc1 * *(a + k * 2 + 1) + cc2 * *(a + k * 2 + 0); | |||
#else | |||
*(c + k * 2 + 0 + j * ldc) -= cc1 * *(a + k * 2 + 0) + cc2 * *(a + k * 2 + 1); | |||
*(c + k * 2 + 1 + j * ldc) -= - cc1 * *(a + k * 2 + 1) + cc2 * *(a + k * 2 + 0); | |||
#endif | |||
} | |||
} | |||
a -= m * 2; | |||
b -= 4 * n; | |||
} | |||
} | |||
#endif | |||
int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, | |||
#ifdef COMPLEX | |||
FLOAT dummy2, | |||
#endif | |||
FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG offset){ | |||
BLASLONG i, j; | |||
FLOAT *aa, *cc; | |||
BLASLONG kk; | |||
#ifdef DOUBLE | |||
int sve_size = svcntd(); | |||
#else | |||
int sve_size = svcntw(); | |||
#endif | |||
#if 0 | |||
fprintf(stderr, "TRSM KERNEL LN : m = %3ld n = %3ld k = %3ld offset = %3ld\n", | |||
m, n, k, offset); | |||
#endif | |||
j = (n >> GEMM_UNROLL_N_SHIFT); | |||
while (j > 0) { | |||
kk = m + offset; | |||
i = m % sve_size; | |||
if (i) { | |||
aa = a + (m - i) * k * COMPSIZE; | |||
cc = c + (m - i) * COMPSIZE; | |||
if (k - kk > 0) { | |||
GEMM_KERNEL(i, GEMM_UNROLL_N, k - kk, dm1, | |||
#ifdef COMPLEX | |||
ZERO, | |||
#endif | |||
aa + i * kk * COMPSIZE, | |||
b + GEMM_UNROLL_N * kk * COMPSIZE, | |||
cc, | |||
ldc); | |||
} | |||
solve(i, GEMM_UNROLL_N, | |||
aa + (kk - i) * i * COMPSIZE, | |||
b + (kk - i) * GEMM_UNROLL_N * COMPSIZE, | |||
cc, ldc); | |||
kk -= i; | |||
} | |||
int mod = i; | |||
i = sve_size; | |||
if (i <= m) { | |||
aa = a + (m - mod - sve_size) * k * COMPSIZE; | |||
cc = c + (m - mod - sve_size) * COMPSIZE; | |||
do { | |||
if (k - kk > 0) { | |||
GEMM_KERNEL(sve_size, GEMM_UNROLL_N, k - kk, dm1, | |||
#ifdef COMPLEX | |||
ZERO, | |||
#endif | |||
aa + sve_size * kk * COMPSIZE, | |||
b + GEMM_UNROLL_N * kk * COMPSIZE, | |||
cc, | |||
ldc); | |||
} | |||
solve(sve_size, GEMM_UNROLL_N, | |||
aa + (kk - sve_size) * sve_size * COMPSIZE, | |||
b + (kk - sve_size) * GEMM_UNROLL_N * COMPSIZE, | |||
cc, ldc); | |||
aa -= sve_size * k * COMPSIZE; | |||
cc -= sve_size * COMPSIZE; | |||
kk -= sve_size; | |||
i += sve_size; | |||
} while (i <= m); | |||
} | |||
b += GEMM_UNROLL_N * k * COMPSIZE; | |||
c += GEMM_UNROLL_N * ldc * COMPSIZE; | |||
j --; | |||
} | |||
if (n & (GEMM_UNROLL_N - 1)) { | |||
j = (GEMM_UNROLL_N >> 1); | |||
while (j > 0) { | |||
if (n & j) { | |||
kk = m + offset; | |||
i = m % sve_size; | |||
if (i) { | |||
aa = a + (m - i) * k * COMPSIZE; | |||
cc = c + (m - i) * COMPSIZE; | |||
if (k - kk > 0) { | |||
GEMM_KERNEL(i, j, k - kk, dm1, | |||
#ifdef COMPLEX | |||
ZERO, | |||
#endif | |||
aa + i * kk * COMPSIZE, | |||
b + j * kk * COMPSIZE, | |||
cc, ldc); | |||
} | |||
solve(i, j, | |||
aa + (kk - i) * i * COMPSIZE, | |||
b + (kk - i) * j * COMPSIZE, | |||
cc, ldc); | |||
kk -= i; | |||
} | |||
int mod = i; | |||
i = sve_size; | |||
if (i <= m) { | |||
aa = a + (m - mod - sve_size) * k * COMPSIZE; | |||
cc = c + (m - mod - sve_size) * COMPSIZE; | |||
do { | |||
if (k - kk > 0) { | |||
GEMM_KERNEL(sve_size, j, k - kk, dm1, | |||
#ifdef COMPLEX | |||
ZERO, | |||
#endif | |||
aa + sve_size * kk * COMPSIZE, | |||
b + j * kk * COMPSIZE, | |||
cc, | |||
ldc); | |||
} | |||
solve(sve_size, j, | |||
aa + (kk - sve_size) * sve_size * COMPSIZE, | |||
b + (kk - sve_size) * j * COMPSIZE, | |||
cc, ldc); | |||
aa -= sve_size * k * COMPSIZE; | |||
cc -= sve_size * COMPSIZE; | |||
kk -= sve_size; | |||
i += sve_size; | |||
} while (i <= m); | |||
} | |||
b += j * k * COMPSIZE; | |||
c += j * ldc * COMPSIZE; | |||
} | |||
j >>= 1; | |||
} | |||
} | |||
return 0; | |||
} |
@@ -0,0 +1,295 @@ | |||
/*********************************************************************/ | |||
/* Copyright 2009, 2010 The University of Texas at Austin. */ | |||
/* All rights reserved. */ | |||
/* */ | |||
/* Redistribution and use in source and binary forms, with or */ | |||
/* without modification, are permitted provided that the following */ | |||
/* conditions are met: */ | |||
/* */ | |||
/* 1. Redistributions of source code must retain the above */ | |||
/* copyright notice, this list of conditions and the following */ | |||
/* disclaimer. */ | |||
/* */ | |||
/* 2. Redistributions in binary form must reproduce the above */ | |||
/* copyright notice, this list of conditions and the following */ | |||
/* disclaimer in the documentation and/or other materials */ | |||
/* provided with the distribution. */ | |||
/* */ | |||
/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ | |||
/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ | |||
/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ | |||
/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ | |||
/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ | |||
/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ | |||
/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ | |||
/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ | |||
/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ | |||
/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ | |||
/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ | |||
/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ | |||
/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ | |||
/* POSSIBILITY OF SUCH DAMAGE. */ | |||
/* */ | |||
/* The views and conclusions contained in the software and */ | |||
/* documentation are those of the authors and should not be */ | |||
/* interpreted as representing official policies, either expressed */ | |||
/* or implied, of The University of Texas at Austin. */ | |||
/*********************************************************************/ | |||
#include "common.h" | |||
#include "arm_sve.h" | |||
static FLOAT dm1 = -1.; | |||
#ifdef CONJ | |||
#define GEMM_KERNEL GEMM_KERNEL_L | |||
#else | |||
#define GEMM_KERNEL GEMM_KERNEL_N | |||
#endif | |||
#if GEMM_DEFAULT_UNROLL_N == 1 | |||
#define GEMM_UNROLL_N_SHIFT 0 | |||
#endif | |||
#if GEMM_DEFAULT_UNROLL_N == 2 | |||
#define GEMM_UNROLL_N_SHIFT 1 | |||
#endif | |||
#if GEMM_DEFAULT_UNROLL_N == 4 | |||
#define GEMM_UNROLL_N_SHIFT 2 | |||
#endif | |||
#if GEMM_DEFAULT_UNROLL_N == 8 | |||
#define GEMM_UNROLL_N_SHIFT 3 | |||
#endif | |||
#if GEMM_DEFAULT_UNROLL_N == 16 | |||
#define GEMM_UNROLL_N_SHIFT 4 | |||
#endif | |||
#ifndef COMPLEX | |||
static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) { | |||
FLOAT aa, bb; | |||
int i, j, k; | |||
for (i = 0; i < m; i++) { | |||
aa = *(a + i); | |||
for (j = 0; j < n; j ++) { | |||
bb = *(c + i + j * ldc); | |||
bb *= aa; | |||
*b = bb; | |||
*(c + i + j * ldc) = bb; | |||
b ++; | |||
for (k = i + 1; k < m; k ++){ | |||
*(c + k + j * ldc) -= bb * *(a + k); | |||
} | |||
} | |||
a += m; | |||
} | |||
} | |||
#else | |||
static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) { | |||
FLOAT aa1, aa2; | |||
FLOAT bb1, bb2; | |||
FLOAT cc1, cc2; | |||
int i, j, k; | |||
ldc *= 2; | |||
for (i = 0; i < m; i++) { | |||
aa1 = *(a + i * 2 + 0); | |||
aa2 = *(a + i * 2 + 1); | |||
for (j = 0; j < n; j ++) { | |||
bb1 = *(c + i * 2 + 0 + j * ldc); | |||
bb2 = *(c + i * 2 + 1 + j * ldc); | |||
#ifndef CONJ | |||
cc1 = aa1 * bb1 - aa2 * bb2; | |||
cc2 = aa1 * bb2 + aa2 * bb1; | |||
#else | |||
cc1 = aa1 * bb1 + aa2 * bb2; | |||
cc2 = aa1 * bb2 - aa2 * bb1; | |||
#endif | |||
*(b + 0) = cc1; | |||
*(b + 1) = cc2; | |||
*(c + i * 2 + 0 + j * ldc) = cc1; | |||
*(c + i * 2 + 1 + j * ldc) = cc2; | |||
b += 2; | |||
for (k = i + 1; k < m; k ++){ | |||
#ifndef CONJ | |||
*(c + k * 2 + 0 + j * ldc) -= cc1 * *(a + k * 2 + 0) - cc2 * *(a + k * 2 + 1); | |||
*(c + k * 2 + 1 + j * ldc) -= cc1 * *(a + k * 2 + 1) + cc2 * *(a + k * 2 + 0); | |||
#else | |||
*(c + k * 2 + 0 + j * ldc) -= cc1 * *(a + k * 2 + 0) + cc2 * *(a + k * 2 + 1); | |||
*(c + k * 2 + 1 + j * ldc) -= -cc1 * *(a + k * 2 + 1) + cc2 * *(a + k * 2 + 0); | |||
#endif | |||
} | |||
} | |||
a += m * 2; | |||
} | |||
} | |||
#endif | |||
int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, | |||
#ifdef COMPLEX | |||
FLOAT dummy2, | |||
#endif | |||
FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG offset){ | |||
FLOAT *aa, *cc; | |||
BLASLONG kk; | |||
BLASLONG i, j, jj; | |||
#ifdef DOUBLE | |||
int sve_size = svcntd(); | |||
#else | |||
int sve_size = svcntw(); | |||
#endif | |||
#if 0 | |||
fprintf(stderr, "TRSM KERNEL LT : m = %3ld n = %3ld k = %3ld offset = %3ld\n", | |||
m, n, k, offset); | |||
#endif | |||
jj = 0; | |||
j = (n >> GEMM_UNROLL_N_SHIFT); | |||
while (j > 0) { | |||
kk = offset; | |||
aa = a; | |||
cc = c; | |||
i = sve_size; | |||
while (i <= m) { | |||
if (kk > 0) { | |||
GEMM_KERNEL(sve_size, GEMM_UNROLL_N, kk, dm1, | |||
#ifdef COMPLEX | |||
ZERO, | |||
#endif | |||
aa, b, cc, ldc); | |||
} | |||
solve(sve_size, GEMM_UNROLL_N, | |||
aa + kk * sve_size * COMPSIZE, | |||
b + kk * GEMM_UNROLL_N * COMPSIZE, | |||
cc, ldc); | |||
aa += sve_size * k * COMPSIZE; | |||
cc += sve_size * COMPSIZE; | |||
kk += sve_size; | |||
i += sve_size; | |||
} | |||
i = m % sve_size; | |||
if (i) { | |||
if (kk > 0) { | |||
GEMM_KERNEL(i, GEMM_UNROLL_N, kk, dm1, | |||
#ifdef COMPLEX | |||
ZERO, | |||
#endif | |||
aa, b, cc, ldc); | |||
} | |||
solve(i, GEMM_UNROLL_N, | |||
aa + kk * i * COMPSIZE, | |||
b + kk * GEMM_UNROLL_N * COMPSIZE, | |||
cc, ldc); | |||
aa += i * k * COMPSIZE; | |||
cc += i * COMPSIZE; | |||
kk += i; | |||
} | |||
b += GEMM_UNROLL_N * k * COMPSIZE; | |||
c += GEMM_UNROLL_N * ldc * COMPSIZE; | |||
j --; | |||
jj += sve_size; | |||
} | |||
if (n & (GEMM_UNROLL_N - 1)) { | |||
j = (GEMM_UNROLL_N >> 1); | |||
while (j > 0) { | |||
if (n & j) { | |||
kk = offset; | |||
aa = a; | |||
cc = c; | |||
i = sve_size; | |||
while (i <= m) { | |||
if (kk > 0) { | |||
GEMM_KERNEL(sve_size, j, kk, dm1, | |||
#ifdef COMPLEX | |||
ZERO, | |||
#endif | |||
aa, | |||
b, | |||
cc, | |||
ldc); | |||
} | |||
solve(sve_size, j, | |||
aa + kk * sve_size * COMPSIZE, | |||
b + kk * j * COMPSIZE, cc, ldc); | |||
aa += sve_size * k * COMPSIZE; | |||
cc += sve_size * COMPSIZE; | |||
kk += sve_size; | |||
i += sve_size; | |||
} | |||
i = m % sve_size; | |||
if (i) { | |||
if (kk > 0) { | |||
GEMM_KERNEL(i, j, kk, dm1, | |||
#ifdef COMPLEX | |||
ZERO, | |||
#endif | |||
aa, | |||
b, | |||
cc, | |||
ldc); | |||
} | |||
solve(i, j, | |||
aa + kk * i * COMPSIZE, | |||
b + kk * j * COMPSIZE, cc, ldc); | |||
aa += i * k * COMPSIZE; | |||
cc += i * COMPSIZE; | |||
kk += i; | |||
} | |||
b += j * k * COMPSIZE; | |||
c += j * ldc * COMPSIZE; | |||
} | |||
j >>= 1; | |||
} | |||
} | |||
return 0; | |||
} |
@@ -0,0 +1,293 @@ | |||
/*********************************************************************/ | |||
/* Copyright 2009, 2010 The University of Texas at Austin. */ | |||
/* All rights reserved. */ | |||
/* */ | |||
/* Redistribution and use in source and binary forms, with or */ | |||
/* without modification, are permitted provided that the following */ | |||
/* conditions are met: */ | |||
/* */ | |||
/* 1. Redistributions of source code must retain the above */ | |||
/* copyright notice, this list of conditions and the following */ | |||
/* disclaimer. */ | |||
/* */ | |||
/* 2. Redistributions in binary form must reproduce the above */ | |||
/* copyright notice, this list of conditions and the following */ | |||
/* disclaimer in the documentation and/or other materials */ | |||
/* provided with the distribution. */ | |||
/* */ | |||
/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ | |||
/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ | |||
/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ | |||
/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ | |||
/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ | |||
/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ | |||
/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ | |||
/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ | |||
/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ | |||
/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ | |||
/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ | |||
/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ | |||
/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ | |||
/* POSSIBILITY OF SUCH DAMAGE. */ | |||
/* */ | |||
/* The views and conclusions contained in the software and */ | |||
/* documentation are those of the authors and should not be */ | |||
/* interpreted as representing official policies, either expressed */ | |||
/* or implied, of The University of Texas at Austin. */ | |||
/*********************************************************************/ | |||
#include "common.h" | |||
#include "arm_sve.h" | |||
static FLOAT dm1 = -1.; | |||
#ifdef CONJ | |||
#define GEMM_KERNEL GEMM_KERNEL_R | |||
#else | |||
#define GEMM_KERNEL GEMM_KERNEL_N | |||
#endif | |||
#if GEMM_DEFAULT_UNROLL_N == 1 | |||
#define GEMM_UNROLL_N_SHIFT 0 | |||
#endif | |||
#if GEMM_DEFAULT_UNROLL_N == 2 | |||
#define GEMM_UNROLL_N_SHIFT 1 | |||
#endif | |||
#if GEMM_DEFAULT_UNROLL_N == 4 | |||
#define GEMM_UNROLL_N_SHIFT 2 | |||
#endif | |||
#if GEMM_DEFAULT_UNROLL_N == 8 | |||
#define GEMM_UNROLL_N_SHIFT 3 | |||
#endif | |||
#if GEMM_DEFAULT_UNROLL_N == 16 | |||
#define GEMM_UNROLL_N_SHIFT 4 | |||
#endif | |||
#ifndef COMPLEX | |||
static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) { | |||
FLOAT aa, bb; | |||
int i, j, k; | |||
for (i = 0; i < n; i++) { | |||
bb = *(b + i); | |||
for (j = 0; j < m; j ++) { | |||
aa = *(c + j + i * ldc); | |||
aa *= bb; | |||
*a = aa; | |||
*(c + j + i * ldc) = aa; | |||
a ++; | |||
for (k = i + 1; k < n; k ++){ | |||
*(c + j + k * ldc) -= aa * *(b + k); | |||
} | |||
} | |||
b += n; | |||
} | |||
} | |||
#else | |||
static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) { | |||
FLOAT aa1, aa2; | |||
FLOAT bb1, bb2; | |||
FLOAT cc1, cc2; | |||
int i, j, k; | |||
ldc *= 2; | |||
for (i = 0; i < n; i++) { | |||
bb1 = *(b + i * 2 + 0); | |||
bb2 = *(b + i * 2 + 1); | |||
for (j = 0; j < m; j ++) { | |||
aa1 = *(c + j * 2 + 0 + i * ldc); | |||
aa2 = *(c + j * 2 + 1 + i * ldc); | |||
#ifndef CONJ | |||
cc1 = aa1 * bb1 - aa2 * bb2; | |||
cc2 = aa1 * bb2 + aa2 * bb1; | |||
#else | |||
cc1 = aa1 * bb1 + aa2 * bb2; | |||
cc2 = -aa1 * bb2 + aa2 * bb1; | |||
#endif | |||
*(a + 0) = cc1; | |||
*(a + 1) = cc2; | |||
*(c + j * 2 + 0 + i * ldc) = cc1; | |||
*(c + j * 2 + 1 + i * ldc) = cc2; | |||
a += 2; | |||
for (k = i + 1; k < n; k ++){ | |||
#ifndef CONJ | |||
*(c + j * 2 + 0 + k * ldc) -= cc1 * *(b + k * 2 + 0) - cc2 * *(b + k * 2 + 1); | |||
*(c + j * 2 + 1 + k * ldc) -= cc1 * *(b + k * 2 + 1) + cc2 * *(b + k * 2 + 0); | |||
#else | |||
*(c + j * 2 + 0 + k * ldc) -= cc1 * *(b + k * 2 + 0) + cc2 * *(b + k * 2 + 1); | |||
*(c + j * 2 + 1 + k * ldc) -= - cc1 * *(b + k * 2 + 1) + cc2 * *(b + k * 2 + 0); | |||
#endif | |||
} | |||
} | |||
b += n * 2; | |||
} | |||
} | |||
#endif | |||
int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, | |||
#ifdef COMPLEX | |||
FLOAT dummy2, | |||
#endif | |||
FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG offset){ | |||
FLOAT *aa, *cc; | |||
BLASLONG kk; | |||
BLASLONG i, j, jj; | |||
#ifdef DOUBLE | |||
int sve_size = svcntd(); | |||
#else | |||
int sve_size = svcntw(); | |||
#endif | |||
#if 0 | |||
fprintf(stderr, "TRSM RN KERNEL m = %3ld n = %3ld k = %3ld offset = %3ld\n", | |||
m, n, k, offset); | |||
#endif | |||
jj = 0; | |||
j = (n >> GEMM_UNROLL_N_SHIFT); | |||
kk = -offset; | |||
while (j > 0) { | |||
aa = a; | |||
cc = c; | |||
i = sve_size; | |||
if (i <= m) { | |||
do { | |||
if (kk > 0) { | |||
GEMM_KERNEL(sve_size, GEMM_UNROLL_N, kk, dm1, | |||
#ifdef COMPLEX | |||
ZERO, | |||
#endif | |||
aa, b, cc, ldc); | |||
} | |||
solve(sve_size, GEMM_UNROLL_N, | |||
aa + kk * sve_size * COMPSIZE, | |||
b + kk * GEMM_UNROLL_N * COMPSIZE, | |||
cc, ldc); | |||
aa += sve_size * k * COMPSIZE; | |||
cc += sve_size * COMPSIZE; | |||
i += sve_size; | |||
} while (i <= m); | |||
} | |||
i = m % sve_size; | |||
if (i) { | |||
if (kk > 0) { | |||
GEMM_KERNEL(i, GEMM_UNROLL_N, kk, dm1, | |||
#ifdef COMPLEX | |||
ZERO, | |||
#endif | |||
aa, b, cc, ldc); | |||
} | |||
solve(i, GEMM_UNROLL_N, | |||
aa + kk * i * COMPSIZE, | |||
b + kk * GEMM_UNROLL_N * COMPSIZE, | |||
cc, ldc); | |||
aa += i * k * COMPSIZE; | |||
cc += i * COMPSIZE; | |||
} | |||
kk += GEMM_UNROLL_N; | |||
b += GEMM_UNROLL_N * k * COMPSIZE; | |||
c += GEMM_UNROLL_N * ldc * COMPSIZE; | |||
j --; | |||
jj += sve_size; | |||
} | |||
if (n & (GEMM_UNROLL_N - 1)) { | |||
j = (GEMM_UNROLL_N >> 1); | |||
while (j > 0) { | |||
if (n & j) { | |||
aa = a; | |||
cc = c; | |||
i = sve_size; | |||
while (i <= m) { | |||
if (kk > 0) { | |||
GEMM_KERNEL(sve_size, j, kk, dm1, | |||
#ifdef COMPLEX | |||
ZERO, | |||
#endif | |||
aa, | |||
b, | |||
cc, | |||
ldc); | |||
} | |||
solve(sve_size, j, | |||
aa + kk * sve_size * COMPSIZE, | |||
b + kk * j * COMPSIZE, cc, ldc); | |||
aa += sve_size * k * COMPSIZE; | |||
cc += sve_size * COMPSIZE; | |||
i += sve_size; | |||
} | |||
i = m % sve_size; | |||
if (i) { | |||
if (kk > 0) { | |||
GEMM_KERNEL(i, j, kk, dm1, | |||
#ifdef COMPLEX | |||
ZERO, | |||
#endif | |||
aa, | |||
b, | |||
cc, | |||
ldc); | |||
} | |||
solve(i, j, | |||
aa + kk * i * COMPSIZE, | |||
b + kk * j * COMPSIZE, cc, ldc); | |||
aa += i * k * COMPSIZE; | |||
cc += i * COMPSIZE; | |||
} | |||
b += j * k * COMPSIZE; | |||
c += j * ldc * COMPSIZE; | |||
kk += j; | |||
} | |||
j >>= 1; | |||
} | |||
} | |||
return 0; | |||
} |
@@ -0,0 +1,317 @@ | |||
/*********************************************************************/ | |||
/* Copyright 2009, 2010 The University of Texas at Austin. */ | |||
/* All rights reserved. */ | |||
/* */ | |||
/* Redistribution and use in source and binary forms, with or */ | |||
/* without modification, are permitted provided that the following */ | |||
/* conditions are met: */ | |||
/* */ | |||
/* 1. Redistributions of source code must retain the above */ | |||
/* copyright notice, this list of conditions and the following */ | |||
/* disclaimer. */ | |||
/* */ | |||
/* 2. Redistributions in binary form must reproduce the above */ | |||
/* copyright notice, this list of conditions and the following */ | |||
/* disclaimer in the documentation and/or other materials */ | |||
/* provided with the distribution. */ | |||
/* */ | |||
/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ | |||
/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ | |||
/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ | |||
/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ | |||
/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ | |||
/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ | |||
/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ | |||
/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ | |||
/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ | |||
/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ | |||
/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ | |||
/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ | |||
/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ | |||
/* POSSIBILITY OF SUCH DAMAGE. */ | |||
/* */ | |||
/* The views and conclusions contained in the software and */ | |||
/* documentation are those of the authors and should not be */ | |||
/* interpreted as representing official policies, either expressed */ | |||
/* or implied, of The University of Texas at Austin. */ | |||
/*********************************************************************/ | |||
#include "common.h" | |||
#include "arm_sve.h" | |||
static FLOAT dm1 = -1.; | |||
#ifdef CONJ | |||
#define GEMM_KERNEL GEMM_KERNEL_R | |||
#else | |||
#define GEMM_KERNEL GEMM_KERNEL_N | |||
#endif | |||
#if GEMM_DEFAULT_UNROLL_N == 1 | |||
#define GEMM_UNROLL_N_SHIFT 0 | |||
#endif | |||
#if GEMM_DEFAULT_UNROLL_N == 2 | |||
#define GEMM_UNROLL_N_SHIFT 1 | |||
#endif | |||
#if GEMM_DEFAULT_UNROLL_N == 4 | |||
#define GEMM_UNROLL_N_SHIFT 2 | |||
#endif | |||
#if GEMM_DEFAULT_UNROLL_N == 8 | |||
#define GEMM_UNROLL_N_SHIFT 3 | |||
#endif | |||
#if GEMM_DEFAULT_UNROLL_N == 16 | |||
#define GEMM_UNROLL_N_SHIFT 4 | |||
#endif | |||
#ifndef COMPLEX | |||
static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) { | |||
FLOAT aa, bb; | |||
int i, j, k; | |||
a += (n - 1) * m; | |||
b += (n - 1) * n; | |||
for (i = n - 1; i >= 0; i--) { | |||
bb = *(b + i); | |||
for (j = 0; j < m; j ++) { | |||
aa = *(c + j + i * ldc); | |||
aa *= bb; | |||
*a = aa; | |||
*(c + j + i * ldc) = aa; | |||
a ++; | |||
for (k = 0; k < i; k ++){ | |||
*(c + j + k * ldc) -= aa * *(b + k); | |||
} | |||
} | |||
b -= n; | |||
a -= 2 * m; | |||
} | |||
} | |||
#else | |||
static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) { | |||
FLOAT aa1, aa2; | |||
FLOAT bb1, bb2; | |||
FLOAT cc1, cc2; | |||
int i, j, k; | |||
ldc *= 2; | |||
a += (n - 1) * m * 2; | |||
b += (n - 1) * n * 2; | |||
for (i = n - 1; i >= 0; i--) { | |||
bb1 = *(b + i * 2 + 0); | |||
bb2 = *(b + i * 2 + 1); | |||
for (j = 0; j < m; j ++) { | |||
aa1 = *(c + j * 2 + 0 + i * ldc); | |||
aa2 = *(c + j * 2 + 1 + i * ldc); | |||
#ifndef CONJ | |||
cc1 = aa1 * bb1 - aa2 * bb2; | |||
cc2 = aa1 * bb2 + aa2 * bb1; | |||
#else | |||
cc1 = aa1 * bb1 + aa2 * bb2; | |||
cc2 = - aa1 * bb2 + aa2 * bb1; | |||
#endif | |||
*(a + 0) = cc1; | |||
*(a + 1) = cc2; | |||
*(c + j * 2 + 0 + i * ldc) = cc1; | |||
*(c + j * 2 + 1 + i * ldc) = cc2; | |||
a += 2; | |||
for (k = 0; k < i; k ++){ | |||
#ifndef CONJ | |||
*(c + j * 2 + 0 + k * ldc) -= cc1 * *(b + k * 2 + 0) - cc2 * *(b + k * 2 + 1); | |||
*(c + j * 2 + 1 + k * ldc) -= cc1 * *(b + k * 2 + 1) + cc2 * *(b + k * 2 + 0); | |||
#else | |||
*(c + j * 2 + 0 + k * ldc) -= cc1 * *(b + k * 2 + 0) + cc2 * *(b + k * 2 + 1); | |||
*(c + j * 2 + 1 + k * ldc) -= -cc1 * *(b + k * 2 + 1) + cc2 * *(b + k * 2 + 0); | |||
#endif | |||
} | |||
} | |||
b -= n * 2; | |||
a -= 4 * m; | |||
} | |||
} | |||
#endif | |||
int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, | |||
#ifdef COMPLEX | |||
FLOAT dummy2, | |||
#endif | |||
FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG offset){ | |||
BLASLONG i, j; | |||
FLOAT *aa, *cc; | |||
BLASLONG kk; | |||
#ifdef DOUBLE | |||
int sve_size = svcntd(); | |||
#else | |||
int sve_size = svcntw(); | |||
#endif | |||
#if 0 | |||
fprintf(stderr, "TRSM RT KERNEL m = %3ld n = %3ld k = %3ld offset = %3ld\n", | |||
m, n, k, offset); | |||
#endif | |||
kk = n - offset; | |||
c += n * ldc * COMPSIZE; | |||
b += n * k * COMPSIZE; | |||
if (n & (GEMM_UNROLL_N - 1)) { | |||
j = 1; | |||
while (j < GEMM_UNROLL_N) { | |||
if (n & j) { | |||
aa = a; | |||
b -= j * k * COMPSIZE; | |||
c -= j * ldc* COMPSIZE; | |||
cc = c; | |||
i = sve_size; | |||
if (i <= m) { | |||
do { | |||
if (k - kk > 0) { | |||
GEMM_KERNEL(sve_size, j, k - kk, dm1, | |||
#ifdef COMPLEX | |||
ZERO, | |||
#endif | |||
aa + sve_size * kk * COMPSIZE, | |||
b + j * kk * COMPSIZE, | |||
cc, | |||
ldc); | |||
} | |||
solve(sve_size, j, | |||
aa + (kk - j) * sve_size * COMPSIZE, | |||
b + (kk - j) * j * COMPSIZE, | |||
cc, ldc); | |||
aa += sve_size * k * COMPSIZE; | |||
cc += sve_size * COMPSIZE; | |||
i += sve_size; | |||
} while (i <= m); | |||
} | |||
i = m % sve_size; | |||
if (i) { | |||
if (k - kk > 0) { | |||
GEMM_KERNEL(i, j, k - kk, dm1, | |||
#ifdef COMPLEX | |||
ZERO, | |||
#endif | |||
aa + i * kk * COMPSIZE, | |||
b + j * kk * COMPSIZE, | |||
cc, ldc); | |||
} | |||
solve(i, j, | |||
aa + (kk - j) * i * COMPSIZE, | |||
b + (kk - j) * j * COMPSIZE, | |||
cc, ldc); | |||
aa += i * k * COMPSIZE; | |||
cc += i * COMPSIZE; | |||
} | |||
kk -= j; | |||
} | |||
j <<= 1; | |||
} | |||
} | |||
j = (n >> GEMM_UNROLL_N_SHIFT); | |||
if (j > 0) { | |||
do { | |||
aa = a; | |||
b -= GEMM_UNROLL_N * k * COMPSIZE; | |||
c -= GEMM_UNROLL_N * ldc * COMPSIZE; | |||
cc = c; | |||
i = sve_size; | |||
if (i <= m) { | |||
do { | |||
if (k - kk > 0) { | |||
GEMM_KERNEL(sve_size, GEMM_UNROLL_N, k - kk, dm1, | |||
#ifdef COMPLEX | |||
ZERO, | |||
#endif | |||
aa + sve_size * kk * COMPSIZE, | |||
b + GEMM_UNROLL_N * kk * COMPSIZE, | |||
cc, | |||
ldc); | |||
} | |||
solve(sve_size, GEMM_UNROLL_N, | |||
aa + (kk - GEMM_UNROLL_N) * sve_size * COMPSIZE, | |||
b + (kk - GEMM_UNROLL_N) * GEMM_UNROLL_N * COMPSIZE, | |||
cc, ldc); | |||
aa += sve_size * k * COMPSIZE; | |||
cc += sve_size * COMPSIZE; | |||
i += sve_size; | |||
} while (i <= m); | |||
} | |||
i = m % sve_size; | |||
if (i) { | |||
if (k - kk > 0) { | |||
GEMM_KERNEL(i, GEMM_UNROLL_N, k - kk, dm1, | |||
#ifdef COMPLEX | |||
ZERO, | |||
#endif | |||
aa + i * kk * COMPSIZE, | |||
b + GEMM_UNROLL_N * kk * COMPSIZE, | |||
cc, | |||
ldc); | |||
} | |||
solve(i, GEMM_UNROLL_N, | |||
aa + (kk - GEMM_UNROLL_N) * i * COMPSIZE, | |||
b + (kk - GEMM_UNROLL_N) * GEMM_UNROLL_N * COMPSIZE, | |||
cc, ldc); | |||
aa += i * k * COMPSIZE; | |||
cc += i * COMPSIZE; | |||
} | |||
kk -= GEMM_UNROLL_N; | |||
j --; | |||
} while (j > 0); | |||
} | |||
return 0; | |||
} | |||
@@ -0,0 +1,119 @@ | |||
/*********************************************************************/ | |||
/* Copyright 2009, 2010 The University of Texas at Austin. */ | |||
/* All rights reserved. */ | |||
/* */ | |||
/* Redistribution and use in source and binary forms, with or */ | |||
/* without modification, are permitted provided that the following */ | |||
/* conditions are met: */ | |||
/* */ | |||
/* 1. Redistributions of source code must retain the above */ | |||
/* copyright notice, this list of conditions and the following */ | |||
/* disclaimer. */ | |||
/* */ | |||
/* 2. Redistributions in binary form must reproduce the above */ | |||
/* copyright notice, this list of conditions and the following */ | |||
/* disclaimer in the documentation and/or other materials */ | |||
/* provided with the distribution. */ | |||
/* */ | |||
/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ | |||
/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ | |||
/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ | |||
/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ | |||
/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ | |||
/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ | |||
/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ | |||
/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ | |||
/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ | |||
/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ | |||
/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ | |||
/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ | |||
/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ | |||
/* POSSIBILITY OF SUCH DAMAGE. */ | |||
/* */ | |||
/* The views and conclusions contained in the software and */ | |||
/* documentation are those of the authors and should not be */ | |||
/* interpreted as representing official policies, either expressed */ | |||
/* or implied, of The University of Texas at Austin. */ | |||
/*********************************************************************/ | |||
#include <stdio.h> | |||
#include "common.h" | |||
#include "arm_sve.h" | |||
#ifndef UNIT | |||
#define INV(a) (ONE / (a)) | |||
#else | |||
#define INV(a) (ONE) | |||
#endif | |||
int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *b){ | |||
BLASLONG i, ii, jj; | |||
FLOAT *ao; | |||
jj = offset; | |||
#ifdef DOUBLE | |||
int64_t js = 0; | |||
svint64_t index = svindex_s64(0LL, lda); | |||
svbool_t pn = svwhilelt_b64(js, n); | |||
int n_active = svcntp_b64(svptrue_b64(), pn); | |||
#else | |||
int32_t N = n; | |||
int32_t js = 0; | |||
svint32_t index = svindex_s32(0, lda); | |||
svbool_t pn = svwhilelt_b32(js, N); | |||
int n_active = svcntp_b32(svptrue_b32(), pn); | |||
#endif | |||
do { | |||
ao = a; | |||
i = 0; | |||
ii = 0; | |||
do { | |||
if (ii == jj) { | |||
for (int j = 0; j < n_active; j++) { | |||
for (int k = 0; k < j; k++) { | |||
*(b + j * n_active + k) = *(ao + k * lda + j); | |||
} | |||
*(b + j * n_active + j) = INV(*(ao + j * lda + j)); | |||
} | |||
ao += n_active; | |||
b += n_active * n_active; | |||
i += n_active; | |||
ii += n_active; | |||
} else { | |||
if (ii > jj) { | |||
#ifdef DOUBLE | |||
svfloat64_t aj_vec = svld1_gather_index(pn, ao, index); | |||
#else | |||
svfloat32_t aj_vec = svld1_gather_index(pn, ao, index); | |||
#endif | |||
svst1(pn, b, aj_vec); | |||
} | |||
ao++; | |||
b += n_active; | |||
i++; | |||
ii++; | |||
} | |||
} while (i < m); | |||
a += n_active * lda; | |||
jj += n_active; | |||
js += n_active; | |||
#ifdef DOUBLE | |||
pn = svwhilelt_b64(js, n); | |||
n_active = svcntp_b64(svptrue_b64(), pn); | |||
} while (svptest_any(svptrue_b64(), pn)); | |||
#else | |||
pn = svwhilelt_b32(js, N); | |||
n_active = svcntp_b32(svptrue_b32(), pn); | |||
} while (svptest_any(svptrue_b32(), pn)); | |||
#endif | |||
return 0; | |||
} |
@@ -0,0 +1,117 @@ | |||
/*********************************************************************/ | |||
/* Copyright 2009, 2010 The University of Texas at Austin. */ | |||
/* All rights reserved. */ | |||
/* */ | |||
/* Redistribution and use in source and binary forms, with or */ | |||
/* without modification, are permitted provided that the following */ | |||
/* conditions are met: */ | |||
/* */ | |||
/* 1. Redistributions of source code must retain the above */ | |||
/* copyright notice, this list of conditions and the following */ | |||
/* disclaimer. */ | |||
/* */ | |||
/* 2. Redistributions in binary form must reproduce the above */ | |||
/* copyright notice, this list of conditions and the following */ | |||
/* disclaimer in the documentation and/or other materials */ | |||
/* provided with the distribution. */ | |||
/* */ | |||
/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ | |||
/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ | |||
/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ | |||
/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ | |||
/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ | |||
/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ | |||
/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ | |||
/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ | |||
/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ | |||
/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ | |||
/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ | |||
/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ | |||
/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ | |||
/* POSSIBILITY OF SUCH DAMAGE. */ | |||
/* */ | |||
/* The views and conclusions contained in the software and */ | |||
/* documentation are those of the authors and should not be */ | |||
/* interpreted as representing official policies, either expressed */ | |||
/* or implied, of The University of Texas at Austin. */ | |||
/*********************************************************************/ | |||
#include <stdio.h> | |||
#include "common.h" | |||
#include "arm_sve.h" | |||
#ifndef UNIT | |||
#define INV(a) (ONE / (a)) | |||
#else | |||
#define INV(a) (ONE) | |||
#endif | |||
int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *b){ | |||
BLASLONG i, ii, jj; | |||
FLOAT *ao; | |||
jj = offset; | |||
#ifdef DOUBLE | |||
int64_t js = 0; | |||
svbool_t pn = svwhilelt_b64(js, n); | |||
int n_active = svcntp_b64(svptrue_b64(), pn); | |||
#else | |||
int32_t N = n; | |||
int32_t js = 0; | |||
svbool_t pn = svwhilelt_b32(js, N); | |||
int n_active = svcntp_b32(svptrue_b32(), pn); | |||
#endif | |||
do { | |||
ao = a; | |||
i = 0; | |||
ii = 0; | |||
do { | |||
if (ii == jj) { | |||
for (int j = 0; j < n_active; j++) { | |||
*(b + j * n_active + j) = INV(*(ao + j * lda + j)); | |||
for (int k = j+1; k < n_active; k++) { | |||
*(b + j * n_active + k) = *(ao + j * lda + k); | |||
} | |||
} | |||
b += n_active * n_active; | |||
ao += lda * n_active; | |||
i += n_active; | |||
ii += n_active; | |||
} else { | |||
if (ii < jj) { | |||
#ifdef DOUBLE | |||
svfloat64_t aj_vec = svld1(pn, ao); | |||
#else | |||
svfloat32_t aj_vec = svld1(pn, ao); | |||
#endif | |||
svst1(pn, b, aj_vec); | |||
} | |||
ao += lda; | |||
b += n_active; | |||
i ++; | |||
ii ++; | |||
} | |||
} while (i < m); | |||
a += n_active; | |||
jj += n_active; | |||
js += n_active; | |||
#ifdef DOUBLE | |||
pn = svwhilelt_b64(js, n); | |||
n_active = svcntp_b64(svptrue_b64(), pn); | |||
} while (svptest_any(svptrue_b64(), pn)); | |||
#else | |||
pn = svwhilelt_b32(js, N); | |||
n_active = svcntp_b32(svptrue_b32(), pn); | |||
} while (svptest_any(svptrue_b32(), pn)); | |||
#endif | |||
return 0; | |||
} |
@@ -0,0 +1,119 @@ | |||
/*********************************************************************/ | |||
/* Copyright 2009, 2010 The University of Texas at Austin. */ | |||
/* All rights reserved. */ | |||
/* */ | |||
/* Redistribution and use in source and binary forms, with or */ | |||
/* without modification, are permitted provided that the following */ | |||
/* conditions are met: */ | |||
/* */ | |||
/* 1. Redistributions of source code must retain the above */ | |||
/* copyright notice, this list of conditions and the following */ | |||
/* disclaimer. */ | |||
/* */ | |||
/* 2. Redistributions in binary form must reproduce the above */ | |||
/* copyright notice, this list of conditions and the following */ | |||
/* disclaimer in the documentation and/or other materials */ | |||
/* provided with the distribution. */ | |||
/* */ | |||
/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ | |||
/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ | |||
/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ | |||
/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ | |||
/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ | |||
/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ | |||
/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ | |||
/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ | |||
/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ | |||
/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ | |||
/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ | |||
/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ | |||
/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ | |||
/* POSSIBILITY OF SUCH DAMAGE. */ | |||
/* */ | |||
/* The views and conclusions contained in the software and */ | |||
/* documentation are those of the authors and should not be */ | |||
/* interpreted as representing official policies, either expressed */ | |||
/* or implied, of The University of Texas at Austin. */ | |||
/*********************************************************************/ | |||
#include <stdio.h> | |||
#include "common.h" | |||
#include "arm_sve.h" | |||
#ifndef UNIT | |||
#define INV(a) (ONE / (a)) | |||
#else | |||
#define INV(a) (ONE) | |||
#endif | |||
int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *b){ | |||
BLASLONG i, ii, jj; | |||
FLOAT *ao; | |||
jj = offset; | |||
#ifdef DOUBLE | |||
int64_t js = 0; | |||
svint64_t index = svindex_s64(0LL, lda); | |||
svbool_t pn = svwhilelt_b64(js, n); | |||
int n_active = svcntp_b64(svptrue_b64(), pn); | |||
#else | |||
int32_t N = n; | |||
int32_t js = 0; | |||
svint32_t index = svindex_s32(0, lda); | |||
svbool_t pn = svwhilelt_b32(js, N); | |||
int n_active = svcntp_b32(svptrue_b32(), pn); | |||
#endif | |||
do { | |||
ao = a; | |||
i = 0; | |||
ii = 0; | |||
do { | |||
if (ii == jj) { | |||
for (int j = 0; j < n_active; j++) { | |||
*(b + j * n_active + j) = INV(*(ao + j * lda + j)); | |||
for (int k = j+1; k < n_active; k++) { | |||
*(b + j * n_active + k) = *(ao + k * lda + j); | |||
} | |||
} | |||
ao += n_active; | |||
b += n_active * n_active; | |||
i += n_active; | |||
ii += n_active; | |||
} else { | |||
if (ii < jj) { | |||
#ifdef DOUBLE | |||
svfloat64_t aj_vec = svld1_gather_index(pn, ao, index); | |||
#else | |||
svfloat32_t aj_vec = svld1_gather_index(pn, ao, index); | |||
#endif | |||
svst1(pn, b, aj_vec); | |||
} | |||
ao++; | |||
b += n_active; | |||
i++; | |||
ii++; | |||
} | |||
} while (i < m); | |||
a += n_active * lda; | |||
jj += n_active; | |||
js += n_active; | |||
#ifdef DOUBLE | |||
pn = svwhilelt_b64(js, n); | |||
n_active = svcntp_b64(svptrue_b64(), pn); | |||
} while (svptest_any(svptrue_b64(), pn)); | |||
#else | |||
pn = svwhilelt_b32(js, N); | |||
n_active = svcntp_b32(svptrue_b32(), pn); | |||
} while (svptest_any(svptrue_b32(), pn)); | |||
#endif | |||
return 0; | |||
} |
@@ -0,0 +1,117 @@ | |||
/*********************************************************************/ | |||
/* Copyright 2009, 2010 The University of Texas at Austin. */ | |||
/* All rights reserved. */ | |||
/* */ | |||
/* Redistribution and use in source and binary forms, with or */ | |||
/* without modification, are permitted provided that the following */ | |||
/* conditions are met: */ | |||
/* */ | |||
/* 1. Redistributions of source code must retain the above */ | |||
/* copyright notice, this list of conditions and the following */ | |||
/* disclaimer. */ | |||
/* */ | |||
/* 2. Redistributions in binary form must reproduce the above */ | |||
/* copyright notice, this list of conditions and the following */ | |||
/* disclaimer in the documentation and/or other materials */ | |||
/* provided with the distribution. */ | |||
/* */ | |||
/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ | |||
/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ | |||
/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ | |||
/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ | |||
/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ | |||
/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ | |||
/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ | |||
/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ | |||
/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ | |||
/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ | |||
/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ | |||
/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ | |||
/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ | |||
/* POSSIBILITY OF SUCH DAMAGE. */ | |||
/* */ | |||
/* The views and conclusions contained in the software and */ | |||
/* documentation are those of the authors and should not be */ | |||
/* interpreted as representing official policies, either expressed */ | |||
/* or implied, of The University of Texas at Austin. */ | |||
/*********************************************************************/ | |||
#include <stdio.h> | |||
#include "common.h" | |||
#include "arm_sve.h" | |||
#ifndef UNIT | |||
#define INV(a) (ONE / (a)) | |||
#else | |||
#define INV(a) (ONE) | |||
#endif | |||
int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *b){ | |||
BLASLONG i, ii, jj; | |||
FLOAT *ao; | |||
jj = offset; | |||
#ifdef DOUBLE | |||
int64_t js = 0; | |||
svbool_t pn = svwhilelt_b64(js, n); | |||
int n_active = svcntp_b64(svptrue_b64(), pn); | |||
#else | |||
int32_t N = n; | |||
int32_t js = 0; | |||
svbool_t pn = svwhilelt_b32(js, N); | |||
int n_active = svcntp_b32(svptrue_b32(), pn); | |||
#endif | |||
do { | |||
ao = a; | |||
i = 0; | |||
ii = 0; | |||
do { | |||
if (ii == jj) { | |||
for (int j = 0; j < n_active; j++) { | |||
for (int k = 0; k < j; k++) { | |||
*(b + j * n_active + k) = *(ao + j * lda + k); | |||
} | |||
*(b + j * n_active + j) = INV(*(ao + j * lda + j)); | |||
} | |||
ao += lda * n_active; | |||
b += n_active * n_active; | |||
i += n_active; | |||
ii += n_active; | |||
} else { | |||
if (ii > jj) { | |||
#ifdef DOUBLE | |||
svfloat64_t aj_vec = svld1(pn, ao); | |||
#else | |||
svfloat32_t aj_vec = svld1(pn, ao); | |||
#endif | |||
svst1(pn, b, aj_vec); | |||
} | |||
ao += lda; | |||
b += n_active; | |||
i ++; | |||
ii ++; | |||
} | |||
} while (i < m); | |||
a += n_active; | |||
jj += n_active; | |||
js += n_active; | |||
#ifdef DOUBLE | |||
pn = svwhilelt_b64(js, n); | |||
n_active = svcntp_b64(svptrue_b64(), pn); | |||
} while (svptest_any(svptrue_b64(), pn)); | |||
#else | |||
pn = svwhilelt_b32(js, N); | |||
n_active = svcntp_b32(svptrue_b32(), pn); | |||
} while (svptest_any(svptrue_b32(), pn)); | |||
#endif | |||
return 0; | |||
} |
@@ -0,0 +1,874 @@ | |||
/******************************************************************************* | |||
Copyright (c) 2015, The OpenBLAS Project | |||
All rights reserved. | |||
Redistribution and use in source and binary forms, with or without | |||
modification, are permitted provided that the following conditions are | |||
met: | |||
1. Redistributions of source code must retain the above copyright | |||
notice, this list of conditions and the following disclaimer. | |||
2. Redistributions in binary form must reproduce the above copyright | |||
notice, this list of conditions and the following disclaimer in | |||
the documentation and/or other materials provided with the | |||
distribution. | |||
3. Neither the name of the OpenBLAS project nor the names of | |||
its contributors may be used to endorse or promote products | |||
derived from this software without specific prior written permission. | |||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
*******************************************************************************/ | |||
#define ASSEMBLER | |||
#include "common.h" | |||
/* X0 X1 X2 s0 X3 x4 x5 x6 */ | |||
/*int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha0,FLOAT* ba,FLOAT* bb,FLOAT* C,BLASLONG ldc */ | |||
#define origM x0 | |||
#define origN x1 | |||
#define origK x2 | |||
#define origPA x3 | |||
#define origPB x4 | |||
#define pC x5 | |||
#define LDC x6 | |||
#define temp x7 | |||
#define counterL x8 | |||
#define counterI x9 | |||
#define counterJ x10 | |||
#define pB x11 | |||
#define pCRow0 x12 | |||
#define pCRow1 x13 | |||
#define pCRow2 x14 | |||
#define pCRow3 x15 | |||
#define pA x16 | |||
#define lanes x17 | |||
#define alphaR x19 | |||
#define alphaI x20 | |||
#define alphaz_R z6.d | |||
#define alphaz_I z7.d | |||
#define alpha0_R d6 | |||
#define alpha0_I d7 | |||
#define A_PRE_SIZE 2560 | |||
#define B_PRE_SIZE 448 | |||
#define C_PRE_SIZE 128 | |||
#if defined(NN) || defined(NT) || defined(TN) || defined(TT) | |||
#define OP_rr fmla | |||
#define OP_ii fmls | |||
#define OP_ri fmla | |||
#define OP_ir fmla | |||
#elif defined(NR) || defined(NC) || defined(TR) || defined(TC) | |||
#define OP_rr fmla | |||
#define OP_ii fmla | |||
#define OP_ri fmls | |||
#define OP_ir fmla | |||
#elif defined(RN) || defined(RT) || defined(CN) || defined(CT) | |||
#define OP_rr fmla | |||
#define OP_ii fmla | |||
#define OP_ri fmla | |||
#define OP_ir fmls | |||
#elif defined(RR) || defined(RC) || defined(CR) || defined(CC) | |||
#define OP_rr fmla | |||
#define OP_ii fmls | |||
#define OP_ri fmls | |||
#define OP_ir fmls | |||
#endif | |||
// 00 origM | |||
// 01 origN | |||
// 02 origK | |||
// 03 origPA | |||
// 04 origPB | |||
// 05 pC | |||
// 06 origLDC -> LDC | |||
// 07 offset -> temp | |||
// 08 counterL | |||
// 09 counterI | |||
// 10 counterJ | |||
// 11 pB | |||
// 12 pCRow0 | |||
// 13 pCRow1 | |||
// 14 pCRow2 | |||
// 15 pCRow3 | |||
// 16 pA | |||
// 17 alpha_save_R | |||
// 18 must save alpha_save_I | |||
// 19 must save | |||
// 20 must save | |||
// 21 must save | |||
// 22 must save | |||
// 23 must save | |||
// 24 must save | |||
// 25 must save | |||
// 26 must save | |||
// 27 must save | |||
// 28 must save | |||
// 29 frame | |||
// 30 link | |||
// 31 sp | |||
//v00 ALPHA_R -> pA00_R, pA01_R | |||
//v01 ALPHA_I -> pA00_I, pA01_I | |||
//v02 pA02_R, pA03_R | |||
//v03 pA02_I, pA03_I | |||
//v04 pA10_R, pA11_R | |||
//v05 pA10_I, pA11_I | |||
//v06 pA12_R, pA13_R | |||
//v07 pA12_I, pA13_I | |||
//v08 must save pB00_R, pB01_R | |||
//v09 must save pB00_I, pB01_I | |||
//v10 must save pB02_R, pB03_R OR ALPHA0_R | |||
//v11 must save pB02_I, pB03_I OR ALPHA0_I | |||
//v12 must save pB10_R, pB11_R | |||
//v13 must save pB10_I, pB11_I | |||
//v14 must save pB12_R, pB13_R OR ALPHA1_R | |||
//v15 must save pB12_I, pB13_I OR ALPHA1_R | |||
//v16 pC0R | |||
//v17 pC0I | |||
//v18 pC1R | |||
//v19 pC1I | |||
//v20 pC2R | |||
//v21 pC2I | |||
//v22 pC3R | |||
//v23 pC3I | |||
//v24 pC3R | |||
//v25 pC3I | |||
//v26 pC22_R, pC23_R | |||
//v27 pC22_I, pC23_I | |||
//v28 pC30_R, pC31_R | |||
//v29 pC30_I, pC31_I | |||
//v30 pC32_R, pC33_R | |||
//v31 pC32_I, pC33_I | |||
/******************************************************************************* | |||
* Macro definitions | |||
*******************************************************************************/ | |||
.macro INITv1x4 | |||
dup z16.d, #0 | |||
dup z17.d, #0 | |||
dup z18.d, #0 | |||
dup z19.d, #0 | |||
dup z20.d, #0 | |||
dup z21.d, #0 | |||
dup z22.d, #0 | |||
dup z23.d, #0 | |||
.endm | |||
.macro KERNELv1x4_I | |||
ld2d {z0.d, z1.d}, p1/z, [pA] | |||
add pA, pA, lanes, lsl #4 // pA += lanes*2*8 | |||
ld2d {z2.d, z3.d}, p1/z, [pA] // next one | |||
add pA, pA, lanes, lsl #4 // pA += lanes*2*8 | |||
ld1rd z8.d, p0/z, [pB] | |||
ld1rd z9.d, p0/z, [pB, 8] | |||
ld1rd z10.d, p0/z, [pB, 16] | |||
ld1rd z11.d, p0/z, [pB, 24] | |||
ld1rd z12.d, p0/z, [pB, 32] | |||
ld1rd z13.d, p0/z, [pB, 40] | |||
ld1rd z14.d, p0/z, [pB, 48] | |||
ld1rd z15.d, p0/z, [pB, 56] | |||
add pB, pB, 64 | |||
fmla z16.d, p1/m, z0.d, z8.d | |||
OP_ir z17.d, p1/m, z1.d, z8.d | |||
ld1rd z8.d, p0/z, [pB] | |||
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ | |||
defined(RR) || defined(RC) || defined(CR) || defined(CC) | |||
#eor z17.16b, z17.16b, z17.16b | |||
fmls z17.d, p1/m, z0.d, z9.d | |||
#else | |||
fmla z17.d, p1/m, z0.d, z9.d | |||
#endif | |||
OP_ii z16.d, p1/m, z1.d, z9.d | |||
ld1rd z9.d, p0/z, [pB, 8] | |||
fmla z18.d, p1/m, z0.d, z10.d | |||
OP_ir z19.d, p1/m, z1.d, z10.d | |||
ld1rd z10.d, p0/z, [pB, 16] | |||
OP_ii z18.d, p1/m, z1.d, z11.d | |||
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ | |||
defined(RR) || defined(RC) || defined(CR) || defined(CC) | |||
#eor z19.16b, z21.16b, z21.16b | |||
fmls z19.d, p1/m, z0.d, z11.d | |||
#else | |||
fmla z19.d, p1/m, z0.d, z11.d | |||
#endif | |||
ld1rd z11.d, p0/z, [pB, 24] | |||
fmla z20.d, p1/m, z0.d, z12.d | |||
OP_ir z21.d, p1/m, z1.d, z12.d | |||
ld1rd z12.d, p0/z, [pB, 32] | |||
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ | |||
defined(RR) || defined(RC) || defined(CR) || defined(CC) | |||
#eor z21.16b, z23.16b, z23.16b | |||
fmls z21.d, p1/m, z0.d, z13.d | |||
#else | |||
fmla z21.d, p1/m, z0.d, z13.d | |||
#endif | |||
OP_ii z20.d, p1/m, z1.d, z13.d | |||
ld1rd z13.d, p0/z, [pB, 40] | |||
fmla z22.d, p1/m, z0.d, z14.d | |||
OP_ir z23.d, p1/m, z1.d, z14.d | |||
ld1rd z14.d, p0/z, [pB, 48] | |||
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ | |||
defined(RR) || defined(RC) || defined(CR) || defined(CC) | |||
#eor z23.16b, z19.16b, z19.16b | |||
fmls z23.d, p1/m, z0.d, z15.d | |||
#else | |||
fmla z23.d, p1/m, z0.d, z15.d | |||
#endif | |||
OP_ii z22.d, p1/m, z1.d, z15.d | |||
ld1rd z15.d, p0/z, [pB, 56] | |||
add pB, pB, 64 | |||
prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64] | |||
.endm | |||
.macro KERNELv1x4_M1 | |||
ld2d {z2.d, z3.d}, p1/z, [pA] | |||
add pA, pA, lanes, lsl #4 // pA = pA + lanes * 2 * 8 | |||
OP_rr z16.d, p1/m, z0.d, z8.d | |||
OP_ir z17.d, p1/m, z1.d, z8.d | |||
ld1rd z8.d, p0/z, [pB] | |||
OP_ii z16.d, p1/m, z1.d, z9.d | |||
OP_ri z17.d, p1/m, z0.d, z9.d | |||
ld1rd z9.d, p0/z, [pB, 8] | |||
OP_rr z18.d, p1/m, z0.d, z10.d | |||
OP_ir z19.d, p1/m, z1.d, z10.d | |||
ld1rd z10.d, p0/z, [pB, 16] | |||
OP_ii z18.d, p1/m, z1.d, z11.d | |||
OP_ri z19.d, p1/m, z0.d, z11.d | |||
ld1rd z11.d, p0/z, [pB, 24] | |||
OP_rr z20.d, p1/m, z0.d, z12.d | |||
OP_ir z21.d, p1/m, z1.d, z12.d | |||
ld1rd z12.d, p0/z, [pB, 32] | |||
OP_ii z20.d, p1/m, z1.d, z13.d | |||
OP_ri z21.d, p1/m, z0.d, z13.d | |||
ld1rd z13.d, p0/z, [pB, 40] | |||
OP_rr z22.d, p1/m, z0.d, z14.d | |||
OP_ir z23.d, p1/m, z1.d, z14.d | |||
ld1rd z14.d, p0/z, [pB, 48] | |||
OP_ii z22.d, p1/m, z1.d, z15.d | |||
OP_ri z23.d, p1/m, z0.d, z15.d | |||
ld1rd z15.d, p0/z, [pB, 56] | |||
add pB, pB, 64 | |||
prfm PLDL1KEEP, [pA, #A_PRE_SIZE] | |||
prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64] | |||
.endm | |||
.macro KERNELv1x4_M2 | |||
ld2d {z0.d, z1.d}, p1/z, [pA] | |||
add pA, pA, lanes, lsl #4 // pA = pA + lanes *2 * 8 | |||
OP_rr z16.d, p1/m, z2.d, z8.d | |||
OP_ir z17.d, p1/m, z3.d, z8.d | |||
ld1rd z8.d, p0/z, [pB] | |||
OP_ii z16.d, p1/m, z3.d, z9.d | |||
OP_ri z17.d, p1/m, z2.d, z9.d | |||
ld1rd z9.d, p0/z, [pB, 8] | |||
OP_rr z18.d, p1/m, z2.d, z10.d | |||
OP_ir z19.d, p1/m, z3.d, z10.d | |||
ld1rd z10.d, p0/z, [pB, 16] | |||
OP_ii z18.d, p1/m, z3.d, z11.d | |||
OP_ri z19.d, p1/m, z2.d, z11.d | |||
ld1rd z11.d, p0/z, [pB, 24] | |||
OP_rr z20.d, p1/m, z2.d, z12.d | |||
OP_ir z21.d, p1/m, z3.d, z12.d | |||
ld1rd z12.d, p0/z, [pB, 32] | |||
OP_ii z20.d, p1/m, z3.d, z13.d | |||
OP_ri z21.d, p1/m, z2.d, z13.d | |||
ld1rd z13.d, p0/z, [pB, 40] | |||
OP_rr z22.d, p1/m, z2.d, z14.d | |||
OP_ir z23.d, p1/m, z3.d, z14.d | |||
ld1rd z14.d, p0/z, [pB, 48] | |||
OP_ii z22.d, p1/m, z3.d, z15.d | |||
OP_ri z23.d, p1/m, z2.d, z15.d | |||
ld1rd z15.d, p0/z, [pB, 56] | |||
prfm PLDL1KEEP, [pB, #B_PRE_SIZE] | |||
add pB, pB, 64 | |||
prfm PLDL1KEEP, [pB, #B_PRE_SIZE+64] | |||
.endm | |||
.macro KERNELv1x4_E | |||
OP_rr z16.d, p1/m, z2.d, z8.d | |||
OP_ir z17.d, p1/m, z3.d, z8.d | |||
OP_ii z16.d, p1/m, z3.d, z9.d | |||
OP_ri z17.d, p1/m, z2.d, z9.d | |||
OP_rr z18.d, p1/m, z2.d, z10.d | |||
OP_ir z19.d, p1/m, z3.d, z10.d | |||
OP_ii z18.d, p1/m, z3.d, z11.d | |||
OP_ri z19.d, p1/m, z2.d, z11.d | |||
OP_rr z20.d, p1/m, z2.d, z12.d | |||
OP_ir z21.d, p1/m, z3.d, z12.d | |||
OP_ii z20.d, p1/m, z3.d, z13.d | |||
OP_ri z21.d, p1/m, z2.d, z13.d | |||
OP_rr z22.d, p1/m, z2.d, z14.d | |||
OP_ir z23.d, p1/m, z3.d, z14.d | |||
OP_ii z22.d, p1/m, z3.d, z15.d | |||
OP_ri z23.d, p1/m, z2.d, z15.d | |||
prfm PLDL1KEEP, [pB, #B_PRE_SIZE] | |||
prfm PLDL1KEEP, [pB, #B_PRE_SIZE+64] | |||
.endm | |||
.macro KERNELv1x4_SUB | |||
ld2d {z0.d, z1.d}, p1/z, [pA] | |||
add pA, pA, lanes, lsl #4 // pA = pA + lanes* 2 * 8 | |||
ld1rd z8.d, p0/z, [pB] | |||
ld1rd z9.d, p0/z, [pB, 8] | |||
ld1rd z10.d, p0/z, [pB, 16] | |||
ld1rd z11.d, p0/z, [pB, 24] | |||
OP_rr z16.d, p1/m, z0.d, z8.d | |||
OP_ir z17.d, p1/m, z1.d, z8.d | |||
OP_ii z16.d, p1/m, z1.d, z9.d | |||
OP_ri z17.d, p1/m, z0.d, z9.d | |||
ld1rd z12.d, p0/z, [pB, 32] | |||
ld1rd z13.d, p0/z, [pB, 40] | |||
ld1rd z14.d, p0/z, [pB, 48] | |||
ld1rd z15.d, p0/z, [pB, 56] | |||
OP_rr z18.d, p1/m, z0.d, z10.d | |||
OP_ir z19.d, p1/m, z1.d, z10.d | |||
OP_ii z18.d, p1/m, z1.d, z11.d | |||
OP_ri z19.d, p1/m, z0.d, z11.d | |||
add pB, pB, 64 | |||
OP_rr z20.d, p1/m, z0.d, z12.d | |||
OP_ir z21.d, p1/m, z1.d, z12.d | |||
OP_ii z20.d, p1/m, z1.d, z13.d | |||
OP_ri z21.d, p1/m, z0.d, z13.d | |||
OP_rr z22.d, p1/m, z0.d, z14.d | |||
OP_ir z23.d, p1/m, z1.d, z14.d | |||
OP_ii z22.d, p1/m, z1.d, z15.d | |||
OP_ri z23.d, p1/m, z0.d, z15.d | |||
prfm PLDL1KEEP, [pB, #B_PRE_SIZE] | |||
prfm PLDL1KEEP, [pA, #A_PRE_SIZE] | |||
.endm | |||
.macro SAVEv1x4 | |||
prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE] | |||
ld2d {z24.d, z25.d}, p1/z, [pCRow0] | |||
fmla z24.d, p1/m, z16.d, alphaz_R | |||
fmls z24.d, p1/m, z17.d, alphaz_I | |||
fmla z25.d, p1/m, z16.d, alphaz_I | |||
fmla z25.d, p1/m, z17.d, alphaz_R | |||
st2d {z24.d, z25.d}, p1, [pCRow0] | |||
add pCRow0, pCRow0, lanes, lsl #4 | |||
ld2d {z26.d, z27.d}, p1/z, [pCRow1] | |||
fmla z26.d, p1/m, z18.d, alphaz_R | |||
fmls z26.d, p1/m, z19.d, alphaz_I | |||
fmla z27.d, p1/m, z18.d, alphaz_I | |||
fmla z27.d, p1/m, z19.d, alphaz_R | |||
st2d {z26.d, z27.d}, p1, [pCRow1] | |||
add pCRow1, pCRow1, lanes, lsl #4 | |||
prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] | |||
ld2d {z28.d, z29.d}, p1/z, [pCRow2] | |||
fmla z28.d, p1/m, z20.d, alphaz_R | |||
fmls z28.d, p1/m, z21.d, alphaz_I | |||
fmla z29.d, p1/m, z20.d, alphaz_I | |||
fmla z29.d, p1/m, z21.d, alphaz_R | |||
st2d {z28.d, z29.d}, p1, [pCRow2] | |||
add pCRow2, pCRow2, lanes, lsl #4 | |||
ld2d {z30.d, z31.d}, p1/z, [pCRow3] | |||
fmla z30.d, p1/m, z22.d, alphaz_R | |||
fmls z30.d, p1/m, z23.d, alphaz_I | |||
fmla z31.d, p1/m, z22.d, alphaz_I | |||
fmla z31.d, p1/m, z23.d, alphaz_R | |||
st2d {z30.d, z31.d}, p1, [pCRow3] | |||
prfm PLDL2KEEP, [pCRow3, #C_PRE_SIZE] | |||
add pCRow3, pCRow3, lanes, lsl #4 // pC = pC + lanes * 2 *8 | |||
prfm PLDL2KEEP, [pCRow3, #C_PRE_SIZE] | |||
.endm | |||
/******************************************************************************/ | |||
.macro INITv1x2 | |||
dup z16.d, #0 | |||
dup z17.d, #0 | |||
dup z18.d, #0 | |||
dup z19.d, #0 | |||
.endm | |||
.macro KERNELv1x2_SUB | |||
ld2d {z0.d, z1.d}, p1/z, [pA] | |||
add pA, pA, lanes, lsl #4 // pA = pA + lanes* 2 * 8 | |||
ld1rd z8.d, p0/z, [pB] | |||
ld1rd z9.d, p0/z, [pB, 8] | |||
ld1rd z10.d, p0/z, [pB, 16] | |||
ld1rd z11.d, p0/z, [pB, 24] | |||
OP_rr z16.d, p1/m, z0.d, z8.d | |||
OP_ir z17.d, p1/m, z1.d, z8.d | |||
OP_ii z16.d, p1/m, z1.d, z9.d | |||
OP_ri z17.d, p1/m, z0.d, z9.d | |||
OP_rr z18.d, p1/m, z0.d, z10.d | |||
OP_ir z19.d, p1/m, z1.d, z10.d | |||
OP_ii z18.d, p1/m, z1.d, z11.d | |||
OP_ri z19.d, p1/m, z0.d, z11.d | |||
add pB, pB, 32 | |||
.endm | |||
.macro SAVEv1x2 | |||
prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE] | |||
ld2d {z24.d, z25.d}, p1/z, [pCRow0] | |||
fmla z24.d, p1/m, z16.d, alphaz_R | |||
fmls z24.d, p1/m, z17.d, alphaz_I | |||
fmla z25.d, p1/m, z16.d, alphaz_I | |||
fmla z25.d, p1/m, z17.d, alphaz_R | |||
st2d {z24.d, z25.d}, p1, [pCRow0] | |||
add pCRow0, pCRow0, lanes, lsl #4 | |||
ld2d {z26.d, z27.d}, p1/z, [pCRow1] | |||
fmla z26.d, p1/m, z18.d, alphaz_R | |||
fmls z26.d, p1/m, z19.d, alphaz_I | |||
fmla z27.d, p1/m, z18.d, alphaz_I | |||
fmla z27.d, p1/m, z19.d, alphaz_R | |||
st2d {z26.d, z27.d}, p1, [pCRow1] | |||
add pCRow1, pCRow1, lanes, lsl #4 | |||
prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] | |||
prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE] | |||
.endm | |||
/******************************************************************************/ | |||
.macro INITv1x1 | |||
dup z16.d, #0 | |||
dup z17.d, #0 | |||
.endm | |||
.macro KERNELv1x1_SUB | |||
ld2d {z0.d, z1.d}, p1/z, [pA] | |||
add pA, pA, lanes, lsl #4 // pA = pA + lanes* 2 * 8 | |||
ld1rd z8.d, p0/z, [pB] | |||
ld1rd z9.d, p0/z, [pB, 8] | |||
add pB, pB, 16 | |||
OP_rr z16.d, p1/m, z0.d, z8.d | |||
OP_ir z17.d, p1/m, z1.d, z8.d | |||
OP_ii z16.d, p1/m, z1.d, z9.d | |||
OP_ri z17.d, p1/m, z0.d, z9.d | |||
.endm | |||
.macro SAVEv1x1 | |||
prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE] | |||
ld2d {z24.d, z25.d}, p1/z, [pCRow0] | |||
fmla z24.d, p1/m, z16.d, alphaz_R | |||
fmls z24.d, p1/m, z17.d, alphaz_I | |||
fmla z25.d, p1/m, z16.d, alphaz_I | |||
fmla z25.d, p1/m, z17.d, alphaz_R | |||
st2d {z24.d, z25.d}, p1, [pCRow0] | |||
add pCRow0, pCRow0, lanes, lsl #4 // pC = pC + lanes * 2 *8 | |||
prfm PLDL2KEEP, [pCRow3, #C_PRE_SIZE] | |||
.endm | |||
/******************************************************************************/ | |||
/******************************************************************************* | |||
* End of macro definitions | |||
*******************************************************************************/ | |||
PROLOGUE | |||
.align 5 | |||
add sp, sp, #-(11 * 16) | |||
stp d8, d9, [sp, #(0 * 16)] | |||
stp d10, d11, [sp, #(1 * 16)] | |||
stp d12, d13, [sp, #(2 * 16)] | |||
stp d14, d15, [sp, #(3 * 16)] | |||
stp d16, d17, [sp, #(4 * 16)] | |||
stp x18, x19, [sp, #(5 * 16)] | |||
stp x20, x21, [sp, #(6 * 16)] | |||
stp x22, x23, [sp, #(7 * 16)] | |||
stp x24, x25, [sp, #(8 * 16)] | |||
stp x26, x27, [sp, #(9 * 16)] | |||
str x28, [sp, #(10 * 16)] | |||
prfm PLDL1KEEP, [origPB] | |||
prfm PLDL1KEEP, [origPA] | |||
fmov alphaR, d0 | |||
dup alphaz_R, alphaR | |||
fmov alphaI, d1 | |||
dup alphaz_I, alphaI | |||
lsl LDC, LDC, #4 // ldc = ldc * 2 * 8 | |||
ptrue p0.d // create true predicate | |||
mov pB, origPB | |||
// Loop over N | |||
mov counterJ, origN | |||
asr counterJ, counterJ, #2 // J = J / 4 | |||
cmp counterJ, #0 | |||
ble .Lzgemm_kernel_L2_BEGIN | |||
/******************************************************************************/ | |||
.Lzgemm_kernel_L4_BEGIN: | |||
mov pCRow0, pC | |||
add pCRow1, pCRow0, LDC | |||
add pCRow2, pCRow1, LDC | |||
add pCRow3, pCRow2, LDC | |||
add pC, pCRow3, LDC | |||
mov pA, origPA // pA = start of A array | |||
.Lzgemm_kernel_L4_Mv1_BEGIN: | |||
/* Loop over M is done in an SVE fashion. This has the benefit of the last M%SVE_LEN iterations being done in a single sweep */ | |||
mov counterI, #0 | |||
whilelt p1.d, counterI, origM | |||
cntp lanes, p0, p1.d // lanes contain number of active SVE lanes in M dimension | |||
.align 5 | |||
.Lzgemm_kernel_L4_Mv1_20: | |||
mov pB, origPB | |||
INITv1x4 // fill with zeros | |||
asr counterL , origK, #3 | |||
cmp counterL , #2 | |||
blt .Lzgemm_kernel_L4_Mv1_32 | |||
KERNELv1x4_I | |||
KERNELv1x4_M2 | |||
KERNELv1x4_M1 | |||
KERNELv1x4_M2 | |||
KERNELv1x4_M1 | |||
KERNELv1x4_M2 | |||
KERNELv1x4_M1 | |||
KERNELv1x4_M2 | |||
subs counterL, counterL, #2 // subtract 2 | |||
ble .Lzgemm_kernel_L4_Mv1_22a | |||
.align 5 | |||
.Lzgemm_kernel_L4_Mv1_22: | |||
KERNELv1x4_M1 | |||
KERNELv1x4_M2 | |||
KERNELv1x4_M1 | |||
KERNELv1x4_M2 | |||
KERNELv1x4_M1 | |||
KERNELv1x4_M2 | |||
KERNELv1x4_M1 | |||
KERNELv1x4_M2 | |||
subs counterL, counterL, #1 | |||
bgt .Lzgemm_kernel_L4_Mv1_22 | |||
.align 5 | |||
.Lzgemm_kernel_L4_Mv1_22a: | |||
KERNELv1x4_M1 | |||
KERNELv1x4_M2 | |||
KERNELv1x4_M1 | |||
KERNELv1x4_M2 | |||
KERNELv1x4_M1 | |||
KERNELv1x4_M2 | |||
KERNELv1x4_M1 | |||
KERNELv1x4_E | |||
b .Lzgemm_kernel_L4_Mv1_44 | |||
.align 5 | |||
.Lzgemm_kernel_L4_Mv1_32: | |||
tst counterL, #1 | |||
ble .Lzgemm_kernel_L4_Mv1_40 | |||
KERNELv1x4_I | |||
KERNELv1x4_M2 | |||
KERNELv1x4_M1 | |||
KERNELv1x4_M2 | |||
KERNELv1x4_M1 | |||
KERNELv1x4_M2 | |||
KERNELv1x4_M1 | |||
KERNELv1x4_E | |||
b .Lzgemm_kernel_L4_Mv1_44 | |||
.Lzgemm_kernel_L4_Mv1_40: | |||
INITv1x4 | |||
.Lzgemm_kernel_L4_Mv1_44: | |||
ands counterL , origK, #7 | |||
ble .Lzgemm_kernel_L4_Mv1_100 | |||
.align 5 | |||
.Lzgemm_kernel_L4_Mv1_46: | |||
KERNELv1x4_SUB | |||
subs counterL, counterL, #1 | |||
bne .Lzgemm_kernel_L4_Mv1_46 | |||
.Lzgemm_kernel_L4_Mv1_100: | |||
prfm PLDL1KEEP, [pA] | |||
prfm PLDL1KEEP, [pA, #64] | |||
prfm PLDL1KEEP, [origPB] | |||
SAVEv1x4 | |||
.Lzgemm_kernel_L4_Mv1_END: | |||
incd counterI | |||
whilelt p1.d, counterI, origM //SVE instruction | |||
cntp lanes, p0, p1.d // lanes contain number of active SVE lanes in M dimension | |||
b.any .Lzgemm_kernel_L4_Mv1_20 | |||
.Lzgemm_kernel_L4_END: | |||
lsl temp, origK, #6 | |||
add origPB, origPB, temp // B = B + K * 4 * 8 * 2 | |||
subs counterJ, counterJ , #1 // j-- | |||
bgt .Lzgemm_kernel_L4_BEGIN | |||
/******************************************************************************/ | |||
.Lzgemm_kernel_L2_BEGIN: // less than 2 left in N direction | |||
mov counterJ , origN | |||
tst counterJ , #3 | |||
ble .Lzgemm_kernel_L999 | |||
tst counterJ , #2 | |||
ble .Lzgemm_kernel_L1_BEGIN | |||
mov pCRow0, pC // pCRow0 = pC | |||
add pCRow1, pCRow0, LDC | |||
add pC,pC,LDC, lsl #1 | |||
mov pA, origPA // pA = A | |||
.Lzgemm_kernel_L2_Mv1_BEGIN: | |||
mov counterI, #0 | |||
whilelt p1.d, counterI, origM //SVE instruction | |||
cntp lanes, p0, p1.d | |||
.Lzgemm_kernel_L2_Mv1_20: | |||
INITv1x2 | |||
mov pB, origPB | |||
asr counterL , origK, #3 // counterL = counterL / 8 | |||
cmp counterL,#0 | |||
ble .Lzgemm_kernel_L2_Mv1_40 | |||
.align 5 | |||
.Lzgemm_kernel_L2_Mv1_22: | |||
KERNELv1x2_SUB | |||
KERNELv1x2_SUB | |||
KERNELv1x2_SUB | |||
KERNELv1x2_SUB | |||
KERNELv1x2_SUB | |||
KERNELv1x2_SUB | |||
KERNELv1x2_SUB | |||
KERNELv1x2_SUB | |||
subs counterL, counterL, #1 | |||
bgt .Lzgemm_kernel_L2_Mv1_22 | |||
.Lzgemm_kernel_L2_Mv1_40: | |||
ands counterL , origK, #7 // counterL = counterL % 8 | |||
ble .Lzgemm_kernel_L2_Mv1_100 | |||
.Lzgemm_kernel_L2_Mv1_42: | |||
KERNELv1x2_SUB | |||
subs counterL, counterL, #1 | |||
bgt .Lzgemm_kernel_L2_Mv1_42 | |||
.Lzgemm_kernel_L2_Mv1_100: | |||
SAVEv1x2 | |||
.Lzgemm_kernel_L2_Mv1_END: | |||
incd counterI | |||
whilelt p1.d, counterI, origM //SVE instruction | |||
cntp lanes, p0, p1.d | |||
b.any .Lzgemm_kernel_L2_Mv1_20 | |||
.Lzgemm_kernel_L2_END: | |||
lsl temp, origK, #5 | |||
add origPB, origPB, temp // B = B + K * 2 * 8 * 2 | |||
/******************************************************************************/ | |||
.Lzgemm_kernel_L1_BEGIN: | |||
mov counterJ , origN | |||
tst counterJ , #1 | |||
ble .Lzgemm_kernel_L999 // done | |||
mov pCRow0, pC // pCRow0 = C | |||
add pC , pC , LDC // Update pC to point to next | |||
mov pA, origPA // pA = A | |||
.Lzgemm_kernel_L1_Mv1_BEGIN: | |||
mov counterI, #0 | |||
whilelt p1.d, counterI, origM //SVE instruction | |||
cntp lanes, p0, p1.d | |||
.Lzgemm_kernel_L1_Mv1_20: | |||
INITv1x1 | |||
mov pB, origPB | |||
asr counterL , origK, #3 // counterL = counterL / 8 | |||
cmp counterL , #0 | |||
ble .Lzgemm_kernel_L1_Mv1_40 | |||
.align 5 | |||
.Lzgemm_kernel_L1_Mv1_22: | |||
KERNELv1x1_SUB | |||
KERNELv1x1_SUB | |||
KERNELv1x1_SUB | |||
KERNELv1x1_SUB | |||
KERNELv1x1_SUB | |||
KERNELv1x1_SUB | |||
KERNELv1x1_SUB | |||
KERNELv1x1_SUB | |||
subs counterL, counterL, #1 | |||
bgt .Lzgemm_kernel_L1_Mv1_22 | |||
.Lzgemm_kernel_L1_Mv1_40: | |||
ands counterL , origK, #7 // counterL = counterL % 8 | |||
ble .Lzgemm_kernel_L1_Mv1_100 | |||
.Lzgemm_kernel_L1_Mv1_42: | |||
KERNELv1x1_SUB | |||
subs counterL, counterL, #1 | |||
bgt .Lzgemm_kernel_L1_Mv1_42 | |||
.Lzgemm_kernel_L1_Mv1_100: | |||
SAVEv1x1 | |||
.Lzgemm_kernel_L1_Mv1_END: | |||
incd counterI | |||
whilelt p1.d, counterI, origM //SVE instruction | |||
cntp lanes, p0, p1.d | |||
b.any .Lzgemm_kernel_L1_Mv1_20 | |||
.Lzgemm_kernel_L1_END: | |||
/******************************************************************************/ | |||
.Lzgemm_kernel_L999: | |||
mov x0, #0 // set return value | |||
ldp d8, d9, [sp, #(0 * 16)] | |||
ldp d10, d11, [sp, #(1 * 16)] | |||
ldp d12, d13, [sp, #(2 * 16)] | |||
ldp d14, d15, [sp, #(3 * 16)] | |||
ldp d16, d17, [sp, #(4 * 16)] | |||
ldp x18, x19, [sp, #(5 * 16)] | |||
ldp x20, x21, [sp, #(6 * 16)] | |||
ldp x22, x23, [sp, #(7 * 16)] | |||
ldp x24, x25, [sp, #(8 * 16)] | |||
ldp x26, x27, [sp, #(9 * 16)] | |||
ldr x28, [sp, #(10 * 16)] | |||
add sp, sp, #(11*16) | |||
ret | |||
EPILOGUE | |||
@@ -0,0 +1,79 @@ | |||
/*********************************************************************/ | |||
/* Copyright 2009, 2010 The University of Texas at Austin. */ | |||
/* All rights reserved. */ | |||
/* */ | |||
/* Redistribution and use in source and binary forms, with or */ | |||
/* without modification, are permitted provided that the following */ | |||
/* conditions are met: */ | |||
/* */ | |||
/* 1. Redistributions of source code must retain the above */ | |||
/* copyright notice, this list of conditions and the following */ | |||
/* disclaimer. */ | |||
/* */ | |||
/* 2. Redistributions in binary form must reproduce the above */ | |||
/* copyright notice, this list of conditions and the following */ | |||
/* disclaimer in the documentation and/or other materials */ | |||
/* provided with the distribution. */ | |||
/* */ | |||
/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ | |||
/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ | |||
/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ | |||
/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ | |||
/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ | |||
/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ | |||
/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ | |||
/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ | |||
/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ | |||
/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ | |||
/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ | |||
/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ | |||
/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ | |||
/* POSSIBILITY OF SUCH DAMAGE. */ | |||
/* */ | |||
/* The views and conclusions contained in the software and */ | |||
/* documentation are those of the authors and should not be */ | |||
/* interpreted as representing official policies, either expressed */ | |||
/* or implied, of The University of Texas at Austin. */ | |||
/*********************************************************************/ | |||
#include <stdio.h> | |||
#include "common.h" | |||
#include <arm_sve.h> | |||
// TODO: write in assembly with proper unrolling of inner loop | |||
int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){ | |||
BLASLONG j; | |||
IFLOAT *aoffset, *aoffset1, *boffset; | |||
svint64_t lda_vec = svindex_s64(0LL, lda * 2); | |||
aoffset = a; | |||
boffset = b; | |||
j = 0; | |||
svbool_t pg = svwhilelt_b64(j, n); | |||
uint64_t active = svcntp_b64(svptrue_b64(), pg); | |||
do { | |||
aoffset1 = aoffset; | |||
uint64_t i_cnt = m; | |||
while (i_cnt--) { | |||
svfloat64_t a_vec_real = svld1_gather_index(pg, (double *) aoffset1, lda_vec); | |||
svfloat64_t a_vec_imag = svld1_gather_index(pg, ((double *) aoffset1) + 1, lda_vec); | |||
svst2_f64(pg, (double *) boffset, svcreate2(a_vec_real, a_vec_imag)); | |||
aoffset1 += 2; | |||
boffset += active * 2; | |||
} | |||
aoffset += active * lda * 2; | |||
j += svcntd(); | |||
pg = svwhilelt_b64(j, n); | |||
active = svcntp_b64(svptrue_b64(), pg); | |||
} while (svptest_any(svptrue_b64(), pg)); | |||
return 0; | |||
} |
@@ -0,0 +1,75 @@ | |||
/*********************************************************************/ | |||
/* Copyright 2009, 2010 The University of Texas at Austin. */ | |||
/* All rights reserved. */ | |||
/* */ | |||
/* Redistribution and use in source and binary forms, with or */ | |||
/* without modification, are permitted provided that the following */ | |||
/* conditions are met: */ | |||
/* */ | |||
/* 1. Redistributions of source code must retain the above */ | |||
/* copyright notice, this list of conditions and the following */ | |||
/* disclaimer. */ | |||
/* */ | |||
/* 2. Redistributions in binary form must reproduce the above */ | |||
/* copyright notice, this list of conditions and the following */ | |||
/* disclaimer in the documentation and/or other materials */ | |||
/* provided with the distribution. */ | |||
/* */ | |||
/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ | |||
/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ | |||
/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ | |||
/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ | |||
/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ | |||
/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ | |||
/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ | |||
/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ | |||
/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ | |||
/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ | |||
/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ | |||
/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ | |||
/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ | |||
/* POSSIBILITY OF SUCH DAMAGE. */ | |||
/* */ | |||
/* The views and conclusions contained in the software and */ | |||
/* documentation are those of the authors and should not be */ | |||
/* interpreted as representing official policies, either expressed */ | |||
/* or implied, of The University of Texas at Austin. */ | |||
/*********************************************************************/ | |||
#include <stdio.h> | |||
#include "common.h" | |||
#include <arm_sve.h> | |||
// TODO: write in assembly with proper unrolling of inner loop | |||
int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){ | |||
BLASLONG j; | |||
IFLOAT *aoffset, *aoffset1, *boffset; | |||
aoffset = a; | |||
boffset = b; | |||
j = 0; | |||
svbool_t pg = svwhilelt_b64(j, n); | |||
uint64_t active = svcntp_b64(svptrue_b64(), pg); | |||
do { | |||
aoffset1 = aoffset; | |||
uint64_t i_cnt = m; | |||
while (i_cnt--) { | |||
svfloat64x2_t a_vec = svld2(pg, (double *)aoffset1); | |||
svst2_f64(pg, (double *) boffset, a_vec); | |||
aoffset1 += lda * 2; | |||
boffset += active * 2; | |||
} | |||
aoffset += active * 2; | |||
j += svcntd(); | |||
pg = svwhilelt_b64(j, n); | |||
active = svcntp_b64(svptrue_b64(), pg); | |||
} while (svptest_any(svptrue_b64(), pg)); | |||
return 0; | |||
} |
@@ -0,0 +1,172 @@ | |||
/*********************************************************************/ | |||
/* Copyright 2009, 2010 The University of Texas at Austin. */ | |||
/* All rights reserved. */ | |||
/* */ | |||
/* Redistribution and use in source and binary forms, with or */ | |||
/* without modification, are permitted provided that the following */ | |||
/* conditions are met: */ | |||
/* */ | |||
/* 1. Redistributions of source code must retain the above */ | |||
/* copyright notice, this list of conditions and the following */ | |||
/* disclaimer. */ | |||
/* */ | |||
/* 2. Redistributions in binary form must reproduce the above */ | |||
/* copyright notice, this list of conditions and the following */ | |||
/* disclaimer in the documentation and/or other materials */ | |||
/* provided with the distribution. */ | |||
/* */ | |||
/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ | |||
/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ | |||
/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ | |||
/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ | |||
/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ | |||
/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ | |||
/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ | |||
/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ | |||
/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ | |||
/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ | |||
/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ | |||
/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ | |||
/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ | |||
/* POSSIBILITY OF SUCH DAMAGE. */ | |||
/* */ | |||
/* The views and conclusions contained in the software and */ | |||
/* documentation are those of the authors and should not be */ | |||
/* interpreted as representing official policies, either expressed */ | |||
/* or implied, of The University of Texas at Austin. */ | |||
/*********************************************************************/ | |||
#include <stdio.h> | |||
#include "common.h" | |||
#include <arm_sve.h> | |||
int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){ | |||
#if defined(DOUBLE) | |||
BLASLONG offset, i; | |||
lda *= 2; | |||
uint64_t sve_size = svcntd(); | |||
svint64_t posY_vec = svdup_s64(posY); | |||
svint64_t posX_vec = svdup_s64(posX); | |||
svint64_t lda_vec = svdup_s64(lda); | |||
svint64_t one_vec = svdup_s64(1LL); | |||
int64_t j = 0; | |||
svbool_t pg = svwhilelt_b64(j, n); | |||
int64_t active = svcntp_b64(svptrue_b64(), pg); | |||
svint64_t index_neg = svindex_s64(0LL, -1LL); | |||
svint64_t index = svindex_s64(0LL, 1LL); | |||
do { | |||
offset = posX - posY; | |||
svint64_t vec_off = svdup_s64(offset); | |||
svbool_t cmp = svcmpgt(pg, vec_off, index_neg); | |||
svint64_t temp = svadd_z(pg, posX_vec, index); | |||
svint64_t temp1 = svmul_z(pg, temp, 2); | |||
temp1 = svmla_z(pg, temp1, posY_vec, lda_vec); | |||
svint64_t temp2 = svmul_z(pg, temp, lda_vec); | |||
temp2 = svmla_z(pg, temp2, posY_vec, 2); | |||
svint64_t gat_ind = svsel(cmp, temp1, temp2); | |||
i = m; | |||
while (i>0) { | |||
svfloat64_t data_vec_real = svld1_gather_index(pg, a, gat_ind); | |||
svfloat64_t data_vec_imag = svld1_gather_index(pg, a+1, gat_ind); | |||
gat_ind = svadd_m(cmp, gat_ind, lda_vec); | |||
gat_ind = svadd_m(svnot_z(pg, cmp) , gat_ind, 2); | |||
if (offset <= 0) { | |||
svbool_t off_g = svwhilelt_b64(offset, 0LL); | |||
data_vec_imag = svneg_m(data_vec_imag, off_g, data_vec_imag); | |||
} | |||
svst2(pg, b, svcreate2(data_vec_real, data_vec_imag)); | |||
// dealing with ZERO separately | |||
if (offset > -active && offset < 1) | |||
b[ -2*offset + 1 ] = ZERO; | |||
b += active * 2; | |||
offset --; | |||
vec_off = svsub_z(pg, vec_off, one_vec); | |||
cmp = svcmpgt(pg, vec_off, index_neg); | |||
i--; | |||
} | |||
posX += sve_size; | |||
posX_vec = svdup_s64(posX); | |||
j += sve_size; | |||
pg = svwhilelt_b64(j, n); | |||
active = svcntp_b64(svptrue_b64(), pg); | |||
} while (svptest_any(svptrue_b64(), pg)); | |||
#else | |||
int offset, i; | |||
lda *= 2; | |||
uint32_t sve_size = svcntw(); | |||
svint32_t posY_vec = svdup_s32(posY); | |||
svint32_t posX_vec = svdup_s32(posX); | |||
svint32_t lda_vec = svdup_s32(lda); | |||
svint32_t one_vec = svdup_s32(1); | |||
int32_t j = 0; | |||
int32_t N = n; | |||
svbool_t pg = svwhilelt_b32(j, N); | |||
int32_t active = svcntp_b32(svptrue_b32(), pg); | |||
svint32_t index_neg = svindex_s32(0, -1); | |||
svint32_t index = svindex_s32(0, 1); | |||
do { | |||
offset = posX - posY; | |||
svint32_t vec_off = svdup_s32(offset); | |||
svbool_t cmp = svcmpgt(pg, vec_off, index_neg); | |||
svint32_t temp = svadd_z(pg, posX_vec, index); | |||
svint32_t temp1 = svmul_z(pg, temp, 2); | |||
temp1 = svmla_z(pg, temp1, posY_vec, lda_vec); | |||
svint32_t temp2 = svmul_z(pg, temp, lda_vec); | |||
temp2 = svmla_z(pg, temp2, posY_vec, 2); | |||
svint32_t gat_ind = svsel(cmp, temp1, temp2); | |||
i = m; | |||
while (i>0) { | |||
svfloat32_t data_vec_real = svld1_gather_index(pg, a, gat_ind); | |||
svfloat32_t data_vec_imag = svld1_gather_index(pg, a+1, gat_ind); | |||
gat_ind = svadd_m(cmp, gat_ind, lda_vec); | |||
gat_ind = svadd_m(svnot_z(pg, cmp) , gat_ind, 2); | |||
if (offset <= 0) { | |||
svbool_t off_g = svwhilelt_b32(offset, 0); | |||
data_vec_imag = svneg_m(data_vec_imag, off_g, data_vec_imag); | |||
} | |||
svst2(pg, b, svcreate2(data_vec_real, data_vec_imag)); | |||
// dealing with ZERO separately | |||
if (offset > -active && offset < 1) | |||
b[ -2*offset + 1 ] = ZERO; | |||
b += active * 2; | |||
offset --; | |||
vec_off = svsub_z(pg, vec_off, one_vec); | |||
cmp = svcmpgt(pg, vec_off, index_neg); | |||
i--; | |||
} | |||
posX += sve_size; | |||
posX_vec = svdup_s32(posX); | |||
j += sve_size; | |||
pg = svwhilelt_b32(j, N); | |||
active = svcntp_b32(svptrue_b32(), pg); | |||
} while (svptest_any(svptrue_b32(), pg)); | |||
#endif | |||
return 0; | |||
} |
@@ -0,0 +1,172 @@ | |||
/*********************************************************************/ | |||
/* Copyright 2009, 2010 The University of Texas at Austin. */ | |||
/* All rights reserved. */ | |||
/* */ | |||
/* Redistribution and use in source and binary forms, with or */ | |||
/* without modification, are permitted provided that the following */ | |||
/* conditions are met: */ | |||
/* */ | |||
/* 1. Redistributions of source code must retain the above */ | |||
/* copyright notice, this list of conditions and the following */ | |||
/* disclaimer. */ | |||
/* */ | |||
/* 2. Redistributions in binary form must reproduce the above */ | |||
/* copyright notice, this list of conditions and the following */ | |||
/* disclaimer in the documentation and/or other materials */ | |||
/* provided with the distribution. */ | |||
/* */ | |||
/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ | |||
/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ | |||
/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ | |||
/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ | |||
/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ | |||
/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ | |||
/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ | |||
/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ | |||
/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ | |||
/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ | |||
/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ | |||
/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ | |||
/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ | |||
/* POSSIBILITY OF SUCH DAMAGE. */ | |||
/* */ | |||
/* The views and conclusions contained in the software and */ | |||
/* documentation are those of the authors and should not be */ | |||
/* interpreted as representing official policies, either expressed */ | |||
/* or implied, of The University of Texas at Austin. */ | |||
/*********************************************************************/ | |||
#include <stdio.h> | |||
#include "common.h" | |||
#include <arm_sve.h> | |||
int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){ | |||
#if defined(DOUBLE) | |||
BLASLONG offset, i; | |||
lda *= 2; | |||
uint64_t sve_size = svcntd(); | |||
svint64_t posY_vec = svdup_s64(posY); | |||
svint64_t posX_vec = svdup_s64(posX); | |||
svint64_t lda_vec = svdup_s64(lda); | |||
svint64_t one_vec = svdup_s64(1LL); | |||
int64_t j = 0; | |||
svbool_t pg = svwhilelt_b64(j, n); | |||
int64_t active = svcntp_b64(svptrue_b64(), pg); | |||
svint64_t index_neg = svindex_s64(0LL, -1LL); | |||
svint64_t index = svindex_s64(0LL, 1LL); | |||
do { | |||
offset = posX - posY; | |||
svint64_t vec_off = svdup_s64(offset); | |||
svbool_t cmp = svcmpgt(pg, vec_off, index_neg); | |||
svint64_t temp = svadd_z(pg, posX_vec, index); | |||
svint64_t temp1 = svmul_z(pg, temp, lda); | |||
temp1 = svmla_z(pg, temp1, posY_vec, 2); | |||
svint64_t temp2 = svmul_z(pg, temp, 2); | |||
temp2 = svmla_z(pg, temp2, posY_vec, lda); | |||
svint64_t gat_ind = svsel(cmp, temp1, temp2); | |||
i = m; | |||
while (i>0) { | |||
svfloat64_t data_vec_real = svld1_gather_index(pg, a, gat_ind); | |||
svfloat64_t data_vec_imag = svld1_gather_index(pg, a+1, gat_ind); | |||
gat_ind = svadd_m(cmp, gat_ind, 2); | |||
gat_ind = svadd_m(svnot_z(pg, cmp) , gat_ind, lda_vec); | |||
data_vec_imag = svneg_z(pg, data_vec_imag); | |||
if (offset <= 0) { | |||
svbool_t off_g = svwhilelt_b64(offset, 0LL); | |||
data_vec_imag = svneg_m(data_vec_imag, off_g, data_vec_imag); | |||
} | |||
svst2(pg, b, svcreate2(data_vec_real, data_vec_imag)); | |||
// dealing with ZERO separately | |||
if (offset > -active && offset < 1) | |||
b[ -2*offset + 1 ] = ZERO; | |||
b += active * 2; | |||
offset --; | |||
vec_off = svsub_z(pg, vec_off, one_vec); | |||
cmp = svcmpgt(pg, vec_off, index_neg); | |||
i--; | |||
} | |||
posX += sve_size; | |||
posX_vec = svdup_s64(posX); | |||
j += sve_size; | |||
pg = svwhilelt_b64(j, n); | |||
active = svcntp_b64(svptrue_b64(), pg); | |||
} while (svptest_any(svptrue_b64(), pg)); | |||
#else | |||
int offset, i; | |||
lda *= 2; | |||
uint32_t sve_size = svcntw(); | |||
svint32_t posY_vec = svdup_s32(posY); | |||
svint32_t posX_vec = svdup_s32(posX); | |||
svint32_t lda_vec = svdup_s32(lda); | |||
svint32_t one_vec = svdup_s32(1); | |||
int32_t j = 0; | |||
int32_t N = n; | |||
svbool_t pg = svwhilelt_b32(j, N); | |||
int32_t active = svcntp_b32(svptrue_b32(), pg); | |||
svint32_t index_neg = svindex_s32(0, -1); | |||
svint32_t index = svindex_s32(0, 1); | |||
do { | |||
offset = posX - posY; | |||
svint32_t vec_off = svdup_s32(offset); | |||
svbool_t cmp = svcmpgt(pg, vec_off, index_neg); | |||
svint32_t temp = svadd_z(pg, posX_vec, index); | |||
svint32_t temp1 = svmul_z(pg, temp, lda); | |||
temp1 = svmla_z(pg, temp1, posY_vec, 2); | |||
svint32_t temp2 = svmul_z(pg, temp, 2); | |||
temp2 = svmla_z(pg, temp2, posY_vec, lda); | |||
svint32_t gat_ind = svsel(cmp, temp1, temp2); | |||
i = m; | |||
while (i>0) { | |||
svfloat32_t data_vec_real = svld1_gather_index(pg, a, gat_ind); | |||
svfloat32_t data_vec_imag = svld1_gather_index(pg, a+1, gat_ind); | |||
gat_ind = svadd_m(cmp, gat_ind, 2); | |||
gat_ind = svadd_m(svnot_z(pg, cmp) , gat_ind, lda_vec); | |||
data_vec_imag = svneg_z(pg, data_vec_imag); | |||
if (offset <= 0) { | |||
svbool_t off_g = svwhilelt_b32(offset, 0); | |||
data_vec_imag = svneg_m(data_vec_imag, off_g, data_vec_imag); | |||
} | |||
svst2(pg, b, svcreate2(data_vec_real, data_vec_imag)); | |||
// dealing with ZERO separately | |||
if (offset > -active && offset < 1) | |||
b[ -2*offset + 1 ] = ZERO; | |||
b += active * 2; | |||
offset --; | |||
vec_off = svsub_z(pg, vec_off, one_vec); | |||
cmp = svcmpgt(pg, vec_off, index_neg); | |||
i--; | |||
} | |||
posX += sve_size; | |||
posX_vec = svdup_s32(posX); | |||
j += sve_size; | |||
pg = svwhilelt_b32(j, N); | |||
active = svcntp_b32(svptrue_b32(), pg); | |||
} while (svptest_any(svptrue_b32(), pg)); | |||
#endif | |||
return 0; | |||
} |
@@ -0,0 +1,150 @@ | |||
/*********************************************************************/ | |||
/* Copyright 2009, 2010 The University of Texas at Austin. */ | |||
/* All rights reserved. */ | |||
/* */ | |||
/* Redistribution and use in source and binary forms, with or */ | |||
/* without modification, are permitted provided that the following */ | |||
/* conditions are met: */ | |||
/* */ | |||
/* 1. Redistributions of source code must retain the above */ | |||
/* copyright notice, this list of conditions and the following */ | |||
/* disclaimer. */ | |||
/* */ | |||
/* 2. Redistributions in binary form must reproduce the above */ | |||
/* copyright notice, this list of conditions and the following */ | |||
/* disclaimer in the documentation and/or other materials */ | |||
/* provided with the distribution. */ | |||
/* */ | |||
/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ | |||
/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ | |||
/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ | |||
/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ | |||
/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ | |||
/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ | |||
/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ | |||
/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ | |||
/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ | |||
/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ | |||
/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ | |||
/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ | |||
/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ | |||
/* POSSIBILITY OF SUCH DAMAGE. */ | |||
/* */ | |||
/* The views and conclusions contained in the software and */ | |||
/* documentation are those of the authors and should not be */ | |||
/* interpreted as representing official policies, either expressed */ | |||
/* or implied, of The University of Texas at Austin. */ | |||
/*********************************************************************/ | |||
#include <stdio.h> | |||
#include "common.h" | |||
#include <arm_sve.h> | |||
int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){ | |||
BLASLONG i, offset; | |||
lda *= 2; | |||
#if defined(DOUBLE) | |||
uint64_t sve_size = svcntd(); | |||
svint64_t posY_vec = svdup_s64(posY); | |||
svint64_t posX_vec = svdup_s64(posX); | |||
svint64_t lda_vec = svdup_s64(lda); | |||
svint64_t one_vec = svdup_s64(1LL); | |||
int64_t j = 0; | |||
svbool_t pg = svwhilelt_b64(j, n); | |||
int64_t active = svcntp_b64(svptrue_b64(), pg); | |||
svint64_t index_neg = svindex_s64(0LL, -1LL); | |||
svint64_t index = svindex_s64(0LL, 1LL); | |||
do { | |||
offset = posX - posY; | |||
svint64_t vec_off = svdup_s64(offset); | |||
svbool_t cmp = svcmpgt(pg, vec_off, index_neg); | |||
svint64_t temp = svadd_z(pg, posX_vec, index); | |||
svint64_t temp1 = svmul_z(pg, temp, 2); | |||
temp1 = svmla_z(pg, temp1, posY_vec, lda_vec); | |||
svint64_t temp2 = svmul_z(pg, temp, lda_vec); | |||
temp2 = svmla_z(pg, temp2, posY_vec, 2); | |||
svint64_t gat_ind = svsel(cmp, temp1, temp2); | |||
i = m; | |||
while (i>0) { | |||
svfloat64_t data_vec_real = svld1_gather_index(pg, a, gat_ind); | |||
svfloat64_t data_vec_imag = svld1_gather_index(pg, a+1, gat_ind); | |||
gat_ind = svadd_m(cmp, gat_ind, lda_vec); | |||
gat_ind = svadd_m(svnot_z(pg, cmp) , gat_ind, 2); | |||
svst2(pg, b, svcreate2(data_vec_real, data_vec_imag)); | |||
b += active * 2; | |||
offset --; | |||
vec_off = svsub_z(pg, vec_off, one_vec); | |||
cmp = svcmpgt(pg, vec_off, index_neg); | |||
i--; | |||
} | |||
posX += sve_size; | |||
posX_vec = svdup_s64(posX); | |||
j += sve_size; | |||
pg = svwhilelt_b64(j, n); | |||
active = svcntp_b64(svptrue_b64(), pg); | |||
} while (svptest_any(svptrue_b64(), pg)); | |||
#else | |||
uint32_t sve_size = svcntw(); | |||
svint32_t posY_vec = svdup_s32(posY); | |||
svint32_t posX_vec = svdup_s32(posX); | |||
svint32_t lda_vec = svdup_s32(lda); | |||
svint32_t one_vec = svdup_s32(1); | |||
int32_t N = n; | |||
int32_t j = 0; | |||
svbool_t pg = svwhilelt_b32(j, N); | |||
int32_t active = svcntp_b32(svptrue_b32(), pg); | |||
svint32_t index_neg = svindex_s32(0, -1); | |||
svint32_t index = svindex_s32(0, 1); | |||
do { | |||
offset = posX - posY; | |||
svint32_t vec_off = svdup_s32(offset); | |||
svbool_t cmp = svcmpgt(pg, vec_off, index_neg); | |||
svint32_t temp = svadd_z(pg, posX_vec, index); | |||
svint32_t temp1 = svmul_z(pg, temp, 2); | |||
temp1 = svmla_z(pg, temp1, posY_vec, lda_vec); | |||
svint32_t temp2 = svmul_z(pg, temp, lda_vec); | |||
temp2 = svmla_z(pg, temp2, posY_vec, 2); | |||
svint32_t gat_ind = svsel(cmp, temp1, temp2); | |||
i = m; | |||
while (i>0) { | |||
svfloat32_t data_vec_real = svld1_gather_index(pg, a, gat_ind); | |||
svfloat32_t data_vec_imag = svld1_gather_index(pg, a+1, gat_ind); | |||
gat_ind = svadd_m(cmp, gat_ind, lda_vec); | |||
gat_ind = svadd_m(svnot_z(pg, cmp) , gat_ind, 2); | |||
svst2(pg, b, svcreate2(data_vec_real, data_vec_imag)); | |||
b += active * 2; | |||
offset --; | |||
vec_off = svsub_z(pg, vec_off, one_vec); | |||
cmp = svcmpgt(pg, vec_off, index_neg); | |||
i--; | |||
} | |||
posX += sve_size; | |||
posX_vec = svdup_s32(posX); | |||
j += sve_size; | |||
pg = svwhilelt_b32(j, N); | |||
active = svcntp_b32(svptrue_b32(), pg); | |||
} while (svptest_any(svptrue_b32(), pg)); | |||
#endif | |||
return 0; | |||
} |
@@ -0,0 +1,150 @@ | |||
/*********************************************************************/ | |||
/* Copyright 2009, 2010 The University of Texas at Austin. */ | |||
/* All rights reserved. */ | |||
/* */ | |||
/* Redistribution and use in source and binary forms, with or */ | |||
/* without modification, are permitted provided that the following */ | |||
/* conditions are met: */ | |||
/* */ | |||
/* 1. Redistributions of source code must retain the above */ | |||
/* copyright notice, this list of conditions and the following */ | |||
/* disclaimer. */ | |||
/* */ | |||
/* 2. Redistributions in binary form must reproduce the above */ | |||
/* copyright notice, this list of conditions and the following */ | |||
/* disclaimer in the documentation and/or other materials */ | |||
/* provided with the distribution. */ | |||
/* */ | |||
/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ | |||
/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ | |||
/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ | |||
/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ | |||
/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ | |||
/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ | |||
/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ | |||
/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ | |||
/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ | |||
/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ | |||
/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ | |||
/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ | |||
/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ | |||
/* POSSIBILITY OF SUCH DAMAGE. */ | |||
/* */ | |||
/* The views and conclusions contained in the software and */ | |||
/* documentation are those of the authors and should not be */ | |||
/* interpreted as representing official policies, either expressed */ | |||
/* or implied, of The University of Texas at Austin. */ | |||
/*********************************************************************/ | |||
#include <stdio.h> | |||
#include "common.h" | |||
#include <arm_sve.h> | |||
int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){ | |||
BLASLONG i, offset; | |||
lda *= 2; | |||
#if defined(DOUBLE) | |||
uint64_t sve_size = svcntd(); | |||
svint64_t posY_vec = svdup_s64(posY); | |||
svint64_t posX_vec = svdup_s64(posX); | |||
svint64_t lda_vec = svdup_s64(lda); | |||
svint64_t one_vec = svdup_s64(1LL); | |||
int64_t j = 0; | |||
svbool_t pg = svwhilelt_b64(j, n); | |||
int64_t active = svcntp_b64(svptrue_b64(), pg); | |||
svint64_t index_neg = svindex_s64(0LL, -1LL); | |||
svint64_t index = svindex_s64(0LL, 1LL); | |||
do { | |||
offset = posX - posY; | |||
svint64_t vec_off = svdup_s64(offset); | |||
svbool_t cmp = svcmpgt(pg, vec_off, index_neg); | |||
svint64_t temp = svadd_z(pg, posX_vec, index); | |||
svint64_t temp1 = svmul_z(pg, temp, lda_vec); | |||
temp1 = svmla_z(pg, temp1, posY_vec, 2); | |||
svint64_t temp2 = svmul_z(pg, temp, 2); | |||
temp2 = svmla_z(pg, temp2, posY_vec, lda); | |||
svint64_t gat_ind = svsel(cmp, temp1, temp2); | |||
i = m; | |||
while (i>0) { | |||
svfloat64_t data_vec_real = svld1_gather_index(pg, a, gat_ind); | |||
svfloat64_t data_vec_imag = svld1_gather_index(pg, a+1, gat_ind); | |||
gat_ind = svadd_m(cmp, gat_ind, 2); | |||
gat_ind = svadd_m(svnot_z(pg, cmp) , gat_ind, lda_vec); | |||
svst2(pg, b, svcreate2(data_vec_real, data_vec_imag)); | |||
b += active * 2; | |||
offset --; | |||
vec_off = svsub_z(pg, vec_off, one_vec); | |||
cmp = svcmpgt(pg, vec_off, index_neg); | |||
i--; | |||
} | |||
posX += sve_size; | |||
posX_vec = svdup_s64(posX); | |||
j += sve_size; | |||
pg = svwhilelt_b64(j, n); | |||
active = svcntp_b64(svptrue_b64(), pg); | |||
} while (svptest_any(svptrue_b64(), pg)); | |||
#else | |||
uint32_t sve_size = svcntw(); | |||
svint32_t posY_vec = svdup_s32(posY); | |||
svint32_t posX_vec = svdup_s32(posX); | |||
svint32_t lda_vec = svdup_s32(lda); | |||
svint32_t one_vec = svdup_s32(1); | |||
int32_t N = n; | |||
int32_t j = 0; | |||
svbool_t pg = svwhilelt_b32(j, N); | |||
int32_t active = svcntp_b32(svptrue_b32(), pg); | |||
svint32_t index_neg = svindex_s32(0, -1); | |||
svint32_t index = svindex_s32(0, 1); | |||
do { | |||
offset = posX - posY; | |||
svint32_t vec_off = svdup_s32(offset); | |||
svbool_t cmp = svcmpgt(pg, vec_off, index_neg); | |||
svint32_t temp = svadd_z(pg, posX_vec, index); | |||
svint32_t temp1 = svmul_z(pg, temp, lda_vec); | |||
temp1 = svmla_z(pg, temp1, posY_vec, 2); | |||
svint32_t temp2 = svmul_z(pg, temp, 2); | |||
temp2 = svmla_z(pg, temp2, posY_vec, lda); | |||
svint32_t gat_ind = svsel(cmp, temp1, temp2); | |||
i = m; | |||
while (i>0) { | |||
svfloat32_t data_vec_real = svld1_gather_index(pg, a, gat_ind); | |||
svfloat32_t data_vec_imag = svld1_gather_index(pg, a+1, gat_ind); | |||
gat_ind = svadd_m(cmp, gat_ind, 2); | |||
gat_ind = svadd_m(svnot_z(pg, cmp) , gat_ind, lda_vec); | |||
svst2(pg, b, svcreate2(data_vec_real, data_vec_imag)); | |||
b += active * 2; | |||
offset --; | |||
vec_off = svsub_z(pg, vec_off, one_vec); | |||
cmp = svcmpgt(pg, vec_off, index_neg); | |||
i--; | |||
} | |||
posX += sve_size; | |||
posX_vec = svdup_s32(posX); | |||
j += sve_size; | |||
pg = svwhilelt_b32(j, N); | |||
active = svcntp_b32(svptrue_b32(), pg); | |||
} while (svptest_any(svptrue_b32(), pg)); | |||
#endif | |||
return 0; | |||
} |
@@ -0,0 +1,145 @@ | |||
/*********************************************************************/ | |||
/* Copyright 2009, 2010 The University of Texas at Austin. */ | |||
/* All rights reserved. */ | |||
/* */ | |||
/* Redistribution and use in source and binary forms, with or */ | |||
/* without modification, are permitted provided that the following */ | |||
/* conditions are met: */ | |||
/* */ | |||
/* 1. Redistributions of source code must retain the above */ | |||
/* copyright notice, this list of conditions and the following */ | |||
/* disclaimer. */ | |||
/* */ | |||
/* 2. Redistributions in binary form must reproduce the above */ | |||
/* copyright notice, this list of conditions and the following */ | |||
/* disclaimer in the documentation and/or other materials */ | |||
/* provided with the distribution. */ | |||
/* */ | |||
/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ | |||
/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ | |||
/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ | |||
/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ | |||
/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ | |||
/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ | |||
/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ | |||
/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ | |||
/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ | |||
/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ | |||
/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ | |||
/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ | |||
/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ | |||
/* POSSIBILITY OF SUCH DAMAGE. */ | |||
/* */ | |||
/* The views and conclusions contained in the software and */ | |||
/* documentation are those of the authors and should not be */ | |||
/* interpreted as representing official policies, either expressed */ | |||
/* or implied, of The University of Texas at Austin. */ | |||
/*********************************************************************/ | |||
#include <stdio.h> | |||
#include "common.h" | |||
#ifdef __ARM_FEATURE_SVE | |||
#include <arm_sve.h> | |||
#endif | |||
int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){ | |||
BLASLONG i, js; | |||
BLASLONG X; | |||
lda += lda; | |||
js = 0; | |||
FLOAT *ao; | |||
#ifdef DOUBLE | |||
svint64_t index = svindex_s64(0LL, lda); | |||
svbool_t pn = svwhilelt_b64(js, n); | |||
int n_active = svcntp_b64(svptrue_b64(), pn); | |||
#else | |||
svint32_t index = svindex_s32(0, lda); | |||
svbool_t pn = svwhilelt_b32(js, n); | |||
int n_active = svcntp_b32(svptrue_b32(), pn); | |||
#endif | |||
do | |||
{ | |||
X = posX; | |||
if (posX <= posY) { | |||
ao = a + posY * 2 + posX * lda; | |||
} else { | |||
ao = a + posX * 2 + posY * lda; | |||
} | |||
i = 0; | |||
do | |||
{ | |||
if (X > posY) { | |||
#ifdef DOUBLE | |||
svfloat64_t aj_vec_real = svld1_gather_index(pn, ao, index); | |||
svfloat64_t aj_vec_imag = svld1_gather_index(pn, ao+1, index); | |||
#else | |||
svfloat32_t aj_vec_real = svld1_gather_index(pn, ao, index); | |||
svfloat32_t aj_vec_imag = svld1_gather_index(pn, ao+1, index); | |||
#endif | |||
svst2(pn, b, svcreate2(aj_vec_real, aj_vec_imag)); | |||
ao += 2; | |||
b += n_active * 2; | |||
X ++; | |||
i ++; | |||
} else | |||
if (X < posY) { | |||
ao += lda; | |||
b += n_active * 2; | |||
X ++; | |||
i ++; | |||
} else { | |||
/* I did not find a way to unroll this while preserving vector-length-agnostic code. */ | |||
#ifdef UNIT | |||
int temp = 0; | |||
for (int j = 0; j < n_active; j++) { | |||
for (int k = 0 ; k < j; k++) { | |||
b[temp++] = *(ao+k*lda+j*2); | |||
b[temp++] = *(ao+k*lda+j*2+1); | |||
} | |||
b[temp++] = ONE; | |||
b[temp++] = ZERO; | |||
for (int k = j+1; k < n_active; k++) { | |||
b[temp++] = ZERO; | |||
b[temp++] = ZERO; | |||
} | |||
} | |||
#else | |||
int temp = 0; | |||
for (int j = 0; j < n_active; j++) { | |||
for (int k = 0 ; k <= j; k++) { | |||
b[temp++] = *(ao+k*lda+j*2); | |||
b[temp++] = *(ao+k*lda+j*2+1); | |||
} | |||
for (int k = j+1; k < n_active; k++) { | |||
b[temp++] = ZERO; | |||
b[temp++] = ZERO; | |||
} | |||
} | |||
#endif | |||
ao += n_active * 2; | |||
b += n_active*n_active * 2; | |||
X += n_active; | |||
i += n_active; | |||
} | |||
} while (i < m); | |||
posY += n_active; | |||
js += n_active; | |||
#ifdef DOUBLE | |||
pn = svwhilelt_b64(js, n); | |||
n_active = svcntp_b64(svptrue_b64(), pn); | |||
} while (svptest_any(svptrue_b64(), pn)); | |||
#else | |||
pn = svwhilelt_b32(js, n); | |||
n_active = svcntp_b32(svptrue_b32(), pn); | |||
} while (svptest_any(svptrue_b32(), pn)); | |||
#endif | |||
return 0; | |||
} |
@@ -0,0 +1,143 @@ | |||
/*********************************************************************/ | |||
/* Copyright 2009, 2010 The University of Texas at Austin. */ | |||
/* All rights reserved. */ | |||
/* */ | |||
/* Redistribution and use in source and binary forms, with or */ | |||
/* without modification, are permitted provided that the following */ | |||
/* conditions are met: */ | |||
/* */ | |||
/* 1. Redistributions of source code must retain the above */ | |||
/* copyright notice, this list of conditions and the following */ | |||
/* disclaimer. */ | |||
/* */ | |||
/* 2. Redistributions in binary form must reproduce the above */ | |||
/* copyright notice, this list of conditions and the following */ | |||
/* disclaimer in the documentation and/or other materials */ | |||
/* provided with the distribution. */ | |||
/* */ | |||
/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ | |||
/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ | |||
/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ | |||
/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ | |||
/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ | |||
/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ | |||
/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ | |||
/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ | |||
/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ | |||
/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ | |||
/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ | |||
/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ | |||
/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ | |||
/* POSSIBILITY OF SUCH DAMAGE. */ | |||
/* */ | |||
/* The views and conclusions contained in the software and */ | |||
/* documentation are those of the authors and should not be */ | |||
/* interpreted as representing official policies, either expressed */ | |||
/* or implied, of The University of Texas at Austin. */ | |||
/*********************************************************************/ | |||
#include <stdio.h> | |||
#include "common.h" | |||
#ifdef __ARM_FEATURE_SVE | |||
#include <arm_sve.h> | |||
#endif | |||
int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){ | |||
BLASLONG i, js; | |||
BLASLONG X; | |||
lda += lda; | |||
FLOAT *ao; | |||
js = 0; | |||
#ifdef DOUBLE | |||
svbool_t pn = svwhilelt_b64(js, n); | |||
int n_active = svcntp_b64(svptrue_b64(), pn); | |||
#else | |||
svbool_t pn = svwhilelt_b32(js, n); | |||
int n_active = svcntp_b32(svptrue_b32(), pn); | |||
#endif | |||
do | |||
{ | |||
X = posX; | |||
if (posX <= posY) { | |||
ao = a + posY * 2 + posX * lda; | |||
} else { | |||
ao = a + posX * 2 + posY * lda; | |||
} | |||
i = 0; | |||
do | |||
{ | |||
if (X > posY) { | |||
ao += 2; | |||
b += n_active * 2; | |||
X ++; | |||
i ++; | |||
} else | |||
if (X < posY) { | |||
#ifdef DOUBLE | |||
svfloat64x2_t aj_vec = svld2(pn, ao); | |||
#else | |||
svfloat32x2_t aj_vec = svld2(pn, ao); | |||
#endif | |||
svst2(pn, b, aj_vec); | |||
ao += lda; | |||
b += n_active * 2; | |||
X ++; | |||
i ++; | |||
} else { | |||
/* I did not find a way to unroll this while preserving vector-length-agnostic code. */ | |||
#ifdef UNIT | |||
int temp = 0; | |||
for (int j = 0; j < n_active; j++) { | |||
for (int k = 0 ; k < j; k++) { | |||
b[temp++] = ZERO; | |||
b[temp++] = ZERO; | |||
} | |||
b[temp++] = ONE; | |||
b[temp++] = ZERO; | |||
for (int k = j+1; k < n_active; k++) { | |||
b[temp++] = *(ao+j*lda+k*2); | |||
b[temp++] = *(ao+j*lda+k*2+1); | |||
} | |||
} | |||
#else | |||
int temp = 0; | |||
for (int j = 0; j < n_active; j++) { | |||
for (int k = 0 ; k < j; k++) { | |||
b[temp++] = ZERO; | |||
b[temp++] = ZERO; | |||
} | |||
for (int k = j; k < n_active; k++) { | |||
b[temp++] = *(ao+j*lda+k*2); | |||
b[temp++] = *(ao+j*lda+k*2+1); | |||
} | |||
} | |||
#endif | |||
ao += n_active * lda; | |||
b += n_active*n_active * 2; | |||
X += n_active; | |||
i += n_active; | |||
} | |||
} while (i < m); | |||
posY += n_active; | |||
js += n_active; | |||
#ifdef DOUBLE | |||
pn = svwhilelt_b64(js, n); | |||
n_active = svcntp_b64(svptrue_b64(), pn); | |||
} while (svptest_any(svptrue_b64(), pn)); | |||
#else | |||
pn = svwhilelt_b32(js, n); | |||
n_active = svcntp_b32(svptrue_b32(), pn); | |||
} while (svptest_any(svptrue_b32(), pn)); | |||
#endif | |||
return 0; | |||
} |
@@ -0,0 +1,145 @@ | |||
/*********************************************************************/ | |||
/* Copyright 2009, 2010 The University of Texas at Austin. */ | |||
/* All rights reserved. */ | |||
/* */ | |||
/* Redistribution and use in source and binary forms, with or */ | |||
/* without modification, are permitted provided that the following */ | |||
/* conditions are met: */ | |||
/* */ | |||
/* 1. Redistributions of source code must retain the above */ | |||
/* copyright notice, this list of conditions and the following */ | |||
/* disclaimer. */ | |||
/* */ | |||
/* 2. Redistributions in binary form must reproduce the above */ | |||
/* copyright notice, this list of conditions and the following */ | |||
/* disclaimer in the documentation and/or other materials */ | |||
/* provided with the distribution. */ | |||
/* */ | |||
/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ | |||
/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ | |||
/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ | |||
/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ | |||
/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ | |||
/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ | |||
/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ | |||
/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ | |||
/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ | |||
/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ | |||
/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ | |||
/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ | |||
/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ | |||
/* POSSIBILITY OF SUCH DAMAGE. */ | |||
/* */ | |||
/* The views and conclusions contained in the software and */ | |||
/* documentation are those of the authors and should not be */ | |||
/* interpreted as representing official policies, either expressed */ | |||
/* or implied, of The University of Texas at Austin. */ | |||
/*********************************************************************/ | |||
#include <stdio.h> | |||
#include "common.h" | |||
#ifdef __ARM_FEATURE_SVE | |||
#include <arm_sve.h> | |||
#endif | |||
int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){ | |||
BLASLONG i, js; | |||
BLASLONG X; | |||
lda += lda; | |||
js = 0; | |||
FLOAT *ao; | |||
#ifdef DOUBLE | |||
svint64_t index = svindex_s64(0LL, lda); | |||
svbool_t pn = svwhilelt_b64(js, n); | |||
int n_active = svcntp_b64(svptrue_b64(), pn); | |||
#else | |||
svint32_t index = svindex_s32(0, lda); | |||
svbool_t pn = svwhilelt_b32(js, n); | |||
int n_active = svcntp_b32(svptrue_b32(), pn); | |||
#endif | |||
do | |||
{ | |||
X = posX; | |||
if (posX <= posY) { | |||
ao = a + posX * 2 + posY * lda; | |||
} else { | |||
ao = a + posY * 2 + posX * lda; | |||
} | |||
i = 0; | |||
do | |||
{ | |||
if (X < posY) { | |||
#ifdef DOUBLE | |||
svfloat64_t aj_vec_real = svld1_gather_index(pn, ao, index); | |||
svfloat64_t aj_vec_imag = svld1_gather_index(pn, ao+1, index); | |||
#else | |||
svfloat32_t aj_vec_real = svld1_gather_index(pn, ao, index); | |||
svfloat32_t aj_vec_imag = svld1_gather_index(pn, ao+1, index); | |||
#endif | |||
svst2(pn, b, svcreate2(aj_vec_real, aj_vec_imag)); | |||
ao += 2; | |||
b += n_active * 2; | |||
X ++; | |||
i ++; | |||
} else | |||
if (X > posY) { | |||
ao += lda; | |||
b += n_active * 2; | |||
X ++; | |||
i ++; | |||
} else { | |||
/* I did not find a way to unroll this while preserving vector-length-agnostic code. */ | |||
#ifdef UNIT | |||
int temp = 0; | |||
for (int j = 0; j < n_active; j++) { | |||
for (int k = 0 ; k < j; k++) { | |||
b[temp++] = ZERO; | |||
b[temp++] = ZERO; | |||
} | |||
b[temp++] = ONE; | |||
b[temp++] = ZERO; | |||
for (int k = j+1; k < n_active; k++) { | |||
b[temp++] = *(ao+k*lda+j*2); | |||
b[temp++] = *(ao+k*lda+j*2+1); | |||
} | |||
} | |||
#else | |||
int temp = 0; | |||
for (int j = 0; j < n_active; j++) { | |||
for (int k = 0 ; k < j; k++) { | |||
b[temp++] = ZERO; | |||
b[temp++] = ZERO; | |||
} | |||
for (int k = j; k < n_active; k++) { | |||
b[temp++] = *(ao+k*lda+j*2); | |||
b[temp++] = *(ao+k*lda+j*2+1); | |||
} | |||
} | |||
#endif | |||
ao += n_active * 2; | |||
b += n_active*n_active * 2; | |||
X += n_active; | |||
i += n_active; | |||
} | |||
} while (i < m); | |||
posY += n_active; | |||
js += n_active; | |||
#ifdef DOUBLE | |||
pn = svwhilelt_b64(js, n); | |||
n_active = svcntp_b64(svptrue_b64(), pn); | |||
} while (svptest_any(svptrue_b64(), pn)); | |||
#else | |||
pn = svwhilelt_b32(js, n); | |||
n_active = svcntp_b32(svptrue_b32(), pn); | |||
} while (svptest_any(svptrue_b32(), pn)); | |||
#endif | |||
return 0; | |||
} |
@@ -0,0 +1,141 @@ | |||
/*********************************************************************/ | |||
/* Copyright 2009, 2010 The University of Texas at Austin. */ | |||
/* All rights reserved. */ | |||
/* */ | |||
/* Redistribution and use in source and binary forms, with or */ | |||
/* without modification, are permitted provided that the following */ | |||
/* conditions are met: */ | |||
/* */ | |||
/* 1. Redistributions of source code must retain the above */ | |||
/* copyright notice, this list of conditions and the following */ | |||
/* disclaimer. */ | |||
/* */ | |||
/* 2. Redistributions in binary form must reproduce the above */ | |||
/* copyright notice, this list of conditions and the following */ | |||
/* disclaimer in the documentation and/or other materials */ | |||
/* provided with the distribution. */ | |||
/* */ | |||
/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ | |||
/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ | |||
/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ | |||
/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ | |||
/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ | |||
/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ | |||
/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ | |||
/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ | |||
/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ | |||
/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ | |||
/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ | |||
/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ | |||
/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ | |||
/* POSSIBILITY OF SUCH DAMAGE. */ | |||
/* */ | |||
/* The views and conclusions contained in the software and */ | |||
/* documentation are those of the authors and should not be */ | |||
/* interpreted as representing official policies, either expressed */ | |||
/* or implied, of The University of Texas at Austin. */ | |||
/*********************************************************************/ | |||
#include <stdio.h> | |||
#include "common.h" | |||
#ifdef __ARM_FEATURE_SVE | |||
#include <arm_sve.h> | |||
#endif | |||
int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){ | |||
BLASLONG i, js; | |||
BLASLONG X; | |||
lda += lda; | |||
FLOAT *ao; | |||
js = 0; | |||
#ifdef DOUBLE | |||
svbool_t pn = svwhilelt_b64(js, n); | |||
int n_active = svcntp_b64(svptrue_b64(), pn); | |||
#else | |||
svbool_t pn = svwhilelt_b32(js, n); | |||
int n_active = svcntp_b32(svptrue_b32(), pn); | |||
#endif | |||
do | |||
{ | |||
X = posX; | |||
if (posX <= posY) { | |||
ao = a + posX * 2 + posY * lda; | |||
} else { | |||
ao = a + posY * 2 + posX * lda; | |||
} | |||
i = 0; | |||
do | |||
{ | |||
if (X < posY) { | |||
ao += 2; | |||
b += n_active * 2; | |||
X ++; | |||
i ++; | |||
} else | |||
if (X > posY) { | |||
#ifdef DOUBLE | |||
svfloat64x2_t aj_vec = svld2(pn, ao); | |||
#else | |||
svfloat32x2_t aj_vec = svld2(pn, ao); | |||
#endif | |||
svst2(pn, b, aj_vec); | |||
ao += lda; | |||
b += n_active * 2; | |||
X ++; | |||
i ++; | |||
} else { | |||
/* I did not find a way to unroll this while preserving vector-length-agnostic code. */ | |||
#ifdef UNIT | |||
int temp = 0; | |||
for (int j = 0; j < n_active; j++) { | |||
for (int k = 0 ; k < j; k++) { | |||
b[temp++] = *(ao+j*lda+k*2); | |||
b[temp++] = *(ao+j*lda+k*2+1); | |||
} | |||
b[temp++] = ONE; | |||
b[temp++] = ZERO; | |||
for (int k = j+1; k < n_active; k++) { | |||
b[temp++] = ZERO; | |||
b[temp++] = ZERO; | |||
} | |||
} | |||
#else | |||
int temp = 0; | |||
for (int j = 0; j < n_active; j++) { | |||
for (int k = 0 ; k <= j; k++) { | |||
b[temp++] = *(ao+j*lda+k*2); | |||
b[temp++] = *(ao+j*lda+k*2+1); | |||
} | |||
for (int k = j+1; k < n_active; k++) { | |||
b[temp++] = ZERO; | |||
b[temp++] = ZERO; | |||
} | |||
} | |||
#endif | |||
ao += n_active * lda; | |||
b += n_active*n_active * 2; | |||
X += n_active; | |||
i += n_active; | |||
} | |||
} while (i < m); | |||
posY += n_active; | |||
js += n_active; | |||
#ifdef DOUBLE | |||
pn = svwhilelt_b64(js, n); | |||
n_active = svcntp_b64(svptrue_b64(), pn); | |||
} while (svptest_any(svptrue_b64(), pn)); | |||
#else | |||
pn = svwhilelt_b32(js, n); | |||
n_active = svcntp_b32(svptrue_b32(), pn); | |||
} while (svptest_any(svptrue_b32(), pn)); | |||
#endif | |||
return 0; | |||
} |
@@ -0,0 +1,119 @@ | |||
/*********************************************************************/ | |||
/* Copyright 2009, 2010 The University of Texas at Austin. */ | |||
/* All rights reserved. */ | |||
/* */ | |||
/* Redistribution and use in source and binary forms, with or */ | |||
/* without modification, are permitted provided that the following */ | |||
/* conditions are met: */ | |||
/* */ | |||
/* 1. Redistributions of source code must retain the above */ | |||
/* copyright notice, this list of conditions and the following */ | |||
/* disclaimer. */ | |||
/* */ | |||
/* 2. Redistributions in binary form must reproduce the above */ | |||
/* copyright notice, this list of conditions and the following */ | |||
/* disclaimer in the documentation and/or other materials */ | |||
/* provided with the distribution. */ | |||
/* */ | |||
/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ | |||
/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ | |||
/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ | |||
/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ | |||
/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ | |||
/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ | |||
/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ | |||
/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ | |||
/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ | |||
/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ | |||
/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ | |||
/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ | |||
/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ | |||
/* POSSIBILITY OF SUCH DAMAGE. */ | |||
/* */ | |||
/* The views and conclusions contained in the software and */ | |||
/* documentation are those of the authors and should not be */ | |||
/* interpreted as representing official policies, either expressed */ | |||
/* or implied, of The University of Texas at Austin. */ | |||
/*********************************************************************/ | |||
#include <stdio.h> | |||
#include "common.h" | |||
#include "arm_sve.h" | |||
int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *b){ | |||
BLASLONG i, ii, jj; | |||
FLOAT *ao; | |||
lda *= 2; | |||
jj = offset; | |||
#ifdef DOUBLE | |||
int64_t js = 0; | |||
svint64_t index = svindex_s64(0LL, lda); | |||
svbool_t pn = svwhilelt_b64(js, n); | |||
int n_active = svcntp_b64(svptrue_b64(), pn); | |||
#else | |||
int32_t N = n; | |||
int32_t js = 0; | |||
svint32_t index = svindex_s32(0, lda); | |||
svbool_t pn = svwhilelt_b32(js, N); | |||
int n_active = svcntp_b32(svptrue_b32(), pn); | |||
#endif | |||
do { | |||
ao = a; | |||
i = 0; | |||
ii = 0; | |||
do { | |||
if (ii == jj) { | |||
for (int j = 0; j < n_active; j++) { | |||
for (int k = 0; k < j; k++) { | |||
*(b + 2*j * n_active + 2*k) = *(ao + k * lda + 2*j); | |||
*(b + 2*j * n_active + 2*k + 1) = *(ao + k * lda + 2*j + 1); | |||
} | |||
compinv(b + 2*j * n_active + 2*j, *(ao + j * lda + 2*j), *(ao + j * lda + 2*j+1)); | |||
//*(b + j * n_active + j) = INV(*(ao + j * lda + j)); | |||
} | |||
ao += n_active * 2; | |||
b += n_active * n_active * 2; | |||
i += n_active; | |||
ii += n_active; | |||
} else { | |||
if (ii > jj) { | |||
#ifdef DOUBLE | |||
svfloat64_t aj_vec_real = svld1_gather_index(pn, ao, index); | |||
svfloat64_t aj_vec_imag = svld1_gather_index(pn, ao+1, index); | |||
#else | |||
svfloat32_t aj_vec_real = svld1_gather_index(pn, ao, index); | |||
svfloat32_t aj_vec_imag = svld1_gather_index(pn, ao+1, index); | |||
#endif | |||
svst2(pn, b, svcreate2(aj_vec_real, aj_vec_imag)); | |||
} | |||
ao += 2; | |||
b += n_active * 2; | |||
i++; | |||
ii++; | |||
} | |||
} while (i < m); | |||
a += n_active * lda; | |||
jj += n_active; | |||
js += n_active; | |||
#ifdef DOUBLE | |||
pn = svwhilelt_b64(js, n); | |||
n_active = svcntp_b64(svptrue_b64(), pn); | |||
} while (svptest_any(svptrue_b64(), pn)); | |||
#else | |||
pn = svwhilelt_b32(js, N); | |||
n_active = svcntp_b32(svptrue_b32(), pn); | |||
} while (svptest_any(svptrue_b32(), pn)); | |||
#endif | |||
return 0; | |||
} |
@@ -0,0 +1,115 @@ | |||
/*********************************************************************/ | |||
/* Copyright 2009, 2010 The University of Texas at Austin. */ | |||
/* All rights reserved. */ | |||
/* */ | |||
/* Redistribution and use in source and binary forms, with or */ | |||
/* without modification, are permitted provided that the following */ | |||
/* conditions are met: */ | |||
/* */ | |||
/* 1. Redistributions of source code must retain the above */ | |||
/* copyright notice, this list of conditions and the following */ | |||
/* disclaimer. */ | |||
/* */ | |||
/* 2. Redistributions in binary form must reproduce the above */ | |||
/* copyright notice, this list of conditions and the following */ | |||
/* disclaimer in the documentation and/or other materials */ | |||
/* provided with the distribution. */ | |||
/* */ | |||
/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ | |||
/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ | |||
/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ | |||
/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ | |||
/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ | |||
/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ | |||
/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ | |||
/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ | |||
/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ | |||
/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ | |||
/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ | |||
/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ | |||
/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ | |||
/* POSSIBILITY OF SUCH DAMAGE. */ | |||
/* */ | |||
/* The views and conclusions contained in the software and */ | |||
/* documentation are those of the authors and should not be */ | |||
/* interpreted as representing official policies, either expressed */ | |||
/* or implied, of The University of Texas at Austin. */ | |||
/*********************************************************************/ | |||
#include <stdio.h> | |||
#include "common.h" | |||
#include "arm_sve.h" | |||
int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *b){ | |||
BLASLONG i, ii, jj; | |||
FLOAT *ao; | |||
lda *= 2; | |||
jj = offset; | |||
#ifdef DOUBLE | |||
int64_t js = 0; | |||
svbool_t pn = svwhilelt_b64(js, n); | |||
int n_active = svcntp_b64(svptrue_b64(), pn); | |||
#else | |||
int32_t N = n; | |||
int32_t js = 0; | |||
svbool_t pn = svwhilelt_b32(js, N); | |||
int n_active = svcntp_b32(svptrue_b32(), pn); | |||
#endif | |||
do { | |||
ao = a; | |||
i = 0; | |||
ii = 0; | |||
do { | |||
if (ii == jj) { | |||
for (int j = 0; j < n_active; j++) { | |||
compinv(b + 2*j * n_active + 2*j, *(ao + j * lda + 2*j), *(ao + j * lda + 2*j+1)); | |||
//*(b + j * n_active + j) = INV(*(ao + j * lda + j)); | |||
for (int k = j+1; k < n_active; k++) { | |||
*(b + 2*j * n_active + 2*k) = *(ao + j * lda + 2*k); | |||
*(b + 2*j * n_active + 2*k + 1) = *(ao + j * lda + 2*k + 1); | |||
} | |||
} | |||
b += n_active * n_active * 2; | |||
ao += lda * n_active; | |||
i += n_active; | |||
ii += n_active; | |||
} else { | |||
if (ii < jj) { | |||
#ifdef DOUBLE | |||
svfloat64x2_t aj_vec = svld2(pn, ao); | |||
#else | |||
svfloat32x2_t aj_vec = svld2(pn, ao); | |||
#endif | |||
svst2(pn, b, aj_vec); | |||
} | |||
ao += lda; | |||
b += n_active * 2; | |||
i ++; | |||
ii ++; | |||
} | |||
} while (i < m); | |||
a += n_active * 2; | |||
jj += n_active; | |||
js += n_active; | |||
#ifdef DOUBLE | |||
pn = svwhilelt_b64(js, n); | |||
n_active = svcntp_b64(svptrue_b64(), pn); | |||
} while (svptest_any(svptrue_b64(), pn)); | |||
#else | |||
pn = svwhilelt_b32(js, N); | |||
n_active = svcntp_b32(svptrue_b32(), pn); | |||
} while (svptest_any(svptrue_b32(), pn)); | |||
#endif | |||
return 0; | |||
} |
@@ -0,0 +1,119 @@ | |||
/*********************************************************************/ | |||
/* Copyright 2009, 2010 The University of Texas at Austin. */ | |||
/* All rights reserved. */ | |||
/* */ | |||
/* Redistribution and use in source and binary forms, with or */ | |||
/* without modification, are permitted provided that the following */ | |||
/* conditions are met: */ | |||
/* */ | |||
/* 1. Redistributions of source code must retain the above */ | |||
/* copyright notice, this list of conditions and the following */ | |||
/* disclaimer. */ | |||
/* */ | |||
/* 2. Redistributions in binary form must reproduce the above */ | |||
/* copyright notice, this list of conditions and the following */ | |||
/* disclaimer in the documentation and/or other materials */ | |||
/* provided with the distribution. */ | |||
/* */ | |||
/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ | |||
/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ | |||
/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ | |||
/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ | |||
/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ | |||
/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ | |||
/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ | |||
/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ | |||
/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ | |||
/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ | |||
/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ | |||
/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ | |||
/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ | |||
/* POSSIBILITY OF SUCH DAMAGE. */ | |||
/* */ | |||
/* The views and conclusions contained in the software and */ | |||
/* documentation are those of the authors and should not be */ | |||
/* interpreted as representing official policies, either expressed */ | |||
/* or implied, of The University of Texas at Austin. */ | |||
/*********************************************************************/ | |||
#include <stdio.h> | |||
#include "common.h" | |||
#include "arm_sve.h" | |||
int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *b){ | |||
BLASLONG i, ii, jj; | |||
FLOAT *ao; | |||
lda *= 2; | |||
jj = offset; | |||
#ifdef DOUBLE | |||
int64_t js = 0; | |||
svint64_t index = svindex_s64(0LL, lda); | |||
svbool_t pn = svwhilelt_b64(js, n); | |||
int n_active = svcntp_b64(svptrue_b64(), pn); | |||
#else | |||
int32_t N = n; | |||
int32_t js = 0; | |||
svint32_t index = svindex_s32(0, lda); | |||
svbool_t pn = svwhilelt_b32(js, N); | |||
int n_active = svcntp_b32(svptrue_b32(), pn); | |||
#endif | |||
do { | |||
ao = a; | |||
i = 0; | |||
ii = 0; | |||
do { | |||
if (ii == jj) { | |||
for (int j = 0; j < n_active; j++) { | |||
compinv(b + 2*j * n_active + 2*j, *(ao + j * lda + 2*j), *(ao + j * lda + 2*j+1)); | |||
//*(b + j * n_active + j) = INV(*(ao + j * lda + j)); | |||
for (int k = j+1; k < n_active; k++) { | |||
*(b + 2*j * n_active + 2*k) = *(ao + k * lda + 2*j); | |||
*(b + 2*j * n_active + 2*k + 1) = *(ao + k * lda + 2*j + 1); | |||
} | |||
} | |||
ao += n_active * 2; | |||
b += n_active * n_active * 2; | |||
i += n_active; | |||
ii += n_active; | |||
} else { | |||
if (ii < jj) { | |||
#ifdef DOUBLE | |||
svfloat64_t aj_vec_real = svld1_gather_index(pn, ao, index); | |||
svfloat64_t aj_vec_imag = svld1_gather_index(pn, ao+1, index); | |||
#else | |||
svfloat32_t aj_vec_real = svld1_gather_index(pn, ao, index); | |||
svfloat32_t aj_vec_imag = svld1_gather_index(pn, ao+1, index); | |||
#endif | |||
svst2(pn, b, svcreate2(aj_vec_real, aj_vec_imag)); | |||
} | |||
ao += 2; | |||
b += n_active * 2; | |||
i++; | |||
ii++; | |||
} | |||
} while (i < m); | |||
a += n_active * lda; | |||
jj += n_active; | |||
js += n_active; | |||
#ifdef DOUBLE | |||
pn = svwhilelt_b64(js, n); | |||
n_active = svcntp_b64(svptrue_b64(), pn); | |||
} while (svptest_any(svptrue_b64(), pn)); | |||
#else | |||
pn = svwhilelt_b32(js, N); | |||
n_active = svcntp_b32(svptrue_b32(), pn); | |||
} while (svptest_any(svptrue_b32(), pn)); | |||
#endif | |||
return 0; | |||
} |
@@ -0,0 +1,115 @@ | |||
/*********************************************************************/ | |||
/* Copyright 2009, 2010 The University of Texas at Austin. */ | |||
/* All rights reserved. */ | |||
/* */ | |||
/* Redistribution and use in source and binary forms, with or */ | |||
/* without modification, are permitted provided that the following */ | |||
/* conditions are met: */ | |||
/* */ | |||
/* 1. Redistributions of source code must retain the above */ | |||
/* copyright notice, this list of conditions and the following */ | |||
/* disclaimer. */ | |||
/* */ | |||
/* 2. Redistributions in binary form must reproduce the above */ | |||
/* copyright notice, this list of conditions and the following */ | |||
/* disclaimer in the documentation and/or other materials */ | |||
/* provided with the distribution. */ | |||
/* */ | |||
/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ | |||
/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ | |||
/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ | |||
/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ | |||
/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ | |||
/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ | |||
/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ | |||
/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ | |||
/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ | |||
/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ | |||
/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ | |||
/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ | |||
/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ | |||
/* POSSIBILITY OF SUCH DAMAGE. */ | |||
/* */ | |||
/* The views and conclusions contained in the software and */ | |||
/* documentation are those of the authors and should not be */ | |||
/* interpreted as representing official policies, either expressed */ | |||
/* or implied, of The University of Texas at Austin. */ | |||
/*********************************************************************/ | |||
#include <stdio.h> | |||
#include "common.h" | |||
#include "arm_sve.h" | |||
int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *b){ | |||
BLASLONG i, ii, jj; | |||
FLOAT *ao; | |||
lda *= 2; | |||
jj = offset; | |||
#ifdef DOUBLE | |||
int64_t js = 0; | |||
svbool_t pn = svwhilelt_b64(js, n); | |||
int n_active = svcntp_b64(svptrue_b64(), pn); | |||
#else | |||
int32_t N = n; | |||
int32_t js = 0; | |||
svbool_t pn = svwhilelt_b32(js, N); | |||
int n_active = svcntp_b32(svptrue_b32(), pn); | |||
#endif | |||
do { | |||
ao = a; | |||
i = 0; | |||
ii = 0; | |||
do { | |||
if (ii == jj) { | |||
for (int j = 0; j < n_active; j++) { | |||
for (int k = 0; k < j; k++) { | |||
*(b + 2*j * n_active + 2*k) = *(ao + j * lda + 2*k); | |||
*(b + 2*j * n_active + 2*k + 1) = *(ao + j * lda + 2*k + 1); | |||
} | |||
compinv(b + 2*j * n_active + 2*j, *(ao + j * lda + 2*j), *(ao + j * lda + 2*j+1)); | |||
//*(b + j * n_active + j) = INV(*(ao + j * lda + j)); | |||
} | |||
ao += lda * n_active; | |||
b += n_active * n_active * 2; | |||
i += n_active; | |||
ii += n_active; | |||
} else { | |||
if (ii > jj) { | |||
#ifdef DOUBLE | |||
svfloat64x2_t aj_vec = svld2(pn, ao); | |||
#else | |||
svfloat32x2_t aj_vec = svld2(pn, ao); | |||
#endif | |||
svst2(pn, b, aj_vec); | |||
} | |||
ao += lda; | |||
b += n_active * 2; | |||
i ++; | |||
ii ++; | |||
} | |||
} while (i < m); | |||
a += n_active * 2; | |||
jj += n_active; | |||
js += n_active; | |||
#ifdef DOUBLE | |||
pn = svwhilelt_b64(js, n); | |||
n_active = svcntp_b64(svptrue_b64(), pn); | |||
} while (svptest_any(svptrue_b64(), pn)); | |||
#else | |||
pn = svwhilelt_b32(js, N); | |||
n_active = svcntp_b32(svptrue_b32(), pn); | |||
} while (svptest_any(svptrue_b32(), pn)); | |||
#endif | |||
return 0; | |||
} |
@@ -3395,11 +3395,13 @@ Until then, just keep it different than DGEMM_DEFAULT_UNROLL_N to keep copy rout | |||
#define DGEMM_DEFAULT_UNROLL_MN 32 | |||
#define CGEMM_DEFAULT_UNROLL_M 8 | |||
#define CGEMM_DEFAULT_UNROLL_M 2 | |||
#define CGEMM_DEFAULT_UNROLL_N 4 | |||
#define CGEMM_DEFAULT_UNROLL_MN 16 | |||
#define ZGEMM_DEFAULT_UNROLL_M 4 | |||
#define ZGEMM_DEFAULT_UNROLL_M 2 | |||
#define ZGEMM_DEFAULT_UNROLL_N 4 | |||
#define ZGEMM_DEFAULT_UNROLL_MN 16 | |||
#define SGEMM_DEFAULT_P 128 | |||
#define DGEMM_DEFAULT_P 160 | |||