Browse Source

Use AVX512 also for DGEMM

this required switching to the generic gemm_beta code (which is faster anyway on SKX)
for both DGEMM and SGEMM

Performance for the not-retuned version is in the 30% range
tags/v0.3.1
Arjan van de Ven 7 years ago
parent
commit
89372e0993
3 changed files with 5154 additions and 2 deletions
  1. +15
    -0
      kernel/x86_64/KERNEL.SKYLAKEX
  2. +5138
    -0
      kernel/x86_64/dgemm_kernel_16x2_skylakex.S
  3. +1
    -2
      kernel/x86_64/sgemm_kernel_16x4_skylakex.S

+ 15
- 0
kernel/x86_64/KERNEL.SKYLAKEX View File

@@ -2,3 +2,18 @@ include $(KERNELDIR)/KERNEL.HASWELL


SGEMMKERNEL = sgemm_kernel_16x4_skylakex.S SGEMMKERNEL = sgemm_kernel_16x4_skylakex.S



DTRMMKERNEL = ../generic/trmmkernel_16x2.c
DGEMMKERNEL = dgemm_kernel_16x2_skylakex.S
DGEMMINCOPY = ../generic/gemm_ncopy_16.c
DGEMMITCOPY = ../generic/gemm_tcopy_16.c
DGEMMONCOPY = ../generic/gemm_ncopy_2.c
DGEMMOTCOPY = ../generic/gemm_tcopy_2.c
DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX)
DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX)
DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX)
DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX)


SGEMM_BETA = ../generic/gemm_beta.c
DGEMM_BETA = ../generic/gemm_beta.c

+ 5138
- 0
kernel/x86_64/dgemm_kernel_16x2_skylakex.S
File diff suppressed because it is too large
View File


+ 1
- 2
kernel/x86_64/sgemm_kernel_16x4_skylakex.S View File

@@ -159,7 +159,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
vmovups -16 * SIZE(AO), %zmm0 vmovups -16 * SIZE(AO), %zmm0
vbroadcastss -4 * SIZE(BO), %zmm2 vbroadcastss -4 * SIZE(BO), %zmm2
vbroadcastss -3 * SIZE(BO), %zmm3 vbroadcastss -3 * SIZE(BO), %zmm3
prefetcht0 A_PR1(AO)
# prefetcht0 A_PR1(AO)


VFMADD231PS_( %zmm4,%zmm2,%zmm0 ) VFMADD231PS_( %zmm4,%zmm2,%zmm0 )
VFMADD231PS_( %zmm6,%zmm3,%zmm0 ) VFMADD231PS_( %zmm6,%zmm3,%zmm0 )
@@ -183,7 +183,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
vmovups -16 * SIZE(AO), %zmm0 vmovups -16 * SIZE(AO), %zmm0
vbroadcastss -4 * SIZE(BO), %zmm2 vbroadcastss -4 * SIZE(BO), %zmm2
vbroadcastss -3 * SIZE(BO), %zmm3 vbroadcastss -3 * SIZE(BO), %zmm3
prefetcht0 A_PR1(AO)


VFMADD231PS_( %zmm4,%zmm2,%zmm0 ) VFMADD231PS_( %zmm4,%zmm2,%zmm0 )
VFMADD231PS_( %zmm6,%zmm3,%zmm0 ) VFMADD231PS_( %zmm6,%zmm3,%zmm0 )


Loading…
Cancel
Save