| @@ -77,10 +77,10 @@ ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX) | |||
| ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||
| ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||
| STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c | |||
| STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c | |||
| STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c | |||
| STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | |||
| STRSMKERNEL_LN = strsm_kernel_8x4_haswell_LN.c | |||
| STRSMKERNEL_LT = strsm_kernel_8x4_haswell_LT.c | |||
| STRSMKERNEL_RN = strsm_kernel_8x4_haswell_RN.c | |||
| STRSMKERNEL_RT = strsm_kernel_8x4_haswell_RT.c | |||
| DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c | |||
| DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c | |||
| @@ -0,0 +1,240 @@ | |||
| #include "common.h" | |||
| #include <stdint.h> | |||
| #include "strsm_kernel_8x4_haswell_L_common.h" | |||
| #define SOLVE_LN_m1n4 \ | |||
| "subq $4,%2; movq %2,%3;" GEMM_SUM_REORDER_1x4(4)\ | |||
| SOLVE_m1n4(-4,4) SAVE_b_m1n4(-16,4)\ | |||
| "movq %2,%3;" save_c_m1n4(4) | |||
| #define SOLVE_LN_m1n8 \ | |||
| "subq $4,%2; movq %2,%3;" GEMM_SUM_REORDER_1x4(4) GEMM_SUM_REORDER_1x4(5)\ | |||
| SOLVE_m1n8(-4,4,5) SAVE_b_m1n8(-16,4,5)\ | |||
| "movq %2,%3;" save_c_m1n4(4) save_c_m1n4(5) | |||
| #define SOLVE_LN_m1n12 \ | |||
| "subq $4,%2; movq %2,%3;" GEMM_SUM_REORDER_1x4(4) GEMM_SUM_REORDER_1x4(5) GEMM_SUM_REORDER_1x4(6)\ | |||
| SOLVE_m1n12(-4,4,5,6) SAVE_b_m1n12(-16,4,5,6)\ | |||
| "movq %2,%3;" save_c_m1n4(4) save_c_m1n4(5) save_c_m1n4(6) | |||
| #define SOLVE_LN_m2n4 \ | |||
| "subq $8,%2; movq %2,%3;" GEMM_SUM_REORDER_2x4(4,5,4)\ | |||
| SOLVE_loup_m2n4(-8,4)\ | |||
| SOLVE_up_m2n4(-16,4) SAVE_b_m2n4(-32,4)\ | |||
| "movq %2,%3;" save_c_m2n4(4) | |||
| #define SOLVE_LN_m2n8 \ | |||
| "subq $8,%2; movq %2,%3;" GEMM_SUM_REORDER_2x4(4,5,4) GEMM_SUM_REORDER_2x4(6,7,5)\ | |||
| SOLVE_loup_m2n8(-8,4,5)\ | |||
| SOLVE_up_m2n8(-16,4,5) SAVE_b_m2n8(-32,4,5)\ | |||
| "movq %2,%3;" save_c_m2n4(4) save_c_m2n4(5) | |||
| #define SOLVE_LN_m2n12 \ | |||
| "subq $8,%2; movq %2,%3;" GEMM_SUM_REORDER_2x4(4,5,4) GEMM_SUM_REORDER_2x4(6,7,5) GEMM_SUM_REORDER_2x4(8,9,6)\ | |||
| SOLVE_loup_m2n12(-8,4,5,6)\ | |||
| SOLVE_up_m2n12(-16,4,5,6) SAVE_b_m2n12(-32,4,5,6)\ | |||
| "movq %2,%3;" save_c_m2n4(4) save_c_m2n4(5) save_c_m2n4(6) | |||
| #define SOLVE_LN_m4n4 \ | |||
| "subq $16,%2; movq %2,%3;" GEMM_SUM_REORDER_4x4(4,5,6,7,4,5)\ | |||
| \ | |||
| SOLVE_loup_m2n4(-8,5) SUBTRACT_m2n4(-16,4)\ | |||
| SOLVE_up_m2n4(-24,5) SUBTRACT_m2n4(-32,4) SAVE_b_m2n4(-32,5)\ | |||
| \ | |||
| SOLVE_loup_m2n4(-48,4)\ | |||
| SOLVE_up_m2n4(-64,4) SAVE_b_m2n4(-64,4)\ | |||
| \ | |||
| "movq %2,%3;" save_c_m4n4(4,5) | |||
| #define SOLVE_LN_m4n8 \ | |||
| "subq $16,%2; movq %2,%3;" GEMM_SUM_REORDER_4x4(4,5,6,7,4,5) GEMM_SUM_REORDER_4x4(8,9,10,11,6,7)\ | |||
| \ | |||
| SOLVE_loup_m2n8(-8,5,7) SUBTRACT_m2n8(-16,4,6)\ | |||
| SOLVE_up_m2n8(-24,5,7) SUBTRACT_m2n8(-32,4,6) SAVE_b_m2n8(-32,5,7)\ | |||
| \ | |||
| SOLVE_loup_m2n8(-48,4,6)\ | |||
| SOLVE_up_m2n8(-64,4,6) SAVE_b_m2n8(-64,4,6)\ | |||
| \ | |||
| "movq %2,%3;" save_c_m4n4(4,5) save_c_m4n4(6,7) | |||
| #define SOLVE_LN_m4n12 \ | |||
| "subq $16,%2; movq %2,%3;" GEMM_SUM_REORDER_4x4(4,5,6,7,4,5) GEMM_SUM_REORDER_4x4(8,9,10,11,6,7) GEMM_SUM_REORDER_4x4(12,13,14,15,8,9)\ | |||
| \ | |||
| SOLVE_loup_m2n12(-8,5,7,9) SUBTRACT_m2n12(-16,4,6,8)\ | |||
| SOLVE_up_m2n12(-24,5,7,9) SUBTRACT_m2n12(-32,4,6,8) SAVE_b_m2n12(-32,5,7,9)\ | |||
| \ | |||
| SOLVE_loup_m2n12(-48,4,6,8)\ | |||
| SOLVE_up_m2n12(-64,4,6,8) SAVE_b_m2n12(-64,4,6,8)\ | |||
| \ | |||
| "movq %2,%3;" save_c_m4n4(4,5) save_c_m4n4(6,7) save_c_m4n4(8,9) | |||
| #define SOLVE_LN_m8n4 \ | |||
| "subq $32,%2; movq %2,%3;" GEMM_SUM_REORDER_8x4(4,5,6,7,-32)\ | |||
| \ | |||
| SOLVE_loup_m2n4(-8,7) SUBTRACT_m2n4(-16,6) SUBTRACT_m2n4(-24,5) SUBTRACT_m2n4(-32,4)\ | |||
| SOLVE_up_m2n4(-40,7) SUBTRACT_m2n4(-48,6) SUBTRACT_m2n4(-56,5) SUBTRACT_m2n4(-64,4) SAVE_b_m2n4(-32,7)\ | |||
| \ | |||
| SOLVE_loup_m2n4(-80,6) SUBTRACT_m2n4(-88,5) SUBTRACT_m2n4(-96,4)\ | |||
| SOLVE_up_m2n4(-112,6) SUBTRACT_m2n4(-120,5) SUBTRACT_m2n4(-128,4) SAVE_b_m2n4(-64,6)\ | |||
| \ | |||
| SOLVE_loup_m2n4(-152,5) SUBTRACT_m2n4(-160,4)\ | |||
| SOLVE_up_m2n4(-184,5) SUBTRACT_m2n4(-192,4) SAVE_b_m2n4(-96,5)\ | |||
| \ | |||
| SOLVE_loup_m2n4(-224,4)\ | |||
| SOLVE_up_m2n4(-256,4) SAVE_b_m2n4(-128,4)\ | |||
| \ | |||
| "movq %2,%3;" save_c_m8n4(4,5,6,7) | |||
| #define SOLVE_LN_m8n8 \ | |||
| "subq $32,%2; movq %2,%3;" GEMM_SUM_REORDER_8x4(4,5,6,7,-32) GEMM_SUM_REORDER_8x4(8,9,10,11,-32)\ | |||
| \ | |||
| SOLVE_loup_m2n8(-8,7,11) SUBTRACT_m2n8(-16,6,10) SUBTRACT_m2n8(-24,5,9) SUBTRACT_m2n8(-32,4,8)\ | |||
| SOLVE_up_m2n8(-40,7,11) SUBTRACT_m2n8(-48,6,10) SUBTRACT_m2n8(-56,5,9) SUBTRACT_m2n8(-64,4,8) SAVE_b_m2n8(-32,7,11)\ | |||
| \ | |||
| SOLVE_loup_m2n8(-80,6,10) SUBTRACT_m2n8(-88,5,9) SUBTRACT_m2n8(-96,4,8)\ | |||
| SOLVE_up_m2n8(-112,6,10) SUBTRACT_m2n8(-120,5,9) SUBTRACT_m2n8(-128,4,8) SAVE_b_m2n8(-64,6,10)\ | |||
| \ | |||
| SOLVE_loup_m2n8(-152,5,9) SUBTRACT_m2n8(-160,4,8)\ | |||
| SOLVE_up_m2n8(-184,5,9) SUBTRACT_m2n8(-192,4,8) SAVE_b_m2n8(-96,5,9)\ | |||
| \ | |||
| SOLVE_loup_m2n8(-224,4,8)\ | |||
| SOLVE_up_m2n8(-256,4,8) SAVE_b_m2n8(-128,4,8)\ | |||
| \ | |||
| "movq %2,%3;" save_c_m8n4(4,5,6,7) save_c_m8n4(8,9,10,11) | |||
| #define SOLVE_LN_m8n12 \ | |||
| "subq $32,%2; movq %2,%3;" GEMM_SUM_REORDER_8x4(4,5,6,7,-32) GEMM_SUM_REORDER_8x4(8,9,10,11,-32) GEMM_SUM_REORDER_8x4(12,13,14,15,-32)\ | |||
| \ | |||
| SOLVE_loup_m2n12(-8,7,11,15) SUBTRACT_m2n12(-16,6,10,14) SUBTRACT_m2n12(-24,5,9,13) SUBTRACT_m2n12(-32,4,8,12)\ | |||
| SOLVE_up_m2n12(-40,7,11,15) SUBTRACT_m2n12(-48,6,10,14) SUBTRACT_m2n12(-56,5,9,13) SUBTRACT_m2n12(-64,4,8,12) SAVE_b_m2n12(-32,7,11,15)\ | |||
| \ | |||
| SOLVE_loup_m2n12(-80,6,10,14) SUBTRACT_m2n12(-88,5,9,13) SUBTRACT_m2n12(-96,4,8,12)\ | |||
| SOLVE_up_m2n12(-112,6,10,14) SUBTRACT_m2n12(-120,5,9,13) SUBTRACT_m2n12(-128,4,8,12) SAVE_b_m2n12(-64,6,10,14)\ | |||
| \ | |||
| SOLVE_loup_m2n12(-152,5,9,13) SUBTRACT_m2n12(-160,4,8,12)\ | |||
| SOLVE_up_m2n12(-184,5,9,13) SUBTRACT_m2n12(-192,4,8,12) SAVE_b_m2n12(-96,5,9,13)\ | |||
| \ | |||
| SOLVE_loup_m2n12(-224,4,8,12)\ | |||
| SOLVE_up_m2n12(-256,4,8,12) SAVE_b_m2n12(-128,4,8,12)\ | |||
| \ | |||
| "movq %2,%3;" save_c_m8n4(4,5,6,7) save_c_m8n4(8,9,10,11) save_c_m8n4(12,13,14,15) | |||
| /* r13 = k-kk, r14 = b_tail, r15 = a_tail */ | |||
| #define GEMM_LN_SIMPLE(mdim,ndim) \ | |||
| "movq %%r15,%0; negq %%r12; leaq (%%r15,%%r12,"#mdim"),%%r15; negq %%r12;"\ | |||
| "movq %%r13,%5; addq $"#mdim",%%r13; movq %%r14,%1;" INIT_m##mdim##n##ndim\ | |||
| "testq %5,%5; jz 2"#mdim""#ndim"2f;"\ | |||
| "2"#mdim""#ndim"1:\n\t"\ | |||
| "subq $16,%1; subq $"#mdim"*4,%0;" GEMM_KERNEL_k1m##mdim##n##ndim "decq %5; jnz 2"#mdim""#ndim"1b;"\ | |||
| "2"#mdim""#ndim"2:\n\t" | |||
| #define GEMM_LN_m8n4 GEMM_LN_SIMPLE(8,4) | |||
| #define GEMM_LN_m8n8 GEMM_LN_SIMPLE(8,8) | |||
| #define GEMM_LN_m8n12 \ | |||
| "movq %%r15,%0; negq %%r12; leaq (%%r15,%%r12,8),%%r15; negq %%r12; movq %%r13,%5; addq $8,%%r13; movq %%r14,%1;" INIT_m8n12\ | |||
| "cmpq $8,%5; jb 28122f;"\ | |||
| "28121:\n\t"\ | |||
| "prefetcht0 -384(%0); subq $32,%0; subq $16,%1;" GEMM_KERNEL_k1m8n12\ | |||
| "subq $32,%0; subq $16,%1;" GEMM_KERNEL_k1m8n12\ | |||
| "prefetcht0 -384(%0); subq $32,%0; subq $16,%1;" GEMM_KERNEL_k1m8n12\ | |||
| "subq $32,%0; subq $16,%1;" GEMM_KERNEL_k1m8n12\ | |||
| "prefetcht0 -384(%0); subq $32,%0; subq $16,%1;" GEMM_KERNEL_k1m8n12\ | |||
| "subq $32,%0; subq $16,%1;" GEMM_KERNEL_k1m8n12\ | |||
| "prefetcht0 -384(%0); subq $32,%0; subq $16,%1;" GEMM_KERNEL_k1m8n12\ | |||
| "subq $32,%0; subq $16,%1;" GEMM_KERNEL_k1m8n12\ | |||
| "subq $8,%5; cmpq $8,%5; jnb 28121b;"\ | |||
| "28122:\n\t"\ | |||
| "testq %5,%5; jz 28124f;"\ | |||
| "28123:\n\t"\ | |||
| "subq $32,%0; subq $16,%1;" GEMM_KERNEL_k1m8n12 "decq %5; jnz 28123b;"\ | |||
| "28124:\n\t" | |||
| #define GEMM_LN_m4n4 GEMM_LN_SIMPLE(4,4) | |||
| #define GEMM_LN_m4n8 GEMM_LN_SIMPLE(4,8) | |||
| #define GEMM_LN_m4n12 GEMM_LN_SIMPLE(4,12) | |||
| #define GEMM_LN_m2n4 GEMM_LN_SIMPLE(2,4) | |||
| #define GEMM_LN_m2n8 GEMM_LN_SIMPLE(2,8) | |||
| #define GEMM_LN_m2n12 GEMM_LN_SIMPLE(2,12) | |||
| #define GEMM_LN_m1n4 GEMM_LN_SIMPLE(1,4) | |||
| #define GEMM_LN_m1n8 GEMM_LN_SIMPLE(1,8) | |||
| #define GEMM_LN_m1n12 GEMM_LN_SIMPLE(1,12) | |||
| #define COMPUTE(ndim) {\ | |||
| c_ptr += M;\ | |||
| __asm__ __volatile__(\ | |||
| "movq %0,%%r15; movq %7,%%r13; movq %6,%%r12; salq $2,%%r12; leaq (%1,%%r12,4),%%r14; movq %10,%%r11;"\ | |||
| "testq $1,%%r11; jz "#ndim"772f;"\ | |||
| #ndim"771:\n\t"\ | |||
| GEMM_LN_m1n##ndim SOLVE_LN_m1n##ndim "subq $1,%%r11;"\ | |||
| #ndim"772:\n\t"\ | |||
| "testq $2,%%r11; jz "#ndim"773f;"\ | |||
| GEMM_LN_m2n##ndim SOLVE_LN_m2n##ndim "subq $2,%%r11;"\ | |||
| #ndim"773:\n\t"\ | |||
| "testq $4,%%r11; jz "#ndim"774f;"\ | |||
| GEMM_LN_m4n##ndim SOLVE_LN_m4n##ndim "subq $4,%%r11;"\ | |||
| #ndim"774:\n\t"\ | |||
| "testq %%r11,%%r11; jz "#ndim"776f;"\ | |||
| #ndim"775:\n\t"\ | |||
| GEMM_LN_m8n##ndim SOLVE_LN_m8n##ndim "subq $8,%%r11; jnz "#ndim"775b;"\ | |||
| #ndim"776:\n\t"\ | |||
| "movq %%r15,%0; movq %%r14,%1; vzeroupper;"\ | |||
| :"+r"(a_ptr),"+r"(b_ptr),"+r"(c_ptr),"+r"(c_tmp),"+r"(ldc_bytes),"+r"(k_cnt):"m"(K),"m"(kmkkinp),"m"(one[0]),"m"(zero[0]),"m"(M)\ | |||
| :"r11","r12","r13","r14","r15","cc","memory",\ | |||
| "xmm0","xmm1","xmm2","xmm3","xmm4","xmm5","xmm6","xmm7","xmm8","xmm9","xmm10","xmm11","xmm12","xmm13","xmm14","xmm15");\ | |||
| a_ptr += M * K; b_ptr += (ndim-4) * K; c_ptr += ldc * ndim;\ | |||
| } | |||
| static void solve_LN(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) { | |||
| FLOAT a0, b0; | |||
| int i, j, k; | |||
| for (i=m-1;i>=0;i--) { | |||
| a0 = a[i*m+i]; //reciprocal of the original value | |||
| for (j=0;j<n;j++) { | |||
| b0 = c[j*ldc+i]*a0; | |||
| c[j*ldc+i] = b[i*n+j] = b0; | |||
| for (k=0;k<i;k++) c[j*ldc+k] -= b0*a[i*m+k]; | |||
| } | |||
| } | |||
| } | |||
| static void COMPUTE_EDGE_1_nchunk(BLASLONG m, BLASLONG n, FLOAT *sa, FLOAT *sb, FLOAT *C, BLASLONG ldc, BLASLONG k, BLASLONG offset) { | |||
| BLASLONG m_count = m, kk = m+offset; FLOAT *a_ptr = sa+m*k, *c_ptr = C+m; | |||
| if(m_count&1){ | |||
| a_ptr-=k; c_ptr--; | |||
| if(k-kk>0) GEMM_KERNEL_N(1,n,k-kk,-1.0,a_ptr+kk*1,sb+kk*n,c_ptr,ldc); | |||
| solve_LN(1,n,a_ptr+(kk-1)*1,sb+(kk-1)*n,c_ptr,ldc); | |||
| kk -= 1; | |||
| m_count--; | |||
| } | |||
| if(m_count&2){ | |||
| a_ptr-=k*2; c_ptr-=2; | |||
| if(k-kk>0) GEMM_KERNEL_N(2,n,k-kk,-1.0,a_ptr+kk*2,sb+kk*n,c_ptr,ldc); | |||
| solve_LN(2,n,a_ptr+(kk-2)*2,sb+(kk-2)*n,c_ptr,ldc); | |||
| kk -= 2; | |||
| m_count-=2; | |||
| } | |||
| if(m_count&4){ | |||
| a_ptr-=k*4; c_ptr-=4; | |||
| if(k-kk>0) GEMM_KERNEL_N(4,n,k-kk,-1.0,a_ptr+kk*4,sb+kk*n,c_ptr,ldc); | |||
| solve_LN(4,n,a_ptr+(kk-4)*4,sb+(kk-4)*n,c_ptr,ldc); | |||
| kk -= 4; | |||
| m_count-=4; | |||
| } | |||
| for(;m_count>7;m_count-=8){ | |||
| a_ptr-=k*8; c_ptr-=8; | |||
| if(k-kk>0) GEMM_KERNEL_N(8,n,k-kk,-1.0,a_ptr+kk*8,sb+kk*n,c_ptr,ldc); | |||
| solve_LN(8,n,a_ptr+(kk-8)*8,sb+(kk-8)*n,c_ptr,ldc); | |||
| kk -= 8; | |||
| } | |||
| } | |||
| int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, FLOAT *sa, FLOAT *sb, FLOAT *C, BLASLONG ldc, BLASLONG offset){ | |||
| float *a_ptr = sa+m*k, *b_ptr = sb, *c_ptr = C, *c_tmp = C; | |||
| float one[8] = {1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0}; | |||
| float zero[8] = {0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0}; | |||
| uint64_t ldc_bytes = (uint64_t)ldc * sizeof(float), K = (uint64_t)k, M = (uint64_t)m, kmkkinp = (uint64_t)(k-m-offset), k_cnt = 0; | |||
| BLASLONG n_count = n; | |||
| for(;n_count>11;n_count-=12) COMPUTE(12) | |||
| for(;n_count>7;n_count-=8) COMPUTE(8) | |||
| for(;n_count>3;n_count-=4) COMPUTE(4) | |||
| for(;n_count>1;n_count-=2) { COMPUTE_EDGE_1_nchunk(m,2,sa,b_ptr,c_ptr,ldc,k,offset); b_ptr += 2*k; c_ptr += ldc*2;} | |||
| if(n_count>0) COMPUTE_EDGE_1_nchunk(m,1,sa,b_ptr,c_ptr,ldc,k,offset); | |||
| return 0; | |||
| } | |||
| @@ -0,0 +1,228 @@ | |||
| #include "common.h" | |||
| #include <stdint.h> | |||
| #include "strsm_kernel_8x4_haswell_L_common.h" | |||
| #define SOLVE_LT_m1n4 \ | |||
| "movq %2,%3;" GEMM_SUM_REORDER_1x4(4)\ | |||
| SOLVE_m1n4(0,4) SAVE_b_m1n4(0,4)\ | |||
| "movq %2,%3; addq $4,%2;" save_c_m1n4(4) | |||
| #define SOLVE_LT_m1n8 \ | |||
| "movq %2,%3;" GEMM_SUM_REORDER_1x4(4) GEMM_SUM_REORDER_1x4(5)\ | |||
| SOLVE_m1n8(0,4,5) SAVE_b_m1n8(0,4,5)\ | |||
| "movq %2,%3; addq $4,%2;" save_c_m1n4(4) save_c_m1n4(5) | |||
| #define SOLVE_LT_m1n12 \ | |||
| "movq %2,%3;" GEMM_SUM_REORDER_1x4(4) GEMM_SUM_REORDER_1x4(5) GEMM_SUM_REORDER_1x4(6)\ | |||
| SOLVE_m1n12(0,4,5,6) SAVE_b_m1n12(0,4,5,6)\ | |||
| "movq %2,%3; addq $4,%2;" save_c_m1n4(4) save_c_m1n4(5) save_c_m1n4(6) | |||
| #define SOLVE_LT_m2n4 \ | |||
| "movq %2,%3;" GEMM_SUM_REORDER_2x4(4,5,4)\ | |||
| SOLVE_uplo_m2n4(0,4)\ | |||
| SOLVE_lo_m2n4(8,4) SAVE_b_m2n4(0,4)\ | |||
| "movq %2,%3; addq $8,%2;" save_c_m2n4(4) | |||
| #define SOLVE_LT_m2n8 \ | |||
| "movq %2,%3;" GEMM_SUM_REORDER_2x4(4,5,4) GEMM_SUM_REORDER_2x4(6,7,5)\ | |||
| SOLVE_uplo_m2n8(0,4,5)\ | |||
| SOLVE_lo_m2n8(8,4,5) SAVE_b_m2n8(0,4,5)\ | |||
| "movq %2,%3; addq $8,%2;" save_c_m2n4(4) save_c_m2n4(5) | |||
| #define SOLVE_LT_m2n12 \ | |||
| "movq %2,%3;" GEMM_SUM_REORDER_2x4(4,5,4) GEMM_SUM_REORDER_2x4(6,7,5) GEMM_SUM_REORDER_2x4(8,9,6)\ | |||
| SOLVE_uplo_m2n12(0,4,5,6)\ | |||
| SOLVE_lo_m2n12(8,4,5,6) SAVE_b_m2n12(0,4,5,6)\ | |||
| "movq %2,%3; addq $8,%2;" save_c_m2n4(4) save_c_m2n4(5) save_c_m2n4(6) | |||
| #define SOLVE_LT_m4n4 \ | |||
| "movq %2,%3;" GEMM_SUM_REORDER_4x4(4,5,6,7,4,5)\ | |||
| \ | |||
| SOLVE_uplo_m2n4(0,4) SUBTRACT_m2n4(8,5)\ | |||
| SOLVE_lo_m2n4(16,4) SUBTRACT_m2n4(24,5) SAVE_b_m2n4(0,4)\ | |||
| \ | |||
| SOLVE_uplo_m2n4(40,5)\ | |||
| SOLVE_lo_m2n4(56,5) SAVE_b_m2n4(32,5)\ | |||
| \ | |||
| "movq %2,%3; addq $16,%2;" save_c_m4n4(4,5) | |||
| #define SOLVE_LT_m4n8 \ | |||
| "movq %2,%3;" GEMM_SUM_REORDER_4x4(4,5,6,7,4,5) GEMM_SUM_REORDER_4x4(8,9,10,11,6,7)\ | |||
| \ | |||
| SOLVE_uplo_m2n8(0,4,6) SUBTRACT_m2n8(8,5,7)\ | |||
| SOLVE_lo_m2n8(16,4,6) SUBTRACT_m2n8(24,5,7) SAVE_b_m2n8(0,4,6)\ | |||
| \ | |||
| SOLVE_uplo_m2n8(40,5,7)\ | |||
| SOLVE_lo_m2n8(56,5,7) SAVE_b_m2n8(32,5,7)\ | |||
| \ | |||
| "movq %2,%3; addq $16,%2;" save_c_m4n4(4,5) save_c_m4n4(6,7) | |||
| #define SOLVE_LT_m4n12 \ | |||
| "movq %2,%3;" GEMM_SUM_REORDER_4x4(4,5,6,7,4,5) GEMM_SUM_REORDER_4x4(8,9,10,11,6,7) GEMM_SUM_REORDER_4x4(12,13,14,15,8,9)\ | |||
| \ | |||
| SOLVE_uplo_m2n12(0,4,6,8) SUBTRACT_m2n12(8,5,7,9)\ | |||
| SOLVE_lo_m2n12(16,4,6,8) SUBTRACT_m2n12(24,5,7,9) SAVE_b_m2n12(0,4,6,8)\ | |||
| \ | |||
| SOLVE_uplo_m2n12(40,5,7,9)\ | |||
| SOLVE_lo_m2n12(56,5,7,9) SAVE_b_m2n12(32,5,7,9)\ | |||
| \ | |||
| "movq %2,%3; addq $16,%2;" save_c_m4n4(4,5) save_c_m4n4(6,7) save_c_m4n4(8,9) | |||
| #define SOLVE_LT_m8n4 \ | |||
| "movq %2,%3;" GEMM_SUM_REORDER_8x4(4,5,6,7,63)\ | |||
| \ | |||
| SOLVE_uplo_m2n4(0,4) SUBTRACT_m2n4(8,5) SUBTRACT_m2n4(16,6) SUBTRACT_m2n4(24,7)\ | |||
| SOLVE_lo_m2n4(32,4) SUBTRACT_m2n4(40,5) SUBTRACT_m2n4(48,6) SUBTRACT_m2n4(56,7) SAVE_b_m2n4(0,4)\ | |||
| \ | |||
| SOLVE_uplo_m2n4(72,5) SUBTRACT_m2n4(80,6) SUBTRACT_m2n4(88,7)\ | |||
| SOLVE_lo_m2n4(104,5) SUBTRACT_m2n4(112,6) SUBTRACT_m2n4(120,7) SAVE_b_m2n4(32,5)\ | |||
| \ | |||
| SOLVE_uplo_m2n4(144,6) SUBTRACT_m2n4(152,7)\ | |||
| SOLVE_lo_m2n4(176,6) SUBTRACT_m2n4(184,7) SAVE_b_m2n4(64,6)\ | |||
| \ | |||
| SOLVE_uplo_m2n4(216,7)\ | |||
| SOLVE_lo_m2n4(248,7) SAVE_b_m2n4(96,7)\ | |||
| \ | |||
| "movq %2,%3; addq $32,%2;" save_c_m8n4(4,5,6,7) | |||
| #define SOLVE_LT_m8n8 \ | |||
| "movq %2,%3;" GEMM_SUM_REORDER_8x4(4,5,6,7,63) GEMM_SUM_REORDER_8x4(8,9,10,11,63)\ | |||
| \ | |||
| SOLVE_uplo_m2n8(0,4,8) SUBTRACT_m2n8(8,5,9) SUBTRACT_m2n8(16,6,10) SUBTRACT_m2n8(24,7,11)\ | |||
| SOLVE_lo_m2n8(32,4,8) SUBTRACT_m2n8(40,5,9) SUBTRACT_m2n8(48,6,10) SUBTRACT_m2n8(56,7,11) SAVE_b_m2n8(0,4,8)\ | |||
| \ | |||
| SOLVE_uplo_m2n8(72,5,9) SUBTRACT_m2n8(80,6,10) SUBTRACT_m2n8(88,7,11)\ | |||
| SOLVE_lo_m2n8(104,5,9) SUBTRACT_m2n8(112,6,10) SUBTRACT_m2n8(120,7,11) SAVE_b_m2n8(32,5,9)\ | |||
| \ | |||
| SOLVE_uplo_m2n8(144,6,10) SUBTRACT_m2n8(152,7,11)\ | |||
| SOLVE_lo_m2n8(176,6,10) SUBTRACT_m2n8(184,7,11) SAVE_b_m2n8(64,6,10)\ | |||
| \ | |||
| SOLVE_uplo_m2n8(216,7,11)\ | |||
| SOLVE_lo_m2n8(248,7,11) SAVE_b_m2n8(96,7,11)\ | |||
| \ | |||
| "movq %2,%3; addq $32,%2;" save_c_m8n4(4,5,6,7) save_c_m8n4(8,9,10,11) | |||
| #define SOLVE_LT_m8n12 \ | |||
| "movq %2,%3;" GEMM_SUM_REORDER_8x4(4,5,6,7,63) GEMM_SUM_REORDER_8x4(8,9,10,11,63) GEMM_SUM_REORDER_8x4(12,13,14,15,63)\ | |||
| \ | |||
| SOLVE_uplo_m2n12(0,4,8,12) SUBTRACT_m2n12(8,5,9,13) SUBTRACT_m2n12(16,6,10,14) SUBTRACT_m2n12(24,7,11,15)\ | |||
| SOLVE_lo_m2n12(32,4,8,12) SUBTRACT_m2n12(40,5,9,13) SUBTRACT_m2n12(48,6,10,14) SUBTRACT_m2n12(56,7,11,15) SAVE_b_m2n12(0,4,8,12)\ | |||
| \ | |||
| SOLVE_uplo_m2n12(72,5,9,13) SUBTRACT_m2n12(80,6,10,14) SUBTRACT_m2n12(88,7,11,15)\ | |||
| SOLVE_lo_m2n12(104,5,9,13) SUBTRACT_m2n12(112,6,10,14) SUBTRACT_m2n12(120,7,11,15) SAVE_b_m2n12(32,5,9,13)\ | |||
| \ | |||
| SOLVE_uplo_m2n12(144,6,10,14) SUBTRACT_m2n12(152,7,11,15)\ | |||
| SOLVE_lo_m2n12(176,6,10,14) SUBTRACT_m2n12(184,7,11,15) SAVE_b_m2n12(64,6,10,14)\ | |||
| \ | |||
| SOLVE_uplo_m2n12(216,7,11,15)\ | |||
| SOLVE_lo_m2n12(248,7,11,15) SAVE_b_m2n12(96,7,11,15)\ | |||
| \ | |||
| "movq %2,%3; addq $32,%2;" save_c_m8n4(4,5,6,7) save_c_m8n4(8,9,10,11) save_c_m8n4(12,13,14,15) | |||
| #define GEMM_LT_SIMPLE(mdim,ndim) \ | |||
| "movq %%r15,%0; leaq (%%r15,%%r12,"#mdim"),%%r15; movq %%r13,%5; addq $"#mdim",%%r13; movq %%r14,%1;" INIT_m##mdim##n##ndim\ | |||
| "testq %5,%5; jz 1"#mdim""#ndim"2f;"\ | |||
| "1"#mdim""#ndim"1:\n\t"\ | |||
| GEMM_KERNEL_k1m##mdim##n##ndim "addq $16,%1; addq $"#mdim"*4,%0; decq %5; jnz 1"#mdim""#ndim"1b;"\ | |||
| "1"#mdim""#ndim"2:\n\t" | |||
| #define GEMM_LT_m8n4 GEMM_LT_SIMPLE(8,4) | |||
| #define GEMM_LT_m8n8 GEMM_LT_SIMPLE(8,8) | |||
| #define GEMM_LT_m8n12 \ | |||
| "movq %%r15,%0; leaq (%%r15,%%r12,8),%%r15; movq %%r13,%5; addq $8,%%r13; movq %%r14,%1;" INIT_m8n12\ | |||
| "cmpq $8,%5; jb 18122f;"\ | |||
| "18121:\n\t"\ | |||
| GEMM_KERNEL_k1m8n12 "prefetcht0 384(%0); addq $32,%0; addq $16,%1;"\ | |||
| GEMM_KERNEL_k1m8n12 "addq $32,%0; addq $16,%1;"\ | |||
| GEMM_KERNEL_k1m8n12 "prefetcht0 384(%0); addq $32,%0; addq $16,%1;"\ | |||
| GEMM_KERNEL_k1m8n12 "addq $32,%0; addq $16,%1;"\ | |||
| GEMM_KERNEL_k1m8n12 "prefetcht0 384(%0); addq $32,%0; addq $16,%1;"\ | |||
| GEMM_KERNEL_k1m8n12 "addq $32,%0; addq $16,%1;"\ | |||
| GEMM_KERNEL_k1m8n12 "prefetcht0 384(%0); addq $32,%0; addq $16,%1;"\ | |||
| GEMM_KERNEL_k1m8n12 "addq $32,%0; addq $16,%1;"\ | |||
| "subq $8,%5; cmpq $8,%5; jnb 18121b;"\ | |||
| "18122:\n\t"\ | |||
| "testq %5,%5; jz 18124f;"\ | |||
| "18123:\n\t"\ | |||
| GEMM_KERNEL_k1m8n12 "addq $32,%0; addq $16,%1; decq %5; jnz 18123b;"\ | |||
| "18124:\n\t" | |||
| #define GEMM_LT_m4n4 GEMM_LT_SIMPLE(4,4) | |||
| #define GEMM_LT_m4n8 GEMM_LT_SIMPLE(4,8) | |||
| #define GEMM_LT_m4n12 GEMM_LT_SIMPLE(4,12) | |||
| #define GEMM_LT_m2n4 GEMM_LT_SIMPLE(2,4) | |||
| #define GEMM_LT_m2n8 GEMM_LT_SIMPLE(2,8) | |||
| #define GEMM_LT_m2n12 GEMM_LT_SIMPLE(2,12) | |||
| #define GEMM_LT_m1n4 GEMM_LT_SIMPLE(1,4) | |||
| #define GEMM_LT_m1n8 GEMM_LT_SIMPLE(1,8) | |||
| #define GEMM_LT_m1n12 GEMM_LT_SIMPLE(1,12) | |||
| #define COMPUTE(ndim) {\ | |||
| __asm__ __volatile__(\ | |||
| "movq %0,%%r15; movq %1,%%r14; movq %7,%%r13; movq %6,%%r12; salq $2,%%r12; movq %10,%%r11;"\ | |||
| "cmpq $8,%%r11; jb "#ndim"772f;"\ | |||
| #ndim"771:\n\t"\ | |||
| GEMM_LT_m8n##ndim SOLVE_LT_m8n##ndim "subq $8,%%r11; cmpq $8,%%r11; jnb "#ndim"771b;"\ | |||
| #ndim"772:\n\t"\ | |||
| "testq $4,%%r11; jz "#ndim"773f;"\ | |||
| GEMM_LT_m4n##ndim SOLVE_LT_m4n##ndim "subq $4,%%r11;"\ | |||
| #ndim"773:\n\t"\ | |||
| "testq $2,%%r11; jz "#ndim"774f;"\ | |||
| GEMM_LT_m2n##ndim SOLVE_LT_m2n##ndim "subq $2,%%r11;"\ | |||
| #ndim"774:\n\t"\ | |||
| "testq $1,%%r11; jz "#ndim"775f;"\ | |||
| GEMM_LT_m1n##ndim SOLVE_LT_m1n##ndim "subq $1,%%r11;"\ | |||
| #ndim"775:\n\t"\ | |||
| "movq %%r15,%0; movq %%r14,%1; vzeroupper;"\ | |||
| :"+r"(a_ptr),"+r"(b_ptr),"+r"(c_ptr),"+r"(c_tmp),"+r"(ldc_bytes),"+r"(k_cnt):"m"(K),"m"(OFF),"m"(one[0]),"m"(zero[0]),"m"(M)\ | |||
| :"r11","r12","r13","r14","r15","cc","memory",\ | |||
| "xmm0","xmm1","xmm2","xmm3","xmm4","xmm5","xmm6","xmm7","xmm8","xmm9","xmm10","xmm11","xmm12","xmm13","xmm14","xmm15");\ | |||
| a_ptr -= M * K; b_ptr += ndim * K; c_ptr += ldc * ndim - M;\ | |||
| } | |||
| static void solve_LT(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) { | |||
| FLOAT a0, b0; | |||
| int i, j, k; | |||
| for (i=0;i<m;i++) { | |||
| a0 = a[i*m+i]; | |||
| for (j=0;j<n;j++) { | |||
| b0 = c[j*ldc+i] * a0; | |||
| b[i*n+j] = c[j*ldc+i] = b0; | |||
| for (k=i+1;k<m;k++) c[j*ldc+k] -= b0 * a[i*m+k]; | |||
| } | |||
| } | |||
| } | |||
| static void COMPUTE_EDGE_1_nchunk(BLASLONG m, BLASLONG n, FLOAT *sa, FLOAT *sb, FLOAT *C, BLASLONG ldc, BLASLONG k, BLASLONG offset) { | |||
| BLASLONG m_count = m, kk = offset; FLOAT *a_ptr = sa, *c_ptr = C; | |||
| for(;m_count>7;m_count-=8){ | |||
| if(kk>0) GEMM_KERNEL_N(8,n,kk,-1.0,a_ptr,sb,c_ptr,ldc); | |||
| solve_LT(8,n,a_ptr+kk*8,sb+kk*n,c_ptr,ldc); | |||
| kk += 8; a_ptr += k * 8; c_ptr += 8; | |||
| } | |||
| for(;m_count>3;m_count-=4){ | |||
| if(kk>0) GEMM_KERNEL_N(4,n,kk,-1.0,a_ptr,sb,c_ptr,ldc); | |||
| solve_LT(4,n,a_ptr+kk*4,sb+kk*n,c_ptr,ldc); | |||
| kk += 4; a_ptr += k * 4; c_ptr += 4; | |||
| } | |||
| for(;m_count>1;m_count-=2){ | |||
| if(kk>0) GEMM_KERNEL_N(2,n,kk,-1.0,a_ptr,sb,c_ptr,ldc); | |||
| solve_LT(2,n,a_ptr+kk*2,sb+kk*n,c_ptr,ldc); | |||
| kk += 2; a_ptr += k * 2; c_ptr += 2; | |||
| } | |||
| if(m_count>0){ | |||
| if(kk>0) GEMM_KERNEL_N(1,n,kk,-1.0,a_ptr,sb,c_ptr,ldc); | |||
| solve_LT(1,n,a_ptr+kk*1,sb+kk*n,c_ptr,ldc); | |||
| kk += 1; a_ptr += k * 1; c_ptr += 1; | |||
| } | |||
| } | |||
| int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, FLOAT *sa, FLOAT *sb, FLOAT *C, BLASLONG ldc, BLASLONG offset){ | |||
| float *a_ptr = sa, *b_ptr = sb, *c_ptr = C, *c_tmp = C; | |||
| float one[8] = {1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0}; | |||
| float zero[8] = {0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0}; | |||
| uint64_t ldc_bytes = (uint64_t)ldc * sizeof(float), K = (uint64_t)k, M = (uint64_t)m, OFF = (uint64_t)offset, k_cnt = 0; | |||
| BLASLONG n_count = n; | |||
| for(;n_count>11;n_count-=12) COMPUTE(12) | |||
| for(;n_count>7;n_count-=8) COMPUTE(8) | |||
| for(;n_count>3;n_count-=4) COMPUTE(4) | |||
| for(;n_count>1;n_count-=2) { COMPUTE_EDGE_1_nchunk(m,2,a_ptr,b_ptr,c_ptr,ldc,k,offset); b_ptr += 2*k; c_ptr += ldc*2;} | |||
| if(n_count>0) COMPUTE_EDGE_1_nchunk(m,1,a_ptr,b_ptr,c_ptr,ldc,k,offset); | |||
| return 0; | |||
| } | |||
| @@ -0,0 +1,187 @@ | |||
| /* r11 = m_counter, r12 = size_of_k_elements, r13 = kk, r14 = b_head, r15 = a_head */ | |||
| /* register i/o: %0 = a_ptr, %1 = b_ptr, %2 = c_ptr, %3 = c_tmp, %4 = ldc, %5 = k_counter */ | |||
| /* memory input: %6 = K, %7 = offset, %8 = {1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0}, %9 = {0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0}, %10 = M */ | |||
| #define init_m8n4(c1,c2,c3,c4)\ | |||
| "vpxor %%ymm"#c1",%%ymm"#c1",%%ymm"#c1"; vpxor %%ymm"#c2",%%ymm"#c2",%%ymm"#c2"; vpxor %%ymm"#c3",%%ymm"#c3",%%ymm"#c3"; vpxor %%ymm"#c4",%%ymm"#c4",%%ymm"#c4";" | |||
| #define INIT_m8n4 init_m8n4(4,5,6,7) | |||
| #define INIT_m8n8 INIT_m8n4 init_m8n4(8,9,10,11) | |||
| #define INIT_m8n12 INIT_m8n8 init_m8n4(12,13,14,15) | |||
| #define init_m4n4(c1,c2,c3,c4)\ | |||
| "vpxor %%xmm"#c1",%%xmm"#c1",%%xmm"#c1"; vpxor %%xmm"#c2",%%xmm"#c2",%%xmm"#c2"; vpxor %%xmm"#c3",%%xmm"#c3",%%xmm"#c3"; vpxor %%xmm"#c4",%%xmm"#c4",%%xmm"#c4";" | |||
| #define INIT_m4n4 init_m4n4(4,5,6,7) | |||
| #define INIT_m4n8 INIT_m4n4 init_m4n4(8,9,10,11) | |||
| #define INIT_m4n12 INIT_m4n8 init_m4n4(12,13,14,15) | |||
| #define init_m2n4(c1,c2)\ | |||
| "vpxor %%xmm"#c1",%%xmm"#c1",%%xmm"#c1"; vpxor %%xmm"#c2",%%xmm"#c2",%%xmm"#c2";" | |||
| #define INIT_m2n4 init_m2n4(4,5) | |||
| #define INIT_m2n8 INIT_m2n4 init_m2n4(6,7) | |||
| #define INIT_m2n12 INIT_m2n8 init_m2n4(8,9) | |||
| #define init_m1n4(c1) "vpxor %%xmm"#c1",%%xmm"#c1",%%xmm"#c1";" | |||
| #define INIT_m1n4 init_m1n4(4) | |||
| #define INIT_m1n8 INIT_m1n4 init_m1n4(5) | |||
| #define INIT_m1n12 INIT_m1n8 init_m1n4(6) | |||
| #define GEMM_KERNEL_k1m8n4 \ | |||
| "vmovsldup (%0),%%ymm1; vmovshdup (%0),%%ymm2;"\ | |||
| "vbroadcastsd (%1),%%ymm3; vfnmadd231ps %%ymm3,%%ymm1,%%ymm4; vfnmadd231ps %%ymm3,%%ymm2,%%ymm5;"\ | |||
| "vbroadcastsd 8(%1),%%ymm3; vfnmadd231ps %%ymm3,%%ymm1,%%ymm6; vfnmadd231ps %%ymm3,%%ymm2,%%ymm7;" | |||
| #define GEMM_KERNEL_k1m8n8 GEMM_KERNEL_k1m8n4\ | |||
| "vbroadcastsd (%1,%%r12,4),%%ymm3; vfnmadd231ps %%ymm3,%%ymm1,%%ymm8; vfnmadd231ps %%ymm3,%%ymm2,%%ymm9;"\ | |||
| "vbroadcastsd 8(%1,%%r12,4),%%ymm3; vfnmadd231ps %%ymm3,%%ymm1,%%ymm10; vfnmadd231ps %%ymm3,%%ymm2,%%ymm11;" | |||
| #define GEMM_KERNEL_k1m8n12 GEMM_KERNEL_k1m8n8\ | |||
| "vbroadcastsd (%1,%%r12,8),%%ymm3; vfnmadd231ps %%ymm3,%%ymm1,%%ymm12; vfnmadd231ps %%ymm3,%%ymm2,%%ymm13;"\ | |||
| "vbroadcastsd 8(%1,%%r12,8),%%ymm3; vfnmadd231ps %%ymm3,%%ymm1,%%ymm14; vfnmadd231ps %%ymm3,%%ymm2,%%ymm15;" | |||
| #define GEMM_KERNEL_k1m4n4 \ | |||
| "vmovsldup (%0),%%xmm1; vmovshdup (%0),%%xmm2;"\ | |||
| "vmovddup (%1),%%xmm3; vfnmadd231ps %%xmm3,%%xmm1,%%xmm4; vfnmadd231ps %%xmm3,%%xmm2,%%xmm5;"\ | |||
| "vmovddup 8(%1),%%xmm3; vfnmadd231ps %%xmm3,%%xmm1,%%xmm6; vfnmadd231ps %%xmm3,%%xmm2,%%xmm7;" | |||
| #define GEMM_KERNEL_k1m4n8 GEMM_KERNEL_k1m4n4\ | |||
| "vmovddup (%1,%%r12,4),%%xmm3; vfnmadd231ps %%xmm3,%%xmm1,%%xmm8; vfnmadd231ps %%xmm3,%%xmm2,%%xmm9;"\ | |||
| "vmovddup 8(%1,%%r12,4),%%xmm3; vfnmadd231ps %%xmm3,%%xmm1,%%xmm10; vfnmadd231ps %%xmm3,%%xmm2,%%xmm11;" | |||
| #define GEMM_KERNEL_k1m4n12 GEMM_KERNEL_k1m4n8\ | |||
| "vmovddup (%1,%%r12,8),%%xmm3; vfnmadd231ps %%xmm3,%%xmm1,%%xmm12; vfnmadd231ps %%xmm3,%%xmm2,%%xmm13;"\ | |||
| "vmovddup 8(%1,%%r12,8),%%xmm3; vfnmadd231ps %%xmm3,%%xmm1,%%xmm14; vfnmadd231ps %%xmm3,%%xmm2,%%xmm15;" | |||
| #define GEMM_KERNEL_k1m2n4 \ | |||
| "vbroadcastss (%0),%%xmm1; vbroadcastss 4(%0),%%xmm2;"\ | |||
| "vmovups (%1),%%xmm3; vfnmadd231ps %%xmm3,%%xmm1,%%xmm4; vfnmadd231ps %%xmm3,%%xmm2,%%xmm5;" | |||
| #define GEMM_KERNEL_k1m2n8 GEMM_KERNEL_k1m2n4\ | |||
| "vmovups (%1,%%r12,4),%%xmm3; vfnmadd231ps %%xmm3,%%xmm1,%%xmm6; vfnmadd231ps %%xmm3,%%xmm2,%%xmm7;" | |||
| #define GEMM_KERNEL_k1m2n12 GEMM_KERNEL_k1m2n8\ | |||
| "vmovups (%1,%%r12,8),%%xmm3; vfnmadd231ps %%xmm3,%%xmm1,%%xmm8; vfnmadd231ps %%xmm3,%%xmm2,%%xmm9;" | |||
| #define GEMM_KERNEL_k1m1n4 "vbroadcastss (%0),%%xmm1; vfnmadd231ps (%1),%%xmm1,%%xmm4;" | |||
| #define GEMM_KERNEL_k1m1n8 GEMM_KERNEL_k1m1n4 "vfnmadd231ps (%1,%%r12,4),%%xmm1,%%xmm5;" | |||
| #define GEMM_KERNEL_k1m1n12 GEMM_KERNEL_k1m1n8 "vfnmadd231ps (%1,%%r12,8),%%xmm1,%%xmm6;" | |||
| #define GEMM_SUM_REORDER_8x4(c1,c2,c3,c4,prefpos)\ | |||
| "vunpcklps %%ymm"#c2",%%ymm"#c1",%%ymm0; vunpckhps %%ymm"#c2",%%ymm"#c1",%%ymm1;"\ | |||
| "vunpcklps %%ymm"#c4",%%ymm"#c3",%%ymm2; vunpckhps %%ymm"#c4",%%ymm"#c3",%%ymm3;"\ | |||
| "vmovups (%3),%%ymm"#c1"; vmovups (%3,%4,1),%%ymm"#c2"; prefetcht1 "#prefpos"(%3); prefetcht1 "#prefpos"(%3,%4,1);"\ | |||
| "vunpcklpd %%ymm"#c2",%%ymm"#c1",%%ymm"#c3"; vunpckhpd %%ymm"#c2",%%ymm"#c1",%%ymm"#c4";"\ | |||
| "vaddps %%ymm0,%%ymm"#c3",%%ymm0; vaddps %%ymm1,%%ymm"#c4",%%ymm1;"\ | |||
| "leaq (%3,%4,2),%3;"\ | |||
| "vmovups (%3),%%ymm"#c1"; vmovups (%3,%4,1),%%ymm"#c2"; prefetcht1 "#prefpos"(%3); prefetcht1 "#prefpos"(%3,%4,1);"\ | |||
| "vunpcklpd %%ymm"#c2",%%ymm"#c1",%%ymm"#c3"; vunpckhpd %%ymm"#c2",%%ymm"#c1",%%ymm"#c4";"\ | |||
| "vaddps %%ymm2,%%ymm"#c3",%%ymm2; vaddps %%ymm3,%%ymm"#c4",%%ymm3;"\ | |||
| "leaq (%3,%4,2),%3;"\ | |||
| "vperm2f128 $2,%%ymm0,%%ymm2,%%ymm"#c1"; vperm2f128 $2,%%ymm1,%%ymm3,%%ymm"#c2";"\ | |||
| "vperm2f128 $19,%%ymm0,%%ymm2,%%ymm"#c3"; vperm2f128 $19,%%ymm1,%%ymm3,%%ymm"#c4";" | |||
| #define GEMM_SUM_REORDER_4x4(c1,c2,c3,c4,co1,co2)\ | |||
| "vunpcklps %%xmm"#c2",%%xmm"#c1",%%xmm0; vunpckhps %%xmm"#c2",%%xmm"#c1",%%xmm1;"\ | |||
| "vunpcklps %%xmm"#c4",%%xmm"#c3",%%xmm2; vunpckhps %%xmm"#c4",%%xmm"#c3",%%xmm3;"\ | |||
| "vmovups (%3),%%xmm"#c1"; vmovups (%3,%4,1),%%xmm"#c2";"\ | |||
| "vunpcklpd %%xmm"#c2",%%xmm"#c1",%%xmm"#c3"; vunpckhpd %%xmm"#c2",%%xmm"#c1",%%xmm"#c4";"\ | |||
| "vaddps %%xmm0,%%xmm"#c3",%%xmm0; vaddps %%xmm1,%%xmm"#c4",%%xmm1;"\ | |||
| "leaq (%3,%4,2),%3;"\ | |||
| "vmovups (%3),%%xmm"#c1"; vmovups (%3,%4,1),%%xmm"#c2";"\ | |||
| "vunpcklpd %%xmm"#c2",%%xmm"#c1",%%xmm"#c3"; vunpckhpd %%xmm"#c2",%%xmm"#c1",%%xmm"#c4";"\ | |||
| "vaddps %%xmm2,%%xmm"#c3",%%xmm2; vaddps %%xmm3,%%xmm"#c4",%%xmm3;"\ | |||
| "leaq (%3,%4,2),%3;"\ | |||
| "vperm2f128 $2,%%ymm0,%%ymm2,%%ymm"#co1"; vperm2f128 $2,%%ymm1,%%ymm3,%%ymm"#co2";" | |||
| #define GEMM_SUM_REORDER_2x4(c1,c2,co1)\ | |||
| "vunpcklps %%xmm"#c2",%%xmm"#c1",%%xmm0; vunpckhps %%xmm"#c2",%%xmm"#c1",%%xmm1;"\ | |||
| "vmovsd (%3),%%xmm2; vmovhpd (%3,%4,1),%%xmm2,%%xmm2; vaddps %%xmm0,%%xmm2,%%xmm0; leaq (%3,%4,2),%3;"\ | |||
| "vmovsd (%3),%%xmm2; vmovhpd (%3,%4,1),%%xmm2,%%xmm2; vaddps %%xmm1,%%xmm2,%%xmm1; leaq (%3,%4,2),%3;"\ | |||
| "vperm2f128 $2,%%ymm0,%%ymm1,%%ymm"#co1";" | |||
| #define GEMM_SUM_REORDER_1x4(c1)\ | |||
| "vmovss (%3),%%xmm1; vinsertps $16,(%3,%4,1),%%xmm1,%%xmm1; leaq (%3,%4,2),%3;"\ | |||
| "vinsertps $32,(%3),%%xmm1,%%xmm1; vinsertps $48,(%3,%4,1),%%xmm1,%%xmm1; leaq (%3,%4,2),%3;"\ | |||
| "vaddps %%xmm"#c1",%%xmm1,%%xmm"#c1";" | |||
| #define save_c_m8n4(c1,c2,c3,c4)\ | |||
| "vunpcklpd %%ymm"#c2",%%ymm"#c1",%%ymm0; vunpckhpd %%ymm"#c2",%%ymm"#c1",%%ymm1;"\ | |||
| "vunpcklpd %%ymm"#c4",%%ymm"#c3",%%ymm2; vunpckhpd %%ymm"#c4",%%ymm"#c3",%%ymm3;"\ | |||
| "vperm2f128 $2,%%ymm0,%%ymm2,%%ymm"#c1"; vperm2f128 $2,%%ymm1,%%ymm3,%%ymm"#c2";"\ | |||
| "vmovups %%ymm"#c1",(%3); vmovups %%ymm"#c2",(%3,%4,1); leaq (%3,%4,2),%3;"\ | |||
| "vperm2f128 $19,%%ymm0,%%ymm2,%%ymm"#c3"; vperm2f128 $19,%%ymm1,%%ymm3,%%ymm"#c4";"\ | |||
| "vmovups %%ymm"#c3",(%3); vmovups %%ymm"#c4",(%3,%4,1); leaq (%3,%4,2),%3;" | |||
| #define save_c_m4n4(c1,c2)\ | |||
| "vunpcklpd %%ymm"#c2",%%ymm"#c1",%%ymm0; vunpckhpd %%ymm"#c2",%%ymm"#c1",%%ymm1;"\ | |||
| "vmovups %%xmm0,(%3); vmovups %%xmm1,(%3,%4,1); leaq (%3,%4,2),%3;"\ | |||
| "vextractf128 $1,%%ymm0,(%3); vextractf128 $1,%%ymm1,(%3,%4,1); leaq (%3,%4,2),%3;" | |||
| #define save_c_m2n4(c1)\ | |||
| "vextractf128 $1,%%ymm"#c1",%%xmm1; vmovsd %%xmm"#c1",(%3); vmovhpd %%xmm"#c1",(%3,%4,1); leaq (%3,%4,2),%3;"\ | |||
| "vmovsd %%xmm1,(%3); vmovhpd %%xmm1,(%3,%4,1); leaq (%3,%4,2),%3;" | |||
| #define save_c_m1n4(c1)\ | |||
| "vmovss %%xmm"#c1",(%3); vextractps $1,%%xmm"#c1",(%3,%4,1); leaq (%3,%4,2),%3;"\ | |||
| "vextractps $2,%%xmm"#c1",(%3); vextractps $3,%%xmm"#c1",(%3,%4,1); leaq (%3,%4,2),%3;" | |||
| #define SOLVE_up_m2n4(a_off,c1)\ | |||
| "vbroadcastsd "#a_off"(%0),%%ymm0; vblendps $170,%8,%%ymm0,%%ymm2;"\ | |||
| "vmulps %%ymm2,%%ymm"#c1",%%ymm"#c1";"\ | |||
| "vmovsldup %%ymm"#c1",%%ymm1;" | |||
| #define SOLVE_up_m2n8(a_off,c1,c2)\ | |||
| "vbroadcastsd "#a_off"(%0),%%ymm0; vblendps $170,%8,%%ymm0,%%ymm2;"\ | |||
| "vmulps %%ymm2,%%ymm"#c1",%%ymm"#c1"; vmulps %%ymm2,%%ymm"#c2",%%ymm"#c2";"\ | |||
| "vmovsldup %%ymm"#c1",%%ymm1; vmovsldup %%ymm"#c2",%%ymm2;" | |||
| #define SOLVE_up_m2n12(a_off,c1,c2,c3)\ | |||
| "vbroadcastsd "#a_off"(%0),%%ymm0; vblendps $170,%8,%%ymm0,%%ymm2;"\ | |||
| "vmulps %%ymm2,%%ymm"#c1",%%ymm"#c1"; vmulps %%ymm2,%%ymm"#c2",%%ymm"#c2"; vmulps %%ymm2,%%ymm"#c3",%%ymm"#c3";"\ | |||
| "vmovsldup %%ymm"#c1",%%ymm1; vmovsldup %%ymm"#c2",%%ymm2; vmovsldup %%ymm"#c3",%%ymm3;" | |||
| #define SOLVE_uplo_m2n4(a_off,c1) SOLVE_up_m2n4(a_off,c1)\ | |||
| "vblendps $85,%9,%%ymm0,%%ymm0; vfnmadd231ps %%ymm0,%%ymm1,%%ymm"#c1";" | |||
| #define SOLVE_uplo_m2n8(a_off,c1,c2) SOLVE_up_m2n8(a_off,c1,c2)\ | |||
| "vblendps $85,%9,%%ymm0,%%ymm0; vfnmadd231ps %%ymm0,%%ymm1,%%ymm"#c1"; vfnmadd231ps %%ymm0,%%ymm2,%%ymm"#c2";" | |||
| #define SOLVE_uplo_m2n12(a_off,c1,c2,c3) SOLVE_up_m2n12(a_off,c1,c2,c3)\ | |||
| "vblendps $85,%9,%%ymm0,%%ymm0; vfnmadd231ps %%ymm0,%%ymm1,%%ymm"#c1"; vfnmadd231ps %%ymm0,%%ymm2,%%ymm"#c2"; vfnmadd231ps %%ymm0,%%ymm3,%%ymm"#c3";" | |||
| #define SOLVE_lo_m2n4(a_off,c1)\ | |||
| "vbroadcastsd "#a_off"(%0),%%ymm0; vblendps $85,%8,%%ymm0,%%ymm2;"\ | |||
| "vmulps %%ymm2,%%ymm"#c1",%%ymm"#c1";"\ | |||
| "vmovshdup %%ymm"#c1",%%ymm1;" | |||
| #define SOLVE_lo_m2n8(a_off,c1,c2)\ | |||
| "vbroadcastsd "#a_off"(%0),%%ymm0; vblendps $85,%8,%%ymm0,%%ymm2;"\ | |||
| "vmulps %%ymm2,%%ymm"#c1",%%ymm"#c1"; vmulps %%ymm2,%%ymm"#c2",%%ymm"#c2";"\ | |||
| "vmovshdup %%ymm"#c1",%%ymm1; vmovshdup %%ymm"#c2",%%ymm2;" | |||
| #define SOLVE_lo_m2n12(a_off,c1,c2,c3)\ | |||
| "vbroadcastsd "#a_off"(%0),%%ymm0; vblendps $85,%8,%%ymm0,%%ymm2;"\ | |||
| "vmulps %%ymm2,%%ymm"#c1",%%ymm"#c1"; vmulps %%ymm2,%%ymm"#c2",%%ymm"#c2"; vmulps %%ymm2,%%ymm"#c3",%%ymm"#c3";"\ | |||
| "vmovshdup %%ymm"#c1",%%ymm1; vmovshdup %%ymm"#c2",%%ymm2; vmovshdup %%ymm"#c3",%%ymm3;" | |||
| #define SOLVE_loup_m2n4(a_off,c1) SOLVE_lo_m2n4(a_off,c1)\ | |||
| "vblendps $170,%9,%%ymm0,%%ymm0; vfnmadd231ps %%ymm0,%%ymm1,%%ymm"#c1";" | |||
| #define SOLVE_loup_m2n8(a_off,c1,c2) SOLVE_lo_m2n8(a_off,c1,c2)\ | |||
| "vblendps $170,%9,%%ymm0,%%ymm0; vfnmadd231ps %%ymm0,%%ymm1,%%ymm"#c1"; vfnmadd231ps %%ymm0,%%ymm2,%%ymm"#c2";" | |||
| #define SOLVE_loup_m2n12(a_off,c1,c2,c3) SOLVE_lo_m2n12(a_off,c1,c2,c3)\ | |||
| "vblendps $170,%9,%%ymm0,%%ymm0; vfnmadd231ps %%ymm0,%%ymm1,%%ymm"#c1"; vfnmadd231ps %%ymm0,%%ymm2,%%ymm"#c2"; vfnmadd231ps %%ymm0,%%ymm3,%%ymm"#c3";" | |||
| #define SOLVE_m1n4(a_off,c1) "vbroadcastss "#a_off"(%0),%%xmm0; vmulps %%xmm0,%%xmm"#c1",%%xmm"#c1";" | |||
| #define SOLVE_m1n8(a_off,c1,c2) SOLVE_m1n4(a_off,c1) "vmulps %%xmm0,%%xmm"#c2",%%xmm"#c2";" | |||
| #define SOLVE_m1n12(a_off,c1,c2,c3) SOLVE_m1n8(a_off,c1,c2) "vmulps %%xmm0,%%xmm"#c3",%%xmm"#c3";" | |||
| #define SUBTRACT_m2n4(a_off,c1) "vbroadcastsd "#a_off"(%0),%%ymm0; vfnmadd231ps %%ymm0,%%ymm1,%%ymm"#c1";" | |||
| #define SUBTRACT_m2n8(a_off,c1,c2) SUBTRACT_m2n4(a_off,c1) "vfnmadd231ps %%ymm0,%%ymm2,%%ymm"#c2";" | |||
| #define SUBTRACT_m2n12(a_off,c1,c2,c3) SUBTRACT_m2n8(a_off,c1,c2) "vfnmadd231ps %%ymm0,%%ymm3,%%ymm"#c3";" | |||
| #define save_b_m2n4(c1,tmp,b_off,...)\ | |||
| "vpermilps $216,%%ymm"#c1",%%ymm"#tmp"; vpermpd $216,%%ymm"#tmp",%%ymm"#tmp"; vmovups %%ymm"#tmp","#b_off"("#__VA_ARGS__");" | |||
| #define SAVE_b_m2n4(b_off,c1) save_b_m2n4(c1,1,b_off,%1) | |||
| #define SAVE_b_m2n8(b_off,c1,c2) SAVE_b_m2n4(b_off,c1) save_b_m2n4(c2,2,b_off,%1,%%r12,4) | |||
| #define SAVE_b_m2n12(b_off,c1,c2,c3) SAVE_b_m2n8(b_off,c1,c2) save_b_m2n4(c3,3,b_off,%1,%%r12,8) | |||
| #define SAVE_b_m1n4(b_off,c1) "vmovups %%xmm"#c1","#b_off"(%1);" | |||
| #define SAVE_b_m1n8(b_off,c1,c2) SAVE_b_m1n4(b_off,c1) "vmovups %%xmm"#c2","#b_off"(%1,%%r12,4);" | |||
| #define SAVE_b_m1n12(b_off,c1,c2,c3) SAVE_b_m1n8(b_off,c1,c2) "vmovups %%xmm"#c3","#b_off"(%1,%%r12,8);" | |||
| @@ -0,0 +1,279 @@ | |||
| #include "common.h" | |||
| #include <stdint.h> | |||
| #include "strsm_kernel_8x4_haswell_R_common.h" | |||
| #define SOLVE_RN_m8n4 \ | |||
| "movq %2,%3;" GEMM_SUM_REORDER_8x4(4,5,6,7,63) "movq %2,%3; addq $32,%2;"\ | |||
| SOLVE_leri_m8n2(0,4,5,%1) SUBTRACT_m8n2(8,6,7,%1)\ | |||
| SOLVE_ri_m8n2(16,4,5,%1) SUBTRACT_m8n2(24,6,7,%1)\ | |||
| SAVE_SOLUTION_m8n2(4,5,0)\ | |||
| SOLVE_leri_m8n2(40,6,7,%1)\ | |||
| SOLVE_ri_m8n2(56,6,7,%1)\ | |||
| SAVE_SOLUTION_m8n2(6,7,64) | |||
| #define SOLVE_RN_m8n8 \ | |||
| "movq %2,%3;" GEMM_SUM_REORDER_8x4(4,5,6,7,63) GEMM_SUM_REORDER_8x4(8,9,10,11,63) "movq %2,%3; addq $32,%2;"\ | |||
| SOLVE_leri_m8n2(0,4,5,%1) SUBTRACT_m8n2(8,6,7,%1) SUBTRACT_m8n2(0,8,9,%1,%%r12,4) SUBTRACT_m8n2(8,10,11,%1,%%r12,4)\ | |||
| SOLVE_ri_m8n2(16,4,5,%1) SUBTRACT_m8n2(24,6,7,%1) SUBTRACT_m8n2(16,8,9,%1,%%r12,4) SUBTRACT_m8n2(24,10,11,%1,%%r12,4)\ | |||
| SAVE_SOLUTION_m8n2(4,5,0)\ | |||
| SOLVE_leri_m8n2(40,6,7,%1) SUBTRACT_m8n2(32,8,9,%1,%%r12,4) SUBTRACT_m8n2(40,10,11,%1,%%r12,4)\ | |||
| SOLVE_ri_m8n2(56,6,7,%1) SUBTRACT_m8n2(48,8,9,%1,%%r12,4) SUBTRACT_m8n2(56,10,11,%1,%%r12,4)\ | |||
| SAVE_SOLUTION_m8n2(6,7,64)\ | |||
| SOLVE_leri_m8n2(64,8,9,%1,%%r12,4) SUBTRACT_m8n2(72,10,11,%1,%%r12,4)\ | |||
| SOLVE_ri_m8n2(80,8,9,%1,%%r12,4) SUBTRACT_m8n2(88,10,11,%1,%%r12,4)\ | |||
| SAVE_SOLUTION_m8n2(8,9,128)\ | |||
| SOLVE_leri_m8n2(104,10,11,%1,%%r12,4)\ | |||
| SOLVE_ri_m8n2(120,10,11,%1,%%r12,4)\ | |||
| SAVE_SOLUTION_m8n2(10,11,192) | |||
| #define SOLVE_RN_m8n12 \ | |||
| "movq %2,%3;" GEMM_SUM_REORDER_8x4(4,5,6,7,63) GEMM_SUM_REORDER_8x4(8,9,10,11,63) GEMM_SUM_REORDER_8x4(12,13,14,15,63) "movq %2,%3; addq $32,%2;"\ | |||
| SOLVE_leri_m8n2(0,4,5,%1) SUBTRACT_m8n2(8,6,7,%1) SUBTRACT_m8n2(0,8,9,%1,%%r12,4) SUBTRACT_m8n2(8,10,11,%1,%%r12,4) SUBTRACT_m8n2(0,12,13,%1,%%r12,8) SUBTRACT_m8n2(8,14,15,%1,%%r12,8)\ | |||
| SOLVE_ri_m8n2(16,4,5,%1) SUBTRACT_m8n2(24,6,7,%1) SUBTRACT_m8n2(16,8,9,%1,%%r12,4) SUBTRACT_m8n2(24,10,11,%1,%%r12,4) SUBTRACT_m8n2(16,12,13,%1,%%r12,8) SUBTRACT_m8n2(24,14,15,%1,%%r12,8)\ | |||
| SAVE_SOLUTION_m8n2(4,5,0)\ | |||
| SOLVE_leri_m8n2(40,6,7,%1) SUBTRACT_m8n2(32,8,9,%1,%%r12,4) SUBTRACT_m8n2(40,10,11,%1,%%r12,4) SUBTRACT_m8n2(32,12,13,%1,%%r12,8) SUBTRACT_m8n2(40,14,15,%1,%%r12,8)\ | |||
| SOLVE_ri_m8n2(56,6,7,%1) SUBTRACT_m8n2(48,8,9,%1,%%r12,4) SUBTRACT_m8n2(56,10,11,%1,%%r12,4) SUBTRACT_m8n2(48,12,13,%1,%%r12,8) SUBTRACT_m8n2(56,14,15,%1,%%r12,8)\ | |||
| SAVE_SOLUTION_m8n2(6,7,64)\ | |||
| SOLVE_leri_m8n2(64,8,9,%1,%%r12,4) SUBTRACT_m8n2(72,10,11,%1,%%r12,4) SUBTRACT_m8n2(64,12,13,%1,%%r12,8) SUBTRACT_m8n2(72,14,15,%1,%%r12,8)\ | |||
| SOLVE_ri_m8n2(80,8,9,%1,%%r12,4) SUBTRACT_m8n2(88,10,11,%1,%%r12,4) SUBTRACT_m8n2(80,12,13,%1,%%r12,8) SUBTRACT_m8n2(88,14,15,%1,%%r12,8)\ | |||
| SAVE_SOLUTION_m8n2(8,9,128)\ | |||
| SOLVE_leri_m8n2(104,10,11,%1,%%r12,4) SUBTRACT_m8n2(96,12,13,%1,%%r12,8) SUBTRACT_m8n2(104,14,15,%1,%%r12,8)\ | |||
| SOLVE_ri_m8n2(120,10,11,%1,%%r12,4) SUBTRACT_m8n2(112,12,13,%1,%%r12,8) SUBTRACT_m8n2(120,14,15,%1,%%r12,8)\ | |||
| SAVE_SOLUTION_m8n2(10,11,192)\ | |||
| SOLVE_leri_m8n2(128,12,13,%1,%%r12,8) SUBTRACT_m8n2(136,14,15,%1,%%r12,8)\ | |||
| SOLVE_ri_m8n2(144,12,13,%1,%%r12,8) SUBTRACT_m8n2(152,14,15,%1,%%r12,8)\ | |||
| SAVE_SOLUTION_m8n2(12,13,256)\ | |||
| SOLVE_leri_m8n2(168,14,15,%1,%%r12,8)\ | |||
| SOLVE_ri_m8n2(184,14,15,%1,%%r12,8)\ | |||
| SAVE_SOLUTION_m8n2(14,15,320) | |||
| #define SOLVE_RN_m4n4 \ | |||
| "movq %2,%3;" GEMM_SUM_REORDER_4x4(4,5,6,7,4,5) "movq %2,%3; addq $16,%2;"\ | |||
| SOLVE_leri_m4n2(0,4,%1) SUBTRACT_m4n2(8,5,%1)\ | |||
| SOLVE_ri_m4n2(16,4,%1) SUBTRACT_m4n2(24,5,%1)\ | |||
| SAVE_SOLUTION_m4n2(4,0)\ | |||
| SOLVE_leri_m4n2(40,5,%1)\ | |||
| SOLVE_ri_m4n2(56,5,%1)\ | |||
| SAVE_SOLUTION_m4n2(5,32) | |||
| #define SOLVE_RN_m4n8 \ | |||
| "movq %2,%3;" GEMM_SUM_REORDER_4x4(4,5,6,7,4,5) GEMM_SUM_REORDER_4x4(8,9,10,11,6,7) "movq %2,%3; addq $16,%2;"\ | |||
| SOLVE_leri_m4n2(0,4,%1) SUBTRACT_m4n2(8,5,%1) SUBTRACT_m4n2(0,6,%1,%%r12,4) SUBTRACT_m4n2(8,7,%1,%%r12,4)\ | |||
| SOLVE_ri_m4n2(16,4,%1) SUBTRACT_m4n2(24,5,%1) SUBTRACT_m4n2(16,6,%1,%%r12,4) SUBTRACT_m4n2(24,7,%1,%%r12,4)\ | |||
| SAVE_SOLUTION_m4n2(4,0)\ | |||
| SOLVE_leri_m4n2(40,5,%1) SUBTRACT_m4n2(32,6,%1,%%r12,4) SUBTRACT_m4n2(40,7,%1,%%r12,4)\ | |||
| SOLVE_ri_m4n2(56,5,%1) SUBTRACT_m4n2(48,6,%1,%%r12,4) SUBTRACT_m4n2(56,7,%1,%%r12,4)\ | |||
| SAVE_SOLUTION_m4n2(5,32)\ | |||
| SOLVE_leri_m4n2(64,6,%1,%%r12,4) SUBTRACT_m4n2(72,7,%1,%%r12,4)\ | |||
| SOLVE_ri_m4n2(80,6,%1,%%r12,4) SUBTRACT_m4n2(88,7,%1,%%r12,4)\ | |||
| SAVE_SOLUTION_m4n2(6,64)\ | |||
| SOLVE_leri_m4n2(104,7,%1,%%r12,4)\ | |||
| SOLVE_ri_m4n2(120,7,%1,%%r12,4)\ | |||
| SAVE_SOLUTION_m4n2(7,96) | |||
| #define SOLVE_RN_m4n12 \ | |||
| "movq %2,%3;" GEMM_SUM_REORDER_4x4(4,5,6,7,4,5) GEMM_SUM_REORDER_4x4(8,9,10,11,6,7) GEMM_SUM_REORDER_4x4(12,13,14,15,8,9) "movq %2,%3; addq $16,%2;"\ | |||
| SOLVE_leri_m4n2(0,4,%1) SUBTRACT_m4n2(8,5,%1) SUBTRACT_m4n2(0,6,%1,%%r12,4) SUBTRACT_m4n2(8,7,%1,%%r12,4) SUBTRACT_m4n2(0,8,%1,%%r12,8) SUBTRACT_m4n2(8,9,%1,%%r12,8)\ | |||
| SOLVE_ri_m4n2(16,4,%1) SUBTRACT_m4n2(24,5,%1) SUBTRACT_m4n2(16,6,%1,%%r12,4) SUBTRACT_m4n2(24,7,%1,%%r12,4) SUBTRACT_m4n2(16,8,%1,%%r12,8) SUBTRACT_m4n2(24,9,%1,%%r12,8)\ | |||
| SAVE_SOLUTION_m4n2(4,0)\ | |||
| SOLVE_leri_m4n2(40,5,%1) SUBTRACT_m4n2(32,6,%1,%%r12,4) SUBTRACT_m4n2(40,7,%1,%%r12,4) SUBTRACT_m4n2(32,8,%1,%%r12,8) SUBTRACT_m4n2(40,9,%1,%%r12,8)\ | |||
| SOLVE_ri_m4n2(56,5,%1) SUBTRACT_m4n2(48,6,%1,%%r12,4) SUBTRACT_m4n2(56,7,%1,%%r12,4) SUBTRACT_m4n2(48,8,%1,%%r12,8) SUBTRACT_m4n2(56,9,%1,%%r12,8)\ | |||
| SAVE_SOLUTION_m4n2(5,32)\ | |||
| SOLVE_leri_m4n2(64,6,%1,%%r12,4) SUBTRACT_m4n2(72,7,%1,%%r12,4) SUBTRACT_m4n2(64,8,%1,%%r12,8) SUBTRACT_m4n2(72,9,%1,%%r12,8)\ | |||
| SOLVE_ri_m4n2(80,6,%1,%%r12,4) SUBTRACT_m4n2(88,7,%1,%%r12,4) SUBTRACT_m4n2(80,8,%1,%%r12,8) SUBTRACT_m4n2(88,9,%1,%%r12,8)\ | |||
| SAVE_SOLUTION_m4n2(6,64)\ | |||
| SOLVE_leri_m4n2(104,7,%1,%%r12,4) SUBTRACT_m4n2(96,8,%1,%%r12,8) SUBTRACT_m4n2(104,9,%1,%%r12,8)\ | |||
| SOLVE_ri_m4n2(120,7,%1,%%r12,4) SUBTRACT_m4n2(112,8,%1,%%r12,8) SUBTRACT_m4n2(120,9,%1,%%r12,8)\ | |||
| SAVE_SOLUTION_m4n2(7,96)\ | |||
| SOLVE_leri_m4n2(128,8,%1,%%r12,8) SUBTRACT_m4n2(136,9,%1,%%r12,8)\ | |||
| SOLVE_ri_m4n2(144,8,%1,%%r12,8) SUBTRACT_m4n2(152,9,%1,%%r12,8)\ | |||
| SAVE_SOLUTION_m4n2(8,128)\ | |||
| SOLVE_leri_m4n2(168,9,%1,%%r12,8)\ | |||
| SOLVE_ri_m4n2(184,9,%1,%%r12,8)\ | |||
| SAVE_SOLUTION_m4n2(9,160) | |||
| #define SOLVE_RN_m2n4 \ | |||
| "movq %2,%3;" GEMM_SUM_REORDER_2x4(4,5) "movq %2,%3; addq $8,%2;"\ | |||
| SOLVE_col1_ltor_m2n4(0,4,5,%1)\ | |||
| SOLVE_col2_ltor_m2n4(16,4,5,%1)\ | |||
| SOLVE_col3_ltor_m2n4(32,4,5,%1)\ | |||
| SOLVE_col4_ltor_m2n4(48,4,5,%1)\ | |||
| SAVE_SOLUTION_m2n4(4,5,0) | |||
| #define SOLVE_RN_m2n8 \ | |||
| "movq %2,%3;" GEMM_SUM_REORDER_2x4(4,5) GEMM_SUM_REORDER_2x4(6,7) "movq %2,%3; addq $8,%2;"\ | |||
| SOLVE_col1_ltor_m2n4(0,4,5,%1) SUBTRACT_m2n4(0,6,7,%1,%%r12,4)\ | |||
| SOLVE_col2_ltor_m2n4(16,4,5,%1) SUBTRACT_m2n4(16,6,7,%1,%%r12,4)\ | |||
| SOLVE_col3_ltor_m2n4(32,4,5,%1) SUBTRACT_m2n4(32,6,7,%1,%%r12,4)\ | |||
| SOLVE_col4_ltor_m2n4(48,4,5,%1) SUBTRACT_m2n4(48,6,7,%1,%%r12,4)\ | |||
| SAVE_SOLUTION_m2n4(4,5,0)\ | |||
| SOLVE_col1_ltor_m2n4(64,6,7,%1,%%r12,4)\ | |||
| SOLVE_col2_ltor_m2n4(80,6,7,%1,%%r12,4)\ | |||
| SOLVE_col3_ltor_m2n4(96,6,7,%1,%%r12,4)\ | |||
| SOLVE_col4_ltor_m2n4(112,6,7,%1,%%r12,4)\ | |||
| SAVE_SOLUTION_m2n4(6,7,32) | |||
| #define SOLVE_RN_m2n12 \ | |||
| "movq %2,%3;" GEMM_SUM_REORDER_2x4(4,5) GEMM_SUM_REORDER_2x4(6,7) GEMM_SUM_REORDER_2x4(8,9) "movq %2,%3; addq $8,%2;"\ | |||
| SOLVE_col1_ltor_m2n4(0,4,5,%1) SUBTRACT_m2n4(0,6,7,%1,%%r12,4) SUBTRACT_m2n4(0,8,9,%1,%%r12,8)\ | |||
| SOLVE_col2_ltor_m2n4(16,4,5,%1) SUBTRACT_m2n4(16,6,7,%1,%%r12,4) SUBTRACT_m2n4(16,8,9,%1,%%r12,8)\ | |||
| SOLVE_col3_ltor_m2n4(32,4,5,%1) SUBTRACT_m2n4(32,6,7,%1,%%r12,4) SUBTRACT_m2n4(32,8,9,%1,%%r12,8)\ | |||
| SOLVE_col4_ltor_m2n4(48,4,5,%1) SUBTRACT_m2n4(48,6,7,%1,%%r12,4) SUBTRACT_m2n4(48,8,9,%1,%%r12,8)\ | |||
| SAVE_SOLUTION_m2n4(4,5,0)\ | |||
| SOLVE_col1_ltor_m2n4(64,6,7,%1,%%r12,4) SUBTRACT_m2n4(64,8,9,%1,%%r12,8)\ | |||
| SOLVE_col2_ltor_m2n4(80,6,7,%1,%%r12,4) SUBTRACT_m2n4(80,8,9,%1,%%r12,8)\ | |||
| SOLVE_col3_ltor_m2n4(96,6,7,%1,%%r12,4) SUBTRACT_m2n4(96,8,9,%1,%%r12,8)\ | |||
| SOLVE_col4_ltor_m2n4(112,6,7,%1,%%r12,4) SUBTRACT_m2n4(112,8,9,%1,%%r12,8)\ | |||
| SAVE_SOLUTION_m2n4(6,7,32)\ | |||
| SOLVE_col1_ltor_m2n4(128,8,9,%1,%%r12,8)\ | |||
| SOLVE_col2_ltor_m2n4(144,8,9,%1,%%r12,8)\ | |||
| SOLVE_col3_ltor_m2n4(160,8,9,%1,%%r12,8)\ | |||
| SOLVE_col4_ltor_m2n4(176,8,9,%1,%%r12,8)\ | |||
| SAVE_SOLUTION_m2n4(8,9,64) | |||
| #define SOLVE_RN_m1n4 \ | |||
| "movq %2,%3;" GEMM_SUM_REORDER_1x4(4) "movq %2,%3; addq $4,%2;"\ | |||
| SOLVE_col1_ltor_m1n4(0,4,%1)\ | |||
| SOLVE_col2_ltor_m1n4(16,4,%1)\ | |||
| SOLVE_col3_ltor_m1n4(32,4,%1)\ | |||
| SOLVE_col4_ltor_m1n4(48,4,%1)\ | |||
| SAVE_SOLUTION_m1n4(4,0) | |||
| #define SOLVE_RN_m1n8 \ | |||
| "movq %2,%3;" GEMM_SUM_REORDER_1x4(4) GEMM_SUM_REORDER_1x4(5) "movq %2,%3; addq $4,%2;"\ | |||
| SOLVE_col1_ltor_m1n4(0,4,%1) SUBTRACT_m1n4(0,5,%1,%%r12,4)\ | |||
| SOLVE_col2_ltor_m1n4(16,4,%1) SUBTRACT_m1n4(16,5,%1,%%r12,4)\ | |||
| SOLVE_col3_ltor_m1n4(32,4,%1) SUBTRACT_m1n4(32,5,%1,%%r12,4)\ | |||
| SOLVE_col4_ltor_m1n4(48,4,%1) SUBTRACT_m1n4(48,5,%1,%%r12,4)\ | |||
| SAVE_SOLUTION_m1n4(4,0)\ | |||
| SOLVE_col1_ltor_m1n4(64,5,%1,%%r12,4)\ | |||
| SOLVE_col2_ltor_m1n4(80,5,%1,%%r12,4)\ | |||
| SOLVE_col3_ltor_m1n4(96,5,%1,%%r12,4)\ | |||
| SOLVE_col4_ltor_m1n4(112,5,%1,%%r12,4)\ | |||
| SAVE_SOLUTION_m1n4(5,16) | |||
| #define SOLVE_RN_m1n12 \ | |||
| "movq %2,%3;" GEMM_SUM_REORDER_1x4(4) GEMM_SUM_REORDER_1x4(5) GEMM_SUM_REORDER_1x4(6) "movq %2,%3; addq $4,%2;"\ | |||
| SOLVE_col1_ltor_m1n4(0,4,%1) SUBTRACT_m1n4(0,5,%1,%%r12,4) SUBTRACT_m1n4(0,6,%1,%%r12,8)\ | |||
| SOLVE_col2_ltor_m1n4(16,4,%1) SUBTRACT_m1n4(16,5,%1,%%r12,4) SUBTRACT_m1n4(16,6,%1,%%r12,8)\ | |||
| SOLVE_col3_ltor_m1n4(32,4,%1) SUBTRACT_m1n4(32,5,%1,%%r12,4) SUBTRACT_m1n4(32,6,%1,%%r12,8)\ | |||
| SOLVE_col4_ltor_m1n4(48,4,%1) SUBTRACT_m1n4(48,5,%1,%%r12,4) SUBTRACT_m1n4(48,6,%1,%%r12,8)\ | |||
| SAVE_SOLUTION_m1n4(4,0)\ | |||
| SOLVE_col1_ltor_m1n4(64,5,%1,%%r12,4) SUBTRACT_m1n4(64,6,%1,%%r12,8)\ | |||
| SOLVE_col2_ltor_m1n4(80,5,%1,%%r12,4) SUBTRACT_m1n4(80,6,%1,%%r12,8)\ | |||
| SOLVE_col3_ltor_m1n4(96,5,%1,%%r12,4) SUBTRACT_m1n4(96,6,%1,%%r12,8)\ | |||
| SOLVE_col4_ltor_m1n4(112,5,%1,%%r12,4) SUBTRACT_m1n4(112,6,%1,%%r12,8)\ | |||
| SAVE_SOLUTION_m1n4(5,16)\ | |||
| SOLVE_col1_ltor_m1n4(128,6,%1,%%r12,8)\ | |||
| SOLVE_col2_ltor_m1n4(144,6,%1,%%r12,8)\ | |||
| SOLVE_col3_ltor_m1n4(160,6,%1,%%r12,8)\ | |||
| SOLVE_col4_ltor_m1n4(176,6,%1,%%r12,8)\ | |||
| SAVE_SOLUTION_m1n4(6,32) | |||
| #define GEMM_RN_SIMPLE(mdim,ndim) \ | |||
| "movq %%r15,%0; leaq (%%r15,%%r12,"#mdim"),%%r15; movq %%r13,%5; movq %%r14,%1;" INIT_m##mdim##n##ndim\ | |||
| "testq %5,%5; jz 1"#mdim""#ndim"2f;"\ | |||
| "1"#mdim""#ndim"1:\n\t"\ | |||
| GEMM_KERNEL_k1m##mdim##n##ndim "addq $16,%1; addq $"#mdim"*4,%0; decq %5; jnz 1"#mdim""#ndim"1b;"\ | |||
| "1"#mdim""#ndim"2:\n\t" | |||
| #define GEMM_RN_m8n4 GEMM_RN_SIMPLE(8,4) | |||
| #define GEMM_RN_m8n8 GEMM_RN_SIMPLE(8,8) | |||
| #define GEMM_RN_m8n12 \ | |||
| "movq %%r15,%0; leaq (%%r15,%%r12,8),%%r15; movq %%r13,%5; movq %%r14,%1;" INIT_m8n12\ | |||
| "cmpq $8,%5; jb 18122f;"\ | |||
| "18121:\n\t"\ | |||
| GEMM_KERNEL_k1m8n12 "prefetcht0 384(%0); addq $32,%0; addq $16,%1;"\ | |||
| GEMM_KERNEL_k1m8n12 "addq $32,%0; addq $16,%1;"\ | |||
| GEMM_KERNEL_k1m8n12 "prefetcht0 384(%0); addq $32,%0; addq $16,%1;"\ | |||
| GEMM_KERNEL_k1m8n12 "addq $32,%0; addq $16,%1;"\ | |||
| GEMM_KERNEL_k1m8n12 "prefetcht0 384(%0); addq $32,%0; addq $16,%1;"\ | |||
| GEMM_KERNEL_k1m8n12 "addq $32,%0; addq $16,%1;"\ | |||
| GEMM_KERNEL_k1m8n12 "prefetcht0 384(%0); addq $32,%0; addq $16,%1;"\ | |||
| GEMM_KERNEL_k1m8n12 "addq $32,%0; addq $16,%1;"\ | |||
| "subq $8,%5; cmpq $8,%5; jnb 18121b;"\ | |||
| "18122:\n\t"\ | |||
| "testq %5,%5; jz 18124f;"\ | |||
| "18123:\n\t"\ | |||
| GEMM_KERNEL_k1m8n12 "addq $32,%0; addq $16,%1; decq %5; jnz 18123b;"\ | |||
| "18124:\n\t" | |||
| #define GEMM_RN_m4n4 GEMM_RN_SIMPLE(4,4) | |||
| #define GEMM_RN_m4n8 GEMM_RN_SIMPLE(4,8) | |||
| #define GEMM_RN_m4n12 GEMM_RN_SIMPLE(4,12) | |||
| #define GEMM_RN_m2n4 GEMM_RN_SIMPLE(2,4) | |||
| #define GEMM_RN_m2n8 GEMM_RN_SIMPLE(2,8) | |||
| #define GEMM_RN_m2n12 GEMM_RN_SIMPLE(2,12) | |||
| #define GEMM_RN_m1n4 GEMM_RN_SIMPLE(1,4) | |||
| #define GEMM_RN_m1n8 GEMM_RN_SIMPLE(1,8) | |||
| #define GEMM_RN_m1n12 GEMM_RN_SIMPLE(1,12) | |||
| #define COMPUTE(ndim) {\ | |||
| __asm__ __volatile__(\ | |||
| "movq %0,%%r15; movq %1,%%r14; movq %7,%%r13; movq %6,%%r12; salq $2,%%r12; movq %10,%%r11;"\ | |||
| "cmpq $8,%%r11; jb "#ndim"772f;"\ | |||
| #ndim"771:\n\t"\ | |||
| GEMM_RN_m8n##ndim SOLVE_RN_m8n##ndim "subq $8,%%r11; cmpq $8,%%r11; jnb "#ndim"771b;"\ | |||
| #ndim"772:\n\t"\ | |||
| "testq $4,%%r11; jz "#ndim"773f;"\ | |||
| GEMM_RN_m4n##ndim SOLVE_RN_m4n##ndim "subq $4,%%r11;"\ | |||
| #ndim"773:\n\t"\ | |||
| "testq $2,%%r11; jz "#ndim"774f;"\ | |||
| GEMM_RN_m2n##ndim SOLVE_RN_m2n##ndim "subq $2,%%r11;"\ | |||
| #ndim"774:\n\t"\ | |||
| "testq $1,%%r11; jz "#ndim"775f;"\ | |||
| GEMM_RN_m1n##ndim SOLVE_RN_m1n##ndim "subq $1,%%r11;"\ | |||
| #ndim"775:\n\t"\ | |||
| "movq %%r15,%0; movq %%r14,%1; vzeroupper;"\ | |||
| :"+r"(a_ptr),"+r"(b_ptr),"+r"(c_ptr),"+r"(c_tmp),"+r"(ldc_bytes),"+r"(k_cnt):"m"(K),"m"(OFF),"m"(one[0]),"m"(zero[0]),"m"(M)\ | |||
| :"r11","r12","r13","r14","r15","cc","memory",\ | |||
| "xmm0","xmm1","xmm2","xmm3","xmm4","xmm5","xmm6","xmm7","xmm8","xmm9","xmm10","xmm11","xmm12","xmm13","xmm14","xmm15");\ | |||
| a_ptr -= M * K; b_ptr += ndim * K; c_ptr += ldc * ndim - M; OFF += ndim;\ | |||
| } | |||
| static void solve_RN(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) { | |||
| FLOAT a0, b0; | |||
| int i, j, k; | |||
| for (i=0; i<n; i++) { | |||
| b0 = b[i*n+i]; | |||
| for (j=0; j<m; j++) { | |||
| a0 = c[i*ldc+j] * b0; | |||
| a[i*m+j] = c[i*ldc+j] = a0; | |||
| for (k=i+1; k<n; k++) c[k*ldc+j] -= a0 * b[i*n+k]; | |||
| } | |||
| } | |||
| } | |||
| static void COMPUTE_EDGE_1_nchunk(BLASLONG m, BLASLONG n, FLOAT *sa, FLOAT *sb, FLOAT *C, BLASLONG ldc, BLASLONG k, BLASLONG offset) { | |||
| BLASLONG m_count = m, kk = offset; FLOAT *a_ptr = sa, *c_ptr = C; | |||
| for(;m_count>7;m_count-=8){ | |||
| if(kk>0) GEMM_KERNEL_N(8,n,kk,-1.0,a_ptr,sb,c_ptr,ldc); | |||
| solve_RN(8,n,a_ptr+kk*8,sb+kk*n,c_ptr,ldc); | |||
| a_ptr += k * 8; c_ptr += 8; | |||
| } | |||
| for(;m_count>3;m_count-=4){ | |||
| if(kk>0) GEMM_KERNEL_N(4,n,kk,-1.0,a_ptr,sb,c_ptr,ldc); | |||
| solve_RN(4,n,a_ptr+kk*4,sb+kk*n,c_ptr,ldc); | |||
| a_ptr += k * 4; c_ptr += 4; | |||
| } | |||
| for(;m_count>1;m_count-=2){ | |||
| if(kk>0) GEMM_KERNEL_N(2,n,kk,-1.0,a_ptr,sb,c_ptr,ldc); | |||
| solve_RN(2,n,a_ptr+kk*2,sb+kk*n,c_ptr,ldc); | |||
| a_ptr += k * 2; c_ptr += 2; | |||
| } | |||
| if(m_count>0){ | |||
| if(kk>0) GEMM_KERNEL_N(1,n,kk,-1.0,a_ptr,sb,c_ptr,ldc); | |||
| solve_RN(1,n,a_ptr+kk*1,sb+kk*n,c_ptr,ldc); | |||
| a_ptr += k * 1; c_ptr += 1; | |||
| } | |||
| } | |||
| int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, FLOAT *sa, FLOAT *sb, FLOAT *C, BLASLONG ldc, BLASLONG offset){ | |||
| float *a_ptr = sa, *b_ptr = sb, *c_ptr = C, *c_tmp = C; | |||
| float one[8] = {1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0}; | |||
| float zero[8] = {0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0}; | |||
| uint64_t ldc_bytes = (uint64_t)ldc * sizeof(float), K = (uint64_t)k, M = (uint64_t)m, OFF = (uint64_t)-offset, k_cnt = 0; | |||
| BLASLONG n_count = n; | |||
| for(;n_count>11;n_count-=12) COMPUTE(12) | |||
| for(;n_count>7;n_count-=8) COMPUTE(8) | |||
| for(;n_count>3;n_count-=4) COMPUTE(4) | |||
| for(;n_count>1;n_count-=2) { COMPUTE_EDGE_1_nchunk(m,2,a_ptr,b_ptr,c_ptr,ldc,k,OFF); b_ptr += 2*k; c_ptr += ldc*2; OFF+=2;} | |||
| if(n_count>0) COMPUTE_EDGE_1_nchunk(m,1,a_ptr,b_ptr,c_ptr,ldc,k,OFF); | |||
| return 0; | |||
| } | |||
| @@ -0,0 +1,281 @@ | |||
| #include "common.h" | |||
| #include <stdint.h> | |||
| #include "strsm_kernel_8x4_haswell_R_common.h" | |||
| #define SOLVE_RT_m8n4 \ | |||
| "movq %2,%3;" GEMM_SUM_REORDER_8x4(4,5,6,7,63) "negq %4; leaq (%3,%4,2),%3; negq %4; addq $32,%2;"\ | |||
| SOLVE_rile_m8n2(-8,6,7,%1) SUBTRACT_m8n2(-16,4,5,%1)\ | |||
| SOLVE_le_m8n2(-24,6,7,%1) SUBTRACT_m8n2(-32,4,5,%1)\ | |||
| SAVE_SOLUTION_m8n2(6,7,-64) "negq %4; leaq (%3,%4,4),%3; negq %4;"\ | |||
| SOLVE_rile_m8n2(-48,4,5,%1)\ | |||
| SOLVE_le_m8n2(-64,4,5,%1)\ | |||
| SAVE_SOLUTION_m8n2(4,5,-128) | |||
| #define SOLVE_RT_m8n8 \ | |||
| "movq %2,%3;" GEMM_SUM_REORDER_8x4(4,5,6,7,63) GEMM_SUM_REORDER_8x4(8,9,10,11,63) "negq %4; leaq (%3,%4,2),%3; negq %4; addq $32,%2;"\ | |||
| SOLVE_rile_m8n2(-8,10,11,%1,%%r12,4) SUBTRACT_m8n2(-16,8,9,%1,%%r12,4) SUBTRACT_m8n2(-8,6,7,%1) SUBTRACT_m8n2(-16,4,5,%1)\ | |||
| SOLVE_le_m8n2(-24,10,11,%1,%%r12,4) SUBTRACT_m8n2(-32,8,9,%1,%%r12,4) SUBTRACT_m8n2(-24,6,7,%1) SUBTRACT_m8n2(-32,4,5,%1)\ | |||
| SAVE_SOLUTION_m8n2(10,11,-64) "negq %4; leaq (%3,%4,4),%3; negq %4;"\ | |||
| SOLVE_rile_m8n2(-48,8,9,%1,%%r12,4) SUBTRACT_m8n2(-40,6,7,%1) SUBTRACT_m8n2(-48,4,5,%1)\ | |||
| SOLVE_le_m8n2(-64,8,9,%1,%%r12,4) SUBTRACT_m8n2(-56,6,7,%1) SUBTRACT_m8n2(-64,4,5,%1)\ | |||
| SAVE_SOLUTION_m8n2(8,9,-128) "negq %4; leaq (%3,%4,4),%3; negq %4;"\ | |||
| SOLVE_rile_m8n2(-72,6,7,%1) SUBTRACT_m8n2(-80,4,5,%1)\ | |||
| SOLVE_le_m8n2(-88,6,7,%1) SUBTRACT_m8n2(-96,4,5,%1)\ | |||
| SAVE_SOLUTION_m8n2(6,7,-192) "negq %4; leaq (%3,%4,4),%3; negq %4;"\ | |||
| SOLVE_rile_m8n2(-112,4,5,%1)\ | |||
| SOLVE_le_m8n2(-128,4,5,%1)\ | |||
| SAVE_SOLUTION_m8n2(4,5,-256) | |||
| #define SOLVE_RT_m8n12 \ | |||
| "movq %2,%3;" GEMM_SUM_REORDER_8x4(4,5,6,7,63) GEMM_SUM_REORDER_8x4(8,9,10,11,63) GEMM_SUM_REORDER_8x4(12,13,14,15,63) "negq %4; leaq (%3,%4,2),%3; negq %4; addq $32,%2;"\ | |||
| SOLVE_rile_m8n2(-8,14,15,%1,%%r12,8) SUBTRACT_m8n2(-16,12,13,%1,%%r12,8) SUBTRACT_m8n2(-8,10,11,%1,%%r12,4) SUBTRACT_m8n2(-16,8,9,%1,%%r12,4) SUBTRACT_m8n2(-8,6,7,%1) SUBTRACT_m8n2(-16,4,5,%1)\ | |||
| SOLVE_le_m8n2(-24,14,15,%1,%%r12,8) SUBTRACT_m8n2(-32,12,13,%1,%%r12,8) SUBTRACT_m8n2(-24,10,11,%1,%%r12,4) SUBTRACT_m8n2(-32,8,9,%1,%%r12,4) SUBTRACT_m8n2(-24,6,7,%1) SUBTRACT_m8n2(-32,4,5,%1)\ | |||
| SAVE_SOLUTION_m8n2(14,15,-64) "negq %4; leaq (%3,%4,4),%3; negq %4;"\ | |||
| SOLVE_rile_m8n2(-48,12,13,%1,%%r12,8) SUBTRACT_m8n2(-40,10,11,%1,%%r12,4) SUBTRACT_m8n2(-48,8,9,%1,%%r12,4) SUBTRACT_m8n2(-40,6,7,%1) SUBTRACT_m8n2(-48,4,5,%1)\ | |||
| SOLVE_le_m8n2(-64,12,13,%1,%%r12,8) SUBTRACT_m8n2(-56,10,11,%1,%%r12,4) SUBTRACT_m8n2(-64,8,9,%1,%%r12,4) SUBTRACT_m8n2(-56,6,7,%1) SUBTRACT_m8n2(-64,4,5,%1)\ | |||
| SAVE_SOLUTION_m8n2(12,13,-128) "negq %4; leaq (%3,%4,4),%3; negq %4;"\ | |||
| SOLVE_rile_m8n2(-72,10,11,%1,%%r12,4) SUBTRACT_m8n2(-80,8,9,%1,%%r12,4) SUBTRACT_m8n2(-72,6,7,%1) SUBTRACT_m8n2(-80,4,5,%1)\ | |||
| SOLVE_le_m8n2(-88,10,11,%1,%%r12,4) SUBTRACT_m8n2(-96,8,9,%1,%%r12,4) SUBTRACT_m8n2(-88,6,7,%1) SUBTRACT_m8n2(-96,4,5,%1)\ | |||
| SAVE_SOLUTION_m8n2(10,11,-192) "negq %4; leaq (%3,%4,4),%3; negq %4;"\ | |||
| SOLVE_rile_m8n2(-112,8,9,%1,%%r12,4) SUBTRACT_m8n2(-104,6,7,%1) SUBTRACT_m8n2(-112,4,5,%1)\ | |||
| SOLVE_le_m8n2(-128,8,9,%1,%%r12,4) SUBTRACT_m8n2(-120,6,7,%1) SUBTRACT_m8n2(-128,4,5,%1)\ | |||
| SAVE_SOLUTION_m8n2(8,9,-256) "negq %4; leaq (%3,%4,4),%3; negq %4;"\ | |||
| SOLVE_rile_m8n2(-136,6,7,%1) SUBTRACT_m8n2(-144,4,5,%1)\ | |||
| SOLVE_le_m8n2(-152,6,7,%1) SUBTRACT_m8n2(-160,4,5,%1)\ | |||
| SAVE_SOLUTION_m8n2(6,7,-320) "negq %4; leaq (%3,%4,4),%3; negq %4;"\ | |||
| SOLVE_rile_m8n2(-176,4,5,%1)\ | |||
| SOLVE_le_m8n2(-192,4,5,%1)\ | |||
| SAVE_SOLUTION_m8n2(4,5,-384) | |||
| #define SOLVE_RT_m4n4 \ | |||
| "movq %2,%3;" GEMM_SUM_REORDER_4x4(4,5,6,7,4,5) "negq %4; leaq (%3,%4,2),%3; negq %4; addq $16,%2;"\ | |||
| SOLVE_rile_m4n2(-8,5,%1) SUBTRACT_m4n2(-16,4,%1)\ | |||
| SOLVE_le_m4n2(-24,5,%1) SUBTRACT_m4n2(-32,4,%1)\ | |||
| SAVE_SOLUTION_m4n2(5,-32) "negq %4; leaq (%3,%4,4),%3; negq %4;"\ | |||
| SOLVE_rile_m4n2(-48,4,%1)\ | |||
| SOLVE_le_m4n2(-64,4,%1)\ | |||
| SAVE_SOLUTION_m4n2(4,-64) | |||
| #define SOLVE_RT_m4n8 \ | |||
| "movq %2,%3;" GEMM_SUM_REORDER_4x4(4,5,6,7,4,5) GEMM_SUM_REORDER_4x4(8,9,10,11,6,7) "negq %4; leaq (%3,%4,2),%3; negq %4; addq $16,%2;"\ | |||
| SOLVE_rile_m4n2(-8,7,%1,%%r12,4) SUBTRACT_m4n2(-16,6,%1,%%r12,4) SUBTRACT_m4n2(-8,5,%1) SUBTRACT_m4n2(-16,4,%1)\ | |||
| SOLVE_le_m4n2(-24,7,%1,%%r12,4) SUBTRACT_m4n2(-32,6,%1,%%r12,4) SUBTRACT_m4n2(-24,5,%1) SUBTRACT_m4n2(-32,4,%1)\ | |||
| SAVE_SOLUTION_m4n2(7,-32) "negq %4; leaq (%3,%4,4),%3; negq %4;"\ | |||
| SOLVE_rile_m4n2(-48,6,%1,%%r12,4) SUBTRACT_m4n2(-40,5,%1) SUBTRACT_m4n2(-48,4,%1)\ | |||
| SOLVE_le_m4n2(-64,6,%1,%%r12,4) SUBTRACT_m4n2(-56,5,%1) SUBTRACT_m4n2(-64,4,%1)\ | |||
| SAVE_SOLUTION_m4n2(6,-64) "negq %4; leaq (%3,%4,4),%3; negq %4;"\ | |||
| SOLVE_rile_m4n2(-72,5,%1) SUBTRACT_m4n2(-80,4,%1)\ | |||
| SOLVE_le_m4n2(-88,5,%1) SUBTRACT_m4n2(-96,4,%1)\ | |||
| SAVE_SOLUTION_m4n2(5,-96) "negq %4; leaq (%3,%4,4),%3; negq %4;"\ | |||
| SOLVE_rile_m4n2(-112,4,%1)\ | |||
| SOLVE_le_m4n2(-128,4,%1)\ | |||
| SAVE_SOLUTION_m4n2(4,-128) | |||
| #define SOLVE_RT_m4n12 \ | |||
| "movq %2,%3;" GEMM_SUM_REORDER_4x4(4,5,6,7,4,5) GEMM_SUM_REORDER_4x4(8,9,10,11,6,7) GEMM_SUM_REORDER_4x4(12,13,14,15,8,9) "negq %4; leaq (%3,%4,2),%3; negq %4; addq $16,%2;"\ | |||
| SOLVE_rile_m4n2(-8,9,%1,%%r12,8) SUBTRACT_m4n2(-16,8,%1,%%r12,8) SUBTRACT_m4n2(-8,7,%1,%%r12,4) SUBTRACT_m4n2(-16,6,%1,%%r12,4) SUBTRACT_m4n2(-8,5,%1) SUBTRACT_m4n2(-16,4,%1)\ | |||
| SOLVE_le_m4n2(-24,9,%1,%%r12,8) SUBTRACT_m4n2(-32,8,%1,%%r12,8) SUBTRACT_m4n2(-24,7,%1,%%r12,4) SUBTRACT_m4n2(-32,6,%1,%%r12,4) SUBTRACT_m4n2(-24,5,%1) SUBTRACT_m4n2(-32,4,%1)\ | |||
| SAVE_SOLUTION_m4n2(9,-32) "negq %4; leaq (%3,%4,4),%3; negq %4;"\ | |||
| SOLVE_rile_m4n2(-48,8,%1,%%r12,8) SUBTRACT_m4n2(-40,7,%1,%%r12,4) SUBTRACT_m4n2(-48,6,%1,%%r12,4) SUBTRACT_m4n2(-40,5,%1) SUBTRACT_m4n2(-48,4,%1)\ | |||
| SOLVE_le_m4n2(-64,8,%1,%%r12,8) SUBTRACT_m4n2(-56,7,%1,%%r12,4) SUBTRACT_m4n2(-64,6,%1,%%r12,4) SUBTRACT_m4n2(-56,5,%1) SUBTRACT_m4n2(-64,4,%1)\ | |||
| SAVE_SOLUTION_m4n2(8,-64) "negq %4; leaq (%3,%4,4),%3; negq %4;"\ | |||
| SOLVE_rile_m4n2(-72,7,%1,%%r12,4) SUBTRACT_m4n2(-80,6,%1,%%r12,4) SUBTRACT_m4n2(-72,5,%1) SUBTRACT_m4n2(-80,4,%1)\ | |||
| SOLVE_le_m4n2(-88,7,%1,%%r12,4) SUBTRACT_m4n2(-96,6,%1,%%r12,4) SUBTRACT_m4n2(-88,5,%1) SUBTRACT_m4n2(-96,4,%1)\ | |||
| SAVE_SOLUTION_m4n2(7,-96) "negq %4; leaq (%3,%4,4),%3; negq %4;"\ | |||
| SOLVE_rile_m4n2(-112,6,%1,%%r12,4) SUBTRACT_m4n2(-104,5,%1) SUBTRACT_m4n2(-112,4,%1)\ | |||
| SOLVE_le_m4n2(-128,6,%1,%%r12,4) SUBTRACT_m4n2(-120,5,%1) SUBTRACT_m4n2(-128,4,%1)\ | |||
| SAVE_SOLUTION_m4n2(6,-128) "negq %4; leaq (%3,%4,4),%3; negq %4;"\ | |||
| SOLVE_rile_m4n2(-136,5,%1) SUBTRACT_m4n2(-144,4,%1)\ | |||
| SOLVE_le_m4n2(-152,5,%1) SUBTRACT_m4n2(-160,4,%1)\ | |||
| SAVE_SOLUTION_m4n2(5,-160) "negq %4; leaq (%3,%4,4),%3; negq %4;"\ | |||
| SOLVE_rile_m4n2(-176,4,%1)\ | |||
| SOLVE_le_m4n2(-192,4,%1)\ | |||
| SAVE_SOLUTION_m4n2(4,-192) | |||
| #define SOLVE_RT_m2n4 \ | |||
| "movq %2,%3;" GEMM_SUM_REORDER_2x4(4,5) "negq %4; leaq (%3,%4,4),%3; negq %4; addq $8,%2;"\ | |||
| SOLVE_col4_rtol_m2n4(-16,4,5,%1)\ | |||
| SOLVE_col3_rtol_m2n4(-32,4,5,%1)\ | |||
| SOLVE_col2_rtol_m2n4(-48,4,5,%1)\ | |||
| SOLVE_col1_rtol_m2n4(-64,4,5,%1)\ | |||
| SAVE_SOLUTION_m2n4(4,5,-32) | |||
| #define SOLVE_RT_m2n8 \ | |||
| "movq %2,%3;" GEMM_SUM_REORDER_2x4(4,5) GEMM_SUM_REORDER_2x4(6,7) "negq %4; leaq (%3,%4,4),%3; negq %4; addq $8,%2;"\ | |||
| SOLVE_col4_rtol_m2n4(-16,6,7,%1,%%r12,4) SUBTRACT_m2n4(-16,4,5,%1)\ | |||
| SOLVE_col3_rtol_m2n4(-32,6,7,%1,%%r12,4) SUBTRACT_m2n4(-32,4,5,%1)\ | |||
| SOLVE_col2_rtol_m2n4(-48,6,7,%1,%%r12,4) SUBTRACT_m2n4(-48,4,5,%1)\ | |||
| SOLVE_col1_rtol_m2n4(-64,6,7,%1,%%r12,4) SUBTRACT_m2n4(-64,4,5,%1)\ | |||
| SAVE_SOLUTION_m2n4(6,7,-32) "negq %4; leaq (%3,%4,8),%3; negq %4;"\ | |||
| SOLVE_col4_rtol_m2n4(-80,4,5,%1)\ | |||
| SOLVE_col3_rtol_m2n4(-96,4,5,%1)\ | |||
| SOLVE_col2_rtol_m2n4(-112,4,5,%1)\ | |||
| SOLVE_col1_rtol_m2n4(-128,4,5,%1)\ | |||
| SAVE_SOLUTION_m2n4(4,5,-64) | |||
| #define SOLVE_RT_m2n12 \ | |||
| "movq %2,%3;" GEMM_SUM_REORDER_2x4(4,5) GEMM_SUM_REORDER_2x4(6,7) GEMM_SUM_REORDER_2x4(8,9) "negq %4; leaq (%3,%4,4),%3; negq %4; addq $8,%2;"\ | |||
| SOLVE_col4_rtol_m2n4(-16,8,9,%1,%%r12,8) SUBTRACT_m2n4(-16,6,7,%1,%%r12,4) SUBTRACT_m2n4(-16,4,5,%1)\ | |||
| SOLVE_col3_rtol_m2n4(-32,8,9,%1,%%r12,8) SUBTRACT_m2n4(-32,6,7,%1,%%r12,4) SUBTRACT_m2n4(-32,4,5,%1)\ | |||
| SOLVE_col2_rtol_m2n4(-48,8,9,%1,%%r12,8) SUBTRACT_m2n4(-48,6,7,%1,%%r12,4) SUBTRACT_m2n4(-48,4,5,%1)\ | |||
| SOLVE_col1_rtol_m2n4(-64,8,9,%1,%%r12,8) SUBTRACT_m2n4(-64,6,7,%1,%%r12,4) SUBTRACT_m2n4(-64,4,5,%1)\ | |||
| SAVE_SOLUTION_m2n4(8,9,-32) "negq %4; leaq (%3,%4,8),%3; negq %4;"\ | |||
| SOLVE_col4_rtol_m2n4(-80,6,7,%1,%%r12,4) SUBTRACT_m2n4(-80,4,5,%1)\ | |||
| SOLVE_col3_rtol_m2n4(-96,6,7,%1,%%r12,4) SUBTRACT_m2n4(-96,4,5,%1)\ | |||
| SOLVE_col2_rtol_m2n4(-112,6,7,%1,%%r12,4) SUBTRACT_m2n4(-112,4,5,%1)\ | |||
| SOLVE_col1_rtol_m2n4(-128,6,7,%1,%%r12,4) SUBTRACT_m2n4(-128,4,5,%1)\ | |||
| SAVE_SOLUTION_m2n4(6,7,-64) "negq %4; leaq (%3,%4,8),%3; negq %4;"\ | |||
| SOLVE_col4_rtol_m2n4(-144,4,5,%1)\ | |||
| SOLVE_col3_rtol_m2n4(-160,4,5,%1)\ | |||
| SOLVE_col2_rtol_m2n4(-176,4,5,%1)\ | |||
| SOLVE_col1_rtol_m2n4(-192,4,5,%1)\ | |||
| SAVE_SOLUTION_m2n4(4,5,-96) | |||
| #define SOLVE_RT_m1n4 \ | |||
| "movq %2,%3;" GEMM_SUM_REORDER_1x4(4) "negq %4; leaq (%3,%4,4),%3; negq %4; addq $4,%2;"\ | |||
| SOLVE_col4_rtol_m1n4(-16,4,%1)\ | |||
| SOLVE_col3_rtol_m1n4(-32,4,%1)\ | |||
| SOLVE_col2_rtol_m1n4(-48,4,%1)\ | |||
| SOLVE_col1_rtol_m1n4(-64,4,%1)\ | |||
| SAVE_SOLUTION_m1n4(4,-16) | |||
| #define SOLVE_RT_m1n8 \ | |||
| "movq %2,%3;" GEMM_SUM_REORDER_1x4(4) GEMM_SUM_REORDER_1x4(5) "negq %4; leaq (%3,%4,4),%3; negq %4; addq $4,%2;"\ | |||
| SOLVE_col4_rtol_m1n4(-16,5,%1,%%r12,4) SUBTRACT_m1n4(-16,4,%1)\ | |||
| SOLVE_col3_rtol_m1n4(-32,5,%1,%%r12,4) SUBTRACT_m1n4(-32,4,%1)\ | |||
| SOLVE_col2_rtol_m1n4(-48,5,%1,%%r12,4) SUBTRACT_m1n4(-48,4,%1)\ | |||
| SOLVE_col1_rtol_m1n4(-64,5,%1,%%r12,4) SUBTRACT_m1n4(-64,4,%1)\ | |||
| SAVE_SOLUTION_m1n4(5,-16) "negq %4; leaq (%3,%4,8),%3; negq %4;"\ | |||
| SOLVE_col4_rtol_m1n4(-80,4,%1)\ | |||
| SOLVE_col3_rtol_m1n4(-96,4,%1)\ | |||
| SOLVE_col2_rtol_m1n4(-112,4,%1)\ | |||
| SOLVE_col1_rtol_m1n4(-128,4,%1)\ | |||
| SAVE_SOLUTION_m1n4(4,-32) | |||
| #define SOLVE_RT_m1n12 \ | |||
| "movq %2,%3;" GEMM_SUM_REORDER_1x4(4) GEMM_SUM_REORDER_1x4(5) GEMM_SUM_REORDER_1x4(6) "negq %4; leaq (%3,%4,4),%3; negq %4; addq $4,%2;"\ | |||
| SOLVE_col4_rtol_m1n4(-16,6,%1,%%r12,8) SUBTRACT_m1n4(-16,5,%1,%%r12,4) SUBTRACT_m1n4(-16,4,%1)\ | |||
| SOLVE_col3_rtol_m1n4(-32,6,%1,%%r12,8) SUBTRACT_m1n4(-32,5,%1,%%r12,4) SUBTRACT_m1n4(-32,4,%1)\ | |||
| SOLVE_col2_rtol_m1n4(-48,6,%1,%%r12,8) SUBTRACT_m1n4(-48,5,%1,%%r12,4) SUBTRACT_m1n4(-48,4,%1)\ | |||
| SOLVE_col1_rtol_m1n4(-64,6,%1,%%r12,8) SUBTRACT_m1n4(-64,5,%1,%%r12,4) SUBTRACT_m1n4(-64,4,%1)\ | |||
| SAVE_SOLUTION_m1n4(6,-16) "negq %4; leaq (%3,%4,8),%3; negq %4;"\ | |||
| SOLVE_col4_rtol_m1n4(-80,5,%1,%%r12,4) SUBTRACT_m1n4(-80,4,%1)\ | |||
| SOLVE_col3_rtol_m1n4(-96,5,%1,%%r12,4) SUBTRACT_m1n4(-96,4,%1)\ | |||
| SOLVE_col2_rtol_m1n4(-112,5,%1,%%r12,4) SUBTRACT_m1n4(-112,4,%1)\ | |||
| SOLVE_col1_rtol_m1n4(-128,5,%1,%%r12,4) SUBTRACT_m1n4(-128,4,%1)\ | |||
| SAVE_SOLUTION_m1n4(5,-32) "negq %4; leaq (%3,%4,8),%3; negq %4;"\ | |||
| SOLVE_col4_rtol_m1n4(-144,4,%1)\ | |||
| SOLVE_col3_rtol_m1n4(-160,4,%1)\ | |||
| SOLVE_col2_rtol_m1n4(-176,4,%1)\ | |||
| SOLVE_col1_rtol_m1n4(-192,4,%1)\ | |||
| SAVE_SOLUTION_m1n4(4,-48) | |||
| /* r14 = b_tail, r15 = a_tail, r13 = k-kk */ | |||
| #define GEMM_RT_SIMPLE(mdim,ndim) \ | |||
| "leaq (%%r15,%%r12,"#mdim"),%%r15; movq %%r15,%0; movq %%r13,%5; movq %%r14,%1;" INIT_m##mdim##n##ndim\ | |||
| "testq %5,%5; jz 1"#mdim""#ndim"2f;"\ | |||
| "1"#mdim""#ndim"1:\n\t"\ | |||
| "subq $16,%1; subq $"#mdim"*4,%0;" GEMM_KERNEL_k1m##mdim##n##ndim "decq %5; jnz 1"#mdim""#ndim"1b;"\ | |||
| "1"#mdim""#ndim"2:\n\t" | |||
| #define GEMM_RT_m8n4 GEMM_RT_SIMPLE(8,4) | |||
| #define GEMM_RT_m8n8 GEMM_RT_SIMPLE(8,8) | |||
| #define GEMM_RT_m8n12 \ | |||
| "leaq (%%r15,%%r12,8),%%r15; movq %%r15,%0; movq %%r13,%5; movq %%r14,%1;" INIT_m8n12\ | |||
| "cmpq $8,%5; jb 18122f;"\ | |||
| "18121:\n\t"\ | |||
| "prefetcht0 -384(%0); subq $32,%0; subq $16,%1;" GEMM_KERNEL_k1m8n12\ | |||
| "subq $32,%0; subq $16,%1;" GEMM_KERNEL_k1m8n12\ | |||
| "prefetcht0 -384(%0); subq $32,%0; subq $16,%1;" GEMM_KERNEL_k1m8n12\ | |||
| "subq $32,%0; subq $16,%1;" GEMM_KERNEL_k1m8n12\ | |||
| "prefetcht0 -384(%0); subq $32,%0; subq $16,%1;" GEMM_KERNEL_k1m8n12\ | |||
| "subq $32,%0; subq $16,%1;" GEMM_KERNEL_k1m8n12\ | |||
| "prefetcht0 -384(%0); subq $32,%0; subq $16,%1;" GEMM_KERNEL_k1m8n12\ | |||
| "subq $32,%0; subq $16,%1;" GEMM_KERNEL_k1m8n12\ | |||
| "subq $8,%5; cmpq $8,%5; jnb 18121b;"\ | |||
| "18122:\n\t"\ | |||
| "testq %5,%5; jz 18124f;"\ | |||
| "18123:\n\t"\ | |||
| "subq $32,%0; subq $16,%1;" GEMM_KERNEL_k1m8n12 "decq %5; jnz 18123b;"\ | |||
| "18124:\n\t" | |||
| #define GEMM_RT_m4n4 GEMM_RT_SIMPLE(4,4) | |||
| #define GEMM_RT_m4n8 GEMM_RT_SIMPLE(4,8) | |||
| #define GEMM_RT_m4n12 GEMM_RT_SIMPLE(4,12) | |||
| #define GEMM_RT_m2n4 GEMM_RT_SIMPLE(2,4) | |||
| #define GEMM_RT_m2n8 GEMM_RT_SIMPLE(2,8) | |||
| #define GEMM_RT_m2n12 GEMM_RT_SIMPLE(2,12) | |||
| #define GEMM_RT_m1n4 GEMM_RT_SIMPLE(1,4) | |||
| #define GEMM_RT_m1n8 GEMM_RT_SIMPLE(1,8) | |||
| #define GEMM_RT_m1n12 GEMM_RT_SIMPLE(1,12) | |||
| #define COMPUTE(ndim) {\ | |||
| b_ptr -= (ndim-4)*K; c_ptr -= ndim * ldc;\ | |||
| __asm__ __volatile__(\ | |||
| "movq %0,%%r15; movq %6,%%r13; subq %7,%%r13; movq %6,%%r12; salq $2,%%r12; movq %1,%%r14; movq %10,%%r11;"\ | |||
| "cmpq $8,%%r11; jb "#ndim"772f;"\ | |||
| #ndim"771:\n\t"\ | |||
| GEMM_RT_m8n##ndim SOLVE_RT_m8n##ndim "subq $8,%%r11; cmpq $8,%%r11; jnb "#ndim"771b;"\ | |||
| #ndim"772:\n\t"\ | |||
| "testq $4,%%r11; jz "#ndim"773f;"\ | |||
| GEMM_RT_m4n##ndim SOLVE_RT_m4n##ndim "subq $4,%%r11;"\ | |||
| #ndim"773:\n\t"\ | |||
| "testq $2,%%r11; jz "#ndim"774f;"\ | |||
| GEMM_RT_m2n##ndim SOLVE_RT_m2n##ndim "subq $2,%%r11;"\ | |||
| #ndim"774:\n\t"\ | |||
| "testq $1,%%r11; jz "#ndim"775f;"\ | |||
| GEMM_RT_m1n##ndim SOLVE_RT_m1n##ndim "subq $1,%%r11;"\ | |||
| #ndim"775:\n\t"\ | |||
| "movq %%r15,%0; movq %%r14,%1; vzeroupper;"\ | |||
| :"+r"(a_ptr),"+r"(b_ptr),"+r"(c_ptr),"+r"(c_tmp),"+r"(ldc_bytes),"+r"(k_cnt):"m"(K),"m"(OFF),"m"(one[0]),"m"(zero[0]),"m"(M)\ | |||
| :"r11","r12","r13","r14","r15","cc","memory",\ | |||
| "xmm0","xmm1","xmm2","xmm3","xmm4","xmm5","xmm6","xmm7","xmm8","xmm9","xmm10","xmm11","xmm12","xmm13","xmm14","xmm15");\ | |||
| a_ptr -= M * K; b_ptr -= 4 * K; c_ptr -= M; OFF -= ndim;\ | |||
| } | |||
| static void solve_RT(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc){ | |||
| FLOAT a0, b0; | |||
| int i, j, k; | |||
| for (i=n-1;i>=0;i--) { | |||
| b0 = b[i*n+i]; | |||
| for (j=0;j<m;j++) { | |||
| a0 = c[i*ldc+j] * b0; | |||
| a[i*m+j] = c[i*ldc+j] = a0; | |||
| for (k=0;k<i;k++) c[k*ldc+j] -= a0 * b[i*n+k]; | |||
| } | |||
| } | |||
| } | |||
| static void COMPUTE_EDGE_1_nchunk(BLASLONG m, BLASLONG n, FLOAT *sa, FLOAT *sb, FLOAT *C, BLASLONG ldc, BLASLONG k, BLASLONG offset) { | |||
| BLASLONG m_count = m, kk = offset; FLOAT *a_ptr = sa, *c_ptr = C; | |||
| for(;m_count>7;m_count-=8){ | |||
| if(k-kk>0) GEMM_KERNEL_N(8,n,k-kk,-1.0,a_ptr+kk*8,sb+kk*n,c_ptr,ldc); | |||
| solve_RT(8,n,a_ptr+(kk-n)*8,sb+(kk-n)*n,c_ptr,ldc); | |||
| a_ptr += k * 8; c_ptr += 8; | |||
| } | |||
| for(;m_count>3;m_count-=4){ | |||
| if(k-kk>0) GEMM_KERNEL_N(4,n,k-kk,-1.0,a_ptr+kk*4,sb+kk*n,c_ptr,ldc); | |||
| solve_RT(4,n,a_ptr+(kk-n)*4,sb+(kk-n)*n,c_ptr,ldc); | |||
| a_ptr += k * 4; c_ptr += 4; | |||
| } | |||
| for(;m_count>1;m_count-=2){ | |||
| if(k-kk>0) GEMM_KERNEL_N(2,n,k-kk,-1.0,a_ptr+kk*2,sb+kk*n,c_ptr,ldc); | |||
| solve_RT(2,n,a_ptr+(kk-n)*2,sb+(kk-n)*n,c_ptr,ldc); | |||
| a_ptr += k * 2; c_ptr += 2; | |||
| } | |||
| if(m_count>0){ | |||
| if(k-kk>0) GEMM_KERNEL_N(1,n,k-kk,-1.0,a_ptr+kk*1,sb+kk*n,c_ptr,ldc); | |||
| solve_RT(1,n,a_ptr+(kk-n)*1,sb+(kk-n)*n,c_ptr,ldc); | |||
| a_ptr += k * 1; c_ptr += 1; | |||
| } | |||
| } | |||
| int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, FLOAT *sa, FLOAT *sb, FLOAT *C, BLASLONG ldc, BLASLONG offset){ | |||
| float *a_ptr = sa, *b_ptr = sb+n*k, *c_ptr = C+n*ldc, *c_tmp = C; | |||
| float one[8] = {1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0}; | |||
| float zero[8] = {0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0}; | |||
| uint64_t ldc_bytes = (uint64_t)ldc * sizeof(float), K = (uint64_t)k, M = (uint64_t)m, OFF = (uint64_t)(n-offset), k_cnt = 0; | |||
| BLASLONG n_count = n; | |||
| if(n&1){b_ptr-=k; c_ptr-=ldc; COMPUTE_EDGE_1_nchunk(m,1,a_ptr,b_ptr,c_ptr,ldc,k,OFF); OFF--; n_count--;} | |||
| if(n&2){b_ptr-=k*2; c_ptr-=ldc*2; COMPUTE_EDGE_1_nchunk(m,2,a_ptr,b_ptr,c_ptr,ldc,k,OFF); OFF-=2; n_count-=2;} | |||
| for(;n_count>11;n_count-=12) COMPUTE(12) | |||
| for(;n_count>7;n_count-=8) COMPUTE(8) | |||
| for(;n_count>3;n_count-=4) COMPUTE(4) | |||
| return 0; | |||
| } | |||
| @@ -0,0 +1,226 @@ | |||
| /* r11 = m_counter, r12 = size_of_k_elements, r13 = kk, r14 = b_head, r15 = a_head */ | |||
| /* register i/o: %0 = a_ptr, %1 = b_ptr, %2 = c_ptr, %3 = c_tmp, %4 = ldc, %5 = k_counter */ | |||
| /* memory input: %6 = K, %7 = offset, %8 = {1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0}, %9 = {0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0}, %10 = M */ | |||
| #define init_m8n4(c1,c2,c3,c4)\ | |||
| "vpxor %%ymm"#c1",%%ymm"#c1",%%ymm"#c1"; vpxor %%ymm"#c2",%%ymm"#c2",%%ymm"#c2";"\ | |||
| "vpxor %%ymm"#c3",%%ymm"#c3",%%ymm"#c3"; vpxor %%ymm"#c4",%%ymm"#c4",%%ymm"#c4";" | |||
| #define INIT_m8n4 init_m8n4(4,5,6,7) | |||
| #define INIT_m8n8 INIT_m8n4 init_m8n4(8,9,10,11) | |||
| #define INIT_m8n12 INIT_m8n8 init_m8n4(12,13,14,15) | |||
| #define init_m4n4(c1,c2,c3,c4)\ | |||
| "vpxor %%xmm"#c1",%%xmm"#c1",%%xmm"#c1"; vpxor %%xmm"#c2",%%xmm"#c2",%%xmm"#c2";"\ | |||
| "vpxor %%xmm"#c3",%%xmm"#c3",%%xmm"#c3"; vpxor %%xmm"#c4",%%xmm"#c4",%%xmm"#c4";" | |||
| #define INIT_m4n4 init_m4n4(4,5,6,7) | |||
| #define INIT_m4n8 INIT_m4n4 init_m4n4(8,9,10,11) | |||
| #define INIT_m4n12 INIT_m4n8 init_m4n4(12,13,14,15) | |||
| #define init_m2n4(c1,c2)\ | |||
| "vpxor %%xmm"#c1",%%xmm"#c1",%%xmm"#c1"; vpxor %%xmm"#c2",%%xmm"#c2",%%xmm"#c2";" | |||
| #define INIT_m2n4 init_m2n4(4,5) | |||
| #define INIT_m2n8 INIT_m2n4 init_m2n4(6,7) | |||
| #define INIT_m2n12 INIT_m2n8 init_m2n4(8,9) | |||
| #define init_m1n4(c1) "vpxor %%xmm"#c1",%%xmm"#c1",%%xmm"#c1";" | |||
| #define INIT_m1n4 init_m1n4(4) | |||
| #define INIT_m1n8 INIT_m1n4 init_m1n4(5) | |||
| #define INIT_m1n12 INIT_m1n8 init_m1n4(6) | |||
| #define GEMM_KERNEL_k1m8n4 \ | |||
| "vmovsldup (%0),%%ymm1; vmovshdup (%0),%%ymm2;"\ | |||
| "vbroadcastsd (%1),%%ymm3; vfnmadd231ps %%ymm3,%%ymm1,%%ymm4; vfnmadd231ps %%ymm3,%%ymm2,%%ymm5;"\ | |||
| "vbroadcastsd 8(%1),%%ymm3; vfnmadd231ps %%ymm3,%%ymm1,%%ymm6; vfnmadd231ps %%ymm3,%%ymm2,%%ymm7;" | |||
| #define GEMM_KERNEL_k1m8n8 GEMM_KERNEL_k1m8n4\ | |||
| "vbroadcastsd (%1,%%r12,4),%%ymm3; vfnmadd231ps %%ymm3,%%ymm1,%%ymm8; vfnmadd231ps %%ymm3,%%ymm2,%%ymm9;"\ | |||
| "vbroadcastsd 8(%1,%%r12,4),%%ymm3; vfnmadd231ps %%ymm3,%%ymm1,%%ymm10; vfnmadd231ps %%ymm3,%%ymm2,%%ymm11;" | |||
| #define GEMM_KERNEL_k1m8n12 GEMM_KERNEL_k1m8n8\ | |||
| "vbroadcastsd (%1,%%r12,8),%%ymm3; vfnmadd231ps %%ymm3,%%ymm1,%%ymm12; vfnmadd231ps %%ymm3,%%ymm2,%%ymm13;"\ | |||
| "vbroadcastsd 8(%1,%%r12,8),%%ymm3; vfnmadd231ps %%ymm3,%%ymm1,%%ymm14; vfnmadd231ps %%ymm3,%%ymm2,%%ymm15;" | |||
| #define GEMM_KERNEL_k1m4n4 \ | |||
| "vmovsldup (%0),%%xmm1; vmovshdup (%0),%%xmm2;"\ | |||
| "vmovddup (%1),%%xmm3; vfnmadd231ps %%xmm3,%%xmm1,%%xmm4; vfnmadd231ps %%xmm3,%%xmm2,%%xmm5;"\ | |||
| "vmovddup 8(%1),%%xmm3; vfnmadd231ps %%xmm3,%%xmm1,%%xmm6; vfnmadd231ps %%xmm3,%%xmm2,%%xmm7;" | |||
| #define GEMM_KERNEL_k1m4n8 GEMM_KERNEL_k1m4n4\ | |||
| "vmovddup (%1,%%r12,4),%%xmm3; vfnmadd231ps %%xmm3,%%xmm1,%%xmm8; vfnmadd231ps %%xmm3,%%xmm2,%%xmm9;"\ | |||
| "vmovddup 8(%1,%%r12,4),%%xmm3; vfnmadd231ps %%xmm3,%%xmm1,%%xmm10; vfnmadd231ps %%xmm3,%%xmm2,%%xmm11;" | |||
| #define GEMM_KERNEL_k1m4n12 GEMM_KERNEL_k1m4n8\ | |||
| "vmovddup (%1,%%r12,8),%%xmm3; vfnmadd231ps %%xmm3,%%xmm1,%%xmm12; vfnmadd231ps %%xmm3,%%xmm2,%%xmm13;"\ | |||
| "vmovddup 8(%1,%%r12,8),%%xmm3; vfnmadd231ps %%xmm3,%%xmm1,%%xmm14; vfnmadd231ps %%xmm3,%%xmm2,%%xmm15;" | |||
| #define GEMM_KERNEL_k1m2n4 \ | |||
| "vbroadcastss (%0),%%xmm1; vbroadcastss 4(%0),%%xmm2;"\ | |||
| "vmovups (%1),%%xmm3; vfnmadd231ps %%xmm3,%%xmm1,%%xmm4; vfnmadd231ps %%xmm3,%%xmm2,%%xmm5;" | |||
| #define GEMM_KERNEL_k1m2n8 GEMM_KERNEL_k1m2n4\ | |||
| "vmovups (%1,%%r12,4),%%xmm3; vfnmadd231ps %%xmm3,%%xmm1,%%xmm6; vfnmadd231ps %%xmm3,%%xmm2,%%xmm7;" | |||
| #define GEMM_KERNEL_k1m2n12 GEMM_KERNEL_k1m2n8\ | |||
| "vmovups (%1,%%r12,8),%%xmm3; vfnmadd231ps %%xmm3,%%xmm1,%%xmm8; vfnmadd231ps %%xmm3,%%xmm2,%%xmm9;" | |||
| #define GEMM_KERNEL_k1m1n4 "vbroadcastss (%0),%%xmm1; vfnmadd231ps (%1),%%xmm1,%%xmm4;" | |||
| #define GEMM_KERNEL_k1m1n8 GEMM_KERNEL_k1m1n4 "vfnmadd231ps (%1,%%r12,4),%%xmm1,%%xmm5;" | |||
| #define GEMM_KERNEL_k1m1n12 GEMM_KERNEL_k1m1n8 "vfnmadd231ps (%1,%%r12,8),%%xmm1,%%xmm6;" | |||
| #define GEMM_SUM_REORDER_8x4(c1,c2,c3,c4,prefpos)\ | |||
| "vmovups (%3),%%ymm0; vmovups (%3,%4,1),%%ymm1; prefetcht1 "#prefpos"(%3); prefetcht1 "#prefpos"(%3,%4,1); leaq (%3,%4,2),%3;"\ | |||
| "vunpcklps %%ymm1,%%ymm0,%%ymm2; vunpckhps %%ymm1,%%ymm0,%%ymm3; vunpcklpd %%ymm3,%%ymm2,%%ymm0; vunpckhpd %%ymm3,%%ymm2,%%ymm1;"\ | |||
| "vaddps %%ymm0,%%ymm"#c1",%%ymm"#c1"; vaddps %%ymm1,%%ymm"#c2",%%ymm"#c2";"\ | |||
| "vmovups (%3),%%ymm0; vmovups (%3,%4,1),%%ymm1; prefetcht1 "#prefpos"(%3); prefetcht1 "#prefpos"(%3,%4,1); leaq (%3,%4,2),%3;"\ | |||
| "vunpcklps %%ymm1,%%ymm0,%%ymm2; vunpckhps %%ymm1,%%ymm0,%%ymm3; vunpcklpd %%ymm3,%%ymm2,%%ymm0; vunpckhpd %%ymm3,%%ymm2,%%ymm1;"\ | |||
| "vaddps %%ymm0,%%ymm"#c3",%%ymm"#c3"; vaddps %%ymm1,%%ymm"#c4",%%ymm"#c4";" | |||
| #define GEMM_SUM_REORDER_4x4(c1,c2,c3,c4,co1,co2)\ | |||
| "vmovups (%3),%%xmm0; vmovups (%3,%4,1),%%xmm1; leaq (%3,%4,2),%3;"\ | |||
| "vunpcklps %%xmm1,%%xmm0,%%xmm2; vunpckhps %%xmm1,%%xmm0,%%xmm3;"\ | |||
| "vunpcklpd %%xmm"#c2",%%xmm"#c1",%%xmm0; vunpckhpd %%xmm"#c2",%%xmm"#c1",%%xmm1;"\ | |||
| "vaddps %%xmm0,%%xmm2,%%xmm"#c1"; vaddps %%xmm1,%%xmm3,%%xmm"#c2";"\ | |||
| "vmovups (%3),%%xmm0; vmovups (%3,%4,1),%%xmm1; leaq (%3,%4,2),%3;"\ | |||
| "vunpcklps %%xmm1,%%xmm0,%%xmm2; vunpckhps %%xmm1,%%xmm0,%%xmm3;"\ | |||
| "vunpcklpd %%xmm"#c4",%%xmm"#c3",%%xmm0; vunpckhpd %%xmm"#c4",%%xmm"#c3",%%xmm1;"\ | |||
| "vaddps %%xmm0,%%xmm2,%%xmm"#c3"; vaddps %%xmm1,%%xmm3,%%xmm"#c4";"\ | |||
| "vperm2f128 $2,%%ymm"#c1",%%ymm"#c2",%%ymm"#co1"; vperm2f128 $2,%%ymm"#c3",%%ymm"#c4",%%ymm"#co2";" | |||
| #define GEMM_SUM_REORDER_2x4(c1,c2)\ | |||
| "vmovsd (%3),%%xmm0; vmovhpd (%3,%4,1),%%xmm0,%%xmm0; leaq (%3,%4,2),%3; vpermilps $216,%%xmm0,%%xmm0;"\ | |||
| "vmovsd (%3),%%xmm1; vmovhpd (%3,%4,1),%%xmm1,%%xmm1; leaq (%3,%4,2),%3; vpermilps $216,%%xmm1,%%xmm1;"\ | |||
| "vunpcklpd %%xmm1,%%xmm0,%%xmm2; vaddps %%xmm2,%%xmm"#c1",%%xmm"#c1";"\ | |||
| "vunpckhpd %%xmm1,%%xmm0,%%xmm3; vaddps %%xmm3,%%xmm"#c2",%%xmm"#c2";"\ | |||
| #define GEMM_SUM_REORDER_1x4(c1)\ | |||
| "vmovss (%3),%%xmm1; vinsertps $16,(%3,%4,1),%%xmm1,%%xmm1; leaq (%3,%4,2),%3;"\ | |||
| "vinsertps $32,(%3),%%xmm1,%%xmm1; vinsertps $48,(%3,%4,1),%%xmm1,%%xmm1; leaq (%3,%4,2),%3;"\ | |||
| "vaddps %%xmm"#c1",%%xmm1,%%xmm"#c1";" | |||
| #define SOLVE_le_m4n2(b_off,c1,...)\ | |||
| "vbroadcastsd "#b_off"("#__VA_ARGS__"),%%ymm0; vblendps $170,%8,%%ymm0,%%ymm2;"\ | |||
| "vmulps %%ymm2,%%ymm"#c1",%%ymm"#c1";"\ | |||
| "vmovsldup %%ymm"#c1",%%ymm1;" | |||
| #define SOLVE_le_m8n2(b_off,c1,c2,...)\ | |||
| "vbroadcastsd "#b_off"("#__VA_ARGS__"),%%ymm0; vblendps $170,%8,%%ymm0,%%ymm2;"\ | |||
| "vmulps %%ymm2,%%ymm"#c1",%%ymm"#c1"; vmulps %%ymm2,%%ymm"#c2",%%ymm"#c2";"\ | |||
| "vmovsldup %%ymm"#c1",%%ymm1; vmovsldup %%ymm"#c2",%%ymm2;" | |||
| #define SOLVE_leri_m4n2(b_off,c1,...) SOLVE_le_m4n2(b_off,c1,__VA_ARGS__)\ | |||
| "vblendps $85,%9,%%ymm0,%%ymm0; vfnmadd231ps %%ymm0,%%ymm1,%%ymm"#c1";" | |||
| #define SOLVE_leri_m8n2(b_off,c1,c2,...) SOLVE_le_m8n2(b_off,c1,c2,__VA_ARGS__)\ | |||
| "vblendps $85,%9,%%ymm0,%%ymm0; vfnmadd231ps %%ymm0,%%ymm1,%%ymm"#c1"; vfnmadd231ps %%ymm0,%%ymm2,%%ymm"#c2";" | |||
| #define SOLVE_ri_m4n2(b_off,c1,...)\ | |||
| "vbroadcastsd "#b_off"("#__VA_ARGS__"),%%ymm0; vblendps $85,%8,%%ymm0,%%ymm2;"\ | |||
| "vmulps %%ymm2,%%ymm"#c1",%%ymm"#c1";"\ | |||
| "vmovshdup %%ymm"#c1",%%ymm1;" | |||
| #define SOLVE_ri_m8n2(b_off,c1,c2,...)\ | |||
| "vbroadcastsd "#b_off"("#__VA_ARGS__"),%%ymm0; vblendps $85,%8,%%ymm0,%%ymm2;"\ | |||
| "vmulps %%ymm2,%%ymm"#c1",%%ymm"#c1"; vmulps %%ymm2,%%ymm"#c2",%%ymm"#c2";"\ | |||
| "vmovshdup %%ymm"#c1",%%ymm1; vmovshdup %%ymm"#c2",%%ymm2;" | |||
| #define SOLVE_rile_m4n2(b_off,c1,...) SOLVE_ri_m4n2(b_off,c1,__VA_ARGS__)\ | |||
| "vblendps $170,%9,%%ymm0,%%ymm0; vfnmadd231ps %%ymm0,%%ymm1,%%ymm"#c1";" | |||
| #define SOLVE_rile_m8n2(b_off,c1,c2,...) SOLVE_ri_m8n2(b_off,c1,c2,__VA_ARGS__)\ | |||
| "vblendps $170,%9,%%ymm0,%%ymm0; vfnmadd231ps %%ymm0,%%ymm1,%%ymm"#c1"; vfnmadd231ps %%ymm0,%%ymm2,%%ymm"#c2";" | |||
| #define SOLVE_col1_rtol_m1n4(b_off,c1,...)\ | |||
| "vmovups "#b_off"("#__VA_ARGS__"),%%xmm0; vblendps $14,%8,%%xmm0,%%xmm2;"\ | |||
| "vmulps %%xmm2,%%xmm"#c1",%%xmm"#c1";"\ | |||
| "vpermilps $0,%%xmm"#c1",%%xmm1;" | |||
| #define SOLVE_col1_rtol_m2n4(b_off,c1,c2,...)\ | |||
| "vmovups "#b_off"("#__VA_ARGS__"),%%xmm0; vblendps $14,%8,%%xmm0,%%xmm2;"\ | |||
| "vmulps %%xmm2,%%xmm"#c1",%%xmm"#c1"; vmulps %%xmm2,%%xmm"#c2",%%xmm"#c2";"\ | |||
| "vpermilps $0,%%xmm"#c1",%%xmm1; vpermilps $0,%%xmm"#c2",%%xmm2;" | |||
| #define SOLVE_col1_ltor_m1n4(b_off,c1,...) SOLVE_col1_rtol_m1n4(b_off,c1,__VA_ARGS__)\ | |||
| "vblendps $1,%9,%%xmm0,%%xmm0; vfnmadd231ps %%xmm0,%%xmm1,%%xmm"#c1";" | |||
| #define SOLVE_col1_ltor_m2n4(b_off,c1,c2,...) SOLVE_col1_rtol_m2n4(b_off,c1,c2,__VA_ARGS__)\ | |||
| "vblendps $1,%9,%%xmm0,%%xmm0; vfnmadd231ps %%xmm0,%%xmm1,%%xmm"#c1"; vfnmadd231ps %%xmm0,%%xmm2,%%xmm"#c2";" | |||
| #define SOLVE_col2_mul_m1n4(b_off,c1,...)\ | |||
| "vmovups "#b_off"("#__VA_ARGS__"),%%xmm0; vblendps $13,%8,%%xmm0,%%xmm2;"\ | |||
| "vmulps %%xmm2,%%xmm"#c1",%%xmm"#c1";"\ | |||
| "vpermilps $85,%%xmm"#c1",%%xmm1;" | |||
| #define SOLVE_col2_mul_m2n4(b_off,c1,c2,...)\ | |||
| "vmovups "#b_off"("#__VA_ARGS__"),%%xmm0; vblendps $13,%8,%%xmm0,%%xmm2;"\ | |||
| "vmulps %%xmm2,%%xmm"#c1",%%xmm"#c1"; vmulps %%xmm2,%%xmm"#c2",%%xmm"#c2";"\ | |||
| "vpermilps $85,%%xmm"#c1",%%xmm1; vpermilps $85,%%xmm"#c2",%%xmm2;" | |||
| #define SOLVE_col2_rtol_m1n4(b_off,c1,...) SOLVE_col2_mul_m1n4(b_off,c1,__VA_ARGS__)\ | |||
| "vblendps $14,%9,%%xmm0,%%xmm0; vfnmadd231ps %%xmm0,%%xmm1,%%xmm"#c1";" | |||
| #define SOLVE_col2_rtol_m2n4(b_off,c1,c2,...) SOLVE_col2_mul_m2n4(b_off,c1,c2,__VA_ARGS__)\ | |||
| "vblendps $14,%9,%%xmm0,%%xmm0; vfnmadd231ps %%xmm0,%%xmm1,%%xmm"#c1"; vfnmadd231ps %%xmm0,%%xmm2,%%xmm"#c2";" | |||
| #define SOLVE_col2_ltor_m1n4(b_off,c1,...) SOLVE_col2_mul_m1n4(b_off,c1,__VA_ARGS__)\ | |||
| "vblendps $3,%9,%%xmm0,%%xmm0; vfnmadd231ps %%xmm0,%%xmm1,%%xmm"#c1";" | |||
| #define SOLVE_col2_ltor_m2n4(b_off,c1,c2,...) SOLVE_col2_mul_m2n4(b_off,c1,c2,__VA_ARGS__)\ | |||
| "vblendps $3,%9,%%xmm0,%%xmm0; vfnmadd231ps %%xmm0,%%xmm1,%%xmm"#c1"; vfnmadd231ps %%xmm0,%%xmm2,%%xmm"#c2";" | |||
| #define SOLVE_col3_mul_m1n4(b_off,c1,...)\ | |||
| "vmovups "#b_off"("#__VA_ARGS__"),%%xmm0; vblendps $11,%8,%%xmm0,%%xmm2;"\ | |||
| "vmulps %%xmm2,%%xmm"#c1",%%xmm"#c1";"\ | |||
| "vpermilps $170,%%xmm"#c1",%%xmm1;" | |||
| #define SOLVE_col3_mul_m2n4(b_off,c1,c2,...)\ | |||
| "vmovups "#b_off"("#__VA_ARGS__"),%%xmm0; vblendps $11,%8,%%xmm0,%%xmm2;"\ | |||
| "vmulps %%xmm2,%%xmm"#c1",%%xmm"#c1"; vmulps %%xmm2,%%xmm"#c2",%%xmm"#c2";"\ | |||
| "vpermilps $170,%%xmm"#c1",%%xmm1; vpermilps $170,%%xmm"#c2",%%xmm2;" | |||
| #define SOLVE_col3_rtol_m1n4(b_off,c1,...) SOLVE_col3_mul_m1n4(b_off,c1,__VA_ARGS__)\ | |||
| "vblendps $12,%9,%%xmm0,%%xmm0; vfnmadd231ps %%xmm0,%%xmm1,%%xmm"#c1";" | |||
| #define SOLVE_col3_rtol_m2n4(b_off,c1,c2,...) SOLVE_col3_mul_m2n4(b_off,c1,c2,__VA_ARGS__)\ | |||
| "vblendps $12,%9,%%xmm0,%%xmm0; vfnmadd231ps %%xmm0,%%xmm1,%%xmm"#c1"; vfnmadd231ps %%xmm0,%%xmm2,%%xmm"#c2";" | |||
| #define SOLVE_col3_ltor_m1n4(b_off,c1,...) SOLVE_col3_mul_m1n4(b_off,c1,__VA_ARGS__)\ | |||
| "vblendps $7,%9,%%xmm0,%%xmm0; vfnmadd231ps %%xmm0,%%xmm1,%%xmm"#c1";" | |||
| #define SOLVE_col3_ltor_m2n4(b_off,c1,c2,...) SOLVE_col3_mul_m2n4(b_off,c1,c2,__VA_ARGS__)\ | |||
| "vblendps $7,%9,%%xmm0,%%xmm0; vfnmadd231ps %%xmm0,%%xmm1,%%xmm"#c1"; vfnmadd231ps %%xmm0,%%xmm2,%%xmm"#c2";" | |||
| #define SOLVE_col4_ltor_m1n4(b_off,c1,...)\ | |||
| "vmovups "#b_off"("#__VA_ARGS__"),%%xmm0; vblendps $7,%8,%%xmm0,%%xmm2;"\ | |||
| "vmulps %%xmm2,%%xmm"#c1",%%xmm"#c1";"\ | |||
| "vpermilps $255,%%xmm"#c1",%%xmm1;" | |||
| #define SOLVE_col4_ltor_m2n4(b_off,c1,c2,...)\ | |||
| "vmovups "#b_off"("#__VA_ARGS__"),%%xmm0; vblendps $7,%8,%%xmm0,%%xmm2;"\ | |||
| "vmulps %%xmm2,%%xmm"#c1",%%xmm"#c1"; vmulps %%xmm2,%%xmm"#c2",%%xmm"#c2";"\ | |||
| "vpermilps $255,%%xmm"#c1",%%xmm1; vpermilps $255,%%xmm"#c2",%%xmm2;" | |||
| #define SOLVE_col4_rtol_m1n4(b_off,c1,...) SOLVE_col4_ltor_m1n4(b_off,c1,__VA_ARGS__)\ | |||
| "vblendps $8,%9,%%xmm0,%%xmm0; vfnmadd231ps %%xmm0,%%xmm1,%%xmm"#c1";" | |||
| #define SOLVE_col4_rtol_m2n4(b_off,c1,c2,...) SOLVE_col4_ltor_m2n4(b_off,c1,c2,__VA_ARGS__)\ | |||
| "vblendps $8,%9,%%xmm0,%%xmm0; vfnmadd231ps %%xmm0,%%xmm1,%%xmm"#c1"; vfnmadd231ps %%xmm0,%%xmm2,%%xmm"#c2";" | |||
| #define SUBTRACT_m4n2(b_off,c1,...) "vbroadcastsd "#b_off"("#__VA_ARGS__"),%%ymm0; vfnmadd231ps %%ymm0,%%ymm1,%%ymm"#c1";" | |||
| #define SUBTRACT_m8n2(b_off,c1,c2,...) SUBTRACT_m4n2(b_off,c1,__VA_ARGS__) "vfnmadd231ps %%ymm0,%%ymm2,%%ymm"#c2";" | |||
| #define SUBTRACT_m1n4(b_off,c1,...) "vmovups "#b_off"("#__VA_ARGS__"),%%xmm0; vfnmadd231ps %%xmm0,%%xmm1,%%xmm"#c1";" | |||
| #define SUBTRACT_m2n4(b_off,c1,c2,...) SUBTRACT_m1n4(b_off,c1,__VA_ARGS__) "vfnmadd231ps %%xmm0,%%xmm2,%%xmm"#c2";" | |||
| #define SAVE_SOLUTION_m8n2(c1,c2,a_off)\ | |||
| "vunpcklps %%ymm"#c2",%%ymm"#c1",%%ymm0; vunpckhps %%ymm"#c2",%%ymm"#c1",%%ymm1;"\ | |||
| "vunpcklpd %%ymm1,%%ymm0,%%ymm"#c1"; vunpckhpd %%ymm1,%%ymm0,%%ymm"#c2";"\ | |||
| "vmovups %%ymm"#c1","#a_off"(%0); vmovups %%ymm"#c2","#a_off"+32(%0);"\ | |||
| "vmovups %%ymm"#c1",(%3); vmovups %%ymm"#c2",(%3,%4,1); leaq (%3,%4,2),%3;" | |||
| #define SAVE_SOLUTION_m4n2(c1,a_off)\ | |||
| "vpermilps $216,%%ymm"#c1",%%ymm"#c1"; vpermpd $216,%%ymm"#c1",%%ymm"#c1";"\ | |||
| "vmovups %%ymm"#c1","#a_off"(%0); vmovups %%xmm"#c1",(%3); vextractf128 $1,%%ymm"#c1",(%3,%4,1); leaq (%3,%4,2),%3;" | |||
| #define SAVE_SOLUTION_m2n4(c1,c2,a_off)\ | |||
| "vunpcklps %%xmm"#c2",%%xmm"#c1",%%xmm0; vmovups %%xmm0,"#a_off"(%0); vmovsd %%xmm0,(%3); vmovhpd %%xmm0,(%3,%4,1); leaq (%3,%4,2),%3;"\ | |||
| "vunpckhps %%xmm"#c2",%%xmm"#c1",%%xmm0; vmovups %%xmm0,"#a_off"+16(%0); vmovsd %%xmm0,(%3); vmovhpd %%xmm0,(%3,%4,1); leaq (%3,%4,2),%3;" | |||
| #define SAVE_SOLUTION_m1n4(c1,a_off)\ | |||
| "vmovups %%xmm"#c1","#a_off"(%0); vmovss %%xmm"#c1",(%3); vextractps $1,%%xmm"#c1",(%3,%4,1); leaq (%3,%4,2),%3;"\ | |||
| "vextractps $2,%%xmm"#c1",(%3); vextractps $3,%%xmm"#c1",(%3,%4,1); leaq (%3,%4,2),%3;" | |||