#define MY_ALIGN .align 3 b L8 MY_ALIGN LSGEMM_L8x16_LMAIN_SUB: LOAD8x16_2 MY_ALIGN LSGEMM_L8x16_LOOP: KERNEL8x16_L2 128,64,0,0 LSGEMM_L8x16_K128: KERNEL8x16_L2 128,64,1,0 KERNEL8x16_I1_L4_2 128,64, 1,0 KERNEL8x16_I1_L4_2 128,64, 2,0 KERNEL8x16_I1_L4_2 128,64, 3,0 KERNEL8x16_I1_L4_2 128,64, 4,0 KERNEL8x16_I1_L4_2 128,64, 5,0 KERNEL8x16_I1_L4_2 128,64, 6,0 KERNEL8x16_I1_L4_2 128,64, 7,0 KERNEL8x16_I1_L4_2 128,64, 8,0 KERNEL8x16_I1_L4_2 128,64, 9,0 KERNEL8x16_I1_L4_2 128,64, 10,0 KERNEL8x16_I1_L4_2 128,64, 11,0 KERNEL8x16_I1_L4_2 128,64, 12,0 KERNEL8x16_I1_L4_2 128,64, 13,0 KERNEL8x16_I1_L4_2 128,64, 14,0 KERNEL8x16_I1_L4_2 128,64, 15,0 KERNEL8x16_I1_L4_2 128,64, 16,0 KERNEL8x16_I1_L4_2 128,64, 17,0 KERNEL8x16_I1_L4_2 128,64, 18,0 KERNEL8x16_I1_L4_2 128,64, 19,0 KERNEL8x16_I1_L4_2 128,64, 20,0 KERNEL8x16_I1_L4_2 128,64, 21,0 KERNEL8x16_I1_L4_2 128,64, 22,0 KERNEL8x16_I1_L4_2 128,64, 23,0 KERNEL8x16_I1_L4_2 128,64, 24,0 KERNEL8x16_I1_L4_2 128,64, 25,0 KERNEL8x16_I1_L4_2 128,64, 26,0 KERNEL8x16_I1_L4_2 128,64, 27,0 KERNEL8x16_I1_L4_2 128,64, 28,0 KERNEL8x16_I1_L4_2 128,64, 29,0 KERNEL8x16_I1_L4_2 128,64, 30,0 KERNEL8x16_I1_L4_2 128,64, 31,1 bdnz LSGEMM_L8x16_LOOP MY_ALIGN LSGEMM_L8x16_LOOP_END: END8x16_2 blr MY_ALIGN LSGEMM_L8x16_L64_SUB: LOAD8x16_2 KERNEL8x16_I1_L4_2 128,64, 0,0 KERNEL8x16_I1_L4_2 128,64, 1,0 KERNEL8x16_I1_L4_2 128,64, 2,0 KERNEL8x16_I1_L4_2 128,64,3,0 KERNEL8x16_I1_L4_2 128,64,4,0 KERNEL8x16_I1_L4_2 128,64,5,0 KERNEL8x16_I1_L4_2 128,64,6,0 KERNEL8x16_I1_L4_2 128,64,7,0 KERNEL8x16_I1_L4_2 128,64,8,0 KERNEL8x16_I1_L4_2 128,64,9,0 KERNEL8x16_I1_L4_2 128,64,10,0 KERNEL8x16_I1_L4_2 128,64,11,0 KERNEL8x16_I1_L4_2 128,64,12,0 KERNEL8x16_I1_L4_2 128,64,13,0 KERNEL8x16_I1_L4_2 128,64,14,0 KERNEL8x16_I1_L4_3 128,64,15,1 blr LSGEMM_L8x16_L32_SUB: LOAD8x16_2 KERNEL8x16_I1_L4_2 128,64,0,0 KERNEL8x16_I1_L4_2 128,64,1,0 KERNEL8x16_I1_L4_2 128,64,2,0 KERNEL8x16_I1_L4_2 128,64,3,0 KERNEL8x16_I1_L4_2 128,64,4,0 KERNEL8x16_I1_L4_2 128,64,5,0 KERNEL8x16_I1_L4_2 128,64,6,0 KERNEL8x16_I1_L4_3 128,64,7,1 blr LSGEMM_L8x16_L16_SUB: LOAD8x16_2 KERNEL8x16_I1_L4_2 128,64,0,0 KERNEL8x16_I1_L4_2 128,64,1,0 KERNEL8x16_I1_L4_2 128,64,2,0 KERNEL8x16_I1_L4_3 128,64,3,1 blr L8: #if defined(TRMMKERNEL) && !defined(LEFT) neg TEMP_REG, OFFSET #endif srawi. J, N, 3 ble LSGEMM_L8_END LSGEMM_L8_BEGIN: li T1, 128 li T2, 256 mr AO, A mr CO, C slwi T3, LDC , 3 add C, C, T3 dcbt A, T1 dcbt A, T2 #if defined(TRMMKERNEL) && defined(LEFT) mr TEMP_REG, OFFSET /*off = offset;*/ #endif srawi. I, M, 4 ble LSGEMM_L8x16_END MY_ALIGN LSGEMM_L8x16_BEGIN: #if defined(TRMMKERNEL) REFRESH_POINTERS AO,BO,TEMP_REG,B,16,8 #else mr BO, B #endif #if defined(TRMMKERNEL) REFRESH_TEMP_BK T11,K,TEMP_REG,16,8 mr T12, T11 addi T12,T12, -2 srawi. L, T12, 7 /**(T11-2) % 128x */ #else mr T12, K addi T12,T12, -2 srawi. L, T12, 7 /**(K-2) % 128x */ #endif ZERO8x16 ble LSGEMM_L8x16_SUB0 mtctr L bl LSGEMM_L8x16_LMAIN_SUB andi. L, T12, 127 ble LSGEMM_L8x16_SAVE b LSGEMM_L8x16_SUB2 MY_ALIGN LSGEMM_L8x16_SUB0: #if defined(TRMMKERNEL) andi. L, T11, 255 cmpwi T11,129 #else andi. L, K, 255 cmpwi K,129 #endif li T10,1 bne CMP8x16_128K addi BO,BO,-32 addi AO,AO,-64 LOAD8x16 64,32 END8x16_WITHOUT_ADD LOAD8x16_2O AO,BO, 128, 64 mtctr T10 bl LSGEMM_L8x16_K128 b LSGEMM_L8x16_SAVE CMP8x16_128K: /*----------------------------------------*/ #if defined(TRMMKERNEL) cmpwi T11,128 #else cmpwi K,128 #endif bne LSGEMM_L8x16_SUB2 MY_ALIGN mtctr T10 addi BO,BO,-64 addi AO,AO,-128 LOAD8x16_2O AO,BO, 128,64 bl LSGEMM_L8x16_K128 b LSGEMM_L8x16_SAVE MY_ALIGN LSGEMM_L8x16_SUB2: andi. T10,L,64 ble LSGEMM_L8x16_SUB2_32 bl LSGEMM_L8x16_L64_SUB MY_ALIGN LSGEMM_L8x16_SUB2_32: andi. T10,L, 32 ble LSGEMM_L8x16_SUB2_16 bl LSGEMM_L8x16_L32_SUB MY_ALIGN LSGEMM_L8x16_SUB2_16: andi. T10,L, 16 ble LSGEMM_L8x16_SUB2_8 bl LSGEMM_L8x16_L16_SUB MY_ALIGN LSGEMM_L8x16_SUB2_8: andi. T10,L, 8 ble LSGEMM_L8x16_SUB2_4 LOAD8x16_2 KERNEL8x16_I1_L4_2 128,64, 0,0 KERNEL8x16_I1_L4_3 128,64, 1,1 MY_ALIGN LSGEMM_L8x16_SUB2_4: andi. T10,L, 4 ble LSGEMM_L8x16_SUB2_2 LOAD8x16_2 KERNEL8x16_I1_L4_3 128,64, 0,1 MY_ALIGN LSGEMM_L8x16_SUB2_2: andi. T10,L, 2 ble LSGEMM_L8x16_SUB2_1 LOAD8x16_2 KERNEL8x16_E2 128,64, 0,1 MY_ALIGN LSGEMM_L8x16_SUB2_1: andi. T10,L, 1 ble LSGEMM_L8x16_SAVE KERNEL8x16 0 MY_ALIGN LSGEMM_L8x16_SAVE: SAVE8x16 #if defined(TRMMKERNEL) REFRESH_AFTER_SAVE T11,K,TEMP_REG,BO,AO,16,8 #endif addic. I, I, -1 bgt+ LSGEMM_L8x16_BEGIN MY_ALIGN LSGEMM_L8x16_END: LSGEMM_L8x8_BEGIN: andi. T2, M, 15 ble LSGEMM_L8x1_END andi. T1, M, 8 ble LSGEMM_L8x8_END #if defined(TRMMKERNEL) REFRESH_POINTERS AO,BO,TEMP_REG,B,8,8 #else mr BO, B #endif #if defined(TRMMKERNEL) REFRESH_TEMP_BK T11,K,TEMP_REG,8,8 mr T12, T11 addi T12,T12, -1 srawi. L, T12, 4 /**(T11-1) % 16x */ #else mr T12, K addi T12,T12, -1 srawi. L, T12, 4 /**(K-1) % 16x */ #endif ZERO8x8 ble LSGEMM_L8x8_SUB0 MY_ALIGN LSGEMM_L8x8_LOOP_START: LOAD8x8_0 /*we already zeroed */ mtctr L MY_ALIGN LSGEMM_L8x8_LOOP: KERNEL8x8_I1_L4_2 32,32, 0,0 KERNEL8x8_I1_L4_2 32,32, 1,0 KERNEL8x8_I1_L4_2 32,32, 2,0 KERNEL8x8_I1_L4_2 32,32, 3,1 bdnz LSGEMM_L8x8_LOOP MY_ALIGN LSGEMM_L8x8_LOOP_END: END8x8 0, AO, BO, 32, 32 b LSGEMM_L8x8_SUB1 MY_ALIGN LSGEMM_L8x8_SUB0: #if defined(TRMMKERNEL) andi. L, T11, 31 #else andi. L, K, 31 #endif b LSGEMM_L8x8_SUB2 MY_ALIGN LSGEMM_L8x8_SUB1: #if defined(TRMMKERNEL) andi. L, T12, 15 #else andi. L, T12, 15 #endif ble LSGEMM_L8x8_SAVE MY_ALIGN LSGEMM_L8x8_SUB2: srawi. T1,L, 3 ble LSGEMM_L8x8_SUB2_4 mtctr T1 MY_ALIGN LSGEMM_L8x8_SUB2_LOOP: LOAD8x8_0 KERNEL8x8_I1_L4_2 32,32, 0,0 KERNEL8x8_I1_L4_3 32,32, 1,1 bdnz LSGEMM_L8x8_SUB2_LOOP MY_ALIGN LSGEMM_L8x8_SUB2_4: andi. T1,L, 4 ble LSGEMM_L8x8_SUB2_2 LOAD8x8_0 KERNEL8x8_I1_L4_3 32,32, 0,1 MY_ALIGN LSGEMM_L8x8_SUB2_2: andi. T1,L, 2 ble LSGEMM_L8x8_SUB2_1 LOAD8x8_0 KERNEL8x8_I1_L2_3 32,32, 0,1 MY_ALIGN LSGEMM_L8x8_SUB2_1: andi. T1,L, 1 ble LSGEMM_L8x8_SAVE KERNEL8x8 0 MY_ALIGN LSGEMM_L8x8_SAVE: SAVE8x8 #if defined(TRMMKERNEL) REFRESH_AFTER_SAVE T11,K,TEMP_REG,BO,AO,8,8 #endif MY_ALIGN LSGEMM_L8x8_END: LSGEMM_L8x4_BEGIN: andi. T2, M, 15 ble LSGEMM_L8x1_END andi. T1, M, 4 ble LSGEMM_L8x4_END #if defined(TRMMKERNEL) REFRESH_POINTERS AO,BO,TEMP_REG,B,4,8 #else mr BO, B #endif #if defined(TRMMKERNEL) REFRESH_TEMP_BK T11,K,TEMP_REG,4,8 mr T12, T11 addi T12,T12, -1 srawi. L, T12, 4 /**(T11-1) % 16x */ #else mr T12, K addi T12,T12, -1 srawi. L, T12, 4 /**(K-1) % 16x */ #endif ZERO8x4 ble LSGEMM_L8x4_SUB0 MY_ALIGN LSGEMM_L8x4_LOOP_START: LOAD8x4_0 /*we already zeroed */ mtctr L MY_ALIGN LSGEMM_L8x4_LOOP: KERNEL8x4_I1_L4_2 16,32, 0,0 KERNEL8x4_I1_L4_2 16,32, 1,0 KERNEL8x4_I1_L4_2 16,32, 2,0 KERNEL8x4_I1_L4_2 16,32, 3,1 bdnz LSGEMM_L8x4_LOOP MY_ALIGN LSGEMM_L8x4_LOOP_END: END8x4 0, AO, BO, 16, 32 b LSGEMM_L8x4_SUB1 MY_ALIGN LSGEMM_L8x4_SUB0: #if defined(TRMMKERNEL) andi. L, T11, 31 #else andi. L, K, 31 #endif b LSGEMM_L8x4_SUB2 MY_ALIGN LSGEMM_L8x4_SUB1: #if defined(TRMMKERNEL) andi. L, T12, 15 #else andi. L, T12, 15 #endif ble LSGEMM_L8x4_SAVE MY_ALIGN LSGEMM_L8x4_SUB2: srawi. T1,L, 3 ble LSGEMM_L8x4_SUB2_4 mtctr T1 MY_ALIGN LSGEMM_L8x4_SUB2_LOOP: LOAD8x4_0 KERNEL8x4_I1_L4_2 16,32, 0,0 KERNEL8x4_I1_L4_3 16,32, 1,1 bdnz LSGEMM_L8x4_SUB2_LOOP MY_ALIGN LSGEMM_L8x4_SUB2_4: andi. T1,L, 4 ble LSGEMM_L8x4_SUB2_2 LOAD8x4_0 KERNEL8x4_I1_L4_3 16,32, 0,1 MY_ALIGN LSGEMM_L8x4_SUB2_2: andi. T1,L, 2 ble LSGEMM_L8x4_SUB2_1 LOAD8x4_0 KERNEL8x4_I1_L2_3 16,32, 0,1 MY_ALIGN LSGEMM_L8x4_SUB2_1: andi. T1,L, 1 ble LSGEMM_L8x4_SAVE KERNEL8x4 0 MY_ALIGN LSGEMM_L8x4_SAVE: SAVE8x4 #if defined(TRMMKERNEL) REFRESH_AFTER_SAVE T11,K,TEMP_REG,BO,AO,4,8 #endif MY_ALIGN LSGEMM_L8x4_END: LSGEMM_L8x2_BEGIN: andi. T1, M, 2 ble LSGEMM_L8x2_END #if defined(TRMMKERNEL) REFRESH_POINTERS AO,BO,TEMP_REG,B,2,8 #else mr BO, B #endif #if defined(TRMMKERNEL) REFRESH_TEMP_BK T11,K,TEMP_REG,2,8 srawi. L, T11, 3 /**(T11) % 8x */ #else srawi. L, K, 3 /**(K) % 8x */ #endif ZERO8x2 ble LSGEMM_L8x2_SUB0 MY_ALIGN LSGEMM_L8x2_LOOP_START: mtctr L MY_ALIGN LSGEMM_L8x2_LOOP: KERNEL8x2_2 0,0, 0,0 KERNEL8x2_2 0,0, 1,0 KERNEL8x2_2 0,0, 2,0 KERNEL8x2_2 0,0, 3,1 bdnz LSGEMM_L8x2_LOOP MY_ALIGN LSGEMM_L8x2_LOOP_END: LSGEMM_L8x2_SUB0: #if defined(TRMMKERNEL) andi. L, T11, 7 #else andi. L, K, 7 #endif ble LSGEMM_L8x2_SAVE MY_ALIGN LSGEMM_L8x2_SUB2: andi. T1,L, 4 ble LSGEMM_L8x2_SUB2_2 KERNEL8x2_2 0,0, 0,0 KERNEL8x2_2 0,0, 1,1 MY_ALIGN LSGEMM_L8x2_SUB2_2: andi. T1,L, 2 ble LSGEMM_L8x2_SUB2_1 KERNEL8x2_2 0,0, 0,1 MY_ALIGN LSGEMM_L8x2_SUB2_1: andi. T1,L, 1 ble LSGEMM_L8x2_SAVE KERNEL8x2 MY_ALIGN LSGEMM_L8x2_SAVE: SAVE8x2 #if defined(TRMMKERNEL) REFRESH_AFTER_SAVE T11,K,TEMP_REG,BO,AO,2,8 #endif MY_ALIGN LSGEMM_L8x2_END: LSGEMM_L8x1_BEGIN: andi. T1, M, 1 ble LSGEMM_L8x1_END #if defined(TRMMKERNEL) REFRESH_POINTERS AO,BO,TEMP_REG,B,1,8 #else mr BO, B #endif #if defined(TRMMKERNEL) REFRESH_TEMP_BK T11,K,TEMP_REG,1,8 srawi. L, T11, 3 /**(T11) % 8x */ #else srawi. L, K, 3 /**(K) % 8x */ #endif ZERO8x1 ble LSGEMM_L8x1_SUB0 MY_ALIGN LSGEMM_L8x1_LOOP_START: mtctr L MY_ALIGN LSGEMM_L8x1_LOOP: KERNEL8x1_4 0,0, 0,0 KERNEL8x1_4 0,0, 1,1 bdnz LSGEMM_L8x1_LOOP MY_ALIGN LSGEMM_L8x1_LOOP_END: LSGEMM_L8x1_SUB0: #if defined(TRMMKERNEL) andi. L, T11, 7 #else andi. L, K, 7 #endif ble LSGEMM_L8x1_SAVE MY_ALIGN LSGEMM_L8x1_SUB2: andi. T1,L, 4 ble LSGEMM_L8x1_SUB2_2 KERNEL8x1_4 0,0, 0,1 MY_ALIGN LSGEMM_L8x1_SUB2_2: andi. T1,L, 2 ble LSGEMM_L8x1_SUB2_1 KERNEL8x1_2 MY_ALIGN LSGEMM_L8x1_SUB2_1: andi. T1,L, 1 ble LSGEMM_L8x1_SAVE KERNEL8x1 MY_ALIGN LSGEMM_L8x1_SAVE: SAVE8x1 #if defined(TRMMKERNEL) REFRESH_AFTER_SAVE T11,K,TEMP_REG,BO,AO,1,8 #endif MY_ALIGN LSGEMM_L8x1_END: slwi T1, K, 5 add B, B, T1 #if defined(TRMMKERNEL) && !defined(LEFT) addi TEMP_REG, TEMP_REG, 8 #endif addic. J, J, -1 bgt LSGEMM_L8_BEGIN LSGEMM_L8_END: /* b LSGEMM_L4_BEGIN*/ andi. T1, N, 4 ble LSGEMM_L4_END LSGEMM_L4_BEGIN: mr AO, A mr CO, C slwi T3, LDC , 2 add C, C, T3 #if defined(TRMMKERNEL) && defined(LEFT) mr TEMP_REG, OFFSET /*off = offset;*/ #endif srawi. I, M, 4 ble LSGEMM_L4x16_END MY_ALIGN LSGEMM_L4x16_BEGIN: #if defined(TRMMKERNEL) REFRESH_POINTERS AO,BO,TEMP_REG,B,16,4 #else mr BO, B #endif #if defined(TRMMKERNEL) REFRESH_TEMP_BK T11,K,TEMP_REG,16,4 mr T12, T11 addi T12,T12, -1 srawi. L, T12, 6 /**(T11-1) % 64x */ #else mr T12, K addi T12,T12, -1 srawi. L, T12, 6 /**(K-1) % 64x */ #endif ZERO4x16 ble LSGEMM_L4x16_SUB0 MY_ALIGN LSGEMM_L4x16_LOOP_START: LOAD4x16_0 /*we already zeroed */ ##OffsetA=64 OffsetB=16 addi AO,AO,2112 addi BO,BO,16 mtctr L MY_ALIGN LSGEMM_L4x16_LOOP: KERNEL4x16_I1_L4_2 -2048,0, 0,0 KERNEL4x16_I1_L4_2 -2048,0, 1,0 KERNEL4x16_I1_L4_2 -2048,0, 2,0 KERNEL4x16_I1_L4_2 -2048,0, 3,0 KERNEL4x16_I1_L4_2 -2048,0, 4,0 KERNEL4x16_I1_L4_2 -2048,0, 5,0 KERNEL4x16_I1_L4_2 -2048,0, 6,0 KERNEL4x16_I1_L4_2 -2048,0, 7,0 KERNEL4x16_I1_L4_2 -2048,0, 8,0 KERNEL4x16_I1_L4_2 -2048,0, 9,0 KERNEL4x16_I1_L4_2 -2048,0, 10,0 KERNEL4x16_I1_L4_2 -2048,0, 11,0 KERNEL4x16_I1_L4_2 -2048,0, 12,0 KERNEL4x16_I1_L4_2 -2048,0, 13,0 KERNEL4x16_I1_L4_2 -2048,0, 14,0 KERNEL4x16_I1_L4_2 -2048,0, 15,1 bdnz LSGEMM_L4x16_LOOP MY_ALIGN LSGEMM_L4x16_LOOP_END: END4x16 0, AO, BO, -2048, 0 b LSGEMM_L4x16_SUB1 MY_ALIGN LSGEMM_L4x16_SUB0: #if defined(TRMMKERNEL) andi. L, T11, 127 #else andi. L, K, 127 #endif b LSGEMM_L4x16_SUB2 MY_ALIGN LSGEMM_L4x16_SUB1: #if defined(TRMMKERNEL) andi. L, T12, 63 #else andi. L, T12, 63 #endif ble LSGEMM_L4x16_SAVE MY_ALIGN LSGEMM_L4x16_SUB2: srawi. T10,L, 5 ble LSGEMM_L4x16_SUB2_16 mtctr T10 MY_ALIGN LSGEMM_L4x16_SUB2_LOOP: LOAD4x16_0 KERNEL4x16_I1_L4_2 64,16, 0,0 KERNEL4x16_I1_L4_2 64,16, 1,0 KERNEL4x16_I1_L4_2 64,16, 2,0 KERNEL4x16_I1_L4_2 64,16, 3,0 KERNEL4x16_I1_L4_2 64,16, 4,0 KERNEL4x16_I1_L4_2 64,16, 5,0 KERNEL4x16_I1_L4_2 64,16, 6,0 KERNEL4x16_I1_L4_3 64,16, 7,1 bdnz LSGEMM_L4x16_SUB2_LOOP MY_ALIGN LSGEMM_L4x16_SUB2_16: andi. T10,L, 16 ble LSGEMM_L4x16_SUB2_8 LOAD4x16_0 KERNEL4x16_I1_L4_2 64,16, 0,0 KERNEL4x16_I1_L4_2 64,16, 1,0 KERNEL4x16_I1_L4_2 64,16, 2,0 KERNEL4x16_I1_L4_3 64,16, 3,1 MY_ALIGN LSGEMM_L4x16_SUB2_8: andi. T10,L, 8 ble LSGEMM_L4x16_SUB2_4 LOAD4x16_0 KERNEL4x16_I1_L4_2 64,16, 0,0 KERNEL4x16_I1_L4_3 64,16, 1,1 MY_ALIGN LSGEMM_L4x16_SUB2_4: andi. T10,L, 4 ble LSGEMM_L4x16_SUB2_2 LOAD4x16_0 KERNEL4x16_I1_L4_3 64,16, 0,1 MY_ALIGN LSGEMM_L4x16_SUB2_2: andi. T10,L, 2 ble LSGEMM_L4x16_SUB2_1 LOAD4x16_0 KERNEL4x16_I1_L2_3 64,16, 0,1 MY_ALIGN LSGEMM_L4x16_SUB2_1: andi. T10,L, 1 ble LSGEMM_L4x16_SAVE KERNEL4x16 0 # addic. L, L, -1 # bgt LSGEMM_L4x16_SUB2 MY_ALIGN LSGEMM_L4x16_SAVE: SAVE4x16 #if defined(TRMMKERNEL) REFRESH_AFTER_SAVE T11,K,TEMP_REG,BO,AO,16,4 #endif addic. I, I, -1 bgt+ LSGEMM_L4x16_BEGIN MY_ALIGN LSGEMM_L4x16_END: LSGEMM_L4x8_BEGIN: andi. T2, M, 15 ble LSGEMM_L4x1_END andi. T1, M, 8 ble LSGEMM_L4x8_END #if defined(TRMMKERNEL) REFRESH_POINTERS AO,BO,TEMP_REG,B,8,4 #else mr BO, B #endif #if defined(TRMMKERNEL) REFRESH_TEMP_BK T11,K,TEMP_REG,8,4 mr T12, T11 addi T12,T12, -1 srawi. L, T12, 4 /**(T11-1) % 16x */ #else mr T12, K addi T12,T12, -1 srawi. L, T12, 4 /**(K-1) % 16x */ #endif ZERO4x8 ble LSGEMM_L4x8_SUB0 MY_ALIGN LSGEMM_L4x8_LOOP_START: LOAD4x8_0 /*we already zeroed */ mtctr L MY_ALIGN LSGEMM_L4x8_LOOP: KERNEL4x8_I1_L4_2 32,16, 0,0 KERNEL4x8_I1_L4_2 32,16, 1,0 KERNEL4x8_I1_L4_2 32,16, 2,0 KERNEL4x8_I1_L4_2 32,16, 3,1 bdnz LSGEMM_L4x8_LOOP MY_ALIGN LSGEMM_L4x8_LOOP_END: END4x8 0, AO, BO, 32, 16 b LSGEMM_L4x8_SUB1 MY_ALIGN LSGEMM_L4x8_SUB0: #if defined(TRMMKERNEL) andi. L, T11, 31 #else andi. L, K, 31 #endif b LSGEMM_L4x8_SUB2 MY_ALIGN LSGEMM_L4x8_SUB1: #if defined(TRMMKERNEL) andi. L, T12, 15 #else andi. L, T12, 15 #endif ble LSGEMM_L4x8_SAVE MY_ALIGN LSGEMM_L4x8_SUB2: srawi. T1,L, 3 ble LSGEMM_L4x8_SUB2_4 mtctr T1 MY_ALIGN LSGEMM_L4x8_SUB2_LOOP: LOAD4x8_0 KERNEL4x8_I1_L4_2 32,16, 0,0 KERNEL4x8_I1_L4_3 32,16, 1,1 bdnz LSGEMM_L4x8_SUB2_LOOP MY_ALIGN LSGEMM_L4x8_SUB2_4: andi. T1,L, 4 ble LSGEMM_L4x8_SUB2_2 LOAD4x8_0 KERNEL4x8_I1_L4_3 32,16, 0,1 MY_ALIGN LSGEMM_L4x8_SUB2_2: andi. T1,L, 2 ble LSGEMM_L4x8_SUB2_1 LOAD4x8_0 KERNEL4x8_I1_L2_3 32,16, 0,1 MY_ALIGN LSGEMM_L4x8_SUB2_1: andi. T1,L, 1 ble LSGEMM_L4x8_SAVE KERNEL4x8 0 MY_ALIGN LSGEMM_L4x8_SAVE: SAVE4x8 #if defined(TRMMKERNEL) REFRESH_AFTER_SAVE T11,K,TEMP_REG,BO,AO,8,4 #endif MY_ALIGN LSGEMM_L4x8_END: LSGEMM_L4x4_BEGIN: andi. T2, M, 15 ble LSGEMM_L4x1_END andi. T1, M, 4 ble LSGEMM_L4x4_END #if defined(TRMMKERNEL) REFRESH_POINTERS AO,BO,TEMP_REG,B,4,4 #else mr BO, B #endif #if defined(TRMMKERNEL) REFRESH_TEMP_BK T11,K,TEMP_REG,4,4 mr T12, T11 addi T12,T12, -1 srawi. L, T12, 4 /**(T11-1) % 16x */ #else mr T12, K addi T12,T12, -1 srawi. L, T12, 4 /**(K-1) % 16x */ #endif ZERO4x4 ble LSGEMM_L4x4_SUB0 MY_ALIGN LSGEMM_L4x4_LOOP_START: LOAD4x4_0 /*we already zeroed */ mtctr L MY_ALIGN LSGEMM_L4x4_LOOP: KERNEL4x4_I1_L4_2 16,16, 0,0 KERNEL4x4_I1_L4_2 16,16, 1,0 KERNEL4x4_I1_L4_2 16,16, 2,0 KERNEL4x4_I1_L4_2 16,16, 3,1 bdnz LSGEMM_L4x4_LOOP MY_ALIGN LSGEMM_L4x4_LOOP_END: END4x4 0, AO, BO, 16, 16 b LSGEMM_L4x4_SUB1 MY_ALIGN LSGEMM_L4x4_SUB0: #if defined(TRMMKERNEL) andi. L, T11, 31 #else andi. L, K, 31 #endif b LSGEMM_L4x4_SUB2 MY_ALIGN LSGEMM_L4x4_SUB1: #if defined(TRMMKERNEL) andi. L, T12, 15 #else andi. L, T12, 15 #endif ble LSGEMM_L4x4_SAVE MY_ALIGN LSGEMM_L4x4_SUB2: srawi. T1,L, 3 ble LSGEMM_L4x4_SUB2_4 mtctr T1 MY_ALIGN LSGEMM_L4x4_SUB2_LOOP: LOAD4x4_0 KERNEL4x4_I1_L4_2 16,16, 0,0 KERNEL4x4_I1_L4_3 16,16, 1,1 bdnz LSGEMM_L4x4_SUB2_LOOP MY_ALIGN LSGEMM_L4x4_SUB2_4: andi. T1,L, 4 ble LSGEMM_L4x4_SUB2_2 LOAD4x4_0 KERNEL4x4_I1_L4_3 16,16, 0,1 MY_ALIGN LSGEMM_L4x4_SUB2_2: andi. T1,L, 2 ble LSGEMM_L4x4_SUB2_1 LOAD4x4_0 KERNEL4x4_I1_L2_3 16,16, 0,1 MY_ALIGN LSGEMM_L4x4_SUB2_1: andi. T1,L, 1 ble LSGEMM_L4x4_SAVE KERNEL4x4 0 MY_ALIGN LSGEMM_L4x4_SAVE: SAVE4x4 #if defined(TRMMKERNEL) REFRESH_AFTER_SAVE T11,K,TEMP_REG,BO,AO,4,4 #endif MY_ALIGN LSGEMM_L4x4_END: LSGEMM_L4x2_BEGIN: andi. T1, M, 2 ble LSGEMM_L4x2_END #if defined(TRMMKERNEL) REFRESH_POINTERS AO,BO,TEMP_REG,B,2,4 #else mr BO, B #endif #if defined(TRMMKERNEL) REFRESH_TEMP_BK T11,K,TEMP_REG,2,4 srawi. L, T11, 3 /**(T11) % 8x */ #else srawi. L, K, 3 /**(K) % 8x */ #endif ZERO4x2 ble LSGEMM_L4x2_SUB0 MY_ALIGN LSGEMM_L4x2_LOOP_START: mtctr L MY_ALIGN LSGEMM_L4x2_LOOP: KERNEL4x2_2 0,0, 0,0 KERNEL4x2_2 0,0, 1,0 KERNEL4x2_2 0,0, 2,0 KERNEL4x2_2 0,0, 3,1 bdnz LSGEMM_L4x2_LOOP MY_ALIGN LSGEMM_L4x2_LOOP_END: LSGEMM_L4x2_SUB0: #if defined(TRMMKERNEL) andi. L, T11, 7 #else andi. L, K, 7 #endif ble LSGEMM_L4x2_SAVE MY_ALIGN LSGEMM_L4x2_SUB2: andi. T1,L, 4 ble LSGEMM_L4x2_SUB2_2 KERNEL4x2_2 0,0, 0,0 KERNEL4x2_2 0,0, 1,1 MY_ALIGN LSGEMM_L4x2_SUB2_2: andi. T1,L, 2 ble LSGEMM_L4x2_SUB2_1 KERNEL4x2_2 0,0, 0,1 MY_ALIGN LSGEMM_L4x2_SUB2_1: andi. T1,L, 1 ble LSGEMM_L4x2_SAVE KERNEL4x2 MY_ALIGN LSGEMM_L4x2_SAVE: SAVE4x2 #if defined(TRMMKERNEL) REFRESH_AFTER_SAVE T11,K,TEMP_REG,BO,AO,2,4 #endif MY_ALIGN LSGEMM_L4x2_END: LSGEMM_L4x1_BEGIN: andi. T1, M, 1 ble LSGEMM_L4x1_END #if defined(TRMMKERNEL) REFRESH_POINTERS AO,BO,TEMP_REG,B,1,4 #else mr BO, B #endif #if defined(TRMMKERNEL) REFRESH_TEMP_BK T11,K,TEMP_REG,1,4 srawi. L, T11, 3 /**(T11) % 8x */ #else srawi. L, K, 3 /**(K) % 8x */ #endif ZERO4x1 ble LSGEMM_L4x1_SUB0 MY_ALIGN LSGEMM_L4x1_LOOP_START: mtctr L MY_ALIGN LSGEMM_L4x1_LOOP: KERNEL4x1_4 0,0, 0,0 KERNEL4x1_4 0,0, 1,1 bdnz LSGEMM_L4x1_LOOP MY_ALIGN LSGEMM_L4x1_LOOP_END: LSGEMM_L4x1_SUB0: #if defined(TRMMKERNEL) andi. L, T11, 7 #else andi. L, K, 7 #endif ble LSGEMM_L4x1_SAVE MY_ALIGN LSGEMM_L4x1_SUB2: andi. T1,L, 4 ble LSGEMM_L4x1_SUB2_2 KERNEL4x1_4 0,0, 0,1 MY_ALIGN LSGEMM_L4x1_SUB2_2: andi. T1,L, 2 ble LSGEMM_L4x1_SUB2_1 KERNEL4x1_2 MY_ALIGN LSGEMM_L4x1_SUB2_1: andi. T1,L, 1 ble LSGEMM_L4x1_SAVE KERNEL4x1 MY_ALIGN LSGEMM_L4x1_SAVE: SAVE4x1 #if defined(TRMMKERNEL) REFRESH_AFTER_SAVE T11,K,TEMP_REG,BO,AO,1,4 #endif MY_ALIGN LSGEMM_L4x1_END: slwi T1, K, 4 add B, B, T1 #if defined(TRMMKERNEL) && !defined(LEFT) addi TEMP_REG, TEMP_REG, 4 #endif andi. T2, N, 3 ble .L999 LSGEMM_L4_END: andi. T1, N, 2 ble LSGEMM_L2_END LSGEMM_L2_BEGIN: mr AO, A mr CO, C slwi T3, LDC , 1 add C, C, T3 #if defined(TRMMKERNEL) && defined(LEFT) mr TEMP_REG, OFFSET /*off = offset;*/ #endif srawi. I, M, 4 ble LSGEMM_L2x16_END MY_ALIGN LSGEMM_L2x16_BEGIN: #if defined(TRMMKERNEL) REFRESH_POINTERS AO,BO,TEMP_REG,B,16,2 #else mr BO, B #endif #if defined(TRMMKERNEL) REFRESH_TEMP_BK T11,K,TEMP_REG,16,2 srawi. L, T11, 6 /**(T11 ) % 64x */ #else srawi. L, K, 6 /**(K ) % 64x */ #endif ZERO2x16 ble LSGEMM_L2x16_SUB0 addi AO,AO,2048 mtctr L MY_ALIGN LSGEMM_L2x16_LOOP: KERNEL2x16_4 -2048,0, 0,0 KERNEL2x16_4 -2048,0, 1,0 KERNEL2x16_4 -2048,0, 2,0 KERNEL2x16_4 -2048,0, 3,0 KERNEL2x16_4 -2048,0, 4,0 KERNEL2x16_4 -2048,0, 5,0 KERNEL2x16_4 -2048,0, 6,0 KERNEL2x16_4 -2048,0, 7,0 KERNEL2x16_4 -2048,0, 8,0 KERNEL2x16_4 -2048,0, 9,0 KERNEL2x16_4 -2048,0, 10,0 KERNEL2x16_4 -2048,0, 11,0 KERNEL2x16_4 -2048,0, 12,0 KERNEL2x16_4 -2048,0, 13,0 KERNEL2x16_4 -2048,0, 14,0 KERNEL2x16_4 -2048,0, 15,1 bdnz LSGEMM_L2x16_LOOP MY_ALIGN addi AO,AO, -2048 MY_ALIGN LSGEMM_L2x16_SUB0: #if defined(TRMMKERNEL) andi. L, T11, 63 #else andi. L, K, 63 #endif ble LSGEMM_L2x16_SAVE MY_ALIGN LSGEMM_L2x16_SUB2: andi. T10,L, 32 ble LSGEMM_L2x16_SUB2_16 KERNEL2x16_4 0,0, 0,0 KERNEL2x16_4 0,0, 1,0 KERNEL2x16_4 0,0, 2,0 KERNEL2x16_4 0,0, 3,0 KERNEL2x16_4 0,0, 4,0 KERNEL2x16_4 0,0, 5,0 KERNEL2x16_4 0,0, 6,0 KERNEL2x16_4 0,0, 7,1 MY_ALIGN LSGEMM_L2x16_SUB2_16: andi. T10,L, 16 ble LSGEMM_L2x16_SUB2_8 KERNEL2x16_4 0,0, 0,0 KERNEL2x16_4 0,0, 1,0 KERNEL2x16_4 0,0, 2,0 KERNEL2x16_4 0,0, 3,1 MY_ALIGN LSGEMM_L2x16_SUB2_8: andi. T10,L, 8 ble LSGEMM_L2x16_SUB2_4 KERNEL2x16_4 0,0, 0,0 KERNEL2x16_4 0,0, 1,1 MY_ALIGN LSGEMM_L2x16_SUB2_4: andi. T10,L, 4 ble LSGEMM_L2x16_SUB2_2 KERNEL2x16_4 0,0, 0,1 MY_ALIGN LSGEMM_L2x16_SUB2_2: andi. T10,L, 2 ble LSGEMM_L2x16_SUB2_1 KERNEL2x16_2 0,0, 0,1 MY_ALIGN LSGEMM_L2x16_SUB2_1: andi. T10,L, 1 ble LSGEMM_L2x16_SAVE KERNEL2x16 MY_ALIGN LSGEMM_L2x16_SAVE: SAVE2x16 #if defined(TRMMKERNEL) REFRESH_AFTER_SAVE T11,K,TEMP_REG,BO,AO,16,2 #endif addic. I, I, -1 bgt+ LSGEMM_L2x16_BEGIN MY_ALIGN LSGEMM_L2x16_END: andi. I, M, 8 ble LSGEMM_L2x8_END MY_ALIGN LSGEMM_L2x8_BEGIN: #if defined(TRMMKERNEL) REFRESH_POINTERS AO,BO,TEMP_REG,B,8,2 #else mr BO, B #endif #if defined(TRMMKERNEL) REFRESH_TEMP_BK T11,K,TEMP_REG,8,2 srawi. L, T11, 6 /**(T11 ) % 64x */ #else srawi. L, K, 6 /**(K ) % 64x */ #endif ZERO2x8 ble LSGEMM_L2x8_SUB0 addi AO,AO,2048 mtctr L MY_ALIGN LSGEMM_L2x8_LOOP: KERNEL2x8_4 -2048,0, 0,0 KERNEL2x8_4 -2048,0, 1,0 KERNEL2x8_4 -2048,0, 2,0 KERNEL2x8_4 -2048,0, 3,0 KERNEL2x8_4 -2048,0, 4,0 KERNEL2x8_4 -2048,0, 5,0 KERNEL2x8_4 -2048,0, 6,0 KERNEL2x8_4 -2048,0, 7,0 KERNEL2x8_4 -2048,0, 8,0 KERNEL2x8_4 -2048,0, 9,0 KERNEL2x8_4 -2048,0, 10,0 KERNEL2x8_4 -2048,0, 11,0 KERNEL2x8_4 -2048,0, 12,0 KERNEL2x8_4 -2048,0, 13,0 KERNEL2x8_4 -2048,0, 14,0 KERNEL2x8_4 -2048,0, 15,1 bdnz LSGEMM_L2x8_LOOP MY_ALIGN addi AO,AO, -2048 MY_ALIGN LSGEMM_L2x8_SUB0: #if defined(TRMMKERNEL) andi. L, T11, 63 #else andi. L, K, 63 #endif ble LSGEMM_L2x8_SAVE MY_ALIGN LSGEMM_L2x8_SUB2: andi. T10,L, 32 ble LSGEMM_L2x8_SUB2_16 KERNEL2x8_4 0,0, 0,0 KERNEL2x8_4 0,0, 1,0 KERNEL2x8_4 0,0, 2,0 KERNEL2x8_4 0,0, 3,0 KERNEL2x8_4 0,0, 4,0 KERNEL2x8_4 0,0, 5,0 KERNEL2x8_4 0,0, 6,0 KERNEL2x8_4 0,0, 7,1 MY_ALIGN LSGEMM_L2x8_SUB2_16: andi. T10,L, 16 ble LSGEMM_L2x8_SUB2_8 KERNEL2x8_4 0,0, 0,0 KERNEL2x8_4 0,0, 1,0 KERNEL2x8_4 0,0, 2,0 KERNEL2x8_4 0,0, 3,1 MY_ALIGN LSGEMM_L2x8_SUB2_8: andi. T10,L, 8 ble LSGEMM_L2x8_SUB2_4 KERNEL2x8_4 0,0, 0,0 KERNEL2x8_4 0,0, 1,1 MY_ALIGN LSGEMM_L2x8_SUB2_4: andi. T10,L, 4 ble LSGEMM_L2x8_SUB2_2 KERNEL2x8_4 0,0, 0,1 MY_ALIGN LSGEMM_L2x8_SUB2_2: andi. T10,L, 2 ble LSGEMM_L2x8_SUB2_1 KERNEL2x8_2 0,0, 0,1 MY_ALIGN LSGEMM_L2x8_SUB2_1: andi. T10,L, 1 ble LSGEMM_L2x8_SAVE KERNEL2x8 MY_ALIGN LSGEMM_L2x8_SAVE: SAVE2x8 #if defined(TRMMKERNEL) REFRESH_AFTER_SAVE T11,K,TEMP_REG,BO,AO,8,2 #endif MY_ALIGN LSGEMM_L2x8_END: andi. I, M, 4 ble LSGEMM_L2x4_END MY_ALIGN LSGEMM_L2x4_BEGIN: #if defined(TRMMKERNEL) REFRESH_POINTERS AO,BO,TEMP_REG,B,4,2 #else mr BO, B #endif #if defined(TRMMKERNEL) REFRESH_TEMP_BK T11,K,TEMP_REG,4,2 srawi. L, T11, 6 /**(T11 ) % 64x */ #else srawi. L, K, 6 /**(K ) % 64x */ #endif ZERO2x4 ble LSGEMM_L2x4_SUB0 mtctr L MY_ALIGN LSGEMM_L2x4_LOOP: KERNEL2x4_4 0,0, 0,0 KERNEL2x4_4 0,0, 1,0 KERNEL2x4_4 0,0, 2,0 KERNEL2x4_4 0,0, 3,0 KERNEL2x4_4 0,0, 4,0 KERNEL2x4_4 0,0, 5,0 KERNEL2x4_4 0,0, 6,0 KERNEL2x4_4 0,0, 7,0 KERNEL2x4_4 0,0, 8,0 KERNEL2x4_4 0,0, 9,0 KERNEL2x4_4 0,0, 10,0 KERNEL2x4_4 0,0, 11,0 KERNEL2x4_4 0,0, 12,0 KERNEL2x4_4 0,0, 13,0 KERNEL2x4_4 0,0, 14,0 KERNEL2x4_4 0,0, 15,1 bdnz LSGEMM_L2x4_LOOP MY_ALIGN MY_ALIGN LSGEMM_L2x4_SUB0: #if defined(TRMMKERNEL) andi. L, T11, 63 #else andi. L, K, 63 #endif ble LSGEMM_L2x4_SAVE MY_ALIGN LSGEMM_L2x4_SUB2: andi. T10,L, 32 ble LSGEMM_L2x4_SUB2_16 KERNEL2x4_4 0,0, 0,0 KERNEL2x4_4 0,0, 1,0 KERNEL2x4_4 0,0, 2,0 KERNEL2x4_4 0,0, 3,0 KERNEL2x4_4 0,0, 4,0 KERNEL2x4_4 0,0, 5,0 KERNEL2x4_4 0,0, 6,0 KERNEL2x4_4 0,0, 7,1 MY_ALIGN LSGEMM_L2x4_SUB2_16: andi. T10,L, 16 ble LSGEMM_L2x4_SUB2_8 KERNEL2x4_4 0,0, 0,0 KERNEL2x4_4 0,0, 1,0 KERNEL2x4_4 0,0, 2,0 KERNEL2x4_4 0,0, 3,1 MY_ALIGN LSGEMM_L2x4_SUB2_8: andi. T10,L, 8 ble LSGEMM_L2x4_SUB2_4 KERNEL2x4_4 0,0, 0,0 KERNEL2x4_4 0,0, 1,1 MY_ALIGN LSGEMM_L2x4_SUB2_4: andi. T10,L, 4 ble LSGEMM_L2x4_SUB2_2 KERNEL2x4_4 0,0, 0,1 MY_ALIGN LSGEMM_L2x4_SUB2_2: andi. T10,L, 2 ble LSGEMM_L2x4_SUB2_1 KERNEL2x4_2 0,0, 0,1 MY_ALIGN LSGEMM_L2x4_SUB2_1: andi. T10,L, 1 ble LSGEMM_L2x4_SAVE KERNEL2x4 MY_ALIGN LSGEMM_L2x4_SAVE: SAVE2x4 #if defined(TRMMKERNEL) REFRESH_AFTER_SAVE T11,K,TEMP_REG,BO,AO,4,2 #endif MY_ALIGN LSGEMM_L2x4_END: andi. I, M, 2 ble LSGEMM_L2x2_END MY_ALIGN LSGEMM_L2x2_BEGIN: #if defined(TRMMKERNEL) REFRESH_POINTERS AO,BO,TEMP_REG,B,2,2 #else mr BO, B #endif #if defined(TRMMKERNEL) REFRESH_TEMP_BK T11,K,TEMP_REG,2,2 srawi. L, T11, 6 /**(T11 ) % 64x */ #else srawi. L, K, 6 /**(K ) % 64x */ #endif ZERO2x2 ble LSGEMM_L2x2_SUB0 mtctr L MY_ALIGN LSGEMM_L2x2_LOOP: KERNEL2x2_4 0,0, 0,0 KERNEL2x2_4 0,0, 1,0 KERNEL2x2_4 0,0, 2,0 KERNEL2x2_4 0,0, 3,0 KERNEL2x2_4 0,0, 4,0 KERNEL2x2_4 0,0, 5,0 KERNEL2x2_4 0,0, 6,0 KERNEL2x2_4 0,0, 7,0 KERNEL2x2_4 0,0, 8,0 KERNEL2x2_4 0,0, 9,0 KERNEL2x2_4 0,0, 10,0 KERNEL2x2_4 0,0, 11,0 KERNEL2x2_4 0,0, 12,0 KERNEL2x2_4 0,0, 13,0 KERNEL2x2_4 0,0, 14,0 KERNEL2x2_4 0,0, 15,1 bdnz LSGEMM_L2x2_LOOP MY_ALIGN MY_ALIGN LSGEMM_L2x2_SUB0: #if defined(TRMMKERNEL) andi. L, T11, 63 #else andi. L, K, 63 #endif ble LSGEMM_L2x2_SAVE MY_ALIGN LSGEMM_L2x2_SUB2: andi. T10,L, 32 ble LSGEMM_L2x2_SUB2_16 KERNEL2x2_4 0,0, 0,0 KERNEL2x2_4 0,0, 1,0 KERNEL2x2_4 0,0, 2,0 KERNEL2x2_4 0,0, 3,0 KERNEL2x2_4 0,0, 4,0 KERNEL2x2_4 0,0, 5,0 KERNEL2x2_4 0,0, 6,0 KERNEL2x2_4 0,0, 7,1 MY_ALIGN LSGEMM_L2x2_SUB2_16: andi. T10,L, 16 ble LSGEMM_L2x2_SUB2_8 KERNEL2x2_4 0,0, 0,0 KERNEL2x2_4 0,0, 1,0 KERNEL2x2_4 0,0, 2,0 KERNEL2x2_4 0,0, 3,1 MY_ALIGN LSGEMM_L2x2_SUB2_8: andi. T10,L, 8 ble LSGEMM_L2x2_SUB2_4 KERNEL2x2_4 0,0, 0,0 KERNEL2x2_4 0,0, 1,1 MY_ALIGN LSGEMM_L2x2_SUB2_4: andi. T10,L, 4 ble LSGEMM_L2x2_SUB2_2 KERNEL2x2_4 0,0, 0,1 MY_ALIGN LSGEMM_L2x2_SUB2_2: andi. T10,L, 2 ble LSGEMM_L2x2_SUB2_1 KERNEL2x2_2 0,0, 0,1 MY_ALIGN LSGEMM_L2x2_SUB2_1: andi. T10,L, 1 ble LSGEMM_L2x2_SAVE KERNEL2x2 MY_ALIGN LSGEMM_L2x2_SAVE: SAVE2x2 #if defined(TRMMKERNEL) REFRESH_AFTER_SAVE T11,K,TEMP_REG,BO,AO,2,2 #endif MY_ALIGN LSGEMM_L2x2_END: andi. I, M, 1 ble LSGEMM_L2x1_END MY_ALIGN LSGEMM_L2x1_BEGIN: #if defined(TRMMKERNEL) REFRESH_POINTERS AO,BO,TEMP_REG,B,1,2 #else mr BO, B #endif #if defined(TRMMKERNEL) REFRESH_TEMP_BK T11,K,TEMP_REG,1,2 srawi. L, T11, 6 /**(T11 ) % 64x */ #else srawi. L, K, 6 /**(K ) % 64x */ #endif ZERO2x1 ble LSGEMM_L2x1_SUB0 mtctr L MY_ALIGN LSGEMM_L2x1_LOOP: KERNEL2x1_4 0,0, 0,0 KERNEL2x1_4 0,0, 1,0 KERNEL2x1_4 0,0, 2,0 KERNEL2x1_4 0,0, 3,0 KERNEL2x1_4 0,0, 4,0 KERNEL2x1_4 0,0, 5,0 KERNEL2x1_4 0,0, 6,0 KERNEL2x1_4 0,0, 7,0 KERNEL2x1_4 0,0, 8,0 KERNEL2x1_4 0,0, 9,0 KERNEL2x1_4 0,0, 10,0 KERNEL2x1_4 0,0, 11,0 KERNEL2x1_4 0,0, 12,0 KERNEL2x1_4 0,0, 13,0 KERNEL2x1_4 0,0, 14,0 KERNEL2x1_4 0,0, 15,1 bdnz LSGEMM_L2x1_LOOP MY_ALIGN MY_ALIGN LSGEMM_L2x1_SUB0: #if defined(TRMMKERNEL) andi. L, T11, 63 #else andi. L, K, 63 #endif ble LSGEMM_L2x1_SAVE MY_ALIGN LSGEMM_L2x1_SUB2: andi. T10,L, 32 ble LSGEMM_L2x1_SUB2_16 KERNEL2x1_4 0,0, 0,0 KERNEL2x1_4 0,0, 1,0 KERNEL2x1_4 0,0, 2,0 KERNEL2x1_4 0,0, 3,0 KERNEL2x1_4 0,0, 4,0 KERNEL2x1_4 0,0, 5,0 KERNEL2x1_4 0,0, 6,0 KERNEL2x1_4 0,0, 7,1 MY_ALIGN LSGEMM_L2x1_SUB2_16: andi. T10,L, 16 ble LSGEMM_L2x1_SUB2_8 KERNEL2x1_4 0,0, 0,0 KERNEL2x1_4 0,0, 1,0 KERNEL2x1_4 0,0, 2,0 KERNEL2x1_4 0,0, 3,1 MY_ALIGN LSGEMM_L2x1_SUB2_8: andi. T10,L, 8 ble LSGEMM_L2x1_SUB2_4 KERNEL2x1_4 0,0, 0,0 KERNEL2x1_4 0,0, 1,1 MY_ALIGN LSGEMM_L2x1_SUB2_4: andi. T10,L, 4 ble LSGEMM_L2x1_SUB2_2 KERNEL2x1_4 0,0, 0,1 MY_ALIGN LSGEMM_L2x1_SUB2_2: andi. T10,L, 2 ble LSGEMM_L2x1_SUB2_1 KERNEL2x1_2 0,0, 0,1 MY_ALIGN LSGEMM_L2x1_SUB2_1: andi. T10,L, 1 ble LSGEMM_L2x1_SAVE KERNEL2x1 MY_ALIGN LSGEMM_L2x1_SAVE: SAVE2x1 #if defined(TRMMKERNEL) REFRESH_AFTER_SAVE T11,K,TEMP_REG,BO,AO,1,2 #endif MY_ALIGN LSGEMM_L2x1_END: slwi T1, K, 3 add B, B, T1 #if defined(TRMMKERNEL) && !defined(LEFT) addi TEMP_REG, TEMP_REG, 2 #endif LSGEMM_L2_END: andi. T1, N, 1 ble LSGEMM_END LSGEMM_1_BEGIN: mr AO, A mr CO, C add C, C, LDC #if defined(TRMMKERNEL) && defined(LEFT) mr TEMP_REG, OFFSET /*off = offset;*/ #endif srawi. I, M, 4 ble LSGEMM_1x16_END MY_ALIGN LSGEMM_1x16_BEGIN: #if defined(TRMMKERNEL) REFRESH_POINTERS AO,BO,TEMP_REG,B,16,1 #else mr BO, B #endif #if defined(TRMMKERNEL) REFRESH_TEMP_BK T11,K,TEMP_REG,16,1 srawi. L, T11, 6 /**(T11 ) % 64x */ #else srawi. L, K, 6 /**(K ) % 64x */ #endif ZERO1x16 ble LSGEMM_1x16_SUB0 addi AO,AO,2048 mtctr L MY_ALIGN LSGEMM_1x16_LOOP: KERNEL1x16_4 -2048,0, 0,0 KERNEL1x16_4 -2048,0, 1,0 KERNEL1x16_4 -2048,0, 2,0 KERNEL1x16_4 -2048,0, 3,0 KERNEL1x16_4 -2048,0, 4,0 KERNEL1x16_4 -2048,0, 5,0 KERNEL1x16_4 -2048,0, 6,0 KERNEL1x16_4 -2048,0, 7,0 KERNEL1x16_4 -2048,0, 8,0 KERNEL1x16_4 -2048,0, 9,0 KERNEL1x16_4 -2048,0, 10,0 KERNEL1x16_4 -2048,0, 11,0 KERNEL1x16_4 -2048,0, 12,0 KERNEL1x16_4 -2048,0, 13,0 KERNEL1x16_4 -2048,0, 14,0 KERNEL1x16_4 -2048,0, 15,1 bdnz LSGEMM_1x16_LOOP MY_ALIGN addi AO,AO, -2048 MY_ALIGN LSGEMM_1x16_SUB0: #if defined(TRMMKERNEL) andi. L, T11, 63 #else andi. L, K, 63 #endif ble LSGEMM_1x16_SAVE MY_ALIGN LSGEMM_1x16_SUB2: andi. T10,L, 32 ble LSGEMM_1x16_SUB2_16 KERNEL1x16_4 0,0, 0,0 KERNEL1x16_4 0,0, 1,0 KERNEL1x16_4 0,0, 2,0 KERNEL1x16_4 0,0, 3,0 KERNEL1x16_4 0,0, 4,0 KERNEL1x16_4 0,0, 5,0 KERNEL1x16_4 0,0, 6,0 KERNEL1x16_4 0,0, 7,1 MY_ALIGN LSGEMM_1x16_SUB2_16: andi. T10,L, 16 ble LSGEMM_1x16_SUB2_8 KERNEL1x16_4 0,0, 0,0 KERNEL1x16_4 0,0, 1,0 KERNEL1x16_4 0,0, 2,0 KERNEL1x16_4 0,0, 3,1 MY_ALIGN LSGEMM_1x16_SUB2_8: andi. T10,L, 8 ble LSGEMM_1x16_SUB2_4 KERNEL1x16_4 0,0, 0,0 KERNEL1x16_4 0,0, 1,1 MY_ALIGN LSGEMM_1x16_SUB2_4: andi. T10,L, 4 ble LSGEMM_1x16_SUB2_2 KERNEL1x16_4 0,0, 0,1 MY_ALIGN LSGEMM_1x16_SUB2_2: andi. T10,L, 2 ble LSGEMM_1x16_SUB2_1 KERNEL1x16_2 0,0, 0,1 MY_ALIGN LSGEMM_1x16_SUB2_1: andi. T10,L, 1 ble LSGEMM_1x16_SAVE KERNEL1x16 MY_ALIGN LSGEMM_1x16_SAVE: SAVE1x16 #if defined(TRMMKERNEL) REFRESH_AFTER_SAVE T11,K,TEMP_REG,BO,AO,16,1 #endif addic. I, I, -1 bgt+ LSGEMM_1x16_BEGIN MY_ALIGN LSGEMM_1x16_END: andi. I, M, 8 ble LSGEMM_1x8_END MY_ALIGN LSGEMM_1x8_BEGIN: #if defined(TRMMKERNEL) REFRESH_POINTERS AO,BO,TEMP_REG,B,8,1 #else mr BO, B #endif #if defined(TRMMKERNEL) REFRESH_TEMP_BK T11,K,TEMP_REG,8,1 srawi. L, T11, 6 /**(T11 ) % 64x */ #else srawi. L, K, 6 /**(K ) % 64x */ #endif ZERO1x8 ble LSGEMM_1x8_SUB0 addi AO,AO,2048 mtctr L MY_ALIGN LSGEMM_1x8_LOOP: KERNEL1x8_4 -2048,0, 0,0 KERNEL1x8_4 -2048,0, 1,0 KERNEL1x8_4 -2048,0, 2,0 KERNEL1x8_4 -2048,0, 3,0 KERNEL1x8_4 -2048,0, 4,0 KERNEL1x8_4 -2048,0, 5,0 KERNEL1x8_4 -2048,0, 6,0 KERNEL1x8_4 -2048,0, 7,0 KERNEL1x8_4 -2048,0, 8,0 KERNEL1x8_4 -2048,0, 9,0 KERNEL1x8_4 -2048,0, 10,0 KERNEL1x8_4 -2048,0, 11,0 KERNEL1x8_4 -2048,0, 12,0 KERNEL1x8_4 -2048,0, 13,0 KERNEL1x8_4 -2048,0, 14,0 KERNEL1x8_4 -2048,0, 15,1 bdnz LSGEMM_1x8_LOOP MY_ALIGN addi AO,AO, -2048 MY_ALIGN LSGEMM_1x8_SUB0: #if defined(TRMMKERNEL) andi. L, T11, 63 #else andi. L, K, 63 #endif ble LSGEMM_1x8_SAVE MY_ALIGN LSGEMM_1x8_SUB2: andi. T10,L, 32 ble LSGEMM_1x8_SUB2_16 KERNEL1x8_4 0,0, 0,0 KERNEL1x8_4 0,0, 1,0 KERNEL1x8_4 0,0, 2,0 KERNEL1x8_4 0,0, 3,0 KERNEL1x8_4 0,0, 4,0 KERNEL1x8_4 0,0, 5,0 KERNEL1x8_4 0,0, 6,0 KERNEL1x8_4 0,0, 7,1 MY_ALIGN LSGEMM_1x8_SUB2_16: andi. T10,L, 16 ble LSGEMM_1x8_SUB2_8 KERNEL1x8_4 0,0, 0,0 KERNEL1x8_4 0,0, 1,0 KERNEL1x8_4 0,0, 2,0 KERNEL1x8_4 0,0, 3,1 MY_ALIGN LSGEMM_1x8_SUB2_8: andi. T10,L, 8 ble LSGEMM_1x8_SUB2_4 KERNEL1x8_4 0,0, 0,0 KERNEL1x8_4 0,0, 1,1 MY_ALIGN LSGEMM_1x8_SUB2_4: andi. T10,L, 4 ble LSGEMM_1x8_SUB2_2 KERNEL1x8_4 0,0, 0,1 MY_ALIGN LSGEMM_1x8_SUB2_2: andi. T10,L, 2 ble LSGEMM_1x8_SUB2_1 KERNEL1x8_2 0,0, 0,1 MY_ALIGN LSGEMM_1x8_SUB2_1: andi. T10,L, 1 ble LSGEMM_1x8_SAVE KERNEL1x8 MY_ALIGN LSGEMM_1x8_SAVE: SAVE1x8 #if defined(TRMMKERNEL) REFRESH_AFTER_SAVE T11,K,TEMP_REG,BO,AO,8,1 #endif MY_ALIGN LSGEMM_1x8_END: andi. I, M, 4 ble LSGEMM_1x4_END MY_ALIGN LSGEMM_1x4_BEGIN: #if defined(TRMMKERNEL) REFRESH_POINTERS AO,BO,TEMP_REG,B,4,1 #else mr BO, B #endif #if defined(TRMMKERNEL) REFRESH_TEMP_BK T11,K,TEMP_REG,4,1 srawi. L, T11, 6 /**(T11 ) % 64x */ #else srawi. L, K, 6 /**(K ) % 64x */ #endif ZERO1x4 ble LSGEMM_1x4_SUB0 mtctr L MY_ALIGN LSGEMM_1x4_LOOP: KERNEL1x4_4 0,0, 0,0 KERNEL1x4_4 0,0, 1,0 KERNEL1x4_4 0,0, 2,0 KERNEL1x4_4 0,0, 3,0 KERNEL1x4_4 0,0, 4,0 KERNEL1x4_4 0,0, 5,0 KERNEL1x4_4 0,0, 6,0 KERNEL1x4_4 0,0, 7,0 KERNEL1x4_4 0,0, 8,0 KERNEL1x4_4 0,0, 9,0 KERNEL1x4_4 0,0, 10,0 KERNEL1x4_4 0,0, 11,0 KERNEL1x4_4 0,0, 12,0 KERNEL1x4_4 0,0, 13,0 KERNEL1x4_4 0,0, 14,0 KERNEL1x4_4 0,0, 15,1 bdnz LSGEMM_1x4_LOOP MY_ALIGN MY_ALIGN LSGEMM_1x4_SUB0: #if defined(TRMMKERNEL) andi. L, T11, 63 #else andi. L, K, 63 #endif ble LSGEMM_1x4_SAVE MY_ALIGN LSGEMM_1x4_SUB2: andi. T10,L, 32 ble LSGEMM_1x4_SUB2_16 KERNEL1x4_4 0,0, 0,0 KERNEL1x4_4 0,0, 1,0 KERNEL1x4_4 0,0, 2,0 KERNEL1x4_4 0,0, 3,0 KERNEL1x4_4 0,0, 4,0 KERNEL1x4_4 0,0, 5,0 KERNEL1x4_4 0,0, 6,0 KERNEL1x4_4 0,0, 7,1 MY_ALIGN LSGEMM_1x4_SUB2_16: andi. T10,L, 16 ble LSGEMM_1x4_SUB2_8 KERNEL1x4_4 0,0, 0,0 KERNEL1x4_4 0,0, 1,0 KERNEL1x4_4 0,0, 2,0 KERNEL1x4_4 0,0, 3,1 MY_ALIGN LSGEMM_1x4_SUB2_8: andi. T10,L, 8 ble LSGEMM_1x4_SUB2_4 KERNEL1x4_4 0,0, 0,0 KERNEL1x4_4 0,0, 1,1 MY_ALIGN LSGEMM_1x4_SUB2_4: andi. T10,L, 4 ble LSGEMM_1x4_SUB2_2 KERNEL1x4_4 0,0, 0,1 MY_ALIGN LSGEMM_1x4_SUB2_2: andi. T10,L, 2 ble LSGEMM_1x4_SUB2_1 KERNEL1x4_2 0,0, 0,1 MY_ALIGN LSGEMM_1x4_SUB2_1: andi. T10,L, 1 ble LSGEMM_1x4_SAVE KERNEL1x4 MY_ALIGN LSGEMM_1x4_SAVE: SAVE1x4 #if defined(TRMMKERNEL) REFRESH_AFTER_SAVE T11,K,TEMP_REG,BO,AO,4,1 #endif MY_ALIGN LSGEMM_1x4_END: andi. I, M, 2 ble LSGEMM_1x2_END MY_ALIGN LSGEMM_1x2_BEGIN: #if defined(TRMMKERNEL) REFRESH_POINTERS AO,BO,TEMP_REG,B,2,1 #else mr BO, B #endif #if defined(TRMMKERNEL) REFRESH_TEMP_BK T11,K,TEMP_REG,2,1 srawi. L, T11, 6 /**(T11 ) % 64x */ #else srawi. L, K, 6 /**(K ) % 64x */ #endif ZERO1x2 ble LSGEMM_1x2_SUB0 mtctr L MY_ALIGN LSGEMM_1x2_LOOP: KERNEL1x2_4 0,0, 0,0 KERNEL1x2_4 0,0, 1,0 KERNEL1x2_4 0,0, 2,0 KERNEL1x2_4 0,0, 3,0 KERNEL1x2_4 0,0, 4,0 KERNEL1x2_4 0,0, 5,0 KERNEL1x2_4 0,0, 6,0 KERNEL1x2_4 0,0, 7,0 KERNEL1x2_4 0,0, 8,0 KERNEL1x2_4 0,0, 9,0 KERNEL1x2_4 0,0, 10,0 KERNEL1x2_4 0,0, 11,0 KERNEL1x2_4 0,0, 12,0 KERNEL1x2_4 0,0, 13,0 KERNEL1x2_4 0,0, 14,0 KERNEL1x2_4 0,0, 15,1 bdnz LSGEMM_1x2_LOOP MY_ALIGN MY_ALIGN LSGEMM_1x2_SUB0: #if defined(TRMMKERNEL) andi. L, T11, 63 #else andi. L, K, 63 #endif ble LSGEMM_1x2_SAVE MY_ALIGN LSGEMM_1x2_SUB2: andi. T10,L, 32 ble LSGEMM_1x2_SUB2_16 KERNEL1x2_4 0,0, 0,0 KERNEL1x2_4 0,0, 1,0 KERNEL1x2_4 0,0, 2,0 KERNEL1x2_4 0,0, 3,0 KERNEL1x2_4 0,0, 4,0 KERNEL1x2_4 0,0, 5,0 KERNEL1x2_4 0,0, 6,0 KERNEL1x2_4 0,0, 7,1 MY_ALIGN LSGEMM_1x2_SUB2_16: andi. T10,L, 16 ble LSGEMM_1x2_SUB2_8 KERNEL1x2_4 0,0, 0,0 KERNEL1x2_4 0,0, 1,0 KERNEL1x2_4 0,0, 2,0 KERNEL1x2_4 0,0, 3,1 MY_ALIGN LSGEMM_1x2_SUB2_8: andi. T10,L, 8 ble LSGEMM_1x2_SUB2_4 KERNEL1x2_4 0,0, 0,0 KERNEL1x2_4 0,0, 1,1 MY_ALIGN LSGEMM_1x2_SUB2_4: andi. T10,L, 4 ble LSGEMM_1x2_SUB2_2 KERNEL1x2_4 0,0, 0,1 MY_ALIGN LSGEMM_1x2_SUB2_2: andi. T10,L, 2 ble LSGEMM_1x2_SUB2_1 KERNEL1x2_2 0,0, 0,1 MY_ALIGN LSGEMM_1x2_SUB2_1: andi. T10,L, 1 ble LSGEMM_1x2_SAVE KERNEL1x2 MY_ALIGN LSGEMM_1x2_SAVE: SAVE1x2 #if defined(TRMMKERNEL) REFRESH_AFTER_SAVE T11,K,TEMP_REG,BO,AO,2,1 #endif MY_ALIGN LSGEMM_1x2_END: andi. I, M, 1 ble LSGEMM_1x1_END MY_ALIGN LSGEMM_1x1_BEGIN: #if defined(TRMMKERNEL) REFRESH_POINTERS AO,BO,TEMP_REG,B,1,1 #else mr BO, B #endif #if defined(TRMMKERNEL) REFRESH_TEMP_BK T11,K,TEMP_REG,1,1 srawi. L, T11, 6 /**(T11 ) % 64x */ #else srawi. L, K, 6 /**(K ) % 64x */ #endif ZERO1x1 ble LSGEMM_1x1_SUB0 mtctr L MY_ALIGN LSGEMM_1x1_LOOP: KERNEL1x1_16 0,0, 0,0 KERNEL1x1_16 0,0, 1,0 KERNEL1x1_16 0,0, 2,0 KERNEL1x1_16 0,0, 3,1 bdnz LSGEMM_1x1_LOOP MY_ALIGN MY_ALIGN LSGEMM_1x1_SUB0: #if defined(TRMMKERNEL) andi. L, T11, 63 #else andi. L, K, 63 #endif ble LSGEMM_1x1_SAVE MY_ALIGN LSGEMM_1x1_SUB2: andi. T10,L, 32 ble LSGEMM_1x1_SUB2_16 KERNEL1x1_16 0,0, 0,0 KERNEL1x1_16 0,0, 1,1 MY_ALIGN LSGEMM_1x1_SUB2_16: andi. T10,L, 16 ble LSGEMM_1x1_SUB2_8 KERNEL1x1_16 0,0, 0,1 MY_ALIGN LSGEMM_1x1_SUB2_8: andi. T10,L, 8 ble LSGEMM_1x1_SUB2_4 KERNEL1x1_8 0,0, 0,1 MY_ALIGN LSGEMM_1x1_SUB2_4: andi. T10,L, 4 ble LSGEMM_1x1_SUB2_2 KERNEL1x1_4 0,0, 0,1 MY_ALIGN LSGEMM_1x1_SUB2_2: andi. T10,L, 2 ble LSGEMM_1x1_SUB2_1 KERNEL1x1_2 0,0, 0,1 MY_ALIGN LSGEMM_1x1_SUB2_1: andi. T10,L, 1 ble LSGEMM_1x1_SAVE KERNEL1x1 MY_ALIGN LSGEMM_1x1_SAVE: SAVE1x1 #if defined(TRMMKERNEL) REFRESH_AFTER_SAVE T11,K,TEMP_REG,BO,AO,1,1 #endif MY_ALIGN LSGEMM_1x1_END: slwi T1, K, 2 add B, B, T1 #if defined(TRMMKERNEL) && !defined(LEFT) addi TEMP_REG, TEMP_REG, 1 #endif LSGEMM_END: