| @@ -90,19 +90,17 @@ SGEMMKERNEL = sgemm_kernel_4x4_vfpv3.S | |||||
| SGEMMINCOPY = | SGEMMINCOPY = | ||||
| SGEMMITCOPY = | SGEMMITCOPY = | ||||
| SGEMMONCOPY = sgemm_ncopy_4_vfpv3.S | SGEMMONCOPY = sgemm_ncopy_4_vfpv3.S | ||||
| SGEMMOTCOPY = ../generic/gemm_tcopy_4.c | |||||
| SGEMMOTCOPY = sgemm_tcopy_4_vfpv3.S | |||||
| SGEMMINCOPYOBJ = | SGEMMINCOPYOBJ = | ||||
| SGEMMITCOPYOBJ = | SGEMMITCOPYOBJ = | ||||
| SGEMMONCOPYOBJ = sgemm_oncopy.o | SGEMMONCOPYOBJ = sgemm_oncopy.o | ||||
| SGEMMOTCOPYOBJ = sgemm_otcopy.o | SGEMMOTCOPYOBJ = sgemm_otcopy.o | ||||
| #DGEMMKERNEL = ../generic/gemmkernel_2x2.c | |||||
| #DGEMMKERNEL = dgemm_kernel_4x2_vfpv2.S | |||||
| DGEMMKERNEL = dgemm_kernel_4x4_vfpv3.S | DGEMMKERNEL = dgemm_kernel_4x4_vfpv3.S | ||||
| DGEMMINCOPY = | DGEMMINCOPY = | ||||
| DGEMMITCOPY = | DGEMMITCOPY = | ||||
| DGEMMONCOPY = dgemm_ncopy_4_vfpv3.S | DGEMMONCOPY = dgemm_ncopy_4_vfpv3.S | ||||
| DGEMMOTCOPY = ../generic/gemm_tcopy_4.c | |||||
| DGEMMOTCOPY = dgemm_tcopy_4_vfpv3.S | |||||
| DGEMMINCOPYOBJ = | DGEMMINCOPYOBJ = | ||||
| DGEMMITCOPYOBJ = | DGEMMITCOPYOBJ = | ||||
| DGEMMONCOPYOBJ = dgemm_oncopy.o | DGEMMONCOPYOBJ = dgemm_oncopy.o | ||||
| @@ -77,7 +77,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #define B [fp, #4 ] | #define B [fp, #4 ] | ||||
| #define C [fp, #8 ] | #define C [fp, #8 ] | ||||
| #define OLDdgemm_kernel_LDC [fp, #12 ] | |||||
| #define OLD_LDC [fp, #12 ] | |||||
| #define I r0 | #define I r0 | ||||
| #define J r1 | #define J r1 | ||||
| @@ -883,7 +883,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| sub r3, fp, #128 | sub r3, fp, #128 | ||||
| vstm r3, { d8 - d15} // store floating point registers | vstm r3, { d8 - d15} // store floating point registers | ||||
| ldr r3, OLDdgemm_kernel_LDC | |||||
| ldr r3, OLD_LDC | |||||
| lsl r3, r3, #3 // ldc = ldc * 8 | lsl r3, r3, #3 // ldc = ldc * 8 | ||||
| str r3, LDC | str r3, LDC | ||||
| @@ -0,0 +1,408 @@ | |||||
| /*************************************************************************** | |||||
| Copyright (c) 2013, The OpenBLAS Project | |||||
| All rights reserved. | |||||
| Redistribution and use in source and binary forms, with or without | |||||
| modification, are permitted provided that the following conditions are | |||||
| met: | |||||
| 1. Redistributions of source code must retain the above copyright | |||||
| notice, this list of conditions and the following disclaimer. | |||||
| 2. Redistributions in binary form must reproduce the above copyright | |||||
| notice, this list of conditions and the following disclaimer in | |||||
| the documentation and/or other materials provided with the | |||||
| distribution. | |||||
| 3. Neither the name of the OpenBLAS project nor the names of | |||||
| its contributors may be used to endorse or promote products | |||||
| derived from this software without specific prior written permission. | |||||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| *****************************************************************************/ | |||||
| /************************************************************************************** | |||||
| * 2013/11/06 Saar | |||||
| * BLASTEST : OK | |||||
| * CTEST : OK | |||||
| * TEST : OK | |||||
| * | |||||
| **************************************************************************************/ | |||||
| #define ASSEMBLER | |||||
| #include "common.h" | |||||
| #define STACKSIZE 256 | |||||
| #define OLD_M r0 | |||||
| #define OLD_N r1 | |||||
| #define OLD_A r2 | |||||
| #define OLD_LDA r3 | |||||
| /****************************************************** | |||||
| * [fp, #-128] - [fp, #-64] is reserved | |||||
| * for store and restore of floating point | |||||
| * registers | |||||
| *******************************************************/ | |||||
| #define B [fp, #4 ] | |||||
| #define A [fp, #-248 ] | |||||
| #define M r0 | |||||
| #define N r1 | |||||
| #define M4 r2 | |||||
| #define LDA r5 | |||||
| #define AO1 r6 | |||||
| #define BO1 r7 | |||||
| #define BO2 r8 | |||||
| #define BO3 r9 | |||||
| #define I r4 | |||||
| #define J r12 | |||||
| #define A_PRE 256 | |||||
| /************************************************************************************** | |||||
| * Macro definitions | |||||
| **************************************************************************************/ | |||||
| .macro COPY4x4 | |||||
| pld [ AO1, #A_PRE ] | |||||
| fldmiad AO1, { d0 - d3 } | |||||
| add r3, AO1, LDA | |||||
| pld [ r3, #A_PRE ] | |||||
| fldmiad r3, { d4 - d7 } | |||||
| add r3, r3, LDA | |||||
| pld [ r3, #A_PRE ] | |||||
| fldmiad r3, { d8 - d11 } | |||||
| add r3, r3, LDA | |||||
| pld [ r3, #A_PRE ] | |||||
| fldmiad r3, { d12 - d15 } | |||||
| fstmiad BO1, { d0 - d15 } | |||||
| add AO1, AO1, #32 | |||||
| add BO1, BO1, M4 | |||||
| .endm | |||||
| .macro COPY2x4 | |||||
| fldmiad AO1, { d0 - d1 } | |||||
| add r3, AO1, LDA | |||||
| fldmiad r3, { d2 - d3 } | |||||
| add r3, r3, LDA | |||||
| fldmiad r3, { d4 - d5 } | |||||
| add r3, r3, LDA | |||||
| fldmiad r3, { d6 - d7 } | |||||
| fstmiad BO2, { d0 - d7 } | |||||
| add AO1, AO1, #16 | |||||
| add BO2, BO2, #64 | |||||
| .endm | |||||
| .macro COPY1x4 | |||||
| fldmiad AO1, { d0 } | |||||
| add r3, AO1, LDA | |||||
| fldmiad r3, { d1 } | |||||
| add r3, r3, LDA | |||||
| fldmiad r3, { d2 } | |||||
| add r3, r3, LDA | |||||
| fldmiad r3, { d3 } | |||||
| fstmiad BO3, { d0 - d3 } | |||||
| add AO1, AO1, #8 | |||||
| add BO3, BO3, #32 | |||||
| .endm | |||||
| /*************************************************************************************************************************/ | |||||
| .macro COPY4x2 | |||||
| pld [ AO1, #A_PRE ] | |||||
| fldmiad AO1, { d0 - d3 } | |||||
| add r3, AO1, LDA | |||||
| pld [ r3, #A_PRE ] | |||||
| fldmiad r3, { d4 - d7 } | |||||
| fstmiad BO1, { d0 - d7 } | |||||
| add AO1, AO1, #32 | |||||
| add BO1, BO1, M4 | |||||
| .endm | |||||
| .macro COPY2x2 | |||||
| fldmiad AO1, { d0 - d1 } | |||||
| add r3, AO1, LDA | |||||
| fldmiad r3, { d2 - d3 } | |||||
| fstmiad BO2, { d0 - d3 } | |||||
| add AO1, AO1, #16 | |||||
| add BO2, BO2, #32 | |||||
| .endm | |||||
| .macro COPY1x2 | |||||
| fldmiad AO1, { d0 } | |||||
| add r3, AO1, LDA | |||||
| fldmiad r3, { d1 } | |||||
| fstmiad BO3, { d0 - d1 } | |||||
| add AO1, AO1, #8 | |||||
| add BO3, BO3, #16 | |||||
| .endm | |||||
| /*************************************************************************************************************************/ | |||||
| .macro COPY4x1 | |||||
| pld [ AO1, #A_PRE ] | |||||
| fldmiad AO1, { d0 - d3 } | |||||
| fstmiad BO1, { d0 - d3 } | |||||
| add AO1, AO1, #32 | |||||
| add BO1, BO1, M4 | |||||
| .endm | |||||
| .macro COPY2x1 | |||||
| fldmiad AO1, { d0 - d1 } | |||||
| fstmiad BO2, { d0 - d1 } | |||||
| add AO1, AO1, #16 | |||||
| add BO2, BO2, #16 | |||||
| .endm | |||||
| .macro COPY1x1 | |||||
| fldmiad AO1, { d0 } | |||||
| fstmiad BO3, { d0 } | |||||
| add AO1, AO1, #8 | |||||
| add BO3, BO3, #8 | |||||
| .endm | |||||
| /************************************************************************************** | |||||
| * End of macro definitions | |||||
| **************************************************************************************/ | |||||
| PROLOGUE | |||||
| .align 5 | |||||
| push {r4 - r9, fp} | |||||
| add fp, sp, #24 | |||||
| sub sp, sp, #STACKSIZE // reserve stack | |||||
| str OLD_A, A // store A | |||||
| lsl LDA, OLD_LDA, #3 // lda = lda * SIZE | |||||
| sub r4, fp, #128 | |||||
| vstm r4, { d8 - d15} // store floating point registers | |||||
| lsl r4 , M, #3 // M * SIZE | |||||
| ldr r3, B | |||||
| and BO2 , N , #-4 | |||||
| and BO3 , N , #-2 | |||||
| mul BO2, BO2, r4 | |||||
| mul BO3, BO3, r4 | |||||
| add BO2 , BO2, r3 | |||||
| add BO3 , BO3, r3 | |||||
| lsl M4, M, #5 // M4 = M * 4 * SIZE | |||||
| dgemm_tcopy_L4_BEGIN: | |||||
| asrs J, M, #2 // J = N / 4 | |||||
| ble dgemm_tcopy_L2_BEGIN | |||||
| dgemm_tcopy_L4_M4_BEGIN: | |||||
| ldr AO1, A // AO1 = A | |||||
| lsl r3, LDA, #2 // r3 = 4 * LDA | |||||
| add r3, r3 , AO1 // A = A + 4 * LDA | |||||
| str r3, A // store A | |||||
| ldr BO1, B | |||||
| add r3, BO1, #128 // B = B + 16 * SIZE | |||||
| str r3, B | |||||
| asrs I, N, #2 // I = M / 4 | |||||
| ble dgemm_tcopy_L4_M4_40 | |||||
| dgemm_tcopy_L4_M4_20: | |||||
| COPY4x4 | |||||
| subs I , I , #1 | |||||
| bne dgemm_tcopy_L4_M4_20 | |||||
| dgemm_tcopy_L4_M4_40: | |||||
| tst N , #2 | |||||
| ble dgemm_tcopy_L4_M4_60 | |||||
| COPY2x4 | |||||
| dgemm_tcopy_L4_M4_60: | |||||
| tst N, #1 | |||||
| ble dgemm_tcopy_L4_M4_END | |||||
| COPY1x4 | |||||
| dgemm_tcopy_L4_M4_END: | |||||
| subs J , J, #1 // j-- | |||||
| bne dgemm_tcopy_L4_M4_BEGIN | |||||
| /*********************************************************************************************/ | |||||
| dgemm_tcopy_L2_BEGIN: | |||||
| tst M, #3 | |||||
| ble dgemm_tcopy_L999 | |||||
| tst M, #2 | |||||
| ble dgemm_tcopy_L1_BEGIN | |||||
| dgemm_tcopy_L2_M4_BEGIN: | |||||
| ldr AO1, A // AO1 = A | |||||
| lsl r3, LDA, #1 // r3 = 2 * LDA | |||||
| add r3, r3 , AO1 // A = A + 2 * LDA | |||||
| str r3, A // store A | |||||
| ldr BO1, B | |||||
| add r3, BO1, #64 // B = B + 8 * SIZE | |||||
| str r3, B | |||||
| asrs I, N, #2 // I = M / 4 | |||||
| ble dgemm_tcopy_L2_M4_40 | |||||
| dgemm_tcopy_L2_M4_20: | |||||
| COPY4x2 | |||||
| subs I , I , #1 | |||||
| bne dgemm_tcopy_L2_M4_20 | |||||
| dgemm_tcopy_L2_M4_40: | |||||
| tst N , #2 | |||||
| ble dgemm_tcopy_L2_M4_60 | |||||
| COPY2x2 | |||||
| dgemm_tcopy_L2_M4_60: | |||||
| tst N , #1 | |||||
| ble dgemm_tcopy_L2_M4_END | |||||
| COPY1x2 | |||||
| dgemm_tcopy_L2_M4_END: | |||||
| /*********************************************************************************************/ | |||||
| dgemm_tcopy_L1_BEGIN: | |||||
| tst M, #1 | |||||
| ble dgemm_tcopy_L999 | |||||
| dgemm_tcopy_L1_M4_BEGIN: | |||||
| ldr AO1, A // AO1 = A | |||||
| add r3, LDA , AO1 // A = A + 1 * LDA | |||||
| str r3, A // store A | |||||
| ldr BO1, B | |||||
| add r3, BO1, #32 // B = B + 4 * SIZE | |||||
| str r3, B | |||||
| asrs I, N, #2 // I = M / 4 | |||||
| ble dgemm_tcopy_L1_M4_40 | |||||
| dgemm_tcopy_L1_M4_20: | |||||
| COPY4x1 | |||||
| subs I , I , #1 | |||||
| bne dgemm_tcopy_L1_M4_20 | |||||
| dgemm_tcopy_L1_M4_40: | |||||
| tst N , #2 | |||||
| ble dgemm_tcopy_L1_M4_60 | |||||
| COPY2x1 | |||||
| dgemm_tcopy_L1_M4_60: | |||||
| tst N , #1 | |||||
| ble dgemm_tcopy_L1_M4_END | |||||
| COPY1x1 | |||||
| dgemm_tcopy_L1_M4_END: | |||||
| dgemm_tcopy_L999: | |||||
| sub r3, fp, #128 | |||||
| vldm r3, { d8 - d15} // restore floating point registers | |||||
| mov r0, #0 // set return value | |||||
| sub sp, fp, #24 | |||||
| pop {r4 - r9, fp} | |||||
| bx lr | |||||
| EPILOGUE | |||||
| @@ -0,0 +1,430 @@ | |||||
| /*************************************************************************** | |||||
| Copyright (c) 2013, The OpenBLAS Project | |||||
| All rights reserved. | |||||
| Redistribution and use in source and binary forms, with or without | |||||
| modification, are permitted provided that the following conditions are | |||||
| met: | |||||
| 1. Redistributions of source code must retain the above copyright | |||||
| notice, this list of conditions and the following disclaimer. | |||||
| 2. Redistributions in binary form must reproduce the above copyright | |||||
| notice, this list of conditions and the following disclaimer in | |||||
| the documentation and/or other materials provided with the | |||||
| distribution. | |||||
| 3. Neither the name of the OpenBLAS project nor the names of | |||||
| its contributors may be used to endorse or promote products | |||||
| derived from this software without specific prior written permission. | |||||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| *****************************************************************************/ | |||||
| /************************************************************************************** | |||||
| * 2013/11/06 Saar | |||||
| * BLASTEST : OK | |||||
| * CTEST : OK | |||||
| * TEST : OK | |||||
| * | |||||
| **************************************************************************************/ | |||||
| #define ASSEMBLER | |||||
| #include "common.h" | |||||
| #define STACKSIZE 256 | |||||
| #define OLD_M r0 | |||||
| #define OLD_N r1 | |||||
| #define OLD_A r2 | |||||
| #define OLD_LDA r3 | |||||
| /****************************************************** | |||||
| * [fp, #-128] - [fp, #-64] is reserved | |||||
| * for store and restore of floating point | |||||
| * registers | |||||
| *******************************************************/ | |||||
| #define B [fp, #4 ] | |||||
| #define A [fp, #-248 ] | |||||
| #define M r0 | |||||
| #define N r1 | |||||
| #define M4 r2 | |||||
| #define LDA r5 | |||||
| #define AO1 r6 | |||||
| #define BO1 r7 | |||||
| #define BO2 r8 | |||||
| #define BO3 r9 | |||||
| #define I r4 | |||||
| #define J r12 | |||||
| #define A_PRE 256 | |||||
| /************************************************************************************** | |||||
| * Macro definitions | |||||
| **************************************************************************************/ | |||||
| .macro COPY4x4_1 | |||||
| pld [ AO1, #A_PRE ] | |||||
| fldmias AO1, { s0 - s3 } | |||||
| add r3, AO1, LDA | |||||
| pld [ r3, #A_PRE ] | |||||
| fldmias r3, { s4 - s7 } | |||||
| add r3, r3, LDA | |||||
| pld [ r3, #A_PRE ] | |||||
| fldmias r3, { s8 - s11 } | |||||
| add r3, r3, LDA | |||||
| pld [ r3, #A_PRE ] | |||||
| fldmias r3, { s12 - s15 } | |||||
| fstmias BO1, { s0 - s15 } | |||||
| add AO1, AO1, #16 | |||||
| add BO1, BO1, M4 | |||||
| .endm | |||||
| .macro COPY4x4_2 | |||||
| fldmias AO1, { s0 - s3 } | |||||
| add r3, AO1, LDA | |||||
| fldmias r3, { s4 - s7 } | |||||
| add r3, r3, LDA | |||||
| fldmias r3, { s8 - s11 } | |||||
| add r3, r3, LDA | |||||
| fldmias r3, { s12 - s15 } | |||||
| fstmias BO1, { s0 - s15 } | |||||
| add AO1, AO1, #16 | |||||
| add BO1, BO1, M4 | |||||
| .endm | |||||
| .macro COPY2x4 | |||||
| fldmias AO1, { s0 - s1 } | |||||
| add r3, AO1, LDA | |||||
| fldmias r3, { s2 - s3 } | |||||
| add r3, r3, LDA | |||||
| fldmias r3, { s4 - s5 } | |||||
| add r3, r3, LDA | |||||
| fldmias r3, { s6 - s7 } | |||||
| fstmias BO2, { s0 - s7 } | |||||
| add AO1, AO1, #8 | |||||
| add BO2, BO2, #32 | |||||
| .endm | |||||
| .macro COPY1x4 | |||||
| fldmias AO1, { s0 } | |||||
| add r3, AO1, LDA | |||||
| fldmias r3, { s1 } | |||||
| add r3, r3, LDA | |||||
| fldmias r3, { s2 } | |||||
| add r3, r3, LDA | |||||
| fldmias r3, { s3 } | |||||
| fstmias BO3, { s0 - s3 } | |||||
| add AO1, AO1, #4 | |||||
| add BO3, BO3, #16 | |||||
| .endm | |||||
| /*************************************************************************************************************************/ | |||||
| .macro COPY4x2 | |||||
| fldmias AO1, { s0 - s3 } | |||||
| add r3, AO1, LDA | |||||
| fldmias r3, { s4 - s7 } | |||||
| fstmias BO1, { s0 - s7 } | |||||
| add AO1, AO1, #16 | |||||
| add BO1, BO1, M4 | |||||
| .endm | |||||
| .macro COPY2x2 | |||||
| fldmias AO1, { s0 - s1 } | |||||
| add r3, AO1, LDA | |||||
| fldmias r3, { s2 - s3 } | |||||
| fstmias BO2, { s0 - s3 } | |||||
| add AO1, AO1, #8 | |||||
| add BO2, BO2, #16 | |||||
| .endm | |||||
| .macro COPY1x2 | |||||
| fldmias AO1, { s0 } | |||||
| add r3, AO1, LDA | |||||
| fldmias r3, { s1 } | |||||
| fstmias BO3, { s0 - s1 } | |||||
| add AO1, AO1, #4 | |||||
| add BO3, BO3, #8 | |||||
| .endm | |||||
| /*************************************************************************************************************************/ | |||||
| .macro COPY4x1 | |||||
| fldmias AO1, { s0 - s3 } | |||||
| fstmias BO1, { s0 - s3 } | |||||
| add AO1, AO1, #16 | |||||
| add BO1, BO1, M4 | |||||
| .endm | |||||
| .macro COPY2x1 | |||||
| fldmias AO1, { s0 - s1 } | |||||
| fstmias BO2, { s0 - s1 } | |||||
| add AO1, AO1, #8 | |||||
| add BO2, BO2, #8 | |||||
| .endm | |||||
| .macro COPY1x1 | |||||
| fldmias AO1, { s0 } | |||||
| fstmias BO3, { s0 } | |||||
| add AO1, AO1, #4 | |||||
| add BO3, BO3, #4 | |||||
| .endm | |||||
| /************************************************************************************** | |||||
| * End of macro definitions | |||||
| **************************************************************************************/ | |||||
| PROLOGUE | |||||
| .align 5 | |||||
| push {r4 - r9, fp} | |||||
| add fp, sp, #24 | |||||
| sub sp, sp, #STACKSIZE // reserve stack | |||||
| str OLD_A, A // store A | |||||
| lsl LDA, OLD_LDA, #2 // lda = lda * SIZE | |||||
| sub r4, fp, #128 | |||||
| vstm r4, { s8 - s15} // store floating point registers | |||||
| lsl r4 , M, #2 // M * SIZE | |||||
| ldr r3, B | |||||
| and BO2 , N , #-4 | |||||
| and BO3 , N , #-2 | |||||
| mul BO2, BO2, r4 | |||||
| mul BO3, BO3, r4 | |||||
| add BO2 , BO2, r3 | |||||
| add BO3 , BO3, r3 | |||||
| lsl M4, M, #4 // M4 = M * 4 * SIZE | |||||
| sgemm_tcopy_L4_BEGIN: | |||||
| asrs J, M, #2 // J = N / 4 | |||||
| ble sgemm_tcopy_L2_BEGIN | |||||
| sgemm_tcopy_L4_M4_BEGIN: | |||||
| ldr AO1, A // AO1 = A | |||||
| lsl r3, LDA, #2 // r3 = 4 * LDA | |||||
| add r3, r3 , AO1 // A = A + 4 * LDA | |||||
| str r3, A // store A | |||||
| ldr BO1, B | |||||
| add r3, BO1, #64 // B = B + 16 * SIZE | |||||
| str r3, B | |||||
| asrs I, N, #2 // I = M / 4 | |||||
| ble sgemm_tcopy_L4_M4_40 | |||||
| sgemm_tcopy_L4_M4_20: | |||||
| COPY4x4_1 | |||||
| subs I , I , #1 | |||||
| ble sgemm_tcopy_L4_M4_40 | |||||
| COPY4x4_2 | |||||
| subs I , I , #1 | |||||
| bne sgemm_tcopy_L4_M4_20 | |||||
| sgemm_tcopy_L4_M4_40: | |||||
| tst N , #2 | |||||
| ble sgemm_tcopy_L4_M4_60 | |||||
| COPY2x4 | |||||
| sgemm_tcopy_L4_M4_60: | |||||
| tst N, #1 | |||||
| ble sgemm_tcopy_L4_M4_END | |||||
| COPY1x4 | |||||
| sgemm_tcopy_L4_M4_END: | |||||
| subs J , J, #1 // j-- | |||||
| bne sgemm_tcopy_L4_M4_BEGIN | |||||
| /*********************************************************************************************/ | |||||
| sgemm_tcopy_L2_BEGIN: | |||||
| tst M, #3 | |||||
| ble sgemm_tcopy_L999 | |||||
| tst M, #2 | |||||
| ble sgemm_tcopy_L1_BEGIN | |||||
| sgemm_tcopy_L2_M4_BEGIN: | |||||
| ldr AO1, A // AO1 = A | |||||
| lsl r3, LDA, #1 // r3 = 2 * LDA | |||||
| add r3, r3 , AO1 // A = A + 2 * LDA | |||||
| str r3, A // store A | |||||
| ldr BO1, B | |||||
| add r3, BO1, #32 // B = B + 8 * SIZE | |||||
| str r3, B | |||||
| asrs I, N, #2 // I = M / 4 | |||||
| ble sgemm_tcopy_L2_M4_40 | |||||
| sgemm_tcopy_L2_M4_20: | |||||
| COPY4x2 | |||||
| subs I , I , #1 | |||||
| bne sgemm_tcopy_L2_M4_20 | |||||
| sgemm_tcopy_L2_M4_40: | |||||
| tst N , #2 | |||||
| ble sgemm_tcopy_L2_M4_60 | |||||
| COPY2x2 | |||||
| sgemm_tcopy_L2_M4_60: | |||||
| tst N , #1 | |||||
| ble sgemm_tcopy_L2_M4_END | |||||
| COPY1x2 | |||||
| sgemm_tcopy_L2_M4_END: | |||||
| /*********************************************************************************************/ | |||||
| sgemm_tcopy_L1_BEGIN: | |||||
| tst M, #1 | |||||
| ble sgemm_tcopy_L999 | |||||
| sgemm_tcopy_L1_M4_BEGIN: | |||||
| ldr AO1, A // AO1 = A | |||||
| add r3, LDA , AO1 // A = A + 1 * LDA | |||||
| str r3, A // store A | |||||
| ldr BO1, B | |||||
| add r3, BO1, #16 // B = B + 4 * SIZE | |||||
| str r3, B | |||||
| asrs I, N, #2 // I = M / 4 | |||||
| ble sgemm_tcopy_L1_M4_40 | |||||
| sgemm_tcopy_L1_M4_20: | |||||
| COPY4x1 | |||||
| subs I , I , #1 | |||||
| bne sgemm_tcopy_L1_M4_20 | |||||
| sgemm_tcopy_L1_M4_40: | |||||
| tst N , #2 | |||||
| ble sgemm_tcopy_L1_M4_60 | |||||
| COPY2x1 | |||||
| sgemm_tcopy_L1_M4_60: | |||||
| tst N , #1 | |||||
| ble sgemm_tcopy_L1_M4_END | |||||
| COPY1x1 | |||||
| sgemm_tcopy_L1_M4_END: | |||||
| sgemm_tcopy_L999: | |||||
| sub r3, fp, #128 | |||||
| vldm r3, { s8 - s15} // restore floating point registers | |||||
| mov r0, #0 // set return value | |||||
| sub sp, fp, #24 | |||||
| pop {r4 - r9, fp} | |||||
| bx lr | |||||
| EPILOGUE | |||||