|
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254 |
- /***************************************************************************
- Copyright (c) 2013, The OpenBLAS Project
- All rights reserved.
- Redistribution and use in source and binary forms, with or without
- modification, are permitted provided that the following conditions are
- met:
- 1. Redistributions of source code must retain the above copyright
- notice, this list of conditions and the following disclaimer.
- 2. Redistributions in binary form must reproduce the above copyright
- notice, this list of conditions and the following disclaimer in
- the documentation and/or other materials provided with the
- distribution.
- 3. Neither the name of the OpenBLAS project nor the names of
- its contributors may be used to endorse or promote products
- derived from this software without specific prior written permission.
- THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
- LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
- USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *****************************************************************************/
-
- /**************************************************************************************
- * 2013/11/05 Saar
- * BLASTEST : OK
- * CTEST : OK
- * TEST : OK
- *
- **************************************************************************************/
-
- #define ASSEMBLER
- #include "common.h"
-
- #define STACKSIZE 256
-
- #define OLD_M r0
- #define OLD_N r1
- #define OLD_A r2
- #define OLD_LDA r3
-
-
- /******************************************************
- * [fp, #-128] - [fp, #-64] is reserved
- * for store and restore of floating point
- * registers
- *******************************************************/
-
- #define LDA [fp, #-260 ]
-
- #define B [fp, #4 ]
-
- #define M r0
- #define N r1
- #define A r2
-
- #define BO r5
-
- #define AO1 r6
- #define AO2 r7
-
- #define I r3
- #define J r12
-
- #define A_PRE 256
-
- /**************************************************************************************
- * Macro definitions
- **************************************************************************************/
-
- .macro COPY2x2
-
- pld [ AO1, #A_PRE ]
- pld [ AO2, #A_PRE ]
- fldd d0 , [ AO1, #0 ]
- fldd d1 , [ AO1, #8 ]
- fldd d4 , [ AO1, #16 ]
- fldd d5 , [ AO1, #24 ]
-
- fldd d2 , [ AO2, #0 ]
- fldd d3 , [ AO2, #8 ]
- add AO1, AO1, #32
- fldd d6 , [ AO2, #16 ]
- fldd d7 , [ AO2, #24 ]
-
- vstmia.f64 BO!, { d0 - d7 }
- add AO2, AO2, #32
-
- .endm
-
-
- .macro COPY1x2
-
- fldd d0 , [ AO1, #0 ]
- fldd d1 , [ AO1, #8 ]
- fldd d2 , [ AO2, #0 ]
- fldd d3 , [ AO2, #8 ]
-
- add AO1, AO1, #16
- vstmia.f64 BO!, { d0 - d3 }
- add AO2, AO2, #16
-
- .endm
-
- .macro COPY2x1
-
- fldd d0 , [ AO1, #0 ]
- fldd d1 , [ AO1, #8 ]
- fldd d2 , [ AO1, #16 ]
- fldd d3 , [ AO1, #24 ]
-
- vstmia.f64 BO!, { d0 - d3 }
- add AO1, AO1, #32
-
- .endm
-
-
- .macro COPY1x1
-
- fldd d0 , [ AO1, #0 ]
- fldd d1 , [ AO1, #8 ]
-
- vstmia.f64 BO!, { d0 - d1 }
- add AO1, AO1, #16
-
- .endm
-
-
-
-
-
- /**************************************************************************************
- * End of macro definitions
- **************************************************************************************/
-
- PROLOGUE
-
- .align 5
-
- push {r4 - r9, fp}
- add fp, sp, #24
- sub sp, sp, #STACKSIZE // reserve stack
-
-
- lsl r3, r3, #4 // lda = lda * 8 * 2
- str r3, LDA
-
- sub r4, fp, #128
- vstm r4, { d8 - d15} // store floating point registers
-
- ldr BO, B
-
- /*********************************************************************************************/
-
- zgemm_ncopy_L2_BEGIN:
-
- asrs J, N, #1 // J = N / 2
- ble zgemm_ncopy_L1_BEGIN
-
- zgemm_ncopy_L2_M2_BEGIN:
-
- mov AO1, A // AO1 = A
- ldr r4 , LDA
- add AO2, AO1, r4
- add A , AO2, r4 // A = A + 2 * LDA
-
- asrs I, M, #1 // I = M / 2
- ble zgemm_ncopy_L2_M2_40
-
- zgemm_ncopy_L2_M2_20:
-
- COPY2x2
-
- subs I , I , #1
- bne zgemm_ncopy_L2_M2_20
-
-
- zgemm_ncopy_L2_M2_40:
-
- ands I, M , #1
- ble zgemm_ncopy_L2_M2_END
-
- zgemm_ncopy_L2_M2_60:
-
- COPY1x2
-
- subs I , I , #1
- bne zgemm_ncopy_L2_M2_60
-
-
- zgemm_ncopy_L2_M2_END:
-
- subs J , J, #1 // j--
- bne zgemm_ncopy_L2_M2_BEGIN
-
-
- /*********************************************************************************************/
-
- zgemm_ncopy_L1_BEGIN:
-
- tst N, #1
- ble zgemm_ncopy_L999
-
-
- zgemm_ncopy_L1_M2_BEGIN:
-
- mov AO1, A // AO1 = A
- ldr r4 , LDA
- add A , AO1, r4 // A = A + 1 * LDA
-
- asrs I, M, #1 // I = M / 2
- ble zgemm_ncopy_L1_M2_40
-
- zgemm_ncopy_L1_M2_20:
-
- COPY2x1
-
- subs I , I , #1
- bne zgemm_ncopy_L1_M2_20
-
-
- zgemm_ncopy_L1_M2_40:
-
- ands I, M , #1
- ble zgemm_ncopy_L1_M2_END
-
- zgemm_ncopy_L1_M2_60:
-
- COPY1x1
-
- subs I , I , #1
- bne zgemm_ncopy_L1_M2_60
-
-
- zgemm_ncopy_L1_M2_END:
-
-
-
- zgemm_ncopy_L999:
-
- sub r3, fp, #128
- vldm r3, { d8 - d15} // restore floating point registers
-
- movs r0, #0 // set return value
- sub sp, fp, #24
- pop {r4 - r9, fp}
- bx lr
-
- EPILOGUE
-
|