You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

zgemm_tcopy_8_lasx.S 7.4 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268
  1. /*******************************************************************************
  2. Copyright (c) 2024, The OpenBLAS Project
  3. All rights reserved.
  4. Redistribution and use in source and binary forms, with or without
  5. modification, are permitted provided that the following conditions are
  6. met:
  7. 1. Redistributions of source code must retain the above copyright
  8. notice, this list of conditions and the following disclaimer.
  9. 2. Redistributions in binary form must reproduce the above copyright
  10. notice, this list of conditions and the following disclaimer in
  11. the documentation and/or other materials provided with the
  12. distribution.
  13. 3. Neither the name of the OpenBLAS project nor the names of
  14. its contributors may be used to endorse or promote products
  15. derived from this software without specific prior written permission.
  16. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  17. AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  18. IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  19. ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
  20. LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  21. DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
  22. SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
  23. CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  24. OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
  25. USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  26. *******************************************************************************/
  27. #define ASSEMBLER
  28. #include "common.h"
  29. /* Function parameters */
  30. #define M $r4 // param 1: m
  31. #define N $r5 // param 2: n
  32. #define SRC $r6 // param 3: src
  33. #define LDA $r7 // param 4: lda
  34. #define DST $r8 // param 5: dst
  35. #define I $r9
  36. #define J $r10
  37. #define S1 $r12
  38. #define S2 $r13
  39. #define S3 $r14
  40. #define S4 $r15
  41. #define S5 $r16
  42. #define S6 $r17
  43. #define S7 $r18
  44. #define S8 $r19
  45. #define TD $r20
  46. #define TS $r11
  47. #define TL $r7
  48. #define T0 $r23
  49. #define ZERO $r0
  50. #define F0 $f0
  51. #define F1 $f1
  52. #define F2 $f2
  53. #define F3 $f3
  54. #define F4 $f4
  55. #define F5 $f5
  56. #define F6 $f6
  57. #define F7 $f7
  58. /* LASX vectors */
  59. #define U0 $xr0
  60. #define U1 $xr1
  61. #define U2 $xr2
  62. #define U3 $xr3
  63. #define U4 $xr4
  64. #define U5 $xr5
  65. #define U6 $xr6
  66. #define U7 $xr7
  67. #define D0 $xr8
  68. #define D1 $xr9
  69. #define D2 $xr10
  70. #define D3 $xr11
  71. #define D4 $xr12
  72. #define D5 $xr13
  73. #define D6 $xr14
  74. #define D7 $xr15
  75. PROLOGUE
  76. addi.d $sp, $sp, -8
  77. SDARG $r23, $sp, 0
  78. move TS, SRC //aoffset
  79. move TD, DST //boffset
  80. slli.d TL, LDA, 0x03 //lda
  81. slli.d TL, TL, 0x01
  82. srai.d J, N, 0x03 //j
  83. beq J, ZERO, .L_N1
  84. .L_J1: /* if(j>0) j--*/
  85. move S1, TS //aoffset1
  86. slli.d T0, TL, 0x01 //2*lda
  87. add.d S2, TS, TL
  88. addi.d TS, TS, 0x80
  89. srai.d I, M, 0x01
  90. beq ZERO, I, .L_J1M1
  91. .L_J1I1: /* if(i>0) i--*/
  92. xvld U0, S1, 0x00
  93. xvld U1, S1, 0x20
  94. xvld U2, S1, 0x40
  95. xvld U3, S1, 0x60
  96. xvld U4, S2, 0x00
  97. xvld U5, S2, 0x20
  98. xvld U6, S2, 0x40
  99. xvld U7, S2, 0x60
  100. xvst U0, TD, 0x00
  101. xvst U1, TD, 0x20
  102. xvst U2, TD, 0x40
  103. xvst U3, TD, 0x60
  104. xvst U4, TD, 0x80
  105. xvst U5, TD, 0xa0
  106. xvst U6, TD, 0xc0
  107. xvst U7, TD, 0xe0
  108. add.d S1, S1, T0
  109. add.d S2, S2, T0
  110. addi.d TD, TD, 0x100
  111. addi.d I, I, -1
  112. blt ZERO, I, .L_J1I1
  113. .L_J1M1: /* if(m&1) */
  114. andi I, M, 0x01
  115. beq ZERO, I, .L_J0
  116. xvld U0, S1, 0x00
  117. xvld U1, S1, 0x20
  118. xvld U2, S1, 0x40
  119. xvld U3, S1, 0x60
  120. xvst U0, TD, 0x00
  121. xvst U1, TD, 0x20
  122. xvst U2, TD, 0x40
  123. xvst U3, TD, 0x60
  124. addi.d TD, TD, 0x80
  125. .L_J0:
  126. addi.d J, J, -1
  127. blt ZERO, J, .L_J1
  128. .L_N1: /* if(n&4) */
  129. andi I, N, 0x04
  130. beq ZERO, I, .L_N2
  131. move S1, TS //aoffset1
  132. slli.d T0, TL, 0x01 //2*lda
  133. add.d S2, TS, TL
  134. addi.d TS, TS, 0x40
  135. srai.d I, M, 0x01
  136. beq ZERO, I, .L_N1M1
  137. .L_N1I1: /* if(i>0) i-- */
  138. xvld U0, S1, 0x00
  139. xvld U1, S1, 0x20
  140. xvld U2, S2, 0x00
  141. xvld U3, S2, 0x20
  142. xvst U0, TD, 0x00
  143. xvst U1, TD, 0x20
  144. xvst U2, TD, 0x40
  145. xvst U3, TD, 0x60
  146. add.d S1, S1, T0
  147. add.d S2, S2, T0
  148. addi.d TD, TD, 0x80
  149. addi.d I, I, -1
  150. blt ZERO, I, .L_N1I1
  151. .L_N1M1: /* if(m&1) */
  152. andi I, M, 0x01
  153. beq ZERO, I, .L_N2
  154. xvld U0, S1, 0x00
  155. xvld U1, S1, 0x20
  156. xvst U0, TD, 0x00
  157. xvst U1, TD, 0x20
  158. addi.d TD, TD, 0x40
  159. .L_N2: /* if(n&2) */
  160. andi I, N, 0x02
  161. beq ZERO, I, .L_N3
  162. move S1, TS //aoffset1
  163. slli.d T0, TL, 0x01 //2*lda
  164. add.d S2, TS, TL
  165. addi.d TS, TS, 0x20
  166. srai.d I, M, 0x01
  167. beq ZERO, I, .L_N2M1
  168. .L_N2I1: /* if(i>0) i-- */
  169. xvld U0, S1, 0x00
  170. xvld U1, S2, 0x00
  171. xvst U0, TD, 0x00
  172. xvst U1, TD, 0x20
  173. add.d S1, S1, T0
  174. add.d S2, S2, T0
  175. addi.d TD, TD, 0x40
  176. addi.d I, I, -1
  177. blt ZERO, I, .L_N2I1
  178. .L_N2M1: /* if(m&1) */
  179. andi I, M, 0x01
  180. beq ZERO, I, .L_N3
  181. xvld U0, S1, 0x00
  182. xvst U0, TD, 0x00
  183. addi.d TD, TD, 0x20
  184. .L_N3: /* if(n&1) */
  185. andi I, N, 0x01
  186. beq ZERO, I, .L_N0
  187. move S1, TS //aoffset1
  188. slli.d T0, TL, 0x01 //2*lda
  189. add.d S2, TS, TL
  190. srai.d I, M, 0x01
  191. beq ZERO, I, .L_N3M1
  192. .L_N3I1: /* if(i>0) i-- */
  193. vld $vr0, S1, 0x00
  194. vld $vr1, S2, 0x00
  195. vst $vr0, TD, 0x00
  196. vst $vr1, TD, 0x10
  197. add.d S1, S1, T0
  198. add.d S2, S2, T0
  199. addi.d TD, TD, 0x20
  200. addi.d I, I, -1
  201. blt ZERO, I, .L_N3I1
  202. .L_N3M1: /* if(m&1) */
  203. andi I, M, 0x01
  204. beq ZERO, I, .L_N0
  205. vld $vr0, S1, 0x00
  206. vst $vr0, TD, 0x00
  207. .L_N0:
  208. LDARG $r23, $sp, 0
  209. addi.d $sp, $sp, 8
  210. jirl $r0, $r1, 0x00
  211. EPILOGUE