You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

zgemv_t_4_lasx.S 9.2 kB


  1. /*******************************************************************************
  2. Copyright (c) 2024, The OpenBLAS Project
  3. All rights reserved.
  4. Redistribution and use in source and binary forms, with or without
  5. modification, are permitted provided that the following conditions are
  6. met:
  7. 1. Redistributions of source code must retain the above copyright
  8. notice, this list of conditions and the following disclaimer.
  9. 2. Redistributions in binary form must reproduce the above copyright
  10. notice, this list of conditions and the following disclaimer in
  11. the documentation and/or other materials provided with the
  12. distribution.
  13. 3. Neither the name of the OpenBLAS project nor the names of
  14. its contributors may be used to endorse or promote products
  15. derived from this software without specific prior written permission.
  16. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  17. AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  18. IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  19. ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
  20. LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  21. DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
  22. SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
  23. CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  24. OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
  25. USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  26. *******************************************************************************/
  27. #define ASSEMBLER
  28. #include "common.h"
  29. #include "loongarch64_asm.S"
  30. /*********************************************************************
  31. * 2024/02/20 guxiwei
  32. * UTEST : OK
  33. * CTEST : OK
  34. * TEST : OK
  35. *
  36. *
  37. *********************************************************************/
  38. /* int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i,
  39. * FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer)
  40. */
  41. #define M $r4
  42. #define N $r5
  43. #define ALPHA_R $f0
  44. #define ALPHA_I $f1
  45. #define A $r7
  46. #define LDA $r8
  47. #define X $r9
  48. #define INC_X $r10
  49. #define Y $r11
  50. #define INC_Y $r6
  51. #define J $r12
  52. #define I $r13
  53. #define K $r14
  54. #define PY0 $r14
  55. #define X_ORG $r15
  56. #define PY1 $r16
  57. #define K_LDA $r17
  58. #define PY2 $r18
  59. #define T0 $r19
  60. #define PA0 $r20
  61. #define PA1 $r23
  62. #define PA2 $r24
  63. #define PA3 $r25
  64. #define PA4 $r26
  65. #define PA5 $r27
  66. #define PA6 $r28
  67. #define PA7 $r29
  68. #define M16 $r30
  69. #define VALPHA $xr0
  70. #define X0 $xr1
  71. #define X1 $xr2
  72. #define A0 $xr3
  73. #define A1 $xr4
  74. #define A2 $xr5
  75. #define A3 $xr6
  76. #define A4 $xr7
  77. #define A5 $xr8
  78. #define A6 $xr9
  79. #define A7 $xr10
  80. #define A8 $xr11
  81. #define A9 $xr12
  82. #define A10 $xr13
  83. #define A11 $xr14
  84. #define A12 $xr15
  85. #define A13 $xr16
  86. #define A14 $xr17
  87. #define A15 $xr18
  88. #define TP0 $xr19
  89. #define TP1 $xr20
  90. #define TP2 $xr21
  91. #define TP3 $xr22
  92. #define TP4 $xr23
  93. #define TP5 $xr24
  94. #define TP6 $xr25
  95. #define TP7 $xr26
  96. #define TMP0 $xr27
  97. #define TMP1 $xr28
  98. #define TMP2 $xr29
  99. #define Y0 $xr3
  100. #define Y1 $xr4
  101. #define Y2 $xr5
  102. #define Y3 $xr6
  103. #define Y4 $xr7
  104. #define Y5 $xr8
  105. #define Y6 $xr9
  106. #define Y7 $xr10
  107. #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
  108. #define GXCONJ1 0
  109. #define GCONJ1 0
  110. #else
  111. #define GXCONJ1 1
  112. #define GCONJ1 0
  113. #endif
  114. #if !defined(XCONJ)
  115. #define GXCONJ2 0
  116. #define GCONJ2 0
  117. #else
  118. #define GXCONJ2 0
  119. #define GCONJ2 1
  120. #endif
  121. .macro ZERO_Y4
  122. GXOR xv, v, TP0, TP0, TP0, TP1, TP1, TP1, TP2, TP2, TP2, TP3, TP3, TP3
  123. .endm
  124. .macro ZERO_Y1
  125. GXOR xv, v, TP0, TP0, TP0
  126. .endm
  127. .macro ZLOAD_X4
  128. GLD xv, , X0, X, 0x00, X1, X, 0x20
  129. .endm
  130. .macro ZLOAD_X4_GAP
  131. xvld X0, X, 0
  132. PTR_ADD T0, X, INC_X
  133. xvld A0, T0, 0
  134. xvpermi.q X0, A0, 0x02
  135. PTR_ADD T0, T0, INC_X
  136. xvld X1, T0, 0
  137. PTR_ADD T0, T0, INC_X
  138. xvld A0, T0, 0
  139. xvpermi.q X1, A0, 0x02
  140. .endm
  141. .macro ZGEMV_T_4x4
  142. GLD_INC xv, , 0x20, \
  143. A0, PA0, 0, A1, PA0, 0, \
  144. A2, PA1, 0, A3, PA1, 0, \
  145. A4, PA2, 0, A5, PA2, 0, \
  146. A6, PA3, 0, A7, PA3, 0
  147. GCOMPLEXMADD GXCONJ1, GCONJ1, \
  148. xvf, d, TP0, A0, X0, TP0, TMP0, TMP1, TMP2, TP0, A1, X1, TP0, TMP0, TMP1, TMP2, \
  149. TP1, A2, X0, TP1, TMP0, TMP1, TMP2, TP1, A3, X1, TP1, TMP0, TMP1, TMP2, \
  150. TP2, A4, X0, TP2, TMP0, TMP1, TMP2, TP2, A5, X1, TP2, TMP0, TMP1, TMP2, \
  151. TP3, A6, X0, TP3, TMP0, TMP1, TMP2, TP3, A7, X1, TP3, TMP0, TMP1, TMP2
  152. .endm
  153. .macro ZGEMV_T_LASX XW:req, X4:req
  154. PTR_SRLI J, N, 2
  155. beqz J, .L_\XW\()_N_3
  156. PTR_SLLI K_LDA, LDA, 2
  157. PTR_SUB K_LDA, K_LDA, M16
  158. .L_\XW\()_N_L4:
  159. ZERO_Y4
  160. move X, X_ORG
  161. PTR_SRLI I, M, 2
  162. beqz I, .L_\XW\()_M_3
  163. .align 5
  164. .L_\XW\()_M_L4:
  165. ZLOAD_\X4
  166. ZGEMV_T_4x4
  167. PTR_ADDI I, I, -1
  168. PTR_ALSL X, INC_X, X, 2
  169. bnez I, .L_\XW\()_M_L4
  170. .L_\XW\()_M_3:
  171. // Accumulated
  172. GCOMPLEXACC xvf, d, Y0, TP0, Y1, TP1, Y2, TP2, Y3, TP3
  173. andi I, M, 3
  174. beqz I, .L_\XW\()_M_END
  175. .align 5
  176. .L_\XW\()_M_L1:
  177. GLD xv, , X0, X, 0x00, A8, PA0, 0x00, A9, PA1, 0x00, A10, PA2, 0x00, A11, PA3, 0x00
  178. #if __loongarch_grlen == 64
  179. GADDI , d, PA0, PA0, 0x10, PA1, PA1, 0x10, PA2, PA2, 0x10, PA3, PA3, 0x10
  180. #elif __loongarch_grlen == 32
  181. GADDI , w, PA0, PA0, 0x10, PA1, PA1, 0x10, PA2, PA2, 0x10, PA3, PA3, 0x10
  182. #else
  183. GADDI , d, PA0, PA0, 0x10, PA1, PA1, 0x10, PA2, PA2, 0x10, PA3, PA3, 0x10
  184. #endif
  185. GCOMPLEXMADD GXCONJ1, GCONJ1, \
  186. xvf, d, A0, A8, X0, A0, TMP0, TMP1, TMP2, A1, A9, X0, A1, TMP0, TMP1, TMP2, \
  187. A2, A10, X0, A2, TMP0, TMP1, TMP2, A3, A11, X0, A3, TMP0, TMP1, TMP2
  188. PTR_ADDI I, I, -1
  189. PTR_ADD X, X, INC_X
  190. bnez I, .L_\XW\()_M_L1
  191. .L_\XW\()_M_END:
  192. xvld A8, Y, 0x00
  193. xvldx A9, Y, INC_Y
  194. PTR_ALSL PY0, INC_Y, Y, 1
  195. xvld A10, PY0, 0x00
  196. xvldx A11, PY0, INC_Y
  197. GCOMPLEXMADD GXCONJ2, GCONJ2, \
  198. xvf, d, A8, VALPHA, A0, A8, TMP0, TMP1, TMP2, A9, VALPHA, A1, A9, TMP0, TMP1, TMP2,\
  199. A10, VALPHA, A2, A10, TMP0, TMP1, TMP2, A11, VALPHA, A3, A11, TMP0, TMP1, TMP2
  200. PTR_ADDI J, J, -1
  201. #if __loongarch_grlen == 64
  202. GADD , d, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA
  203. #elif __loongarch_grlen == 32
  204. GADD , w, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA
  205. #else
  206. GADD , d, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA
  207. #endif
  208. vst $vr11, Y, 0x00
  209. vstx $vr12, Y, INC_Y
  210. vst $vr13, PY0, 0x00
  211. vstx $vr14, PY0, INC_Y
  212. PTR_ALSL Y, INC_Y, Y, 2
  213. bnez J, .L_\XW\()_N_L4
  214. .L_\XW\()_N_3:
  215. andi J, N, 3
  216. beqz J, .L_END
  217. PTR_SUB K_LDA, LDA, M16
  218. .L_\XW\()_N_1:
  219. ZERO_Y1
  220. move X, X_ORG
  221. move I, M
  222. beqz I, .L_END
  223. .align 5
  224. .L_\XW\()_N_1_M_L1:
  225. GLD xv, , A0, PA0, 0x00, X0, X, 0x00
  226. GCOMPLEXMADD GXCONJ1, GCONJ1, \
  227. xvf, d, TP0, A0, X0, TP0, TMP0, TMP1, TMP2
  228. PTR_ADDI I, I, -1
  229. PTR_ADD X, X, INC_X
  230. PTR_ADDI PA0, PA0, 0x10
  231. bnez I, .L_\XW\()_N_1_M_L1
  232. .L_\XW\()_N_1_M_END:
  233. PTR_ADDI J, J, -1
  234. xvld A0, Y, 0x00
  235. GCOMPLEXMADD GXCONJ2, GCONJ2, \
  236. xvf, d, A0, VALPHA, TP0, A0, TMP0, TMP1, TMP2
  237. vst $vr3, Y, 0x00
  238. PTR_ADD PA0, PA0, K_LDA
  239. PTR_ADD Y, Y, INC_Y
  240. bnez J, .L_\XW\()_N_1
  241. b .L_END
  242. .endm
  243. PROLOGUE
  244. PTR_LD INC_Y, $sp, 0
  245. push_if_used 8, 6
  246. PTR_ADDI K, $r0, 0x01
  247. PTR_SUB I, INC_X, K
  248. maskeqz I, K, I /* if(inc_x == 1) I = 0; else I = 1; */
  249. GSLLI , d, LDA, LDA, 4, INC_X, INC_X, 4, INC_Y, INC_Y, 4, M16, M, 4
  250. // Init VALPHA
  251. xvpackev.d $xr0, $xr1, $xr0
  252. xvreplve0.q VALPHA, $xr0
  253. move X_ORG, X
  254. move PA0, A
  255. #if __loongarch_grlen == 64
  256. GADD , d, PA1, PA0, LDA, PA2, PA1, LDA, PA3, PA2, LDA, PA4, PA3, LDA
  257. #elif __loongarch_grlen == 32
  258. GADD , w, PA1, PA0, LDA, PA2, PA1, LDA, PA3, PA2, LDA, PA4, PA3, LDA
  259. #else
  260. GADD , d, PA1, PA0, LDA, PA2, PA1, LDA, PA3, PA2, LDA, PA4, PA3, LDA
  261. #endif
  262. la.local T0, .L_GAP_TABLE
  263. PTR_ALSL I, I, T0, 1
  264. ld.h K, I, 0
  265. PTR_ADD T0, T0, K
  266. jirl $r0, T0, 0
  267. .L_GAP_TABLE:
  268. .hword .L_GAP_0 - .L_GAP_TABLE
  269. .hword .L_GAP_1 - .L_GAP_TABLE
  270. .L_GAP_0: /* if (incx == 1) */
  271. ZGEMV_T_LASX GAP_0, X4
  272. .L_GAP_1: /* if (incx != 1) */
  273. ZGEMV_T_LASX GAP_1, X4_GAP
  274. .L_END:
  275. pop_if_used 8, 6
  276. jirl $r0, $r1, 0x0
  277. EPILOGUE