You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

zgemv_n_4_lasx.S 10 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343
  1. /*******************************************************************************
  2. Copyright (c) 2024, The OpenBLAS Project
  3. All rights reserved.
  4. Redistribution and use in source and binary forms, with or without
  5. modification, are permitted provided that the following conditions are
  6. met:
  7. 1. Redistributions of source code must retain the above copyright
  8. notice, this list of conditions and the following disclaimer.
  9. 2. Redistributions in binary form must reproduce the above copyright
  10. notice, this list of conditions and the following disclaimer in
  11. the documentation and/or other materials provided with the
  12. distribution.
  13. 3. Neither the name of the OpenBLAS project nor the names of
  14. its contributors may be used to endorse or promote products
  15. derived from this software without specific prior written permission.
  16. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  17. AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  18. IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  19. ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
  20. LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  21. DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
  22. SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
  23. CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  24. OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
  25. USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  26. *******************************************************************************/
  27. #define ASSEMBLER
  28. #include "common.h"
  29. #include "loongarch64_asm.S"
  30. /*********************************************************************
  31. * 2024/02/20 guxiwei
  32. * UTEST : OK
  33. * CTEST : OK
  34. * TEST : OK
  35. *
  36. *
  37. *********************************************************************/
  38. /* int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i,
  39. * FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer)
  40. */
  41. #define M $r4
  42. #define N $r5
  43. #define ALPHA_R $f0
  44. #define ALPHA_I $f1
  45. #define A $r7
  46. #define LDA $r8
  47. #define X $r9
  48. #define INC_X $r10
  49. #define Y $r11
  50. #define INC_Y $r6
  51. #define J $r12
  52. #define I $r13
  53. #define K $r14
  54. #define Y_ORG $r15
  55. #define OFFSET $r16
  56. #define K_LDA $r17
  57. #define M16 $r18
  58. #define T0 $r19
  59. #define PA0 $r20
  60. #define PA1 $r23
  61. #define PA2 $r24
  62. #define PA3 $r25
  63. #define PA4 $r26
  64. #define PA5 $r27
  65. #define PA6 $r28
  66. #define PA7 $r29
  67. #define VALPHA $xr1
  68. #define X0 $xr2
  69. #define X1 $xr3
  70. #define X2 $xr4
  71. #define X3 $xr5
  72. #define X4 $xr6
  73. #define X5 $xr7
  74. #define X6 $xr8
  75. #define X7 $xr9
  76. #define Y0 $xr10
  77. #define Y1 $xr11
  78. #define A0 $xr12
  79. #define A1 $xr13
  80. #define A2 $xr14
  81. #define A3 $xr15
  82. #define A4 $xr16
  83. #define A5 $xr17
  84. #define A6 $xr18
  85. #define A7 $xr19
  86. #define A8 $xr20
  87. #define A9 $xr21
  88. #define A10 $xr22
  89. #define A11 $xr23
  90. #define A12 $xr24
  91. #define A13 $xr25
  92. #define A14 $xr26
  93. #define A15 $xr27
  94. #define TMP0 $xr28
  95. #define TMP1 $xr29
  96. #define TMP2 $xr30
  97. #if !defined(CONJ)
  98. #if !defined(XCONJ)
  99. #define GXCONJ 0
  100. #define GCONJ 0
  101. #else
  102. #define GXCONJ 1
  103. #define GCONJ 0
  104. #endif
  105. #else
  106. #if !defined(XCONJ)
  107. #define GXCONJ 0
  108. #define GCONJ 1
  109. #else
  110. #define GXCONJ 1
  111. #define GCONJ 1
  112. #endif
  113. #endif
  114. .macro ZLOAD_X_4
  115. GLD xv, , X0, X, 0x00, X1, X, 0x10, X2, X, 0x20, X3, X, 0x30
  116. GPERMI xv, q, X0, X0, 0, X1, X1, 0, X2, X2, 0, X3, X3, 0
  117. GCOMPLEXMUL GXCONJ, \
  118. xvf, d, X0, VALPHA, X0, TMP0, TMP1, TMP2, \
  119. X1, VALPHA, X1, TMP0, TMP1, TMP2, \
  120. X2, VALPHA, X2, TMP0, TMP1, TMP2, \
  121. X3, VALPHA, X3, TMP0, TMP1, TMP2
  122. .endm
  123. .macro ZLOAD_X_4_GAP
  124. xvld X0, X, 0
  125. xvpermi.q X0, X0, 0
  126. PTR_ADD T0, X, INC_X
  127. xvld X1, T0, 0
  128. xvpermi.q X1, X1, 0
  129. PTR_ADD T0, T0, INC_X
  130. xvld X2, T0, 0
  131. xvpermi.q X2, X2, 0
  132. PTR_ADD T0, T0, INC_X
  133. xvld X3, T0, 0
  134. xvpermi.q X3, X3, 0
  135. GCOMPLEXMUL GXCONJ, \
  136. xvf, d, X0, VALPHA, X0, TMP0, TMP1, TMP2, \
  137. X1, VALPHA, X1, TMP0, TMP1, TMP2, \
  138. X2, VALPHA, X2, TMP0, TMP1, TMP2, \
  139. X3, VALPHA, X3, TMP0, TMP1, TMP2
  140. .endm
  141. .macro ZLOAD_Y_4
  142. GLD xv, , Y0, Y, 0, Y1, Y, 0x20
  143. .endm
  144. .macro ZLOAD_Y_4_GAP
  145. vld $vr10, Y, 0
  146. vldx $vr13, Y, INC_Y
  147. PTR_ALSL T0, INC_Y, Y, 1
  148. vld $vr11, T0, 0
  149. vldx $vr14, T0, INC_Y
  150. GPERMI xv, q, Y0, A1, 0x02, Y1, A2, 0x02
  151. .endm
  152. .macro ZGEMV_N_4x4
  153. GLD_INC xv, , 0x20, \
  154. A0, PA0, 0, A1, PA0, 0, \
  155. A2, PA1, 0, A3, PA1, 0, \
  156. A4, PA2, 0, A5, PA2, 0, \
  157. A6, PA3, 0, A7, PA3, 0
  158. GCOMPLEXMADD GXCONJ, GCONJ, \
  159. xvf, d, Y0, X0, A0, Y0, TMP0, TMP1, TMP2, Y1, X0, A1, Y1, TMP0, TMP1, TMP2, \
  160. Y0, X1, A2, Y0, TMP0, TMP1, TMP2, Y1, X1, A3, Y1, TMP0, TMP1, TMP2, \
  161. Y0, X2, A4, Y0, TMP0, TMP1, TMP2, Y1, X2, A5, Y1, TMP0, TMP1, TMP2, \
  162. Y0, X3, A6, Y0, TMP0, TMP1, TMP2, Y1, X3, A7, Y1, TMP0, TMP1, TMP2
  163. .endm
  164. .macro ZSTORE_Y_4
  165. GST xv, , Y0, Y, 0, Y1, Y, 0x20
  166. .endm
  167. .macro ZSTORE_Y_4_GAP
  168. xvstelm.d Y0, Y, 0, 0
  169. xvstelm.d Y0, Y, 0x08, 1
  170. PTR_ADD T0, Y, INC_Y
  171. xvstelm.d Y0, T0, 0, 2
  172. xvstelm.d Y0, T0, 0x08, 3
  173. PTR_ADD T0, T0, INC_Y
  174. xvstelm.d Y1, T0, 0, 0
  175. xvstelm.d Y1, T0, 0x08, 1
  176. PTR_ADD T0, T0, INC_Y
  177. xvstelm.d Y1, T0, 0, 2
  178. xvstelm.d Y1, T0, 0x08, 3
  179. .endm
  180. .macro ZLOAD_Y_1
  181. vld $vr10, Y, 0
  182. .endm
  183. .macro ZGEMV_N_1x4
  184. GLD_INC v, , 0x10, $vr12, PA0, 0, $vr14, PA1, 0, $vr16, PA2, 0, $vr18, PA3, 0
  185. GCOMPLEXMADD GXCONJ, GCONJ, \
  186. xvf, d, Y0, X0, A0, Y0, TMP0, TMP1, TMP2, \
  187. Y0, X1, A2, Y0, TMP0, TMP1, TMP2, \
  188. Y0, X2, A4, Y0, TMP0, TMP1, TMP2, \
  189. Y0, X3, A6, Y0, TMP0, TMP1, TMP2
  190. .endm
  191. .macro ZSTORE_Y_1
  192. vst $vr10, Y, 0
  193. .endm
  194. .macro ZLOAD_X_1
  195. GLD xv, , X0, X, 0x00
  196. GPERMI xv, q, X0, X0, 0
  197. GCOMPLEXMUL GXCONJ, \
  198. xvf, d, X0, VALPHA, X0, TMP0, TMP1, TMP2
  199. .endm
  200. .macro ZGEMV_N_1x1
  201. GLD_INC v, , 0x10, $vr12, PA0, 0
  202. GCOMPLEXMADD GXCONJ, GCONJ, \
  203. xvf, d, Y0, X0, A0, Y0, TMP0, TMP1, TMP2
  204. .endm
  205. .macro ZGEMV_N_LASX XW:req, X_4:req, X_1:req, Y_4:req, Y_1:req
  206. PTR_SRLI J, N, 2
  207. beqz J, .L_\XW\()_N_3
  208. PTR_SLLI K_LDA, LDA, 2
  209. PTR_SUB K_LDA, K_LDA, M16
  210. .L_\XW\()_N_L4:
  211. ZLOAD_\X_4
  212. xor K, K, K
  213. move Y, Y_ORG
  214. PTR_SRLI I, M, 2
  215. beqz I, .L_\XW\()_M_3
  216. .align 5
  217. .L_\XW\()_M_L4:
  218. ZLOAD_\Y_4
  219. ZGEMV_N_4x4
  220. ZSTORE_\Y_4
  221. PTR_ADDI I, I, -1
  222. PTR_ALSL Y, INC_Y, Y, 2
  223. PTR_ADDI K, K, 4
  224. bnez I, .L_\XW\()_M_L4
  225. .L_\XW\()_M_3:
  226. andi I, M, 3
  227. beqz I, .L_\XW\()_M_END
  228. .align 5
  229. .L_\XW\()_M_L1:
  230. ZLOAD_\Y_1
  231. ZGEMV_N_1x4
  232. ZSTORE_\Y_1
  233. PTR_ADDI I, I, -1
  234. PTR_ADD Y, Y, INC_Y
  235. PTR_ADDI K, K, 1
  236. bnez I, .L_\XW\()_M_L1
  237. .L_\XW\()_M_END:
  238. PTR_ADDI J, J, -1
  239. #if __loongarch_grlen == 64
  240. GADD , d, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA
  241. #elif __loongarch_grlen == 32
  242. GADD , w, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA
  243. #else
  244. GADD , d, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA
  245. #endif
  246. PTR_ALSL X, INC_X, X, 2
  247. bnez J, .L_\XW\()_N_L4
  248. .L_\XW\()_N_3:
  249. andi J, N, 3
  250. beqz J, .L_END
  251. .L_\XW\()_N_L1:
  252. ZLOAD_\X_1
  253. xor K, K, K
  254. move Y, Y_ORG
  255. move I, M
  256. beqz I, .L_END
  257. .align 5
  258. .L_\XW\()_N_1_M_L1:
  259. ZLOAD_\Y_1
  260. ZGEMV_N_1x1
  261. ZSTORE_\Y_1
  262. PTR_ADDI I, I, -1
  263. PTR_ADD Y, Y, INC_Y
  264. PTR_ADDI K, K, 1
  265. bnez I, .L_\XW\()_N_1_M_L1
  266. .L_\XW\()_N_1_M_END:
  267. PTR_ADDI J, J, -1
  268. PTR_SUB K_LDA, LDA, M16
  269. PTR_ADD PA0, PA0, K_LDA
  270. PTR_ADD X, X, INC_X
  271. bnez J, .L_\XW\()_N_L1
  272. b .L_END
  273. .endm
  274. PROLOGUE
  275. PTR_LD INC_Y, $sp, 0
  276. push_if_used 7, 7
  277. PTR_ADDI K, $r0, 0x01
  278. PTR_SUB I, INC_X, K
  279. PTR_SUB J, INC_Y, K
  280. maskeqz I, K, I /* if(inc_x == 1) I = 0; else I = 1; */
  281. maskeqz J, K, J /* if(inc_y == 1) j = 0; else j = 1; */
  282. PTR_ALSL I, I, J, 1
  283. GSLLI , d, LDA, LDA, 4, INC_X, INC_X, 4, INC_Y, INC_Y, 4, M16, M, 4
  284. // Init VALPHA
  285. xvpackev.d $xr0, $xr1, $xr0
  286. xvreplve0.q VALPHA, $xr0
  287. move Y_ORG, Y
  288. move PA0, A
  289. #if __loongarch_grlen == 64
  290. GADD , d, PA1, PA0, LDA, PA2, PA1, LDA, PA3, PA2, LDA
  291. #elif __loongarch_grlen == 32
  292. GADD , w, PA1, PA0, LDA, PA2, PA1, LDA, PA3, PA2, LDA
  293. #else
  294. GADD , d, PA1, PA0, LDA, PA2, PA1, LDA, PA3, PA2, LDA
  295. #endif
  296. la.local T0, .L_GAP_TABLE
  297. PTR_ALSL I, I, T0, 1
  298. ld.h K, I, 0 // Obtain the offset address
  299. PTR_ADD T0, T0, K
  300. jirl $r0, T0, 0
  301. .L_GAP_TABLE:
  302. .hword .L_GAP_0_0 - .L_GAP_TABLE
  303. .hword .L_GAP_0_1 - .L_GAP_TABLE
  304. .hword .L_GAP_1_0 - .L_GAP_TABLE
  305. .hword .L_GAP_1_1 - .L_GAP_TABLE
  306. .L_GAP_0_0: /* if (inc_x == 1) && (incy == 1) */
  307. ZGEMV_N_LASX GAP_0_0, X_4, X_1, Y_4, Y_1
  308. .L_GAP_0_1: /* if (inc_x == 1) && (incy != 1) */
  309. ZGEMV_N_LASX GAP_0_1, X_4, X_1, Y_4_GAP, Y_1
  310. .L_GAP_1_0: /* if (inc_x != 1) && (incy == 1) */
  311. ZGEMV_N_LASX GAP_1_0, X_4_GAP, X_1, Y_4, Y_1
  312. .L_GAP_1_1: /* if (inc_x != 1) && (incy != 1) */
  313. ZGEMV_N_LASX GAP_1_1, X_4_GAP, X_1, Y_4_GAP, Y_1
  314. .L_END:
  315. pop_if_used 7, 7
  316. jirl $r0, $r1, 0x0
  317. EPILOGUE