You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

cswap_lsx.S 10 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421
  1. /*******************************************************************************
  2. Copyright (c) 2023, The OpenBLAS Project
  3. All rights reserved.
  4. Redistribution and use in source and binary forms, with or without
  5. modification, are permitted provided that the following conditions are
  6. met:
  7. 1. Redistributions of source code must retain the above copyright
  8. notice, this list of conditions and the following disclaimer.
  9. 2. Redistributions in binary form must reproduce the above copyright
  10. notice, this list of conditions and the following disclaimer in
  11. the documentation and/or other materials provided with the
  12. distribution.
  13. 3. Neither the name of the OpenBLAS project nor the names of
  14. its contributors may be used to endorse or promote products
  15. derived from this software without specific prior written permission.
  16. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  17. AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  18. IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  19. ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
  20. LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  21. DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
  22. SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
  23. CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  24. OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
  25. USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  26. *******************************************************************************/
  27. #define ASSEMBLER
  28. #include "common.h"
  29. #define N $r4
  30. #define X $r7
  31. #define INCX $r8
  32. #define Y $r9
  33. #define INCY $r10
  34. #define I $r17
  35. #define TEMP $r18
  36. #define XX $r5
  37. #define YY $r6
  38. #define t1 $r14
  39. #define t2 $r15
  40. #define t3 $r16
  41. #define t4 $r19
  42. #define a1 $f12
  43. #define a2 $f13
  44. #define a3 $f14
  45. #define a4 $f15
  46. #define b1 $f16
  47. #define b2 $f17
  48. #define b3 $f18
  49. #define b4 $f19
  50. #define VX0 $vr12
  51. #define VX1 $vr13
  52. #define VX2 $vr14
  53. #define VX3 $vr15
  54. PROLOGUE
  55. bge $r0, N, .L999
  56. li.d TEMP, 1
  57. slli.d TEMP, TEMP, ZBASE_SHIFT
  58. slli.d INCX, INCX, ZBASE_SHIFT
  59. slli.d INCY, INCY, ZBASE_SHIFT
  60. srai.d I, N, 2
  61. bne INCX, TEMP, .L20
  62. bne INCY, TEMP, .L12 // INCX==1 and INCY!=1
  63. b .L11 // INCX==1 and INCY==1
  64. .L20:
  65. bne INCY, TEMP, .L22 // INCX!=1 and INCY!=1
  66. b .L21 // INCX!=1 and INCY==1
  67. .L11:
  68. bge $r0, I, .L112
  69. .align 3
  70. .L111:
  71. #ifdef DOUBLE
  72. vld VX0, X, 0 * SIZE
  73. vld VX1, X, 2 * SIZE
  74. vld VX2, Y, 0 * SIZE
  75. vld VX3, Y, 2 * SIZE
  76. vst VX2, X, 0 * SIZE
  77. vst VX3, X, 2 * SIZE
  78. vst VX0, Y, 0 * SIZE
  79. vst VX1, Y, 2 * SIZE
  80. vld VX0, X, 4 * SIZE
  81. vld VX1, X, 6 * SIZE
  82. vld VX2, Y, 4 * SIZE
  83. vld VX3, Y, 6 * SIZE
  84. vst VX2, X, 4 * SIZE
  85. vst VX3, X, 6 * SIZE
  86. vst VX0, Y, 4 * SIZE
  87. vst VX1, Y, 6 * SIZE
  88. #else
  89. vld VX0, X, 0 * SIZE
  90. vld VX1, X, 4 * SIZE
  91. vld VX2, Y, 0 * SIZE
  92. vld VX3, Y, 4 * SIZE
  93. vst VX2, X, 0 * SIZE
  94. vst VX3, X, 4 * SIZE
  95. vst VX0, Y, 0 * SIZE
  96. vst VX1, Y, 4 * SIZE
  97. #endif
  98. addi.d I, I, -1
  99. addi.d X, X, 8 * SIZE
  100. addi.d Y, Y, 8 * SIZE
  101. blt $r0, I, .L111
  102. .align 3
  103. .L112:
  104. andi I, N, 3
  105. bge $r0, I, .L999
  106. .align 3
  107. .L113:
  108. LD a1, X, 0 * SIZE
  109. LD a2, X, 1 * SIZE
  110. LD a3, Y, 0 * SIZE
  111. LD a4, Y, 1 * SIZE
  112. ST a1, Y, 0 * SIZE
  113. ST a2, Y, 1 * SIZE
  114. ST a3, X, 0 * SIZE
  115. ST a4, X, 1 * SIZE
  116. addi.d I, I, -1
  117. addi.d X, X, 2 * SIZE
  118. addi.d Y, Y, 2 * SIZE
  119. blt $r0, I, .L113
  120. b .L999
  121. .align 3
  122. .L12: // INCX==1 and INCY!=1
  123. bge $r0, I, .L122
  124. .align 3
  125. .L121:
  126. #ifdef DOUBLE
  127. vld VX0, X, 0 * SIZE
  128. ld.d t1, Y, 0 * SIZE
  129. vstelm.d VX0, Y, 0 * SIZE, 0
  130. ld.d t2, Y, 1 * SIZE
  131. vstelm.d VX0, Y, 1 * SIZE, 1
  132. vinsgr2vr.d VX2, t1, 0
  133. vinsgr2vr.d VX2, t2, 1
  134. add.d Y, Y, INCY
  135. vst VX2, X, 0 * SIZE
  136. vld VX1, X, 2 * SIZE
  137. ld.d t3, Y, 0 * SIZE
  138. vstelm.d VX1, Y, 0 * SIZE, 0
  139. ld.d t4, Y, 1 * SIZE
  140. vstelm.d VX1, Y, 1 * SIZE, 1
  141. vinsgr2vr.d VX3, t3, 0
  142. vinsgr2vr.d VX3, t4, 1
  143. add.d Y, Y, INCY
  144. vst VX3, X, 2 * SIZE
  145. vld VX0, X, 4 * SIZE
  146. ld.d t1, Y, 0 * SIZE
  147. vstelm.d VX0, Y, 0 * SIZE, 0
  148. ld.d t2, Y, 1 * SIZE
  149. vstelm.d VX0, Y, 1 * SIZE, 1
  150. vinsgr2vr.d VX2, t1, 0
  151. vinsgr2vr.d VX2, t2, 1
  152. add.d Y, Y, INCY
  153. vst VX2, X, 4 * SIZE
  154. vld VX1, X, 6 * SIZE
  155. ld.d t3, Y, 0 * SIZE
  156. vstelm.d VX1, Y, 0 * SIZE, 0
  157. ld.d t4, Y, 1 * SIZE
  158. vstelm.d VX1, Y, 1 * SIZE, 1
  159. vinsgr2vr.d VX3, t3, 0
  160. vinsgr2vr.d VX3, t4, 1
  161. add.d Y, Y, INCY
  162. vst VX3, X, 6 * SIZE
  163. #else
  164. vld VX0, X, 0 * SIZE
  165. ld.w t1, Y, 0 * SIZE
  166. vstelm.w VX0, Y, 0 * SIZE, 0
  167. ld.w t2, Y, 1 * SIZE
  168. vstelm.w VX0, Y, 1 * SIZE, 1
  169. add.d Y, Y, INCY
  170. ld.w t3, Y, 0 * SIZE
  171. vstelm.w VX0, Y, 0 * SIZE, 2
  172. ld.w t4, Y, 1 * SIZE
  173. vstelm.w VX0, Y, 1 * SIZE, 3
  174. vinsgr2vr.w VX2, t1, 0
  175. vinsgr2vr.w VX2, t2, 1
  176. vinsgr2vr.w VX2, t3, 2
  177. vinsgr2vr.w VX2, t4, 3
  178. add.d Y, Y, INCY
  179. vst VX2, X, 0 * SIZE
  180. vld VX1, X, 4 * SIZE
  181. ld.w t1, Y, 0 * SIZE
  182. vstelm.w VX1, Y, 0 * SIZE, 0
  183. ld.w t2, Y, 1 * SIZE
  184. vstelm.w VX1, Y, 1 * SIZE, 1
  185. add.d Y, Y, INCY
  186. ld.w t3, Y, 0 * SIZE
  187. vstelm.w VX1, Y, 0 * SIZE, 2
  188. ld.w t4, Y, 1 * SIZE
  189. vstelm.w VX1, Y, 1 * SIZE, 3
  190. vinsgr2vr.w VX3, t1, 0
  191. vinsgr2vr.w VX3, t2, 1
  192. vinsgr2vr.w VX3, t3, 2
  193. vinsgr2vr.w VX3, t4, 3
  194. add.d Y, Y, INCY
  195. vst VX3, X, 4 * SIZE
  196. #endif
  197. addi.d X, X, 8 * SIZE
  198. addi.d I, I, -1
  199. blt $r0, I, .L121
  200. .align 3
  201. .L122:
  202. andi I, N, 3
  203. bge $r0, I, .L999
  204. .align 3
  205. .L123:
  206. LD a1, X, 0 * SIZE
  207. LD a2, X, 1 * SIZE
  208. LD a3, Y, 0 * SIZE
  209. LD a4, Y, 1 * SIZE
  210. ST a1, Y, 0 * SIZE
  211. ST a2, Y, 1 * SIZE
  212. ST a3, X, 0 * SIZE
  213. ST a4, X, 1 * SIZE
  214. addi.d I, I, -1
  215. addi.d X, X, 2 * SIZE
  216. add.d Y, Y, INCY
  217. blt $r0, I, .L123
  218. b .L999
  219. .align 3
  220. .L21:// INCX!=1 and INCY==1
  221. bge $r0, I, .L212
  222. .align 3
  223. .L211:
  224. #ifdef DOUBLE
  225. vld VX2, Y, 0 * SIZE
  226. ld.d t1, X, 0 * SIZE
  227. vstelm.d VX2, X, 0 * SIZE, 0
  228. ld.d t2, X, 1 * SIZE
  229. vstelm.d VX2, X, 1 * SIZE, 1
  230. vinsgr2vr.d VX0, t1, 0
  231. vinsgr2vr.d VX0, t2, 1
  232. add.d X, X, INCX
  233. vst VX0, Y, 0 * SIZE
  234. vld VX3, Y, 2 * SIZE
  235. ld.d t3, X, 0 * SIZE
  236. vstelm.d VX3, X, 0 * SIZE, 0
  237. ld.d t4, X, 1 * SIZE
  238. vstelm.d VX3, X, 1 * SIZE, 1
  239. vinsgr2vr.d VX1, t3, 0
  240. vinsgr2vr.d VX1, t4, 1
  241. add.d X, X, INCX
  242. vst VX1, Y, 2 * SIZE
  243. vld VX2, Y, 4 * SIZE
  244. ld.d t1, X, 0 * SIZE
  245. vstelm.d VX2, X, 0 * SIZE, 0
  246. ld.d t2, X, 1 * SIZE
  247. vstelm.d VX2, X, 1 * SIZE, 1
  248. vinsgr2vr.d VX0, t1, 0
  249. vinsgr2vr.d VX0, t2, 1
  250. add.d X, X, INCX
  251. vst VX0, Y, 4 * SIZE
  252. vld VX3, Y, 6 * SIZE
  253. ld.d t3, X, 0 * SIZE
  254. vstelm.d VX3, X, 0 * SIZE, 0
  255. ld.d t4, X, 1 * SIZE
  256. vstelm.d VX3, X, 1 * SIZE, 1
  257. vinsgr2vr.d VX1, t3, 0
  258. vinsgr2vr.d VX1, t4, 1
  259. add.d X, X, INCX
  260. vst VX1, Y, 6 * SIZE
  261. #else
  262. vld VX2, Y, 0 * SIZE
  263. ld.w t1, X, 0 * SIZE
  264. vstelm.w VX2, X, 0 * SIZE, 0
  265. ld.w t2, X, 1 * SIZE
  266. vstelm.w VX2, X, 1 * SIZE, 1
  267. add.d X, X, INCX
  268. ld.w t3, X, 0 * SIZE
  269. vstelm.w VX2, X, 0 * SIZE, 2
  270. ld.w t4, X, 1 * SIZE
  271. vstelm.w VX2, X, 1 * SIZE, 3
  272. vinsgr2vr.w VX0, t1, 0
  273. vinsgr2vr.w VX0, t2, 1
  274. vinsgr2vr.w VX0, t3, 2
  275. vinsgr2vr.w VX0, t4, 3
  276. add.d X, X, INCX
  277. vst VX0, Y, 0 * SIZE
  278. vld VX3, Y, 4 * SIZE
  279. ld.w t1, X, 0 * SIZE
  280. vstelm.w VX3, X, 0 * SIZE, 0
  281. ld.w t2, X, 1 * SIZE
  282. vstelm.w VX3, X, 1 * SIZE, 1
  283. add.d X, X, INCX
  284. ld.w t3, X, 0 * SIZE
  285. vstelm.w VX3, X, 0 * SIZE, 2
  286. ld.w t4, X, 1 * SIZE
  287. vstelm.w VX3, X, 1 * SIZE, 3
  288. vinsgr2vr.w VX1, t1, 0
  289. vinsgr2vr.w VX1, t2, 1
  290. vinsgr2vr.w VX1, t3, 2
  291. vinsgr2vr.w VX1, t4, 3
  292. add.d X, X, INCX
  293. vst VX1, Y, 4 * SIZE
  294. #endif
  295. addi.d Y, Y, 8 * SIZE
  296. addi.d I, I, -1
  297. blt $r0, I, .L211
  298. .align 3
  299. .L212:
  300. andi I, N, 3
  301. bge $r0, I, .L999
  302. .align 3
  303. .L213:
  304. LD a1, X, 0 * SIZE
  305. LD a2, X, 1 * SIZE
  306. LD a3, Y, 0 * SIZE
  307. LD a4, Y, 1 * SIZE
  308. ST a1, Y, 0 * SIZE
  309. ST a2, Y, 1 * SIZE
  310. ST a3, X, 0 * SIZE
  311. ST a4, X, 1 * SIZE
  312. addi.d I, I, -1
  313. add.d X, X, INCX
  314. addi.d Y, Y, 2 * SIZE
  315. blt $r0, I, .L213
  316. b .L999
  317. .align 3
  318. .L22:
  319. bge $r0, I, .L223
  320. .align 3
  321. move XX, X
  322. .L222:
  323. LD a1, X, 0 * SIZE
  324. LD a2, X, 1 * SIZE
  325. add.d X, X, INCX
  326. LD a3, X, 0 * SIZE
  327. LD a4, X, 1 * SIZE
  328. add.d X, X, INCX
  329. LD b1, Y, 0 * SIZE
  330. ST a1, Y, 0 * SIZE
  331. LD b2, Y, 1 * SIZE
  332. ST a2, Y, 1 * SIZE
  333. add.d Y, Y, INCY
  334. LD b3, Y, 0 * SIZE
  335. ST a3, Y, 0 * SIZE
  336. LD b4, Y, 1 * SIZE
  337. ST a4, Y, 1 * SIZE
  338. add.d Y, Y, INCY
  339. LD a1, X, 0 * SIZE
  340. ST b1, XX, 0 * SIZE
  341. LD a2, X, 1 * SIZE
  342. add.d X, X, INCX
  343. ST b2, XX, 1 * SIZE
  344. add.d XX, XX, INCX
  345. LD a3, X, 0 * SIZE
  346. ST b3, XX, 0 * SIZE
  347. LD a4, X, 1 * SIZE
  348. add.d X, X, INCX
  349. ST b4, XX, 1 * SIZE
  350. add.d XX, XX, INCX
  351. LD b1, Y, 0 * SIZE
  352. ST a1, Y, 0 * SIZE
  353. LD b2, Y, 1 * SIZE
  354. ST a2, Y, 1 * SIZE
  355. add.d Y, Y, INCY
  356. LD b3, Y, 0 * SIZE
  357. ST a3, Y, 0 * SIZE
  358. LD b4, Y, 1 * SIZE
  359. ST a4, Y, 1 * SIZE
  360. add.d Y, Y, INCY
  361. ST b1, XX, 0 * SIZE
  362. ST b2, XX, 1 * SIZE
  363. add.d XX, XX, INCX
  364. ST b3, XX, 0 * SIZE
  365. ST b4, XX, 1 * SIZE
  366. add.d XX, XX, INCX
  367. addi.d I, I, -1
  368. blt $r0, I, .L222
  369. .align 3
  370. .L223:
  371. andi I, N, 3
  372. bge $r0, I, .L999
  373. .align 3
  374. .L224:
  375. LD a1, X, 0 * SIZE
  376. LD a2, X, 1 * SIZE
  377. LD a3, Y, 0 * SIZE
  378. LD a4, Y, 1 * SIZE
  379. ST a1, Y, 0 * SIZE
  380. ST a2, Y, 1 * SIZE
  381. ST a3, X, 0 * SIZE
  382. ST a4, X, 1 * SIZE
  383. addi.d I, I, -1
  384. add.d X, X, INCX
  385. add.d Y, Y, INCY
  386. blt $r0, I, .L224
  387. .align 3
  388. .L999:
  389. move $r4, $r12
  390. jirl $r0, $r1, 0x0
  391. .align 3
  392. EPILOGUE