You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

copy_lsx.S 6.9 kB


  1. /*****************************************************************************
  2. Copyright (c) 2023, The OpenBLAS Project
  3. All rights reserved.
  4. Redistribution and use in source and binary forms, with or without
  5. modification, are permitted provided that the following conditions are
  6. met:
  7. 1. Redistributions of source code must retain the above copyright
  8. notice, this list of conditions and the following disclaimer.
  9. 2. Redistributions in binary form must reproduce the above copyright
  10. notice, this list of conditions and the following disclaimer in
  11. the documentation and/or other materials provided with the
  12. distribution.
  13. 3. Neither the name of the OpenBLAS project nor the names of
  14. its contributors may be used to endorse or promote products
  15. derived from this software without specific prior written
  16. permission.
  17. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  18. AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  19. IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  20. ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
  21. LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  22. DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
  23. SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
  24. CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  25. OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
  26. USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  27. **********************************************************************************/
  28. #define ASSEMBLER
  29. #include "common.h"
  30. #define N $r4
  31. #define X $r5
  32. #define INCX $r6
  33. #define Y $r7
  34. #define INCY $r8
  35. #define I $r17
  36. #define TEMP $r18
  37. #define t1 $r14
  38. #define t2 $r15
  39. #define t3 $r16
  40. #define t4 $r19
  41. #define a1 $f12
  42. #define a2 $f13
  43. #define a3 $f14
  44. #define a4 $f15
  45. #define VX0 $vr12
  46. #define VX1 $vr13
  47. PROLOGUE
  48. bge $r0, N, .L999
  49. li.d TEMP, 1
  50. slli.d TEMP, TEMP, BASE_SHIFT
  51. slli.d INCX, INCX, BASE_SHIFT
  52. slli.d INCY, INCY, BASE_SHIFT
  53. srai.d I, N, 3
  54. bne INCX, TEMP, .L20
  55. bne INCY, TEMP, .L12 // INCX==1 and INCY!=1
  56. b .L11 // INCX==1 and INCY==1
  57. .L20:
  58. bne INCY, TEMP, .L22 // INCX!=1 and INCY!=1
  59. b .L21 // INCX!=1 and INCY==1
  60. /* INCX==1 and INCY==1 */
  61. .L11:
  62. bge $r0, I, .L112
  63. .align 3
  64. .L111:
  65. vld VX0, X, 0
  66. vld VX1, X, 16
  67. addi.d I, I, -1
  68. vst VX0, Y, 0
  69. vst VX1, Y, 16
  70. #ifdef DOUBLE
  71. vld VX0, X, 32
  72. vld VX1, X, 48
  73. vst VX0, Y, 32
  74. vst VX1, Y, 48
  75. #endif
  76. addi.d X, X, 8 * SIZE
  77. addi.d Y, Y, 8 * SIZE
  78. blt $r0, I, .L111
  79. .align 3
  80. .L112:
  81. andi I, N, 7
  82. bge $r0, I, .L999
  83. .align 3
  84. .L113:
  85. LD $f12, X, 0
  86. addi.d I, I, -1
  87. addi.d X, X, SIZE
  88. ST $f12, Y, 0
  89. addi.d Y, Y, SIZE
  90. blt $r0, I, .L113
  91. b .L999
  92. .align 3
  93. /* INCX==1 and INCY!=1 */
  94. .L12:
  95. bge $r0, I, .L122
  96. .align 3
  97. .L121:
  98. #ifdef DOUBLE
  99. vld VX0, X, 0
  100. vld VX1, X, 16
  101. vstelm.d VX0, Y, 0, 0
  102. add.d Y, Y, INCY
  103. vstelm.d VX0, Y, 0, 1
  104. add.d Y, Y, INCY
  105. vstelm.d VX1, Y, 0, 0
  106. add.d Y, Y, INCY
  107. vstelm.d VX1, Y, 0, 1
  108. add.d Y, Y, INCY
  109. vld VX0, X, 32
  110. vld VX1, X, 48
  111. vstelm.d VX0, Y, 0, 0
  112. add.d Y, Y, INCY
  113. vstelm.d VX0, Y, 0, 1
  114. add.d Y, Y, INCY
  115. vstelm.d VX1, Y, 0, 0
  116. add.d Y, Y, INCY
  117. vstelm.d VX1, Y, 0, 1
  118. add.d Y, Y, INCY
  119. #else
  120. vld VX0, X, 0
  121. vld VX1, X, 16
  122. vstelm.w VX0, Y, 0, 0
  123. add.d Y, Y, INCY
  124. vstelm.w VX0, Y, 0, 1
  125. add.d Y, Y, INCY
  126. vstelm.w VX0, Y, 0, 2
  127. add.d Y, Y, INCY
  128. vstelm.w VX0, Y, 0, 3
  129. add.d Y, Y, INCY
  130. vstelm.w VX1, Y, 0, 0
  131. add.d Y, Y, INCY
  132. vstelm.w VX1, Y, 0, 1
  133. add.d Y, Y, INCY
  134. vstelm.w VX1, Y, 0, 2
  135. add.d Y, Y, INCY
  136. vstelm.w VX1, Y, 0, 3
  137. add.d Y, Y, INCY
  138. #endif
  139. addi.d X, X, 8 * SIZE
  140. addi.d I, I, -1
  141. blt $r0, I, .L121
  142. .align 3
  143. .L122:
  144. andi I, N, 7
  145. bge $r0, I, .L999
  146. .align 3
  147. .L123:
  148. LD $f12, X, 0
  149. addi.d I, I, -1
  150. addi.d X, X, SIZE
  151. ST $f12, Y, 0
  152. add.d Y, Y, INCY
  153. blt $r0, I, .L123
  154. b .L999
  155. .align 3
  156. /* INCX!=1 and INCY==1 */
  157. .L21:
  158. bge $r0, I, .L212
  159. .align 3
  160. .L211:
  161. #ifdef DOUBLE
  162. ld.d t1, X, 0
  163. add.d X, X, INCX
  164. ld.d t2, X, 0
  165. add.d X, X, INCX
  166. ld.d t3, X, 0
  167. add.d X, X, INCX
  168. ld.d t4, X, 0
  169. add.d X, X, INCX
  170. vinsgr2vr.d VX0, t1, 0
  171. vinsgr2vr.d VX0, t2, 1
  172. vinsgr2vr.d VX1, t3, 0
  173. vinsgr2vr.d VX1, t4, 1
  174. vst VX0, Y, 0
  175. vst VX1, Y, 16
  176. ld.d t1, X, 0
  177. add.d X, X, INCX
  178. ld.d t2, X, 0
  179. add.d X, X, INCX
  180. ld.d t3, X, 0
  181. add.d X, X, INCX
  182. ld.d t4, X, 0
  183. add.d X, X, INCX
  184. vinsgr2vr.d VX0, t1, 0
  185. vinsgr2vr.d VX0, t2, 1
  186. vinsgr2vr.d VX1, t3, 0
  187. vinsgr2vr.d VX1, t4, 1
  188. vst VX0, Y, 32
  189. vst VX1, Y, 48
  190. #else
  191. ld.w t1, X, 0
  192. add.d X, X, INCX
  193. ld.w t2, X, 0
  194. add.d X, X, INCX
  195. ld.w t3, X, 0
  196. add.d X, X, INCX
  197. ld.w t4, X, 0
  198. add.d X, X, INCX
  199. vinsgr2vr.w VX0, t1, 0
  200. vinsgr2vr.w VX0, t2, 1
  201. vinsgr2vr.w VX0, t3, 2
  202. vinsgr2vr.w VX0, t4, 3
  203. vst VX0, Y, 0
  204. ld.w t1, X, 0
  205. add.d X, X, INCX
  206. ld.w t2, X, 0
  207. add.d X, X, INCX
  208. ld.w t3, X, 0
  209. add.d X, X, INCX
  210. ld.w t4, X, 0
  211. add.d X, X, INCX
  212. vinsgr2vr.w VX1, t1, 0
  213. vinsgr2vr.w VX1, t2, 1
  214. vinsgr2vr.w VX1, t3, 2
  215. vinsgr2vr.w VX1, t4, 3
  216. vst VX1, Y, 16
  217. #endif
  218. addi.d Y, Y, 8 * SIZE
  219. addi.d I, I, -1
  220. blt $r0, I, .L211
  221. .align 3
  222. .L212:
  223. andi I, N, 7
  224. bge $r0, I, .L999
  225. .align 3
  226. .L213:
  227. LD $f12, X, 0
  228. addi.d I, I, -1
  229. ST $f12, Y, 0
  230. add.d X, X, INCX
  231. addi.d Y, Y, SIZE
  232. blt $r0, I, .L213
  233. b .L999
  234. .align 3
  235. /* INCX!=1 and INCY!=1 */
  236. .L22:
  237. bge $r0, I, .L223
  238. .align 3
  239. .L222:
  240. LD a1, X, 0
  241. add.d X, X, INCX
  242. LD a2, X, 0
  243. add.d X, X, INCX
  244. LD a3, X, 0
  245. add.d X, X, INCX
  246. LD a4, X, 0
  247. add.d X, X, INCX
  248. ST a1, Y, 0
  249. add.d Y, Y, INCY
  250. ST a2, Y, 0
  251. add.d Y, Y, INCY
  252. ST a3, Y, 0
  253. add.d Y, Y, INCY
  254. ST a4, Y, 0
  255. add.d Y, Y, INCY
  256. LD a1, X, 0
  257. add.d X, X, INCX
  258. LD a2, X, 0
  259. add.d X, X, INCX
  260. LD a3, X, 0
  261. add.d X, X, INCX
  262. LD a4, X, 0
  263. add.d X, X, INCX
  264. ST a1, Y, 0
  265. add.d Y, Y, INCY
  266. ST a2, Y, 0
  267. add.d Y, Y, INCY
  268. ST a3, Y, 0
  269. add.d Y, Y, INCY
  270. ST a4, Y, 0
  271. add.d Y, Y, INCY
  272. addi.d I, I, -1
  273. blt $r0, I, .L222
  274. .align 3
  275. .L223:
  276. andi I, N, 7
  277. bge $r0, I, .L999
  278. .align 3
  279. .L224:
  280. LD $f12, X, 0
  281. addi.d I, I, -1
  282. ST $f12, Y, 0
  283. add.d X, X, INCX
  284. add.d Y, Y, INCY
  285. blt $r0, I, .L224
  286. .align 3
  287. .L999:
  288. move $r4, $r12
  289. jirl $r0, $r1, 0x0
  290. .align 3
  291. EPILOGUE