You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

csum_lasx.S 7.0 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274
  1. /*******************************************************************************
  2. Copyright (c) 2023, The OpenBLAS Project
  3. All rights reserved.
  4. Redistribution and use in source and binary forms, with or without
  5. modification, are permitted provided that the following conditions are
  6. met:
  7. 1. Redistributions of source code must retain the above copyright
  8. notice, this list of conditions and the following disclaimer.
  9. 2. Redistributions in binary form must reproduce the above copyright
  10. notice, this list of conditions and the following disclaimer in
  11. the documentation and/or other materials provided with the
  12. distribution.
  13. 3. Neither the name of the OpenBLAS project nor the names of
  14. its contributors may be used to endorse or promote products
  15. derived from this software without specific prior written permission.
  16. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  17. AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  18. IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  19. ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
  20. LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  21. DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
  22. SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
  23. CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  24. OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
  25. USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  26. *******************************************************************************/
  27. #define ASSEMBLER
  28. #include "common.h"
  29. #define N $r4
  30. #define X $r5
  31. #define INCX $r6
  32. #define I $r17
  33. #define TEMP $r18
  34. #define t1 $r15
  35. #define t2 $r12
  36. #define t3 $r13
  37. #define t4 $r14
  38. #define a1 $f12
  39. #define a2 $f13
  40. #define a3 $f14
  41. #define a4 $f15
  42. #define s1 $f16
  43. #define VX0 $xr12
  44. #define VX1 $xr13
  45. #define VX2 $xr14
  46. #define VX3 $xr15
  47. #define res1 $xr16
  48. #define res2 $xr17
  49. PROLOGUE
  50. xvxor.v res1, res1, res1
  51. xvxor.v res2, res2, res2
  52. bge $r0, N, .L999
  53. bge $r0, INCX, .L999
  54. li.d TEMP, 1
  55. slli.d TEMP, TEMP, ZBASE_SHIFT
  56. slli.d INCX, INCX, ZBASE_SHIFT
  57. srai.d I, N, 3
  58. bne INCX, TEMP, .L20
  59. bge $r0, I, .L13
  60. .align 3
  61. .L11:
  62. #ifdef DOUBLE
  63. xvld VX0, X, 0 * SIZE
  64. xvld VX1, X, 4 * SIZE
  65. xvfadd.d res2, VX0, VX1
  66. xvfadd.d res1, res1, res2
  67. xvld VX2, X, 8 * SIZE
  68. xvld VX3, X, 12 * SIZE
  69. xvfadd.d res2, VX2, VX3
  70. xvfadd.d res1, res1, res2
  71. #else
  72. xvld VX0, X, 0 * SIZE
  73. xvld VX1, X, 8 * SIZE
  74. xvfadd.s res2, VX0, VX1
  75. xvfadd.s res1, res2, res1
  76. #endif
  77. addi.d X, X, 16 * SIZE
  78. addi.d I, I, -1
  79. blt $r0, I, .L11
  80. .align 3
  81. .L12:
  82. #ifdef DOUBLE
  83. xvpickve.d VX1, res1, 1
  84. xvpickve.d VX2, res1, 2
  85. xvpickve.d VX3, res1, 3
  86. xvfadd.d res1, VX1, res1
  87. xvfadd.d res1, VX2, res1
  88. xvfadd.d res1, VX3, res1
  89. #else
  90. xvfadd.s res2, res1, res2
  91. xvpickve.w VX1, res1, 1
  92. xvpickve.w VX2, res1, 2
  93. xvpickve.w VX3, res1, 3
  94. xvfadd.s res1, VX1, res1
  95. xvfadd.s res1, VX2, res1
  96. xvfadd.s res1, VX3, res1
  97. xvpickve.w VX0, res2, 4
  98. xvpickve.w VX1, res2, 5
  99. xvpickve.w VX2, res2, 6
  100. xvpickve.w VX3, res2, 7
  101. xvfadd.s res1, VX0, res1
  102. xvfadd.s res1, VX1, res1
  103. xvfadd.s res1, VX2, res1
  104. xvfadd.s res1, VX3, res1
  105. #endif
  106. .align 3
  107. .L13:
  108. andi I, N, 7
  109. bge $r0, I, .L999
  110. .align 3
  111. .L14:
  112. LD a1, X, 0 * SIZE
  113. LD a2, X, 1 * SIZE
  114. ADD a1, a1, a2
  115. ADD s1, a1, s1
  116. addi.d I, I, -1
  117. addi.d X, X, 2 * SIZE
  118. blt $r0, I, .L14
  119. b .L999
  120. .align 3
  121. .L20:
  122. bge $r0, I, .L23
  123. .align 3
  124. .L21:
  125. #ifdef DOUBLE
  126. ld.d t1, X, 0 * SIZE
  127. ld.d t2, X, 1 * SIZE
  128. add.d X, X, INCX
  129. ld.d t3, X, 0 * SIZE
  130. ld.d t4, X, 1 * SIZE
  131. add.d X, X, INCX
  132. xvinsgr2vr.d VX0, t1, 0
  133. xvinsgr2vr.d VX0, t2, 1
  134. xvinsgr2vr.d VX0, t3, 2
  135. xvinsgr2vr.d VX0, t4, 3
  136. ld.d t1, X, 0 * SIZE
  137. ld.d t2, X, 1 * SIZE
  138. add.d X, X, INCX
  139. ld.d t3, X, 0 * SIZE
  140. ld.d t4, X, 1 * SIZE
  141. add.d X, X, INCX
  142. xvinsgr2vr.d VX1, t1, 0
  143. xvinsgr2vr.d VX1, t2, 1
  144. xvinsgr2vr.d VX1, t3, 2
  145. xvinsgr2vr.d VX1, t4, 3
  146. xvfadd.d res2, VX0, VX1
  147. xvfadd.d res1, res1, res2
  148. ld.d t1, X, 0 * SIZE
  149. ld.d t2, X, 1 * SIZE
  150. add.d X, X, INCX
  151. ld.d t3, X, 0 * SIZE
  152. ld.d t4, X, 1 * SIZE
  153. add.d X, X, INCX
  154. xvinsgr2vr.d VX0, t1, 0
  155. xvinsgr2vr.d VX0, t2, 1
  156. xvinsgr2vr.d VX0, t3, 2
  157. xvinsgr2vr.d VX0, t4, 3
  158. ld.d t1, X, 0 * SIZE
  159. ld.d t2, X, 1 * SIZE
  160. add.d X, X, INCX
  161. ld.d t3, X, 0 * SIZE
  162. ld.d t4, X, 1 * SIZE
  163. add.d X, X, INCX
  164. xvinsgr2vr.d VX1, t1, 0
  165. xvinsgr2vr.d VX1, t2, 1
  166. xvinsgr2vr.d VX1, t3, 2
  167. xvinsgr2vr.d VX1, t4, 3
  168. xvfadd.d res2, VX0, VX1
  169. xvfadd.d res1, res1, res2
  170. #else
  171. ld.w t1, X, 0 * SIZE
  172. ld.w t2, X, 1 * SIZE
  173. add.d X, X, INCX
  174. ld.w t3, X, 0 * SIZE
  175. ld.w t4, X, 1 * SIZE
  176. add.d X, X, INCX
  177. xvinsgr2vr.w VX0, t1, 0
  178. xvinsgr2vr.w VX0, t2, 1
  179. xvinsgr2vr.w VX0, t3, 2
  180. xvinsgr2vr.w VX0, t4, 3
  181. ld.w t1, X, 0 * SIZE
  182. ld.w t2, X, 1 * SIZE
  183. add.d X, X, INCX
  184. ld.w t3, X, 0 * SIZE
  185. ld.w t4, X, 1 * SIZE
  186. add.d X, X, INCX
  187. xvinsgr2vr.w VX0, t1, 4
  188. xvinsgr2vr.w VX0, t2, 5
  189. xvinsgr2vr.w VX0, t3, 6
  190. xvinsgr2vr.w VX0, t4, 7
  191. ld.w t1, X, 0 * SIZE
  192. ld.w t2, X, 1 * SIZE
  193. add.d X, X, INCX
  194. ld.w t3, X, 0 * SIZE
  195. ld.w t4, X, 1 * SIZE
  196. add.d X, X, INCX
  197. xvinsgr2vr.w VX1, t1, 0
  198. xvinsgr2vr.w VX1, t2, 1
  199. xvinsgr2vr.w VX1, t3, 2
  200. xvinsgr2vr.w VX1, t4, 3
  201. ld.w t1, X, 0 * SIZE
  202. ld.w t2, X, 1 * SIZE
  203. add.d X, X, INCX
  204. ld.w t3, X, 0 * SIZE
  205. ld.w t4, X, 1 * SIZE
  206. add.d X, X, INCX
  207. xvinsgr2vr.w VX1, t1, 4
  208. xvinsgr2vr.w VX1, t2, 5
  209. xvinsgr2vr.w VX1, t3, 6
  210. xvinsgr2vr.w VX1, t4, 7
  211. xvfadd.s res2, VX0, VX1
  212. xvfadd.s res1, res2, res1
  213. #endif
  214. addi.d I, I, -1
  215. blt $r0, I, .L21
  216. .align 3
  217. .L22:
  218. #ifdef DOUBLE
  219. xvpickve.d VX1, res1, 1
  220. xvpickve.d VX2, res1, 2
  221. xvpickve.d VX3, res1, 3
  222. xvfadd.d res1, VX1, res1
  223. xvfadd.d res1, VX2, res1
  224. xvfadd.d res1, VX3, res1
  225. #else
  226. xvfadd.s res2, res1, res2
  227. xvpickve.w VX1, res1, 1
  228. xvpickve.w VX2, res1, 2
  229. xvpickve.w VX3, res1, 3
  230. xvfadd.s res1, VX1, res1
  231. xvfadd.s res1, VX2, res1
  232. xvfadd.s res1, VX3, res1
  233. xvpickve.w VX0, res2, 4
  234. xvpickve.w VX1, res2, 5
  235. xvpickve.w VX2, res2, 6
  236. xvpickve.w VX3, res2, 7
  237. xvfadd.s res1, VX0, res1
  238. xvfadd.s res1, VX1, res1
  239. xvfadd.s res1, VX2, res1
  240. xvfadd.s res1, VX3, res1
  241. #endif
  242. .align 3
  243. .L23:
  244. andi I, N, 7
  245. bge $r0, I, .L999
  246. .align 3
  247. .L24:
  248. LD a1, X, 0 * SIZE
  249. LD a2, X, 1 * SIZE
  250. ADD a1, a1, a2
  251. ADD s1, a1, s1
  252. addi.d I, I, -1
  253. add.d X, X, INCX
  254. blt $r0, I, .L24
  255. .align 3
  256. .L999:
  257. fmov.s $f0, $f16
  258. jirl $r0, $r1, 0x0
  259. .align 3
  260. EPILOGUE