You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

asum_lsx.S 6.6 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258
  1. /***************************************************************************
  2. Copyright (c) 2023, The OpenBLAS Project
  3. All rights reserved.
  4. Redistribution and use in source and binary forms, with or without
  5. modification, are permitted provided that the following conditions are
  6. met:
  7. 1. Redistributions of source code must retain the above copyright
  8. notice, this list of conditions and the following disclaimer.
  9. 2. Redistributions in binary form must reproduce the above copyright
  10. notice, this list of conditions and the following disclaimer in
  11. the documentation and/or other materials provided with the
  12. distribution.
  13. 3. Neither the name of the OpenBLAS project nor the names of
  14. its contributors may be used to endorse or promote products
  15. derived from this software without specific prior written permission.
  16. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  17. AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  18. IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  19. ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
  20. LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  21. DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
  22. SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
  23. CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  24. OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
  25. USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  26. *****************************************************************************/
  27. #define ASSEMBLER
  28. #include "common.h"
  29. #define N $r4
  30. #define X $r5
  31. #define INCX $r6
  32. #define I $r17
  33. #define TEMP $r18
  34. #define t1 $r15
  35. #define t2 $r12
  36. #define t3 $r13
  37. #define t4 $r14
  38. #define VX0 $vr12
  39. #define VX1 $vr13
  40. #define VX2 $vr14
  41. #define VX3 $vr15
  42. #define VT0 $vr23
  43. #define VT1 $vr22
  44. #define res1 $vr16
  45. #define res2 $vr17
  46. #define res0 $vr18
  47. #define neg1 $vr19
  48. PROLOGUE
  49. vxor.v res1, res1, res1
  50. vxor.v res2, res2, res2
  51. vxor.v res0, res0, res0
  52. bge $r0, N, .L999
  53. bge $r0, INCX, .L999
  54. #ifdef DOUBLE
  55. li.d t1, -1
  56. vreplgr2vr.d neg1, t1
  57. vffint.d.l neg1, neg1
  58. #else
  59. li.w t1, -1
  60. vreplgr2vr.w neg1, t1
  61. vffint.s.w neg1, neg1
  62. #endif
  63. li.d TEMP, SIZE
  64. slli.d INCX, INCX, BASE_SHIFT
  65. srai.d I, N, 3
  66. bne INCX, TEMP, .L20
  67. bge $r0, I, .L13
  68. .align 3
  69. .L11:
  70. #ifdef DOUBLE
  71. vld VX0, X, 0 * SIZE
  72. vld VX1, X, 2 * SIZE
  73. vfmul.d VX2, neg1, VX0
  74. vfmul.d VX3, neg1, VX1
  75. vfcmp.clt.d VT0, VX0, res0
  76. vfcmp.clt.d VT1, VX1, res0
  77. vbitsel.v VX0, VX0, VX2, VT0
  78. vbitsel.v VX1, VX1, VX3, VT1
  79. vfadd.d res2, VX0, VX1
  80. vfadd.d res1, res1, res2
  81. vld VX0, X, 4 * SIZE
  82. vld VX1, X, 6 * SIZE
  83. vfmul.d VX2, neg1, VX0
  84. vfmul.d VX3, neg1, VX1
  85. vfcmp.clt.d VT0, VX0, res0
  86. vfcmp.clt.d VT1, VX1, res0
  87. vbitsel.v VX0, VX0, VX2, VT0
  88. vbitsel.v VX1, VX1, VX3, VT1
  89. vfadd.d res2, VX0, VX1
  90. vfadd.d res1, res1, res2
  91. #else
  92. vld VX0, X, 0 * SIZE
  93. vld VX1, X, 4 * SIZE
  94. vfmul.s VX2, neg1, VX0
  95. vfmul.s VX3, neg1, VX1
  96. vfcmp.clt.s VT0, VX0, res0
  97. vfcmp.clt.s VT1, VX1, res0
  98. vbitsel.v VX0, VX0, VX2, VT0
  99. vbitsel.v VX1, VX1, VX3, VT1
  100. vfadd.s res2, VX0, VX1
  101. vfadd.s res1, res1, res2
  102. #endif
  103. addi.d X, X, 8 * SIZE
  104. addi.d I, I, -1
  105. blt $r0, I, .L11
  106. .align 3
  107. .L12:
  108. #ifdef DOUBLE
  109. vreplvei.d VX1, res1, 1
  110. vfadd.d res1, VX1, res1
  111. #else
  112. vreplvei.w VX1, res1, 1
  113. vreplvei.w VX2, res1, 2
  114. vreplvei.w VX3, res1, 3
  115. vfadd.s res1, VX1, res1
  116. vfadd.s res1, VX2, res1
  117. vfadd.s res1, VX3, res1
  118. #endif
  119. .align 3
  120. .L13:
  121. andi I, N, 7
  122. bge $r0, I, .L999
  123. .align 3
  124. .L14:
  125. LD $f12, X, 0 * SIZE
  126. FABS $f12, $f12
  127. ADD $f16, $f12, $f16
  128. addi.d I, I, -1
  129. addi.d X, X, SIZE
  130. blt $r0, I, .L14
  131. b .L999
  132. .align 3
  133. .L20:
  134. bge $r0, I, .L23
  135. .align 3
  136. .L21:
  137. #ifdef DOUBLE
  138. ld.d t1, X, 0 * SIZE
  139. add.d X, X, INCX
  140. ld.d t2, X, 0 * SIZE
  141. add.d X, X, INCX
  142. vinsgr2vr.d VX0, t1, 0
  143. vinsgr2vr.d VX0, t2, 1
  144. ld.d t1, X, 0 * SIZE
  145. add.d X, X, INCX
  146. ld.d t2, X, 0 * SIZE
  147. vinsgr2vr.d VX1, t1, 0
  148. vinsgr2vr.d VX1, t2, 1
  149. add.d X, X, INCX
  150. vfmul.d VX2, neg1, VX0
  151. vfmul.d VX3, neg1, VX1
  152. vfcmp.clt.d VT0, VX0, res0
  153. vfcmp.clt.d VT1, VX1, res0
  154. vbitsel.v VX0, VX0, VX2, VT0
  155. vbitsel.v VX1, VX1, VX3, VT1
  156. vfadd.d res2, VX0, VX1
  157. vfadd.d res1, res1, res2
  158. ld.d t3, X, 0 * SIZE
  159. add.d X, X, INCX
  160. ld.d t4, X, 0 * SIZE
  161. add.d X, X, INCX
  162. vinsgr2vr.d VX0, t3, 0
  163. vinsgr2vr.d VX0, t4, 1
  164. ld.d t3, X, 0 * SIZE
  165. add.d X, X, INCX
  166. ld.d t4, X, 0 * SIZE
  167. vinsgr2vr.d VX1, t3, 0
  168. vinsgr2vr.d VX1, t4, 1
  169. add.d X, X, INCX
  170. vfmul.d VX2, neg1, VX0
  171. vfmul.d VX3, neg1, VX1
  172. vfcmp.clt.d VT0, VX0, res0
  173. vfcmp.clt.d VT1, VX1, res0
  174. vbitsel.v VX0, VX0, VX2, VT0
  175. vbitsel.v VX1, VX1, VX3, VT1
  176. vfadd.d res2, VX0, VX1
  177. vfadd.d res1, res1, res2
  178. #else
  179. ld.w t1, X, 0 * SIZE
  180. add.d X, X, INCX
  181. ld.w t2, X, 0 * SIZE
  182. add.d X, X, INCX
  183. ld.w t3, X, 0 * SIZE
  184. add.d X, X, INCX
  185. ld.w t4, X, 0 * SIZE
  186. add.d X, X, INCX
  187. vinsgr2vr.w VX0, t1, 0
  188. vinsgr2vr.w VX0, t2, 1
  189. vinsgr2vr.w VX0, t3, 2
  190. vinsgr2vr.w VX0, t4, 3
  191. ld.w t1, X, 0 * SIZE
  192. add.d X, X, INCX
  193. ld.w t2, X, 0 * SIZE
  194. add.d X, X, INCX
  195. ld.w t3, X, 0 * SIZE
  196. add.d X, X, INCX
  197. ld.w t4, X, 0 * SIZE
  198. add.d X, X, INCX
  199. vinsgr2vr.w VX1, t1, 0
  200. vinsgr2vr.w VX1, t2, 1
  201. vinsgr2vr.w VX1, t3, 2
  202. vinsgr2vr.w VX1, t4, 3
  203. vfmul.s VX2, neg1, VX0
  204. vfmul.s VX3, neg1, VX1
  205. vfcmp.clt.s VT0, VX0, res0
  206. vfcmp.clt.s VT1, VX1, res0
  207. vbitsel.v VX0, VX0, VX2, VT0
  208. vbitsel.v VX1, VX1, VX3, VT1
  209. vfadd.s res2, VX0, VX1
  210. vfadd.s res1, res1, res2
  211. #endif
  212. addi.d I, I, -1
  213. blt $r0, I, .L21
  214. .align 3
  215. .L22:
  216. #ifdef DOUBLE
  217. vreplvei.d VX1, res1, 1
  218. vfadd.d res1, VX1, res1
  219. #else
  220. vreplvei.w VX1, res1, 1
  221. vreplvei.w VX2, res1, 2
  222. vreplvei.w VX3, res1, 3
  223. vfadd.s res1, VX1, res1
  224. vfadd.s res1, VX2, res1
  225. vfadd.s res1, VX3, res1
  226. #endif
  227. .align 3
  228. .L23:
  229. andi I, N, 7
  230. bge $r0, I, .L999
  231. .align 3
  232. .L24:
  233. LD $f12, X, 0 * SIZE
  234. FABS $f12, $f12
  235. ADD $f16, $f12, $f16
  236. addi.d I, I, -1
  237. add.d X, X, INCX
  238. blt $r0, I, .L24
  239. .align 3
  240. .L999:
  241. MOV $f0, $f16
  242. jirl $r0, $r1, 0x0
  243. .align 3
  244. EPILOGUE