You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

dnrm2_lsx.S 6.5 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268
  1. /*****************************************************************************
  2. Copyright (c) 2023, The OpenBLAS Project
  3. All rights reserved.
  4. Redistribution and use in source and binary forms, with or without
  5. modification, are permitted provided that the following conditions are
  6. met:
  7. 1. Redistributions of source code must retain the above copyright
  8. notice, this list of conditions and the following disclaimer.
  9. 2. Redistributions in binary form must reproduce the above copyright
  10. notice, this list of conditions and the following disclaimer in
  11. the documentation and/or other materials provided with the
  12. distribution.
  13. 3. Neither the name of the OpenBLAS project nor the names of
  14. its contributors may be used to endorse or promote products
  15. derived from this software without specific prior written
  16. permission.
  17. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  18. AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  19. IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  20. ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
  21. LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  22. DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
  23. SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
  24. CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  25. OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
  26. USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  27. **********************************************************************************/
  28. #define ASSEMBLER
  29. #include "common.h"
  30. #define N $r4
  31. #define X $r5
  32. #define INCX $r6
  33. #define XX $r19
  34. #define I $r17
  35. #define TEMP $r18
  36. #define t1 $r12
  37. #define t2 $r13
  38. #define t3 $r14
  39. #define t4 $r15
  40. /* Don't change following FR unless you know the effects. */
  41. #define VX0 $vr15
  42. #define VX1 $vr16
  43. #define VM0 $vr17
  44. #define VM1 $vr18
  45. #define VM2 $vr13
  46. #define VM3 $vr14
  47. #define res1 $vr19
  48. #define res2 $vr20
  49. #define VALPHA $vr21
  50. #define INF $f23
  51. #define a1 $f22
  52. #define max $f17
  53. #define ALPHA $f12
  54. PROLOGUE
  55. #ifdef F_INTERFACE
  56. LDINT N, 0(N)
  57. LDINT INCX, 0(INCX)
  58. #endif
  59. vxor.v res1, res1, res1
  60. vxor.v res2, res2, res2
  61. vxor.v VM0, VM0, VM0
  62. bge $r0, N, .L999
  63. beq $r0, INCX, .L999
  64. move XX, X
  65. // Init INF
  66. addi.d TEMP, $r0, 0x7FF
  67. slli.d TEMP, TEMP, 52
  68. MTC INF, TEMP
  69. li.d TEMP, SIZE
  70. slli.d INCX, INCX, BASE_SHIFT
  71. srai.d I, N, 3
  72. bne INCX, TEMP, .L20
  73. bge $r0, I, .L97
  74. .align 3
  75. .L10:
  76. vld VX0, X, 0 * SIZE
  77. vld VX1, X, 2 * SIZE
  78. vfmaxa.d VM1, VX1, VX0
  79. vld VX0, X, 4 * SIZE
  80. vld VX1, X, 6 * SIZE
  81. vfmaxa.d VM2, VX1, VX0
  82. vfmaxa.d VM3, VM1, VM2
  83. vfmaxa.d VM0, VM0, VM3
  84. addi.d I, I, -1
  85. addi.d X, X, 8 * SIZE
  86. blt $r0, I, .L10
  87. b .L96
  88. .align 3
  89. .L20: // INCX!=1
  90. bge $r0, I, .L97
  91. .align 3
  92. .L21:
  93. ld.d t1, X, 0 * SIZE
  94. add.d X, X, INCX
  95. vinsgr2vr.d VX0, t1, 0
  96. ld.d t2, X, 0 * SIZE
  97. add.d X, X, INCX
  98. vinsgr2vr.d VX0, t2, 1
  99. ld.d t3, X, 0 * SIZE
  100. add.d X, X, INCX
  101. vinsgr2vr.d VX1, t3, 0
  102. ld.d t4, X, 0 * SIZE
  103. add.d X, X, INCX
  104. vinsgr2vr.d VX1, t4, 1
  105. vfmaxa.d VM1, VX0, VX1
  106. ld.d t1, X, 0 * SIZE
  107. add.d X, X, INCX
  108. vinsgr2vr.d VX0, t1, 0
  109. ld.d t2, X, 0 * SIZE
  110. add.d X, X, INCX
  111. vinsgr2vr.d VX0, t2, 1
  112. ld.d t3, X, 0 * SIZE
  113. add.d X, X, INCX
  114. vinsgr2vr.d VX1, t3, 0
  115. ld.d t4, X, 0 * SIZE
  116. add.d X, X, INCX
  117. vinsgr2vr.d VX1, t4, 1
  118. vfmaxa.d VM2, VX0, VX1
  119. vfmaxa.d VM3, VM1, VM2
  120. vfmaxa.d VM0, VM0, VM3
  121. addi.d I, I, -1
  122. blt $r0, I, .L21
  123. b .L96
  124. .align 3
  125. .L96:
  126. vreplvei.d VX0, VM0, 0
  127. vreplvei.d VX1, VM0, 1
  128. vfmaxa.d VM0, VX0, VX1
  129. .align 3
  130. .L97:
  131. andi I, N, 7
  132. bge $r0, I, .L99
  133. .align 3
  134. .L98:
  135. vld VX1, X, 0
  136. vfmaxa.d VM0, VM0, VX1
  137. addi.d I, I, -1
  138. add.d X, X, INCX
  139. blt $r0, I, .L98
  140. .align 3
  141. .L99:
  142. fabs.d max, max
  143. lu12i.w TEMP, 0x3f800 // 1
  144. movgr2fr.d a1, $r0
  145. movgr2fr.w ALPHA, TEMP
  146. CMPEQ $fcc0, max, a1
  147. fcvt.d.s ALPHA, ALPHA
  148. bcnez $fcc0, .L999
  149. fdiv.d ALPHA, ALPHA, max
  150. CMPEQ $fcc0, INF, ALPHA
  151. bcnez $fcc0, .L999
  152. movfr2gr.d TEMP, ALPHA
  153. vreplgr2vr.d VALPHA, TEMP
  154. .L100:
  155. li.d TEMP, SIZE
  156. bne INCX, TEMP, .L120
  157. srai.d I, N, 3
  158. bge $r0, I, .L997
  159. .align 3
  160. .L110:
  161. vld VX0, XX, 0 * SIZE
  162. vld VX1, XX, 2 * SIZE
  163. vfmul.d VM2, VX0, VALPHA
  164. vfmul.d VM3, VX1, VALPHA
  165. vfmadd.d res1, VM2, VM2, res1
  166. vfmadd.d res2, VM3, VM3, res2
  167. vld VX0, XX, 4 * SIZE
  168. vld VX1, XX, 6 * SIZE
  169. vfmul.d VM2, VX0, VALPHA
  170. vfmul.d VM3, VX1, VALPHA
  171. vfmadd.d res1, VM2, VM2, res1
  172. vfmadd.d res2, VM3, VM3, res2
  173. addi.d XX, XX, 8 * SIZE
  174. addi.d I, I, -1
  175. blt $r0, I, .L110
  176. b .L996
  177. .align 3
  178. .L120:
  179. srai.d I, N, 3
  180. bge $r0, I, .L997
  181. .align 3
  182. .L121:
  183. ld.d t1, XX, 0 * SIZE
  184. add.d XX, XX, INCX
  185. ld.d t2, XX, 0 * SIZE
  186. add.d XX, XX, INCX
  187. ld.d t3, XX, 0 * SIZE
  188. add.d XX, XX, INCX
  189. ld.d t4, XX, 0 * SIZE
  190. add.d XX, XX, INCX
  191. vinsgr2vr.d VX0, t1, 0
  192. vinsgr2vr.d VX0, t2, 1
  193. vinsgr2vr.d VX1, t3, 0
  194. vinsgr2vr.d VX1, t4, 1
  195. vfmul.d VM2, VX0, VALPHA
  196. ld.d t1, XX, 0 * SIZE
  197. add.d XX, XX, INCX
  198. vfmul.d VM3, VX1, VALPHA
  199. ld.d t2, XX, 0 * SIZE
  200. add.d XX, XX, INCX
  201. vfmadd.d res1, VM2, VM2, res1
  202. vfmadd.d res2, VM3, VM3, res2
  203. ld.d t3, XX, 0 * SIZE
  204. add.d XX, XX, INCX
  205. ld.d t4, XX, 0 * SIZE
  206. add.d XX, XX, INCX
  207. vinsgr2vr.d VX0, t1, 0
  208. vinsgr2vr.d VX0, t2, 1
  209. vinsgr2vr.d VX1, t3, 0
  210. vinsgr2vr.d VX1, t4, 1
  211. vfmul.d VM2, VX0, VALPHA
  212. vfmul.d VM3, VX1, VALPHA
  213. vfmadd.d res1, VM2, VM2, res1
  214. vfmadd.d res2, VM3, VM3, res2
  215. addi.d I, I, -1
  216. blt $r0, I, .L121
  217. b .L996
  218. .align 3
  219. .L996:
  220. vfadd.d res1, res1, res2
  221. vreplvei.d VX1, res1, 1
  222. vfadd.d res1, VX1, res1
  223. .align 3
  224. .L997:
  225. andi I, N, 7
  226. bge $r0, I, .L999
  227. .align 3
  228. .L998:
  229. fld.d $f15, XX, 0 * SIZE
  230. addi.d I, I, -1
  231. fmul.d $f15, $f15, ALPHA
  232. fmadd.d $f19, $f15, $f15, $f19
  233. add.d XX, XX , INCX
  234. blt $r0, I, .L998
  235. .align 3
  236. .L999:
  237. fsqrt.d $f19, $f19
  238. fmul.d $f0, max, $f19
  239. jirl $r0, $r1, 0x0
  240. EPILOGUE