You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

znrm2_lasx.S 6.2 kB


  1. /***************************************************************************
  2. Copyright (c) 2023, The OpenBLAS Project
  3. All rights reserved.
  4. Redistribution and use in source and binary forms, with or without
  5. modification, are permitted provided that the following conditions are
  6. met:
  7. 1. Redistributions of source code must retain the above copyright
  8. notice, this list of conditions and the following disclaimer.
  9. 2. Redistributions in binary form must reproduce the above copyright
  10. notice, this list of conditions and the following disclaimer in
  11. the documentation and/or other materials provided with the
  12. distribution.
  13. 3. Neither the name of the OpenBLAS project nor the names of
  14. its contributors may be used to endorse or promote products
  15. derived from this software without specific prior written permission.
  16. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  17. AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  18. IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  19. ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
  20. LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  21. DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
  22. SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
  23. CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  24. OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
  25. USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  26. *****************************************************************************/
  27. #define ASSEMBLER
  28. #include "common.h"
  29. #define N $r4
  30. #define X $r5
  31. #define INCX $r6
  32. #define XX $r19
  33. #define I $r17
  34. #define TEMP $r18
  35. #define t1 $r12
  36. #define t2 $r13
  37. #define t3 $r14
  38. #define t4 $r15
  39. #define INF $f23
  40. #define a1 $f22
  41. #define max $f17
  42. #define ALPHA $f12
  43. #define a3 $f15
  44. #define a2 $f16
  45. #define VX0 $xr15
  46. #define VX1 $xr16
  47. #define VM0 $xr17
  48. #define VM1 $xr18
  49. #define VM2 $xr13
  50. #define VM3 $xr14
  51. #define res1 $xr19
  52. #define res2 $xr20
  53. #define VALPHA $xr21
  54. PROLOGUE
  55. #ifdef F_INTERFACE
  56. LDINT N, 0(N)
  57. LDINT INCX, 0(INCX)
  58. #endif
  59. xvxor.v res1, res1, res1
  60. xvxor.v res2, res2, res2
  61. xvxor.v VM0, VM0, VM0
  62. bge $r0, N, .L999
  63. beq $r0, INCX, .L999
  64. move XX, X
  65. // Init INF
  66. addi.d TEMP, $r0, 0x7FF
  67. slli.d TEMP, TEMP, 52
  68. MTC INF, TEMP
  69. li.d TEMP, 1
  70. slli.d TEMP, TEMP, ZBASE_SHIFT
  71. slli.d INCX, INCX, ZBASE_SHIFT
  72. srai.d I, N, 2
  73. bne INCX, TEMP, .L20
  74. bge $r0, I, .L97
  75. .align 3
  76. .L10:
  77. xvld VX0, X, 0 * SIZE
  78. xvld VX1, X, 4 * SIZE
  79. xvfmaxa.d VM1, VX1, VX0
  80. xvfmaxa.d VM0, VM0, VM1
  81. addi.d I, I, -1
  82. addi.d X, X, 8 * SIZE
  83. blt $r0, I, .L10
  84. b .L96
  85. .align 3
  86. .L20: // INCX!=1
  87. bge $r0, I, .L97
  88. .align 3
  89. .L21:
  90. ld.d t1, X, 0 * SIZE
  91. ld.d t2, X, 1 * SIZE
  92. add.d X, X, INCX
  93. ld.d t3, X, 0 * SIZE
  94. ld.d t4, X, 1 * SIZE
  95. add.d X, X, INCX
  96. xvinsgr2vr.d VX0, t1, 0
  97. xvinsgr2vr.d VX0, t2, 1
  98. xvinsgr2vr.d VX0, t3, 2
  99. xvinsgr2vr.d VX0, t4, 3
  100. ld.d t1, X, 0 * SIZE
  101. ld.d t2, X, 1 * SIZE
  102. add.d X, X, INCX
  103. ld.d t3, X, 0 * SIZE
  104. ld.d t4, X, 1 * SIZE
  105. xvinsgr2vr.d VX1, t1, 0
  106. xvinsgr2vr.d VX1, t2, 1
  107. xvinsgr2vr.d VX1, t3, 2
  108. xvinsgr2vr.d VX1, t4, 3
  109. add.d X, X, INCX
  110. xvfmaxa.d VM1, VX0, VX1
  111. xvfmaxa.d VM0, VM0, VM1
  112. addi.d I, I, -1
  113. blt $r0, I, .L21
  114. b .L96
  115. .align 3
  116. .L96:
  117. xvpickve.d VX0, VM0, 1
  118. xvpickve.d VX1, VM0, 2
  119. xvpickve.d VM3, VM0, 3
  120. xvfmaxa.d VM1, VX0, VX1
  121. xvfmaxa.d VM2, VM3, VM0
  122. xvfmaxa.d VM0, VM1, VM2
  123. .align 3
  124. .L97:
  125. andi I, N, 3
  126. bge $r0, I, .L99
  127. .align 3
  128. .L98:
  129. fld.d a3, X, 0 * SIZE
  130. fld.d a2, X, 1 * SIZE
  131. fmaxa.d a3, a2, a3
  132. fmaxa.d max, a3, max
  133. addi.d I, I, -1
  134. add.d X, X, INCX
  135. blt $r0, I, .L98
  136. .align 3
  137. .L99:
  138. fabs.d max, max
  139. lu12i.w TEMP, 0x3f800 // 1
  140. movgr2fr.d a1, $r0
  141. movgr2fr.w ALPHA, TEMP
  142. CMPEQ $fcc0, max, a1
  143. fcvt.d.s ALPHA, ALPHA
  144. bcnez $fcc0, .L999
  145. fdiv.d ALPHA, ALPHA, max
  146. CMPEQ $fcc0, INF, ALPHA
  147. bcnez $fcc0, .L999
  148. movfr2gr.d TEMP, ALPHA
  149. xvreplgr2vr.d VALPHA, TEMP
  150. .L100:
  151. li.d TEMP, 1
  152. slli.d TEMP, TEMP, ZBASE_SHIFT
  153. srai.d I, N, 2
  154. bne INCX, TEMP, .L120
  155. bge $r0, I, .L997
  156. .align 3
  157. .L110:
  158. xvld VX0, XX, 0 * SIZE
  159. xvld VX1, XX, 4 * SIZE
  160. xvfmul.d VM2, VX0, VALPHA
  161. xvfmul.d VM3, VX1, VALPHA
  162. xvfmadd.d res1, VM2, VM2, res1
  163. xvfmadd.d res2, VM3, VM3, res2
  164. addi.d XX, XX, 8 * SIZE
  165. addi.d I, I, -1
  166. blt $r0, I, .L110
  167. b .L996
  168. .align 3
  169. .L120:
  170. bge $r0, I, .L997
  171. .align 3
  172. .L121:
  173. ld.d t1, XX, 0 * SIZE
  174. ld.d t2, XX, 1 * SIZE
  175. add.d XX, XX, INCX
  176. ld.d t3, XX, 0 * SIZE
  177. ld.d t4, XX, 1 * SIZE
  178. add.d XX, XX, INCX
  179. xvinsgr2vr.d VX0, t1, 0
  180. xvinsgr2vr.d VX0, t2, 1
  181. xvinsgr2vr.d VX0, t3, 2
  182. xvinsgr2vr.d VX0, t4, 3
  183. ld.d t1, XX, 0 * SIZE
  184. ld.d t2, XX, 1 * SIZE
  185. add.d XX, XX, INCX
  186. ld.d t3, XX, 0 * SIZE
  187. ld.d t4, XX, 1 * SIZE
  188. add.d XX, XX, INCX
  189. xvinsgr2vr.d VX1, t1, 0
  190. xvinsgr2vr.d VX1, t2, 1
  191. xvinsgr2vr.d VX1, t3, 2
  192. xvinsgr2vr.d VX1, t4, 3
  193. xvfmul.d VM2, VX0, VALPHA
  194. xvfmul.d VM3, VX1, VALPHA
  195. xvfmadd.d res1, VM2, VM2, res1
  196. xvfmadd.d res2, VM3, VM3, res2
  197. addi.d I, I, -1
  198. blt $r0, I, .L121
  199. b .L996
  200. .align 3
  201. .L996:
  202. xvfadd.d res1, res1, res2
  203. xvpickve.d VX0, res1, 1
  204. xvpickve.d VX1, res1, 2
  205. xvpickve.d VM2, res1, 3
  206. xvfadd.d res1, VX0, res1
  207. xvfadd.d VX1, VX1, VM2
  208. xvfadd.d res1, VX1, res1
  209. .align 3
  210. .L997:
  211. andi I, N, 3
  212. bge $r0, I, .L999
  213. .align 3
  214. .L998:
  215. fld.d a3, XX, 0 * SIZE
  216. fld.d a2, XX, 1 * SIZE
  217. addi.d I, I, -1
  218. fmul.d a3, a3, ALPHA
  219. fmadd.d $f19, a3, a3, $f19
  220. fmul.d a2, a2, ALPHA
  221. fmadd.d $f19, a2, a2, $f19
  222. add.d XX, XX , INCX
  223. blt $r0, I, .L998
  224. .align 3
  225. .L999:
  226. fsqrt.d $f19, $f19
  227. fmul.d $f0, max, $f19
  228. jirl $r0, $r1, 0x0
  229. .align 3
  230. EPILOGUE