You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

dnrm2.S 7.7 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324
  1. /***************************************************************************
  2. Copyright (c) 2021, The OpenBLAS Project
  3. All rights reserved.
  4. Redistribution and use in source and binary forms, with or without
  5. modification, are permitted provided that the following conditions are
  6. met:
  7. 1. Redistributions of source code must retain the above copyright
  8. notice, this list of conditions and the following disclaimer.
  9. 2. Redistributions in binary form must reproduce the above copyright
  10. notice, this list of conditions and the following disclaimer in
  11. the documentation and/or other materials provided with the
  12. distribution.
  13. 3. Neither the name of the OpenBLAS project nor the names of
  14. its contributors may be used to endorse or promote products
  15. derived from this software without specific prior written permission.
  16. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  17. AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  18. IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  19. ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
  20. LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  21. DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
  22. SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
  23. CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  24. OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
  25. USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  26. *****************************************************************************/
  27. #define ASSEMBLER
  28. #include "common.h"
  29. #define N $r4
  30. #define X $r5
  31. #define INCX $r6
  32. #define XX $r7
  33. #define I $r17
  34. #define TEMP $r18
  35. #define a1 $f10
  36. #define a2 $f11
  37. #define a3 $f12
  38. #define a4 $f13
  39. #define a5 $f14
  40. #define a6 $f15
  41. #define a7 $f16
  42. #define a8 $f17
  43. #define t1 $f0
  44. #define t2 $f1
  45. #define t3 $f2
  46. #define t4 $f3
  47. #define s1 $f22
  48. #define s2 $f8
  49. #define s3 $f23
  50. #define s4 $f9
  51. #define ALPHA $f4
  52. #define max $f5
  53. #define INF $f6
  54. PROLOGUE
  55. #ifdef F_INTERFACE
  56. LDINT N, 0(N)
  57. LDINT INCX, 0(INCX)
  58. #endif
  59. // Init INF
  60. addi.d TEMP, $r0, 0x7FF
  61. slli.d TEMP, TEMP, 52
  62. MTC INF, TEMP
  63. MTC s1, $r0
  64. bge $r0, N, .L999
  65. slli.d INCX, INCX, BASE_SHIFT
  66. beq $r0, INCX, .L999
  67. move XX, X
  68. NOP
  69. LD a1, X, 0 * SIZE
  70. addi.d N, N, -1
  71. add.d X, X, INCX
  72. FABS s1, a1
  73. FABS s2, a1
  74. bge $r0, N, .L999
  75. FABS s3, a1
  76. srai.d I, N, 3
  77. FABS s4, a1
  78. bge $r0, I, .L15
  79. LD a1, X, 0 * SIZE
  80. add.d X, X, INCX
  81. LD a2, X, 0 * SIZE
  82. add.d X, X, INCX
  83. LD a3, X, 0 * SIZE
  84. add.d X, X, INCX
  85. LD a4, X, 0 * SIZE
  86. add.d X, X, INCX
  87. LD a5, X, 0 * SIZE
  88. add.d X, X, INCX
  89. LD a6, X, 0 * SIZE
  90. add.d X, X, INCX
  91. LD a7, X, 0 * SIZE
  92. add.d X, X, INCX
  93. LD a8, X, 0 * SIZE
  94. addi.d I, I, -1
  95. add.d X, X, INCX
  96. bge $r0, I, .L13
  97. .align 3
  98. .L12:
  99. FABS t1, a1
  100. LD a1, X, 0 * SIZE
  101. FABS t2, a2
  102. add.d X, X, INCX
  103. FABS t3, a3
  104. LD a2, X, 0 * SIZE
  105. FABS t4, a4
  106. add.d X, X, INCX
  107. CMPLT $fcc0, s1, t1
  108. LD a3, X, 0 * SIZE
  109. CMPLT $fcc1, s2, t2
  110. add.d X, X, INCX
  111. CMPLT $fcc2, s3, t3
  112. LD a4, X, 0 * SIZE
  113. CMPLT $fcc3, s4, t4
  114. add.d X, X, INCX
  115. CMOVT s1, s1, t1, $fcc0
  116. CMOVT s2, s2, t2, $fcc1
  117. CMOVT s3, s3, t3, $fcc2
  118. CMOVT s4, s4, t4, $fcc3
  119. FABS t1, a5
  120. LD a5, X, 0 * SIZE
  121. FABS t2, a6
  122. add.d X, X, INCX
  123. FABS t3, a7
  124. LD a6, X, 0 * SIZE
  125. FABS t4, a8
  126. add.d X, X, INCX
  127. CMPLT $fcc0, s1, t1
  128. LD a7, X, 0 * SIZE
  129. CMPLT $fcc1, s2, t2
  130. add.d X, X, INCX
  131. CMPLT $fcc2, s3, t3
  132. LD a8, X, 0 * SIZE
  133. CMPLT $fcc3, s4, t4
  134. add.d X, X, INCX
  135. CMOVT s1, s1, t1, $fcc0
  136. addi.d I, I, -1
  137. CMOVT s2, s2, t2, $fcc1
  138. CMOVT s3, s3, t3, $fcc2
  139. CMOVT s4, s4, t4, $fcc3
  140. blt $r0, I, .L12
  141. .align 3
  142. .L13:
  143. FABS t1, a1
  144. FABS t2, a2
  145. FABS t3, a3
  146. FABS t4, a4
  147. CMPLT $fcc0, s1, t1
  148. CMPLT $fcc1, s2, t2
  149. CMPLT $fcc2, s3, t3
  150. CMPLT $fcc3, s4, t4
  151. CMOVT s1, s1, t1, $fcc0
  152. CMOVT s2, s2, t2, $fcc1
  153. CMOVT s3, s3, t3, $fcc2
  154. CMOVT s4, s4, t4, $fcc3
  155. FABS t1, a5
  156. FABS t2, a6
  157. FABS t3, a7
  158. FABS t4, a8
  159. CMPLT $fcc0, s1, t1
  160. CMPLT $fcc1, s2, t2
  161. CMPLT $fcc2, s3, t3
  162. CMPLT $fcc3, s4, t4
  163. CMOVT s1, s1, t1, $fcc0
  164. CMOVT s2, s2, t2, $fcc1
  165. CMOVT s3, s3, t3, $fcc2
  166. CMOVT s4, s4, t4, $fcc3
  167. .align 3
  168. .L15:
  169. andi I, N, 7
  170. bge $r0, I, .L100
  171. .align 3
  172. .L16:
  173. LD a1, X, 0 * SIZE
  174. addi.d I, I, -1
  175. FABS t1, a1
  176. CMPLT $fcc0, s1, t1
  177. CMOVT s1, s1, t1, $fcc0
  178. add.d X, X, INCX
  179. blt $r0, I, .L16
  180. .align 3
  181. .L100:
  182. CMPLT $fcc0, s1, s2
  183. CMPLT $fcc1, s3, s4
  184. CMOVT s1, s1, s2, $fcc0
  185. CMOVT s3, s3, s4, $fcc1
  186. CMPLT $fcc0, s1, s3
  187. CMOVT s1, s1, s3, $fcc0
  188. addi.d N, N, 1
  189. lu12i.w TEMP, 0x3f800
  190. movgr2fr.d a1, $r0
  191. movgr2fr.w ALPHA, TEMP
  192. CMPEQ $fcc0, s1, a1
  193. fcvt.d.s ALPHA, ALPHA
  194. bcnez $fcc0, .L999
  195. fdiv.d ALPHA, ALPHA, s1
  196. CMPEQ $fcc0, INF, ALPHA
  197. bcnez $fcc0, .L999
  198. MOV max, s1
  199. MOV s1, a1
  200. MOV s2, a1
  201. MOV s3, a1
  202. MOV s4, a1
  203. srai.d I, N, 3
  204. bge $r0, I, .L105
  205. LD a1, XX, 0 * SIZE
  206. add.d XX, XX, INCX
  207. LD a2, XX, 0 * SIZE
  208. add.d XX, XX, INCX
  209. LD a3, XX, 0 * SIZE
  210. add.d XX, XX, INCX
  211. LD a4, XX, 0 * SIZE
  212. add.d XX, XX, INCX
  213. LD a5, XX, 0 * SIZE
  214. add.d XX, XX, INCX
  215. LD a6, XX, 0 * SIZE
  216. add.d XX, XX, INCX
  217. LD a7, XX, 0 * SIZE
  218. add.d XX, XX, INCX
  219. LD a8, XX, 0 * SIZE
  220. addi.d I, I, -1
  221. add.d XX, XX, INCX
  222. bge $r0, I, .L104
  223. .align 3
  224. .L103:
  225. MUL t1, ALPHA, a1
  226. LD a1, XX, 0 * SIZE
  227. MUL t2, ALPHA, a2
  228. add.d XX, XX, INCX
  229. MUL t3, ALPHA, a3
  230. LD a2, XX, 0 * SIZE
  231. MUL t4, ALPHA, a4
  232. add.d XX, XX, INCX
  233. MADD s1, t1, t1, s1
  234. LD a3, XX, 0 * SIZE
  235. MADD s2, t2, t2, s2
  236. add.d XX, XX, INCX
  237. MADD s3, t3, t3, s3
  238. LD a4, XX, 0 * SIZE
  239. MADD s4, t4, t4, s4
  240. add.d XX, XX, INCX
  241. MUL t1, ALPHA, a5
  242. LD a5, XX, 0 * SIZE
  243. MUL t2, ALPHA, a6
  244. add.d XX, XX, INCX
  245. MUL t3, ALPHA, a7
  246. LD a6, XX, 0 * SIZE
  247. MUL t4, ALPHA, a8
  248. add.d XX, XX, INCX
  249. MADD s1, t1, t1, s1
  250. LD a7, XX, 0 * SIZE
  251. MADD s2, t2, t2, s2
  252. add.d XX, XX, INCX
  253. MADD s3, t3, t3, s3
  254. LD a8, XX, 0 * SIZE
  255. MADD s4, t4, t4, s4
  256. addi.d I, I, -1
  257. add.d XX, XX, INCX
  258. blt $r0, I, .L103
  259. .align 3
  260. .L104:
  261. MUL t1, ALPHA, a1
  262. MUL t2, ALPHA, a2
  263. MUL t3, ALPHA, a3
  264. MUL t4, ALPHA, a4
  265. MADD s1, t1, t1, s1
  266. MADD s2, t2, t2, s2
  267. MADD s3, t3, t3, s3
  268. MADD s4, t4, t4, s4
  269. MUL t1, ALPHA, a5
  270. MUL t2, ALPHA, a6
  271. MUL t3, ALPHA, a7
  272. MUL t4, ALPHA, a8
  273. MADD s1, t1, t1, s1
  274. MADD s2, t2, t2, s2
  275. MADD s3, t3, t3, s3
  276. MADD s4, t4, t4, s4
  277. .align 3
  278. .L105:
  279. andi I, N, 7
  280. bge $r0, I, .L998
  281. .align 3
  282. .L106:
  283. LD a1, XX, 0 * SIZE
  284. addi.d I, I, -1
  285. MUL t1, ALPHA, a1
  286. add.d XX, XX, INCX
  287. MADD s1, t1, t1, s1
  288. blt $r0, I, .L106
  289. .align 3
  290. .L998:
  291. ADD s1, s1, s2
  292. ADD s3, s3, s4
  293. ADD s1, s1, s3
  294. fsqrt.d s1, s1
  295. move $r4, $r17
  296. MUL $f0, max, s1
  297. jirl $r0, $r1, 0x0
  298. .align 3
  299. .L999:
  300. move $r4, $r17
  301. fmov.d $f0, $f22
  302. jirl $r0, $r1, 0x0
  303. EPILOGUE