You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

scal.S 7.7 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333
  1. /***************************************************************************
  2. Copyright (c) 2021, The OpenBLAS Project
  3. All rights reserved.
  4. Redistribution and use in source and binary forms, with or without
  5. modification, are permitted provided that the following conditions are
  6. met:
  7. 1. Redistributions of source code must retain the above copyright
  8. notice, this list of conditions and the following disclaimer.
  9. 2. Redistributions in binary form must reproduce the above copyright
  10. notice, this list of conditions and the following disclaimer in
  11. the documentation and/or other materials provided with the
  12. distribution.
  13. 3. Neither the name of the OpenBLAS project nor the names of
  14. its contributors may be used to endorse or promote products
  15. derived from this software without specific prior written permission.
  16. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  17. AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  18. IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  19. ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
  20. LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  21. DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
  22. SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
  23. CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  24. OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
  25. USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  26. *****************************************************************************/
  27. #define ASSEMBLER
  28. #include "common.h"
  29. #define N $r4
  30. #define X $r7
  31. #define INCX $r8
  32. #define I $r17
  33. #define TEMP $r18
  34. #define XX $r5
  35. #define ALPHA $f0
  36. #define a1 $f22
  37. #define a2 $f8
  38. #define a3 $f23
  39. #define a4 $f9
  40. #define a5 $f10
  41. #define a6 $f11
  42. #define a7 $f12
  43. #define a8 $f13
  44. #define t1 $f14
  45. #define t2 $f15
  46. #define t3 $f16
  47. #define t4 $f17
  48. PROLOGUE
  49. li.d TEMP, SIZE
  50. ld.d XX, $sp, 0 // Load dummy2
  51. slli.d XX, XX, BASE_SHIFT
  52. MTC a1, $r0
  53. slli.d INCX, INCX, BASE_SHIFT
  54. bge $r0, N, .L999
  55. CMPEQ $fcc0, ALPHA, a1
  56. bceqz $fcc0, .L50
  57. beq XX, TEMP, .L50 // if dummp2 == 1, do not directly copy 0
  58. srai.d I, N, 3
  59. bne INCX, TEMP, .L20
  60. bge $r0, I, .L15
  61. .align 3
  62. .L12:
  63. ST a1, X, 0 * SIZE
  64. ST a1, X, 1 * SIZE
  65. ST a1, X, 2 * SIZE
  66. ST a1, X, 3 * SIZE
  67. ST a1, X, 4 * SIZE
  68. ST a1, X, 5 * SIZE
  69. ST a1, X, 6 * SIZE
  70. ST a1, X, 7 * SIZE
  71. addi.w I, I, -1
  72. addi.d X, X, 8 * SIZE
  73. blt $r0, I, .L12
  74. .align 3
  75. .L15:
  76. andi I, N, 7
  77. bge $r0, I, .L999
  78. .align 3
  79. .L16:
  80. ST a1, X, 0 * SIZE
  81. addi.d I, I, -1
  82. addi.d X, X, SIZE
  83. blt $r0, I, .L16
  84. move $r4, $r17
  85. fmov.d $f0, $f22
  86. jirl $r0, $r1, 0x0
  87. .align 3
  88. .L20:
  89. srai.d I, N, 3
  90. bge $r0, I, .L25
  91. .align 3
  92. .L22:
  93. ST a1, X, 0 * SIZE
  94. add.d X, X, INCX
  95. ST a1, X, 0 * SIZE
  96. add.d X, X, INCX
  97. ST a1, X, 0 * SIZE
  98. add.d X, X, INCX
  99. ST a1, X, 0 * SIZE
  100. add.d X, X, INCX
  101. ST a1, X, 0 * SIZE
  102. add.d X, X, INCX
  103. ST a1, X, 0 * SIZE
  104. add.d X, X, INCX
  105. ST a1, X, 0 * SIZE
  106. add.d X, X, INCX
  107. ST a1, X, 0 * SIZE
  108. addi.d I, I, -1
  109. add.d X, X, INCX
  110. blt $r0, I, .L22
  111. .align 3
  112. .L25:
  113. andi I, N, 7
  114. bge $r0, I, .L999
  115. .align 3
  116. .L26:
  117. addi.d I, I, -1
  118. ST a1, X, 0 * SIZE
  119. add.d X, X, INCX
  120. blt $r0, I, .L26
  121. move $r4, $r17
  122. fmov.d $f0, $f22
  123. jirl $r0, $r1, 0x0
  124. .align 3
  125. .L50:
  126. srai.d I, N, 3
  127. bne INCX, TEMP, .L60
  128. addi.d I, I, -1
  129. blt I, $r0, .L55
  130. LD a1, X, 0 * SIZE
  131. LD a2, X, 1 * SIZE
  132. LD a3, X, 2 * SIZE
  133. LD a4, X, 3 * SIZE
  134. LD a5, X, 4 * SIZE
  135. LD a6, X, 5 * SIZE
  136. LD a7, X, 6 * SIZE
  137. LD a8, X, 7 * SIZE
  138. bge $r0, I, .L53
  139. .align 3
  140. .L52:
  141. MUL t1, ALPHA, a1
  142. LD a1, X, 8 * SIZE
  143. MUL t2, ALPHA, a2
  144. LD a2, X, 9 * SIZE
  145. MUL t3, ALPHA, a3
  146. LD a3, X, 10 * SIZE
  147. MUL t4, ALPHA, a4
  148. LD a4, X, 11 * SIZE
  149. ST t1, X, 0 * SIZE
  150. MUL t1, ALPHA, a5
  151. LD a5, X, 12 * SIZE
  152. ST t2, X, 1 * SIZE
  153. MUL t2, ALPHA, a6
  154. LD a6, X, 13 * SIZE
  155. ST t3, X, 2 * SIZE
  156. MUL t3, ALPHA, a7
  157. LD a7, X, 14 * SIZE
  158. ST t4, X, 3 * SIZE
  159. MUL t4, ALPHA, a8
  160. LD a8, X, 15 * SIZE
  161. addi.d I, I, -1
  162. ST t1, X, 4 * SIZE
  163. ST t2, X, 5 * SIZE
  164. ST t3, X, 6 * SIZE
  165. ST t4, X, 7 * SIZE
  166. addi.d X, X, 8 * SIZE
  167. blt $r0, I, .L52
  168. .align 3
  169. .L53:
  170. MUL t1, ALPHA, a1
  171. MUL t2, ALPHA, a2
  172. MUL t3, ALPHA, a3
  173. MUL t4, ALPHA, a4
  174. ST t1, X, 0 * SIZE
  175. MUL t1, ALPHA, a5
  176. ST t2, X, 1 * SIZE
  177. MUL t2, ALPHA, a6
  178. ST t3, X, 2 * SIZE
  179. MUL t3, ALPHA, a7
  180. ST t4, X, 3 * SIZE
  181. MUL t4, ALPHA, a8
  182. ST t1, X, 4 * SIZE
  183. ST t2, X, 5 * SIZE
  184. ST t3, X, 6 * SIZE
  185. ST t4, X, 7 * SIZE
  186. addi.d X, X, 8 * SIZE
  187. .align 3
  188. .L55:
  189. andi I, N, 7
  190. bge $r0, I, .L999
  191. .align 3
  192. .L56:
  193. LD a1, X, 0 * SIZE
  194. MUL t1, ALPHA, a1
  195. addi.d X, X, SIZE
  196. addi.d I, I, -1
  197. ST t1, X, -1 * SIZE
  198. blt $r0, I, .L56
  199. move $r4, $r17
  200. fmov.d $f0, $f22
  201. jirl $r0, $r1, 0x0
  202. .align 3
  203. .L60:
  204. srai.d I, N, 3
  205. move XX, X
  206. addi.d I, I, -1
  207. blt I, $r0, .L65
  208. LD a1, X, 0 * SIZE
  209. add.d X, X, INCX
  210. LD a2, X, 0 * SIZE
  211. add.d X, X, INCX
  212. LD a3, X, 0 * SIZE
  213. add.d X, X, INCX
  214. LD a4, X, 0 * SIZE
  215. add.d X, X, INCX
  216. LD a5, X, 0 * SIZE
  217. add.d X, X, INCX
  218. LD a6, X, 0 * SIZE
  219. add.d X, X, INCX
  220. LD a7, X, 0 * SIZE
  221. add.d X, X, INCX
  222. LD a8, X, 0 * SIZE
  223. add.d X, X, INCX
  224. bge $r0, I, .L63
  225. .align 3
  226. .L62:
  227. MUL t1, ALPHA, a1
  228. LD a1, X, 0 * SIZE
  229. add.d X, X, INCX
  230. MUL t2, ALPHA, a2
  231. LD a2, X, 0 * SIZE
  232. add.d X, X, INCX
  233. MUL t3, ALPHA, a3
  234. LD a3, X, 0 * SIZE
  235. add.d X, X, INCX
  236. MUL t4, ALPHA, a4
  237. LD a4, X, 0 * SIZE
  238. add.d X, X, INCX
  239. ST t1, XX, 0 * SIZE
  240. add.d XX, XX, INCX
  241. ST t2, XX, 0 * SIZE
  242. add.d XX, XX, INCX
  243. ST t3, XX, 0 * SIZE
  244. add.d XX, XX, INCX
  245. ST t4, XX, 0 * SIZE
  246. add.d XX, XX, INCX
  247. MUL t1, ALPHA, a5
  248. LD a5, X, 0 * SIZE
  249. add.d X, X, INCX
  250. MUL t2, ALPHA, a6
  251. LD a6, X, 0 * SIZE
  252. add.d X, X, INCX
  253. MUL t3, ALPHA, a7
  254. LD a7, X, 0 * SIZE
  255. add.d X, X, INCX
  256. MUL t4, ALPHA, a8
  257. LD a8, X, 0 * SIZE
  258. add.d X, X, INCX
  259. ST t1, XX, 0 * SIZE
  260. add.d XX, XX, INCX
  261. ST t2, XX, 0 * SIZE
  262. add.d XX, XX, INCX
  263. ST t3, XX, 0 * SIZE
  264. add.d XX, XX, INCX
  265. ST t4, XX, 0 * SIZE
  266. addi.d I, I, -1
  267. add.d XX, XX, INCX
  268. blt $r0, I, .L62
  269. .align 3
  270. .L63:
  271. MUL t1, ALPHA, a1
  272. MUL t2, ALPHA, a2
  273. MUL t3, ALPHA, a3
  274. MUL t4, ALPHA, a4
  275. ST t1, XX, 0 * SIZE
  276. add.d XX, XX, INCX
  277. ST t2, XX, 0 * SIZE
  278. add.d XX, XX, INCX
  279. ST t3, XX, 0 * SIZE
  280. add.d XX, XX, INCX
  281. ST t4, XX, 0 * SIZE
  282. add.d XX, XX, INCX
  283. MUL t1, ALPHA, a5
  284. MUL t2, ALPHA, a6
  285. MUL t3, ALPHA, a7
  286. MUL t4, ALPHA, a8
  287. ST t1, XX, 0 * SIZE
  288. add.d XX, XX, INCX
  289. ST t2, XX, 0 * SIZE
  290. add.d XX, XX, INCX
  291. ST t3, XX, 0 * SIZE
  292. add.d XX, XX, INCX
  293. ST t4, XX, 0 * SIZE
  294. add.d XX, XX, INCX
  295. .align 3
  296. .L65:
  297. andi I, N, 7
  298. bge $r0, I, .L999
  299. .align 3
  300. .L66:
  301. LD a1, X, 0 * SIZE
  302. MUL t1, ALPHA, a1
  303. addi.d I, I, -1
  304. ST t1, X, 0 * SIZE
  305. add.d X, X, INCX
  306. blt $r0, I, .L66
  307. .align 3
  308. .L999:
  309. move $r4, $r17
  310. fmov.d $f0, $f22
  311. jirl $r0, $r1, 0x0
  312. EPILOGUE