You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

scal_lasx.S 6.8 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286
  1. /***************************************************************************
  2. Copyright (c) 2023, The OpenBLAS Project
  3. All rights reserved.
  4. Redistribution and use in source and binary forms, with or without
  5. modification, are permitted provided that the following conditions are
  6. met:
  7. 1. Redistributions of source code must retain the above copyright
  8. notice, this list of conditions and the following disclaimer.
  9. 2. Redistributions in binary form must reproduce the above copyright
  10. notice, this list of conditions and the following disclaimer in
  11. the documentation and/or other materials provided with the
  12. distribution.
  13. 3. Neither the name of the OpenBLAS project nor the names of
  14. its contributors may be used to endorse or promote products
  15. derived from this software without specific prior written permission.
  16. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  17. AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  18. IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  19. ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
  20. LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  21. DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
  22. SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
  23. CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  24. OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
  25. USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  26. *****************************************************************************/
  27. #define ASSEMBLER
  28. #include "common.h"
  29. #define N $r4
  30. #define ALPHA $f0
  31. #define X $r7
  32. #define INCX $r8
  33. #define I $r12
  34. #define TEMP $r13
  35. #define t1 $r14
  36. #define t2 $r18
  37. #define t3 $r15
  38. #define t4 $r17
  39. #define XX $r16
  40. #define VX0 $xr12
  41. #define VX1 $xr13
  42. #define VT0 $xr14
  43. #define VT1 $xr15
  44. #define VALPHA $xr19
  45. #define a1 $f8
  46. #define a2 $f23
  47. PROLOGUE
  48. bge $r0, N, .L999
  49. bge $r0, INCX, .L999
  50. li.d TEMP, 1
  51. ld.d t1, $sp, 0 // Load dummp2
  52. movgr2fr.d a1, $r0
  53. FFINT a1, a1
  54. movgr2fr.d a2, TEMP
  55. FFINT a2, a2
  56. slli.d TEMP, TEMP, BASE_SHIFT
  57. slli.d INCX, INCX, BASE_SHIFT
  58. slli.d t1, t1, BASE_SHIFT
  59. CMPEQ $fcc0, ALPHA, a1
  60. bcnez $fcc0, .L20 //ALPHA==0
  61. CMPEQ $fcc0, ALPHA, a2
  62. bcnez $fcc0, .L999 //ALPHA==1 return
  63. .L1:
  64. srai.d I, N, 3
  65. beq INCX, TEMP, .L30 //ALPHA !=0|1 and INCX==1
  66. MTG TEMP, ALPHA
  67. #ifdef DOUBLE
  68. xvreplgr2vr.d VALPHA, TEMP
  69. #else
  70. xvreplgr2vr.w VALPHA, TEMP
  71. #endif
  72. move XX, X
  73. .align 3
  74. .L10: //ALPHA !=0|1 and INCX!=1
  75. bge $r0, I, .L32
  76. .align 3
  77. .L11:
  78. #ifdef DOUBLE
  79. ld.d t1, X, 0 * SIZE
  80. add.d X, X, INCX
  81. ld.d t2, X, 0 * SIZE
  82. add.d X, X, INCX
  83. ld.d t3, X, 0 * SIZE
  84. add.d X, X, INCX
  85. ld.d t4, X, 0 * SIZE
  86. add.d X, X, INCX
  87. xvinsgr2vr.d VX0, t1, 0
  88. xvinsgr2vr.d VX0, t2, 1
  89. xvinsgr2vr.d VX0, t3, 2
  90. xvinsgr2vr.d VX0, t4, 3
  91. ld.d t1, X, 0 * SIZE
  92. add.d X, X, INCX
  93. ld.d t2, X, 0 * SIZE
  94. add.d X, X, INCX
  95. xvfmul.d VT0, VX0, VALPHA
  96. ld.d t3, X, 0 * SIZE
  97. add.d X, X, INCX
  98. ld.d t4, X, 0 * SIZE
  99. add.d X, X, INCX
  100. xvinsgr2vr.d VX1, t1, 0
  101. xvinsgr2vr.d VX1, t2, 1
  102. xvinsgr2vr.d VX1, t3, 2
  103. xvinsgr2vr.d VX1, t4, 3
  104. xvstelm.d VT0, XX, 0, 0
  105. add.d XX, XX, INCX
  106. xvstelm.d VT0, XX, 0, 1
  107. add.d XX, XX, INCX
  108. xvstelm.d VT0, XX, 0, 2
  109. add.d XX, XX, INCX
  110. xvstelm.d VT0, XX, 0, 3
  111. add.d XX, XX, INCX
  112. xvfmul.d VT1, VX1, VALPHA
  113. xvstelm.d VT1, XX, 0, 0
  114. add.d XX, XX, INCX
  115. xvstelm.d VT1, XX, 0, 1
  116. add.d XX, XX, INCX
  117. xvstelm.d VT1, XX, 0, 2
  118. add.d XX, XX, INCX
  119. xvstelm.d VT1, XX, 0, 3
  120. #else
  121. ld.w t1, X, 0 * SIZE
  122. add.d X, X, INCX
  123. ld.w t2, X, 0 * SIZE
  124. add.d X, X, INCX
  125. ld.w t3, X, 0 * SIZE
  126. add.d X, X, INCX
  127. ld.w t4, X, 0 * SIZE
  128. add.d X, X, INCX
  129. xvinsgr2vr.w VX0, t1, 0
  130. xvinsgr2vr.w VX0, t2, 1
  131. xvinsgr2vr.w VX0, t3, 2
  132. xvinsgr2vr.w VX0, t4, 3
  133. ld.w t1, X, 0 * SIZE
  134. add.d X, X, INCX
  135. ld.w t2, X, 0 * SIZE
  136. add.d X, X, INCX
  137. ld.w t3, X, 0 * SIZE
  138. add.d X, X, INCX
  139. ld.w t4, X, 0 * SIZE
  140. add.d X, X, INCX
  141. xvinsgr2vr.w VX0, t1, 4
  142. xvinsgr2vr.w VX0, t2, 5
  143. xvinsgr2vr.w VX0, t3, 6
  144. xvinsgr2vr.w VX0, t4, 7
  145. xvfmul.s VT0, VX0, VALPHA
  146. xvstelm.w VT0, XX, 0, 0
  147. add.d XX, XX, INCX
  148. xvstelm.w VT0, XX, 0, 1
  149. add.d XX, XX, INCX
  150. xvstelm.w VT0, XX, 0, 2
  151. add.d XX, XX, INCX
  152. xvstelm.w VT0, XX, 0, 3
  153. add.d XX, XX, INCX
  154. xvstelm.w VT0, XX, 0, 4
  155. add.d XX, XX, INCX
  156. xvstelm.w VT0, XX, 0, 5
  157. add.d XX, XX, INCX
  158. xvstelm.w VT0, XX, 0, 6
  159. add.d XX, XX, INCX
  160. xvstelm.w VT0, XX, 0, 7
  161. #endif
  162. add.d XX, XX, INCX
  163. addi.d I, I, -1
  164. blt $r0, I, .L11
  165. b .L32
  166. .align 3
  167. .L20:
  168. beq t1, TEMP, .L1 // if dummp2 == 1, do not directly copy 0
  169. srai.d I, N, 3
  170. beq INCX, TEMP, .L24
  171. bge $r0, I, .L22
  172. .align 3
  173. .L21:
  174. ST a1, X, 0
  175. add.d X, X, INCX
  176. ST a1, X, 0
  177. add.d X, X, INCX
  178. ST a1, X, 0
  179. add.d X, X, INCX
  180. ST a1, X, 0
  181. add.d X, X, INCX
  182. ST a1, X, 0
  183. add.d X, X, INCX
  184. ST a1, X, 0
  185. add.d X, X, INCX
  186. ST a1, X, 0
  187. add.d X, X, INCX
  188. ST a1, X, 0
  189. add.d X, X, INCX
  190. addi.d I, I, -1
  191. blt $r0, I, .L21
  192. .align 3
  193. .L22:
  194. andi I, N, 7
  195. bge $r0, I, .L999
  196. .align 3
  197. .L23:
  198. ST a1, X, 0 * SIZE
  199. addi.d I, I, -1
  200. add.d X, X, INCX
  201. blt $r0, I, .L23
  202. jirl $r0, $r1, 0
  203. .align 3
  204. .L24:
  205. bge $r0, I, .L26 /*N<8 INCX==1*/
  206. .align 3
  207. .L25:
  208. xvxor.v VX0, VX0, VX0
  209. xvst VX0, X, 0 * SIZE
  210. #ifdef DOUBLE
  211. xvst VX0, X, 4 * SIZE
  212. #endif
  213. addi.d I, I, -1
  214. addi.d X, X, 8 * SIZE
  215. blt $r0, I, .L25
  216. .align 3
  217. .L26:
  218. andi I, N, 7
  219. bge $r0, I, .L999
  220. .align 3
  221. .L27:
  222. ST a1, X, 0 * SIZE
  223. addi.d I, I, -1
  224. addi.d X, X, SIZE
  225. blt $r0, I, .L27
  226. jirl $r0, $r1, 0
  227. .align 3
  228. .L30:
  229. bge $r0, I, .L32/*N<8 INCX==1*/
  230. MTG TEMP, ALPHA
  231. #ifdef DOUBLE
  232. xvreplgr2vr.d VALPHA , TEMP
  233. #else
  234. xvreplgr2vr.w VALPHA , TEMP
  235. #endif
  236. .align 3
  237. .L31:
  238. xvld VX0, X, 0 * SIZE
  239. #ifdef DOUBLE
  240. xvld VX1, X, 4 * SIZE
  241. xvfmul.d VT0, VX0, VALPHA
  242. xvfmul.d VT1, VX1, VALPHA
  243. xvst VT0, X, 0 * SIZE
  244. xvst VT1, X, 4 * SIZE
  245. #else
  246. xvfmul.s VT0, VX0, VALPHA
  247. xvst VT0, X, 0 * SIZE
  248. #endif
  249. addi.d I, I, -1
  250. addi.d X, X, 8 * SIZE
  251. blt $r0, I, .L31
  252. .align 3
  253. .L32:
  254. andi I, N, 7
  255. bge $r0, I, .L999
  256. .align 3
  257. .L33:
  258. LD a1, X, 0 * SIZE
  259. addi.d I, I, -1
  260. MUL a1, ALPHA, a1
  261. ST a1, X, 0 * SIZE
  262. add.d X, X, INCX
  263. blt $r0, I, .L33
  264. jirl $r0, $r1, 0
  265. .align 3
  266. .L999:
  267. jirl $r0, $r1, 0x0
  268. EPILOGUE