You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

casum_lsx.S 9.3 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358
  1. /***************************************************************************
  2. Copyright (c) 2023, The OpenBLAS Project
  3. All rights reserved.
  4. Redistribution and use in source and binary forms, with or without
  5. modification, are permitted provided that the following conditions are
  6. met:
  7. 1. Redistributions of source code must retain the above copyright
  8. notice, this list of conditions and the following disclaimer.
  9. 2. Redistributions in binary form must reproduce the above copyright
  10. notice, this list of conditions and the following disclaimer in
  11. the documentation and/or other materials provided with the
  12. distribution.
  13. 3. Neither the name of the OpenBLAS project nor the names of
  14. its contributors may be used to endorse or promote products
  15. derived from this software without specific prior written permission.
  16. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  17. AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  18. IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  19. ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
  20. LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  21. DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
  22. SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
  23. CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  24. OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
  25. USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  26. *****************************************************************************/
  27. #define ASSEMBLER
  28. #include "common.h"
  29. #define N $r4
  30. #define X $r5
  31. #define INCX $r6
  32. #define I $r17
  33. #define TEMP $r18
  34. #define t1 $r15
  35. #define t2 $r12
  36. #define t3 $r13
  37. #define t4 $r14
  38. #define a1 $f12
  39. #define a2 $f13
  40. #define a3 $f14
  41. #define a4 $f15
  42. #define s1 $f16
  43. #define VX0 $vr12
  44. #define VX1 $vr13
  45. #define VX2 $vr14
  46. #define VX3 $vr15
  47. #define res1 $vr16
  48. #define res2 $vr17
  49. #define res3 $vr18
  50. #define res0 $vr19
  51. #define neg1 $vr20
  52. #define VT0 $vr21
  53. #define VT1 $vr22
  54. PROLOGUE
  55. vxor.v res1, res1, res1
  56. vxor.v res2, res2, res2
  57. vxor.v res0, res0, res0
  58. bge $r0, N, .L999
  59. bge $r0, INCX, .L999
  60. #ifdef DOUBLE
  61. li.d t1, -1
  62. vreplgr2vr.d neg1, t1
  63. vffint.d.l neg1, neg1
  64. #else
  65. li.w t1, -1
  66. vreplgr2vr.w neg1, t1
  67. vffint.s.w neg1, neg1
  68. #endif
  69. li.d TEMP, 1
  70. slli.d TEMP, TEMP, ZBASE_SHIFT
  71. slli.d INCX, INCX, ZBASE_SHIFT
  72. srai.d I, N, 3
  73. bne INCX, TEMP, .L20
  74. bge $r0, I, .L13
  75. .align 3
  76. .L11:
  77. #ifdef DOUBLE
  78. vld VX0, X, 0 * SIZE
  79. vld VX1, X, 2 * SIZE
  80. vfmul.d VX2, neg1, VX0
  81. vfmul.d VX3, neg1, VX1
  82. vfcmp.clt.d VT0, VX0, res0
  83. vfcmp.clt.d VT1, VX1, res0
  84. vbitsel.v VX0, VX0, VX2, VT0
  85. vbitsel.v VX1, VX1, VX3, VT1
  86. vfadd.d res2, VX0, VX1
  87. vfadd.d res1, res1, res2
  88. vld VX2, X, 4 * SIZE
  89. vld VX3, X, 6 * SIZE
  90. vfmul.d VX0, neg1, VX2
  91. vfmul.d VX1, neg1, VX3
  92. vfcmp.clt.d VT0, VX2, res0
  93. vfcmp.clt.d VT1, VX3, res0
  94. vbitsel.v VX2, VX2, VX0, VT0
  95. vbitsel.v VX3, VX3, VX1, VT1
  96. vfadd.d res2, VX2, VX3
  97. vfadd.d res1, res1, res2
  98. vld VX0, X, 8 * SIZE
  99. vld VX1, X, 10 * SIZE
  100. vfmul.d VX2, neg1, VX0
  101. vfmul.d VX3, neg1, VX1
  102. vfcmp.clt.d VT0, VX0, res0
  103. vfcmp.clt.d VT1, VX1, res0
  104. vbitsel.v VX0, VX0, VX2, VT0
  105. vbitsel.v VX1, VX1, VX3, VT1
  106. vfadd.d res2, VX0, VX1
  107. vfadd.d res1, res1, res2
  108. vld VX2, X, 12 * SIZE
  109. vld VX3, X, 14 * SIZE
  110. vfmul.d VX0, neg1, VX2
  111. vfmul.d VX1, neg1, VX3
  112. vfcmp.clt.d VT0, VX2, res0
  113. vfcmp.clt.d VT1, VX3, res0
  114. vbitsel.v VX2, VX2, VX0, VT0
  115. vbitsel.v VX3, VX3, VX1, VT1
  116. vfadd.d res2, VX2, VX3
  117. vfadd.d res1, res1, res2
  118. addi.d I, I, -1
  119. #else
  120. vld VX0, X, 0 * SIZE
  121. vld VX1, X, 4 * SIZE
  122. vfmul.s VX2, neg1, VX0
  123. vfmul.s VX3, neg1, VX1
  124. vfcmp.clt.s VT0, VX0, res0
  125. vfcmp.clt.s VT1, VX1, res0
  126. vbitsel.v VX0, VX0, VX2, VT0
  127. vbitsel.v VX1, VX1, VX3, VT1
  128. vfadd.s res2, VX0, VX1
  129. vld VX0, X, 8 * SIZE
  130. vld VX1, X, 12 * SIZE
  131. addi.d I, I, -1
  132. vfmul.s VX2, neg1, VX0
  133. vfmul.s VX3, neg1, VX1
  134. vfcmp.clt.s VT0, VX0, res0
  135. vfcmp.clt.s VT1, VX1, res0
  136. vbitsel.v VX0, VX0, VX2, VT0
  137. vbitsel.v VX1, VX1, VX3, VT1
  138. vfadd.s res3, VX1, VX0
  139. vfadd.s res2, res3, res2
  140. vfadd.s res1, res1, res2
  141. #endif
  142. addi.d X, X, 16 * SIZE
  143. blt $r0, I, .L11
  144. .align 3
  145. .L12:
  146. #ifdef DOUBLE
  147. vreplvei.d VX1, res1, 1
  148. vfadd.d res1, VX1, res1
  149. #else
  150. vreplvei.w VX1, res1, 1
  151. vreplvei.w VX2, res1, 2
  152. vreplvei.w VX3, res1, 3
  153. vfadd.s res1, VX1, res1
  154. vfadd.s res1, VX2, res1
  155. vfadd.s res1, VX3, res1
  156. #endif
  157. .align 3
  158. .L13:
  159. andi I, N, 7
  160. bge $r0, I, .L999
  161. .align 3
  162. .L14:
  163. LD a1, X, 0 * SIZE
  164. LD a2, X, 1 * SIZE
  165. FABS a1, a1
  166. FABS a2, a2
  167. addi.d I, I, -1
  168. ADD a1, a1, a2
  169. ADD s1, a1, s1
  170. addi.d X, X, 2 * SIZE
  171. blt $r0, I, .L14
  172. b .L999
  173. .align 3
  174. .L20:
  175. bge $r0, I, .L23
  176. .align 3
  177. .L21:
  178. #ifdef DOUBLE
  179. ld.d t1, X, 0 * SIZE
  180. ld.d t2, X, 1 * SIZE
  181. add.d X, X, INCX
  182. vinsgr2vr.d VX0, t1, 0
  183. vinsgr2vr.d VX0, t2, 1
  184. ld.d t1, X, 0 * SIZE
  185. ld.d t2, X, 1 * SIZE
  186. vinsgr2vr.d VX1, t1, 0
  187. vinsgr2vr.d VX1, t2, 1
  188. add.d X, X, INCX
  189. vfmul.d VX2, neg1, VX0
  190. vfmul.d VX3, neg1, VX1
  191. vfcmp.clt.d VT0, VX0, res0
  192. vfcmp.clt.d VT1, VX1, res0
  193. vbitsel.v VX0, VX0, VX2, VT0
  194. vbitsel.v VX1, VX1, VX3, VT1
  195. vfadd.d res2, VX0, VX1
  196. vfadd.d res1, res1, res2
  197. ld.d t3, X, 0 * SIZE
  198. ld.d t4, X, 1 * SIZE
  199. add.d X, X, INCX
  200. vinsgr2vr.d VX0, t3, 0
  201. vinsgr2vr.d VX0, t4, 1
  202. ld.d t3, X, 0 * SIZE
  203. ld.d t4, X, 1 * SIZE
  204. vinsgr2vr.d VX1, t3, 0
  205. vinsgr2vr.d VX1, t4, 1
  206. add.d X, X, INCX
  207. vfmul.d VX2, neg1, VX0
  208. vfmul.d VX3, neg1, VX1
  209. vfcmp.clt.d VT0, VX0, res0
  210. vfcmp.clt.d VT1, VX1, res0
  211. vbitsel.v VX0, VX0, VX2, VT0
  212. vbitsel.v VX1, VX1, VX3, VT1
  213. vfadd.d res2, VX0, VX1
  214. vfadd.d res1, res1, res2
  215. ld.d t1, X, 0 * SIZE
  216. ld.d t2, X, 1 * SIZE
  217. add.d X, X, INCX
  218. vinsgr2vr.d VX0, t1, 0
  219. vinsgr2vr.d VX0, t2, 1
  220. ld.d t1, X, 0 * SIZE
  221. ld.d t2, X, 1 * SIZE
  222. vinsgr2vr.d VX1, t1, 0
  223. vinsgr2vr.d VX1, t2, 1
  224. add.d X, X, INCX
  225. vfmul.d VX2, neg1, VX0
  226. vfmul.d VX3, neg1, VX1
  227. vfcmp.clt.d VT0, VX0, res0
  228. vfcmp.clt.d VT1, VX1, res0
  229. vbitsel.v VX0, VX0, VX2, VT0
  230. vbitsel.v VX1, VX1, VX3, VT1
  231. vfadd.d res2, VX0, VX1
  232. vfadd.d res1, res1, res2
  233. ld.d t3, X, 0 * SIZE
  234. ld.d t4, X, 1 * SIZE
  235. add.d X, X, INCX
  236. vinsgr2vr.d VX0, t3, 0
  237. vinsgr2vr.d VX0, t4, 1
  238. ld.d t3, X, 0 * SIZE
  239. ld.d t4, X, 1 * SIZE
  240. vinsgr2vr.d VX1, t3, 0
  241. vinsgr2vr.d VX1, t4, 1
  242. add.d X, X, INCX
  243. vfmul.d VX2, neg1, VX0
  244. vfmul.d VX3, neg1, VX1
  245. vfcmp.clt.d VT0, VX0, res0
  246. vfcmp.clt.d VT1, VX1, res0
  247. vbitsel.v VX0, VX0, VX2, VT0
  248. vbitsel.v VX1, VX1, VX3, VT1
  249. vfadd.d res2, VX0, VX1
  250. vfadd.d res1, res1, res2
  251. #else
  252. ld.w t1, X, 0 * SIZE
  253. ld.w t2, X, 1 * SIZE
  254. add.d X, X, INCX
  255. ld.w t3, X, 0 * SIZE
  256. ld.w t4, X, 1 * SIZE
  257. add.d X, X, INCX
  258. vinsgr2vr.w VX0, t1, 0
  259. vinsgr2vr.w VX0, t2, 1
  260. vinsgr2vr.w VX0, t3, 2
  261. vinsgr2vr.w VX0, t4, 3
  262. ld.w t1, X, 0 * SIZE
  263. ld.w t2, X, 1 * SIZE
  264. add.d X, X, INCX
  265. ld.w t3, X, 0 * SIZE
  266. ld.w t4, X, 1 * SIZE
  267. add.d X, X, INCX
  268. vinsgr2vr.w VX1, t1, 0
  269. vinsgr2vr.w VX1, t2, 1
  270. vinsgr2vr.w VX1, t3, 2
  271. vinsgr2vr.w VX1, t4, 3
  272. vfmul.s VX2, neg1, VX0
  273. vfmul.s VX3, neg1, VX1
  274. vfcmp.clt.s VT0, VX0, res0
  275. vfcmp.clt.s VT1, VX1, res0
  276. vbitsel.v VX0, VX0, VX2, VT0
  277. vbitsel.v VX1, VX1, VX3, VT1
  278. vfadd.s res2, VX0, VX1
  279. ld.w t1, X, 0 * SIZE
  280. ld.w t2, X, 1 * SIZE
  281. add.d X, X, INCX
  282. ld.w t3, X, 0 * SIZE
  283. ld.w t4, X, 1 * SIZE
  284. add.d X, X, INCX
  285. vinsgr2vr.w VX2, t1, 0
  286. vinsgr2vr.w VX2, t2, 1
  287. vinsgr2vr.w VX2, t3, 2
  288. vinsgr2vr.w VX2, t4, 3
  289. ld.w t1, X, 0 * SIZE
  290. ld.w t2, X, 1 * SIZE
  291. add.d X, X, INCX
  292. ld.w t3, X, 0 * SIZE
  293. ld.w t4, X, 1 * SIZE
  294. add.d X, X, INCX
  295. vinsgr2vr.w VX3, t1, 0
  296. vinsgr2vr.w VX3, t2, 1
  297. vinsgr2vr.w VX3, t3, 2
  298. vinsgr2vr.w VX3, t4, 3
  299. vfmul.s VX0, neg1, VX2
  300. vfmul.s VX1, neg1, VX3
  301. vfcmp.clt.s VT0, VX2, res0
  302. vfcmp.clt.s VT1, VX3, res0
  303. vbitsel.v VX2, VX2, VX0, VT0
  304. vbitsel.v VX3, VX3, VX1, VT1
  305. vfadd.s res3, VX2, VX3
  306. vfadd.s res2, res3, res2
  307. vfadd.s res1, res1, res2
  308. #endif
  309. addi.d I, I, -1
  310. blt $r0, I, .L21
  311. .align 3
  312. .L22:
  313. #ifdef DOUBLE
  314. vreplvei.d VX1, res1, 1
  315. vfadd.d res1, VX1, res1
  316. #else
  317. vreplvei.w VX1, res1, 1
  318. vreplvei.w VX2, res1, 2
  319. vreplvei.w VX3, res1, 3
  320. vfadd.s res1, VX1, res1
  321. vfadd.s res1, VX2, res1
  322. vfadd.s res1, VX3, res1
  323. #endif
  324. .align 3
  325. .L23:
  326. andi I, N, 7
  327. bge $r0, I, .L999
  328. .align 3
  329. .L24:
  330. LD a1, X, 0 * SIZE
  331. LD a2, X, 1 * SIZE
  332. FABS a1, a1
  333. FABS a2, a2
  334. addi.d I, I, -1
  335. ADD a1, a1, a2
  336. ADD s1, a1, s1
  337. add.d X, X, INCX
  338. blt $r0, I, .L24
  339. .align 3
  340. .L999:
  341. MOV $f0, $f16
  342. jirl $r0, $r1, 0x0
  343. .align 3
  344. EPILOGUE