You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

iamin_lsx.S 11 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446
  1. /***************************************************************************
  2. Copyright (c) 2023, The OpenBLAS Project
  3. All rights reserved.
  4. Redistribution and use in source and binary forms, with or without
  5. modification, are permitted provided that the following conditions are
  6. met:
  7. 1. Redistributions of source code must retain the above copyright
  8. notice, this list of conditions and the following disclaimer.
  9. 2. Redistributions in binary form must reproduce the above copyright
  10. notice, this list of conditions and the following disclaimer in
  11. the documentation and/or other materials provided with the
  12. distribution.
  13. 3. Neither the name of the OpenBLAS project nor the names of
  14. its contributors may be used to endorse or promote products
  15. derived from this software without specific prior written permission.
  16. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  17. AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  18. IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  19. ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
  20. LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  21. DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
  22. SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
  23. CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  24. OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
  25. USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  26. *****************************************************************************/
  27. #define ASSEMBLER
  28. #include "common.h"
  29. #define N $r4
  30. #define X $r5
  31. #define INCX $r6
  32. #define I $r12
  33. #define t1 $r13
  34. #define t2 $r15
  35. #define t3 $r18
  36. #define t4 $r16
  37. #define i0 $r17
  38. #define i1 $r14
  39. #define TEMP $r19
  40. #define x1 $vr9
  41. #define x2 $vr10
  42. #define x3 $vr11
  43. #define x4 $vr12
  44. #define VX0 $vr13
  45. #define VX1 $vr14
  46. #define VM0 $vr15
  47. #define VM1 $vr16
  48. #ifdef DOUBLE
  49. #define VINC2 $vr17
  50. #define VINC4 $vr18
  51. #else
  52. #define VINC4 $vr17
  53. #define VINC8 $vr18
  54. #endif
  55. #define VI0 $vr20
  56. #define VI1 $vr21
  57. #define VI2 $vr22
  58. #define VI3 $vr8
  59. #define VI4 $vr19
  60. #define VT0 $vr23
  61. PROLOGUE
  62. li.d i0, 0
  63. bge $r0, N, .L999
  64. bge $r0, INCX, .L999
  65. li.d TEMP, 1
  66. slli.d TEMP, TEMP, BASE_SHIFT
  67. slli.d INCX, INCX, BASE_SHIFT
  68. bne INCX, TEMP, .L20
  69. vld VM0, X, 0
  70. #ifdef DOUBLE
  71. addi.d i0, i0, 1
  72. srai.d I, N, 3
  73. bge $r0, I, .L21
  74. slli.d i0, i0, 1 //2
  75. vreplgr2vr.d VINC2, i0
  76. slli.d i0, i0, 1 //4
  77. vreplgr2vr.d VINC4, i0
  78. addi.d i0, i0, -7
  79. vinsgr2vr.d VI1, i0, 0 //initialize the index value for vectorization
  80. addi.d i0, i0, 1
  81. vinsgr2vr.d VI1, i0, 1
  82. addi.d i0, i0, 3
  83. vinsgr2vr.d VI0, i0, 0 //1
  84. addi.d i0, i0, 1
  85. vinsgr2vr.d VI0, i0, 1 //2
  86. #else
  87. addi.w i0, i0, 1
  88. srai.d I, N, 3
  89. bge $r0, I, .L21
  90. slli.w i0, i0, 2 //4
  91. vreplgr2vr.w VINC4, i0
  92. slli.w i0, i0, 1 //8
  93. vreplgr2vr.w VINC8, i0
  94. addi.w i0, i0, -15
  95. vinsgr2vr.w VI1, i0, 0 //initialize the index value for vectorization
  96. addi.w i0, i0, 1
  97. vinsgr2vr.w VI1, i0, 1
  98. addi.w i0, i0, 1
  99. vinsgr2vr.w VI1, i0, 2
  100. addi.w i0, i0, 1
  101. vinsgr2vr.w VI1, i0, 3
  102. addi.w i0, i0, 5
  103. vinsgr2vr.w VI0, i0, 0 //1
  104. addi.w i0, i0, 1
  105. vinsgr2vr.w VI0, i0, 1 //2
  106. addi.w i0, i0, 1
  107. vinsgr2vr.w VI0, i0, 2 //3
  108. addi.w i0, i0, 1
  109. vinsgr2vr.w VI0, i0, 3 //4
  110. #endif
  111. .align 3
  112. .L10:
  113. vld VX0, X, 0 * SIZE
  114. #ifdef DOUBLE
  115. vadd.d VI1, VI1, VINC4
  116. vld VX1, X, 2 * SIZE
  117. vadd.d VI2, VI1, VINC2
  118. vfmina.d x1, VX0, VX1
  119. vfcmp.ceq.d VT0, VX0, x1
  120. vbitsel.v x2, VI2, VI1, VT0
  121. vld VX0, X, 4 * SIZE
  122. vadd.d VI1, VI2, VINC2
  123. vld VX1, X, 6 * SIZE
  124. vadd.d VI2, VI1, VINC2
  125. vfmina.d x3, VX0, VX1
  126. vfcmp.ceq.d VT0, VX0, x3
  127. vbitsel.v x4, VI2, VI1, VT0
  128. vfmina.d x3, x1, x3
  129. vfcmp.ceq.d VT0, x1, x3
  130. addi.d I, I, -1
  131. vbitsel.v x2, x4, x2, VT0
  132. vfmina.d VM1, VM0, x3
  133. #else
  134. vadd.w VI1, VI1, VINC8
  135. vld VX1, X, 4 * SIZE
  136. vadd.w VI2, VI1, VINC4
  137. vfmina.s VM1, VX0, VX1
  138. vfcmp.ceq.s VT0, VX0, VM1
  139. addi.d I, I, -1
  140. vbitsel.v x2, VI2, VI1, VT0
  141. vfmina.s VM1, VM0, VM1
  142. #endif
  143. VCMPEQ VT0, VM0, VM1
  144. addi.d X, X, 8 * SIZE
  145. vbitsel.v VM0, VM1, VM0, VT0
  146. vbitsel.v VI0, x2, VI0, VT0
  147. blt $r0, I, .L10
  148. .align 3
  149. .L15:
  150. #ifdef DOUBLE
  151. vreplvei.d VI1, VI0, 0
  152. vreplvei.d VI2, VI0, 1
  153. vreplvei.d x1, VM0, 0
  154. vreplvei.d x2, VM0, 1
  155. fcmp.ceq.d $fcc0, $f10, $f9
  156. bceqz $fcc0, .L26
  157. vfcmp.clt.d VT0, VI1, VI2
  158. vbitsel.v VI0, VI2, VI1, VT0
  159. b .L27
  160. #else
  161. vreplvei.w VI1, VI0, 0
  162. vreplvei.w VI2, VI0, 1
  163. vreplvei.w VI3, VI0, 2
  164. vreplvei.w VI4, VI0, 3
  165. vreplvei.w x1, VM0, 0
  166. vreplvei.w x2, VM0, 1
  167. vreplvei.w x3, VM0, 2
  168. vreplvei.w x4, VM0, 3
  169. vfmina.s VM1, x1, x2
  170. vfcmp.ceq.s VT0, VM1, x1
  171. vbitsel.v VINC4, VI2, VI1, VT0
  172. vfmina.s VM0, x3, x4
  173. vfcmp.ceq.s VT0, x3, VM0
  174. vbitsel.v VINC8, VI4, VI3, VT0
  175. vfmina.s VM0, VM0, VM1
  176. vfcmp.ceq.s VT0, VM0, VM1
  177. vbitsel.v VI0, VINC8, VINC4, VT0
  178. fcmp.ceq.d $fcc0, $f15, $f9
  179. bceqz $fcc0, .L26
  180. vfcmp.clt.s VT0, VI1, VI0
  181. vbitsel.v VI0, VI0, VI1, VT0
  182. b .L26
  183. #endif
  184. .align 3
  185. .L20: // INCX!=1
  186. move TEMP, X
  187. #ifdef DOUBLE
  188. addi.d i0, i0, 1
  189. ld.d t1, TEMP, 0 * SIZE
  190. add.d TEMP, TEMP, INCX
  191. vinsgr2vr.d VM0, t1, 0
  192. srai.d I, N, 3
  193. bge $r0, I, .L21
  194. ld.d t2, TEMP, 0 * SIZE
  195. add.d TEMP, TEMP, INCX
  196. vinsgr2vr.d VM0, t2, 1
  197. slli.d i0, i0, 1 //2
  198. vreplgr2vr.d VINC2, i0
  199. slli.d i0, i0, 1 //4
  200. vreplgr2vr.d VINC4, i0
  201. addi.d i0, i0, -7
  202. vinsgr2vr.d VI1, i0, 0 //initialize the index value for vectorization
  203. addi.d i0, i0, 1
  204. vinsgr2vr.d VI1, i0, 1
  205. addi.d i0, i0, 3
  206. vinsgr2vr.d VI0, i0, 0 //1
  207. addi.d i0, i0, 1
  208. vinsgr2vr.d VI0, i0, 1 //2
  209. #else
  210. addi.w i0, i0, 1
  211. ld.w t1, TEMP, 0 * SIZE
  212. add.d TEMP, TEMP, INCX
  213. vinsgr2vr.w VM0, t1, 0
  214. srai.d I, N, 3
  215. bge $r0, I, .L21
  216. ld.w t2, TEMP, 0 * SIZE
  217. add.d TEMP, TEMP, INCX
  218. vreplvei.d VI1, VI0, 0
  219. ld.w t3, TEMP, 0 * SIZE
  220. add.d TEMP, TEMP, INCX
  221. ld.w t4, TEMP, 0 * SIZE
  222. add.d TEMP, TEMP, INCX
  223. vinsgr2vr.w VM0, t2, 1
  224. vinsgr2vr.w VM0, t3, 2
  225. vinsgr2vr.w VM0, t4, 3
  226. slli.w i0, i0, 2 //4
  227. vreplgr2vr.w VINC4, i0
  228. slli.w i0, i0, 1 //8
  229. vreplgr2vr.w VINC8, i0
  230. addi.w i0, i0, -15
  231. vinsgr2vr.w VI1, i0, 0 //initialize the index value for vectorization
  232. addi.w i0, i0, 1
  233. vinsgr2vr.w VI1, i0, 1
  234. addi.w i0, i0, 1
  235. vinsgr2vr.w VI1, i0, 2
  236. addi.w i0, i0, 1
  237. vinsgr2vr.w VI1, i0, 3
  238. addi.w i0, i0, 5
  239. vinsgr2vr.w VI0, i0, 0 //1
  240. addi.w i0, i0, 1
  241. vinsgr2vr.w VI0, i0, 1 //2
  242. addi.w i0, i0, 1
  243. vinsgr2vr.w VI0, i0, 2 //3
  244. addi.w i0, i0, 1
  245. vinsgr2vr.w VI0, i0, 3 //4
  246. #endif
  247. .align 3
  248. .L24:
  249. #ifdef DOUBLE
  250. ld.d t1, X, 0 * SIZE
  251. add.d X, X, INCX
  252. ld.d t2, X, 0 * SIZE
  253. add.d X, X, INCX
  254. vinsgr2vr.d VX0, t1, 0
  255. vinsgr2vr.d VX0, t2, 1
  256. vadd.d VI1, VI1, VINC4
  257. ld.d t1, X, 0 * SIZE
  258. add.d X, X, INCX
  259. ld.d t2, X, 0 * SIZE
  260. add.d X, X, INCX
  261. vinsgr2vr.d VX1, t1, 0
  262. vinsgr2vr.d VX1, t2, 1
  263. vadd.d VI2, VI1, VINC2
  264. vfmina.d x1, VX0, VX1
  265. vfcmp.ceq.d VT0, VX0, x1
  266. vbitsel.v x2, VI2, VI1, VT0
  267. ld.d t1, X, 0 * SIZE
  268. add.d X, X, INCX
  269. ld.d t2, X, 0 * SIZE
  270. add.d X, X, INCX
  271. vinsgr2vr.d VX0, t1, 0
  272. vinsgr2vr.d VX0, t2, 1
  273. vadd.d VI1, VI2, VINC2
  274. ld.d t1, X, 0 * SIZE
  275. add.d X, X, INCX
  276. ld.d t2, X, 0 * SIZE
  277. add.d X, X, INCX
  278. vinsgr2vr.d VX1, t1, 0
  279. vinsgr2vr.d VX1, t2, 1
  280. vadd.d VI2, VI1, VINC2
  281. vfmina.d x3, VX0, VX1
  282. vfcmp.ceq.d VT0, VX0, x3
  283. vbitsel.v x4, VI2, VI1, VT0
  284. vfmina.d x3, x1, x3
  285. vfcmp.ceq.d VT0, x1, x3
  286. addi.d I, I, -1
  287. vbitsel.v x2, x4, x2, VT0
  288. vfmina.d VM1, VM0, x3
  289. vbitsel.v VM0, VM1, VM0, VT0
  290. vfcmp.ceq.d VT0, VM0, VM1
  291. vbitsel.v VI0, x2, VI0, VT0
  292. #else
  293. ld.w t1, X, 0 * SIZE
  294. add.d X, X, INCX
  295. ld.w t2, X, 0 * SIZE
  296. add.d X, X, INCX
  297. ld.w t3, X, 0 * SIZE
  298. add.d X, X, INCX
  299. ld.w t4, X, 0 * SIZE
  300. add.d X, X, INCX
  301. vinsgr2vr.w VX0, t1, 0
  302. vinsgr2vr.w VX0, t2, 1
  303. vinsgr2vr.w VX0, t3, 2
  304. vinsgr2vr.w VX0, t4, 3
  305. vadd.w VI1, VI1, VINC8
  306. ld.w t1, X, 0 * SIZE
  307. add.d X, X, INCX
  308. ld.w t2, X, 0 * SIZE
  309. add.d X, X, INCX
  310. ld.w t3, X, 0 * SIZE
  311. add.d X, X, INCX
  312. ld.w t4, X, 0 * SIZE
  313. add.d X, X, INCX
  314. vinsgr2vr.w VX1, t1, 0
  315. vinsgr2vr.w VX1, t2, 1
  316. vinsgr2vr.w VX1, t3, 2
  317. vinsgr2vr.w VX1, t4, 3
  318. vadd.w VI2, VI1, VINC4
  319. vfmina.s VM1, VX0, VX1
  320. vfcmp.ceq.s VT0, VX0, VM1
  321. vbitsel.v VI2, VI2, VI1, VT0
  322. vfmina.s VM1, VM0, VM1
  323. vfcmp.ceq.s VT0, VM0, VM1
  324. addi.d I, I, -1
  325. vbitsel.v VM0, VM1, VM0, VT0
  326. vbitsel.v VI0, VI2, VI0, VT0
  327. #endif
  328. blt $r0, I, .L24
  329. .align 3
  330. .L25:
  331. #ifdef DOUBLE
  332. vreplvei.d VI1, VI0, 0
  333. vreplvei.d VI2, VI0, 1
  334. vreplvei.d x1, VM0, 0
  335. vreplvei.d x2, VM0, 1
  336. fcmp.ceq.d $fcc0, $f10, $f9
  337. bceqz $fcc0, .L26
  338. vfcmp.clt.d VT0, VI1, VI2
  339. vbitsel.v VI0, VI2, VI1, VT0
  340. b .L27
  341. #else
  342. vreplvei.w VI1, VI0, 0
  343. vreplvei.w VI2, VI0, 1
  344. vreplvei.w VI3, VI0, 2
  345. vreplvei.w VI4, VI0, 3
  346. vreplvei.w x1, VM0, 0
  347. vreplvei.w x2, VM0, 1
  348. vreplvei.w x3, VM0, 2
  349. vreplvei.w x4, VM0, 3
  350. vfmina.s VM1, x1, x2
  351. vfcmp.ceq.s VT0, VM1, x1
  352. vbitsel.v VINC4, VI2, VI1, VT0
  353. vfmina.s VM0, x3, x4
  354. vfcmp.ceq.s VT0, x3, VM0
  355. vbitsel.v VINC8, VI4, VI3, VT0
  356. vfmina.s VM0, VM0, VM1
  357. vfcmp.ceq.s VT0, VM0, VM1
  358. vbitsel.v VI0, VINC8, VINC4, VT0
  359. fcmp.ceq.d $fcc0, $f15, $f9
  360. bceqz $fcc0, .L26
  361. vfcmp.clt.s VT0, VI1, VI0
  362. vbitsel.v VI0, VI0, VI1, VT0
  363. #endif
  364. .align 3
  365. .L26:
  366. #ifdef DOUBLE
  367. vfmina.d VM0, x1, x2
  368. vfcmp.ceq.d VT0, x1, VM0
  369. vbitsel.v VI0, VI2, VI1, VT0
  370. .align 3
  371. .L27:
  372. movfr2gr.d i0, $f20
  373. #else
  374. fcmp.ceq.d $fcc0, $f15, $f10
  375. bceqz $fcc0, .L27
  376. vfcmp.clt.s VT0, VI2, VI0
  377. vbitsel.v VI0, VI0, VI2, VT0
  378. .align 3
  379. .L27:
  380. fcmp.ceq.d $fcc0, $f15, $f11
  381. bceqz $fcc0, .L28
  382. vfcmp.clt.s VT0, VI3, VI0
  383. vbitsel.v VI0, VI0, VI3, VT0
  384. .align 3
  385. .L28:
  386. fcmp.ceq.d $fcc0, $f15, $f12
  387. bceqz $fcc0, .L29
  388. vfcmp.clt.s VT0, VI4, VI0
  389. vbitsel.v VI0, VI0, VI4, VT0
  390. .align 3
  391. .L29:
  392. movfr2gr.s i0, $f20
  393. #endif
  394. .align 3
  395. .L21: //N<8
  396. andi I, N, 7
  397. bge $r0, I, .L999
  398. srai.d i1, N, 3
  399. slli.d i1, i1, 3
  400. addi.d i1, i1, 1 //current index
  401. movgr2fr.d $f21, i1
  402. movgr2fr.d $f20, i0
  403. .align 3
  404. .L22:
  405. LD $f9, X, 0
  406. addi.d I, I, -1
  407. VFMINA VM1, x1, VM0
  408. VCMPEQ VT0, VM0, VM1
  409. add.d X, X, INCX
  410. vbitsel.v VM0, VM1, VM0, VT0
  411. vbitsel.v VI0, VI1, VI0, VT0
  412. addi.d i1, i1, 1
  413. MTC $f21, i1
  414. blt $r0, I, .L22
  415. movfr2gr.s i0, $f20
  416. .align 3
  417. .L999:
  418. move $r4, $r17
  419. jirl $r0, $r1, 0x0
  420. .align 3
  421. EPILOGUE