You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

imin_lasx.S 14 kB


  1. /***************************************************************************
  2. Copyright (c) 2023, The OpenBLAS Project
  3. All rights reserved.
  4. Redistribution and use in source and binary forms, with or without
  5. modification, are permitted provided that the following conditions are
  6. met:
  7. 1. Redistributions of source code must retain the above copyright
  8. notice, this list of conditions and the following disclaimer.
  9. 2. Redistributions in binary form must reproduce the above copyright
  10. notice, this list of conditions and the following disclaimer in
  11. the documentation and/or other materials provided with the
  12. distribution.
  13. 3. Neither the name of the OpenBLAS project nor the names of
  14. its contributors may be used to endorse or promote products
  15. derived from this software without specific prior written permission.
  16. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  17. AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  18. IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  19. ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
  20. LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  21. DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
  22. SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
  23. CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  24. OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
  25. USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  26. *****************************************************************************/
  27. #define ASSEMBLER
  28. #include "common.h"
  29. #define N $r4
  30. #define X $r5
  31. #define INCX $r6
  32. #define I $r12
  33. #define t1 $r13
  34. #define t2 $r15
  35. #define t3 $r18
  36. #define t4 $r16
  37. #define i0 $r17
  38. #define i1 $r14
  39. #define TEMP $r19
  40. #define x1 $xr9
  41. #define x2 $xr10
  42. #define x3 $xr11
  43. #define x4 $xr12
  44. #define VX0 $xr13
  45. #define VX1 $xr14
  46. #define VM0 $xr15
  47. #define VM1 $xr16
  48. #define VINC4 $xr17
  49. #define VINC8 $xr18
  50. #define VI0 $xr20
  51. #define VI1 $xr21
  52. #define VI2 $xr22
  53. #define VI3 $xr8
  54. #define VI4 $xr19
  55. #define VT0 $xr23
  56. PROLOGUE
  57. li.d i0, 0
  58. bge $r0, N, .L999
  59. bge $r0, INCX, .L999
  60. li.d TEMP, 1
  61. slli.d TEMP, TEMP, BASE_SHIFT
  62. slli.d INCX, INCX, BASE_SHIFT
  63. bne INCX, TEMP, .L20
  64. xvld VM0, X, 0
  65. #ifdef DOUBLE
  66. addi.d i0, i0, 1
  67. srai.d I, N, 3
  68. bge $r0, I, .L21
  69. slli.d i0, i0, 2 //4
  70. xvreplgr2vr.d VINC4, i0
  71. slli.d i0, i0, 1 //8
  72. xvreplgr2vr.d VINC8, i0
  73. addi.d i0, i0, -15
  74. xvinsgr2vr.d VI1, i0, 0 //initialize the index value for vectorization
  75. addi.d i0, i0, 1
  76. xvinsgr2vr.d VI1, i0, 1
  77. addi.d i0, i0, 1
  78. xvinsgr2vr.d VI1, i0, 2
  79. addi.d i0, i0, 1
  80. xvinsgr2vr.d VI1, i0, 3
  81. addi.d i0, i0, 5
  82. xvinsgr2vr.d VI0, i0, 0 //1
  83. addi.d i0, i0, 1
  84. xvinsgr2vr.d VI0, i0, 1 //2
  85. addi.d i0, i0, 1
  86. xvinsgr2vr.d VI0, i0, 2 //3
  87. addi.d i0, i0, 1
  88. xvinsgr2vr.d VI0, i0, 3 //4
  89. #else
  90. addi.w i0, i0, 1
  91. srai.d I, N, 3
  92. bge $r0, I, .L21
  93. slli.w i0, i0, 3 //8
  94. xvreplgr2vr.w VINC8, i0
  95. addi.w i0, i0, -15
  96. xvinsgr2vr.w VI1, i0, 0 //initialize the index value for vectorization
  97. addi.w i0, i0, 1
  98. xvinsgr2vr.w VI1, i0, 1
  99. addi.w i0, i0, 1
  100. xvinsgr2vr.w VI1, i0, 2
  101. addi.w i0, i0, 1
  102. xvinsgr2vr.w VI1, i0, 3
  103. addi.w i0, i0, 1
  104. xvinsgr2vr.w VI1, i0, 4
  105. addi.w i0, i0, 1
  106. xvinsgr2vr.w VI1, i0, 5
  107. addi.w i0, i0, 1
  108. xvinsgr2vr.w VI1, i0, 6
  109. addi.w i0, i0, 1
  110. xvinsgr2vr.w VI1, i0, 7
  111. addi.w i0, i0, 1
  112. xvinsgr2vr.w VI0, i0, 0 //1
  113. addi.w i0, i0, 1
  114. xvinsgr2vr.w VI0, i0, 1 //2
  115. addi.w i0, i0, 1
  116. xvinsgr2vr.w VI0, i0, 2 //3
  117. addi.w i0, i0, 1
  118. xvinsgr2vr.w VI0, i0, 3 //4
  119. addi.w i0, i0, 1
  120. xvinsgr2vr.w VI0, i0, 4 //5
  121. addi.w i0, i0, 1
  122. xvinsgr2vr.w VI0, i0, 5 //6
  123. addi.w i0, i0, 1
  124. xvinsgr2vr.w VI0, i0, 6 //7
  125. addi.w i0, i0, 1
  126. xvinsgr2vr.w VI0, i0, 7 //8
  127. #endif
  128. .align 3
  129. .L10:
  130. xvld VX0, X, 0 * SIZE
  131. #ifdef DOUBLE
  132. xvadd.d VI1, VI1, VINC8
  133. xvld VX1, X, 4 * SIZE
  134. xvadd.d VI2, VI1, VINC4
  135. XVCMPLT VT0, VX1, VX0
  136. addi.d I, I, -1
  137. xvbitsel.v VM1, VX0, VX1, VT0
  138. xvbitsel.v VI2, VI1, VI2, VT0
  139. XVCMPLT VT0, VM1, VM0
  140. addi.d X, X, 8 * SIZE
  141. xvbitsel.v VM0, VM0, VM1, VT0
  142. xvbitsel.v VI0, VI0, VI2, VT0
  143. #else
  144. xvadd.w VI1, VI1, VINC8
  145. XVCMPLT VT0, VX0, VM0
  146. addi.d I, I, -1
  147. xvbitsel.v VM0, VM0, VX0, VT0
  148. xvbitsel.v VI0, VI0, VI1, VT0
  149. addi.d X, X, 8 * SIZE
  150. #endif
  151. blt $r0, I, .L10
  152. .align 3
  153. .L15:
  154. #ifdef DOUBLE
  155. xvpickve.d VI1, VI0, 0
  156. xvpickve.d VI2, VI0, 1
  157. xvpickve.d VI3, VI0, 2
  158. xvpickve.d VI4, VI0, 3
  159. xvpickve.d x1, VM0, 0
  160. xvpickve.d x2, VM0, 1
  161. xvpickve.d x3, VM0, 2
  162. xvpickve.d x4, VM0, 3
  163. #else
  164. xvxor.v VX0, VX0, VX0
  165. xvor.v VX0, VI0, VX0
  166. xvxor.v VX1, VX1, VX1
  167. xvor.v VX1, VM0, VX1
  168. xvpickve.w VI1, VI0, 0
  169. xvpickve.w VI2, VI0, 1
  170. xvpickve.w VI3, VI0, 2
  171. xvpickve.w VI4, VI0, 3
  172. xvpickve.w x1, VM0, 0
  173. xvpickve.w x2, VM0, 1
  174. xvpickve.w x3, VM0, 2
  175. xvpickve.w x4, VM0, 3
  176. #endif
  177. XVCMPLT VT0, x2, x1
  178. xvbitsel.v VM1, x1, x2, VT0
  179. xvbitsel.v VINC4, VI1, VI2, VT0
  180. XVCMPLT VT0, x4, x3
  181. xvbitsel.v VM0, x3, x4, VT0
  182. xvbitsel.v VINC8, VI3, VI4, VT0
  183. XVCMPLT VT0, VM1, VM0
  184. xvbitsel.v VM0, VM0, VM1, VT0
  185. xvbitsel.v VI0, VINC8, VINC4, VT0
  186. fcmp.ceq.d $fcc0, $f15, $f9
  187. bceqz $fcc0, .L26
  188. XVCMPLT VT0, VI1, VI0
  189. xvbitsel.v VI0, VI0, VI1, VT0
  190. b .L26
  191. .align 3
  192. .L20: // INCX!=1
  193. move TEMP, X
  194. #ifdef DOUBLE
  195. addi.d i0, i0, 1
  196. ld.d t1, TEMP, 0 * SIZE
  197. add.d TEMP, TEMP, INCX
  198. xvinsgr2vr.d VM0, t1, 0
  199. srai.d I, N, 3
  200. bge $r0, I, .L21
  201. ld.d t2, TEMP, 0 * SIZE
  202. add.d TEMP, TEMP, INCX
  203. ld.d t3, TEMP, 0 * SIZE
  204. add.d TEMP, TEMP, INCX
  205. ld.d t4, TEMP, 0 * SIZE
  206. add.d TEMP, TEMP, INCX
  207. xvinsgr2vr.d VM0, t1, 0
  208. xvinsgr2vr.d VM0, t2, 1
  209. xvinsgr2vr.d VM0, t3, 2
  210. xvinsgr2vr.d VM0, t4, 3
  211. slli.d i0, i0, 2 //4
  212. xvreplgr2vr.d VINC4, i0
  213. slli.d i0, i0, 1 //8
  214. xvreplgr2vr.d VINC8, i0
  215. addi.d i0, i0, -15
  216. xvinsgr2vr.d VI1, i0, 0 //initialize the index value for vectorization
  217. addi.d i0, i0, 1
  218. xvinsgr2vr.d VI1, i0, 1
  219. addi.d i0, i0, 1
  220. xvinsgr2vr.d VI1, i0, 2
  221. addi.d i0, i0, 1
  222. xvinsgr2vr.d VI1, i0, 3
  223. addi.d i0, i0, 5
  224. xvinsgr2vr.d VI0, i0, 0 //1
  225. addi.d i0, i0, 1
  226. xvinsgr2vr.d VI0, i0, 1 //2
  227. addi.d i0, i0, 1
  228. xvinsgr2vr.d VI0, i0, 2 //3
  229. addi.d i0, i0, 1
  230. xvinsgr2vr.d VI0, i0, 3 //4
  231. #else
  232. addi.w i0, i0, 1
  233. ld.w t1, TEMP, 0 * SIZE
  234. add.d TEMP, TEMP, INCX
  235. srai.d I, N, 3
  236. bge $r0, I, .L21
  237. ld.w t2, TEMP, 0 * SIZE
  238. add.d TEMP, TEMP, INCX
  239. ld.w t3, TEMP, 0 * SIZE
  240. add.d TEMP, TEMP, INCX
  241. ld.w t4, TEMP, 0 * SIZE
  242. add.d TEMP, TEMP, INCX
  243. xvinsgr2vr.w VM0, t1, 0
  244. xvinsgr2vr.w VM0, t2, 1
  245. xvinsgr2vr.w VM0, t3, 2
  246. xvinsgr2vr.w VM0, t4, 3
  247. ld.w t1, TEMP, 0 * SIZE
  248. add.d TEMP, TEMP, INCX
  249. ld.w t2, TEMP, 0 * SIZE
  250. add.d TEMP, TEMP, INCX
  251. ld.w t3, TEMP, 0 * SIZE
  252. add.d TEMP, TEMP, INCX
  253. ld.w t4, TEMP, 0 * SIZE
  254. add.d TEMP, TEMP, INCX
  255. xvinsgr2vr.w VM0, t1, 4
  256. xvinsgr2vr.w VM0, t2, 5
  257. xvinsgr2vr.w VM0, t3, 6
  258. xvinsgr2vr.w VM0, t4, 7
  259. slli.w i0, i0, 3 //8
  260. xvreplgr2vr.w VINC8, i0
  261. addi.w i0, i0, -15
  262. xvinsgr2vr.w VI1, i0, 0 //initialize the index value for vectorization
  263. addi.w i0, i0, 1
  264. xvinsgr2vr.w VI1, i0, 1
  265. addi.w i0, i0, 1
  266. xvinsgr2vr.w VI1, i0, 2
  267. addi.w i0, i0, 1
  268. xvinsgr2vr.w VI1, i0, 3
  269. addi.w i0, i0, 1
  270. xvinsgr2vr.w VI1, i0, 4
  271. addi.w i0, i0, 1
  272. xvinsgr2vr.w VI1, i0, 5
  273. addi.w i0, i0, 1
  274. xvinsgr2vr.w VI1, i0, 6
  275. addi.w i0, i0, 1
  276. xvinsgr2vr.w VI1, i0, 7
  277. addi.w i0, i0, 1
  278. xvinsgr2vr.w VI0, i0, 0 //1
  279. addi.w i0, i0, 1
  280. xvinsgr2vr.w VI0, i0, 1 //2
  281. addi.w i0, i0, 1
  282. xvinsgr2vr.w VI0, i0, 2 //3
  283. addi.w i0, i0, 1
  284. xvinsgr2vr.w VI0, i0, 3 //4
  285. addi.w i0, i0, 1
  286. xvinsgr2vr.w VI0, i0, 4 //5
  287. addi.w i0, i0, 1
  288. xvinsgr2vr.w VI0, i0, 5 //6
  289. addi.w i0, i0, 1
  290. xvinsgr2vr.w VI0, i0, 6 //7
  291. addi.w i0, i0, 1
  292. xvinsgr2vr.w VI0, i0, 7 //8
  293. #endif
  294. .align 3
  295. .L24:
  296. #ifdef DOUBLE
  297. ld.d t1, X, 0 * SIZE
  298. add.d X, X, INCX
  299. ld.d t2, X, 0 * SIZE
  300. add.d X, X, INCX
  301. ld.d t3, X, 0 * SIZE
  302. add.d X, X, INCX
  303. ld.d t4, X, 0 * SIZE
  304. add.d X, X, INCX
  305. xvinsgr2vr.d VX0, t1, 0
  306. xvinsgr2vr.d VX0, t2, 1
  307. xvinsgr2vr.d VX0, t3, 2
  308. xvinsgr2vr.d VX0, t4, 3
  309. xvadd.d VI1, VI1, VINC8
  310. ld.d t1, X, 0 * SIZE
  311. add.d X, X, INCX
  312. ld.d t2, X, 0 * SIZE
  313. add.d X, X, INCX
  314. ld.d t3, X, 0 * SIZE
  315. add.d X, X, INCX
  316. ld.d t4, X, 0 * SIZE
  317. add.d X, X, INCX
  318. xvinsgr2vr.d VX1, t1, 0
  319. xvinsgr2vr.d VX1, t2, 1
  320. xvinsgr2vr.d VX1, t3, 2
  321. xvinsgr2vr.d VX1, t4, 3
  322. xvadd.d VI1, VI1, VINC8
  323. xvadd.d VI2, VI1, VINC4
  324. XVCMPLT VT0, VX1, VX0
  325. addi.d I, I, -1
  326. xvbitsel.v VM1, VX0, VX1, VT0
  327. xvbitsel.v VI2, VI1, VI2, VT0
  328. XVCMPLT VT0, VM1, VM0
  329. xvbitsel.v VM0, VM0, VM1, VT0
  330. xvbitsel.v VI0, VI0, VI2, VT0
  331. #else
  332. ld.w t1, X, 0 * SIZE
  333. add.d X, X, INCX
  334. ld.w t2, X, 0 * SIZE
  335. add.d X, X, INCX
  336. ld.w t3, X, 0 * SIZE
  337. add.d X, X, INCX
  338. ld.w t4, X, 0 * SIZE
  339. add.d X, X, INCX
  340. xvinsgr2vr.w VX0, t1, 0
  341. xvinsgr2vr.w VX0, t2, 1
  342. xvinsgr2vr.w VX0, t3, 2
  343. xvinsgr2vr.w VX0, t4, 3
  344. ld.w t1, X, 0 * SIZE
  345. add.d X, X, INCX
  346. ld.w t2, X, 0 * SIZE
  347. add.d X, X, INCX
  348. ld.w t3, X, 0 * SIZE
  349. add.d X, X, INCX
  350. ld.w t4, X, 0 * SIZE
  351. add.d X, X, INCX
  352. xvinsgr2vr.w VX0, t1, 4
  353. xvinsgr2vr.w VX0, t2, 5
  354. xvinsgr2vr.w VX0, t3, 6
  355. xvinsgr2vr.w VX0, t4, 7
  356. xvadd.w VI1, VI1, VINC8
  357. XVCMPLT VT0, VX0, VM0
  358. addi.d I, I, -1
  359. xvbitsel.v VM0, VM0, VX0, VT0
  360. xvbitsel.v VI0, VI0, VI1, VT0
  361. #endif
  362. blt $r0, I, .L24
  363. .align 3
  364. .L25:
  365. #ifdef DOUBLE
  366. xvpickve.d VI1, VI0, 0
  367. xvpickve.d VI2, VI0, 1
  368. xvpickve.d VI3, VI0, 2
  369. xvpickve.d VI4, VI0, 3
  370. xvpickve.d x1, VM0, 0
  371. xvpickve.d x2, VM0, 1
  372. xvpickve.d x3, VM0, 2
  373. xvpickve.d x4, VM0, 3
  374. #else
  375. xvxor.v VX0, VX0, VX0
  376. xvor.v VX0, VI0, VX0
  377. xvxor.v VX1, VX1, VX1
  378. xvor.v VX1, VM0, VX1
  379. xvpickve.w VI1, VI0, 0
  380. xvpickve.w VI2, VI0, 1
  381. xvpickve.w VI3, VI0, 2
  382. xvpickve.w VI4, VI0, 3
  383. xvpickve.w x1, VM0, 0
  384. xvpickve.w x2, VM0, 1
  385. xvpickve.w x3, VM0, 2
  386. xvpickve.w x4, VM0, 3
  387. #endif
  388. XVCMPLT VT0, x2, x1
  389. xvbitsel.v VM1, x1, x2, VT0
  390. xvbitsel.v VINC4, VI1, VI2, VT0
  391. XVCMPLT VT0, x4, x3
  392. xvbitsel.v VM0, x3, x4, VT0
  393. xvbitsel.v VINC8, VI3, VI4, VT0
  394. XVCMPLT VT0, VM1, VM0
  395. xvbitsel.v VM0, VM0, VM1, VT0
  396. xvbitsel.v VI0, VINC8, VINC4, VT0
  397. fcmp.ceq.d $fcc0, $f15, $f9
  398. bceqz $fcc0, .L26
  399. XVCMPLT VT0, VI1, VI0
  400. xvbitsel.v VI0, VI0, VI1, VT0
  401. .align 3
  402. .L26:
  403. fcmp.ceq.d $fcc0, $f15, $f10
  404. bceqz $fcc0, .L27
  405. XVCMPLT VT0, VI2, VI0
  406. xvbitsel.v VI0, VI0, VI2, VT0
  407. .align 3
  408. .L27:
  409. fcmp.ceq.d $fcc0, $f15, $f11
  410. bceqz $fcc0, .L28
  411. XVCMPLT VT0, VI3, VI0
  412. xvbitsel.v VI0, VI0, VI3, VT0
  413. .align 3
  414. .L28:
  415. fcmp.ceq.d $fcc0, $f15, $f12
  416. bceqz $fcc0, .L29
  417. XVCMPLT VT0, VI4, VI0
  418. xvbitsel.v VI0, VI0, VI4, VT0
  419. .align 3
  420. .L29:
  421. #ifdef DOUBLE
  422. MTG i0, $f20
  423. #else
  424. fmov.s $f16, $f20
  425. #endif
  426. .align 3
  427. #ifndef DOUBLE
  428. .L252:
  429. xvxor.v VI0, VI0, VI0
  430. xvor.v VI0, VI0, VX0
  431. fmov.s $f13, $f15
  432. xvxor.v VM0, VM0, VM0
  433. xvor.v VM0, VM0, VX1
  434. xvpickve.w VI1, VI0, 4
  435. xvpickve.w VI2, VI0, 5
  436. xvpickve.w VI3, VI0, 6
  437. xvpickve.w VI4, VI0, 7
  438. xvpickve.w x1, VM0, 4
  439. xvpickve.w x2, VM0, 5
  440. xvpickve.w x3, VM0, 6
  441. xvpickve.w x4, VM0, 7
  442. XVCMPLT VT0, x2, x1
  443. xvbitsel.v x1, x1, x2, VT0
  444. xvbitsel.v VINC4, VI1, VI2, VT0
  445. XVCMPLT VT0, x4, x3
  446. xvbitsel.v VM0, x3, x4, VT0
  447. xvbitsel.v VINC8, VI3, VI4, VT0
  448. XVCMPLT VT0, x1, VM0
  449. xvbitsel.v VM0, VM0, x1, VT0
  450. xvbitsel.v VI0, VINC8, VINC4, VT0
  451. li.d TEMP, 1 //处理尾数相等时取最小序号
  452. movgr2fr.w $f17, TEMP
  453. ffint.s.w $f17, $f17
  454. xvfcmp.ceq.s VT0, VM0, x1
  455. fcmp.ceq.s $fcc0, $f23, $f17
  456. bceqz $fcc0, .L262
  457. XVCMPLT VT0, VI1, VI0
  458. xvbitsel.v VI0, VI0, VI1, VT0
  459. .align 3
  460. .L262:
  461. xvfcmp.ceq.s VT0, VM0, x2
  462. fcmp.ceq.s $fcc0, $f23, $f17
  463. bceqz $fcc0, .L272
  464. XVCMPLT VT0, VI2, VI0
  465. xvbitsel.v VI0, VI0, VI2, VT0
  466. .align 3
  467. .L272:
  468. xvfcmp.ceq.s VT0, VM0, x3
  469. fcmp.ceq.s $fcc0, $f23, $f17
  470. bceqz $fcc0, .L282
  471. XVCMPLT VT0, VI3, VI0
  472. xvbitsel.v VI0, VI0, VI3, VT0
  473. .align 3
  474. .L282:
  475. xvfcmp.ceq.s VT0, VM0, x4
  476. fcmp.ceq.s $fcc0, $f23, $f17
  477. bceqz $fcc0, .L292
  478. XVCMPLT VT0, VI4, VI0
  479. xvbitsel.v VI0, VI0, VI4, VT0
  480. .align 3
  481. .L292:
  482. CMPLT $fcc0, $f13, $f15
  483. fsel $f15, $f15, $f13, $fcc0
  484. fsel $f20, $f20, $f16, $fcc0
  485. MTG i0, $f20
  486. #endif
  487. .L21: //N<8
  488. andi I, N, 7
  489. bge $r0, I, .L999
  490. srai.d i1, N, 3
  491. slli.d i1, i1, 3
  492. addi.d i1, i1, 1 //current index
  493. movgr2fr.d $f21, i1
  494. movgr2fr.d $f20, i0
  495. .align 3
  496. .L22:
  497. fld.d $f9, X, 0
  498. addi.d I, I, -1
  499. CMPLT $fcc0, $f9, $f15
  500. add.d X, X, INCX
  501. fsel $f15, $f15, $f9, $fcc0
  502. fsel $f20, $f20, $f21, $fcc0
  503. addi.d i1, i1, 1
  504. movgr2fr.d $f21, i1
  505. blt $r0, I, .L22
  506. MTG i0, $f20
  507. .align 3
  508. .L999:
  509. move $r4, $r17
  510. jirl $r0, $r1, 0x0
  511. .align 3
  512. EPILOGUE