You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

icamin_lasx.S 16 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655
  1. /***************************************************************************
  2. Copyright (c) 2023, The OpenBLAS Project
  3. All rights reserved.
  4. Redistribution and use in source and binary forms, with or without
  5. modification, are permitted provided that the following conditions are
  6. met:
  7. 1. Redistributions of source code must retain the above copyright
  8. notice, this list of conditions and the following disclaimer.
  9. 2. Redistributions in binary form must reproduce the above copyright
  10. notice, this list of conditions and the following disclaimer in
  11. the documentation and/or other materials provided with the
  12. distribution.
  13. 3. Neither the name of the OpenBLAS project nor the names of
  14. its contributors may be used to endorse or promote products
  15. derived from this software without specific prior written permission.
  16. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  17. AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  18. IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  19. ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
  20. LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  21. DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
  22. SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
  23. CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  24. OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
  25. USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  26. *****************************************************************************/
  27. #define ASSEMBLER
  28. #include "common.h"
  29. #define N $r4
  30. #define X $r5
  31. #define INCX $r6
  32. #define I $r12
  33. #define t1 $r13
  34. #define t2 $r15
  35. #define t3 $r18
  36. #define t4 $r16
  37. #define i0 $r17
  38. #define i1 $r14
  39. #define TEMP $r19
  40. #define a0 $f12
  41. #define a1 $f13
  42. #define s1 $f15
  43. #define x1 $xr9
  44. #define x2 $xr10
  45. #define x3 $xr11
  46. #define x4 $xr12
  47. #define VX0 $xr13
  48. #define VX1 $xr14
  49. #define VM0 $xr15
  50. #define VM1 $xr16
  51. #define VINC4 $xr17
  52. #define VINC8 $xr18
  53. #define VI0 $xr20
  54. #define VI1 $xr21
  55. #define VI2 $xr22
  56. #define VI3 $xr8
  57. #define VI4 $xr19
  58. #define VT0 $xr23
  59. PROLOGUE
  60. li.d i0, 0
  61. bge $r0, N, .L999
  62. bge $r0, INCX, .L999
  63. li.d TEMP, 1
  64. slli.d TEMP, TEMP, ZBASE_SHIFT
  65. slli.d INCX, INCX, ZBASE_SHIFT
  66. LD a0, X, 0 * SIZE
  67. LD a1, X, 1 * SIZE
  68. FABS a0, a0
  69. FABS a1, a1
  70. ADD s1, a1, a0
  71. #ifdef DOUBLE
  72. xvxor.v VI3, VI3, VI3 // 0
  73. li.d I, -1
  74. xvreplgr2vr.d VI4, I
  75. xvffint.d.l VI4, VI4 // -1
  76. bne INCX, TEMP, .L20
  77. // Init VM0
  78. xvreplve0.d VM0, VM0
  79. xvld VX0, X, 0 * SIZE
  80. xvld VX1, X, 4 * SIZE
  81. xvpickev.d x1, VX1, VX0
  82. xvpickod.d x2, VX1, VX0
  83. xvfmul.d x3, VI4, x1
  84. xvfmul.d x4, VI4, x2
  85. xvfcmp.clt.d VT0, x1, VI3
  86. xvfcmp.clt.d VINC8, x2, VI3
  87. xvbitsel.v x1, x1, x3, VT0
  88. xvbitsel.v x2, x2, x4, VINC8
  89. xvfadd.d VM0, x1, x2
  90. addi.d i0, i0, 1
  91. srai.d I, N, 2
  92. bge $r0, I, .L21
  93. slli.d i0, i0, 2 //4
  94. xvreplgr2vr.d VINC4, i0
  95. addi.d i0, i0, -7
  96. xvinsgr2vr.d VI1, i0, 0 //initialize the index value for vectorization
  97. addi.d i0, i0, 2
  98. xvinsgr2vr.d VI1, i0, 1
  99. addi.d i0, i0, -1
  100. xvinsgr2vr.d VI1, i0, 2
  101. addi.d i0, i0, 2
  102. xvinsgr2vr.d VI1, i0, 3
  103. addi.d i0, i0, 1
  104. xvinsgr2vr.d VI0, i0, 0 //1
  105. addi.d i0, i0, 2
  106. xvinsgr2vr.d VI0, i0, 1 //3
  107. addi.d i0, i0, -1
  108. xvinsgr2vr.d VI0, i0, 2 //2
  109. addi.d i0, i0, 2
  110. xvinsgr2vr.d VI0, i0, 3 //4
  111. #else
  112. xvxor.v VI3, VI3, VI3 // 0
  113. li.w I, -1
  114. xvreplgr2vr.w VI4, I
  115. xvffint.s.w VI4, VI4 // -1
  116. bne INCX, TEMP, .L20
  117. // Init VM0
  118. xvld VX0, X, 0 * SIZE
  119. xvld VX1, X, 8 * SIZE
  120. xvpickev.w x1, VX1, VX0
  121. xvpickod.w x2, VX1, VX0
  122. xvfmul.s x3, VI4, x1
  123. xvfmul.s x4, VI4, x2
  124. xvfcmp.clt.s VT0, x1, VI3
  125. xvfcmp.clt.s VINC4, x2, VI3
  126. xvbitsel.v x1, x1, x3, VT0
  127. xvbitsel.v x2, x2, x4, VINC4
  128. xvfadd.s VM0, x1, x2
  129. addi.w i0, i0, 1
  130. srai.d I, N, 3
  131. bge $r0, I, .L21
  132. slli.w i0, i0, 3 //8
  133. xvreplgr2vr.w VINC8, i0
  134. addi.w i0, i0, -15
  135. xvinsgr2vr.w VI1, i0, 0 //initialize the index value for vectorization
  136. addi.w i0, i0, 1
  137. xvinsgr2vr.w VI1, i0, 1
  138. addi.w i0, i0, 3
  139. xvinsgr2vr.w VI1, i0, 2
  140. addi.w i0, i0, 1
  141. xvinsgr2vr.w VI1, i0, 3
  142. addi.w i0, i0, -3
  143. xvinsgr2vr.w VI1, i0, 4
  144. addi.w i0, i0, 1
  145. xvinsgr2vr.w VI1, i0, 5
  146. addi.w i0, i0, 3
  147. xvinsgr2vr.w VI1, i0, 6
  148. addi.w i0, i0, 1
  149. xvinsgr2vr.w VI1, i0, 7
  150. addi.w i0, i0, 1
  151. xvinsgr2vr.w VI0, i0, 0 //1
  152. addi.w i0, i0, 1
  153. xvinsgr2vr.w VI0, i0, 1 //2
  154. addi.w i0, i0, 3
  155. xvinsgr2vr.w VI0, i0, 2 //5
  156. addi.w i0, i0, 1
  157. xvinsgr2vr.w VI0, i0, 3 //6
  158. addi.w i0, i0, -3
  159. xvinsgr2vr.w VI0, i0, 4 //3
  160. addi.w i0, i0, 1
  161. xvinsgr2vr.w VI0, i0, 5 //4
  162. addi.w i0, i0, 3
  163. xvinsgr2vr.w VI0, i0, 6 //7
  164. addi.w i0, i0, 1
  165. xvinsgr2vr.w VI0, i0, 7 //8
  166. #endif
  167. .align 3
  168. .L10:
  169. xvld VX0, X, 0 * SIZE
  170. #ifdef DOUBLE
  171. xvadd.d VI1, VI1, VINC4
  172. xvld VX1, X, 4 * SIZE
  173. addi.d I, I, -1
  174. xvpickev.d x1, VX1, VX0
  175. xvpickod.d x2, VX1, VX0
  176. xvfmul.d x3, VI4, x1
  177. xvfmul.d x4, VI4, x2
  178. xvfcmp.clt.d VT0, x1, VI3
  179. xvfcmp.clt.d VINC8, x2, VI3
  180. xvbitsel.v x1, x1, x3, VT0
  181. xvbitsel.v x2, x2, x4, VINC8
  182. addi.d X, X, 8 * SIZE
  183. #else
  184. xvadd.w VI1, VI1, VINC8
  185. xvld VX1, X, 8 * SIZE
  186. addi.d I, I, -1
  187. xvpickev.w x1, VX1, VX0
  188. xvpickod.w x2, VX1, VX0
  189. xvfmul.s x3, VI4, x1
  190. xvfmul.s x4, VI4, x2
  191. xvfcmp.clt.s VT0, x1, VI3
  192. xvfcmp.clt.s VINC4, x2, VI3
  193. xvbitsel.v x1, x1, x3, VT0
  194. xvbitsel.v x2, x2, x4, VINC4
  195. addi.d X, X, 16 * SIZE
  196. #endif
  197. XVFADD x1, x1, x2
  198. XVFMIN x3, VM0, x1
  199. XVCMPEQ VT0, x3, VM0
  200. xvbitsel.v VM0, x3, VM0, VT0
  201. xvbitsel.v VI0, VI1, VI0, VT0
  202. blt $r0, I, .L10
  203. .align 3
  204. .L15:
  205. #ifdef DOUBLE
  206. xvpickve.d VI1, VI0, 0
  207. xvpickve.d VI2, VI0, 1
  208. xvpickve.d VI3, VI0, 2
  209. xvpickve.d VI4, VI0, 3
  210. xvpickve.d x1, VM0, 0
  211. xvpickve.d x2, VM0, 1
  212. xvpickve.d x3, VM0, 2
  213. xvpickve.d x4, VM0, 3
  214. xvfmin.d VM1, x1, x2
  215. xvfcmp.ceq.d VT0, VM1, x1
  216. xvbitsel.v VINC4, VI2, VI1, VT0
  217. xvfmin.d VM0, x3, x4
  218. xvfcmp.ceq.d VT0, x3, VM0
  219. xvbitsel.v VINC8, VI4, VI3, VT0
  220. xvfmin.d VM0, VM0, VM1
  221. xvfcmp.ceq.d VT0, VM0, VM1
  222. xvbitsel.v VI0, VINC8, VINC4, VT0
  223. #else
  224. xvxor.v VX0, VX0, VX0
  225. xvor.v VX0, VI0, VX0
  226. xvxor.v VX1, VX1, VX1
  227. xvor.v VX1, VM0, VX1
  228. xvpickve.w VI1, VI0, 0
  229. xvpickve.w VI2, VI0, 1
  230. xvpickve.w VI3, VI0, 2
  231. xvpickve.w VI4, VI0, 3
  232. xvpickve.w x1, VM0, 0
  233. xvpickve.w x2, VM0, 1
  234. xvpickve.w x3, VM0, 2
  235. xvpickve.w x4, VM0, 3
  236. xvfcmp.clt.s VT0, x2, x1
  237. xvbitsel.v VM1, x1, x2, VT0
  238. xvbitsel.v VINC4, VI1, VI2, VT0
  239. xvfcmp.clt.s VT0, x4, x3
  240. xvbitsel.v VM0, x3, x4, VT0
  241. xvbitsel.v VINC8, VI3, VI4, VT0
  242. xvfcmp.clt.s VT0, VM1, VM0
  243. xvbitsel.v VM0, VM0, VM1, VT0
  244. xvbitsel.v VI0, VINC8, VINC4, VT0
  245. #endif
  246. fcmp.ceq.d $fcc0, $f15, $f9
  247. bceqz $fcc0, .L26
  248. XVCMPLT VT0, VI1, VI0
  249. xvbitsel.v VI0, VI0, VI1, VT0
  250. b .L26
  251. .align 3
  252. .L20: // INCX!=1
  253. #ifdef DOUBLE
  254. // Init VM0
  255. ld.d t1, X, 0 * SIZE
  256. ld.d t2, X, 1 * SIZE
  257. add.d i1, X, INCX
  258. ld.d t3, i1, 0 * SIZE
  259. ld.d t4, i1, 1 * SIZE
  260. add.d i1, i1, INCX
  261. xvinsgr2vr.d x1, t1, 0
  262. xvinsgr2vr.d x2, t2, 0
  263. xvinsgr2vr.d x1, t3, 1
  264. xvinsgr2vr.d x2, t4, 1
  265. ld.d t1, i1, 0 * SIZE
  266. ld.d t2, i1, 1 * SIZE
  267. add.d i1, i1, INCX
  268. ld.d t3, i1, 0 * SIZE
  269. ld.d t4, i1, 1 * SIZE
  270. xvinsgr2vr.d x1, t1, 2
  271. xvinsgr2vr.d x2, t2, 2
  272. xvinsgr2vr.d x1, t3, 3
  273. xvinsgr2vr.d x2, t4, 3
  274. xvfmul.d x3, VI4, x1
  275. xvfmul.d x4, VI4, x2
  276. xvfcmp.clt.d VT0, x1, VI3
  277. xvfcmp.clt.d VINC8, x2, VI3
  278. xvbitsel.v x1, x1, x3, VT0
  279. xvbitsel.v x2, x2, x4, VINC8
  280. xvfadd.d VM0, x1, x2
  281. addi.d i0, i0, 1
  282. srai.d I, N, 2
  283. bge $r0, I, .L21
  284. slli.d i0, i0, 2 //4
  285. xvreplgr2vr.d VINC4, i0
  286. addi.d i0, i0, -7
  287. xvinsgr2vr.d VI1, i0, 0 //initialize the index value for vectorization
  288. addi.d i0, i0, 1
  289. xvinsgr2vr.d VI1, i0, 1
  290. addi.d i0, i0, 1
  291. xvinsgr2vr.d VI1, i0, 2
  292. addi.d i0, i0, 1
  293. xvinsgr2vr.d VI1, i0, 3
  294. addi.d i0, i0, 1
  295. xvinsgr2vr.d VI0, i0, 0 //1
  296. addi.d i0, i0, 1
  297. xvinsgr2vr.d VI0, i0, 1 //2
  298. addi.d i0, i0, 1
  299. xvinsgr2vr.d VI0, i0, 2 //3
  300. addi.d i0, i0, 1
  301. xvinsgr2vr.d VI0, i0, 3 //4
  302. #else
  303. // Init VM0
  304. ld.w t1, X, 0 * SIZE
  305. ld.w t2, X, 1 * SIZE
  306. add.d i1, X, INCX
  307. ld.w t3, i1, 0 * SIZE
  308. ld.w t4, i1, 1 * SIZE
  309. add.d i1, i1, INCX
  310. xvinsgr2vr.w x1, t1, 0
  311. xvinsgr2vr.w x2, t2, 0
  312. xvinsgr2vr.w x1, t3, 1
  313. xvinsgr2vr.w x2, t4, 1
  314. ld.w t1, i1, 0 * SIZE
  315. ld.w t2, i1, 1 * SIZE
  316. add.d i1, i1, INCX
  317. ld.w t3, i1, 0 * SIZE
  318. ld.w t4, i1, 1 * SIZE
  319. add.d i1, i1, INCX
  320. xvinsgr2vr.w x1, t1, 2
  321. xvinsgr2vr.w x2, t2, 2
  322. xvinsgr2vr.w x1, t3, 3
  323. xvinsgr2vr.w x2, t4, 3
  324. ld.w t1, i1, 0 * SIZE
  325. ld.w t2, i1, 1 * SIZE
  326. add.d i1, i1, INCX
  327. ld.w t3, i1, 0 * SIZE
  328. ld.w t4, i1, 1 * SIZE
  329. add.d i1, i1, INCX
  330. xvinsgr2vr.w x1, t1, 4
  331. xvinsgr2vr.w x2, t2, 4
  332. xvinsgr2vr.w x1, t3, 5
  333. xvinsgr2vr.w x2, t4, 5
  334. ld.w t1, i1, 0 * SIZE
  335. ld.w t2, i1, 1 * SIZE
  336. add.d i1, i1, INCX
  337. ld.w t3, i1, 0 * SIZE
  338. ld.w t4, i1, 1 * SIZE
  339. add.d i1, i1, INCX
  340. xvinsgr2vr.w x1, t1, 6
  341. xvinsgr2vr.w x2, t2, 6
  342. xvinsgr2vr.w x1, t3, 7
  343. xvinsgr2vr.w x2, t4, 7
  344. xvfmul.s x3, VI4, x1
  345. xvfmul.s x4, VI4, x2
  346. xvfcmp.clt.s VT0, x1, VI3
  347. xvfcmp.clt.s VINC8, x2, VI3
  348. xvbitsel.v x1, x1, x3, VT0
  349. xvbitsel.v x2, x2, x4, VINC8
  350. xvfadd.s VM0, x1, x2
  351. addi.w i0, i0, 1
  352. srai.d I, N, 3
  353. bge $r0, I, .L21
  354. slli.w i0, i0, 3 //8
  355. xvreplgr2vr.w VINC8, i0
  356. addi.w i0, i0, -15
  357. xvinsgr2vr.w VI1, i0, 0 //initialize the index value for vectorization
  358. addi.w i0, i0, 1
  359. xvinsgr2vr.w VI1, i0, 1
  360. addi.w i0, i0, 1
  361. xvinsgr2vr.w VI1, i0, 2
  362. addi.w i0, i0, 1
  363. xvinsgr2vr.w VI1, i0, 3
  364. addi.w i0, i0, 1
  365. xvinsgr2vr.w VI1, i0, 4
  366. addi.w i0, i0, 1
  367. xvinsgr2vr.w VI1, i0, 5
  368. addi.w i0, i0, 1
  369. xvinsgr2vr.w VI1, i0, 6
  370. addi.w i0, i0, 1
  371. xvinsgr2vr.w VI1, i0, 7
  372. addi.w i0, i0, 1
  373. xvinsgr2vr.w VI0, i0, 0 //1
  374. addi.w i0, i0, 1
  375. xvinsgr2vr.w VI0, i0, 1 //2
  376. addi.w i0, i0, 1
  377. xvinsgr2vr.w VI0, i0, 2 //3
  378. addi.w i0, i0, 1
  379. xvinsgr2vr.w VI0, i0, 3 //4
  380. addi.w i0, i0, 1
  381. xvinsgr2vr.w VI0, i0, 4 //5
  382. addi.w i0, i0, 1
  383. xvinsgr2vr.w VI0, i0, 5 //6
  384. addi.w i0, i0, 1
  385. xvinsgr2vr.w VI0, i0, 6 //7
  386. addi.w i0, i0, 1
  387. xvinsgr2vr.w VI0, i0, 7 //8
  388. #endif
  389. .align 3
  390. .L24:
  391. #ifdef DOUBLE
  392. ld.d t1, X, 0 * SIZE
  393. ld.d t2, X, 1 * SIZE
  394. add.d X, X, INCX
  395. ld.d t3, X, 0 * SIZE
  396. ld.d t4, X, 1 * SIZE
  397. add.d X, X, INCX
  398. xvinsgr2vr.d x1, t1, 0
  399. xvinsgr2vr.d x2, t2, 0
  400. xvinsgr2vr.d x1, t3, 1
  401. xvinsgr2vr.d x2, t4, 1
  402. xvadd.d VI1, VI1, VINC4
  403. ld.d t1, X, 0 * SIZE
  404. ld.d t2, X, 1 * SIZE
  405. add.d X, X, INCX
  406. ld.d t3, X, 0 * SIZE
  407. ld.d t4, X, 1 * SIZE
  408. add.d X, X, INCX
  409. xvinsgr2vr.d x1, t1, 2
  410. xvinsgr2vr.d x2, t2, 2
  411. xvinsgr2vr.d x1, t3, 3
  412. xvinsgr2vr.d x2, t4, 3
  413. #else
  414. ld.w t1, X, 0 * SIZE
  415. ld.w t2, X, 1 * SIZE
  416. add.d X, X, INCX
  417. ld.w t3, X, 0 * SIZE
  418. ld.w t4, X, 1 * SIZE
  419. add.d X, X, INCX
  420. xvinsgr2vr.w x1, t1, 0
  421. xvinsgr2vr.w x2, t2, 0
  422. xvinsgr2vr.w x1, t3, 1
  423. xvinsgr2vr.w x2, t4, 1
  424. ld.w t1, X, 0 * SIZE
  425. ld.w t2, X, 1 * SIZE
  426. add.d X, X, INCX
  427. ld.w t3, X, 0 * SIZE
  428. ld.w t4, X, 1 * SIZE
  429. add.d X, X, INCX
  430. xvinsgr2vr.w x1, t1, 2
  431. xvinsgr2vr.w x2, t2, 2
  432. xvinsgr2vr.w x1, t3, 3
  433. xvinsgr2vr.w x2, t4, 3
  434. xvadd.w VI1, VI1, VINC8
  435. ld.w t1, X, 0 * SIZE
  436. ld.w t2, X, 1 * SIZE
  437. add.d X, X, INCX
  438. ld.w t3, X, 0 * SIZE
  439. ld.w t4, X, 1 * SIZE
  440. add.d X, X, INCX
  441. xvinsgr2vr.w x1, t1, 4
  442. xvinsgr2vr.w x2, t2, 4
  443. xvinsgr2vr.w x1, t3, 5
  444. xvinsgr2vr.w x2, t4, 5
  445. ld.w t1, X, 0 * SIZE
  446. ld.w t2, X, 1 * SIZE
  447. add.d X, X, INCX
  448. ld.w t3, X, 0 * SIZE
  449. ld.w t4, X, 1 * SIZE
  450. add.d X, X, INCX
  451. xvinsgr2vr.w x1, t1, 6
  452. xvinsgr2vr.w x2, t2, 6
  453. xvinsgr2vr.w x1, t3, 7
  454. xvinsgr2vr.w x2, t4, 7
  455. #endif
  456. addi.d I, I, -1
  457. XVFMUL x3, VI4, x1
  458. XVFMUL x4, VI4, x2
  459. XVCMPLT VT0, x1, VI3
  460. XVCMPLT VINC8, x2, VI3
  461. xvbitsel.v x1, x1, x3, VT0
  462. xvbitsel.v x2, x2, x4, VINC8
  463. XVFADD x1, x1, x2
  464. XVFMIN x3, VM0, x1
  465. XVCMPEQ VT0, x3, VM0
  466. xvbitsel.v VM0, x3, VM0, VT0
  467. xvbitsel.v VI0, VI1, VI0, VT0
  468. blt $r0, I, .L24
  469. .align 3
  470. .L25:
  471. #ifdef DOUBLE
  472. xvpickve.d VI1, VI0, 0
  473. xvpickve.d VI2, VI0, 1
  474. xvpickve.d VI3, VI0, 2
  475. xvpickve.d VI4, VI0, 3
  476. xvpickve.d x1, VM0, 0
  477. xvpickve.d x2, VM0, 1
  478. xvpickve.d x3, VM0, 2
  479. xvpickve.d x4, VM0, 3
  480. xvfmina.d VM1, x1, x2
  481. xvfcmp.ceq.d VT0, VM1, x1
  482. xvbitsel.v VINC4, VI2, VI1, VT0
  483. xvfmina.d VM0, x3, x4
  484. xvfcmp.ceq.d VT0, x3, VM0
  485. xvbitsel.v VINC8, VI4, VI3, VT0
  486. xvfmina.d VM0, VM0, VM1
  487. xvfcmp.ceq.d VT0, VM0, VM1
  488. #else
  489. xvxor.v VX0, VX0, VX0
  490. xvor.v VX0, VI0, VX0
  491. xvxor.v VX1, VX1, VX1
  492. xvor.v VX1, VM0, VX1
  493. xvpickve.w VI1, VI0, 0
  494. xvpickve.w VI2, VI0, 1
  495. xvpickve.w VI3, VI0, 2
  496. xvpickve.w VI4, VI0, 3
  497. xvpickve.w x1, VM0, 0
  498. xvpickve.w x2, VM0, 1
  499. xvpickve.w x3, VM0, 2
  500. xvpickve.w x4, VM0, 3
  501. xvfcmp.clt.s VT0, x2, x1
  502. xvbitsel.v VM1, x1, x2, VT0
  503. xvbitsel.v VINC4, VI1, VI2, VT0
  504. xvfcmp.clt.s VT0, x4, x3
  505. xvbitsel.v VM0, x3, x4, VT0
  506. xvbitsel.v VINC8, VI3, VI4, VT0
  507. xvfcmp.clt.s VT0, VM1, VM0
  508. xvbitsel.v VM0, VM0, VM1, VT0
  509. #endif
  510. xvbitsel.v VI0, VINC8, VINC4, VT0
  511. fcmp.ceq.d $fcc0, $f15, $f9
  512. bceqz $fcc0, .L26
  513. XVCMPLT VT0, VI1, VI0
  514. xvbitsel.v VI0, VI0, VI1, VT0
  515. .align 3
  516. .L26:
  517. fcmp.ceq.d $fcc0, $f15, $f10
  518. bceqz $fcc0, .L27
  519. XVCMPLT VT0, VI2, VI0
  520. xvbitsel.v VI0, VI0, VI2, VT0
  521. .align 3
  522. .L27:
  523. fcmp.ceq.d $fcc0, $f15, $f11
  524. bceqz $fcc0, .L28
  525. XVCMPLT VT0, VI3, VI0
  526. xvbitsel.v VI0, VI0, VI3, VT0
  527. .align 3
  528. .L28:
  529. fcmp.ceq.d $fcc0, $f15, $f12
  530. bceqz $fcc0, .L29
  531. XVCMPLT VT0, VI4, VI0
  532. xvbitsel.v VI0, VI0, VI4, VT0
  533. .align 3
  534. .L29:
  535. #ifdef DOUBLE
  536. movfr2gr.d i0, $f20
  537. .align 3
  538. .L21: //N<4
  539. andi I, N, 3
  540. bge $r0, I, .L999
  541. srai.d i1, N, 2
  542. slli.d i1, i1, 2
  543. #else
  544. fmov.s $f16, $f20
  545. .align 3
  546. .L252:
  547. xvxor.v VI0, VI0, VI0
  548. xvor.v VI0, VI0, VX0
  549. fmov.s $f13, $f15
  550. xvxor.v VM0, VM0, VM0
  551. xvor.v VM0, VM0, VX1
  552. xvpickve.w VI1, VI0, 4
  553. xvpickve.w VI2, VI0, 5
  554. xvpickve.w VI3, VI0, 6
  555. xvpickve.w VI4, VI0, 7
  556. xvpickve.w x1, VM0, 4
  557. xvpickve.w x2, VM0, 5
  558. xvpickve.w x3, VM0, 6
  559. xvpickve.w x4, VM0, 7
  560. xvfcmp.clt.s VT0, x2, x1
  561. xvbitsel.v x1, x1, x2, VT0
  562. xvbitsel.v VINC4, VI1, VI2, VT0
  563. xvfcmp.clt.s VT0, x4, x3
  564. xvbitsel.v VM0, x3, x4, VT0
  565. xvbitsel.v VINC8, VI3, VI4, VT0
  566. xvfcmp.clt.s VT0, x1, VM0
  567. xvbitsel.v VM0, VM0, x1, VT0
  568. xvbitsel.v VI0, VINC8, VINC4, VT0
  569. fcmp.ceq.d $fcc0, $f15, $f9
  570. bceqz $fcc0, .L262
  571. xvfcmp.clt.s VT0, VI1, VI0
  572. xvbitsel.v VI0, VI0, VI1, VT0
  573. .align 3
  574. .L262:
  575. fcmp.ceq.d $fcc0, $f15, $f10
  576. bceqz $fcc0, .L272
  577. xvfcmp.clt.s VT0, VI2, VI0
  578. xvbitsel.v VI0, VI0, VI2, VT0
  579. .align 3
  580. .L272:
  581. fcmp.ceq.d $fcc0, $f15, $f11
  582. bceqz $fcc0, .L282
  583. xvfcmp.clt.s VT0, VI3, VI0
  584. xvbitsel.v VI0, VI0, VI3, VT0
  585. .align 3
  586. .L282:
  587. fcmp.ceq.d $fcc0, $f15, $f12
  588. bceqz $fcc0, .L292
  589. xvfcmp.clt.s VT0, VI4, VI0
  590. xvbitsel.v VI0, VI0, VI4, VT0
  591. .align 3
  592. .L292:
  593. fcmp.clt.s $fcc0, $f13, $f15
  594. fsel $f15, $f15, $f13, $fcc0
  595. fsel $f20, $f20, $f16, $fcc0
  596. movfr2gr.s i0, $f20
  597. .L21: //N<8
  598. andi I, N, 7
  599. bge $r0, I, .L999
  600. srai.d i1, N, 3
  601. slli.d i1, i1, 3
  602. #endif
  603. addi.d i1, i1, 1 //current index
  604. movgr2fr.d $f21, i1
  605. movgr2fr.d $f20, i0
  606. .align 3
  607. .L22:
  608. LD a0, X, 0 * SIZE
  609. LD a1, X, 1 * SIZE
  610. addi.d I, I, -1
  611. FABS a0, a0
  612. FABS a1, a1
  613. ADD a0, a0, a1
  614. FMIN a1, s1, a0
  615. CMPEQ $fcc0, s1, a1
  616. add.d X, X, INCX
  617. fsel s1, a1, s1, $fcc0
  618. fsel $f20, $f21, $f20, $fcc0
  619. addi.d i1, i1, 1
  620. movgr2fr.d $f21, i1
  621. blt $r0, I, .L22
  622. MTG i0, $f20
  623. .align 3
  624. .L999:
  625. move $r4, $r17
  626. jirl $r0, $r1, 0x0
  627. .align 3
  628. EPILOGUE