You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

axpy_lasx.S 13 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529
  1. /***************************************************************************
  2. Copyright (c) 2023, The OpenBLAS Project
  3. All rights reserved.
  4. Redistribution and use in source and binary forms, with or without
  5. modification, are permitted provided that the following conditions are
  6. met:
  7. 1. Redistributions of source code must retain the above copyright
  8. notice, this list of conditions and the following disclaimer.
  9. 2. Redistributions in binary form must reproduce the above copyright
  10. notice, this list of conditions and the following disclaimer in
  11. the documentation and/or other materials provided with the
  12. distribution.
  13. 3. Neither the name of the OpenBLAS project nor the names of
  14. its contributors may be used to endorse or promote products
  15. derived from this software without specific prior written permission.
  16. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  17. AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  18. IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  19. ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
  20. LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  21. DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
  22. SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
  23. CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  24. OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
  25. USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  26. *****************************************************************************/
  27. #define ASSEMBLER
  28. #include "common.h"
  29. #define N $r4
  30. #define XX $r5
  31. #define YY $r6
  32. #define ALPHA $f0
  33. #define X $r7
  34. #define INCX $r8
  35. #define Y $r9
  36. #define INCY $r10
  37. #define I $r12
  38. #define TEMP $r13
  39. #define t1 $r14
  40. #define t2 $r16
  41. #define t3 $r15
  42. #define t4 $r17
  43. #define a1 $f12
  44. #define a2 $f13
  45. #define a3 $f14
  46. #define a4 $f15
  47. #define b1 $f16
  48. #define b2 $f17
  49. #define b3 $f18
  50. #define b4 $f19
  51. #define VX0 $xr8
  52. #define VX1 $xr20
  53. #define VX2 $xr21
  54. #define VX3 $xr22
  55. #define VXA $xr23
  56. PROLOGUE
  57. bge $r0, N, .L999
  58. li.d TEMP, 1
  59. movgr2fr.d a1, $r0
  60. FFINT a1, a1
  61. movgr2fr.d a2, TEMP
  62. FFINT a2, a2
  63. CMPEQ $fcc0, ALPHA, a1
  64. bcnez $fcc0, .L999
  65. slli.d TEMP, TEMP, BASE_SHIFT
  66. slli.d INCX, INCX, BASE_SHIFT
  67. slli.d INCY, INCY, BASE_SHIFT
  68. MTG t1, ALPHA
  69. #ifdef DOUBLE
  70. xvreplgr2vr.d VXA, t1
  71. #else
  72. xvreplgr2vr.w VXA, t1
  73. #endif
  74. srai.d I, N, 3
  75. bne INCX, TEMP, .L20
  76. bne INCY, TEMP, .L12 // INCX==1 and INCY!=1
  77. b .L11 // INCX==1 and INCY==1
  78. .L20:
  79. bne INCY, TEMP, .L22 // INCX!=1 and INCY!=1
  80. b .L21 // INCX!=1 and INCY==1
  81. .L11:
  82. bge $r0, I, .L113
  83. CMPEQ $fcc0, ALPHA, a2
  84. bceqz $fcc0, .L112
  85. .align 3
  86. .L111:
  87. #ifdef DOUBLE
  88. xvld VX0, X, 0 * SIZE
  89. xvld VX2, Y, 0 * SIZE
  90. xvld VX1, X, 4 * SIZE
  91. xvld VX3, Y, 4 * SIZE
  92. xvfadd.d VX2, VX0, VX2
  93. xvfadd.d VX3, VX1, VX3
  94. addi.d I, I, -1
  95. xvst VX2, Y, 0 * SIZE
  96. xvst VX3, Y, 4 * SIZE
  97. #else
  98. xvld VX0, X, 0 * SIZE
  99. xvld VX2, Y, 0 * SIZE
  100. addi.d I, I, -1
  101. xvfadd.s VX2, VX0, VX2
  102. xvst VX2, Y, 0 * SIZE
  103. #endif
  104. addi.d X, X, 8 * SIZE
  105. addi.d Y, Y, 8 * SIZE
  106. blt $r0, I, .L111
  107. b .L113
  108. .align 3
  109. .L112:
  110. #ifdef DOUBLE
  111. xvld VX0, X, 0 * SIZE
  112. xvld VX2, Y, 0 * SIZE
  113. xvld VX1, X, 4 * SIZE
  114. xvld VX3, Y, 4 * SIZE
  115. xvfmadd.d VX2, VX0, VXA, VX2
  116. xvfmadd.d VX3, VX1, VXA, VX3
  117. addi.d I, I, -1
  118. xvst VX2, Y, 0 * SIZE
  119. xvst VX3, Y, 4 * SIZE
  120. #else
  121. xvld VX0, X, 0 * SIZE
  122. xvld VX2, Y, 0 * SIZE
  123. addi.d I, I, -1
  124. xvfmadd.s VX2, VX0, VXA, VX2
  125. xvst VX2, Y, 0 * SIZE
  126. #endif
  127. addi.d X, X, 8 * SIZE
  128. addi.d Y, Y, 8 * SIZE
  129. blt $r0, I, .L112
  130. .align 3
  131. .L113:
  132. andi I, N, 7
  133. bge $r0, I, .L999
  134. .align 3
  135. .L114:
  136. LD $f12, X, 0 * SIZE
  137. LD $f14, Y, 0 * SIZE
  138. addi.d I, I, -1
  139. MADD $f14, $f12, $f0, $f14
  140. ST $f14, Y, 0 * SIZE
  141. addi.d X, X, SIZE
  142. addi.d Y, Y, SIZE
  143. blt $r0, I, .L114
  144. b .L999
  145. .align 3
  146. .L12: // INCX==1 and INCY!=1
  147. bge $r0, I, .L122
  148. move YY, Y
  149. .align 3
  150. .L121:
  151. #ifdef DOUBLE
  152. xvld VX0, X, 0 * SIZE
  153. ld.d t1, Y, 0 * SIZE
  154. add.d Y, Y, INCY
  155. ld.d t2, Y, 0 * SIZE
  156. add.d Y, Y, INCY
  157. ld.d t3, Y, 0 * SIZE
  158. add.d Y, Y, INCY
  159. ld.d t4, Y, 0 * SIZE
  160. xvinsgr2vr.d VX2, t1, 0
  161. xvinsgr2vr.d VX2, t2, 1
  162. xvinsgr2vr.d VX2, t3, 2
  163. xvinsgr2vr.d VX2, t4, 3
  164. add.d Y, Y, INCY
  165. xvfmadd.d VX2, VX0, VXA, VX2
  166. xvld VX1, X, 4 * SIZE
  167. xvstelm.d VX2, YY, 0, 0
  168. add.d YY, YY, INCY
  169. xvstelm.d VX2, YY, 0, 1
  170. add.d YY, YY, INCY
  171. xvstelm.d VX2, YY, 0, 2
  172. add.d YY, YY, INCY
  173. xvstelm.d VX2, YY, 0, 3
  174. add.d YY, YY, INCY
  175. ld.d t1, Y, 0 * SIZE
  176. add.d Y, Y, INCY
  177. ld.d t2, Y, 0 * SIZE
  178. add.d Y, Y, INCY
  179. ld.d t3, Y, 0 * SIZE
  180. add.d Y, Y, INCY
  181. ld.d t4, Y, 0 * SIZE
  182. xvinsgr2vr.d VX3, t1, 0
  183. xvinsgr2vr.d VX3, t2, 1
  184. xvinsgr2vr.d VX3, t3, 2
  185. xvinsgr2vr.d VX3, t4, 3
  186. add.d Y, Y, INCY
  187. xvfmadd.d VX3, VX1, VXA, VX3
  188. addi.d I, I, -1
  189. xvstelm.d VX3, YY, 0, 0
  190. add.d YY, YY, INCY
  191. xvstelm.d VX3, YY, 0, 1
  192. add.d YY, YY, INCY
  193. xvstelm.d VX3, YY, 0, 2
  194. add.d YY, YY, INCY
  195. xvstelm.d VX3, YY, 0, 3
  196. #else
  197. xvld VX0, X, 0 * SIZE
  198. ld.w t1, Y, 0 * SIZE
  199. add.d Y, Y, INCY
  200. ld.w t2, Y, 0 * SIZE
  201. add.d Y, Y, INCY
  202. ld.w t3, Y, 0 * SIZE
  203. add.d Y, Y, INCY
  204. ld.w t4, Y, 0 * SIZE
  205. xvinsgr2vr.w VX2, t1, 0
  206. xvinsgr2vr.w VX2, t2, 1
  207. xvinsgr2vr.w VX2, t3, 2
  208. xvinsgr2vr.w VX2, t4, 3
  209. add.d Y, Y, INCY
  210. ld.w t1, Y, 0 * SIZE
  211. add.d Y, Y, INCY
  212. ld.w t2, Y, 0 * SIZE
  213. add.d Y, Y, INCY
  214. ld.w t3, Y, 0 * SIZE
  215. add.d Y, Y, INCY
  216. ld.w t4, Y, 0 * SIZE
  217. xvinsgr2vr.w VX2, t1, 4
  218. xvinsgr2vr.w VX2, t2, 5
  219. xvinsgr2vr.w VX2, t3, 6
  220. xvinsgr2vr.w VX2, t4, 7
  221. add.d Y, Y, INCY
  222. xvfmadd.s VX2, VX0, VXA, VX2
  223. addi.d I, I, -1
  224. xvstelm.w VX2, YY, 0, 0
  225. add.d YY, YY, INCY
  226. xvstelm.w VX2, YY, 0, 1
  227. add.d YY, YY, INCY
  228. xvstelm.w VX2, YY, 0, 2
  229. add.d YY, YY, INCY
  230. xvstelm.w VX2, YY, 0, 3
  231. add.d YY, YY, INCY
  232. xvstelm.w VX2, YY, 0, 4
  233. add.d YY, YY, INCY
  234. xvstelm.w VX2, YY, 0, 5
  235. add.d YY, YY, INCY
  236. xvstelm.w VX2, YY, 0, 6
  237. add.d YY, YY, INCY
  238. xvstelm.w VX2, YY, 0, 7
  239. #endif
  240. add.d YY, YY, INCY
  241. addi.d X, X, 8 * SIZE
  242. blt $r0, I, .L121
  243. .align 3
  244. .L122:
  245. andi I, N, 7
  246. bge $r0, I, .L999
  247. .align 3
  248. .L123:
  249. LD $f12, X, 0 * SIZE
  250. LD $f14, Y, 0 * SIZE
  251. addi.d I, I, -1
  252. MADD $f14, $f12, $f0, $f14
  253. ST $f14, Y, 0 * SIZE
  254. addi.d X, X, SIZE
  255. add.d Y, Y, INCY
  256. blt $r0, I, .L123
  257. b .L999
  258. .align 3
  259. .L21:// INCX!=1 and INCY==1
  260. bge $r0, I, .L212
  261. .align 3
  262. .L211:
  263. #ifdef DOUBLE
  264. xvld VX2, Y, 0 * SIZE
  265. ld.d t1, X, 0 * SIZE
  266. add.d X, X, INCX
  267. ld.d t2, X, 0 * SIZE
  268. add.d X, X, INCX
  269. ld.d t3, X, 0 * SIZE
  270. add.d X, X, INCX
  271. ld.d t4, X, 0 * SIZE
  272. xvinsgr2vr.d VX0, t1, 0
  273. xvinsgr2vr.d VX0, t2, 1
  274. xvinsgr2vr.d VX0, t3, 2
  275. xvinsgr2vr.d VX0, t4, 3
  276. add.d X, X, INCX
  277. xvfmadd.d VX2, VX0, VXA, VX2
  278. xvld VX3, Y, 4 * SIZE
  279. xvst VX2, Y, 0 * SIZE
  280. ld.d t1, X, 0 * SIZE
  281. add.d X, X, INCX
  282. ld.d t2, X, 0 * SIZE
  283. add.d X, X, INCX
  284. ld.d t3, X, 0 * SIZE
  285. add.d X, X, INCX
  286. ld.d t4, X, 0 * SIZE
  287. xvinsgr2vr.d VX1, t1, 0
  288. xvinsgr2vr.d VX1, t2, 1
  289. xvinsgr2vr.d VX1, t3, 2
  290. xvinsgr2vr.d VX1, t4, 3
  291. add.d X, X, INCX
  292. xvfmadd.d VX3, VX1, VXA, VX3
  293. addi.d I, I, -1
  294. xvst VX3, Y, 4 * SIZE
  295. addi.d Y, Y, 8 * SIZE
  296. #else
  297. xvld VX2, Y, 0 * SIZE
  298. ld.w t1, X, 0 * SIZE
  299. add.d X, X, INCX
  300. ld.w t2, X, 0 * SIZE
  301. add.d X, X, INCX
  302. ld.w t3, X, 0 * SIZE
  303. add.d X, X, INCX
  304. ld.w t4, X, 0 * SIZE
  305. xvinsgr2vr.w VX0, t1, 0
  306. xvinsgr2vr.w VX0, t2, 1
  307. xvinsgr2vr.w VX0, t3, 2
  308. xvinsgr2vr.w VX0, t4, 3
  309. add.d X, X, INCX
  310. ld.w t1, X, 0 * SIZE
  311. add.d X, X, INCX
  312. ld.w t2, X, 0 * SIZE
  313. add.d X, X, INCX
  314. ld.w t3, X, 0 * SIZE
  315. add.d X, X, INCX
  316. ld.w t4, X, 0 * SIZE
  317. add.d X, X, INCX
  318. xvinsgr2vr.w VX0, t1, 4
  319. xvinsgr2vr.w VX0, t2, 5
  320. xvinsgr2vr.w VX0, t3, 6
  321. xvinsgr2vr.w VX0, t4, 7
  322. xvfmadd.s VX2, VX0, VXA, VX2
  323. addi.d I, I, -1
  324. xvst VX2, Y, 0 * SIZE
  325. addi.d Y, Y, 8 * SIZE
  326. #endif
  327. blt $r0, I, .L211
  328. .align 3
  329. .L212:
  330. andi I, N, 7
  331. bge $r0, I, .L999
  332. .align 3
  333. .L213:
  334. LD $f12, X, 0 * SIZE
  335. LD $f14, Y, 0 * SIZE
  336. addi.d I, I, -1
  337. MADD $f14, $f12, $f0, $f14
  338. ST $f14, Y, 0 * SIZE
  339. add.d X, X, INCX
  340. addi.d Y, Y, SIZE
  341. blt $r0, I, .L213
  342. b .L999
  343. .align 3
  344. .L22:
  345. bge $r0, I, .L223
  346. move YY, Y
  347. .align 3
  348. .L222:
  349. #ifdef DOUBLE
  350. ld.d t1, X, 0 * SIZE
  351. add.d X, X, INCX
  352. ld.d t2, X, 0 * SIZE
  353. add.d X, X, INCX
  354. ld.d t3, X, 0 * SIZE
  355. add.d X, X, INCX
  356. ld.d t4, X, 0 * SIZE
  357. xvinsgr2vr.d VX0, t1, 0
  358. xvinsgr2vr.d VX0, t2, 1
  359. xvinsgr2vr.d VX0, t3, 2
  360. xvinsgr2vr.d VX0, t4, 3
  361. add.d X, X, INCX
  362. ld.d t1, Y, 0 * SIZE
  363. add.d Y, Y, INCY
  364. ld.d t2, Y, 0 * SIZE
  365. add.d Y, Y, INCY
  366. ld.d t3, Y, 0 * SIZE
  367. add.d Y, Y, INCY
  368. ld.d t4, Y, 0 * SIZE
  369. xvinsgr2vr.d VX2, t1, 0
  370. xvinsgr2vr.d VX2, t2, 1
  371. xvinsgr2vr.d VX2, t3, 2
  372. xvinsgr2vr.d VX2, t4, 3
  373. add.d Y, Y, INCY
  374. xvfmadd.d VX2, VX0, VXA, VX2
  375. ld.d t1, X, 0 * SIZE
  376. add.d X, X, INCX
  377. ld.d t2, X, 0 * SIZE
  378. add.d X, X, INCX
  379. ld.d t3, X, 0 * SIZE
  380. add.d X, X, INCX
  381. ld.d t4, X, 0 * SIZE
  382. add.d X, X, INCX
  383. xvinsgr2vr.d VX1, t1, 0
  384. xvinsgr2vr.d VX1, t2, 1
  385. xvinsgr2vr.d VX1, t3, 2
  386. xvinsgr2vr.d VX1, t4, 3
  387. xvstelm.d VX2, YY, 0, 0
  388. add.d YY, YY, INCY
  389. xvstelm.d VX2, YY, 0, 1
  390. add.d YY, YY, INCY
  391. xvstelm.d VX2, YY, 0, 2
  392. add.d YY, YY, INCY
  393. xvstelm.d VX2, YY, 0, 3
  394. add.d YY, YY, INCY
  395. ld.d t1, Y, 0 * SIZE
  396. add.d Y, Y, INCY
  397. ld.d t2, Y, 0 * SIZE
  398. add.d Y, Y, INCY
  399. ld.d t3, Y, 0 * SIZE
  400. add.d Y, Y, INCY
  401. ld.d t4, Y, 0 * SIZE
  402. xvinsgr2vr.d VX3, t1, 0
  403. xvinsgr2vr.d VX3, t2, 1
  404. xvinsgr2vr.d VX3, t3, 2
  405. xvinsgr2vr.d VX3, t4, 3
  406. add.d Y, Y, INCY
  407. xvfmadd.d VX3, VX1, VXA, VX3
  408. addi.d I, I, -1
  409. xvstelm.d VX3, YY, 0, 0
  410. add.d YY, YY, INCY
  411. xvstelm.d VX3, YY, 0, 1
  412. add.d YY, YY, INCY
  413. xvstelm.d VX3, YY, 0, 2
  414. add.d YY, YY, INCY
  415. xvstelm.d VX3, YY, 0, 3
  416. #else
  417. ld.w t1, X, 0 * SIZE
  418. add.d X, X, INCX
  419. ld.w t2, X, 0 * SIZE
  420. add.d X, X, INCX
  421. ld.w t3, X, 0 * SIZE
  422. add.d X, X, INCX
  423. ld.w t4, X, 0 * SIZE
  424. add.d X, X, INCX
  425. xvinsgr2vr.w VX0, t1, 0
  426. xvinsgr2vr.w VX0, t2, 1
  427. xvinsgr2vr.w VX0, t3, 2
  428. xvinsgr2vr.w VX0, t4, 3
  429. ld.w t1, Y, 0 * SIZE
  430. add.d Y, Y, INCY
  431. ld.w t2, Y, 0 * SIZE
  432. add.d Y, Y, INCY
  433. ld.w t3, Y, 0 * SIZE
  434. add.d Y, Y, INCY
  435. ld.w t4, Y, 0 * SIZE
  436. add.d Y, Y, INCY
  437. xvinsgr2vr.w VX2, t1, 0
  438. xvinsgr2vr.w VX2, t2, 1
  439. xvinsgr2vr.w VX2, t3, 2
  440. xvinsgr2vr.w VX2, t4, 3
  441. ld.w t1, X, 0 * SIZE
  442. add.d X, X, INCX
  443. ld.w t2, X, 0 * SIZE
  444. add.d X, X, INCX
  445. ld.w t3, X, 0 * SIZE
  446. add.d X, X, INCX
  447. ld.w t4, X, 0 * SIZE
  448. add.d X, X, INCX
  449. xvinsgr2vr.w VX0, t1, 4
  450. xvinsgr2vr.w VX0, t2, 5
  451. xvinsgr2vr.w VX0, t3, 6
  452. xvinsgr2vr.w VX0, t4, 7
  453. ld.w t1, Y, 0 * SIZE
  454. add.d Y, Y, INCY
  455. ld.w t2, Y, 0 * SIZE
  456. add.d Y, Y, INCY
  457. ld.w t3, Y, 0 * SIZE
  458. add.d Y, Y, INCY
  459. ld.w t4, Y, 0 * SIZE
  460. xvinsgr2vr.w VX2, t1, 4
  461. xvinsgr2vr.w VX2, t2, 5
  462. xvinsgr2vr.w VX2, t3, 6
  463. xvinsgr2vr.w VX2, t4, 7
  464. add.d Y, Y, INCY
  465. xvfmadd.s VX2, VX0, VXA, VX2
  466. addi.d I, I, -1
  467. xvstelm.w VX2, YY, 0, 0
  468. add.d YY, YY, INCY
  469. xvstelm.w VX2, YY, 0, 1
  470. add.d YY, YY, INCY
  471. xvstelm.w VX2, YY, 0, 2
  472. add.d YY, YY, INCY
  473. xvstelm.w VX2, YY, 0, 3
  474. add.d YY, YY, INCY
  475. xvstelm.w VX2, YY, 0, 4
  476. add.d YY, YY, INCY
  477. xvstelm.w VX2, YY, 0, 5
  478. add.d YY, YY, INCY
  479. xvstelm.w VX2, YY, 0, 6
  480. add.d YY, YY, INCY
  481. xvstelm.w VX2, YY, 0, 7
  482. #endif
  483. add.d YY, YY, INCY
  484. blt $r0, I, .L222
  485. .align 3
  486. .L223:
  487. andi I, N, 7
  488. bge $r0, I, .L999
  489. .align 3
  490. .L224:
  491. LD $f12, X, 0 * SIZE
  492. LD $f14, Y, 0 * SIZE
  493. addi.d I, I, -1
  494. MADD $f14, $f12, $f0, $f14
  495. ST $f14, Y, 0 * SIZE
  496. add.d X, X, INCX
  497. add.d Y, Y, INCY
  498. blt $r0, I, .L224
  499. .align 3
  500. .L999:
  501. move $r4, $r12
  502. jirl $r0, $r1, 0x0
  503. .align 3
  504. EPILOGUE