You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

axpy_lsx.S 14 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573
  1. /***************************************************************************
  2. Copyright (c) 2023, The OpenBLAS Project
  3. All rights reserved.
  4. Redistribution and use in source and binary forms, with or without
  5. modification, are permitted provided that the following conditions are
  6. met:
  7. 1. Redistributions of source code must retain the above copyright
  8. notice, this list of conditions and the following disclaimer.
  9. 2. Redistributions in binary form must reproduce the above copyright
  10. notice, this list of conditions and the following disclaimer in
  11. the documentation and/or other materials provided with the
  12. distribution.
  13. 3. Neither the name of the OpenBLAS project nor the names of
  14. its contributors may be used to endorse or promote products
  15. derived from this software without specific prior written permission.
  16. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  17. AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  18. IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  19. ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
  20. LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  21. DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
  22. SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
  23. CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  24. OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
  25. USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  26. *****************************************************************************/
  27. #define ASSEMBLER
  28. #include "common.h"
  29. #define N $r4
  30. #define XX $r5
  31. #define YY $r6
  32. #define ALPHA $f0
  33. #define X $r7
  34. #define INCX $r8
  35. #define Y $r9
  36. #define INCY $r10
  37. #define I $r12
  38. #define TEMP $r13
  39. #define t1 $r14
  40. #define t2 $r16
  41. #define t3 $r15
  42. #define t4 $r17
  43. #define a1 $f12
  44. #define a2 $f13
  45. #define a3 $f14
  46. #define a4 $f15
  47. #define b1 $f16
  48. #define b2 $f17
  49. #define b3 $f18
  50. #define b4 $f19
  51. #define VX0 $vr8
  52. #define VX1 $vr20
  53. #define VX2 $vr21
  54. #define VX3 $vr22
  55. #define VXA $vr23
  56. PROLOGUE
  57. bge $r0, N, .L999
  58. li.d TEMP, 1
  59. movgr2fr.d a1, $r0
  60. FFINT a1, a1
  61. movgr2fr.d a2, TEMP
  62. FFINT a2, a2
  63. CMPEQ $fcc0, ALPHA, a1
  64. bcnez $fcc0, .L999
  65. slli.d TEMP, TEMP, BASE_SHIFT
  66. slli.d INCX, INCX, BASE_SHIFT
  67. slli.d INCY, INCY, BASE_SHIFT
  68. MTG t1, ALPHA
  69. #ifdef DOUBLE
  70. vreplgr2vr.d VXA, t1
  71. #else
  72. vreplgr2vr.w VXA, t1
  73. #endif
  74. srai.d I, N, 3
  75. bne INCX, TEMP, .L20
  76. bne INCY, TEMP, .L12 // INCX==1 and INCY!=1
  77. b .L11 // INCX==1 and INCY==1
  78. .L20:
  79. bne INCY, TEMP, .L22 // INCX!=1 and INCY!=1
  80. b .L21 // INCX!=1 and INCY==1
  81. .L11:
  82. bge $r0, I, .L113
  83. CMPEQ $fcc0, ALPHA, a2
  84. bceqz $fcc0, .L112
  85. .align 3
  86. .L111:
  87. #ifdef DOUBLE
  88. vld VX0, X, 0 * SIZE
  89. vld VX2, Y, 0 * SIZE
  90. vld VX1, X, 2 * SIZE
  91. vld VX3, Y, 2 * SIZE
  92. vfadd.d VX2, VX0, VX2
  93. vfadd.d VX3, VX1, VX3
  94. vst VX2, Y, 0 * SIZE
  95. vst VX3, Y, 2 * SIZE
  96. vld VX0, X, 4 * SIZE
  97. vld VX2, Y, 4 * SIZE
  98. vld VX1, X, 6 * SIZE
  99. vld VX3, Y, 6 * SIZE
  100. vfadd.d VX2, VX0, VX2
  101. vfadd.d VX3, VX1, VX3
  102. vst VX2, Y, 4 * SIZE
  103. vst VX3, Y, 6 * SIZE
  104. #else
  105. vld VX0, X, 0 * SIZE
  106. vld VX2, Y, 0 * SIZE
  107. vld VX1, X, 4 * SIZE
  108. vld VX3, Y, 4 * SIZE
  109. vfadd.s VX2, VX0, VX2
  110. vfadd.s VX3, VX1, VX3
  111. vst VX2, Y, 0 * SIZE
  112. vst VX3, Y, 4 * SIZE
  113. #endif
  114. addi.d X, X, 8 * SIZE
  115. addi.d Y, Y, 8 * SIZE
  116. addi.d I, I, -1
  117. blt $r0, I, .L111
  118. b .L113
  119. .align 3
  120. .L112:
  121. #ifdef DOUBLE
  122. vld VX0, X, 0 * SIZE
  123. vld VX2, Y, 0 * SIZE
  124. vld VX1, X, 2 * SIZE
  125. vld VX3, Y, 2 * SIZE
  126. vfmadd.d VX2, VX0, VXA, VX2
  127. vfmadd.d VX3, VX1, VXA, VX3
  128. addi.d I, I, -1
  129. vst VX2, Y, 0 * SIZE
  130. vst VX3, Y, 2 * SIZE
  131. vld VX0, X, 4 * SIZE
  132. vld VX2, Y, 4 * SIZE
  133. vld VX1, X, 6 * SIZE
  134. vld VX3, Y, 6 * SIZE
  135. addi.d X, X, 8 * SIZE
  136. vfmadd.d VX2, VX0, VXA, VX2
  137. vfmadd.d VX3, VX1, VXA, VX3
  138. vst VX2, Y, 4 * SIZE
  139. vst VX3, Y, 6 * SIZE
  140. addi.d Y, Y, 8 * SIZE
  141. #else
  142. vld VX0, X, 0 * SIZE
  143. vld VX2, Y, 0 * SIZE
  144. vld VX1, X, 4 * SIZE
  145. vld VX3, Y, 4 * SIZE
  146. vfmadd.s VX2, VX0, VXA, VX2
  147. vfmadd.s VX3, VX1, VXA, VX3
  148. vst VX2, Y, 0 * SIZE
  149. vst VX3, Y, 4 * SIZE
  150. addi.d X, X, 8 * SIZE
  151. addi.d Y, Y, 8 * SIZE
  152. addi.d I, I, -1
  153. #endif
  154. blt $r0, I, .L112
  155. .align 3
  156. .L113:
  157. andi I, N, 7
  158. bge $r0, I, .L999
  159. .align 3
  160. .L114:
  161. LD $f12, X, 0 * SIZE
  162. LD $f14, Y, 0 * SIZE
  163. addi.d I, I, -1
  164. MADD $f14, $f12, $f0, $f14
  165. ST $f14, Y, 0 * SIZE
  166. addi.d X, X, SIZE
  167. addi.d Y, Y, SIZE
  168. blt $r0, I, .L114
  169. b .L999
  170. .align 3
  171. .L12: // INCX==1 and INCY!=1
  172. bge $r0, I, .L122
  173. move YY, Y
  174. .align 3
  175. .L121:
  176. #ifdef DOUBLE
  177. vld VX0, X, 0 * SIZE
  178. ld.d t1, Y, 0 * SIZE
  179. add.d Y, Y, INCY
  180. ld.d t2, Y, 0 * SIZE
  181. vinsgr2vr.d VX2, t1, 0
  182. vinsgr2vr.d VX2, t2, 1
  183. add.d Y, Y, INCY
  184. vfmadd.d VX2, VX0, VXA, VX2
  185. vld VX1, X, 2 * SIZE
  186. vstelm.d VX2, YY, 0, 0
  187. add.d YY, YY, INCY
  188. vstelm.d VX2, YY, 0, 1
  189. add.d YY, YY, INCY
  190. ld.d t3, Y, 0 * SIZE
  191. add.d Y, Y, INCY
  192. ld.d t4, Y, 0 * SIZE
  193. vinsgr2vr.d VX3, t3, 0
  194. vinsgr2vr.d VX3, t4, 1
  195. add.d Y, Y, INCY
  196. vfmadd.d VX3, VX1, VXA, VX3
  197. vld VX0, X, 4 * SIZE
  198. vstelm.d VX3, YY, 0, 0
  199. add.d YY, YY, INCY
  200. vstelm.d VX3, YY, 0, 1
  201. add.d YY, YY, INCY
  202. ld.d t1, Y, 0 * SIZE
  203. add.d Y, Y, INCY
  204. ld.d t2, Y, 0 * SIZE
  205. vinsgr2vr.d VX2, t1, 0
  206. vinsgr2vr.d VX2, t2, 1
  207. add.d Y, Y, INCY
  208. vfmadd.d VX2, VX0, VXA, VX2
  209. vld VX1, X, 6 * SIZE
  210. vstelm.d VX2, YY, 0, 0
  211. add.d YY, YY, INCY
  212. vstelm.d VX2, YY, 0, 1
  213. add.d YY, YY, INCY
  214. ld.d t3, Y, 0 * SIZE
  215. add.d Y, Y, INCY
  216. ld.d t4, Y, 0 * SIZE
  217. vinsgr2vr.d VX3, t3, 0
  218. vinsgr2vr.d VX3, t4, 1
  219. add.d Y, Y, INCY
  220. vfmadd.d VX3, VX1, VXA, VX3
  221. vstelm.d VX3, YY, 0, 0
  222. add.d YY, YY, INCY
  223. vstelm.d VX3, YY, 0, 1
  224. add.d YY, YY, INCY
  225. addi.d X, X, 8 * SIZE
  226. addi.d I, I, -1
  227. #else
  228. vld VX0, X, 0 * SIZE
  229. ld.w t1, Y, 0 * SIZE
  230. add.d Y, Y, INCY
  231. ld.w t2, Y, 0 * SIZE
  232. add.d Y, Y, INCY
  233. ld.w t3, Y, 0 * SIZE
  234. add.d Y, Y, INCY
  235. ld.w t4, Y, 0 * SIZE
  236. vinsgr2vr.w VX2, t1, 0
  237. vinsgr2vr.w VX2, t2, 1
  238. vinsgr2vr.w VX2, t3, 2
  239. vinsgr2vr.w VX2, t4, 3
  240. add.d Y, Y, INCY
  241. vfmadd.s VX2, VX0, VXA, VX2
  242. vld VX1, X, 4 * SIZE
  243. vstelm.w VX2, YY, 0, 0
  244. add.d YY, YY, INCY
  245. vstelm.w VX2, YY, 0, 1
  246. add.d YY, YY, INCY
  247. vstelm.w VX2, YY, 0, 2
  248. add.d YY, YY, INCY
  249. vstelm.w VX2, YY, 0, 3
  250. add.d YY, YY, INCY
  251. ld.w t1, Y, 0 * SIZE
  252. add.d Y, Y, INCY
  253. ld.w t2, Y, 0 * SIZE
  254. add.d Y, Y, INCY
  255. ld.w t3, Y, 0 * SIZE
  256. add.d Y, Y, INCY
  257. ld.w t4, Y, 0 * SIZE
  258. vinsgr2vr.w VX3, t1, 0
  259. vinsgr2vr.w VX3, t2, 1
  260. vinsgr2vr.w VX3, t3, 2
  261. vinsgr2vr.w VX3, t4, 3
  262. add.d Y, Y, INCY
  263. vfmadd.s VX3, VX1, VXA, VX3
  264. addi.d I, I, -1
  265. vstelm.w VX3, YY, 0, 0
  266. add.d YY, YY, INCY
  267. vstelm.w VX3, YY, 0, 1
  268. add.d YY, YY, INCY
  269. vstelm.w VX3, YY, 0, 2
  270. add.d YY, YY, INCY
  271. vstelm.w VX3, YY, 0, 3
  272. add.d YY, YY, INCY
  273. addi.d X, X, 8 * SIZE
  274. #endif
  275. blt $r0, I, .L121
  276. .align 3
  277. .L122:
  278. andi I, N, 7
  279. bge $r0, I, .L999
  280. .align 3
  281. .L123:
  282. LD $f12, X, 0 * SIZE
  283. LD $f14, Y, 0 * SIZE
  284. addi.d I, I, -1
  285. MADD $f14, $f12, $f0, $f14
  286. ST $f14, Y, 0 * SIZE
  287. addi.d X, X, SIZE
  288. add.d Y, Y, INCY
  289. blt $r0, I, .L123
  290. b .L999
  291. .align 3
  292. .L21:// INCX!=1 and INCY==1
  293. bge $r0, I, .L212
  294. .align 3
  295. .L211:
  296. #ifdef DOUBLE
  297. vld VX2, Y, 0 * SIZE
  298. ld.d t1, X, 0 * SIZE
  299. add.d X, X, INCX
  300. ld.d t2, X, 0 * SIZE
  301. vinsgr2vr.d VX0, t1, 0
  302. vinsgr2vr.d VX0, t2, 1
  303. add.d X, X, INCX
  304. vfmadd.d VX2, VX0, VXA, VX2
  305. vld VX3, Y, 2 * SIZE
  306. vst VX2, Y, 0 * SIZE
  307. ld.d t3, X, 0 * SIZE
  308. add.d X, X, INCX
  309. ld.d t4, X, 0 * SIZE
  310. vinsgr2vr.d VX1, t3, 0
  311. vinsgr2vr.d VX1, t4, 1
  312. add.d X, X, INCX
  313. vfmadd.d VX3, VX1, VXA, VX3
  314. vld VX2, Y, 4 * SIZE
  315. vst VX3, Y, 2 * SIZE
  316. ld.d t1, X, 0 * SIZE
  317. add.d X, X, INCX
  318. ld.d t2, X, 0 * SIZE
  319. vinsgr2vr.d VX0, t1, 0
  320. vinsgr2vr.d VX0, t2, 1
  321. add.d X, X, INCX
  322. vfmadd.d VX2, VX0, VXA, VX2
  323. vld VX3, Y, 6 * SIZE
  324. vst VX2, Y, 4 * SIZE
  325. ld.d t3, X, 0 * SIZE
  326. add.d X, X, INCX
  327. ld.d t4, X, 0 * SIZE
  328. vinsgr2vr.d VX1, t3, 0
  329. vinsgr2vr.d VX1, t4, 1
  330. add.d X, X, INCX
  331. vfmadd.d VX3, VX1, VXA, VX3
  332. addi.d I, I, -1
  333. vst VX3, Y, 6 * SIZE
  334. #else
  335. vld VX2, Y, 0 * SIZE
  336. ld.w t1, X, 0 * SIZE
  337. add.d X, X, INCX
  338. ld.w t2, X, 0 * SIZE
  339. add.d X, X, INCX
  340. ld.w t3, X, 0 * SIZE
  341. add.d X, X, INCX
  342. ld.w t4, X, 0 * SIZE
  343. vinsgr2vr.w VX0, t1, 0
  344. vinsgr2vr.w VX0, t2, 1
  345. vinsgr2vr.w VX0, t3, 2
  346. vinsgr2vr.w VX0, t4, 3
  347. add.d X, X, INCX
  348. vfmadd.s VX2, VX0, VXA, VX2
  349. vld VX3, Y, 4 * SIZE
  350. vst VX2, Y, 0 * SIZE
  351. ld.w t1, X, 0 * SIZE
  352. add.d X, X, INCX
  353. ld.w t2, X, 0 * SIZE
  354. add.d X, X, INCX
  355. ld.w t3, X, 0 * SIZE
  356. add.d X, X, INCX
  357. ld.w t4, X, 0 * SIZE
  358. vinsgr2vr.w VX1, t1, 0
  359. vinsgr2vr.w VX1, t2, 1
  360. vinsgr2vr.w VX1, t3, 2
  361. vinsgr2vr.w VX1, t4, 3
  362. add.d X, X, INCX
  363. vfmadd.s VX3, VX1, VXA, VX3
  364. addi.d I, I, -1
  365. vst VX3, Y, 4 * SIZE
  366. #endif
  367. addi.d Y, Y, 8 * SIZE
  368. blt $r0, I, .L211
  369. .align 3
  370. .L212:
  371. andi I, N, 7
  372. bge $r0, I, .L999
  373. .align 3
  374. .L213:
  375. LD $f12, X, 0 * SIZE
  376. LD $f14, Y, 0 * SIZE
  377. addi.d I, I, -1
  378. MADD $f14, $f12, $f0, $f14
  379. ST $f14, Y, 0 * SIZE
  380. add.d X, X, INCX
  381. addi.d Y, Y, SIZE
  382. blt $r0, I, .L213
  383. b .L999
  384. .align 3
  385. .L22:
  386. bge $r0, I, .L223
  387. move YY, Y
  388. .align 3
  389. .L222:
  390. #ifdef DOUBLE
  391. ld.d t1, X, 0 * SIZE
  392. add.d X, X, INCX
  393. ld.d t2, X, 0 * SIZE
  394. add.d X, X, INCX
  395. vinsgr2vr.d VX0, t1, 0
  396. vinsgr2vr.d VX0, t2, 1
  397. ld.d t1, Y, 0 * SIZE
  398. add.d Y, Y, INCY
  399. ld.d t2, Y, 0 * SIZE
  400. vinsgr2vr.d VX2, t1, 0
  401. vinsgr2vr.d VX2, t2, 1
  402. add.d Y, Y, INCY
  403. vfmadd.d VX2, VX0, VXA, VX2
  404. ld.d t3, X, 0 * SIZE
  405. add.d X, X, INCX
  406. ld.d t4, X, 0 * SIZE
  407. add.d X, X, INCX
  408. vinsgr2vr.d VX1, t3, 0
  409. vinsgr2vr.d VX1, t4, 1
  410. vstelm.d VX2, YY, 0, 0
  411. add.d YY, YY, INCY
  412. vstelm.d VX2, YY, 0, 1
  413. add.d YY, YY, INCY
  414. ld.d t3, Y, 0 * SIZE
  415. add.d Y, Y, INCY
  416. ld.d t4, Y, 0 * SIZE
  417. vinsgr2vr.d VX3, t3, 0
  418. vinsgr2vr.d VX3, t4, 1
  419. add.d Y, Y, INCY
  420. vfmadd.d VX3, VX1, VXA, VX3
  421. ld.d t1, X, 0 * SIZE
  422. add.d X, X, INCX
  423. ld.d t2, X, 0 * SIZE
  424. add.d X, X, INCX
  425. vinsgr2vr.d VX0, t1, 0
  426. vinsgr2vr.d VX0, t2, 1
  427. vstelm.d VX3, YY, 0, 0
  428. add.d YY, YY, INCY
  429. vstelm.d VX3, YY, 0, 1
  430. add.d YY, YY, INCY
  431. ld.d t1, Y, 0 * SIZE
  432. add.d Y, Y, INCY
  433. ld.d t2, Y, 0 * SIZE
  434. vinsgr2vr.d VX2, t1, 0
  435. vinsgr2vr.d VX2, t2, 1
  436. add.d Y, Y, INCY
  437. vfmadd.d VX2, VX0, VXA, VX2
  438. ld.d t3, X, 0 * SIZE
  439. add.d X, X, INCX
  440. ld.d t4, X, 0 * SIZE
  441. add.d X, X, INCX
  442. vinsgr2vr.d VX1, t3, 0
  443. vinsgr2vr.d VX1, t4, 1
  444. vstelm.d VX2, YY, 0, 0
  445. add.d YY, YY, INCY
  446. vstelm.d VX2, YY, 0, 1
  447. add.d YY, YY, INCY
  448. ld.d t1, Y, 0 * SIZE
  449. add.d Y, Y, INCY
  450. ld.d t2, Y, 0 * SIZE
  451. vinsgr2vr.d VX3, t1, 0
  452. vinsgr2vr.d VX3, t2, 1
  453. add.d Y, Y, INCY
  454. vfmadd.d VX3, VX1, VXA, VX3
  455. addi.d I, I, -1
  456. vstelm.d VX3, YY, 0, 0
  457. add.d YY, YY, INCY
  458. vstelm.d VX3, YY, 0, 1
  459. #else
  460. ld.w t1, X, 0 * SIZE
  461. add.d X, X, INCX
  462. ld.w t2, X, 0 * SIZE
  463. add.d X, X, INCX
  464. ld.w t3, X, 0 * SIZE
  465. add.d X, X, INCX
  466. ld.w t4, X, 0 * SIZE
  467. vinsgr2vr.w VX0, t1, 0
  468. vinsgr2vr.w VX0, t2, 1
  469. vinsgr2vr.w VX0, t3, 2
  470. vinsgr2vr.w VX0, t4, 3
  471. add.d X, X, INCX
  472. ld.w t1, Y, 0 * SIZE
  473. add.d Y, Y, INCY
  474. ld.w t2, Y, 0 * SIZE
  475. add.d Y, Y, INCY
  476. ld.w t3, Y, 0 * SIZE
  477. add.d Y, Y, INCY
  478. ld.w t4, Y, 0 * SIZE
  479. vinsgr2vr.w VX2, t1, 0
  480. vinsgr2vr.w VX2, t2, 1
  481. vinsgr2vr.w VX2, t3, 2
  482. vinsgr2vr.w VX2, t4, 3
  483. add.d Y, Y, INCY
  484. vfmadd.s VX2, VX0, VXA, VX2
  485. ld.w t1, X, 0 * SIZE
  486. add.d X, X, INCX
  487. ld.w t2, X, 0 * SIZE
  488. add.d X, X, INCX
  489. ld.w t3, X, 0 * SIZE
  490. add.d X, X, INCX
  491. ld.w t4, X, 0 * SIZE
  492. add.d X, X, INCX
  493. vinsgr2vr.w VX1, t1, 0
  494. vinsgr2vr.w VX1, t2, 1
  495. vinsgr2vr.w VX1, t3, 2
  496. vinsgr2vr.w VX1, t4, 3
  497. vstelm.w VX2, YY, 0, 0
  498. add.d YY, YY, INCY
  499. vstelm.w VX2, YY, 0, 1
  500. add.d YY, YY, INCY
  501. vstelm.w VX2, YY, 0, 2
  502. add.d YY, YY, INCY
  503. vstelm.w VX2, YY, 0, 3
  504. add.d YY, YY, INCY
  505. ld.w t1, Y, 0 * SIZE
  506. add.d Y, Y, INCY
  507. ld.w t2, Y, 0 * SIZE
  508. add.d Y, Y, INCY
  509. ld.w t3, Y, 0 * SIZE
  510. add.d Y, Y, INCY
  511. ld.w t4, Y, 0 * SIZE
  512. vinsgr2vr.w VX3, t1, 0
  513. vinsgr2vr.w VX3, t2, 1
  514. vinsgr2vr.w VX3, t3, 2
  515. vinsgr2vr.w VX3, t4, 3
  516. add.d Y, Y, INCY
  517. vfmadd.s VX3, VX1, VXA, VX3
  518. addi.d I, I, -1
  519. vstelm.w VX3, YY, 0, 0
  520. add.d YY, YY, INCY
  521. vstelm.w VX3, YY, 0, 1
  522. add.d YY, YY, INCY
  523. vstelm.w VX3, YY, 0, 2
  524. add.d YY, YY, INCY
  525. vstelm.w VX3, YY, 0, 3
  526. #endif
  527. add.d YY, YY, INCY
  528. blt $r0, I, .L222
  529. .align 3
  530. .L223:
  531. andi I, N, 7
  532. bge $r0, I, .L999
  533. .align 3
  534. .L224:
  535. LD $f12, X, 0 * SIZE
  536. LD $f14, Y, 0 * SIZE
  537. addi.d I, I, -1
  538. MADD $f14, $f12, $f0, $f14
  539. ST $f14, Y, 0 * SIZE
  540. add.d X, X, INCX
  541. add.d Y, Y, INCY
  542. blt $r0, I, .L224
  543. .align 3
  544. .L999:
  545. move $r4, $r12
  546. jirl $r0, $r1, 0x0
  547. .align 3
  548. EPILOGUE