You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

daxpy_loongson3a_simd.S 16 kB


  1. /*****************************************************************************
  2. Copyright (c) 2011-2014, The OpenBLAS Project
  3. All rights reserved.
  4. Redistribution and use in source and binary forms, with or without
  5. modification, are permitted provided that the following conditions are
  6. met:
  7. 1. Redistributions of source code must retain the above copyright
  8. notice, this list of conditions and the following disclaimer.
  9. 2. Redistributions in binary form must reproduce the above copyright
  10. notice, this list of conditions and the following disclaimer in
  11. the documentation and/or other materials provided with the
  12. distribution.
  13. 3. Neither the name of the OpenBLAS project nor the names of
  14. its contributors may be used to endorse or promote products
  15. derived from this software without specific prior written
  16. permission.
  17. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  18. AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  19. IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  20. ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
  21. LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  22. DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
  23. SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
  24. CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  25. OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
  26. USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  27. **********************************************************************************/
  28. /*********************************************************************/
  29. /* Copyright 2009, 2010 The University of Texas at Austin. */
  30. /* All rights reserved. */
  31. /* */
  32. /* Redistribution and use in source and binary forms, with or */
  33. /* without modification, are permitted provided that the following */
  34. /* conditions are met: */
  35. /* */
  36. /* 1. Redistributions of source code must retain the above */
  37. /* copyright notice, this list of conditions and the following */
  38. /* disclaimer. */
  39. /* */
  40. /* 2. Redistributions in binary form must reproduce the above */
  41. /* copyright notice, this list of conditions and the following */
  42. /* disclaimer in the documentation and/or other materials */
  43. /* provided with the distribution. */
  44. /* */
  45. /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
  46. /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
  47. /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
  48. /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
  49. /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
  50. /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
  51. /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
  52. /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
  53. /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
  54. /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
  55. /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
  56. /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
  57. /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
  58. /* POSSIBILITY OF SUCH DAMAGE. */
  59. /* */
  60. /* The views and conclusions contained in the software and */
  61. /* documentation are those of the authors and should not be */
  62. /* interpreted as representing official policies, either expressed */
  63. /* or implied, of The University of Texas at Austin. */
  64. /*********************************************************************/
  65. #define ASSEMBLER
  66. #include "common.h"
  67. #define PREFETCH_DISTANCE 2016
  68. #define N $4
  69. #define X $8
  70. #define INCX $9
  71. #define Y $10
  72. #define INCY $11
  73. #define I $2
  74. #define TEMP $3
  75. #define YY $5
  76. #define ALPHA $f15
  77. #define a1 $f0
  78. #define a2 $f1
  79. #define a3 $f2
  80. #define a4 $f3
  81. #define a5 $f4
  82. #define a6 $f5
  83. #define a7 $f6
  84. #define a8 $f7
  85. #define a9 $f8
  86. #define a10 $f9
  87. #define a11 $f10
  88. #define a12 $f11
  89. #define a13 $f12
  90. #define a14 $f13
  91. #define a15 $f14
  92. #define a16 $f17
  93. #define t1 $f18
  94. #define t2 $f19
  95. #define t3 $f20
  96. #define t4 $f21
  97. #define b1 $f22
  98. #define b2 $f23
  99. #define b3 $f24
  100. #define b4 $f25
  101. #define b5 $f26
  102. #define b6 $f27
  103. #define b7 $f28
  104. #define b8 $f29
  105. #define A1 0
  106. #define A2 1
  107. #define A3 2
  108. #define A4 3
  109. #define A5 4
  110. #define A6 5
  111. #define A7 6
  112. #define A8 7
  113. #define A9 8
  114. #define A10 9
  115. #define A11 10
  116. #define A12 11
  117. #define A13 12
  118. #define A14 13
  119. #define A15 14
  120. #define A16 17
  121. #define T1 18
  122. #define T2 19
  123. #define T3 20
  124. #define T4 21
  125. #define B1 22
  126. #define B2 23
  127. #define B3 24
  128. #define B4 25
  129. #define B5 26
  130. #define B6 27
  131. #define B7 28
  132. #define B8 29
  133. #define X_BASE 8
  134. #define Y_BASE 10
  135. #define gsLQC1_(base,fq,ft,offset) .word (0x32<<26|base<<21|ft<<16|0x1<<15|offset<<6|0x1<<5|fq)
  136. #define gsLQC1(base,fq,ft,offset) gsLQC1_((base), (fq), (ft), (offset))
  137. #define gsSQC1_(base,fq,ft,offset) .word (0x3A<<26|base<<21|ft<<16|0x1<<15|offset<<6|0x1<<5|fq)
  138. #define gsSQC1(base,fq,ft,offset) gsSQC1_((base), (fq), (ft), (offset))
  139. PROLOGUE
  140. #ifndef __64BIT__
  141. daddiu $sp, $sp, -40
  142. sdc1 $f20, 0($sp)
  143. sdc1 $f22, 8($sp)
  144. sdc1 $f24, 16($sp)
  145. sdc1 $f26, 24($sp)
  146. sdc1 $f28, 32($sp)
  147. #else
  148. daddiu $sp, $sp, -48
  149. sdc1 $f24, 0($sp)
  150. sdc1 $f25, 8($sp)
  151. sdc1 $f26, 16($sp)
  152. sdc1 $f27, 24($sp)
  153. sdc1 $f28, 32($sp)
  154. sdc1 $f29, 40($sp)
  155. #endif
  156. li TEMP, SIZE
  157. blez N, .L999
  158. dsll INCX, INCX, BASE_SHIFT
  159. bne INCX, TEMP, .L20
  160. dsll INCY, INCY, BASE_SHIFT
  161. bne INCY, TEMP, .L20
  162. //Dose the address of Y algin 16 bytes?
  163. andi TEMP, Y, 8
  164. beq TEMP, $0, .L10
  165. //Y unalgin. Compute this unalgined element.
  166. LD a1, 0 * SIZE(X)
  167. LD b1, 0 * SIZE(Y)
  168. daddiu X, X, SIZE
  169. daddiu Y, Y, SIZE
  170. MADD t1, b1, ALPHA, a1
  171. daddiu N, N, -1
  172. ST t1, -1 * SIZE(Y)
  173. blez N, .L999
  174. .align 5
  175. .L10:
  176. dsra I, N, 4
  177. blez I, .L15
  178. daddiu I, I, -1
  179. //Y algin. We need test X address
  180. //Dose the address of X algin 16 bytes?
  181. andi TEMP, X, 8
  182. bne TEMP, $0, .L30 ///
  183. .align 5
  184. .L11:
  185. //X & Y algin
  186. gsLQC1(X_BASE,A2,A1,0)
  187. gsLQC1(X_BASE,A4,A3,1)
  188. gsLQC1(X_BASE,A6,A5,2)
  189. gsLQC1(X_BASE,A8,A7,3)
  190. gsLQC1(X_BASE,A10,A9,4)
  191. gsLQC1(X_BASE,A12,A11,5)
  192. gsLQC1(X_BASE,A14,A13,6)
  193. gsLQC1(X_BASE,A16,A15,7)
  194. gsLQC1(Y_BASE,B2,B1,0)
  195. gsLQC1(Y_BASE,B4,B3,1)
  196. gsLQC1(Y_BASE,B6,B5,2)
  197. gsLQC1(Y_BASE,B8,B7,3)
  198. blez I, .L13
  199. NOP
  200. .align 5
  201. .L12:
  202. MADD t1, b1, ALPHA, a1
  203. MADD t2, b2, ALPHA, a2
  204. gsSQC1(Y_BASE, T2, T1, 0)
  205. gsLQC1(Y_BASE,B2,B1,4)
  206. MADD t3, b3, ALPHA, a3
  207. MADD t4, b4, ALPHA, a4
  208. gsSQC1(Y_BASE, T4, T3, 1)
  209. gsLQC1(Y_BASE,B4,B3,5)
  210. PREFETCHD(PREFETCH_DISTANCE*SIZE(Y))
  211. PREFETCHD((PREFETCH_DISTANCE+4)*SIZE(Y))
  212. MADD t1, b5, ALPHA, a5
  213. MADD t2, b6, ALPHA, a6
  214. gsSQC1(Y_BASE, T2, T1, 2)
  215. gsLQC1(Y_BASE,B6,B5,6)
  216. MADD t3, b7, ALPHA, a7
  217. MADD t4, b8, ALPHA, a8
  218. gsSQC1(Y_BASE, T4, T3, 3)
  219. gsLQC1(Y_BASE,B8,B7, 7)
  220. PREFETCHD((PREFETCH_DISTANCE+8)*SIZE(Y))
  221. PREFETCHD((PREFETCH_DISTANCE+12)*SIZE(Y))
  222. MADD t1, b1, ALPHA, a9
  223. MADD t2, b2, ALPHA, a10
  224. gsSQC1(Y_BASE, T2, T1, 4)
  225. gsLQC1(Y_BASE,B2,B1,8)
  226. MADD t3, b3, ALPHA, a11
  227. MADD t4, b4, ALPHA, a12
  228. gsSQC1(Y_BASE, T4, T3, 5)
  229. gsLQC1(Y_BASE,B4,B3,9)
  230. PREFETCHD(PREFETCH_DISTANCE*SIZE(X))
  231. PREFETCHD((PREFETCH_DISTANCE+4)*SIZE(X))
  232. MADD t1, b5, ALPHA, a13
  233. MADD t2, b6, ALPHA, a14
  234. gsSQC1(Y_BASE, T2, T1, 6)
  235. gsLQC1(Y_BASE,B6,B5,10)
  236. MADD t3, b7, ALPHA, a15
  237. MADD t4, b8, ALPHA, a16
  238. gsSQC1(Y_BASE, T4, T3, 7)
  239. gsLQC1(Y_BASE,B8,B7,11)
  240. PREFETCHD((PREFETCH_DISTANCE+8)*SIZE(X))
  241. PREFETCHD((PREFETCH_DISTANCE+12)*SIZE(X))
  242. gsLQC1(X_BASE,A2,A1,8)
  243. gsLQC1(X_BASE,A4,A3,9)
  244. gsLQC1(X_BASE,A6,A5,10)
  245. gsLQC1(X_BASE,A8,A7,11)
  246. gsLQC1(X_BASE,A10,A9,12)
  247. gsLQC1(X_BASE,A12,A11,13)
  248. gsLQC1(X_BASE,A14,A13,14)
  249. gsLQC1(X_BASE,A16,A15,15)
  250. daddiu I, I, -1
  251. daddiu Y, Y, 16 * SIZE
  252. daddiu X, X, 16 * SIZE
  253. bgtz I, .L12
  254. .align 5
  255. .L13:
  256. MADD t1, b1, ALPHA, a1
  257. MADD t2, b2, ALPHA, a2
  258. gsSQC1(Y_BASE, T2, T1, 0)
  259. gsLQC1(Y_BASE,B2,B1,4)
  260. MADD t3, b3, ALPHA, a3
  261. MADD t4, b4, ALPHA, a4
  262. gsSQC1(Y_BASE, T4, T3, 1)
  263. gsLQC1(Y_BASE,B4,B3,5)
  264. MADD t1, b5, ALPHA, a5
  265. MADD t2, b6, ALPHA, a6
  266. gsSQC1(Y_BASE, T2, T1, 2)
  267. gsLQC1(Y_BASE,B6,B5,6)
  268. MADD t3, b7, ALPHA, a7
  269. MADD t4, b8, ALPHA, a8
  270. gsSQC1(Y_BASE, T4, T3, 3)
  271. gsLQC1(Y_BASE,B8,B7,7)
  272. MADD t1, b1, ALPHA, a9
  273. MADD t2, b2, ALPHA, a10
  274. gsSQC1(Y_BASE, T2, T1, 4)
  275. MADD t3, b3, ALPHA, a11
  276. MADD t4, b4, ALPHA, a12
  277. gsSQC1(Y_BASE, T4, T3, 5)
  278. MADD t1, b5, ALPHA, a13
  279. MADD t2, b6, ALPHA, a14
  280. gsSQC1(Y_BASE, T2, T1, 6)
  281. MADD t3, b7, ALPHA, a15
  282. MADD t4, b8, ALPHA, a16
  283. gsSQC1(Y_BASE, T4, T3, 7)
  284. daddiu X, X, 16 * SIZE
  285. daddiu Y, Y, 16 * SIZE
  286. .align 5
  287. .L15:
  288. andi I, N, 15
  289. blez I, .L999
  290. NOP
  291. .align 5
  292. .L16:
  293. LD a1, 0 * SIZE(X)
  294. LD b1, 0 * SIZE(Y)
  295. daddiu X, X, SIZE
  296. daddiu Y, Y, SIZE
  297. MADD t1, b1, ALPHA, a1
  298. daddiu I, I, -1
  299. bgtz I, .L16
  300. ST t1, -1 * SIZE(Y)
  301. #ifndef __64BIT__
  302. ldc1 $f20, 0($sp)
  303. ldc1 $f22, 8($sp)
  304. ldc1 $f24, 16($sp)
  305. ldc1 $f26, 24($sp)
  306. ldc1 $f28, 32($sp)
  307. daddiu $sp, $sp, 40
  308. #else
  309. ldc1 $f24, 0($sp)
  310. ldc1 $f25, 8($sp)
  311. ldc1 $f26, 16($sp)
  312. ldc1 $f27, 24($sp)
  313. ldc1 $f28, 32($sp)
  314. ldc1 $f29, 40($sp)
  315. daddiu $sp, $sp, 48
  316. #endif
  317. j $31
  318. NOP
  319. .align 5
  320. .L30:
  321. //Y align, X unalign, INCX==INCY==1
  322. //unloop 16
  323. LD a1, 0 * SIZE(X)
  324. daddiu X, X, SIZE
  325. gsLQC1(X_BASE,A3,A2,0)
  326. gsLQC1(X_BASE,A5,A4,1)
  327. gsLQC1(X_BASE,A7,A6,2)
  328. gsLQC1(X_BASE,A9,A8,3)
  329. gsLQC1(X_BASE,A11,A10,4)
  330. gsLQC1(X_BASE,A13,A12,5)
  331. gsLQC1(X_BASE,A15,A14,6)
  332. LD a16, 14 * SIZE(X)
  333. gsLQC1(Y_BASE,B2,B1,0)
  334. gsLQC1(Y_BASE,B4,B3,1)
  335. gsLQC1(Y_BASE,B6,B5,2)
  336. gsLQC1(Y_BASE,B8,B7,3)
  337. blez I, .L32
  338. NOP
  339. .align 5
  340. .L31:
  341. MADD t1, b1, ALPHA, a1
  342. MADD t2, b2, ALPHA, a2
  343. gsSQC1(Y_BASE, T2, T1, 0)
  344. gsLQC1(Y_BASE,B2,B1,4)
  345. MADD t3, b3, ALPHA, a3
  346. MADD t4, b4, ALPHA, a4
  347. gsSQC1(Y_BASE, T4, T3, 1)
  348. gsLQC1(Y_BASE,B4,B3,5)
  349. PREFETCHD(PREFETCH_DISTANCE*SIZE(Y))
  350. PREFETCHD((PREFETCH_DISTANCE+4)*SIZE(Y))
  351. MADD t1, b5, ALPHA, a5
  352. MADD t2, b6, ALPHA, a6
  353. gsSQC1(Y_BASE, T2, T1, 2)
  354. gsLQC1(Y_BASE,B6,B5,6)
  355. MADD t3, b7, ALPHA, a7
  356. MADD t4, b8, ALPHA, a8
  357. gsSQC1(Y_BASE, T4, T3, 3)
  358. gsLQC1(Y_BASE,B8,B7,7)
  359. PREFETCHD((PREFETCH_DISTANCE+8)*SIZE(Y))
  360. PREFETCHD((PREFETCH_DISTANCE+12)*SIZE(Y))
  361. MADD t1, b1, ALPHA, a9
  362. MADD t2, b2, ALPHA, a10
  363. gsSQC1(Y_BASE, T2, T1, 4)
  364. gsLQC1(Y_BASE,B2,B1,8)
  365. MADD t3, b3, ALPHA, a11
  366. MADD t4, b4, ALPHA, a12
  367. gsSQC1(Y_BASE, T4, T3, 5)
  368. gsLQC1(Y_BASE,B4,B3,9)
  369. PREFETCHD(PREFETCH_DISTANCE*SIZE(X))
  370. PREFETCHD((PREFETCH_DISTANCE+4)*SIZE(X))
  371. MADD t1, b5, ALPHA, a13
  372. MADD t2, b6, ALPHA, a14
  373. gsSQC1(Y_BASE, T2, T1, 6)
  374. gsLQC1(Y_BASE,B6,B5,10)
  375. MADD t3, b7, ALPHA, a15
  376. MADD t4, b8, ALPHA, a16
  377. gsSQC1(Y_BASE, T4, T3, 7)
  378. gsLQC1(Y_BASE,B8,B7,11)
  379. PREFETCHD((PREFETCH_DISTANCE+8)*SIZE(X))
  380. PREFETCHD((PREFETCH_DISTANCE+12)*SIZE(X))
  381. LD a1, 15 * SIZE(X)
  382. gsLQC1(X_BASE,A3,A2,8)
  383. gsLQC1(X_BASE,A5,A4,9)
  384. gsLQC1(X_BASE,A7,A6,10)
  385. gsLQC1(X_BASE,A9,A8,11)
  386. gsLQC1(X_BASE,A11,A10,12)
  387. gsLQC1(X_BASE,A13,A12,13)
  388. gsLQC1(X_BASE,A15,A14,14)
  389. LD a16, 30 * SIZE(X)
  390. daddiu I, I, -1
  391. daddiu Y, Y, 16 * SIZE
  392. daddiu X, X, 16 * SIZE
  393. bgtz I, .L31
  394. .align 5
  395. //Loop end:
  396. .L32:
  397. MADD t1, b1, ALPHA, a1
  398. MADD t2, b2, ALPHA, a2
  399. gsSQC1(Y_BASE, T2, T1, 0)
  400. gsLQC1(Y_BASE,B2,B1,4)
  401. MADD t3, b3, ALPHA, a3
  402. MADD t4, b4, ALPHA, a4
  403. gsSQC1(Y_BASE, T4, T3, 1)
  404. gsLQC1(Y_BASE,B4,B3,5)
  405. MADD t1, b5, ALPHA, a5
  406. MADD t2, b6, ALPHA, a6
  407. gsSQC1(Y_BASE, T2, T1, 2)
  408. gsLQC1(Y_BASE,B6,B5,6)
  409. MADD t3, b7, ALPHA, a7
  410. MADD t4, b8, ALPHA, a8
  411. gsSQC1(Y_BASE, T4, T3, 3)
  412. gsLQC1(Y_BASE,B8,B7,7)
  413. MADD t1, b1, ALPHA, a9
  414. MADD t2, b2, ALPHA, a10
  415. gsSQC1(Y_BASE, T2, T1, 4)
  416. MADD t3, b3, ALPHA, a11
  417. MADD t4, b4, ALPHA, a12
  418. gsSQC1(Y_BASE, T4, T3, 5)
  419. MADD t1, b5, ALPHA, a13
  420. MADD t2, b6, ALPHA, a14
  421. gsSQC1(Y_BASE, T2, T1, 6)
  422. MADD t3, b7, ALPHA, a15
  423. MADD t4, b8, ALPHA, a16
  424. gsSQC1(Y_BASE, T4, T3, 7)
  425. daddiu X, X, 15 * SIZE
  426. daddiu Y, Y, 16 * SIZE
  427. //jump back to the remain process.
  428. b .L15
  429. .align 5
  430. //INCX!=1 or INCY != 1
  431. .L20:
  432. beq INCY, $0, .L27
  433. dsra I, N, 3
  434. move YY, Y
  435. blez I, .L25
  436. daddiu I, I, -1
  437. LD a1, 0 * SIZE(X)
  438. daddu X, X, INCX
  439. LD b1, 0 * SIZE(Y)
  440. daddu Y, Y, INCY
  441. LD a2, 0 * SIZE(X)
  442. daddu X, X, INCX
  443. LD b2, 0 * SIZE(Y)
  444. daddu Y, Y, INCY
  445. LD a3, 0 * SIZE(X)
  446. daddu X, X, INCX
  447. LD b3, 0 * SIZE(Y)
  448. daddu Y, Y, INCY
  449. LD a4, 0 * SIZE(X)
  450. daddu X, X, INCX
  451. LD b4, 0 * SIZE(Y)
  452. daddu Y, Y, INCY
  453. LD a5, 0 * SIZE(X)
  454. daddu X, X, INCX
  455. LD b5, 0 * SIZE(Y)
  456. daddu Y, Y, INCY
  457. LD a6, 0 * SIZE(X)
  458. daddu X, X, INCX
  459. LD b6, 0 * SIZE(Y)
  460. daddu Y, Y, INCY
  461. LD a7, 0 * SIZE(X)
  462. daddu X, X, INCX
  463. LD b7, 0 * SIZE(Y)
  464. daddu Y, Y, INCY
  465. LD a8, 0 * SIZE(X)
  466. daddu X, X, INCX
  467. LD b8, 0 * SIZE(Y)
  468. daddu Y, Y, INCY
  469. blez I, .L23
  470. NOP
  471. .align 5
  472. .L22:
  473. MADD t1, b1, ALPHA, a1
  474. LD a1, 0 * SIZE(X)
  475. LD b1, 0 * SIZE(Y)
  476. daddu X, X, INCX
  477. daddu Y, Y, INCY
  478. MADD t2, b2, ALPHA, a2
  479. LD a2, 0 * SIZE(X)
  480. LD b2, 0 * SIZE(Y)
  481. daddu X, X, INCX
  482. daddu Y, Y, INCY
  483. MADD t3, b3, ALPHA, a3
  484. LD a3, 0 * SIZE(X)
  485. LD b3, 0 * SIZE(Y)
  486. daddu X, X, INCX
  487. daddu Y, Y, INCY
  488. MADD t4, b4, ALPHA, a4
  489. LD a4, 0 * SIZE(X)
  490. LD b4, 0 * SIZE(Y)
  491. daddu X, X, INCX
  492. daddu Y, Y, INCY
  493. ST t1, 0 * SIZE(YY)
  494. daddu YY, YY, INCY
  495. MADD t1, b5, ALPHA, a5
  496. LD a5, 0 * SIZE(X)
  497. LD b5, 0 * SIZE(Y)
  498. daddu X, X, INCX
  499. daddu Y, Y, INCY
  500. ST t2, 0 * SIZE(YY)
  501. daddu YY, YY, INCY
  502. MADD t2, b6, ALPHA, a6
  503. LD a6, 0 * SIZE(X)
  504. LD b6, 0 * SIZE(Y)
  505. daddu X, X, INCX
  506. daddu Y, Y, INCY
  507. ST t3, 0 * SIZE(YY)
  508. daddu YY, YY, INCY
  509. MADD t3, b7, ALPHA, a7
  510. LD a7, 0 * SIZE(X)
  511. LD b7, 0 * SIZE(Y)
  512. daddu X, X, INCX
  513. daddu Y, Y, INCY
  514. ST t4, 0 * SIZE(YY)
  515. daddu YY, YY, INCY
  516. MADD t4, b8, ALPHA, a8
  517. LD a8, 0 * SIZE(X)
  518. daddu X, X, INCX
  519. LD b8, 0 * SIZE(Y)
  520. daddu Y, Y, INCY
  521. ST t1, 0 * SIZE(YY)
  522. daddu YY, YY, INCY
  523. ST t2, 0 * SIZE(YY)
  524. daddu YY, YY, INCY
  525. ST t3, 0 * SIZE(YY)
  526. daddu YY, YY, INCY
  527. ST t4, 0 * SIZE(YY)
  528. daddiu I, I, -1
  529. bgtz I, .L22
  530. daddu YY, YY, INCY
  531. .align 5
  532. .L23:
  533. MADD t1, b1, ALPHA, a1
  534. MADD t2, b2, ALPHA, a2
  535. MADD t3, b3, ALPHA, a3
  536. MADD t4, b4, ALPHA, a4
  537. ST t1, 0 * SIZE(YY)
  538. daddu YY, YY, INCY
  539. MADD t1, b5, ALPHA, a5
  540. ST t2, 0 * SIZE(YY)
  541. daddu YY, YY, INCY
  542. MADD t2, b6, ALPHA, a6
  543. ST t3, 0 * SIZE(YY)
  544. daddu YY, YY, INCY
  545. MADD t3, b7, ALPHA, a7
  546. ST t4, 0 * SIZE(YY)
  547. daddu YY, YY, INCY
  548. MADD t4, b8, ALPHA, a8
  549. ST t1, 0 * SIZE(YY)
  550. daddu YY, YY, INCY
  551. ST t2, 0 * SIZE(YY)
  552. daddu YY, YY, INCY
  553. ST t3, 0 * SIZE(YY)
  554. daddu YY, YY, INCY
  555. ST t4, 0 * SIZE(YY)
  556. daddu YY, YY, INCY
  557. .align 5
  558. .L25:
  559. andi I, N, 7
  560. blez I, .L999
  561. NOP
  562. .align 5
  563. .L26:
  564. LD a1, 0 * SIZE(X)
  565. LD b1, 0 * SIZE(Y)
  566. MADD t1, b1, ALPHA, a1
  567. daddu X, X, INCX
  568. ST t1, 0 * SIZE(Y)
  569. daddiu I, I, -1
  570. bgtz I, .L26
  571. daddu Y, Y, INCY
  572. .align 5
  573. .L999:
  574. #ifndef __64BIT__
  575. ldc1 $f20, 0($sp)
  576. ldc1 $f22, 8($sp)
  577. ldc1 $f24, 16($sp)
  578. ldc1 $f26, 24($sp)
  579. ldc1 $f28, 32($sp)
  580. daddiu $sp, $sp, 40
  581. #else
  582. ldc1 $f24, 0($sp)
  583. ldc1 $f25, 8($sp)
  584. ldc1 $f26, 16($sp)
  585. ldc1 $f27, 24($sp)
  586. ldc1 $f28, 32($sp)
  587. ldc1 $f29, 40($sp)
  588. daddiu $sp, $sp, 48
  589. #endif
  590. j $31
  591. NOP
  592. .align 3
  593. .L27:
  594. LD b1, 0 * SIZE(Y)
  595. .L28:
  596. daddiu N, N, -1
  597. LD a1, 0 * SIZE(X)
  598. daddu X, X, INCX
  599. bgtz N, .L28
  600. MADD b1, b1, ALPHA, a1
  601. j .L999
  602. ST b1, 0 * SIZE(Y)
  603. EPILOGUE