You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

symv_U.S 15 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782
  1. /*********************************************************************/
  2. /* Copyright 2009, 2010 The University of Texas at Austin. */
  3. /* All rights reserved. */
  4. /* */
  5. /* Redistribution and use in source and binary forms, with or */
  6. /* without modification, are permitted provided that the following */
  7. /* conditions are met: */
  8. /* */
  9. /* 1. Redistributions of source code must retain the above */
  10. /* copyright notice, this list of conditions and the following */
  11. /* disclaimer. */
  12. /* */
  13. /* 2. Redistributions in binary form must reproduce the above */
  14. /* copyright notice, this list of conditions and the following */
  15. /* disclaimer in the documentation and/or other materials */
  16. /* provided with the distribution. */
  17. /* */
  18. /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
  19. /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
  20. /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
  21. /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
  22. /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
  23. /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
  24. /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
  25. /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
  26. /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
  27. /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
  28. /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
  29. /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
  30. /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
  31. /* POSSIBILITY OF SUCH DAMAGE. */
  32. /* */
  33. /* The views and conclusions contained in the software and */
  34. /* documentation are those of the authors and should not be */
  35. /* interpreted as representing official policies, either expressed */
  36. /* or implied, of The University of Texas at Austin. */
  37. /*********************************************************************/
  38. #define ASSEMBLER
  39. #include "common.h"
  40. #define M $4
  41. #define A $6
  42. #define LDA $7
  43. #define X $8
  44. #define INCX $9
  45. #define Y $10
  46. #define INCY $11
  47. #define BUFFER $5
  48. #define XX $12
  49. #define YY $13
  50. #define I $14
  51. #define IS $15
  52. #define AO1 $16
  53. #define AO2 $17
  54. #define Y1 $18
  55. #define TEMP $19
  56. #define ALPHA $f13
  57. #define a1 $f0
  58. #define a2 $f1
  59. #define a3 $f2
  60. #define a4 $f3
  61. #define a5 $f4
  62. #define a6 $f5
  63. #define a7 $f6
  64. #define a8 $f7
  65. #define alpha1 $f8
  66. #define alpha2 $f9
  67. #define x1 $f10
  68. #define x2 $f11
  69. #define x3 $f12
  70. #define x4 $f14
  71. #define xsum1 $f15
  72. #define xsum2 $f16
  73. #define ysum1 $f17
  74. #define ysum2 $f18
  75. #define ysum3 $f19
  76. #define ysum4 $f20
  77. PROLOGUE
  78. LDARG BUFFER, 0($sp)
  79. daddiu $sp, $sp, -32
  80. SDARG $16, 0($sp)
  81. dsll LDA, LDA, BASE_SHIFT
  82. SDARG $17, 8($sp)
  83. dsll INCX, INCX, BASE_SHIFT
  84. SDARG $18, 16($sp)
  85. dsll INCY, INCY, BASE_SHIFT
  86. SDARG $19, 24($sp)
  87. nop
  88. blez M, .L999
  89. li IS, SIZE
  90. beq IS, INCX, .L05
  91. move Y1, Y
  92. dsra I, M, 2
  93. move XX, X
  94. blez I, .L02
  95. move X, BUFFER
  96. .align 3
  97. .L01:
  98. LD a1, 0 * SIZE(XX)
  99. daddu XX, XX, INCX
  100. LD a2, 0 * SIZE(XX)
  101. daddu XX, XX, INCX
  102. LD a3, 0 * SIZE(XX)
  103. daddu XX, XX, INCX
  104. LD a4, 0 * SIZE(XX)
  105. daddu XX, XX, INCX
  106. ST a1, 0 * SIZE(BUFFER)
  107. ST a2, 1 * SIZE(BUFFER)
  108. ST a3, 2 * SIZE(BUFFER)
  109. ST a4, 3 * SIZE(BUFFER)
  110. daddiu I, I, -1
  111. bgtz I, .L01
  112. daddiu BUFFER, BUFFER, 4 * SIZE
  113. .align 3
  114. .L02:
  115. andi I, M, 3
  116. blez I, .L05
  117. NOP
  118. .align 3
  119. .L03:
  120. LD a1, 0 * SIZE(XX)
  121. daddu XX, XX, INCX
  122. ST a1, 0 * SIZE(BUFFER)
  123. daddiu I, I, -1
  124. bgtz I, .L03
  125. daddiu BUFFER, BUFFER, 1 * SIZE
  126. .align 3
  127. .L05:
  128. beq IS, INCY, .L10
  129. daddiu BUFFER, BUFFER, 255
  130. li TEMP, -256
  131. and BUFFER, BUFFER, TEMP
  132. dsra I, M, 2
  133. move Y1, BUFFER
  134. blez I, .L07
  135. move YY, Y
  136. .align 3
  137. .L06:
  138. LD a1, 0 * SIZE(YY)
  139. daddu YY, YY, INCY
  140. LD a2, 0 * SIZE(YY)
  141. daddu YY, YY, INCY
  142. LD a3, 0 * SIZE(YY)
  143. daddu YY, YY, INCY
  144. LD a4, 0 * SIZE(YY)
  145. daddu YY, YY, INCY
  146. ST a1, 0 * SIZE(BUFFER)
  147. ST a2, 1 * SIZE(BUFFER)
  148. ST a3, 2 * SIZE(BUFFER)
  149. ST a4, 3 * SIZE(BUFFER)
  150. daddiu I, I, -1
  151. bgtz I, .L06
  152. daddiu BUFFER, BUFFER, 4 * SIZE
  153. .align 3
  154. .L07:
  155. andi I, M, 3
  156. blez I, .L10
  157. NOP
  158. .align 3
  159. .L08:
  160. LD a1, 0 * SIZE(YY)
  161. daddu YY, YY, INCY
  162. ST a1, 0 * SIZE(BUFFER)
  163. daddiu I, I, -1
  164. bgtz I, .L08
  165. daddiu BUFFER, BUFFER, 1 * SIZE
  166. .align 3
  167. .L10:
  168. slti TEMP, M, 2
  169. nop
  170. bgtz TEMP, .L20
  171. li IS, 0
  172. .align 3
  173. .L11:
  174. dsll TEMP, IS, BASE_SHIFT
  175. daddu TEMP, X, TEMP
  176. LD alpha1, 0 * SIZE(TEMP)
  177. LD alpha2, 1 * SIZE(TEMP)
  178. move AO1, A
  179. dsra I, IS, 3
  180. daddu AO2, A, LDA
  181. daddu A, AO2, LDA
  182. MTC $0, xsum1
  183. MTC $0, xsum2
  184. move XX, X
  185. MUL alpha1, ALPHA, alpha1
  186. move YY, Y1
  187. MUL alpha2, ALPHA, alpha2
  188. blez I, .L15
  189. daddiu I, I, -1
  190. LD x1, 0 * SIZE(XX)
  191. LD x2, 1 * SIZE(XX)
  192. LD x3, 2 * SIZE(XX)
  193. LD a1, 0 * SIZE(AO1)
  194. LD a2, 1 * SIZE(AO1)
  195. LD a5, 2 * SIZE(AO1)
  196. LD a6, 3 * SIZE(AO1)
  197. LD a3, 0 * SIZE(AO2)
  198. LD a4, 1 * SIZE(AO2)
  199. LD a7, 2 * SIZE(AO2)
  200. LD a8, 3 * SIZE(AO2)
  201. LD ysum1, 0 * SIZE(YY)
  202. LD ysum2, 1 * SIZE(YY)
  203. blez I, .L13
  204. LD ysum3, 2 * SIZE(YY)
  205. .align 3
  206. .L12:
  207. MADD ysum1, ysum1, alpha1, a1
  208. LD ysum4, 3 * SIZE(YY)
  209. MADD ysum2, ysum2, alpha1, a2
  210. LD x4, 3 * SIZE(XX)
  211. MADD xsum1, xsum1, x1, a1
  212. LD a1, 4 * SIZE(AO1)
  213. MADD xsum2, xsum2, x1, a3
  214. LD x1, 4 * SIZE(XX)
  215. MADD ysum1, ysum1, alpha2, a3
  216. LD a3, 4 * SIZE(AO2)
  217. MADD ysum2, ysum2, alpha2, a4
  218. daddiu I, I, -1
  219. MADD xsum1, xsum1, x2, a2
  220. LD a2, 5 * SIZE(AO1)
  221. MADD xsum2, xsum2, x2, a4
  222. LD a4, 5 * SIZE(AO2)
  223. ST ysum1, 0 * SIZE(YY)
  224. LD ysum1, 4 * SIZE(YY)
  225. ST ysum2, 1 * SIZE(YY)
  226. LD ysum2, 5 * SIZE(YY)
  227. MADD ysum3, ysum3, alpha1, a5
  228. nop
  229. MADD ysum4, ysum4, alpha1, a6
  230. LD x2, 5 * SIZE(XX)
  231. MADD xsum1, xsum1, x3, a5
  232. LD a5, 6 * SIZE(AO1)
  233. MADD xsum2, xsum2, x3, a7
  234. LD x3, 6 * SIZE(XX)
  235. MADD ysum3, ysum3, alpha2, a7
  236. LD a7, 6 * SIZE(AO2)
  237. MADD ysum4, ysum4, alpha2, a8
  238. daddiu XX, XX, 8 * SIZE
  239. MADD xsum1, xsum1, x4, a6
  240. LD a6, 7 * SIZE(AO1)
  241. MADD xsum2, xsum2, x4, a8
  242. LD a8, 7 * SIZE(AO2)
  243. ST ysum3, 2 * SIZE(YY)
  244. LD ysum3, 6 * SIZE(YY)
  245. ST ysum4, 3 * SIZE(YY)
  246. LD ysum4, 7 * SIZE(YY)
  247. MADD ysum1, ysum1, alpha1, a1
  248. daddiu AO2, AO2, 8 * SIZE
  249. MADD ysum2, ysum2, alpha1, a2
  250. LD x4,-1 * SIZE(XX)
  251. MADD xsum1, xsum1, x1, a1
  252. LD a1, 8 * SIZE(AO1)
  253. MADD xsum2, xsum2, x1, a3
  254. LD x1, 0 * SIZE(XX)
  255. MADD ysum1, ysum1, alpha2, a3
  256. LD a3, 0 * SIZE(AO2)
  257. MADD ysum2, ysum2, alpha2, a4
  258. nop
  259. MADD xsum1, xsum1, x2, a2
  260. LD a2, 9 * SIZE(AO1)
  261. MADD xsum2, xsum2, x2, a4
  262. LD a4, 1 * SIZE(AO2)
  263. ST ysum1, 4 * SIZE(YY)
  264. LD ysum1, 8 * SIZE(YY)
  265. ST ysum2, 5 * SIZE(YY)
  266. LD ysum2, 9 * SIZE(YY)
  267. MADD ysum3, ysum3, alpha1, a5
  268. daddiu AO1, AO1, 8 * SIZE
  269. MADD ysum4, ysum4, alpha1, a6
  270. LD x2, 1 * SIZE(XX)
  271. MADD xsum1, xsum1, x3, a5
  272. LD a5, 2 * SIZE(AO1)
  273. MADD xsum2, xsum2, x3, a7
  274. LD x3, 2 * SIZE(XX)
  275. MADD ysum3, ysum3, alpha2, a7
  276. LD a7, 2 * SIZE(AO2)
  277. MADD ysum4, ysum4, alpha2, a8
  278. daddiu YY, YY, 8 * SIZE
  279. MADD xsum1, xsum1, x4, a6
  280. LD a6, 3 * SIZE(AO1)
  281. MADD xsum2, xsum2, x4, a8
  282. LD a8, 3 * SIZE(AO2)
  283. ST ysum3,-2 * SIZE(YY)
  284. LD ysum3, 2 * SIZE(YY)
  285. bgtz I, .L12
  286. ST ysum4,-1 * SIZE(YY)
  287. .align 3
  288. .L13:
  289. MADD ysum1, ysum1, alpha1, a1
  290. LD ysum4, 3 * SIZE(YY)
  291. MADD ysum2, ysum2, alpha1, a2
  292. LD x4, 3 * SIZE(XX)
  293. MADD xsum1, xsum1, x1, a1
  294. LD a1, 4 * SIZE(AO1)
  295. MADD xsum2, xsum2, x1, a3
  296. LD x1, 4 * SIZE(XX)
  297. MADD ysum1, ysum1, alpha2, a3
  298. LD a3, 4 * SIZE(AO2)
  299. MADD ysum2, ysum2, alpha2, a4
  300. MADD xsum1, xsum1, x2, a2
  301. LD a2, 5 * SIZE(AO1)
  302. MADD xsum2, xsum2, x2, a4
  303. LD a4, 5 * SIZE(AO2)
  304. LD x2, 5 * SIZE(XX)
  305. ST ysum1, 0 * SIZE(YY)
  306. ST ysum2, 1 * SIZE(YY)
  307. LD ysum1, 4 * SIZE(YY)
  308. LD ysum2, 5 * SIZE(YY)
  309. MADD ysum3, ysum3, alpha1, a5
  310. MADD ysum4, ysum4, alpha1, a6
  311. MADD xsum1, xsum1, x3, a5
  312. LD a5, 6 * SIZE(AO1)
  313. MADD xsum2, xsum2, x3, a7
  314. LD x3, 6 * SIZE(XX)
  315. MADD ysum3, ysum3, alpha2, a7
  316. LD a7, 6 * SIZE(AO2)
  317. MADD ysum4, ysum4, alpha2, a8
  318. MADD xsum1, xsum1, x4, a6
  319. LD a6, 7 * SIZE(AO1)
  320. MADD xsum2, xsum2, x4, a8
  321. LD a8, 7 * SIZE(AO2)
  322. LD x4, 7 * SIZE(XX)
  323. ST ysum3, 2 * SIZE(YY)
  324. ST ysum4, 3 * SIZE(YY)
  325. LD ysum3, 6 * SIZE(YY)
  326. LD ysum4, 7 * SIZE(YY)
  327. MADD ysum1, ysum1, alpha1, a1
  328. MADD ysum2, ysum2, alpha1, a2
  329. MADD xsum1, xsum1, x1, a1
  330. MADD xsum2, xsum2, x1, a3
  331. MADD ysum1, ysum1, alpha2, a3
  332. MADD ysum2, ysum2, alpha2, a4
  333. MADD xsum1, xsum1, x2, a2
  334. MADD xsum2, xsum2, x2, a4
  335. MADD ysum3, ysum3, alpha1, a5
  336. MADD ysum4, ysum4, alpha1, a6
  337. MADD xsum1, xsum1, x3, a5
  338. MADD xsum2, xsum2, x3, a7
  339. MADD ysum3, ysum3, alpha2, a7
  340. daddiu XX, XX, 8 * SIZE
  341. MADD ysum4, ysum4, alpha2, a8
  342. daddiu AO1, AO1, 8 * SIZE
  343. MADD xsum1, xsum1, x4, a6
  344. daddiu AO2, AO2, 8 * SIZE
  345. MADD xsum2, xsum2, x4, a8
  346. ST ysum1, 4 * SIZE(YY)
  347. ST ysum2, 5 * SIZE(YY)
  348. ST ysum3, 6 * SIZE(YY)
  349. ST ysum4, 7 * SIZE(YY)
  350. daddiu YY, YY, 8 * SIZE
  351. .align 3
  352. .L15:
  353. andi I, IS, 4
  354. NOP
  355. blez I, .L16
  356. NOP
  357. LD x1, 0 * SIZE(XX)
  358. LD x2, 1 * SIZE(XX)
  359. LD x3, 2 * SIZE(XX)
  360. LD x4, 3 * SIZE(XX)
  361. daddiu XX, XX, 4 * SIZE
  362. LD a1, 0 * SIZE(AO1)
  363. LD a2, 1 * SIZE(AO1)
  364. LD a5, 2 * SIZE(AO1)
  365. LD a6, 3 * SIZE(AO1)
  366. daddiu AO1, AO1, 4 * SIZE
  367. LD a3, 0 * SIZE(AO2)
  368. LD a4, 1 * SIZE(AO2)
  369. LD a7, 2 * SIZE(AO2)
  370. LD a8, 3 * SIZE(AO2)
  371. daddiu AO2, AO2, 4 * SIZE
  372. LD ysum1, 0 * SIZE(YY)
  373. LD ysum2, 1 * SIZE(YY)
  374. LD ysum3, 2 * SIZE(YY)
  375. LD ysum4, 3 * SIZE(YY)
  376. MADD ysum1, ysum1, alpha1, a1
  377. MADD ysum2, ysum2, alpha1, a2
  378. MADD xsum1, xsum1, x1, a1
  379. MADD xsum2, xsum2, x1, a3
  380. MADD ysum1, ysum1, alpha2, a3
  381. MADD ysum2, ysum2, alpha2, a4
  382. MADD xsum1, xsum1, x2, a2
  383. MADD xsum2, xsum2, x2, a4
  384. MADD ysum3, ysum3, alpha1, a5
  385. MADD ysum4, ysum4, alpha1, a6
  386. MADD xsum1, xsum1, x3, a5
  387. MADD xsum2, xsum2, x3, a7
  388. MADD ysum3, ysum3, alpha2, a7
  389. MADD ysum4, ysum4, alpha2, a8
  390. MADD xsum1, xsum1, x4, a6
  391. MADD xsum2, xsum2, x4, a8
  392. ST ysum1, 0 * SIZE(YY)
  393. ST ysum2, 1 * SIZE(YY)
  394. ST ysum3, 2 * SIZE(YY)
  395. ST ysum4, 3 * SIZE(YY)
  396. daddiu YY, YY, 4 * SIZE
  397. .align 3
  398. .L16:
  399. andi I, IS, 2
  400. NOP
  401. blez I, .L19
  402. NOP
  403. LD x1, 0 * SIZE(XX)
  404. LD x2, 1 * SIZE(XX)
  405. daddiu XX, XX, 2 * SIZE
  406. LD a1, 0 * SIZE(AO1)
  407. LD a2, 1 * SIZE(AO1)
  408. daddiu AO1, AO1, 2 * SIZE
  409. LD a3, 0 * SIZE(AO2)
  410. LD a4, 1 * SIZE(AO2)
  411. daddiu AO2, AO2, 2 * SIZE
  412. LD ysum1, 0 * SIZE(YY)
  413. LD ysum2, 1 * SIZE(YY)
  414. MADD ysum1, ysum1, alpha1, a1
  415. MADD ysum2, ysum2, alpha1, a2
  416. MADD xsum1, xsum1, x1, a1
  417. MADD xsum2, xsum2, x1, a3
  418. MADD ysum1, ysum1, alpha2, a3
  419. MADD ysum2, ysum2, alpha2, a4
  420. MADD xsum1, xsum1, x2, a2
  421. MADD xsum2, xsum2, x2, a4
  422. ST ysum1, 0 * SIZE(YY)
  423. ST ysum2, 1 * SIZE(YY)
  424. .align 3
  425. .L19:
  426. dsll TEMP, IS, BASE_SHIFT
  427. daddu TEMP, Y1, TEMP
  428. LD ysum1, 0 * SIZE(TEMP)
  429. LD ysum2, 1 * SIZE(TEMP)
  430. LD a1, 0 * SIZE(AO1)
  431. LD a2, 1 * SIZE(AO1)
  432. LD a3, 0 * SIZE(AO2)
  433. LD a4, 1 * SIZE(AO2)
  434. MUL xsum1, ALPHA, xsum1
  435. MUL xsum2, ALPHA, xsum2
  436. MADD xsum1, xsum1, alpha1, a1
  437. MADD xsum2, xsum2, alpha1, a3
  438. MADD xsum1, xsum1, alpha2, a3
  439. MADD xsum2, xsum2, alpha2, a4
  440. ADD ysum1, ysum1, xsum1
  441. ADD ysum2, ysum2, xsum2
  442. ST ysum1, 0 * SIZE(TEMP)
  443. ST ysum2, 1 * SIZE(TEMP)
  444. daddiu TEMP, IS, 4
  445. slt TEMP, M, TEMP
  446. beqz TEMP, .L11
  447. daddiu IS, IS, 2
  448. .align 3
  449. .L20:
  450. andi TEMP, M, 1
  451. nop
  452. blez TEMP, .L900
  453. nop
  454. .align 3
  455. dsll TEMP, IS, BASE_SHIFT
  456. daddu TEMP, X, TEMP
  457. LD alpha1, 0 * SIZE(TEMP)
  458. move AO1, A
  459. dsra I, IS, 2
  460. daddu A, AO1, LDA
  461. MTC $0, xsum1
  462. MTC $0, xsum2
  463. move XX, X
  464. MUL alpha1, ALPHA, alpha1
  465. move YY, Y1
  466. blez I, .L25
  467. daddiu I, I, -1
  468. LD x1, 0 * SIZE(XX)
  469. LD x2, 1 * SIZE(XX)
  470. LD x3, 2 * SIZE(XX)
  471. LD x4, 3 * SIZE(XX)
  472. LD a1, 0 * SIZE(AO1)
  473. LD a2, 1 * SIZE(AO1)
  474. LD a3, 2 * SIZE(AO1)
  475. LD a4, 3 * SIZE(AO1)
  476. LD ysum1, 0 * SIZE(YY)
  477. LD ysum2, 1 * SIZE(YY)
  478. LD ysum3, 2 * SIZE(YY)
  479. blez I, .L23
  480. LD ysum4, 3 * SIZE(YY)
  481. .align 3
  482. .L22:
  483. MADD ysum1, ysum1, alpha1, a1
  484. daddiu I, I, -1
  485. MADD xsum1, xsum1, x1, a1
  486. LD a1, 4 * SIZE(AO1)
  487. MADD ysum2, ysum2, alpha1, a2
  488. LD x1, 4 * SIZE(XX)
  489. MADD xsum2, xsum2, x2, a2
  490. LD a2, 5 * SIZE(AO1)
  491. ST ysum1, 0 * SIZE(YY)
  492. LD ysum1, 4 * SIZE(YY)
  493. ST ysum2, 1 * SIZE(YY)
  494. LD ysum2, 5 * SIZE(YY)
  495. daddiu AO1, AO1, 4 * SIZE
  496. nop
  497. MADD ysum3, ysum3, alpha1, a3
  498. LD x2, 5 * SIZE(XX)
  499. MADD xsum1, xsum1, x3, a3
  500. LD a3, 2 * SIZE(AO1)
  501. MADD ysum4, ysum4, alpha1, a4
  502. LD x3, 6 * SIZE(XX)
  503. MADD xsum2, xsum2, x4, a4
  504. LD a4, 3 * SIZE(AO1)
  505. ST ysum3, 2 * SIZE(YY)
  506. LD ysum3, 6 * SIZE(YY)
  507. ST ysum4, 3 * SIZE(YY)
  508. LD ysum4, 7 * SIZE(YY)
  509. daddiu XX, XX, 4 * SIZE
  510. daddiu YY, YY, 4 * SIZE
  511. bgtz I, .L22
  512. LD x4, 3 * SIZE(XX)
  513. .align 3
  514. .L23:
  515. MADD ysum1, ysum1, alpha1, a1
  516. daddiu AO1, AO1, 4 * SIZE
  517. MADD xsum1, xsum1, x1, a1
  518. daddiu XX, XX, 4 * SIZE
  519. MADD ysum2, ysum2, alpha1, a2
  520. daddiu YY, YY, 4 * SIZE
  521. MADD xsum2, xsum2, x2, a2
  522. nop
  523. MADD ysum3, ysum3, alpha1, a3
  524. ST ysum1,-4 * SIZE(YY)
  525. MADD xsum1, xsum1, x3, a3
  526. ST ysum2,-3 * SIZE(YY)
  527. MADD ysum4, ysum4, alpha1, a4
  528. ST ysum3,-2 * SIZE(YY)
  529. MADD xsum2, xsum2, x4, a4
  530. ST ysum4,-1 * SIZE(YY)
  531. .align 3
  532. .L25:
  533. andi I, IS, 2
  534. NOP
  535. blez I, .L26
  536. NOP
  537. LD x1, 0 * SIZE(XX)
  538. LD x2, 1 * SIZE(XX)
  539. daddiu XX, XX, 2 * SIZE
  540. LD a1, 0 * SIZE(AO1)
  541. LD a2, 1 * SIZE(AO1)
  542. daddiu AO1, AO1, 2 * SIZE
  543. LD ysum1, 0 * SIZE(YY)
  544. LD ysum2, 1 * SIZE(YY)
  545. MADD ysum1, ysum1, alpha1, a1
  546. MADD xsum1, xsum1, x1, a1
  547. MADD ysum2, ysum2, alpha1, a2
  548. MADD xsum2, xsum2, x2, a2
  549. ST ysum1, 0 * SIZE(YY)
  550. ST ysum2, 1 * SIZE(YY)
  551. daddiu YY, YY, 2 * SIZE
  552. .align 3
  553. .L26:
  554. andi I, IS, 1
  555. NOP
  556. blez I, .L29
  557. NOP
  558. LD x1, 0 * SIZE(XX)
  559. daddiu XX, XX, 1 * SIZE
  560. LD a1, 0 * SIZE(AO1)
  561. daddiu AO1, AO1, 1* SIZE
  562. LD ysum1, 0 * SIZE(YY)
  563. MADD ysum1, ysum1, alpha1, a1
  564. MADD xsum1, xsum1, x1, a1
  565. ST ysum1, 0 * SIZE(YY)
  566. .align 3
  567. .L29:
  568. dsll TEMP, IS, BASE_SHIFT
  569. daddu TEMP, Y1, TEMP
  570. LD ysum1, 0 * SIZE(TEMP)
  571. LD a1, 0 * SIZE(AO1)
  572. ADD xsum1, xsum1, xsum2
  573. MUL xsum1, ALPHA, xsum1
  574. MADD xsum1, xsum1, alpha1, a1
  575. ADD ysum1, ysum1, xsum1
  576. ST ysum1, 0 * SIZE(TEMP)
  577. .align 3
  578. .L900:
  579. li IS, SIZE
  580. beq INCY, IS, .L999
  581. NOP
  582. dsra I, M, 2
  583. blez I, .L905
  584. NOP
  585. .align 3
  586. .L902:
  587. LD a1, 0 * SIZE(Y1)
  588. LD a2, 1 * SIZE(Y1)
  589. LD a3, 2 * SIZE(Y1)
  590. LD a4, 3 * SIZE(Y1)
  591. ST a1, 0 * SIZE(Y)
  592. daddu Y, Y, INCY
  593. ST a2, 0 * SIZE(Y)
  594. daddu Y, Y, INCY
  595. ST a3, 0 * SIZE(Y)
  596. daddu Y, Y, INCY
  597. ST a4, 0 * SIZE(Y)
  598. daddu Y, Y, INCY
  599. daddiu I, I, -1
  600. bgtz I, .L902
  601. daddiu Y1, Y1, 4 * SIZE
  602. .align 3
  603. .L905:
  604. andi I, M, 3
  605. blez I, .L999
  606. NOP
  607. .align 3
  608. .L906:
  609. LD a1, 0 * SIZE(Y1)
  610. daddiu Y1, Y1, 1 * SIZE
  611. ST a1, 0 * SIZE(Y)
  612. daddiu I, I, -1
  613. bgtz I, .L906
  614. daddu Y, Y, INCY
  615. .align 3
  616. .L999:
  617. LDARG $16, 0($sp)
  618. LDARG $17, 8($sp)
  619. LDARG $18, 16($sp)
  620. LDARG $19, 24($sp)
  621. j $31
  622. daddiu $sp, $sp, 32
  623. EPILOGUE