You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

gemv_t.S 17 kB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061
  1. /*********************************************************************/
  2. /* Copyright 2009, 2010 The University of Texas at Austin. */
  3. /* All rights reserved. */
  4. /* */
  5. /* Redistribution and use in source and binary forms, with or */
  6. /* without modification, are permitted provided that the following */
  7. /* conditions are met: */
  8. /* */
  9. /* 1. Redistributions of source code must retain the above */
  10. /* copyright notice, this list of conditions and the following */
  11. /* disclaimer. */
  12. /* */
  13. /* 2. Redistributions in binary form must reproduce the above */
  14. /* copyright notice, this list of conditions and the following */
  15. /* disclaimer in the documentation and/or other materials */
  16. /* provided with the distribution. */
  17. /* */
  18. /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
  19. /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
  20. /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
  21. /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
  22. /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
  23. /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
  24. /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
  25. /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
  26. /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
  27. /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
  28. /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
  29. /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
  30. /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
  31. /* POSSIBILITY OF SUCH DAMAGE. */
  32. /* */
  33. /* The views and conclusions contained in the software and */
  34. /* documentation are those of the authors and should not be */
  35. /* interpreted as representing official policies, either expressed */
  36. /* or implied, of The University of Texas at Austin. */
  37. /*********************************************************************/
  38. #define ASSEMBLER
  39. #include "common.h"
  40. #define STACKSIZE 64
  41. #define PREFETCHSIZE 32
  42. #define M $16
  43. #define N $17
  44. #define A $20
  45. #define LDA $21
  46. #define X $18
  47. #define INCX $19
  48. #define Y $22
  49. #define INCY $23
  50. #define BUFFER $24
  51. #define I $25
  52. #define J $27
  53. #define X1 $3
  54. #define Y1 $4
  55. #define A1 $5
  56. #define A2 $6
  57. #define A3 $7
  58. #define A4 $8
  59. #define alpha $f19
  60. #define s0 $f0
  61. #define s1 $f1
  62. #define s2 $f10
  63. #define s3 $f11
  64. #define t0 $f12
  65. #define t1 $f13
  66. #define t2 $f14
  67. #define t3 $f15
  68. #define x0 $f16
  69. #define x1 $f17
  70. #define x2 $f18
  71. #define x3 $f21
  72. #define a0 $f22
  73. #define a1 $f23
  74. #define a2 $f24
  75. #define a3 $f25
  76. #define a4 $f26
  77. #define a5 $f27
  78. #define a6 $f28
  79. #define a7 $f29
  80. #define a8 $f2
  81. #define a9 $f3
  82. #define a10 $f4
  83. #define a11 $f5
  84. #define a12 $f6
  85. #define a13 $f7
  86. #define a14 $f8
  87. #define a15 $f9
  88. PROLOGUE
  89. lda $sp, -STACKSIZE($sp)
  90. ldq X, 0 + STACKSIZE($sp)
  91. ldq INCX, 8 + STACKSIZE($sp)
  92. ldq Y, 16 + STACKSIZE($sp)
  93. ldq INCY, 24 + STACKSIZE($sp)
  94. ldq BUFFER, 32 + STACKSIZE($sp)
  95. stt $f2, 0($sp)
  96. stt $f3, 8($sp)
  97. stt $f4, 16($sp)
  98. stt $f5, 24($sp)
  99. stt $f6, 32($sp)
  100. stt $f7, 40($sp)
  101. stt $f8, 48($sp)
  102. stt $f9, 56($sp)
  103. PROFCODE
  104. cmple M, 0, $0
  105. SXADDQ INCX, 0, INCX
  106. cmple N, 0, $1
  107. SXADDQ INCY, 0, INCY
  108. or $0, $1, $0
  109. bne $0, $L999
  110. cmpeq INCX, SIZE, $0
  111. mov X, X1
  112. SXADDQ LDA, 0, LDA
  113. bne $0, $L10
  114. sra M, 3, I
  115. mov BUFFER, Y1
  116. mov BUFFER, X
  117. ble I, $L05
  118. .align 4
  119. $L02:
  120. ldl $31, (PREFETCHSIZE + 0) * SIZE(X1)
  121. lda I, -1(I)
  122. LD a0, 0 * SIZE(X1)
  123. addq X1, INCX, X1
  124. LD a1, 0 * SIZE(X1)
  125. addq X1, INCX, X1
  126. LD a2, 0 * SIZE(X1)
  127. addq X1, INCX, X1
  128. LD a3, 0 * SIZE(X1)
  129. addq X1, INCX, X1
  130. ST a0, 0 * SIZE(Y1)
  131. ST a1, 1 * SIZE(Y1)
  132. ST a2, 2 * SIZE(Y1)
  133. ST a3, 3 * SIZE(Y1)
  134. LD a4, 0 * SIZE(X1)
  135. addq X1, INCX, X1
  136. LD a5, 0 * SIZE(X1)
  137. addq X1, INCX, X1
  138. LD a6, 0 * SIZE(X1)
  139. addq X1, INCX, X1
  140. LD a7, 0 * SIZE(X1)
  141. addq X1, INCX, X1
  142. ST a4, 4 * SIZE(Y1)
  143. ST a5, 5 * SIZE(Y1)
  144. ST a6, 6 * SIZE(Y1)
  145. ST a7, 7 * SIZE(Y1)
  146. lda Y1, 8 * SIZE(Y1)
  147. bgt I, $L02
  148. .align 4
  149. $L05:
  150. and M, 7, I
  151. ble I, $L10
  152. .align 4
  153. $L06:
  154. LD a0, 0 * SIZE(X1)
  155. addq X1, INCX, X1
  156. ST a0, 0 * SIZE(Y1)
  157. addq Y1, SIZE, Y1
  158. lda I, -1(I)
  159. bgt I, $L06
  160. .align 4
  161. $L10:
  162. mov Y, Y1
  163. fclr t0
  164. unop
  165. fclr t1
  166. sra N, 2, J
  167. fclr t2
  168. fclr t3
  169. ble J, $L20
  170. .align 4
  171. $L11:
  172. mov A, A1
  173. fclr s0
  174. addq A, LDA, A2
  175. fclr s1
  176. addq A2, LDA, A3
  177. fclr s2
  178. addq A3, LDA, A4
  179. fclr s3
  180. s4addq LDA, A, A
  181. unop
  182. mov X, X1
  183. lds $f31, 3 * SIZE(Y)
  184. sra M, 3, I
  185. ble I, $L15
  186. LD x0, 0 * SIZE(X1)
  187. LD x1, 1 * SIZE(X1)
  188. LD x2, 2 * SIZE(X1)
  189. LD a0, 0 * SIZE(A1)
  190. LD a1, 0 * SIZE(A2)
  191. LD a2, 0 * SIZE(A3)
  192. LD a3, 0 * SIZE(A4)
  193. LD a4, 1 * SIZE(A1)
  194. LD a5, 1 * SIZE(A2)
  195. LD a6, 1 * SIZE(A3)
  196. LD a7, 1 * SIZE(A4)
  197. LD a8, 2 * SIZE(A1)
  198. LD a9, 2 * SIZE(A2)
  199. LD a10, 2 * SIZE(A3)
  200. LD a11, 2 * SIZE(A4)
  201. LD a12, 3 * SIZE(A1)
  202. LD a13, 3 * SIZE(A2)
  203. LD a14, 3 * SIZE(A3)
  204. LD a15, 3 * SIZE(A4)
  205. lda I, -1(I)
  206. ble I, $L13
  207. .align 4
  208. $L12:
  209. ADD s0, t0, s0
  210. LD x3, 3 * SIZE(X1)
  211. MUL x0, a0, t0
  212. LD a0, 4 * SIZE(A1)
  213. ADD s1, t1, s1
  214. ldl $31, (PREFETCHSIZE + 0) * SIZE(A1)
  215. MUL x0, a1, t1
  216. LD a1, 4 * SIZE(A2)
  217. ADD s2, t2, s2
  218. unop
  219. MUL x0, a2, t2
  220. LD a2, 4 * SIZE(A3)
  221. ADD s3, t3, s3
  222. unop
  223. MUL x0, a3, t3
  224. LD a3, 4 * SIZE(A4)
  225. ADD s0, t0, s0
  226. LD x0, 4 * SIZE(X1)
  227. MUL x1, a4, t0
  228. LD a4, 5 * SIZE(A1)
  229. ADD s1, t1, s1
  230. lda A1, 8 * SIZE(A1)
  231. MUL x1, a5, t1
  232. LD a5, 5 * SIZE(A2)
  233. ADD s2, t2, s2
  234. unop
  235. MUL x1, a6, t2
  236. LD a6, 5 * SIZE(A3)
  237. ADD s3, t3, s3
  238. unop
  239. MUL x1, a7, t3
  240. LD a7, 5 * SIZE(A4)
  241. ADD s0, t0, s0
  242. LD x1, 5 * SIZE(X1)
  243. MUL x2, a8, t0
  244. LD a8, -2 * SIZE(A1)
  245. ADD s1, t1, s1
  246. ldl $31, (PREFETCHSIZE + 0) * SIZE(A2)
  247. MUL x2, a9, t1
  248. LD a9, 6 * SIZE(A2)
  249. ADD s2, t2, s2
  250. lda A2, 8 * SIZE(A2)
  251. MUL x2, a10, t2
  252. LD a10, 6 * SIZE(A3)
  253. ADD s3, t3, s3
  254. lda A3, 8 * SIZE(A3)
  255. MUL x2, a11, t3
  256. LD a11, 6 * SIZE(A4)
  257. ADD s0, t0, s0
  258. LD x2, 6 * SIZE(X1)
  259. MUL x3, a12, t0
  260. LD a12, -1 * SIZE(A1)
  261. ADD s1, t1, s1
  262. lda A4, 8 * SIZE(A4)
  263. MUL x3, a13, t1
  264. LD a13, -1 * SIZE(A2)
  265. ADD s2, t2, s2
  266. unop
  267. MUL x3, a14, t2
  268. LD a14, -1 * SIZE(A3)
  269. ADD s3, t3, s3
  270. unop
  271. MUL x3, a15, t3
  272. LD a15, -1 * SIZE(A4)
  273. ADD s0, t0, s0
  274. LD x3, 7 * SIZE(X1)
  275. MUL x0, a0, t0
  276. LD a0, 0 * SIZE(A1)
  277. ADD s1, t1, s1
  278. ldl $31, (PREFETCHSIZE - 8) * SIZE(A3)
  279. MUL x0, a1, t1
  280. LD a1, 0 * SIZE(A2)
  281. ADD s2, t2, s2
  282. unop
  283. MUL x0, a2, t2
  284. LD a2, 0 * SIZE(A3)
  285. ADD s3, t3, s3
  286. unop
  287. MUL x0, a3, t3
  288. LD a3, 0 * SIZE(A4)
  289. ADD s0, t0, s0
  290. LD x0, 8 * SIZE(X1)
  291. MUL x1, a4, t0
  292. LD a4, 1 * SIZE(A1)
  293. ADD s1, t1, s1
  294. unop
  295. MUL x1, a5, t1
  296. LD a5, 1 * SIZE(A2)
  297. ADD s2, t2, s2
  298. unop
  299. MUL x1, a6, t2
  300. LD a6, 1 * SIZE(A3)
  301. ADD s3, t3, s3
  302. unop
  303. MUL x1, a7, t3
  304. LD a7, 1 * SIZE(A4)
  305. ADD s0, t0, s0
  306. LD x1, 9 * SIZE(X1)
  307. MUL x2, a8, t0
  308. LD a8, 2 * SIZE(A1)
  309. ADD s1, t1, s1
  310. ldl $31, (PREFETCHSIZE - 8) * SIZE(A4)
  311. MUL x2, a9, t1
  312. LD a9, 2 * SIZE(A2)
  313. ADD s2, t2, s2
  314. lda X1, 8 * SIZE(X1)
  315. MUL x2, a10, t2
  316. LD a10, 2 * SIZE(A3)
  317. ADD s3, t3, s3
  318. lda I, -1(I)
  319. MUL x2, a11, t3
  320. LD a11, 2 * SIZE(A4)
  321. ADD s0, t0, s0
  322. LD x2, 2 * SIZE(X1)
  323. MUL x3, a12, t0
  324. LD a12, 3 * SIZE(A1)
  325. ADD s1, t1, s1
  326. ldl $31, (PREFETCHSIZE - 8) * SIZE(X1)
  327. MUL x3, a13, t1
  328. LD a13, 3 * SIZE(A2)
  329. ADD s2, t2, s2
  330. unop
  331. MUL x3, a14, t2
  332. LD a14, 3 * SIZE(A3)
  333. ADD s3, t3, s3
  334. MUL x3, a15, t3
  335. LD a15, 3 * SIZE(A4)
  336. bgt I, $L12
  337. .align 4
  338. $L13:
  339. ADD s0, t0, s0
  340. LD x3, 3 * SIZE(X1)
  341. MUL x0, a0, t0
  342. LD a0, 4 * SIZE(A1)
  343. ADD s1, t1, s1
  344. unop
  345. MUL x0, a1, t1
  346. LD a1, 4 * SIZE(A2)
  347. ADD s2, t2, s2
  348. unop
  349. MUL x0, a2, t2
  350. LD a2, 4 * SIZE(A3)
  351. ADD s3, t3, s3
  352. unop
  353. MUL x0, a3, t3
  354. LD a3, 4 * SIZE(A4)
  355. ADD s0, t0, s0
  356. LD x0, 4 * SIZE(X1)
  357. MUL x1, a4, t0
  358. LD a4, 5 * SIZE(A1)
  359. ADD s1, t1, s1
  360. unop
  361. MUL x1, a5, t1
  362. LD a5, 5 * SIZE(A2)
  363. ADD s2, t2, s2
  364. unop
  365. MUL x1, a6, t2
  366. LD a6, 5 * SIZE(A3)
  367. ADD s3, t3, s3
  368. unop
  369. MUL x1, a7, t3
  370. LD a7, 5 * SIZE(A4)
  371. ADD s0, t0, s0
  372. LD x1, 5 * SIZE(X1)
  373. MUL x2, a8, t0
  374. LD a8, 6 * SIZE(A1)
  375. ADD s1, t1, s1
  376. unop
  377. MUL x2, a9, t1
  378. LD a9, 6 * SIZE(A2)
  379. ADD s2, t2, s2
  380. unop
  381. MUL x2, a10, t2
  382. LD a10, 6 * SIZE(A3)
  383. ADD s3, t3, s3
  384. unop
  385. MUL x2, a11, t3
  386. LD a11, 6 * SIZE(A4)
  387. ADD s0, t0, s0
  388. LD x2, 6 * SIZE(X1)
  389. MUL x3, a12, t0
  390. LD a12, 7 * SIZE(A1)
  391. ADD s1, t1, s1
  392. lda A1, 8 * SIZE(A1)
  393. MUL x3, a13, t1
  394. LD a13, 7 * SIZE(A2)
  395. ADD s2, t2, s2
  396. lda A2, 8 * SIZE(A2)
  397. MUL x3, a14, t2
  398. LD a14, 7 * SIZE(A3)
  399. ADD s3, t3, s3
  400. lda A3, 8 * SIZE(A3)
  401. MUL x3, a15, t3
  402. LD a15, 7 * SIZE(A4)
  403. ADD s0, t0, s0
  404. LD x3, 7 * SIZE(X1)
  405. MUL x0, a0, t0
  406. unop
  407. ADD s1, t1, s1
  408. lda X1, 8 * SIZE(X1)
  409. MUL x0, a1, t1
  410. lda A4, 8 * SIZE(A4)
  411. ADD s2, t2, s2
  412. MUL x0, a2, t2
  413. ADD s3, t3, s3
  414. MUL x0, a3, t3
  415. ADD s0, t0, s0
  416. MUL x1, a4, t0
  417. ADD s1, t1, s1
  418. MUL x1, a5, t1
  419. ADD s2, t2, s2
  420. MUL x1, a6, t2
  421. ADD s3, t3, s3
  422. MUL x1, a7, t3
  423. ADD s0, t0, s0
  424. MUL x2, a8, t0
  425. ADD s1, t1, s1
  426. MUL x2, a9, t1
  427. ADD s2, t2, s2
  428. MUL x2, a10, t2
  429. ADD s3, t3, s3
  430. MUL x2, a11, t3
  431. ADD s0, t0, s0
  432. MUL x3, a12, t0
  433. ADD s1, t1, s1
  434. MUL x3, a13, t1
  435. ADD s2, t2, s2
  436. MUL x3, a14, t2
  437. ADD s3, t3, s3
  438. MUL x3, a15, t3
  439. .align 4
  440. $L15:
  441. and M, 7, I
  442. ble I, $L18
  443. LD x0, 0 * SIZE(X1)
  444. LD a0, 0 * SIZE(A1)
  445. LD a1, 0 * SIZE(A2)
  446. LD a2, 0 * SIZE(A3)
  447. LD a3, 0 * SIZE(A4)
  448. lda I, -1(I)
  449. ble I, $L17
  450. .align 4
  451. $L16:
  452. ADD s0, t0, s0
  453. lda A4, 1 * SIZE(A4)
  454. MUL x0, a0, t0
  455. LD a0, 1 * SIZE(A1)
  456. ADD s1, t1, s1
  457. lda A1, 1 * SIZE(A1)
  458. MUL x0, a1, t1
  459. LD a1, 1 * SIZE(A2)
  460. ADD s2, t2, s2
  461. lda A2, 1 * SIZE(A2)
  462. MUL x0, a2, t2
  463. LD a2, 1 * SIZE(A3)
  464. ADD s3, t3, s3
  465. lda A3, 1 * SIZE(A3)
  466. MUL x0, a3, t3
  467. LD a3, 0 * SIZE(A4)
  468. LD x0, 1 * SIZE(X1)
  469. lda X1, 1 * SIZE(X1)
  470. lda I, -1(I)
  471. bgt I, $L16
  472. .align 4
  473. $L17:
  474. ADD s0, t0, s0
  475. MUL x0, a0, t0
  476. ADD s1, t1, s1
  477. MUL x0, a1, t1
  478. ADD s2, t2, s2
  479. MUL x0, a2, t2
  480. ADD s3, t3, s3
  481. MUL x0, a3, t3
  482. .align 4
  483. $L18:
  484. LD a0, 0 * SIZE(Y)
  485. addq Y, INCY, Y
  486. LD a1, 0 * SIZE(Y)
  487. addq Y, INCY, Y
  488. LD a2, 0 * SIZE(Y)
  489. addq Y, INCY, Y
  490. LD a3, 0 * SIZE(Y)
  491. addq Y, INCY, Y
  492. ADD s0, t0, s0
  493. ADD s1, t1, s1
  494. ADD s2, t2, s2
  495. ADD s3, t3, s3
  496. MUL alpha, s0, s0
  497. MUL alpha, s1, s1
  498. MUL alpha, s2, s2
  499. MUL alpha, s3, s3
  500. ADD a0, s0, a0
  501. fclr t0
  502. ADD a1, s1, a1
  503. fclr t1
  504. ADD a2, s2, a2
  505. fclr t2
  506. ADD a3, s3, a3
  507. fclr t3
  508. ST a0, 0 * SIZE(Y1)
  509. addq Y1, INCY, Y1
  510. ST a1, 0 * SIZE(Y1)
  511. addq Y1, INCY, Y1
  512. ST a2, 0 * SIZE(Y1)
  513. addq Y1, INCY, Y1
  514. ST a3, 0 * SIZE(Y1)
  515. addq Y1, INCY, Y1
  516. lda J, -1(J)
  517. bgt J, $L11
  518. .align 4
  519. $L20:
  520. and N, 2, J
  521. ble J, $L30
  522. mov A, A1
  523. addq A, LDA, A2
  524. addq A2, LDA, A
  525. fclr s0
  526. mov X, X1
  527. fclr s1
  528. sra M, 3, I
  529. fclr s2
  530. fclr s3
  531. ble I, $L25
  532. LD a0, 0 * SIZE(A1)
  533. LD a1, 0 * SIZE(A2)
  534. LD a2, 1 * SIZE(A1)
  535. LD a3, 1 * SIZE(A2)
  536. LD a4, 2 * SIZE(A1)
  537. LD a5, 2 * SIZE(A2)
  538. LD a6, 3 * SIZE(A1)
  539. LD a7, 3 * SIZE(A2)
  540. LD a8, 4 * SIZE(A1)
  541. LD a9, 4 * SIZE(A2)
  542. LD a10, 5 * SIZE(A1)
  543. LD a11, 5 * SIZE(A2)
  544. LD a12, 6 * SIZE(A1)
  545. LD a13, 6 * SIZE(A2)
  546. LD a14, 7 * SIZE(A1)
  547. LD a15, 7 * SIZE(A2)
  548. LD x0, 0 * SIZE(X1)
  549. LD x1, 1 * SIZE(X1)
  550. LD x2, 2 * SIZE(X1)
  551. lda I, -1(I)
  552. ble I, $L23
  553. .align 4
  554. $L22:
  555. ADD s0, t0, s0
  556. LD x3, 3 * SIZE(X1)
  557. MUL x0, a0, t0
  558. LD a0, 8 * SIZE(A1)
  559. ADD s1, t1, s1
  560. ldl $31, (PREFETCHSIZE + 0) * SIZE(A1)
  561. MUL x0, a1, t1
  562. LD a1, 8 * SIZE(A2)
  563. ADD s0, t2, s0
  564. LD x0, 4 * SIZE(X1)
  565. MUL x1, a2, t2
  566. LD a2, 9 * SIZE(A1)
  567. ADD s1, t3, s1
  568. unop
  569. MUL x1, a3, t3
  570. LD a3, 9 * SIZE(A2)
  571. ADD s0, t0, s0
  572. LD x1, 5 * SIZE(X1)
  573. MUL x2, a4, t0
  574. LD a4, 10 * SIZE(A1)
  575. ADD s1, t1, s1
  576. lda I, -1(I)
  577. MUL x2, a5, t1
  578. LD a5, 10 * SIZE(A2)
  579. ADD s0, t2, s0
  580. LD x2, 6 * SIZE(X1)
  581. MUL x3, a6, t2
  582. LD a6, 11 * SIZE(A1)
  583. ADD s1, t3, s1
  584. lda X1, 8 * SIZE(X1)
  585. MUL x3, a7, t3
  586. LD a7, 11 * SIZE(A2)
  587. ADD s0, t0, s0
  588. LD x3, -1 * SIZE(X1)
  589. MUL x0, a8, t0
  590. LD a8, 12 * SIZE(A1)
  591. ADD s1, t1, s1
  592. ldl $31, (PREFETCHSIZE + 0) * SIZE(A2)
  593. MUL x0, a9, t1
  594. LD a9, 12 * SIZE(A2)
  595. ADD s0, t0, s0
  596. LD x0, 0 * SIZE(X1)
  597. MUL x1, a10, t0
  598. LD a10, 13 * SIZE(A1)
  599. ADD s1, t1, s1
  600. lda A1, 8 * SIZE(A1)
  601. MUL x1, a11, t1
  602. LD a11, 13 * SIZE(A2)
  603. ADD s0, t0, s0
  604. LD x1, 1 * SIZE(X1)
  605. MUL x2, a12, t0
  606. LD a12, 6 * SIZE(A1)
  607. ADD s1, t1, s1
  608. MUL x2, a13, t1
  609. LD a13, 14 * SIZE(A2)
  610. lda A2, 8 * SIZE(A2)
  611. ADD s0, t0, s0
  612. LD x2, 2 * SIZE(X1)
  613. MUL x3, a14, t0
  614. LD a14, 7 * SIZE(A1)
  615. ADD s1, t1, s1
  616. MUL x3, a15, t1
  617. LD a15, 7 * SIZE(A2)
  618. bgt I, $L22
  619. .align 4
  620. $L23:
  621. ADD s0, t0, s0
  622. LD x3, 3 * SIZE(X1)
  623. MUL x0, a0, t0
  624. lda A1, 8 * SIZE(A1)
  625. ADD s1, t1, s1
  626. unop
  627. MUL x0, a1, t1
  628. unop
  629. ADD s0, t2, s0
  630. LD x0, 4 * SIZE(X1)
  631. MUL x1, a2, t2
  632. lda A2, 8 * SIZE(A2)
  633. ADD s1, t3, s1
  634. unop
  635. MUL x1, a3, t3
  636. unop
  637. ADD s0, t0, s0
  638. LD x1, 5 * SIZE(X1)
  639. MUL x2, a4, t0
  640. unop
  641. ADD s1, t1, s1
  642. unop
  643. MUL x2, a5, t1
  644. unop
  645. ADD s0, t2, s0
  646. LD x2, 6 * SIZE(X1)
  647. MUL x3, a6, t2
  648. unop
  649. ADD s1, t3, s1
  650. unop
  651. MUL x3, a7, t3
  652. unop
  653. ADD s0, t0, s0
  654. LD x3, 7 * SIZE(X1)
  655. MUL x0, a8, t0
  656. lda X1, 8 * SIZE(X1)
  657. ADD s1, t1, s1
  658. unop
  659. MUL x0, a9, t1
  660. unop
  661. ADD s0, t0, s0
  662. MUL x1, a10, t0
  663. ADD s1, t1, s1
  664. MUL x1, a11, t1
  665. ADD s0, t0, s0
  666. MUL x2, a12, t0
  667. ADD s1, t1, s1
  668. MUL x2, a13, t1
  669. ADD s0, t0, s0
  670. MUL x3, a14, t0
  671. ADD s1, t1, s1
  672. MUL x3, a15, t1
  673. .align 4
  674. $L25:
  675. and M, 7, I
  676. ble I, $L28
  677. LD a0, 0 * SIZE(A1)
  678. LD a1, 0 * SIZE(A2)
  679. LD x0, 0 * SIZE(X1)
  680. lda I, -1(I)
  681. ble I, $L27
  682. .align 4
  683. $L26:
  684. ADD s0, t0, s0
  685. lda A2, 1 * SIZE(A2)
  686. MUL x0, a0, t0
  687. LD a0, 1 * SIZE(A1)
  688. ADD s1, t1, s1
  689. lda A1, 1 * SIZE(A1)
  690. MUL x0, a1, t1
  691. LD a1, 0 * SIZE(A2)
  692. LD x0, 1 * SIZE(X1)
  693. lda X1, 1 * SIZE(X1)
  694. lda I, -1(I)
  695. bgt I, $L26
  696. .align 4
  697. $L27:
  698. ADD s0, t0, s0
  699. MUL x0, a0, t0
  700. ADD s1, t1, s1
  701. MUL x0, a1, t1
  702. .align 4
  703. $L28:
  704. LD a0, 0 * SIZE(Y)
  705. addq Y, INCY, Y
  706. LD a1, 0 * SIZE(Y)
  707. addq Y, INCY, Y
  708. ADD s0, t0, s0
  709. ADD s1, t1, s1
  710. ADD s2, t2, s2
  711. ADD s3, t3, s3
  712. ADD s0, s2, s0
  713. ADD s1, s3, s1
  714. MUL alpha, s0, s0
  715. MUL alpha, s1, s1
  716. ADD a0, s0, a0
  717. ADD a1, s1, a1
  718. ST a0, 0 * SIZE(Y1)
  719. fclr t0
  720. addq Y1, INCY, Y1
  721. fclr t1
  722. ST a1, 0 * SIZE(Y1)
  723. fclr t2
  724. addq Y1, INCY, Y1
  725. fclr t3
  726. .align 4
  727. $L30:
  728. blbc N, $L999
  729. mov A, A1
  730. fclr s0
  731. mov X, X1
  732. fclr s1
  733. sra M, 3, I
  734. fclr s2
  735. fclr s3
  736. ble I, $L35
  737. LD a0, 0 * SIZE(A1)
  738. LD a1, 1 * SIZE(A1)
  739. LD a8, 0 * SIZE(X1)
  740. LD a9, 1 * SIZE(X1)
  741. LD a2, 2 * SIZE(A1)
  742. LD a3, 3 * SIZE(A1)
  743. LD a10, 2 * SIZE(X1)
  744. LD a11, 3 * SIZE(X1)
  745. LD a4, 4 * SIZE(A1)
  746. LD a5, 5 * SIZE(A1)
  747. LD a12, 4 * SIZE(X1)
  748. LD a13, 5 * SIZE(X1)
  749. LD a6, 6 * SIZE(A1)
  750. LD a7, 7 * SIZE(A1)
  751. LD a14, 6 * SIZE(X1)
  752. lda I, -1(I)
  753. ble I, $L33
  754. .align 4
  755. $L32:
  756. ADD s0, t0, s0
  757. LD a15, 7 * SIZE(X1)
  758. MUL a0, a8, t0
  759. LD a0, 8 * SIZE(A1)
  760. ADD s1, t1, s1
  761. LD a8, 8 * SIZE(X1)
  762. MUL a1, a9, t1
  763. LD a1, 9 * SIZE(A1)
  764. ADD s2, t2, s2
  765. LD a9, 9 * SIZE(X1)
  766. MUL a2, a10, t2
  767. LD a2, 10 * SIZE(A1)
  768. ADD s3, t3, s3
  769. LD a10, 10 * SIZE(X1)
  770. MUL a3, a11, t3
  771. LD a3, 11 * SIZE(A1)
  772. ADD s0, t0, s0
  773. LD a11, 11 * SIZE(X1)
  774. MUL a4, a12, t0
  775. LD a4, 12 * SIZE(A1)
  776. ADD s1, t1, s1
  777. LD a12, 12 * SIZE(X1)
  778. MUL a5, a13, t1
  779. LD a5, 13 * SIZE(A1)
  780. ADD s2, t2, s2
  781. LD a13, 13 * SIZE(X1)
  782. MUL a6, a14, t2
  783. LD a6, 14 * SIZE(A1)
  784. ADD s3, t3, s3
  785. LD a14, 14 * SIZE(X1)
  786. MUL a7, a15, t3
  787. LD a7, 15 * SIZE(A1)
  788. lda A1, 8 * SIZE(A1)
  789. lda I, -1(I)
  790. lda X1, 8 * SIZE(X1)
  791. bgt I, $L32
  792. .align 4
  793. $L33:
  794. ADD s0, t0, s0
  795. LD a15, 7 * SIZE(X1)
  796. MUL a0, a8, t0
  797. lda A1, 8 * SIZE(A1)
  798. ADD s1, t1, s1
  799. unop
  800. MUL a1, a9, t1
  801. lda X1, 8 * SIZE(X1)
  802. ADD s2, t2, s2
  803. MUL a2, a10, t2
  804. ADD s3, t3, s3
  805. MUL a3, a11, t3
  806. ADD s0, t0, s0
  807. MUL a4, a12, t0
  808. ADD s1, t1, s1
  809. MUL a5, a13, t1
  810. ADD s2, t2, s2
  811. MUL a6, a14, t2
  812. ADD s3, t3, s3
  813. MUL a7, a15, t3
  814. .align 4
  815. $L35:
  816. and M, 7, I
  817. ble I, $L38
  818. LD a0, 0 * SIZE(A1)
  819. LD x0, 0 * SIZE(X1)
  820. lda I, -1(I)
  821. ble I, $L37
  822. .align 4
  823. $L36:
  824. ADD s0, t0, s0
  825. MUL x0, a0, t0
  826. LD a0, 1 * SIZE(A1)
  827. LD x0, 1 * SIZE(X1)
  828. lda A1, 1 * SIZE(A1)
  829. lda X1, 1 * SIZE(X1)
  830. lda I, -1(I)
  831. bgt I, $L36
  832. .align 4
  833. $L37:
  834. ADD s0, t0, s0
  835. MUL x0, a0, t0
  836. .align 4
  837. $L38:
  838. LD a0, 0 * SIZE(Y)
  839. ADD s0, t0, s0
  840. ADD s1, t1, s1
  841. ADD s2, t2, s2
  842. ADD s3, t3, s3
  843. ADD s0, s2, s0
  844. ADD s1, s3, s1
  845. ADD s0, s1, s0
  846. MUL alpha, s0, s0
  847. ADD a0, s0, a0
  848. ST a0, 0 * SIZE(Y1)
  849. .align 4
  850. $L999:
  851. ldt $f2, 0($sp)
  852. ldt $f3, 8($sp)
  853. ldt $f4, 16($sp)
  854. ldt $f5, 24($sp)
  855. ldt $f6, 32($sp)
  856. ldt $f7, 40($sp)
  857. ldt $f8, 48($sp)
  858. ldt $f9, 56($sp)
  859. lda $sp, STACKSIZE($sp)
  860. ret
  861. EPILOGUE