You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

gemv_n.S 25 kB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400
  1. /*********************************************************************/
  2. /* Copyright 2009, 2010 The University of Texas at Austin. */
  3. /* All rights reserved. */
  4. /* */
  5. /* Redistribution and use in source and binary forms, with or */
  6. /* without modification, are permitted provided that the following */
  7. /* conditions are met: */
  8. /* */
  9. /* 1. Redistributions of source code must retain the above */
  10. /* copyright notice, this list of conditions and the following */
  11. /* disclaimer. */
  12. /* */
  13. /* 2. Redistributions in binary form must reproduce the above */
  14. /* copyright notice, this list of conditions and the following */
  15. /* disclaimer in the documentation and/or other materials */
  16. /* provided with the distribution. */
  17. /* */
  18. /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
  19. /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
  20. /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
  21. /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
  22. /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
  23. /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
  24. /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
  25. /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
  26. /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
  27. /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
  28. /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
  29. /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
  30. /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
  31. /* POSSIBILITY OF SUCH DAMAGE. */
  32. /* */
  33. /* The views and conclusions contained in the software and */
  34. /* documentation are those of the authors and should not be */
  35. /* interpreted as representing official policies, either expressed */
  36. /* or implied, of The University of Texas at Austin. */
  37. /*********************************************************************/
  38. #define ASSEMBLER
  39. #include "common.h"
  40. #define M %i0
  41. #define N %i1
  42. #if defined(DOUBLE) && !defined(__64BIT__)
  43. #define A %i5
  44. #define LDA %i2
  45. #define X %i3
  46. #define INCX %i4
  47. #else
  48. #define A %i4
  49. #define LDA %i5
  50. #define X %i2
  51. #define INCX %i3
  52. #endif
  53. #define Y %l0
  54. #define INCY %l1
  55. #define BUFFER %l2
  56. #define I %l3
  57. #define J %l5
  58. #define A1 %o0
  59. #define A2 %o1
  60. #define A3 %o2
  61. #define A4 %o3
  62. #define Y1 %l4
  63. #define YY %l6
  64. #ifdef DOUBLE
  65. #define t1 %f0
  66. #define t2 %f2
  67. #define t3 %f4
  68. #define t4 %f6
  69. #define y1 %f8
  70. #define y2 %f10
  71. #define y3 %f12
  72. #define y4 %f14
  73. #define y5 %f16
  74. #define y6 %f18
  75. #define y7 %f20
  76. #define y8 %f22
  77. #define a1 %f24
  78. #define a2 %f26
  79. #define a3 %f28
  80. #define a4 %f30
  81. #define a5 %f32
  82. #define a6 %f34
  83. #define a7 %f36
  84. #define a8 %f38
  85. #define a9 %f40
  86. #define a10 %f42
  87. #define a11 %f44
  88. #define a12 %f46
  89. #define a13 %f48
  90. #define a14 %f50
  91. #define a15 %f52
  92. #define a16 %f54
  93. #define x1 %f56
  94. #define x2 %f58
  95. #define x3 %f60
  96. #define x4 %f62
  97. #define FZERO %f52
  98. #define ALPHA %f54
  99. #else
  100. #define t1 %f0
  101. #define t2 %f1
  102. #define t3 %f2
  103. #define t4 %f3
  104. #define y1 %f4
  105. #define y2 %f5
  106. #define y3 %f6
  107. #define y4 %f7
  108. #define y5 %f8
  109. #define y6 %f9
  110. #define y7 %f10
  111. #define y8 %f11
  112. #define a1 %f12
  113. #define a2 %f13
  114. #define a3 %f14
  115. #define a4 %f15
  116. #define a5 %f16
  117. #define a6 %f17
  118. #define a7 %f18
  119. #define a8 %f19
  120. #define a9 %f20
  121. #define a10 %f21
  122. #define a11 %f22
  123. #define a12 %f23
  124. #define a13 %f24
  125. #define a14 %f25
  126. #define a15 %f26
  127. #define a16 %f27
  128. #define x1 %f28
  129. #define x2 %f29
  130. #define x3 %f30
  131. #define x4 %f31
  132. #define FZERO %f26
  133. #define ALPHA %f27
  134. #endif
  135. #ifndef __64BIT__
  136. #define STACK_FZERO [%sp + STACK_START + 8]
  137. #define STACK_ALPHA [%sp + STACK_START + 16]
  138. #else
  139. #define STACK_FZERO [%sp + STACK_START + 32]
  140. #define STACK_ALPHA [%sp + STACK_START + 40]
  141. #endif
  142. PROLOGUE
  143. SAVESP
  144. #ifndef __64BIT__
  145. #ifdef DOUBLE
  146. st %i3, [%sp + STACK_START + 16] /* ALPHA */
  147. st %i4, [%sp + STACK_START + 20]
  148. ld [%sp + STACK_START + 28], LDA
  149. ld [%sp + STACK_START + 32], X
  150. ld [%sp + STACK_START + 36], INCX
  151. ld [%sp + STACK_START + 40], Y
  152. ld [%sp + STACK_START + 44], INCY
  153. ld [%sp + STACK_START + 48], BUFFER
  154. #else
  155. st %i3, [%sp + STACK_START + 16] /* ALPHA */
  156. ld [%sp + STACK_START + 28], X
  157. ld [%sp + STACK_START + 32], INCX
  158. ld [%sp + STACK_START + 36], Y
  159. ld [%sp + STACK_START + 40], INCY
  160. ld [%sp + STACK_START + 44], BUFFER
  161. #endif
  162. LDF [%sp + STACK_START + 16], ALPHA
  163. #else
  164. ldx [%sp + STACK_START + 56], X
  165. ldx [%sp + STACK_START + 64], INCX
  166. ldx [%sp + STACK_START + 72], Y
  167. ldx [%sp + STACK_START + 80], INCY
  168. ldx [%sp + STACK_START + 88], BUFFER
  169. #ifdef DOUBLE
  170. FMOV %f6, ALPHA
  171. STF %f6, STACK_ALPHA
  172. #else
  173. FMOV %f7, ALPHA
  174. STF %f7, STACK_ALPHA
  175. #endif
  176. #endif
  177. sll LDA, BASE_SHIFT, LDA
  178. cmp M, 0
  179. ble %icc, .LL999
  180. sll INCX, BASE_SHIFT, INCX
  181. cmp N, 0
  182. ble %icc, .LL999
  183. sll INCY, BASE_SHIFT, INCY
  184. #ifdef DOUBLE
  185. FCLR(21)
  186. #else
  187. FCLR(26)
  188. #endif
  189. cmp INCY, SIZE
  190. be %icc, .LL10
  191. mov Y, YY
  192. add M, 7, J
  193. sra J, 3, J
  194. mov BUFFER, YY
  195. mov BUFFER, Y1
  196. .LL01:
  197. STF FZERO, [Y1 + 0 * SIZE]
  198. STF FZERO, [Y1 + 1 * SIZE]
  199. STF FZERO, [Y1 + 2 * SIZE]
  200. STF FZERO, [Y1 + 3 * SIZE]
  201. STF FZERO, [Y1 + 4 * SIZE]
  202. STF FZERO, [Y1 + 5 * SIZE]
  203. STF FZERO, [Y1 + 6 * SIZE]
  204. deccc J
  205. STF FZERO, [Y1 + 7 * SIZE]
  206. bg,pn %icc, .LL01
  207. add Y1, 8 * SIZE, Y1
  208. .LL10:
  209. sra N, 2, J
  210. cmp J, 0
  211. ble,pn %icc, .LL20
  212. nop
  213. .LL11:
  214. mov YY, Y1
  215. mov A, A1
  216. add A, LDA, A2
  217. add A2, LDA, A3
  218. add A3, LDA, A4
  219. add A4, LDA, A
  220. LDF STACK_ALPHA, ALPHA
  221. LDF [X], x1
  222. add X, INCX, X
  223. LDF [X], x2
  224. add X, INCX, X
  225. LDF [X], x3
  226. add X, INCX, X
  227. LDF [X], x4
  228. add X, INCX, X
  229. FMUL ALPHA, x1, x1
  230. FMUL ALPHA, x2, x2
  231. FMUL ALPHA, x3, x3
  232. FMUL ALPHA, x4, x4
  233. sra M, 3, I
  234. cmp I, 0
  235. ble,pn %icc, .LL16
  236. nop
  237. LDF [A1 + 0 * SIZE], a1
  238. LDF [A1 + 1 * SIZE], a2
  239. LDF [A1 + 2 * SIZE], a3
  240. LDF [A1 + 3 * SIZE], a4
  241. LDF [A1 + 4 * SIZE], a5
  242. LDF [A1 + 5 * SIZE], a6
  243. LDF [A1 + 6 * SIZE], a7
  244. LDF [A1 + 7 * SIZE], a8
  245. LDF [A2 + 0 * SIZE], a9
  246. LDF [A2 + 1 * SIZE], a10
  247. LDF [A2 + 2 * SIZE], a11
  248. LDF [A2 + 3 * SIZE], a12
  249. LDF [A2 + 4 * SIZE], a13
  250. LDF [A2 + 5 * SIZE], a14
  251. LDF [A2 + 6 * SIZE], a15
  252. LDF [A2 + 7 * SIZE], a16
  253. FMUL a1, x1, t1
  254. LDF [A3 + 0 * SIZE], a1
  255. FMUL a2, x1, t2
  256. LDF [A3 + 1 * SIZE], a2
  257. FMUL a3, x1, t3
  258. LDF [A3 + 2 * SIZE], a3
  259. FMUL a4, x1, t4
  260. LDF [A3 + 3 * SIZE], a4
  261. deccc I
  262. ble,pn %icc, .LL13
  263. nop
  264. nop
  265. nop
  266. nop
  267. #ifdef DOUBLE
  268. #define PREFETCHSIZE 20
  269. #else
  270. #define PREFETCHSIZE 40
  271. #endif
  272. .LL12:
  273. LDF [Y1 + 0 * SIZE], y1
  274. LDF [Y1 + 1 * SIZE], y2
  275. LDF [Y1 + 2 * SIZE], y3
  276. LDF [Y1 + 3 * SIZE], y4
  277. LDF [Y1 + 4 * SIZE], y5
  278. LDF [Y1 + 5 * SIZE], y6
  279. LDF [Y1 + 6 * SIZE], y7
  280. LDF [Y1 + 7 * SIZE], y8
  281. FADD y1, t1, y1
  282. prefetch [A1 + PREFETCHSIZE * SIZE], 1
  283. FMUL a5, x1, t1
  284. LDF [A3 + 4 * SIZE], a5
  285. FADD y2, t2, y2
  286. nop
  287. FMUL a6, x1, t2
  288. LDF [A3 + 5 * SIZE], a6
  289. FADD y3, t3, y3
  290. nop
  291. FMUL a7, x1, t3
  292. LDF [A3 + 6 * SIZE], a7
  293. FADD y4, t4, y4
  294. nop
  295. FMUL a8, x1, t4
  296. LDF [A3 + 7 * SIZE], a8
  297. FADD y5, t1, y5
  298. nop
  299. FMUL a9, x2, t1
  300. LDF [A4 + 0 * SIZE], a9
  301. FADD y6, t2, y6
  302. nop
  303. FMUL a10, x2, t2
  304. LDF [A4 + 1 * SIZE], a10
  305. FADD y7, t3, y7
  306. nop
  307. FMUL a11, x2, t3
  308. LDF [A4 + 2 * SIZE], a11
  309. FADD y8, t4, y8
  310. nop
  311. FMUL a12, x2, t4
  312. LDF [A4 + 3 * SIZE], a12
  313. FADD y1, t1, y1
  314. prefetch [A2 + PREFETCHSIZE * SIZE], 1
  315. FMUL a13, x2, t1
  316. LDF [A4 + 4 * SIZE], a13
  317. FADD y2, t2, y2
  318. nop
  319. FMUL a14, x2, t2
  320. LDF [A4 + 5 * SIZE], a14
  321. FADD y3, t3, y3
  322. nop
  323. FMUL a15, x2, t3
  324. LDF [A4 + 6 * SIZE], a15
  325. FADD y4, t4, y4
  326. nop
  327. FMUL a16, x2, t4
  328. LDF [A4 + 7 * SIZE], a16
  329. FADD y5, t1, y5
  330. nop
  331. FMUL a1, x3, t1
  332. LDF [A1 + 8 * SIZE], a1
  333. FADD y6, t2, y6
  334. nop
  335. FMUL a2, x3, t2
  336. LDF [A1 + 9 * SIZE], a2
  337. FADD y7, t3, y7
  338. nop
  339. FMUL a3, x3, t3
  340. LDF [A1 + 10 * SIZE], a3
  341. FADD y8, t4, y8
  342. nop
  343. FMUL a4, x3, t4
  344. LDF [A1 + 11 * SIZE], a4
  345. FADD y1, t1, y1
  346. prefetch [A3 + PREFETCHSIZE * SIZE], 1
  347. FMUL a5, x3, t1
  348. LDF [A1 + 12 * SIZE], a5
  349. FADD y2, t2, y2
  350. nop
  351. FMUL a6, x3, t2
  352. LDF [A1 + 13 * SIZE], a6
  353. FADD y3, t3, y3
  354. nop
  355. FMUL a7, x3, t3
  356. LDF [A1 + 14 * SIZE], a7
  357. FADD y4, t4, y4
  358. nop
  359. FMUL a8, x3, t4
  360. LDF [A1 + 15 * SIZE], a8
  361. FADD y5, t1, y5
  362. nop
  363. FMUL a9, x4, t1
  364. LDF [A2 + 8 * SIZE], a9
  365. FADD y6, t2, y6
  366. nop
  367. FMUL a10, x4, t2
  368. LDF [A2 + 9 * SIZE], a10
  369. FADD y7, t3, y7
  370. nop
  371. FMUL a11, x4, t3
  372. LDF [A2 + 10 * SIZE], a11
  373. FADD y8, t4, y8
  374. nop
  375. FMUL a12, x4, t4
  376. LDF [A2 + 11 * SIZE], a12
  377. FADD y1, t1, y1
  378. prefetch [A4 + PREFETCHSIZE * SIZE], 1
  379. FMUL a13, x4, t1
  380. LDF [A2 + 12 * SIZE], a13
  381. FADD y2, t2, y2
  382. add A3, 8 * SIZE, A3
  383. FMUL a14, x4, t2
  384. LDF [A2 + 13 * SIZE], a14
  385. FADD y3, t3, y3
  386. add Y1, 8 * SIZE, Y1
  387. FMUL a15, x4, t3
  388. LDF [A2 + 14 * SIZE], a15
  389. FADD y4, t4, y4
  390. deccc I
  391. FMUL a16, x4, t4
  392. LDF [A2 + 15 * SIZE], a16
  393. FADD y5, t1, y5
  394. add A1, 8 * SIZE, A1
  395. FMUL a1, x1, t1
  396. LDF [A3 + 0 * SIZE], a1
  397. FADD y6, t2, y6
  398. add A2, 8 * SIZE, A2
  399. FMUL a2, x1, t2
  400. LDF [A3 + 1 * SIZE], a2
  401. FADD y7, t3, y7
  402. add A4, 8 * SIZE, A4
  403. FMUL a3, x1, t3
  404. LDF [A3 + 2 * SIZE], a3
  405. FADD y8, t4, y8
  406. nop
  407. FMUL a4, x1, t4
  408. LDF [A3 + 3 * SIZE], a4
  409. STF y1, [Y1 - 8 * SIZE]
  410. STF y2, [Y1 - 7 * SIZE]
  411. STF y3, [Y1 - 6 * SIZE]
  412. STF y4, [Y1 - 5 * SIZE]
  413. STF y5, [Y1 - 4 * SIZE]
  414. STF y6, [Y1 - 3 * SIZE]
  415. STF y7, [Y1 - 2 * SIZE]
  416. bg,pn %icc, .LL12
  417. STF y8, [Y1 - 1 * SIZE]
  418. .LL13:
  419. LDF [Y1 + 0 * SIZE], y1
  420. LDF [Y1 + 1 * SIZE], y2
  421. LDF [Y1 + 2 * SIZE], y3
  422. LDF [Y1 + 3 * SIZE], y4
  423. LDF [Y1 + 4 * SIZE], y5
  424. LDF [Y1 + 5 * SIZE], y6
  425. LDF [Y1 + 6 * SIZE], y7
  426. LDF [Y1 + 7 * SIZE], y8
  427. FADD y1, t1, y1
  428. FMUL a5, x1, t1
  429. LDF [A3 + 0 * SIZE], a1
  430. FADD y2, t2, y2
  431. FMUL a6, x1, t2
  432. LDF [A3 + 1 * SIZE], a2
  433. FADD y3, t3, y3
  434. FMUL a7, x1, t3
  435. LDF [A3 + 2 * SIZE], a3
  436. FADD y4, t4, y4
  437. FMUL a8, x1, t4
  438. LDF [A3 + 3 * SIZE], a4
  439. FADD y5, t1, y5
  440. FMUL a9, x2, t1
  441. LDF [A3 + 4 * SIZE], a5
  442. FADD y6, t2, y6
  443. FMUL a10, x2, t2
  444. LDF [A3 + 5 * SIZE], a6
  445. FADD y7, t3, y7
  446. FMUL a11, x2, t3
  447. LDF [A3 + 6 * SIZE], a7
  448. FADD y8, t4, y8
  449. FMUL a12, x2, t4
  450. LDF [A3 + 7 * SIZE], a8
  451. FADD y1, t1, y1
  452. FMUL a13, x2, t1
  453. LDF [A4 + 0 * SIZE], a9
  454. FADD y2, t2, y2
  455. FMUL a14, x2, t2
  456. LDF [A4 + 1 * SIZE], a10
  457. FADD y3, t3, y3
  458. FMUL a15, x2, t3
  459. LDF [A4 + 2 * SIZE], a11
  460. FADD y4, t4, y4
  461. FMUL a16, x2, t4
  462. LDF [A4 + 3 * SIZE], a12
  463. FADD y5, t1, y5
  464. FMUL a1, x3, t1
  465. LDF [A4 + 4 * SIZE], a13
  466. FADD y6, t2, y6
  467. FMUL a2, x3, t2
  468. LDF [A4 + 5 * SIZE], a14
  469. FADD y7, t3, y7
  470. FMUL a3, x3, t3
  471. LDF [A4 + 6 * SIZE], a15
  472. FADD y8, t4, y8
  473. FMUL a4, x3, t4
  474. LDF [A4 + 7 * SIZE], a16
  475. FADD y1, t1, y1
  476. FMUL a5, x3, t1
  477. FADD y2, t2, y2
  478. FMUL a6, x3, t2
  479. FADD y3, t3, y3
  480. FMUL a7, x3, t3
  481. FADD y4, t4, y4
  482. FMUL a8, x3, t4
  483. FADD y5, t1, y5
  484. FMUL a9, x4, t1
  485. FADD y6, t2, y6
  486. FMUL a10, x4, t2
  487. FADD y7, t3, y7
  488. FMUL a11, x4, t3
  489. FADD y8, t4, y8
  490. FMUL a12, x4, t4
  491. FADD y1, t1, y1
  492. FMUL a13, x4, t1
  493. FADD y2, t2, y2
  494. FMUL a14, x4, t2
  495. FADD y3, t3, y3
  496. FMUL a15, x4, t3
  497. FADD y4, t4, y4
  498. FMUL a16, x4, t4
  499. add A4, 8 * SIZE, A4
  500. STF y1, [Y1 + 0 * SIZE]
  501. FADD y5, t1, y5
  502. STF y2, [Y1 + 1 * SIZE]
  503. FADD y6, t2, y6
  504. STF y3, [Y1 + 2 * SIZE]
  505. FADD y7, t3, y7
  506. STF y4, [Y1 + 3 * SIZE]
  507. FADD y8, t4, y8
  508. STF y5, [Y1 + 4 * SIZE]
  509. add A1, 8 * SIZE, A1
  510. STF y6, [Y1 + 5 * SIZE]
  511. add A2, 8 * SIZE, A2
  512. STF y7, [Y1 + 6 * SIZE]
  513. add A3, 8 * SIZE, A3
  514. STF y8, [Y1 + 7 * SIZE]
  515. add Y1, 8 * SIZE, Y1
  516. .LL16:
  517. andcc M, 4, I
  518. ble,pn %icc, .LL17
  519. nop
  520. LDF [A1 + 0 * SIZE], a1
  521. LDF [A1 + 1 * SIZE], a2
  522. LDF [A1 + 2 * SIZE], a3
  523. LDF [A1 + 3 * SIZE], a4
  524. LDF [A2 + 0 * SIZE], a5
  525. LDF [A2 + 1 * SIZE], a6
  526. LDF [A2 + 2 * SIZE], a7
  527. LDF [A2 + 3 * SIZE], a8
  528. LDF [A3 + 0 * SIZE], a9
  529. LDF [A3 + 1 * SIZE], a10
  530. LDF [A3 + 2 * SIZE], a11
  531. LDF [A3 + 3 * SIZE], a12
  532. LDF [A4 + 0 * SIZE], a13
  533. LDF [A4 + 1 * SIZE], a14
  534. LDF [A4 + 2 * SIZE], a15
  535. LDF [A4 + 3 * SIZE], a16
  536. LDF [Y1 + 0 * SIZE], y1
  537. add A1, 4 * SIZE, A1
  538. LDF [Y1 + 1 * SIZE], y2
  539. add A2, 4 * SIZE, A2
  540. LDF [Y1 + 2 * SIZE], y3
  541. add A3, 4 * SIZE, A3
  542. LDF [Y1 + 3 * SIZE], y4
  543. add A4, 4 * SIZE, A4
  544. FMUL a1, x1, t1
  545. FMUL a2, x1, t2
  546. FMUL a3, x1, t3
  547. FMUL a4, x1, t4
  548. FADD y1, t1, y1
  549. FMUL a5, x2, t1
  550. FADD y2, t2, y2
  551. FMUL a6, x2, t2
  552. FADD y3, t3, y3
  553. FMUL a7, x2, t3
  554. FADD y4, t4, y4
  555. FMUL a8, x2, t4
  556. FADD y1, t1, y1
  557. FMUL a9, x3, t1
  558. FADD y2, t2, y2
  559. FMUL a10, x3, t2
  560. FADD y3, t3, y3
  561. FMUL a11, x3, t3
  562. FADD y4, t4, y4
  563. FMUL a12, x3, t4
  564. FADD y1, t1, y1
  565. FMUL a13, x4, t1
  566. FADD y2, t2, y2
  567. FMUL a14, x4, t2
  568. FADD y3, t3, y3
  569. FMUL a15, x4, t3
  570. FADD y4, t4, y4
  571. FMUL a16, x4, t4
  572. FADD y1, t1, y1
  573. FADD y2, t2, y2
  574. FADD y3, t3, y3
  575. FADD y4, t4, y4
  576. STF y1, [Y1 + 0 * SIZE]
  577. STF y2, [Y1 + 1 * SIZE]
  578. STF y3, [Y1 + 2 * SIZE]
  579. STF y4, [Y1 + 3 * SIZE]
  580. add Y1, 4 * SIZE, Y1
  581. .LL17:
  582. andcc M, 2, I
  583. ble,pn %icc, .LL18
  584. nop
  585. LDF [A1 + 0 * SIZE], a1
  586. LDF [A2 + 0 * SIZE], a2
  587. LDF [A3 + 0 * SIZE], a3
  588. LDF [A4 + 0 * SIZE], a4
  589. LDF [Y1 + 0 * SIZE], y1
  590. LDF [A1 + 1 * SIZE], a5
  591. LDF [A2 + 1 * SIZE], a6
  592. LDF [A3 + 1 * SIZE], a7
  593. LDF [A4 + 1 * SIZE], a8
  594. LDF [Y1 + 1 * SIZE], y2
  595. add A1, 2 * SIZE, A1
  596. add A2, 2 * SIZE, A2
  597. add A3, 2 * SIZE, A3
  598. add A4, 2 * SIZE, A4
  599. FMUL a1, x1, t1
  600. FMUL a2, x2, t2
  601. FMUL a3, x3, t3
  602. FMUL a4, x4, t4
  603. FADD y1, t1, y1
  604. FMUL a5, x1, t1
  605. FADD y1, t2, y1
  606. FMUL a6, x2, t2
  607. FADD y1, t3, y1
  608. FMUL a7, x3, t3
  609. FADD y1, t4, y1
  610. FMUL a8, x4, t4
  611. FADD y2, t1, y2
  612. FADD y2, t2, y2
  613. FADD y2, t3, y2
  614. FADD y2, t4, y2
  615. STF y1, [Y1 + 0 * SIZE]
  616. STF y2, [Y1 + 1 * SIZE]
  617. add Y1, 2 * SIZE, Y1
  618. .LL18:
  619. andcc M, 1, I
  620. ble,pn %icc, .LL19
  621. nop
  622. LDF [A1 + 0 * SIZE], a1
  623. LDF [A2 + 0 * SIZE], a2
  624. LDF [A3 + 0 * SIZE], a3
  625. LDF [A4 + 0 * SIZE], a4
  626. LDF [Y1 + 0 * SIZE], y1
  627. FMUL a1, x1, t1
  628. FMUL a2, x2, t2
  629. FMUL a3, x3, t3
  630. FMUL a4, x4, t4
  631. FADD y1, t1, y1
  632. FADD y1, t2, y1
  633. FADD y1, t3, y1
  634. FADD y1, t4, y1
  635. STF y1, [Y1]
  636. .LL19:
  637. deccc J
  638. bg %icc, .LL11
  639. nop
  640. .LL20:
  641. andcc N, 2, J
  642. ble,pn %icc, .LL30
  643. nop
  644. .LL21:
  645. mov YY, Y1
  646. mov A, A1
  647. add A, LDA, A2
  648. add A2, LDA, A
  649. LDF STACK_ALPHA, ALPHA
  650. LDF [X], x1
  651. add X, INCX, X
  652. LDF [X], x2
  653. add X, INCX, X
  654. FMUL ALPHA, x1, x1
  655. FMUL ALPHA, x2, x2
  656. sra M, 3, I
  657. cmp I, 0
  658. ble,pn %icc, .LL26
  659. nop
  660. LDF [Y1 + 0 * SIZE], y1
  661. LDF [Y1 + 1 * SIZE], y2
  662. LDF [Y1 + 2 * SIZE], y3
  663. LDF [Y1 + 3 * SIZE], y4
  664. LDF [Y1 + 4 * SIZE], y5
  665. LDF [Y1 + 5 * SIZE], y6
  666. LDF [Y1 + 6 * SIZE], y7
  667. LDF [Y1 + 7 * SIZE], y8
  668. LDF [A1 + 0 * SIZE], a1
  669. LDF [A1 + 1 * SIZE], a2
  670. LDF [A1 + 2 * SIZE], a3
  671. LDF [A1 + 3 * SIZE], a4
  672. LDF [A1 + 4 * SIZE], a5
  673. LDF [A1 + 5 * SIZE], a6
  674. LDF [A1 + 6 * SIZE], a7
  675. LDF [A1 + 7 * SIZE], a8
  676. LDF [A2 + 0 * SIZE], a9
  677. LDF [A2 + 1 * SIZE], a10
  678. LDF [A2 + 2 * SIZE], a11
  679. LDF [A2 + 3 * SIZE], a12
  680. LDF [A2 + 4 * SIZE], a13
  681. LDF [A2 + 5 * SIZE], a14
  682. LDF [A2 + 6 * SIZE], a15
  683. LDF [A2 + 7 * SIZE], a16
  684. FMUL a1, x1, t1
  685. deccc I
  686. LDF [A1 + 8 * SIZE], a1
  687. FMUL a2, x1, t2
  688. LDF [A1 + 9 * SIZE], a2
  689. FMUL a3, x1, t3
  690. LDF [A1 + 10 * SIZE], a3
  691. FMUL a4, x1, t4
  692. ble,pn %icc, .LL23
  693. LDF [A1 + 11 * SIZE], a4
  694. .LL22:
  695. FADD y1, t1, y1
  696. prefetch [A1 + PREFETCHSIZE * SIZE], 1
  697. FMUL a5, x1, t1
  698. LDF [A1 + 12 * SIZE], a5
  699. FADD y2, t2, y2
  700. FMUL a6, x1, t2
  701. LDF [A1 + 13 * SIZE], a6
  702. FADD y3, t3, y3
  703. FMUL a7, x1, t3
  704. LDF [A1 + 14 * SIZE], a7
  705. FADD y4, t4, y4
  706. FMUL a8, x1, t4
  707. LDF [A1 + 15 * SIZE], a8
  708. FADD y5, t1, y5
  709. FMUL a9, x2, t1
  710. LDF [A2 + 8 * SIZE], a9
  711. FADD y6, t2, y6
  712. FMUL a10, x2, t2
  713. LDF [A2 + 9 * SIZE], a10
  714. FADD y7, t3, y7
  715. FMUL a11, x2, t3
  716. LDF [A2 + 10 * SIZE], a11
  717. FADD y8, t4, y8
  718. FMUL a12, x2, t4
  719. LDF [A2 + 11 * SIZE], a12
  720. FADD y1, t1, y1
  721. prefetch [A2 + PREFETCHSIZE * SIZE], 1
  722. FMUL a13, x2, t1
  723. LDF [A2 + 12 * SIZE], a13
  724. FADD y2, t2, y2
  725. FMUL a14, x2, t2
  726. LDF [A2 + 13 * SIZE], a14
  727. FADD y3, t3, y3
  728. FMUL a15, x2, t3
  729. LDF [A2 + 14 * SIZE], a15
  730. FADD y4, t4, y4
  731. FMUL a16, x2, t4
  732. LDF [A2 + 15 * SIZE], a16
  733. FADD y5, t1, y5
  734. FMUL a1, x1, t1
  735. LDF [A1 + 16 * SIZE], a1
  736. FADD y6, t2, y6
  737. FMUL a2, x1, t2
  738. LDF [A1 + 17 * SIZE], a2
  739. FADD y7, t3, y7
  740. FMUL a3, x1, t3
  741. LDF [A1 + 18 * SIZE], a3
  742. FADD y8, t4, y8
  743. FMUL a4, x1, t4
  744. LDF [A1 + 19 * SIZE], a4
  745. STF y1, [Y1 + 0 * SIZE]
  746. STF y2, [Y1 + 1 * SIZE]
  747. STF y3, [Y1 + 2 * SIZE]
  748. STF y4, [Y1 + 3 * SIZE]
  749. STF y5, [Y1 + 4 * SIZE]
  750. STF y6, [Y1 + 5 * SIZE]
  751. STF y7, [Y1 + 6 * SIZE]
  752. STF y8, [Y1 + 7 * SIZE]
  753. LDF [Y1 + 8 * SIZE], y1
  754. add A1, 8 * SIZE, A1
  755. LDF [Y1 + 9 * SIZE], y2
  756. add A2, 8 * SIZE, A2
  757. LDF [Y1 + 10 * SIZE], y3
  758. deccc I
  759. LDF [Y1 + 11 * SIZE], y4
  760. LDF [Y1 + 12 * SIZE], y5
  761. LDF [Y1 + 13 * SIZE], y6
  762. LDF [Y1 + 14 * SIZE], y7
  763. LDF [Y1 + 15 * SIZE], y8
  764. bg,pn %icc, .LL22
  765. add Y1, 8 * SIZE, Y1
  766. .LL23:
  767. FADD y1, t1, y1
  768. FMUL a5, x1, t1
  769. FADD y2, t2, y2
  770. FMUL a6, x1, t2
  771. FADD y3, t3, y3
  772. FMUL a7, x1, t3
  773. FADD y4, t4, y4
  774. FMUL a8, x1, t4
  775. FADD y5, t1, y5
  776. FMUL a9, x2, t1
  777. FADD y6, t2, y6
  778. FMUL a10, x2, t2
  779. FADD y7, t3, y7
  780. FMUL a11, x2, t3
  781. FADD y8, t4, y8
  782. FMUL a12, x2, t4
  783. FADD y1, t1, y1
  784. FMUL a13, x2, t1
  785. FADD y2, t2, y2
  786. FMUL a14, x2, t2
  787. FADD y3, t3, y3
  788. FMUL a15, x2, t3
  789. FADD y4, t4, y4
  790. FMUL a16, x2, t4
  791. STF y1, [Y1 + 0 * SIZE]
  792. FADD y5, t1, y5
  793. STF y2, [Y1 + 1 * SIZE]
  794. FADD y6, t2, y6
  795. STF y3, [Y1 + 2 * SIZE]
  796. FADD y7, t3, y7
  797. STF y4, [Y1 + 3 * SIZE]
  798. FADD y8, t4, y8
  799. STF y5, [Y1 + 4 * SIZE]
  800. add A1, 8 * SIZE, A1
  801. STF y6, [Y1 + 5 * SIZE]
  802. add A2, 8 * SIZE, A2
  803. STF y7, [Y1 + 6 * SIZE]
  804. nop
  805. STF y8, [Y1 + 7 * SIZE]
  806. add Y1, 8 * SIZE, Y1
  807. .LL26:
  808. andcc M, 4, I
  809. ble,pn %icc, .LL27
  810. nop
  811. LDF [A1 + 0 * SIZE], a1
  812. LDF [A1 + 1 * SIZE], a2
  813. LDF [A1 + 2 * SIZE], a3
  814. LDF [A1 + 3 * SIZE], a4
  815. LDF [A2 + 0 * SIZE], a5
  816. LDF [A2 + 1 * SIZE], a6
  817. LDF [A2 + 2 * SIZE], a7
  818. LDF [A2 + 3 * SIZE], a8
  819. LDF [Y1 + 0 * SIZE], y1
  820. add A1, 4 * SIZE, A1
  821. LDF [Y1 + 1 * SIZE], y2
  822. add A2, 4 * SIZE, A2
  823. LDF [Y1 + 2 * SIZE], y3
  824. LDF [Y1 + 3 * SIZE], y4
  825. FMUL a1, x1, t1
  826. FMUL a2, x1, t2
  827. FMUL a3, x1, t3
  828. FMUL a4, x1, t4
  829. FADD y1, t1, y1
  830. FMUL a5, x2, t1
  831. FADD y2, t2, y2
  832. FMUL a6, x2, t2
  833. FADD y3, t3, y3
  834. FMUL a7, x2, t3
  835. FADD y4, t4, y4
  836. FMUL a8, x2, t4
  837. FADD y1, t1, y1
  838. FADD y2, t2, y2
  839. FADD y3, t3, y3
  840. FADD y4, t4, y4
  841. STF y1, [Y1 + 0 * SIZE]
  842. STF y2, [Y1 + 1 * SIZE]
  843. STF y3, [Y1 + 2 * SIZE]
  844. STF y4, [Y1 + 3 * SIZE]
  845. add Y1, 4 * SIZE, Y1
  846. .LL27:
  847. andcc M, 2, I
  848. ble,pn %icc, .LL28
  849. nop
  850. LDF [A1 + 0 * SIZE], a1
  851. LDF [A2 + 0 * SIZE], a2
  852. LDF [Y1 + 0 * SIZE], y1
  853. LDF [A1 + 1 * SIZE], a5
  854. LDF [A2 + 1 * SIZE], a6
  855. add A1, 2 * SIZE, A1
  856. LDF [Y1 + 1 * SIZE], y2
  857. add A2, 2 * SIZE, A2
  858. FMUL a1, x1, t1
  859. FMUL a2, x2, t2
  860. FADD y1, t1, y1
  861. FMUL a5, x1, t1
  862. FADD y1, t2, y1
  863. FMUL a6, x2, t2
  864. FADD y2, t1, y2
  865. FADD y2, t2, y2
  866. STF y1, [Y1 + 0 * SIZE]
  867. STF y2, [Y1 + 1 * SIZE]
  868. add Y1, 2 * SIZE, Y1
  869. .LL28:
  870. andcc M, 1, I
  871. ble,pn %icc, .LL30
  872. nop
  873. LDF [A1 + 0 * SIZE], a1
  874. LDF [A2 + 0 * SIZE], a2
  875. LDF [Y1 + 0 * SIZE], y1
  876. FMUL a1, x1, t1
  877. FMUL a2, x2, t2
  878. FADD y1, t1, y1
  879. FADD y1, t2, y1
  880. STF y1, [Y1]
  881. .LL30:
  882. andcc N, 1, J
  883. ble,pn %icc, .LL990
  884. nop
  885. .LL31:
  886. mov YY, Y1
  887. mov A, A1
  888. LDF STACK_ALPHA, ALPHA
  889. LDF [X], x1
  890. add X, INCX, X
  891. FMUL ALPHA, x1, x1
  892. sra M, 3, I
  893. cmp I, 0
  894. ble,pn %icc, .LL36
  895. nop
  896. LDF [Y1 + 0 * SIZE], y1
  897. LDF [Y1 + 1 * SIZE], y2
  898. LDF [Y1 + 2 * SIZE], y3
  899. LDF [Y1 + 3 * SIZE], y4
  900. LDF [Y1 + 4 * SIZE], y5
  901. LDF [Y1 + 5 * SIZE], y6
  902. LDF [Y1 + 6 * SIZE], y7
  903. LDF [Y1 + 7 * SIZE], y8
  904. LDF [A1 + 0 * SIZE], a1
  905. LDF [A1 + 1 * SIZE], a2
  906. LDF [A1 + 2 * SIZE], a3
  907. LDF [A1 + 3 * SIZE], a4
  908. LDF [A1 + 4 * SIZE], a5
  909. LDF [A1 + 5 * SIZE], a6
  910. LDF [A1 + 6 * SIZE], a7
  911. LDF [A1 + 7 * SIZE], a8
  912. FMUL a1, x1, t1
  913. deccc I
  914. LDF [A1 + 8 * SIZE], a1
  915. FMUL a2, x1, t2
  916. LDF [A1 + 9 * SIZE], a2
  917. FMUL a3, x1, t3
  918. LDF [A1 + 10 * SIZE], a3
  919. FMUL a4, x1, t4
  920. ble,pn %icc, .LL33
  921. LDF [A1 + 11 * SIZE], a4
  922. .LL32:
  923. FADD y1, t1, y1
  924. prefetch [A1 + PREFETCHSIZE * SIZE], 1
  925. FMUL a5, x1, t1
  926. LDF [A1 + 12 * SIZE], a5
  927. FADD y2, t2, y2
  928. FMUL a6, x1, t2
  929. LDF [A1 + 13 * SIZE], a6
  930. FADD y3, t3, y3
  931. FMUL a7, x1, t3
  932. LDF [A1 + 14 * SIZE], a7
  933. FADD y4, t4, y4
  934. FMUL a8, x1, t4
  935. LDF [A1 + 15 * SIZE], a8
  936. FADD y5, t1, y5
  937. FMUL a1, x1, t1
  938. LDF [A1 + 16 * SIZE], a1
  939. FADD y6, t2, y6
  940. FMUL a2, x1, t2
  941. LDF [A1 + 17 * SIZE], a2
  942. FADD y7, t3, y7
  943. FMUL a3, x1, t3
  944. LDF [A1 + 18 * SIZE], a3
  945. FADD y8, t4, y8
  946. FMUL a4, x1, t4
  947. LDF [A1 + 19 * SIZE], a4
  948. STF y1, [Y1 + 0 * SIZE]
  949. STF y2, [Y1 + 1 * SIZE]
  950. STF y3, [Y1 + 2 * SIZE]
  951. STF y4, [Y1 + 3 * SIZE]
  952. STF y5, [Y1 + 4 * SIZE]
  953. STF y6, [Y1 + 5 * SIZE]
  954. STF y7, [Y1 + 6 * SIZE]
  955. STF y8, [Y1 + 7 * SIZE]
  956. LDF [Y1 + 8 * SIZE], y1
  957. LDF [Y1 + 9 * SIZE], y2
  958. LDF [Y1 + 10 * SIZE], y3
  959. LDF [Y1 + 11 * SIZE], y4
  960. LDF [Y1 + 12 * SIZE], y5
  961. deccc I
  962. LDF [Y1 + 13 * SIZE], y6
  963. add A1, 8 * SIZE, A1
  964. LDF [Y1 + 14 * SIZE], y7
  965. add Y1, 8 * SIZE, Y1
  966. bg,pn %icc, .LL32
  967. LDF [Y1 + 7 * SIZE], y8
  968. .LL33:
  969. FADD y1, t1, y1
  970. FMUL a5, x1, t1
  971. FADD y2, t2, y2
  972. FMUL a6, x1, t2
  973. FADD y3, t3, y3
  974. FMUL a7, x1, t3
  975. FADD y4, t4, y4
  976. FMUL a8, x1, t4
  977. STF y1, [Y1 + 0 * SIZE]
  978. FADD y5, t1, y5
  979. STF y2, [Y1 + 1 * SIZE]
  980. FADD y6, t2, y6
  981. STF y3, [Y1 + 2 * SIZE]
  982. FADD y7, t3, y7
  983. STF y4, [Y1 + 3 * SIZE]
  984. FADD y8, t4, y8
  985. STF y5, [Y1 + 4 * SIZE]
  986. STF y6, [Y1 + 5 * SIZE]
  987. STF y7, [Y1 + 6 * SIZE]
  988. add A1, 8 * SIZE, A1
  989. STF y8, [Y1 + 7 * SIZE]
  990. add Y1, 8 * SIZE, Y1
  991. .LL36:
  992. andcc M, 4, I
  993. ble,pn %icc, .LL37
  994. nop
  995. LDF [A1 + 0 * SIZE], a1
  996. LDF [A1 + 1 * SIZE], a2
  997. LDF [A1 + 2 * SIZE], a3
  998. LDF [A1 + 3 * SIZE], a4
  999. LDF [Y1 + 0 * SIZE], y1
  1000. add A1, 4 * SIZE, A1
  1001. LDF [Y1 + 1 * SIZE], y2
  1002. LDF [Y1 + 2 * SIZE], y3
  1003. LDF [Y1 + 3 * SIZE], y4
  1004. FMUL a1, x1, t1
  1005. FMUL a2, x1, t2
  1006. FMUL a3, x1, t3
  1007. FMUL a4, x1, t4
  1008. FADD y1, t1, y1
  1009. FADD y2, t2, y2
  1010. FADD y3, t3, y3
  1011. FADD y4, t4, y4
  1012. STF y1, [Y1 + 0 * SIZE]
  1013. STF y2, [Y1 + 1 * SIZE]
  1014. STF y3, [Y1 + 2 * SIZE]
  1015. STF y4, [Y1 + 3 * SIZE]
  1016. add Y1, 4 * SIZE, Y1
  1017. .LL37:
  1018. andcc M, 2, I
  1019. ble,pn %icc, .LL38
  1020. nop
  1021. LDF [A1 + 0 * SIZE], a1
  1022. LDF [Y1 + 0 * SIZE], y1
  1023. LDF [A1 + 1 * SIZE], a5
  1024. LDF [Y1 + 1 * SIZE], y2
  1025. add A1, 2 * SIZE, A1
  1026. FMUL a1, x1, t1
  1027. FADD y1, t1, y1
  1028. FMUL a5, x1, t1
  1029. FADD y2, t1, y2
  1030. STF y1, [Y1 + 0 * SIZE]
  1031. STF y2, [Y1 + 1 * SIZE]
  1032. add Y1, 2 * SIZE, Y1
  1033. .LL38:
  1034. andcc M, 1, I
  1035. ble,pn %icc, .LL990
  1036. nop
  1037. LDF [A1 + 0 * SIZE], a1
  1038. LDF [Y1 + 0 * SIZE], y1
  1039. FMUL a1, x1, t1
  1040. FADD y1, t1, y1
  1041. STF y1, [Y1]
  1042. .LL990:
  1043. cmp INCY, SIZE
  1044. be %icc, .LL999
  1045. mov Y, Y1
  1046. sra M, 3, I
  1047. cmp I, 0
  1048. ble,pn %icc, .LL995
  1049. nop
  1050. .LL991:
  1051. LDF [BUFFER + 0 * SIZE], a1
  1052. LDF [Y], y1
  1053. add Y, INCY, Y
  1054. LDF [BUFFER + 1 * SIZE], a2
  1055. LDF [Y], y2
  1056. add Y, INCY, Y
  1057. LDF [BUFFER + 2 * SIZE], a3
  1058. LDF [Y], y3
  1059. add Y, INCY, Y
  1060. LDF [BUFFER + 3 * SIZE], a4
  1061. LDF [Y], y4
  1062. add Y, INCY, Y
  1063. LDF [BUFFER + 4 * SIZE], a5
  1064. FADD y1, a1, y1
  1065. LDF [Y], y5
  1066. add Y, INCY, Y
  1067. LDF [BUFFER + 5 * SIZE], a6
  1068. FADD y2, a2, y2
  1069. LDF [Y], y6
  1070. add Y, INCY, Y
  1071. LDF [BUFFER + 6 * SIZE], a7
  1072. FADD y3, a3, y3
  1073. LDF [Y], y7
  1074. add Y, INCY, Y
  1075. LDF [BUFFER + 7 * SIZE], a8
  1076. FADD y4, a4, y4
  1077. LDF [Y], y8
  1078. add Y, INCY, Y
  1079. STF y1, [Y1]
  1080. FADD y5, a5, y5
  1081. add Y1, INCY, Y1
  1082. STF y2, [Y1]
  1083. FADD y6, a6, y6
  1084. add Y1, INCY, Y1
  1085. STF y3, [Y1]
  1086. FADD y7, a7, y7
  1087. add Y1, INCY, Y1
  1088. STF y4, [Y1]
  1089. FADD y8, a8, y8
  1090. add Y1, INCY, Y1
  1091. STF y5, [Y1]
  1092. add Y1, INCY, Y1
  1093. STF y6, [Y1]
  1094. add Y1, INCY, Y1
  1095. STF y7, [Y1]
  1096. add Y1, INCY, Y1
  1097. STF y8, [Y1]
  1098. add Y1, INCY, Y1
  1099. deccc I
  1100. bg,pn %icc, .LL991
  1101. add BUFFER, 8 * SIZE, BUFFER
  1102. .LL995:
  1103. andcc M, 7, I
  1104. ble,pn %icc, .LL999
  1105. nop
  1106. andcc M, 4, I
  1107. ble,pn %icc, .LL996
  1108. nop
  1109. LDF [BUFFER + 0 * SIZE], a1
  1110. LDF [BUFFER + 1 * SIZE], a2
  1111. LDF [BUFFER + 2 * SIZE], a3
  1112. LDF [BUFFER + 3 * SIZE], a4
  1113. add BUFFER, 4 * SIZE, BUFFER
  1114. LDF [Y], y1
  1115. add Y, INCY, Y
  1116. LDF [Y], y2
  1117. add Y, INCY, Y
  1118. LDF [Y], y3
  1119. add Y, INCY, Y
  1120. LDF [Y], y4
  1121. add Y, INCY, Y
  1122. FADD y1, a1, y1
  1123. FADD y2, a2, y2
  1124. FADD y3, a3, y3
  1125. FADD y4, a4, y4
  1126. STF y1, [Y1]
  1127. add Y1, INCY, Y1
  1128. STF y2, [Y1]
  1129. add Y1, INCY, Y1
  1130. STF y3, [Y1]
  1131. add Y1, INCY, Y1
  1132. STF y4, [Y1]
  1133. add Y1, INCY, Y1
  1134. .LL996:
  1135. andcc M, 2, I
  1136. ble,pn %icc, .LL997
  1137. nop
  1138. LDF [BUFFER + 0 * SIZE], a1
  1139. LDF [BUFFER + 1 * SIZE], a2
  1140. add BUFFER, 2 * SIZE, BUFFER
  1141. LDF [Y], y1
  1142. add Y, INCY, Y
  1143. LDF [Y], y2
  1144. add Y, INCY, Y
  1145. FADD y1, a1, y1
  1146. FADD y2, a2, y2
  1147. STF y1, [Y1]
  1148. add Y1, INCY, Y1
  1149. STF y2, [Y1]
  1150. add Y1, INCY, Y1
  1151. .LL997:
  1152. andcc M, 1, I
  1153. ble,pn %icc, .LL999
  1154. nop
  1155. LDF [BUFFER + 0 * SIZE], a1
  1156. LDF [Y], y1
  1157. FADD y1, a1, y1
  1158. STF y1, [Y1]
  1159. .LL999:
  1160. return %i7 + 8
  1161. clr %o0
  1162. EPILOGUE