You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

ztrsm_kernel_RT_1x4.S 42 kB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673167416751676167716781679168016811682168316841685168616871688168916901691169216931694169516961697169816991700170117021703170417051706170717081709171017111712171317141715171617171718171917201721172217231724172517261727172817291730173117321733173417351736173717381739174017411742174317441745174617471748174917501751175217531754175517561757175817591760176117621763176417651766176717681769177017711772177317741775177617771778177917801781178217831784178517861787178817891790179117921793179417951796179717981799180018011802180318041805180618071808180918101811181218131814181518161817181818191820182118221823182418251826182718281829183018311832183318341835183618371838183918401841184218431844184518461847184818491850185118521853185418551856185718581859186018611862186318641865186618671868186918701871187218731874187518761877187818791880188118821883188418851886188718881889189018911892189318941895189618971898189919001901190219031904190519061907190819091910191119121913191419151916191719181919192019211922192319241925192619271928192919301931193219331934193519361937193819391940194119421943194419451946194719481949195019511952195319541955195619571958195919601961196219631964196519661967196819691970197119721973197419751976197719781979198019811982198319841985198619871988198919901991199219931994199519961997199819992000200120022003200420052006200720082009201020112012201320142015201620172018201920202021202220232024202520262027202820292030203120322033203420352036203720382039204020412042204320442045204620472048204920502051205220532054205520562057205820592060206120622063206420652066206720682069207020712072207320742075207620772078207920802081208220832084208520862087208820892090209120922093209420952096209720982099210021012102210321042105210621072108210921102111211221132114211521162117211821192120212121222123212421252126212721282129213021312132
  1. /*********************************************************************/
  2. /* Copyright 2005-2010 The University of Texas at Austin. */
  3. /* All rights reserved. */
  4. /* */
  5. /* Redistribution and use in source and binary forms, with or */
  6. /* without modification, are permitted provided that the following */
  7. /* conditions are met: */
  8. /* */
  9. /* 1. Redistributions of source code must retain the above */
  10. /* copyright notice, this list of conditions and the following */
  11. /* disclaimer. */
  12. /* */
  13. /* 2. Redistributions in binary form must reproduce the above */
  14. /* copyright notice, this list of conditions and the following */
  15. /* disclaimer in the documentation and/or other materials */
  16. /* provided with the distribution. */
  17. /* */
  18. /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
  19. /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
  20. /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
  21. /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
  22. /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
  23. /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
  24. /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
  25. /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
  26. /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
  27. /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
  28. /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
  29. /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
  30. /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
  31. /* POSSIBILITY OF SUCH DAMAGE. */
  32. /* */
  33. /* The views and conclusions contained in the software and */
  34. /* documentation are those of the authors and should not be */
  35. /* interpreted as representing official policies, either expressed */
  36. /* or implied, of The University of Texas at Austin. */
  37. /*********************************************************************/
  38. #define ASSEMBLER
  39. #include "common.h"
  40. #define APREFETCHSIZE 24
  41. #define APREFETCH_CATEGORY 0
  42. #define M %i0
  43. #define N %i1
  44. #define K %i2
  45. #define A %i5
  46. #define B %i3
  47. #define C %i4
  48. #define LDC %o0
  49. #define AO %o1
  50. #define BO %o2
  51. #define I %o3
  52. #define J %o4
  53. #define L %o5
  54. #define C1 %l0
  55. #define C2 %l1
  56. #define C3 %l2
  57. #define C4 %l3
  58. #define OFFSET %l4
  59. #define KK %l5
  60. #define TEMP1 %l6
  61. #define TEMP2 %l7
  62. #define AORIG %o7
  63. #ifdef DOUBLE
  64. #define c01 %f0
  65. #define c02 %f2
  66. #define c03 %f4
  67. #define c04 %f6
  68. #define c05 %f8
  69. #define c06 %f10
  70. #define c07 %f12
  71. #define c08 %f14
  72. #define c09 %f16
  73. #define c10 %f18
  74. #define c11 %f20
  75. #define c12 %f22
  76. #define c13 %f24
  77. #define c14 %f26
  78. #define c15 %f28
  79. #define c16 %f30
  80. #define a1 %f32
  81. #define a2 %f34
  82. #define a3 %f36
  83. #define a4 %f38
  84. #define a5 %f40
  85. #define b1 %f42
  86. #define b2 %f44
  87. #define b3 %f46
  88. #define b4 %f48
  89. #define b5 %f50
  90. #define b6 %f52
  91. #define b7 %f54
  92. #define b8 %f56
  93. #define b9 %f58
  94. #define cc01 0
  95. #define cc02 2
  96. #define cc03 4
  97. #define cc04 6
  98. #define cc05 8
  99. #define cc06 10
  100. #define cc07 12
  101. #define cc08 14
  102. #define cc09 16
  103. #define cc10 18
  104. #define cc11 20
  105. #define cc12 22
  106. #define cc13 24
  107. #define cc14 26
  108. #define cc15 28
  109. #define cc16 30
  110. #define aa1 1
  111. #define aa2 3
  112. #define aa3 5
  113. #define aa4 7
  114. #define aa5 9
  115. #define bb1 11
  116. #define bb2 13
  117. #define bb3 15
  118. #define bb4 17
  119. #define bb5 19
  120. #define bb6 21
  121. #define bb7 23
  122. #define bb8 25
  123. #define bb9 27
  124. #else
  125. #define c01 %f0
  126. #define c02 %f1
  127. #define c03 %f2
  128. #define c04 %f3
  129. #define c05 %f4
  130. #define c06 %f5
  131. #define c07 %f6
  132. #define c08 %f7
  133. #define c09 %f8
  134. #define c10 %f9
  135. #define c11 %f10
  136. #define c12 %f11
  137. #define c13 %f12
  138. #define c14 %f13
  139. #define c15 %f14
  140. #define c16 %f15
  141. #define a1 %f16
  142. #define a2 %f17
  143. #define a3 %f18
  144. #define a4 %f19
  145. #define a5 %f20
  146. #define b1 %f21
  147. #define b2 %f22
  148. #define b3 %f23
  149. #define b4 %f24
  150. #define b5 %f25
  151. #define b6 %f26
  152. #define b7 %f27
  153. #define b8 %f28
  154. #define b9 %f29
  155. #define cc01 0
  156. #define cc02 1
  157. #define cc03 2
  158. #define cc04 3
  159. #define cc05 4
  160. #define cc06 5
  161. #define cc07 6
  162. #define cc08 7
  163. #define cc09 8
  164. #define cc10 9
  165. #define cc11 10
  166. #define cc12 11
  167. #define cc13 12
  168. #define cc14 13
  169. #define cc15 14
  170. #define cc16 15
  171. #define aa1 16
  172. #define aa2 17
  173. #define aa3 18
  174. #define aa4 19
  175. #define aa5 20
  176. #define bb1 21
  177. #define bb2 22
  178. #define bb3 23
  179. #define bb4 24
  180. #define bb5 25
  181. #define bb6 26
  182. #define bb7 27
  183. #define bb8 28
  184. #define bb9 29
  185. #endif
  186. #ifndef CONJ
  187. #define FMADD1 FMADD
  188. #define FMADD2 FMADD
  189. #define FMADD3 FMADD
  190. #define FMADD4 FNMSUB
  191. #else
  192. #if defined(LN) || defined(LT)
  193. #define FMADD1 FMADD
  194. #define FMADD2 FNMSUB
  195. #define FMADD3 FMADD
  196. #define FMADD4 FMADD
  197. #endif
  198. #if defined(RN) || defined(RT)
  199. #define FMADD1 FMADD
  200. #define FMADD2 FMADD
  201. #define FMADD3 FNMSUB
  202. #define FMADD4 FMADD
  203. #endif
  204. #endif
  205. .register %g2, #scratch
  206. .register %g3, #scratch
  207. PROLOGUE
  208. SAVESP
  209. #ifndef __64BIT__
  210. #ifdef DOUBLE
  211. ld [%sp + STACK_START + 32], A
  212. ld [%sp + STACK_START + 36], B
  213. ld [%sp + STACK_START + 40], C
  214. ld [%sp + STACK_START + 44], LDC
  215. ld [%sp + STACK_START + 48], OFFSET
  216. #else
  217. ld [%sp + STACK_START + 28], B
  218. ld [%sp + STACK_START + 32], C
  219. ld [%sp + STACK_START + 36], LDC
  220. ld [%sp + STACK_START + 40], OFFSET
  221. #endif
  222. #else
  223. ldx [%sp + STACK_START + 56], B
  224. ldx [%sp + STACK_START + 64], C
  225. ldx [%sp + STACK_START + 72], LDC
  226. ldx [%sp + STACK_START + 80], OFFSET
  227. #endif
  228. cmp M, 0
  229. ble,pn %icc, .LL999
  230. nop
  231. sll LDC, ZBASE_SHIFT, LDC
  232. #ifdef LN
  233. smul M, K, TEMP1
  234. sll TEMP1, ZBASE_SHIFT, TEMP1
  235. add A, TEMP1, A
  236. sll M, ZBASE_SHIFT, TEMP1
  237. add C, TEMP1, C
  238. #endif
  239. #ifdef RN
  240. neg OFFSET, KK
  241. #endif
  242. #ifdef RT
  243. smul N, K, TEMP1
  244. sll TEMP1, ZBASE_SHIFT, TEMP1
  245. add B, TEMP1, B
  246. smul N, LDC, TEMP1
  247. add C, TEMP1, C
  248. sub N, OFFSET, KK
  249. #endif
  250. and N, 1, J
  251. cmp J, 0
  252. ble,pn %icc, .LL20
  253. nop
  254. #ifdef RT
  255. sll K, ZBASE_SHIFT, TEMP1
  256. sub B, TEMP1, B
  257. #endif
  258. #ifndef RT
  259. mov C, C1
  260. add C, LDC, C
  261. #else
  262. sub C, LDC, C1
  263. sub C, LDC, C
  264. #endif
  265. #ifdef LN
  266. add M, OFFSET, KK
  267. #endif
  268. #ifdef LT
  269. mov OFFSET, KK
  270. #endif
  271. #if defined(LN) || defined(RT)
  272. mov A, AORIG
  273. #else
  274. mov A, AO
  275. #endif
  276. mov M, I
  277. .align 4
  278. .LL32:
  279. #if defined(LT) || defined(RN)
  280. mov B, BO
  281. #else
  282. #ifdef LN
  283. sll K, ZBASE_SHIFT, TEMP1
  284. sub AORIG, TEMP1, AORIG
  285. #endif
  286. sll KK, ZBASE_SHIFT + 0, TEMP1
  287. add AORIG, TEMP1, AO
  288. add B, TEMP1, BO
  289. #endif
  290. LDF [AO + 0 * SIZE], a1
  291. LDF [AO + 1 * SIZE], a2
  292. LDF [AO + 2 * SIZE], a3
  293. LDF [AO + 3 * SIZE], a4
  294. LDF [BO + 0 * SIZE], b1
  295. LDF [BO + 1 * SIZE], b2
  296. LDF [BO + 2 * SIZE], b3
  297. FCLR (cc01)
  298. LDF [BO + 3 * SIZE], b4
  299. FCLR (cc02)
  300. LDF [BO + 4 * SIZE], b5
  301. FCLR (cc03)
  302. LDF [BO + 5 * SIZE], b6
  303. FCLR (cc04)
  304. LDF [BO + 6 * SIZE], b7
  305. FCLR (cc05)
  306. LDF [BO + 7 * SIZE], b8
  307. FCLR (cc06)
  308. prefetch [C1 + 2 * SIZE], 3
  309. FCLR (cc07)
  310. #if defined(LT) || defined(RN)
  311. sra KK, 2, L
  312. #else
  313. sub K, KK, L
  314. sra L, 2, L
  315. #endif
  316. cmp L, 0
  317. ble,pn %icc, .LL35
  318. FCLR (cc08)
  319. .align 4
  320. .LL33:
  321. FMADD1 (aa1, bb1, cc01, cc01)
  322. prefetch [AO + (APREFETCHSIZE + 0) * SIZE], APREFETCH_CATEGORY
  323. FMADD2 (aa2, bb1, cc02, cc02)
  324. LDF [BO + 8 * SIZE], b1
  325. FMADD3 (aa1, bb2, cc03, cc03)
  326. LDF [AO + 4 * SIZE], a1
  327. FMADD4 (aa2, bb2, cc04, cc04)
  328. LDF [AO + 5 * SIZE], a2
  329. FMADD1 (aa3, bb3, cc01, cc01)
  330. LDF [BO + 9 * SIZE], b2
  331. FMADD2 (aa4, bb3, cc02, cc02)
  332. LDF [BO + 10 * SIZE], b3
  333. FMADD3 (aa3, bb4, cc03, cc03)
  334. LDF [AO + 6 * SIZE], a3
  335. FMADD4 (aa4, bb4, cc04, cc04)
  336. LDF [AO + 7 * SIZE], a4
  337. FMADD1 (aa1, bb5, cc01, cc01)
  338. LDF [BO + 11 * SIZE], b4
  339. FMADD2 (aa2, bb5, cc02, cc02)
  340. LDF [BO + 12 * SIZE], b5
  341. FMADD3 (aa1, bb6, cc03, cc03)
  342. LDF [AO + 8 * SIZE], a1
  343. FMADD4 (aa2, bb6, cc04, cc04)
  344. LDF [AO + 9 * SIZE], a2
  345. FMADD1 (aa3, bb7, cc01, cc01)
  346. LDF [BO + 13 * SIZE], b6
  347. FMADD2 (aa4, bb7, cc02, cc02)
  348. LDF [BO + 14 * SIZE], b7
  349. FMADD3 (aa3, bb8, cc03, cc03)
  350. LDF [AO + 10 * SIZE], a3
  351. FMADD4 (aa4, bb8, cc04, cc04)
  352. LDF [AO + 11 * SIZE], a4
  353. add AO, 8 * SIZE, AO
  354. add L, -1, L
  355. add BO, 8 * SIZE, BO
  356. cmp L, 0
  357. bg,pt %icc, .LL33
  358. LDF [BO + 7 * SIZE], b8
  359. .align 4
  360. .LL35:
  361. #if defined(LT) || defined(RN)
  362. and KK, 3, L
  363. #else
  364. sub K, KK, L
  365. and L, 3, L
  366. #endif
  367. cmp L, 0
  368. ble,a,pn %icc, .LL38
  369. nop
  370. .align 4
  371. .LL37:
  372. FMADD1 (aa1, bb1, cc01, cc01)
  373. add L, -1, L
  374. FMADD2 (aa2, bb1, cc02, cc02)
  375. LDF [BO + 2 * SIZE], b1
  376. FMADD3 (aa1, bb2, cc03, cc03)
  377. LDF [AO + 2 * SIZE], a1
  378. FMADD4 (aa2, bb2, cc04, cc04)
  379. LDF [AO + 3 * SIZE], a2
  380. add AO, 2 * SIZE, AO
  381. cmp L, 0
  382. add BO, 2 * SIZE, BO
  383. bg,pt %icc, .LL37
  384. LDF [BO + 1 * SIZE], b2
  385. .align 4
  386. .LL38:
  387. FADD c01, c04, c01
  388. FADD c02, c03, c02
  389. #if defined(LN) || defined(RT)
  390. sub KK, 1, TEMP1
  391. sll TEMP1, ZBASE_SHIFT, TEMP1
  392. add AORIG, TEMP1, AO
  393. add B, TEMP1, BO
  394. #endif
  395. #if defined(LN) || defined(LT)
  396. LDF [BO + 0 * SIZE], a1
  397. LDF [BO + 1 * SIZE], a2
  398. #else
  399. LDF [AO + 0 * SIZE], a1
  400. LDF [AO + 1 * SIZE], a2
  401. #endif
  402. FSUB a1, c01, c01
  403. FSUB a2, c02, c02
  404. #if defined(LN) || defined(LT)
  405. LDF [AO + 0 * SIZE], a1
  406. LDF [AO + 1 * SIZE], a2
  407. #else
  408. LDF [BO + 0 * SIZE], a1
  409. LDF [BO + 1 * SIZE], a2
  410. #endif
  411. FMUL a1, c01, b1
  412. FMUL a2, c01, b2
  413. #ifndef CONJ
  414. FNMSUB (aa2, cc02, bb1, cc01)
  415. FMADD (aa1, cc02, bb2, cc02)
  416. #else
  417. FMADD (aa2, cc02, bb1, cc01)
  418. FMSUB (aa1, cc02, bb2, cc02)
  419. #endif
  420. #ifdef LN
  421. add C1, -2 * SIZE, C1
  422. #endif
  423. #if defined(LN) || defined(LT)
  424. STF c01, [BO + 0 * SIZE]
  425. STF c02, [BO + 1 * SIZE]
  426. #else
  427. STF c01, [AO + 0 * SIZE]
  428. STF c02, [AO + 1 * SIZE]
  429. #endif
  430. STF c01, [C1 + 0 * SIZE]
  431. STF c02, [C1 + 1 * SIZE]
  432. #ifndef LN
  433. add C1, 2 * SIZE, C1
  434. #endif
  435. #ifdef RT
  436. sll K, ZBASE_SHIFT, TEMP1
  437. add AORIG, TEMP1, AORIG
  438. #endif
  439. #if defined(LT) || defined(RN)
  440. sub K, KK, TEMP1
  441. sll TEMP1, ZBASE_SHIFT, TEMP1
  442. add AO, TEMP1, AO
  443. add BO, TEMP1, BO
  444. #endif
  445. #ifdef LT
  446. add KK, 1, KK
  447. #endif
  448. #ifdef LN
  449. sub KK, 1, KK
  450. #endif
  451. add I, -1, I
  452. cmp I, 0
  453. bg,pt %icc, .LL32
  454. nop
  455. #ifdef LN
  456. sll K, ZBASE_SHIFT, TEMP1
  457. add B, TEMP1, B
  458. #endif
  459. #if defined(LT) || defined(RN)
  460. mov BO, B
  461. #endif
  462. #ifdef RN
  463. add KK, 1, KK
  464. #endif
  465. #ifdef RT
  466. sub KK, 1, KK
  467. #endif
  468. .align 4
  469. .LL20:
  470. and N, 2, J
  471. cmp J, 0
  472. ble,pn %icc, .LL30
  473. nop
  474. #ifdef RT
  475. sll K, ZBASE_SHIFT + 1, TEMP1
  476. sub B, TEMP1, B
  477. #endif
  478. #ifndef RT
  479. mov C, C1
  480. add C, LDC, C2
  481. add C2, LDC, C
  482. #else
  483. sub C, LDC, C2
  484. sub C2, LDC, C1
  485. sub C2, LDC, C
  486. #endif
  487. #ifdef LN
  488. add M, OFFSET, KK
  489. #endif
  490. #ifdef LT
  491. mov OFFSET, KK
  492. #endif
  493. #if defined(LN) || defined(RT)
  494. mov A, AORIG
  495. #else
  496. mov A, AO
  497. #endif
  498. mov M, I
  499. .align 4
  500. .LL22:
  501. #if defined(LT) || defined(RN)
  502. mov B, BO
  503. #else
  504. #ifdef LN
  505. sll K, ZBASE_SHIFT, TEMP1
  506. sub AORIG, TEMP1, AORIG
  507. #endif
  508. sll KK, ZBASE_SHIFT + 0, TEMP1
  509. sll KK, ZBASE_SHIFT + 1, TEMP2
  510. add AORIG, TEMP1, AO
  511. add B, TEMP2, BO
  512. #endif
  513. LDF [AO + 0 * SIZE], a1
  514. LDF [AO + 1 * SIZE], a2
  515. LDF [BO + 0 * SIZE], b1
  516. LDF [BO + 1 * SIZE], b2
  517. LDF [BO + 2 * SIZE], b3
  518. LDF [BO + 3 * SIZE], b4
  519. LDF [BO + 4 * SIZE], b5
  520. FCLR (cc01)
  521. LDF [BO + 5 * SIZE], b6
  522. FCLR (cc02)
  523. LDF [BO + 6 * SIZE], b7
  524. FCLR (cc03)
  525. LDF [BO + 7 * SIZE], b8
  526. FCLR (cc04)
  527. LDF [BO + 8 * SIZE], b9
  528. FCLR (cc05)
  529. prefetch [C1 + 2 * SIZE], 3
  530. FCLR (cc06)
  531. prefetch [C2 + 2 * SIZE], 3
  532. FCLR (cc07)
  533. #if defined(LT) || defined(RN)
  534. sra KK, 2, L
  535. #else
  536. sub K, KK, L
  537. sra L, 2, L
  538. #endif
  539. cmp L, 0
  540. ble,pn %icc, .LL25
  541. FCLR (cc08)
  542. .align 4
  543. .LL23:
  544. FMADD1 (aa1, bb1, cc01, cc01)
  545. LDF [AO + 2 * SIZE], a3
  546. FMADD2 (aa2, bb1, cc02, cc02)
  547. LDF [AO + 3 * SIZE], a4
  548. FMADD3 (aa1, bb2, cc03, cc03)
  549. LDF [BO + 16 * SIZE], b1
  550. FMADD4 (aa2, bb2, cc04, cc04)
  551. LDF [BO + 9 * SIZE], b2
  552. FMADD1 (aa1, bb3, cc05, cc05)
  553. prefetch [AO + (APREFETCHSIZE + 0) * SIZE], APREFETCH_CATEGORY
  554. FMADD2 (aa2, bb3, cc06, cc06)
  555. add L, -1, L
  556. FMADD3 (aa1, bb4, cc07, cc07)
  557. LDF [BO + 10 * SIZE], b3
  558. FMADD4 (aa2, bb4, cc08, cc08)
  559. LDF [BO + 11 * SIZE], b4
  560. FMADD1 (aa3, bb5, cc01, cc01)
  561. LDF [AO + 4 * SIZE], a1
  562. FMADD2 (aa4, bb5, cc02, cc02)
  563. LDF [AO + 5 * SIZE], a2
  564. FMADD3 (aa3, bb6, cc03, cc03)
  565. LDF [BO + 12 * SIZE], b5
  566. FMADD4 (aa4, bb6, cc04, cc04)
  567. LDF [BO + 13 * SIZE], b6
  568. FMADD1 (aa3, bb7, cc05, cc05)
  569. cmp L, 0
  570. FMADD2 (aa4, bb7, cc06, cc06)
  571. add AO, 8 * SIZE, AO
  572. FMADD3 (aa3, bb8, cc07, cc07)
  573. LDF [BO + 14 * SIZE], b7
  574. FMADD4 (aa4, bb8, cc08, cc08)
  575. LDF [BO + 15 * SIZE], b8
  576. FMADD1 (aa1, bb9, cc01, cc01)
  577. LDF [AO - 2 * SIZE], a3
  578. FMADD2 (aa2, bb9, cc02, cc02)
  579. LDF [AO - 1 * SIZE], a4
  580. FMADD3 (aa1, bb2, cc03, cc03)
  581. LDF [BO + 24 * SIZE], b9
  582. FMADD4 (aa2, bb2, cc04, cc04)
  583. LDF [BO + 17 * SIZE], b2
  584. FMADD1 (aa1, bb3, cc05, cc05)
  585. add BO, 16 * SIZE, BO
  586. FMADD2 (aa2, bb3, cc06, cc06)
  587. nop
  588. FMADD3 (aa1, bb4, cc07, cc07)
  589. LDF [BO + 2 * SIZE], b3
  590. FMADD4 (aa2, bb4, cc08, cc08)
  591. LDF [BO + 3 * SIZE], b4
  592. FMADD1 (aa3, bb5, cc01, cc01)
  593. LDF [AO + 0 * SIZE], a1
  594. FMADD2 (aa4, bb5, cc02, cc02)
  595. LDF [AO + 1 * SIZE], a2
  596. FMADD3 (aa3, bb6, cc03, cc03)
  597. LDF [BO + 4 * SIZE], b5
  598. FMADD4 (aa4, bb6, cc04, cc04)
  599. LDF [BO + 5 * SIZE], b6
  600. FMADD1 (aa3, bb7, cc05, cc05)
  601. nop
  602. FMADD2 (aa4, bb7, cc06, cc06)
  603. LDF [BO + 6 * SIZE], b7
  604. FMADD3 (aa3, bb8, cc07, cc07)
  605. FMADD4 (aa4, bb8, cc08, cc08)
  606. bg,pt %icc, .LL23
  607. LDF [BO + 7 * SIZE], b8
  608. .align 4
  609. .LL25:
  610. #if defined(LT) || defined(RN)
  611. and KK, 3, L
  612. #else
  613. sub K, KK, L
  614. and L, 3, L
  615. #endif
  616. cmp L, 0
  617. ble,a,pn %icc, .LL28
  618. nop
  619. .align 4
  620. .LL27:
  621. FMADD1 (aa1, bb1, cc01, cc01)
  622. add L, -1, L
  623. FMADD2 (aa2, bb1, cc02, cc02)
  624. LDF [BO + 4 * SIZE], b1
  625. FMADD3 (aa1, bb2, cc03, cc03)
  626. add AO, 2 * SIZE, AO
  627. FMADD4 (aa2, bb2, cc04, cc04)
  628. LDF [BO + 5 * SIZE], b2
  629. FMADD1 (aa1, bb3, cc05, cc05)
  630. cmp L, 0
  631. FMADD2 (aa2, bb3, cc06, cc06)
  632. LDF [BO + 6 * SIZE], b3
  633. FMADD3 (aa1, bb4, cc07, cc07)
  634. LDF [AO + 0 * SIZE], a1
  635. FMADD4 (aa2, bb4, cc08, cc08)
  636. LDF [AO + 1 * SIZE], a2
  637. LDF [BO + 7 * SIZE], b4
  638. bg,pt %icc, .LL27
  639. add BO, 4 * SIZE, BO
  640. .align 4
  641. .LL28:
  642. FADD c01, c04, c01
  643. FADD c02, c03, c02
  644. FADD c05, c08, c05
  645. FADD c06, c07, c06
  646. #if defined(LN) || defined(RT)
  647. #ifdef LN
  648. sub KK, 1, TEMP1
  649. #else
  650. sub KK, 2, TEMP1
  651. #endif
  652. sll TEMP1, ZBASE_SHIFT + 0, TEMP2
  653. sll TEMP1, ZBASE_SHIFT + 1, TEMP1
  654. add AORIG, TEMP2, AO
  655. add B, TEMP1, BO
  656. #endif
  657. #if defined(LN) || defined(LT)
  658. LDF [BO + 0 * SIZE], a1
  659. LDF [BO + 1 * SIZE], a2
  660. LDF [BO + 2 * SIZE], a3
  661. LDF [BO + 3 * SIZE], a4
  662. #else
  663. LDF [AO + 0 * SIZE], a1
  664. LDF [AO + 1 * SIZE], a2
  665. LDF [AO + 2 * SIZE], a3
  666. LDF [AO + 3 * SIZE], a4
  667. #endif
  668. FSUB a1, c01, c01
  669. FSUB a2, c02, c02
  670. FSUB a3, c05, c05
  671. FSUB a4, c06, c06
  672. #if defined(LN) || defined(LT)
  673. LDF [AO + 0 * SIZE], a1
  674. LDF [AO + 1 * SIZE], a2
  675. FMUL a1, c01, b1
  676. FMUL a2, c01, b2
  677. FMUL a1, c05, b3
  678. FMUL a2, c05, b4
  679. #ifndef CONJ
  680. FNMSUB (aa2, cc02, bb1, cc01)
  681. FMADD (aa1, cc02, bb2, cc02)
  682. FNMSUB (aa2, cc06, bb3, cc05)
  683. FMADD (aa1, cc06, bb4, cc06)
  684. #else
  685. FMADD (aa2, cc02, bb1, cc01)
  686. FMSUB (aa1, cc02, bb2, cc02)
  687. FMADD (aa2, cc06, bb3, cc05)
  688. FMSUB (aa1, cc06, bb4, cc06)
  689. #endif
  690. #endif
  691. #ifdef RN
  692. LDF [BO + 0 * SIZE], b1
  693. LDF [BO + 1 * SIZE], b2
  694. LDF [BO + 2 * SIZE], b3
  695. LDF [BO + 3 * SIZE], b4
  696. FMUL b1, c01, a1
  697. FMUL b2, c01, a2
  698. #ifndef CONJ
  699. FNMSUB (bb2, cc02, aa1, cc01)
  700. FMADD (bb1, cc02, aa2, cc02)
  701. #else
  702. FMADD (bb2, cc02, aa1, cc01)
  703. FMSUB (bb1, cc02, aa2, cc02)
  704. #endif
  705. FNMSUB (bb3, cc01, cc05, cc05)
  706. FNMSUB (bb3, cc02, cc06, cc06)
  707. #ifndef CONJ
  708. FMADD (bb4, cc02, cc05, cc05)
  709. FNMSUB (bb4, cc01, cc06, cc06)
  710. #else
  711. FNMSUB (bb4, cc02, cc05, cc05)
  712. FMADD (bb4, cc01, cc06, cc06)
  713. #endif
  714. LDF [BO + 6 * SIZE], b1
  715. LDF [BO + 7 * SIZE], b2
  716. FMUL b1, c05, a1
  717. FMUL b2, c05, a2
  718. #ifndef CONJ
  719. FNMSUB (bb2, cc06, aa1, cc05)
  720. FMADD (bb1, cc06, aa2, cc06)
  721. #else
  722. FMADD (bb2, cc06, aa1, cc05)
  723. FMSUB (bb1, cc06, aa2, cc06)
  724. #endif
  725. #endif
  726. #ifdef RT
  727. LDF [BO + 6 * SIZE], b1
  728. LDF [BO + 7 * SIZE], b2
  729. LDF [BO + 4 * SIZE], b3
  730. LDF [BO + 5 * SIZE], b4
  731. FMUL b1, c05, a1
  732. FMUL b2, c05, a2
  733. #ifndef CONJ
  734. FNMSUB (bb2, cc06, aa1, cc05)
  735. FMADD (bb1, cc06, aa2, cc06)
  736. #else
  737. FMADD (bb2, cc06, aa1, cc05)
  738. FMSUB (bb1, cc06, aa2, cc06)
  739. #endif
  740. FNMSUB (bb3, cc05, cc01, cc01)
  741. FNMSUB (bb3, cc06, cc02, cc02)
  742. #ifndef CONJ
  743. FMADD (bb4, cc06, cc01, cc01)
  744. FNMSUB (bb4, cc05, cc02, cc02)
  745. #else
  746. FNMSUB (bb4, cc06, cc01, cc01)
  747. FMADD (bb4, cc05, cc02, cc02)
  748. #endif
  749. LDF [BO + 0 * SIZE], b1
  750. LDF [BO + 1 * SIZE], b2
  751. FMUL b1, c01, a1
  752. FMUL b2, c01, a2
  753. #ifndef CONJ
  754. FNMSUB (bb2, cc02, aa1, cc01)
  755. FMADD (bb1, cc02, aa2, cc02)
  756. #else
  757. FMADD (bb2, cc02, aa1, cc01)
  758. FMSUB (bb1, cc02, aa2, cc02)
  759. #endif
  760. #endif
  761. #ifdef LN
  762. add C1, -2 * SIZE, C1
  763. add C2, -2 * SIZE, C2
  764. #endif
  765. #if defined(LN) || defined(LT)
  766. STF c01, [BO + 0 * SIZE]
  767. STF c02, [BO + 1 * SIZE]
  768. STF c05, [BO + 2 * SIZE]
  769. STF c06, [BO + 3 * SIZE]
  770. #else
  771. STF c01, [AO + 0 * SIZE]
  772. STF c02, [AO + 1 * SIZE]
  773. STF c05, [AO + 2 * SIZE]
  774. STF c06, [AO + 3 * SIZE]
  775. #endif
  776. STF c01, [C1 + 0 * SIZE]
  777. STF c02, [C1 + 1 * SIZE]
  778. STF c05, [C2 + 0 * SIZE]
  779. STF c06, [C2 + 1 * SIZE]
  780. #ifndef LN
  781. add C1, 2 * SIZE, C1
  782. add C2, 2 * SIZE, C2
  783. #endif
  784. #ifdef RT
  785. sll K, ZBASE_SHIFT, TEMP1
  786. add AORIG, TEMP1, AORIG
  787. #endif
  788. #if defined(LT) || defined(RN)
  789. sub K, KK, TEMP1
  790. sll TEMP1, ZBASE_SHIFT + 0, TEMP2
  791. sll TEMP1, ZBASE_SHIFT + 1, TEMP1
  792. add AO, TEMP2, AO
  793. add BO, TEMP1, BO
  794. #endif
  795. #ifdef LT
  796. add KK, 1, KK
  797. #endif
  798. #ifdef LN
  799. sub KK, 1, KK
  800. #endif
  801. add I, -1, I
  802. cmp I, 0
  803. bg,pt %icc, .LL22
  804. nop
  805. #ifdef LN
  806. sll K, ZBASE_SHIFT + 1, TEMP1
  807. add B, TEMP1, B
  808. #endif
  809. #if defined(LT) || defined(RN)
  810. mov BO, B
  811. #endif
  812. #ifdef RN
  813. add KK, 2, KK
  814. #endif
  815. #ifdef RT
  816. sub KK, 2, KK
  817. #endif
  818. .align 4
  819. .LL30:
  820. sra N, 2, J
  821. cmp J, 0
  822. ble,pn %icc, .LL999
  823. nop
  824. .align 4
  825. .LL11:
  826. #ifdef RT
  827. sll K, ZBASE_SHIFT + 2, TEMP1
  828. sub B, TEMP1, B
  829. #endif
  830. #ifndef RT
  831. mov C, C1
  832. add C, LDC, C2
  833. add C2, LDC, C3
  834. add C3, LDC, C4
  835. add C4, LDC, C
  836. #else
  837. sub C, LDC, C4
  838. sub C4, LDC, C3
  839. sub C3, LDC, C2
  840. sub C2, LDC, C1
  841. sub C2, LDC, C
  842. #endif
  843. #ifdef LN
  844. add M, OFFSET, KK
  845. #endif
  846. #ifdef LT
  847. mov OFFSET, KK
  848. #endif
  849. #if defined(LN) || defined(RT)
  850. mov A, AORIG
  851. #else
  852. mov A, AO
  853. #endif
  854. mov M, I
  855. .align 4
  856. .LL12:
  857. #if defined(LT) || defined(RN)
  858. mov B, BO
  859. #else
  860. #ifdef LN
  861. sll K, ZBASE_SHIFT, TEMP1
  862. sub AORIG, TEMP1, AORIG
  863. #endif
  864. sll KK, ZBASE_SHIFT + 0, TEMP1
  865. sll KK, ZBASE_SHIFT + 2, TEMP2
  866. add AORIG, TEMP1, AO
  867. add B, TEMP2, BO
  868. #endif
  869. LDF [AO + 0 * SIZE], a1
  870. FCLR (cc01)
  871. LDF [AO + 1 * SIZE], a2
  872. FCLR (cc05)
  873. LDF [AO + 8 * SIZE], a5
  874. FCLR (cc09)
  875. LDF [BO + 0 * SIZE], b1
  876. FCLR (cc13)
  877. LDF [BO + 1 * SIZE], b2
  878. FCLR (cc02)
  879. LDF [BO + 2 * SIZE], b3
  880. FCLR (cc06)
  881. LDF [BO + 3 * SIZE], b4
  882. FCLR (cc10)
  883. LDF [BO + 4 * SIZE], b5
  884. FCLR (cc14)
  885. LDF [BO + 5 * SIZE], b6
  886. FCLR (cc03)
  887. LDF [BO + 6 * SIZE], b7
  888. FCLR (cc07)
  889. LDF [BO + 7 * SIZE], b8
  890. FCLR (cc11)
  891. LDF [BO + 8 * SIZE], b9
  892. FCLR (cc15)
  893. prefetch [C1 + 1 * SIZE], 3
  894. FCLR (cc04)
  895. prefetch [C2 + 2 * SIZE], 3
  896. FCLR (cc08)
  897. prefetch [C3 + 1 * SIZE], 3
  898. FCLR (cc12)
  899. prefetch [C4 + 2 * SIZE], 3
  900. FCLR (cc16)
  901. #if defined(LT) || defined(RN)
  902. sra KK, 3, L
  903. #else
  904. sub K, KK, L
  905. sra L, 3, L
  906. #endif
  907. cmp L, 0
  908. ble,pn %icc, .LL15
  909. nop
  910. .align 4
  911. .LL13:
  912. FMADD1 (aa1, bb1, cc01, cc01)
  913. FMADD2 (aa2, bb1, cc02, cc02)
  914. FMADD3 (aa1, bb2, cc03, cc03)
  915. FMADD4 (aa2, bb2, cc04, cc04)
  916. FMADD1 (aa1, bb3, cc05, cc05)
  917. LDF [BO + 16 * SIZE], b1
  918. FMADD2 (aa2, bb3, cc06, cc06)
  919. LDF [BO + 9 * SIZE], b2
  920. FMADD3 (aa1, bb4, cc07, cc07)
  921. LDF [BO + 10 * SIZE], b3
  922. FMADD4 (aa2, bb4, cc08, cc08)
  923. LDF [BO + 11 * SIZE], b4
  924. FMADD1 (aa1, bb5, cc09, cc09)
  925. LDF [AO + 2 * SIZE], a3
  926. FMADD2 (aa2, bb5, cc10, cc10)
  927. LDF [AO + 3 * SIZE], a4
  928. FMADD3 (aa1, bb6, cc11, cc11)
  929. prefetch [AO + (APREFETCHSIZE + 0) * SIZE], APREFETCH_CATEGORY
  930. FMADD4 (aa2, bb6, cc12, cc12)
  931. nop
  932. FMADD1 (aa1, bb7, cc13, cc13)
  933. LDF [BO + 12 * SIZE], b5
  934. FMADD2 (aa2, bb7, cc14, cc14)
  935. LDF [BO + 13 * SIZE], b6
  936. FMADD3 (aa1, bb8, cc15, cc15)
  937. LDF [BO + 14 * SIZE], b7
  938. FMADD4 (aa2, bb8, cc16, cc16)
  939. LDF [BO + 15 * SIZE], b8
  940. FMADD1 (aa3, bb9, cc01, cc01)
  941. FMADD2 (aa4, bb9, cc02, cc02)
  942. FMADD3 (aa3, bb2, cc03, cc03)
  943. FMADD4 (aa4, bb2, cc04, cc04)
  944. FMADD1 (aa3, bb3, cc05, cc05)
  945. LDF [BO + 24 * SIZE], b9
  946. FMADD2 (aa4, bb3, cc06, cc06)
  947. LDF [BO + 17 * SIZE], b2
  948. FMADD3 (aa3, bb4, cc07, cc07)
  949. LDF [BO + 18 * SIZE], b3
  950. FMADD4 (aa4, bb4, cc08, cc08)
  951. LDF [BO + 19 * SIZE], b4
  952. FMADD1 (aa3, bb5, cc09, cc09)
  953. LDF [AO + 4 * SIZE], a1
  954. FMADD2 (aa4, bb5, cc10, cc10)
  955. LDF [AO + 5 * SIZE], a2
  956. FMADD3 (aa3, bb6, cc11, cc11)
  957. add L, -1, L
  958. FMADD4 (aa4, bb6, cc12, cc12)
  959. nop
  960. FMADD1 (aa3, bb7, cc13, cc13)
  961. LDF [BO + 20 * SIZE], b5
  962. FMADD2 (aa4, bb7, cc14, cc14)
  963. LDF [BO + 21 * SIZE], b6
  964. FMADD3 (aa3, bb8, cc15, cc15)
  965. LDF [BO + 22 * SIZE], b7
  966. FMADD4 (aa4, bb8, cc16, cc16)
  967. LDF [BO + 23 * SIZE], b8
  968. FMADD1 (aa1, bb1, cc01, cc01)
  969. FMADD2 (aa2, bb1, cc02, cc02)
  970. FMADD3 (aa1, bb2, cc03, cc03)
  971. FMADD4 (aa2, bb2, cc04, cc04)
  972. FMADD1 (aa1, bb3, cc05, cc05)
  973. LDF [BO + 32 * SIZE], b1
  974. FMADD2 (aa2, bb3, cc06, cc06)
  975. LDF [BO + 25 * SIZE], b2
  976. FMADD3 (aa1, bb4, cc07, cc07)
  977. LDF [BO + 26 * SIZE], b3
  978. FMADD4 (aa2, bb4, cc08, cc08)
  979. LDF [BO + 27 * SIZE], b4
  980. FMADD1 (aa1, bb5, cc09, cc09)
  981. LDF [AO + 6 * SIZE], a3
  982. FMADD2 (aa2, bb5, cc10, cc10)
  983. LDF [AO + 7 * SIZE], a4
  984. FMADD3 (aa1, bb6, cc11, cc11)
  985. nop
  986. FMADD4 (aa2, bb6, cc12, cc12)
  987. nop
  988. FMADD1 (aa1, bb7, cc13, cc13)
  989. LDF [BO + 28 * SIZE], b5
  990. FMADD2 (aa2, bb7, cc14, cc14)
  991. LDF [BO + 29 * SIZE], b6
  992. FMADD3 (aa1, bb8, cc15, cc15)
  993. LDF [BO + 30 * SIZE], b7
  994. FMADD4 (aa2, bb8, cc16, cc16)
  995. LDF [BO + 31 * SIZE], b8
  996. FMADD1 (aa3, bb9, cc01, cc01)
  997. FMADD2 (aa4, bb9, cc02, cc02)
  998. FMADD3 (aa3, bb2, cc03, cc03)
  999. FMADD4 (aa4, bb2, cc04, cc04)
  1000. FMADD1 (aa3, bb3, cc05, cc05)
  1001. LDF [BO + 40 * SIZE], b9
  1002. FMADD2 (aa4, bb3, cc06, cc06)
  1003. LDF [BO + 33 * SIZE], b2
  1004. FMADD3 (aa3, bb4, cc07, cc07)
  1005. LDF [BO + 34 * SIZE], b3
  1006. FMADD4 (aa4, bb4, cc08, cc08)
  1007. LDF [BO + 35 * SIZE], b4
  1008. FMADD1 (aa3, bb5, cc09, cc09)
  1009. LDF [AO + 16 * SIZE], a1 /****/
  1010. FMADD2 (aa4, bb5, cc10, cc10)
  1011. LDF [AO + 9 * SIZE], a2
  1012. FMADD3 (aa3, bb6, cc11, cc11)
  1013. nop
  1014. FMADD4 (aa4, bb6, cc12, cc12)
  1015. nop
  1016. FMADD1 (aa3, bb7, cc13, cc13)
  1017. LDF [BO + 36 * SIZE], b5
  1018. FMADD2 (aa4, bb7, cc14, cc14)
  1019. LDF [BO + 37 * SIZE], b6
  1020. FMADD3 (aa3, bb8, cc15, cc15)
  1021. LDF [BO + 38 * SIZE], b7
  1022. FMADD4 (aa4, bb8, cc16, cc16)
  1023. LDF [BO + 39 * SIZE], b8
  1024. FMADD1 (aa5, bb1, cc01, cc01)
  1025. FMADD2 (aa2, bb1, cc02, cc02)
  1026. FMADD3 (aa5, bb2, cc03, cc03)
  1027. FMADD4 (aa2, bb2, cc04, cc04)
  1028. FMADD1 (aa5, bb3, cc05, cc05)
  1029. LDF [BO + 48 * SIZE], b1
  1030. FMADD2 (aa2, bb3, cc06, cc06)
  1031. LDF [BO + 41 * SIZE], b2
  1032. FMADD3 (aa5, bb4, cc07, cc07)
  1033. LDF [BO + 42 * SIZE], b3
  1034. FMADD4 (aa2, bb4, cc08, cc08)
  1035. LDF [BO + 43 * SIZE], b4
  1036. FMADD1 (aa5, bb5, cc09, cc09)
  1037. LDF [AO + 10 * SIZE], a3
  1038. FMADD2 (aa2, bb5, cc10, cc10)
  1039. LDF [AO + 11 * SIZE], a4
  1040. FMADD3 (aa5, bb6, cc11, cc11)
  1041. prefetch [AO + (APREFETCHSIZE + 8) * SIZE], APREFETCH_CATEGORY
  1042. FMADD4 (aa2, bb6, cc12, cc12)
  1043. nop
  1044. FMADD1 (aa5, bb7, cc13, cc13)
  1045. LDF [BO + 44 * SIZE], b5
  1046. FMADD2 (aa2, bb7, cc14, cc14)
  1047. LDF [BO + 45 * SIZE], b6
  1048. FMADD3 (aa5, bb8, cc15, cc15)
  1049. LDF [BO + 46 * SIZE], b7
  1050. FMADD4 (aa2, bb8, cc16, cc16)
  1051. LDF [BO + 47 * SIZE], b8
  1052. FMADD1 (aa3, bb9, cc01, cc01)
  1053. FMADD2 (aa4, bb9, cc02, cc02)
  1054. FMADD3 (aa3, bb2, cc03, cc03)
  1055. FMADD4 (aa4, bb2, cc04, cc04)
  1056. FMADD1 (aa3, bb3, cc05, cc05)
  1057. LDF [BO + 56 * SIZE], b9
  1058. FMADD2 (aa4, bb3, cc06, cc06)
  1059. LDF [BO + 49 * SIZE], b2
  1060. FMADD3 (aa3, bb4, cc07, cc07)
  1061. LDF [BO + 50 * SIZE], b3
  1062. FMADD4 (aa4, bb4, cc08, cc08)
  1063. LDF [BO + 51 * SIZE], b4
  1064. FMADD1 (aa3, bb5, cc09, cc09)
  1065. LDF [AO + 12 * SIZE], a5
  1066. FMADD2 (aa4, bb5, cc10, cc10)
  1067. LDF [AO + 13 * SIZE], a2
  1068. FMADD3 (aa3, bb6, cc11, cc11)
  1069. cmp L, 0
  1070. FMADD4 (aa4, bb6, cc12, cc12)
  1071. nop
  1072. FMADD1 (aa3, bb7, cc13, cc13)
  1073. LDF [BO + 52 * SIZE], b5
  1074. FMADD2 (aa4, bb7, cc14, cc14)
  1075. LDF [BO + 53 * SIZE], b6
  1076. FMADD3 (aa3, bb8, cc15, cc15)
  1077. LDF [BO + 54 * SIZE], b7
  1078. FMADD4 (aa4, bb8, cc16, cc16)
  1079. LDF [BO + 55 * SIZE], b8
  1080. FMADD1 (aa5, bb1, cc01, cc01)
  1081. FMADD2 (aa2, bb1, cc02, cc02)
  1082. FMADD3 (aa5, bb2, cc03, cc03)
  1083. FMADD4 (aa2, bb2, cc04, cc04)
  1084. FMADD1 (aa5, bb3, cc05, cc05)
  1085. LDF [BO + 64 * SIZE], b1
  1086. FMADD2 (aa2, bb3, cc06, cc06)
  1087. LDF [BO + 57 * SIZE], b2
  1088. FMADD3 (aa5, bb4, cc07, cc07)
  1089. LDF [BO + 58 * SIZE], b3
  1090. FMADD4 (aa2, bb4, cc08, cc08)
  1091. LDF [BO + 59 * SIZE], b4
  1092. FMADD1 (aa5, bb5, cc09, cc09)
  1093. LDF [AO + 14 * SIZE], a3
  1094. FMADD2 (aa2, bb5, cc10, cc10)
  1095. LDF [AO + 15 * SIZE], a4
  1096. FMADD3 (aa5, bb6, cc11, cc11)
  1097. add BO, 64 * SIZE, BO
  1098. FMADD4 (aa2, bb6, cc12, cc12)
  1099. add AO, 16 * SIZE, AO
  1100. FMADD1 (aa5, bb7, cc13, cc13)
  1101. LDF [BO - 4 * SIZE], b5
  1102. FMADD2 (aa2, bb7, cc14, cc14)
  1103. LDF [BO - 3 * SIZE], b6
  1104. FMADD3 (aa5, bb8, cc15, cc15)
  1105. LDF [BO - 2 * SIZE], b7
  1106. FMADD4 (aa2, bb8, cc16, cc16)
  1107. LDF [BO - 1 * SIZE], b8
  1108. FMADD1 (aa3, bb9, cc01, cc01)
  1109. FMADD2 (aa4, bb9, cc02, cc02)
  1110. FMADD3 (aa3, bb2, cc03, cc03)
  1111. FMADD4 (aa4, bb2, cc04, cc04)
  1112. FMADD1 (aa3, bb3, cc05, cc05)
  1113. LDF [BO + 8 * SIZE], b9
  1114. FMADD2 (aa4, bb3, cc06, cc06)
  1115. LDF [BO + 1 * SIZE], b2
  1116. FMADD3 (aa3, bb4, cc07, cc07)
  1117. LDF [BO + 2 * SIZE], b3
  1118. FMADD4 (aa4, bb4, cc08, cc08)
  1119. LDF [BO + 3 * SIZE], b4
  1120. FMADD1 (aa3, bb5, cc09, cc09)
  1121. LDF [AO + 8 * SIZE], a5 /****/
  1122. FMADD2 (aa4, bb5, cc10, cc10)
  1123. LDF [AO + 1 * SIZE], a2
  1124. FMADD3 (aa3, bb6, cc11, cc11)
  1125. FMADD4 (aa4, bb6, cc12, cc12)
  1126. FMADD1 (aa3, bb7, cc13, cc13)
  1127. LDF [BO + 4 * SIZE], b5
  1128. FMADD2 (aa4, bb7, cc14, cc14)
  1129. LDF [BO + 5 * SIZE], b6
  1130. FMADD3 (aa3, bb8, cc15, cc15)
  1131. LDF [BO + 6 * SIZE], b7
  1132. FMADD4 (aa4, bb8, cc16, cc16)
  1133. ble,pn %icc, .LL15
  1134. LDF [BO + 7 * SIZE], b8
  1135. FMADD1 (aa1, bb1, cc01, cc01)
  1136. FMADD2 (aa2, bb1, cc02, cc02)
  1137. FMADD3 (aa1, bb2, cc03, cc03)
  1138. FMADD4 (aa2, bb2, cc04, cc04)
  1139. FMADD1 (aa1, bb3, cc05, cc05)
  1140. LDF [BO + 16 * SIZE], b1
  1141. FMADD2 (aa2, bb3, cc06, cc06)
  1142. LDF [BO + 9 * SIZE], b2
  1143. FMADD3 (aa1, bb4, cc07, cc07)
  1144. LDF [BO + 10 * SIZE], b3
  1145. FMADD4 (aa2, bb4, cc08, cc08)
  1146. LDF [BO + 11 * SIZE], b4
  1147. FMADD1 (aa1, bb5, cc09, cc09)
  1148. LDF [AO + 2 * SIZE], a3
  1149. FMADD2 (aa2, bb5, cc10, cc10)
  1150. LDF [AO + 3 * SIZE], a4
  1151. FMADD3 (aa1, bb6, cc11, cc11)
  1152. prefetch [AO + (APREFETCHSIZE + 0) * SIZE], APREFETCH_CATEGORY
  1153. FMADD4 (aa2, bb6, cc12, cc12)
  1154. nop
  1155. FMADD1 (aa1, bb7, cc13, cc13)
  1156. LDF [BO + 12 * SIZE], b5
  1157. FMADD2 (aa2, bb7, cc14, cc14)
  1158. LDF [BO + 13 * SIZE], b6
  1159. FMADD3 (aa1, bb8, cc15, cc15)
  1160. LDF [BO + 14 * SIZE], b7
  1161. FMADD4 (aa2, bb8, cc16, cc16)
  1162. LDF [BO + 15 * SIZE], b8
  1163. FMADD1 (aa3, bb9, cc01, cc01)
  1164. FMADD2 (aa4, bb9, cc02, cc02)
  1165. FMADD3 (aa3, bb2, cc03, cc03)
  1166. FMADD4 (aa4, bb2, cc04, cc04)
  1167. FMADD1 (aa3, bb3, cc05, cc05)
  1168. LDF [BO + 24 * SIZE], b9
  1169. FMADD2 (aa4, bb3, cc06, cc06)
  1170. LDF [BO + 17 * SIZE], b2
  1171. FMADD3 (aa3, bb4, cc07, cc07)
  1172. LDF [BO + 18 * SIZE], b3
  1173. FMADD4 (aa4, bb4, cc08, cc08)
  1174. LDF [BO + 19 * SIZE], b4
  1175. FMADD1 (aa3, bb5, cc09, cc09)
  1176. LDF [AO + 4 * SIZE], a1
  1177. FMADD2 (aa4, bb5, cc10, cc10)
  1178. LDF [AO + 5 * SIZE], a2
  1179. FMADD3 (aa3, bb6, cc11, cc11)
  1180. add L, -1, L
  1181. FMADD4 (aa4, bb6, cc12, cc12)
  1182. nop
  1183. FMADD1 (aa3, bb7, cc13, cc13)
  1184. LDF [BO + 20 * SIZE], b5
  1185. FMADD2 (aa4, bb7, cc14, cc14)
  1186. LDF [BO + 21 * SIZE], b6
  1187. FMADD3 (aa3, bb8, cc15, cc15)
  1188. LDF [BO + 22 * SIZE], b7
  1189. FMADD4 (aa4, bb8, cc16, cc16)
  1190. LDF [BO + 23 * SIZE], b8
  1191. FMADD1 (aa1, bb1, cc01, cc01)
  1192. FMADD2 (aa2, bb1, cc02, cc02)
  1193. FMADD3 (aa1, bb2, cc03, cc03)
  1194. FMADD4 (aa2, bb2, cc04, cc04)
  1195. FMADD1 (aa1, bb3, cc05, cc05)
  1196. LDF [BO + 32 * SIZE], b1
  1197. FMADD2 (aa2, bb3, cc06, cc06)
  1198. LDF [BO + 25 * SIZE], b2
  1199. FMADD3 (aa1, bb4, cc07, cc07)
  1200. LDF [BO + 26 * SIZE], b3
  1201. FMADD4 (aa2, bb4, cc08, cc08)
  1202. LDF [BO + 27 * SIZE], b4
  1203. FMADD1 (aa1, bb5, cc09, cc09)
  1204. LDF [AO + 6 * SIZE], a3
  1205. FMADD2 (aa2, bb5, cc10, cc10)
  1206. LDF [AO + 7 * SIZE], a4
  1207. FMADD3 (aa1, bb6, cc11, cc11)
  1208. nop
  1209. FMADD4 (aa2, bb6, cc12, cc12)
  1210. nop
  1211. FMADD1 (aa1, bb7, cc13, cc13)
  1212. LDF [BO + 28 * SIZE], b5
  1213. FMADD2 (aa2, bb7, cc14, cc14)
  1214. LDF [BO + 29 * SIZE], b6
  1215. FMADD3 (aa1, bb8, cc15, cc15)
  1216. LDF [BO + 30 * SIZE], b7
  1217. FMADD4 (aa2, bb8, cc16, cc16)
  1218. LDF [BO + 31 * SIZE], b8
  1219. FMADD1 (aa3, bb9, cc01, cc01)
  1220. FMADD2 (aa4, bb9, cc02, cc02)
  1221. FMADD3 (aa3, bb2, cc03, cc03)
  1222. FMADD4 (aa4, bb2, cc04, cc04)
  1223. FMADD1 (aa3, bb3, cc05, cc05)
  1224. LDF [BO + 40 * SIZE], b9
  1225. FMADD2 (aa4, bb3, cc06, cc06)
  1226. LDF [BO + 33 * SIZE], b2
  1227. FMADD3 (aa3, bb4, cc07, cc07)
  1228. LDF [BO + 34 * SIZE], b3
  1229. FMADD4 (aa4, bb4, cc08, cc08)
  1230. LDF [BO + 35 * SIZE], b4
  1231. FMADD1 (aa3, bb5, cc09, cc09)
  1232. LDF [AO + 16 * SIZE], a1 /****/
  1233. FMADD2 (aa4, bb5, cc10, cc10)
  1234. LDF [AO + 9 * SIZE], a2
  1235. FMADD3 (aa3, bb6, cc11, cc11)
  1236. nop
  1237. FMADD4 (aa4, bb6, cc12, cc12)
  1238. nop
  1239. FMADD1 (aa3, bb7, cc13, cc13)
  1240. LDF [BO + 36 * SIZE], b5
  1241. FMADD2 (aa4, bb7, cc14, cc14)
  1242. LDF [BO + 37 * SIZE], b6
  1243. FMADD3 (aa3, bb8, cc15, cc15)
  1244. LDF [BO + 38 * SIZE], b7
  1245. FMADD4 (aa4, bb8, cc16, cc16)
  1246. LDF [BO + 39 * SIZE], b8
  1247. FMADD1 (aa5, bb1, cc01, cc01)
  1248. FMADD2 (aa2, bb1, cc02, cc02)
  1249. FMADD3 (aa5, bb2, cc03, cc03)
  1250. FMADD4 (aa2, bb2, cc04, cc04)
  1251. FMADD1 (aa5, bb3, cc05, cc05)
  1252. LDF [BO + 48 * SIZE], b1
  1253. FMADD2 (aa2, bb3, cc06, cc06)
  1254. LDF [BO + 41 * SIZE], b2
  1255. FMADD3 (aa5, bb4, cc07, cc07)
  1256. LDF [BO + 42 * SIZE], b3
  1257. FMADD4 (aa2, bb4, cc08, cc08)
  1258. LDF [BO + 43 * SIZE], b4
  1259. FMADD1 (aa5, bb5, cc09, cc09)
  1260. LDF [AO + 10 * SIZE], a3
  1261. FMADD2 (aa2, bb5, cc10, cc10)
  1262. LDF [AO + 11 * SIZE], a4
  1263. FMADD3 (aa5, bb6, cc11, cc11)
  1264. prefetch [AO + (APREFETCHSIZE + 8) * SIZE], APREFETCH_CATEGORY
  1265. FMADD4 (aa2, bb6, cc12, cc12)
  1266. nop
  1267. FMADD1 (aa5, bb7, cc13, cc13)
  1268. LDF [BO + 44 * SIZE], b5
  1269. FMADD2 (aa2, bb7, cc14, cc14)
  1270. LDF [BO + 45 * SIZE], b6
  1271. FMADD3 (aa5, bb8, cc15, cc15)
  1272. LDF [BO + 46 * SIZE], b7
  1273. FMADD4 (aa2, bb8, cc16, cc16)
  1274. LDF [BO + 47 * SIZE], b8
  1275. FMADD1 (aa3, bb9, cc01, cc01)
  1276. FMADD2 (aa4, bb9, cc02, cc02)
  1277. FMADD3 (aa3, bb2, cc03, cc03)
  1278. FMADD4 (aa4, bb2, cc04, cc04)
  1279. FMADD1 (aa3, bb3, cc05, cc05)
  1280. LDF [BO + 56 * SIZE], b9
  1281. FMADD2 (aa4, bb3, cc06, cc06)
  1282. LDF [BO + 49 * SIZE], b2
  1283. FMADD3 (aa3, bb4, cc07, cc07)
  1284. LDF [BO + 50 * SIZE], b3
  1285. FMADD4 (aa4, bb4, cc08, cc08)
  1286. LDF [BO + 51 * SIZE], b4
  1287. FMADD1 (aa3, bb5, cc09, cc09)
  1288. LDF [AO + 12 * SIZE], a5
  1289. FMADD2 (aa4, bb5, cc10, cc10)
  1290. LDF [AO + 13 * SIZE], a2
  1291. FMADD3 (aa3, bb6, cc11, cc11)
  1292. cmp L, 0
  1293. FMADD4 (aa4, bb6, cc12, cc12)
  1294. nop
  1295. FMADD1 (aa3, bb7, cc13, cc13)
  1296. LDF [BO + 52 * SIZE], b5
  1297. FMADD2 (aa4, bb7, cc14, cc14)
  1298. LDF [BO + 53 * SIZE], b6
  1299. FMADD3 (aa3, bb8, cc15, cc15)
  1300. LDF [BO + 54 * SIZE], b7
  1301. FMADD4 (aa4, bb8, cc16, cc16)
  1302. LDF [BO + 55 * SIZE], b8
  1303. FMADD1 (aa5, bb1, cc01, cc01)
  1304. FMADD2 (aa2, bb1, cc02, cc02)
  1305. FMADD3 (aa5, bb2, cc03, cc03)
  1306. FMADD4 (aa2, bb2, cc04, cc04)
  1307. FMADD1 (aa5, bb3, cc05, cc05)
  1308. LDF [BO + 64 * SIZE], b1
  1309. FMADD2 (aa2, bb3, cc06, cc06)
  1310. LDF [BO + 57 * SIZE], b2
  1311. FMADD3 (aa5, bb4, cc07, cc07)
  1312. LDF [BO + 58 * SIZE], b3
  1313. FMADD4 (aa2, bb4, cc08, cc08)
  1314. LDF [BO + 59 * SIZE], b4
  1315. FMADD1 (aa5, bb5, cc09, cc09)
  1316. LDF [AO + 14 * SIZE], a3
  1317. FMADD2 (aa2, bb5, cc10, cc10)
  1318. LDF [AO + 15 * SIZE], a4
  1319. FMADD3 (aa5, bb6, cc11, cc11)
  1320. add BO, 64 * SIZE, BO
  1321. FMADD4 (aa2, bb6, cc12, cc12)
  1322. add AO, 16 * SIZE, AO
  1323. FMADD1 (aa5, bb7, cc13, cc13)
  1324. LDF [BO - 4 * SIZE], b5
  1325. FMADD2 (aa2, bb7, cc14, cc14)
  1326. LDF [BO - 3 * SIZE], b6
  1327. FMADD3 (aa5, bb8, cc15, cc15)
  1328. LDF [BO - 2 * SIZE], b7
  1329. FMADD4 (aa2, bb8, cc16, cc16)
  1330. LDF [BO - 1 * SIZE], b8
  1331. FMADD1 (aa3, bb9, cc01, cc01)
  1332. FMADD2 (aa4, bb9, cc02, cc02)
  1333. FMADD3 (aa3, bb2, cc03, cc03)
  1334. FMADD4 (aa4, bb2, cc04, cc04)
  1335. FMADD1 (aa3, bb3, cc05, cc05)
  1336. LDF [BO + 8 * SIZE], b9
  1337. FMADD2 (aa4, bb3, cc06, cc06)
  1338. LDF [BO + 1 * SIZE], b2
  1339. FMADD3 (aa3, bb4, cc07, cc07)
  1340. LDF [BO + 2 * SIZE], b3
  1341. FMADD4 (aa4, bb4, cc08, cc08)
  1342. LDF [BO + 3 * SIZE], b4
  1343. FMADD1 (aa3, bb5, cc09, cc09)
  1344. LDF [AO + 8 * SIZE], a5 /****/
  1345. FMADD2 (aa4, bb5, cc10, cc10)
  1346. LDF [AO + 1 * SIZE], a2
  1347. FMADD3 (aa3, bb6, cc11, cc11)
  1348. FMADD4 (aa4, bb6, cc12, cc12)
  1349. FMADD1 (aa3, bb7, cc13, cc13)
  1350. LDF [BO + 4 * SIZE], b5
  1351. FMADD2 (aa4, bb7, cc14, cc14)
  1352. LDF [BO + 5 * SIZE], b6
  1353. FMADD3 (aa3, bb8, cc15, cc15)
  1354. LDF [BO + 6 * SIZE], b7
  1355. FMADD4 (aa4, bb8, cc16, cc16)
  1356. bg,pt %icc, .LL13
  1357. LDF [BO + 7 * SIZE], b8
  1358. .align 4
  1359. .LL15:
  1360. #if defined(LT) || defined(RN)
  1361. and KK, 7, L
  1362. #else
  1363. sub K, KK, L
  1364. and L, 7, L
  1365. #endif
  1366. cmp L, 0
  1367. ble,a,pn %icc, .LL18
  1368. nop
  1369. .align 4
  1370. .LL17:
  1371. FMADD1 (aa1, bb1, cc01, cc01)
  1372. add L, -1, L
  1373. FMADD2 (aa2, bb1, cc02, cc02)
  1374. nop
  1375. FMADD3 (aa1, bb2, cc03, cc03)
  1376. LDF [BO + 8 * SIZE], b1
  1377. FMADD4 (aa2, bb2, cc04, cc04)
  1378. LDF [BO + 9 * SIZE], b2
  1379. FMADD1 (aa1, bb3, cc05, cc05)
  1380. cmp L, 0
  1381. FMADD2 (aa2, bb3, cc06, cc06)
  1382. nop
  1383. FMADD3 (aa1, bb4, cc07, cc07)
  1384. LDF [BO + 10 * SIZE], b3
  1385. FMADD4 (aa2, bb4, cc08, cc08)
  1386. LDF [BO + 11 * SIZE], b4
  1387. FMADD1 (aa1, bb5, cc09, cc09)
  1388. nop
  1389. FMADD2 (aa2, bb5, cc10, cc10)
  1390. nop
  1391. FMADD3 (aa1, bb6, cc11, cc11)
  1392. LDF [BO + 12 * SIZE], b5
  1393. FMADD4 (aa2, bb6, cc12, cc12)
  1394. LDF [BO + 13 * SIZE], b6
  1395. FMADD1 (aa1, bb7, cc13, cc13)
  1396. add AO, 2 * SIZE, AO
  1397. FMADD2 (aa2, bb7, cc14, cc14)
  1398. add BO, 8 * SIZE, BO
  1399. FMADD3 (aa1, bb8, cc15, cc15)
  1400. LDF [AO + 0 * SIZE], a1
  1401. FMADD4 (aa2, bb8, cc16, cc16)
  1402. LDF [AO + 1 * SIZE], a2
  1403. LDF [BO + 6 * SIZE], b7
  1404. bg,pt %icc, .LL17
  1405. LDF [BO + 7 * SIZE], b8
  1406. nop
  1407. .align 4
  1408. .LL18:
  1409. FADD c01, c04, c01
  1410. FADD c02, c03, c02
  1411. FADD c05, c08, c05
  1412. FADD c06, c07, c06
  1413. FADD c09, c12, c09
  1414. FADD c10, c11, c10
  1415. FADD c13, c16, c13
  1416. FADD c14, c15, c14
  1417. #if defined(LN) || defined(RT)
  1418. #ifdef LN
  1419. sub KK, 1, TEMP1
  1420. #else
  1421. sub KK, 4, TEMP1
  1422. #endif
  1423. sll TEMP1, ZBASE_SHIFT + 0, TEMP2
  1424. sll TEMP1, ZBASE_SHIFT + 2, TEMP1
  1425. add AORIG, TEMP2, AO
  1426. add B, TEMP1, BO
  1427. #endif
  1428. #if defined(LN) || defined(LT)
  1429. LDF [BO + 0 * SIZE], a1
  1430. LDF [BO + 1 * SIZE], a2
  1431. LDF [BO + 2 * SIZE], a3
  1432. LDF [BO + 3 * SIZE], a4
  1433. LDF [BO + 4 * SIZE], b1
  1434. LDF [BO + 5 * SIZE], b2
  1435. LDF [BO + 6 * SIZE], b3
  1436. LDF [BO + 7 * SIZE], b4
  1437. #else
  1438. LDF [AO + 0 * SIZE], a1
  1439. LDF [AO + 1 * SIZE], a2
  1440. LDF [AO + 2 * SIZE], a3
  1441. LDF [AO + 3 * SIZE], a4
  1442. LDF [AO + 4 * SIZE], b1
  1443. LDF [AO + 5 * SIZE], b2
  1444. LDF [AO + 6 * SIZE], b3
  1445. LDF [AO + 7 * SIZE], b4
  1446. #endif
  1447. FSUB a1, c01, c01
  1448. FSUB a2, c02, c02
  1449. FSUB a3, c05, c05
  1450. FSUB a4, c06, c06
  1451. FSUB b1, c09, c09
  1452. FSUB b2, c10, c10
  1453. FSUB b3, c13, c13
  1454. FSUB b4, c14, c14
  1455. #if defined(LN) || defined(LT)
  1456. LDF [AO + 0 * SIZE], a1
  1457. LDF [AO + 1 * SIZE], a2
  1458. FMUL a1, c01, b1
  1459. FMUL a2, c01, b2
  1460. FMUL a1, c05, b3
  1461. FMUL a2, c05, b4
  1462. FMUL a1, c09, b5
  1463. FMUL a2, c09, b6
  1464. FMUL a1, c13, b7
  1465. FMUL a2, c13, b8
  1466. #ifndef CONJ
  1467. FNMSUB (aa2, cc02, bb1, cc01)
  1468. FMADD (aa1, cc02, bb2, cc02)
  1469. FNMSUB (aa2, cc06, bb3, cc05)
  1470. FMADD (aa1, cc06, bb4, cc06)
  1471. FNMSUB (aa2, cc10, bb5, cc09)
  1472. FMADD (aa1, cc10, bb6, cc10)
  1473. FNMSUB (aa2, cc14, bb7, cc13)
  1474. FMADD (aa1, cc14, bb8, cc14)
  1475. #else
  1476. FMADD (aa2, cc02, bb1, cc01)
  1477. FMSUB (aa1, cc02, bb2, cc02)
  1478. FMADD (aa2, cc06, bb3, cc05)
  1479. FMSUB (aa1, cc06, bb4, cc06)
  1480. FMADD (aa2, cc10, bb5, cc09)
  1481. FMSUB (aa1, cc10, bb6, cc10)
  1482. FMADD (aa2, cc14, bb7, cc13)
  1483. FMSUB (aa1, cc14, bb8, cc14)
  1484. #endif
  1485. #endif
  1486. #ifdef RN
  1487. LDF [BO + 0 * SIZE], b1
  1488. LDF [BO + 1 * SIZE], b2
  1489. LDF [BO + 2 * SIZE], b3
  1490. LDF [BO + 3 * SIZE], b4
  1491. LDF [BO + 4 * SIZE], b5
  1492. LDF [BO + 5 * SIZE], b6
  1493. LDF [BO + 6 * SIZE], b7
  1494. LDF [BO + 7 * SIZE], b8
  1495. FMUL b1, c01, a1
  1496. FMUL b2, c01, a2
  1497. #ifndef CONJ
  1498. FNMSUB (bb2, cc02, aa1, cc01)
  1499. FMADD (bb1, cc02, aa2, cc02)
  1500. #else
  1501. FMADD (bb2, cc02, aa1, cc01)
  1502. FMSUB (bb1, cc02, aa2, cc02)
  1503. #endif
  1504. FNMSUB (bb3, cc01, cc05, cc05)
  1505. FNMSUB (bb3, cc02, cc06, cc06)
  1506. FNMSUB (bb5, cc01, cc09, cc09)
  1507. FNMSUB (bb5, cc02, cc10, cc10)
  1508. FNMSUB (bb7, cc01, cc13, cc13)
  1509. FNMSUB (bb7, cc02, cc14, cc14)
  1510. #ifndef CONJ
  1511. FMADD (bb4, cc02, cc05, cc05)
  1512. FNMSUB (bb4, cc01, cc06, cc06)
  1513. FMADD (bb6, cc02, cc09, cc09)
  1514. FNMSUB (bb6, cc01, cc10, cc10)
  1515. FMADD (bb8, cc02, cc13, cc13)
  1516. FNMSUB (bb8, cc01, cc14, cc14)
  1517. #else
  1518. FNMSUB (bb4, cc02, cc05, cc05)
  1519. FMADD (bb4, cc01, cc06, cc06)
  1520. FNMSUB (bb6, cc02, cc09, cc09)
  1521. FMADD (bb6, cc01, cc10, cc10)
  1522. FNMSUB (bb8, cc02, cc13, cc13)
  1523. FMADD (bb8, cc01, cc14, cc14)
  1524. #endif
  1525. LDF [BO + 10 * SIZE], b1
  1526. LDF [BO + 11 * SIZE], b2
  1527. LDF [BO + 12 * SIZE], b3
  1528. LDF [BO + 13 * SIZE], b4
  1529. LDF [BO + 14 * SIZE], b5
  1530. LDF [BO + 15 * SIZE], b6
  1531. FMUL b1, c05, a1
  1532. FMUL b2, c05, a2
  1533. #ifndef CONJ
  1534. FNMSUB (bb2, cc06, aa1, cc05)
  1535. FMADD (bb1, cc06, aa2, cc06)
  1536. #else
  1537. FMADD (bb2, cc06, aa1, cc05)
  1538. FMSUB (bb1, cc06, aa2, cc06)
  1539. #endif
  1540. FNMSUB (bb3, cc05, cc09, cc09)
  1541. FNMSUB (bb3, cc06, cc10, cc10)
  1542. FNMSUB (bb5, cc05, cc13, cc13)
  1543. FNMSUB (bb5, cc06, cc14, cc14)
  1544. #ifndef CONJ
  1545. FMADD (bb4, cc06, cc09, cc09)
  1546. FNMSUB (bb4, cc05, cc10, cc10)
  1547. FMADD (bb6, cc06, cc13, cc13)
  1548. FNMSUB (bb6, cc05, cc14, cc14)
  1549. #else
  1550. FNMSUB (bb4, cc06, cc09, cc09)
  1551. FMADD (bb4, cc05, cc10, cc10)
  1552. FNMSUB (bb6, cc06, cc13, cc13)
  1553. FMADD (bb6, cc05, cc14, cc14)
  1554. #endif
  1555. LDF [BO + 20 * SIZE], b1
  1556. LDF [BO + 21 * SIZE], b2
  1557. LDF [BO + 22 * SIZE], b3
  1558. LDF [BO + 23 * SIZE], b4
  1559. FMUL b1, c09, a1
  1560. FMUL b2, c09, a2
  1561. #ifndef CONJ
  1562. FNMSUB (bb2, cc10, aa1, cc09)
  1563. FMADD (bb1, cc10, aa2, cc10)
  1564. #else
  1565. FMADD (bb2, cc10, aa1, cc09)
  1566. FMSUB (bb1, cc10, aa2, cc10)
  1567. #endif
  1568. FNMSUB (bb3, cc09, cc13, cc13)
  1569. FNMSUB (bb3, cc10, cc14, cc14)
  1570. #ifndef CONJ
  1571. FMADD (bb4, cc10, cc13, cc13)
  1572. FNMSUB (bb4, cc09, cc14, cc14)
  1573. #else
  1574. FNMSUB (bb4, cc10, cc13, cc13)
  1575. FMADD (bb4, cc09, cc14, cc14)
  1576. #endif
  1577. LDF [BO + 30 * SIZE], b1
  1578. LDF [BO + 31 * SIZE], b2
  1579. FMUL b1, c13, a1
  1580. FMUL b2, c13, a2
  1581. #ifndef CONJ
  1582. FNMSUB (bb2, cc14, aa1, cc13)
  1583. FMADD (bb1, cc14, aa2, cc14)
  1584. #else
  1585. FMADD (bb2, cc14, aa1, cc13)
  1586. FMSUB (bb1, cc14, aa2, cc14)
  1587. #endif
  1588. #endif
  1589. #ifdef RT
  1590. LDF [BO + 30 * SIZE], b1
  1591. LDF [BO + 31 * SIZE], b2
  1592. LDF [BO + 28 * SIZE], b3
  1593. LDF [BO + 29 * SIZE], b4
  1594. LDF [BO + 26 * SIZE], b5
  1595. LDF [BO + 27 * SIZE], b6
  1596. LDF [BO + 24 * SIZE], b7
  1597. LDF [BO + 25 * SIZE], b8
  1598. FMUL b1, c13, a1
  1599. FMUL b2, c13, a2
  1600. #ifndef CONJ
  1601. FNMSUB (bb2, cc14, aa1, cc13)
  1602. FMADD (bb1, cc14, aa2, cc14)
  1603. #else
  1604. FMADD (bb2, cc14, aa1, cc13)
  1605. FMSUB (bb1, cc14, aa2, cc14)
  1606. #endif
  1607. FNMSUB (bb3, cc13, cc09, cc09)
  1608. FNMSUB (bb3, cc14, cc10, cc10)
  1609. FNMSUB (bb5, cc13, cc05, cc05)
  1610. FNMSUB (bb5, cc14, cc06, cc06)
  1611. FNMSUB (bb7, cc13, cc01, cc01)
  1612. FNMSUB (bb7, cc14, cc02, cc02)
  1613. #ifndef CONJ
  1614. FMADD (bb4, cc14, cc09, cc09)
  1615. FNMSUB (bb4, cc13, cc10, cc10)
  1616. FMADD (bb6, cc14, cc05, cc05)
  1617. FNMSUB (bb6, cc13, cc06, cc06)
  1618. FMADD (bb8, cc14, cc01, cc01)
  1619. FNMSUB (bb8, cc13, cc02, cc02)
  1620. #else
  1621. FNMSUB (bb4, cc14, cc09, cc09)
  1622. FMADD (bb4, cc13, cc10, cc10)
  1623. FNMSUB (bb6, cc14, cc05, cc05)
  1624. FMADD (bb6, cc13, cc06, cc06)
  1625. FNMSUB (bb8, cc14, cc01, cc01)
  1626. FMADD (bb8, cc13, cc02, cc02)
  1627. #endif
  1628. LDF [BO + 20 * SIZE], b1
  1629. LDF [BO + 21 * SIZE], b2
  1630. LDF [BO + 18 * SIZE], b3
  1631. LDF [BO + 19 * SIZE], b4
  1632. LDF [BO + 16 * SIZE], b5
  1633. LDF [BO + 17 * SIZE], b6
  1634. FMUL b1, c09, a1
  1635. FMUL b2, c09, a2
  1636. #ifndef CONJ
  1637. FNMSUB (bb2, cc10, aa1, cc09)
  1638. FMADD (bb1, cc10, aa2, cc10)
  1639. #else
  1640. FMADD (bb2, cc10, aa1, cc09)
  1641. FMSUB (bb1, cc10, aa2, cc10)
  1642. #endif
  1643. FNMSUB (bb3, cc09, cc05, cc05)
  1644. FNMSUB (bb3, cc10, cc06, cc06)
  1645. FNMSUB (bb5, cc09, cc01, cc01)
  1646. FNMSUB (bb5, cc10, cc02, cc02)
  1647. #ifndef CONJ
  1648. FMADD (bb4, cc10, cc05, cc05)
  1649. FNMSUB (bb4, cc09, cc06, cc06)
  1650. FMADD (bb6, cc10, cc01, cc01)
  1651. FNMSUB (bb6, cc09, cc02, cc02)
  1652. #else
  1653. FNMSUB (bb4, cc10, cc05, cc05)
  1654. FMADD (bb4, cc09, cc06, cc06)
  1655. FNMSUB (bb6, cc10, cc01, cc01)
  1656. FMADD (bb6, cc09, cc02, cc02)
  1657. #endif
  1658. LDF [BO + 10 * SIZE], b1
  1659. LDF [BO + 11 * SIZE], b2
  1660. LDF [BO + 8 * SIZE], b3
  1661. LDF [BO + 9 * SIZE], b4
  1662. FMUL b1, c05, a1
  1663. FMUL b2, c05, a2
  1664. #ifndef CONJ
  1665. FNMSUB (bb2, cc06, aa1, cc05)
  1666. FMADD (bb1, cc06, aa2, cc06)
  1667. #else
  1668. FMADD (bb2, cc06, aa1, cc05)
  1669. FMSUB (bb1, cc06, aa2, cc06)
  1670. #endif
  1671. FNMSUB (bb3, cc05, cc01, cc01)
  1672. FNMSUB (bb3, cc06, cc02, cc02)
  1673. #ifndef CONJ
  1674. FMADD (bb4, cc06, cc01, cc01)
  1675. FNMSUB (bb4, cc05, cc02, cc02)
  1676. #else
  1677. FNMSUB (bb4, cc06, cc01, cc01)
  1678. FMADD (bb4, cc05, cc02, cc02)
  1679. #endif
  1680. LDF [BO + 0 * SIZE], b1
  1681. LDF [BO + 1 * SIZE], b2
  1682. FMUL b1, c01, a1
  1683. FMUL b2, c01, a2
  1684. #ifndef CONJ
  1685. FNMSUB (bb2, cc02, aa1, cc01)
  1686. FMADD (bb1, cc02, aa2, cc02)
  1687. #else
  1688. FMADD (bb2, cc02, aa1, cc01)
  1689. FMSUB (bb1, cc02, aa2, cc02)
  1690. #endif
  1691. #endif
  1692. #ifdef LN
  1693. add C1, -2 * SIZE, C1
  1694. add C2, -2 * SIZE, C2
  1695. add C3, -2 * SIZE, C3
  1696. add C4, -2 * SIZE, C4
  1697. #endif
  1698. #if defined(LN) || defined(LT)
  1699. STF c01, [BO + 0 * SIZE]
  1700. STF c02, [BO + 1 * SIZE]
  1701. STF c05, [BO + 2 * SIZE]
  1702. STF c06, [BO + 3 * SIZE]
  1703. STF c09, [BO + 4 * SIZE]
  1704. STF c10, [BO + 5 * SIZE]
  1705. STF c13, [BO + 6 * SIZE]
  1706. STF c14, [BO + 7 * SIZE]
  1707. #else
  1708. STF c01, [AO + 0 * SIZE]
  1709. STF c02, [AO + 1 * SIZE]
  1710. STF c05, [AO + 2 * SIZE]
  1711. STF c06, [AO + 3 * SIZE]
  1712. STF c09, [AO + 4 * SIZE]
  1713. STF c10, [AO + 5 * SIZE]
  1714. STF c13, [AO + 6 * SIZE]
  1715. STF c14, [AO + 7 * SIZE]
  1716. #endif
  1717. STF c01, [C1 + 0 * SIZE]
  1718. STF c02, [C1 + 1 * SIZE]
  1719. STF c05, [C2 + 0 * SIZE]
  1720. STF c06, [C2 + 1 * SIZE]
  1721. STF c09, [C3 + 0 * SIZE]
  1722. STF c10, [C3 + 1 * SIZE]
  1723. STF c13, [C4 + 0 * SIZE]
  1724. STF c14, [C4 + 1 * SIZE]
  1725. #ifndef LN
  1726. add C1, 2 * SIZE, C1
  1727. add C2, 2 * SIZE, C2
  1728. add C3, 2 * SIZE, C3
  1729. add C4, 2 * SIZE, C4
  1730. #endif
  1731. #ifdef RT
  1732. sll K, ZBASE_SHIFT, TEMP1
  1733. add AORIG, TEMP1, AORIG
  1734. #endif
  1735. #if defined(LT) || defined(RN)
  1736. sub K, KK, TEMP1
  1737. sll TEMP1, ZBASE_SHIFT + 0, TEMP2
  1738. sll TEMP1, ZBASE_SHIFT + 2, TEMP1
  1739. add AO, TEMP2, AO
  1740. add BO, TEMP1, BO
  1741. #endif
  1742. #ifdef LT
  1743. add KK, 1, KK
  1744. #endif
  1745. #ifdef LN
  1746. sub KK, 1, KK
  1747. #endif
  1748. add I, -1, I
  1749. cmp I, 0
  1750. bg,pt %icc, .LL12
  1751. nop
  1752. #ifdef LN
  1753. sll K, ZBASE_SHIFT + 2, TEMP1
  1754. add B, TEMP1, B
  1755. #endif
  1756. #if defined(LT) || defined(RN)
  1757. mov BO, B
  1758. #endif
  1759. #ifdef RN
  1760. add KK, 4, KK
  1761. #endif
  1762. #ifdef RT
  1763. sub KK, 4, KK
  1764. #endif
  1765. add J, -1, J
  1766. cmp J, 0
  1767. bg,pt %icc, .LL11
  1768. nop
  1769. .align 4
  1770. .LL999:
  1771. return %i7 + 8
  1772. clr %o0
  1773. EPILOGUE