You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

ztrsm_kernel_RT.S 37 kB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714171517161717171817191720172117221723172417251726172717281729173017311732173317341735173617371738173917401741174217431744174517461747174817491750175117521753175417551756175717581759176017611762176317641765176617671768176917701771177217731774177517761777177817791780178117821783178417851786178717881789179017911792179317941795179617971798179918001801180218031804180518061807180818091810181118121813181418151816181718181819182018211822182318241825182618271828182918301831183218331834183518361837183818391840184118421843184418451846184718481849185018511852185318541855185618571858185918601861186218631864186518661867186818691870187118721873187418751876187718781879188018811882188318841885188618871888188918901891189218931894189518961897189818991900190119021903190419051906190719081909191019111912191319141915191619171918191919201921192219231924192519261927192819291930193119321933193419351936193719381939194019411942194319441945194619471948194919501951195219531954195519561957195819591960196119621963196419651966196719681969197019711972197319741975197619771978197919801981198219831984198519861987198819891990199119921993199419951996199719981999200020012002200320042005200620072008200920102011201220132014201520162017201820192020202120222023202420252026202720282029203020312032203320342035203620372038203920402041204220432044204520462047204820492050205120522053205420552056205720582059206020612062206320642065206620672068206920702071207220732074207520762077207820792080208120822083208420852086208720882089209020912092209320942095209620972098209921002101210221032104210521062107210821092110211121122113211421152116211721182119212021212122212321242125212621272128212921302131213221332134213521362137213821392140214121422143214421452146214721482149215021512152215321542155215621572158215921602161216221632164216521662167216821692170217121722173217421752176217721782179218021812182218321842185218621872188218921902191219221932194219521962197219821992200220122022203220422052206220722082209221022112212221322142215221622172218221922202221222222232224222522262227222822292230223122322233223422352236223722382239224022412242224322442245224622472248224922502251225222532254225522562257225822592260226122622263226422652266226722682269227022712272227322742275227622772278227922802281228222832284228522862287228822892290229122922293229422952296229722982299230023012302230323042305230623072308230923102311231223132314231523162317231823192320232123222323232423252326232723282329233023312332233323342335233623372338233923402341234223432344234523462347234823492350235123522353235423552356235723582359236023612362236323642365236623672368236923702371237223732374237523762377237823792380238123822383238423852386238723882389
  1. /*********************************************************************/
  2. /* Copyright 2009, 2010 The University of Texas at Austin. */
  3. /* All rights reserved. */
  4. /* */
  5. /* Redistribution and use in source and binary forms, with or */
  6. /* without modification, are permitted provided that the following */
  7. /* conditions are met: */
  8. /* */
  9. /* 1. Redistributions of source code must retain the above */
  10. /* copyright notice, this list of conditions and the following */
  11. /* disclaimer. */
  12. /* */
  13. /* 2. Redistributions in binary form must reproduce the above */
  14. /* copyright notice, this list of conditions and the following */
  15. /* disclaimer in the documentation and/or other materials */
  16. /* provided with the distribution. */
  17. /* */
  18. /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
  19. /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
  20. /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
  21. /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
  22. /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
  23. /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
  24. /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
  25. /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
  26. /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
  27. /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
  28. /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
  29. /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
  30. /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
  31. /* POSSIBILITY OF SUCH DAMAGE. */
  32. /* */
  33. /* The views and conclusions contained in the software and */
  34. /* documentation are those of the authors and should not be */
  35. /* interpreted as representing official policies, either expressed */
  36. /* or implied, of The University of Texas at Austin. */
  37. /*********************************************************************/
  38. #define ASSEMBLER
  39. #include "common.h"
  40. #define M %i0
  41. #define N %i1
  42. #define K %i2
  43. #define A %i5
  44. #define B %i3
  45. #define C %i4
  46. #define LDC %o0
  47. #define AO %o1
  48. #define BO %o2
  49. #define I %o3
  50. #define J %o4
  51. #define L %o5
  52. #define C1 %l0
  53. #define C2 %l1
  54. #define OFFSET %l2
  55. #define KK %l3
  56. #define TEMP1 %l4
  57. #define TEMP2 %l5
  58. #define AORIG %l6
  59. #ifdef DOUBLE
  60. #define c01 %f0
  61. #define c02 %f2
  62. #define c03 %f4
  63. #define c04 %f6
  64. #define c05 %f8
  65. #define c06 %f10
  66. #define c07 %f12
  67. #define c08 %f14
  68. #define c09 %f16
  69. #define c10 %f18
  70. #define c11 %f20
  71. #define c12 %f22
  72. #define c13 %f24
  73. #define c14 %f26
  74. #define c15 %f28
  75. #define c16 %f30
  76. #define t1 %f32
  77. #define t2 %f34
  78. #define t3 %f36
  79. #define t4 %f38
  80. #define a1 %f40
  81. #define a2 %f42
  82. #define a3 %f44
  83. #define a4 %f46
  84. #define a5 %f62
  85. #define b1 %f48
  86. #define b2 %f50
  87. #define b3 %f52
  88. #define b4 %f54
  89. #define b5 %f56
  90. #define FZERO %f58
  91. #else
  92. #define c01 %f0
  93. #define c02 %f1
  94. #define c03 %f2
  95. #define c04 %f3
  96. #define c05 %f4
  97. #define c06 %f5
  98. #define c07 %f6
  99. #define c08 %f7
  100. #define c09 %f8
  101. #define c10 %f9
  102. #define c11 %f10
  103. #define c12 %f11
  104. #define c13 %f12
  105. #define c14 %f13
  106. #define c15 %f14
  107. #define c16 %f15
  108. #define t1 %f16
  109. #define t2 %f17
  110. #define t3 %f18
  111. #define t4 %f19
  112. #define a1 %f20
  113. #define a2 %f21
  114. #define a3 %f22
  115. #define a4 %f23
  116. #define a5 %f31
  117. #define b1 %f24
  118. #define b2 %f25
  119. #define b3 %f26
  120. #define b4 %f27
  121. #define b5 %f28
  122. #define FZERO %f29
  123. #endif
  124. #define t5 c13
  125. #define t6 c14
  126. #define t7 c15
  127. #define t8 c16
  128. #ifndef CONJ
  129. #define FADD1 FADD
  130. #define FADD2 FADD
  131. #define FADD3 FADD
  132. #define FADD4 FSUB
  133. #else
  134. #if defined(LN) || defined(LT)
  135. #define FADD1 FADD
  136. #define FADD2 FSUB
  137. #define FADD3 FADD
  138. #define FADD4 FADD
  139. #endif
  140. #if defined(RN) || defined(RT)
  141. #define FADD1 FADD
  142. #define FADD2 FADD
  143. #define FADD3 FSUB
  144. #define FADD4 FADD
  145. #endif
  146. #endif
  147. #define APREFETCHSIZE 40
  148. #define BPREFETCHSIZE 40
  149. #define APREFETCH_CATEGORY 0
  150. #define BPREFETCH_CATEGORY 0
  151. PROLOGUE
  152. SAVESP
  153. #ifndef __64BIT__
  154. #ifdef DOUBLE
  155. ld [%sp + STACK_START + 32], A
  156. ld [%sp + STACK_START + 36], B
  157. ld [%sp + STACK_START + 40], C
  158. ld [%sp + STACK_START + 44], LDC
  159. ld [%sp + STACK_START + 48], OFFSET
  160. #else
  161. ld [%sp + STACK_START + 28], B
  162. ld [%sp + STACK_START + 32], C
  163. ld [%sp + STACK_START + 36], LDC
  164. ld [%sp + STACK_START + 40], OFFSET
  165. #endif
  166. #else
  167. ldx [%sp+ STACK_START + 56], B
  168. ldx [%sp+ STACK_START + 64], C
  169. ldx [%sp+ STACK_START + 72], LDC
  170. ldx [%sp+ STACK_START + 80], OFFSET
  171. #endif
  172. #ifdef DOUBLE
  173. FCLR(27)
  174. #else
  175. FCLR(29)
  176. #endif
  177. sll LDC, ZBASE_SHIFT, LDC
  178. #ifdef LN
  179. smul M, K, TEMP1
  180. sll TEMP1, ZBASE_SHIFT, TEMP1
  181. add A, TEMP1, A
  182. sll M, ZBASE_SHIFT, TEMP1
  183. add C, TEMP1, C
  184. #endif
  185. #ifdef RN
  186. neg OFFSET, KK
  187. #endif
  188. #ifdef RT
  189. smul N, K, TEMP1
  190. sll TEMP1, ZBASE_SHIFT, TEMP1
  191. add B, TEMP1, B
  192. smul N, LDC, TEMP1
  193. add C, TEMP1, C
  194. sub N, OFFSET, KK
  195. #endif
  196. and N, 1, J
  197. cmp J, 0
  198. ble,pn %icc, .LL100
  199. nop
  200. #ifdef RT
  201. sll K, 0 + ZBASE_SHIFT, TEMP1
  202. sub B, TEMP1, B
  203. sub C, LDC, C
  204. #endif
  205. mov C, C1
  206. #ifdef LN
  207. add M, OFFSET, KK
  208. #endif
  209. #ifdef LT
  210. mov OFFSET, KK
  211. #endif
  212. #if defined(LN) || defined(RT)
  213. mov A, AORIG
  214. #else
  215. mov A, AO
  216. #endif
  217. #ifndef RT
  218. add C, LDC, C
  219. #endif
  220. sra M, 1, I
  221. cmp I, 0
  222. ble,pn %icc, .LL150
  223. FMOV FZERO, c03
  224. .LL121:
  225. #if defined(LT) || defined(RN)
  226. sra KK, 2, L
  227. mov B, BO
  228. cmp L, 0
  229. #else
  230. #ifdef LN
  231. sll K, 1 + ZBASE_SHIFT, TEMP1
  232. sub AORIG, TEMP1, AORIG
  233. #endif
  234. sll KK, 1 + ZBASE_SHIFT, TEMP1
  235. sll KK, 0 + ZBASE_SHIFT, TEMP2
  236. add AORIG, TEMP1, AO
  237. add B, TEMP2, BO
  238. sub K, KK, TEMP1
  239. sra TEMP1, 2, L
  240. cmp L, 0
  241. #endif
  242. FMOV FZERO, c03
  243. LDF [AO + 0 * SIZE], a1
  244. FMOV FZERO, t1
  245. LDF [BO + 0 * SIZE], b1
  246. FMOV FZERO, c07
  247. LDF [AO + 1 * SIZE], a2
  248. FMOV FZERO, t2
  249. LDF [BO + 1 * SIZE], b2
  250. FMOV FZERO, c04
  251. LDF [AO + 2 * SIZE], a3
  252. FMOV FZERO, t3
  253. LDF [BO + 2 * SIZE], b3
  254. FMOV FZERO, c08
  255. LDF [AO + 3 * SIZE], a4
  256. FMOV FZERO, t4
  257. LDF [BO + 3 * SIZE], b4
  258. FMOV FZERO, c01
  259. prefetch [C1 + 3 * SIZE], 3
  260. FMOV FZERO, c05
  261. FMOV FZERO, c02
  262. ble,pn %icc, .LL125
  263. FMOV FZERO, c06
  264. .LL122:
  265. FADD1 c03, t1, c03
  266. add L, -1, L
  267. FMUL a1, b1, t1
  268. prefetch [AO + APREFETCHSIZE * SIZE], 0
  269. FADD3 c07, t2, c07
  270. add BO, 8 * SIZE, BO
  271. FMUL a1, b2, t2
  272. LDF [AO + 4 * SIZE], a1
  273. FADD2 c04, t3, c04
  274. add AO, 16 * SIZE, AO
  275. FMUL a2, b1, t3
  276. cmp L, 0
  277. FADD4 c08, t4, c08
  278. nop
  279. FMUL a2, b2, t4
  280. LDF [AO - 11 * SIZE], a2
  281. FADD1 c01, t1, c01
  282. nop
  283. FMUL a3, b1, t1
  284. nop
  285. FADD3 c05, t2, c05
  286. nop
  287. FMUL a3, b2, t2
  288. LDF [AO - 10 * SIZE], a3
  289. FADD2 c02, t3, c02
  290. nop
  291. FMUL a4, b1, t3
  292. LDF [BO - 4 * SIZE], b1
  293. FADD4 c06, t4, c06
  294. nop
  295. FMUL a4, b2, t4
  296. LDF [BO - 3 * SIZE], b2
  297. FADD1 c03, t1, c03
  298. nop
  299. FMUL a1, b3, t1
  300. LDF [AO - 9 * SIZE], a4
  301. FADD3 c07, t2, c07
  302. nop
  303. FMUL a1, b4, t2
  304. LDF [AO - 8 * SIZE], a1
  305. FADD2 c04, t3, c04
  306. nop
  307. FMUL a2, b3, t3
  308. nop
  309. FADD4 c08, t4, c08
  310. nop
  311. FMUL a2, b4, t4
  312. LDF [AO - 7 * SIZE], a2
  313. FADD1 c01, t1, c01
  314. nop
  315. FMUL a3, b3, t1
  316. nop
  317. FADD3 c05, t2, c05
  318. nop
  319. FMUL a3, b4, t2
  320. LDF [AO - 6 * SIZE], a3
  321. FADD2 c02, t3, c02
  322. nop
  323. FMUL a4, b3, t3
  324. LDF [BO - 2 * SIZE], b3
  325. FADD4 c06, t4, c06
  326. nop
  327. FMUL a4, b4, t4
  328. LDF [BO - 1 * SIZE], b4
  329. FADD1 c03, t1, c03
  330. nop
  331. FMUL a1, b1, t1
  332. LDF [AO - 5 * SIZE], a4
  333. FADD3 c07, t2, c07
  334. nop
  335. FMUL a1, b2, t2
  336. LDF [AO - 4 * SIZE], a1
  337. FADD2 c04, t3, c04
  338. nop
  339. FMUL a2, b1, t3
  340. nop
  341. FADD4 c08, t4, c08
  342. nop
  343. FMUL a2, b2, t4
  344. LDF [AO - 3 * SIZE], a2
  345. FADD1 c01, t1, c01
  346. nop
  347. FMUL a3, b1, t1
  348. nop
  349. FADD3 c05, t2, c05
  350. nop
  351. FMUL a3, b2, t2
  352. LDF [AO - 2 * SIZE], a3
  353. FADD2 c02, t3, c02
  354. nop
  355. FMUL a4, b1, t3
  356. LDF [BO + 0 * SIZE], b1
  357. FADD4 c06, t4, c06
  358. nop
  359. FMUL a4, b2, t4
  360. LDF [BO + 1 * SIZE], b2
  361. FADD1 c03, t1, c03
  362. nop
  363. FMUL a1, b3, t1
  364. LDF [AO - 1 * SIZE], a4
  365. FADD3 c07, t2, c07
  366. nop
  367. FMUL a1, b4, t2
  368. LDF [AO + 0 * SIZE], a1
  369. FADD2 c04, t3, c04
  370. nop
  371. FMUL a2, b3, t3
  372. nop
  373. FADD4 c08, t4, c08
  374. nop
  375. FMUL a2, b4, t4
  376. LDF [AO + 1 * SIZE], a2
  377. FADD1 c01, t1, c01
  378. nop
  379. FMUL a3, b3, t1
  380. nop
  381. FADD3 c05, t2, c05
  382. nop
  383. FMUL a3, b4, t2
  384. LDF [AO + 2 * SIZE], a3
  385. FADD2 c02, t3, c02
  386. nop
  387. FMUL a4, b3, t3
  388. LDF [BO + 2 * SIZE], b3
  389. FADD4 c06, t4, c06
  390. FMUL a4, b4, t4
  391. LDF [AO + 3 * SIZE], a4
  392. bg,pt %icc, .LL122
  393. LDF [BO + 3 * SIZE], b4
  394. .LL125:
  395. #if defined(LT) || defined(RN)
  396. and KK, 3, L
  397. #else
  398. and TEMP1, 3, L
  399. #endif
  400. cmp L, 0
  401. ble,a,pn %icc, .LL129
  402. nop
  403. .LL126:
  404. FADD1 c03, t1, c03
  405. add AO, 4 * SIZE, AO
  406. FMUL a1, b1, t1
  407. add BO, 2 * SIZE, BO
  408. FADD3 c07, t2, c07
  409. add L, -1, L
  410. FMUL a1, b2, t2
  411. LDF [AO + 0 * SIZE], a1
  412. FADD2 c04, t3, c04
  413. cmp L, 0
  414. FMUL a2, b1, t3
  415. FADD4 c08, t4, c08
  416. FMUL a2, b2, t4
  417. LDF [AO + 1 * SIZE], a2
  418. FADD1 c01, t1, c01
  419. FMUL a3, b1, t1
  420. FADD3 c05, t2, c05
  421. FMUL a3, b2, t2
  422. LDF [AO + 2 * SIZE], a3
  423. FADD2 c02, t3, c02
  424. FMUL a4, b1, t3
  425. LDF [BO + 0 * SIZE], b1
  426. FADD4 c06, t4, c06
  427. FMUL a4, b2, t4
  428. LDF [BO + 1 * SIZE], b2
  429. bg,pt %icc, .LL126
  430. LDF [AO + 3 * SIZE], a4
  431. .LL129:
  432. FADD1 c03, t1, c03
  433. FADD3 c07, t2, c07
  434. FADD2 c04, t3, c04
  435. FADD4 c08, t4, c08
  436. FADD c01, c06, c01
  437. FADD c02, c05, c02
  438. FADD c03, c08, c03
  439. FADD c04, c07, c04
  440. #if defined(LN) || defined(RT)
  441. #ifdef LN
  442. sub KK, 2, TEMP1
  443. #else
  444. sub KK, 1, TEMP1
  445. #endif
  446. sll TEMP1, 1 + ZBASE_SHIFT, TEMP2
  447. sll TEMP1, 0 + ZBASE_SHIFT, TEMP1
  448. add AORIG, TEMP2, AO
  449. add B, TEMP1, BO
  450. #endif
  451. #if defined(LN) || defined(LT)
  452. LDF [BO + 0 * SIZE], a1
  453. LDF [BO + 1 * SIZE], a2
  454. LDF [BO + 2 * SIZE], a3
  455. LDF [BO + 3 * SIZE], a4
  456. FSUB a1, c01, c01
  457. FSUB a2, c02, c02
  458. FSUB a3, c03, c03
  459. FSUB a4, c04, c04
  460. #else
  461. LDF [AO + 0 * SIZE], a1
  462. LDF [AO + 1 * SIZE], a2
  463. LDF [AO + 2 * SIZE], a3
  464. LDF [AO + 3 * SIZE], a4
  465. FSUB a1, c01, c01
  466. FSUB a2, c02, c02
  467. FSUB a3, c03, c03
  468. FSUB a4, c04, c04
  469. #endif
  470. #ifdef LN
  471. LDF [AO + 6 * SIZE], a1
  472. LDF [AO + 7 * SIZE], a2
  473. LDF [AO + 4 * SIZE], a3
  474. LDF [AO + 5 * SIZE], a4
  475. LDF [AO + 0 * SIZE], b1
  476. LDF [AO + 1 * SIZE], b2
  477. FMUL a1, c03, t1
  478. FMUL a2, c04, t2
  479. FMUL a1, c04, t3
  480. FMUL a2, c03, t4
  481. FADD4 t1, t2, c03
  482. FADD2 t3, t4, c04
  483. FMUL a3, c03, t1
  484. FMUL a3, c04, t2
  485. FMUL a4, c04, t5
  486. FMUL a4, c03, t6
  487. FSUB c01, t1, c01
  488. FSUB c02, t2, c02
  489. FADD2 c01, t5, c01
  490. FADD4 c02, t6, c02
  491. FMUL b1, c01, t1
  492. FMUL b2, c02, t2
  493. FMUL b1, c02, t3
  494. FMUL b2, c01, t4
  495. FADD4 t1, t2, c01
  496. FADD2 t3, t4, c02
  497. #endif
  498. #ifdef LT
  499. LDF [AO + 0 * SIZE], a1
  500. LDF [AO + 1 * SIZE], a2
  501. LDF [AO + 2 * SIZE], a3
  502. LDF [AO + 3 * SIZE], a4
  503. LDF [AO + 6 * SIZE], b1
  504. LDF [AO + 7 * SIZE], b2
  505. FMUL a1, c01, t1
  506. FMUL a2, c02, t2
  507. FMUL a1, c02, t3
  508. FMUL a2, c01, t4
  509. FADD4 t1, t2, c01
  510. FADD2 t3, t4, c02
  511. FMUL a3, c01, t1
  512. FMUL a3, c02, t2
  513. FMUL a4, c02, t5
  514. FMUL a4, c01, t6
  515. FSUB c03, t1, c03
  516. FSUB c04, t2, c04
  517. FADD2 c03, t5, c03
  518. FADD4 c04, t6, c04
  519. FMUL b1, c03, t1
  520. FMUL b2, c04, t2
  521. FMUL b1, c04, t3
  522. FMUL b2, c03, t4
  523. FADD4 t1, t2, c03
  524. FADD2 t3, t4, c04
  525. #endif
  526. #ifdef RN
  527. LDF [BO + 0 * SIZE], a1
  528. LDF [BO + 1 * SIZE], a2
  529. FMUL a1, c01, t1
  530. FMUL a2, c02, t2
  531. FMUL a1, c02, t3
  532. FMUL a2, c01, t4
  533. FMUL a1, c03, t5
  534. FMUL a2, c04, t6
  535. FMUL a1, c04, t7
  536. FMUL a2, c03, t8
  537. FADD4 t1, t2, c01
  538. FADD3 t3, t4, c02
  539. FADD4 t5, t6, c03
  540. FADD3 t7, t8, c04
  541. #endif
  542. #ifdef RT
  543. LDF [BO + 0 * SIZE], a1
  544. LDF [BO + 1 * SIZE], a2
  545. FMUL a1, c01, t1
  546. FMUL a2, c02, t2
  547. FMUL a1, c02, t3
  548. FMUL a2, c01, t4
  549. FMUL a1, c03, t5
  550. FMUL a2, c04, t6
  551. FMUL a1, c04, t7
  552. FMUL a2, c03, t8
  553. FADD4 t1, t2, c01
  554. FADD3 t3, t4, c02
  555. FADD4 t5, t6, c03
  556. FADD3 t7, t8, c04
  557. #endif
  558. #ifdef LN
  559. add C1, -4 * SIZE, C1
  560. #endif
  561. #if defined(LN) || defined(LT)
  562. STF c01, [BO + 0 * SIZE]
  563. STF c02, [BO + 1 * SIZE]
  564. STF c03, [BO + 2 * SIZE]
  565. STF c04, [BO + 3 * SIZE]
  566. #else
  567. STF c01, [AO + 0 * SIZE]
  568. STF c02, [AO + 1 * SIZE]
  569. STF c03, [AO + 2 * SIZE]
  570. STF c04, [AO + 3 * SIZE]
  571. #endif
  572. STF c01, [C1 + 0 * SIZE]
  573. STF c02, [C1 + 1 * SIZE]
  574. STF c03, [C1 + 2 * SIZE]
  575. STF c04, [C1 + 3 * SIZE]
  576. FMOV FZERO, t1
  577. FMOV FZERO, t2
  578. FMOV FZERO, t3
  579. FMOV FZERO, t4
  580. #ifndef LN
  581. add C1, 4 * SIZE, C1
  582. #endif
  583. #ifdef RT
  584. sll K, 1 + ZBASE_SHIFT, TEMP1
  585. add AORIG, TEMP1, AORIG
  586. #endif
  587. #if defined(LT) || defined(RN)
  588. sub K, KK, TEMP1
  589. sll TEMP1, 1 + ZBASE_SHIFT, TEMP2
  590. sll TEMP1, 0 + ZBASE_SHIFT, TEMP1
  591. add AO, TEMP2, AO
  592. add BO, TEMP1, BO
  593. #endif
  594. #ifdef LT
  595. add KK, 2, KK
  596. #endif
  597. #ifdef LN
  598. sub KK, 2, KK
  599. #endif
  600. add I, -1, I
  601. cmp I, 0
  602. bg,pt %icc, .LL121
  603. FMOV FZERO, c03
  604. .LL150:
  605. and M, 1, I
  606. cmp I, 0
  607. ble,pn %icc, .LL199
  608. nop
  609. #if defined(LT) || defined(RN)
  610. sra KK, 2, L
  611. mov B, BO
  612. cmp L, 0
  613. #else
  614. #ifdef LN
  615. sll K, 0 + ZBASE_SHIFT, TEMP1
  616. sub AORIG, TEMP1, AORIG
  617. #endif
  618. sll KK, 0 + ZBASE_SHIFT, TEMP1
  619. add AORIG, TEMP1, AO
  620. add B, TEMP1, BO
  621. sub K, KK, TEMP1
  622. sra TEMP1, 2, L
  623. cmp L, 0
  624. #endif
  625. LDF [AO + 0 * SIZE], a1
  626. FMOV FZERO, c01
  627. LDF [BO + 0 * SIZE], b1
  628. FMOV FZERO, t1
  629. LDF [AO + 1 * SIZE], a2
  630. FMOV FZERO, c02
  631. LDF [BO + 1 * SIZE], b2
  632. FMOV FZERO, t2
  633. LDF [AO + 2 * SIZE], a3
  634. FMOV FZERO, c03
  635. LDF [BO + 2 * SIZE], b3
  636. FMOV FZERO, t3
  637. LDF [AO + 3 * SIZE], a4
  638. FMOV FZERO, c04
  639. LDF [BO + 3 * SIZE], b4
  640. FMOV FZERO, t4
  641. ble,pn %icc, .LL155
  642. nop
  643. .LL152:
  644. FADD1 c01, t1, c01
  645. add L, -1, L
  646. FMUL a1, b1, t1
  647. prefetch [AO + APREFETCHSIZE * SIZE], 0
  648. FADD3 c02, t2, c02
  649. add BO, 8 * SIZE, BO
  650. FMUL a1, b2, t2
  651. LDF [AO + 4 * SIZE], a1
  652. FADD2 c03, t3, c03
  653. cmp L, 0
  654. FMUL a2, b1, t3
  655. LDF [BO - 4 * SIZE], b1
  656. FADD4 c04, t4, c04
  657. nop
  658. FMUL a2, b2, t4
  659. LDF [AO + 5 * SIZE], a2
  660. FADD1 c01, t1, c01
  661. nop
  662. FMUL a3, b3, t1
  663. LDF [BO - 3 * SIZE], b2
  664. FADD3 c02, t2, c02
  665. nop
  666. FMUL a3, b4, t2
  667. LDF [AO + 6 * SIZE], a3
  668. FADD2 c03, t3, c03
  669. nop
  670. FMUL a4, b3, t3
  671. LDF [BO - 2 * SIZE], b3
  672. FADD4 c04, t4, c04
  673. nop
  674. FMUL a4, b4, t4
  675. LDF [AO + 7 * SIZE], a4
  676. FADD1 c01, t1, c01
  677. nop
  678. FMUL a1, b1, t1
  679. LDF [BO - 1 * SIZE], b4
  680. FADD3 c02, t2, c02
  681. FMUL a1, b2, t2
  682. LDF [AO + 8 * SIZE], a1
  683. FADD2 c03, t3, c03
  684. FMUL a2, b1, t3
  685. LDF [BO + 0 * SIZE], b1
  686. FADD4 c04, t4, c04
  687. FMUL a2, b2, t4
  688. LDF [AO + 9 * SIZE], a2
  689. FADD1 c01, t1, c01
  690. FMUL a3, b3, t1
  691. LDF [BO + 1 * SIZE], b2
  692. FADD3 c02, t2, c02
  693. FMUL a3, b4, t2
  694. LDF [AO + 10 * SIZE], a3
  695. FADD2 c03, t3, c03
  696. FMUL a4, b3, t3
  697. LDF [BO + 2 * SIZE], b3
  698. FADD4 c04, t4, c04
  699. FMUL a4, b4, t4
  700. LDF [AO + 11 * SIZE], a4
  701. add AO, 8 * SIZE, AO
  702. bg,pt %icc, .LL152
  703. LDF [BO + 3 * SIZE], b4
  704. .LL155:
  705. #if defined(LT) || defined(RN)
  706. and KK, 3, L
  707. #else
  708. and TEMP1, 3, L
  709. #endif
  710. cmp L, 0
  711. ble,a,pn %icc, .LL159
  712. nop
  713. .LL156:
  714. FADD1 c01, t1, c01
  715. add AO, 2 * SIZE, AO
  716. FMUL a1, b1, t1
  717. add BO, 2 * SIZE, BO
  718. FADD3 c02, t2, c02
  719. add L, -1, L
  720. FMUL a1, b2, t2
  721. LDF [AO + 0 * SIZE], a1
  722. FADD2 c03, t3, c03
  723. FMUL a2, b1, t3
  724. LDF [BO + 0 * SIZE], b1
  725. cmp L, 0
  726. FADD4 c04, t4, c04
  727. FMUL a2, b2, t4
  728. LDF [BO + 1 * SIZE], b2
  729. bg,pt %icc, .LL156
  730. LDF [AO + 1 * SIZE], a2
  731. .LL159:
  732. FADD1 c01, t1, c01
  733. FADD3 c02, t2, c02
  734. FADD2 c03, t3, c03
  735. FADD4 c04, t4, c04
  736. FADD c01, c04, c01
  737. FADD c02, c03, c02
  738. #if defined(LN) || defined(RT)
  739. sub KK, 1, TEMP1
  740. sll TEMP1, 0 + ZBASE_SHIFT, TEMP1
  741. add AORIG, TEMP1, AO
  742. add B, TEMP1, BO
  743. #endif
  744. #if defined(LN) || defined(LT)
  745. LDF [BO + 0 * SIZE], a1
  746. LDF [BO + 1 * SIZE], a2
  747. FSUB a1, c01, c01
  748. FSUB a2, c02, c02
  749. #else
  750. LDF [AO + 0 * SIZE], a1
  751. LDF [AO + 1 * SIZE], a2
  752. FSUB a1, c01, c01
  753. FSUB a2, c02, c02
  754. #endif
  755. #ifdef LN
  756. LDF [AO + 0 * SIZE], a1
  757. LDF [AO + 1 * SIZE], a2
  758. FMUL a1, c01, t1
  759. FMUL a2, c02, t2
  760. FMUL a1, c02, t3
  761. FMUL a2, c01, t4
  762. FADD4 t1, t2, c01
  763. FADD2 t3, t4, c02
  764. #endif
  765. #ifdef LT
  766. LDF [AO + 0 * SIZE], a1
  767. LDF [AO + 1 * SIZE], a2
  768. FMUL a1, c01, t1
  769. FMUL a2, c02, t2
  770. FMUL a1, c02, t3
  771. FMUL a2, c01, t4
  772. FADD4 t1, t2, c01
  773. FADD2 t3, t4, c02
  774. #endif
  775. #ifdef RN
  776. LDF [BO + 0 * SIZE], a1
  777. LDF [BO + 1 * SIZE], a2
  778. FMUL a1, c01, t1
  779. FMUL a2, c02, t2
  780. FMUL a1, c02, t3
  781. FMUL a2, c01, t4
  782. FADD4 t1, t2, c01
  783. FADD3 t3, t4, c02
  784. #endif
  785. #ifdef RT
  786. LDF [BO + 0 * SIZE], a1
  787. LDF [BO + 1 * SIZE], a2
  788. FMUL a1, c01, t1
  789. FMUL a2, c02, t2
  790. FMUL a1, c02, t3
  791. FMUL a2, c01, t4
  792. FADD4 t1, t2, c01
  793. FADD3 t3, t4, c02
  794. #endif
  795. #ifdef LN
  796. add C1, -2 * SIZE, C1
  797. #endif
  798. #if defined(LN) || defined(LT)
  799. STF c01, [BO + 0 * SIZE]
  800. STF c02, [BO + 1 * SIZE]
  801. #else
  802. STF c01, [AO + 0 * SIZE]
  803. STF c02, [AO + 1 * SIZE]
  804. #endif
  805. STF c01, [C1 + 0 * SIZE]
  806. STF c02, [C1 + 1 * SIZE]
  807. FMOV FZERO, t1
  808. FMOV FZERO, t2
  809. FMOV FZERO, t3
  810. FMOV FZERO, t4
  811. #ifndef LN
  812. add C1, 2 * SIZE, C1
  813. #endif
  814. #ifdef RT
  815. sll K, 0 + ZBASE_SHIFT, TEMP1
  816. add AORIG, TEMP1, AORIG
  817. #endif
  818. #if defined(LT) || defined(RN)
  819. sub K, KK, TEMP1
  820. sll TEMP1, 0 + ZBASE_SHIFT, TEMP1
  821. add AO, TEMP1, AO
  822. add BO, TEMP1, BO
  823. #endif
  824. #ifdef LT
  825. add KK, 1, KK
  826. #endif
  827. #ifdef LN
  828. sub KK, 1, KK
  829. #endif
  830. .LL199:
  831. #ifdef LN
  832. sll K, 0 + ZBASE_SHIFT, TEMP1
  833. add B, TEMP1, B
  834. #endif
  835. #if defined(LT) || defined(RN)
  836. mov BO, B
  837. #endif
  838. #ifdef RN
  839. add KK, 1, KK
  840. #endif
  841. #ifdef RT
  842. sub KK, 1, KK
  843. #endif
  844. .LL100:
  845. sra N, 1, J
  846. cmp J, 0
  847. ble,pn %icc, .LL999
  848. nop
  849. .LL11:
  850. #ifdef RT
  851. sll K, 1 + ZBASE_SHIFT, TEMP1
  852. sub B, TEMP1, B
  853. add LDC, LDC, TEMP1
  854. sub C, TEMP1, C
  855. #endif
  856. FMOV FZERO, t1
  857. FMOV FZERO, t2
  858. FMOV FZERO, t3
  859. sra M, 1, I
  860. mov C, C1
  861. add C, LDC, C2
  862. #ifdef LN
  863. add M, OFFSET, KK
  864. #endif
  865. #ifdef LT
  866. mov OFFSET, KK
  867. #endif
  868. #if defined(LN) || defined(RT)
  869. mov A, AORIG
  870. #else
  871. mov A, AO
  872. #endif
  873. cmp I, 0
  874. #ifndef RT
  875. add C2, LDC, C
  876. #endif
  877. ble,pn %icc, .LL50
  878. FMOV FZERO, t4
  879. .LL21:
  880. #if defined(LT) || defined(RN)
  881. sra KK, 2, L
  882. mov B, BO
  883. cmp L, 0
  884. #else
  885. #ifdef LN
  886. sll K, 1 + ZBASE_SHIFT, TEMP1
  887. sub AORIG, TEMP1, AORIG
  888. #endif
  889. sll KK, 1 + ZBASE_SHIFT, TEMP1
  890. add AORIG, TEMP1, AO
  891. add B, TEMP1, BO
  892. sub K, KK, TEMP1
  893. sra TEMP1, 2, L
  894. cmp L, 0
  895. #endif
  896. FMOV FZERO, t1
  897. FMOV FZERO, t2
  898. FMOV FZERO, t3
  899. FMOV FZERO, t4
  900. FMOV FZERO, c01
  901. FMOV FZERO, c02
  902. LDF [AO + 0 * SIZE], a1
  903. FMOV FZERO, c03
  904. LDF [BO + 0 * SIZE], b1
  905. FMOV FZERO, c04
  906. LDF [AO + 1 * SIZE], a2
  907. FMOV FZERO, c05
  908. LDF [BO + 1 * SIZE], b2
  909. FMOV FZERO, c06
  910. LDF [AO + 2 * SIZE], a3
  911. FMOV FZERO, c07
  912. LDF [BO + 2 * SIZE], b3
  913. FMOV FZERO, c08
  914. LDF [AO + 3 * SIZE], a4
  915. FMOV FZERO, c09
  916. LDF [BO + 3 * SIZE], b4
  917. FMOV FZERO, c10
  918. LDF [BO + 4 * SIZE], b5
  919. FMOV FZERO, c11
  920. LDF [AO + 4 * SIZE], a5
  921. FMOV FZERO, c12
  922. prefetch [C1 + 3 * SIZE], 3
  923. FMOV FZERO, c13
  924. prefetch [C2 + 3 * SIZE], 3
  925. FMOV FZERO, c14
  926. FMOV FZERO, c15
  927. ble,pn %icc, .LL25
  928. FMOV FZERO, c16
  929. .LL22:
  930. FADD2 c04, t1, c04
  931. prefetch [AO + APREFETCHSIZE * SIZE], APREFETCH_CATEGORY
  932. FMUL a1, b1, t1
  933. nop
  934. FADD4 c08, t2, c08
  935. prefetch [BO + BPREFETCHSIZE * SIZE], BPREFETCH_CATEGORY
  936. FMUL a1, b2, t2
  937. add AO, 16 * SIZE, AO
  938. FADD2 c12, t3, c12
  939. LDF [AO - 13 * SIZE], a4
  940. FMUL a1, b3, t3
  941. add BO, 16 * SIZE, BO
  942. FADD4 c16, t4, c16
  943. nop
  944. FMUL a1, b4, t4
  945. LDF [AO - 8 * SIZE], a1
  946. FADD1 c01, t1, c01
  947. nop
  948. FMUL a2, b1, t1
  949. nop
  950. FADD3 c05, t2, c05
  951. nop
  952. FMUL a2, b2, t2
  953. nop
  954. FADD1 c09, t3, c09
  955. nop
  956. FMUL a2, b3, t3
  957. nop
  958. FADD3 c13, t4, c13
  959. add L, -1, L
  960. FMUL a2, b4, t4
  961. LDF [AO - 11 * SIZE], a2
  962. FADD2 c02, t1, c02
  963. nop
  964. FMUL a3, b1, t1
  965. nop
  966. FADD4 c06, t2, c06
  967. nop
  968. FMUL a3, b2, t2
  969. nop
  970. FADD2 c10, t3, c10
  971. nop
  972. FMUL a3, b3, t3
  973. nop
  974. FADD4 c14, t4, c14
  975. nop
  976. FMUL a3, b4, t4
  977. LDF [AO - 10 * SIZE], a3
  978. FADD1 c03, t1, c03
  979. nop
  980. FMUL a4, b1, t1
  981. LDF [BO - 8 * SIZE], b1
  982. FADD3 c07, t2, c07
  983. nop
  984. FMUL a4, b2, t2
  985. LDF [BO - 11 * SIZE], b2
  986. FADD1 c11, t3, c11
  987. nop
  988. FMUL a4, b3, t3
  989. LDF [BO - 10 * SIZE], b3
  990. FADD3 c15, t4, c15
  991. nop
  992. FMUL a4, b4, t4
  993. LDF [BO - 9 * SIZE], b4
  994. FADD2 c04, t1, c04
  995. nop
  996. FMUL a5, b5, t1
  997. LDF [AO - 9 * SIZE], a4
  998. FADD4 c08, t2, c08
  999. nop
  1000. FMUL a5, b2, t2
  1001. nop
  1002. FADD2 c12, t3, c12
  1003. nop
  1004. FMUL a5, b3, t3
  1005. nop
  1006. FADD4 c16, t4, c16
  1007. nop
  1008. FMUL a5, b4, t4
  1009. LDF [AO - 4 * SIZE], a5
  1010. FADD1 c01, t1, c01
  1011. nop
  1012. FMUL a2, b5, t1
  1013. nop
  1014. FADD3 c05, t2, c05
  1015. nop
  1016. FMUL a2, b2, t2
  1017. nop
  1018. FADD1 c09, t3, c09
  1019. nop
  1020. FMUL a2, b3, t3
  1021. nop
  1022. FADD3 c13, t4, c13
  1023. nop
  1024. FMUL a2, b4, t4
  1025. LDF [AO - 7 * SIZE], a2
  1026. FADD2 c02, t1, c02
  1027. nop
  1028. FMUL a3, b5, t1
  1029. nop
  1030. FADD4 c06, t2, c06
  1031. nop
  1032. FMUL a3, b2, t2
  1033. nop
  1034. FADD2 c10, t3, c10
  1035. nop
  1036. FMUL a3, b3, t3
  1037. nop
  1038. FADD4 c14, t4, c14
  1039. nop
  1040. FMUL a3, b4, t4
  1041. LDF [AO - 6 * SIZE], a3
  1042. FADD1 c03, t1, c03
  1043. nop
  1044. FMUL a4, b5, t1
  1045. LDF [BO - 4 * SIZE], b5
  1046. FADD3 c07, t2, c07
  1047. nop
  1048. FMUL a4, b2, t2
  1049. LDF [BO - 7 * SIZE], b2
  1050. FADD1 c11, t3, c11
  1051. nop
  1052. FMUL a4, b3, t3
  1053. LDF [BO - 6 * SIZE], b3
  1054. FADD3 c15, t4, c15
  1055. nop
  1056. FMUL a4, b4, t4
  1057. LDF [BO - 5 * SIZE], b4
  1058. FADD2 c04, t1, c04
  1059. nop
  1060. FMUL a1, b1, t1
  1061. LDF [AO - 5 * SIZE], a4
  1062. FADD4 c08, t2, c08
  1063. nop
  1064. FMUL a1, b2, t2
  1065. nop
  1066. FADD2 c12, t3, c12
  1067. nop
  1068. FMUL a1, b3, t3
  1069. nop
  1070. FADD4 c16, t4, c16
  1071. nop
  1072. FMUL a1, b4, t4
  1073. LDF [AO - 0 * SIZE], a1
  1074. FADD1 c01, t1, c01
  1075. nop
  1076. FMUL a2, b1, t1
  1077. nop
  1078. #ifdef DOUBLE
  1079. prefetch [AO + (APREFETCHSIZE + 8) * SIZE], APREFETCH_CATEGORY
  1080. #else
  1081. nop
  1082. #endif
  1083. FADD3 c05, t2, c05
  1084. nop
  1085. FMUL a2, b2, t2
  1086. FADD1 c09, t3, c09
  1087. nop
  1088. FMUL a2, b3, t3
  1089. nop
  1090. FADD3 c13, t4, c13
  1091. nop
  1092. FMUL a2, b4, t4
  1093. nop
  1094. FADD2 c02, t1, c02
  1095. nop
  1096. FMUL a3, b1, t1
  1097. LDF [AO - 3 * SIZE], a2
  1098. FADD4 c06, t2, c06
  1099. #ifdef DOUBLE
  1100. prefetch [BO + (BPREFETCHSIZE + 8) * SIZE], BPREFETCH_CATEGORY
  1101. #else
  1102. nop
  1103. #endif
  1104. FMUL a3, b2, t2
  1105. nop
  1106. FADD2 c10, t3, c10
  1107. nop
  1108. FMUL a3, b3, t3
  1109. nop
  1110. FADD4 c14, t4, c14
  1111. nop
  1112. FMUL a3, b4, t4
  1113. LDF [AO - 2 * SIZE], a3
  1114. FADD1 c03, t1, c03
  1115. nop
  1116. FMUL a4, b1, t1
  1117. LDF [BO - 0 * SIZE], b1
  1118. FADD3 c07, t2, c07
  1119. nop
  1120. FMUL a4, b2, t2
  1121. LDF [BO - 3 * SIZE], b2
  1122. FADD1 c11, t3, c11
  1123. nop
  1124. FMUL a4, b3, t3
  1125. LDF [BO - 2 * SIZE], b3
  1126. FADD3 c15, t4, c15
  1127. nop
  1128. FMUL a4, b4, t4
  1129. LDF [BO - 1 * SIZE], b4
  1130. FADD2 c04, t1, c04
  1131. nop
  1132. FMUL a5, b5, t1
  1133. LDF [AO - 1 * SIZE], a4
  1134. FADD4 c08, t2, c08
  1135. FMUL a5, b2, t2
  1136. FADD2 c12, t3, c12
  1137. FMUL a5, b3, t3
  1138. FADD4 c16, t4, c16
  1139. nop
  1140. FMUL a5, b4, t4
  1141. LDF [AO + 4 * SIZE], a5
  1142. FADD1 c01, t1, c01
  1143. nop
  1144. FMUL a2, b5, t1
  1145. nop
  1146. FADD3 c05, t2, c05
  1147. nop
  1148. FMUL a2, b2, t2
  1149. nop
  1150. FADD1 c09, t3, c09
  1151. nop
  1152. FMUL a2, b3, t3
  1153. nop
  1154. FADD3 c13, t4, c13
  1155. nop
  1156. FMUL a2, b4, t4
  1157. LDF [AO + 1 * SIZE], a2
  1158. FADD2 c02, t1, c02
  1159. nop
  1160. FMUL a3, b5, t1
  1161. nop
  1162. FADD4 c06, t2, c06
  1163. nop
  1164. FMUL a3, b2, t2
  1165. nop
  1166. FADD2 c10, t3, c10
  1167. nop
  1168. FMUL a3, b3, t3
  1169. nop
  1170. FADD4 c14, t4, c14
  1171. nop
  1172. FMUL a3, b4, t4
  1173. LDF [AO + 2 * SIZE], a3
  1174. FADD1 c03, t1, c03
  1175. cmp L, 0
  1176. FMUL a4, b5, t1
  1177. LDF [BO + 4 * SIZE], b5
  1178. FADD3 c07, t2, c07
  1179. nop
  1180. FMUL a4, b2, t2
  1181. LDF [BO + 1 * SIZE], b2
  1182. FADD1 c11, t3, c11
  1183. nop
  1184. FMUL a4, b3, t3
  1185. LDF [BO + 2 * SIZE], b3
  1186. FADD3 c15, t4, c15
  1187. FMUL a4, b4, t4
  1188. bg,pt %icc, .LL22
  1189. LDF [BO + 3 * SIZE], b4
  1190. .LL25:
  1191. #if defined(LT) || defined(RN)
  1192. and KK, 3, L
  1193. #else
  1194. and TEMP1, 3, L
  1195. #endif
  1196. cmp L, 0
  1197. ble,pn %icc, .LL29
  1198. nop
  1199. .LL26:
  1200. FADD2 c04, t1, c04
  1201. LDF [AO + 3 * SIZE], a4
  1202. FMUL a1, b1, t1
  1203. add AO, 4 * SIZE, AO
  1204. FADD4 c08, t2, c08
  1205. add BO, 4 * SIZE, BO
  1206. FMUL a1, b2, t2
  1207. add L, -1, L
  1208. FADD2 c12, t3, c12
  1209. nop
  1210. FMUL a1, b3, t3
  1211. cmp L, 0
  1212. FADD4 c16, t4, c16
  1213. nop
  1214. FMUL a1, b4, t4
  1215. LDF [AO + 0 * SIZE], a1
  1216. FADD1 c01, t1, c01
  1217. nop
  1218. FMUL a2, b1, t1
  1219. nop
  1220. FADD3 c05, t2, c05
  1221. nop
  1222. FMUL a2, b2, t2
  1223. nop
  1224. FADD1 c09, t3, c09
  1225. nop
  1226. FMUL a2, b3, t3
  1227. nop
  1228. FADD3 c13, t4, c13
  1229. nop
  1230. FMUL a2, b4, t4
  1231. LDF [AO + 1 * SIZE], a2
  1232. FADD2 c02, t1, c02
  1233. nop
  1234. FMUL a3, b1, t1
  1235. nop
  1236. FADD4 c06, t2, c06
  1237. nop
  1238. FMUL a3, b2, t2
  1239. nop
  1240. FADD2 c10, t3, c10
  1241. nop
  1242. FMUL a3, b3, t3
  1243. nop
  1244. FADD4 c14, t4, c14
  1245. nop
  1246. FMUL a3, b4, t4
  1247. LDF [AO + 2 * SIZE], a3
  1248. FADD1 c03, t1, c03
  1249. nop
  1250. FMUL a4, b1, t1
  1251. LDF [BO + 0 * SIZE], b1
  1252. FADD3 c07, t2, c07
  1253. nop
  1254. FMUL a4, b2, t2
  1255. LDF [BO + 1 * SIZE], b2
  1256. FADD1 c11, t3, c11
  1257. nop
  1258. FMUL a4, b3, t3
  1259. LDF [BO + 2 * SIZE], b3
  1260. FADD3 c15, t4, c15
  1261. FMUL a4, b4, t4
  1262. bg,pt %icc, .LL26
  1263. LDF [BO + 3 * SIZE], b4
  1264. .LL29:
  1265. #if defined(LN) || defined(RT)
  1266. sub KK, 2, TEMP1
  1267. sll TEMP1, 1 + ZBASE_SHIFT, TEMP1
  1268. add AORIG, TEMP1, AO
  1269. add B, TEMP1, BO
  1270. #endif
  1271. FADD2 c04, t1, c04
  1272. FADD4 c08, t2, c08
  1273. FADD2 c12, t3, c12
  1274. FADD4 c16, t4, c16
  1275. FADD c01, c06, c01
  1276. FADD c02, c05, c02
  1277. FADD c03, c08, c03
  1278. FADD c04, c07, c04
  1279. FADD c09, c14, c09
  1280. FADD c10, c13, c10
  1281. FADD c11, c16, c11
  1282. FADD c12, c15, c12
  1283. #if defined(LN) || defined(LT)
  1284. LDF [BO + 0 * SIZE], a1
  1285. LDF [BO + 1 * SIZE], a2
  1286. LDF [BO + 2 * SIZE], a3
  1287. LDF [BO + 3 * SIZE], a4
  1288. LDF [BO + 4 * SIZE], b1
  1289. LDF [BO + 5 * SIZE], b2
  1290. LDF [BO + 6 * SIZE], b3
  1291. LDF [BO + 7 * SIZE], b4
  1292. FSUB a1, c01, c01
  1293. FSUB a2, c02, c02
  1294. FSUB a3, c09, c09
  1295. FSUB a4, c10, c10
  1296. FSUB b1, c03, c03
  1297. FSUB b2, c04, c04
  1298. FSUB b3, c11, c11
  1299. FSUB b4, c12, c12
  1300. #else
  1301. LDF [AO + 0 * SIZE], a1
  1302. LDF [AO + 1 * SIZE], a2
  1303. LDF [AO + 2 * SIZE], a3
  1304. LDF [AO + 3 * SIZE], a4
  1305. LDF [AO + 4 * SIZE], b1
  1306. LDF [AO + 5 * SIZE], b2
  1307. LDF [AO + 6 * SIZE], b3
  1308. LDF [AO + 7 * SIZE], b4
  1309. FSUB a1, c01, c01
  1310. FSUB a2, c02, c02
  1311. FSUB a3, c03, c03
  1312. FSUB a4, c04, c04
  1313. FSUB b1, c09, c09
  1314. FSUB b2, c10, c10
  1315. FSUB b3, c11, c11
  1316. FSUB b4, c12, c12
  1317. #endif
  1318. #ifdef LN
  1319. LDF [AO + 6 * SIZE], a1
  1320. LDF [AO + 7 * SIZE], a2
  1321. LDF [AO + 4 * SIZE], a3
  1322. LDF [AO + 5 * SIZE], a4
  1323. LDF [AO + 0 * SIZE], b1
  1324. LDF [AO + 1 * SIZE], b2
  1325. FMUL a1, c03, t1
  1326. FMUL a2, c04, t2
  1327. FMUL a1, c04, t3
  1328. FMUL a2, c03, t4
  1329. FMUL a1, c11, t5
  1330. FMUL a2, c12, t6
  1331. FMUL a1, c12, t7
  1332. FMUL a2, c11, t8
  1333. FADD4 t1, t2, c03
  1334. FADD2 t3, t4, c04
  1335. FADD4 t5, t6, c11
  1336. FADD2 t7, t8, c12
  1337. FMUL a3, c03, t1
  1338. FMUL a3, c04, t2
  1339. FMUL a3, c11, t3
  1340. FMUL a3, c12, t4
  1341. FMUL a4, c04, t5
  1342. FMUL a4, c03, t6
  1343. FMUL a4, c12, t7
  1344. FMUL a4, c11, t8
  1345. FSUB c01, t1, c01
  1346. FSUB c02, t2, c02
  1347. FSUB c09, t3, c09
  1348. FSUB c10, t4, c10
  1349. FADD2 c01, t5, c01
  1350. FADD4 c02, t6, c02
  1351. FADD2 c09, t7, c09
  1352. FADD4 c10, t8, c10
  1353. FMUL b1, c01, t1
  1354. FMUL b2, c02, t2
  1355. FMUL b1, c02, t3
  1356. FMUL b2, c01, t4
  1357. FMUL b1, c09, t5
  1358. FMUL b2, c10, t6
  1359. FMUL b1, c10, t7
  1360. FMUL b2, c09, t8
  1361. FADD4 t1, t2, c01
  1362. FADD2 t3, t4, c02
  1363. FADD4 t5, t6, c09
  1364. FADD2 t7, t8, c10
  1365. #endif
  1366. #ifdef LT
  1367. LDF [AO + 0 * SIZE], a1
  1368. LDF [AO + 1 * SIZE], a2
  1369. LDF [AO + 2 * SIZE], a3
  1370. LDF [AO + 3 * SIZE], a4
  1371. LDF [AO + 6 * SIZE], b1
  1372. LDF [AO + 7 * SIZE], b2
  1373. FMUL a1, c01, t1
  1374. FMUL a2, c02, t2
  1375. FMUL a1, c02, t3
  1376. FMUL a2, c01, t4
  1377. FMUL a1, c09, t5
  1378. FMUL a2, c10, t6
  1379. FMUL a1, c10, t7
  1380. FMUL a2, c09, t8
  1381. FADD4 t1, t2, c01
  1382. FADD2 t3, t4, c02
  1383. FADD4 t5, t6, c09
  1384. FADD2 t7, t8, c10
  1385. FMUL a3, c01, t1
  1386. FMUL a3, c02, t2
  1387. FMUL a3, c09, t3
  1388. FMUL a3, c10, t4
  1389. FMUL a4, c02, t5
  1390. FMUL a4, c01, t6
  1391. FMUL a4, c10, t7
  1392. FMUL a4, c09, t8
  1393. FSUB c03, t1, c03
  1394. FSUB c04, t2, c04
  1395. FSUB c11, t3, c11
  1396. FSUB c12, t4, c12
  1397. FADD2 c03, t5, c03
  1398. FADD4 c04, t6, c04
  1399. FADD2 c11, t7, c11
  1400. FADD4 c12, t8, c12
  1401. FMUL b1, c03, t1
  1402. FMUL b2, c04, t2
  1403. FMUL b1, c04, t3
  1404. FMUL b2, c03, t4
  1405. FMUL b1, c11, t5
  1406. FMUL b2, c12, t6
  1407. FMUL b1, c12, t7
  1408. FMUL b2, c11, t8
  1409. FADD4 t1, t2, c03
  1410. FADD2 t3, t4, c04
  1411. FADD4 t5, t6, c11
  1412. FADD2 t7, t8, c12
  1413. #endif
  1414. #ifdef RN
  1415. LDF [BO + 0 * SIZE], a1
  1416. LDF [BO + 1 * SIZE], a2
  1417. LDF [BO + 2 * SIZE], a3
  1418. LDF [BO + 3 * SIZE], a4
  1419. LDF [BO + 6 * SIZE], b1
  1420. LDF [BO + 7 * SIZE], b2
  1421. FMUL a1, c01, t1
  1422. FMUL a2, c02, t2
  1423. FMUL a1, c02, t3
  1424. FMUL a2, c01, t4
  1425. FMUL a1, c03, t5
  1426. FMUL a2, c04, t6
  1427. FMUL a1, c04, t7
  1428. FMUL a2, c03, t8
  1429. FADD4 t1, t2, c01
  1430. FADD3 t3, t4, c02
  1431. FADD4 t5, t6, c03
  1432. FADD3 t7, t8, c04
  1433. FMUL a3, c01, t1
  1434. FMUL a3, c02, t2
  1435. FMUL a3, c03, t3
  1436. FMUL a3, c04, t4
  1437. FMUL a4, c02, t5
  1438. FMUL a4, c01, t6
  1439. FMUL a4, c04, t7
  1440. FMUL a4, c03, t8
  1441. FSUB c09, t1, c09
  1442. FSUB c10, t2, c10
  1443. FSUB c11, t3, c11
  1444. FSUB c12, t4, c12
  1445. FADD3 c09, t5, c09
  1446. FADD4 c10, t6, c10
  1447. FADD3 c11, t7, c11
  1448. FADD4 c12, t8, c12
  1449. FMUL b1, c09, t1
  1450. FMUL b2, c10, t2
  1451. FMUL b1, c10, t3
  1452. FMUL b2, c09, t4
  1453. FMUL b1, c11, t5
  1454. FMUL b2, c12, t6
  1455. FMUL b1, c12, t7
  1456. FMUL b2, c11, t8
  1457. FADD4 t1, t2, c09
  1458. FADD3 t3, t4, c10
  1459. FADD4 t5, t6, c11
  1460. FADD3 t7, t8, c12
  1461. #endif
  1462. #ifdef RT
  1463. LDF [BO + 6 * SIZE], a1
  1464. LDF [BO + 7 * SIZE], a2
  1465. LDF [BO + 4 * SIZE], a3
  1466. LDF [BO + 5 * SIZE], a4
  1467. LDF [BO + 0 * SIZE], b1
  1468. LDF [BO + 1 * SIZE], b2
  1469. FMUL a1, c09, t1
  1470. FMUL a2, c10, t2
  1471. FMUL a1, c10, t3
  1472. FMUL a2, c09, t4
  1473. FMUL a1, c11, t5
  1474. FMUL a2, c12, t6
  1475. FMUL a1, c12, t7
  1476. FMUL a2, c11, t8
  1477. FADD4 t1, t2, c09
  1478. FADD3 t3, t4, c10
  1479. FADD4 t5, t6, c11
  1480. FADD3 t7, t8, c12
  1481. FMUL a3, c09, t1
  1482. FMUL a3, c10, t2
  1483. FMUL a3, c11, t3
  1484. FMUL a3, c12, t4
  1485. FMUL a4, c10, t5
  1486. FMUL a4, c09, t6
  1487. FMUL a4, c12, t7
  1488. FMUL a4, c11, t8
  1489. FSUB c01, t1, c01
  1490. FSUB c02, t2, c02
  1491. FSUB c03, t3, c03
  1492. FSUB c04, t4, c04
  1493. FADD3 c01, t5, c01
  1494. FADD4 c02, t6, c02
  1495. FADD3 c03, t7, c03
  1496. FADD4 c04, t8, c04
  1497. FMUL b1, c01, t1
  1498. FMUL b2, c02, t2
  1499. FMUL b1, c02, t3
  1500. FMUL b2, c01, t4
  1501. FMUL b1, c03, t5
  1502. FMUL b2, c04, t6
  1503. FMUL b1, c04, t7
  1504. FMUL b2, c03, t8
  1505. FADD4 t1, t2, c01
  1506. FADD3 t3, t4, c02
  1507. FADD4 t5, t6, c03
  1508. FADD3 t7, t8, c04
  1509. #endif
  1510. #ifdef LN
  1511. add C1, -4 * SIZE, C1
  1512. add C2, -4 * SIZE, C2
  1513. #endif
  1514. #if defined(LN) || defined(LT)
  1515. STF c01, [BO + 0 * SIZE]
  1516. STF c02, [BO + 1 * SIZE]
  1517. STF c09, [BO + 2 * SIZE]
  1518. STF c10, [BO + 3 * SIZE]
  1519. STF c03, [BO + 4 * SIZE]
  1520. STF c04, [BO + 5 * SIZE]
  1521. STF c11, [BO + 6 * SIZE]
  1522. STF c12, [BO + 7 * SIZE]
  1523. #else
  1524. STF c01, [AO + 0 * SIZE]
  1525. STF c02, [AO + 1 * SIZE]
  1526. STF c03, [AO + 2 * SIZE]
  1527. STF c04, [AO + 3 * SIZE]
  1528. STF c09, [AO + 4 * SIZE]
  1529. STF c10, [AO + 5 * SIZE]
  1530. STF c11, [AO + 6 * SIZE]
  1531. STF c12, [AO + 7 * SIZE]
  1532. #endif
  1533. STF c01, [C1 + 0 * SIZE]
  1534. STF c02, [C1 + 1 * SIZE]
  1535. STF c03, [C1 + 2 * SIZE]
  1536. STF c04, [C1 + 3 * SIZE]
  1537. STF c09, [C2 + 0 * SIZE]
  1538. STF c10, [C2 + 1 * SIZE]
  1539. STF c11, [C2 + 2 * SIZE]
  1540. STF c12, [C2 + 3 * SIZE]
  1541. FMOV FZERO, t1
  1542. FMOV FZERO, t2
  1543. FMOV FZERO, t3
  1544. FMOV FZERO, t4
  1545. #ifndef LN
  1546. add C1, 4 * SIZE, C1
  1547. add C2, 4 * SIZE, C2
  1548. #endif
  1549. #ifdef RT
  1550. sll K, 1 + ZBASE_SHIFT, TEMP1
  1551. add AORIG, TEMP1, AORIG
  1552. #endif
  1553. #if defined(LT) || defined(RN)
  1554. sub K, KK, TEMP1
  1555. sll TEMP1, 1 + ZBASE_SHIFT, TEMP1
  1556. add AO, TEMP1, AO
  1557. add BO, TEMP1, BO
  1558. #endif
  1559. #ifdef LT
  1560. add KK, 2, KK
  1561. #endif
  1562. #ifdef LN
  1563. sub KK, 2, KK
  1564. #endif
  1565. add I, -1, I
  1566. cmp I, 0
  1567. bg,pt %icc, .LL21
  1568. FMOV FZERO, c01
  1569. .LL50:
  1570. and M, 1, I
  1571. FMOV FZERO, c02
  1572. cmp I, 0
  1573. FMOV FZERO, t1
  1574. ble,pn %icc, .LL99
  1575. FMOV FZERO, c04
  1576. #if defined(LT) || defined(RN)
  1577. sra KK, 2, L
  1578. mov B, BO
  1579. cmp L, 0
  1580. #else
  1581. #ifdef LN
  1582. sll K, 0 + ZBASE_SHIFT, TEMP1
  1583. sub AORIG, TEMP1, AORIG
  1584. #endif
  1585. sll KK, 0 + ZBASE_SHIFT, TEMP1
  1586. sll KK, 1 + ZBASE_SHIFT, TEMP2
  1587. add AORIG, TEMP1, AO
  1588. add B, TEMP2, BO
  1589. sub K, KK, TEMP1
  1590. sra TEMP1, 2, L
  1591. cmp L, 0
  1592. #endif
  1593. LDF [AO + 0 * SIZE], a1
  1594. FMOV FZERO, t2
  1595. LDF [BO + 0 * SIZE], b1
  1596. FMOV FZERO, c06
  1597. LDF [AO + 1 * SIZE], a2
  1598. FMOV FZERO, t3
  1599. LDF [BO + 1 * SIZE], b2
  1600. FMOV FZERO, c08
  1601. LDF [AO + 2 * SIZE], a3
  1602. FMOV FZERO, t4
  1603. LDF [BO + 2 * SIZE], b3
  1604. FMOV FZERO, c01
  1605. LDF [AO + 3 * SIZE], a4
  1606. FMOV FZERO, c03
  1607. LDF [BO + 3 * SIZE], b4
  1608. FMOV FZERO, c05
  1609. ble,pn %icc, .LL55
  1610. FMOV FZERO, c07
  1611. .LL52:
  1612. FADD2 c02, t1, c02
  1613. add AO, 8 * SIZE, AO
  1614. prefetch [AO + APREFETCHSIZE * SIZE], 0
  1615. FMUL a1, b1, t1
  1616. add BO, 16 * SIZE, BO
  1617. FADD4 c04, t2, c04
  1618. add L, -1, L
  1619. FMUL a1, b2, t2
  1620. FADD2 c06, t3, c06
  1621. cmp L, 0
  1622. FMUL a1, b3, t3
  1623. FADD4 c08, t4, c08
  1624. FMUL a1, b4, t4
  1625. LDF [AO - 4 * SIZE], a1
  1626. FADD1 c01, t1, c01
  1627. FMUL a2, b1, t1
  1628. LDF [BO - 12 * SIZE], b1
  1629. FADD3 c03, t2, c03
  1630. FMUL a2, b2, t2
  1631. LDF [BO - 11 * SIZE], b2
  1632. FADD1 c05, t3, c05
  1633. FMUL a2, b3, t3
  1634. LDF [BO - 10 * SIZE], b3
  1635. FADD3 c07, t4, c07
  1636. FMUL a2, b4, t4
  1637. LDF [BO - 9 * SIZE], b4
  1638. FADD2 c02, t1, c02
  1639. FMUL a3, b1, t1
  1640. LDF [AO - 3 * SIZE], a2
  1641. FADD4 c04, t2, c04
  1642. FMUL a3, b2, t2
  1643. FADD2 c06, t3, c06
  1644. FMUL a3, b3, t3
  1645. FADD4 c08, t4, c08
  1646. FMUL a3, b4, t4
  1647. LDF [AO - 2 * SIZE], a3
  1648. FADD1 c01, t1, c01
  1649. FMUL a4, b1, t1
  1650. LDF [BO - 8 * SIZE], b1
  1651. FADD3 c03, t2, c03
  1652. FMUL a4, b2, t2
  1653. LDF [BO - 7 * SIZE], b2
  1654. FADD1 c05, t3, c05
  1655. FMUL a4, b3, t3
  1656. LDF [BO - 6 * SIZE], b3
  1657. FADD3 c07, t4, c07
  1658. FMUL a4, b4, t4
  1659. LDF [BO - 5 * SIZE], b4
  1660. FADD2 c02, t1, c02
  1661. FMUL a1, b1, t1
  1662. LDF [AO - 1 * SIZE], a4
  1663. FADD4 c04, t2, c04
  1664. FMUL a1, b2, t2
  1665. FADD2 c06, t3, c06
  1666. FMUL a1, b3, t3
  1667. FADD4 c08, t4, c08
  1668. FMUL a1, b4, t4
  1669. LDF [AO + 0 * SIZE], a1
  1670. FADD1 c01, t1, c01
  1671. FMUL a2, b1, t1
  1672. LDF [BO - 4 * SIZE], b1
  1673. FADD3 c03, t2, c03
  1674. FMUL a2, b2, t2
  1675. LDF [BO - 3 * SIZE], b2
  1676. FADD1 c05, t3, c05
  1677. FMUL a2, b3, t3
  1678. LDF [BO - 2 * SIZE], b3
  1679. FADD3 c07, t4, c07
  1680. FMUL a2, b4, t4
  1681. LDF [BO - 1 * SIZE], b4
  1682. FADD2 c02, t1, c02
  1683. FMUL a3, b1, t1
  1684. LDF [AO + 1 * SIZE], a2
  1685. FADD4 c04, t2, c04
  1686. FMUL a3, b2, t2
  1687. FADD2 c06, t3, c06
  1688. FMUL a3, b3, t3
  1689. FADD4 c08, t4, c08
  1690. FMUL a3, b4, t4
  1691. LDF [AO + 2 * SIZE], a3
  1692. FADD1 c01, t1, c01
  1693. FMUL a4, b1, t1
  1694. LDF [BO + 0 * SIZE], b1
  1695. FADD3 c03, t2, c03
  1696. FMUL a4, b2, t2
  1697. LDF [BO + 1 * SIZE], b2
  1698. FADD1 c05, t3, c05
  1699. FMUL a4, b3, t3
  1700. LDF [BO + 2 * SIZE], b3
  1701. FADD3 c07, t4, c07
  1702. FMUL a4, b4, t4
  1703. LDF [BO + 3 * SIZE], b4
  1704. bg,pt %icc, .LL52
  1705. LDF [AO + 3 * SIZE], a4
  1706. .LL55:
  1707. #if defined(LT) || defined(RN)
  1708. and KK, 3, L
  1709. #else
  1710. and TEMP1, 3, L
  1711. #endif
  1712. cmp L, 0
  1713. ble,a,pn %icc, .LL59
  1714. nop
  1715. .LL56:
  1716. FADD2 c02, t1, c02
  1717. add AO, 2 * SIZE, AO
  1718. FMUL a1, b1, t1
  1719. add L, -1, L
  1720. add BO, 4 * SIZE, BO
  1721. FADD4 c04, t2, c04
  1722. cmp L, 0
  1723. FMUL a1, b2, t2
  1724. FADD2 c06, t3, c06
  1725. FMUL a1, b3, t3
  1726. FADD4 c08, t4, c08
  1727. FMUL a1, b4, t4
  1728. LDF [AO + 0 * SIZE], a1
  1729. FADD1 c01, t1, c01
  1730. FMUL a2, b1, t1
  1731. LDF [BO + 0 * SIZE], b1
  1732. FADD3 c03, t2, c03
  1733. FMUL a2, b2, t2
  1734. LDF [BO + 1 * SIZE], b2
  1735. FADD1 c05, t3, c05
  1736. FMUL a2, b3, t3
  1737. LDF [BO + 2 * SIZE], b3
  1738. FADD3 c07, t4, c07
  1739. FMUL a2, b4, t4
  1740. LDF [BO + 3 * SIZE], b4
  1741. bg,pt %icc, .LL56
  1742. LDF [AO + 1 * SIZE], a2
  1743. .LL59:
  1744. #if defined(LN) || defined(RT)
  1745. #ifdef LN
  1746. sub KK, 1, TEMP1
  1747. #else
  1748. sub KK, 2, TEMP1
  1749. #endif
  1750. sll TEMP1, 0 + ZBASE_SHIFT, TEMP2
  1751. sll TEMP1, 1 + ZBASE_SHIFT, TEMP1
  1752. add AORIG, TEMP2, AO
  1753. add B, TEMP1, BO
  1754. #endif
  1755. FADD2 c02, t1, c02
  1756. FADD4 c04, t2, c04
  1757. FADD2 c06, t3, c06
  1758. FADD4 c08, t4, c08
  1759. FADD c01, c04, c01
  1760. FADD c02, c03, c02
  1761. FADD c05, c08, c05
  1762. FADD c06, c07, c06
  1763. #if defined(LN) || defined(LT)
  1764. LDF [BO + 0 * SIZE], a1
  1765. LDF [BO + 1 * SIZE], a2
  1766. LDF [BO + 2 * SIZE], a3
  1767. LDF [BO + 3 * SIZE], a4
  1768. FSUB a1, c01, c01
  1769. FSUB a2, c02, c02
  1770. FSUB a3, c05, c05
  1771. FSUB a4, c06, c06
  1772. #else
  1773. LDF [AO + 0 * SIZE], a1
  1774. LDF [AO + 1 * SIZE], a2
  1775. LDF [AO + 2 * SIZE], a3
  1776. LDF [AO + 3 * SIZE], a4
  1777. FSUB a1, c01, c01
  1778. FSUB a2, c02, c02
  1779. FSUB a3, c05, c05
  1780. FSUB a4, c06, c06
  1781. #endif
  1782. #ifdef LN
  1783. LDF [AO + 0 * SIZE], a1
  1784. LDF [AO + 1 * SIZE], a2
  1785. FMUL a1, c01, t1
  1786. FMUL a2, c02, t2
  1787. FMUL a1, c02, t3
  1788. FMUL a2, c01, t4
  1789. FMUL a1, c05, t5
  1790. FMUL a2, c06, t6
  1791. FMUL a1, c06, t7
  1792. FMUL a2, c05, t8
  1793. FADD4 t1, t2, c01
  1794. FADD2 t3, t4, c02
  1795. FADD4 t5, t6, c05
  1796. FADD2 t7, t8, c06
  1797. #endif
  1798. #ifdef LT
  1799. LDF [AO + 0 * SIZE], a1
  1800. LDF [AO + 1 * SIZE], a2
  1801. FMUL a1, c01, t1
  1802. FMUL a2, c02, t2
  1803. FMUL a1, c02, t3
  1804. FMUL a2, c01, t4
  1805. FMUL a1, c05, t5
  1806. FMUL a2, c06, t6
  1807. FMUL a1, c06, t7
  1808. FMUL a2, c05, t8
  1809. FADD4 t1, t2, c01
  1810. FADD2 t3, t4, c02
  1811. FADD4 t5, t6, c05
  1812. FADD2 t7, t8, c06
  1813. #endif
  1814. #ifdef RN
  1815. LDF [BO + 0 * SIZE], a1
  1816. LDF [BO + 1 * SIZE], a2
  1817. LDF [BO + 2 * SIZE], a3
  1818. LDF [BO + 3 * SIZE], a4
  1819. LDF [BO + 6 * SIZE], b1
  1820. LDF [BO + 7 * SIZE], b2
  1821. FMUL a1, c01, t1
  1822. FMUL a2, c02, t2
  1823. FMUL a1, c02, t3
  1824. FMUL a2, c01, t4
  1825. FADD4 t1, t2, c01
  1826. FADD3 t3, t4, c02
  1827. FMUL a3, c01, t1
  1828. FMUL a3, c02, t2
  1829. FMUL a4, c02, t3
  1830. FMUL a4, c01, t4
  1831. FSUB c05, t1, c05
  1832. FSUB c06, t2, c06
  1833. FADD3 c05, t3, c05
  1834. FADD4 c06, t4, c06
  1835. FMUL b1, c05, t1
  1836. FMUL b2, c06, t2
  1837. FMUL b1, c06, t3
  1838. FMUL b2, c05, t4
  1839. FADD4 t1, t2, c05
  1840. FADD3 t3, t4, c06
  1841. #endif
  1842. #ifdef RT
  1843. LDF [BO + 6 * SIZE], a1
  1844. LDF [BO + 7 * SIZE], a2
  1845. LDF [BO + 4 * SIZE], a3
  1846. LDF [BO + 5 * SIZE], a4
  1847. LDF [BO + 0 * SIZE], b1
  1848. LDF [BO + 1 * SIZE], b2
  1849. FMUL a1, c05, t1
  1850. FMUL a2, c06, t2
  1851. FMUL a1, c06, t3
  1852. FMUL a2, c05, t4
  1853. FADD4 t1, t2, c05
  1854. FADD3 t3, t4, c06
  1855. FMUL a3, c05, t1
  1856. FMUL a3, c06, t2
  1857. FMUL a4, c06, t3
  1858. FMUL a4, c05, t4
  1859. FSUB c01, t1, c01
  1860. FSUB c02, t2, c02
  1861. FADD3 c01, t3, c01
  1862. FADD4 c02, t4, c02
  1863. FMUL b1, c01, t1
  1864. FMUL b2, c02, t2
  1865. FMUL b1, c02, t3
  1866. FMUL b2, c01, t4
  1867. FADD4 t1, t2, c01
  1868. FADD3 t3, t4, c02
  1869. #endif
  1870. #ifdef LN
  1871. add C1, -2 * SIZE, C1
  1872. add C2, -2 * SIZE, C2
  1873. #endif
  1874. #if defined(LN) || defined(LT)
  1875. STF c01, [BO + 0 * SIZE]
  1876. STF c02, [BO + 1 * SIZE]
  1877. STF c05, [BO + 2 * SIZE]
  1878. STF c06, [BO + 3 * SIZE]
  1879. #else
  1880. STF c01, [AO + 0 * SIZE]
  1881. STF c02, [AO + 1 * SIZE]
  1882. STF c05, [AO + 2 * SIZE]
  1883. STF c06, [AO + 3 * SIZE]
  1884. #endif
  1885. STF c01, [C1 + 0 * SIZE]
  1886. STF c02, [C1 + 1 * SIZE]
  1887. STF c05, [C2 + 0 * SIZE]
  1888. STF c06, [C2 + 1 * SIZE]
  1889. FMOV FZERO, t1
  1890. FMOV FZERO, t2
  1891. FMOV FZERO, t3
  1892. FMOV FZERO, t4
  1893. #ifndef LN
  1894. add C1, 2 * SIZE, C1
  1895. add C2, 2 * SIZE, C2
  1896. #endif
  1897. #ifdef RT
  1898. sll K, 0 + ZBASE_SHIFT, TEMP1
  1899. add AORIG, TEMP1, AORIG
  1900. #endif
  1901. #if defined(LT) || defined(RN)
  1902. sub K, KK, TEMP1
  1903. sll TEMP1, 0 + ZBASE_SHIFT, TEMP2
  1904. sll TEMP1, 1 + ZBASE_SHIFT, TEMP1
  1905. add AO, TEMP2, AO
  1906. add BO, TEMP1, BO
  1907. #endif
  1908. #ifdef LT
  1909. add KK, 1, KK
  1910. #endif
  1911. #ifdef LN
  1912. sub KK, 1, KK
  1913. #endif
  1914. .LL99:
  1915. #ifdef LN
  1916. sll K, 1 + ZBASE_SHIFT, TEMP1
  1917. add B, TEMP1, B
  1918. #endif
  1919. #if defined(LT) || defined(RN)
  1920. mov BO, B
  1921. #endif
  1922. #ifdef RN
  1923. add KK, 2, KK
  1924. #endif
  1925. #ifdef RT
  1926. sub KK, 2, KK
  1927. #endif
  1928. add J, -1, J
  1929. cmp J, 0
  1930. bg,pt %icc, .LL11
  1931. nop
  1932. .LL999:
  1933. return %i7 + 8
  1934. clr %o0
  1935. EPILOGUE