You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

ztrsm_kernel_LT.S 37 kB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714171517161717171817191720172117221723172417251726172717281729173017311732173317341735173617371738173917401741174217431744174517461747174817491750175117521753175417551756175717581759176017611762176317641765176617671768176917701771177217731774177517761777177817791780178117821783178417851786178717881789179017911792179317941795179617971798179918001801180218031804180518061807180818091810181118121813181418151816181718181819182018211822182318241825182618271828182918301831183218331834183518361837183818391840184118421843184418451846184718481849185018511852185318541855185618571858185918601861186218631864186518661867186818691870187118721873187418751876187718781879188018811882188318841885188618871888188918901891189218931894189518961897189818991900190119021903190419051906190719081909191019111912191319141915191619171918191919201921192219231924192519261927192819291930193119321933193419351936193719381939194019411942194319441945194619471948194919501951195219531954195519561957195819591960196119621963196419651966196719681969197019711972197319741975197619771978197919801981198219831984198519861987198819891990199119921993199419951996199719981999200020012002200320042005200620072008200920102011201220132014201520162017201820192020202120222023202420252026202720282029203020312032203320342035203620372038203920402041204220432044204520462047204820492050205120522053205420552056205720582059206020612062206320642065206620672068206920702071207220732074207520762077207820792080208120822083208420852086208720882089209020912092209320942095209620972098209921002101210221032104210521062107210821092110211121122113211421152116211721182119212021212122212321242125212621272128212921302131213221332134213521362137213821392140214121422143214421452146214721482149215021512152215321542155215621572158215921602161216221632164216521662167216821692170217121722173217421752176217721782179218021812182218321842185218621872188218921902191219221932194219521962197219821992200220122022203220422052206220722082209221022112212221322142215221622172218221922202221222222232224222522262227222822292230223122322233223422352236223722382239224022412242224322442245224622472248224922502251225222532254225522562257225822592260226122622263226422652266226722682269227022712272227322742275227622772278227922802281228222832284228522862287228822892290229122922293229422952296229722982299230023012302230323042305230623072308230923102311231223132314231523162317231823192320232123222323232423252326232723282329233023312332233323342335233623372338233923402341234223432344234523462347234823492350235123522353235423552356235723582359236023612362236323642365236623672368236923702371237223732374237523762377237823792380238123822383238423852386238723882389
  1. /*********************************************************************/
  2. /* Copyright 2009, 2010 The University of Texas at Austin. */
  3. /* All rights reserved. */
  4. /* */
  5. /* Redistribution and use in source and binary forms, with or */
  6. /* without modification, are permitted provided that the following */
  7. /* conditions are met: */
  8. /* */
  9. /* 1. Redistributions of source code must retain the above */
  10. /* copyright notice, this list of conditions and the following */
  11. /* disclaimer. */
  12. /* */
  13. /* 2. Redistributions in binary form must reproduce the above */
  14. /* copyright notice, this list of conditions and the following */
  15. /* disclaimer in the documentation and/or other materials */
  16. /* provided with the distribution. */
  17. /* */
  18. /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
  19. /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
  20. /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
  21. /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
  22. /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
  23. /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
  24. /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
  25. /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
  26. /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
  27. /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
  28. /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
  29. /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
  30. /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
  31. /* POSSIBILITY OF SUCH DAMAGE. */
  32. /* */
  33. /* The views and conclusions contained in the software and */
  34. /* documentation are those of the authors and should not be */
  35. /* interpreted as representing official policies, either expressed */
  36. /* or implied, of The University of Texas at Austin. */
  37. /*********************************************************************/
  38. #define ASSEMBLER
  39. #include "common.h"
  40. #define M %i0
  41. #define N %i1
  42. #define K %i2
  43. #define A %i5
  44. #define B %i3
  45. #define C %i4
  46. #define LDC %o0
  47. #define AO %o1
  48. #define BO %o2
  49. #define I %o3
  50. #define J %o4
  51. #define L %o5
  52. #define C1 %l0
  53. #define C2 %l1
  54. #define OFFSET %l2
  55. #define KK %l3
  56. #define TEMP1 %l4
  57. #define TEMP2 %l5
  58. #define AORIG %l6
  59. #ifdef DOUBLE
  60. #define c01 %f0
  61. #define c02 %f2
  62. #define c03 %f4
  63. #define c04 %f6
  64. #define c05 %f8
  65. #define c06 %f10
  66. #define c07 %f12
  67. #define c08 %f14
  68. #define c09 %f16
  69. #define c10 %f18
  70. #define c11 %f20
  71. #define c12 %f22
  72. #define c13 %f24
  73. #define c14 %f26
  74. #define c15 %f28
  75. #define c16 %f30
  76. #define t1 %f32
  77. #define t2 %f34
  78. #define t3 %f36
  79. #define t4 %f38
  80. #define a1 %f40
  81. #define a2 %f42
  82. #define a3 %f44
  83. #define a4 %f46
  84. #define a5 %f62
  85. #define b1 %f48
  86. #define b2 %f50
  87. #define b3 %f52
  88. #define b4 %f54
  89. #define b5 %f56
  90. #define FZERO %f58
  91. #else
  92. #define c01 %f0
  93. #define c02 %f1
  94. #define c03 %f2
  95. #define c04 %f3
  96. #define c05 %f4
  97. #define c06 %f5
  98. #define c07 %f6
  99. #define c08 %f7
  100. #define c09 %f8
  101. #define c10 %f9
  102. #define c11 %f10
  103. #define c12 %f11
  104. #define c13 %f12
  105. #define c14 %f13
  106. #define c15 %f14
  107. #define c16 %f15
  108. #define t1 %f16
  109. #define t2 %f17
  110. #define t3 %f18
  111. #define t4 %f19
  112. #define a1 %f20
  113. #define a2 %f21
  114. #define a3 %f22
  115. #define a4 %f23
  116. #define a5 %f31
  117. #define b1 %f24
  118. #define b2 %f25
  119. #define b3 %f26
  120. #define b4 %f27
  121. #define b5 %f28
  122. #define FZERO %f29
  123. #endif
  124. #define t5 c13
  125. #define t6 c14
  126. #define t7 c15
  127. #define t8 c16
  128. #ifndef CONJ
  129. #define FADD1 FADD
  130. #define FADD2 FADD
  131. #define FADD3 FADD
  132. #define FADD4 FSUB
  133. #else
  134. #if defined(LN) || defined(LT)
  135. #define FADD1 FADD
  136. #define FADD2 FSUB
  137. #define FADD3 FADD
  138. #define FADD4 FADD
  139. #endif
  140. #if defined(RN) || defined(RT)
  141. #define FADD1 FADD
  142. #define FADD2 FADD
  143. #define FADD3 FSUB
  144. #define FADD4 FADD
  145. #endif
  146. #endif
  147. #define APREFETCHSIZE 40
  148. #define BPREFETCHSIZE 40
  149. #define APREFETCH_CATEGORY 0
  150. #define BPREFETCH_CATEGORY 0
  151. PROLOGUE
  152. SAVESP
  153. #ifndef __64BIT__
  154. #ifdef DOUBLE
  155. ld [%sp + STACK_START + 32], A
  156. ld [%sp + STACK_START + 36], B
  157. ld [%sp + STACK_START + 40], C
  158. ld [%sp + STACK_START + 44], LDC
  159. ld [%sp + STACK_START + 48], OFFSET
  160. #else
  161. ld [%sp + STACK_START + 28], B
  162. ld [%sp + STACK_START + 32], C
  163. ld [%sp + STACK_START + 36], LDC
  164. ld [%sp + STACK_START + 40], OFFSET
  165. #endif
  166. #else
  167. ldx [%sp+ STACK_START + 56], B
  168. ldx [%sp+ STACK_START + 64], C
  169. ldx [%sp+ STACK_START + 72], LDC
  170. ldx [%sp+ STACK_START + 80], OFFSET
  171. #endif
  172. #ifdef DOUBLE
  173. FCLR(27)
  174. #else
  175. FCLR(29)
  176. #endif
  177. sll LDC, ZBASE_SHIFT, LDC
  178. #ifdef LN
  179. smul M, K, TEMP1
  180. sll TEMP1, ZBASE_SHIFT, TEMP1
  181. add A, TEMP1, A
  182. sll M, ZBASE_SHIFT, TEMP1
  183. add C, TEMP1, C
  184. #endif
  185. #ifdef RN
  186. neg OFFSET, KK
  187. #endif
  188. #ifdef RT
  189. smul N, K, TEMP1
  190. sll TEMP1, ZBASE_SHIFT, TEMP1
  191. add B, TEMP1, B
  192. smul N, LDC, TEMP1
  193. add C, TEMP1, C
  194. sub N, OFFSET, KK
  195. #endif
  196. sra N, 1, J
  197. cmp J, 0
  198. ble,pn %icc, .LL100
  199. nop
  200. .LL11:
  201. #ifdef RT
  202. sll K, 1 + ZBASE_SHIFT, TEMP1
  203. sub B, TEMP1, B
  204. add LDC, LDC, TEMP1
  205. sub C, TEMP1, C
  206. #endif
  207. FMOV FZERO, t1
  208. FMOV FZERO, t2
  209. FMOV FZERO, t3
  210. sra M, 1, I
  211. mov C, C1
  212. add C, LDC, C2
  213. #ifdef LN
  214. add M, OFFSET, KK
  215. #endif
  216. #ifdef LT
  217. mov OFFSET, KK
  218. #endif
  219. #if defined(LN) || defined(RT)
  220. mov A, AORIG
  221. #else
  222. mov A, AO
  223. #endif
  224. cmp I, 0
  225. #ifndef RT
  226. add C2, LDC, C
  227. #endif
  228. ble,pn %icc, .LL50
  229. FMOV FZERO, t4
  230. .LL21:
  231. #if defined(LT) || defined(RN)
  232. sra KK, 2, L
  233. mov B, BO
  234. cmp L, 0
  235. #else
  236. #ifdef LN
  237. sll K, 1 + ZBASE_SHIFT, TEMP1
  238. sub AORIG, TEMP1, AORIG
  239. #endif
  240. sll KK, 1 + ZBASE_SHIFT, TEMP1
  241. add AORIG, TEMP1, AO
  242. add B, TEMP1, BO
  243. sub K, KK, TEMP1
  244. sra TEMP1, 2, L
  245. cmp L, 0
  246. #endif
  247. FMOV FZERO, t1
  248. FMOV FZERO, t2
  249. FMOV FZERO, t3
  250. FMOV FZERO, t4
  251. FMOV FZERO, c01
  252. FMOV FZERO, c02
  253. LDF [AO + 0 * SIZE], a1
  254. FMOV FZERO, c03
  255. LDF [BO + 0 * SIZE], b1
  256. FMOV FZERO, c04
  257. LDF [AO + 1 * SIZE], a2
  258. FMOV FZERO, c05
  259. LDF [BO + 1 * SIZE], b2
  260. FMOV FZERO, c06
  261. LDF [AO + 2 * SIZE], a3
  262. FMOV FZERO, c07
  263. LDF [BO + 2 * SIZE], b3
  264. FMOV FZERO, c08
  265. LDF [AO + 3 * SIZE], a4
  266. FMOV FZERO, c09
  267. LDF [BO + 3 * SIZE], b4
  268. FMOV FZERO, c10
  269. LDF [BO + 4 * SIZE], b5
  270. FMOV FZERO, c11
  271. LDF [AO + 4 * SIZE], a5
  272. FMOV FZERO, c12
  273. prefetch [C1 + 3 * SIZE], 3
  274. FMOV FZERO, c13
  275. prefetch [C2 + 3 * SIZE], 3
  276. FMOV FZERO, c14
  277. FMOV FZERO, c15
  278. ble,pn %icc, .LL25
  279. FMOV FZERO, c16
  280. .LL22:
  281. FADD2 c04, t1, c04
  282. prefetch [AO + APREFETCHSIZE * SIZE], APREFETCH_CATEGORY
  283. FMUL a1, b1, t1
  284. nop
  285. FADD4 c08, t2, c08
  286. prefetch [BO + BPREFETCHSIZE * SIZE], BPREFETCH_CATEGORY
  287. FMUL a1, b2, t2
  288. add AO, 16 * SIZE, AO
  289. FADD2 c12, t3, c12
  290. LDF [AO - 13 * SIZE], a4
  291. FMUL a1, b3, t3
  292. add BO, 16 * SIZE, BO
  293. FADD4 c16, t4, c16
  294. nop
  295. FMUL a1, b4, t4
  296. LDF [AO - 8 * SIZE], a1
  297. FADD1 c01, t1, c01
  298. nop
  299. FMUL a2, b1, t1
  300. nop
  301. FADD3 c05, t2, c05
  302. nop
  303. FMUL a2, b2, t2
  304. nop
  305. FADD1 c09, t3, c09
  306. nop
  307. FMUL a2, b3, t3
  308. nop
  309. FADD3 c13, t4, c13
  310. add L, -1, L
  311. FMUL a2, b4, t4
  312. LDF [AO - 11 * SIZE], a2
  313. FADD2 c02, t1, c02
  314. nop
  315. FMUL a3, b1, t1
  316. nop
  317. FADD4 c06, t2, c06
  318. nop
  319. FMUL a3, b2, t2
  320. nop
  321. FADD2 c10, t3, c10
  322. nop
  323. FMUL a3, b3, t3
  324. nop
  325. FADD4 c14, t4, c14
  326. nop
  327. FMUL a3, b4, t4
  328. LDF [AO - 10 * SIZE], a3
  329. FADD1 c03, t1, c03
  330. nop
  331. FMUL a4, b1, t1
  332. LDF [BO - 8 * SIZE], b1
  333. FADD3 c07, t2, c07
  334. nop
  335. FMUL a4, b2, t2
  336. LDF [BO - 11 * SIZE], b2
  337. FADD1 c11, t3, c11
  338. nop
  339. FMUL a4, b3, t3
  340. LDF [BO - 10 * SIZE], b3
  341. FADD3 c15, t4, c15
  342. nop
  343. FMUL a4, b4, t4
  344. LDF [BO - 9 * SIZE], b4
  345. FADD2 c04, t1, c04
  346. nop
  347. FMUL a5, b5, t1
  348. LDF [AO - 9 * SIZE], a4
  349. FADD4 c08, t2, c08
  350. nop
  351. FMUL a5, b2, t2
  352. nop
  353. FADD2 c12, t3, c12
  354. nop
  355. FMUL a5, b3, t3
  356. nop
  357. FADD4 c16, t4, c16
  358. nop
  359. FMUL a5, b4, t4
  360. LDF [AO - 4 * SIZE], a5
  361. FADD1 c01, t1, c01
  362. nop
  363. FMUL a2, b5, t1
  364. nop
  365. FADD3 c05, t2, c05
  366. nop
  367. FMUL a2, b2, t2
  368. nop
  369. FADD1 c09, t3, c09
  370. nop
  371. FMUL a2, b3, t3
  372. nop
  373. FADD3 c13, t4, c13
  374. nop
  375. FMUL a2, b4, t4
  376. LDF [AO - 7 * SIZE], a2
  377. FADD2 c02, t1, c02
  378. nop
  379. FMUL a3, b5, t1
  380. nop
  381. FADD4 c06, t2, c06
  382. nop
  383. FMUL a3, b2, t2
  384. nop
  385. FADD2 c10, t3, c10
  386. nop
  387. FMUL a3, b3, t3
  388. nop
  389. FADD4 c14, t4, c14
  390. nop
  391. FMUL a3, b4, t4
  392. LDF [AO - 6 * SIZE], a3
  393. FADD1 c03, t1, c03
  394. nop
  395. FMUL a4, b5, t1
  396. LDF [BO - 4 * SIZE], b5
  397. FADD3 c07, t2, c07
  398. nop
  399. FMUL a4, b2, t2
  400. LDF [BO - 7 * SIZE], b2
  401. FADD1 c11, t3, c11
  402. nop
  403. FMUL a4, b3, t3
  404. LDF [BO - 6 * SIZE], b3
  405. FADD3 c15, t4, c15
  406. nop
  407. FMUL a4, b4, t4
  408. LDF [BO - 5 * SIZE], b4
  409. FADD2 c04, t1, c04
  410. nop
  411. FMUL a1, b1, t1
  412. LDF [AO - 5 * SIZE], a4
  413. FADD4 c08, t2, c08
  414. nop
  415. FMUL a1, b2, t2
  416. nop
  417. FADD2 c12, t3, c12
  418. nop
  419. FMUL a1, b3, t3
  420. nop
  421. FADD4 c16, t4, c16
  422. nop
  423. FMUL a1, b4, t4
  424. LDF [AO - 0 * SIZE], a1
  425. FADD1 c01, t1, c01
  426. nop
  427. FMUL a2, b1, t1
  428. nop
  429. #ifdef DOUBLE
  430. prefetch [AO + (APREFETCHSIZE + 8) * SIZE], APREFETCH_CATEGORY
  431. #else
  432. nop
  433. #endif
  434. FADD3 c05, t2, c05
  435. nop
  436. FMUL a2, b2, t2
  437. FADD1 c09, t3, c09
  438. nop
  439. FMUL a2, b3, t3
  440. nop
  441. FADD3 c13, t4, c13
  442. nop
  443. FMUL a2, b4, t4
  444. nop
  445. FADD2 c02, t1, c02
  446. nop
  447. FMUL a3, b1, t1
  448. LDF [AO - 3 * SIZE], a2
  449. FADD4 c06, t2, c06
  450. #ifdef DOUBLE
  451. prefetch [BO + (BPREFETCHSIZE + 8) * SIZE], BPREFETCH_CATEGORY
  452. #else
  453. nop
  454. #endif
  455. FMUL a3, b2, t2
  456. nop
  457. FADD2 c10, t3, c10
  458. nop
  459. FMUL a3, b3, t3
  460. nop
  461. FADD4 c14, t4, c14
  462. nop
  463. FMUL a3, b4, t4
  464. LDF [AO - 2 * SIZE], a3
  465. FADD1 c03, t1, c03
  466. nop
  467. FMUL a4, b1, t1
  468. LDF [BO - 0 * SIZE], b1
  469. FADD3 c07, t2, c07
  470. nop
  471. FMUL a4, b2, t2
  472. LDF [BO - 3 * SIZE], b2
  473. FADD1 c11, t3, c11
  474. nop
  475. FMUL a4, b3, t3
  476. LDF [BO - 2 * SIZE], b3
  477. FADD3 c15, t4, c15
  478. nop
  479. FMUL a4, b4, t4
  480. LDF [BO - 1 * SIZE], b4
  481. FADD2 c04, t1, c04
  482. nop
  483. FMUL a5, b5, t1
  484. LDF [AO - 1 * SIZE], a4
  485. FADD4 c08, t2, c08
  486. FMUL a5, b2, t2
  487. FADD2 c12, t3, c12
  488. FMUL a5, b3, t3
  489. FADD4 c16, t4, c16
  490. nop
  491. FMUL a5, b4, t4
  492. LDF [AO + 4 * SIZE], a5
  493. FADD1 c01, t1, c01
  494. nop
  495. FMUL a2, b5, t1
  496. nop
  497. FADD3 c05, t2, c05
  498. nop
  499. FMUL a2, b2, t2
  500. nop
  501. FADD1 c09, t3, c09
  502. nop
  503. FMUL a2, b3, t3
  504. nop
  505. FADD3 c13, t4, c13
  506. nop
  507. FMUL a2, b4, t4
  508. LDF [AO + 1 * SIZE], a2
  509. FADD2 c02, t1, c02
  510. nop
  511. FMUL a3, b5, t1
  512. nop
  513. FADD4 c06, t2, c06
  514. nop
  515. FMUL a3, b2, t2
  516. nop
  517. FADD2 c10, t3, c10
  518. nop
  519. FMUL a3, b3, t3
  520. nop
  521. FADD4 c14, t4, c14
  522. nop
  523. FMUL a3, b4, t4
  524. LDF [AO + 2 * SIZE], a3
  525. FADD1 c03, t1, c03
  526. cmp L, 0
  527. FMUL a4, b5, t1
  528. LDF [BO + 4 * SIZE], b5
  529. FADD3 c07, t2, c07
  530. nop
  531. FMUL a4, b2, t2
  532. LDF [BO + 1 * SIZE], b2
  533. FADD1 c11, t3, c11
  534. nop
  535. FMUL a4, b3, t3
  536. LDF [BO + 2 * SIZE], b3
  537. FADD3 c15, t4, c15
  538. FMUL a4, b4, t4
  539. bg,pt %icc, .LL22
  540. LDF [BO + 3 * SIZE], b4
  541. .LL25:
  542. #if defined(LT) || defined(RN)
  543. and KK, 3, L
  544. #else
  545. and TEMP1, 3, L
  546. #endif
  547. cmp L, 0
  548. ble,pn %icc, .LL29
  549. nop
  550. .LL26:
  551. FADD2 c04, t1, c04
  552. LDF [AO + 3 * SIZE], a4
  553. FMUL a1, b1, t1
  554. add AO, 4 * SIZE, AO
  555. FADD4 c08, t2, c08
  556. add BO, 4 * SIZE, BO
  557. FMUL a1, b2, t2
  558. add L, -1, L
  559. FADD2 c12, t3, c12
  560. nop
  561. FMUL a1, b3, t3
  562. cmp L, 0
  563. FADD4 c16, t4, c16
  564. nop
  565. FMUL a1, b4, t4
  566. LDF [AO + 0 * SIZE], a1
  567. FADD1 c01, t1, c01
  568. nop
  569. FMUL a2, b1, t1
  570. nop
  571. FADD3 c05, t2, c05
  572. nop
  573. FMUL a2, b2, t2
  574. nop
  575. FADD1 c09, t3, c09
  576. nop
  577. FMUL a2, b3, t3
  578. nop
  579. FADD3 c13, t4, c13
  580. nop
  581. FMUL a2, b4, t4
  582. LDF [AO + 1 * SIZE], a2
  583. FADD2 c02, t1, c02
  584. nop
  585. FMUL a3, b1, t1
  586. nop
  587. FADD4 c06, t2, c06
  588. nop
  589. FMUL a3, b2, t2
  590. nop
  591. FADD2 c10, t3, c10
  592. nop
  593. FMUL a3, b3, t3
  594. nop
  595. FADD4 c14, t4, c14
  596. nop
  597. FMUL a3, b4, t4
  598. LDF [AO + 2 * SIZE], a3
  599. FADD1 c03, t1, c03
  600. nop
  601. FMUL a4, b1, t1
  602. LDF [BO + 0 * SIZE], b1
  603. FADD3 c07, t2, c07
  604. nop
  605. FMUL a4, b2, t2
  606. LDF [BO + 1 * SIZE], b2
  607. FADD1 c11, t3, c11
  608. nop
  609. FMUL a4, b3, t3
  610. LDF [BO + 2 * SIZE], b3
  611. FADD3 c15, t4, c15
  612. FMUL a4, b4, t4
  613. bg,pt %icc, .LL26
  614. LDF [BO + 3 * SIZE], b4
  615. .LL29:
  616. #if defined(LN) || defined(RT)
  617. sub KK, 2, TEMP1
  618. sll TEMP1, 1 + ZBASE_SHIFT, TEMP1
  619. add AORIG, TEMP1, AO
  620. add B, TEMP1, BO
  621. #endif
  622. FADD2 c04, t1, c04
  623. FADD4 c08, t2, c08
  624. FADD2 c12, t3, c12
  625. FADD4 c16, t4, c16
  626. FADD c01, c06, c01
  627. FADD c02, c05, c02
  628. FADD c03, c08, c03
  629. FADD c04, c07, c04
  630. FADD c09, c14, c09
  631. FADD c10, c13, c10
  632. FADD c11, c16, c11
  633. FADD c12, c15, c12
  634. #if defined(LN) || defined(LT)
  635. LDF [BO + 0 * SIZE], a1
  636. LDF [BO + 1 * SIZE], a2
  637. LDF [BO + 2 * SIZE], a3
  638. LDF [BO + 3 * SIZE], a4
  639. LDF [BO + 4 * SIZE], b1
  640. LDF [BO + 5 * SIZE], b2
  641. LDF [BO + 6 * SIZE], b3
  642. LDF [BO + 7 * SIZE], b4
  643. FSUB a1, c01, c01
  644. FSUB a2, c02, c02
  645. FSUB a3, c09, c09
  646. FSUB a4, c10, c10
  647. FSUB b1, c03, c03
  648. FSUB b2, c04, c04
  649. FSUB b3, c11, c11
  650. FSUB b4, c12, c12
  651. #else
  652. LDF [AO + 0 * SIZE], a1
  653. LDF [AO + 1 * SIZE], a2
  654. LDF [AO + 2 * SIZE], a3
  655. LDF [AO + 3 * SIZE], a4
  656. LDF [AO + 4 * SIZE], b1
  657. LDF [AO + 5 * SIZE], b2
  658. LDF [AO + 6 * SIZE], b3
  659. LDF [AO + 7 * SIZE], b4
  660. FSUB a1, c01, c01
  661. FSUB a2, c02, c02
  662. FSUB a3, c03, c03
  663. FSUB a4, c04, c04
  664. FSUB b1, c09, c09
  665. FSUB b2, c10, c10
  666. FSUB b3, c11, c11
  667. FSUB b4, c12, c12
  668. #endif
  669. #ifdef LN
  670. LDF [AO + 6 * SIZE], a1
  671. LDF [AO + 7 * SIZE], a2
  672. LDF [AO + 4 * SIZE], a3
  673. LDF [AO + 5 * SIZE], a4
  674. LDF [AO + 0 * SIZE], b1
  675. LDF [AO + 1 * SIZE], b2
  676. FMUL a1, c03, t1
  677. FMUL a2, c04, t2
  678. FMUL a1, c04, t3
  679. FMUL a2, c03, t4
  680. FMUL a1, c11, t5
  681. FMUL a2, c12, t6
  682. FMUL a1, c12, t7
  683. FMUL a2, c11, t8
  684. FADD4 t1, t2, c03
  685. FADD2 t3, t4, c04
  686. FADD4 t5, t6, c11
  687. FADD2 t7, t8, c12
  688. FMUL a3, c03, t1
  689. FMUL a3, c04, t2
  690. FMUL a3, c11, t3
  691. FMUL a3, c12, t4
  692. FMUL a4, c04, t5
  693. FMUL a4, c03, t6
  694. FMUL a4, c12, t7
  695. FMUL a4, c11, t8
  696. FSUB c01, t1, c01
  697. FSUB c02, t2, c02
  698. FSUB c09, t3, c09
  699. FSUB c10, t4, c10
  700. FADD2 c01, t5, c01
  701. FADD4 c02, t6, c02
  702. FADD2 c09, t7, c09
  703. FADD4 c10, t8, c10
  704. FMUL b1, c01, t1
  705. FMUL b2, c02, t2
  706. FMUL b1, c02, t3
  707. FMUL b2, c01, t4
  708. FMUL b1, c09, t5
  709. FMUL b2, c10, t6
  710. FMUL b1, c10, t7
  711. FMUL b2, c09, t8
  712. FADD4 t1, t2, c01
  713. FADD2 t3, t4, c02
  714. FADD4 t5, t6, c09
  715. FADD2 t7, t8, c10
  716. #endif
  717. #ifdef LT
  718. LDF [AO + 0 * SIZE], a1
  719. LDF [AO + 1 * SIZE], a2
  720. LDF [AO + 2 * SIZE], a3
  721. LDF [AO + 3 * SIZE], a4
  722. LDF [AO + 6 * SIZE], b1
  723. LDF [AO + 7 * SIZE], b2
  724. FMUL a1, c01, t1
  725. FMUL a2, c02, t2
  726. FMUL a1, c02, t3
  727. FMUL a2, c01, t4
  728. FMUL a1, c09, t5
  729. FMUL a2, c10, t6
  730. FMUL a1, c10, t7
  731. FMUL a2, c09, t8
  732. FADD4 t1, t2, c01
  733. FADD2 t3, t4, c02
  734. FADD4 t5, t6, c09
  735. FADD2 t7, t8, c10
  736. FMUL a3, c01, t1
  737. FMUL a3, c02, t2
  738. FMUL a3, c09, t3
  739. FMUL a3, c10, t4
  740. FMUL a4, c02, t5
  741. FMUL a4, c01, t6
  742. FMUL a4, c10, t7
  743. FMUL a4, c09, t8
  744. FSUB c03, t1, c03
  745. FSUB c04, t2, c04
  746. FSUB c11, t3, c11
  747. FSUB c12, t4, c12
  748. FADD2 c03, t5, c03
  749. FADD4 c04, t6, c04
  750. FADD2 c11, t7, c11
  751. FADD4 c12, t8, c12
  752. FMUL b1, c03, t1
  753. FMUL b2, c04, t2
  754. FMUL b1, c04, t3
  755. FMUL b2, c03, t4
  756. FMUL b1, c11, t5
  757. FMUL b2, c12, t6
  758. FMUL b1, c12, t7
  759. FMUL b2, c11, t8
  760. FADD4 t1, t2, c03
  761. FADD2 t3, t4, c04
  762. FADD4 t5, t6, c11
  763. FADD2 t7, t8, c12
  764. #endif
  765. #ifdef RN
  766. LDF [BO + 0 * SIZE], a1
  767. LDF [BO + 1 * SIZE], a2
  768. LDF [BO + 2 * SIZE], a3
  769. LDF [BO + 3 * SIZE], a4
  770. LDF [BO + 6 * SIZE], b1
  771. LDF [BO + 7 * SIZE], b2
  772. FMUL a1, c01, t1
  773. FMUL a2, c02, t2
  774. FMUL a1, c02, t3
  775. FMUL a2, c01, t4
  776. FMUL a1, c03, t5
  777. FMUL a2, c04, t6
  778. FMUL a1, c04, t7
  779. FMUL a2, c03, t8
  780. FADD4 t1, t2, c01
  781. FADD3 t3, t4, c02
  782. FADD4 t5, t6, c03
  783. FADD3 t7, t8, c04
  784. FMUL a3, c01, t1
  785. FMUL a3, c02, t2
  786. FMUL a3, c03, t3
  787. FMUL a3, c04, t4
  788. FMUL a4, c02, t5
  789. FMUL a4, c01, t6
  790. FMUL a4, c04, t7
  791. FMUL a4, c03, t8
  792. FSUB c09, t1, c09
  793. FSUB c10, t2, c10
  794. FSUB c11, t3, c11
  795. FSUB c12, t4, c12
  796. FADD3 c09, t5, c09
  797. FADD4 c10, t6, c10
  798. FADD3 c11, t7, c11
  799. FADD4 c12, t8, c12
  800. FMUL b1, c09, t1
  801. FMUL b2, c10, t2
  802. FMUL b1, c10, t3
  803. FMUL b2, c09, t4
  804. FMUL b1, c11, t5
  805. FMUL b2, c12, t6
  806. FMUL b1, c12, t7
  807. FMUL b2, c11, t8
  808. FADD4 t1, t2, c09
  809. FADD3 t3, t4, c10
  810. FADD4 t5, t6, c11
  811. FADD3 t7, t8, c12
  812. #endif
  813. #ifdef RT
  814. LDF [BO + 6 * SIZE], a1
  815. LDF [BO + 7 * SIZE], a2
  816. LDF [BO + 4 * SIZE], a3
  817. LDF [BO + 5 * SIZE], a4
  818. LDF [BO + 0 * SIZE], b1
  819. LDF [BO + 1 * SIZE], b2
  820. FMUL a1, c09, t1
  821. FMUL a2, c10, t2
  822. FMUL a1, c10, t3
  823. FMUL a2, c09, t4
  824. FMUL a1, c11, t5
  825. FMUL a2, c12, t6
  826. FMUL a1, c12, t7
  827. FMUL a2, c11, t8
  828. FADD4 t1, t2, c09
  829. FADD3 t3, t4, c10
  830. FADD4 t5, t6, c11
  831. FADD3 t7, t8, c12
  832. FMUL a3, c09, t1
  833. FMUL a3, c10, t2
  834. FMUL a3, c11, t3
  835. FMUL a3, c12, t4
  836. FMUL a4, c10, t5
  837. FMUL a4, c09, t6
  838. FMUL a4, c12, t7
  839. FMUL a4, c11, t8
  840. FSUB c01, t1, c01
  841. FSUB c02, t2, c02
  842. FSUB c03, t3, c03
  843. FSUB c04, t4, c04
  844. FADD3 c01, t5, c01
  845. FADD4 c02, t6, c02
  846. FADD3 c03, t7, c03
  847. FADD4 c04, t8, c04
  848. FMUL b1, c01, t1
  849. FMUL b2, c02, t2
  850. FMUL b1, c02, t3
  851. FMUL b2, c01, t4
  852. FMUL b1, c03, t5
  853. FMUL b2, c04, t6
  854. FMUL b1, c04, t7
  855. FMUL b2, c03, t8
  856. FADD4 t1, t2, c01
  857. FADD3 t3, t4, c02
  858. FADD4 t5, t6, c03
  859. FADD3 t7, t8, c04
  860. #endif
  861. #ifdef LN
  862. add C1, -4 * SIZE, C1
  863. add C2, -4 * SIZE, C2
  864. #endif
  865. #if defined(LN) || defined(LT)
  866. STF c01, [BO + 0 * SIZE]
  867. STF c02, [BO + 1 * SIZE]
  868. STF c09, [BO + 2 * SIZE]
  869. STF c10, [BO + 3 * SIZE]
  870. STF c03, [BO + 4 * SIZE]
  871. STF c04, [BO + 5 * SIZE]
  872. STF c11, [BO + 6 * SIZE]
  873. STF c12, [BO + 7 * SIZE]
  874. #else
  875. STF c01, [AO + 0 * SIZE]
  876. STF c02, [AO + 1 * SIZE]
  877. STF c03, [AO + 2 * SIZE]
  878. STF c04, [AO + 3 * SIZE]
  879. STF c09, [AO + 4 * SIZE]
  880. STF c10, [AO + 5 * SIZE]
  881. STF c11, [AO + 6 * SIZE]
  882. STF c12, [AO + 7 * SIZE]
  883. #endif
  884. STF c01, [C1 + 0 * SIZE]
  885. STF c02, [C1 + 1 * SIZE]
  886. STF c03, [C1 + 2 * SIZE]
  887. STF c04, [C1 + 3 * SIZE]
  888. STF c09, [C2 + 0 * SIZE]
  889. STF c10, [C2 + 1 * SIZE]
  890. STF c11, [C2 + 2 * SIZE]
  891. STF c12, [C2 + 3 * SIZE]
  892. FMOV FZERO, t1
  893. FMOV FZERO, t2
  894. FMOV FZERO, t3
  895. FMOV FZERO, t4
  896. #ifndef LN
  897. add C1, 4 * SIZE, C1
  898. add C2, 4 * SIZE, C2
  899. #endif
  900. #ifdef RT
  901. sll K, 1 + ZBASE_SHIFT, TEMP1
  902. add AORIG, TEMP1, AORIG
  903. #endif
  904. #if defined(LT) || defined(RN)
  905. sub K, KK, TEMP1
  906. sll TEMP1, 1 + ZBASE_SHIFT, TEMP1
  907. add AO, TEMP1, AO
  908. add BO, TEMP1, BO
  909. #endif
  910. #ifdef LT
  911. add KK, 2, KK
  912. #endif
  913. #ifdef LN
  914. sub KK, 2, KK
  915. #endif
  916. add I, -1, I
  917. cmp I, 0
  918. bg,pt %icc, .LL21
  919. FMOV FZERO, c01
  920. .LL50:
  921. and M, 1, I
  922. FMOV FZERO, c02
  923. cmp I, 0
  924. FMOV FZERO, t1
  925. ble,pn %icc, .LL99
  926. FMOV FZERO, c04
  927. #if defined(LT) || defined(RN)
  928. sra KK, 2, L
  929. mov B, BO
  930. cmp L, 0
  931. #else
  932. #ifdef LN
  933. sll K, 0 + ZBASE_SHIFT, TEMP1
  934. sub AORIG, TEMP1, AORIG
  935. #endif
  936. sll KK, 0 + ZBASE_SHIFT, TEMP1
  937. sll KK, 1 + ZBASE_SHIFT, TEMP2
  938. add AORIG, TEMP1, AO
  939. add B, TEMP2, BO
  940. sub K, KK, TEMP1
  941. sra TEMP1, 2, L
  942. cmp L, 0
  943. #endif
  944. LDF [AO + 0 * SIZE], a1
  945. FMOV FZERO, t2
  946. LDF [BO + 0 * SIZE], b1
  947. FMOV FZERO, c06
  948. LDF [AO + 1 * SIZE], a2
  949. FMOV FZERO, t3
  950. LDF [BO + 1 * SIZE], b2
  951. FMOV FZERO, c08
  952. LDF [AO + 2 * SIZE], a3
  953. FMOV FZERO, t4
  954. LDF [BO + 2 * SIZE], b3
  955. FMOV FZERO, c01
  956. LDF [AO + 3 * SIZE], a4
  957. FMOV FZERO, c03
  958. LDF [BO + 3 * SIZE], b4
  959. FMOV FZERO, c05
  960. ble,pn %icc, .LL55
  961. FMOV FZERO, c07
  962. .LL52:
  963. FADD2 c02, t1, c02
  964. add AO, 8 * SIZE, AO
  965. prefetch [AO + APREFETCHSIZE * SIZE], 0
  966. FMUL a1, b1, t1
  967. add BO, 16 * SIZE, BO
  968. FADD4 c04, t2, c04
  969. add L, -1, L
  970. FMUL a1, b2, t2
  971. FADD2 c06, t3, c06
  972. cmp L, 0
  973. FMUL a1, b3, t3
  974. FADD4 c08, t4, c08
  975. FMUL a1, b4, t4
  976. LDF [AO - 4 * SIZE], a1
  977. FADD1 c01, t1, c01
  978. FMUL a2, b1, t1
  979. LDF [BO - 12 * SIZE], b1
  980. FADD3 c03, t2, c03
  981. FMUL a2, b2, t2
  982. LDF [BO - 11 * SIZE], b2
  983. FADD1 c05, t3, c05
  984. FMUL a2, b3, t3
  985. LDF [BO - 10 * SIZE], b3
  986. FADD3 c07, t4, c07
  987. FMUL a2, b4, t4
  988. LDF [BO - 9 * SIZE], b4
  989. FADD2 c02, t1, c02
  990. FMUL a3, b1, t1
  991. LDF [AO - 3 * SIZE], a2
  992. FADD4 c04, t2, c04
  993. FMUL a3, b2, t2
  994. FADD2 c06, t3, c06
  995. FMUL a3, b3, t3
  996. FADD4 c08, t4, c08
  997. FMUL a3, b4, t4
  998. LDF [AO - 2 * SIZE], a3
  999. FADD1 c01, t1, c01
  1000. FMUL a4, b1, t1
  1001. LDF [BO - 8 * SIZE], b1
  1002. FADD3 c03, t2, c03
  1003. FMUL a4, b2, t2
  1004. LDF [BO - 7 * SIZE], b2
  1005. FADD1 c05, t3, c05
  1006. FMUL a4, b3, t3
  1007. LDF [BO - 6 * SIZE], b3
  1008. FADD3 c07, t4, c07
  1009. FMUL a4, b4, t4
  1010. LDF [BO - 5 * SIZE], b4
  1011. FADD2 c02, t1, c02
  1012. FMUL a1, b1, t1
  1013. LDF [AO - 1 * SIZE], a4
  1014. FADD4 c04, t2, c04
  1015. FMUL a1, b2, t2
  1016. FADD2 c06, t3, c06
  1017. FMUL a1, b3, t3
  1018. FADD4 c08, t4, c08
  1019. FMUL a1, b4, t4
  1020. LDF [AO + 0 * SIZE], a1
  1021. FADD1 c01, t1, c01
  1022. FMUL a2, b1, t1
  1023. LDF [BO - 4 * SIZE], b1
  1024. FADD3 c03, t2, c03
  1025. FMUL a2, b2, t2
  1026. LDF [BO - 3 * SIZE], b2
  1027. FADD1 c05, t3, c05
  1028. FMUL a2, b3, t3
  1029. LDF [BO - 2 * SIZE], b3
  1030. FADD3 c07, t4, c07
  1031. FMUL a2, b4, t4
  1032. LDF [BO - 1 * SIZE], b4
  1033. FADD2 c02, t1, c02
  1034. FMUL a3, b1, t1
  1035. LDF [AO + 1 * SIZE], a2
  1036. FADD4 c04, t2, c04
  1037. FMUL a3, b2, t2
  1038. FADD2 c06, t3, c06
  1039. FMUL a3, b3, t3
  1040. FADD4 c08, t4, c08
  1041. FMUL a3, b4, t4
  1042. LDF [AO + 2 * SIZE], a3
  1043. FADD1 c01, t1, c01
  1044. FMUL a4, b1, t1
  1045. LDF [BO + 0 * SIZE], b1
  1046. FADD3 c03, t2, c03
  1047. FMUL a4, b2, t2
  1048. LDF [BO + 1 * SIZE], b2
  1049. FADD1 c05, t3, c05
  1050. FMUL a4, b3, t3
  1051. LDF [BO + 2 * SIZE], b3
  1052. FADD3 c07, t4, c07
  1053. FMUL a4, b4, t4
  1054. LDF [BO + 3 * SIZE], b4
  1055. bg,pt %icc, .LL52
  1056. LDF [AO + 3 * SIZE], a4
  1057. .LL55:
  1058. #if defined(LT) || defined(RN)
  1059. and KK, 3, L
  1060. #else
  1061. and TEMP1, 3, L
  1062. #endif
  1063. cmp L, 0
  1064. ble,a,pn %icc, .LL59
  1065. nop
  1066. .LL56:
  1067. FADD2 c02, t1, c02
  1068. add AO, 2 * SIZE, AO
  1069. FMUL a1, b1, t1
  1070. add L, -1, L
  1071. add BO, 4 * SIZE, BO
  1072. FADD4 c04, t2, c04
  1073. cmp L, 0
  1074. FMUL a1, b2, t2
  1075. FADD2 c06, t3, c06
  1076. FMUL a1, b3, t3
  1077. FADD4 c08, t4, c08
  1078. FMUL a1, b4, t4
  1079. LDF [AO + 0 * SIZE], a1
  1080. FADD1 c01, t1, c01
  1081. FMUL a2, b1, t1
  1082. LDF [BO + 0 * SIZE], b1
  1083. FADD3 c03, t2, c03
  1084. FMUL a2, b2, t2
  1085. LDF [BO + 1 * SIZE], b2
  1086. FADD1 c05, t3, c05
  1087. FMUL a2, b3, t3
  1088. LDF [BO + 2 * SIZE], b3
  1089. FADD3 c07, t4, c07
  1090. FMUL a2, b4, t4
  1091. LDF [BO + 3 * SIZE], b4
  1092. bg,pt %icc, .LL56
  1093. LDF [AO + 1 * SIZE], a2
  1094. .LL59:
  1095. #if defined(LN) || defined(RT)
  1096. #ifdef LN
  1097. sub KK, 1, TEMP1
  1098. #else
  1099. sub KK, 2, TEMP1
  1100. #endif
  1101. sll TEMP1, 0 + ZBASE_SHIFT, TEMP2
  1102. sll TEMP1, 1 + ZBASE_SHIFT, TEMP1
  1103. add AORIG, TEMP2, AO
  1104. add B, TEMP1, BO
  1105. #endif
  1106. FADD2 c02, t1, c02
  1107. FADD4 c04, t2, c04
  1108. FADD2 c06, t3, c06
  1109. FADD4 c08, t4, c08
  1110. FADD c01, c04, c01
  1111. FADD c02, c03, c02
  1112. FADD c05, c08, c05
  1113. FADD c06, c07, c06
  1114. #if defined(LN) || defined(LT)
  1115. LDF [BO + 0 * SIZE], a1
  1116. LDF [BO + 1 * SIZE], a2
  1117. LDF [BO + 2 * SIZE], a3
  1118. LDF [BO + 3 * SIZE], a4
  1119. FSUB a1, c01, c01
  1120. FSUB a2, c02, c02
  1121. FSUB a3, c05, c05
  1122. FSUB a4, c06, c06
  1123. #else
  1124. LDF [AO + 0 * SIZE], a1
  1125. LDF [AO + 1 * SIZE], a2
  1126. LDF [AO + 2 * SIZE], a3
  1127. LDF [AO + 3 * SIZE], a4
  1128. FSUB a1, c01, c01
  1129. FSUB a2, c02, c02
  1130. FSUB a3, c05, c05
  1131. FSUB a4, c06, c06
  1132. #endif
  1133. #ifdef LN
  1134. LDF [AO + 0 * SIZE], a1
  1135. LDF [AO + 1 * SIZE], a2
  1136. FMUL a1, c01, t1
  1137. FMUL a2, c02, t2
  1138. FMUL a1, c02, t3
  1139. FMUL a2, c01, t4
  1140. FMUL a1, c05, t5
  1141. FMUL a2, c06, t6
  1142. FMUL a1, c06, t7
  1143. FMUL a2, c05, t8
  1144. FADD4 t1, t2, c01
  1145. FADD2 t3, t4, c02
  1146. FADD4 t5, t6, c05
  1147. FADD2 t7, t8, c06
  1148. #endif
  1149. #ifdef LT
  1150. LDF [AO + 0 * SIZE], a1
  1151. LDF [AO + 1 * SIZE], a2
  1152. FMUL a1, c01, t1
  1153. FMUL a2, c02, t2
  1154. FMUL a1, c02, t3
  1155. FMUL a2, c01, t4
  1156. FMUL a1, c05, t5
  1157. FMUL a2, c06, t6
  1158. FMUL a1, c06, t7
  1159. FMUL a2, c05, t8
  1160. FADD4 t1, t2, c01
  1161. FADD2 t3, t4, c02
  1162. FADD4 t5, t6, c05
  1163. FADD2 t7, t8, c06
  1164. #endif
  1165. #ifdef RN
  1166. LDF [BO + 0 * SIZE], a1
  1167. LDF [BO + 1 * SIZE], a2
  1168. LDF [BO + 2 * SIZE], a3
  1169. LDF [BO + 3 * SIZE], a4
  1170. LDF [BO + 6 * SIZE], b1
  1171. LDF [BO + 7 * SIZE], b2
  1172. FMUL a1, c01, t1
  1173. FMUL a2, c02, t2
  1174. FMUL a1, c02, t3
  1175. FMUL a2, c01, t4
  1176. FADD4 t1, t2, c01
  1177. FADD3 t3, t4, c02
  1178. FMUL a3, c01, t1
  1179. FMUL a3, c02, t2
  1180. FMUL a4, c02, t3
  1181. FMUL a4, c01, t4
  1182. FSUB c05, t1, c05
  1183. FSUB c06, t2, c06
  1184. FADD3 c05, t3, c05
  1185. FADD4 c06, t4, c06
  1186. FMUL b1, c05, t1
  1187. FMUL b2, c06, t2
  1188. FMUL b1, c06, t3
  1189. FMUL b2, c05, t4
  1190. FADD4 t1, t2, c05
  1191. FADD3 t3, t4, c06
  1192. #endif
  1193. #ifdef RT
  1194. LDF [BO + 6 * SIZE], a1
  1195. LDF [BO + 7 * SIZE], a2
  1196. LDF [BO + 4 * SIZE], a3
  1197. LDF [BO + 5 * SIZE], a4
  1198. LDF [BO + 0 * SIZE], b1
  1199. LDF [BO + 1 * SIZE], b2
  1200. FMUL a1, c05, t1
  1201. FMUL a2, c06, t2
  1202. FMUL a1, c06, t3
  1203. FMUL a2, c05, t4
  1204. FADD4 t1, t2, c05
  1205. FADD3 t3, t4, c06
  1206. FMUL a3, c05, t1
  1207. FMUL a3, c06, t2
  1208. FMUL a4, c06, t3
  1209. FMUL a4, c05, t4
  1210. FSUB c01, t1, c01
  1211. FSUB c02, t2, c02
  1212. FADD3 c01, t3, c01
  1213. FADD4 c02, t4, c02
  1214. FMUL b1, c01, t1
  1215. FMUL b2, c02, t2
  1216. FMUL b1, c02, t3
  1217. FMUL b2, c01, t4
  1218. FADD4 t1, t2, c01
  1219. FADD3 t3, t4, c02
  1220. #endif
  1221. #ifdef LN
  1222. add C1, -2 * SIZE, C1
  1223. add C2, -2 * SIZE, C2
  1224. #endif
  1225. #if defined(LN) || defined(LT)
  1226. STF c01, [BO + 0 * SIZE]
  1227. STF c02, [BO + 1 * SIZE]
  1228. STF c05, [BO + 2 * SIZE]
  1229. STF c06, [BO + 3 * SIZE]
  1230. #else
  1231. STF c01, [AO + 0 * SIZE]
  1232. STF c02, [AO + 1 * SIZE]
  1233. STF c05, [AO + 2 * SIZE]
  1234. STF c06, [AO + 3 * SIZE]
  1235. #endif
  1236. STF c01, [C1 + 0 * SIZE]
  1237. STF c02, [C1 + 1 * SIZE]
  1238. STF c05, [C2 + 0 * SIZE]
  1239. STF c06, [C2 + 1 * SIZE]
  1240. FMOV FZERO, t1
  1241. FMOV FZERO, t2
  1242. FMOV FZERO, t3
  1243. FMOV FZERO, t4
  1244. #ifndef LN
  1245. add C1, 2 * SIZE, C1
  1246. add C2, 2 * SIZE, C2
  1247. #endif
  1248. #ifdef RT
  1249. sll K, 0 + ZBASE_SHIFT, TEMP1
  1250. add AORIG, TEMP1, AORIG
  1251. #endif
  1252. #if defined(LT) || defined(RN)
  1253. sub K, KK, TEMP1
  1254. sll TEMP1, 0 + ZBASE_SHIFT, TEMP2
  1255. sll TEMP1, 1 + ZBASE_SHIFT, TEMP1
  1256. add AO, TEMP2, AO
  1257. add BO, TEMP1, BO
  1258. #endif
  1259. #ifdef LT
  1260. add KK, 1, KK
  1261. #endif
  1262. #ifdef LN
  1263. sub KK, 1, KK
  1264. #endif
  1265. .LL99:
  1266. #ifdef LN
  1267. sll K, 1 + ZBASE_SHIFT, TEMP1
  1268. add B, TEMP1, B
  1269. #endif
  1270. #if defined(LT) || defined(RN)
  1271. mov BO, B
  1272. #endif
  1273. #ifdef RN
  1274. add KK, 2, KK
  1275. #endif
  1276. #ifdef RT
  1277. sub KK, 2, KK
  1278. #endif
  1279. add J, -1, J
  1280. cmp J, 0
  1281. bg,pt %icc, .LL11
  1282. nop
  1283. .LL100:
  1284. and N, 1, J
  1285. cmp J, 0
  1286. ble,pn %icc, .LL999
  1287. nop
  1288. #ifdef RT
  1289. sll K, 0 + ZBASE_SHIFT, TEMP1
  1290. sub B, TEMP1, B
  1291. sub C, LDC, C
  1292. #endif
  1293. mov C, C1
  1294. #ifdef LN
  1295. add M, OFFSET, KK
  1296. #endif
  1297. #ifdef LT
  1298. mov OFFSET, KK
  1299. #endif
  1300. #if defined(LN) || defined(RT)
  1301. mov A, AORIG
  1302. #else
  1303. mov A, AO
  1304. #endif
  1305. #ifndef RT
  1306. add C, LDC, C
  1307. #endif
  1308. sra M, 1, I
  1309. cmp I, 0
  1310. ble,pn %icc, .LL150
  1311. FMOV FZERO, c03
  1312. .LL121:
  1313. #if defined(LT) || defined(RN)
  1314. sra KK, 2, L
  1315. mov B, BO
  1316. cmp L, 0
  1317. #else
  1318. #ifdef LN
  1319. sll K, 1 + ZBASE_SHIFT, TEMP1
  1320. sub AORIG, TEMP1, AORIG
  1321. #endif
  1322. sll KK, 1 + ZBASE_SHIFT, TEMP1
  1323. sll KK, 0 + ZBASE_SHIFT, TEMP2
  1324. add AORIG, TEMP1, AO
  1325. add B, TEMP2, BO
  1326. sub K, KK, TEMP1
  1327. sra TEMP1, 2, L
  1328. cmp L, 0
  1329. #endif
  1330. FMOV FZERO, c03
  1331. LDF [AO + 0 * SIZE], a1
  1332. FMOV FZERO, t1
  1333. LDF [BO + 0 * SIZE], b1
  1334. FMOV FZERO, c07
  1335. LDF [AO + 1 * SIZE], a2
  1336. FMOV FZERO, t2
  1337. LDF [BO + 1 * SIZE], b2
  1338. FMOV FZERO, c04
  1339. LDF [AO + 2 * SIZE], a3
  1340. FMOV FZERO, t3
  1341. LDF [BO + 2 * SIZE], b3
  1342. FMOV FZERO, c08
  1343. LDF [AO + 3 * SIZE], a4
  1344. FMOV FZERO, t4
  1345. LDF [BO + 3 * SIZE], b4
  1346. FMOV FZERO, c01
  1347. prefetch [C1 + 3 * SIZE], 3
  1348. FMOV FZERO, c05
  1349. FMOV FZERO, c02
  1350. ble,pn %icc, .LL125
  1351. FMOV FZERO, c06
  1352. .LL122:
  1353. FADD1 c03, t1, c03
  1354. add L, -1, L
  1355. FMUL a1, b1, t1
  1356. prefetch [AO + APREFETCHSIZE * SIZE], 0
  1357. FADD3 c07, t2, c07
  1358. add BO, 8 * SIZE, BO
  1359. FMUL a1, b2, t2
  1360. LDF [AO + 4 * SIZE], a1
  1361. FADD2 c04, t3, c04
  1362. add AO, 16 * SIZE, AO
  1363. FMUL a2, b1, t3
  1364. cmp L, 0
  1365. FADD4 c08, t4, c08
  1366. nop
  1367. FMUL a2, b2, t4
  1368. LDF [AO - 11 * SIZE], a2
  1369. FADD1 c01, t1, c01
  1370. nop
  1371. FMUL a3, b1, t1
  1372. nop
  1373. FADD3 c05, t2, c05
  1374. nop
  1375. FMUL a3, b2, t2
  1376. LDF [AO - 10 * SIZE], a3
  1377. FADD2 c02, t3, c02
  1378. nop
  1379. FMUL a4, b1, t3
  1380. LDF [BO - 4 * SIZE], b1
  1381. FADD4 c06, t4, c06
  1382. nop
  1383. FMUL a4, b2, t4
  1384. LDF [BO - 3 * SIZE], b2
  1385. FADD1 c03, t1, c03
  1386. nop
  1387. FMUL a1, b3, t1
  1388. LDF [AO - 9 * SIZE], a4
  1389. FADD3 c07, t2, c07
  1390. nop
  1391. FMUL a1, b4, t2
  1392. LDF [AO - 8 * SIZE], a1
  1393. FADD2 c04, t3, c04
  1394. nop
  1395. FMUL a2, b3, t3
  1396. nop
  1397. FADD4 c08, t4, c08
  1398. nop
  1399. FMUL a2, b4, t4
  1400. LDF [AO - 7 * SIZE], a2
  1401. FADD1 c01, t1, c01
  1402. nop
  1403. FMUL a3, b3, t1
  1404. nop
  1405. FADD3 c05, t2, c05
  1406. nop
  1407. FMUL a3, b4, t2
  1408. LDF [AO - 6 * SIZE], a3
  1409. FADD2 c02, t3, c02
  1410. nop
  1411. FMUL a4, b3, t3
  1412. LDF [BO - 2 * SIZE], b3
  1413. FADD4 c06, t4, c06
  1414. nop
  1415. FMUL a4, b4, t4
  1416. LDF [BO - 1 * SIZE], b4
  1417. FADD1 c03, t1, c03
  1418. nop
  1419. FMUL a1, b1, t1
  1420. LDF [AO - 5 * SIZE], a4
  1421. FADD3 c07, t2, c07
  1422. nop
  1423. FMUL a1, b2, t2
  1424. LDF [AO - 4 * SIZE], a1
  1425. FADD2 c04, t3, c04
  1426. nop
  1427. FMUL a2, b1, t3
  1428. nop
  1429. FADD4 c08, t4, c08
  1430. nop
  1431. FMUL a2, b2, t4
  1432. LDF [AO - 3 * SIZE], a2
  1433. FADD1 c01, t1, c01
  1434. nop
  1435. FMUL a3, b1, t1
  1436. nop
  1437. FADD3 c05, t2, c05
  1438. nop
  1439. FMUL a3, b2, t2
  1440. LDF [AO - 2 * SIZE], a3
  1441. FADD2 c02, t3, c02
  1442. nop
  1443. FMUL a4, b1, t3
  1444. LDF [BO + 0 * SIZE], b1
  1445. FADD4 c06, t4, c06
  1446. nop
  1447. FMUL a4, b2, t4
  1448. LDF [BO + 1 * SIZE], b2
  1449. FADD1 c03, t1, c03
  1450. nop
  1451. FMUL a1, b3, t1
  1452. LDF [AO - 1 * SIZE], a4
  1453. FADD3 c07, t2, c07
  1454. nop
  1455. FMUL a1, b4, t2
  1456. LDF [AO + 0 * SIZE], a1
  1457. FADD2 c04, t3, c04
  1458. nop
  1459. FMUL a2, b3, t3
  1460. nop
  1461. FADD4 c08, t4, c08
  1462. nop
  1463. FMUL a2, b4, t4
  1464. LDF [AO + 1 * SIZE], a2
  1465. FADD1 c01, t1, c01
  1466. nop
  1467. FMUL a3, b3, t1
  1468. nop
  1469. FADD3 c05, t2, c05
  1470. nop
  1471. FMUL a3, b4, t2
  1472. LDF [AO + 2 * SIZE], a3
  1473. FADD2 c02, t3, c02
  1474. nop
  1475. FMUL a4, b3, t3
  1476. LDF [BO + 2 * SIZE], b3
  1477. FADD4 c06, t4, c06
  1478. FMUL a4, b4, t4
  1479. LDF [AO + 3 * SIZE], a4
  1480. bg,pt %icc, .LL122
  1481. LDF [BO + 3 * SIZE], b4
  1482. .LL125:
  1483. #if defined(LT) || defined(RN)
  1484. and KK, 3, L
  1485. #else
  1486. and TEMP1, 3, L
  1487. #endif
  1488. cmp L, 0
  1489. ble,a,pn %icc, .LL129
  1490. nop
  1491. .LL126:
  1492. FADD1 c03, t1, c03
  1493. add AO, 4 * SIZE, AO
  1494. FMUL a1, b1, t1
  1495. add BO, 2 * SIZE, BO
  1496. FADD3 c07, t2, c07
  1497. add L, -1, L
  1498. FMUL a1, b2, t2
  1499. LDF [AO + 0 * SIZE], a1
  1500. FADD2 c04, t3, c04
  1501. cmp L, 0
  1502. FMUL a2, b1, t3
  1503. FADD4 c08, t4, c08
  1504. FMUL a2, b2, t4
  1505. LDF [AO + 1 * SIZE], a2
  1506. FADD1 c01, t1, c01
  1507. FMUL a3, b1, t1
  1508. FADD3 c05, t2, c05
  1509. FMUL a3, b2, t2
  1510. LDF [AO + 2 * SIZE], a3
  1511. FADD2 c02, t3, c02
  1512. FMUL a4, b1, t3
  1513. LDF [BO + 0 * SIZE], b1
  1514. FADD4 c06, t4, c06
  1515. FMUL a4, b2, t4
  1516. LDF [BO + 1 * SIZE], b2
  1517. bg,pt %icc, .LL126
  1518. LDF [AO + 3 * SIZE], a4
  1519. .LL129:
  1520. FADD1 c03, t1, c03
  1521. FADD3 c07, t2, c07
  1522. FADD2 c04, t3, c04
  1523. FADD4 c08, t4, c08
  1524. FADD c01, c06, c01
  1525. FADD c02, c05, c02
  1526. FADD c03, c08, c03
  1527. FADD c04, c07, c04
  1528. #if defined(LN) || defined(RT)
  1529. #ifdef LN
  1530. sub KK, 2, TEMP1
  1531. #else
  1532. sub KK, 1, TEMP1
  1533. #endif
  1534. sll TEMP1, 1 + ZBASE_SHIFT, TEMP2
  1535. sll TEMP1, 0 + ZBASE_SHIFT, TEMP1
  1536. add AORIG, TEMP2, AO
  1537. add B, TEMP1, BO
  1538. #endif
  1539. #if defined(LN) || defined(LT)
  1540. LDF [BO + 0 * SIZE], a1
  1541. LDF [BO + 1 * SIZE], a2
  1542. LDF [BO + 2 * SIZE], a3
  1543. LDF [BO + 3 * SIZE], a4
  1544. FSUB a1, c01, c01
  1545. FSUB a2, c02, c02
  1546. FSUB a3, c03, c03
  1547. FSUB a4, c04, c04
  1548. #else
  1549. LDF [AO + 0 * SIZE], a1
  1550. LDF [AO + 1 * SIZE], a2
  1551. LDF [AO + 2 * SIZE], a3
  1552. LDF [AO + 3 * SIZE], a4
  1553. FSUB a1, c01, c01
  1554. FSUB a2, c02, c02
  1555. FSUB a3, c03, c03
  1556. FSUB a4, c04, c04
  1557. #endif
  1558. #ifdef LN
  1559. LDF [AO + 6 * SIZE], a1
  1560. LDF [AO + 7 * SIZE], a2
  1561. LDF [AO + 4 * SIZE], a3
  1562. LDF [AO + 5 * SIZE], a4
  1563. LDF [AO + 0 * SIZE], b1
  1564. LDF [AO + 1 * SIZE], b2
  1565. FMUL a1, c03, t1
  1566. FMUL a2, c04, t2
  1567. FMUL a1, c04, t3
  1568. FMUL a2, c03, t4
  1569. FADD4 t1, t2, c03
  1570. FADD2 t3, t4, c04
  1571. FMUL a3, c03, t1
  1572. FMUL a3, c04, t2
  1573. FMUL a4, c04, t5
  1574. FMUL a4, c03, t6
  1575. FSUB c01, t1, c01
  1576. FSUB c02, t2, c02
  1577. FADD2 c01, t5, c01
  1578. FADD4 c02, t6, c02
  1579. FMUL b1, c01, t1
  1580. FMUL b2, c02, t2
  1581. FMUL b1, c02, t3
  1582. FMUL b2, c01, t4
  1583. FADD4 t1, t2, c01
  1584. FADD2 t3, t4, c02
  1585. #endif
  1586. #ifdef LT
  1587. LDF [AO + 0 * SIZE], a1
  1588. LDF [AO + 1 * SIZE], a2
  1589. LDF [AO + 2 * SIZE], a3
  1590. LDF [AO + 3 * SIZE], a4
  1591. LDF [AO + 6 * SIZE], b1
  1592. LDF [AO + 7 * SIZE], b2
  1593. FMUL a1, c01, t1
  1594. FMUL a2, c02, t2
  1595. FMUL a1, c02, t3
  1596. FMUL a2, c01, t4
  1597. FADD4 t1, t2, c01
  1598. FADD2 t3, t4, c02
  1599. FMUL a3, c01, t1
  1600. FMUL a3, c02, t2
  1601. FMUL a4, c02, t5
  1602. FMUL a4, c01, t6
  1603. FSUB c03, t1, c03
  1604. FSUB c04, t2, c04
  1605. FADD2 c03, t5, c03
  1606. FADD4 c04, t6, c04
  1607. FMUL b1, c03, t1
  1608. FMUL b2, c04, t2
  1609. FMUL b1, c04, t3
  1610. FMUL b2, c03, t4
  1611. FADD4 t1, t2, c03
  1612. FADD2 t3, t4, c04
  1613. #endif
  1614. #ifdef RN
  1615. LDF [BO + 0 * SIZE], a1
  1616. LDF [BO + 1 * SIZE], a2
  1617. FMUL a1, c01, t1
  1618. FMUL a2, c02, t2
  1619. FMUL a1, c02, t3
  1620. FMUL a2, c01, t4
  1621. FMUL a1, c03, t5
  1622. FMUL a2, c04, t6
  1623. FMUL a1, c04, t7
  1624. FMUL a2, c03, t8
  1625. FADD4 t1, t2, c01
  1626. FADD3 t3, t4, c02
  1627. FADD4 t5, t6, c03
  1628. FADD3 t7, t8, c04
  1629. #endif
  1630. #ifdef RT
  1631. LDF [BO + 0 * SIZE], a1
  1632. LDF [BO + 1 * SIZE], a2
  1633. FMUL a1, c01, t1
  1634. FMUL a2, c02, t2
  1635. FMUL a1, c02, t3
  1636. FMUL a2, c01, t4
  1637. FMUL a1, c03, t5
  1638. FMUL a2, c04, t6
  1639. FMUL a1, c04, t7
  1640. FMUL a2, c03, t8
  1641. FADD4 t1, t2, c01
  1642. FADD3 t3, t4, c02
  1643. FADD4 t5, t6, c03
  1644. FADD3 t7, t8, c04
  1645. #endif
  1646. #ifdef LN
  1647. add C1, -4 * SIZE, C1
  1648. #endif
  1649. #if defined(LN) || defined(LT)
  1650. STF c01, [BO + 0 * SIZE]
  1651. STF c02, [BO + 1 * SIZE]
  1652. STF c03, [BO + 2 * SIZE]
  1653. STF c04, [BO + 3 * SIZE]
  1654. #else
  1655. STF c01, [AO + 0 * SIZE]
  1656. STF c02, [AO + 1 * SIZE]
  1657. STF c03, [AO + 2 * SIZE]
  1658. STF c04, [AO + 3 * SIZE]
  1659. #endif
  1660. STF c01, [C1 + 0 * SIZE]
  1661. STF c02, [C1 + 1 * SIZE]
  1662. STF c03, [C1 + 2 * SIZE]
  1663. STF c04, [C1 + 3 * SIZE]
  1664. FMOV FZERO, t1
  1665. FMOV FZERO, t2
  1666. FMOV FZERO, t3
  1667. FMOV FZERO, t4
  1668. #ifndef LN
  1669. add C1, 4 * SIZE, C1
  1670. #endif
  1671. #ifdef RT
  1672. sll K, 1 + ZBASE_SHIFT, TEMP1
  1673. add AORIG, TEMP1, AORIG
  1674. #endif
  1675. #if defined(LT) || defined(RN)
  1676. sub K, KK, TEMP1
  1677. sll TEMP1, 1 + ZBASE_SHIFT, TEMP2
  1678. sll TEMP1, 0 + ZBASE_SHIFT, TEMP1
  1679. add AO, TEMP2, AO
  1680. add BO, TEMP1, BO
  1681. #endif
  1682. #ifdef LT
  1683. add KK, 2, KK
  1684. #endif
  1685. #ifdef LN
  1686. sub KK, 2, KK
  1687. #endif
  1688. add I, -1, I
  1689. cmp I, 0
  1690. bg,pt %icc, .LL121
  1691. FMOV FZERO, c03
  1692. .LL150:
  1693. and M, 1, I
  1694. cmp I, 0
  1695. ble,pn %icc, .LL199
  1696. nop
  1697. #if defined(LT) || defined(RN)
  1698. sra KK, 2, L
  1699. mov B, BO
  1700. cmp L, 0
  1701. #else
  1702. #ifdef LN
  1703. sll K, 0 + ZBASE_SHIFT, TEMP1
  1704. sub AORIG, TEMP1, AORIG
  1705. #endif
  1706. sll KK, 0 + ZBASE_SHIFT, TEMP1
  1707. add AORIG, TEMP1, AO
  1708. add B, TEMP1, BO
  1709. sub K, KK, TEMP1
  1710. sra TEMP1, 2, L
  1711. cmp L, 0
  1712. #endif
  1713. LDF [AO + 0 * SIZE], a1
  1714. FMOV FZERO, c01
  1715. LDF [BO + 0 * SIZE], b1
  1716. FMOV FZERO, t1
  1717. LDF [AO + 1 * SIZE], a2
  1718. FMOV FZERO, c02
  1719. LDF [BO + 1 * SIZE], b2
  1720. FMOV FZERO, t2
  1721. LDF [AO + 2 * SIZE], a3
  1722. FMOV FZERO, c03
  1723. LDF [BO + 2 * SIZE], b3
  1724. FMOV FZERO, t3
  1725. LDF [AO + 3 * SIZE], a4
  1726. FMOV FZERO, c04
  1727. LDF [BO + 3 * SIZE], b4
  1728. FMOV FZERO, t4
  1729. ble,pn %icc, .LL155
  1730. nop
  1731. .LL152:
  1732. FADD1 c01, t1, c01
  1733. add L, -1, L
  1734. FMUL a1, b1, t1
  1735. prefetch [AO + APREFETCHSIZE * SIZE], 0
  1736. FADD3 c02, t2, c02
  1737. add BO, 8 * SIZE, BO
  1738. FMUL a1, b2, t2
  1739. LDF [AO + 4 * SIZE], a1
  1740. FADD2 c03, t3, c03
  1741. cmp L, 0
  1742. FMUL a2, b1, t3
  1743. LDF [BO - 4 * SIZE], b1
  1744. FADD4 c04, t4, c04
  1745. nop
  1746. FMUL a2, b2, t4
  1747. LDF [AO + 5 * SIZE], a2
  1748. FADD1 c01, t1, c01
  1749. nop
  1750. FMUL a3, b3, t1
  1751. LDF [BO - 3 * SIZE], b2
  1752. FADD3 c02, t2, c02
  1753. nop
  1754. FMUL a3, b4, t2
  1755. LDF [AO + 6 * SIZE], a3
  1756. FADD2 c03, t3, c03
  1757. nop
  1758. FMUL a4, b3, t3
  1759. LDF [BO - 2 * SIZE], b3
  1760. FADD4 c04, t4, c04
  1761. nop
  1762. FMUL a4, b4, t4
  1763. LDF [AO + 7 * SIZE], a4
  1764. FADD1 c01, t1, c01
  1765. nop
  1766. FMUL a1, b1, t1
  1767. LDF [BO - 1 * SIZE], b4
  1768. FADD3 c02, t2, c02
  1769. FMUL a1, b2, t2
  1770. LDF [AO + 8 * SIZE], a1
  1771. FADD2 c03, t3, c03
  1772. FMUL a2, b1, t3
  1773. LDF [BO + 0 * SIZE], b1
  1774. FADD4 c04, t4, c04
  1775. FMUL a2, b2, t4
  1776. LDF [AO + 9 * SIZE], a2
  1777. FADD1 c01, t1, c01
  1778. FMUL a3, b3, t1
  1779. LDF [BO + 1 * SIZE], b2
  1780. FADD3 c02, t2, c02
  1781. FMUL a3, b4, t2
  1782. LDF [AO + 10 * SIZE], a3
  1783. FADD2 c03, t3, c03
  1784. FMUL a4, b3, t3
  1785. LDF [BO + 2 * SIZE], b3
  1786. FADD4 c04, t4, c04
  1787. FMUL a4, b4, t4
  1788. LDF [AO + 11 * SIZE], a4
  1789. add AO, 8 * SIZE, AO
  1790. bg,pt %icc, .LL152
  1791. LDF [BO + 3 * SIZE], b4
  1792. .LL155:
  1793. #if defined(LT) || defined(RN)
  1794. and KK, 3, L
  1795. #else
  1796. and TEMP1, 3, L
  1797. #endif
  1798. cmp L, 0
  1799. ble,a,pn %icc, .LL159
  1800. nop
  1801. .LL156:
  1802. FADD1 c01, t1, c01
  1803. add AO, 2 * SIZE, AO
  1804. FMUL a1, b1, t1
  1805. add BO, 2 * SIZE, BO
  1806. FADD3 c02, t2, c02
  1807. add L, -1, L
  1808. FMUL a1, b2, t2
  1809. LDF [AO + 0 * SIZE], a1
  1810. FADD2 c03, t3, c03
  1811. FMUL a2, b1, t3
  1812. LDF [BO + 0 * SIZE], b1
  1813. cmp L, 0
  1814. FADD4 c04, t4, c04
  1815. FMUL a2, b2, t4
  1816. LDF [BO + 1 * SIZE], b2
  1817. bg,pt %icc, .LL156
  1818. LDF [AO + 1 * SIZE], a2
  1819. .LL159:
  1820. FADD1 c01, t1, c01
  1821. FADD3 c02, t2, c02
  1822. FADD2 c03, t3, c03
  1823. FADD4 c04, t4, c04
  1824. FADD c01, c04, c01
  1825. FADD c02, c03, c02
  1826. #if defined(LN) || defined(RT)
  1827. sub KK, 1, TEMP1
  1828. sll TEMP1, 0 + ZBASE_SHIFT, TEMP1
  1829. add AORIG, TEMP1, AO
  1830. add B, TEMP1, BO
  1831. #endif
  1832. #if defined(LN) || defined(LT)
  1833. LDF [BO + 0 * SIZE], a1
  1834. LDF [BO + 1 * SIZE], a2
  1835. FSUB a1, c01, c01
  1836. FSUB a2, c02, c02
  1837. #else
  1838. LDF [AO + 0 * SIZE], a1
  1839. LDF [AO + 1 * SIZE], a2
  1840. FSUB a1, c01, c01
  1841. FSUB a2, c02, c02
  1842. #endif
  1843. #ifdef LN
  1844. LDF [AO + 0 * SIZE], a1
  1845. LDF [AO + 1 * SIZE], a2
  1846. FMUL a1, c01, t1
  1847. FMUL a2, c02, t2
  1848. FMUL a1, c02, t3
  1849. FMUL a2, c01, t4
  1850. FADD4 t1, t2, c01
  1851. FADD2 t3, t4, c02
  1852. #endif
  1853. #ifdef LT
  1854. LDF [AO + 0 * SIZE], a1
  1855. LDF [AO + 1 * SIZE], a2
  1856. FMUL a1, c01, t1
  1857. FMUL a2, c02, t2
  1858. FMUL a1, c02, t3
  1859. FMUL a2, c01, t4
  1860. FADD4 t1, t2, c01
  1861. FADD2 t3, t4, c02
  1862. #endif
  1863. #ifdef RN
  1864. LDF [BO + 0 * SIZE], a1
  1865. LDF [BO + 1 * SIZE], a2
  1866. FMUL a1, c01, t1
  1867. FMUL a2, c02, t2
  1868. FMUL a1, c02, t3
  1869. FMUL a2, c01, t4
  1870. FADD4 t1, t2, c01
  1871. FADD3 t3, t4, c02
  1872. #endif
  1873. #ifdef RT
  1874. LDF [BO + 0 * SIZE], a1
  1875. LDF [BO + 1 * SIZE], a2
  1876. FMUL a1, c01, t1
  1877. FMUL a2, c02, t2
  1878. FMUL a1, c02, t3
  1879. FMUL a2, c01, t4
  1880. FADD4 t1, t2, c01
  1881. FADD3 t3, t4, c02
  1882. #endif
  1883. #ifdef LN
  1884. add C1, -2 * SIZE, C1
  1885. #endif
  1886. #if defined(LN) || defined(LT)
  1887. STF c01, [BO + 0 * SIZE]
  1888. STF c02, [BO + 1 * SIZE]
  1889. #else
  1890. STF c01, [AO + 0 * SIZE]
  1891. STF c02, [AO + 1 * SIZE]
  1892. #endif
  1893. STF c01, [C1 + 0 * SIZE]
  1894. STF c02, [C1 + 1 * SIZE]
  1895. FMOV FZERO, t1
  1896. FMOV FZERO, t2
  1897. FMOV FZERO, t3
  1898. FMOV FZERO, t4
  1899. #ifndef LN
  1900. add C1, 2 * SIZE, C1
  1901. #endif
  1902. #ifdef RT
  1903. sll K, 0 + ZBASE_SHIFT, TEMP1
  1904. add AORIG, TEMP1, AORIG
  1905. #endif
  1906. #if defined(LT) || defined(RN)
  1907. sub K, KK, TEMP1
  1908. sll TEMP1, 0 + ZBASE_SHIFT, TEMP1
  1909. add AO, TEMP1, AO
  1910. add BO, TEMP1, BO
  1911. #endif
  1912. #ifdef LT
  1913. add KK, 1, KK
  1914. #endif
  1915. #ifdef LN
  1916. sub KK, 1, KK
  1917. #endif
  1918. .LL199:
  1919. #ifdef LN
  1920. sll K, 0 + ZBASE_SHIFT, TEMP1
  1921. add B, TEMP1, B
  1922. #endif
  1923. #if defined(LT) || defined(RN)
  1924. mov BO, B
  1925. #endif
  1926. #ifdef RN
  1927. add KK, 1, KK
  1928. #endif
  1929. #ifdef RT
  1930. sub KK, 1, KK
  1931. #endif
  1932. .LL999:
  1933. return %i7 + 8
  1934. clr %o0
  1935. EPILOGUE