You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

ztrsm_kernel_LN.S 37 kB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714171517161717171817191720172117221723172417251726172717281729173017311732173317341735173617371738173917401741174217431744174517461747174817491750175117521753175417551756175717581759176017611762176317641765176617671768176917701771177217731774177517761777177817791780178117821783178417851786178717881789179017911792179317941795179617971798179918001801180218031804180518061807180818091810181118121813181418151816181718181819182018211822182318241825182618271828182918301831183218331834183518361837183818391840184118421843184418451846184718481849185018511852185318541855185618571858185918601861186218631864186518661867186818691870187118721873187418751876187718781879188018811882188318841885188618871888188918901891189218931894189518961897189818991900190119021903190419051906190719081909191019111912191319141915191619171918191919201921192219231924192519261927192819291930193119321933193419351936193719381939194019411942194319441945194619471948194919501951195219531954195519561957195819591960196119621963196419651966196719681969197019711972197319741975197619771978197919801981198219831984198519861987198819891990199119921993199419951996199719981999200020012002200320042005200620072008200920102011201220132014201520162017201820192020202120222023202420252026202720282029203020312032203320342035203620372038203920402041204220432044204520462047204820492050205120522053205420552056205720582059206020612062206320642065206620672068206920702071207220732074207520762077207820792080208120822083208420852086208720882089209020912092209320942095209620972098209921002101210221032104210521062107210821092110211121122113211421152116211721182119212021212122212321242125212621272128212921302131213221332134213521362137213821392140214121422143214421452146214721482149215021512152215321542155215621572158215921602161216221632164216521662167216821692170217121722173217421752176217721782179218021812182218321842185218621872188218921902191219221932194219521962197219821992200220122022203220422052206220722082209221022112212221322142215221622172218221922202221222222232224222522262227222822292230223122322233223422352236223722382239224022412242224322442245224622472248224922502251225222532254225522562257225822592260226122622263226422652266226722682269227022712272227322742275227622772278227922802281228222832284228522862287228822892290229122922293229422952296229722982299230023012302230323042305230623072308230923102311231223132314231523162317231823192320232123222323232423252326232723282329233023312332233323342335233623372338233923402341234223432344234523462347234823492350235123522353235423552356235723582359236023612362236323642365236623672368236923702371237223732374237523762377237823792380238123822383238423852386238723882389239023912392239323942395
  1. /*********************************************************************/
  2. /* Copyright 2009, 2010 The University of Texas at Austin. */
  3. /* All rights reserved. */
  4. /* */
  5. /* Redistribution and use in source and binary forms, with or */
  6. /* without modification, are permitted provided that the following */
  7. /* conditions are met: */
  8. /* */
  9. /* 1. Redistributions of source code must retain the above */
  10. /* copyright notice, this list of conditions and the following */
  11. /* disclaimer. */
  12. /* */
  13. /* 2. Redistributions in binary form must reproduce the above */
  14. /* copyright notice, this list of conditions and the following */
  15. /* disclaimer in the documentation and/or other materials */
  16. /* provided with the distribution. */
  17. /* */
  18. /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
  19. /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
  20. /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
  21. /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
  22. /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
  23. /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
  24. /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
  25. /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
  26. /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
  27. /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
  28. /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
  29. /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
  30. /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
  31. /* POSSIBILITY OF SUCH DAMAGE. */
  32. /* */
  33. /* The views and conclusions contained in the software and */
  34. /* documentation are those of the authors and should not be */
  35. /* interpreted as representing official policies, either expressed */
  36. /* or implied, of The University of Texas at Austin. */
  37. /*********************************************************************/
  38. #define ASSEMBLER
  39. #include "common.h"
  40. #define M %i0
  41. #define N %i1
  42. #define K %i2
  43. #define A %i5
  44. #define B %i3
  45. #define C %i4
  46. #define LDC %o0
  47. #define AO %o1
  48. #define BO %o2
  49. #define I %o3
  50. #define J %o4
  51. #define L %o5
  52. #define C1 %l0
  53. #define C2 %l1
  54. #define OFFSET %l2
  55. #define KK %l3
  56. #define TEMP1 %l4
  57. #define TEMP2 %l5
  58. #define AORIG %l6
  59. #ifdef DOUBLE
  60. #define c01 %f0
  61. #define c02 %f2
  62. #define c03 %f4
  63. #define c04 %f6
  64. #define c05 %f8
  65. #define c06 %f10
  66. #define c07 %f12
  67. #define c08 %f14
  68. #define c09 %f16
  69. #define c10 %f18
  70. #define c11 %f20
  71. #define c12 %f22
  72. #define c13 %f24
  73. #define c14 %f26
  74. #define c15 %f28
  75. #define c16 %f30
  76. #define t1 %f32
  77. #define t2 %f34
  78. #define t3 %f36
  79. #define t4 %f38
  80. #define a1 %f40
  81. #define a2 %f42
  82. #define a3 %f44
  83. #define a4 %f46
  84. #define a5 %f62
  85. #define b1 %f48
  86. #define b2 %f50
  87. #define b3 %f52
  88. #define b4 %f54
  89. #define b5 %f56
  90. #define FZERO %f58
  91. #else
  92. #define c01 %f0
  93. #define c02 %f1
  94. #define c03 %f2
  95. #define c04 %f3
  96. #define c05 %f4
  97. #define c06 %f5
  98. #define c07 %f6
  99. #define c08 %f7
  100. #define c09 %f8
  101. #define c10 %f9
  102. #define c11 %f10
  103. #define c12 %f11
  104. #define c13 %f12
  105. #define c14 %f13
  106. #define c15 %f14
  107. #define c16 %f15
  108. #define t1 %f16
  109. #define t2 %f17
  110. #define t3 %f18
  111. #define t4 %f19
  112. #define a1 %f20
  113. #define a2 %f21
  114. #define a3 %f22
  115. #define a4 %f23
  116. #define a5 %f31
  117. #define b1 %f24
  118. #define b2 %f25
  119. #define b3 %f26
  120. #define b4 %f27
  121. #define b5 %f28
  122. #define FZERO %f29
  123. #endif
  124. #define t5 c13
  125. #define t6 c14
  126. #define t7 c15
  127. #define t8 c16
  128. #ifndef CONJ
  129. #define FADD1 FADD
  130. #define FADD2 FADD
  131. #define FADD3 FADD
  132. #define FADD4 FSUB
  133. #else
  134. #if defined(LN) || defined(LT)
  135. #define FADD1 FADD
  136. #define FADD2 FSUB
  137. #define FADD3 FADD
  138. #define FADD4 FADD
  139. #endif
  140. #if defined(RN) || defined(RT)
  141. #define FADD1 FADD
  142. #define FADD2 FADD
  143. #define FADD3 FSUB
  144. #define FADD4 FADD
  145. #endif
  146. #endif
  147. #define APREFETCHSIZE 40
  148. #define BPREFETCHSIZE 40
  149. #define APREFETCH_CATEGORY 0
  150. #define BPREFETCH_CATEGORY 0
  151. PROLOGUE
  152. SAVESP
  153. #ifndef __64BIT__
  154. #ifdef DOUBLE
  155. ld [%sp + STACK_START + 32], A
  156. ld [%sp + STACK_START + 36], B
  157. ld [%sp + STACK_START + 40], C
  158. ld [%sp + STACK_START + 44], LDC
  159. ld [%sp + STACK_START + 48], OFFSET
  160. #else
  161. ld [%sp + STACK_START + 28], B
  162. ld [%sp + STACK_START + 32], C
  163. ld [%sp + STACK_START + 36], LDC
  164. ld [%sp + STACK_START + 40], OFFSET
  165. #endif
  166. #else
  167. ldx [%sp+ STACK_START + 56], B
  168. ldx [%sp+ STACK_START + 64], C
  169. ldx [%sp+ STACK_START + 72], LDC
  170. ldx [%sp+ STACK_START + 80], OFFSET
  171. #endif
  172. #ifdef DOUBLE
  173. FCLR(27)
  174. #else
  175. FCLR(29)
  176. #endif
  177. sll LDC, ZBASE_SHIFT, LDC
  178. #ifdef LN
  179. smul M, K, TEMP1
  180. sll TEMP1, ZBASE_SHIFT, TEMP1
  181. add A, TEMP1, A
  182. sll M, ZBASE_SHIFT, TEMP1
  183. add C, TEMP1, C
  184. #endif
  185. #ifdef RN
  186. neg OFFSET, KK
  187. #endif
  188. #ifdef RT
  189. smul N, K, TEMP1
  190. sll TEMP1, ZBASE_SHIFT, TEMP1
  191. add B, TEMP1, B
  192. smul N, LDC, TEMP1
  193. add C, TEMP1, C
  194. sub N, OFFSET, KK
  195. #endif
  196. sra N, 1, J
  197. cmp J, 0
  198. ble,pn %icc, .LL100
  199. nop
  200. .LL11:
  201. #ifdef RT
  202. sll K, 1 + ZBASE_SHIFT, TEMP1
  203. sub B, TEMP1, B
  204. add LDC, LDC, TEMP1
  205. sub C, TEMP1, C
  206. #endif
  207. mov C, C1
  208. add C, LDC, C2
  209. #ifdef LN
  210. add M, OFFSET, KK
  211. #endif
  212. #ifdef LT
  213. mov OFFSET, KK
  214. #endif
  215. #if defined(LN) || defined(RT)
  216. mov A, AORIG
  217. #else
  218. mov A, AO
  219. #endif
  220. #ifndef RT
  221. add C2, LDC, C
  222. #endif
  223. and M, 1, I
  224. cmp I, 0
  225. ble,pn %icc, .LL50
  226. nop
  227. #if defined(LT) || defined(RN)
  228. sra KK, 2, L
  229. mov B, BO
  230. cmp L, 0
  231. #else
  232. #ifdef LN
  233. sll K, 0 + ZBASE_SHIFT, TEMP1
  234. sub AORIG, TEMP1, AORIG
  235. #endif
  236. sll KK, 0 + ZBASE_SHIFT, TEMP1
  237. sll KK, 1 + ZBASE_SHIFT, TEMP2
  238. add AORIG, TEMP1, AO
  239. add B, TEMP2, BO
  240. sub K, KK, TEMP1
  241. sra TEMP1, 2, L
  242. cmp L, 0
  243. #endif
  244. FMOV FZERO, c02
  245. FMOV FZERO, t1
  246. FMOV FZERO, c04
  247. LDF [AO + 0 * SIZE], a1
  248. FMOV FZERO, t2
  249. LDF [BO + 0 * SIZE], b1
  250. FMOV FZERO, c06
  251. LDF [AO + 1 * SIZE], a2
  252. FMOV FZERO, t3
  253. LDF [BO + 1 * SIZE], b2
  254. FMOV FZERO, c08
  255. LDF [AO + 2 * SIZE], a3
  256. FMOV FZERO, t4
  257. LDF [BO + 2 * SIZE], b3
  258. FMOV FZERO, c01
  259. LDF [AO + 3 * SIZE], a4
  260. FMOV FZERO, c03
  261. LDF [BO + 3 * SIZE], b4
  262. FMOV FZERO, c05
  263. ble,pn %icc, .LL55
  264. FMOV FZERO, c07
  265. .LL52:
  266. FADD2 c02, t1, c02
  267. add AO, 8 * SIZE, AO
  268. prefetch [AO + APREFETCHSIZE * SIZE], 0
  269. FMUL a1, b1, t1
  270. add BO, 16 * SIZE, BO
  271. FADD4 c04, t2, c04
  272. add L, -1, L
  273. FMUL a1, b2, t2
  274. FADD2 c06, t3, c06
  275. cmp L, 0
  276. FMUL a1, b3, t3
  277. FADD4 c08, t4, c08
  278. FMUL a1, b4, t4
  279. LDF [AO - 4 * SIZE], a1
  280. FADD1 c01, t1, c01
  281. FMUL a2, b1, t1
  282. LDF [BO - 12 * SIZE], b1
  283. FADD3 c03, t2, c03
  284. FMUL a2, b2, t2
  285. LDF [BO - 11 * SIZE], b2
  286. FADD1 c05, t3, c05
  287. FMUL a2, b3, t3
  288. LDF [BO - 10 * SIZE], b3
  289. FADD3 c07, t4, c07
  290. FMUL a2, b4, t4
  291. LDF [BO - 9 * SIZE], b4
  292. FADD2 c02, t1, c02
  293. FMUL a3, b1, t1
  294. LDF [AO - 3 * SIZE], a2
  295. FADD4 c04, t2, c04
  296. FMUL a3, b2, t2
  297. FADD2 c06, t3, c06
  298. FMUL a3, b3, t3
  299. FADD4 c08, t4, c08
  300. FMUL a3, b4, t4
  301. LDF [AO - 2 * SIZE], a3
  302. FADD1 c01, t1, c01
  303. FMUL a4, b1, t1
  304. LDF [BO - 8 * SIZE], b1
  305. FADD3 c03, t2, c03
  306. FMUL a4, b2, t2
  307. LDF [BO - 7 * SIZE], b2
  308. FADD1 c05, t3, c05
  309. FMUL a4, b3, t3
  310. LDF [BO - 6 * SIZE], b3
  311. FADD3 c07, t4, c07
  312. FMUL a4, b4, t4
  313. LDF [BO - 5 * SIZE], b4
  314. FADD2 c02, t1, c02
  315. FMUL a1, b1, t1
  316. LDF [AO - 1 * SIZE], a4
  317. FADD4 c04, t2, c04
  318. FMUL a1, b2, t2
  319. FADD2 c06, t3, c06
  320. FMUL a1, b3, t3
  321. FADD4 c08, t4, c08
  322. FMUL a1, b4, t4
  323. LDF [AO + 0 * SIZE], a1
  324. FADD1 c01, t1, c01
  325. FMUL a2, b1, t1
  326. LDF [BO - 4 * SIZE], b1
  327. FADD3 c03, t2, c03
  328. FMUL a2, b2, t2
  329. LDF [BO - 3 * SIZE], b2
  330. FADD1 c05, t3, c05
  331. FMUL a2, b3, t3
  332. LDF [BO - 2 * SIZE], b3
  333. FADD3 c07, t4, c07
  334. FMUL a2, b4, t4
  335. LDF [BO - 1 * SIZE], b4
  336. FADD2 c02, t1, c02
  337. FMUL a3, b1, t1
  338. LDF [AO + 1 * SIZE], a2
  339. FADD4 c04, t2, c04
  340. FMUL a3, b2, t2
  341. FADD2 c06, t3, c06
  342. FMUL a3, b3, t3
  343. FADD4 c08, t4, c08
  344. FMUL a3, b4, t4
  345. LDF [AO + 2 * SIZE], a3
  346. FADD1 c01, t1, c01
  347. FMUL a4, b1, t1
  348. LDF [BO + 0 * SIZE], b1
  349. FADD3 c03, t2, c03
  350. FMUL a4, b2, t2
  351. LDF [BO + 1 * SIZE], b2
  352. FADD1 c05, t3, c05
  353. FMUL a4, b3, t3
  354. LDF [BO + 2 * SIZE], b3
  355. FADD3 c07, t4, c07
  356. FMUL a4, b4, t4
  357. LDF [BO + 3 * SIZE], b4
  358. bg,pt %icc, .LL52
  359. LDF [AO + 3 * SIZE], a4
  360. .LL55:
  361. #if defined(LT) || defined(RN)
  362. and KK, 3, L
  363. #else
  364. and TEMP1, 3, L
  365. #endif
  366. cmp L, 0
  367. ble,a,pn %icc, .LL59
  368. nop
  369. .LL56:
  370. FADD2 c02, t1, c02
  371. add AO, 2 * SIZE, AO
  372. FMUL a1, b1, t1
  373. add L, -1, L
  374. add BO, 4 * SIZE, BO
  375. FADD4 c04, t2, c04
  376. cmp L, 0
  377. FMUL a1, b2, t2
  378. FADD2 c06, t3, c06
  379. FMUL a1, b3, t3
  380. FADD4 c08, t4, c08
  381. FMUL a1, b4, t4
  382. LDF [AO + 0 * SIZE], a1
  383. FADD1 c01, t1, c01
  384. FMUL a2, b1, t1
  385. LDF [BO + 0 * SIZE], b1
  386. FADD3 c03, t2, c03
  387. FMUL a2, b2, t2
  388. LDF [BO + 1 * SIZE], b2
  389. FADD1 c05, t3, c05
  390. FMUL a2, b3, t3
  391. LDF [BO + 2 * SIZE], b3
  392. FADD3 c07, t4, c07
  393. FMUL a2, b4, t4
  394. LDF [BO + 3 * SIZE], b4
  395. bg,pt %icc, .LL56
  396. LDF [AO + 1 * SIZE], a2
  397. .LL59:
  398. #if defined(LN) || defined(RT)
  399. #ifdef LN
  400. sub KK, 1, TEMP1
  401. #else
  402. sub KK, 2, TEMP1
  403. #endif
  404. sll TEMP1, 0 + ZBASE_SHIFT, TEMP2
  405. sll TEMP1, 1 + ZBASE_SHIFT, TEMP1
  406. add AORIG, TEMP2, AO
  407. add B, TEMP1, BO
  408. #endif
  409. FADD2 c02, t1, c02
  410. FADD4 c04, t2, c04
  411. FADD2 c06, t3, c06
  412. FADD4 c08, t4, c08
  413. FADD c01, c04, c01
  414. FADD c02, c03, c02
  415. FADD c05, c08, c05
  416. FADD c06, c07, c06
  417. #if defined(LN) || defined(LT)
  418. LDF [BO + 0 * SIZE], a1
  419. LDF [BO + 1 * SIZE], a2
  420. LDF [BO + 2 * SIZE], a3
  421. LDF [BO + 3 * SIZE], a4
  422. FSUB a1, c01, c01
  423. FSUB a2, c02, c02
  424. FSUB a3, c05, c05
  425. FSUB a4, c06, c06
  426. #else
  427. LDF [AO + 0 * SIZE], a1
  428. LDF [AO + 1 * SIZE], a2
  429. LDF [AO + 2 * SIZE], a3
  430. LDF [AO + 3 * SIZE], a4
  431. FSUB a1, c01, c01
  432. FSUB a2, c02, c02
  433. FSUB a3, c05, c05
  434. FSUB a4, c06, c06
  435. #endif
  436. #ifdef LN
  437. LDF [AO + 0 * SIZE], a1
  438. LDF [AO + 1 * SIZE], a2
  439. FMUL a1, c01, t1
  440. FMUL a2, c02, t2
  441. FMUL a1, c02, t3
  442. FMUL a2, c01, t4
  443. FMUL a1, c05, t5
  444. FMUL a2, c06, t6
  445. FMUL a1, c06, t7
  446. FMUL a2, c05, t8
  447. FADD4 t1, t2, c01
  448. FADD2 t3, t4, c02
  449. FADD4 t5, t6, c05
  450. FADD2 t7, t8, c06
  451. #endif
  452. #ifdef LT
  453. LDF [AO + 0 * SIZE], a1
  454. LDF [AO + 1 * SIZE], a2
  455. FMUL a1, c01, t1
  456. FMUL a2, c02, t2
  457. FMUL a1, c02, t3
  458. FMUL a2, c01, t4
  459. FMUL a1, c05, t5
  460. FMUL a2, c06, t6
  461. FMUL a1, c06, t7
  462. FMUL a2, c05, t8
  463. FADD4 t1, t2, c01
  464. FADD2 t3, t4, c02
  465. FADD4 t5, t6, c05
  466. FADD2 t7, t8, c06
  467. #endif
  468. #ifdef RN
  469. LDF [BO + 0 * SIZE], a1
  470. LDF [BO + 1 * SIZE], a2
  471. LDF [BO + 2 * SIZE], a3
  472. LDF [BO + 3 * SIZE], a4
  473. LDF [BO + 6 * SIZE], b1
  474. LDF [BO + 7 * SIZE], b2
  475. FMUL a1, c01, t1
  476. FMUL a2, c02, t2
  477. FMUL a1, c02, t3
  478. FMUL a2, c01, t4
  479. FADD4 t1, t2, c01
  480. FADD3 t3, t4, c02
  481. FMUL a3, c01, t1
  482. FMUL a3, c02, t2
  483. FMUL a4, c02, t3
  484. FMUL a4, c01, t4
  485. FSUB c05, t1, c05
  486. FSUB c06, t2, c06
  487. FADD3 c05, t3, c05
  488. FADD4 c06, t4, c06
  489. FMUL b1, c05, t1
  490. FMUL b2, c06, t2
  491. FMUL b1, c06, t3
  492. FMUL b2, c05, t4
  493. FADD4 t1, t2, c05
  494. FADD3 t3, t4, c06
  495. #endif
  496. #ifdef RT
  497. LDF [BO + 6 * SIZE], a1
  498. LDF [BO + 7 * SIZE], a2
  499. LDF [BO + 4 * SIZE], a3
  500. LDF [BO + 5 * SIZE], a4
  501. LDF [BO + 0 * SIZE], b1
  502. LDF [BO + 1 * SIZE], b2
  503. FMUL a1, c05, t1
  504. FMUL a2, c06, t2
  505. FMUL a1, c06, t3
  506. FMUL a2, c05, t4
  507. FADD4 t1, t2, c05
  508. FADD3 t3, t4, c06
  509. FMUL a3, c05, t1
  510. FMUL a3, c06, t2
  511. FMUL a4, c06, t3
  512. FMUL a4, c05, t4
  513. FSUB c01, t1, c01
  514. FSUB c02, t2, c02
  515. FADD3 c01, t3, c01
  516. FADD4 c02, t4, c02
  517. FMUL b1, c01, t1
  518. FMUL b2, c02, t2
  519. FMUL b1, c02, t3
  520. FMUL b2, c01, t4
  521. FADD4 t1, t2, c01
  522. FADD3 t3, t4, c02
  523. #endif
  524. #ifdef LN
  525. add C1, -2 * SIZE, C1
  526. add C2, -2 * SIZE, C2
  527. #endif
  528. #if defined(LN) || defined(LT)
  529. STF c01, [BO + 0 * SIZE]
  530. STF c02, [BO + 1 * SIZE]
  531. STF c05, [BO + 2 * SIZE]
  532. STF c06, [BO + 3 * SIZE]
  533. #else
  534. STF c01, [AO + 0 * SIZE]
  535. STF c02, [AO + 1 * SIZE]
  536. STF c05, [AO + 2 * SIZE]
  537. STF c06, [AO + 3 * SIZE]
  538. #endif
  539. STF c01, [C1 + 0 * SIZE]
  540. STF c02, [C1 + 1 * SIZE]
  541. STF c05, [C2 + 0 * SIZE]
  542. STF c06, [C2 + 1 * SIZE]
  543. FMOV FZERO, t1
  544. FMOV FZERO, t2
  545. FMOV FZERO, t3
  546. FMOV FZERO, t4
  547. #ifndef LN
  548. add C1, 2 * SIZE, C1
  549. add C2, 2 * SIZE, C2
  550. #endif
  551. #ifdef RT
  552. sll K, 0 + ZBASE_SHIFT, TEMP1
  553. add AORIG, TEMP1, AORIG
  554. #endif
  555. #if defined(LT) || defined(RN)
  556. sub K, KK, TEMP1
  557. sll TEMP1, 0 + ZBASE_SHIFT, TEMP2
  558. sll TEMP1, 1 + ZBASE_SHIFT, TEMP1
  559. add AO, TEMP2, AO
  560. add BO, TEMP1, BO
  561. #endif
  562. #ifdef LT
  563. add KK, 1, KK
  564. #endif
  565. #ifdef LN
  566. sub KK, 1, KK
  567. #endif
  568. .LL50:
  569. sra M, 1, I
  570. cmp I, 0
  571. ble,pn %icc, .LL99
  572. nop
  573. .LL21:
  574. #if defined(LT) || defined(RN)
  575. sra KK, 2, L
  576. mov B, BO
  577. cmp L, 0
  578. #else
  579. #ifdef LN
  580. sll K, 1 + ZBASE_SHIFT, TEMP1
  581. sub AORIG, TEMP1, AORIG
  582. #endif
  583. sll KK, 1 + ZBASE_SHIFT, TEMP1
  584. add AORIG, TEMP1, AO
  585. add B, TEMP1, BO
  586. sub K, KK, TEMP1
  587. sra TEMP1, 2, L
  588. cmp L, 0
  589. #endif
  590. FMOV FZERO, t1
  591. FMOV FZERO, t2
  592. FMOV FZERO, t3
  593. FMOV FZERO, t4
  594. FMOV FZERO, c01
  595. FMOV FZERO, c02
  596. LDF [AO + 0 * SIZE], a1
  597. FMOV FZERO, c03
  598. LDF [BO + 0 * SIZE], b1
  599. FMOV FZERO, c04
  600. LDF [AO + 1 * SIZE], a2
  601. FMOV FZERO, c05
  602. LDF [BO + 1 * SIZE], b2
  603. FMOV FZERO, c06
  604. LDF [AO + 2 * SIZE], a3
  605. FMOV FZERO, c07
  606. LDF [BO + 2 * SIZE], b3
  607. FMOV FZERO, c08
  608. LDF [AO + 3 * SIZE], a4
  609. FMOV FZERO, c09
  610. LDF [BO + 3 * SIZE], b4
  611. FMOV FZERO, c10
  612. LDF [BO + 4 * SIZE], b5
  613. FMOV FZERO, c11
  614. LDF [AO + 4 * SIZE], a5
  615. FMOV FZERO, c12
  616. #ifdef LN
  617. prefetch [C1 - 3 * SIZE], 3
  618. FMOV FZERO, c13
  619. prefetch [C2 - 3 * SIZE], 3
  620. FMOV FZERO, c14
  621. #else
  622. prefetch [C1 + 3 * SIZE], 3
  623. FMOV FZERO, c13
  624. prefetch [C2 + 3 * SIZE], 3
  625. FMOV FZERO, c14
  626. #endif
  627. FMOV FZERO, c15
  628. ble,pn %icc, .LL25
  629. FMOV FZERO, c16
  630. .LL22:
  631. FADD2 c04, t1, c04
  632. prefetch [AO + APREFETCHSIZE * SIZE], APREFETCH_CATEGORY
  633. FMUL a1, b1, t1
  634. nop
  635. FADD4 c08, t2, c08
  636. prefetch [BO + BPREFETCHSIZE * SIZE], BPREFETCH_CATEGORY
  637. FMUL a1, b2, t2
  638. add AO, 16 * SIZE, AO
  639. FADD2 c12, t3, c12
  640. LDF [AO - 13 * SIZE], a4
  641. FMUL a1, b3, t3
  642. add BO, 16 * SIZE, BO
  643. FADD4 c16, t4, c16
  644. nop
  645. FMUL a1, b4, t4
  646. LDF [AO - 8 * SIZE], a1
  647. FADD1 c01, t1, c01
  648. nop
  649. FMUL a2, b1, t1
  650. nop
  651. FADD3 c05, t2, c05
  652. nop
  653. FMUL a2, b2, t2
  654. nop
  655. FADD1 c09, t3, c09
  656. nop
  657. FMUL a2, b3, t3
  658. nop
  659. FADD3 c13, t4, c13
  660. add L, -1, L
  661. FMUL a2, b4, t4
  662. LDF [AO - 11 * SIZE], a2
  663. FADD2 c02, t1, c02
  664. nop
  665. FMUL a3, b1, t1
  666. nop
  667. FADD4 c06, t2, c06
  668. nop
  669. FMUL a3, b2, t2
  670. nop
  671. FADD2 c10, t3, c10
  672. nop
  673. FMUL a3, b3, t3
  674. nop
  675. FADD4 c14, t4, c14
  676. nop
  677. FMUL a3, b4, t4
  678. LDF [AO - 10 * SIZE], a3
  679. FADD1 c03, t1, c03
  680. nop
  681. FMUL a4, b1, t1
  682. LDF [BO - 8 * SIZE], b1
  683. FADD3 c07, t2, c07
  684. nop
  685. FMUL a4, b2, t2
  686. LDF [BO - 11 * SIZE], b2
  687. FADD1 c11, t3, c11
  688. nop
  689. FMUL a4, b3, t3
  690. LDF [BO - 10 * SIZE], b3
  691. FADD3 c15, t4, c15
  692. nop
  693. FMUL a4, b4, t4
  694. LDF [BO - 9 * SIZE], b4
  695. FADD2 c04, t1, c04
  696. nop
  697. FMUL a5, b5, t1
  698. LDF [AO - 9 * SIZE], a4
  699. FADD4 c08, t2, c08
  700. nop
  701. FMUL a5, b2, t2
  702. nop
  703. FADD2 c12, t3, c12
  704. nop
  705. FMUL a5, b3, t3
  706. nop
  707. FADD4 c16, t4, c16
  708. nop
  709. FMUL a5, b4, t4
  710. LDF [AO - 4 * SIZE], a5
  711. FADD1 c01, t1, c01
  712. nop
  713. FMUL a2, b5, t1
  714. nop
  715. FADD3 c05, t2, c05
  716. nop
  717. FMUL a2, b2, t2
  718. nop
  719. FADD1 c09, t3, c09
  720. nop
  721. FMUL a2, b3, t3
  722. nop
  723. FADD3 c13, t4, c13
  724. nop
  725. FMUL a2, b4, t4
  726. LDF [AO - 7 * SIZE], a2
  727. FADD2 c02, t1, c02
  728. nop
  729. FMUL a3, b5, t1
  730. nop
  731. FADD4 c06, t2, c06
  732. nop
  733. FMUL a3, b2, t2
  734. nop
  735. FADD2 c10, t3, c10
  736. nop
  737. FMUL a3, b3, t3
  738. nop
  739. FADD4 c14, t4, c14
  740. nop
  741. FMUL a3, b4, t4
  742. LDF [AO - 6 * SIZE], a3
  743. FADD1 c03, t1, c03
  744. nop
  745. FMUL a4, b5, t1
  746. LDF [BO - 4 * SIZE], b5
  747. FADD3 c07, t2, c07
  748. nop
  749. FMUL a4, b2, t2
  750. LDF [BO - 7 * SIZE], b2
  751. FADD1 c11, t3, c11
  752. nop
  753. FMUL a4, b3, t3
  754. LDF [BO - 6 * SIZE], b3
  755. FADD3 c15, t4, c15
  756. nop
  757. FMUL a4, b4, t4
  758. LDF [BO - 5 * SIZE], b4
  759. FADD2 c04, t1, c04
  760. nop
  761. FMUL a1, b1, t1
  762. LDF [AO - 5 * SIZE], a4
  763. FADD4 c08, t2, c08
  764. nop
  765. FMUL a1, b2, t2
  766. nop
  767. FADD2 c12, t3, c12
  768. nop
  769. FMUL a1, b3, t3
  770. nop
  771. FADD4 c16, t4, c16
  772. nop
  773. FMUL a1, b4, t4
  774. LDF [AO - 0 * SIZE], a1
  775. FADD1 c01, t1, c01
  776. nop
  777. FMUL a2, b1, t1
  778. nop
  779. #ifdef DOUBLE
  780. prefetch [AO + (APREFETCHSIZE + 8) * SIZE], APREFETCH_CATEGORY
  781. #else
  782. nop
  783. #endif
  784. FADD3 c05, t2, c05
  785. nop
  786. FMUL a2, b2, t2
  787. FADD1 c09, t3, c09
  788. nop
  789. FMUL a2, b3, t3
  790. nop
  791. FADD3 c13, t4, c13
  792. nop
  793. FMUL a2, b4, t4
  794. nop
  795. FADD2 c02, t1, c02
  796. nop
  797. FMUL a3, b1, t1
  798. LDF [AO - 3 * SIZE], a2
  799. FADD4 c06, t2, c06
  800. #ifdef DOUBLE
  801. prefetch [BO + (BPREFETCHSIZE + 8) * SIZE], BPREFETCH_CATEGORY
  802. #else
  803. nop
  804. #endif
  805. FMUL a3, b2, t2
  806. nop
  807. FADD2 c10, t3, c10
  808. nop
  809. FMUL a3, b3, t3
  810. nop
  811. FADD4 c14, t4, c14
  812. nop
  813. FMUL a3, b4, t4
  814. LDF [AO - 2 * SIZE], a3
  815. FADD1 c03, t1, c03
  816. nop
  817. FMUL a4, b1, t1
  818. LDF [BO - 0 * SIZE], b1
  819. FADD3 c07, t2, c07
  820. nop
  821. FMUL a4, b2, t2
  822. LDF [BO - 3 * SIZE], b2
  823. FADD1 c11, t3, c11
  824. nop
  825. FMUL a4, b3, t3
  826. LDF [BO - 2 * SIZE], b3
  827. FADD3 c15, t4, c15
  828. nop
  829. FMUL a4, b4, t4
  830. LDF [BO - 1 * SIZE], b4
  831. FADD2 c04, t1, c04
  832. nop
  833. FMUL a5, b5, t1
  834. LDF [AO - 1 * SIZE], a4
  835. FADD4 c08, t2, c08
  836. FMUL a5, b2, t2
  837. FADD2 c12, t3, c12
  838. FMUL a5, b3, t3
  839. FADD4 c16, t4, c16
  840. nop
  841. FMUL a5, b4, t4
  842. LDF [AO + 4 * SIZE], a5
  843. FADD1 c01, t1, c01
  844. nop
  845. FMUL a2, b5, t1
  846. nop
  847. FADD3 c05, t2, c05
  848. nop
  849. FMUL a2, b2, t2
  850. nop
  851. FADD1 c09, t3, c09
  852. nop
  853. FMUL a2, b3, t3
  854. nop
  855. FADD3 c13, t4, c13
  856. nop
  857. FMUL a2, b4, t4
  858. LDF [AO + 1 * SIZE], a2
  859. FADD2 c02, t1, c02
  860. nop
  861. FMUL a3, b5, t1
  862. nop
  863. FADD4 c06, t2, c06
  864. nop
  865. FMUL a3, b2, t2
  866. nop
  867. FADD2 c10, t3, c10
  868. nop
  869. FMUL a3, b3, t3
  870. nop
  871. FADD4 c14, t4, c14
  872. nop
  873. FMUL a3, b4, t4
  874. LDF [AO + 2 * SIZE], a3
  875. FADD1 c03, t1, c03
  876. cmp L, 0
  877. FMUL a4, b5, t1
  878. LDF [BO + 4 * SIZE], b5
  879. FADD3 c07, t2, c07
  880. nop
  881. FMUL a4, b2, t2
  882. LDF [BO + 1 * SIZE], b2
  883. FADD1 c11, t3, c11
  884. nop
  885. FMUL a4, b3, t3
  886. LDF [BO + 2 * SIZE], b3
  887. FADD3 c15, t4, c15
  888. FMUL a4, b4, t4
  889. bg,pt %icc, .LL22
  890. LDF [BO + 3 * SIZE], b4
  891. .LL25:
  892. #if defined(LT) || defined(RN)
  893. and KK, 3, L
  894. #else
  895. and TEMP1, 3, L
  896. #endif
  897. cmp L, 0
  898. ble,pn %icc, .LL29
  899. nop
  900. .LL26:
  901. FADD2 c04, t1, c04
  902. LDF [AO + 3 * SIZE], a4
  903. FMUL a1, b1, t1
  904. add AO, 4 * SIZE, AO
  905. FADD4 c08, t2, c08
  906. add BO, 4 * SIZE, BO
  907. FMUL a1, b2, t2
  908. add L, -1, L
  909. FADD2 c12, t3, c12
  910. nop
  911. FMUL a1, b3, t3
  912. cmp L, 0
  913. FADD4 c16, t4, c16
  914. nop
  915. FMUL a1, b4, t4
  916. LDF [AO + 0 * SIZE], a1
  917. FADD1 c01, t1, c01
  918. nop
  919. FMUL a2, b1, t1
  920. nop
  921. FADD3 c05, t2, c05
  922. nop
  923. FMUL a2, b2, t2
  924. nop
  925. FADD1 c09, t3, c09
  926. nop
  927. FMUL a2, b3, t3
  928. nop
  929. FADD3 c13, t4, c13
  930. nop
  931. FMUL a2, b4, t4
  932. LDF [AO + 1 * SIZE], a2
  933. FADD2 c02, t1, c02
  934. nop
  935. FMUL a3, b1, t1
  936. nop
  937. FADD4 c06, t2, c06
  938. nop
  939. FMUL a3, b2, t2
  940. nop
  941. FADD2 c10, t3, c10
  942. nop
  943. FMUL a3, b3, t3
  944. nop
  945. FADD4 c14, t4, c14
  946. nop
  947. FMUL a3, b4, t4
  948. LDF [AO + 2 * SIZE], a3
  949. FADD1 c03, t1, c03
  950. nop
  951. FMUL a4, b1, t1
  952. LDF [BO + 0 * SIZE], b1
  953. FADD3 c07, t2, c07
  954. nop
  955. FMUL a4, b2, t2
  956. LDF [BO + 1 * SIZE], b2
  957. FADD1 c11, t3, c11
  958. nop
  959. FMUL a4, b3, t3
  960. LDF [BO + 2 * SIZE], b3
  961. FADD3 c15, t4, c15
  962. FMUL a4, b4, t4
  963. bg,pt %icc, .LL26
  964. LDF [BO + 3 * SIZE], b4
  965. .LL29:
  966. #if defined(LN) || defined(RT)
  967. sub KK, 2, TEMP1
  968. sll TEMP1, 1 + ZBASE_SHIFT, TEMP1
  969. add AORIG, TEMP1, AO
  970. add B, TEMP1, BO
  971. #endif
  972. FADD2 c04, t1, c04
  973. FADD4 c08, t2, c08
  974. FADD2 c12, t3, c12
  975. FADD4 c16, t4, c16
  976. FADD c01, c06, c01
  977. FADD c02, c05, c02
  978. FADD c03, c08, c03
  979. FADD c04, c07, c04
  980. FADD c09, c14, c09
  981. FADD c10, c13, c10
  982. FADD c11, c16, c11
  983. FADD c12, c15, c12
  984. #if defined(LN) || defined(LT)
  985. LDF [BO + 0 * SIZE], a1
  986. LDF [BO + 1 * SIZE], a2
  987. LDF [BO + 2 * SIZE], a3
  988. LDF [BO + 3 * SIZE], a4
  989. LDF [BO + 4 * SIZE], b1
  990. LDF [BO + 5 * SIZE], b2
  991. LDF [BO + 6 * SIZE], b3
  992. LDF [BO + 7 * SIZE], b4
  993. FSUB a1, c01, c01
  994. FSUB a2, c02, c02
  995. FSUB a3, c09, c09
  996. FSUB a4, c10, c10
  997. FSUB b1, c03, c03
  998. FSUB b2, c04, c04
  999. FSUB b3, c11, c11
  1000. FSUB b4, c12, c12
  1001. #else
  1002. LDF [AO + 0 * SIZE], a1
  1003. LDF [AO + 1 * SIZE], a2
  1004. LDF [AO + 2 * SIZE], a3
  1005. LDF [AO + 3 * SIZE], a4
  1006. LDF [AO + 4 * SIZE], b1
  1007. LDF [AO + 5 * SIZE], b2
  1008. LDF [AO + 6 * SIZE], b3
  1009. LDF [AO + 7 * SIZE], b4
  1010. FSUB a1, c01, c01
  1011. FSUB a2, c02, c02
  1012. FSUB a3, c03, c03
  1013. FSUB a4, c04, c04
  1014. FSUB b1, c09, c09
  1015. FSUB b2, c10, c10
  1016. FSUB b3, c11, c11
  1017. FSUB b4, c12, c12
  1018. #endif
  1019. #ifdef LN
  1020. LDF [AO + 6 * SIZE], a1
  1021. LDF [AO + 7 * SIZE], a2
  1022. LDF [AO + 4 * SIZE], a3
  1023. LDF [AO + 5 * SIZE], a4
  1024. LDF [AO + 0 * SIZE], b1
  1025. LDF [AO + 1 * SIZE], b2
  1026. FMUL a1, c03, t1
  1027. FMUL a2, c04, t2
  1028. FMUL a1, c04, t3
  1029. FMUL a2, c03, t4
  1030. FMUL a1, c11, t5
  1031. FMUL a2, c12, t6
  1032. FMUL a1, c12, t7
  1033. FMUL a2, c11, t8
  1034. FADD4 t1, t2, c03
  1035. FADD2 t3, t4, c04
  1036. FADD4 t5, t6, c11
  1037. FADD2 t7, t8, c12
  1038. FMUL a3, c03, t1
  1039. FMUL a3, c04, t2
  1040. FMUL a3, c11, t3
  1041. FMUL a3, c12, t4
  1042. FMUL a4, c04, t5
  1043. FMUL a4, c03, t6
  1044. FMUL a4, c12, t7
  1045. FMUL a4, c11, t8
  1046. FSUB c01, t1, c01
  1047. FSUB c02, t2, c02
  1048. FSUB c09, t3, c09
  1049. FSUB c10, t4, c10
  1050. FADD2 c01, t5, c01
  1051. FADD4 c02, t6, c02
  1052. FADD2 c09, t7, c09
  1053. FADD4 c10, t8, c10
  1054. FMUL b1, c01, t1
  1055. FMUL b2, c02, t2
  1056. FMUL b1, c02, t3
  1057. FMUL b2, c01, t4
  1058. FMUL b1, c09, t5
  1059. FMUL b2, c10, t6
  1060. FMUL b1, c10, t7
  1061. FMUL b2, c09, t8
  1062. FADD4 t1, t2, c01
  1063. FADD2 t3, t4, c02
  1064. FADD4 t5, t6, c09
  1065. FADD2 t7, t8, c10
  1066. #endif
  1067. #ifdef LT
  1068. LDF [AO + 0 * SIZE], a1
  1069. LDF [AO + 1 * SIZE], a2
  1070. LDF [AO + 2 * SIZE], a3
  1071. LDF [AO + 3 * SIZE], a4
  1072. LDF [AO + 6 * SIZE], b1
  1073. LDF [AO + 7 * SIZE], b2
  1074. FMUL a1, c01, t1
  1075. FMUL a2, c02, t2
  1076. FMUL a1, c02, t3
  1077. FMUL a2, c01, t4
  1078. FMUL a1, c09, t5
  1079. FMUL a2, c10, t6
  1080. FMUL a1, c10, t7
  1081. FMUL a2, c09, t8
  1082. FADD4 t1, t2, c01
  1083. FADD2 t3, t4, c02
  1084. FADD4 t5, t6, c09
  1085. FADD2 t7, t8, c10
  1086. FMUL a3, c01, t1
  1087. FMUL a3, c02, t2
  1088. FMUL a3, c09, t3
  1089. FMUL a3, c10, t4
  1090. FMUL a4, c02, t5
  1091. FMUL a4, c01, t6
  1092. FMUL a4, c10, t7
  1093. FMUL a4, c09, t8
  1094. FSUB c03, t1, c03
  1095. FSUB c04, t2, c04
  1096. FSUB c11, t3, c11
  1097. FSUB c12, t4, c12
  1098. FADD2 c03, t5, c03
  1099. FADD4 c04, t6, c04
  1100. FADD2 c11, t7, c11
  1101. FADD4 c12, t8, c12
  1102. FMUL b1, c03, t1
  1103. FMUL b2, c04, t2
  1104. FMUL b1, c04, t3
  1105. FMUL b2, c03, t4
  1106. FMUL b1, c11, t5
  1107. FMUL b2, c12, t6
  1108. FMUL b1, c12, t7
  1109. FMUL b2, c11, t8
  1110. FADD4 t1, t2, c03
  1111. FADD2 t3, t4, c04
  1112. FADD4 t5, t6, c11
  1113. FADD2 t7, t8, c12
  1114. #endif
  1115. #ifdef RN
  1116. LDF [BO + 0 * SIZE], a1
  1117. LDF [BO + 1 * SIZE], a2
  1118. LDF [BO + 2 * SIZE], a3
  1119. LDF [BO + 3 * SIZE], a4
  1120. LDF [BO + 6 * SIZE], b1
  1121. LDF [BO + 7 * SIZE], b2
  1122. FMUL a1, c01, t1
  1123. FMUL a2, c02, t2
  1124. FMUL a1, c02, t3
  1125. FMUL a2, c01, t4
  1126. FMUL a1, c03, t5
  1127. FMUL a2, c04, t6
  1128. FMUL a1, c04, t7
  1129. FMUL a2, c03, t8
  1130. FADD4 t1, t2, c01
  1131. FADD3 t3, t4, c02
  1132. FADD4 t5, t6, c03
  1133. FADD3 t7, t8, c04
  1134. FMUL a3, c01, t1
  1135. FMUL a3, c02, t2
  1136. FMUL a3, c03, t3
  1137. FMUL a3, c04, t4
  1138. FMUL a4, c02, t5
  1139. FMUL a4, c01, t6
  1140. FMUL a4, c04, t7
  1141. FMUL a4, c03, t8
  1142. FSUB c09, t1, c09
  1143. FSUB c10, t2, c10
  1144. FSUB c11, t3, c11
  1145. FSUB c12, t4, c12
  1146. FADD3 c09, t5, c09
  1147. FADD4 c10, t6, c10
  1148. FADD3 c11, t7, c11
  1149. FADD4 c12, t8, c12
  1150. FMUL b1, c09, t1
  1151. FMUL b2, c10, t2
  1152. FMUL b1, c10, t3
  1153. FMUL b2, c09, t4
  1154. FMUL b1, c11, t5
  1155. FMUL b2, c12, t6
  1156. FMUL b1, c12, t7
  1157. FMUL b2, c11, t8
  1158. FADD4 t1, t2, c09
  1159. FADD3 t3, t4, c10
  1160. FADD4 t5, t6, c11
  1161. FADD3 t7, t8, c12
  1162. #endif
  1163. #ifdef RT
  1164. LDF [BO + 6 * SIZE], a1
  1165. LDF [BO + 7 * SIZE], a2
  1166. LDF [BO + 4 * SIZE], a3
  1167. LDF [BO + 5 * SIZE], a4
  1168. LDF [BO + 0 * SIZE], b1
  1169. LDF [BO + 1 * SIZE], b2
  1170. FMUL a1, c09, t1
  1171. FMUL a2, c10, t2
  1172. FMUL a1, c10, t3
  1173. FMUL a2, c09, t4
  1174. FMUL a1, c11, t5
  1175. FMUL a2, c12, t6
  1176. FMUL a1, c12, t7
  1177. FMUL a2, c11, t8
  1178. FADD4 t1, t2, c09
  1179. FADD3 t3, t4, c10
  1180. FADD4 t5, t6, c11
  1181. FADD3 t7, t8, c12
  1182. FMUL a3, c09, t1
  1183. FMUL a3, c10, t2
  1184. FMUL a3, c11, t3
  1185. FMUL a3, c12, t4
  1186. FMUL a4, c10, t5
  1187. FMUL a4, c09, t6
  1188. FMUL a4, c12, t7
  1189. FMUL a4, c11, t8
  1190. FSUB c01, t1, c01
  1191. FSUB c02, t2, c02
  1192. FSUB c03, t3, c03
  1193. FSUB c04, t4, c04
  1194. FADD3 c01, t5, c01
  1195. FADD4 c02, t6, c02
  1196. FADD3 c03, t7, c03
  1197. FADD4 c04, t8, c04
  1198. FMUL b1, c01, t1
  1199. FMUL b2, c02, t2
  1200. FMUL b1, c02, t3
  1201. FMUL b2, c01, t4
  1202. FMUL b1, c03, t5
  1203. FMUL b2, c04, t6
  1204. FMUL b1, c04, t7
  1205. FMUL b2, c03, t8
  1206. FADD4 t1, t2, c01
  1207. FADD3 t3, t4, c02
  1208. FADD4 t5, t6, c03
  1209. FADD3 t7, t8, c04
  1210. #endif
  1211. #ifdef LN
  1212. add C1, -4 * SIZE, C1
  1213. add C2, -4 * SIZE, C2
  1214. #endif
  1215. #if defined(LN) || defined(LT)
  1216. STF c01, [BO + 0 * SIZE]
  1217. STF c02, [BO + 1 * SIZE]
  1218. STF c09, [BO + 2 * SIZE]
  1219. STF c10, [BO + 3 * SIZE]
  1220. STF c03, [BO + 4 * SIZE]
  1221. STF c04, [BO + 5 * SIZE]
  1222. STF c11, [BO + 6 * SIZE]
  1223. STF c12, [BO + 7 * SIZE]
  1224. #else
  1225. STF c01, [AO + 0 * SIZE]
  1226. STF c02, [AO + 1 * SIZE]
  1227. STF c03, [AO + 2 * SIZE]
  1228. STF c04, [AO + 3 * SIZE]
  1229. STF c09, [AO + 4 * SIZE]
  1230. STF c10, [AO + 5 * SIZE]
  1231. STF c11, [AO + 6 * SIZE]
  1232. STF c12, [AO + 7 * SIZE]
  1233. #endif
  1234. STF c01, [C1 + 0 * SIZE]
  1235. STF c02, [C1 + 1 * SIZE]
  1236. STF c03, [C1 + 2 * SIZE]
  1237. STF c04, [C1 + 3 * SIZE]
  1238. STF c09, [C2 + 0 * SIZE]
  1239. STF c10, [C2 + 1 * SIZE]
  1240. STF c11, [C2 + 2 * SIZE]
  1241. STF c12, [C2 + 3 * SIZE]
  1242. #ifndef LN
  1243. add C1, 4 * SIZE, C1
  1244. add C2, 4 * SIZE, C2
  1245. #endif
  1246. #ifdef RT
  1247. sll K, 1 + ZBASE_SHIFT, TEMP1
  1248. add AORIG, TEMP1, AORIG
  1249. #endif
  1250. #if defined(LT) || defined(RN)
  1251. sub K, KK, TEMP1
  1252. sll TEMP1, 1 + ZBASE_SHIFT, TEMP1
  1253. add AO, TEMP1, AO
  1254. add BO, TEMP1, BO
  1255. #endif
  1256. #ifdef LT
  1257. add KK, 2, KK
  1258. #endif
  1259. #ifdef LN
  1260. sub KK, 2, KK
  1261. #endif
  1262. add I, -1, I
  1263. cmp I, 0
  1264. bg,pt %icc, .LL21
  1265. nop
  1266. .LL99:
  1267. #ifdef LN
  1268. sll K, 1 + ZBASE_SHIFT, TEMP1
  1269. add B, TEMP1, B
  1270. #endif
  1271. #if defined(LT) || defined(RN)
  1272. mov BO, B
  1273. #endif
  1274. #ifdef RN
  1275. add KK, 2, KK
  1276. #endif
  1277. #ifdef RT
  1278. sub KK, 2, KK
  1279. #endif
  1280. add J, -1, J
  1281. cmp J, 0
  1282. bg,pt %icc, .LL11
  1283. nop
  1284. .LL100:
  1285. and N, 1, J
  1286. cmp J, 0
  1287. ble,pn %icc, .LL999
  1288. nop
  1289. #ifdef RT
  1290. sll K, 0 + ZBASE_SHIFT, TEMP1
  1291. sub B, TEMP1, B
  1292. sub C, LDC, C
  1293. #endif
  1294. mov C, C1
  1295. #ifdef LN
  1296. add M, OFFSET, KK
  1297. #endif
  1298. #ifdef LT
  1299. mov OFFSET, KK
  1300. #endif
  1301. #if defined(LN) || defined(RT)
  1302. mov A, AORIG
  1303. #else
  1304. mov A, AO
  1305. #endif
  1306. #ifndef RT
  1307. add C, LDC, C
  1308. #endif
  1309. and M, 1, I
  1310. cmp I, 0
  1311. ble,pn %icc, .LL150
  1312. nop
  1313. #if defined(LT) || defined(RN)
  1314. sra KK, 2, L
  1315. mov B, BO
  1316. cmp L, 0
  1317. #else
  1318. #ifdef LN
  1319. sll K, 0 + ZBASE_SHIFT, TEMP1
  1320. sub AORIG, TEMP1, AORIG
  1321. #endif
  1322. sll KK, 0 + ZBASE_SHIFT, TEMP1
  1323. add AORIG, TEMP1, AO
  1324. add B, TEMP1, BO
  1325. sub K, KK, TEMP1
  1326. sra TEMP1, 2, L
  1327. cmp L, 0
  1328. #endif
  1329. LDF [AO + 0 * SIZE], a1
  1330. FMOV FZERO, c01
  1331. LDF [BO + 0 * SIZE], b1
  1332. FMOV FZERO, t1
  1333. LDF [AO + 1 * SIZE], a2
  1334. FMOV FZERO, c02
  1335. LDF [BO + 1 * SIZE], b2
  1336. FMOV FZERO, t2
  1337. LDF [AO + 2 * SIZE], a3
  1338. FMOV FZERO, c03
  1339. LDF [BO + 2 * SIZE], b3
  1340. FMOV FZERO, t3
  1341. LDF [AO + 3 * SIZE], a4
  1342. FMOV FZERO, c04
  1343. LDF [BO + 3 * SIZE], b4
  1344. FMOV FZERO, t4
  1345. ble,pn %icc, .LL155
  1346. nop
  1347. .LL152:
  1348. FADD1 c01, t1, c01
  1349. add L, -1, L
  1350. FMUL a1, b1, t1
  1351. prefetch [AO + APREFETCHSIZE * SIZE], 0
  1352. FADD3 c02, t2, c02
  1353. add BO, 8 * SIZE, BO
  1354. FMUL a1, b2, t2
  1355. LDF [AO + 4 * SIZE], a1
  1356. FADD2 c03, t3, c03
  1357. cmp L, 0
  1358. FMUL a2, b1, t3
  1359. LDF [BO - 4 * SIZE], b1
  1360. FADD4 c04, t4, c04
  1361. nop
  1362. FMUL a2, b2, t4
  1363. LDF [AO + 5 * SIZE], a2
  1364. FADD1 c01, t1, c01
  1365. nop
  1366. FMUL a3, b3, t1
  1367. LDF [BO - 3 * SIZE], b2
  1368. FADD3 c02, t2, c02
  1369. nop
  1370. FMUL a3, b4, t2
  1371. LDF [AO + 6 * SIZE], a3
  1372. FADD2 c03, t3, c03
  1373. nop
  1374. FMUL a4, b3, t3
  1375. LDF [BO - 2 * SIZE], b3
  1376. FADD4 c04, t4, c04
  1377. nop
  1378. FMUL a4, b4, t4
  1379. LDF [AO + 7 * SIZE], a4
  1380. FADD1 c01, t1, c01
  1381. nop
  1382. FMUL a1, b1, t1
  1383. LDF [BO - 1 * SIZE], b4
  1384. FADD3 c02, t2, c02
  1385. FMUL a1, b2, t2
  1386. LDF [AO + 8 * SIZE], a1
  1387. FADD2 c03, t3, c03
  1388. FMUL a2, b1, t3
  1389. LDF [BO + 0 * SIZE], b1
  1390. FADD4 c04, t4, c04
  1391. FMUL a2, b2, t4
  1392. LDF [AO + 9 * SIZE], a2
  1393. FADD1 c01, t1, c01
  1394. FMUL a3, b3, t1
  1395. LDF [BO + 1 * SIZE], b2
  1396. FADD3 c02, t2, c02
  1397. FMUL a3, b4, t2
  1398. LDF [AO + 10 * SIZE], a3
  1399. FADD2 c03, t3, c03
  1400. FMUL a4, b3, t3
  1401. LDF [BO + 2 * SIZE], b3
  1402. FADD4 c04, t4, c04
  1403. FMUL a4, b4, t4
  1404. LDF [AO + 11 * SIZE], a4
  1405. add AO, 8 * SIZE, AO
  1406. bg,pt %icc, .LL152
  1407. LDF [BO + 3 * SIZE], b4
  1408. .LL155:
  1409. #if defined(LT) || defined(RN)
  1410. and KK, 3, L
  1411. #else
  1412. and TEMP1, 3, L
  1413. #endif
  1414. cmp L, 0
  1415. ble,a,pn %icc, .LL159
  1416. nop
  1417. .LL156:
  1418. FADD1 c01, t1, c01
  1419. add AO, 2 * SIZE, AO
  1420. FMUL a1, b1, t1
  1421. add BO, 2 * SIZE, BO
  1422. FADD3 c02, t2, c02
  1423. add L, -1, L
  1424. FMUL a1, b2, t2
  1425. LDF [AO + 0 * SIZE], a1
  1426. FADD2 c03, t3, c03
  1427. FMUL a2, b1, t3
  1428. LDF [BO + 0 * SIZE], b1
  1429. cmp L, 0
  1430. FADD4 c04, t4, c04
  1431. FMUL a2, b2, t4
  1432. LDF [BO + 1 * SIZE], b2
  1433. bg,pt %icc, .LL156
  1434. LDF [AO + 1 * SIZE], a2
  1435. .LL159:
  1436. FADD1 c01, t1, c01
  1437. FADD3 c02, t2, c02
  1438. FADD2 c03, t3, c03
  1439. FADD4 c04, t4, c04
  1440. FADD c01, c04, c01
  1441. FADD c02, c03, c02
  1442. #if defined(LN) || defined(RT)
  1443. sub KK, 1, TEMP1
  1444. sll TEMP1, 0 + ZBASE_SHIFT, TEMP1
  1445. add AORIG, TEMP1, AO
  1446. add B, TEMP1, BO
  1447. #endif
  1448. #if defined(LN) || defined(LT)
  1449. LDF [BO + 0 * SIZE], a1
  1450. LDF [BO + 1 * SIZE], a2
  1451. FSUB a1, c01, c01
  1452. FSUB a2, c02, c02
  1453. #else
  1454. LDF [AO + 0 * SIZE], a1
  1455. LDF [AO + 1 * SIZE], a2
  1456. FSUB a1, c01, c01
  1457. FSUB a2, c02, c02
  1458. #endif
  1459. #ifdef LN
  1460. LDF [AO + 0 * SIZE], a1
  1461. LDF [AO + 1 * SIZE], a2
  1462. FMUL a1, c01, t1
  1463. FMUL a2, c02, t2
  1464. FMUL a1, c02, t3
  1465. FMUL a2, c01, t4
  1466. FADD4 t1, t2, c01
  1467. FADD2 t3, t4, c02
  1468. #endif
  1469. #ifdef LT
  1470. LDF [AO + 0 * SIZE], a1
  1471. LDF [AO + 1 * SIZE], a2
  1472. FMUL a1, c01, t1
  1473. FMUL a2, c02, t2
  1474. FMUL a1, c02, t3
  1475. FMUL a2, c01, t4
  1476. FADD4 t1, t2, c01
  1477. FADD2 t3, t4, c02
  1478. #endif
  1479. #ifdef RN
  1480. LDF [BO + 0 * SIZE], a1
  1481. LDF [BO + 1 * SIZE], a2
  1482. FMUL a1, c01, t1
  1483. FMUL a2, c02, t2
  1484. FMUL a1, c02, t3
  1485. FMUL a2, c01, t4
  1486. FADD4 t1, t2, c01
  1487. FADD3 t3, t4, c02
  1488. #endif
  1489. #ifdef RT
  1490. LDF [BO + 0 * SIZE], a1
  1491. LDF [BO + 1 * SIZE], a2
  1492. FMUL a1, c01, t1
  1493. FMUL a2, c02, t2
  1494. FMUL a1, c02, t3
  1495. FMUL a2, c01, t4
  1496. FADD4 t1, t2, c01
  1497. FADD3 t3, t4, c02
  1498. #endif
  1499. #ifdef LN
  1500. add C1, -2 * SIZE, C1
  1501. #endif
  1502. #if defined(LN) || defined(LT)
  1503. STF c01, [BO + 0 * SIZE]
  1504. STF c02, [BO + 1 * SIZE]
  1505. #else
  1506. STF c01, [AO + 0 * SIZE]
  1507. STF c02, [AO + 1 * SIZE]
  1508. #endif
  1509. STF c01, [C1 + 0 * SIZE]
  1510. STF c02, [C1 + 1 * SIZE]
  1511. FMOV FZERO, t1
  1512. FMOV FZERO, t2
  1513. FMOV FZERO, t3
  1514. FMOV FZERO, t4
  1515. #ifndef LN
  1516. add C1, 2 * SIZE, C1
  1517. #endif
  1518. #ifdef RT
  1519. sll K, 0 + ZBASE_SHIFT, TEMP1
  1520. add AORIG, TEMP1, AORIG
  1521. #endif
  1522. #if defined(LT) || defined(RN)
  1523. sub K, KK, TEMP1
  1524. sll TEMP1, 0 + ZBASE_SHIFT, TEMP1
  1525. add AO, TEMP1, AO
  1526. add BO, TEMP1, BO
  1527. #endif
  1528. #ifdef LT
  1529. add KK, 1, KK
  1530. #endif
  1531. #ifdef LN
  1532. sub KK, 1, KK
  1533. #endif
  1534. .LL150:
  1535. sra M, 1, I
  1536. cmp I, 0
  1537. ble,pn %icc, .LL199
  1538. nop
  1539. .LL121:
  1540. #if defined(LT) || defined(RN)
  1541. sra KK, 2, L
  1542. mov B, BO
  1543. cmp L, 0
  1544. #else
  1545. #ifdef LN
  1546. sll K, 1 + ZBASE_SHIFT, TEMP1
  1547. sub AORIG, TEMP1, AORIG
  1548. #endif
  1549. sll KK, 1 + ZBASE_SHIFT, TEMP1
  1550. sll KK, 0 + ZBASE_SHIFT, TEMP2
  1551. add AORIG, TEMP1, AO
  1552. add B, TEMP2, BO
  1553. sub K, KK, TEMP1
  1554. sra TEMP1, 2, L
  1555. cmp L, 0
  1556. #endif
  1557. FMOV FZERO, c03
  1558. LDF [AO + 0 * SIZE], a1
  1559. FMOV FZERO, t1
  1560. LDF [BO + 0 * SIZE], b1
  1561. FMOV FZERO, c07
  1562. LDF [AO + 1 * SIZE], a2
  1563. FMOV FZERO, t2
  1564. LDF [BO + 1 * SIZE], b2
  1565. FMOV FZERO, c04
  1566. LDF [AO + 2 * SIZE], a3
  1567. FMOV FZERO, t3
  1568. LDF [BO + 2 * SIZE], b3
  1569. FMOV FZERO, c08
  1570. LDF [AO + 3 * SIZE], a4
  1571. FMOV FZERO, t4
  1572. LDF [BO + 3 * SIZE], b4
  1573. FMOV FZERO, c01
  1574. #ifdef LN
  1575. prefetch [C1 - 3 * SIZE], 3
  1576. #else
  1577. prefetch [C1 + 3 * SIZE], 3
  1578. #endif
  1579. FMOV FZERO, c05
  1580. FMOV FZERO, c02
  1581. ble,pn %icc, .LL125
  1582. FMOV FZERO, c06
  1583. .LL122:
  1584. FADD1 c03, t1, c03
  1585. add L, -1, L
  1586. FMUL a1, b1, t1
  1587. prefetch [AO + APREFETCHSIZE * SIZE], 0
  1588. FADD3 c07, t2, c07
  1589. add BO, 8 * SIZE, BO
  1590. FMUL a1, b2, t2
  1591. LDF [AO + 4 * SIZE], a1
  1592. FADD2 c04, t3, c04
  1593. add AO, 16 * SIZE, AO
  1594. FMUL a2, b1, t3
  1595. cmp L, 0
  1596. FADD4 c08, t4, c08
  1597. nop
  1598. FMUL a2, b2, t4
  1599. LDF [AO - 11 * SIZE], a2
  1600. FADD1 c01, t1, c01
  1601. nop
  1602. FMUL a3, b1, t1
  1603. nop
  1604. FADD3 c05, t2, c05
  1605. nop
  1606. FMUL a3, b2, t2
  1607. LDF [AO - 10 * SIZE], a3
  1608. FADD2 c02, t3, c02
  1609. nop
  1610. FMUL a4, b1, t3
  1611. LDF [BO - 4 * SIZE], b1
  1612. FADD4 c06, t4, c06
  1613. nop
  1614. FMUL a4, b2, t4
  1615. LDF [BO - 3 * SIZE], b2
  1616. FADD1 c03, t1, c03
  1617. nop
  1618. FMUL a1, b3, t1
  1619. LDF [AO - 9 * SIZE], a4
  1620. FADD3 c07, t2, c07
  1621. nop
  1622. FMUL a1, b4, t2
  1623. LDF [AO - 8 * SIZE], a1
  1624. FADD2 c04, t3, c04
  1625. nop
  1626. FMUL a2, b3, t3
  1627. nop
  1628. FADD4 c08, t4, c08
  1629. nop
  1630. FMUL a2, b4, t4
  1631. LDF [AO - 7 * SIZE], a2
  1632. FADD1 c01, t1, c01
  1633. nop
  1634. FMUL a3, b3, t1
  1635. nop
  1636. FADD3 c05, t2, c05
  1637. nop
  1638. FMUL a3, b4, t2
  1639. LDF [AO - 6 * SIZE], a3
  1640. FADD2 c02, t3, c02
  1641. nop
  1642. FMUL a4, b3, t3
  1643. LDF [BO - 2 * SIZE], b3
  1644. FADD4 c06, t4, c06
  1645. nop
  1646. FMUL a4, b4, t4
  1647. LDF [BO - 1 * SIZE], b4
  1648. FADD1 c03, t1, c03
  1649. nop
  1650. FMUL a1, b1, t1
  1651. LDF [AO - 5 * SIZE], a4
  1652. FADD3 c07, t2, c07
  1653. nop
  1654. FMUL a1, b2, t2
  1655. LDF [AO - 4 * SIZE], a1
  1656. FADD2 c04, t3, c04
  1657. nop
  1658. FMUL a2, b1, t3
  1659. nop
  1660. FADD4 c08, t4, c08
  1661. nop
  1662. FMUL a2, b2, t4
  1663. LDF [AO - 3 * SIZE], a2
  1664. FADD1 c01, t1, c01
  1665. nop
  1666. FMUL a3, b1, t1
  1667. nop
  1668. FADD3 c05, t2, c05
  1669. nop
  1670. FMUL a3, b2, t2
  1671. LDF [AO - 2 * SIZE], a3
  1672. FADD2 c02, t3, c02
  1673. nop
  1674. FMUL a4, b1, t3
  1675. LDF [BO + 0 * SIZE], b1
  1676. FADD4 c06, t4, c06
  1677. nop
  1678. FMUL a4, b2, t4
  1679. LDF [BO + 1 * SIZE], b2
  1680. FADD1 c03, t1, c03
  1681. nop
  1682. FMUL a1, b3, t1
  1683. LDF [AO - 1 * SIZE], a4
  1684. FADD3 c07, t2, c07
  1685. nop
  1686. FMUL a1, b4, t2
  1687. LDF [AO + 0 * SIZE], a1
  1688. FADD2 c04, t3, c04
  1689. nop
  1690. FMUL a2, b3, t3
  1691. nop
  1692. FADD4 c08, t4, c08
  1693. nop
  1694. FMUL a2, b4, t4
  1695. LDF [AO + 1 * SIZE], a2
  1696. FADD1 c01, t1, c01
  1697. nop
  1698. FMUL a3, b3, t1
  1699. nop
  1700. FADD3 c05, t2, c05
  1701. nop
  1702. FMUL a3, b4, t2
  1703. LDF [AO + 2 * SIZE], a3
  1704. FADD2 c02, t3, c02
  1705. nop
  1706. FMUL a4, b3, t3
  1707. LDF [BO + 2 * SIZE], b3
  1708. FADD4 c06, t4, c06
  1709. FMUL a4, b4, t4
  1710. LDF [AO + 3 * SIZE], a4
  1711. bg,pt %icc, .LL122
  1712. LDF [BO + 3 * SIZE], b4
  1713. .LL125:
  1714. #if defined(LT) || defined(RN)
  1715. and KK, 3, L
  1716. #else
  1717. and TEMP1, 3, L
  1718. #endif
  1719. cmp L, 0
  1720. ble,a,pn %icc, .LL129
  1721. nop
  1722. .LL126:
  1723. FADD1 c03, t1, c03
  1724. add AO, 4 * SIZE, AO
  1725. FMUL a1, b1, t1
  1726. add BO, 2 * SIZE, BO
  1727. FADD3 c07, t2, c07
  1728. add L, -1, L
  1729. FMUL a1, b2, t2
  1730. LDF [AO + 0 * SIZE], a1
  1731. FADD2 c04, t3, c04
  1732. cmp L, 0
  1733. FMUL a2, b1, t3
  1734. FADD4 c08, t4, c08
  1735. FMUL a2, b2, t4
  1736. LDF [AO + 1 * SIZE], a2
  1737. FADD1 c01, t1, c01
  1738. FMUL a3, b1, t1
  1739. FADD3 c05, t2, c05
  1740. FMUL a3, b2, t2
  1741. LDF [AO + 2 * SIZE], a3
  1742. FADD2 c02, t3, c02
  1743. FMUL a4, b1, t3
  1744. LDF [BO + 0 * SIZE], b1
  1745. FADD4 c06, t4, c06
  1746. FMUL a4, b2, t4
  1747. LDF [BO + 1 * SIZE], b2
  1748. bg,pt %icc, .LL126
  1749. LDF [AO + 3 * SIZE], a4
  1750. .LL129:
  1751. FADD1 c03, t1, c03
  1752. FADD3 c07, t2, c07
  1753. FADD2 c04, t3, c04
  1754. FADD4 c08, t4, c08
  1755. FADD c01, c06, c01
  1756. FADD c02, c05, c02
  1757. FADD c03, c08, c03
  1758. FADD c04, c07, c04
  1759. #if defined(LN) || defined(RT)
  1760. #ifdef LN
  1761. sub KK, 2, TEMP1
  1762. #else
  1763. sub KK, 1, TEMP1
  1764. #endif
  1765. sll TEMP1, 1 + ZBASE_SHIFT, TEMP2
  1766. sll TEMP1, 0 + ZBASE_SHIFT, TEMP1
  1767. add AORIG, TEMP2, AO
  1768. add B, TEMP1, BO
  1769. #endif
  1770. #if defined(LN) || defined(LT)
  1771. LDF [BO + 0 * SIZE], a1
  1772. LDF [BO + 1 * SIZE], a2
  1773. LDF [BO + 2 * SIZE], a3
  1774. LDF [BO + 3 * SIZE], a4
  1775. FSUB a1, c01, c01
  1776. FSUB a2, c02, c02
  1777. FSUB a3, c03, c03
  1778. FSUB a4, c04, c04
  1779. #else
  1780. LDF [AO + 0 * SIZE], a1
  1781. LDF [AO + 1 * SIZE], a2
  1782. LDF [AO + 2 * SIZE], a3
  1783. LDF [AO + 3 * SIZE], a4
  1784. FSUB a1, c01, c01
  1785. FSUB a2, c02, c02
  1786. FSUB a3, c03, c03
  1787. FSUB a4, c04, c04
  1788. #endif
  1789. #ifdef LN
  1790. LDF [AO + 6 * SIZE], a1
  1791. LDF [AO + 7 * SIZE], a2
  1792. LDF [AO + 4 * SIZE], a3
  1793. LDF [AO + 5 * SIZE], a4
  1794. LDF [AO + 0 * SIZE], b1
  1795. LDF [AO + 1 * SIZE], b2
  1796. FMUL a1, c03, t1
  1797. FMUL a2, c04, t2
  1798. FMUL a1, c04, t3
  1799. FMUL a2, c03, t4
  1800. FADD4 t1, t2, c03
  1801. FADD2 t3, t4, c04
  1802. FMUL a3, c03, t1
  1803. FMUL a3, c04, t2
  1804. FMUL a4, c04, t5
  1805. FMUL a4, c03, t6
  1806. FSUB c01, t1, c01
  1807. FSUB c02, t2, c02
  1808. FADD2 c01, t5, c01
  1809. FADD4 c02, t6, c02
  1810. FMUL b1, c01, t1
  1811. FMUL b2, c02, t2
  1812. FMUL b1, c02, t3
  1813. FMUL b2, c01, t4
  1814. FADD4 t1, t2, c01
  1815. FADD2 t3, t4, c02
  1816. #endif
  1817. #ifdef LT
  1818. LDF [AO + 0 * SIZE], a1
  1819. LDF [AO + 1 * SIZE], a2
  1820. LDF [AO + 2 * SIZE], a3
  1821. LDF [AO + 3 * SIZE], a4
  1822. LDF [AO + 6 * SIZE], b1
  1823. LDF [AO + 7 * SIZE], b2
  1824. FMUL a1, c01, t1
  1825. FMUL a2, c02, t2
  1826. FMUL a1, c02, t3
  1827. FMUL a2, c01, t4
  1828. FADD4 t1, t2, c01
  1829. FADD2 t3, t4, c02
  1830. FMUL a3, c01, t1
  1831. FMUL a3, c02, t2
  1832. FMUL a4, c02, t5
  1833. FMUL a4, c01, t6
  1834. FSUB c03, t1, c03
  1835. FSUB c04, t2, c04
  1836. FADD2 c03, t5, c03
  1837. FADD4 c04, t6, c04
  1838. FMUL b1, c03, t1
  1839. FMUL b2, c04, t2
  1840. FMUL b1, c04, t3
  1841. FMUL b2, c03, t4
  1842. FADD4 t1, t2, c03
  1843. FADD2 t3, t4, c04
  1844. #endif
  1845. #ifdef RN
  1846. LDF [BO + 0 * SIZE], a1
  1847. LDF [BO + 1 * SIZE], a2
  1848. FMUL a1, c01, t1
  1849. FMUL a2, c02, t2
  1850. FMUL a1, c02, t3
  1851. FMUL a2, c01, t4
  1852. FMUL a1, c03, t5
  1853. FMUL a2, c04, t6
  1854. FMUL a1, c04, t7
  1855. FMUL a2, c03, t8
  1856. FADD4 t1, t2, c01
  1857. FADD3 t3, t4, c02
  1858. FADD4 t5, t6, c03
  1859. FADD3 t7, t8, c04
  1860. #endif
  1861. #ifdef RT
  1862. LDF [BO + 0 * SIZE], a1
  1863. LDF [BO + 1 * SIZE], a2
  1864. FMUL a1, c01, t1
  1865. FMUL a2, c02, t2
  1866. FMUL a1, c02, t3
  1867. FMUL a2, c01, t4
  1868. FMUL a1, c03, t5
  1869. FMUL a2, c04, t6
  1870. FMUL a1, c04, t7
  1871. FMUL a2, c03, t8
  1872. FADD4 t1, t2, c01
  1873. FADD3 t3, t4, c02
  1874. FADD4 t5, t6, c03
  1875. FADD3 t7, t8, c04
  1876. #endif
  1877. #ifdef LN
  1878. add C1, -4 * SIZE, C1
  1879. #endif
  1880. #if defined(LN) || defined(LT)
  1881. STF c01, [BO + 0 * SIZE]
  1882. STF c02, [BO + 1 * SIZE]
  1883. STF c03, [BO + 2 * SIZE]
  1884. STF c04, [BO + 3 * SIZE]
  1885. #else
  1886. STF c01, [AO + 0 * SIZE]
  1887. STF c02, [AO + 1 * SIZE]
  1888. STF c03, [AO + 2 * SIZE]
  1889. STF c04, [AO + 3 * SIZE]
  1890. #endif
  1891. STF c01, [C1 + 0 * SIZE]
  1892. STF c02, [C1 + 1 * SIZE]
  1893. STF c03, [C1 + 2 * SIZE]
  1894. STF c04, [C1 + 3 * SIZE]
  1895. FMOV FZERO, t1
  1896. FMOV FZERO, t2
  1897. FMOV FZERO, t3
  1898. FMOV FZERO, t4
  1899. #ifndef LN
  1900. add C1, 4 * SIZE, C1
  1901. #endif
  1902. #ifdef RT
  1903. sll K, 1 + ZBASE_SHIFT, TEMP1
  1904. add AORIG, TEMP1, AORIG
  1905. #endif
  1906. #if defined(LT) || defined(RN)
  1907. sub K, KK, TEMP1
  1908. sll TEMP1, 1 + ZBASE_SHIFT, TEMP2
  1909. sll TEMP1, 0 + ZBASE_SHIFT, TEMP1
  1910. add AO, TEMP2, AO
  1911. add BO, TEMP1, BO
  1912. #endif
  1913. #ifdef LT
  1914. add KK, 2, KK
  1915. #endif
  1916. #ifdef LN
  1917. sub KK, 2, KK
  1918. #endif
  1919. add I, -1, I
  1920. cmp I, 0
  1921. bg,pt %icc, .LL121
  1922. FMOV FZERO, c03
  1923. .LL199:
  1924. #ifdef LN
  1925. sll K, 0 + ZBASE_SHIFT, TEMP1
  1926. add B, TEMP1, B
  1927. #endif
  1928. #if defined(LT) || defined(RN)
  1929. mov BO, B
  1930. #endif
  1931. #ifdef RN
  1932. add KK, 1, KK
  1933. #endif
  1934. #ifdef RT
  1935. sub KK, 1, KK
  1936. #endif
  1937. .LL999:
  1938. return %i7 + 8
  1939. clr %o0
  1940. EPILOGUE