You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

ztrsm_kernel_LT_1x4.S 42 kB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714171517161717171817191720172117221723172417251726172717281729173017311732173317341735173617371738173917401741174217431744174517461747174817491750175117521753175417551756175717581759176017611762176317641765176617671768176917701771177217731774177517761777177817791780178117821783178417851786178717881789179017911792179317941795179617971798179918001801180218031804180518061807180818091810181118121813181418151816181718181819182018211822182318241825182618271828182918301831183218331834183518361837183818391840184118421843184418451846184718481849185018511852185318541855185618571858185918601861186218631864186518661867186818691870187118721873187418751876187718781879188018811882188318841885188618871888188918901891189218931894189518961897189818991900190119021903190419051906190719081909191019111912191319141915191619171918191919201921192219231924192519261927192819291930193119321933193419351936193719381939194019411942194319441945194619471948194919501951195219531954195519561957195819591960196119621963196419651966196719681969197019711972197319741975197619771978197919801981198219831984198519861987198819891990199119921993199419951996199719981999200020012002200320042005200620072008200920102011201220132014201520162017201820192020202120222023202420252026202720282029203020312032203320342035203620372038203920402041204220432044204520462047204820492050205120522053205420552056205720582059206020612062206320642065206620672068206920702071207220732074207520762077207820792080208120822083208420852086208720882089209020912092209320942095209620972098209921002101210221032104210521062107210821092110211121122113211421152116211721182119212021212122212321242125212621272128212921302131
  1. /*********************************************************************/
  2. /* Copyright 2005-2010 The University of Texas at Austin. */
  3. /* All rights reserved. */
  4. /* */
  5. /* Redistribution and use in source and binary forms, with or */
  6. /* without modification, are permitted provided that the following */
  7. /* conditions are met: */
  8. /* */
  9. /* 1. Redistributions of source code must retain the above */
  10. /* copyright notice, this list of conditions and the following */
  11. /* disclaimer. */
  12. /* */
  13. /* 2. Redistributions in binary form must reproduce the above */
  14. /* copyright notice, this list of conditions and the following */
  15. /* disclaimer in the documentation and/or other materials */
  16. /* provided with the distribution. */
  17. /* */
  18. /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
  19. /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
  20. /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
  21. /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
  22. /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
  23. /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
  24. /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
  25. /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
  26. /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
  27. /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
  28. /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
  29. /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
  30. /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
  31. /* POSSIBILITY OF SUCH DAMAGE. */
  32. /* */
  33. /* The views and conclusions contained in the software and */
  34. /* documentation are those of the authors and should not be */
  35. /* interpreted as representing official policies, either expressed */
  36. /* or implied, of The University of Texas at Austin. */
  37. /*********************************************************************/
  38. #define ASSEMBLER
  39. #include "common.h"
  40. #define APREFETCHSIZE 24
  41. #define APREFETCH_CATEGORY 0
  42. #define M %i0
  43. #define N %i1
  44. #define K %i2
  45. #define A %i5
  46. #define B %i3
  47. #define C %i4
  48. #define LDC %o0
  49. #define AO %o1
  50. #define BO %o2
  51. #define I %o3
  52. #define J %o4
  53. #define L %o5
  54. #define C1 %l0
  55. #define C2 %l1
  56. #define C3 %l2
  57. #define C4 %l3
  58. #define OFFSET %l4
  59. #define KK %l5
  60. #define TEMP1 %l6
  61. #define TEMP2 %l7
  62. #define AORIG %o7
  63. #ifdef DOUBLE
  64. #define c01 %f0
  65. #define c02 %f2
  66. #define c03 %f4
  67. #define c04 %f6
  68. #define c05 %f8
  69. #define c06 %f10
  70. #define c07 %f12
  71. #define c08 %f14
  72. #define c09 %f16
  73. #define c10 %f18
  74. #define c11 %f20
  75. #define c12 %f22
  76. #define c13 %f24
  77. #define c14 %f26
  78. #define c15 %f28
  79. #define c16 %f30
  80. #define a1 %f32
  81. #define a2 %f34
  82. #define a3 %f36
  83. #define a4 %f38
  84. #define a5 %f40
  85. #define b1 %f42
  86. #define b2 %f44
  87. #define b3 %f46
  88. #define b4 %f48
  89. #define b5 %f50
  90. #define b6 %f52
  91. #define b7 %f54
  92. #define b8 %f56
  93. #define b9 %f58
  94. #define cc01 0
  95. #define cc02 2
  96. #define cc03 4
  97. #define cc04 6
  98. #define cc05 8
  99. #define cc06 10
  100. #define cc07 12
  101. #define cc08 14
  102. #define cc09 16
  103. #define cc10 18
  104. #define cc11 20
  105. #define cc12 22
  106. #define cc13 24
  107. #define cc14 26
  108. #define cc15 28
  109. #define cc16 30
  110. #define aa1 1
  111. #define aa2 3
  112. #define aa3 5
  113. #define aa4 7
  114. #define aa5 9
  115. #define bb1 11
  116. #define bb2 13
  117. #define bb3 15
  118. #define bb4 17
  119. #define bb5 19
  120. #define bb6 21
  121. #define bb7 23
  122. #define bb8 25
  123. #define bb9 27
  124. #else
  125. #define c01 %f0
  126. #define c02 %f1
  127. #define c03 %f2
  128. #define c04 %f3
  129. #define c05 %f4
  130. #define c06 %f5
  131. #define c07 %f6
  132. #define c08 %f7
  133. #define c09 %f8
  134. #define c10 %f9
  135. #define c11 %f10
  136. #define c12 %f11
  137. #define c13 %f12
  138. #define c14 %f13
  139. #define c15 %f14
  140. #define c16 %f15
  141. #define a1 %f16
  142. #define a2 %f17
  143. #define a3 %f18
  144. #define a4 %f19
  145. #define a5 %f20
  146. #define b1 %f21
  147. #define b2 %f22
  148. #define b3 %f23
  149. #define b4 %f24
  150. #define b5 %f25
  151. #define b6 %f26
  152. #define b7 %f27
  153. #define b8 %f28
  154. #define b9 %f29
  155. #define cc01 0
  156. #define cc02 1
  157. #define cc03 2
  158. #define cc04 3
  159. #define cc05 4
  160. #define cc06 5
  161. #define cc07 6
  162. #define cc08 7
  163. #define cc09 8
  164. #define cc10 9
  165. #define cc11 10
  166. #define cc12 11
  167. #define cc13 12
  168. #define cc14 13
  169. #define cc15 14
  170. #define cc16 15
  171. #define aa1 16
  172. #define aa2 17
  173. #define aa3 18
  174. #define aa4 19
  175. #define aa5 20
  176. #define bb1 21
  177. #define bb2 22
  178. #define bb3 23
  179. #define bb4 24
  180. #define bb5 25
  181. #define bb6 26
  182. #define bb7 27
  183. #define bb8 28
  184. #define bb9 29
  185. #endif
  186. #ifndef CONJ
  187. #define FMADD1 FMADD
  188. #define FMADD2 FMADD
  189. #define FMADD3 FMADD
  190. #define FMADD4 FNMSUB
  191. #else
  192. #if defined(LN) || defined(LT)
  193. #define FMADD1 FMADD
  194. #define FMADD2 FNMSUB
  195. #define FMADD3 FMADD
  196. #define FMADD4 FMADD
  197. #endif
  198. #if defined(RN) || defined(RT)
  199. #define FMADD1 FMADD
  200. #define FMADD2 FMADD
  201. #define FMADD3 FNMSUB
  202. #define FMADD4 FMADD
  203. #endif
  204. #endif
  205. .register %g2, #scratch
  206. .register %g3, #scratch
  207. PROLOGUE
  208. SAVESP
  209. #ifndef __64BIT__
  210. #ifdef DOUBLE
  211. ld [%sp + STACK_START + 32], A
  212. ld [%sp + STACK_START + 36], B
  213. ld [%sp + STACK_START + 40], C
  214. ld [%sp + STACK_START + 44], LDC
  215. ld [%sp + STACK_START + 48], OFFSET
  216. #else
  217. ld [%sp + STACK_START + 28], B
  218. ld [%sp + STACK_START + 32], C
  219. ld [%sp + STACK_START + 36], LDC
  220. ld [%sp + STACK_START + 40], OFFSET
  221. #endif
  222. #else
  223. ldx [%sp + STACK_START + 56], B
  224. ldx [%sp + STACK_START + 64], C
  225. ldx [%sp + STACK_START + 72], LDC
  226. ldx [%sp + STACK_START + 80], OFFSET
  227. #endif
  228. cmp M, 0
  229. ble,pn %icc, .LL999
  230. nop
  231. sll LDC, ZBASE_SHIFT, LDC
  232. #ifdef LN
  233. smul M, K, TEMP1
  234. sll TEMP1, ZBASE_SHIFT, TEMP1
  235. add A, TEMP1, A
  236. sll M, ZBASE_SHIFT, TEMP1
  237. add C, TEMP1, C
  238. #endif
  239. #ifdef RN
  240. neg OFFSET, KK
  241. #endif
  242. #ifdef RT
  243. smul N, K, TEMP1
  244. sll TEMP1, ZBASE_SHIFT, TEMP1
  245. add B, TEMP1, B
  246. smul N, LDC, TEMP1
  247. add C, TEMP1, C
  248. sub N, OFFSET, KK
  249. #endif
  250. sra N, 2, J
  251. cmp J, 0
  252. ble,pn %icc, .LL20
  253. nop
  254. .align 4
  255. .LL11:
  256. #ifdef RT
  257. sll K, ZBASE_SHIFT + 2, TEMP1
  258. sub B, TEMP1, B
  259. #endif
  260. #ifndef RT
  261. mov C, C1
  262. add C, LDC, C2
  263. add C2, LDC, C3
  264. add C3, LDC, C4
  265. add C4, LDC, C
  266. #else
  267. sub C, LDC, C4
  268. sub C4, LDC, C3
  269. sub C3, LDC, C2
  270. sub C2, LDC, C1
  271. sub C2, LDC, C
  272. #endif
  273. #ifdef LN
  274. add M, OFFSET, KK
  275. #endif
  276. #ifdef LT
  277. mov OFFSET, KK
  278. #endif
  279. #if defined(LN) || defined(RT)
  280. mov A, AORIG
  281. #else
  282. mov A, AO
  283. #endif
  284. mov M, I
  285. .align 4
  286. .LL12:
  287. #if defined(LT) || defined(RN)
  288. mov B, BO
  289. #else
  290. #ifdef LN
  291. sll K, ZBASE_SHIFT, TEMP1
  292. sub AORIG, TEMP1, AORIG
  293. #endif
  294. sll KK, ZBASE_SHIFT + 0, TEMP1
  295. sll KK, ZBASE_SHIFT + 2, TEMP2
  296. add AORIG, TEMP1, AO
  297. add B, TEMP2, BO
  298. #endif
  299. LDF [AO + 0 * SIZE], a1
  300. FCLR (cc01)
  301. LDF [AO + 1 * SIZE], a2
  302. FCLR (cc05)
  303. LDF [AO + 8 * SIZE], a5
  304. FCLR (cc09)
  305. LDF [BO + 0 * SIZE], b1
  306. FCLR (cc13)
  307. LDF [BO + 1 * SIZE], b2
  308. FCLR (cc02)
  309. LDF [BO + 2 * SIZE], b3
  310. FCLR (cc06)
  311. LDF [BO + 3 * SIZE], b4
  312. FCLR (cc10)
  313. LDF [BO + 4 * SIZE], b5
  314. FCLR (cc14)
  315. LDF [BO + 5 * SIZE], b6
  316. FCLR (cc03)
  317. LDF [BO + 6 * SIZE], b7
  318. FCLR (cc07)
  319. LDF [BO + 7 * SIZE], b8
  320. FCLR (cc11)
  321. LDF [BO + 8 * SIZE], b9
  322. FCLR (cc15)
  323. prefetch [C1 + 1 * SIZE], 3
  324. FCLR (cc04)
  325. prefetch [C2 + 2 * SIZE], 3
  326. FCLR (cc08)
  327. prefetch [C3 + 1 * SIZE], 3
  328. FCLR (cc12)
  329. prefetch [C4 + 2 * SIZE], 3
  330. FCLR (cc16)
  331. #if defined(LT) || defined(RN)
  332. sra KK, 3, L
  333. #else
  334. sub K, KK, L
  335. sra L, 3, L
  336. #endif
  337. cmp L, 0
  338. ble,pn %icc, .LL15
  339. nop
  340. .align 4
  341. .LL13:
  342. FMADD1 (aa1, bb1, cc01, cc01)
  343. FMADD2 (aa2, bb1, cc02, cc02)
  344. FMADD3 (aa1, bb2, cc03, cc03)
  345. FMADD4 (aa2, bb2, cc04, cc04)
  346. FMADD1 (aa1, bb3, cc05, cc05)
  347. LDF [BO + 16 * SIZE], b1
  348. FMADD2 (aa2, bb3, cc06, cc06)
  349. LDF [BO + 9 * SIZE], b2
  350. FMADD3 (aa1, bb4, cc07, cc07)
  351. LDF [BO + 10 * SIZE], b3
  352. FMADD4 (aa2, bb4, cc08, cc08)
  353. LDF [BO + 11 * SIZE], b4
  354. FMADD1 (aa1, bb5, cc09, cc09)
  355. LDF [AO + 2 * SIZE], a3
  356. FMADD2 (aa2, bb5, cc10, cc10)
  357. LDF [AO + 3 * SIZE], a4
  358. FMADD3 (aa1, bb6, cc11, cc11)
  359. prefetch [AO + (APREFETCHSIZE + 0) * SIZE], APREFETCH_CATEGORY
  360. FMADD4 (aa2, bb6, cc12, cc12)
  361. nop
  362. FMADD1 (aa1, bb7, cc13, cc13)
  363. LDF [BO + 12 * SIZE], b5
  364. FMADD2 (aa2, bb7, cc14, cc14)
  365. LDF [BO + 13 * SIZE], b6
  366. FMADD3 (aa1, bb8, cc15, cc15)
  367. LDF [BO + 14 * SIZE], b7
  368. FMADD4 (aa2, bb8, cc16, cc16)
  369. LDF [BO + 15 * SIZE], b8
  370. FMADD1 (aa3, bb9, cc01, cc01)
  371. FMADD2 (aa4, bb9, cc02, cc02)
  372. FMADD3 (aa3, bb2, cc03, cc03)
  373. FMADD4 (aa4, bb2, cc04, cc04)
  374. FMADD1 (aa3, bb3, cc05, cc05)
  375. LDF [BO + 24 * SIZE], b9
  376. FMADD2 (aa4, bb3, cc06, cc06)
  377. LDF [BO + 17 * SIZE], b2
  378. FMADD3 (aa3, bb4, cc07, cc07)
  379. LDF [BO + 18 * SIZE], b3
  380. FMADD4 (aa4, bb4, cc08, cc08)
  381. LDF [BO + 19 * SIZE], b4
  382. FMADD1 (aa3, bb5, cc09, cc09)
  383. LDF [AO + 4 * SIZE], a1
  384. FMADD2 (aa4, bb5, cc10, cc10)
  385. LDF [AO + 5 * SIZE], a2
  386. FMADD3 (aa3, bb6, cc11, cc11)
  387. add L, -1, L
  388. FMADD4 (aa4, bb6, cc12, cc12)
  389. nop
  390. FMADD1 (aa3, bb7, cc13, cc13)
  391. LDF [BO + 20 * SIZE], b5
  392. FMADD2 (aa4, bb7, cc14, cc14)
  393. LDF [BO + 21 * SIZE], b6
  394. FMADD3 (aa3, bb8, cc15, cc15)
  395. LDF [BO + 22 * SIZE], b7
  396. FMADD4 (aa4, bb8, cc16, cc16)
  397. LDF [BO + 23 * SIZE], b8
  398. FMADD1 (aa1, bb1, cc01, cc01)
  399. FMADD2 (aa2, bb1, cc02, cc02)
  400. FMADD3 (aa1, bb2, cc03, cc03)
  401. FMADD4 (aa2, bb2, cc04, cc04)
  402. FMADD1 (aa1, bb3, cc05, cc05)
  403. LDF [BO + 32 * SIZE], b1
  404. FMADD2 (aa2, bb3, cc06, cc06)
  405. LDF [BO + 25 * SIZE], b2
  406. FMADD3 (aa1, bb4, cc07, cc07)
  407. LDF [BO + 26 * SIZE], b3
  408. FMADD4 (aa2, bb4, cc08, cc08)
  409. LDF [BO + 27 * SIZE], b4
  410. FMADD1 (aa1, bb5, cc09, cc09)
  411. LDF [AO + 6 * SIZE], a3
  412. FMADD2 (aa2, bb5, cc10, cc10)
  413. LDF [AO + 7 * SIZE], a4
  414. FMADD3 (aa1, bb6, cc11, cc11)
  415. nop
  416. FMADD4 (aa2, bb6, cc12, cc12)
  417. nop
  418. FMADD1 (aa1, bb7, cc13, cc13)
  419. LDF [BO + 28 * SIZE], b5
  420. FMADD2 (aa2, bb7, cc14, cc14)
  421. LDF [BO + 29 * SIZE], b6
  422. FMADD3 (aa1, bb8, cc15, cc15)
  423. LDF [BO + 30 * SIZE], b7
  424. FMADD4 (aa2, bb8, cc16, cc16)
  425. LDF [BO + 31 * SIZE], b8
  426. FMADD1 (aa3, bb9, cc01, cc01)
  427. FMADD2 (aa4, bb9, cc02, cc02)
  428. FMADD3 (aa3, bb2, cc03, cc03)
  429. FMADD4 (aa4, bb2, cc04, cc04)
  430. FMADD1 (aa3, bb3, cc05, cc05)
  431. LDF [BO + 40 * SIZE], b9
  432. FMADD2 (aa4, bb3, cc06, cc06)
  433. LDF [BO + 33 * SIZE], b2
  434. FMADD3 (aa3, bb4, cc07, cc07)
  435. LDF [BO + 34 * SIZE], b3
  436. FMADD4 (aa4, bb4, cc08, cc08)
  437. LDF [BO + 35 * SIZE], b4
  438. FMADD1 (aa3, bb5, cc09, cc09)
  439. LDF [AO + 16 * SIZE], a1 /****/
  440. FMADD2 (aa4, bb5, cc10, cc10)
  441. LDF [AO + 9 * SIZE], a2
  442. FMADD3 (aa3, bb6, cc11, cc11)
  443. nop
  444. FMADD4 (aa4, bb6, cc12, cc12)
  445. nop
  446. FMADD1 (aa3, bb7, cc13, cc13)
  447. LDF [BO + 36 * SIZE], b5
  448. FMADD2 (aa4, bb7, cc14, cc14)
  449. LDF [BO + 37 * SIZE], b6
  450. FMADD3 (aa3, bb8, cc15, cc15)
  451. LDF [BO + 38 * SIZE], b7
  452. FMADD4 (aa4, bb8, cc16, cc16)
  453. LDF [BO + 39 * SIZE], b8
  454. FMADD1 (aa5, bb1, cc01, cc01)
  455. FMADD2 (aa2, bb1, cc02, cc02)
  456. FMADD3 (aa5, bb2, cc03, cc03)
  457. FMADD4 (aa2, bb2, cc04, cc04)
  458. FMADD1 (aa5, bb3, cc05, cc05)
  459. LDF [BO + 48 * SIZE], b1
  460. FMADD2 (aa2, bb3, cc06, cc06)
  461. LDF [BO + 41 * SIZE], b2
  462. FMADD3 (aa5, bb4, cc07, cc07)
  463. LDF [BO + 42 * SIZE], b3
  464. FMADD4 (aa2, bb4, cc08, cc08)
  465. LDF [BO + 43 * SIZE], b4
  466. FMADD1 (aa5, bb5, cc09, cc09)
  467. LDF [AO + 10 * SIZE], a3
  468. FMADD2 (aa2, bb5, cc10, cc10)
  469. LDF [AO + 11 * SIZE], a4
  470. FMADD3 (aa5, bb6, cc11, cc11)
  471. prefetch [AO + (APREFETCHSIZE + 8) * SIZE], APREFETCH_CATEGORY
  472. FMADD4 (aa2, bb6, cc12, cc12)
  473. nop
  474. FMADD1 (aa5, bb7, cc13, cc13)
  475. LDF [BO + 44 * SIZE], b5
  476. FMADD2 (aa2, bb7, cc14, cc14)
  477. LDF [BO + 45 * SIZE], b6
  478. FMADD3 (aa5, bb8, cc15, cc15)
  479. LDF [BO + 46 * SIZE], b7
  480. FMADD4 (aa2, bb8, cc16, cc16)
  481. LDF [BO + 47 * SIZE], b8
  482. FMADD1 (aa3, bb9, cc01, cc01)
  483. FMADD2 (aa4, bb9, cc02, cc02)
  484. FMADD3 (aa3, bb2, cc03, cc03)
  485. FMADD4 (aa4, bb2, cc04, cc04)
  486. FMADD1 (aa3, bb3, cc05, cc05)
  487. LDF [BO + 56 * SIZE], b9
  488. FMADD2 (aa4, bb3, cc06, cc06)
  489. LDF [BO + 49 * SIZE], b2
  490. FMADD3 (aa3, bb4, cc07, cc07)
  491. LDF [BO + 50 * SIZE], b3
  492. FMADD4 (aa4, bb4, cc08, cc08)
  493. LDF [BO + 51 * SIZE], b4
  494. FMADD1 (aa3, bb5, cc09, cc09)
  495. LDF [AO + 12 * SIZE], a5
  496. FMADD2 (aa4, bb5, cc10, cc10)
  497. LDF [AO + 13 * SIZE], a2
  498. FMADD3 (aa3, bb6, cc11, cc11)
  499. cmp L, 0
  500. FMADD4 (aa4, bb6, cc12, cc12)
  501. nop
  502. FMADD1 (aa3, bb7, cc13, cc13)
  503. LDF [BO + 52 * SIZE], b5
  504. FMADD2 (aa4, bb7, cc14, cc14)
  505. LDF [BO + 53 * SIZE], b6
  506. FMADD3 (aa3, bb8, cc15, cc15)
  507. LDF [BO + 54 * SIZE], b7
  508. FMADD4 (aa4, bb8, cc16, cc16)
  509. LDF [BO + 55 * SIZE], b8
  510. FMADD1 (aa5, bb1, cc01, cc01)
  511. FMADD2 (aa2, bb1, cc02, cc02)
  512. FMADD3 (aa5, bb2, cc03, cc03)
  513. FMADD4 (aa2, bb2, cc04, cc04)
  514. FMADD1 (aa5, bb3, cc05, cc05)
  515. LDF [BO + 64 * SIZE], b1
  516. FMADD2 (aa2, bb3, cc06, cc06)
  517. LDF [BO + 57 * SIZE], b2
  518. FMADD3 (aa5, bb4, cc07, cc07)
  519. LDF [BO + 58 * SIZE], b3
  520. FMADD4 (aa2, bb4, cc08, cc08)
  521. LDF [BO + 59 * SIZE], b4
  522. FMADD1 (aa5, bb5, cc09, cc09)
  523. LDF [AO + 14 * SIZE], a3
  524. FMADD2 (aa2, bb5, cc10, cc10)
  525. LDF [AO + 15 * SIZE], a4
  526. FMADD3 (aa5, bb6, cc11, cc11)
  527. add BO, 64 * SIZE, BO
  528. FMADD4 (aa2, bb6, cc12, cc12)
  529. add AO, 16 * SIZE, AO
  530. FMADD1 (aa5, bb7, cc13, cc13)
  531. LDF [BO - 4 * SIZE], b5
  532. FMADD2 (aa2, bb7, cc14, cc14)
  533. LDF [BO - 3 * SIZE], b6
  534. FMADD3 (aa5, bb8, cc15, cc15)
  535. LDF [BO - 2 * SIZE], b7
  536. FMADD4 (aa2, bb8, cc16, cc16)
  537. LDF [BO - 1 * SIZE], b8
  538. FMADD1 (aa3, bb9, cc01, cc01)
  539. FMADD2 (aa4, bb9, cc02, cc02)
  540. FMADD3 (aa3, bb2, cc03, cc03)
  541. FMADD4 (aa4, bb2, cc04, cc04)
  542. FMADD1 (aa3, bb3, cc05, cc05)
  543. LDF [BO + 8 * SIZE], b9
  544. FMADD2 (aa4, bb3, cc06, cc06)
  545. LDF [BO + 1 * SIZE], b2
  546. FMADD3 (aa3, bb4, cc07, cc07)
  547. LDF [BO + 2 * SIZE], b3
  548. FMADD4 (aa4, bb4, cc08, cc08)
  549. LDF [BO + 3 * SIZE], b4
  550. FMADD1 (aa3, bb5, cc09, cc09)
  551. LDF [AO + 8 * SIZE], a5 /****/
  552. FMADD2 (aa4, bb5, cc10, cc10)
  553. LDF [AO + 1 * SIZE], a2
  554. FMADD3 (aa3, bb6, cc11, cc11)
  555. FMADD4 (aa4, bb6, cc12, cc12)
  556. FMADD1 (aa3, bb7, cc13, cc13)
  557. LDF [BO + 4 * SIZE], b5
  558. FMADD2 (aa4, bb7, cc14, cc14)
  559. LDF [BO + 5 * SIZE], b6
  560. FMADD3 (aa3, bb8, cc15, cc15)
  561. LDF [BO + 6 * SIZE], b7
  562. FMADD4 (aa4, bb8, cc16, cc16)
  563. ble,pn %icc, .LL15
  564. LDF [BO + 7 * SIZE], b8
  565. FMADD1 (aa1, bb1, cc01, cc01)
  566. FMADD2 (aa2, bb1, cc02, cc02)
  567. FMADD3 (aa1, bb2, cc03, cc03)
  568. FMADD4 (aa2, bb2, cc04, cc04)
  569. FMADD1 (aa1, bb3, cc05, cc05)
  570. LDF [BO + 16 * SIZE], b1
  571. FMADD2 (aa2, bb3, cc06, cc06)
  572. LDF [BO + 9 * SIZE], b2
  573. FMADD3 (aa1, bb4, cc07, cc07)
  574. LDF [BO + 10 * SIZE], b3
  575. FMADD4 (aa2, bb4, cc08, cc08)
  576. LDF [BO + 11 * SIZE], b4
  577. FMADD1 (aa1, bb5, cc09, cc09)
  578. LDF [AO + 2 * SIZE], a3
  579. FMADD2 (aa2, bb5, cc10, cc10)
  580. LDF [AO + 3 * SIZE], a4
  581. FMADD3 (aa1, bb6, cc11, cc11)
  582. prefetch [AO + (APREFETCHSIZE + 0) * SIZE], APREFETCH_CATEGORY
  583. FMADD4 (aa2, bb6, cc12, cc12)
  584. nop
  585. FMADD1 (aa1, bb7, cc13, cc13)
  586. LDF [BO + 12 * SIZE], b5
  587. FMADD2 (aa2, bb7, cc14, cc14)
  588. LDF [BO + 13 * SIZE], b6
  589. FMADD3 (aa1, bb8, cc15, cc15)
  590. LDF [BO + 14 * SIZE], b7
  591. FMADD4 (aa2, bb8, cc16, cc16)
  592. LDF [BO + 15 * SIZE], b8
  593. FMADD1 (aa3, bb9, cc01, cc01)
  594. FMADD2 (aa4, bb9, cc02, cc02)
  595. FMADD3 (aa3, bb2, cc03, cc03)
  596. FMADD4 (aa4, bb2, cc04, cc04)
  597. FMADD1 (aa3, bb3, cc05, cc05)
  598. LDF [BO + 24 * SIZE], b9
  599. FMADD2 (aa4, bb3, cc06, cc06)
  600. LDF [BO + 17 * SIZE], b2
  601. FMADD3 (aa3, bb4, cc07, cc07)
  602. LDF [BO + 18 * SIZE], b3
  603. FMADD4 (aa4, bb4, cc08, cc08)
  604. LDF [BO + 19 * SIZE], b4
  605. FMADD1 (aa3, bb5, cc09, cc09)
  606. LDF [AO + 4 * SIZE], a1
  607. FMADD2 (aa4, bb5, cc10, cc10)
  608. LDF [AO + 5 * SIZE], a2
  609. FMADD3 (aa3, bb6, cc11, cc11)
  610. add L, -1, L
  611. FMADD4 (aa4, bb6, cc12, cc12)
  612. nop
  613. FMADD1 (aa3, bb7, cc13, cc13)
  614. LDF [BO + 20 * SIZE], b5
  615. FMADD2 (aa4, bb7, cc14, cc14)
  616. LDF [BO + 21 * SIZE], b6
  617. FMADD3 (aa3, bb8, cc15, cc15)
  618. LDF [BO + 22 * SIZE], b7
  619. FMADD4 (aa4, bb8, cc16, cc16)
  620. LDF [BO + 23 * SIZE], b8
  621. FMADD1 (aa1, bb1, cc01, cc01)
  622. FMADD2 (aa2, bb1, cc02, cc02)
  623. FMADD3 (aa1, bb2, cc03, cc03)
  624. FMADD4 (aa2, bb2, cc04, cc04)
  625. FMADD1 (aa1, bb3, cc05, cc05)
  626. LDF [BO + 32 * SIZE], b1
  627. FMADD2 (aa2, bb3, cc06, cc06)
  628. LDF [BO + 25 * SIZE], b2
  629. FMADD3 (aa1, bb4, cc07, cc07)
  630. LDF [BO + 26 * SIZE], b3
  631. FMADD4 (aa2, bb4, cc08, cc08)
  632. LDF [BO + 27 * SIZE], b4
  633. FMADD1 (aa1, bb5, cc09, cc09)
  634. LDF [AO + 6 * SIZE], a3
  635. FMADD2 (aa2, bb5, cc10, cc10)
  636. LDF [AO + 7 * SIZE], a4
  637. FMADD3 (aa1, bb6, cc11, cc11)
  638. nop
  639. FMADD4 (aa2, bb6, cc12, cc12)
  640. nop
  641. FMADD1 (aa1, bb7, cc13, cc13)
  642. LDF [BO + 28 * SIZE], b5
  643. FMADD2 (aa2, bb7, cc14, cc14)
  644. LDF [BO + 29 * SIZE], b6
  645. FMADD3 (aa1, bb8, cc15, cc15)
  646. LDF [BO + 30 * SIZE], b7
  647. FMADD4 (aa2, bb8, cc16, cc16)
  648. LDF [BO + 31 * SIZE], b8
  649. FMADD1 (aa3, bb9, cc01, cc01)
  650. FMADD2 (aa4, bb9, cc02, cc02)
  651. FMADD3 (aa3, bb2, cc03, cc03)
  652. FMADD4 (aa4, bb2, cc04, cc04)
  653. FMADD1 (aa3, bb3, cc05, cc05)
  654. LDF [BO + 40 * SIZE], b9
  655. FMADD2 (aa4, bb3, cc06, cc06)
  656. LDF [BO + 33 * SIZE], b2
  657. FMADD3 (aa3, bb4, cc07, cc07)
  658. LDF [BO + 34 * SIZE], b3
  659. FMADD4 (aa4, bb4, cc08, cc08)
  660. LDF [BO + 35 * SIZE], b4
  661. FMADD1 (aa3, bb5, cc09, cc09)
  662. LDF [AO + 16 * SIZE], a1 /****/
  663. FMADD2 (aa4, bb5, cc10, cc10)
  664. LDF [AO + 9 * SIZE], a2
  665. FMADD3 (aa3, bb6, cc11, cc11)
  666. nop
  667. FMADD4 (aa4, bb6, cc12, cc12)
  668. nop
  669. FMADD1 (aa3, bb7, cc13, cc13)
  670. LDF [BO + 36 * SIZE], b5
  671. FMADD2 (aa4, bb7, cc14, cc14)
  672. LDF [BO + 37 * SIZE], b6
  673. FMADD3 (aa3, bb8, cc15, cc15)
  674. LDF [BO + 38 * SIZE], b7
  675. FMADD4 (aa4, bb8, cc16, cc16)
  676. LDF [BO + 39 * SIZE], b8
  677. FMADD1 (aa5, bb1, cc01, cc01)
  678. FMADD2 (aa2, bb1, cc02, cc02)
  679. FMADD3 (aa5, bb2, cc03, cc03)
  680. FMADD4 (aa2, bb2, cc04, cc04)
  681. FMADD1 (aa5, bb3, cc05, cc05)
  682. LDF [BO + 48 * SIZE], b1
  683. FMADD2 (aa2, bb3, cc06, cc06)
  684. LDF [BO + 41 * SIZE], b2
  685. FMADD3 (aa5, bb4, cc07, cc07)
  686. LDF [BO + 42 * SIZE], b3
  687. FMADD4 (aa2, bb4, cc08, cc08)
  688. LDF [BO + 43 * SIZE], b4
  689. FMADD1 (aa5, bb5, cc09, cc09)
  690. LDF [AO + 10 * SIZE], a3
  691. FMADD2 (aa2, bb5, cc10, cc10)
  692. LDF [AO + 11 * SIZE], a4
  693. FMADD3 (aa5, bb6, cc11, cc11)
  694. prefetch [AO + (APREFETCHSIZE + 8) * SIZE], APREFETCH_CATEGORY
  695. FMADD4 (aa2, bb6, cc12, cc12)
  696. nop
  697. FMADD1 (aa5, bb7, cc13, cc13)
  698. LDF [BO + 44 * SIZE], b5
  699. FMADD2 (aa2, bb7, cc14, cc14)
  700. LDF [BO + 45 * SIZE], b6
  701. FMADD3 (aa5, bb8, cc15, cc15)
  702. LDF [BO + 46 * SIZE], b7
  703. FMADD4 (aa2, bb8, cc16, cc16)
  704. LDF [BO + 47 * SIZE], b8
  705. FMADD1 (aa3, bb9, cc01, cc01)
  706. FMADD2 (aa4, bb9, cc02, cc02)
  707. FMADD3 (aa3, bb2, cc03, cc03)
  708. FMADD4 (aa4, bb2, cc04, cc04)
  709. FMADD1 (aa3, bb3, cc05, cc05)
  710. LDF [BO + 56 * SIZE], b9
  711. FMADD2 (aa4, bb3, cc06, cc06)
  712. LDF [BO + 49 * SIZE], b2
  713. FMADD3 (aa3, bb4, cc07, cc07)
  714. LDF [BO + 50 * SIZE], b3
  715. FMADD4 (aa4, bb4, cc08, cc08)
  716. LDF [BO + 51 * SIZE], b4
  717. FMADD1 (aa3, bb5, cc09, cc09)
  718. LDF [AO + 12 * SIZE], a5
  719. FMADD2 (aa4, bb5, cc10, cc10)
  720. LDF [AO + 13 * SIZE], a2
  721. FMADD3 (aa3, bb6, cc11, cc11)
  722. cmp L, 0
  723. FMADD4 (aa4, bb6, cc12, cc12)
  724. nop
  725. FMADD1 (aa3, bb7, cc13, cc13)
  726. LDF [BO + 52 * SIZE], b5
  727. FMADD2 (aa4, bb7, cc14, cc14)
  728. LDF [BO + 53 * SIZE], b6
  729. FMADD3 (aa3, bb8, cc15, cc15)
  730. LDF [BO + 54 * SIZE], b7
  731. FMADD4 (aa4, bb8, cc16, cc16)
  732. LDF [BO + 55 * SIZE], b8
  733. FMADD1 (aa5, bb1, cc01, cc01)
  734. FMADD2 (aa2, bb1, cc02, cc02)
  735. FMADD3 (aa5, bb2, cc03, cc03)
  736. FMADD4 (aa2, bb2, cc04, cc04)
  737. FMADD1 (aa5, bb3, cc05, cc05)
  738. LDF [BO + 64 * SIZE], b1
  739. FMADD2 (aa2, bb3, cc06, cc06)
  740. LDF [BO + 57 * SIZE], b2
  741. FMADD3 (aa5, bb4, cc07, cc07)
  742. LDF [BO + 58 * SIZE], b3
  743. FMADD4 (aa2, bb4, cc08, cc08)
  744. LDF [BO + 59 * SIZE], b4
  745. FMADD1 (aa5, bb5, cc09, cc09)
  746. LDF [AO + 14 * SIZE], a3
  747. FMADD2 (aa2, bb5, cc10, cc10)
  748. LDF [AO + 15 * SIZE], a4
  749. FMADD3 (aa5, bb6, cc11, cc11)
  750. add BO, 64 * SIZE, BO
  751. FMADD4 (aa2, bb6, cc12, cc12)
  752. add AO, 16 * SIZE, AO
  753. FMADD1 (aa5, bb7, cc13, cc13)
  754. LDF [BO - 4 * SIZE], b5
  755. FMADD2 (aa2, bb7, cc14, cc14)
  756. LDF [BO - 3 * SIZE], b6
  757. FMADD3 (aa5, bb8, cc15, cc15)
  758. LDF [BO - 2 * SIZE], b7
  759. FMADD4 (aa2, bb8, cc16, cc16)
  760. LDF [BO - 1 * SIZE], b8
  761. FMADD1 (aa3, bb9, cc01, cc01)
  762. FMADD2 (aa4, bb9, cc02, cc02)
  763. FMADD3 (aa3, bb2, cc03, cc03)
  764. FMADD4 (aa4, bb2, cc04, cc04)
  765. FMADD1 (aa3, bb3, cc05, cc05)
  766. LDF [BO + 8 * SIZE], b9
  767. FMADD2 (aa4, bb3, cc06, cc06)
  768. LDF [BO + 1 * SIZE], b2
  769. FMADD3 (aa3, bb4, cc07, cc07)
  770. LDF [BO + 2 * SIZE], b3
  771. FMADD4 (aa4, bb4, cc08, cc08)
  772. LDF [BO + 3 * SIZE], b4
  773. FMADD1 (aa3, bb5, cc09, cc09)
  774. LDF [AO + 8 * SIZE], a5 /****/
  775. FMADD2 (aa4, bb5, cc10, cc10)
  776. LDF [AO + 1 * SIZE], a2
  777. FMADD3 (aa3, bb6, cc11, cc11)
  778. FMADD4 (aa4, bb6, cc12, cc12)
  779. FMADD1 (aa3, bb7, cc13, cc13)
  780. LDF [BO + 4 * SIZE], b5
  781. FMADD2 (aa4, bb7, cc14, cc14)
  782. LDF [BO + 5 * SIZE], b6
  783. FMADD3 (aa3, bb8, cc15, cc15)
  784. LDF [BO + 6 * SIZE], b7
  785. FMADD4 (aa4, bb8, cc16, cc16)
  786. bg,pt %icc, .LL13
  787. LDF [BO + 7 * SIZE], b8
  788. .align 4
  789. .LL15:
  790. #if defined(LT) || defined(RN)
  791. and KK, 7, L
  792. #else
  793. sub K, KK, L
  794. and L, 7, L
  795. #endif
  796. cmp L, 0
  797. ble,a,pn %icc, .LL18
  798. nop
  799. .align 4
  800. .LL17:
  801. FMADD1 (aa1, bb1, cc01, cc01)
  802. add L, -1, L
  803. FMADD2 (aa2, bb1, cc02, cc02)
  804. nop
  805. FMADD3 (aa1, bb2, cc03, cc03)
  806. LDF [BO + 8 * SIZE], b1
  807. FMADD4 (aa2, bb2, cc04, cc04)
  808. LDF [BO + 9 * SIZE], b2
  809. FMADD1 (aa1, bb3, cc05, cc05)
  810. cmp L, 0
  811. FMADD2 (aa2, bb3, cc06, cc06)
  812. nop
  813. FMADD3 (aa1, bb4, cc07, cc07)
  814. LDF [BO + 10 * SIZE], b3
  815. FMADD4 (aa2, bb4, cc08, cc08)
  816. LDF [BO + 11 * SIZE], b4
  817. FMADD1 (aa1, bb5, cc09, cc09)
  818. nop
  819. FMADD2 (aa2, bb5, cc10, cc10)
  820. nop
  821. FMADD3 (aa1, bb6, cc11, cc11)
  822. LDF [BO + 12 * SIZE], b5
  823. FMADD4 (aa2, bb6, cc12, cc12)
  824. LDF [BO + 13 * SIZE], b6
  825. FMADD1 (aa1, bb7, cc13, cc13)
  826. add AO, 2 * SIZE, AO
  827. FMADD2 (aa2, bb7, cc14, cc14)
  828. add BO, 8 * SIZE, BO
  829. FMADD3 (aa1, bb8, cc15, cc15)
  830. LDF [AO + 0 * SIZE], a1
  831. FMADD4 (aa2, bb8, cc16, cc16)
  832. LDF [AO + 1 * SIZE], a2
  833. LDF [BO + 6 * SIZE], b7
  834. bg,pt %icc, .LL17
  835. LDF [BO + 7 * SIZE], b8
  836. nop
  837. .align 4
  838. .LL18:
  839. FADD c01, c04, c01
  840. FADD c02, c03, c02
  841. FADD c05, c08, c05
  842. FADD c06, c07, c06
  843. FADD c09, c12, c09
  844. FADD c10, c11, c10
  845. FADD c13, c16, c13
  846. FADD c14, c15, c14
  847. #if defined(LN) || defined(RT)
  848. #ifdef LN
  849. sub KK, 1, TEMP1
  850. #else
  851. sub KK, 4, TEMP1
  852. #endif
  853. sll TEMP1, ZBASE_SHIFT + 0, TEMP2
  854. sll TEMP1, ZBASE_SHIFT + 2, TEMP1
  855. add AORIG, TEMP2, AO
  856. add B, TEMP1, BO
  857. #endif
  858. #if defined(LN) || defined(LT)
  859. LDF [BO + 0 * SIZE], a1
  860. LDF [BO + 1 * SIZE], a2
  861. LDF [BO + 2 * SIZE], a3
  862. LDF [BO + 3 * SIZE], a4
  863. LDF [BO + 4 * SIZE], b1
  864. LDF [BO + 5 * SIZE], b2
  865. LDF [BO + 6 * SIZE], b3
  866. LDF [BO + 7 * SIZE], b4
  867. #else
  868. LDF [AO + 0 * SIZE], a1
  869. LDF [AO + 1 * SIZE], a2
  870. LDF [AO + 2 * SIZE], a3
  871. LDF [AO + 3 * SIZE], a4
  872. LDF [AO + 4 * SIZE], b1
  873. LDF [AO + 5 * SIZE], b2
  874. LDF [AO + 6 * SIZE], b3
  875. LDF [AO + 7 * SIZE], b4
  876. #endif
  877. FSUB a1, c01, c01
  878. FSUB a2, c02, c02
  879. FSUB a3, c05, c05
  880. FSUB a4, c06, c06
  881. FSUB b1, c09, c09
  882. FSUB b2, c10, c10
  883. FSUB b3, c13, c13
  884. FSUB b4, c14, c14
  885. #if defined(LN) || defined(LT)
  886. LDF [AO + 0 * SIZE], a1
  887. LDF [AO + 1 * SIZE], a2
  888. FMUL a1, c01, b1
  889. FMUL a2, c01, b2
  890. FMUL a1, c05, b3
  891. FMUL a2, c05, b4
  892. FMUL a1, c09, b5
  893. FMUL a2, c09, b6
  894. FMUL a1, c13, b7
  895. FMUL a2, c13, b8
  896. #ifndef CONJ
  897. FNMSUB (aa2, cc02, bb1, cc01)
  898. FMADD (aa1, cc02, bb2, cc02)
  899. FNMSUB (aa2, cc06, bb3, cc05)
  900. FMADD (aa1, cc06, bb4, cc06)
  901. FNMSUB (aa2, cc10, bb5, cc09)
  902. FMADD (aa1, cc10, bb6, cc10)
  903. FNMSUB (aa2, cc14, bb7, cc13)
  904. FMADD (aa1, cc14, bb8, cc14)
  905. #else
  906. FMADD (aa2, cc02, bb1, cc01)
  907. FMSUB (aa1, cc02, bb2, cc02)
  908. FMADD (aa2, cc06, bb3, cc05)
  909. FMSUB (aa1, cc06, bb4, cc06)
  910. FMADD (aa2, cc10, bb5, cc09)
  911. FMSUB (aa1, cc10, bb6, cc10)
  912. FMADD (aa2, cc14, bb7, cc13)
  913. FMSUB (aa1, cc14, bb8, cc14)
  914. #endif
  915. #endif
  916. #ifdef RN
  917. LDF [BO + 0 * SIZE], b1
  918. LDF [BO + 1 * SIZE], b2
  919. LDF [BO + 2 * SIZE], b3
  920. LDF [BO + 3 * SIZE], b4
  921. LDF [BO + 4 * SIZE], b5
  922. LDF [BO + 5 * SIZE], b6
  923. LDF [BO + 6 * SIZE], b7
  924. LDF [BO + 7 * SIZE], b8
  925. FMUL b1, c01, a1
  926. FMUL b2, c01, a2
  927. #ifndef CONJ
  928. FNMSUB (bb2, cc02, aa1, cc01)
  929. FMADD (bb1, cc02, aa2, cc02)
  930. #else
  931. FMADD (bb2, cc02, aa1, cc01)
  932. FMSUB (bb1, cc02, aa2, cc02)
  933. #endif
  934. FNMSUB (bb3, cc01, cc05, cc05)
  935. FNMSUB (bb3, cc02, cc06, cc06)
  936. FNMSUB (bb5, cc01, cc09, cc09)
  937. FNMSUB (bb5, cc02, cc10, cc10)
  938. FNMSUB (bb7, cc01, cc13, cc13)
  939. FNMSUB (bb7, cc02, cc14, cc14)
  940. #ifndef CONJ
  941. FMADD (bb4, cc02, cc05, cc05)
  942. FNMSUB (bb4, cc01, cc06, cc06)
  943. FMADD (bb6, cc02, cc09, cc09)
  944. FNMSUB (bb6, cc01, cc10, cc10)
  945. FMADD (bb8, cc02, cc13, cc13)
  946. FNMSUB (bb8, cc01, cc14, cc14)
  947. #else
  948. FNMSUB (bb4, cc02, cc05, cc05)
  949. FMADD (bb4, cc01, cc06, cc06)
  950. FNMSUB (bb6, cc02, cc09, cc09)
  951. FMADD (bb6, cc01, cc10, cc10)
  952. FNMSUB (bb8, cc02, cc13, cc13)
  953. FMADD (bb8, cc01, cc14, cc14)
  954. #endif
  955. LDF [BO + 10 * SIZE], b1
  956. LDF [BO + 11 * SIZE], b2
  957. LDF [BO + 12 * SIZE], b3
  958. LDF [BO + 13 * SIZE], b4
  959. LDF [BO + 14 * SIZE], b5
  960. LDF [BO + 15 * SIZE], b6
  961. FMUL b1, c05, a1
  962. FMUL b2, c05, a2
  963. #ifndef CONJ
  964. FNMSUB (bb2, cc06, aa1, cc05)
  965. FMADD (bb1, cc06, aa2, cc06)
  966. #else
  967. FMADD (bb2, cc06, aa1, cc05)
  968. FMSUB (bb1, cc06, aa2, cc06)
  969. #endif
  970. FNMSUB (bb3, cc05, cc09, cc09)
  971. FNMSUB (bb3, cc06, cc10, cc10)
  972. FNMSUB (bb5, cc05, cc13, cc13)
  973. FNMSUB (bb5, cc06, cc14, cc14)
  974. #ifndef CONJ
  975. FMADD (bb4, cc06, cc09, cc09)
  976. FNMSUB (bb4, cc05, cc10, cc10)
  977. FMADD (bb6, cc06, cc13, cc13)
  978. FNMSUB (bb6, cc05, cc14, cc14)
  979. #else
  980. FNMSUB (bb4, cc06, cc09, cc09)
  981. FMADD (bb4, cc05, cc10, cc10)
  982. FNMSUB (bb6, cc06, cc13, cc13)
  983. FMADD (bb6, cc05, cc14, cc14)
  984. #endif
  985. LDF [BO + 20 * SIZE], b1
  986. LDF [BO + 21 * SIZE], b2
  987. LDF [BO + 22 * SIZE], b3
  988. LDF [BO + 23 * SIZE], b4
  989. FMUL b1, c09, a1
  990. FMUL b2, c09, a2
  991. #ifndef CONJ
  992. FNMSUB (bb2, cc10, aa1, cc09)
  993. FMADD (bb1, cc10, aa2, cc10)
  994. #else
  995. FMADD (bb2, cc10, aa1, cc09)
  996. FMSUB (bb1, cc10, aa2, cc10)
  997. #endif
  998. FNMSUB (bb3, cc09, cc13, cc13)
  999. FNMSUB (bb3, cc10, cc14, cc14)
  1000. #ifndef CONJ
  1001. FMADD (bb4, cc10, cc13, cc13)
  1002. FNMSUB (bb4, cc09, cc14, cc14)
  1003. #else
  1004. FNMSUB (bb4, cc10, cc13, cc13)
  1005. FMADD (bb4, cc09, cc14, cc14)
  1006. #endif
  1007. LDF [BO + 30 * SIZE], b1
  1008. LDF [BO + 31 * SIZE], b2
  1009. FMUL b1, c13, a1
  1010. FMUL b2, c13, a2
  1011. #ifndef CONJ
  1012. FNMSUB (bb2, cc14, aa1, cc13)
  1013. FMADD (bb1, cc14, aa2, cc14)
  1014. #else
  1015. FMADD (bb2, cc14, aa1, cc13)
  1016. FMSUB (bb1, cc14, aa2, cc14)
  1017. #endif
  1018. #endif
  1019. #ifdef RT
  1020. LDF [BO + 30 * SIZE], b1
  1021. LDF [BO + 31 * SIZE], b2
  1022. LDF [BO + 28 * SIZE], b3
  1023. LDF [BO + 29 * SIZE], b4
  1024. LDF [BO + 26 * SIZE], b5
  1025. LDF [BO + 27 * SIZE], b6
  1026. LDF [BO + 24 * SIZE], b7
  1027. LDF [BO + 25 * SIZE], b8
  1028. FMUL b1, c13, a1
  1029. FMUL b2, c13, a2
  1030. #ifndef CONJ
  1031. FNMSUB (bb2, cc14, aa1, cc13)
  1032. FMADD (bb1, cc14, aa2, cc14)
  1033. #else
  1034. FMADD (bb2, cc14, aa1, cc13)
  1035. FMSUB (bb1, cc14, aa2, cc14)
  1036. #endif
  1037. FNMSUB (bb3, cc13, cc09, cc09)
  1038. FNMSUB (bb3, cc14, cc10, cc10)
  1039. FNMSUB (bb5, cc13, cc05, cc05)
  1040. FNMSUB (bb5, cc14, cc06, cc06)
  1041. FNMSUB (bb7, cc13, cc01, cc01)
  1042. FNMSUB (bb7, cc14, cc02, cc02)
  1043. #ifndef CONJ
  1044. FMADD (bb4, cc14, cc09, cc09)
  1045. FNMSUB (bb4, cc13, cc10, cc10)
  1046. FMADD (bb6, cc14, cc05, cc05)
  1047. FNMSUB (bb6, cc13, cc06, cc06)
  1048. FMADD (bb8, cc14, cc01, cc01)
  1049. FNMSUB (bb8, cc13, cc02, cc02)
  1050. #else
  1051. FNMSUB (bb4, cc14, cc09, cc09)
  1052. FMADD (bb4, cc13, cc10, cc10)
  1053. FNMSUB (bb6, cc14, cc05, cc05)
  1054. FMADD (bb6, cc13, cc06, cc06)
  1055. FNMSUB (bb8, cc14, cc01, cc01)
  1056. FMADD (bb8, cc13, cc02, cc02)
  1057. #endif
  1058. LDF [BO + 20 * SIZE], b1
  1059. LDF [BO + 21 * SIZE], b2
  1060. LDF [BO + 18 * SIZE], b3
  1061. LDF [BO + 19 * SIZE], b4
  1062. LDF [BO + 16 * SIZE], b5
  1063. LDF [BO + 17 * SIZE], b6
  1064. FMUL b1, c09, a1
  1065. FMUL b2, c09, a2
  1066. #ifndef CONJ
  1067. FNMSUB (bb2, cc10, aa1, cc09)
  1068. FMADD (bb1, cc10, aa2, cc10)
  1069. #else
  1070. FMADD (bb2, cc10, aa1, cc09)
  1071. FMSUB (bb1, cc10, aa2, cc10)
  1072. #endif
  1073. FNMSUB (bb3, cc09, cc05, cc05)
  1074. FNMSUB (bb3, cc10, cc06, cc06)
  1075. FNMSUB (bb5, cc09, cc01, cc01)
  1076. FNMSUB (bb5, cc10, cc02, cc02)
  1077. #ifndef CONJ
  1078. FMADD (bb4, cc10, cc05, cc05)
  1079. FNMSUB (bb4, cc09, cc06, cc06)
  1080. FMADD (bb6, cc10, cc01, cc01)
  1081. FNMSUB (bb6, cc09, cc02, cc02)
  1082. #else
  1083. FNMSUB (bb4, cc10, cc05, cc05)
  1084. FMADD (bb4, cc09, cc06, cc06)
  1085. FNMSUB (bb6, cc10, cc01, cc01)
  1086. FMADD (bb6, cc09, cc02, cc02)
  1087. #endif
  1088. LDF [BO + 10 * SIZE], b1
  1089. LDF [BO + 11 * SIZE], b2
  1090. LDF [BO + 8 * SIZE], b3
  1091. LDF [BO + 9 * SIZE], b4
  1092. FMUL b1, c05, a1
  1093. FMUL b2, c05, a2
  1094. #ifndef CONJ
  1095. FNMSUB (bb2, cc06, aa1, cc05)
  1096. FMADD (bb1, cc06, aa2, cc06)
  1097. #else
  1098. FMADD (bb2, cc06, aa1, cc05)
  1099. FMSUB (bb1, cc06, aa2, cc06)
  1100. #endif
  1101. FNMSUB (bb3, cc05, cc01, cc01)
  1102. FNMSUB (bb3, cc06, cc02, cc02)
  1103. #ifndef CONJ
  1104. FMADD (bb4, cc06, cc01, cc01)
  1105. FNMSUB (bb4, cc05, cc02, cc02)
  1106. #else
  1107. FNMSUB (bb4, cc06, cc01, cc01)
  1108. FMADD (bb4, cc05, cc02, cc02)
  1109. #endif
  1110. LDF [BO + 0 * SIZE], b1
  1111. LDF [BO + 1 * SIZE], b2
  1112. FMUL b1, c01, a1
  1113. FMUL b2, c01, a2
  1114. #ifndef CONJ
  1115. FNMSUB (bb2, cc02, aa1, cc01)
  1116. FMADD (bb1, cc02, aa2, cc02)
  1117. #else
  1118. FMADD (bb2, cc02, aa1, cc01)
  1119. FMSUB (bb1, cc02, aa2, cc02)
  1120. #endif
  1121. #endif
  1122. #ifdef LN
  1123. add C1, -2 * SIZE, C1
  1124. add C2, -2 * SIZE, C2
  1125. add C3, -2 * SIZE, C3
  1126. add C4, -2 * SIZE, C4
  1127. #endif
  1128. #if defined(LN) || defined(LT)
  1129. STF c01, [BO + 0 * SIZE]
  1130. STF c02, [BO + 1 * SIZE]
  1131. STF c05, [BO + 2 * SIZE]
  1132. STF c06, [BO + 3 * SIZE]
  1133. STF c09, [BO + 4 * SIZE]
  1134. STF c10, [BO + 5 * SIZE]
  1135. STF c13, [BO + 6 * SIZE]
  1136. STF c14, [BO + 7 * SIZE]
  1137. #else
  1138. STF c01, [AO + 0 * SIZE]
  1139. STF c02, [AO + 1 * SIZE]
  1140. STF c05, [AO + 2 * SIZE]
  1141. STF c06, [AO + 3 * SIZE]
  1142. STF c09, [AO + 4 * SIZE]
  1143. STF c10, [AO + 5 * SIZE]
  1144. STF c13, [AO + 6 * SIZE]
  1145. STF c14, [AO + 7 * SIZE]
  1146. #endif
  1147. STF c01, [C1 + 0 * SIZE]
  1148. STF c02, [C1 + 1 * SIZE]
  1149. STF c05, [C2 + 0 * SIZE]
  1150. STF c06, [C2 + 1 * SIZE]
  1151. STF c09, [C3 + 0 * SIZE]
  1152. STF c10, [C3 + 1 * SIZE]
  1153. STF c13, [C4 + 0 * SIZE]
  1154. STF c14, [C4 + 1 * SIZE]
  1155. #ifndef LN
  1156. add C1, 2 * SIZE, C1
  1157. add C2, 2 * SIZE, C2
  1158. add C3, 2 * SIZE, C3
  1159. add C4, 2 * SIZE, C4
  1160. #endif
  1161. #ifdef RT
  1162. sll K, ZBASE_SHIFT, TEMP1
  1163. add AORIG, TEMP1, AORIG
  1164. #endif
  1165. #if defined(LT) || defined(RN)
  1166. sub K, KK, TEMP1
  1167. sll TEMP1, ZBASE_SHIFT + 0, TEMP2
  1168. sll TEMP1, ZBASE_SHIFT + 2, TEMP1
  1169. add AO, TEMP2, AO
  1170. add BO, TEMP1, BO
  1171. #endif
  1172. #ifdef LT
  1173. add KK, 1, KK
  1174. #endif
  1175. #ifdef LN
  1176. sub KK, 1, KK
  1177. #endif
  1178. add I, -1, I
  1179. cmp I, 0
  1180. bg,pt %icc, .LL12
  1181. nop
  1182. #ifdef LN
  1183. sll K, ZBASE_SHIFT + 2, TEMP1
  1184. add B, TEMP1, B
  1185. #endif
  1186. #if defined(LT) || defined(RN)
  1187. mov BO, B
  1188. #endif
  1189. #ifdef RN
  1190. add KK, 4, KK
  1191. #endif
  1192. #ifdef RT
  1193. sub KK, 4, KK
  1194. #endif
  1195. add J, -1, J
  1196. cmp J, 0
  1197. bg,pt %icc, .LL11
  1198. nop
  1199. .align 4
  1200. .LL20:
  1201. and N, 2, J
  1202. cmp J, 0
  1203. ble,pn %icc, .LL30
  1204. nop
  1205. #ifdef RT
  1206. sll K, ZBASE_SHIFT + 1, TEMP1
  1207. sub B, TEMP1, B
  1208. #endif
  1209. #ifndef RT
  1210. mov C, C1
  1211. add C, LDC, C2
  1212. add C2, LDC, C
  1213. #else
  1214. sub C, LDC, C2
  1215. sub C2, LDC, C1
  1216. sub C2, LDC, C
  1217. #endif
  1218. #ifdef LN
  1219. add M, OFFSET, KK
  1220. #endif
  1221. #ifdef LT
  1222. mov OFFSET, KK
  1223. #endif
  1224. #if defined(LN) || defined(RT)
  1225. mov A, AORIG
  1226. #else
  1227. mov A, AO
  1228. #endif
  1229. mov M, I
  1230. .align 4
  1231. .LL22:
  1232. #if defined(LT) || defined(RN)
  1233. mov B, BO
  1234. #else
  1235. #ifdef LN
  1236. sll K, ZBASE_SHIFT, TEMP1
  1237. sub AORIG, TEMP1, AORIG
  1238. #endif
  1239. sll KK, ZBASE_SHIFT + 0, TEMP1
  1240. sll KK, ZBASE_SHIFT + 1, TEMP2
  1241. add AORIG, TEMP1, AO
  1242. add B, TEMP2, BO
  1243. #endif
  1244. LDF [AO + 0 * SIZE], a1
  1245. LDF [AO + 1 * SIZE], a2
  1246. LDF [BO + 0 * SIZE], b1
  1247. LDF [BO + 1 * SIZE], b2
  1248. LDF [BO + 2 * SIZE], b3
  1249. LDF [BO + 3 * SIZE], b4
  1250. LDF [BO + 4 * SIZE], b5
  1251. FCLR (cc01)
  1252. LDF [BO + 5 * SIZE], b6
  1253. FCLR (cc02)
  1254. LDF [BO + 6 * SIZE], b7
  1255. FCLR (cc03)
  1256. LDF [BO + 7 * SIZE], b8
  1257. FCLR (cc04)
  1258. LDF [BO + 8 * SIZE], b9
  1259. FCLR (cc05)
  1260. prefetch [C1 + 2 * SIZE], 3
  1261. FCLR (cc06)
  1262. prefetch [C2 + 2 * SIZE], 3
  1263. FCLR (cc07)
  1264. #if defined(LT) || defined(RN)
  1265. sra KK, 2, L
  1266. #else
  1267. sub K, KK, L
  1268. sra L, 2, L
  1269. #endif
  1270. cmp L, 0
  1271. ble,pn %icc, .LL25
  1272. FCLR (cc08)
  1273. .align 4
  1274. .LL23:
  1275. FMADD1 (aa1, bb1, cc01, cc01)
  1276. LDF [AO + 2 * SIZE], a3
  1277. FMADD2 (aa2, bb1, cc02, cc02)
  1278. LDF [AO + 3 * SIZE], a4
  1279. FMADD3 (aa1, bb2, cc03, cc03)
  1280. LDF [BO + 16 * SIZE], b1
  1281. FMADD4 (aa2, bb2, cc04, cc04)
  1282. LDF [BO + 9 * SIZE], b2
  1283. FMADD1 (aa1, bb3, cc05, cc05)
  1284. prefetch [AO + (APREFETCHSIZE + 0) * SIZE], APREFETCH_CATEGORY
  1285. FMADD2 (aa2, bb3, cc06, cc06)
  1286. add L, -1, L
  1287. FMADD3 (aa1, bb4, cc07, cc07)
  1288. LDF [BO + 10 * SIZE], b3
  1289. FMADD4 (aa2, bb4, cc08, cc08)
  1290. LDF [BO + 11 * SIZE], b4
  1291. FMADD1 (aa3, bb5, cc01, cc01)
  1292. LDF [AO + 4 * SIZE], a1
  1293. FMADD2 (aa4, bb5, cc02, cc02)
  1294. LDF [AO + 5 * SIZE], a2
  1295. FMADD3 (aa3, bb6, cc03, cc03)
  1296. LDF [BO + 12 * SIZE], b5
  1297. FMADD4 (aa4, bb6, cc04, cc04)
  1298. LDF [BO + 13 * SIZE], b6
  1299. FMADD1 (aa3, bb7, cc05, cc05)
  1300. cmp L, 0
  1301. FMADD2 (aa4, bb7, cc06, cc06)
  1302. add AO, 8 * SIZE, AO
  1303. FMADD3 (aa3, bb8, cc07, cc07)
  1304. LDF [BO + 14 * SIZE], b7
  1305. FMADD4 (aa4, bb8, cc08, cc08)
  1306. LDF [BO + 15 * SIZE], b8
  1307. FMADD1 (aa1, bb9, cc01, cc01)
  1308. LDF [AO - 2 * SIZE], a3
  1309. FMADD2 (aa2, bb9, cc02, cc02)
  1310. LDF [AO - 1 * SIZE], a4
  1311. FMADD3 (aa1, bb2, cc03, cc03)
  1312. LDF [BO + 24 * SIZE], b9
  1313. FMADD4 (aa2, bb2, cc04, cc04)
  1314. LDF [BO + 17 * SIZE], b2
  1315. FMADD1 (aa1, bb3, cc05, cc05)
  1316. add BO, 16 * SIZE, BO
  1317. FMADD2 (aa2, bb3, cc06, cc06)
  1318. nop
  1319. FMADD3 (aa1, bb4, cc07, cc07)
  1320. LDF [BO + 2 * SIZE], b3
  1321. FMADD4 (aa2, bb4, cc08, cc08)
  1322. LDF [BO + 3 * SIZE], b4
  1323. FMADD1 (aa3, bb5, cc01, cc01)
  1324. LDF [AO + 0 * SIZE], a1
  1325. FMADD2 (aa4, bb5, cc02, cc02)
  1326. LDF [AO + 1 * SIZE], a2
  1327. FMADD3 (aa3, bb6, cc03, cc03)
  1328. LDF [BO + 4 * SIZE], b5
  1329. FMADD4 (aa4, bb6, cc04, cc04)
  1330. LDF [BO + 5 * SIZE], b6
  1331. FMADD1 (aa3, bb7, cc05, cc05)
  1332. nop
  1333. FMADD2 (aa4, bb7, cc06, cc06)
  1334. LDF [BO + 6 * SIZE], b7
  1335. FMADD3 (aa3, bb8, cc07, cc07)
  1336. FMADD4 (aa4, bb8, cc08, cc08)
  1337. bg,pt %icc, .LL23
  1338. LDF [BO + 7 * SIZE], b8
  1339. .align 4
  1340. .LL25:
  1341. #if defined(LT) || defined(RN)
  1342. and KK, 3, L
  1343. #else
  1344. sub K, KK, L
  1345. and L, 3, L
  1346. #endif
  1347. cmp L, 0
  1348. ble,a,pn %icc, .LL28
  1349. nop
  1350. .align 4
  1351. .LL27:
  1352. FMADD1 (aa1, bb1, cc01, cc01)
  1353. add L, -1, L
  1354. FMADD2 (aa2, bb1, cc02, cc02)
  1355. LDF [BO + 4 * SIZE], b1
  1356. FMADD3 (aa1, bb2, cc03, cc03)
  1357. add AO, 2 * SIZE, AO
  1358. FMADD4 (aa2, bb2, cc04, cc04)
  1359. LDF [BO + 5 * SIZE], b2
  1360. FMADD1 (aa1, bb3, cc05, cc05)
  1361. cmp L, 0
  1362. FMADD2 (aa2, bb3, cc06, cc06)
  1363. LDF [BO + 6 * SIZE], b3
  1364. FMADD3 (aa1, bb4, cc07, cc07)
  1365. LDF [AO + 0 * SIZE], a1
  1366. FMADD4 (aa2, bb4, cc08, cc08)
  1367. LDF [AO + 1 * SIZE], a2
  1368. LDF [BO + 7 * SIZE], b4
  1369. bg,pt %icc, .LL27
  1370. add BO, 4 * SIZE, BO
  1371. .align 4
  1372. .LL28:
  1373. FADD c01, c04, c01
  1374. FADD c02, c03, c02
  1375. FADD c05, c08, c05
  1376. FADD c06, c07, c06
  1377. #if defined(LN) || defined(RT)
  1378. #ifdef LN
  1379. sub KK, 1, TEMP1
  1380. #else
  1381. sub KK, 2, TEMP1
  1382. #endif
  1383. sll TEMP1, ZBASE_SHIFT + 0, TEMP2
  1384. sll TEMP1, ZBASE_SHIFT + 1, TEMP1
  1385. add AORIG, TEMP2, AO
  1386. add B, TEMP1, BO
  1387. #endif
  1388. #if defined(LN) || defined(LT)
  1389. LDF [BO + 0 * SIZE], a1
  1390. LDF [BO + 1 * SIZE], a2
  1391. LDF [BO + 2 * SIZE], a3
  1392. LDF [BO + 3 * SIZE], a4
  1393. #else
  1394. LDF [AO + 0 * SIZE], a1
  1395. LDF [AO + 1 * SIZE], a2
  1396. LDF [AO + 2 * SIZE], a3
  1397. LDF [AO + 3 * SIZE], a4
  1398. #endif
  1399. FSUB a1, c01, c01
  1400. FSUB a2, c02, c02
  1401. FSUB a3, c05, c05
  1402. FSUB a4, c06, c06
  1403. #if defined(LN) || defined(LT)
  1404. LDF [AO + 0 * SIZE], a1
  1405. LDF [AO + 1 * SIZE], a2
  1406. FMUL a1, c01, b1
  1407. FMUL a2, c01, b2
  1408. FMUL a1, c05, b3
  1409. FMUL a2, c05, b4
  1410. #ifndef CONJ
  1411. FNMSUB (aa2, cc02, bb1, cc01)
  1412. FMADD (aa1, cc02, bb2, cc02)
  1413. FNMSUB (aa2, cc06, bb3, cc05)
  1414. FMADD (aa1, cc06, bb4, cc06)
  1415. #else
  1416. FMADD (aa2, cc02, bb1, cc01)
  1417. FMSUB (aa1, cc02, bb2, cc02)
  1418. FMADD (aa2, cc06, bb3, cc05)
  1419. FMSUB (aa1, cc06, bb4, cc06)
  1420. #endif
  1421. #endif
  1422. #ifdef RN
  1423. LDF [BO + 0 * SIZE], b1
  1424. LDF [BO + 1 * SIZE], b2
  1425. LDF [BO + 2 * SIZE], b3
  1426. LDF [BO + 3 * SIZE], b4
  1427. FMUL b1, c01, a1
  1428. FMUL b2, c01, a2
  1429. #ifndef CONJ
  1430. FNMSUB (bb2, cc02, aa1, cc01)
  1431. FMADD (bb1, cc02, aa2, cc02)
  1432. #else
  1433. FMADD (bb2, cc02, aa1, cc01)
  1434. FMSUB (bb1, cc02, aa2, cc02)
  1435. #endif
  1436. FNMSUB (bb3, cc01, cc05, cc05)
  1437. FNMSUB (bb3, cc02, cc06, cc06)
  1438. #ifndef CONJ
  1439. FMADD (bb4, cc02, cc05, cc05)
  1440. FNMSUB (bb4, cc01, cc06, cc06)
  1441. #else
  1442. FNMSUB (bb4, cc02, cc05, cc05)
  1443. FMADD (bb4, cc01, cc06, cc06)
  1444. #endif
  1445. LDF [BO + 6 * SIZE], b1
  1446. LDF [BO + 7 * SIZE], b2
  1447. FMUL b1, c05, a1
  1448. FMUL b2, c05, a2
  1449. #ifndef CONJ
  1450. FNMSUB (bb2, cc06, aa1, cc05)
  1451. FMADD (bb1, cc06, aa2, cc06)
  1452. #else
  1453. FMADD (bb2, cc06, aa1, cc05)
  1454. FMSUB (bb1, cc06, aa2, cc06)
  1455. #endif
  1456. #endif
  1457. #ifdef RT
  1458. LDF [BO + 6 * SIZE], b1
  1459. LDF [BO + 7 * SIZE], b2
  1460. LDF [BO + 4 * SIZE], b3
  1461. LDF [BO + 5 * SIZE], b4
  1462. FMUL b1, c05, a1
  1463. FMUL b2, c05, a2
  1464. #ifndef CONJ
  1465. FNMSUB (bb2, cc06, aa1, cc05)
  1466. FMADD (bb1, cc06, aa2, cc06)
  1467. #else
  1468. FMADD (bb2, cc06, aa1, cc05)
  1469. FMSUB (bb1, cc06, aa2, cc06)
  1470. #endif
  1471. FNMSUB (bb3, cc05, cc01, cc01)
  1472. FNMSUB (bb3, cc06, cc02, cc02)
  1473. #ifndef CONJ
  1474. FMADD (bb4, cc06, cc01, cc01)
  1475. FNMSUB (bb4, cc05, cc02, cc02)
  1476. #else
  1477. FNMSUB (bb4, cc06, cc01, cc01)
  1478. FMADD (bb4, cc05, cc02, cc02)
  1479. #endif
  1480. LDF [BO + 0 * SIZE], b1
  1481. LDF [BO + 1 * SIZE], b2
  1482. FMUL b1, c01, a1
  1483. FMUL b2, c01, a2
  1484. #ifndef CONJ
  1485. FNMSUB (bb2, cc02, aa1, cc01)
  1486. FMADD (bb1, cc02, aa2, cc02)
  1487. #else
  1488. FMADD (bb2, cc02, aa1, cc01)
  1489. FMSUB (bb1, cc02, aa2, cc02)
  1490. #endif
  1491. #endif
  1492. #ifdef LN
  1493. add C1, -2 * SIZE, C1
  1494. add C2, -2 * SIZE, C2
  1495. #endif
  1496. #if defined(LN) || defined(LT)
  1497. STF c01, [BO + 0 * SIZE]
  1498. STF c02, [BO + 1 * SIZE]
  1499. STF c05, [BO + 2 * SIZE]
  1500. STF c06, [BO + 3 * SIZE]
  1501. #else
  1502. STF c01, [AO + 0 * SIZE]
  1503. STF c02, [AO + 1 * SIZE]
  1504. STF c05, [AO + 2 * SIZE]
  1505. STF c06, [AO + 3 * SIZE]
  1506. #endif
  1507. STF c01, [C1 + 0 * SIZE]
  1508. STF c02, [C1 + 1 * SIZE]
  1509. STF c05, [C2 + 0 * SIZE]
  1510. STF c06, [C2 + 1 * SIZE]
  1511. #ifndef LN
  1512. add C1, 2 * SIZE, C1
  1513. add C2, 2 * SIZE, C2
  1514. #endif
  1515. #ifdef RT
  1516. sll K, ZBASE_SHIFT, TEMP1
  1517. add AORIG, TEMP1, AORIG
  1518. #endif
  1519. #if defined(LT) || defined(RN)
  1520. sub K, KK, TEMP1
  1521. sll TEMP1, ZBASE_SHIFT + 0, TEMP2
  1522. sll TEMP1, ZBASE_SHIFT + 1, TEMP1
  1523. add AO, TEMP2, AO
  1524. add BO, TEMP1, BO
  1525. #endif
  1526. #ifdef LT
  1527. add KK, 1, KK
  1528. #endif
  1529. #ifdef LN
  1530. sub KK, 1, KK
  1531. #endif
  1532. add I, -1, I
  1533. cmp I, 0
  1534. bg,pt %icc, .LL22
  1535. nop
  1536. #ifdef LN
  1537. sll K, ZBASE_SHIFT + 1, TEMP1
  1538. add B, TEMP1, B
  1539. #endif
  1540. #if defined(LT) || defined(RN)
  1541. mov BO, B
  1542. #endif
  1543. #ifdef RN
  1544. add KK, 2, KK
  1545. #endif
  1546. #ifdef RT
  1547. sub KK, 2, KK
  1548. #endif
  1549. .align 4
  1550. .LL30:
  1551. and N, 1, J
  1552. cmp J, 0
  1553. ble,pn %icc, .LL999
  1554. nop
  1555. #ifdef RT
  1556. sll K, ZBASE_SHIFT, TEMP1
  1557. sub B, TEMP1, B
  1558. #endif
  1559. #ifndef RT
  1560. mov C, C1
  1561. add C, LDC, C
  1562. #else
  1563. sub C, LDC, C1
  1564. sub C, LDC, C
  1565. #endif
  1566. #ifdef LN
  1567. add M, OFFSET, KK
  1568. #endif
  1569. #ifdef LT
  1570. mov OFFSET, KK
  1571. #endif
  1572. #if defined(LN) || defined(RT)
  1573. mov A, AORIG
  1574. #else
  1575. mov A, AO
  1576. #endif
  1577. mov M, I
  1578. .align 4
  1579. .LL32:
  1580. #if defined(LT) || defined(RN)
  1581. mov B, BO
  1582. #else
  1583. #ifdef LN
  1584. sll K, ZBASE_SHIFT, TEMP1
  1585. sub AORIG, TEMP1, AORIG
  1586. #endif
  1587. sll KK, ZBASE_SHIFT + 0, TEMP1
  1588. add AORIG, TEMP1, AO
  1589. add B, TEMP1, BO
  1590. #endif
  1591. LDF [AO + 0 * SIZE], a1
  1592. LDF [AO + 1 * SIZE], a2
  1593. LDF [AO + 2 * SIZE], a3
  1594. LDF [AO + 3 * SIZE], a4
  1595. LDF [BO + 0 * SIZE], b1
  1596. LDF [BO + 1 * SIZE], b2
  1597. LDF [BO + 2 * SIZE], b3
  1598. FCLR (cc01)
  1599. LDF [BO + 3 * SIZE], b4
  1600. FCLR (cc02)
  1601. LDF [BO + 4 * SIZE], b5
  1602. FCLR (cc03)
  1603. LDF [BO + 5 * SIZE], b6
  1604. FCLR (cc04)
  1605. LDF [BO + 6 * SIZE], b7
  1606. FCLR (cc05)
  1607. LDF [BO + 7 * SIZE], b8
  1608. FCLR (cc06)
  1609. prefetch [C1 + 2 * SIZE], 3
  1610. FCLR (cc07)
  1611. #if defined(LT) || defined(RN)
  1612. sra KK, 2, L
  1613. #else
  1614. sub K, KK, L
  1615. sra L, 2, L
  1616. #endif
  1617. cmp L, 0
  1618. ble,pn %icc, .LL35
  1619. FCLR (cc08)
  1620. .align 4
  1621. .LL33:
  1622. FMADD1 (aa1, bb1, cc01, cc01)
  1623. prefetch [AO + (APREFETCHSIZE + 0) * SIZE], APREFETCH_CATEGORY
  1624. FMADD2 (aa2, bb1, cc02, cc02)
  1625. LDF [BO + 8 * SIZE], b1
  1626. FMADD3 (aa1, bb2, cc03, cc03)
  1627. LDF [AO + 4 * SIZE], a1
  1628. FMADD4 (aa2, bb2, cc04, cc04)
  1629. LDF [AO + 5 * SIZE], a2
  1630. FMADD1 (aa3, bb3, cc01, cc01)
  1631. LDF [BO + 9 * SIZE], b2
  1632. FMADD2 (aa4, bb3, cc02, cc02)
  1633. LDF [BO + 10 * SIZE], b3
  1634. FMADD3 (aa3, bb4, cc03, cc03)
  1635. LDF [AO + 6 * SIZE], a3
  1636. FMADD4 (aa4, bb4, cc04, cc04)
  1637. LDF [AO + 7 * SIZE], a4
  1638. FMADD1 (aa1, bb5, cc01, cc01)
  1639. LDF [BO + 11 * SIZE], b4
  1640. FMADD2 (aa2, bb5, cc02, cc02)
  1641. LDF [BO + 12 * SIZE], b5
  1642. FMADD3 (aa1, bb6, cc03, cc03)
  1643. LDF [AO + 8 * SIZE], a1
  1644. FMADD4 (aa2, bb6, cc04, cc04)
  1645. LDF [AO + 9 * SIZE], a2
  1646. FMADD1 (aa3, bb7, cc01, cc01)
  1647. LDF [BO + 13 * SIZE], b6
  1648. FMADD2 (aa4, bb7, cc02, cc02)
  1649. LDF [BO + 14 * SIZE], b7
  1650. FMADD3 (aa3, bb8, cc03, cc03)
  1651. LDF [AO + 10 * SIZE], a3
  1652. FMADD4 (aa4, bb8, cc04, cc04)
  1653. LDF [AO + 11 * SIZE], a4
  1654. add AO, 8 * SIZE, AO
  1655. add L, -1, L
  1656. add BO, 8 * SIZE, BO
  1657. cmp L, 0
  1658. bg,pt %icc, .LL33
  1659. LDF [BO + 7 * SIZE], b8
  1660. .align 4
  1661. .LL35:
  1662. #if defined(LT) || defined(RN)
  1663. and KK, 3, L
  1664. #else
  1665. sub K, KK, L
  1666. and L, 3, L
  1667. #endif
  1668. cmp L, 0
  1669. ble,a,pn %icc, .LL38
  1670. nop
  1671. .align 4
  1672. .LL37:
  1673. FMADD1 (aa1, bb1, cc01, cc01)
  1674. add L, -1, L
  1675. FMADD2 (aa2, bb1, cc02, cc02)
  1676. LDF [BO + 2 * SIZE], b1
  1677. FMADD3 (aa1, bb2, cc03, cc03)
  1678. LDF [AO + 2 * SIZE], a1
  1679. FMADD4 (aa2, bb2, cc04, cc04)
  1680. LDF [AO + 3 * SIZE], a2
  1681. add AO, 2 * SIZE, AO
  1682. cmp L, 0
  1683. add BO, 2 * SIZE, BO
  1684. bg,pt %icc, .LL37
  1685. LDF [BO + 1 * SIZE], b2
  1686. .align 4
  1687. .LL38:
  1688. FADD c01, c04, c01
  1689. FADD c02, c03, c02
  1690. #if defined(LN) || defined(RT)
  1691. sub KK, 1, TEMP1
  1692. sll TEMP1, ZBASE_SHIFT, TEMP1
  1693. add AORIG, TEMP1, AO
  1694. add B, TEMP1, BO
  1695. #endif
  1696. #if defined(LN) || defined(LT)
  1697. LDF [BO + 0 * SIZE], a1
  1698. LDF [BO + 1 * SIZE], a2
  1699. #else
  1700. LDF [AO + 0 * SIZE], a1
  1701. LDF [AO + 1 * SIZE], a2
  1702. #endif
  1703. FSUB a1, c01, c01
  1704. FSUB a2, c02, c02
  1705. #if defined(LN) || defined(LT)
  1706. LDF [AO + 0 * SIZE], a1
  1707. LDF [AO + 1 * SIZE], a2
  1708. #else
  1709. LDF [BO + 0 * SIZE], a1
  1710. LDF [BO + 1 * SIZE], a2
  1711. #endif
  1712. FMUL a1, c01, b1
  1713. FMUL a2, c01, b2
  1714. #ifndef CONJ
  1715. FNMSUB (aa2, cc02, bb1, cc01)
  1716. FMADD (aa1, cc02, bb2, cc02)
  1717. #else
  1718. FMADD (aa2, cc02, bb1, cc01)
  1719. FMSUB (aa1, cc02, bb2, cc02)
  1720. #endif
  1721. #ifdef LN
  1722. add C1, -2 * SIZE, C1
  1723. #endif
  1724. #if defined(LN) || defined(LT)
  1725. STF c01, [BO + 0 * SIZE]
  1726. STF c02, [BO + 1 * SIZE]
  1727. #else
  1728. STF c01, [AO + 0 * SIZE]
  1729. STF c02, [AO + 1 * SIZE]
  1730. #endif
  1731. STF c01, [C1 + 0 * SIZE]
  1732. STF c02, [C1 + 1 * SIZE]
  1733. #ifndef LN
  1734. add C1, 2 * SIZE, C1
  1735. #endif
  1736. #ifdef RT
  1737. sll K, ZBASE_SHIFT, TEMP1
  1738. add AORIG, TEMP1, AORIG
  1739. #endif
  1740. #if defined(LT) || defined(RN)
  1741. sub K, KK, TEMP1
  1742. sll TEMP1, ZBASE_SHIFT, TEMP1
  1743. add AO, TEMP1, AO
  1744. add BO, TEMP1, BO
  1745. #endif
  1746. #ifdef LT
  1747. add KK, 1, KK
  1748. #endif
  1749. #ifdef LN
  1750. sub KK, 1, KK
  1751. #endif
  1752. add I, -1, I
  1753. cmp I, 0
  1754. bg,pt %icc, .LL32
  1755. nop
  1756. #ifdef LN
  1757. sll K, ZBASE_SHIFT, TEMP1
  1758. add B, TEMP1, B
  1759. #endif
  1760. #if defined(LT) || defined(RN)
  1761. mov BO, B
  1762. #endif
  1763. #ifdef RN
  1764. add KK, 1, KK
  1765. #endif
  1766. #ifdef RT
  1767. sub KK, 1, KK
  1768. #endif
  1769. .align 4
  1770. .LL999:
  1771. return %i7 + 8
  1772. clr %o0
  1773. EPILOGUE