You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

zgemm_kernel.S 32 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764176517661767176817691770177117721773177417751776177717781779178017811782178317841785178617871788178917901791179217931794179517961797179817991800180118021803180418051806180718081809181018111812181318141815181618171818181918201821182218231824182518261827182818291830183118321833183418351836183718381839184018411842184318441845184618471848184918501851185218531854185518561857185818591860186118621863186418651866186718681869187018711872187318741875187618771878187918801881188218831884188518861887188818891890189118921893189418951896189718981899190019011902190319041905190619071908190919101911191219131914191519161917
  1. /*********************************************************************/
  2. /* Copyright 2009, 2010 The University of Texas at Austin. */
  3. /* All rights reserved. */
  4. /* */
  5. /* Redistribution and use in source and binary forms, with or */
  6. /* without modification, are permitted provided that the following */
  7. /* conditions are met: */
  8. /* */
  9. /* 1. Redistributions of source code must retain the above */
  10. /* copyright notice, this list of conditions and the following */
  11. /* disclaimer. */
  12. /* */
  13. /* 2. Redistributions in binary form must reproduce the above */
  14. /* copyright notice, this list of conditions and the following */
  15. /* disclaimer in the documentation and/or other materials */
  16. /* provided with the distribution. */
  17. /* */
  18. /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
  19. /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
  20. /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
  21. /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
  22. /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
  23. /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
  24. /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
  25. /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
  26. /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
  27. /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
  28. /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
  29. /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
  30. /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
  31. /* POSSIBILITY OF SUCH DAMAGE. */
  32. /* */
  33. /* The views and conclusions contained in the software and */
  34. /* documentation are those of the authors and should not be */
  35. /* interpreted as representing official policies, either expressed */
  36. /* or implied, of The University of Texas at Austin. */
  37. /*********************************************************************/
  38. #define ASSEMBLER
  39. #include "common.h"
  40. #define M %i0
  41. #define N %i1
  42. #define K %i2
  43. #define A %i5
  44. #define B %i3
  45. #define C %i4
  46. #define LDC %o0
  47. #define AO %o1
  48. #define BO %o2
  49. #define I %o3
  50. #define J %o4
  51. #define L %o5
  52. #define C1 %l0
  53. #define C2 %l1
  54. #define OFFSET %l2
  55. #define KK %l3
  56. #define TEMP1 %l4
  57. #define TEMP2 %l5
  58. #ifdef DOUBLE
  59. #define c01 %f0
  60. #define c02 %f2
  61. #define c03 %f4
  62. #define c04 %f6
  63. #define c05 %f8
  64. #define c06 %f10
  65. #define c07 %f12
  66. #define c08 %f14
  67. #define c09 %f16
  68. #define c10 %f18
  69. #define c11 %f20
  70. #define c12 %f22
  71. #define c13 %f24
  72. #define c14 %f26
  73. #define c15 %f28
  74. #define c16 %f30
  75. #define t1 %f32
  76. #define t2 %f34
  77. #define t3 %f36
  78. #define t4 %f38
  79. #define a1 %f40
  80. #define a2 %f42
  81. #define a3 %f44
  82. #define a4 %f46
  83. #define a5 %f62
  84. #define b1 %f48
  85. #define b2 %f50
  86. #define b3 %f52
  87. #define b4 %f54
  88. #define b5 %f56
  89. #define FZERO %f58
  90. #define ALPHA_R %f60
  91. #define ALPHA_I %f62
  92. #else
  93. #define c01 %f0
  94. #define c02 %f1
  95. #define c03 %f2
  96. #define c04 %f3
  97. #define c05 %f4
  98. #define c06 %f5
  99. #define c07 %f6
  100. #define c08 %f7
  101. #define c09 %f8
  102. #define c10 %f9
  103. #define c11 %f10
  104. #define c12 %f11
  105. #define c13 %f12
  106. #define c14 %f13
  107. #define c15 %f14
  108. #define c16 %f15
  109. #define t1 %f16
  110. #define t2 %f17
  111. #define t3 %f18
  112. #define t4 %f19
  113. #define a1 %f20
  114. #define a2 %f21
  115. #define a3 %f22
  116. #define a4 %f23
  117. #define a5 %f31
  118. #define b1 %f24
  119. #define b2 %f25
  120. #define b3 %f26
  121. #define b4 %f27
  122. #define b5 %f28
  123. #define FZERO %f29
  124. #define ALPHA_R %f30
  125. #define ALPHA_I %f31
  126. #endif
  127. #if defined(NN) || defined(NT) || defined(TN) || defined(TT)
  128. #define FADD1 FADD
  129. #define FADD2 FADD
  130. #define FADD3 FADD
  131. #define FADD4 FSUB
  132. #elif defined(NR) || defined(NC) || defined(TR) || defined(TC)
  133. #define FADD1 FADD
  134. #define FADD2 FADD
  135. #define FADD3 FSUB
  136. #define FADD4 FADD
  137. #elif defined(RN) || defined(RT) || defined(CN) || defined(CT)
  138. #define FADD1 FADD
  139. #define FADD2 FSUB
  140. #define FADD3 FADD
  141. #define FADD4 FADD
  142. #else
  143. #define FADD1 FADD
  144. #define FADD2 FSUB
  145. #define FADD3 FSUB
  146. #define FADD4 FSUB
  147. #endif
  148. #define APREFETCHSIZE 40
  149. #define BPREFETCHSIZE 40
  150. #define APREFETCH_CATEGORY 0
  151. #define BPREFETCH_CATEGORY 0
  152. PROLOGUE
  153. SAVESP
  154. #ifndef __64BIT__
  155. #ifdef DOUBLE
  156. #define STACK_ALPHA [%sp + STACK_START + 24]
  157. #else
  158. #define STACK_ALPHA [%sp + STACK_START + 20]
  159. #endif
  160. #else
  161. #define STACK_ALPHA [%sp + STACK_START + 40]
  162. #endif
  163. #ifndef __64BIT__
  164. #ifdef DOUBLE
  165. st %i3, [%sp + STACK_START + 16]
  166. st %i4, [%sp + STACK_START + 20]
  167. st %i5, [%sp + STACK_START + 24]
  168. ld [%sp + STACK_START + 32], A
  169. ld [%sp + STACK_START + 36], B
  170. ld [%sp + STACK_START + 40], C
  171. ld [%sp + STACK_START + 44], LDC
  172. #ifdef TRMMKERNEL
  173. ld [%sp + STACK_START + 48], OFFSET
  174. #endif
  175. ldd [%sp + STACK_START + 16], ALPHA_R
  176. ldd [%sp + STACK_START + 24], ALPHA_I
  177. #else
  178. st %i3, [%sp + STACK_START + 16]
  179. st %i4, [%sp + STACK_START + 20]
  180. ld [%sp + STACK_START + 28], B
  181. ld [%sp + STACK_START + 32], C
  182. ld [%sp + STACK_START + 36], LDC
  183. #ifdef TRMMKERNEL
  184. ld [%sp + STACK_START + 40], OFFSET
  185. #endif
  186. ld [%sp + STACK_START + 16], ALPHA_R
  187. ld [%sp + STACK_START + 20], ALPHA_I
  188. #endif
  189. #else
  190. #ifdef DOUBLE
  191. FMOV %f6, ALPHA_R
  192. FMOV %f8, ALPHA_I
  193. STF %f8, STACK_ALPHA
  194. #else
  195. FMOV %f7, ALPHA_R
  196. FMOV %f9, ALPHA_I
  197. STF %f9, STACK_ALPHA
  198. #endif
  199. ldx [%sp+ STACK_START + 56], B
  200. nop
  201. ldx [%sp+ STACK_START + 64], C
  202. nop
  203. ldx [%sp+ STACK_START + 72], LDC
  204. #ifdef TRMMKERNEL
  205. ldx [%sp+ STACK_START + 80], OFFSET
  206. #endif
  207. LDF [%sp + STACK_START + 32], FZERO
  208. #endif
  209. #ifdef DOUBLE
  210. FCLR(27)
  211. #else
  212. FCLR(29)
  213. #endif
  214. #if defined(TRMMKERNEL) && !defined(LEFT)
  215. neg OFFSET, KK
  216. #endif
  217. sra N, 1, J
  218. cmp J, 0
  219. ble,pn %icc, .LL100
  220. sll LDC, ZBASE_SHIFT, LDC
  221. .LL11:
  222. sra M, 1, I
  223. FMOV FZERO, t1
  224. add C, LDC, C2
  225. FMOV FZERO, t2
  226. mov C, C1
  227. FMOV FZERO, t3
  228. cmp I, 0
  229. #if defined(TRMMKERNEL) && defined(LEFT)
  230. mov OFFSET, KK
  231. #endif
  232. mov A, AO
  233. add C2, LDC, C
  234. nop
  235. ble,pn %icc, .LL50
  236. FMOV FZERO, t4
  237. .LL21:
  238. #if !defined(TRMMKERNEL)
  239. sra K, 2, L
  240. FMOV FZERO, c01
  241. cmp L, 0
  242. FMOV FZERO, c02
  243. LDF [AO + 0 * SIZE], a1
  244. FMOV FZERO, c03
  245. LDF [B + 0 * SIZE], b1
  246. FMOV FZERO, c04
  247. LDF [AO + 1 * SIZE], a2
  248. FMOV FZERO, c05
  249. LDF [B + 1 * SIZE], b2
  250. FMOV FZERO, c06
  251. LDF [AO + 2 * SIZE], a3
  252. FMOV FZERO, c07
  253. LDF [B + 2 * SIZE], b3
  254. FMOV FZERO, c08
  255. LDF [AO + 3 * SIZE], a4
  256. FMOV FZERO, c09
  257. LDF [B + 3 * SIZE], b4
  258. FMOV FZERO, c10
  259. LDF [B + 4 * SIZE], b5
  260. FMOV FZERO, c11
  261. LDF [AO + 4 * SIZE], a5
  262. FMOV FZERO, c12
  263. prefetch [C1 + 3 * SIZE], 3
  264. FMOV FZERO, c13
  265. prefetch [C2 + 3 * SIZE], 3
  266. FMOV FZERO, c14
  267. mov B, BO
  268. #else
  269. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  270. mov B, BO
  271. #else
  272. sll KK, 1 + ZBASE_SHIFT, TEMP1
  273. add AO, TEMP1, AO
  274. add B, TEMP1, BO
  275. #endif
  276. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  277. sub K, KK, L
  278. #elif defined(LEFT)
  279. add KK, 2, L
  280. #else
  281. add KK, 2, L
  282. #endif
  283. sra L, 2, L
  284. cmp L, 0
  285. FMOV FZERO, c01
  286. FMOV FZERO, c02
  287. LDF [AO + 0 * SIZE], a1
  288. FMOV FZERO, c03
  289. LDF [BO + 0 * SIZE], b1
  290. FMOV FZERO, c04
  291. LDF [AO + 1 * SIZE], a2
  292. FMOV FZERO, c05
  293. LDF [BO + 1 * SIZE], b2
  294. FMOV FZERO, c06
  295. LDF [AO + 2 * SIZE], a3
  296. FMOV FZERO, c07
  297. LDF [BO + 2 * SIZE], b3
  298. FMOV FZERO, c08
  299. LDF [AO + 3 * SIZE], a4
  300. FMOV FZERO, c09
  301. LDF [BO + 3 * SIZE], b4
  302. FMOV FZERO, c10
  303. LDF [BO + 4 * SIZE], b5
  304. FMOV FZERO, c11
  305. LDF [AO + 4 * SIZE], a5
  306. FMOV FZERO, c12
  307. prefetch [C1 + 3 * SIZE], 3
  308. FMOV FZERO, c13
  309. prefetch [C2 + 3 * SIZE], 3
  310. FMOV FZERO, c14
  311. #endif
  312. FMOV FZERO, c15
  313. ble,pn %icc, .LL25
  314. FMOV FZERO, c16
  315. .LL22:
  316. FADD2 c04, t1, c04
  317. prefetch [AO + APREFETCHSIZE * SIZE], APREFETCH_CATEGORY
  318. FMUL a1, b1, t1
  319. nop
  320. FADD4 c08, t2, c08
  321. prefetch [BO + BPREFETCHSIZE * SIZE], BPREFETCH_CATEGORY
  322. FMUL a1, b2, t2
  323. add AO, 16 * SIZE, AO
  324. FADD2 c12, t3, c12
  325. LDF [AO - 13 * SIZE], a4
  326. FMUL a1, b3, t3
  327. add BO, 16 * SIZE, BO
  328. FADD4 c16, t4, c16
  329. nop
  330. FMUL a1, b4, t4
  331. LDF [AO - 8 * SIZE], a1
  332. FADD1 c01, t1, c01
  333. nop
  334. FMUL a2, b1, t1
  335. nop
  336. FADD3 c05, t2, c05
  337. nop
  338. FMUL a2, b2, t2
  339. nop
  340. FADD1 c09, t3, c09
  341. nop
  342. FMUL a2, b3, t3
  343. nop
  344. FADD3 c13, t4, c13
  345. add L, -1, L
  346. FMUL a2, b4, t4
  347. LDF [AO - 11 * SIZE], a2
  348. FADD2 c02, t1, c02
  349. nop
  350. FMUL a3, b1, t1
  351. nop
  352. FADD4 c06, t2, c06
  353. nop
  354. FMUL a3, b2, t2
  355. nop
  356. FADD2 c10, t3, c10
  357. nop
  358. FMUL a3, b3, t3
  359. nop
  360. FADD4 c14, t4, c14
  361. nop
  362. FMUL a3, b4, t4
  363. LDF [AO - 10 * SIZE], a3
  364. FADD1 c03, t1, c03
  365. nop
  366. FMUL a4, b1, t1
  367. LDF [BO - 8 * SIZE], b1
  368. FADD3 c07, t2, c07
  369. nop
  370. FMUL a4, b2, t2
  371. LDF [BO - 11 * SIZE], b2
  372. FADD1 c11, t3, c11
  373. nop
  374. FMUL a4, b3, t3
  375. LDF [BO - 10 * SIZE], b3
  376. FADD3 c15, t4, c15
  377. nop
  378. FMUL a4, b4, t4
  379. LDF [BO - 9 * SIZE], b4
  380. FADD2 c04, t1, c04
  381. nop
  382. FMUL a5, b5, t1
  383. LDF [AO - 9 * SIZE], a4
  384. FADD4 c08, t2, c08
  385. nop
  386. FMUL a5, b2, t2
  387. nop
  388. FADD2 c12, t3, c12
  389. nop
  390. FMUL a5, b3, t3
  391. nop
  392. FADD4 c16, t4, c16
  393. nop
  394. FMUL a5, b4, t4
  395. LDF [AO - 4 * SIZE], a5
  396. FADD1 c01, t1, c01
  397. nop
  398. FMUL a2, b5, t1
  399. nop
  400. FADD3 c05, t2, c05
  401. nop
  402. FMUL a2, b2, t2
  403. nop
  404. FADD1 c09, t3, c09
  405. nop
  406. FMUL a2, b3, t3
  407. nop
  408. FADD3 c13, t4, c13
  409. nop
  410. FMUL a2, b4, t4
  411. LDF [AO - 7 * SIZE], a2
  412. FADD2 c02, t1, c02
  413. nop
  414. FMUL a3, b5, t1
  415. nop
  416. FADD4 c06, t2, c06
  417. nop
  418. FMUL a3, b2, t2
  419. nop
  420. FADD2 c10, t3, c10
  421. nop
  422. FMUL a3, b3, t3
  423. nop
  424. FADD4 c14, t4, c14
  425. nop
  426. FMUL a3, b4, t4
  427. LDF [AO - 6 * SIZE], a3
  428. FADD1 c03, t1, c03
  429. nop
  430. FMUL a4, b5, t1
  431. LDF [BO - 4 * SIZE], b5
  432. FADD3 c07, t2, c07
  433. nop
  434. FMUL a4, b2, t2
  435. LDF [BO - 7 * SIZE], b2
  436. FADD1 c11, t3, c11
  437. nop
  438. FMUL a4, b3, t3
  439. LDF [BO - 6 * SIZE], b3
  440. FADD3 c15, t4, c15
  441. nop
  442. FMUL a4, b4, t4
  443. LDF [BO - 5 * SIZE], b4
  444. FADD2 c04, t1, c04
  445. nop
  446. FMUL a1, b1, t1
  447. LDF [AO - 5 * SIZE], a4
  448. FADD4 c08, t2, c08
  449. nop
  450. FMUL a1, b2, t2
  451. nop
  452. FADD2 c12, t3, c12
  453. nop
  454. FMUL a1, b3, t3
  455. nop
  456. FADD4 c16, t4, c16
  457. nop
  458. FMUL a1, b4, t4
  459. LDF [AO - 0 * SIZE], a1
  460. FADD1 c01, t1, c01
  461. nop
  462. FMUL a2, b1, t1
  463. nop
  464. #ifdef DOUBLE
  465. prefetch [AO + (APREFETCHSIZE + 8) * SIZE], APREFETCH_CATEGORY
  466. #else
  467. nop
  468. #endif
  469. FADD3 c05, t2, c05
  470. nop
  471. FMUL a2, b2, t2
  472. FADD1 c09, t3, c09
  473. nop
  474. FMUL a2, b3, t3
  475. nop
  476. FADD3 c13, t4, c13
  477. nop
  478. FMUL a2, b4, t4
  479. nop
  480. FADD2 c02, t1, c02
  481. nop
  482. FMUL a3, b1, t1
  483. LDF [AO - 3 * SIZE], a2
  484. FADD4 c06, t2, c06
  485. #ifdef DOUBLE
  486. prefetch [BO + (BPREFETCHSIZE + 8) * SIZE], BPREFETCH_CATEGORY
  487. #else
  488. nop
  489. #endif
  490. FMUL a3, b2, t2
  491. nop
  492. FADD2 c10, t3, c10
  493. nop
  494. FMUL a3, b3, t3
  495. nop
  496. FADD4 c14, t4, c14
  497. nop
  498. FMUL a3, b4, t4
  499. LDF [AO - 2 * SIZE], a3
  500. FADD1 c03, t1, c03
  501. nop
  502. FMUL a4, b1, t1
  503. LDF [BO - 0 * SIZE], b1
  504. FADD3 c07, t2, c07
  505. nop
  506. FMUL a4, b2, t2
  507. LDF [BO - 3 * SIZE], b2
  508. FADD1 c11, t3, c11
  509. nop
  510. FMUL a4, b3, t3
  511. LDF [BO - 2 * SIZE], b3
  512. FADD3 c15, t4, c15
  513. nop
  514. FMUL a4, b4, t4
  515. LDF [BO - 1 * SIZE], b4
  516. FADD2 c04, t1, c04
  517. nop
  518. FMUL a5, b5, t1
  519. LDF [AO - 1 * SIZE], a4
  520. FADD4 c08, t2, c08
  521. FMUL a5, b2, t2
  522. FADD2 c12, t3, c12
  523. FMUL a5, b3, t3
  524. FADD4 c16, t4, c16
  525. nop
  526. FMUL a5, b4, t4
  527. LDF [AO + 4 * SIZE], a5
  528. FADD1 c01, t1, c01
  529. nop
  530. FMUL a2, b5, t1
  531. nop
  532. FADD3 c05, t2, c05
  533. nop
  534. FMUL a2, b2, t2
  535. nop
  536. FADD1 c09, t3, c09
  537. nop
  538. FMUL a2, b3, t3
  539. nop
  540. FADD3 c13, t4, c13
  541. nop
  542. FMUL a2, b4, t4
  543. LDF [AO + 1 * SIZE], a2
  544. FADD2 c02, t1, c02
  545. nop
  546. FMUL a3, b5, t1
  547. nop
  548. FADD4 c06, t2, c06
  549. nop
  550. FMUL a3, b2, t2
  551. nop
  552. FADD2 c10, t3, c10
  553. nop
  554. FMUL a3, b3, t3
  555. nop
  556. FADD4 c14, t4, c14
  557. nop
  558. FMUL a3, b4, t4
  559. LDF [AO + 2 * SIZE], a3
  560. FADD1 c03, t1, c03
  561. cmp L, 0
  562. FMUL a4, b5, t1
  563. LDF [BO + 4 * SIZE], b5
  564. FADD3 c07, t2, c07
  565. nop
  566. FMUL a4, b2, t2
  567. LDF [BO + 1 * SIZE], b2
  568. FADD1 c11, t3, c11
  569. nop
  570. FMUL a4, b3, t3
  571. LDF [BO + 2 * SIZE], b3
  572. FADD3 c15, t4, c15
  573. FMUL a4, b4, t4
  574. bg,pt %icc, .LL22
  575. LDF [BO + 3 * SIZE], b4
  576. .LL25:
  577. #ifndef TRMMKERNEL
  578. and K, 3, L
  579. #else
  580. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  581. sub K, KK, L
  582. #elif defined(LEFT)
  583. add KK, 2, L
  584. #else
  585. add KK, 2, L
  586. #endif
  587. and L, 3, L
  588. #endif
  589. cmp L, 0
  590. ble,pn %icc, .LL29
  591. LDF STACK_ALPHA, ALPHA_I
  592. .LL26:
  593. FADD2 c04, t1, c04
  594. LDF [AO + 3 * SIZE], a4
  595. FMUL a1, b1, t1
  596. add AO, 4 * SIZE, AO
  597. FADD4 c08, t2, c08
  598. add BO, 4 * SIZE, BO
  599. FMUL a1, b2, t2
  600. add L, -1, L
  601. FADD2 c12, t3, c12
  602. nop
  603. FMUL a1, b3, t3
  604. cmp L, 0
  605. FADD4 c16, t4, c16
  606. nop
  607. FMUL a1, b4, t4
  608. LDF [AO + 0 * SIZE], a1
  609. FADD1 c01, t1, c01
  610. nop
  611. FMUL a2, b1, t1
  612. nop
  613. FADD3 c05, t2, c05
  614. nop
  615. FMUL a2, b2, t2
  616. nop
  617. FADD1 c09, t3, c09
  618. nop
  619. FMUL a2, b3, t3
  620. nop
  621. FADD3 c13, t4, c13
  622. nop
  623. FMUL a2, b4, t4
  624. LDF [AO + 1 * SIZE], a2
  625. FADD2 c02, t1, c02
  626. nop
  627. FMUL a3, b1, t1
  628. nop
  629. FADD4 c06, t2, c06
  630. nop
  631. FMUL a3, b2, t2
  632. nop
  633. FADD2 c10, t3, c10
  634. nop
  635. FMUL a3, b3, t3
  636. nop
  637. FADD4 c14, t4, c14
  638. nop
  639. FMUL a3, b4, t4
  640. LDF [AO + 2 * SIZE], a3
  641. FADD1 c03, t1, c03
  642. nop
  643. FMUL a4, b1, t1
  644. LDF [BO + 0 * SIZE], b1
  645. FADD3 c07, t2, c07
  646. nop
  647. FMUL a4, b2, t2
  648. LDF [BO + 1 * SIZE], b2
  649. FADD1 c11, t3, c11
  650. nop
  651. FMUL a4, b3, t3
  652. LDF [BO + 2 * SIZE], b3
  653. FADD3 c15, t4, c15
  654. FMUL a4, b4, t4
  655. bg,pt %icc, .LL26
  656. LDF [BO + 3 * SIZE], b4
  657. .LL29:
  658. #ifndef TRMMKERNEL
  659. FADD2 c04, t1, c04
  660. LDF [C1 + 0 * SIZE], a1
  661. FADD4 c08, t2, c08
  662. LDF [C1 + 1 * SIZE], a2
  663. FADD2 c12, t3, c12
  664. LDF [C1 + 2 * SIZE], a3
  665. FADD4 c16, t4, c16
  666. LDF [C1 + 3 * SIZE], a4
  667. FADD c01, c06, c01
  668. LDF [C2 + 0 * SIZE], b1
  669. FADD c02, c05, c02
  670. LDF [C2 + 1 * SIZE], b2
  671. FADD c03, c08, c03
  672. LDF [C2 + 2 * SIZE], b3
  673. FADD c04, c07, c04
  674. LDF [C2 + 3 * SIZE], b4
  675. FADD c09, c14, c09
  676. FMUL ALPHA_R, c01, t1
  677. FADD c10, c13, c10
  678. FMUL ALPHA_R, c02, t2
  679. FADD c11, c16, c11
  680. FMUL ALPHA_R, c03, t3
  681. FADD c12, c15, c12
  682. FMUL ALPHA_R, c04, t4
  683. FADD a1, t1, a1
  684. FMUL ALPHA_I, c02, t1
  685. FADD a2, t2, a2
  686. FMUL ALPHA_I, c01, t2
  687. FADD a3, t3, a3
  688. FMUL ALPHA_I, c04, t3
  689. FADD a4, t4, a4
  690. FMUL ALPHA_I, c03, t4
  691. FSUB a1, t1, a1
  692. FMUL ALPHA_R, c09, t1
  693. FADD a2, t2, a2
  694. FMUL ALPHA_R, c10, t2
  695. FSUB a3, t3, a3
  696. FMUL ALPHA_R, c11, t3
  697. FADD a4, t4, a4
  698. FMUL ALPHA_R, c12, t4
  699. FADD b1, t1, b1
  700. FMUL ALPHA_I, c10, t1
  701. FADD b2, t2, b2
  702. FMUL ALPHA_I, c09, t2
  703. FADD b3, t3, b3
  704. FMUL ALPHA_I, c12, t3
  705. FADD b4, t4, b4
  706. FMUL ALPHA_I, c11, t4
  707. STF a1, [C1 + 0 * SIZE]
  708. FSUB b1, t1, b1
  709. STF a2, [C1 + 1 * SIZE]
  710. FADD b2, t2, b2
  711. STF a3, [C1 + 2 * SIZE]
  712. FSUB b3, t3, b3
  713. STF a4, [C1 + 3 * SIZE]
  714. FADD b4, t4, b4
  715. STF b1, [C2 + 0 * SIZE]
  716. FMOV FZERO, t1
  717. STF b2, [C2 + 1 * SIZE]
  718. FMOV FZERO, t2
  719. STF b3, [C2 + 2 * SIZE]
  720. FMOV FZERO, t3
  721. STF b4, [C2 + 3 * SIZE]
  722. FMOV FZERO, t4
  723. #else
  724. FADD2 c04, t1, c04
  725. FADD4 c08, t2, c08
  726. FADD2 c12, t3, c12
  727. FADD4 c16, t4, c16
  728. FADD c01, c06, c01
  729. FADD c02, c05, c02
  730. FADD c03, c08, c03
  731. FADD c04, c07, c04
  732. STF c01, [C1 + 0 * SIZE]
  733. FADD c09, c14, c09
  734. STF c02, [C1 + 1 * SIZE]
  735. FADD c10, c13, c10
  736. STF c03, [C1 + 2 * SIZE]
  737. FADD c11, c16, c11
  738. STF c04, [C1 + 3 * SIZE]
  739. FADD c12, c15, c12
  740. STF c09, [C2 + 0 * SIZE]
  741. FMOV FZERO, t1
  742. STF c10, [C2 + 1 * SIZE]
  743. FMOV FZERO, t2
  744. STF c11, [C2 + 2 * SIZE]
  745. FMOV FZERO, t3
  746. STF c12, [C2 + 3 * SIZE]
  747. FMOV FZERO, t4
  748. #endif
  749. add C1, 4 * SIZE, C1
  750. add C2, 4 * SIZE, C2
  751. #ifdef TRMMKERNEL
  752. #if ( defined(LEFT) && defined(TRANSA)) || \
  753. (!defined(LEFT) && !defined(TRANSA))
  754. sub K, KK, TEMP1
  755. #ifdef LEFT
  756. add TEMP1, -2, TEMP1
  757. #else
  758. add TEMP1, -2, TEMP1
  759. #endif
  760. sll TEMP1, 1 + ZBASE_SHIFT, TEMP1
  761. add AO, TEMP1, AO
  762. add BO, TEMP1, BO
  763. #endif
  764. #ifdef LEFT
  765. add KK, 2, KK
  766. #endif
  767. #endif
  768. add I, -1, I
  769. cmp I, 0
  770. bg,pt %icc, .LL21
  771. FMOV FZERO, c01
  772. .LL50:
  773. and M, 1, I
  774. FMOV FZERO, c02
  775. cmp I, 0
  776. FMOV FZERO, t1
  777. ble,pn %icc, .LL99
  778. FMOV FZERO, c04
  779. #if !defined(TRMMKERNEL)
  780. LDF [AO + 0 * SIZE], a1
  781. sra K, 2, L
  782. FMOV FZERO, t2
  783. LDF [B + 0 * SIZE], b1
  784. mov B, BO
  785. FMOV FZERO, c06
  786. LDF [AO + 1 * SIZE], a2
  787. cmp L, 0
  788. FMOV FZERO, t3
  789. LDF [B + 1 * SIZE], b2
  790. FMOV FZERO, c08
  791. LDF [AO + 2 * SIZE], a3
  792. FMOV FZERO, t4
  793. LDF [B + 2 * SIZE], b3
  794. FMOV FZERO, c01
  795. LDF [AO + 3 * SIZE], a4
  796. FMOV FZERO, c03
  797. LDF [B + 3 * SIZE], b4
  798. FMOV FZERO, c05
  799. #else
  800. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  801. mov B, BO
  802. #else
  803. sll KK, 0 + ZBASE_SHIFT, TEMP1
  804. sll KK, 1 + ZBASE_SHIFT, TEMP2
  805. add AO, TEMP1, AO
  806. add B, TEMP2, BO
  807. #endif
  808. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  809. sub K, KK, L
  810. #elif defined(LEFT)
  811. add KK, 1, L
  812. #else
  813. add KK, 2, L
  814. #endif
  815. sra L, 2, L
  816. cmp L, 0
  817. LDF [AO + 0 * SIZE], a1
  818. FMOV FZERO, t2
  819. LDF [BO + 0 * SIZE], b1
  820. FMOV FZERO, c06
  821. LDF [AO + 1 * SIZE], a2
  822. FMOV FZERO, t3
  823. LDF [BO + 1 * SIZE], b2
  824. FMOV FZERO, c08
  825. LDF [AO + 2 * SIZE], a3
  826. FMOV FZERO, t4
  827. LDF [BO + 2 * SIZE], b3
  828. FMOV FZERO, c01
  829. LDF [AO + 3 * SIZE], a4
  830. FMOV FZERO, c03
  831. LDF [BO + 3 * SIZE], b4
  832. FMOV FZERO, c05
  833. #endif
  834. ble,pn %icc, .LL55
  835. FMOV FZERO, c07
  836. .LL52:
  837. FADD2 c02, t1, c02
  838. add AO, 8 * SIZE, AO
  839. prefetch [AO + APREFETCHSIZE * SIZE], 0
  840. FMUL a1, b1, t1
  841. add BO, 16 * SIZE, BO
  842. FADD4 c04, t2, c04
  843. add L, -1, L
  844. FMUL a1, b2, t2
  845. FADD2 c06, t3, c06
  846. cmp L, 0
  847. FMUL a1, b3, t3
  848. FADD4 c08, t4, c08
  849. FMUL a1, b4, t4
  850. LDF [AO - 4 * SIZE], a1
  851. FADD1 c01, t1, c01
  852. FMUL a2, b1, t1
  853. LDF [BO - 12 * SIZE], b1
  854. FADD3 c03, t2, c03
  855. FMUL a2, b2, t2
  856. LDF [BO - 11 * SIZE], b2
  857. FADD1 c05, t3, c05
  858. FMUL a2, b3, t3
  859. LDF [BO - 10 * SIZE], b3
  860. FADD3 c07, t4, c07
  861. FMUL a2, b4, t4
  862. LDF [BO - 9 * SIZE], b4
  863. FADD2 c02, t1, c02
  864. FMUL a3, b1, t1
  865. LDF [AO - 3 * SIZE], a2
  866. FADD4 c04, t2, c04
  867. FMUL a3, b2, t2
  868. FADD2 c06, t3, c06
  869. FMUL a3, b3, t3
  870. FADD4 c08, t4, c08
  871. FMUL a3, b4, t4
  872. LDF [AO - 2 * SIZE], a3
  873. FADD1 c01, t1, c01
  874. FMUL a4, b1, t1
  875. LDF [BO - 8 * SIZE], b1
  876. FADD3 c03, t2, c03
  877. FMUL a4, b2, t2
  878. LDF [BO - 7 * SIZE], b2
  879. FADD1 c05, t3, c05
  880. FMUL a4, b3, t3
  881. LDF [BO - 6 * SIZE], b3
  882. FADD3 c07, t4, c07
  883. FMUL a4, b4, t4
  884. LDF [BO - 5 * SIZE], b4
  885. FADD2 c02, t1, c02
  886. FMUL a1, b1, t1
  887. LDF [AO - 1 * SIZE], a4
  888. FADD4 c04, t2, c04
  889. FMUL a1, b2, t2
  890. FADD2 c06, t3, c06
  891. FMUL a1, b3, t3
  892. FADD4 c08, t4, c08
  893. FMUL a1, b4, t4
  894. LDF [AO + 0 * SIZE], a1
  895. FADD1 c01, t1, c01
  896. FMUL a2, b1, t1
  897. LDF [BO - 4 * SIZE], b1
  898. FADD3 c03, t2, c03
  899. FMUL a2, b2, t2
  900. LDF [BO - 3 * SIZE], b2
  901. FADD1 c05, t3, c05
  902. FMUL a2, b3, t3
  903. LDF [BO - 2 * SIZE], b3
  904. FADD3 c07, t4, c07
  905. FMUL a2, b4, t4
  906. LDF [BO - 1 * SIZE], b4
  907. FADD2 c02, t1, c02
  908. FMUL a3, b1, t1
  909. LDF [AO + 1 * SIZE], a2
  910. FADD4 c04, t2, c04
  911. FMUL a3, b2, t2
  912. FADD2 c06, t3, c06
  913. FMUL a3, b3, t3
  914. FADD4 c08, t4, c08
  915. FMUL a3, b4, t4
  916. LDF [AO + 2 * SIZE], a3
  917. FADD1 c01, t1, c01
  918. FMUL a4, b1, t1
  919. LDF [BO + 0 * SIZE], b1
  920. FADD3 c03, t2, c03
  921. FMUL a4, b2, t2
  922. LDF [BO + 1 * SIZE], b2
  923. FADD1 c05, t3, c05
  924. FMUL a4, b3, t3
  925. LDF [BO + 2 * SIZE], b3
  926. FADD3 c07, t4, c07
  927. FMUL a4, b4, t4
  928. LDF [BO + 3 * SIZE], b4
  929. bg,pt %icc, .LL52
  930. LDF [AO + 3 * SIZE], a4
  931. .LL55:
  932. #ifndef TRMMKERNEL
  933. and K, 3, L
  934. #else
  935. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  936. sub K, KK, L
  937. #elif defined(LEFT)
  938. add KK, 1, L
  939. #else
  940. add KK, 2, L
  941. #endif
  942. and L, 3, L
  943. #endif
  944. cmp L, 0
  945. ble,a,pn %icc, .LL59
  946. nop
  947. .LL56:
  948. FADD2 c02, t1, c02
  949. add AO, 2 * SIZE, AO
  950. FMUL a1, b1, t1
  951. add L, -1, L
  952. add BO, 4 * SIZE, BO
  953. FADD4 c04, t2, c04
  954. cmp L, 0
  955. FMUL a1, b2, t2
  956. FADD2 c06, t3, c06
  957. FMUL a1, b3, t3
  958. FADD4 c08, t4, c08
  959. FMUL a1, b4, t4
  960. LDF [AO + 0 * SIZE], a1
  961. FADD1 c01, t1, c01
  962. FMUL a2, b1, t1
  963. LDF [BO + 0 * SIZE], b1
  964. FADD3 c03, t2, c03
  965. FMUL a2, b2, t2
  966. LDF [BO + 1 * SIZE], b2
  967. FADD1 c05, t3, c05
  968. FMUL a2, b3, t3
  969. LDF [BO + 2 * SIZE], b3
  970. FADD3 c07, t4, c07
  971. FMUL a2, b4, t4
  972. LDF [BO + 3 * SIZE], b4
  973. bg,pt %icc, .LL56
  974. LDF [AO + 1 * SIZE], a2
  975. .LL59:
  976. #ifndef TRMMKERNEL
  977. FADD2 c02, t1, c02
  978. LDF [C1 + 0 * SIZE], a1
  979. FADD4 c04, t2, c04
  980. LDF [C1 + 1 * SIZE], a2
  981. FADD2 c06, t3, c06
  982. LDF [C2 + 0 * SIZE], a3
  983. FADD4 c08, t4, c08
  984. LDF [C2 + 1 * SIZE], a4
  985. FADD c01, c04, c01
  986. FMUL ALPHA_R, c01, t1
  987. FADD c02, c03, c02
  988. FMUL ALPHA_R, c02, t2
  989. FADD c05, c08, c05
  990. FMUL ALPHA_R, c05, t3
  991. FADD c06, c07, c06
  992. FMUL ALPHA_R, c06, t4
  993. FADD a1, t1, a1
  994. FMUL ALPHA_I, c02, t1
  995. FADD a2, t2, a2
  996. FMUL ALPHA_I, c01, t2
  997. FADD a3, t3, a3
  998. FMUL ALPHA_I, c06, t3
  999. FADD a4, t4, a4
  1000. FMUL ALPHA_I, c05, t4
  1001. FSUB a1, t1, a1
  1002. FADD a2, t2, a2
  1003. FSUB a3, t3, a3
  1004. FADD a4, t4, a4
  1005. STF a1, [C1 + 0 * SIZE]
  1006. FMOV FZERO, t1
  1007. STF a2, [C1 + 1 * SIZE]
  1008. FMOV FZERO, t2
  1009. STF a3, [C2 + 0 * SIZE]
  1010. FMOV FZERO, t3
  1011. STF a4, [C2 + 1 * SIZE]
  1012. FMOV FZERO, t4
  1013. #else
  1014. FADD2 c02, t1, c02
  1015. FADD4 c04, t2, c04
  1016. FADD2 c06, t3, c06
  1017. FADD4 c08, t4, c08
  1018. FADD c01, c04, c01
  1019. FADD c02, c03, c02
  1020. FADD c05, c08, c05
  1021. FADD c06, c07, c06
  1022. STF c01, [C1 + 0 * SIZE]
  1023. FMOV FZERO, t1
  1024. STF c02, [C1 + 1 * SIZE]
  1025. FMOV FZERO, t2
  1026. STF c05, [C2 + 0 * SIZE]
  1027. FMOV FZERO, t3
  1028. STF c06, [C2 + 1 * SIZE]
  1029. FMOV FZERO, t4
  1030. #endif
  1031. add C1, 2 * SIZE, C1
  1032. add C2, 2 * SIZE, C2
  1033. #ifdef TRMMKERNEL
  1034. #if ( defined(LEFT) && defined(TRANSA)) || \
  1035. (!defined(LEFT) && !defined(TRANSA))
  1036. sub K, KK, TEMP1
  1037. #ifdef LEFT
  1038. add TEMP1, -1, TEMP1
  1039. #else
  1040. add TEMP1, -2, TEMP1
  1041. #endif
  1042. sll TEMP1, 0 + ZBASE_SHIFT, TEMP2
  1043. sll TEMP1, 1 + ZBASE_SHIFT, TEMP1
  1044. add AO, TEMP2, AO
  1045. add BO, TEMP1, BO
  1046. #endif
  1047. #ifdef LEFT
  1048. add KK, 1, KK
  1049. #endif
  1050. #endif
  1051. .LL99:
  1052. add J, -1, J
  1053. mov BO, B
  1054. cmp J, 0
  1055. bg,pt %icc, .LL11
  1056. #if defined(TRMMKERNEL) && !defined(LEFT)
  1057. add KK, 2, KK
  1058. #else
  1059. nop
  1060. #endif
  1061. .LL100:
  1062. sra M, 1, I
  1063. and N, 1, J
  1064. cmp J, 0
  1065. ble,pn %icc, .LL999
  1066. mov A, AO
  1067. mov C, C1
  1068. add C, LDC, C
  1069. #if defined(TRMMKERNEL) && defined(LEFT)
  1070. mov OFFSET, KK
  1071. #endif
  1072. cmp I, 0
  1073. ble,pn %icc, .LL150
  1074. FMOV FZERO, c03
  1075. .LL121:
  1076. #if !defined(TRMMKERNEL)
  1077. LDF [AO + 0 * SIZE], a1
  1078. sra K, 2, L
  1079. FMOV FZERO, t1
  1080. LDF [B + 0 * SIZE], b1
  1081. mov B, BO
  1082. FMOV FZERO, c07
  1083. LDF [AO + 1 * SIZE], a2
  1084. cmp L, 0
  1085. FMOV FZERO, t2
  1086. LDF [B + 1 * SIZE], b2
  1087. FMOV FZERO, c04
  1088. LDF [AO + 2 * SIZE], a3
  1089. FMOV FZERO, t3
  1090. LDF [B + 2 * SIZE], b3
  1091. FMOV FZERO, c08
  1092. LDF [AO + 3 * SIZE], a4
  1093. FMOV FZERO, t4
  1094. LDF [B + 3 * SIZE], b4
  1095. FMOV FZERO, c01
  1096. prefetch [C1 + 3 * SIZE], 3
  1097. FMOV FZERO, c05
  1098. FMOV FZERO, c02
  1099. #else
  1100. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1101. mov B, BO
  1102. #else
  1103. sll KK, 1 + ZBASE_SHIFT, TEMP1
  1104. sll KK, 0 + ZBASE_SHIFT, TEMP2
  1105. add AO, TEMP1, AO
  1106. add B, TEMP2, BO
  1107. #endif
  1108. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1109. sub K, KK, L
  1110. #elif defined(LEFT)
  1111. add KK, 2, L
  1112. #else
  1113. add KK, 1, L
  1114. #endif
  1115. sra L, 2, L
  1116. cmp L, 0
  1117. LDF [AO + 0 * SIZE], a1
  1118. FMOV FZERO, t1
  1119. LDF [BO + 0 * SIZE], b1
  1120. FMOV FZERO, c07
  1121. LDF [AO + 1 * SIZE], a2
  1122. FMOV FZERO, t2
  1123. LDF [BO + 1 * SIZE], b2
  1124. FMOV FZERO, c04
  1125. LDF [AO + 2 * SIZE], a3
  1126. FMOV FZERO, t3
  1127. LDF [BO + 2 * SIZE], b3
  1128. FMOV FZERO, c08
  1129. LDF [AO + 3 * SIZE], a4
  1130. FMOV FZERO, t4
  1131. LDF [BO + 3 * SIZE], b4
  1132. FMOV FZERO, c01
  1133. prefetch [C1 + 3 * SIZE], 3
  1134. FMOV FZERO, c05
  1135. FMOV FZERO, c02
  1136. #endif
  1137. ble,pn %icc, .LL125
  1138. FMOV FZERO, c06
  1139. .LL122:
  1140. FADD1 c03, t1, c03
  1141. add L, -1, L
  1142. FMUL a1, b1, t1
  1143. prefetch [AO + APREFETCHSIZE * SIZE], 0
  1144. FADD3 c07, t2, c07
  1145. add BO, 8 * SIZE, BO
  1146. FMUL a1, b2, t2
  1147. LDF [AO + 4 * SIZE], a1
  1148. FADD2 c04, t3, c04
  1149. add AO, 16 * SIZE, AO
  1150. FMUL a2, b1, t3
  1151. cmp L, 0
  1152. FADD4 c08, t4, c08
  1153. nop
  1154. FMUL a2, b2, t4
  1155. LDF [AO - 11 * SIZE], a2
  1156. FADD1 c01, t1, c01
  1157. nop
  1158. FMUL a3, b1, t1
  1159. nop
  1160. FADD3 c05, t2, c05
  1161. nop
  1162. FMUL a3, b2, t2
  1163. LDF [AO - 10 * SIZE], a3
  1164. FADD2 c02, t3, c02
  1165. nop
  1166. FMUL a4, b1, t3
  1167. LDF [BO - 4 * SIZE], b1
  1168. FADD4 c06, t4, c06
  1169. nop
  1170. FMUL a4, b2, t4
  1171. LDF [BO - 3 * SIZE], b2
  1172. FADD1 c03, t1, c03
  1173. nop
  1174. FMUL a1, b3, t1
  1175. LDF [AO - 9 * SIZE], a4
  1176. FADD3 c07, t2, c07
  1177. nop
  1178. FMUL a1, b4, t2
  1179. LDF [AO - 8 * SIZE], a1
  1180. FADD2 c04, t3, c04
  1181. nop
  1182. FMUL a2, b3, t3
  1183. nop
  1184. FADD4 c08, t4, c08
  1185. nop
  1186. FMUL a2, b4, t4
  1187. LDF [AO - 7 * SIZE], a2
  1188. FADD1 c01, t1, c01
  1189. nop
  1190. FMUL a3, b3, t1
  1191. nop
  1192. FADD3 c05, t2, c05
  1193. nop
  1194. FMUL a3, b4, t2
  1195. LDF [AO - 6 * SIZE], a3
  1196. FADD2 c02, t3, c02
  1197. nop
  1198. FMUL a4, b3, t3
  1199. LDF [BO - 2 * SIZE], b3
  1200. FADD4 c06, t4, c06
  1201. nop
  1202. FMUL a4, b4, t4
  1203. LDF [BO - 1 * SIZE], b4
  1204. FADD1 c03, t1, c03
  1205. nop
  1206. FMUL a1, b1, t1
  1207. LDF [AO - 5 * SIZE], a4
  1208. FADD3 c07, t2, c07
  1209. nop
  1210. FMUL a1, b2, t2
  1211. LDF [AO - 4 * SIZE], a1
  1212. FADD2 c04, t3, c04
  1213. nop
  1214. FMUL a2, b1, t3
  1215. nop
  1216. FADD4 c08, t4, c08
  1217. nop
  1218. FMUL a2, b2, t4
  1219. LDF [AO - 3 * SIZE], a2
  1220. FADD1 c01, t1, c01
  1221. nop
  1222. FMUL a3, b1, t1
  1223. nop
  1224. FADD3 c05, t2, c05
  1225. nop
  1226. FMUL a3, b2, t2
  1227. LDF [AO - 2 * SIZE], a3
  1228. FADD2 c02, t3, c02
  1229. nop
  1230. FMUL a4, b1, t3
  1231. LDF [BO + 0 * SIZE], b1
  1232. FADD4 c06, t4, c06
  1233. nop
  1234. FMUL a4, b2, t4
  1235. LDF [BO + 1 * SIZE], b2
  1236. FADD1 c03, t1, c03
  1237. nop
  1238. FMUL a1, b3, t1
  1239. LDF [AO - 1 * SIZE], a4
  1240. FADD3 c07, t2, c07
  1241. nop
  1242. FMUL a1, b4, t2
  1243. LDF [AO + 0 * SIZE], a1
  1244. FADD2 c04, t3, c04
  1245. nop
  1246. FMUL a2, b3, t3
  1247. nop
  1248. FADD4 c08, t4, c08
  1249. nop
  1250. FMUL a2, b4, t4
  1251. LDF [AO + 1 * SIZE], a2
  1252. FADD1 c01, t1, c01
  1253. nop
  1254. FMUL a3, b3, t1
  1255. nop
  1256. FADD3 c05, t2, c05
  1257. nop
  1258. FMUL a3, b4, t2
  1259. LDF [AO + 2 * SIZE], a3
  1260. FADD2 c02, t3, c02
  1261. nop
  1262. FMUL a4, b3, t3
  1263. LDF [BO + 2 * SIZE], b3
  1264. FADD4 c06, t4, c06
  1265. FMUL a4, b4, t4
  1266. LDF [AO + 3 * SIZE], a4
  1267. bg,pt %icc, .LL122
  1268. LDF [BO + 3 * SIZE], b4
  1269. .LL125:
  1270. #ifndef TRMMKERNEL
  1271. and K, 3, L
  1272. #else
  1273. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1274. sub K, KK, L
  1275. #elif defined(LEFT)
  1276. add KK, 2, L
  1277. #else
  1278. add KK, 1, L
  1279. #endif
  1280. and L, 3, L
  1281. #endif
  1282. cmp L, 0
  1283. ble,a,pn %icc, .LL129
  1284. nop
  1285. .LL126:
  1286. FADD1 c03, t1, c03
  1287. add AO, 4 * SIZE, AO
  1288. FMUL a1, b1, t1
  1289. add BO, 2 * SIZE, BO
  1290. FADD3 c07, t2, c07
  1291. add L, -1, L
  1292. FMUL a1, b2, t2
  1293. LDF [AO + 0 * SIZE], a1
  1294. FADD2 c04, t3, c04
  1295. cmp L, 0
  1296. FMUL a2, b1, t3
  1297. FADD4 c08, t4, c08
  1298. FMUL a2, b2, t4
  1299. LDF [AO + 1 * SIZE], a2
  1300. FADD1 c01, t1, c01
  1301. FMUL a3, b1, t1
  1302. FADD3 c05, t2, c05
  1303. FMUL a3, b2, t2
  1304. LDF [AO + 2 * SIZE], a3
  1305. FADD2 c02, t3, c02
  1306. FMUL a4, b1, t3
  1307. LDF [BO + 0 * SIZE], b1
  1308. FADD4 c06, t4, c06
  1309. FMUL a4, b2, t4
  1310. LDF [BO + 1 * SIZE], b2
  1311. bg,pt %icc, .LL126
  1312. LDF [AO + 3 * SIZE], a4
  1313. .LL129:
  1314. #ifndef TRMMKERNEL
  1315. FADD1 c03, t1, c03
  1316. LDF [C1 + 0 * SIZE], a1
  1317. FADD3 c07, t2, c07
  1318. LDF [C1 + 1 * SIZE], a2
  1319. FADD2 c04, t3, c04
  1320. LDF [C1 + 2 * SIZE], a3
  1321. FADD4 c08, t4, c08
  1322. LDF [C1 + 3 * SIZE], a4
  1323. FADD c01, c06, c01
  1324. FMUL ALPHA_R, c01, t1
  1325. FADD c02, c05, c02
  1326. FMUL ALPHA_R, c02, t2
  1327. FADD c03, c08, c03
  1328. FMUL ALPHA_R, c03, t3
  1329. FADD c04, c07, c04
  1330. FMUL ALPHA_R, c04, t4
  1331. FADD a1, t1, a1
  1332. FMUL ALPHA_I, c02, t1
  1333. FADD a2, t2, a2
  1334. FMUL ALPHA_I, c01, t2
  1335. FADD a3, t3, a3
  1336. FMUL ALPHA_I, c04, t3
  1337. FADD a4, t4, a4
  1338. FMUL ALPHA_I, c03, t4
  1339. FSUB a1, t1, a1
  1340. FADD a2, t2, a2
  1341. FSUB a3, t3, a3
  1342. FADD a4, t4, a4
  1343. STF a1, [C1 + 0 * SIZE]
  1344. FMOV FZERO, t1
  1345. STF a2, [C1 + 1 * SIZE]
  1346. FMOV FZERO, t2
  1347. STF a3, [C1 + 2 * SIZE]
  1348. FMOV FZERO, t3
  1349. STF a4, [C1 + 3 * SIZE]
  1350. FMOV FZERO, t4
  1351. #else
  1352. FADD1 c03, t1, c03
  1353. FADD3 c07, t2, c07
  1354. FADD2 c04, t3, c04
  1355. FADD4 c08, t4, c08
  1356. FADD c01, c06, c01
  1357. FADD c02, c05, c02
  1358. FADD c03, c08, c03
  1359. FADD c04, c07, c04
  1360. STF c01, [C1 + 0 * SIZE]
  1361. FMOV FZERO, t1
  1362. STF c02, [C1 + 1 * SIZE]
  1363. FMOV FZERO, t2
  1364. STF c03, [C1 + 2 * SIZE]
  1365. FMOV FZERO, t3
  1366. STF c04, [C1 + 3 * SIZE]
  1367. FMOV FZERO, t4
  1368. #endif
  1369. add C1, 4 * SIZE, C1
  1370. #ifdef TRMMKERNEL
  1371. #if ( defined(LEFT) && defined(TRANSA)) || \
  1372. (!defined(LEFT) && !defined(TRANSA))
  1373. sub K, KK, TEMP1
  1374. #ifdef LEFT
  1375. add TEMP1, -2, TEMP1
  1376. #else
  1377. add TEMP1, -1, TEMP1
  1378. #endif
  1379. sll TEMP1, 1 + ZBASE_SHIFT, TEMP2
  1380. sll TEMP1, 0 + ZBASE_SHIFT, TEMP1
  1381. add AO, TEMP2, AO
  1382. add BO, TEMP1, BO
  1383. #endif
  1384. #ifdef LEFT
  1385. add KK, 2, KK
  1386. #endif
  1387. #endif
  1388. add I, -1, I
  1389. cmp I, 0
  1390. bg,pt %icc, .LL121
  1391. FMOV FZERO, c03
  1392. .LL150:
  1393. and M, 1, I
  1394. cmp I, 0
  1395. ble,pn %icc, .LL999
  1396. nop
  1397. #if !defined(TRMMKERNEL)
  1398. LDF [AO + 0 * SIZE], a1
  1399. sra K, 2, L
  1400. FMOV FZERO, c01
  1401. LDF [B + 0 * SIZE], b1
  1402. mov B, BO
  1403. FMOV FZERO, t1
  1404. LDF [AO + 1 * SIZE], a2
  1405. cmp L, 0
  1406. FMOV FZERO, c02
  1407. LDF [B + 1 * SIZE], b2
  1408. FMOV FZERO, t2
  1409. LDF [AO + 2 * SIZE], a3
  1410. FMOV FZERO, c03
  1411. LDF [B + 2 * SIZE], b3
  1412. FMOV FZERO, t3
  1413. LDF [AO + 3 * SIZE], a4
  1414. FMOV FZERO, c04
  1415. LDF [B + 3 * SIZE], b4
  1416. FMOV FZERO, t4
  1417. #else
  1418. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1419. mov B, BO
  1420. #else
  1421. sll KK, 0 + ZBASE_SHIFT, TEMP1
  1422. sll KK, 0 + ZBASE_SHIFT, TEMP2
  1423. add AO, TEMP1, AO
  1424. add B, TEMP2, BO
  1425. #endif
  1426. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1427. sub K, KK, L
  1428. #elif defined(LEFT)
  1429. add KK, 1, L
  1430. #else
  1431. add KK, 1, L
  1432. #endif
  1433. sra L, 2, L
  1434. cmp L, 0
  1435. LDF [AO + 0 * SIZE], a1
  1436. FMOV FZERO, c01
  1437. LDF [BO + 0 * SIZE], b1
  1438. FMOV FZERO, t1
  1439. LDF [AO + 1 * SIZE], a2
  1440. FMOV FZERO, c02
  1441. LDF [BO + 1 * SIZE], b2
  1442. FMOV FZERO, t2
  1443. LDF [AO + 2 * SIZE], a3
  1444. FMOV FZERO, c03
  1445. LDF [BO + 2 * SIZE], b3
  1446. FMOV FZERO, t3
  1447. LDF [AO + 3 * SIZE], a4
  1448. FMOV FZERO, c04
  1449. LDF [BO + 3 * SIZE], b4
  1450. FMOV FZERO, t4
  1451. #endif
  1452. ble,pn %icc, .LL155
  1453. nop
  1454. .LL152:
  1455. FADD1 c01, t1, c01
  1456. add L, -1, L
  1457. FMUL a1, b1, t1
  1458. prefetch [AO + APREFETCHSIZE * SIZE], 0
  1459. FADD3 c02, t2, c02
  1460. add BO, 8 * SIZE, BO
  1461. FMUL a1, b2, t2
  1462. LDF [AO + 4 * SIZE], a1
  1463. FADD2 c03, t3, c03
  1464. cmp L, 0
  1465. FMUL a2, b1, t3
  1466. LDF [BO - 4 * SIZE], b1
  1467. FADD4 c04, t4, c04
  1468. nop
  1469. FMUL a2, b2, t4
  1470. LDF [AO + 5 * SIZE], a2
  1471. FADD1 c01, t1, c01
  1472. nop
  1473. FMUL a3, b3, t1
  1474. LDF [BO - 3 * SIZE], b2
  1475. FADD3 c02, t2, c02
  1476. nop
  1477. FMUL a3, b4, t2
  1478. LDF [AO + 6 * SIZE], a3
  1479. FADD2 c03, t3, c03
  1480. nop
  1481. FMUL a4, b3, t3
  1482. LDF [BO - 2 * SIZE], b3
  1483. FADD4 c04, t4, c04
  1484. nop
  1485. FMUL a4, b4, t4
  1486. LDF [AO + 7 * SIZE], a4
  1487. FADD1 c01, t1, c01
  1488. nop
  1489. FMUL a1, b1, t1
  1490. LDF [BO - 1 * SIZE], b4
  1491. FADD3 c02, t2, c02
  1492. FMUL a1, b2, t2
  1493. LDF [AO + 8 * SIZE], a1
  1494. FADD2 c03, t3, c03
  1495. FMUL a2, b1, t3
  1496. LDF [BO + 0 * SIZE], b1
  1497. FADD4 c04, t4, c04
  1498. FMUL a2, b2, t4
  1499. LDF [AO + 9 * SIZE], a2
  1500. FADD1 c01, t1, c01
  1501. FMUL a3, b3, t1
  1502. LDF [BO + 1 * SIZE], b2
  1503. FADD3 c02, t2, c02
  1504. FMUL a3, b4, t2
  1505. LDF [AO + 10 * SIZE], a3
  1506. FADD2 c03, t3, c03
  1507. FMUL a4, b3, t3
  1508. LDF [BO + 2 * SIZE], b3
  1509. FADD4 c04, t4, c04
  1510. FMUL a4, b4, t4
  1511. LDF [AO + 11 * SIZE], a4
  1512. add AO, 8 * SIZE, AO
  1513. bg,pt %icc, .LL152
  1514. LDF [BO + 3 * SIZE], b4
  1515. .LL155:
  1516. #ifndef TRMMKERNEL
  1517. and K, 3, L
  1518. #else
  1519. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1520. sub K, KK, L
  1521. #elif defined(LEFT)
  1522. add KK, 1, L
  1523. #else
  1524. add KK, 1, L
  1525. #endif
  1526. and L, 3, L
  1527. #endif
  1528. cmp L, 0
  1529. ble,a,pn %icc, .LL159
  1530. nop
  1531. .LL156:
  1532. FADD1 c01, t1, c01
  1533. add AO, 2 * SIZE, AO
  1534. FMUL a1, b1, t1
  1535. add BO, 2 * SIZE, BO
  1536. FADD3 c02, t2, c02
  1537. add L, -1, L
  1538. FMUL a1, b2, t2
  1539. LDF [AO + 0 * SIZE], a1
  1540. FADD2 c03, t3, c03
  1541. FMUL a2, b1, t3
  1542. LDF [BO + 0 * SIZE], b1
  1543. cmp L, 0
  1544. FADD4 c04, t4, c04
  1545. FMUL a2, b2, t4
  1546. LDF [BO + 1 * SIZE], b2
  1547. bg,pt %icc, .LL156
  1548. LDF [AO + 1 * SIZE], a2
  1549. .LL159:
  1550. #ifndef TRMMKERNEL
  1551. FADD1 c01, t1, c01
  1552. FADD3 c02, t2, c02
  1553. FADD2 c03, t3, c03
  1554. FADD4 c04, t4, c04
  1555. LDF [C1 + 0 * SIZE], a1
  1556. LDF [C1 + 1 * SIZE], a2
  1557. FADD c01, c04, c01
  1558. FADD c02, c03, c02
  1559. FMUL ALPHA_R, c01, t1
  1560. FMUL ALPHA_R, c02, t2
  1561. FMUL ALPHA_I, c02, t3
  1562. FMUL ALPHA_I, c01, t4
  1563. FADD a1, t1, a1
  1564. FADD a2, t2, a2
  1565. FSUB a1, t3, a1
  1566. FADD a2, t4, a2
  1567. STF a1, [C1 + 0 * SIZE]
  1568. STF a2, [C1 + 1 * SIZE]
  1569. #else
  1570. FADD1 c01, t1, c01
  1571. FADD3 c02, t2, c02
  1572. FADD2 c03, t3, c03
  1573. FADD4 c04, t4, c04
  1574. FADD c01, c04, c01
  1575. FADD c02, c03, c02
  1576. STF c01, [C1 + 0 * SIZE]
  1577. STF c02, [C1 + 1 * SIZE]
  1578. #endif
  1579. add C1, 2 * SIZE, C1
  1580. #ifndef TRMMKERNEL
  1581. #if ( defined(LEFT) && defined(TRANSA)) || \
  1582. (!defined(LEFT) && !defined(TRANSA))
  1583. sub K, KK, TEMP1
  1584. #ifdef LEFT
  1585. add TEMP1, -1, TEMP1
  1586. #else
  1587. add TEMP1, -1, TEMP1
  1588. #endif
  1589. sll TEMP1, 0 + ZBASE_SHIFT, TEMP2
  1590. sll TEMP1, 0 + ZBASE_SHIFT, TEMP1
  1591. add AO, TEMP2, AO
  1592. add BO, TEMP1, BO
  1593. #endif
  1594. #ifdef LEFT
  1595. add KK, 1, KK
  1596. #endif
  1597. #endif
  1598. .LL999:
  1599. return %i7 + 8
  1600. clr %o0
  1601. EPILOGUE