You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

gemm_kernel_power6.S 47 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764176517661767176817691770177117721773177417751776177717781779178017811782178317841785178617871788178917901791179217931794179517961797179817991800180118021803180418051806180718081809181018111812181318141815181618171818181918201821182218231824182518261827182818291830183118321833183418351836183718381839184018411842184318441845184618471848184918501851185218531854185518561857185818591860186118621863186418651866186718681869187018711872187318741875187618771878187918801881188218831884188518861887188818891890189118921893189418951896189718981899190019011902190319041905190619071908190919101911191219131914191519161917191819191920192119221923192419251926192719281929193019311932193319341935193619371938193919401941194219431944194519461947194819491950195119521953195419551956195719581959196019611962196319641965196619671968196919701971197219731974197519761977197819791980198119821983198419851986198719881989199019911992199319941995199619971998199920002001200220032004200520062007200820092010201120122013201420152016201720182019202020212022202320242025202620272028202920302031203220332034203520362037203820392040204120422043204420452046204720482049205020512052205320542055205620572058205920602061206220632064206520662067206820692070207120722073207420752076207720782079208020812082208320842085208620872088208920902091209220932094209520962097209820992100210121022103210421052106210721082109211021112112211321142115211621172118211921202121212221232124212521262127212821292130213121322133213421352136213721382139214021412142214321442145214621472148214921502151215221532154215521562157215821592160216121622163216421652166216721682169217021712172217321742175217621772178217921802181218221832184218521862187218821892190219121922193219421952196219721982199220022012202220322042205220622072208220922102211221222132214221522162217221822192220222122222223222422252226222722282229223022312232223322342235223622372238223922402241224222432244224522462247224822492250225122522253225422552256225722582259226022612262226322642265226622672268226922702271227222732274227522762277227822792280228122822283228422852286228722882289229022912292229322942295229622972298229923002301230223032304230523062307230823092310231123122313231423152316231723182319232023212322232323242325232623272328232923302331233223332334233523362337233823392340234123422343234423452346234723482349235023512352235323542355235623572358235923602361236223632364236523662367236823692370237123722373237423752376237723782379238023812382238323842385238623872388238923902391239223932394239523962397239823992400240124022403240424052406240724082409241024112412241324142415241624172418241924202421242224232424242524262427242824292430243124322433243424352436243724382439244024412442244324442445244624472448244924502451245224532454245524562457245824592460246124622463246424652466246724682469247024712472247324742475247624772478247924802481248224832484248524862487248824892490249124922493249424952496249724982499250025012502250325042505250625072508250925102511251225132514251525162517251825192520252125222523252425252526252725282529253025312532253325342535253625372538253925402541254225432544254525462547254825492550255125522553255425552556255725582559256025612562256325642565256625672568256925702571257225732574257525762577257825792580258125822583258425852586258725882589259025912592259325942595259625972598259926002601260226032604260526062607260826092610261126122613261426152616261726182619262026212622262326242625262626272628262926302631263226332634263526362637263826392640264126422643264426452646264726482649265026512652265326542655265626572658265926602661266226632664266526662667
  1. /*********************************************************************/
  2. /* Copyright 2009, 2010 The University of Texas at Austin. */
  3. /* All rights reserved. */
  4. /* */
  5. /* Redistribution and use in source and binary forms, with or */
  6. /* without modification, are permitted provided that the following */
  7. /* conditions are met: */
  8. /* */
  9. /* 1. Redistributions of source code must retain the above */
  10. /* copyright notice, this list of conditions and the following */
  11. /* disclaimer. */
  12. /* */
  13. /* 2. Redistributions in binary form must reproduce the above */
  14. /* copyright notice, this list of conditions and the following */
  15. /* disclaimer in the documentation and/or other materials */
  16. /* provided with the distribution. */
  17. /* */
  18. /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
  19. /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
  20. /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
  21. /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
  22. /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
  23. /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
  24. /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
  25. /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
  26. /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
  27. /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
  28. /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
  29. /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
  30. /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
  31. /* POSSIBILITY OF SUCH DAMAGE. */
  32. /* */
  33. /* The views and conclusions contained in the software and */
  34. /* documentation are those of the authors and should not be */
  35. /* interpreted as representing official policies, either expressed */
  36. /* or implied, of The University of Texas at Austin. */
  37. /*********************************************************************/
  38. #define ASSEMBLER
  39. #include "common.h"
  40. #ifndef __64BIT__
  41. #define LOAD lwz
  42. #else
  43. #define LOAD ld
  44. #endif
  45. #ifdef __64BIT__
  46. #define STACKSIZE 320
  47. #define ALPHA 296(SP)
  48. #define FZERO 304(SP)
  49. #else
  50. #define STACKSIZE 240
  51. #define ALPHA 224(SP)
  52. #define FZERO 232(SP)
  53. #endif
  54. #define M r3
  55. #define N r4
  56. #define K r5
  57. #if defined(linux) || defined(__FreeBSD__)
  58. #ifndef __64BIT__
  59. #define A r6
  60. #define B r7
  61. #define C r8
  62. #define LDC r9
  63. #define OFFSET r10
  64. #else
  65. #define A r7
  66. #define B r8
  67. #define C r9
  68. #define LDC r10
  69. #define OFFSET r6
  70. #endif
  71. #endif
  72. #if defined(_AIX) || defined(__APPLE__)
  73. #if !defined(__64BIT__) && defined(DOUBLE)
  74. #define A r8
  75. #define B r9
  76. #define C r10
  77. #define LDC r7
  78. #define OFFSET r6
  79. #else
  80. #define A r7
  81. #define B r8
  82. #define C r9
  83. #define LDC r10
  84. #define OFFSET r6
  85. #endif
  86. #endif
  87. #define TEMP r19
  88. #define KK r20
  89. #define BB r21
  90. #define I r22
  91. #define J r23
  92. #define AO r24
  93. #define BO r25
  94. #define CO1 r26
  95. #define CO2 r27
  96. #define CO3 r28
  97. #define CO4 r29
  98. #define PREA r30
  99. #define PREC r31
  100. #ifndef NEEDPARAM
  101. PROLOGUE
  102. PROFCODE
  103. addi SP, SP, -STACKSIZE
  104. li r0, 0
  105. stfd f14, 0(SP)
  106. stfd f15, 8(SP)
  107. stfd f16, 16(SP)
  108. stfd f17, 24(SP)
  109. stfd f18, 32(SP)
  110. stfd f19, 40(SP)
  111. stfd f20, 48(SP)
  112. stfd f21, 56(SP)
  113. stfd f22, 64(SP)
  114. stfd f23, 72(SP)
  115. stfd f24, 80(SP)
  116. stfd f25, 88(SP)
  117. stfd f26, 96(SP)
  118. stfd f27, 104(SP)
  119. stfd f28, 112(SP)
  120. stfd f29, 120(SP)
  121. stfd f30, 128(SP)
  122. stfd f31, 136(SP)
  123. #ifdef __64BIT__
  124. std r31, 144(SP)
  125. std r30, 152(SP)
  126. std r29, 160(SP)
  127. std r28, 168(SP)
  128. std r27, 176(SP)
  129. std r26, 184(SP)
  130. std r25, 192(SP)
  131. std r24, 200(SP)
  132. std r23, 208(SP)
  133. std r22, 216(SP)
  134. std r21, 224(SP)
  135. #ifdef TRMMKERNEL
  136. std r20, 232(SP)
  137. std r19, 240(SP)
  138. #endif
  139. #else
  140. stw r31, 144(SP)
  141. stw r30, 148(SP)
  142. stw r29, 152(SP)
  143. stw r28, 156(SP)
  144. stw r27, 160(SP)
  145. stw r26, 164(SP)
  146. stw r25, 168(SP)
  147. stw r24, 172(SP)
  148. stw r23, 176(SP)
  149. stw r22, 180(SP)
  150. stw r21, 184(SP)
  151. #ifdef TRMMKERNEL
  152. stw r20, 188(SP)
  153. stw r19, 192(SP)
  154. #endif
  155. #endif
  156. stfd f1, ALPHA
  157. stw r0, FZERO
  158. #if defined(_AIX) || defined(__APPLE__)
  159. #if !defined(__64BIT__) && defined(DOUBLE)
  160. lwz LDC, FRAMESLOT(0) + STACKSIZE(SP)
  161. #endif
  162. #endif
  163. slwi LDC, LDC, BASE_SHIFT
  164. #if defined(TRMMKERNEL)
  165. #if (defined(linux) || defined(__FreeBSD__)) && defined(__64BIT__)
  166. ld OFFSET, FRAMESLOT(0) + STACKSIZE(SP)
  167. #endif
  168. #if defined(_AIX) || defined(__APPLE__)
  169. #ifdef __64BIT__
  170. ld OFFSET, FRAMESLOT(0) + STACKSIZE(SP)
  171. #else
  172. #ifdef DOUBLE
  173. lwz OFFSET, FRAMESLOT(1) + STACKSIZE(SP)
  174. #else
  175. lwz OFFSET, FRAMESLOT(0) + STACKSIZE(SP)
  176. #endif
  177. #endif
  178. #endif
  179. #endif
  180. #if defined(TRMMKERNEL) && !defined(LEFT)
  181. neg KK, OFFSET
  182. #endif
  183. cmpwi cr0, M, 0
  184. ble LL(999)
  185. cmpwi cr0, N, 0
  186. ble LL(999)
  187. cmpwi cr0, K, 0
  188. ble LL(999)
  189. lfs f0, FZERO
  190. li PREA, (16 * 3) * SIZE
  191. srawi. J, N, 2
  192. li PREC, 3 * SIZE
  193. ble LL(40)
  194. .align 4
  195. LL(10):
  196. mr CO1, C
  197. fmr f1, f0
  198. add CO2, C, LDC
  199. fmr f2, f0
  200. add CO3, CO2, LDC
  201. fmr f3, f0
  202. add CO4, CO3, LDC
  203. fmr f4, f0
  204. #if defined(TRMMKERNEL) && defined(LEFT)
  205. mr KK, OFFSET
  206. #endif
  207. slwi BB, K, BASE_SHIFT + 2
  208. fmr f5, f0
  209. srawi. I, M, 2
  210. fmr f6, f0
  211. mr AO, A
  212. fmr f7, f0
  213. add C, CO4, LDC
  214. fmr f8, f0
  215. fmr f9, f0
  216. fmr f10, f0
  217. fmr f11, f0
  218. fmr f12, f0
  219. fmr f13, f0
  220. fmr f14, f0
  221. fmr f15, f0
  222. ble LL(20)
  223. .align 4
  224. LL(11):
  225. #if defined(TRMMKERNEL)
  226. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  227. LFD f16, 0 * SIZE(AO)
  228. LFD f20, 0 * SIZE(B)
  229. LFD f17, 1 * SIZE(AO)
  230. LFD f21, 1 * SIZE(B)
  231. LFD f18, 2 * SIZE(AO)
  232. LFD f22, 2 * SIZE(B)
  233. LFD f19, 3 * SIZE(AO)
  234. LFD f23, 3 * SIZE(B)
  235. mr BO, B
  236. #else
  237. slwi r0, KK, 2 + BASE_SHIFT
  238. add AO, AO, r0
  239. add BO, B, r0
  240. LFD f16, 0 * SIZE(AO)
  241. LFD f20, 0 * SIZE(BO)
  242. LFD f17, 1 * SIZE(AO)
  243. LFD f21, 1 * SIZE(BO)
  244. LFD f18, 2 * SIZE(AO)
  245. LFD f22, 2 * SIZE(BO)
  246. LFD f19, 3 * SIZE(AO)
  247. LFD f23, 3 * SIZE(BO)
  248. #endif
  249. dcbtst CO1, PREC
  250. dcbtst CO2, PREC
  251. dcbtst CO3, PREC
  252. dcbtst CO4, PREC
  253. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  254. sub TEMP, K, KK
  255. #elif defined(LEFT)
  256. addi TEMP, KK, 4
  257. #else
  258. addi TEMP, KK, 4
  259. #endif
  260. srawi. TEMP, TEMP, 3
  261. mtspr CTR, TEMP
  262. ble LL(15)
  263. #else
  264. LFD f16, 0 * SIZE(AO)
  265. LFD f20, 0 * SIZE(B)
  266. LFD f17, 1 * SIZE(AO)
  267. LFD f21, 1 * SIZE(B)
  268. LFD f18, 2 * SIZE(AO)
  269. LFD f22, 2 * SIZE(B)
  270. LFD f19, 3 * SIZE(AO)
  271. LFD f23, 3 * SIZE(B)
  272. dcbtst CO1, PREC
  273. dcbtst CO2, PREC
  274. dcbtst CO3, PREC
  275. dcbtst CO4, PREC
  276. srawi. r0, K, 3
  277. mtctr r0
  278. mr BO, B
  279. ble LL(15)
  280. #endif
  281. .align 4
  282. LL(12):
  283. dcbt AO, PREA
  284. FMADD f0, f16, f20, f0
  285. nop
  286. FMADD f4, f16, f21, f4
  287. FMADD f8, f16, f22, f8
  288. FMADD f12, f16, f23, f12
  289. LFD f24, 4 * SIZE(AO)
  290. LFD f25, 5 * SIZE(AO)
  291. FMADD f1, f17, f20, f1
  292. FMADD f5, f17, f21, f5
  293. LFD f28, 4 * SIZE(BO)
  294. LFD f29, 5 * SIZE(BO)
  295. FMADD f9, f17, f22, f9
  296. FMADD f13, f17, f23, f13
  297. FMADD f2, f18, f20, f2
  298. FMADD f6, f18, f21, f6
  299. FMADD f10, f18, f22, f10
  300. FMADD f14, f18, f23, f14
  301. LFD f26, 6 * SIZE(AO)
  302. LFD f27, 7 * SIZE(AO)
  303. FMADD f3, f19, f20, f3
  304. FMADD f7, f19, f21, f7
  305. LFD f30, 6 * SIZE(BO)
  306. LFD f31, 7 * SIZE(BO)
  307. FMADD f11, f19, f22, f11
  308. FMADD f15, f19, f23, f15
  309. FMADD f0, f24, f28, f0
  310. FMADD f4, f24, f29, f4
  311. FMADD f8, f24, f30, f8
  312. FMADD f12, f24, f31, f12
  313. LFD f16, 8 * SIZE(AO)
  314. LFD f17, 9 * SIZE(AO)
  315. FMADD f1, f25, f28, f1
  316. FMADD f5, f25, f29, f5
  317. LFD f20, 8 * SIZE(BO)
  318. LFD f21, 9 * SIZE(BO)
  319. FMADD f9, f25, f30, f9
  320. FMADD f13, f25, f31, f13
  321. FMADD f2, f26, f28, f2
  322. FMADD f6, f26, f29, f6
  323. FMADD f10, f26, f30, f10
  324. FMADD f14, f26, f31, f14
  325. LFD f18, 10 * SIZE(AO)
  326. LFD f19, 11 * SIZE(AO)
  327. FMADD f3, f27, f28, f3
  328. FMADD f7, f27, f29, f7
  329. LFD f22, 10 * SIZE(BO)
  330. LFD f23, 11 * SIZE(BO)
  331. FMADD f11, f27, f30, f11
  332. FMADD f15, f27, f31, f15
  333. FMADD f0, f16, f20, f0
  334. FMADD f4, f16, f21, f4
  335. FMADD f8, f16, f22, f8
  336. FMADD f12, f16, f23, f12
  337. LFD f24, 12 * SIZE(AO)
  338. LFD f25, 13 * SIZE(AO)
  339. FMADD f1, f17, f20, f1
  340. FMADD f5, f17, f21, f5
  341. LFD f28, 12 * SIZE(BO)
  342. LFD f29, 13 * SIZE(BO)
  343. FMADD f9, f17, f22, f9
  344. FMADD f13, f17, f23, f13
  345. FMADD f2, f18, f20, f2
  346. FMADD f6, f18, f21, f6
  347. FMADD f10, f18, f22, f10
  348. FMADD f14, f18, f23, f14
  349. LFD f26, 14 * SIZE(AO)
  350. LFD f27, 15 * SIZE(AO)
  351. FMADD f3, f19, f20, f3
  352. FMADD f7, f19, f21, f7
  353. LFD f30, 14 * SIZE(BO)
  354. LFD f31, 15 * SIZE(BO)
  355. FMADD f11, f19, f22, f11
  356. FMADD f15, f19, f23, f15
  357. FMADD f0, f24, f28, f0
  358. FMADD f4, f24, f29, f4
  359. FMADD f8, f24, f30, f8
  360. FMADD f12, f24, f31, f12
  361. LFD f16, 16 * SIZE(AO)
  362. LFD f17, 17 * SIZE(AO)
  363. FMADD f1, f25, f28, f1
  364. FMADD f5, f25, f29, f5
  365. LFD f20, 16 * SIZE(BO)
  366. LFD f21, 17 * SIZE(BO)
  367. FMADD f9, f25, f30, f9
  368. FMADD f13, f25, f31, f13
  369. FMADD f2, f26, f28, f2
  370. FMADD f6, f26, f29, f6
  371. FMADD f10, f26, f30, f10
  372. FMADD f14, f26, f31, f14
  373. LFD f18, 18 * SIZE(AO)
  374. LFD f19, 19 * SIZE(AO)
  375. FMADD f3, f27, f28, f3
  376. FMADD f7, f27, f29, f7
  377. LFD f22, 18 * SIZE(BO)
  378. LFD f23, 19 * SIZE(BO)
  379. FMADD f11, f27, f30, f11
  380. FMADD f15, f27, f31, f15
  381. FMADD f0, f16, f20, f0
  382. FMADD f4, f16, f21, f4
  383. FMADD f8, f16, f22, f8
  384. FMADD f12, f16, f23, f12
  385. LFD f24, 20 * SIZE(AO)
  386. LFD f25, 21 * SIZE(AO)
  387. FMADD f1, f17, f20, f1
  388. FMADD f5, f17, f21, f5
  389. LFD f28, 20 * SIZE(BO)
  390. LFD f29, 21 * SIZE(BO)
  391. FMADD f9, f17, f22, f9
  392. FMADD f13, f17, f23, f13
  393. FMADD f2, f18, f20, f2
  394. FMADD f6, f18, f21, f6
  395. FMADD f10, f18, f22, f10
  396. FMADD f14, f18, f23, f14
  397. LFD f26, 22 * SIZE(AO)
  398. LFD f27, 23 * SIZE(AO)
  399. FMADD f3, f19, f20, f3
  400. FMADD f7, f19, f21, f7
  401. LFD f30, 22 * SIZE(BO)
  402. LFD f31, 23 * SIZE(BO)
  403. FMADD f11, f19, f22, f11
  404. FMADD f15, f19, f23, f15
  405. FMADD f0, f24, f28, f0
  406. FMADD f4, f24, f29, f4
  407. FMADD f8, f24, f30, f8
  408. FMADD f12, f24, f31, f12
  409. LFD f16, 24 * SIZE(AO)
  410. LFD f17, 25 * SIZE(AO)
  411. FMADD f1, f25, f28, f1
  412. FMADD f5, f25, f29, f5
  413. LFD f20, 24 * SIZE(BO)
  414. LFD f21, 25 * SIZE(BO)
  415. FMADD f9, f25, f30, f9
  416. FMADD f13, f25, f31, f13
  417. FMADD f2, f26, f28, f2
  418. FMADD f6, f26, f29, f6
  419. FMADD f10, f26, f30, f10
  420. FMADD f14, f26, f31, f14
  421. LFD f18, 26 * SIZE(AO)
  422. LFD f19, 27 * SIZE(AO)
  423. FMADD f3, f27, f28, f3
  424. FMADD f7, f27, f29, f7
  425. LFD f22, 26 * SIZE(BO)
  426. LFD f23, 27 * SIZE(BO)
  427. FMADD f11, f27, f30, f11
  428. FMADD f15, f27, f31, f15
  429. FMADD f0, f16, f20, f0
  430. FMADD f4, f16, f21, f4
  431. FMADD f8, f16, f22, f8
  432. FMADD f12, f16, f23, f12
  433. LFD f24, 28 * SIZE(AO)
  434. LFD f25, 29 * SIZE(AO)
  435. FMADD f1, f17, f20, f1
  436. FMADD f5, f17, f21, f5
  437. LFD f28, 28 * SIZE(BO)
  438. LFD f29, 29 * SIZE(BO)
  439. FMADD f9, f17, f22, f9
  440. FMADD f13, f17, f23, f13
  441. FMADD f2, f18, f20, f2
  442. FMADD f6, f18, f21, f6
  443. FMADD f10, f18, f22, f10
  444. FMADD f14, f18, f23, f14
  445. LFD f26, 30 * SIZE(AO)
  446. LFD f27, 31 * SIZE(AO)
  447. FMADD f3, f19, f20, f3
  448. FMADD f7, f19, f21, f7
  449. LFD f30, 30 * SIZE(BO)
  450. LFD f31, 31 * SIZE(BO)
  451. FMADD f11, f19, f22, f11
  452. FMADD f15, f19, f23, f15
  453. FMADD f0, f24, f28, f0
  454. FMADD f4, f24, f29, f4
  455. FMADD f8, f24, f30, f8
  456. FMADD f12, f24, f31, f12
  457. LFD f16, 32 * SIZE(AO)
  458. LFD f17, 33 * SIZE(AO)
  459. FMADD f1, f25, f28, f1
  460. FMADD f5, f25, f29, f5
  461. LFD f20, 32 * SIZE(BO)
  462. LFD f21, 33 * SIZE(BO)
  463. FMADD f9, f25, f30, f9
  464. FMADD f13, f25, f31, f13
  465. LFD f18, 34 * SIZE(AO)
  466. LFD f19, 35 * SIZE(AO)
  467. FMADD f2, f26, f28, f2
  468. FMADD f6, f26, f29, f6
  469. LFD f22, 34 * SIZE(BO)
  470. LFD f23, 35 * SIZE(BO)
  471. FMADD f10, f26, f30, f10
  472. FMADD f14, f26, f31, f14
  473. FMADD f3, f27, f28, f3
  474. FMADD f7, f27, f29, f7
  475. FMADD f11, f27, f30, f11
  476. FMADD f15, f27, f31, f15
  477. addi AO, AO, 32 * SIZE
  478. addi BO, BO, 32 * SIZE
  479. bdnz LL(12)
  480. .align 4
  481. LL(15):
  482. lfd f30, ALPHA
  483. dcbtst B, BB
  484. addi BB, BB, 16 * SIZE
  485. dcbtst B, BB
  486. addi BB, BB, 16 * SIZE
  487. #if defined(TRMMKERNEL)
  488. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  489. sub TEMP, K, KK
  490. #elif defined(LEFT)
  491. addi TEMP, KK, 4
  492. #else
  493. addi TEMP, KK, 4
  494. #endif
  495. andi. TEMP, TEMP, 7
  496. mtspr CTR, TEMP
  497. #else
  498. andi. r0, K, 7
  499. mtspr CTR, r0
  500. #endif
  501. ble+ LL(18)
  502. .align 4
  503. LL(16):
  504. FMADD f0, f16, f20, f0
  505. FMADD f4, f16, f21, f4
  506. FMADD f8, f16, f22, f8
  507. FMADD f12, f16, f23, f12
  508. FMADD f1, f17, f20, f1
  509. FMADD f5, f17, f21, f5
  510. FMADD f9, f17, f22, f9
  511. FMADD f13, f17, f23, f13
  512. FMADD f2, f18, f20, f2
  513. FMADD f6, f18, f21, f6
  514. FMADD f10, f18, f22, f10
  515. FMADD f14, f18, f23, f14
  516. FMADD f3, f19, f20, f3
  517. FMADD f7, f19, f21, f7
  518. FMADD f11, f19, f22, f11
  519. FMADD f15, f19, f23, f15
  520. LFD f16, 4 * SIZE(AO)
  521. LFD f17, 5 * SIZE(AO)
  522. LFD f18, 6 * SIZE(AO)
  523. LFD f19, 7 * SIZE(AO)
  524. LFD f20, 4 * SIZE(BO)
  525. LFD f21, 5 * SIZE(BO)
  526. LFD f22, 6 * SIZE(BO)
  527. LFD f23, 7 * SIZE(BO)
  528. addi BO, BO, 4 * SIZE
  529. addi AO, AO, 4 * SIZE
  530. nop
  531. bdnz LL(16)
  532. .align 4
  533. LL(18):
  534. #ifndef TRMMKERNEL
  535. LFD f16, 0 * SIZE(CO1)
  536. LFD f17, 1 * SIZE(CO1)
  537. LFD f18, 2 * SIZE(CO1)
  538. LFD f19, 3 * SIZE(CO1)
  539. LFD f20, 0 * SIZE(CO2)
  540. LFD f21, 1 * SIZE(CO2)
  541. LFD f22, 2 * SIZE(CO2)
  542. LFD f23, 3 * SIZE(CO2)
  543. FMADD f0, f0, f30, f16
  544. FMADD f1, f1, f30, f17
  545. FMADD f2, f2, f30, f18
  546. FMADD f3, f3, f30, f19
  547. FMADD f4, f4, f30, f20
  548. FMADD f5, f5, f30, f21
  549. FMADD f6, f6, f30, f22
  550. FMADD f7, f7, f30, f23
  551. LFD f16, 0 * SIZE(CO3)
  552. LFD f17, 1 * SIZE(CO3)
  553. LFD f18, 2 * SIZE(CO3)
  554. LFD f19, 3 * SIZE(CO3)
  555. LFD f20, 0 * SIZE(CO4)
  556. LFD f21, 1 * SIZE(CO4)
  557. LFD f22, 2 * SIZE(CO4)
  558. LFD f23, 3 * SIZE(CO4)
  559. FMADD f8, f8, f30, f16
  560. FMADD f9, f9, f30, f17
  561. FMADD f10, f10, f30, f18
  562. FMADD f11, f11, f30, f19
  563. FMADD f12, f12, f30, f20
  564. FMADD f13, f13, f30, f21
  565. FMADD f14, f14, f30, f22
  566. FMADD f15, f15, f30, f23
  567. #else
  568. FMUL f0, f0, f30
  569. FMUL f1, f1, f30
  570. FMUL f2, f2, f30
  571. FMUL f3, f3, f30
  572. FMUL f4, f4, f30
  573. FMUL f5, f5, f30
  574. FMUL f6, f6, f30
  575. FMUL f7, f7, f30
  576. FMUL f8, f8, f30
  577. FMUL f9, f9, f30
  578. FMUL f10, f10, f30
  579. FMUL f11, f11, f30
  580. FMUL f12, f12, f30
  581. FMUL f13, f13, f30
  582. FMUL f14, f14, f30
  583. FMUL f15, f15, f30
  584. #endif
  585. STFD f0, 0 * SIZE(CO1)
  586. STFD f1, 1 * SIZE(CO1)
  587. STFD f2, 2 * SIZE(CO1)
  588. STFD f3, 3 * SIZE(CO1)
  589. lfs f0, FZERO
  590. fmr f1, f0
  591. fmr f2, f0
  592. fmr f3, f0
  593. STFD f4, 0 * SIZE(CO2)
  594. STFD f5, 1 * SIZE(CO2)
  595. STFD f6, 2 * SIZE(CO2)
  596. STFD f7, 3 * SIZE(CO2)
  597. fmr f4, f0
  598. fmr f5, f0
  599. fmr f6, f0
  600. fmr f7, f0
  601. STFD f8, 0 * SIZE(CO3)
  602. STFD f9, 1 * SIZE(CO3)
  603. STFD f10, 2 * SIZE(CO3)
  604. STFD f11, 3 * SIZE(CO3)
  605. fmr f8, f0
  606. fmr f9, f0
  607. fmr f10, f0
  608. fmr f11, f0
  609. STFD f12, 0 * SIZE(CO4)
  610. STFD f13, 1 * SIZE(CO4)
  611. STFD f14, 2 * SIZE(CO4)
  612. STFD f15, 3 * SIZE(CO4)
  613. fmr f12, f0
  614. fmr f13, f0
  615. fmr f14, f0
  616. fmr f15, f0
  617. addi CO1, CO1, 4 * SIZE
  618. addi CO2, CO2, 4 * SIZE
  619. addi CO3, CO3, 4 * SIZE
  620. addi CO4, CO4, 4 * SIZE
  621. #ifdef TRMMKERNEL
  622. #if ( defined(LEFT) && defined(TRANSA)) || \
  623. (!defined(LEFT) && !defined(TRANSA))
  624. sub TEMP, K, KK
  625. #ifdef LEFT
  626. addi TEMP, TEMP, -4
  627. #else
  628. addi TEMP, TEMP, -4
  629. #endif
  630. slwi TEMP, TEMP, 2 + BASE_SHIFT
  631. add AO, AO, TEMP
  632. add BO, BO, TEMP
  633. #endif
  634. #ifdef LEFT
  635. addi KK, KK, 4
  636. #endif
  637. #endif
  638. addic. I, I, -1
  639. bgt+ LL(11)
  640. .align 4
  641. LL(20):
  642. andi. I, M, 2
  643. ble LL(30)
  644. #if defined(TRMMKERNEL)
  645. #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  646. LFD f16, 0 * SIZE(AO)
  647. LFD f17, 1 * SIZE(AO)
  648. LFD f18, 2 * SIZE(AO)
  649. LFD f19, 3 * SIZE(AO)
  650. LFD f20, 0 * SIZE(B)
  651. LFD f21, 1 * SIZE(B)
  652. LFD f22, 2 * SIZE(B)
  653. LFD f23, 3 * SIZE(B)
  654. LFD f24, 4 * SIZE(B)
  655. LFD f25, 5 * SIZE(B)
  656. LFD f26, 6 * SIZE(B)
  657. LFD f27, 7 * SIZE(B)
  658. mr BO, B
  659. #else
  660. slwi r0, KK, 1 + BASE_SHIFT
  661. slwi TEMP, KK, 2 + BASE_SHIFT
  662. add AO, AO, r0
  663. add BO, B, TEMP
  664. LFD f16, 0 * SIZE(AO)
  665. LFD f17, 1 * SIZE(AO)
  666. LFD f18, 2 * SIZE(AO)
  667. LFD f19, 3 * SIZE(AO)
  668. LFD f20, 0 * SIZE(BO)
  669. LFD f21, 1 * SIZE(BO)
  670. LFD f22, 2 * SIZE(BO)
  671. LFD f23, 3 * SIZE(BO)
  672. LFD f24, 4 * SIZE(BO)
  673. LFD f25, 5 * SIZE(BO)
  674. LFD f26, 6 * SIZE(BO)
  675. LFD f27, 7 * SIZE(BO)
  676. #endif
  677. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  678. sub TEMP, K, KK
  679. #elif defined(LEFT)
  680. addi TEMP, KK, 2
  681. #else
  682. addi TEMP, KK, 4
  683. #endif
  684. srawi. TEMP, TEMP, 2
  685. mtspr CTR, TEMP
  686. #else
  687. LFD f16, 0 * SIZE(AO)
  688. LFD f17, 1 * SIZE(AO)
  689. LFD f18, 2 * SIZE(AO)
  690. LFD f19, 3 * SIZE(AO)
  691. LFD f20, 0 * SIZE(B)
  692. LFD f21, 1 * SIZE(B)
  693. LFD f22, 2 * SIZE(B)
  694. LFD f23, 3 * SIZE(B)
  695. LFD f24, 4 * SIZE(B)
  696. LFD f25, 5 * SIZE(B)
  697. LFD f26, 6 * SIZE(B)
  698. LFD f27, 7 * SIZE(B)
  699. srawi. r0, K, 2
  700. mtspr CTR, r0
  701. mr BO, B
  702. #endif
  703. ble LL(25)
  704. .align 5
  705. LL(22):
  706. FMADD f0, f16, f20, f0
  707. FMADD f1, f17, f20, f1
  708. FMADD f4, f16, f21, f4
  709. FMADD f5, f17, f21, f5
  710. FMADD f8, f16, f22, f8
  711. FMADD f9, f17, f22, f9
  712. FMADD f12, f16, f23, f12
  713. FMADD f13, f17, f23, f13
  714. LFD f20, 8 * SIZE(BO)
  715. LFD f21, 9 * SIZE(BO)
  716. LFD f22, 10 * SIZE(BO)
  717. LFD f23, 11 * SIZE(BO)
  718. FMADD f2, f18, f24, f2
  719. FMADD f3, f19, f24, f3
  720. FMADD f6, f18, f25, f6
  721. FMADD f7, f19, f25, f7
  722. FMADD f10, f18, f26, f10
  723. FMADD f11, f19, f26, f11
  724. FMADD f14, f18, f27, f14
  725. FMADD f15, f19, f27, f15
  726. LFD f16, 4 * SIZE(AO)
  727. LFD f17, 5 * SIZE(AO)
  728. LFD f18, 6 * SIZE(AO)
  729. LFD f19, 7 * SIZE(AO)
  730. FMADD f0, f16, f20, f0
  731. FMADD f1, f17, f20, f1
  732. FMADD f4, f16, f21, f4
  733. FMADD f5, f17, f21, f5
  734. LFD f24, 12 * SIZE(BO)
  735. LFD f25, 13 * SIZE(BO)
  736. LFD f26, 14 * SIZE(BO)
  737. LFD f27, 15 * SIZE(BO)
  738. FMADD f8, f16, f22, f8
  739. FMADD f9, f17, f22, f9
  740. FMADD f12, f16, f23, f12
  741. FMADD f13, f17, f23, f13
  742. LFD f20, 16 * SIZE(BO)
  743. LFD f21, 17 * SIZE(BO)
  744. LFD f22, 18 * SIZE(BO)
  745. LFD f23, 19 * SIZE(BO)
  746. FMADD f2, f18, f24, f2
  747. FMADD f3, f19, f24, f3
  748. FMADD f6, f18, f25, f6
  749. FMADD f7, f19, f25, f7
  750. FMADD f10, f18, f26, f10
  751. FMADD f11, f19, f26, f11
  752. FMADD f14, f18, f27, f14
  753. FMADD f15, f19, f27, f15
  754. LFD f16, 8 * SIZE(AO)
  755. LFD f17, 9 * SIZE(AO)
  756. LFD f18, 10 * SIZE(AO)
  757. LFD f19, 11 * SIZE(AO)
  758. LFD f24, 20 * SIZE(BO)
  759. LFD f25, 21 * SIZE(BO)
  760. LFD f26, 22 * SIZE(BO)
  761. LFD f27, 23 * SIZE(BO)
  762. addi AO, AO, 8 * SIZE
  763. addi BO, BO, 16 * SIZE
  764. bdnz LL(22)
  765. fadd f0, f2, f0
  766. fadd f1, f3, f1
  767. fadd f4, f6, f4
  768. fadd f5, f7, f5
  769. fadd f8, f10, f8
  770. fadd f9, f11, f9
  771. fadd f12, f14, f12
  772. fadd f13, f15, f13
  773. .align 4
  774. LL(25):
  775. lfd f30, ALPHA
  776. #if defined(TRMMKERNEL)
  777. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  778. sub TEMP, K, KK
  779. #elif defined(LEFT)
  780. addi TEMP, KK, 2
  781. #else
  782. addi TEMP, KK, 4
  783. #endif
  784. andi. TEMP, TEMP, 3
  785. mtspr CTR, TEMP
  786. #else
  787. andi. r0, K, 3
  788. mtspr CTR, r0
  789. #endif
  790. ble+ LL(28)
  791. .align 4
  792. LL(26):
  793. FMADD f0, f16, f20, f0
  794. FMADD f1, f17, f20, f1
  795. FMADD f4, f16, f21, f4
  796. FMADD f5, f17, f21, f5
  797. FMADD f8, f16, f22, f8
  798. FMADD f9, f17, f22, f9
  799. FMADD f12, f16, f23, f12
  800. FMADD f13, f17, f23, f13
  801. LFD f16, 2 * SIZE(AO)
  802. LFD f17, 3 * SIZE(AO)
  803. LFD f20, 4 * SIZE(BO)
  804. LFD f21, 5 * SIZE(BO)
  805. LFD f22, 6 * SIZE(BO)
  806. LFD f23, 7 * SIZE(BO)
  807. addi BO, BO, 4 * SIZE
  808. addi AO, AO, 2 * SIZE
  809. bdnz LL(26)
  810. .align 4
  811. LL(28):
  812. #ifndef TRMMKERNEL
  813. LFD f16, 0 * SIZE(CO1)
  814. LFD f17, 1 * SIZE(CO1)
  815. LFD f18, 0 * SIZE(CO2)
  816. LFD f19, 1 * SIZE(CO2)
  817. FMADD f0, f0, f30, f16
  818. FMADD f1, f1, f30, f17
  819. FMADD f4, f4, f30, f18
  820. FMADD f5, f5, f30, f19
  821. LFD f20, 0 * SIZE(CO3)
  822. LFD f21, 1 * SIZE(CO3)
  823. LFD f22, 0 * SIZE(CO4)
  824. LFD f23, 1 * SIZE(CO4)
  825. FMADD f8, f8, f30, f20
  826. FMADD f9, f9, f30, f21
  827. FMADD f12, f12, f30, f22
  828. FMADD f13, f13, f30, f23
  829. #else
  830. FMUL f0, f0, f30
  831. FMUL f1, f1, f30
  832. FMUL f4, f4, f30
  833. FMUL f5, f5, f30
  834. FMUL f8, f8, f30
  835. FMUL f9, f9, f30
  836. FMUL f12, f12, f30
  837. FMUL f13, f13, f30
  838. #endif
  839. STFD f0, 0 * SIZE(CO1)
  840. STFD f1, 1 * SIZE(CO1)
  841. STFD f4, 0 * SIZE(CO2)
  842. STFD f5, 1 * SIZE(CO2)
  843. lfs f0, FZERO
  844. fmr f1, f0
  845. fmr f2, f0
  846. fmr f3, f0
  847. STFD f8, 0 * SIZE(CO3)
  848. STFD f9, 1 * SIZE(CO3)
  849. STFD f12, 0 * SIZE(CO4)
  850. STFD f13, 1 * SIZE(CO4)
  851. fmr f4, f0
  852. fmr f5, f0
  853. fmr f6, f0
  854. fmr f7, f0
  855. fmr f8, f0
  856. fmr f9, f0
  857. fmr f10, f0
  858. fmr f11, f0
  859. fmr f12, f0
  860. fmr f13, f0
  861. fmr f14, f0
  862. fmr f15, f0
  863. addi CO1, CO1, 2 * SIZE
  864. addi CO2, CO2, 2 * SIZE
  865. addi CO3, CO3, 2 * SIZE
  866. addi CO4, CO4, 2 * SIZE
  867. #ifdef TRMMKERNEL
  868. #if ( defined(LEFT) && defined(TRANSA)) || \
  869. (!defined(LEFT) && !defined(TRANSA))
  870. sub TEMP, K, KK
  871. #ifdef LEFT
  872. addi TEMP, TEMP, -2
  873. #else
  874. addi TEMP, TEMP, -4
  875. #endif
  876. slwi r0, TEMP, 1 + BASE_SHIFT
  877. slwi TEMP, TEMP, 2 + BASE_SHIFT
  878. add AO, AO, r0
  879. add BO, BO, TEMP
  880. #endif
  881. #ifdef LEFT
  882. addi KK, KK, 2
  883. #endif
  884. #endif
  885. .align 4
  886. LL(30):
  887. andi. I, M, 1
  888. ble LL(39)
  889. #if defined(TRMMKERNEL)
  890. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  891. LFD f16, 0 * SIZE(AO)
  892. LFD f17, 1 * SIZE(AO)
  893. LFD f18, 2 * SIZE(AO)
  894. LFD f19, 3 * SIZE(AO)
  895. LFD f20, 0 * SIZE(B)
  896. LFD f21, 1 * SIZE(B)
  897. LFD f22, 2 * SIZE(B)
  898. LFD f23, 3 * SIZE(B)
  899. LFD f24, 4 * SIZE(B)
  900. LFD f25, 5 * SIZE(B)
  901. LFD f26, 6 * SIZE(B)
  902. LFD f27, 7 * SIZE(B)
  903. mr BO, B
  904. #else
  905. slwi r0, KK, 0 + BASE_SHIFT
  906. slwi TEMP, KK, 2 + BASE_SHIFT
  907. add AO, AO, r0
  908. add BO, B, TEMP
  909. LFD f16, 0 * SIZE(AO)
  910. LFD f17, 1 * SIZE(AO)
  911. LFD f18, 2 * SIZE(AO)
  912. LFD f19, 3 * SIZE(AO)
  913. LFD f20, 0 * SIZE(BO)
  914. LFD f21, 1 * SIZE(BO)
  915. LFD f22, 2 * SIZE(BO)
  916. LFD f23, 3 * SIZE(BO)
  917. LFD f24, 4 * SIZE(BO)
  918. LFD f25, 5 * SIZE(BO)
  919. LFD f26, 6 * SIZE(BO)
  920. LFD f27, 7 * SIZE(BO)
  921. #endif
  922. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  923. sub TEMP, K, KK
  924. #elif defined(LEFT)
  925. addi TEMP, KK, 1
  926. #else
  927. addi TEMP, KK, 4
  928. #endif
  929. srawi. TEMP, TEMP, 2
  930. mtspr CTR, TEMP
  931. #else
  932. LFD f16, 0 * SIZE(AO)
  933. LFD f17, 1 * SIZE(AO)
  934. LFD f18, 2 * SIZE(AO)
  935. LFD f19, 3 * SIZE(AO)
  936. LFD f20, 0 * SIZE(B)
  937. LFD f21, 1 * SIZE(B)
  938. LFD f22, 2 * SIZE(B)
  939. LFD f23, 3 * SIZE(B)
  940. LFD f24, 4 * SIZE(B)
  941. LFD f25, 5 * SIZE(B)
  942. LFD f26, 6 * SIZE(B)
  943. LFD f27, 7 * SIZE(B)
  944. srawi. r0, K, 2
  945. mtspr CTR, r0
  946. mr BO, B
  947. #endif
  948. ble LL(35)
  949. .align 5
  950. LL(32):
  951. FMADD f0, f16, f20, f0
  952. FMADD f4, f16, f21, f4
  953. FMADD f8, f16, f22, f8
  954. FMADD f12, f16, f23, f12
  955. LFD f20, 8 * SIZE(BO)
  956. LFD f21, 9 * SIZE(BO)
  957. LFD f22, 10 * SIZE(BO)
  958. LFD f23, 11 * SIZE(BO)
  959. FMADD f1, f17, f24, f1
  960. FMADD f5, f17, f25, f5
  961. FMADD f9, f17, f26, f9
  962. FMADD f13, f17, f27, f13
  963. LFD f24, 12 * SIZE(BO)
  964. LFD f25, 13 * SIZE(BO)
  965. LFD f26, 14 * SIZE(BO)
  966. LFD f27, 15 * SIZE(BO)
  967. FMADD f0, f18, f20, f0
  968. FMADD f4, f18, f21, f4
  969. FMADD f8, f18, f22, f8
  970. FMADD f12, f18, f23, f12
  971. LFD f20, 16 * SIZE(BO)
  972. LFD f21, 17 * SIZE(BO)
  973. LFD f22, 18 * SIZE(BO)
  974. LFD f23, 19 * SIZE(BO)
  975. FMADD f1, f19, f24, f1
  976. FMADD f5, f19, f25, f5
  977. FMADD f9, f19, f26, f9
  978. FMADD f13, f19, f27, f13
  979. LFD f16, 4 * SIZE(AO)
  980. LFD f17, 5 * SIZE(AO)
  981. LFD f18, 6 * SIZE(AO)
  982. LFD f19, 7 * SIZE(AO)
  983. LFD f24, 20 * SIZE(BO)
  984. LFD f25, 21 * SIZE(BO)
  985. LFD f26, 22 * SIZE(BO)
  986. LFD f27, 23 * SIZE(BO)
  987. addi AO, AO, 4 * SIZE
  988. addi BO, BO, 16 * SIZE
  989. bdnz LL(32)
  990. fadd f0, f1, f0
  991. fadd f4, f5, f4
  992. fadd f8, f9, f8
  993. fadd f12, f13, f12
  994. .align 4
  995. LL(35):
  996. lfd f30, ALPHA
  997. #if defined(TRMMKERNEL)
  998. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  999. sub TEMP, K, KK
  1000. #elif defined(LEFT)
  1001. addi TEMP, KK, 1
  1002. #else
  1003. addi TEMP, KK, 4
  1004. #endif
  1005. andi. TEMP, TEMP, 3
  1006. mtspr CTR, TEMP
  1007. #else
  1008. andi. r0, K, 3
  1009. mtspr CTR, r0
  1010. #endif
  1011. ble+ LL(38)
  1012. .align 4
  1013. LL(36):
  1014. FMADD f0, f16, f20, f0
  1015. FMADD f4, f16, f21, f4
  1016. FMADD f8, f16, f22, f8
  1017. FMADD f12, f16, f23, f12
  1018. LFD f16, 1 * SIZE(AO)
  1019. LFD f20, 4 * SIZE(BO)
  1020. LFD f21, 5 * SIZE(BO)
  1021. LFD f22, 6 * SIZE(BO)
  1022. LFD f23, 7 * SIZE(BO)
  1023. addi BO, BO, 4 * SIZE
  1024. addi AO, AO, 1 * SIZE
  1025. bdnz LL(36)
  1026. .align 4
  1027. LL(38):
  1028. #ifndef TRMMKERNEL
  1029. LFD f16, 0 * SIZE(CO1)
  1030. LFD f18, 0 * SIZE(CO2)
  1031. LFD f20, 0 * SIZE(CO3)
  1032. LFD f22, 0 * SIZE(CO4)
  1033. FMADD f0, f0, f30, f16
  1034. FMADD f4, f4, f30, f18
  1035. FMADD f8, f8, f30, f20
  1036. FMADD f12, f12, f30, f22
  1037. #else
  1038. FMUL f0, f0, f30
  1039. FMUL f4, f4, f30
  1040. FMUL f8, f8, f30
  1041. FMUL f12, f12, f30
  1042. #endif
  1043. STFD f0, 0 * SIZE(CO1)
  1044. STFD f4, 0 * SIZE(CO2)
  1045. STFD f8, 0 * SIZE(CO3)
  1046. STFD f12, 0 * SIZE(CO4)
  1047. lfs f0, FZERO
  1048. fmr f1, f0
  1049. fmr f4, f0
  1050. fmr f5, f0
  1051. fmr f8, f0
  1052. fmr f9, f0
  1053. fmr f12, f0
  1054. fmr f13, f0
  1055. #ifdef TRMMKERNEL
  1056. #if ( defined(LEFT) && defined(TRANSA)) || \
  1057. (!defined(LEFT) && !defined(TRANSA))
  1058. sub TEMP, K, KK
  1059. #ifdef LEFT
  1060. addi TEMP, TEMP, -1
  1061. #else
  1062. addi TEMP, TEMP, -4
  1063. #endif
  1064. slwi r0, TEMP, 0 + BASE_SHIFT
  1065. slwi TEMP, TEMP, 2 + BASE_SHIFT
  1066. add AO, AO, r0
  1067. add BO, BO, TEMP
  1068. #endif
  1069. #ifdef LEFT
  1070. addi KK, KK, 2
  1071. #endif
  1072. #endif
  1073. .align 4
  1074. LL(39):
  1075. #if defined(TRMMKERNEL) && !defined(LEFT)
  1076. addi KK, KK, 4
  1077. #endif
  1078. lfs f0, FZERO
  1079. mr B, BO
  1080. addic. J, J, -1
  1081. bgt LL(10)
  1082. .align 4
  1083. LL(40):
  1084. mr CO1, C
  1085. add CO2, C, LDC
  1086. andi. J, N, 2
  1087. ble LL(70)
  1088. #if defined(TRMMKERNEL) && defined(LEFT)
  1089. mr KK, OFFSET
  1090. #endif
  1091. lfs f0, FZERO
  1092. fmr f1, f0
  1093. fmr f2, f0
  1094. fmr f3, f0
  1095. fmr f4, f0
  1096. fmr f5, f0
  1097. fmr f6, f0
  1098. fmr f7, f0
  1099. srawi. I, M, 2
  1100. add C, CO2, LDC
  1101. mr AO, A
  1102. ble LL(50)
  1103. .align 4
  1104. LL(41):
  1105. #if defined(TRMMKERNEL)
  1106. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1107. LFD f16, 0 * SIZE(AO)
  1108. LFD f17, 1 * SIZE(AO)
  1109. LFD f18, 2 * SIZE(AO)
  1110. LFD f19, 3 * SIZE(AO)
  1111. LFD f20, 0 * SIZE(B)
  1112. LFD f21, 1 * SIZE(B)
  1113. LFD f22, 2 * SIZE(B)
  1114. LFD f23, 3 * SIZE(B)
  1115. mr BO, B
  1116. #else
  1117. slwi r0, KK, 2 + BASE_SHIFT
  1118. slwi TEMP, KK, 1 + BASE_SHIFT
  1119. add AO, AO, r0
  1120. add BO, B, TEMP
  1121. LFD f16, 0 * SIZE(AO)
  1122. LFD f17, 1 * SIZE(AO)
  1123. LFD f18, 2 * SIZE(AO)
  1124. LFD f19, 3 * SIZE(AO)
  1125. LFD f20, 0 * SIZE(BO)
  1126. LFD f21, 1 * SIZE(BO)
  1127. LFD f22, 2 * SIZE(BO)
  1128. LFD f23, 3 * SIZE(BO)
  1129. #endif
  1130. dcbtst CO1, PREC
  1131. dcbtst CO2, PREC
  1132. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1133. sub TEMP, K, KK
  1134. #elif defined(LEFT)
  1135. addi TEMP, KK, 4
  1136. #else
  1137. addi TEMP, KK, 2
  1138. #endif
  1139. srawi. TEMP, TEMP, 2
  1140. mtspr CTR, TEMP
  1141. #else
  1142. LFD f16, 0 * SIZE(AO)
  1143. LFD f17, 1 * SIZE(AO)
  1144. LFD f18, 2 * SIZE(AO)
  1145. LFD f19, 3 * SIZE(AO)
  1146. LFD f20, 0 * SIZE(B)
  1147. LFD f21, 1 * SIZE(B)
  1148. LFD f22, 2 * SIZE(B)
  1149. LFD f23, 3 * SIZE(B)
  1150. dcbtst CO1, PREC
  1151. dcbtst CO2, PREC
  1152. srawi. r0, K, 2
  1153. mtspr CTR, r0
  1154. mr BO, B
  1155. #endif
  1156. ble LL(45)
  1157. .align 5
  1158. LL(42):
  1159. FMADD f0, f16, f20, f0
  1160. FMADD f1, f17, f20, f1
  1161. FMADD f2, f18, f20, f2
  1162. FMADD f3, f19, f20, f3
  1163. FMADD f4, f16, f21, f4
  1164. FMADD f5, f17, f21, f5
  1165. FMADD f6, f18, f21, f6
  1166. FMADD f7, f19, f21, f7
  1167. LFD f16, 4 * SIZE(AO)
  1168. LFD f17, 5 * SIZE(AO)
  1169. LFD f18, 6 * SIZE(AO)
  1170. LFD f19, 7 * SIZE(AO)
  1171. FMADD f0, f16, f22, f0
  1172. FMADD f1, f17, f22, f1
  1173. FMADD f2, f18, f22, f2
  1174. FMADD f3, f19, f22, f3
  1175. FMADD f4, f16, f23, f4
  1176. FMADD f5, f17, f23, f5
  1177. FMADD f6, f18, f23, f6
  1178. FMADD f7, f19, f23, f7
  1179. LFD f16, 8 * SIZE(AO)
  1180. LFD f17, 9 * SIZE(AO)
  1181. LFD f18, 10 * SIZE(AO)
  1182. LFD f19, 11 * SIZE(AO)
  1183. LFD f20, 4 * SIZE(BO)
  1184. LFD f21, 5 * SIZE(BO)
  1185. LFD f22, 6 * SIZE(BO)
  1186. LFD f23, 7 * SIZE(BO)
  1187. FMADD f0, f16, f20, f0
  1188. FMADD f1, f17, f20, f1
  1189. FMADD f2, f18, f20, f2
  1190. FMADD f3, f19, f20, f3
  1191. FMADD f4, f16, f21, f4
  1192. FMADD f5, f17, f21, f5
  1193. FMADD f6, f18, f21, f6
  1194. FMADD f7, f19, f21, f7
  1195. LFD f16, 12 * SIZE(AO)
  1196. LFD f17, 13 * SIZE(AO)
  1197. LFD f18, 14 * SIZE(AO)
  1198. LFD f19, 15 * SIZE(AO)
  1199. FMADD f0, f16, f22, f0
  1200. FMADD f1, f17, f22, f1
  1201. FMADD f2, f18, f22, f2
  1202. FMADD f3, f19, f22, f3
  1203. FMADD f4, f16, f23, f4
  1204. FMADD f5, f17, f23, f5
  1205. FMADD f6, f18, f23, f6
  1206. FMADD f7, f19, f23, f7
  1207. LFD f16, 16 * SIZE(AO)
  1208. LFD f17, 17 * SIZE(AO)
  1209. LFD f18, 18 * SIZE(AO)
  1210. LFD f19, 19 * SIZE(AO)
  1211. LFD f20, 8 * SIZE(BO)
  1212. LFD f21, 9 * SIZE(BO)
  1213. LFD f22, 10 * SIZE(BO)
  1214. LFD f23, 11 * SIZE(BO)
  1215. addi AO, AO, 16 * SIZE
  1216. addi BO, BO, 8 * SIZE
  1217. bdnz LL(42)
  1218. .align 4
  1219. LL(45):
  1220. lfd f30, ALPHA
  1221. #if defined(TRMMKERNEL)
  1222. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1223. sub TEMP, K, KK
  1224. #elif defined(LEFT)
  1225. addi TEMP, KK, 4
  1226. #else
  1227. addi TEMP, KK, 2
  1228. #endif
  1229. andi. TEMP, TEMP, 3
  1230. mtspr CTR, TEMP
  1231. #else
  1232. andi. r0, K, 3
  1233. mtspr CTR, r0
  1234. #endif
  1235. ble+ LL(48)
  1236. .align 4
  1237. LL(46):
  1238. FMADD f0, f16, f20, f0
  1239. FMADD f1, f17, f20, f1
  1240. FMADD f2, f18, f20, f2
  1241. FMADD f3, f19, f20, f3
  1242. FMADD f4, f16, f21, f4
  1243. FMADD f5, f17, f21, f5
  1244. FMADD f6, f18, f21, f6
  1245. FMADD f7, f19, f21, f7
  1246. LFD f16, 4 * SIZE(AO)
  1247. LFD f17, 5 * SIZE(AO)
  1248. LFD f18, 6 * SIZE(AO)
  1249. LFD f19, 7 * SIZE(AO)
  1250. LFD f20, 2 * SIZE(BO)
  1251. LFD f21, 3 * SIZE(BO)
  1252. addi BO, BO, 2 * SIZE
  1253. addi AO, AO, 4 * SIZE
  1254. bdnz LL(46)
  1255. .align 4
  1256. LL(48):
  1257. #ifndef TRMMKERNEL
  1258. LFD f16, 0 * SIZE(CO1)
  1259. LFD f17, 1 * SIZE(CO1)
  1260. LFD f18, 2 * SIZE(CO1)
  1261. LFD f19, 3 * SIZE(CO1)
  1262. LFD f20, 0 * SIZE(CO2)
  1263. LFD f21, 1 * SIZE(CO2)
  1264. LFD f22, 2 * SIZE(CO2)
  1265. LFD f23, 3 * SIZE(CO2)
  1266. FMADD f0, f0, f30, f16
  1267. FMADD f1, f1, f30, f17
  1268. FMADD f2, f2, f30, f18
  1269. FMADD f3, f3, f30, f19
  1270. FMADD f4, f4, f30, f20
  1271. FMADD f5, f5, f30, f21
  1272. FMADD f6, f6, f30, f22
  1273. FMADD f7, f7, f30, f23
  1274. #else
  1275. FMUL f0, f0, f30
  1276. FMUL f1, f1, f30
  1277. FMUL f2, f2, f30
  1278. FMUL f3, f3, f30
  1279. FMUL f4, f4, f30
  1280. FMUL f5, f5, f30
  1281. FMUL f6, f6, f30
  1282. FMUL f7, f7, f30
  1283. #endif
  1284. STFD f0, 0 * SIZE(CO1)
  1285. STFD f1, 1 * SIZE(CO1)
  1286. STFD f2, 2 * SIZE(CO1)
  1287. STFD f3, 3 * SIZE(CO1)
  1288. lfs f0, FZERO
  1289. fmr f1, f0
  1290. fmr f2, f0
  1291. fmr f3, f0
  1292. STFD f4, 0 * SIZE(CO2)
  1293. STFD f5, 1 * SIZE(CO2)
  1294. STFD f6, 2 * SIZE(CO2)
  1295. STFD f7, 3 * SIZE(CO2)
  1296. fmr f4, f0
  1297. fmr f5, f0
  1298. fmr f6, f0
  1299. fmr f7, f0
  1300. addi CO1, CO1, 4 * SIZE
  1301. addi CO2, CO2, 4 * SIZE
  1302. #ifdef TRMMKERNEL
  1303. #if ( defined(LEFT) && defined(TRANSA)) || \
  1304. (!defined(LEFT) && !defined(TRANSA))
  1305. sub TEMP, K, KK
  1306. #ifdef LEFT
  1307. addi TEMP, TEMP, -4
  1308. #else
  1309. addi TEMP, TEMP, -2
  1310. #endif
  1311. slwi r0, TEMP, 2 + BASE_SHIFT
  1312. slwi TEMP, TEMP, 1 + BASE_SHIFT
  1313. add AO, AO, r0
  1314. add BO, BO, TEMP
  1315. #endif
  1316. #ifdef LEFT
  1317. addi KK, KK, 4
  1318. #endif
  1319. #endif
  1320. addic. I, I, -1
  1321. bgt+ LL(41)
  1322. .align 4
  1323. LL(50):
  1324. andi. I, M, 2
  1325. ble LL(60)
  1326. #if defined(TRMMKERNEL)
  1327. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1328. LFD f16, 0 * SIZE(AO)
  1329. LFD f17, 1 * SIZE(AO)
  1330. LFD f18, 2 * SIZE(AO)
  1331. LFD f19, 3 * SIZE(AO)
  1332. LFD f20, 0 * SIZE(B)
  1333. LFD f21, 1 * SIZE(B)
  1334. LFD f22, 2 * SIZE(B)
  1335. LFD f23, 3 * SIZE(B)
  1336. LFD f24, 4 * SIZE(B)
  1337. LFD f25, 5 * SIZE(B)
  1338. LFD f26, 6 * SIZE(B)
  1339. LFD f27, 7 * SIZE(B)
  1340. mr BO, B
  1341. #else
  1342. slwi r0, KK, 1 + BASE_SHIFT
  1343. slwi TEMP, KK, 1 + BASE_SHIFT
  1344. add AO, AO, r0
  1345. add BO, B, TEMP
  1346. LFD f16, 0 * SIZE(AO)
  1347. LFD f17, 1 * SIZE(AO)
  1348. LFD f18, 2 * SIZE(AO)
  1349. LFD f19, 3 * SIZE(AO)
  1350. LFD f20, 0 * SIZE(BO)
  1351. LFD f21, 1 * SIZE(BO)
  1352. LFD f22, 2 * SIZE(BO)
  1353. LFD f23, 3 * SIZE(BO)
  1354. LFD f24, 4 * SIZE(BO)
  1355. LFD f25, 5 * SIZE(BO)
  1356. LFD f26, 6 * SIZE(BO)
  1357. LFD f27, 7 * SIZE(BO)
  1358. #endif
  1359. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1360. sub TEMP, K, KK
  1361. #elif defined(LEFT)
  1362. addi TEMP, KK, 2
  1363. #else
  1364. addi TEMP, KK, 2
  1365. #endif
  1366. srawi. TEMP, TEMP, 2
  1367. mtspr CTR, TEMP
  1368. #else
  1369. LFD f16, 0 * SIZE(AO)
  1370. LFD f17, 1 * SIZE(AO)
  1371. LFD f18, 2 * SIZE(AO)
  1372. LFD f19, 3 * SIZE(AO)
  1373. LFD f20, 0 * SIZE(B)
  1374. LFD f21, 1 * SIZE(B)
  1375. LFD f22, 2 * SIZE(B)
  1376. LFD f23, 3 * SIZE(B)
  1377. LFD f24, 4 * SIZE(B)
  1378. LFD f25, 5 * SIZE(B)
  1379. LFD f26, 6 * SIZE(B)
  1380. LFD f27, 7 * SIZE(B)
  1381. srawi. r0, K, 2
  1382. mtspr CTR, r0
  1383. mr BO, B
  1384. #endif
  1385. ble LL(55)
  1386. .align 5
  1387. LL(52):
  1388. FMADD f0, f16, f20, f0
  1389. FMADD f1, f17, f20, f1
  1390. FMADD f2, f16, f21, f2
  1391. FMADD f3, f17, f21, f3
  1392. FMADD f4, f18, f22, f4
  1393. FMADD f5, f19, f22, f5
  1394. FMADD f6, f18, f23, f6
  1395. FMADD f7, f19, f23, f7
  1396. LFD f16, 4 * SIZE(AO)
  1397. LFD f17, 5 * SIZE(AO)
  1398. LFD f18, 6 * SIZE(AO)
  1399. LFD f19, 7 * SIZE(AO)
  1400. LFD f20, 8 * SIZE(BO)
  1401. LFD f21, 9 * SIZE(BO)
  1402. LFD f22, 10 * SIZE(BO)
  1403. LFD f23, 11 * SIZE(BO)
  1404. FMADD f0, f16, f24, f0
  1405. FMADD f1, f17, f24, f1
  1406. FMADD f2, f16, f25, f2
  1407. FMADD f3, f17, f25, f3
  1408. FMADD f4, f18, f26, f4
  1409. FMADD f5, f19, f26, f5
  1410. FMADD f6, f18, f27, f6
  1411. FMADD f7, f19, f27, f7
  1412. LFD f16, 8 * SIZE(AO)
  1413. LFD f17, 9 * SIZE(AO)
  1414. LFD f18, 10 * SIZE(AO)
  1415. LFD f19, 11 * SIZE(AO)
  1416. LFD f24, 12 * SIZE(BO)
  1417. LFD f25, 13 * SIZE(BO)
  1418. LFD f26, 14 * SIZE(BO)
  1419. LFD f27, 15 * SIZE(BO)
  1420. addi AO, AO, 8 * SIZE
  1421. addi BO, BO, 8 * SIZE
  1422. bdnz LL(52)
  1423. .align 4
  1424. LL(55):
  1425. lfd f30, ALPHA
  1426. #if defined(TRMMKERNEL)
  1427. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1428. sub TEMP, K, KK
  1429. #elif defined(LEFT)
  1430. addi TEMP, KK, 2
  1431. #else
  1432. addi TEMP, KK, 2
  1433. #endif
  1434. andi. TEMP, TEMP, 3
  1435. mtspr CTR, TEMP
  1436. #else
  1437. andi. r0, K, 3
  1438. mtspr CTR, r0
  1439. #endif
  1440. ble+ LL(58)
  1441. .align 4
  1442. LL(56):
  1443. FMADD f0, f16, f20, f0
  1444. FMADD f1, f17, f20, f1
  1445. FMADD f2, f16, f21, f2
  1446. FMADD f3, f17, f21, f3
  1447. LFD f16, 2 * SIZE(AO)
  1448. LFD f17, 3 * SIZE(AO)
  1449. LFD f20, 2 * SIZE(BO)
  1450. LFD f21, 3 * SIZE(BO)
  1451. addi BO, BO, 2 * SIZE
  1452. addi AO, AO, 2 * SIZE
  1453. bdnz LL(56)
  1454. .align 4
  1455. LL(58):
  1456. #ifndef TRMMKERNEL
  1457. LFD f16, 0 * SIZE(CO1)
  1458. LFD f17, 1 * SIZE(CO1)
  1459. LFD f18, 0 * SIZE(CO2)
  1460. LFD f19, 1 * SIZE(CO2)
  1461. FADD f0, f4, f0
  1462. FADD f1, f5, f1
  1463. FADD f2, f6, f2
  1464. FADD f3, f7, f3
  1465. FMADD f0, f0, f30, f16
  1466. FMADD f1, f1, f30, f17
  1467. FMADD f2, f2, f30, f18
  1468. FMADD f3, f3, f30, f19
  1469. #else
  1470. FADD f0, f4, f0
  1471. FADD f1, f5, f1
  1472. FADD f2, f6, f2
  1473. FADD f3, f7, f3
  1474. FMUL f0, f0, f30
  1475. FMUL f1, f1, f30
  1476. FMUL f2, f2, f30
  1477. FMUL f3, f3, f30
  1478. #endif
  1479. STFD f0, 0 * SIZE(CO1)
  1480. STFD f1, 1 * SIZE(CO1)
  1481. STFD f2, 0 * SIZE(CO2)
  1482. STFD f3, 1 * SIZE(CO2)
  1483. lfs f0, FZERO
  1484. fmr f1, f0
  1485. fmr f2, f0
  1486. fmr f3, f0
  1487. fmr f4, f0
  1488. fmr f5, f0
  1489. fmr f6, f0
  1490. fmr f7, f0
  1491. addi CO1, CO1, 2 * SIZE
  1492. addi CO2, CO2, 2 * SIZE
  1493. #ifdef TRMMKERNEL
  1494. #if ( defined(LEFT) && defined(TRANSA)) || \
  1495. (!defined(LEFT) && !defined(TRANSA))
  1496. sub TEMP, K, KK
  1497. #ifdef LEFT
  1498. addi TEMP, TEMP, -2
  1499. #else
  1500. addi TEMP, TEMP, -2
  1501. #endif
  1502. slwi r0, TEMP, 1 + BASE_SHIFT
  1503. slwi TEMP, TEMP, 1 + BASE_SHIFT
  1504. add AO, AO, r0
  1505. add BO, BO, TEMP
  1506. #endif
  1507. #ifdef LEFT
  1508. addi KK, KK, 2
  1509. #endif
  1510. #endif
  1511. .align 4
  1512. LL(60):
  1513. andi. I, M, 1
  1514. ble LL(69)
  1515. #if defined(TRMMKERNEL)
  1516. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1517. LFD f16, 0 * SIZE(AO)
  1518. LFD f17, 1 * SIZE(AO)
  1519. LFD f18, 2 * SIZE(AO)
  1520. LFD f19, 3 * SIZE(AO)
  1521. LFD f20, 0 * SIZE(B)
  1522. LFD f21, 1 * SIZE(B)
  1523. LFD f22, 2 * SIZE(B)
  1524. LFD f23, 3 * SIZE(B)
  1525. LFD f24, 4 * SIZE(B)
  1526. LFD f25, 5 * SIZE(B)
  1527. LFD f26, 6 * SIZE(B)
  1528. LFD f27, 7 * SIZE(B)
  1529. mr BO, B
  1530. #else
  1531. slwi r0, KK, 0 + BASE_SHIFT
  1532. slwi TEMP, KK, 1 + BASE_SHIFT
  1533. add AO, AO, r0
  1534. add BO, B, TEMP
  1535. LFD f16, 0 * SIZE(AO)
  1536. LFD f17, 1 * SIZE(AO)
  1537. LFD f18, 2 * SIZE(AO)
  1538. LFD f19, 3 * SIZE(AO)
  1539. LFD f20, 0 * SIZE(BO)
  1540. LFD f21, 1 * SIZE(BO)
  1541. LFD f22, 2 * SIZE(BO)
  1542. LFD f23, 3 * SIZE(BO)
  1543. LFD f24, 4 * SIZE(BO)
  1544. LFD f25, 5 * SIZE(BO)
  1545. LFD f26, 6 * SIZE(BO)
  1546. LFD f27, 7 * SIZE(BO)
  1547. #endif
  1548. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1549. sub TEMP, K, KK
  1550. #elif defined(LEFT)
  1551. addi TEMP, KK, 1
  1552. #else
  1553. addi TEMP, KK, 2
  1554. #endif
  1555. srawi. TEMP, TEMP, 2
  1556. mtspr CTR, TEMP
  1557. #else
  1558. LFD f16, 0 * SIZE(AO)
  1559. LFD f17, 1 * SIZE(AO)
  1560. LFD f18, 2 * SIZE(AO)
  1561. LFD f19, 3 * SIZE(AO)
  1562. LFD f20, 0 * SIZE(B)
  1563. LFD f21, 1 * SIZE(B)
  1564. LFD f22, 2 * SIZE(B)
  1565. LFD f23, 3 * SIZE(B)
  1566. LFD f24, 4 * SIZE(B)
  1567. LFD f25, 5 * SIZE(B)
  1568. LFD f26, 6 * SIZE(B)
  1569. LFD f27, 7 * SIZE(B)
  1570. srawi. r0, K, 2
  1571. mtspr CTR, r0
  1572. mr BO, B
  1573. #endif
  1574. ble LL(65)
  1575. .align 5
  1576. LL(62):
  1577. FMADD f0, f16, f20, f0
  1578. FMADD f1, f16, f21, f1
  1579. FMADD f2, f17, f22, f2
  1580. FMADD f3, f17, f23, f3
  1581. LFD f20, 8 * SIZE(BO)
  1582. LFD f21, 9 * SIZE(BO)
  1583. LFD f22, 10 * SIZE(BO)
  1584. LFD f23, 11 * SIZE(BO)
  1585. FMADD f0, f18, f24, f0
  1586. FMADD f1, f18, f25, f1
  1587. FMADD f2, f19, f26, f2
  1588. FMADD f3, f19, f27, f3
  1589. LFD f16, 4 * SIZE(AO)
  1590. LFD f17, 5 * SIZE(AO)
  1591. LFD f18, 6 * SIZE(AO)
  1592. LFD f19, 7 * SIZE(AO)
  1593. LFD f24, 12 * SIZE(BO)
  1594. LFD f25, 13 * SIZE(BO)
  1595. LFD f26, 14 * SIZE(BO)
  1596. LFD f27, 15 * SIZE(BO)
  1597. addi AO, AO, 4 * SIZE
  1598. addi BO, BO, 8 * SIZE
  1599. bdnz LL(62)
  1600. .align 4
  1601. LL(65):
  1602. lfd f30, ALPHA
  1603. #if defined(TRMMKERNEL)
  1604. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1605. sub TEMP, K, KK
  1606. #elif defined(LEFT)
  1607. addi TEMP, KK, 1
  1608. #else
  1609. addi TEMP, KK, 2
  1610. #endif
  1611. andi. TEMP, TEMP, 3
  1612. mtspr CTR, TEMP
  1613. #else
  1614. andi. r0, K, 3
  1615. mtspr CTR, r0
  1616. #endif
  1617. ble+ LL(68)
  1618. .align 4
  1619. LL(66):
  1620. FMADD f0, f16, f20, f0
  1621. FMADD f1, f16, f21, f1
  1622. LFD f16, 1 * SIZE(AO)
  1623. LFD f20, 2 * SIZE(BO)
  1624. LFD f21, 3 * SIZE(BO)
  1625. addi BO, BO, 2 * SIZE
  1626. addi AO, AO, 1 * SIZE
  1627. bdnz LL(66)
  1628. .align 4
  1629. LL(68):
  1630. #ifndef TRMMKERNEL
  1631. LFD f16, 0 * SIZE(CO1)
  1632. LFD f18, 0 * SIZE(CO2)
  1633. FADD f0, f2, f0
  1634. FADD f1, f3, f1
  1635. FMADD f0, f0, f30, f16
  1636. FMADD f1, f1, f30, f18
  1637. #else
  1638. FADD f0, f2, f0
  1639. FADD f1, f3, f1
  1640. FMUL f0, f0, f30
  1641. FMUL f1, f1, f30
  1642. #endif
  1643. STFD f0, 0 * SIZE(CO1)
  1644. STFD f1, 0 * SIZE(CO2)
  1645. lfs f0, FZERO
  1646. fmr f1, f0
  1647. fmr f4, f0
  1648. fmr f5, f0
  1649. #ifdef TRMMKERNEL
  1650. #if ( defined(LEFT) && defined(TRANSA)) || \
  1651. (!defined(LEFT) && !defined(TRANSA))
  1652. sub TEMP, K, KK
  1653. #ifdef LEFT
  1654. addi TEMP, TEMP, -1
  1655. #else
  1656. addi TEMP, TEMP, -2
  1657. #endif
  1658. slwi r0, TEMP, 0 + BASE_SHIFT
  1659. slwi TEMP, TEMP, 1 + BASE_SHIFT
  1660. add AO, AO, r0
  1661. add BO, BO, TEMP
  1662. #endif
  1663. #ifdef LEFT
  1664. addi KK, KK, 1
  1665. #endif
  1666. #endif
  1667. .align 4
  1668. LL(69):
  1669. #if defined(TRMMKERNEL) && !defined(LEFT)
  1670. addi KK, KK, 2
  1671. #endif
  1672. mr B, BO
  1673. .align 4
  1674. LL(70):
  1675. mr CO1, C
  1676. andi. J, N, 1
  1677. ble LL(999)
  1678. #if defined(TRMMKERNEL) && defined(LEFT)
  1679. mr KK, OFFSET
  1680. #endif
  1681. lfs f0, FZERO
  1682. fmr f1, f0
  1683. fmr f2, f0
  1684. fmr f3, f0
  1685. srawi. I, M, 2
  1686. mr AO, A
  1687. ble LL(80)
  1688. .align 4
  1689. LL(71):
  1690. #if defined(TRMMKERNEL)
  1691. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1692. LFD f16, 0 * SIZE(AO)
  1693. LFD f17, 1 * SIZE(AO)
  1694. LFD f18, 2 * SIZE(AO)
  1695. LFD f19, 3 * SIZE(AO)
  1696. LFD f20, 0 * SIZE(B)
  1697. LFD f21, 1 * SIZE(B)
  1698. LFD f22, 2 * SIZE(B)
  1699. LFD f23, 3 * SIZE(B)
  1700. mr BO, B
  1701. #else
  1702. slwi r0, KK, 2 + BASE_SHIFT
  1703. slwi TEMP, KK, 0 + BASE_SHIFT
  1704. add AO, AO, r0
  1705. add BO, B, TEMP
  1706. LFD f16, 0 * SIZE(AO)
  1707. LFD f17, 1 * SIZE(AO)
  1708. LFD f18, 2 * SIZE(AO)
  1709. LFD f19, 3 * SIZE(AO)
  1710. LFD f20, 0 * SIZE(BO)
  1711. LFD f21, 1 * SIZE(BO)
  1712. LFD f22, 2 * SIZE(BO)
  1713. LFD f23, 3 * SIZE(BO)
  1714. #endif
  1715. dcbtst CO1, PREC
  1716. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1717. sub TEMP, K, KK
  1718. #elif defined(LEFT)
  1719. addi TEMP, KK, 4
  1720. #else
  1721. addi TEMP, KK, 1
  1722. #endif
  1723. srawi. TEMP, TEMP, 2
  1724. mtspr CTR, TEMP
  1725. #else
  1726. LFD f16, 0 * SIZE(AO)
  1727. LFD f17, 1 * SIZE(AO)
  1728. LFD f18, 2 * SIZE(AO)
  1729. LFD f19, 3 * SIZE(AO)
  1730. LFD f20, 0 * SIZE(B)
  1731. LFD f21, 1 * SIZE(B)
  1732. LFD f22, 2 * SIZE(B)
  1733. LFD f23, 3 * SIZE(B)
  1734. dcbtst CO1, PREC
  1735. srawi. r0, K, 2
  1736. mtspr CTR, r0
  1737. mr BO, B
  1738. #endif
  1739. ble LL(75)
  1740. .align 5
  1741. LL(72):
  1742. FMADD f0, f16, f20, f0
  1743. FMADD f1, f17, f20, f1
  1744. FMADD f2, f18, f20, f2
  1745. FMADD f3, f19, f20, f3
  1746. LFD f16, 4 * SIZE(AO)
  1747. LFD f17, 5 * SIZE(AO)
  1748. LFD f18, 6 * SIZE(AO)
  1749. LFD f19, 7 * SIZE(AO)
  1750. FMADD f0, f16, f21, f0
  1751. FMADD f1, f17, f21, f1
  1752. FMADD f2, f18, f21, f2
  1753. FMADD f3, f19, f21, f3
  1754. LFD f16, 8 * SIZE(AO)
  1755. LFD f17, 9 * SIZE(AO)
  1756. LFD f18, 10 * SIZE(AO)
  1757. LFD f19, 11 * SIZE(AO)
  1758. FMADD f0, f16, f22, f0
  1759. FMADD f1, f17, f22, f1
  1760. FMADD f2, f18, f22, f2
  1761. FMADD f3, f19, f22, f3
  1762. LFD f16, 12 * SIZE(AO)
  1763. LFD f17, 13 * SIZE(AO)
  1764. LFD f18, 14 * SIZE(AO)
  1765. LFD f19, 15 * SIZE(AO)
  1766. FMADD f0, f16, f23, f0
  1767. FMADD f1, f17, f23, f1
  1768. FMADD f2, f18, f23, f2
  1769. FMADD f3, f19, f23, f3
  1770. LFD f16, 16 * SIZE(AO)
  1771. LFD f17, 17 * SIZE(AO)
  1772. LFD f18, 18 * SIZE(AO)
  1773. LFD f19, 19 * SIZE(AO)
  1774. LFD f20, 4 * SIZE(BO)
  1775. LFD f21, 5 * SIZE(BO)
  1776. LFD f22, 6 * SIZE(BO)
  1777. LFD f23, 7 * SIZE(BO)
  1778. addi AO, AO, 16 * SIZE
  1779. addi BO, BO, 4 * SIZE
  1780. bdnz LL(72)
  1781. .align 4
  1782. LL(75):
  1783. lfd f30, ALPHA
  1784. #if defined(TRMMKERNEL)
  1785. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1786. sub TEMP, K, KK
  1787. #elif defined(LEFT)
  1788. addi TEMP, KK, 4
  1789. #else
  1790. addi TEMP, KK, 1
  1791. #endif
  1792. andi. TEMP, TEMP, 3
  1793. mtspr CTR, TEMP
  1794. #else
  1795. andi. r0, K, 3
  1796. mtspr CTR, r0
  1797. #endif
  1798. ble+ LL(78)
  1799. .align 4
  1800. LL(76):
  1801. FMADD f0, f16, f20, f0
  1802. FMADD f1, f17, f20, f1
  1803. FMADD f2, f18, f20, f2
  1804. FMADD f3, f19, f20, f3
  1805. LFD f16, 4 * SIZE(AO)
  1806. LFD f17, 5 * SIZE(AO)
  1807. LFD f18, 6 * SIZE(AO)
  1808. LFD f19, 7 * SIZE(AO)
  1809. LFD f20, 1 * SIZE(BO)
  1810. addi BO, BO, 1 * SIZE
  1811. addi AO, AO, 4 * SIZE
  1812. bdnz LL(76)
  1813. .align 4
  1814. LL(78):
  1815. #ifndef TRMMKERNEL
  1816. LFD f16, 0 * SIZE(CO1)
  1817. LFD f17, 1 * SIZE(CO1)
  1818. LFD f18, 2 * SIZE(CO1)
  1819. LFD f19, 3 * SIZE(CO1)
  1820. FMADD f0, f0, f30, f16
  1821. FMADD f1, f1, f30, f17
  1822. FMADD f2, f2, f30, f18
  1823. FMADD f3, f3, f30, f19
  1824. #else
  1825. FMUL f0, f0, f30
  1826. FMUL f1, f1, f30
  1827. FMUL f2, f2, f30
  1828. FMUL f3, f3, f30
  1829. #endif
  1830. STFD f0, 0 * SIZE(CO1)
  1831. STFD f1, 1 * SIZE(CO1)
  1832. STFD f2, 2 * SIZE(CO1)
  1833. STFD f3, 3 * SIZE(CO1)
  1834. lfs f0, FZERO
  1835. fmr f1, f0
  1836. fmr f2, f0
  1837. fmr f3, f0
  1838. #ifdef TRMMKERNEL
  1839. #if ( defined(LEFT) && defined(TRANSA)) || \
  1840. (!defined(LEFT) && !defined(TRANSA))
  1841. sub TEMP, K, KK
  1842. #ifdef LEFT
  1843. addi TEMP, TEMP, -4
  1844. #else
  1845. addi TEMP, TEMP, -1
  1846. #endif
  1847. slwi r0 , TEMP, 2 + BASE_SHIFT
  1848. slwi TEMP, TEMP, 0 + BASE_SHIFT
  1849. add AO, AO, r0
  1850. add BO, BO, TEMP
  1851. #endif
  1852. #ifdef LEFT
  1853. addi KK, KK, 4
  1854. #endif
  1855. #endif
  1856. addi CO1, CO1, 4 * SIZE
  1857. addic. I, I, -1
  1858. bgt+ LL(71)
  1859. .align 4
  1860. LL(80):
  1861. andi. I, M, 2
  1862. ble LL(90)
  1863. #if defined(TRMMKERNEL)
  1864. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1865. LFD f16, 0 * SIZE(AO)
  1866. LFD f17, 1 * SIZE(AO)
  1867. LFD f18, 2 * SIZE(AO)
  1868. LFD f19, 3 * SIZE(AO)
  1869. LFD f20, 0 * SIZE(B)
  1870. LFD f21, 1 * SIZE(B)
  1871. LFD f22, 2 * SIZE(B)
  1872. LFD f23, 3 * SIZE(B)
  1873. mr BO, B
  1874. #else
  1875. slwi r0, KK, 1 + BASE_SHIFT
  1876. slwi TEMP, KK, 0 + BASE_SHIFT
  1877. add AO, AO, r0
  1878. add BO, B, TEMP
  1879. LFD f16, 0 * SIZE(AO)
  1880. LFD f17, 1 * SIZE(AO)
  1881. LFD f18, 2 * SIZE(AO)
  1882. LFD f19, 3 * SIZE(AO)
  1883. LFD f20, 0 * SIZE(BO)
  1884. LFD f21, 1 * SIZE(BO)
  1885. LFD f22, 2 * SIZE(BO)
  1886. LFD f23, 3 * SIZE(BO)
  1887. #endif
  1888. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1889. sub TEMP, K, KK
  1890. #elif defined(LEFT)
  1891. addi TEMP, KK, 2
  1892. #else
  1893. addi TEMP, KK, 1
  1894. #endif
  1895. srawi. TEMP, TEMP, 2
  1896. mtspr CTR, TEMP
  1897. #else
  1898. LFD f16, 0 * SIZE(AO)
  1899. LFD f17, 1 * SIZE(AO)
  1900. LFD f18, 2 * SIZE(AO)
  1901. LFD f19, 3 * SIZE(AO)
  1902. LFD f20, 0 * SIZE(B)
  1903. LFD f21, 1 * SIZE(B)
  1904. LFD f22, 2 * SIZE(B)
  1905. LFD f23, 3 * SIZE(B)
  1906. srawi. r0, K, 2
  1907. mtspr CTR, r0
  1908. mr BO, B
  1909. #endif
  1910. ble LL(85)
  1911. .align 5
  1912. LL(82):
  1913. FMADD f0, f16, f20, f0
  1914. FMADD f1, f17, f20, f1
  1915. FMADD f2, f18, f21, f2
  1916. FMADD f3, f19, f21, f3
  1917. LFD f16, 4 * SIZE(AO)
  1918. LFD f17, 5 * SIZE(AO)
  1919. LFD f18, 6 * SIZE(AO)
  1920. LFD f19, 7 * SIZE(AO)
  1921. FMADD f0, f16, f22, f0
  1922. FMADD f1, f17, f22, f1
  1923. FMADD f2, f18, f23, f2
  1924. FMADD f3, f19, f23, f3
  1925. LFD f16, 8 * SIZE(AO)
  1926. LFD f17, 9 * SIZE(AO)
  1927. LFD f18, 10 * SIZE(AO)
  1928. LFD f19, 11 * SIZE(AO)
  1929. LFD f20, 4 * SIZE(BO)
  1930. LFD f21, 5 * SIZE(BO)
  1931. LFD f22, 6 * SIZE(BO)
  1932. LFD f23, 7 * SIZE(BO)
  1933. addi AO, AO, 8 * SIZE
  1934. addi BO, BO, 4 * SIZE
  1935. bdnz LL(82)
  1936. .align 4
  1937. LL(85):
  1938. lfd f30, ALPHA
  1939. #if defined(TRMMKERNEL)
  1940. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1941. sub TEMP, K, KK
  1942. #elif defined(LEFT)
  1943. addi TEMP, KK, 2
  1944. #else
  1945. addi TEMP, KK, 1
  1946. #endif
  1947. andi. TEMP, TEMP, 3
  1948. mtspr CTR, TEMP
  1949. #else
  1950. andi. r0, K, 3
  1951. mtspr CTR, r0
  1952. #endif
  1953. ble+ LL(88)
  1954. .align 4
  1955. LL(86):
  1956. FMADD f0, f16, f20, f0
  1957. FMADD f1, f17, f20, f1
  1958. LFD f16, 2 * SIZE(AO)
  1959. LFD f17, 3 * SIZE(AO)
  1960. LFD f20, 1 * SIZE(BO)
  1961. addi BO, BO, 1 * SIZE
  1962. addi AO, AO, 2 * SIZE
  1963. bdnz LL(86)
  1964. .align 4
  1965. LL(88):
  1966. #ifndef TRMMKERNEL
  1967. LFD f16, 0 * SIZE(CO1)
  1968. LFD f17, 1 * SIZE(CO1)
  1969. FADD f0, f2, f0
  1970. FADD f1, f3, f1
  1971. FMADD f0, f0, f30, f16
  1972. FMADD f1, f1, f30, f17
  1973. #else
  1974. FADD f0, f2, f0
  1975. FADD f1, f3, f1
  1976. FMUL f0, f0, f30
  1977. FMUL f1, f1, f30
  1978. #endif
  1979. STFD f0, 0 * SIZE(CO1)
  1980. STFD f1, 1 * SIZE(CO1)
  1981. lfs f0, FZERO
  1982. fmr f1, f0
  1983. fmr f2, f0
  1984. fmr f3, f0
  1985. addi CO1, CO1, 2 * SIZE
  1986. #ifdef TRMMKERNEL
  1987. #if ( defined(LEFT) && defined(TRANSA)) || \
  1988. (!defined(LEFT) && !defined(TRANSA))
  1989. sub TEMP, K, KK
  1990. #ifdef LEFT
  1991. addi TEMP, TEMP, -2
  1992. #else
  1993. addi TEMP, TEMP, -1
  1994. #endif
  1995. slwi r0 , TEMP, 1 + BASE_SHIFT
  1996. slwi TEMP, TEMP, 0 + BASE_SHIFT
  1997. add AO, AO, r0
  1998. add BO, BO, TEMP
  1999. #endif
  2000. #ifdef LEFT
  2001. addi KK, KK, 2
  2002. #endif
  2003. #endif
  2004. .align 4
  2005. LL(90):
  2006. andi. I, M, 1
  2007. ble LL(999)
  2008. #if defined(TRMMKERNEL)
  2009. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  2010. LFD f16, 0 * SIZE(AO)
  2011. LFD f17, 1 * SIZE(AO)
  2012. LFD f18, 2 * SIZE(AO)
  2013. LFD f19, 3 * SIZE(AO)
  2014. LFD f20, 0 * SIZE(B)
  2015. LFD f21, 1 * SIZE(B)
  2016. LFD f22, 2 * SIZE(B)
  2017. LFD f23, 3 * SIZE(B)
  2018. mr BO, B
  2019. #else
  2020. slwi r0, KK, 0 + BASE_SHIFT
  2021. slwi TEMP, KK, 0 + BASE_SHIFT
  2022. add AO, AO, r0
  2023. add BO, B, TEMP
  2024. LFD f16, 0 * SIZE(AO)
  2025. LFD f17, 1 * SIZE(AO)
  2026. LFD f18, 2 * SIZE(AO)
  2027. LFD f19, 3 * SIZE(AO)
  2028. LFD f20, 0 * SIZE(BO)
  2029. LFD f21, 1 * SIZE(BO)
  2030. LFD f22, 2 * SIZE(BO)
  2031. LFD f23, 3 * SIZE(BO)
  2032. #endif
  2033. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  2034. sub TEMP, K, KK
  2035. #elif defined(LEFT)
  2036. addi TEMP, KK, 1
  2037. #else
  2038. addi TEMP, KK, 1
  2039. #endif
  2040. srawi. TEMP, TEMP, 3
  2041. mtspr CTR, TEMP
  2042. #else
  2043. LFD f16, 0 * SIZE(AO)
  2044. LFD f17, 1 * SIZE(AO)
  2045. LFD f18, 2 * SIZE(AO)
  2046. LFD f19, 3 * SIZE(AO)
  2047. LFD f20, 0 * SIZE(B)
  2048. LFD f21, 1 * SIZE(B)
  2049. LFD f22, 2 * SIZE(B)
  2050. LFD f23, 3 * SIZE(B)
  2051. srawi. r0, K, 3
  2052. mtspr CTR, r0
  2053. mr BO, B
  2054. #endif
  2055. ble LL(95)
  2056. .align 5
  2057. LL(92):
  2058. FMADD f0, f16, f20, f0
  2059. FMADD f1, f17, f21, f1
  2060. FMADD f2, f18, f22, f2
  2061. FMADD f3, f19, f23, f3
  2062. LFD f16, 4 * SIZE(AO)
  2063. LFD f17, 5 * SIZE(AO)
  2064. LFD f18, 6 * SIZE(AO)
  2065. LFD f19, 7 * SIZE(AO)
  2066. LFD f20, 4 * SIZE(BO)
  2067. LFD f21, 5 * SIZE(BO)
  2068. LFD f22, 6 * SIZE(BO)
  2069. LFD f23, 7 * SIZE(BO)
  2070. FMADD f0, f16, f20, f0
  2071. FMADD f1, f17, f21, f1
  2072. FMADD f2, f18, f22, f2
  2073. FMADD f3, f19, f23, f3
  2074. LFD f16, 8 * SIZE(AO)
  2075. LFD f17, 9 * SIZE(AO)
  2076. LFD f18, 10 * SIZE(AO)
  2077. LFD f19, 11 * SIZE(AO)
  2078. LFD f20, 8 * SIZE(BO)
  2079. LFD f21, 9 * SIZE(BO)
  2080. LFD f22, 10 * SIZE(BO)
  2081. LFD f23, 11 * SIZE(BO)
  2082. addi AO, AO, 8 * SIZE
  2083. addi BO, BO, 8 * SIZE
  2084. bdnz LL(92)
  2085. .align 4
  2086. LL(95):
  2087. lfd f30, ALPHA
  2088. #if defined(TRMMKERNEL)
  2089. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  2090. sub TEMP, K, KK
  2091. #elif defined(LEFT)
  2092. addi TEMP, KK, 1
  2093. #else
  2094. addi TEMP, KK, 1
  2095. #endif
  2096. andi. TEMP, TEMP, 7
  2097. mtspr CTR, TEMP
  2098. #else
  2099. andi. r0, K, 7
  2100. mtspr CTR, r0
  2101. #endif
  2102. ble+ LL(98)
  2103. .align 4
  2104. LL(96):
  2105. FMADD f0, f16, f20, f0
  2106. LFD f16, 1 * SIZE(AO)
  2107. LFD f20, 1 * SIZE(BO)
  2108. addi BO, BO, 1 * SIZE
  2109. addi AO, AO, 1 * SIZE
  2110. bdnz LL(96)
  2111. .align 4
  2112. LL(98):
  2113. #ifndef TRMMKERNEL
  2114. LFD f16, 0 * SIZE(CO1)
  2115. FADD f0, f1, f0
  2116. FADD f2, f3, f2
  2117. FADD f0, f2, f0
  2118. FMADD f0, f0, f30, f16
  2119. #else
  2120. FADD f0, f1, f0
  2121. FADD f2, f3, f2
  2122. FADD f0, f2, f0
  2123. FMUL f0, f0, f30
  2124. #endif
  2125. STFD f0, 0 * SIZE(CO1)
  2126. .align 4
  2127. LL(999):
  2128. addi r3, 0, 0
  2129. lfd f14, 0(SP)
  2130. lfd f15, 8(SP)
  2131. lfd f16, 16(SP)
  2132. lfd f17, 24(SP)
  2133. lfd f18, 32(SP)
  2134. lfd f19, 40(SP)
  2135. lfd f20, 48(SP)
  2136. lfd f21, 56(SP)
  2137. lfd f22, 64(SP)
  2138. lfd f23, 72(SP)
  2139. lfd f24, 80(SP)
  2140. lfd f25, 88(SP)
  2141. lfd f26, 96(SP)
  2142. lfd f27, 104(SP)
  2143. lfd f28, 112(SP)
  2144. lfd f29, 120(SP)
  2145. lfd f30, 128(SP)
  2146. lfd f31, 136(SP)
  2147. #ifdef __64BIT__
  2148. ld r31, 144(SP)
  2149. ld r30, 152(SP)
  2150. ld r29, 160(SP)
  2151. ld r28, 168(SP)
  2152. ld r27, 176(SP)
  2153. ld r26, 184(SP)
  2154. ld r25, 192(SP)
  2155. ld r24, 200(SP)
  2156. ld r23, 208(SP)
  2157. ld r22, 216(SP)
  2158. ld r21, 224(SP)
  2159. #ifdef TRMMKERNEL
  2160. ld r20, 232(SP)
  2161. ld r19, 240(SP)
  2162. #endif
  2163. #else
  2164. lwz r31, 144(SP)
  2165. lwz r30, 148(SP)
  2166. lwz r29, 152(SP)
  2167. lwz r28, 156(SP)
  2168. lwz r27, 160(SP)
  2169. lwz r26, 164(SP)
  2170. lwz r25, 168(SP)
  2171. lwz r24, 172(SP)
  2172. lwz r23, 176(SP)
  2173. lwz r22, 180(SP)
  2174. lwz r21, 184(SP)
  2175. #ifdef TRMMKERNEL
  2176. lwz r20, 188(SP)
  2177. lwz r19, 192(SP)
  2178. #endif
  2179. #endif
  2180. addi SP, SP, STACKSIZE
  2181. blr
  2182. EPILOGUE
  2183. #endif