You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

gemm_kernel_altivec_g4.S 48 kB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714171517161717171817191720172117221723172417251726172717281729173017311732173317341735173617371738173917401741174217431744174517461747174817491750175117521753175417551756175717581759176017611762176317641765176617671768176917701771177217731774177517761777177817791780178117821783178417851786178717881789179017911792179317941795179617971798179918001801180218031804180518061807180818091810181118121813181418151816181718181819182018211822182318241825182618271828182918301831183218331834183518361837183818391840184118421843184418451846184718481849185018511852185318541855185618571858185918601861186218631864186518661867186818691870187118721873187418751876187718781879188018811882188318841885188618871888188918901891189218931894189518961897189818991900190119021903190419051906190719081909191019111912191319141915191619171918191919201921192219231924192519261927192819291930193119321933193419351936193719381939194019411942194319441945194619471948194919501951195219531954195519561957195819591960196119621963196419651966196719681969197019711972197319741975197619771978197919801981198219831984198519861987198819891990199119921993199419951996199719981999200020012002200320042005200620072008200920102011201220132014201520162017201820192020202120222023202420252026202720282029203020312032203320342035203620372038203920402041204220432044204520462047204820492050205120522053205420552056205720582059206020612062206320642065206620672068206920702071207220732074207520762077207820792080208120822083208420852086208720882089209020912092209320942095209620972098209921002101210221032104210521062107210821092110211121122113211421152116211721182119212021212122212321242125212621272128212921302131213221332134213521362137213821392140214121422143214421452146214721482149215021512152215321542155215621572158215921602161216221632164216521662167216821692170217121722173217421752176217721782179218021812182218321842185218621872188218921902191219221932194219521962197219821992200220122022203220422052206220722082209221022112212221322142215221622172218221922202221222222232224222522262227222822292230223122322233223422352236223722382239224022412242224322442245224622472248224922502251225222532254225522562257225822592260226122622263226422652266226722682269227022712272227322742275227622772278227922802281228222832284228522862287228822892290229122922293229422952296229722982299230023012302230323042305230623072308230923102311231223132314231523162317231823192320232123222323232423252326232723282329233023312332233323342335233623372338233923402341234223432344234523462347234823492350235123522353235423552356235723582359236023612362236323642365236623672368236923702371237223732374237523762377237823792380238123822383238423852386238723882389239023912392239323942395239623972398239924002401240224032404240524062407240824092410241124122413241424152416241724182419242024212422242324242425242624272428242924302431243224332434243524362437243824392440244124422443244424452446244724482449245024512452245324542455245624572458245924602461246224632464246524662467246824692470247124722473247424752476247724782479248024812482248324842485248624872488248924902491249224932494249524962497249824992500250125022503250425052506250725082509251025112512251325142515251625172518251925202521252225232524252525262527252825292530253125322533253425352536253725382539254025412542254325442545254625472548254925502551255225532554255525562557255825592560256125622563256425652566256725682569257025712572257325742575257625772578257925802581258225832584258525862587258825892590259125922593259425952596259725982599260026012602260326042605260626072608260926102611261226132614261526162617261826192620262126222623262426252626262726282629263026312632263326342635263626372638263926402641264226432644264526462647
  1. /*********************************************************************/
  2. /* Copyright 2009, 2010 The University of Texas at Austin. */
  3. /* All rights reserved. */
  4. /* */
  5. /* Redistribution and use in source and binary forms, with or */
  6. /* without modification, are permitted provided that the following */
  7. /* conditions are met: */
  8. /* */
  9. /* 1. Redistributions of source code must retain the above */
  10. /* copyright notice, this list of conditions and the following */
  11. /* disclaimer. */
  12. /* */
  13. /* 2. Redistributions in binary form must reproduce the above */
  14. /* copyright notice, this list of conditions and the following */
  15. /* disclaimer in the documentation and/or other materials */
  16. /* provided with the distribution. */
  17. /* */
  18. /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
  19. /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
  20. /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
  21. /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
  22. /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
  23. /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
  24. /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
  25. /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
  26. /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
  27. /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
  28. /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
  29. /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
  30. /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
  31. /* POSSIBILITY OF SUCH DAMAGE. */
  32. /* */
  33. /* The views and conclusions contained in the software and */
  34. /* documentation are those of the authors and should not be */
  35. /* interpreted as representing official policies, either expressed */
  36. /* or implied, of The University of Texas at Austin. */
  37. /*********************************************************************/
  38. #define ASSEMBLER
  39. #include "common.h"
  40. #ifndef __64BIT__
  41. #define LOAD lwz
  42. #else
  43. #define LOAD ld
  44. #endif
  45. #ifdef __64BIT__
  46. #define STACKSIZE 360
  47. #else
  48. #define STACKSIZE 272
  49. #endif
  50. #define ALPHA 0
  51. #define FZERO 16
  52. #define M r3
  53. #define N r4
  54. #define K r5
  55. #if defined(linux) || defined(__FreeBSD__)
  56. #ifndef __64BIT__
  57. #define A r6
  58. #define B r7
  59. #define C r8
  60. #define LDC r9
  61. #else
  62. #define A r7
  63. #define B r8
  64. #define C r9
  65. #define LDC r10
  66. #endif
  67. #endif
  68. #if defined(_AIX) || defined(__APPLE__)
  69. #if !defined(__64BIT__) && defined(DOUBLE)
  70. #define A r8
  71. #define B r9
  72. #define C r10
  73. #define LDC r7
  74. #else
  75. #define A r7
  76. #define B r8
  77. #define C r9
  78. #define LDC r10
  79. #endif
  80. #endif
  81. #define STACK r11
  82. #define I r21
  83. #define J r22
  84. #define AO r23
  85. #define BO r24
  86. #define CO1 r25
  87. #define CO2 r26
  88. #define CO3 r27
  89. #define CO4 r28
  90. #define PREA r29
  91. #define PREB r29
  92. #define PREC r30
  93. #define VREG r31
  94. #define LOAD_A lvx
  95. #define LOAD_B lvx
  96. #define OFFSET_0 0
  97. #define OFFSET_1 r14
  98. #define OFFSET_2 r15
  99. #define OFFSET_3 r16
  100. #define OFFSET_4 r17
  101. #define OFFSET_5 r18
  102. #define OFFSET_6 r19
  103. #define OFFSET_7 r20
  104. #define c01 v0
  105. #define c02 v1
  106. #define c03 v2
  107. #define c04 v3
  108. #define c05 v4
  109. #define c06 v5
  110. #define c07 v6
  111. #define c08 v7
  112. #define c09 v8
  113. #define c10 v9
  114. #define c11 v10
  115. #define c12 v11
  116. #define c13 v12
  117. #define c14 v13
  118. #define c15 v14
  119. #define c16 v15
  120. #define a1 v16
  121. #define a2 v17
  122. #define a3 v18
  123. #define a4 v19
  124. #define a5 v20
  125. #define a6 v21
  126. #define a7 v22
  127. #define a8 v23
  128. #define b1 v24
  129. #define b2 v25
  130. #define bp1 v26
  131. #define bp2 v27
  132. #define C1 v16
  133. #define C2 v17
  134. #define C3 v18
  135. #define C4 v19
  136. #define C5 v20
  137. #define C6 v21
  138. #define C7 v22
  139. #define C8 v23
  140. #define C9 v24
  141. #define c00 v25
  142. #define PERMRSHIFT1 v26
  143. #define PERMRSHIFT2 v27
  144. #define PERMRSHIFT3 v28
  145. #define PERMRSHIFT4 v29
  146. #define VZERO v30
  147. #define alpha v31
  148. #ifndef NEEDPARAM
  149. PROLOGUE
  150. PROFCODE
  151. addi SP, SP, -STACKSIZE
  152. mr STACK, SP
  153. li r0, 0 * 16
  154. stvx v20, SP, r0
  155. li r0, 1 * 16
  156. stvx v21, SP, r0
  157. li r0, 2 * 16
  158. stvx v22, SP, r0
  159. li r0, 3 * 16
  160. stvx v23, SP, r0
  161. li r0, 4 * 16
  162. stvx v24, SP, r0
  163. li r0, 5 * 16
  164. stvx v25, SP, r0
  165. li r0, 6 * 16
  166. stvx v26, SP, r0
  167. li r0, 7 * 16
  168. stvx v27, SP, r0
  169. li r0, 8 * 16
  170. stvx v28, SP, r0
  171. li r0, 9 * 16
  172. stvx v29, SP, r0
  173. li r0, 10 * 16
  174. stvx v30, SP, r0
  175. li r0, 11 * 16
  176. stvx v31, SP, r0
  177. #ifdef __64BIT__
  178. std r31, 192(SP)
  179. std r30, 200(SP)
  180. std r29, 208(SP)
  181. std r28, 216(SP)
  182. std r27, 224(SP)
  183. std r26, 232(SP)
  184. std r25, 240(SP)
  185. std r24, 248(SP)
  186. std r23, 256(SP)
  187. std r22, 264(SP)
  188. std r21, 272(SP)
  189. std r20, 280(SP)
  190. std r19, 288(SP)
  191. std r18, 296(SP)
  192. std r17, 304(SP)
  193. std r16, 312(SP)
  194. std r15, 320(SP)
  195. std r14, 328(SP)
  196. #else
  197. stw r31, 192(SP)
  198. stw r30, 196(SP)
  199. stw r29, 200(SP)
  200. stw r28, 204(SP)
  201. stw r27, 208(SP)
  202. stw r26, 212(SP)
  203. stw r25, 216(SP)
  204. stw r24, 220(SP)
  205. stw r23, 224(SP)
  206. stw r22, 228(SP)
  207. stw r21, 232(SP)
  208. stw r20, 236(SP)
  209. stw r19, 240(SP)
  210. stw r18, 244(SP)
  211. stw r17, 248(SP)
  212. stw r16, 252(SP)
  213. stw r15, 256(SP)
  214. stw r14, 260(SP)
  215. #endif
  216. #if defined(_AIX) || defined(__APPLE__)
  217. #if !defined(__64BIT__) && defined(DOUBLE)
  218. lwz LDC, FRAMESLOT(0) + STACKSIZE(SP)
  219. #endif
  220. #endif
  221. li r0, -1
  222. mfspr VREG, VRsave
  223. mtspr VRsave, r0
  224. addi SP, SP, -128
  225. li r0, -128
  226. and SP, SP, r0
  227. li OFFSET_1, 4 * SIZE
  228. li OFFSET_2, 8 * SIZE
  229. li OFFSET_3, 12 * SIZE
  230. li OFFSET_4, 16 * SIZE
  231. li OFFSET_5, 20 * SIZE
  232. li OFFSET_6, 24 * SIZE
  233. li OFFSET_7, 28 * SIZE
  234. stfs f1, ALPHA + 0(SP)
  235. stfs f1, ALPHA + 4(SP)
  236. stfs f1, ALPHA + 8(SP)
  237. stfs f1, ALPHA + 12(SP)
  238. li r29, 0
  239. stw r29, FZERO(SP)
  240. slwi LDC, LDC, BASE_SHIFT
  241. li PREC, (15 * SIZE)
  242. li PREB, (25 * 8 * SIZE)
  243. cmpwi cr0, M, 0
  244. ble LL(999)
  245. cmpwi cr0, N, 0
  246. ble LL(999)
  247. cmpwi cr0, K, 0
  248. ble LL(999)
  249. srawi. J, N, 2
  250. ble LL(60)
  251. .align 4
  252. LL(01):
  253. mr CO1, C
  254. add CO2, C, LDC
  255. add CO3, CO2, LDC
  256. add CO4, CO3, LDC
  257. add C, CO4, LDC
  258. mr AO, A
  259. srawi. I, M, 4
  260. ble LL(20)
  261. .align 4
  262. LL(11):
  263. vxor c01, c01, c01
  264. LOAD_B b1, OFFSET_0, B
  265. vxor c02, c02, c02
  266. LOAD_A a1, OFFSET_0, AO
  267. vxor c03, c03, c03
  268. LOAD_A a2, OFFSET_1, AO
  269. vxor c04, c04, c04
  270. LOAD_A a3, OFFSET_2, AO
  271. vxor c05, c05, c05
  272. LOAD_A a4, OFFSET_3, AO
  273. vxor c06, c06, c06
  274. LOAD_B b2, OFFSET_2, B
  275. vxor c07, c07, c07
  276. LOAD_A a5, OFFSET_4, AO
  277. vxor c08, c08, c08
  278. LOAD_A a6, OFFSET_5, AO
  279. vxor c09, c09, c09
  280. dcbtst CO1, PREC
  281. vxor c10, c10, c10
  282. dcbtst CO2, PREC
  283. vxor c11, c11, c11
  284. dcbtst CO3, PREC
  285. vxor c12, c12, c12
  286. dcbtst CO4, PREC
  287. vxor c13, c13, c13
  288. mr BO, B
  289. vxor c14, c14, c14
  290. srawi. r0, K, 2
  291. vxor c15, c15, c15
  292. mtspr CTR, r0
  293. vxor c16, c16, c16
  294. vspltw bp1, b1, 0
  295. ble LL(15)
  296. .align 4
  297. LL(12):
  298. /* 1 */
  299. vmaddfp c01, a1, bp1, c01
  300. vspltw bp2, b1, 1
  301. vmaddfp c02, a2, bp1, c02
  302. addi AO, AO, 8 * SIZE
  303. vmaddfp c03, a3, bp1, c03
  304. LOAD_A a7, OFFSET_4, AO
  305. vmaddfp c04, a4, bp1, c04
  306. LOAD_A a8, OFFSET_5, AO
  307. /* 2 */
  308. vmaddfp c05, a1, bp2, c05
  309. vspltw bp1, b1, 2
  310. vmaddfp c06, a2, bp2, c06
  311. dcbt BO, PREB
  312. vmaddfp c07, a3, bp2, c07
  313. dcbt AO, PREB
  314. vmaddfp c08, a4, bp2, c08
  315. addi AO, AO, 8 * SIZE
  316. /* 3 */
  317. vmaddfp c09, a1, bp1, c09
  318. vspltw bp2, b1, 3
  319. vmaddfp c10, a2, bp1, c10
  320. LOAD_B b1, OFFSET_1, BO
  321. vmaddfp c11, a3, bp1, c11
  322. dcbt AO, PREB
  323. vmaddfp c12, a4, bp1, c12
  324. addi AO, AO, 8 * SIZE
  325. /* 4 */
  326. vmaddfp c13, a1, bp2, c13
  327. vspltw bp1, b1, 0
  328. vmaddfp c14, a2, bp2, c14
  329. LOAD_A a1, OFFSET_2, AO
  330. vmaddfp c15, a3, bp2, c15
  331. dcbt AO, PREB
  332. vmaddfp c16, a4, bp2, c16
  333. addi AO, AO, 8 * SIZE
  334. /* 5 */
  335. vmaddfp c01, a5, bp1, c01
  336. vspltw bp2, b1, 1
  337. vmaddfp c02, a6, bp1, c02
  338. LOAD_A a2, OFFSET_1, AO
  339. vmaddfp c03, a7, bp1, c03
  340. LOAD_A a3, OFFSET_2, AO
  341. vmaddfp c04, a8, bp1, c04
  342. LOAD_A a4, OFFSET_3, AO
  343. /* 6 */
  344. vmaddfp c05, a5, bp2, c05
  345. vspltw bp1, b1, 2
  346. vmaddfp c06, a6, bp2, c06
  347. nop
  348. vmaddfp c07, a7, bp2, c07
  349. dcbt AO, PREA
  350. vmaddfp c08, a8, bp2, c08
  351. addi AO, AO, 8 * SIZE
  352. /* 7 */
  353. vmaddfp c09, a5, bp1, c09
  354. vspltw bp2, b1, 3
  355. vmaddfp c10, a6, bp1, c10
  356. LOAD_B b1, OFFSET_4, BO
  357. vmaddfp c11, a7, bp1, c11
  358. nop
  359. vmaddfp c12, a8, bp1, c12
  360. nop
  361. /* 8 */
  362. vmaddfp c13, a5, bp2, c13
  363. vspltw bp1, b2, 0
  364. vmaddfp c14, a6, bp2, c14
  365. LOAD_A a5, OFFSET_2, AO
  366. vmaddfp c15, a7, bp2, c15
  367. LOAD_A a6, OFFSET_3, AO
  368. vmaddfp c16, a8, bp2, c16
  369. LOAD_A a7, OFFSET_4, AO
  370. /* 9 */
  371. vmaddfp c01, a1, bp1, c01
  372. vspltw bp2, b2, 1
  373. vmaddfp c02, a2, bp1, c02
  374. LOAD_A a8, OFFSET_5, AO
  375. vmaddfp c03, a3, bp1, c03
  376. addi BO, BO, 8 * SIZE
  377. vmaddfp c04, a4, bp1, c04
  378. nop
  379. /* 10 */
  380. vmaddfp c05, a1, bp2, c05
  381. vspltw bp1, b2, 2
  382. vmaddfp c06, a2, bp2, c06
  383. nop
  384. vmaddfp c07, a3, bp2, c07
  385. nop
  386. vmaddfp c08, a4, bp2, c08
  387. nop
  388. /* 11 */
  389. vmaddfp c09, a1, bp1, c09
  390. vspltw bp2, b2, 3
  391. vmaddfp c10, a2, bp1, c10
  392. LOAD_B b2, OFFSET_1, BO
  393. vmaddfp c11, a3, bp1, c11
  394. dcbt AO, PREA
  395. vmaddfp c12, a4, bp1, c12
  396. addi AO, AO, 8 * SIZE
  397. /* 12 */
  398. vmaddfp c13, a1, bp2, c13
  399. vspltw bp1, b2, 0
  400. vmaddfp c14, a2, bp2, c14
  401. LOAD_A a1, OFFSET_4, AO
  402. vmaddfp c15, a3, bp2, c15
  403. LOAD_A a2, OFFSET_5, AO
  404. vmaddfp c16, a4, bp2, c16
  405. LOAD_A a3, OFFSET_6, AO
  406. /* 13 */
  407. vmaddfp c01, a5, bp1, c01
  408. vspltw bp2, b2, 1
  409. vmaddfp c02, a6, bp1, c02
  410. LOAD_A a4, OFFSET_7, AO
  411. vmaddfp c03, a7, bp1, c03
  412. dcbt AO, PREA
  413. vmaddfp c04, a8, bp1, c04
  414. addi AO, AO, 8 * SIZE
  415. /* 14 */
  416. vmaddfp c05, a5, bp2, c05
  417. vspltw bp1, b2, 2
  418. vmaddfp c06, a6, bp2, c06
  419. nop
  420. vmaddfp c07, a7, bp2, c07
  421. dcbt AO, PREA
  422. vmaddfp c08, a8, bp2, c08
  423. addi AO, AO, 8 * SIZE
  424. /* 15 */
  425. vmaddfp c09, a5, bp1, c09
  426. vspltw bp2, b2, 3
  427. vmaddfp c10, a6, bp1, c10
  428. LOAD_B b2, OFFSET_4, BO
  429. vmaddfp c11, a7, bp1, c11
  430. dcbt AO, PREA
  431. vmaddfp c12, a8, bp1, c12
  432. addi BO, BO, 8 * SIZE
  433. /* 16 */
  434. vmaddfp c13, a5, bp2, c13
  435. vspltw bp1, b1, 0
  436. vmaddfp c14, a6, bp2, c14
  437. LOAD_A a5, OFFSET_4, AO
  438. vmaddfp c15, a7, bp2, c15
  439. LOAD_A a6, OFFSET_5, AO
  440. vmaddfp c16, a8, bp2, c16
  441. bdnz+ LL(12)
  442. .align 4
  443. LL(15):
  444. andi. r0, K, 3
  445. lvx alpha, OFFSET_0, SP
  446. vxor VZERO, VZERO, VZERO
  447. mtspr CTR, r0
  448. ble+ LL(18)
  449. .align 4
  450. LL(16):
  451. vmaddfp c01, a1, bp1, c01
  452. vspltw bp2, b1, 1
  453. vmaddfp c02, a2, bp1, c02
  454. nop
  455. vmaddfp c03, a3, bp1, c03
  456. nop
  457. vmaddfp c04, a4, bp1, c04
  458. nop
  459. vmaddfp c05, a1, bp2, c05
  460. vspltw bp1, b1, 2
  461. vmaddfp c06, a2, bp2, c06
  462. nop
  463. vmaddfp c07, a3, bp2, c07
  464. nop
  465. vmaddfp c08, a4, bp2, c08
  466. nop
  467. vmaddfp c09, a1, bp1, c09
  468. vspltw bp2, b1, 3
  469. vmaddfp c10, a2, bp1, c10
  470. LOAD_B b1, OFFSET_1, BO
  471. vmaddfp c11, a3, bp1, c11
  472. addi AO, AO, 16 * SIZE
  473. vmaddfp c12, a4, bp1, c12
  474. addi BO, BO, 4 * SIZE
  475. vmaddfp c13, a1, bp2, c13
  476. vspltw bp1, b1, 0
  477. vmaddfp c14, a2, bp2, c14
  478. LOAD_A a1, OFFSET_0, AO
  479. vmaddfp c15, a3, bp2, c15
  480. LOAD_A a2, OFFSET_1, AO
  481. vmaddfp c16, a4, bp2, c16
  482. LOAD_A a3, OFFSET_2, AO
  483. LOAD_A a4, OFFSET_3, AO
  484. bdnz+ LL(16)
  485. .align 4
  486. LL(18):
  487. lvx C1, OFFSET_0, CO1
  488. cmpwi cr0, LDC, 32 * SIZE
  489. lvx C2, OFFSET_1, CO1
  490. lvsr PERMRSHIFT1, 0, CO1
  491. lvx C3, OFFSET_2, CO1
  492. lvsr PERMRSHIFT2, 0, CO2
  493. lvx C4, OFFSET_3, CO1
  494. lvsr PERMRSHIFT3, 0, CO3
  495. lvx C5, OFFSET_4, CO1
  496. lvsr PERMRSHIFT4, 0, CO4
  497. ble LL(19)
  498. vperm c00, VZERO, c01, PERMRSHIFT1
  499. vperm c01, c01, c02, PERMRSHIFT1
  500. vperm c02, c02, c03, PERMRSHIFT1
  501. vperm c03, c03, c04, PERMRSHIFT1
  502. vperm c04, c04, VZERO, PERMRSHIFT1
  503. vmaddfp c00, alpha, c00, C1
  504. lvx C1, OFFSET_0, CO2
  505. vmaddfp c01, alpha, c01, C2
  506. lvx C6, OFFSET_1, CO2
  507. vmaddfp c02, alpha, c02, C3
  508. lvx C7, OFFSET_2, CO2
  509. vmaddfp c03, alpha, c03, C4
  510. lvx C8, OFFSET_3, CO2
  511. vmaddfp c04, alpha, c04, C5
  512. lvx C9, OFFSET_4, CO2
  513. stvx c00, OFFSET_0, CO1
  514. vperm c00, VZERO, c05, PERMRSHIFT2
  515. stvx c01, OFFSET_1, CO1
  516. vperm c05, c05, c06, PERMRSHIFT2
  517. stvx c02, OFFSET_2, CO1
  518. vperm c06, c06, c07, PERMRSHIFT2
  519. stvx c03, OFFSET_3, CO1
  520. vperm c07, c07, c08, PERMRSHIFT2
  521. stvx c04, OFFSET_4, CO1
  522. vperm c08, c08, VZERO, PERMRSHIFT2
  523. vmaddfp c00, alpha, c00, C1
  524. lvx C1, OFFSET_0, CO3
  525. vmaddfp c05, alpha, c05, C6
  526. lvx C2, OFFSET_1, CO3
  527. vmaddfp c06, alpha, c06, C7
  528. lvx C3, OFFSET_2, CO3
  529. vmaddfp c07, alpha, c07, C8
  530. lvx C4, OFFSET_3, CO3
  531. vmaddfp c08, alpha, c08, C9
  532. lvx C5, OFFSET_4, CO3
  533. stvx c00, OFFSET_0, CO2
  534. vperm c00, VZERO, c09, PERMRSHIFT3
  535. stvx c05, OFFSET_1, CO2
  536. vperm c09, c09, c10, PERMRSHIFT3
  537. stvx c06, OFFSET_2, CO2
  538. vperm c10, c10, c11, PERMRSHIFT3
  539. stvx c07, OFFSET_3, CO2
  540. vperm c11, c11, c12, PERMRSHIFT3
  541. stvx c08, OFFSET_4, CO2
  542. vperm c12, c12, VZERO, PERMRSHIFT3
  543. vmaddfp c00, alpha, c00, C1
  544. lvx C9, OFFSET_4, CO4
  545. vmaddfp c09, alpha, c09, C2
  546. lvx C1, OFFSET_0, CO4
  547. vmaddfp c10, alpha, c10, C3
  548. lvx C6, OFFSET_1, CO4
  549. vmaddfp c11, alpha, c11, C4
  550. lvx C7, OFFSET_2, CO4
  551. vmaddfp c12, alpha, c12, C5
  552. lvx C8, OFFSET_3, CO4
  553. stvx c00, OFFSET_0, CO3
  554. vperm c00, VZERO, c13, PERMRSHIFT4
  555. stvx c09, OFFSET_1, CO3
  556. vperm c13, c13, c14, PERMRSHIFT4
  557. stvx c10, OFFSET_2, CO3
  558. vperm c14, c14, c15, PERMRSHIFT4
  559. stvx c11, OFFSET_3, CO3
  560. vperm c15, c15, c16, PERMRSHIFT4
  561. stvx c12, OFFSET_4, CO3
  562. vperm c16, c16, VZERO, PERMRSHIFT4
  563. vmaddfp c00, alpha, c00, C1
  564. vmaddfp c13, alpha, c13, C6
  565. vmaddfp c14, alpha, c14, C7
  566. vmaddfp c15, alpha, c15, C8
  567. vmaddfp c16, alpha, c16, C9
  568. stvx c00, OFFSET_0, CO4
  569. stvx c13, OFFSET_1, CO4
  570. stvx c14, OFFSET_2, CO4
  571. stvx c15, OFFSET_3, CO4
  572. stvx c16, OFFSET_4, CO4
  573. addi CO1, CO1, 16 * SIZE
  574. addi CO2, CO2, 16 * SIZE
  575. addi CO3, CO3, 16 * SIZE
  576. addi CO4, CO4, 16 * SIZE
  577. addic. I, I, -1
  578. bgt+ LL(11)
  579. b LL(20)
  580. .align 4
  581. LL(19):
  582. lvx C6, OFFSET_1, CO2
  583. lvx C7, OFFSET_2, CO2
  584. lvx C8, OFFSET_3, CO2
  585. lvx C9, OFFSET_4, CO2
  586. vperm c00, VZERO, c01, PERMRSHIFT1
  587. vperm c01, c01, c02, PERMRSHIFT1
  588. vperm c02, c02, c03, PERMRSHIFT1
  589. vperm c03, c03, c04, PERMRSHIFT1
  590. vperm c04, c04, VZERO, PERMRSHIFT1
  591. vmaddfp c00, alpha, c00, C1
  592. vmaddfp c01, alpha, c01, C2
  593. lvx C2, OFFSET_1, CO3
  594. vmaddfp c02, alpha, c02, C3
  595. lvx C3, OFFSET_2, CO3
  596. vmaddfp c03, alpha, c03, C4
  597. lvx C4, OFFSET_3, CO3
  598. vmaddfp c04, alpha, c04, C5
  599. lvx C5, OFFSET_4, CO3
  600. stvx c00, OFFSET_0, CO1
  601. stvx c01, OFFSET_1, CO1
  602. stvx c02, OFFSET_2, CO1
  603. stvx c03, OFFSET_3, CO1
  604. stvx c04, OFFSET_4, CO1
  605. lvx C1, OFFSET_0, CO2
  606. vperm c00, VZERO, c05, PERMRSHIFT2
  607. vperm c05, c05, c06, PERMRSHIFT2
  608. vperm c06, c06, c07, PERMRSHIFT2
  609. vperm c07, c07, c08, PERMRSHIFT2
  610. vperm c08, c08, VZERO, PERMRSHIFT2
  611. vmaddfp c00, alpha, c00, C1
  612. vmaddfp c05, alpha, c05, C6
  613. lvx C6, OFFSET_1, CO4
  614. vmaddfp c06, alpha, c06, C7
  615. lvx C7, OFFSET_2, CO4
  616. vmaddfp c07, alpha, c07, C8
  617. lvx C8, OFFSET_3, CO4
  618. vmaddfp c08, alpha, c08, C9
  619. lvx C9, OFFSET_4, CO4
  620. stvx c00, OFFSET_0, CO2
  621. stvx c05, OFFSET_1, CO2
  622. stvx c06, OFFSET_2, CO2
  623. stvx c07, OFFSET_3, CO2
  624. stvx c08, OFFSET_4, CO2
  625. lvx C1, OFFSET_0, CO3
  626. vperm c00, VZERO, c09, PERMRSHIFT3
  627. vperm c09, c09, c10, PERMRSHIFT3
  628. vperm c10, c10, c11, PERMRSHIFT3
  629. vperm c11, c11, c12, PERMRSHIFT3
  630. vperm c12, c12, VZERO, PERMRSHIFT3
  631. vmaddfp c00, alpha, c00, C1
  632. vmaddfp c09, alpha, c09, C2
  633. vmaddfp c10, alpha, c10, C3
  634. vmaddfp c11, alpha, c11, C4
  635. vmaddfp c12, alpha, c12, C5
  636. stvx c00, OFFSET_0, CO3
  637. stvx c09, OFFSET_1, CO3
  638. stvx c10, OFFSET_2, CO3
  639. stvx c11, OFFSET_3, CO3
  640. stvx c12, OFFSET_4, CO3
  641. lvx C1, OFFSET_0, CO4
  642. vperm c00, VZERO, c13, PERMRSHIFT4
  643. vperm c13, c13, c14, PERMRSHIFT4
  644. vperm c14, c14, c15, PERMRSHIFT4
  645. vperm c15, c15, c16, PERMRSHIFT4
  646. vperm c16, c16, VZERO, PERMRSHIFT4
  647. vmaddfp c00, alpha, c00, C1
  648. vmaddfp c13, alpha, c13, C6
  649. vmaddfp c14, alpha, c14, C7
  650. vmaddfp c15, alpha, c15, C8
  651. vmaddfp c16, alpha, c16, C9
  652. stvx c00, OFFSET_0, CO4
  653. stvx c13, OFFSET_1, CO4
  654. stvx c14, OFFSET_2, CO4
  655. stvx c15, OFFSET_3, CO4
  656. stvx c16, OFFSET_4, CO4
  657. addi CO1, CO1, 16 * SIZE
  658. addi CO2, CO2, 16 * SIZE
  659. addi CO3, CO3, 16 * SIZE
  660. addi CO4, CO4, 16 * SIZE
  661. addic. I, I, -1
  662. bgt+ LL(11)
  663. .align 4
  664. LL(20):
  665. andi. I, M, 8
  666. ble LL(30)
  667. vxor c01, c01, c01
  668. LOAD_A a1, OFFSET_0, AO
  669. vxor c02, c02, c02
  670. LOAD_A a2, OFFSET_1, AO
  671. vxor c05, c05, c05
  672. LOAD_A a3, OFFSET_2, AO
  673. vxor c06, c06, c06
  674. LOAD_A a4, OFFSET_3, AO
  675. vxor c09, c09, c09
  676. LOAD_B b1, OFFSET_0, B
  677. vxor c10, c10, c10
  678. LOAD_B b2, OFFSET_1, B
  679. vxor c13, c13, c13
  680. vxor c14, c14, c14
  681. mr BO, B
  682. vspltw bp1, b1, 0
  683. srawi. r0, K, 1
  684. mtspr CTR, r0
  685. ble LL(25)
  686. .align 4
  687. LL(22):
  688. vmaddfp c01, a1, bp1, c01
  689. vspltw bp2, b1, 1
  690. addi AO, AO, 16 * SIZE
  691. vmaddfp c02, a2, bp1, c02
  692. addi BO, BO, 8 * SIZE
  693. vmaddfp c05, a1, bp2, c05
  694. vspltw bp1, b1, 2
  695. vmaddfp c06, a2, bp2, c06
  696. vmaddfp c09, a1, bp1, c09
  697. vspltw bp2, b1, 3
  698. LOAD_B b1, OFFSET_0, BO
  699. vmaddfp c10, a2, bp1, c10
  700. vmaddfp c13, a1, bp2, c13
  701. LOAD_A a1, OFFSET_0, AO
  702. vspltw bp1, b2, 0
  703. vmaddfp c14, a2, bp2, c14
  704. LOAD_A a2, OFFSET_1, AO
  705. vmaddfp c01, a3, bp1, c01
  706. vspltw bp2, b2, 1
  707. vmaddfp c02, a4, bp1, c02
  708. vmaddfp c05, a3, bp2, c05
  709. vspltw bp1, b2, 2
  710. vmaddfp c06, a4, bp2, c06
  711. vmaddfp c09, a3, bp1, c09
  712. vspltw bp2, b2, 3
  713. LOAD_B b2, OFFSET_1, BO
  714. vmaddfp c10, a4, bp1, c10
  715. vmaddfp c13, a3, bp2, c13
  716. LOAD_A a3, OFFSET_2, AO
  717. vmaddfp c14, a4, bp2, c14
  718. LOAD_A a4, OFFSET_3, AO
  719. vspltw bp1, b1, 0
  720. bdnz LL(22)
  721. .align 4
  722. LL(25):
  723. andi. r0, K, 1
  724. lvx alpha, OFFSET_0, SP
  725. vxor VZERO, VZERO, VZERO
  726. ble+ LL(28)
  727. .align 4
  728. LL(26):
  729. vmaddfp c01, a1, bp1, c01
  730. vspltw bp2, b1, 1
  731. vmaddfp c02, a2, bp1, c02
  732. nop
  733. vmaddfp c05, a1, bp2, c05
  734. vspltw bp1, b1, 2
  735. vmaddfp c06, a2, bp2, c06
  736. nop
  737. vmaddfp c09, a1, bp1, c09
  738. vspltw bp2, b1, 3
  739. vmaddfp c10, a2, bp1, c10
  740. addi AO, AO, 8 * SIZE
  741. vmaddfp c13, a1, bp2, c13
  742. addi BO, BO, 4 * SIZE
  743. vmaddfp c14, a2, bp2, c14
  744. nop
  745. .align 4
  746. LL(28):
  747. lvx C1, OFFSET_0, CO1
  748. lvx C2, OFFSET_1, CO1
  749. lvx C3, OFFSET_2, CO1
  750. lvsr PERMRSHIFT1, 0, CO1
  751. lvsr PERMRSHIFT2, 0, CO2
  752. lvsr PERMRSHIFT3, 0, CO3
  753. lvsr PERMRSHIFT4, 0, CO4
  754. vperm c00, VZERO, c01, PERMRSHIFT1
  755. vperm c01, c01, c02, PERMRSHIFT1
  756. vperm c02, c02, VZERO, PERMRSHIFT1
  757. vmaddfp c00, alpha, c00, C1
  758. vmaddfp c01, alpha, c01, C2
  759. vmaddfp c02, alpha, c02, C3
  760. stvx c00, OFFSET_0, CO1
  761. stvx c01, OFFSET_1, CO1
  762. stvx c02, OFFSET_2, CO1
  763. lvx C1, OFFSET_0, CO2
  764. lvx C2, OFFSET_1, CO2
  765. lvx C3, OFFSET_2, CO2
  766. vperm c00, VZERO, c05, PERMRSHIFT2
  767. vperm c05, c05, c06, PERMRSHIFT2
  768. vperm c06, c06, VZERO, PERMRSHIFT2
  769. vmaddfp c00, alpha, c00, C1
  770. vmaddfp c05, alpha, c05, C2
  771. vmaddfp c06, alpha, c06, C3
  772. stvx c00, OFFSET_0, CO2
  773. stvx c05, OFFSET_1, CO2
  774. stvx c06, OFFSET_2, CO2
  775. lvx C1, OFFSET_0, CO3
  776. lvx C2, OFFSET_1, CO3
  777. lvx C3, OFFSET_2, CO3
  778. vperm c00, VZERO, c09, PERMRSHIFT3
  779. vperm c09, c09, c10, PERMRSHIFT3
  780. vperm c10, c10, VZERO, PERMRSHIFT3
  781. vmaddfp c00, alpha, c00, C1
  782. vmaddfp c09, alpha, c09, C2
  783. vmaddfp c10, alpha, c10, C3
  784. stvx c00, OFFSET_0, CO3
  785. stvx c09, OFFSET_1, CO3
  786. stvx c10, OFFSET_2, CO3
  787. lvx C1, OFFSET_0, CO4
  788. lvx C2, OFFSET_1, CO4
  789. lvx C3, OFFSET_2, CO4
  790. vperm c00, VZERO, c13, PERMRSHIFT4
  791. vperm c13, c13, c14, PERMRSHIFT4
  792. vperm c14, c14, VZERO, PERMRSHIFT4
  793. vmaddfp c00, alpha, c00, C1
  794. vmaddfp c13, alpha, c13, C2
  795. vmaddfp c14, alpha, c14, C3
  796. stvx c00, OFFSET_0, CO4
  797. stvx c13, OFFSET_1, CO4
  798. stvx c14, OFFSET_2, CO4
  799. addi CO1, CO1, 8 * SIZE
  800. addi CO2, CO2, 8 * SIZE
  801. addi CO3, CO3, 8 * SIZE
  802. addi CO4, CO4, 8 * SIZE
  803. .align 4
  804. LL(30):
  805. andi. I, M, 4
  806. ble LL(40)
  807. vxor c01, c01, c01
  808. LOAD_A a1, OFFSET_0, AO
  809. vxor c02, c02, c02
  810. LOAD_A a2, OFFSET_1, AO
  811. vxor c05, c05, c05
  812. LOAD_B b1, OFFSET_0, B
  813. vxor c06, c06, c06
  814. LOAD_B b2, OFFSET_1, B
  815. vxor c09, c09, c09
  816. vxor c10, c10, c10
  817. vxor c13, c13, c13
  818. vxor c14, c14, c14
  819. vspltw bp1, b1, 0
  820. mr BO, B
  821. srawi. r0, K, 1
  822. mtspr CTR, r0
  823. ble LL(35)
  824. .align 4
  825. LL(32):
  826. vmaddfp c01, a1, bp1, c01
  827. addi AO, AO, 8 * SIZE
  828. vspltw bp2, b1, 1
  829. vmaddfp c05, a1, bp2, c05
  830. addi BO, BO, 8 * SIZE
  831. vspltw bp1, b1, 2
  832. vmaddfp c09, a1, bp1, c09
  833. vspltw bp2, b1, 3
  834. vmaddfp c13, a1, bp2, c13
  835. LOAD_A a1, OFFSET_0, AO
  836. vspltw bp1, b2, 0
  837. LOAD_B b1, OFFSET_0, BO
  838. vmaddfp c02, a2, bp1, c02
  839. vspltw bp2, b2, 1
  840. vmaddfp c06, a2, bp2, c06
  841. vspltw bp1, b2, 2
  842. vmaddfp c10, a2, bp1, c10
  843. vspltw bp2, b2, 3
  844. LOAD_B b2, OFFSET_1, BO
  845. vmaddfp c14, a2, bp2, c14
  846. LOAD_A a2, OFFSET_1, AO
  847. vspltw bp1, b1, 0
  848. bdnz LL(32)
  849. .align 4
  850. LL(35):
  851. andi. r0, K, 1
  852. lvx alpha, OFFSET_0, SP
  853. vxor VZERO, VZERO, VZERO
  854. ble+ LL(38)
  855. .align 4
  856. LL(36):
  857. vmaddfp c01, a1, bp1, c01
  858. vspltw bp2, b1, 1
  859. vmaddfp c05, a1, bp2, c05
  860. vspltw bp1, b1, 2
  861. vmaddfp c09, a1, bp1, c09
  862. vspltw bp2, b1, 3
  863. vmaddfp c13, a1, bp2, c13
  864. addi AO, AO, 4 * SIZE
  865. addi BO, BO, 4 * SIZE
  866. .align 4
  867. LL(38):
  868. vaddfp c01, c01, c02
  869. vaddfp c05, c05, c06
  870. vaddfp c09, c09, c10
  871. vaddfp c13, c13, c14
  872. lvx C1, OFFSET_0, CO1
  873. lvx C2, OFFSET_1, CO1
  874. lvsr PERMRSHIFT1, 0, CO1
  875. lvsr PERMRSHIFT2, 0, CO2
  876. lvsr PERMRSHIFT3, 0, CO3
  877. lvsr PERMRSHIFT4, 0, CO4
  878. vperm c00, VZERO, c01, PERMRSHIFT1
  879. vperm c01, c01, VZERO, PERMRSHIFT1
  880. vmaddfp c00, alpha, c00, C1
  881. vmaddfp c01, alpha, c01, C2
  882. stvx c00, OFFSET_0, CO1
  883. stvx c01, OFFSET_1, CO1
  884. lvx C1, OFFSET_0, CO2
  885. lvx C2, OFFSET_1, CO2
  886. vperm c00, VZERO, c05, PERMRSHIFT2
  887. vperm c05, c05, VZERO, PERMRSHIFT2
  888. vmaddfp c00, alpha, c00, C1
  889. vmaddfp c05, alpha, c05, C2
  890. stvx c00, OFFSET_0, CO2
  891. stvx c05, OFFSET_1, CO2
  892. lvx C1, OFFSET_0, CO3
  893. lvx C2, OFFSET_1, CO3
  894. vperm c00, VZERO, c09, PERMRSHIFT3
  895. vperm c09, c09, VZERO, PERMRSHIFT3
  896. vmaddfp c00, alpha, c00, C1
  897. vmaddfp c09, alpha, c09, C2
  898. stvx c00, OFFSET_0, CO3
  899. stvx c09, OFFSET_1, CO3
  900. lvx C1, OFFSET_0, CO4
  901. lvx C2, OFFSET_1, CO4
  902. vperm c00, VZERO, c13, PERMRSHIFT4
  903. vperm c13, c13, VZERO, PERMRSHIFT4
  904. vmaddfp c00, alpha, c00, C1
  905. vmaddfp c13, alpha, c13, C2
  906. stvx c00, OFFSET_0, CO4
  907. stvx c13, OFFSET_1, CO4
  908. addi CO1, CO1, 4 * SIZE
  909. addi CO2, CO2, 4 * SIZE
  910. addi CO3, CO3, 4 * SIZE
  911. addi CO4, CO4, 4 * SIZE
  912. .align 4
  913. LL(40):
  914. andi. I, M, 2
  915. ble LL(50)
  916. mr BO, B
  917. LFD f8, 0 * SIZE(AO)
  918. LFD f9, 1 * SIZE(AO)
  919. LFD f10, 0 * SIZE(B)
  920. LFD f11, 1 * SIZE(B)
  921. LFD f12, 2 * SIZE(B)
  922. LFD f13, 3 * SIZE(B)
  923. lfs f0, FZERO(SP)
  924. fmr f1, f0
  925. fmr f2, f0
  926. fmr f3, f0
  927. fmr f4, f0
  928. fmr f5, f0
  929. fmr f6, f0
  930. fmr f7, f0
  931. srawi. r0, K, 1
  932. mtspr CTR, r0
  933. ble LL(45)
  934. .align 4
  935. LL(42):
  936. FMADD f0, f8, f10, f0
  937. FMADD f2, f8, f11, f2
  938. FMADD f4, f8, f12, f4
  939. FMADD f6, f8, f13, f6
  940. FMADD f1, f9, f10, f1
  941. FMADD f3, f9, f11, f3
  942. FMADD f5, f9, f12, f5
  943. FMADD f7, f9, f13, f7
  944. LFD f8, 2 * SIZE(AO)
  945. LFD f9, 3 * SIZE(AO)
  946. LFD f10, 4 * SIZE(BO)
  947. LFD f11, 5 * SIZE(BO)
  948. LFD f12, 6 * SIZE(BO)
  949. LFD f13, 7 * SIZE(BO)
  950. FMADD f0, f8, f10, f0
  951. FMADD f2, f8, f11, f2
  952. FMADD f4, f8, f12, f4
  953. FMADD f6, f8, f13, f6
  954. FMADD f1, f9, f10, f1
  955. FMADD f3, f9, f11, f3
  956. FMADD f5, f9, f12, f5
  957. FMADD f7, f9, f13, f7
  958. LFD f8, 4 * SIZE(AO)
  959. LFD f9, 5 * SIZE(AO)
  960. LFD f10, 8 * SIZE(BO)
  961. LFD f11, 9 * SIZE(BO)
  962. LFD f12, 10 * SIZE(BO)
  963. LFD f13, 11 * SIZE(BO)
  964. addi AO, AO, 4 * SIZE
  965. addi BO, BO, 8 * SIZE
  966. bdnz LL(42)
  967. .align 4
  968. LL(45):
  969. andi. r0, K, 1
  970. ble LL(48)
  971. .align 4
  972. LL(46):
  973. FMADD f0, f8, f10, f0
  974. FMADD f2, f8, f11, f2
  975. FMADD f4, f8, f12, f4
  976. FMADD f6, f8, f13, f6
  977. FMADD f1, f9, f10, f1
  978. FMADD f3, f9, f11, f3
  979. FMADD f5, f9, f12, f5
  980. FMADD f7, f9, f13, f7
  981. LFD f8, 2 * SIZE(AO)
  982. LFD f9, 3 * SIZE(AO)
  983. LFD f10, 4 * SIZE(BO)
  984. LFD f11, 5 * SIZE(BO)
  985. LFD f12, 6 * SIZE(BO)
  986. LFD f13, 7 * SIZE(BO)
  987. addi AO, AO, 2 * SIZE
  988. addi BO, BO, 4 * SIZE
  989. .align 4
  990. LL(48):
  991. lfs f13, ALPHA(SP)
  992. LFD f8, 0 * SIZE(CO1)
  993. LFD f9, 1 * SIZE(CO1)
  994. LFD f10, 0 * SIZE(CO2)
  995. LFD f11, 1 * SIZE(CO2)
  996. FMADD f0, f0, f13, f8
  997. FMADD f1, f1, f13, f9
  998. FMADD f2, f2, f13, f10
  999. FMADD f3, f3, f13, f11
  1000. LFD f8, 0 * SIZE(CO3)
  1001. LFD f9, 1 * SIZE(CO3)
  1002. LFD f10, 0 * SIZE(CO4)
  1003. LFD f11, 1 * SIZE(CO4)
  1004. FMADD f4, f4, f13, f8
  1005. FMADD f5, f5, f13, f9
  1006. FMADD f6, f6, f13, f10
  1007. FMADD f7, f7, f13, f11
  1008. STFD f0, 0 * SIZE(CO1)
  1009. STFD f1, 1 * SIZE(CO1)
  1010. STFD f2, 0 * SIZE(CO2)
  1011. STFD f3, 1 * SIZE(CO2)
  1012. STFD f4, 0 * SIZE(CO3)
  1013. STFD f5, 1 * SIZE(CO3)
  1014. STFD f6, 0 * SIZE(CO4)
  1015. STFD f7, 1 * SIZE(CO4)
  1016. addi CO1, CO1, 2 * SIZE
  1017. addi CO2, CO2, 2 * SIZE
  1018. addi CO3, CO3, 2 * SIZE
  1019. addi CO4, CO4, 2 * SIZE
  1020. .align 4
  1021. LL(50):
  1022. andi. I, M, 1
  1023. ble LL(59)
  1024. mr BO, B
  1025. LFD f8, 0 * SIZE(AO)
  1026. LFD f9, 1 * SIZE(AO)
  1027. LFD f10, 0 * SIZE(B)
  1028. LFD f11, 1 * SIZE(B)
  1029. LFD f12, 2 * SIZE(B)
  1030. LFD f13, 3 * SIZE(B)
  1031. lfs f0, FZERO(SP)
  1032. fmr f1, f0
  1033. fmr f2, f0
  1034. fmr f3, f0
  1035. srawi. r0, K, 1
  1036. mtspr CTR, r0
  1037. ble LL(55)
  1038. .align 4
  1039. LL(52):
  1040. FMADD f0, f8, f10, f0
  1041. FMADD f1, f8, f11, f1
  1042. FMADD f2, f8, f12, f2
  1043. FMADD f3, f8, f13, f3
  1044. LFD f8, 2 * SIZE(AO)
  1045. LFD f10, 4 * SIZE(BO)
  1046. LFD f11, 5 * SIZE(BO)
  1047. LFD f12, 6 * SIZE(BO)
  1048. LFD f13, 7 * SIZE(BO)
  1049. FMADD f0, f9, f10, f0
  1050. FMADD f1, f9, f11, f1
  1051. FMADD f2, f9, f12, f2
  1052. FMADD f3, f9, f13, f3
  1053. LFD f9, 3 * SIZE(AO)
  1054. LFD f10, 8 * SIZE(BO)
  1055. LFD f11, 9 * SIZE(BO)
  1056. LFD f12, 10 * SIZE(BO)
  1057. LFD f13, 11 * SIZE(BO)
  1058. addi AO, AO, 2 * SIZE
  1059. addi BO, BO, 8 * SIZE
  1060. bdnz LL(52)
  1061. .align 4
  1062. LL(55):
  1063. andi. r0, K, 1
  1064. ble LL(58)
  1065. .align 4
  1066. LL(56):
  1067. FMADD f0, f8, f10, f0
  1068. FMADD f1, f8, f11, f1
  1069. FMADD f2, f8, f12, f2
  1070. FMADD f3, f8, f13, f3
  1071. LFD f8, 2 * SIZE(AO)
  1072. LFD f10, 4 * SIZE(BO)
  1073. LFD f11, 5 * SIZE(BO)
  1074. LFD f12, 6 * SIZE(BO)
  1075. LFD f13, 7 * SIZE(BO)
  1076. addi AO, AO, 1 * SIZE
  1077. addi BO, BO, 4 * SIZE
  1078. .align 4
  1079. LL(58):
  1080. lfs f13, ALPHA(SP)
  1081. LFD f8, 0 * SIZE(CO1)
  1082. LFD f9, 0 * SIZE(CO2)
  1083. LFD f10, 0 * SIZE(CO3)
  1084. LFD f11, 0 * SIZE(CO4)
  1085. FMADD f0, f0, f13, f8
  1086. FMADD f1, f1, f13, f9
  1087. FMADD f2, f2, f13, f10
  1088. FMADD f3, f3, f13, f11
  1089. STFD f0, 0 * SIZE(CO1)
  1090. STFD f1, 0 * SIZE(CO2)
  1091. STFD f2, 0 * SIZE(CO3)
  1092. STFD f3, 0 * SIZE(CO4)
  1093. .align 4
  1094. LL(59):
  1095. mr B, BO
  1096. addic. J, J, -1
  1097. bgt LL(01)
  1098. .align 4
  1099. LL(60):
  1100. andi. r0, N, 2
  1101. ble LL(120)
  1102. mr CO1, C
  1103. add CO2, C, LDC
  1104. add C, CO2, LDC
  1105. mr AO, A
  1106. srawi. I, M, 4
  1107. ble LL(80)
  1108. .align 4
  1109. LL(71):
  1110. vxor c01, c01, c01
  1111. LOAD_B b1, OFFSET_0, B
  1112. vxor c02, c02, c02
  1113. vxor c03, c03, c03
  1114. LOAD_A a1, OFFSET_0, AO
  1115. vxor c04, c04, c04
  1116. LOAD_A a2, OFFSET_1, AO
  1117. vxor c05, c05, c05
  1118. LOAD_A a3, OFFSET_2, AO
  1119. vxor c06, c06, c06
  1120. LOAD_A a4, OFFSET_3, AO
  1121. vxor c07, c07, c07
  1122. vxor c08, c08, c08
  1123. mr BO, B
  1124. dcbtst CO1, PREC
  1125. dcbtst CO2, PREC
  1126. vspltw bp1, b1, 0
  1127. srawi. r0, K, 1
  1128. mtspr CTR, r0
  1129. ble LL(75)
  1130. .align 4
  1131. LL(72):
  1132. LOAD_A a5, OFFSET_4, AO
  1133. LOAD_A a6, OFFSET_5, AO
  1134. LOAD_A a7, OFFSET_6, AO
  1135. LOAD_A a8, OFFSET_7, AO
  1136. vmaddfp c01, a1, bp1, c01
  1137. vspltw bp2, b1, 1
  1138. vmaddfp c02, a2, bp1, c02
  1139. vmaddfp c03, a3, bp1, c03
  1140. vmaddfp c04, a4, bp1, c04
  1141. vmaddfp c05, a1, bp2, c05
  1142. vspltw bp1, b1, 2
  1143. vmaddfp c06, a2, bp2, c06
  1144. vmaddfp c07, a3, bp2, c07
  1145. vmaddfp c08, a4, bp2, c08
  1146. vmaddfp c01, a5, bp1, c01
  1147. vspltw bp2, b1, 3
  1148. vmaddfp c02, a6, bp1, c02
  1149. vmaddfp c03, a7, bp1, c03
  1150. vmaddfp c04, a8, bp1, c04
  1151. LOAD_B b1, OFFSET_1, BO
  1152. vspltw bp1, b1, 0
  1153. vmaddfp c05, a5, bp2, c05
  1154. vmaddfp c06, a6, bp2, c06
  1155. vmaddfp c07, a7, bp2, c07
  1156. vmaddfp c08, a8, bp2, c08
  1157. addi AO, AO, 32 * SIZE
  1158. addi BO, BO, 4 * SIZE
  1159. LOAD_A a1, OFFSET_0, AO
  1160. LOAD_A a2, OFFSET_1, AO
  1161. LOAD_A a3, OFFSET_2, AO
  1162. LOAD_A a4, OFFSET_3, AO
  1163. bdnz LL(72)
  1164. .align 4
  1165. LL(75):
  1166. andi. r0, K, 1
  1167. lvx alpha, OFFSET_0, SP
  1168. vxor VZERO, VZERO, VZERO
  1169. ble+ LL(78)
  1170. .align 4
  1171. LL(76):
  1172. vmaddfp c01, a1, bp1, c01
  1173. vspltw bp2, b1, 1
  1174. vmaddfp c02, a2, bp1, c02
  1175. addi AO, AO, 16 * SIZE
  1176. vmaddfp c03, a3, bp1, c03
  1177. addi BO, BO, 2 * SIZE
  1178. vmaddfp c04, a4, bp1, c04
  1179. nop
  1180. vmaddfp c05, a1, bp2, c05
  1181. vmaddfp c06, a2, bp2, c06
  1182. vmaddfp c07, a3, bp2, c07
  1183. vmaddfp c08, a4, bp2, c08
  1184. .align 4
  1185. LL(78):
  1186. lvx C1, OFFSET_0, CO1
  1187. lvx C2, OFFSET_1, CO1
  1188. lvx C3, OFFSET_2, CO1
  1189. lvx C4, OFFSET_3, CO1
  1190. lvx C5, OFFSET_4, CO1
  1191. lvsr PERMRSHIFT1, 0, CO1
  1192. lvsr PERMRSHIFT2, 0, CO2
  1193. lvsr PERMRSHIFT3, 0, CO3
  1194. lvsr PERMRSHIFT4, 0, CO4
  1195. vperm c00, VZERO, c01, PERMRSHIFT1
  1196. vperm c01, c01, c02, PERMRSHIFT1
  1197. vperm c02, c02, c03, PERMRSHIFT1
  1198. vperm c03, c03, c04, PERMRSHIFT1
  1199. vperm c04, c04, VZERO, PERMRSHIFT1
  1200. vmaddfp c00, alpha, c00, C1
  1201. vmaddfp c01, alpha, c01, C2
  1202. vmaddfp c02, alpha, c02, C3
  1203. vmaddfp c03, alpha, c03, C4
  1204. vmaddfp c04, alpha, c04, C5
  1205. stvx c00, OFFSET_0, CO1
  1206. stvx c01, OFFSET_1, CO1
  1207. stvx c02, OFFSET_2, CO1
  1208. stvx c03, OFFSET_3, CO1
  1209. stvx c04, OFFSET_4, CO1
  1210. lvx C1, OFFSET_0, CO2
  1211. lvx C2, OFFSET_1, CO2
  1212. lvx C3, OFFSET_2, CO2
  1213. lvx C4, OFFSET_3, CO2
  1214. lvx C5, OFFSET_4, CO2
  1215. vperm c00, VZERO, c05, PERMRSHIFT2
  1216. vperm c05, c05, c06, PERMRSHIFT2
  1217. vperm c06, c06, c07, PERMRSHIFT2
  1218. vperm c07, c07, c08, PERMRSHIFT2
  1219. vperm c08, c08, VZERO, PERMRSHIFT2
  1220. vmaddfp c00, alpha, c00, C1
  1221. vmaddfp c05, alpha, c05, C2
  1222. vmaddfp c06, alpha, c06, C3
  1223. vmaddfp c07, alpha, c07, C4
  1224. vmaddfp c08, alpha, c08, C5
  1225. stvx c00, OFFSET_0, CO2
  1226. stvx c05, OFFSET_1, CO2
  1227. stvx c06, OFFSET_2, CO2
  1228. stvx c07, OFFSET_3, CO2
  1229. stvx c08, OFFSET_4, CO2
  1230. addi CO1, CO1, 16 * SIZE
  1231. addi CO2, CO2, 16 * SIZE
  1232. addic. I, I, -1
  1233. bgt+ LL(71)
  1234. .align 4
  1235. LL(80):
  1236. andi. I, M, 8
  1237. ble LL(90)
  1238. vxor c01, c01, c01
  1239. LOAD_B b1, OFFSET_0, B
  1240. vxor c02, c02, c02
  1241. vxor c03, c03, c03
  1242. LOAD_A a1, OFFSET_0, AO
  1243. vxor c04, c04, c04
  1244. LOAD_A a2, OFFSET_1, AO
  1245. vxor c05, c05, c05
  1246. LOAD_A a3, OFFSET_2, AO
  1247. vxor c06, c06, c06
  1248. LOAD_A a4, OFFSET_3, AO
  1249. vxor c07, c07, c07
  1250. vxor c08, c08, c08
  1251. mr BO, B
  1252. vspltw bp1, b1, 0
  1253. srawi. r0, K, 1
  1254. mtspr CTR, r0
  1255. ble LL(85)
  1256. .align 4
  1257. LL(82):
  1258. vmaddfp c01, a1, bp1, c01
  1259. vspltw bp2, b1, 1
  1260. vmaddfp c02, a2, bp1, c02
  1261. vmaddfp c05, a1, bp2, c05
  1262. vspltw bp1, b1, 2
  1263. vmaddfp c06, a2, bp2, c06
  1264. vmaddfp c03, a3, bp1, c03
  1265. vspltw bp2, b1, 3
  1266. vmaddfp c04, a4, bp1, c04
  1267. LOAD_B b1, OFFSET_1, BO
  1268. vspltw bp1, b1, 0
  1269. vmaddfp c07, a3, bp2, c07
  1270. vmaddfp c08, a4, bp2, c08
  1271. addi AO, AO, 16 * SIZE
  1272. addi BO, BO, 4 * SIZE
  1273. LOAD_A a1, OFFSET_0, AO
  1274. LOAD_A a2, OFFSET_1, AO
  1275. LOAD_A a3, OFFSET_2, AO
  1276. LOAD_A a4, OFFSET_3, AO
  1277. bdnz LL(82)
  1278. .align 4
  1279. LL(85):
  1280. andi. r0, K, 1
  1281. lvx alpha, OFFSET_0, SP
  1282. vxor VZERO, VZERO, VZERO
  1283. ble+ LL(88)
  1284. .align 4
  1285. LL(86):
  1286. vmaddfp c01, a1, bp1, c01
  1287. vspltw bp2, b1, 1
  1288. vmaddfp c02, a2, bp1, c02
  1289. addi AO, AO, 8 * SIZE
  1290. vmaddfp c05, a1, bp2, c05
  1291. addi BO, BO, 2 * SIZE
  1292. vmaddfp c06, a2, bp2, c06
  1293. .align 4
  1294. LL(88):
  1295. lvx C1, OFFSET_0, CO1
  1296. lvx C2, OFFSET_1, CO1
  1297. lvx C3, OFFSET_2, CO1
  1298. vaddfp c01, c01, c03
  1299. vaddfp c02, c02, c04
  1300. vaddfp c05, c05, c07
  1301. vaddfp c06, c06, c08
  1302. lvsr PERMRSHIFT1, 0, CO1
  1303. lvsr PERMRSHIFT2, 0, CO2
  1304. lvsr PERMRSHIFT3, 0, CO3
  1305. lvsr PERMRSHIFT4, 0, CO4
  1306. vperm c00, VZERO, c01, PERMRSHIFT1
  1307. vperm c01, c01, c02, PERMRSHIFT1
  1308. vperm c02, c02, VZERO, PERMRSHIFT1
  1309. vmaddfp c00, alpha, c00, C1
  1310. vmaddfp c01, alpha, c01, C2
  1311. vmaddfp c02, alpha, c02, C3
  1312. stvx c00, OFFSET_0, CO1
  1313. stvx c01, OFFSET_1, CO1
  1314. stvx c02, OFFSET_2, CO1
  1315. lvx C1, OFFSET_0, CO2
  1316. lvx C2, OFFSET_1, CO2
  1317. lvx C3, OFFSET_2, CO2
  1318. vperm c00, VZERO, c05, PERMRSHIFT2
  1319. vperm c05, c05, c06, PERMRSHIFT2
  1320. vperm c06, c06, VZERO, PERMRSHIFT2
  1321. vmaddfp c00, alpha, c00, C1
  1322. vmaddfp c05, alpha, c05, C2
  1323. vmaddfp c06, alpha, c06, C3
  1324. stvx c00, OFFSET_0, CO2
  1325. stvx c05, OFFSET_1, CO2
  1326. stvx c06, OFFSET_2, CO2
  1327. addi CO1, CO1, 8 * SIZE
  1328. addi CO2, CO2, 8 * SIZE
  1329. .align 4
  1330. LL(90):
  1331. andi. I, M, 4
  1332. ble LL(100)
  1333. vxor c01, c01, c01
  1334. LOAD_B b1, OFFSET_0, B
  1335. vxor c02, c02, c02
  1336. LOAD_A a1, OFFSET_0, AO
  1337. LOAD_A a2, OFFSET_1, AO
  1338. vxor c05, c05, c05
  1339. vxor c06, c06, c06
  1340. mr BO, B
  1341. vspltw bp1, b1, 0
  1342. srawi. r0, K, 1
  1343. mtspr CTR, r0
  1344. ble LL(95)
  1345. .align 4
  1346. LL(92):
  1347. vmaddfp c01, a1, bp1, c01
  1348. vspltw bp2, b1, 1
  1349. vmaddfp c05, a1, bp2, c05
  1350. vspltw bp1, b1, 2
  1351. vmaddfp c02, a2, bp1, c02
  1352. vspltw bp2, b1, 3
  1353. LOAD_B b1, OFFSET_1, BO
  1354. vspltw bp1, b1, 0
  1355. vmaddfp c06, a2, bp2, c06
  1356. addi AO, AO, 8 * SIZE
  1357. addi BO, BO, 4 * SIZE
  1358. LOAD_A a1, OFFSET_0, AO
  1359. LOAD_A a2, OFFSET_1, AO
  1360. bdnz LL(92)
  1361. .align 4
  1362. LL(95):
  1363. andi. r0, K, 1
  1364. lvx alpha, OFFSET_0, SP
  1365. vxor VZERO, VZERO, VZERO
  1366. ble+ LL(98)
  1367. .align 4
  1368. LL(96):
  1369. vspltw bp2, b1, 1
  1370. vmaddfp c01, a1, bp1, c01
  1371. vmaddfp c05, a1, bp2, c05
  1372. addi AO, AO, 4 * SIZE
  1373. addi BO, BO, 2 * SIZE
  1374. .align 4
  1375. LL(98):
  1376. vaddfp c01, c01, c02
  1377. vaddfp c05, c05, c06
  1378. vaddfp c09, c09, c10
  1379. vaddfp c13, c13, c14
  1380. lvx C1, OFFSET_0, CO1
  1381. lvx C2, OFFSET_1, CO1
  1382. lvsr PERMRSHIFT1, 0, CO1
  1383. lvsr PERMRSHIFT2, 0, CO2
  1384. lvsr PERMRSHIFT3, 0, CO3
  1385. lvsr PERMRSHIFT4, 0, CO4
  1386. vperm c00, VZERO, c01, PERMRSHIFT1
  1387. vperm c01, c01, VZERO, PERMRSHIFT1
  1388. vmaddfp c00, alpha, c00, C1
  1389. vmaddfp c01, alpha, c01, C2
  1390. stvx c00, OFFSET_0, CO1
  1391. stvx c01, OFFSET_1, CO1
  1392. lvx C1, OFFSET_0, CO2
  1393. lvx C2, OFFSET_1, CO2
  1394. vperm c00, VZERO, c05, PERMRSHIFT2
  1395. vperm c05, c05, VZERO, PERMRSHIFT2
  1396. vmaddfp c00, alpha, c00, C1
  1397. vmaddfp c05, alpha, c05, C2
  1398. stvx c00, OFFSET_0, CO2
  1399. stvx c05, OFFSET_1, CO2
  1400. addi CO1, CO1, 4 * SIZE
  1401. addi CO2, CO2, 4 * SIZE
  1402. .align 4
  1403. LL(100):
  1404. andi. I, M, 2
  1405. ble LL(110)
  1406. mr BO, B
  1407. LFD f8, 0 * SIZE(AO)
  1408. LFD f9, 1 * SIZE(AO)
  1409. LFD f10, 0 * SIZE(B)
  1410. LFD f11, 1 * SIZE(B)
  1411. LFD f12, 2 * SIZE(B)
  1412. LFD f13, 3 * SIZE(B)
  1413. lfs f0, FZERO(SP)
  1414. fmr f1, f0
  1415. fmr f2, f0
  1416. fmr f3, f0
  1417. fmr f4, f0
  1418. fmr f5, f0
  1419. fmr f6, f0
  1420. fmr f7, f0
  1421. srawi. r0, K, 1
  1422. mtspr CTR, r0
  1423. ble LL(105)
  1424. .align 4
  1425. LL(102):
  1426. FMADD f0, f8, f10, f0
  1427. FMADD f1, f9, f10, f1
  1428. FMADD f2, f8, f11, f2
  1429. FMADD f3, f9, f11, f3
  1430. LFD f8, 2 * SIZE(AO)
  1431. LFD f9, 3 * SIZE(AO)
  1432. FMADD f4, f8, f12, f4
  1433. FMADD f5, f9, f12, f5
  1434. FMADD f6, f8, f13, f6
  1435. FMADD f7, f9, f13, f7
  1436. LFD f8, 4 * SIZE(AO)
  1437. LFD f9, 5 * SIZE(AO)
  1438. LFD f10, 4 * SIZE(BO)
  1439. LFD f11, 5 * SIZE(BO)
  1440. LFD f12, 6 * SIZE(BO)
  1441. LFD f13, 7 * SIZE(BO)
  1442. addi AO, AO, 4 * SIZE
  1443. addi BO, BO, 4 * SIZE
  1444. bdnz LL(102)
  1445. .align 4
  1446. LL(105):
  1447. andi. r0, K, 1
  1448. lfs f13, ALPHA(SP)
  1449. ble LL(108)
  1450. .align 4
  1451. LL(106):
  1452. FMADD f0, f8, f10, f0
  1453. FMADD f1, f9, f10, f1
  1454. FMADD f2, f8, f11, f2
  1455. FMADD f3, f9, f11, f3
  1456. LFD f8, 2 * SIZE(AO)
  1457. LFD f9, 3 * SIZE(AO)
  1458. LFD f10, 2 * SIZE(BO)
  1459. LFD f11, 3 * SIZE(BO)
  1460. addi AO, AO, 2 * SIZE
  1461. addi BO, BO, 2 * SIZE
  1462. .align 4
  1463. LL(108):
  1464. LFD f8, 0 * SIZE(CO1)
  1465. LFD f9, 1 * SIZE(CO1)
  1466. LFD f10, 0 * SIZE(CO2)
  1467. LFD f11, 1 * SIZE(CO2)
  1468. FADD f0, f0, f4
  1469. FADD f1, f1, f5
  1470. FADD f2, f2, f6
  1471. FADD f3, f3, f7
  1472. FMADD f0, f0, f13, f8
  1473. FMADD f1, f1, f13, f9
  1474. FMADD f2, f2, f13, f10
  1475. FMADD f3, f3, f13, f11
  1476. STFD f0, 0 * SIZE(CO1)
  1477. STFD f1, 1 * SIZE(CO1)
  1478. STFD f2, 0 * SIZE(CO2)
  1479. STFD f3, 1 * SIZE(CO2)
  1480. addi CO1, CO1, 2 * SIZE
  1481. addi CO2, CO2, 2 * SIZE
  1482. .align 4
  1483. LL(110):
  1484. andi. I, M, 1
  1485. ble LL(119)
  1486. mr BO, B
  1487. LFD f8, 0 * SIZE(AO)
  1488. LFD f9, 1 * SIZE(AO)
  1489. LFD f10, 0 * SIZE(B)
  1490. LFD f11, 1 * SIZE(B)
  1491. LFD f12, 2 * SIZE(B)
  1492. LFD f13, 3 * SIZE(B)
  1493. lfs f0, FZERO(SP)
  1494. fmr f1, f0
  1495. fmr f2, f0
  1496. fmr f3, f0
  1497. srawi. r0, K, 1
  1498. mtspr CTR, r0
  1499. ble LL(115)
  1500. .align 4
  1501. LL(112):
  1502. FMADD f0, f8, f10, f0
  1503. FMADD f1, f8, f11, f1
  1504. FMADD f2, f9, f12, f2
  1505. FMADD f3, f9, f13, f3
  1506. LFD f8, 2 * SIZE(AO)
  1507. LFD f9, 3 * SIZE(AO)
  1508. LFD f10, 4 * SIZE(BO)
  1509. LFD f11, 5 * SIZE(BO)
  1510. LFD f12, 6 * SIZE(BO)
  1511. LFD f13, 7 * SIZE(BO)
  1512. addi AO, AO, 2 * SIZE
  1513. addi BO, BO, 4 * SIZE
  1514. bdnz LL(112)
  1515. .align 4
  1516. LL(115):
  1517. andi. r0, K, 1
  1518. lfs f13, ALPHA(SP)
  1519. ble LL(118)
  1520. .align 4
  1521. LL(116):
  1522. FMADD f0, f8, f10, f0
  1523. FMADD f1, f8, f11, f1
  1524. LFD f8, 1 * SIZE(AO)
  1525. LFD f10, 2 * SIZE(BO)
  1526. LFD f11, 3 * SIZE(BO)
  1527. addi AO, AO, 1 * SIZE
  1528. addi BO, BO, 2 * SIZE
  1529. .align 4
  1530. LL(118):
  1531. LFD f8, 0 * SIZE(CO1)
  1532. LFD f9, 0 * SIZE(CO2)
  1533. FADD f0, f0, f2
  1534. FADD f1, f1, f3
  1535. FMADD f0, f0, f13, f8
  1536. FMADD f1, f1, f13, f9
  1537. STFD f0, 0 * SIZE(CO1)
  1538. STFD f1, 0 * SIZE(CO2)
  1539. .align 4
  1540. LL(119):
  1541. mr B, BO
  1542. .align 4
  1543. LL(120):
  1544. andi. r0, N, 1
  1545. ble LL(999)
  1546. mr CO1, C
  1547. mr AO, A
  1548. srawi. I, M, 4
  1549. ble LL(140)
  1550. .align 4
  1551. LL(130):
  1552. vxor c01, c01, c01
  1553. vxor c02, c02, c02
  1554. vxor c03, c03, c03
  1555. vxor c04, c04, c04
  1556. mr BO, B
  1557. dcbtst CO1, PREC
  1558. mr J, K
  1559. andi. r0, B, 15
  1560. ble+ LL(131)
  1561. LOAD_A a1, OFFSET_0, AO
  1562. LOAD_A a2, OFFSET_1, AO
  1563. LOAD_A a3, OFFSET_2, AO
  1564. LOAD_A a4, OFFSET_3, AO
  1565. LOAD_B b1, OFFSET_0, BO
  1566. vspltw bp1, b1, 2
  1567. vspltw bp2, b1, 3
  1568. addi AO, AO, 16 * SIZE
  1569. addi BO, BO, SIZE
  1570. vmaddfp c01, a1, bp1, c01
  1571. vmaddfp c02, a2, bp1, c02
  1572. vmaddfp c03, a3, bp1, c03
  1573. vmaddfp c04, a4, bp1, c04
  1574. subi J, J, 1
  1575. cmpwi cr0, J, 0
  1576. ble LL(138)
  1577. LOAD_A a1, OFFSET_0, AO
  1578. LOAD_A a2, OFFSET_1, AO
  1579. LOAD_A a3, OFFSET_2, AO
  1580. LOAD_A a4, OFFSET_3, AO
  1581. addi AO, AO, 16 * SIZE
  1582. addi BO, BO, SIZE
  1583. vmaddfp c01, a1, bp2, c01
  1584. vmaddfp c02, a2, bp2, c02
  1585. vmaddfp c03, a3, bp2, c03
  1586. vmaddfp c04, a4, bp2, c04
  1587. subi J, J, 1
  1588. cmpwi cr0, J, 0
  1589. ble LL(138)
  1590. .align 4
  1591. LL(131):
  1592. LOAD_A a1, OFFSET_0, AO
  1593. LOAD_A a2, OFFSET_1, AO
  1594. LOAD_A a3, OFFSET_2, AO
  1595. LOAD_A a4, OFFSET_3, AO
  1596. LOAD_A a5, OFFSET_4, AO
  1597. LOAD_A a6, OFFSET_5, AO
  1598. LOAD_A a7, OFFSET_6, AO
  1599. LOAD_A a8, OFFSET_7, AO
  1600. LOAD_B b1, OFFSET_0, BO
  1601. srawi. r0, J, 2
  1602. mtspr CTR, r0
  1603. ble LL(135)
  1604. .align 4
  1605. LL(133):
  1606. vspltw bp1, b1, 0
  1607. vmaddfp c01, a1, bp1, c01
  1608. vmaddfp c02, a2, bp1, c02
  1609. vmaddfp c03, a3, bp1, c03
  1610. vmaddfp c04, a4, bp1, c04
  1611. vspltw bp2, b1, 1
  1612. vmaddfp c01, a5, bp2, c01
  1613. vmaddfp c02, a6, bp2, c02
  1614. vmaddfp c03, a7, bp2, c03
  1615. vmaddfp c04, a8, bp2, c04
  1616. addi AO, AO, 32 * SIZE
  1617. LOAD_A a1, OFFSET_0, AO
  1618. LOAD_A a2, OFFSET_1, AO
  1619. LOAD_A a3, OFFSET_2, AO
  1620. LOAD_A a4, OFFSET_3, AO
  1621. vspltw bp1, b1, 2
  1622. vmaddfp c01, a1, bp1, c01
  1623. vmaddfp c02, a2, bp1, c02
  1624. vmaddfp c03, a3, bp1, c03
  1625. vmaddfp c04, a4, bp1, c04
  1626. LOAD_A a5, OFFSET_4, AO
  1627. LOAD_A a6, OFFSET_5, AO
  1628. LOAD_A a7, OFFSET_6, AO
  1629. LOAD_A a8, OFFSET_7, AO
  1630. vspltw bp2, b1, 3
  1631. vmaddfp c01, a5, bp2, c01
  1632. vmaddfp c02, a6, bp2, c02
  1633. vmaddfp c03, a7, bp2, c03
  1634. vmaddfp c04, a8, bp2, c04
  1635. addi AO, AO, 32 * SIZE
  1636. addi BO, BO, 4 * SIZE
  1637. LOAD_A a1, OFFSET_0, AO
  1638. LOAD_A a2, OFFSET_1, AO
  1639. LOAD_A a3, OFFSET_2, AO
  1640. LOAD_A a4, OFFSET_3, AO
  1641. LOAD_A a5, OFFSET_4, AO
  1642. LOAD_A a6, OFFSET_5, AO
  1643. LOAD_A a7, OFFSET_6, AO
  1644. LOAD_A a8, OFFSET_7, AO
  1645. LOAD_B b1, OFFSET_0, BO
  1646. bdnz LL(133)
  1647. .align 4
  1648. LL(135):
  1649. andi. r0, J, 3
  1650. ble+ LL(138)
  1651. cmpwi cr0, r0, 3
  1652. bne LL(136)
  1653. vspltw bp1, b1, 0
  1654. vmaddfp c01, a1, bp1, c01
  1655. vmaddfp c02, a2, bp1, c02
  1656. vmaddfp c03, a3, bp1, c03
  1657. vmaddfp c04, a4, bp1, c04
  1658. addi AO, AO, 16 * SIZE
  1659. LOAD_A a1, OFFSET_0, AO
  1660. LOAD_A a2, OFFSET_1, AO
  1661. LOAD_A a3, OFFSET_2, AO
  1662. LOAD_A a4, OFFSET_3, AO
  1663. vspltw bp2, b1, 1
  1664. vmaddfp c01, a1, bp2, c01
  1665. vmaddfp c02, a2, bp2, c02
  1666. vmaddfp c03, a3, bp2, c03
  1667. vmaddfp c04, a4, bp2, c04
  1668. addi AO, AO, 16 * SIZE
  1669. LOAD_A a1, OFFSET_0, AO
  1670. LOAD_A a2, OFFSET_1, AO
  1671. LOAD_A a3, OFFSET_2, AO
  1672. LOAD_A a4, OFFSET_3, AO
  1673. vspltw bp1, b1, 2
  1674. vmaddfp c01, a1, bp1, c01
  1675. vmaddfp c02, a2, bp1, c02
  1676. vmaddfp c03, a3, bp1, c03
  1677. vmaddfp c04, a4, bp1, c04
  1678. addi AO, AO, 16 * SIZE
  1679. addi BO, BO, 3 * SIZE
  1680. b LL(138)
  1681. .align 4
  1682. LL(136):
  1683. cmpwi cr0, r0, 2
  1684. bne LL(137)
  1685. vspltw bp1, b1, 0
  1686. vspltw bp2, b1, 1
  1687. vmaddfp c01, a1, bp1, c01
  1688. vmaddfp c02, a2, bp1, c02
  1689. vmaddfp c03, a3, bp1, c03
  1690. vmaddfp c04, a4, bp1, c04
  1691. LOAD_A a1, OFFSET_4, AO
  1692. LOAD_A a2, OFFSET_5, AO
  1693. LOAD_A a3, OFFSET_6, AO
  1694. LOAD_A a4, OFFSET_7, AO
  1695. vmaddfp c01, a1, bp2, c01
  1696. vmaddfp c02, a2, bp2, c02
  1697. vmaddfp c03, a3, bp2, c03
  1698. vmaddfp c04, a4, bp2, c04
  1699. addi AO, AO, 32 * SIZE
  1700. addi BO, BO, 2 * SIZE
  1701. b LL(138)
  1702. .align 4
  1703. LL(137):
  1704. cmpwi cr0, r0, 1
  1705. bne LL(138)
  1706. vspltw bp1, b1, 0
  1707. vmaddfp c01, a1, bp1, c01
  1708. vmaddfp c02, a2, bp1, c02
  1709. vmaddfp c03, a3, bp1, c03
  1710. vmaddfp c04, a4, bp1, c04
  1711. addi AO, AO, 16 * SIZE
  1712. addi BO, BO, 1 * SIZE
  1713. .align 4
  1714. LL(138):
  1715. lvx alpha, OFFSET_0, SP
  1716. vxor VZERO, VZERO, VZERO
  1717. lvx C1, OFFSET_0, CO1
  1718. lvx C2, OFFSET_1, CO1
  1719. lvx C3, OFFSET_2, CO1
  1720. lvx C4, OFFSET_3, CO1
  1721. lvx C5, OFFSET_4, CO1
  1722. lvsr PERMRSHIFT1, 0, CO1
  1723. vperm c00, VZERO, c01, PERMRSHIFT1
  1724. vperm c01, c01, c02, PERMRSHIFT1
  1725. vperm c02, c02, c03, PERMRSHIFT1
  1726. vperm c03, c03, c04, PERMRSHIFT1
  1727. vperm c04, c04, VZERO, PERMRSHIFT1
  1728. vmaddfp c00, alpha, c00, C1
  1729. vmaddfp c01, alpha, c01, C2
  1730. vmaddfp c02, alpha, c02, C3
  1731. vmaddfp c03, alpha, c03, C4
  1732. vmaddfp c04, alpha, c04, C5
  1733. stvx c00, OFFSET_0, CO1
  1734. stvx c01, OFFSET_1, CO1
  1735. stvx c02, OFFSET_2, CO1
  1736. stvx c03, OFFSET_3, CO1
  1737. stvx c04, OFFSET_4, CO1
  1738. addi CO1, CO1, 16 * SIZE
  1739. addic. I, I, -1
  1740. bgt+ LL(130)
  1741. .align 4
  1742. LL(140):
  1743. andi. I, M, 8
  1744. ble LL(150)
  1745. vxor c01, c01, c01
  1746. vxor c02, c02, c02
  1747. mr BO, B
  1748. mr J, K
  1749. andi. r0, B, 15
  1750. ble+ LL(141)
  1751. LOAD_A a1, OFFSET_0, AO
  1752. LOAD_A a2, OFFSET_1, AO
  1753. LOAD_B b1, OFFSET_0, BO
  1754. vspltw bp1, b1, 2
  1755. vspltw bp2, b1, 3
  1756. addi AO, AO, 8 * SIZE
  1757. addi BO, BO, SIZE
  1758. vmaddfp c01, a1, bp1, c01
  1759. vmaddfp c02, a2, bp1, c02
  1760. subi J, J, 1
  1761. cmpwi cr0, J, 0
  1762. ble LL(148)
  1763. LOAD_A a1, OFFSET_0, AO
  1764. LOAD_A a2, OFFSET_1, AO
  1765. addi AO, AO, 8 * SIZE
  1766. addi BO, BO, SIZE
  1767. vmaddfp c01, a1, bp2, c01
  1768. vmaddfp c02, a2, bp2, c02
  1769. subi J, J, 1
  1770. cmpwi cr0, J, 0
  1771. ble LL(148)
  1772. .align 4
  1773. LL(141):
  1774. LOAD_A a1, OFFSET_0, AO
  1775. LOAD_A a2, OFFSET_1, AO
  1776. LOAD_A a3, OFFSET_2, AO
  1777. LOAD_A a4, OFFSET_3, AO
  1778. LOAD_A a5, OFFSET_4, AO
  1779. LOAD_A a6, OFFSET_5, AO
  1780. LOAD_A a7, OFFSET_6, AO
  1781. LOAD_A a8, OFFSET_7, AO
  1782. LOAD_B b1, OFFSET_0, BO
  1783. srawi. r0, J, 2
  1784. mtspr CTR, r0
  1785. ble LL(145)
  1786. .align 4
  1787. LL(143):
  1788. vspltw bp1, b1, 0
  1789. vmaddfp c01, a1, bp1, c01
  1790. vmaddfp c02, a2, bp1, c02
  1791. vspltw bp2, b1, 1
  1792. vmaddfp c01, a3, bp2, c01
  1793. vmaddfp c02, a4, bp2, c02
  1794. vspltw bp1, b1, 2
  1795. vmaddfp c01, a5, bp1, c01
  1796. vmaddfp c02, a6, bp1, c02
  1797. vspltw bp2, b1, 3
  1798. vmaddfp c01, a7, bp2, c01
  1799. vmaddfp c02, a8, bp2, c02
  1800. addi AO, AO, 32 * SIZE
  1801. addi BO, BO, 4 * SIZE
  1802. LOAD_A a1, OFFSET_0, AO
  1803. LOAD_A a2, OFFSET_1, AO
  1804. LOAD_A a3, OFFSET_2, AO
  1805. LOAD_A a4, OFFSET_3, AO
  1806. LOAD_A a5, OFFSET_4, AO
  1807. LOAD_A a6, OFFSET_5, AO
  1808. LOAD_A a7, OFFSET_6, AO
  1809. LOAD_A a8, OFFSET_7, AO
  1810. LOAD_B b1, OFFSET_0, BO
  1811. bdnz LL(143)
  1812. .align 4
  1813. LL(145):
  1814. andi. r0, J, 3
  1815. ble+ LL(148)
  1816. cmpwi cr0, r0, 3
  1817. bne LL(146)
  1818. vspltw bp1, b1, 0
  1819. vmaddfp c01, a1, bp1, c01
  1820. vmaddfp c02, a2, bp1, c02
  1821. vspltw bp2, b1, 1
  1822. vmaddfp c01, a3, bp2, c01
  1823. vmaddfp c02, a4, bp2, c02
  1824. LOAD_A a1, OFFSET_4, AO
  1825. LOAD_A a2, OFFSET_5, AO
  1826. vspltw bp1, b1, 2
  1827. vmaddfp c01, a1, bp1, c01
  1828. vmaddfp c02, a2, bp1, c02
  1829. addi AO, AO, 24 * SIZE
  1830. addi BO, BO, 3 * SIZE
  1831. b LL(148)
  1832. .align 4
  1833. LL(146):
  1834. cmpwi cr0, r0, 2
  1835. bne LL(147)
  1836. vspltw bp1, b1, 0
  1837. vspltw bp2, b1, 1
  1838. vmaddfp c01, a1, bp1, c01
  1839. vmaddfp c02, a2, bp1, c02
  1840. vmaddfp c01, a3, bp2, c01
  1841. vmaddfp c02, a4, bp2, c02
  1842. addi AO, AO, 16 * SIZE
  1843. addi BO, BO, 2 * SIZE
  1844. b LL(148)
  1845. .align 4
  1846. LL(147):
  1847. cmpwi cr0, r0, 1
  1848. bne LL(148)
  1849. vspltw bp1, b1, 0
  1850. vmaddfp c01, a1, bp1, c01
  1851. vmaddfp c02, a2, bp1, c02
  1852. addi AO, AO, 8 * SIZE
  1853. addi BO, BO, 1 * SIZE
  1854. .align 4
  1855. LL(148):
  1856. lvx alpha, OFFSET_0, SP
  1857. vxor VZERO, VZERO, VZERO
  1858. lvx C1, OFFSET_0, CO1
  1859. lvx C2, OFFSET_1, CO1
  1860. lvx C3, OFFSET_2, CO1
  1861. lvsr PERMRSHIFT1, 0, CO1
  1862. vperm c00, VZERO, c01, PERMRSHIFT1
  1863. vperm c01, c01, c02, PERMRSHIFT1
  1864. vperm c02, c02, VZERO, PERMRSHIFT1
  1865. vmaddfp c00, alpha, c00, C1
  1866. vmaddfp c01, alpha, c01, C2
  1867. vmaddfp c02, alpha, c02, C3
  1868. stvx c00, OFFSET_0, CO1
  1869. stvx c01, OFFSET_1, CO1
  1870. stvx c02, OFFSET_2, CO1
  1871. addi CO1, CO1, 8 * SIZE
  1872. .align 4
  1873. LL(150):
  1874. andi. I, M, 4
  1875. ble LL(160)
  1876. vxor c01, c01, c01
  1877. mr BO, B
  1878. mr J, K
  1879. andi. r0, B, 15
  1880. ble+ LL(151)
  1881. LOAD_A a1, OFFSET_0, AO
  1882. LOAD_B b1, OFFSET_0, BO
  1883. vspltw bp1, b1, 2
  1884. vspltw bp2, b1, 3
  1885. addi AO, AO, 4 * SIZE
  1886. addi BO, BO, SIZE
  1887. vmaddfp c01, a1, bp1, c01
  1888. subi J, J, 1
  1889. cmpwi cr0, J, 0
  1890. ble LL(158)
  1891. LOAD_A a1, OFFSET_0, AO
  1892. addi AO, AO, 4 * SIZE
  1893. addi BO, BO, SIZE
  1894. vmaddfp c01, a1, bp2, c01
  1895. subi J, J, 1
  1896. cmpwi cr0, J, 0
  1897. ble LL(158)
  1898. .align 4
  1899. LL(151):
  1900. LOAD_A a1, OFFSET_0, AO
  1901. LOAD_A a2, OFFSET_1, AO
  1902. LOAD_A a3, OFFSET_2, AO
  1903. LOAD_A a4, OFFSET_3, AO
  1904. LOAD_B b1, OFFSET_0, BO
  1905. srawi. r0, J, 2
  1906. mtspr CTR, r0
  1907. ble LL(155)
  1908. .align 4
  1909. LL(153):
  1910. vspltw bp1, b1, 0
  1911. vmaddfp c01, a1, bp1, c01
  1912. vspltw bp2, b1, 1
  1913. vmaddfp c01, a2, bp2, c01
  1914. vspltw bp1, b1, 2
  1915. vmaddfp c01, a3, bp1, c01
  1916. vspltw bp2, b1, 3
  1917. vmaddfp c01, a4, bp2, c01
  1918. addi AO, AO, 16 * SIZE
  1919. addi BO, BO, 4 * SIZE
  1920. LOAD_A a1, OFFSET_0, AO
  1921. LOAD_A a2, OFFSET_1, AO
  1922. LOAD_A a3, OFFSET_2, AO
  1923. LOAD_A a4, OFFSET_3, AO
  1924. LOAD_B b1, OFFSET_0, BO
  1925. bdnz LL(153)
  1926. .align 4
  1927. LL(155):
  1928. andi. r0, J, 3
  1929. ble+ LL(158)
  1930. cmpwi cr0, r0, 3
  1931. bne LL(156)
  1932. vspltw bp1, b1, 0
  1933. vmaddfp c01, a1, bp1, c01
  1934. vspltw bp2, b1, 1
  1935. vmaddfp c01, a2, bp2, c01
  1936. vspltw bp1, b1, 2
  1937. vmaddfp c01, a3, bp1, c01
  1938. addi AO, AO, 12 * SIZE
  1939. addi BO, BO, 3 * SIZE
  1940. b LL(158)
  1941. .align 4
  1942. LL(156):
  1943. cmpwi cr0, r0, 2
  1944. bne LL(157)
  1945. vspltw bp1, b1, 0
  1946. vspltw bp2, b1, 1
  1947. vmaddfp c01, a1, bp1, c01
  1948. vmaddfp c01, a2, bp2, c01
  1949. addi AO, AO, 8 * SIZE
  1950. addi BO, BO, 2 * SIZE
  1951. b LL(158)
  1952. .align 4
  1953. LL(157):
  1954. cmpwi cr0, r0, 1
  1955. bne LL(158)
  1956. vspltw bp1, b1, 0
  1957. vmaddfp c01, a1, bp1, c01
  1958. addi AO, AO, 4 * SIZE
  1959. addi BO, BO, 1 * SIZE
  1960. .align 4
  1961. LL(158):
  1962. lvx alpha, OFFSET_0, SP
  1963. vxor VZERO, VZERO, VZERO
  1964. lvx C1, OFFSET_0, CO1
  1965. lvx C2, OFFSET_1, CO1
  1966. lvsr PERMRSHIFT1, 0, CO1
  1967. vperm c00, VZERO, c01, PERMRSHIFT1
  1968. vperm c01, c01, VZERO, PERMRSHIFT1
  1969. vmaddfp c00, alpha, c00, C1
  1970. vmaddfp c01, alpha, c01, C2
  1971. stvx c00, OFFSET_0, CO1
  1972. stvx c01, OFFSET_1, CO1
  1973. addi CO1, CO1, 4 * SIZE
  1974. .align 4
  1975. LL(160):
  1976. andi. I, M, 2
  1977. ble LL(170)
  1978. mr BO, B
  1979. LFD f8, 0 * SIZE(AO)
  1980. LFD f9, 1 * SIZE(AO)
  1981. LFD f10, 2 * SIZE(AO)
  1982. LFD f11, 3 * SIZE(AO)
  1983. LFD f12, 0 * SIZE(B)
  1984. LFD f13, 1 * SIZE(B)
  1985. lfs f0, FZERO(SP)
  1986. fmr f1, f0
  1987. fmr f2, f0
  1988. fmr f3, f0
  1989. srawi. r0, K, 1
  1990. mtspr CTR, r0
  1991. ble LL(165)
  1992. .align 4
  1993. LL(162):
  1994. FMADD f0, f8, f12, f0
  1995. FMADD f1, f9, f12, f1
  1996. FMADD f2, f10, f13, f2
  1997. FMADD f3, f11, f13, f3
  1998. LFD f8, 4 * SIZE(AO)
  1999. LFD f9, 5 * SIZE(AO)
  2000. LFD f10, 6 * SIZE(AO)
  2001. LFD f11, 7 * SIZE(AO)
  2002. LFD f12, 2 * SIZE(BO)
  2003. LFD f13, 3 * SIZE(BO)
  2004. addi AO, AO, 4 * SIZE
  2005. addi BO, BO, 2 * SIZE
  2006. bdnz LL(162)
  2007. .align 4
  2008. LL(165):
  2009. andi. r0, K, 1
  2010. lfs f13, ALPHA(SP)
  2011. ble LL(168)
  2012. .align 4
  2013. LL(166):
  2014. FMADD f0, f8, f12, f0
  2015. FMADD f1, f9, f12, f1
  2016. addi AO, AO, 2 * SIZE
  2017. addi BO, BO, 1 * SIZE
  2018. .align 4
  2019. LL(168):
  2020. LFD f8, 0 * SIZE(CO1)
  2021. LFD f9, 1 * SIZE(CO1)
  2022. FADD f0, f0, f2
  2023. FADD f1, f1, f3
  2024. FMADD f0, f0, f13, f8
  2025. FMADD f1, f1, f13, f9
  2026. STFD f0, 0 * SIZE(CO1)
  2027. STFD f1, 1 * SIZE(CO1)
  2028. addi CO1, CO1, 2 * SIZE
  2029. .align 4
  2030. LL(170):
  2031. andi. I, M, 1
  2032. ble LL(999)
  2033. mr BO, B
  2034. LFD f8, 0 * SIZE(AO)
  2035. LFD f9, 1 * SIZE(AO)
  2036. LFD f10, 0 * SIZE(B)
  2037. LFD f11, 1 * SIZE(B)
  2038. lfs f0, FZERO(SP)
  2039. fmr f1, f0
  2040. srawi. r0, K, 1
  2041. mtspr CTR, r0
  2042. ble LL(175)
  2043. .align 4
  2044. LL(172):
  2045. FMADD f0, f8, f10, f0
  2046. FMADD f1, f9, f11, f1
  2047. LFD f8, 2 * SIZE(AO)
  2048. LFD f9, 3 * SIZE(AO)
  2049. LFD f10, 2 * SIZE(BO)
  2050. LFD f11, 3 * SIZE(BO)
  2051. addi AO, AO, 2 * SIZE
  2052. addi BO, BO, 2 * SIZE
  2053. bdnz LL(172)
  2054. .align 4
  2055. LL(175):
  2056. andi. r0, K, 1
  2057. lfs f13, ALPHA(SP)
  2058. ble LL(178)
  2059. .align 4
  2060. LL(176):
  2061. FMADD f0, f8, f10, f0
  2062. addi AO, AO, 1 * SIZE
  2063. addi BO, BO, 1 * SIZE
  2064. .align 4
  2065. LL(178):
  2066. LFD f8, 0 * SIZE(CO1)
  2067. FADD f0, f0, f1
  2068. FMADD f0, f0, f13, f8
  2069. STFD f0, 0 * SIZE(CO1)
  2070. .align 4
  2071. LL(999):
  2072. mr SP, STACK
  2073. li r0, 0 * 16
  2074. lvx v20, SP, r0
  2075. li r0, 1 * 16
  2076. lvx v21, SP, r0
  2077. li r0, 2 * 16
  2078. lvx v22, SP, r0
  2079. li r0, 3 * 16
  2080. lvx v23, SP, r0
  2081. li r0, 4 * 16
  2082. lvx v24, SP, r0
  2083. li r0, 5 * 16
  2084. lvx v25, SP, r0
  2085. li r0, 6 * 16
  2086. lvx v26, SP, r0
  2087. li r0, 7 * 16
  2088. lvx v27, SP, r0
  2089. li r0, 8 * 16
  2090. lvx v28, SP, r0
  2091. li r0, 9 * 16
  2092. lvx v29, SP, r0
  2093. li r0, 10 * 16
  2094. lvx v30, SP, r0
  2095. li r0, 11 * 16
  2096. lvx v31, SP, r0
  2097. mtspr VRsave, VREG
  2098. #ifdef __64BIT__
  2099. ld r31, 192(SP)
  2100. ld r30, 200(SP)
  2101. ld r29, 208(SP)
  2102. ld r28, 216(SP)
  2103. ld r27, 224(SP)
  2104. ld r26, 232(SP)
  2105. ld r25, 240(SP)
  2106. ld r24, 248(SP)
  2107. ld r23, 256(SP)
  2108. ld r22, 264(SP)
  2109. ld r21, 272(SP)
  2110. ld r20, 280(SP)
  2111. ld r19, 288(SP)
  2112. ld r18, 296(SP)
  2113. ld r17, 304(SP)
  2114. ld r16, 312(SP)
  2115. ld r15, 320(SP)
  2116. ld r14, 328(SP)
  2117. #else
  2118. lwz r31, 192(SP)
  2119. lwz r30, 196(SP)
  2120. lwz r29, 200(SP)
  2121. lwz r28, 204(SP)
  2122. lwz r27, 208(SP)
  2123. lwz r26, 212(SP)
  2124. lwz r25, 216(SP)
  2125. lwz r24, 220(SP)
  2126. lwz r23, 224(SP)
  2127. lwz r22, 228(SP)
  2128. lwz r21, 232(SP)
  2129. lwz r20, 236(SP)
  2130. lwz r19, 240(SP)
  2131. lwz r18, 244(SP)
  2132. lwz r17, 248(SP)
  2133. lwz r16, 252(SP)
  2134. lwz r15, 256(SP)
  2135. lwz r14, 260(SP)
  2136. #endif
  2137. addi SP, SP, STACKSIZE
  2138. blr
  2139. EPILOGUE
  2140. #endif