You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

gemm_kernel_altivec.S 49 kB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673167416751676167716781679168016811682168316841685168616871688168916901691169216931694169516961697169816991700170117021703170417051706170717081709171017111712171317141715171617171718171917201721172217231724172517261727172817291730173117321733173417351736173717381739174017411742174317441745174617471748174917501751175217531754175517561757175817591760176117621763176417651766176717681769177017711772177317741775177617771778177917801781178217831784178517861787178817891790179117921793179417951796179717981799180018011802180318041805180618071808180918101811181218131814181518161817181818191820182118221823182418251826182718281829183018311832183318341835183618371838183918401841184218431844184518461847184818491850185118521853185418551856185718581859186018611862186318641865186618671868186918701871187218731874187518761877187818791880188118821883188418851886188718881889189018911892189318941895189618971898189919001901190219031904190519061907190819091910191119121913191419151916191719181919192019211922192319241925192619271928192919301931193219331934193519361937193819391940194119421943194419451946194719481949195019511952195319541955195619571958195919601961196219631964196519661967196819691970197119721973197419751976197719781979198019811982198319841985198619871988198919901991199219931994199519961997199819992000200120022003200420052006200720082009201020112012201320142015201620172018201920202021202220232024202520262027202820292030203120322033203420352036203720382039204020412042204320442045204620472048204920502051205220532054205520562057205820592060206120622063206420652066206720682069207020712072207320742075207620772078207920802081208220832084208520862087208820892090209120922093209420952096209720982099210021012102210321042105210621072108210921102111211221132114211521162117211821192120212121222123212421252126212721282129213021312132213321342135213621372138213921402141214221432144214521462147214821492150215121522153215421552156215721582159216021612162216321642165216621672168216921702171217221732174217521762177217821792180218121822183218421852186218721882189219021912192219321942195219621972198219922002201220222032204220522062207220822092210221122122213221422152216221722182219222022212222222322242225222622272228222922302231223222332234223522362237223822392240224122422243224422452246224722482249225022512252225322542255225622572258225922602261226222632264226522662267226822692270227122722273227422752276227722782279228022812282228322842285228622872288228922902291229222932294229522962297229822992300230123022303230423052306230723082309231023112312231323142315231623172318231923202321232223232324232523262327232823292330233123322333233423352336233723382339234023412342234323442345234623472348234923502351235223532354235523562357235823592360236123622363236423652366236723682369237023712372237323742375237623772378237923802381238223832384238523862387238823892390239123922393239423952396239723982399240024012402240324042405240624072408240924102411241224132414241524162417241824192420242124222423242424252426242724282429243024312432243324342435243624372438243924402441244224432444244524462447244824492450245124522453245424552456245724582459246024612462246324642465246624672468246924702471247224732474247524762477247824792480248124822483248424852486248724882489249024912492249324942495249624972498249925002501250225032504250525062507250825092510251125122513251425152516251725182519252025212522252325242525252625272528252925302531253225332534253525362537253825392540254125422543254425452546254725482549255025512552255325542555255625572558255925602561256225632564256525662567256825692570257125722573257425752576257725782579258025812582258325842585258625872588258925902591259225932594259525962597259825992600260126022603260426052606260726082609261026112612261326142615261626172618261926202621262226232624262526262627262826292630263126322633263426352636263726382639264026412642264326442645264626472648264926502651265226532654265526562657265826592660266126622663266426652666266726682669267026712672267326742675267626772678267926802681268226832684268526862687268826892690269126922693269426952696269726982699270027012702270327042705270627072708
  1. /*********************************************************************/
  2. /* Copyright 2009, 2010 The University of Texas at Austin. */
  3. /* All rights reserved. */
  4. /* */
  5. /* Redistribution and use in source and binary forms, with or */
  6. /* without modification, are permitted provided that the following */
  7. /* conditions are met: */
  8. /* */
  9. /* 1. Redistributions of source code must retain the above */
  10. /* copyright notice, this list of conditions and the following */
  11. /* disclaimer. */
  12. /* */
  13. /* 2. Redistributions in binary form must reproduce the above */
  14. /* copyright notice, this list of conditions and the following */
  15. /* disclaimer in the documentation and/or other materials */
  16. /* provided with the distribution. */
  17. /* */
  18. /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
  19. /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
  20. /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
  21. /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
  22. /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
  23. /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
  24. /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
  25. /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
  26. /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
  27. /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
  28. /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
  29. /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
  30. /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
  31. /* POSSIBILITY OF SUCH DAMAGE. */
  32. /* */
  33. /* The views and conclusions contained in the software and */
  34. /* documentation are those of the authors and should not be */
  35. /* interpreted as representing official policies, either expressed */
  36. /* or implied, of The University of Texas at Austin. */
  37. /*********************************************************************/
  38. #define ASSEMBLER
  39. #include "common.h"
  40. #ifndef __64BIT__
  41. #define LOAD lwz
  42. #else
  43. #define LOAD ld
  44. #endif
  45. #ifdef __64BIT__
  46. #define STACKSIZE 360
  47. #else
  48. #define STACKSIZE 272
  49. #endif
  50. #define ALPHA 0
  51. #define FZERO 16
  52. #define M r3
  53. #define N r4
  54. #define K r5
  55. #if defined(linux) || defined(__FreeBSD__)
  56. #ifndef __64BIT__
  57. #define A r6
  58. #define B r7
  59. #define C r8
  60. #define LDC r9
  61. #else
  62. #define A r7
  63. #define B r8
  64. #define C r9
  65. #define LDC r10
  66. #endif
  67. #endif
  68. #if defined(_AIX) || defined(__APPLE__)
  69. #if !defined(__64BIT__) && defined(DOUBLE)
  70. #define A r8
  71. #define B r9
  72. #define C r10
  73. #define LDC r7
  74. #else
  75. #define A r7
  76. #define B r8
  77. #define C r9
  78. #define LDC r10
  79. #endif
  80. #endif
  81. #define STACK r11
  82. #define I r21
  83. #define J r22
  84. #define AO r23
  85. #define BO r24
  86. #define CO1 r25
  87. #define CO2 r26
  88. #define CO3 r27
  89. #define CO4 r28
  90. #define PREA r29
  91. #define PREB r29
  92. #define PREC r30
  93. #define VREG r31
  94. #define LOAD_A lvx
  95. #define LOAD_B lvx
  96. #define OFFSET_0 0
  97. #define OFFSET_1 r14
  98. #define OFFSET_2 r15
  99. #define OFFSET_3 r16
  100. #define OFFSET_4 r17
  101. #define OFFSET_5 r18
  102. #define OFFSET_6 r19
  103. #define OFFSET_7 r20
  104. #define c01 v0
  105. #define c02 v1
  106. #define c03 v2
  107. #define c04 v3
  108. #define c05 v4
  109. #define c06 v5
  110. #define c07 v6
  111. #define c08 v7
  112. #define c09 v8
  113. #define c10 v9
  114. #define c11 v10
  115. #define c12 v11
  116. #define c13 v12
  117. #define c14 v13
  118. #define c15 v14
  119. #define c16 v15
  120. #define a1 v16
  121. #define a2 v17
  122. #define a3 v18
  123. #define a4 v19
  124. #define a5 v20
  125. #define a6 v21
  126. #define a7 v22
  127. #define a8 v23
  128. #define b1 v24
  129. #define b2 v25
  130. #define bp1 v26
  131. #define bp2 v27
  132. #define C1 v16
  133. #define C2 v17
  134. #define C3 v18
  135. #define C4 v19
  136. #define C5 v20
  137. #define C6 v21
  138. #define C7 v22
  139. #define C8 v23
  140. #define C9 v24
  141. #define c00 v25
  142. #define PERMRSHIFT1 v26
  143. #define PERMRSHIFT2 v27
  144. #define PERMRSHIFT3 v28
  145. #define PERMRSHIFT4 v29
  146. #define VZERO v30
  147. #define alpha v31
  148. #ifndef NEEDPARAM
  149. PROLOGUE
  150. PROFCODE
  151. addi SP, SP, -STACKSIZE
  152. mr STACK, SP
  153. li r0, 0 * 16
  154. stvx v20, SP, r0
  155. li r0, 1 * 16
  156. stvx v21, SP, r0
  157. li r0, 2 * 16
  158. stvx v22, SP, r0
  159. li r0, 3 * 16
  160. stvx v23, SP, r0
  161. li r0, 4 * 16
  162. stvx v24, SP, r0
  163. li r0, 5 * 16
  164. stvx v25, SP, r0
  165. li r0, 6 * 16
  166. stvx v26, SP, r0
  167. li r0, 7 * 16
  168. stvx v27, SP, r0
  169. li r0, 8 * 16
  170. stvx v28, SP, r0
  171. li r0, 9 * 16
  172. stvx v29, SP, r0
  173. li r0, 10 * 16
  174. stvx v30, SP, r0
  175. li r0, 11 * 16
  176. stvx v31, SP, r0
  177. #ifdef __64BIT__
  178. std r31, 192(SP)
  179. std r30, 200(SP)
  180. std r29, 208(SP)
  181. std r28, 216(SP)
  182. std r27, 224(SP)
  183. std r26, 232(SP)
  184. std r25, 240(SP)
  185. std r24, 248(SP)
  186. std r23, 256(SP)
  187. std r22, 264(SP)
  188. std r21, 272(SP)
  189. std r20, 280(SP)
  190. std r19, 288(SP)
  191. std r18, 296(SP)
  192. std r17, 304(SP)
  193. std r16, 312(SP)
  194. std r15, 320(SP)
  195. std r14, 328(SP)
  196. #else
  197. stw r31, 192(SP)
  198. stw r30, 196(SP)
  199. stw r29, 200(SP)
  200. stw r28, 204(SP)
  201. stw r27, 208(SP)
  202. stw r26, 212(SP)
  203. stw r25, 216(SP)
  204. stw r24, 220(SP)
  205. stw r23, 224(SP)
  206. stw r22, 228(SP)
  207. stw r21, 232(SP)
  208. stw r20, 236(SP)
  209. stw r19, 240(SP)
  210. stw r18, 244(SP)
  211. stw r17, 248(SP)
  212. stw r16, 252(SP)
  213. stw r15, 256(SP)
  214. stw r14, 260(SP)
  215. #endif
  216. #if defined(_AIX) || defined(__APPLE__)
  217. #if !defined(__64BIT__) && defined(DOUBLE)
  218. lwz LDC, FRAMESLOT(0) + STACKSIZE(SP)
  219. #endif
  220. #endif
  221. li r0, -1
  222. mfspr VREG, VRsave
  223. mtspr VRsave, r0
  224. addi SP, SP, -128
  225. li r0, -128
  226. and SP, SP, r0
  227. li OFFSET_1, 4 * SIZE
  228. li OFFSET_2, 8 * SIZE
  229. li OFFSET_3, 12 * SIZE
  230. li OFFSET_4, 16 * SIZE
  231. li OFFSET_5, 20 * SIZE
  232. li OFFSET_6, 24 * SIZE
  233. li OFFSET_7, 28 * SIZE
  234. stfs f1, ALPHA + 0(SP)
  235. stfs f1, ALPHA + 4(SP)
  236. stfs f1, ALPHA + 8(SP)
  237. stfs f1, ALPHA + 12(SP)
  238. li r29, 0
  239. stw r29, FZERO(SP)
  240. slwi LDC, LDC, BASE_SHIFT
  241. li PREC, (15 * SIZE)
  242. #ifdef CELL
  243. li PREB, (3 * 32 * SIZE)
  244. #else
  245. li PREB, (5 * 32 * SIZE)
  246. #endif
  247. cmpwi cr0, M, 0
  248. ble LL(999)
  249. cmpwi cr0, N, 0
  250. ble LL(999)
  251. cmpwi cr0, K, 0
  252. ble LL(999)
  253. srawi. J, N, 2
  254. ble LL(60)
  255. .align 4
  256. LL(01):
  257. mr CO1, C
  258. add CO2, C, LDC
  259. add CO3, CO2, LDC
  260. add CO4, CO3, LDC
  261. add C, CO4, LDC
  262. mr AO, A
  263. srawi. I, M, 4
  264. ble LL(20)
  265. .align 4
  266. LL(11):
  267. vxor c01, c01, c01
  268. LOAD_B b1, OFFSET_0, B
  269. vxor c02, c02, c02
  270. LOAD_A a1, OFFSET_0, AO
  271. vxor c03, c03, c03
  272. LOAD_A a2, OFFSET_1, AO
  273. vxor c04, c04, c04
  274. LOAD_A a3, OFFSET_2, AO
  275. vxor c05, c05, c05
  276. LOAD_A a4, OFFSET_3, AO
  277. vxor c06, c06, c06
  278. LOAD_A a5, OFFSET_4, AO
  279. vxor c07, c07, c07
  280. nop
  281. vxor c08, c08, c08
  282. vxor c09, c09, c09
  283. dcbtst CO1, PREC
  284. vxor c10, c10, c10
  285. dcbtst CO2, PREC
  286. vxor c11, c11, c11
  287. dcbtst CO3, PREC
  288. vxor c12, c12, c12
  289. dcbtst CO4, PREC
  290. vxor c13, c13, c13
  291. mr BO, B
  292. vxor c14, c14, c14
  293. srawi. r0, K, 2
  294. vxor c15, c15, c15
  295. mtspr CTR, r0
  296. vxor c16, c16, c16
  297. vspltw bp1, b1, 0
  298. ble LL(13)
  299. .align 4
  300. #define NOP1 mr r3, r3
  301. #define NOP2 mr r4, r4
  302. LL(12):
  303. vmaddfp c01, a1, bp1, c01
  304. vspltw bp2, b1, 1
  305. vmaddfp c02, a2, bp1, c02
  306. DCBT(A, PREA)
  307. vmaddfp c03, a3, bp1, c03
  308. NOP1
  309. vmaddfp c04, a4, bp1, c04
  310. vspltw bp1, b1, 2
  311. vmaddfp c05, a1, bp2, c05
  312. DCBT(B, PREB)
  313. vmaddfp c06, a2, bp2, c06
  314. NOP2
  315. vmaddfp c07, a3, bp2, c07
  316. NOP1
  317. vmaddfp c08, a4, bp2, c08
  318. vspltw bp2, b1, 3
  319. vmaddfp c09, a1, bp1, c09
  320. NOP1
  321. vmaddfp c10, a2, bp1, c10
  322. LOAD_B b2, OFFSET_1, BO
  323. vmaddfp c11, a3, bp1, c11
  324. addi BO, BO, 8 * SIZE
  325. vmaddfp c12, a4, bp1, c12
  326. vspltw bp1, b2, 0
  327. vmaddfp c13, a1, bp2, c13
  328. NOP1
  329. vmaddfp c14, a2, bp2, c14
  330. LOAD_A a5, OFFSET_4, AO
  331. vmaddfp c15, a3, bp2, c15
  332. LOAD_A a6, OFFSET_5, AO
  333. vmaddfp c16, a4, bp2, c16
  334. vspltw bp2, b2, 1
  335. vmaddfp c01, a5, bp1, c01
  336. LOAD_A a7, OFFSET_6, AO
  337. vmaddfp c02, a6, bp1, c02
  338. LOAD_A a8, OFFSET_7, AO
  339. vmaddfp c03, a7, bp1, c03
  340. NOP1
  341. vmaddfp c04, a8, bp1, c04
  342. NOP2
  343. vmaddfp c05, a5, bp2, c05
  344. vspltw bp1, b2, 2
  345. vmaddfp c06, a6, bp2, c06
  346. addi AO, AO, 32 * SIZE
  347. vmaddfp c07, a7, bp2, c07
  348. LOAD_B b1, OFFSET_0, BO
  349. vmaddfp c08, a8, bp2, c08
  350. vspltw bp2, b2, 3
  351. vmaddfp c09, a5, bp1, c09
  352. NOP1
  353. vmaddfp c10, a6, bp1, c10
  354. NOP2
  355. vmaddfp c11, a7, bp1, c11
  356. NOP1
  357. vmaddfp c12, a8, bp1, c12
  358. vspltw bp1, b1, 0
  359. vmaddfp c13, a5, bp2, c13
  360. DCBT(A, PREA)
  361. vmaddfp c14, a6, bp2, c14
  362. LOAD_A a1, OFFSET_0, AO
  363. vmaddfp c15, a7, bp2, c15
  364. LOAD_A a2, OFFSET_1, AO
  365. vmaddfp c16, a8, bp2, c16
  366. vspltw bp2, b1, 1
  367. vmaddfp c01, a1, bp1, c01
  368. LOAD_A a3, OFFSET_2, AO
  369. vmaddfp c02, a2, bp1, c02
  370. LOAD_A a4, OFFSET_3, AO
  371. vmaddfp c03, a3, bp1, c03
  372. NOP1
  373. vmaddfp c04, a4, bp1, c04
  374. vspltw bp1, b1, 2
  375. vmaddfp c05, a1, bp2, c05
  376. NOP1
  377. vmaddfp c06, a2, bp2, c06
  378. NOP2
  379. vmaddfp c07, a3, bp2, c07
  380. NOP1
  381. vmaddfp c08, a4, bp2, c08
  382. vspltw bp2, b1, 3
  383. vmaddfp c09, a1, bp1, c09
  384. LOAD_B b2, OFFSET_1, BO
  385. vmaddfp c10, a2, bp1, c10
  386. NOP2
  387. vmaddfp c11, a3, bp1, c11
  388. NOP1
  389. vmaddfp c12, a4, bp1, c12
  390. addi BO, BO, 8 * SIZE
  391. vmaddfp c13, a1, bp2, c13
  392. vspltw bp1, b2, 0
  393. vmaddfp c14, a2, bp2, c14
  394. LOAD_A a5, OFFSET_4, AO
  395. vmaddfp c15, a3, bp2, c15
  396. LOAD_A a6, OFFSET_5, AO
  397. vmaddfp c16, a4, bp2, c16
  398. vspltw bp2, b2, 1
  399. vmaddfp c01, a5, bp1, c01
  400. LOAD_A a7, OFFSET_6, AO
  401. vmaddfp c02, a6, bp1, c02
  402. LOAD_A a8, OFFSET_7, AO
  403. vmaddfp c03, a7, bp1, c03
  404. addi AO, AO, 32 * SIZE
  405. vmaddfp c04, a8, bp1, c04
  406. NOP2
  407. vmaddfp c05, a5, bp2, c05
  408. vspltw bp1, b2, 2
  409. vmaddfp c06, a6, bp2, c06
  410. NOP2
  411. vmaddfp c07, a7, bp2, c07
  412. NOP1
  413. vmaddfp c08, a8, bp2, c08
  414. LOAD_B b1, OFFSET_0, BO
  415. vmaddfp c09, a5, bp1, c09
  416. vspltw bp2, b2, 3
  417. vmaddfp c10, a6, bp1, c10
  418. LOAD_A a1, OFFSET_0, AO //
  419. vmaddfp c11, a7, bp1, c11
  420. LOAD_A a2, OFFSET_1, AO
  421. vmaddfp c12, a8, bp1, c12
  422. NOP2
  423. vmaddfp c13, a5, bp2, c13
  424. vspltw bp1, b1, 0
  425. vmaddfp c14, a6, bp2, c14
  426. LOAD_A a3, OFFSET_2, AO
  427. vmaddfp c15, a7, bp2, c15
  428. LOAD_A a4, OFFSET_3, AO
  429. vmaddfp c16, a8, bp2, c16
  430. bdnz+ LL(12)
  431. .align 4
  432. LL(13):
  433. andi. r0, K, 2
  434. nop
  435. nop
  436. ble+ LL(15)
  437. .align 4
  438. vmaddfp c01, a1, bp1, c01
  439. vspltw bp2, b1, 1
  440. vmaddfp c02, a2, bp1, c02
  441. NOP2
  442. vmaddfp c03, a3, bp1, c03
  443. NOP1
  444. vmaddfp c04, a4, bp1, c04
  445. NOP2
  446. vmaddfp c05, a1, bp2, c05
  447. vspltw bp1, b1, 2
  448. vmaddfp c06, a2, bp2, c06
  449. NOP2
  450. vmaddfp c07, a3, bp2, c07
  451. NOP1
  452. vmaddfp c08, a4, bp2, c08
  453. LOAD_B b2, OFFSET_1, BO
  454. vmaddfp c09, a1, bp1, c09
  455. vspltw bp2, b1, 3
  456. vmaddfp c10, a2, bp1, c10
  457. LOAD_A a5, OFFSET_4, AO
  458. vmaddfp c11, a3, bp1, c11
  459. LOAD_A a6, OFFSET_5, AO
  460. vmaddfp c12, a4, bp1, c12
  461. addi BO, BO, 8 * SIZE
  462. vmaddfp c13, a1, bp2, c13
  463. vspltw bp1, b2, 0
  464. vmaddfp c14, a2, bp2, c14
  465. LOAD_A a7, OFFSET_6, AO
  466. vmaddfp c15, a3, bp2, c15
  467. LOAD_A a8, OFFSET_7, AO
  468. vmaddfp c16, a4, bp2, c16
  469. addi AO, AO, 32 * SIZE
  470. vmaddfp c01, a5, bp1, c01
  471. vspltw bp2, b2, 1
  472. vmaddfp c02, a6, bp1, c02
  473. NOP2
  474. vmaddfp c03, a7, bp1, c03
  475. NOP1
  476. vmaddfp c04, a8, bp1, c04
  477. NOP2
  478. vmaddfp c05, a5, bp2, c05
  479. vspltw bp1, b2, 2
  480. vmaddfp c06, a6, bp2, c06
  481. NOP2
  482. vmaddfp c07, a7, bp2, c07
  483. NOP1
  484. vmaddfp c08, a8, bp2, c08
  485. LOAD_B b1, OFFSET_0, BO
  486. vmaddfp c09, a5, bp1, c09
  487. vspltw bp2, b2, 3
  488. vmaddfp c10, a6, bp1, c10
  489. LOAD_A a1, OFFSET_0, AO
  490. vmaddfp c11, a7, bp1, c11
  491. LOAD_A a2, OFFSET_1, AO
  492. vmaddfp c12, a8, bp1, c12
  493. NOP2
  494. vmaddfp c13, a5, bp2, c13
  495. vspltw bp1, b1, 0
  496. vmaddfp c14, a6, bp2, c14
  497. LOAD_A a3, OFFSET_2, AO
  498. vmaddfp c15, a7, bp2, c15
  499. LOAD_A a4, OFFSET_3, AO
  500. vmaddfp c16, a8, bp2, c16
  501. .align 4
  502. LL(15):
  503. andi. r0, K, 1
  504. lvx alpha, OFFSET_0, SP
  505. vxor VZERO, VZERO, VZERO
  506. ble+ LL(18)
  507. .align 4
  508. vmaddfp c01, a1, bp1, c01
  509. vspltw bp2, b1, 1
  510. vmaddfp c02, a2, bp1, c02
  511. nop
  512. vmaddfp c03, a3, bp1, c03
  513. nop
  514. vmaddfp c04, a4, bp1, c04
  515. nop
  516. vmaddfp c05, a1, bp2, c05
  517. vspltw bp1, b1, 2
  518. vmaddfp c06, a2, bp2, c06
  519. nop
  520. vmaddfp c07, a3, bp2, c07
  521. nop
  522. vmaddfp c08, a4, bp2, c08
  523. nop
  524. vmaddfp c09, a1, bp1, c09
  525. vspltw bp2, b1, 3
  526. vmaddfp c10, a2, bp1, c10
  527. addi AO, AO, 16 * SIZE
  528. vmaddfp c11, a3, bp1, c11
  529. addi BO, BO, 4 * SIZE
  530. vmaddfp c12, a4, bp1, c12
  531. nop
  532. vmaddfp c13, a1, bp2, c13
  533. vmaddfp c14, a2, bp2, c14
  534. vmaddfp c15, a3, bp2, c15
  535. vmaddfp c16, a4, bp2, c16
  536. .align 4
  537. LL(18):
  538. lvx C1, OFFSET_0, CO1
  539. cmpwi cr0, LDC, 32 * SIZE
  540. lvx C2, OFFSET_1, CO1
  541. lvsr PERMRSHIFT1, 0, CO1
  542. lvx C3, OFFSET_2, CO1
  543. lvsr PERMRSHIFT2, 0, CO2
  544. lvx C4, OFFSET_3, CO1
  545. lvsr PERMRSHIFT3, 0, CO3
  546. lvx C5, OFFSET_4, CO1
  547. lvsr PERMRSHIFT4, 0, CO4
  548. ble LL(19)
  549. vperm c00, VZERO, c01, PERMRSHIFT1
  550. vperm c01, c01, c02, PERMRSHIFT1
  551. vperm c02, c02, c03, PERMRSHIFT1
  552. vperm c03, c03, c04, PERMRSHIFT1
  553. vperm c04, c04, VZERO, PERMRSHIFT1
  554. vmaddfp c00, alpha, c00, C1
  555. lvx C1, OFFSET_0, CO2
  556. vmaddfp c01, alpha, c01, C2
  557. lvx C6, OFFSET_1, CO2
  558. vmaddfp c02, alpha, c02, C3
  559. lvx C7, OFFSET_2, CO2
  560. vmaddfp c03, alpha, c03, C4
  561. lvx C8, OFFSET_3, CO2
  562. vmaddfp c04, alpha, c04, C5
  563. lvx C9, OFFSET_4, CO2
  564. stvx c00, OFFSET_0, CO1
  565. vperm c00, VZERO, c05, PERMRSHIFT2
  566. stvx c01, OFFSET_1, CO1
  567. vperm c05, c05, c06, PERMRSHIFT2
  568. stvx c02, OFFSET_2, CO1
  569. vperm c06, c06, c07, PERMRSHIFT2
  570. stvx c03, OFFSET_3, CO1
  571. vperm c07, c07, c08, PERMRSHIFT2
  572. stvx c04, OFFSET_4, CO1
  573. vperm c08, c08, VZERO, PERMRSHIFT2
  574. vmaddfp c00, alpha, c00, C1
  575. lvx C1, OFFSET_0, CO3
  576. vmaddfp c05, alpha, c05, C6
  577. lvx C2, OFFSET_1, CO3
  578. vmaddfp c06, alpha, c06, C7
  579. lvx C3, OFFSET_2, CO3
  580. vmaddfp c07, alpha, c07, C8
  581. lvx C4, OFFSET_3, CO3
  582. vmaddfp c08, alpha, c08, C9
  583. lvx C5, OFFSET_4, CO3
  584. stvx c00, OFFSET_0, CO2
  585. vperm c00, VZERO, c09, PERMRSHIFT3
  586. stvx c05, OFFSET_1, CO2
  587. vperm c09, c09, c10, PERMRSHIFT3
  588. stvx c06, OFFSET_2, CO2
  589. vperm c10, c10, c11, PERMRSHIFT3
  590. stvx c07, OFFSET_3, CO2
  591. vperm c11, c11, c12, PERMRSHIFT3
  592. stvx c08, OFFSET_4, CO2
  593. vperm c12, c12, VZERO, PERMRSHIFT3
  594. vmaddfp c00, alpha, c00, C1
  595. lvx C9, OFFSET_4, CO4
  596. vmaddfp c09, alpha, c09, C2
  597. lvx C1, OFFSET_0, CO4
  598. vmaddfp c10, alpha, c10, C3
  599. lvx C6, OFFSET_1, CO4
  600. vmaddfp c11, alpha, c11, C4
  601. lvx C7, OFFSET_2, CO4
  602. vmaddfp c12, alpha, c12, C5
  603. lvx C8, OFFSET_3, CO4
  604. stvx c00, OFFSET_0, CO3
  605. vperm c00, VZERO, c13, PERMRSHIFT4
  606. stvx c09, OFFSET_1, CO3
  607. vperm c13, c13, c14, PERMRSHIFT4
  608. stvx c10, OFFSET_2, CO3
  609. vperm c14, c14, c15, PERMRSHIFT4
  610. stvx c11, OFFSET_3, CO3
  611. vperm c15, c15, c16, PERMRSHIFT4
  612. stvx c12, OFFSET_4, CO3
  613. vperm c16, c16, VZERO, PERMRSHIFT4
  614. vmaddfp c00, alpha, c00, C1
  615. vmaddfp c13, alpha, c13, C6
  616. vmaddfp c14, alpha, c14, C7
  617. vmaddfp c15, alpha, c15, C8
  618. vmaddfp c16, alpha, c16, C9
  619. stvx c00, OFFSET_0, CO4
  620. stvx c13, OFFSET_1, CO4
  621. stvx c14, OFFSET_2, CO4
  622. stvx c15, OFFSET_3, CO4
  623. stvx c16, OFFSET_4, CO4
  624. addi CO1, CO1, 16 * SIZE
  625. addi CO2, CO2, 16 * SIZE
  626. addi CO3, CO3, 16 * SIZE
  627. addi CO4, CO4, 16 * SIZE
  628. addic. I, I, -1
  629. bgt+ LL(11)
  630. b LL(20)
  631. .align 4
  632. LL(19):
  633. lvx C6, OFFSET_1, CO2
  634. lvx C7, OFFSET_2, CO2
  635. lvx C8, OFFSET_3, CO2
  636. lvx C9, OFFSET_4, CO2
  637. vperm c00, VZERO, c01, PERMRSHIFT1
  638. vperm c01, c01, c02, PERMRSHIFT1
  639. vperm c02, c02, c03, PERMRSHIFT1
  640. vperm c03, c03, c04, PERMRSHIFT1
  641. vperm c04, c04, VZERO, PERMRSHIFT1
  642. vmaddfp c00, alpha, c00, C1
  643. vmaddfp c01, alpha, c01, C2
  644. lvx C2, OFFSET_1, CO3
  645. vmaddfp c02, alpha, c02, C3
  646. lvx C3, OFFSET_2, CO3
  647. vmaddfp c03, alpha, c03, C4
  648. lvx C4, OFFSET_3, CO3
  649. vmaddfp c04, alpha, c04, C5
  650. lvx C5, OFFSET_4, CO3
  651. stvx c00, OFFSET_0, CO1
  652. stvx c01, OFFSET_1, CO1
  653. stvx c02, OFFSET_2, CO1
  654. stvx c03, OFFSET_3, CO1
  655. stvx c04, OFFSET_4, CO1
  656. lvx C1, OFFSET_0, CO2
  657. vperm c00, VZERO, c05, PERMRSHIFT2
  658. vperm c05, c05, c06, PERMRSHIFT2
  659. vperm c06, c06, c07, PERMRSHIFT2
  660. vperm c07, c07, c08, PERMRSHIFT2
  661. vperm c08, c08, VZERO, PERMRSHIFT2
  662. vmaddfp c00, alpha, c00, C1
  663. vmaddfp c05, alpha, c05, C6
  664. lvx C6, OFFSET_1, CO4
  665. vmaddfp c06, alpha, c06, C7
  666. lvx C7, OFFSET_2, CO4
  667. vmaddfp c07, alpha, c07, C8
  668. lvx C8, OFFSET_3, CO4
  669. vmaddfp c08, alpha, c08, C9
  670. lvx C9, OFFSET_4, CO4
  671. stvx c00, OFFSET_0, CO2
  672. stvx c05, OFFSET_1, CO2
  673. stvx c06, OFFSET_2, CO2
  674. stvx c07, OFFSET_3, CO2
  675. stvx c08, OFFSET_4, CO2
  676. lvx C1, OFFSET_0, CO3
  677. vperm c00, VZERO, c09, PERMRSHIFT3
  678. vperm c09, c09, c10, PERMRSHIFT3
  679. vperm c10, c10, c11, PERMRSHIFT3
  680. vperm c11, c11, c12, PERMRSHIFT3
  681. vperm c12, c12, VZERO, PERMRSHIFT3
  682. vmaddfp c00, alpha, c00, C1
  683. vmaddfp c09, alpha, c09, C2
  684. vmaddfp c10, alpha, c10, C3
  685. vmaddfp c11, alpha, c11, C4
  686. vmaddfp c12, alpha, c12, C5
  687. stvx c00, OFFSET_0, CO3
  688. stvx c09, OFFSET_1, CO3
  689. stvx c10, OFFSET_2, CO3
  690. stvx c11, OFFSET_3, CO3
  691. stvx c12, OFFSET_4, CO3
  692. lvx C1, OFFSET_0, CO4
  693. vperm c00, VZERO, c13, PERMRSHIFT4
  694. vperm c13, c13, c14, PERMRSHIFT4
  695. vperm c14, c14, c15, PERMRSHIFT4
  696. vperm c15, c15, c16, PERMRSHIFT4
  697. vperm c16, c16, VZERO, PERMRSHIFT4
  698. vmaddfp c00, alpha, c00, C1
  699. vmaddfp c13, alpha, c13, C6
  700. vmaddfp c14, alpha, c14, C7
  701. vmaddfp c15, alpha, c15, C8
  702. vmaddfp c16, alpha, c16, C9
  703. stvx c00, OFFSET_0, CO4
  704. stvx c13, OFFSET_1, CO4
  705. stvx c14, OFFSET_2, CO4
  706. stvx c15, OFFSET_3, CO4
  707. stvx c16, OFFSET_4, CO4
  708. addi CO1, CO1, 16 * SIZE
  709. addi CO2, CO2, 16 * SIZE
  710. addi CO3, CO3, 16 * SIZE
  711. addi CO4, CO4, 16 * SIZE
  712. addic. I, I, -1
  713. bgt+ LL(11)
  714. .align 4
  715. LL(20):
  716. andi. I, M, 8
  717. ble LL(30)
  718. vxor c01, c01, c01
  719. LOAD_A a1, OFFSET_0, AO
  720. vxor c02, c02, c02
  721. LOAD_A a2, OFFSET_1, AO
  722. vxor c05, c05, c05
  723. LOAD_A a3, OFFSET_2, AO
  724. vxor c06, c06, c06
  725. LOAD_A a4, OFFSET_3, AO
  726. vxor c09, c09, c09
  727. LOAD_B b1, OFFSET_0, B
  728. vxor c10, c10, c10
  729. LOAD_B b2, OFFSET_1, B
  730. vxor c13, c13, c13
  731. vxor c14, c14, c14
  732. mr BO, B
  733. vspltw bp1, b1, 0
  734. srawi. r0, K, 1
  735. mtspr CTR, r0
  736. ble LL(25)
  737. .align 4
  738. LL(22):
  739. vmaddfp c01, a1, bp1, c01
  740. vspltw bp2, b1, 1
  741. addi AO, AO, 16 * SIZE
  742. vmaddfp c02, a2, bp1, c02
  743. addi BO, BO, 8 * SIZE
  744. vmaddfp c05, a1, bp2, c05
  745. vspltw bp1, b1, 2
  746. vmaddfp c06, a2, bp2, c06
  747. vmaddfp c09, a1, bp1, c09
  748. vspltw bp2, b1, 3
  749. LOAD_B b1, OFFSET_0, BO
  750. vmaddfp c10, a2, bp1, c10
  751. vmaddfp c13, a1, bp2, c13
  752. LOAD_A a1, OFFSET_0, AO
  753. vspltw bp1, b2, 0
  754. vmaddfp c14, a2, bp2, c14
  755. LOAD_A a2, OFFSET_1, AO
  756. vmaddfp c01, a3, bp1, c01
  757. vspltw bp2, b2, 1
  758. vmaddfp c02, a4, bp1, c02
  759. vmaddfp c05, a3, bp2, c05
  760. vspltw bp1, b2, 2
  761. vmaddfp c06, a4, bp2, c06
  762. vmaddfp c09, a3, bp1, c09
  763. vspltw bp2, b2, 3
  764. LOAD_B b2, OFFSET_1, BO
  765. vmaddfp c10, a4, bp1, c10
  766. vmaddfp c13, a3, bp2, c13
  767. LOAD_A a3, OFFSET_2, AO
  768. vmaddfp c14, a4, bp2, c14
  769. LOAD_A a4, OFFSET_3, AO
  770. vspltw bp1, b1, 0
  771. bdnz LL(22)
  772. .align 4
  773. LL(25):
  774. andi. r0, K, 1
  775. lvx alpha, OFFSET_0, SP
  776. vxor VZERO, VZERO, VZERO
  777. ble+ LL(28)
  778. .align 4
  779. LL(26):
  780. vmaddfp c01, a1, bp1, c01
  781. vspltw bp2, b1, 1
  782. vmaddfp c02, a2, bp1, c02
  783. nop
  784. vmaddfp c05, a1, bp2, c05
  785. vspltw bp1, b1, 2
  786. vmaddfp c06, a2, bp2, c06
  787. nop
  788. vmaddfp c09, a1, bp1, c09
  789. vspltw bp2, b1, 3
  790. vmaddfp c10, a2, bp1, c10
  791. addi AO, AO, 8 * SIZE
  792. vmaddfp c13, a1, bp2, c13
  793. addi BO, BO, 4 * SIZE
  794. vmaddfp c14, a2, bp2, c14
  795. nop
  796. .align 4
  797. LL(28):
  798. lvx C1, OFFSET_0, CO1
  799. lvx C2, OFFSET_1, CO1
  800. lvx C3, OFFSET_2, CO1
  801. lvsr PERMRSHIFT1, 0, CO1
  802. lvsr PERMRSHIFT2, 0, CO2
  803. lvsr PERMRSHIFT3, 0, CO3
  804. lvsr PERMRSHIFT4, 0, CO4
  805. vperm c00, VZERO, c01, PERMRSHIFT1
  806. vperm c01, c01, c02, PERMRSHIFT1
  807. vperm c02, c02, VZERO, PERMRSHIFT1
  808. vmaddfp c00, alpha, c00, C1
  809. vmaddfp c01, alpha, c01, C2
  810. vmaddfp c02, alpha, c02, C3
  811. stvx c00, OFFSET_0, CO1
  812. stvx c01, OFFSET_1, CO1
  813. stvx c02, OFFSET_2, CO1
  814. lvx C1, OFFSET_0, CO2
  815. lvx C2, OFFSET_1, CO2
  816. lvx C3, OFFSET_2, CO2
  817. vperm c00, VZERO, c05, PERMRSHIFT2
  818. vperm c05, c05, c06, PERMRSHIFT2
  819. vperm c06, c06, VZERO, PERMRSHIFT2
  820. vmaddfp c00, alpha, c00, C1
  821. vmaddfp c05, alpha, c05, C2
  822. vmaddfp c06, alpha, c06, C3
  823. stvx c00, OFFSET_0, CO2
  824. stvx c05, OFFSET_1, CO2
  825. stvx c06, OFFSET_2, CO2
  826. lvx C1, OFFSET_0, CO3
  827. lvx C2, OFFSET_1, CO3
  828. lvx C3, OFFSET_2, CO3
  829. vperm c00, VZERO, c09, PERMRSHIFT3
  830. vperm c09, c09, c10, PERMRSHIFT3
  831. vperm c10, c10, VZERO, PERMRSHIFT3
  832. vmaddfp c00, alpha, c00, C1
  833. vmaddfp c09, alpha, c09, C2
  834. vmaddfp c10, alpha, c10, C3
  835. stvx c00, OFFSET_0, CO3
  836. stvx c09, OFFSET_1, CO3
  837. stvx c10, OFFSET_2, CO3
  838. lvx C1, OFFSET_0, CO4
  839. lvx C2, OFFSET_1, CO4
  840. lvx C3, OFFSET_2, CO4
  841. vperm c00, VZERO, c13, PERMRSHIFT4
  842. vperm c13, c13, c14, PERMRSHIFT4
  843. vperm c14, c14, VZERO, PERMRSHIFT4
  844. vmaddfp c00, alpha, c00, C1
  845. vmaddfp c13, alpha, c13, C2
  846. vmaddfp c14, alpha, c14, C3
  847. stvx c00, OFFSET_0, CO4
  848. stvx c13, OFFSET_1, CO4
  849. stvx c14, OFFSET_2, CO4
  850. addi CO1, CO1, 8 * SIZE
  851. addi CO2, CO2, 8 * SIZE
  852. addi CO3, CO3, 8 * SIZE
  853. addi CO4, CO4, 8 * SIZE
  854. .align 4
  855. LL(30):
  856. andi. I, M, 4
  857. ble LL(40)
  858. vxor c01, c01, c01
  859. LOAD_A a1, OFFSET_0, AO
  860. vxor c02, c02, c02
  861. LOAD_A a2, OFFSET_1, AO
  862. vxor c05, c05, c05
  863. LOAD_B b1, OFFSET_0, B
  864. vxor c06, c06, c06
  865. LOAD_B b2, OFFSET_1, B
  866. vxor c09, c09, c09
  867. vxor c10, c10, c10
  868. vxor c13, c13, c13
  869. vxor c14, c14, c14
  870. vspltw bp1, b1, 0
  871. mr BO, B
  872. srawi. r0, K, 1
  873. mtspr CTR, r0
  874. ble LL(35)
  875. .align 4
  876. LL(32):
  877. vmaddfp c01, a1, bp1, c01
  878. addi AO, AO, 8 * SIZE
  879. vspltw bp2, b1, 1
  880. vmaddfp c05, a1, bp2, c05
  881. addi BO, BO, 8 * SIZE
  882. vspltw bp1, b1, 2
  883. vmaddfp c09, a1, bp1, c09
  884. vspltw bp2, b1, 3
  885. vmaddfp c13, a1, bp2, c13
  886. LOAD_A a1, OFFSET_0, AO
  887. vspltw bp1, b2, 0
  888. LOAD_B b1, OFFSET_0, BO
  889. vmaddfp c02, a2, bp1, c02
  890. vspltw bp2, b2, 1
  891. vmaddfp c06, a2, bp2, c06
  892. vspltw bp1, b2, 2
  893. vmaddfp c10, a2, bp1, c10
  894. vspltw bp2, b2, 3
  895. LOAD_B b2, OFFSET_1, BO
  896. vmaddfp c14, a2, bp2, c14
  897. LOAD_A a2, OFFSET_1, AO
  898. vspltw bp1, b1, 0
  899. bdnz LL(32)
  900. .align 4
  901. LL(35):
  902. andi. r0, K, 1
  903. lvx alpha, OFFSET_0, SP
  904. vxor VZERO, VZERO, VZERO
  905. ble+ LL(38)
  906. .align 4
  907. LL(36):
  908. vmaddfp c01, a1, bp1, c01
  909. vspltw bp2, b1, 1
  910. vmaddfp c05, a1, bp2, c05
  911. vspltw bp1, b1, 2
  912. vmaddfp c09, a1, bp1, c09
  913. vspltw bp2, b1, 3
  914. vmaddfp c13, a1, bp2, c13
  915. addi AO, AO, 4 * SIZE
  916. addi BO, BO, 4 * SIZE
  917. .align 4
  918. LL(38):
  919. vaddfp c01, c01, c02
  920. vaddfp c05, c05, c06
  921. vaddfp c09, c09, c10
  922. vaddfp c13, c13, c14
  923. lvx C1, OFFSET_0, CO1
  924. lvx C2, OFFSET_1, CO1
  925. lvsr PERMRSHIFT1, 0, CO1
  926. lvsr PERMRSHIFT2, 0, CO2
  927. lvsr PERMRSHIFT3, 0, CO3
  928. lvsr PERMRSHIFT4, 0, CO4
  929. vperm c00, VZERO, c01, PERMRSHIFT1
  930. vperm c01, c01, VZERO, PERMRSHIFT1
  931. vmaddfp c00, alpha, c00, C1
  932. vmaddfp c01, alpha, c01, C2
  933. stvx c00, OFFSET_0, CO1
  934. stvx c01, OFFSET_1, CO1
  935. lvx C1, OFFSET_0, CO2
  936. lvx C2, OFFSET_1, CO2
  937. vperm c00, VZERO, c05, PERMRSHIFT2
  938. vperm c05, c05, VZERO, PERMRSHIFT2
  939. vmaddfp c00, alpha, c00, C1
  940. vmaddfp c05, alpha, c05, C2
  941. stvx c00, OFFSET_0, CO2
  942. stvx c05, OFFSET_1, CO2
  943. lvx C1, OFFSET_0, CO3
  944. lvx C2, OFFSET_1, CO3
  945. vperm c00, VZERO, c09, PERMRSHIFT3
  946. vperm c09, c09, VZERO, PERMRSHIFT3
  947. vmaddfp c00, alpha, c00, C1
  948. vmaddfp c09, alpha, c09, C2
  949. stvx c00, OFFSET_0, CO3
  950. stvx c09, OFFSET_1, CO3
  951. lvx C1, OFFSET_0, CO4
  952. lvx C2, OFFSET_1, CO4
  953. vperm c00, VZERO, c13, PERMRSHIFT4
  954. vperm c13, c13, VZERO, PERMRSHIFT4
  955. vmaddfp c00, alpha, c00, C1
  956. vmaddfp c13, alpha, c13, C2
  957. stvx c00, OFFSET_0, CO4
  958. stvx c13, OFFSET_1, CO4
  959. addi CO1, CO1, 4 * SIZE
  960. addi CO2, CO2, 4 * SIZE
  961. addi CO3, CO3, 4 * SIZE
  962. addi CO4, CO4, 4 * SIZE
  963. .align 4
  964. LL(40):
  965. andi. I, M, 2
  966. ble LL(50)
  967. mr BO, B
  968. LFD f8, 0 * SIZE(AO)
  969. LFD f9, 1 * SIZE(AO)
  970. LFD f10, 0 * SIZE(B)
  971. LFD f11, 1 * SIZE(B)
  972. LFD f12, 2 * SIZE(B)
  973. LFD f13, 3 * SIZE(B)
  974. lfs f0, FZERO(SP)
  975. fmr f1, f0
  976. fmr f2, f0
  977. fmr f3, f0
  978. fmr f4, f0
  979. fmr f5, f0
  980. fmr f6, f0
  981. fmr f7, f0
  982. srawi. r0, K, 1
  983. mtspr CTR, r0
  984. ble LL(45)
  985. .align 4
  986. LL(42):
  987. FMADD f0, f8, f10, f0
  988. FMADD f2, f8, f11, f2
  989. FMADD f4, f8, f12, f4
  990. FMADD f6, f8, f13, f6
  991. FMADD f1, f9, f10, f1
  992. FMADD f3, f9, f11, f3
  993. FMADD f5, f9, f12, f5
  994. FMADD f7, f9, f13, f7
  995. LFD f8, 2 * SIZE(AO)
  996. LFD f9, 3 * SIZE(AO)
  997. LFD f10, 4 * SIZE(BO)
  998. LFD f11, 5 * SIZE(BO)
  999. LFD f12, 6 * SIZE(BO)
  1000. LFD f13, 7 * SIZE(BO)
  1001. FMADD f0, f8, f10, f0
  1002. FMADD f2, f8, f11, f2
  1003. FMADD f4, f8, f12, f4
  1004. FMADD f6, f8, f13, f6
  1005. FMADD f1, f9, f10, f1
  1006. FMADD f3, f9, f11, f3
  1007. FMADD f5, f9, f12, f5
  1008. FMADD f7, f9, f13, f7
  1009. LFD f8, 4 * SIZE(AO)
  1010. LFD f9, 5 * SIZE(AO)
  1011. LFD f10, 8 * SIZE(BO)
  1012. LFD f11, 9 * SIZE(BO)
  1013. LFD f12, 10 * SIZE(BO)
  1014. LFD f13, 11 * SIZE(BO)
  1015. addi AO, AO, 4 * SIZE
  1016. addi BO, BO, 8 * SIZE
  1017. bdnz LL(42)
  1018. .align 4
  1019. LL(45):
  1020. andi. r0, K, 1
  1021. ble LL(48)
  1022. .align 4
  1023. LL(46):
  1024. FMADD f0, f8, f10, f0
  1025. FMADD f2, f8, f11, f2
  1026. FMADD f4, f8, f12, f4
  1027. FMADD f6, f8, f13, f6
  1028. FMADD f1, f9, f10, f1
  1029. FMADD f3, f9, f11, f3
  1030. FMADD f5, f9, f12, f5
  1031. FMADD f7, f9, f13, f7
  1032. LFD f8, 2 * SIZE(AO)
  1033. LFD f9, 3 * SIZE(AO)
  1034. LFD f10, 4 * SIZE(BO)
  1035. LFD f11, 5 * SIZE(BO)
  1036. LFD f12, 6 * SIZE(BO)
  1037. LFD f13, 7 * SIZE(BO)
  1038. addi AO, AO, 2 * SIZE
  1039. addi BO, BO, 4 * SIZE
  1040. .align 4
  1041. LL(48):
  1042. lfs f13, ALPHA(SP)
  1043. LFD f8, 0 * SIZE(CO1)
  1044. LFD f9, 1 * SIZE(CO1)
  1045. LFD f10, 0 * SIZE(CO2)
  1046. LFD f11, 1 * SIZE(CO2)
  1047. FMADD f0, f0, f13, f8
  1048. FMADD f1, f1, f13, f9
  1049. FMADD f2, f2, f13, f10
  1050. FMADD f3, f3, f13, f11
  1051. LFD f8, 0 * SIZE(CO3)
  1052. LFD f9, 1 * SIZE(CO3)
  1053. LFD f10, 0 * SIZE(CO4)
  1054. LFD f11, 1 * SIZE(CO4)
  1055. FMADD f4, f4, f13, f8
  1056. FMADD f5, f5, f13, f9
  1057. FMADD f6, f6, f13, f10
  1058. FMADD f7, f7, f13, f11
  1059. STFD f0, 0 * SIZE(CO1)
  1060. STFD f1, 1 * SIZE(CO1)
  1061. STFD f2, 0 * SIZE(CO2)
  1062. STFD f3, 1 * SIZE(CO2)
  1063. STFD f4, 0 * SIZE(CO3)
  1064. STFD f5, 1 * SIZE(CO3)
  1065. STFD f6, 0 * SIZE(CO4)
  1066. STFD f7, 1 * SIZE(CO4)
  1067. addi CO1, CO1, 2 * SIZE
  1068. addi CO2, CO2, 2 * SIZE
  1069. addi CO3, CO3, 2 * SIZE
  1070. addi CO4, CO4, 2 * SIZE
  1071. .align 4
  1072. LL(50):
  1073. andi. I, M, 1
  1074. ble LL(59)
  1075. mr BO, B
  1076. LFD f8, 0 * SIZE(AO)
  1077. LFD f9, 1 * SIZE(AO)
  1078. LFD f10, 0 * SIZE(B)
  1079. LFD f11, 1 * SIZE(B)
  1080. LFD f12, 2 * SIZE(B)
  1081. LFD f13, 3 * SIZE(B)
  1082. lfs f0, FZERO(SP)
  1083. fmr f1, f0
  1084. fmr f2, f0
  1085. fmr f3, f0
  1086. srawi. r0, K, 1
  1087. mtspr CTR, r0
  1088. ble LL(55)
  1089. .align 4
  1090. LL(52):
  1091. FMADD f0, f8, f10, f0
  1092. FMADD f1, f8, f11, f1
  1093. FMADD f2, f8, f12, f2
  1094. FMADD f3, f8, f13, f3
  1095. LFD f8, 2 * SIZE(AO)
  1096. LFD f10, 4 * SIZE(BO)
  1097. LFD f11, 5 * SIZE(BO)
  1098. LFD f12, 6 * SIZE(BO)
  1099. LFD f13, 7 * SIZE(BO)
  1100. FMADD f0, f9, f10, f0
  1101. FMADD f1, f9, f11, f1
  1102. FMADD f2, f9, f12, f2
  1103. FMADD f3, f9, f13, f3
  1104. LFD f9, 3 * SIZE(AO)
  1105. LFD f10, 8 * SIZE(BO)
  1106. LFD f11, 9 * SIZE(BO)
  1107. LFD f12, 10 * SIZE(BO)
  1108. LFD f13, 11 * SIZE(BO)
  1109. addi AO, AO, 2 * SIZE
  1110. addi BO, BO, 8 * SIZE
  1111. bdnz LL(52)
  1112. .align 4
  1113. LL(55):
  1114. andi. r0, K, 1
  1115. ble LL(58)
  1116. .align 4
  1117. LL(56):
  1118. FMADD f0, f8, f10, f0
  1119. FMADD f1, f8, f11, f1
  1120. FMADD f2, f8, f12, f2
  1121. FMADD f3, f8, f13, f3
  1122. LFD f8, 2 * SIZE(AO)
  1123. LFD f10, 4 * SIZE(BO)
  1124. LFD f11, 5 * SIZE(BO)
  1125. LFD f12, 6 * SIZE(BO)
  1126. LFD f13, 7 * SIZE(BO)
  1127. addi AO, AO, 1 * SIZE
  1128. addi BO, BO, 4 * SIZE
  1129. .align 4
  1130. LL(58):
  1131. lfs f13, ALPHA(SP)
  1132. LFD f8, 0 * SIZE(CO1)
  1133. LFD f9, 0 * SIZE(CO2)
  1134. LFD f10, 0 * SIZE(CO3)
  1135. LFD f11, 0 * SIZE(CO4)
  1136. FMADD f0, f0, f13, f8
  1137. FMADD f1, f1, f13, f9
  1138. FMADD f2, f2, f13, f10
  1139. FMADD f3, f3, f13, f11
  1140. STFD f0, 0 * SIZE(CO1)
  1141. STFD f1, 0 * SIZE(CO2)
  1142. STFD f2, 0 * SIZE(CO3)
  1143. STFD f3, 0 * SIZE(CO4)
  1144. .align 4
  1145. LL(59):
  1146. mr B, BO
  1147. addic. J, J, -1
  1148. bgt LL(01)
  1149. .align 4
  1150. LL(60):
  1151. andi. r0, N, 2
  1152. ble LL(120)
  1153. mr CO1, C
  1154. add CO2, C, LDC
  1155. add C, CO2, LDC
  1156. mr AO, A
  1157. srawi. I, M, 4
  1158. ble LL(80)
  1159. .align 4
  1160. LL(71):
  1161. vxor c01, c01, c01
  1162. LOAD_B b1, OFFSET_0, B
  1163. vxor c02, c02, c02
  1164. vxor c03, c03, c03
  1165. LOAD_A a1, OFFSET_0, AO
  1166. vxor c04, c04, c04
  1167. LOAD_A a2, OFFSET_1, AO
  1168. vxor c05, c05, c05
  1169. LOAD_A a3, OFFSET_2, AO
  1170. vxor c06, c06, c06
  1171. LOAD_A a4, OFFSET_3, AO
  1172. vxor c07, c07, c07
  1173. vxor c08, c08, c08
  1174. mr BO, B
  1175. dcbtst CO1, PREC
  1176. dcbtst CO2, PREC
  1177. vspltw bp1, b1, 0
  1178. srawi. r0, K, 1
  1179. mtspr CTR, r0
  1180. ble LL(75)
  1181. .align 4
  1182. LL(72):
  1183. LOAD_A a5, OFFSET_4, AO
  1184. LOAD_A a6, OFFSET_5, AO
  1185. LOAD_A a7, OFFSET_6, AO
  1186. LOAD_A a8, OFFSET_7, AO
  1187. vmaddfp c01, a1, bp1, c01
  1188. vspltw bp2, b1, 1
  1189. vmaddfp c02, a2, bp1, c02
  1190. vmaddfp c03, a3, bp1, c03
  1191. vmaddfp c04, a4, bp1, c04
  1192. vmaddfp c05, a1, bp2, c05
  1193. vspltw bp1, b1, 2
  1194. vmaddfp c06, a2, bp2, c06
  1195. vmaddfp c07, a3, bp2, c07
  1196. vmaddfp c08, a4, bp2, c08
  1197. vmaddfp c01, a5, bp1, c01
  1198. vspltw bp2, b1, 3
  1199. vmaddfp c02, a6, bp1, c02
  1200. vmaddfp c03, a7, bp1, c03
  1201. vmaddfp c04, a8, bp1, c04
  1202. LOAD_B b1, OFFSET_1, BO
  1203. vspltw bp1, b1, 0
  1204. vmaddfp c05, a5, bp2, c05
  1205. vmaddfp c06, a6, bp2, c06
  1206. vmaddfp c07, a7, bp2, c07
  1207. vmaddfp c08, a8, bp2, c08
  1208. addi AO, AO, 32 * SIZE
  1209. addi BO, BO, 4 * SIZE
  1210. LOAD_A a1, OFFSET_0, AO
  1211. LOAD_A a2, OFFSET_1, AO
  1212. LOAD_A a3, OFFSET_2, AO
  1213. LOAD_A a4, OFFSET_3, AO
  1214. bdnz LL(72)
  1215. .align 4
  1216. LL(75):
  1217. andi. r0, K, 1
  1218. lvx alpha, OFFSET_0, SP
  1219. vxor VZERO, VZERO, VZERO
  1220. ble+ LL(78)
  1221. .align 4
  1222. LL(76):
  1223. vmaddfp c01, a1, bp1, c01
  1224. vspltw bp2, b1, 1
  1225. vmaddfp c02, a2, bp1, c02
  1226. addi AO, AO, 16 * SIZE
  1227. vmaddfp c03, a3, bp1, c03
  1228. addi BO, BO, 2 * SIZE
  1229. vmaddfp c04, a4, bp1, c04
  1230. nop
  1231. vmaddfp c05, a1, bp2, c05
  1232. vmaddfp c06, a2, bp2, c06
  1233. vmaddfp c07, a3, bp2, c07
  1234. vmaddfp c08, a4, bp2, c08
  1235. .align 4
  1236. LL(78):
  1237. lvx C1, OFFSET_0, CO1
  1238. lvx C2, OFFSET_1, CO1
  1239. lvx C3, OFFSET_2, CO1
  1240. lvx C4, OFFSET_3, CO1
  1241. lvx C5, OFFSET_4, CO1
  1242. lvsr PERMRSHIFT1, 0, CO1
  1243. lvsr PERMRSHIFT2, 0, CO2
  1244. lvsr PERMRSHIFT3, 0, CO3
  1245. lvsr PERMRSHIFT4, 0, CO4
  1246. vperm c00, VZERO, c01, PERMRSHIFT1
  1247. vperm c01, c01, c02, PERMRSHIFT1
  1248. vperm c02, c02, c03, PERMRSHIFT1
  1249. vperm c03, c03, c04, PERMRSHIFT1
  1250. vperm c04, c04, VZERO, PERMRSHIFT1
  1251. vmaddfp c00, alpha, c00, C1
  1252. vmaddfp c01, alpha, c01, C2
  1253. vmaddfp c02, alpha, c02, C3
  1254. vmaddfp c03, alpha, c03, C4
  1255. vmaddfp c04, alpha, c04, C5
  1256. stvx c00, OFFSET_0, CO1
  1257. stvx c01, OFFSET_1, CO1
  1258. stvx c02, OFFSET_2, CO1
  1259. stvx c03, OFFSET_3, CO1
  1260. stvx c04, OFFSET_4, CO1
  1261. lvx C1, OFFSET_0, CO2
  1262. lvx C2, OFFSET_1, CO2
  1263. lvx C3, OFFSET_2, CO2
  1264. lvx C4, OFFSET_3, CO2
  1265. lvx C5, OFFSET_4, CO2
  1266. vperm c00, VZERO, c05, PERMRSHIFT2
  1267. vperm c05, c05, c06, PERMRSHIFT2
  1268. vperm c06, c06, c07, PERMRSHIFT2
  1269. vperm c07, c07, c08, PERMRSHIFT2
  1270. vperm c08, c08, VZERO, PERMRSHIFT2
  1271. vmaddfp c00, alpha, c00, C1
  1272. vmaddfp c05, alpha, c05, C2
  1273. vmaddfp c06, alpha, c06, C3
  1274. vmaddfp c07, alpha, c07, C4
  1275. vmaddfp c08, alpha, c08, C5
  1276. stvx c00, OFFSET_0, CO2
  1277. stvx c05, OFFSET_1, CO2
  1278. stvx c06, OFFSET_2, CO2
  1279. stvx c07, OFFSET_3, CO2
  1280. stvx c08, OFFSET_4, CO2
  1281. addi CO1, CO1, 16 * SIZE
  1282. addi CO2, CO2, 16 * SIZE
  1283. addic. I, I, -1
  1284. bgt+ LL(71)
  1285. .align 4
  1286. LL(80):
  1287. andi. I, M, 8
  1288. ble LL(90)
  1289. vxor c01, c01, c01
  1290. LOAD_B b1, OFFSET_0, B
  1291. vxor c02, c02, c02
  1292. vxor c03, c03, c03
  1293. LOAD_A a1, OFFSET_0, AO
  1294. vxor c04, c04, c04
  1295. LOAD_A a2, OFFSET_1, AO
  1296. vxor c05, c05, c05
  1297. LOAD_A a3, OFFSET_2, AO
  1298. vxor c06, c06, c06
  1299. LOAD_A a4, OFFSET_3, AO
  1300. vxor c07, c07, c07
  1301. vxor c08, c08, c08
  1302. mr BO, B
  1303. vspltw bp1, b1, 0
  1304. srawi. r0, K, 1
  1305. mtspr CTR, r0
  1306. ble LL(85)
  1307. .align 4
  1308. LL(82):
  1309. vmaddfp c01, a1, bp1, c01
  1310. vspltw bp2, b1, 1
  1311. vmaddfp c02, a2, bp1, c02
  1312. vmaddfp c05, a1, bp2, c05
  1313. vspltw bp1, b1, 2
  1314. vmaddfp c06, a2, bp2, c06
  1315. vmaddfp c03, a3, bp1, c03
  1316. vspltw bp2, b1, 3
  1317. vmaddfp c04, a4, bp1, c04
  1318. LOAD_B b1, OFFSET_1, BO
  1319. vspltw bp1, b1, 0
  1320. vmaddfp c07, a3, bp2, c07
  1321. vmaddfp c08, a4, bp2, c08
  1322. addi AO, AO, 16 * SIZE
  1323. addi BO, BO, 4 * SIZE
  1324. LOAD_A a1, OFFSET_0, AO
  1325. LOAD_A a2, OFFSET_1, AO
  1326. LOAD_A a3, OFFSET_2, AO
  1327. LOAD_A a4, OFFSET_3, AO
  1328. bdnz LL(82)
  1329. .align 4
  1330. LL(85):
  1331. andi. r0, K, 1
  1332. lvx alpha, OFFSET_0, SP
  1333. vxor VZERO, VZERO, VZERO
  1334. ble+ LL(88)
  1335. .align 4
  1336. LL(86):
  1337. vmaddfp c01, a1, bp1, c01
  1338. vspltw bp2, b1, 1
  1339. vmaddfp c02, a2, bp1, c02
  1340. addi AO, AO, 8 * SIZE
  1341. vmaddfp c05, a1, bp2, c05
  1342. addi BO, BO, 2 * SIZE
  1343. vmaddfp c06, a2, bp2, c06
  1344. .align 4
  1345. LL(88):
  1346. lvx C1, OFFSET_0, CO1
  1347. lvx C2, OFFSET_1, CO1
  1348. lvx C3, OFFSET_2, CO1
  1349. vaddfp c01, c01, c03
  1350. vaddfp c02, c02, c04
  1351. vaddfp c05, c05, c07
  1352. vaddfp c06, c06, c08
  1353. lvsr PERMRSHIFT1, 0, CO1
  1354. lvsr PERMRSHIFT2, 0, CO2
  1355. lvsr PERMRSHIFT3, 0, CO3
  1356. lvsr PERMRSHIFT4, 0, CO4
  1357. vperm c00, VZERO, c01, PERMRSHIFT1
  1358. vperm c01, c01, c02, PERMRSHIFT1
  1359. vperm c02, c02, VZERO, PERMRSHIFT1
  1360. vmaddfp c00, alpha, c00, C1
  1361. vmaddfp c01, alpha, c01, C2
  1362. vmaddfp c02, alpha, c02, C3
  1363. stvx c00, OFFSET_0, CO1
  1364. stvx c01, OFFSET_1, CO1
  1365. stvx c02, OFFSET_2, CO1
  1366. lvx C1, OFFSET_0, CO2
  1367. lvx C2, OFFSET_1, CO2
  1368. lvx C3, OFFSET_2, CO2
  1369. vperm c00, VZERO, c05, PERMRSHIFT2
  1370. vperm c05, c05, c06, PERMRSHIFT2
  1371. vperm c06, c06, VZERO, PERMRSHIFT2
  1372. vmaddfp c00, alpha, c00, C1
  1373. vmaddfp c05, alpha, c05, C2
  1374. vmaddfp c06, alpha, c06, C3
  1375. stvx c00, OFFSET_0, CO2
  1376. stvx c05, OFFSET_1, CO2
  1377. stvx c06, OFFSET_2, CO2
  1378. addi CO1, CO1, 8 * SIZE
  1379. addi CO2, CO2, 8 * SIZE
  1380. .align 4
  1381. LL(90):
  1382. andi. I, M, 4
  1383. ble LL(100)
  1384. vxor c01, c01, c01
  1385. LOAD_B b1, OFFSET_0, B
  1386. vxor c02, c02, c02
  1387. LOAD_A a1, OFFSET_0, AO
  1388. LOAD_A a2, OFFSET_1, AO
  1389. vxor c05, c05, c05
  1390. vxor c06, c06, c06
  1391. mr BO, B
  1392. vspltw bp1, b1, 0
  1393. srawi. r0, K, 1
  1394. mtspr CTR, r0
  1395. ble LL(95)
  1396. .align 4
  1397. LL(92):
  1398. vmaddfp c01, a1, bp1, c01
  1399. vspltw bp2, b1, 1
  1400. vmaddfp c05, a1, bp2, c05
  1401. vspltw bp1, b1, 2
  1402. vmaddfp c02, a2, bp1, c02
  1403. vspltw bp2, b1, 3
  1404. LOAD_B b1, OFFSET_1, BO
  1405. vspltw bp1, b1, 0
  1406. vmaddfp c06, a2, bp2, c06
  1407. addi AO, AO, 8 * SIZE
  1408. addi BO, BO, 4 * SIZE
  1409. LOAD_A a1, OFFSET_0, AO
  1410. LOAD_A a2, OFFSET_1, AO
  1411. bdnz LL(92)
  1412. .align 4
  1413. LL(95):
  1414. andi. r0, K, 1
  1415. lvx alpha, OFFSET_0, SP
  1416. vxor VZERO, VZERO, VZERO
  1417. ble+ LL(98)
  1418. .align 4
  1419. LL(96):
  1420. vspltw bp2, b1, 1
  1421. vmaddfp c01, a1, bp1, c01
  1422. vmaddfp c05, a1, bp2, c05
  1423. addi AO, AO, 4 * SIZE
  1424. addi BO, BO, 2 * SIZE
  1425. .align 4
  1426. LL(98):
  1427. vaddfp c01, c01, c02
  1428. vaddfp c05, c05, c06
  1429. vaddfp c09, c09, c10
  1430. vaddfp c13, c13, c14
  1431. lvx C1, OFFSET_0, CO1
  1432. lvx C2, OFFSET_1, CO1
  1433. lvsr PERMRSHIFT1, 0, CO1
  1434. lvsr PERMRSHIFT2, 0, CO2
  1435. lvsr PERMRSHIFT3, 0, CO3
  1436. lvsr PERMRSHIFT4, 0, CO4
  1437. vperm c00, VZERO, c01, PERMRSHIFT1
  1438. vperm c01, c01, VZERO, PERMRSHIFT1
  1439. vmaddfp c00, alpha, c00, C1
  1440. vmaddfp c01, alpha, c01, C2
  1441. stvx c00, OFFSET_0, CO1
  1442. stvx c01, OFFSET_1, CO1
  1443. lvx C1, OFFSET_0, CO2
  1444. lvx C2, OFFSET_1, CO2
  1445. vperm c00, VZERO, c05, PERMRSHIFT2
  1446. vperm c05, c05, VZERO, PERMRSHIFT2
  1447. vmaddfp c00, alpha, c00, C1
  1448. vmaddfp c05, alpha, c05, C2
  1449. stvx c00, OFFSET_0, CO2
  1450. stvx c05, OFFSET_1, CO2
  1451. addi CO1, CO1, 4 * SIZE
  1452. addi CO2, CO2, 4 * SIZE
  1453. .align 4
  1454. LL(100):
  1455. andi. I, M, 2
  1456. ble LL(110)
  1457. mr BO, B
  1458. LFD f8, 0 * SIZE(AO)
  1459. LFD f9, 1 * SIZE(AO)
  1460. LFD f10, 0 * SIZE(B)
  1461. LFD f11, 1 * SIZE(B)
  1462. LFD f12, 2 * SIZE(B)
  1463. LFD f13, 3 * SIZE(B)
  1464. lfs f0, FZERO(SP)
  1465. fmr f1, f0
  1466. fmr f2, f0
  1467. fmr f3, f0
  1468. fmr f4, f0
  1469. fmr f5, f0
  1470. fmr f6, f0
  1471. fmr f7, f0
  1472. srawi. r0, K, 1
  1473. mtspr CTR, r0
  1474. ble LL(105)
  1475. .align 4
  1476. LL(102):
  1477. FMADD f0, f8, f10, f0
  1478. FMADD f1, f9, f10, f1
  1479. FMADD f2, f8, f11, f2
  1480. FMADD f3, f9, f11, f3
  1481. LFD f8, 2 * SIZE(AO)
  1482. LFD f9, 3 * SIZE(AO)
  1483. FMADD f4, f8, f12, f4
  1484. FMADD f5, f9, f12, f5
  1485. FMADD f6, f8, f13, f6
  1486. FMADD f7, f9, f13, f7
  1487. LFD f8, 4 * SIZE(AO)
  1488. LFD f9, 5 * SIZE(AO)
  1489. LFD f10, 4 * SIZE(BO)
  1490. LFD f11, 5 * SIZE(BO)
  1491. LFD f12, 6 * SIZE(BO)
  1492. LFD f13, 7 * SIZE(BO)
  1493. addi AO, AO, 4 * SIZE
  1494. addi BO, BO, 4 * SIZE
  1495. bdnz LL(102)
  1496. .align 4
  1497. LL(105):
  1498. andi. r0, K, 1
  1499. lfs f13, ALPHA(SP)
  1500. ble LL(108)
  1501. .align 4
  1502. LL(106):
  1503. FMADD f0, f8, f10, f0
  1504. FMADD f1, f9, f10, f1
  1505. FMADD f2, f8, f11, f2
  1506. FMADD f3, f9, f11, f3
  1507. LFD f8, 2 * SIZE(AO)
  1508. LFD f9, 3 * SIZE(AO)
  1509. LFD f10, 2 * SIZE(BO)
  1510. LFD f11, 3 * SIZE(BO)
  1511. addi AO, AO, 2 * SIZE
  1512. addi BO, BO, 2 * SIZE
  1513. .align 4
  1514. LL(108):
  1515. LFD f8, 0 * SIZE(CO1)
  1516. LFD f9, 1 * SIZE(CO1)
  1517. LFD f10, 0 * SIZE(CO2)
  1518. LFD f11, 1 * SIZE(CO2)
  1519. FADD f0, f0, f4
  1520. FADD f1, f1, f5
  1521. FADD f2, f2, f6
  1522. FADD f3, f3, f7
  1523. FMADD f0, f0, f13, f8
  1524. FMADD f1, f1, f13, f9
  1525. FMADD f2, f2, f13, f10
  1526. FMADD f3, f3, f13, f11
  1527. STFD f0, 0 * SIZE(CO1)
  1528. STFD f1, 1 * SIZE(CO1)
  1529. STFD f2, 0 * SIZE(CO2)
  1530. STFD f3, 1 * SIZE(CO2)
  1531. addi CO1, CO1, 2 * SIZE
  1532. addi CO2, CO2, 2 * SIZE
  1533. .align 4
  1534. LL(110):
  1535. andi. I, M, 1
  1536. ble LL(119)
  1537. mr BO, B
  1538. LFD f8, 0 * SIZE(AO)
  1539. LFD f9, 1 * SIZE(AO)
  1540. LFD f10, 0 * SIZE(B)
  1541. LFD f11, 1 * SIZE(B)
  1542. LFD f12, 2 * SIZE(B)
  1543. LFD f13, 3 * SIZE(B)
  1544. lfs f0, FZERO(SP)
  1545. fmr f1, f0
  1546. fmr f2, f0
  1547. fmr f3, f0
  1548. srawi. r0, K, 1
  1549. mtspr CTR, r0
  1550. ble LL(115)
  1551. .align 4
  1552. LL(112):
  1553. FMADD f0, f8, f10, f0
  1554. FMADD f1, f8, f11, f1
  1555. FMADD f2, f9, f12, f2
  1556. FMADD f3, f9, f13, f3
  1557. LFD f8, 2 * SIZE(AO)
  1558. LFD f9, 3 * SIZE(AO)
  1559. LFD f10, 4 * SIZE(BO)
  1560. LFD f11, 5 * SIZE(BO)
  1561. LFD f12, 6 * SIZE(BO)
  1562. LFD f13, 7 * SIZE(BO)
  1563. addi AO, AO, 2 * SIZE
  1564. addi BO, BO, 4 * SIZE
  1565. bdnz LL(112)
  1566. .align 4
  1567. LL(115):
  1568. andi. r0, K, 1
  1569. lfs f13, ALPHA(SP)
  1570. ble LL(118)
  1571. .align 4
  1572. LL(116):
  1573. FMADD f0, f8, f10, f0
  1574. FMADD f1, f8, f11, f1
  1575. LFD f8, 1 * SIZE(AO)
  1576. LFD f10, 2 * SIZE(BO)
  1577. LFD f11, 3 * SIZE(BO)
  1578. addi AO, AO, 1 * SIZE
  1579. addi BO, BO, 2 * SIZE
  1580. .align 4
  1581. LL(118):
  1582. LFD f8, 0 * SIZE(CO1)
  1583. LFD f9, 0 * SIZE(CO2)
  1584. FADD f0, f0, f2
  1585. FADD f1, f1, f3
  1586. FMADD f0, f0, f13, f8
  1587. FMADD f1, f1, f13, f9
  1588. STFD f0, 0 * SIZE(CO1)
  1589. STFD f1, 0 * SIZE(CO2)
  1590. .align 4
  1591. LL(119):
  1592. mr B, BO
  1593. .align 4
  1594. LL(120):
  1595. andi. r0, N, 1
  1596. ble LL(999)
  1597. mr CO1, C
  1598. mr AO, A
  1599. srawi. I, M, 4
  1600. ble LL(140)
  1601. .align 4
  1602. LL(130):
  1603. vxor c01, c01, c01
  1604. vxor c02, c02, c02
  1605. vxor c03, c03, c03
  1606. vxor c04, c04, c04
  1607. mr BO, B
  1608. dcbtst CO1, PREC
  1609. mr J, K
  1610. andi. r0, B, 15
  1611. ble+ LL(131)
  1612. LOAD_A a1, OFFSET_0, AO
  1613. LOAD_A a2, OFFSET_1, AO
  1614. LOAD_A a3, OFFSET_2, AO
  1615. LOAD_A a4, OFFSET_3, AO
  1616. LOAD_B b1, OFFSET_0, BO
  1617. vspltw bp1, b1, 2
  1618. vspltw bp2, b1, 3
  1619. addi AO, AO, 16 * SIZE
  1620. addi BO, BO, SIZE
  1621. vmaddfp c01, a1, bp1, c01
  1622. vmaddfp c02, a2, bp1, c02
  1623. vmaddfp c03, a3, bp1, c03
  1624. vmaddfp c04, a4, bp1, c04
  1625. subi J, J, 1
  1626. cmpwi cr0, J, 0
  1627. ble LL(138)
  1628. LOAD_A a1, OFFSET_0, AO
  1629. LOAD_A a2, OFFSET_1, AO
  1630. LOAD_A a3, OFFSET_2, AO
  1631. LOAD_A a4, OFFSET_3, AO
  1632. addi AO, AO, 16 * SIZE
  1633. addi BO, BO, SIZE
  1634. vmaddfp c01, a1, bp2, c01
  1635. vmaddfp c02, a2, bp2, c02
  1636. vmaddfp c03, a3, bp2, c03
  1637. vmaddfp c04, a4, bp2, c04
  1638. subi J, J, 1
  1639. cmpwi cr0, J, 0
  1640. ble LL(138)
  1641. .align 4
  1642. LL(131):
  1643. LOAD_A a1, OFFSET_0, AO
  1644. LOAD_A a2, OFFSET_1, AO
  1645. LOAD_A a3, OFFSET_2, AO
  1646. LOAD_A a4, OFFSET_3, AO
  1647. LOAD_A a5, OFFSET_4, AO
  1648. LOAD_A a6, OFFSET_5, AO
  1649. LOAD_A a7, OFFSET_6, AO
  1650. LOAD_A a8, OFFSET_7, AO
  1651. LOAD_B b1, OFFSET_0, BO
  1652. srawi. r0, J, 2
  1653. mtspr CTR, r0
  1654. ble LL(135)
  1655. .align 4
  1656. LL(133):
  1657. vspltw bp1, b1, 0
  1658. vmaddfp c01, a1, bp1, c01
  1659. vmaddfp c02, a2, bp1, c02
  1660. vmaddfp c03, a3, bp1, c03
  1661. vmaddfp c04, a4, bp1, c04
  1662. vspltw bp2, b1, 1
  1663. vmaddfp c01, a5, bp2, c01
  1664. vmaddfp c02, a6, bp2, c02
  1665. vmaddfp c03, a7, bp2, c03
  1666. vmaddfp c04, a8, bp2, c04
  1667. addi AO, AO, 32 * SIZE
  1668. LOAD_A a1, OFFSET_0, AO
  1669. LOAD_A a2, OFFSET_1, AO
  1670. LOAD_A a3, OFFSET_2, AO
  1671. LOAD_A a4, OFFSET_3, AO
  1672. vspltw bp1, b1, 2
  1673. vmaddfp c01, a1, bp1, c01
  1674. vmaddfp c02, a2, bp1, c02
  1675. vmaddfp c03, a3, bp1, c03
  1676. vmaddfp c04, a4, bp1, c04
  1677. LOAD_A a5, OFFSET_4, AO
  1678. LOAD_A a6, OFFSET_5, AO
  1679. LOAD_A a7, OFFSET_6, AO
  1680. LOAD_A a8, OFFSET_7, AO
  1681. vspltw bp2, b1, 3
  1682. vmaddfp c01, a5, bp2, c01
  1683. vmaddfp c02, a6, bp2, c02
  1684. vmaddfp c03, a7, bp2, c03
  1685. vmaddfp c04, a8, bp2, c04
  1686. addi AO, AO, 32 * SIZE
  1687. addi BO, BO, 4 * SIZE
  1688. LOAD_A a1, OFFSET_0, AO
  1689. LOAD_A a2, OFFSET_1, AO
  1690. LOAD_A a3, OFFSET_2, AO
  1691. LOAD_A a4, OFFSET_3, AO
  1692. LOAD_A a5, OFFSET_4, AO
  1693. LOAD_A a6, OFFSET_5, AO
  1694. LOAD_A a7, OFFSET_6, AO
  1695. LOAD_A a8, OFFSET_7, AO
  1696. LOAD_B b1, OFFSET_0, BO
  1697. bdnz LL(133)
  1698. .align 4
  1699. LL(135):
  1700. andi. r0, J, 3
  1701. ble+ LL(138)
  1702. cmpwi cr0, r0, 3
  1703. bne LL(136)
  1704. vspltw bp1, b1, 0
  1705. vmaddfp c01, a1, bp1, c01
  1706. vmaddfp c02, a2, bp1, c02
  1707. vmaddfp c03, a3, bp1, c03
  1708. vmaddfp c04, a4, bp1, c04
  1709. addi AO, AO, 16 * SIZE
  1710. LOAD_A a1, OFFSET_0, AO
  1711. LOAD_A a2, OFFSET_1, AO
  1712. LOAD_A a3, OFFSET_2, AO
  1713. LOAD_A a4, OFFSET_3, AO
  1714. vspltw bp2, b1, 1
  1715. vmaddfp c01, a1, bp2, c01
  1716. vmaddfp c02, a2, bp2, c02
  1717. vmaddfp c03, a3, bp2, c03
  1718. vmaddfp c04, a4, bp2, c04
  1719. addi AO, AO, 16 * SIZE
  1720. LOAD_A a1, OFFSET_0, AO
  1721. LOAD_A a2, OFFSET_1, AO
  1722. LOAD_A a3, OFFSET_2, AO
  1723. LOAD_A a4, OFFSET_3, AO
  1724. vspltw bp1, b1, 2
  1725. vmaddfp c01, a1, bp1, c01
  1726. vmaddfp c02, a2, bp1, c02
  1727. vmaddfp c03, a3, bp1, c03
  1728. vmaddfp c04, a4, bp1, c04
  1729. addi AO, AO, 16 * SIZE
  1730. addi BO, BO, 3 * SIZE
  1731. b LL(138)
  1732. .align 4
  1733. LL(136):
  1734. cmpwi cr0, r0, 2
  1735. bne LL(137)
  1736. vspltw bp1, b1, 0
  1737. vspltw bp2, b1, 1
  1738. vmaddfp c01, a1, bp1, c01
  1739. vmaddfp c02, a2, bp1, c02
  1740. vmaddfp c03, a3, bp1, c03
  1741. vmaddfp c04, a4, bp1, c04
  1742. LOAD_A a1, OFFSET_4, AO
  1743. LOAD_A a2, OFFSET_5, AO
  1744. LOAD_A a3, OFFSET_6, AO
  1745. LOAD_A a4, OFFSET_7, AO
  1746. vmaddfp c01, a1, bp2, c01
  1747. vmaddfp c02, a2, bp2, c02
  1748. vmaddfp c03, a3, bp2, c03
  1749. vmaddfp c04, a4, bp2, c04
  1750. addi AO, AO, 32 * SIZE
  1751. addi BO, BO, 2 * SIZE
  1752. b LL(138)
  1753. .align 4
  1754. LL(137):
  1755. cmpwi cr0, r0, 1
  1756. bne LL(138)
  1757. vspltw bp1, b1, 0
  1758. vmaddfp c01, a1, bp1, c01
  1759. vmaddfp c02, a2, bp1, c02
  1760. vmaddfp c03, a3, bp1, c03
  1761. vmaddfp c04, a4, bp1, c04
  1762. addi AO, AO, 16 * SIZE
  1763. addi BO, BO, 1 * SIZE
  1764. .align 4
  1765. LL(138):
  1766. lvx alpha, OFFSET_0, SP
  1767. vxor VZERO, VZERO, VZERO
  1768. lvx C1, OFFSET_0, CO1
  1769. lvx C2, OFFSET_1, CO1
  1770. lvx C3, OFFSET_2, CO1
  1771. lvx C4, OFFSET_3, CO1
  1772. lvx C5, OFFSET_4, CO1
  1773. lvsr PERMRSHIFT1, 0, CO1
  1774. vperm c00, VZERO, c01, PERMRSHIFT1
  1775. vperm c01, c01, c02, PERMRSHIFT1
  1776. vperm c02, c02, c03, PERMRSHIFT1
  1777. vperm c03, c03, c04, PERMRSHIFT1
  1778. vperm c04, c04, VZERO, PERMRSHIFT1
  1779. vmaddfp c00, alpha, c00, C1
  1780. vmaddfp c01, alpha, c01, C2
  1781. vmaddfp c02, alpha, c02, C3
  1782. vmaddfp c03, alpha, c03, C4
  1783. vmaddfp c04, alpha, c04, C5
  1784. stvx c00, OFFSET_0, CO1
  1785. stvx c01, OFFSET_1, CO1
  1786. stvx c02, OFFSET_2, CO1
  1787. stvx c03, OFFSET_3, CO1
  1788. stvx c04, OFFSET_4, CO1
  1789. addi CO1, CO1, 16 * SIZE
  1790. addic. I, I, -1
  1791. bgt+ LL(130)
  1792. .align 4
  1793. LL(140):
  1794. andi. I, M, 8
  1795. ble LL(150)
  1796. vxor c01, c01, c01
  1797. vxor c02, c02, c02
  1798. mr BO, B
  1799. mr J, K
  1800. andi. r0, B, 15
  1801. ble+ LL(141)
  1802. LOAD_A a1, OFFSET_0, AO
  1803. LOAD_A a2, OFFSET_1, AO
  1804. LOAD_B b1, OFFSET_0, BO
  1805. vspltw bp1, b1, 2
  1806. vspltw bp2, b1, 3
  1807. addi AO, AO, 8 * SIZE
  1808. addi BO, BO, SIZE
  1809. vmaddfp c01, a1, bp1, c01
  1810. vmaddfp c02, a2, bp1, c02
  1811. subi J, J, 1
  1812. cmpwi cr0, J, 0
  1813. ble LL(148)
  1814. LOAD_A a1, OFFSET_0, AO
  1815. LOAD_A a2, OFFSET_1, AO
  1816. addi AO, AO, 8 * SIZE
  1817. addi BO, BO, SIZE
  1818. vmaddfp c01, a1, bp2, c01
  1819. vmaddfp c02, a2, bp2, c02
  1820. subi J, J, 1
  1821. cmpwi cr0, J, 0
  1822. ble LL(148)
  1823. .align 4
  1824. LL(141):
  1825. LOAD_A a1, OFFSET_0, AO
  1826. LOAD_A a2, OFFSET_1, AO
  1827. LOAD_A a3, OFFSET_2, AO
  1828. LOAD_A a4, OFFSET_3, AO
  1829. LOAD_A a5, OFFSET_4, AO
  1830. LOAD_A a6, OFFSET_5, AO
  1831. LOAD_A a7, OFFSET_6, AO
  1832. LOAD_A a8, OFFSET_7, AO
  1833. LOAD_B b1, OFFSET_0, BO
  1834. srawi. r0, J, 2
  1835. mtspr CTR, r0
  1836. ble LL(145)
  1837. .align 4
  1838. LL(143):
  1839. vspltw bp1, b1, 0
  1840. vmaddfp c01, a1, bp1, c01
  1841. vmaddfp c02, a2, bp1, c02
  1842. vspltw bp2, b1, 1
  1843. vmaddfp c01, a3, bp2, c01
  1844. vmaddfp c02, a4, bp2, c02
  1845. vspltw bp1, b1, 2
  1846. vmaddfp c01, a5, bp1, c01
  1847. vmaddfp c02, a6, bp1, c02
  1848. vspltw bp2, b1, 3
  1849. vmaddfp c01, a7, bp2, c01
  1850. vmaddfp c02, a8, bp2, c02
  1851. addi AO, AO, 32 * SIZE
  1852. addi BO, BO, 4 * SIZE
  1853. LOAD_A a1, OFFSET_0, AO
  1854. LOAD_A a2, OFFSET_1, AO
  1855. LOAD_A a3, OFFSET_2, AO
  1856. LOAD_A a4, OFFSET_3, AO
  1857. LOAD_A a5, OFFSET_4, AO
  1858. LOAD_A a6, OFFSET_5, AO
  1859. LOAD_A a7, OFFSET_6, AO
  1860. LOAD_A a8, OFFSET_7, AO
  1861. LOAD_B b1, OFFSET_0, BO
  1862. bdnz LL(143)
  1863. .align 4
  1864. LL(145):
  1865. andi. r0, J, 3
  1866. ble+ LL(148)
  1867. cmpwi cr0, r0, 3
  1868. bne LL(146)
  1869. vspltw bp1, b1, 0
  1870. vmaddfp c01, a1, bp1, c01
  1871. vmaddfp c02, a2, bp1, c02
  1872. vspltw bp2, b1, 1
  1873. vmaddfp c01, a3, bp2, c01
  1874. vmaddfp c02, a4, bp2, c02
  1875. LOAD_A a1, OFFSET_4, AO
  1876. LOAD_A a2, OFFSET_5, AO
  1877. vspltw bp1, b1, 2
  1878. vmaddfp c01, a1, bp1, c01
  1879. vmaddfp c02, a2, bp1, c02
  1880. addi AO, AO, 24 * SIZE
  1881. addi BO, BO, 3 * SIZE
  1882. b LL(148)
  1883. .align 4
  1884. LL(146):
  1885. cmpwi cr0, r0, 2
  1886. bne LL(147)
  1887. vspltw bp1, b1, 0
  1888. vspltw bp2, b1, 1
  1889. vmaddfp c01, a1, bp1, c01
  1890. vmaddfp c02, a2, bp1, c02
  1891. vmaddfp c01, a3, bp2, c01
  1892. vmaddfp c02, a4, bp2, c02
  1893. addi AO, AO, 16 * SIZE
  1894. addi BO, BO, 2 * SIZE
  1895. b LL(148)
  1896. .align 4
  1897. LL(147):
  1898. cmpwi cr0, r0, 1
  1899. bne LL(148)
  1900. vspltw bp1, b1, 0
  1901. vmaddfp c01, a1, bp1, c01
  1902. vmaddfp c02, a2, bp1, c02
  1903. addi AO, AO, 8 * SIZE
  1904. addi BO, BO, 1 * SIZE
  1905. .align 4
  1906. LL(148):
  1907. lvx alpha, OFFSET_0, SP
  1908. vxor VZERO, VZERO, VZERO
  1909. lvx C1, OFFSET_0, CO1
  1910. lvx C2, OFFSET_1, CO1
  1911. lvx C3, OFFSET_2, CO1
  1912. lvsr PERMRSHIFT1, 0, CO1
  1913. vperm c00, VZERO, c01, PERMRSHIFT1
  1914. vperm c01, c01, c02, PERMRSHIFT1
  1915. vperm c02, c02, VZERO, PERMRSHIFT1
  1916. vmaddfp c00, alpha, c00, C1
  1917. vmaddfp c01, alpha, c01, C2
  1918. vmaddfp c02, alpha, c02, C3
  1919. stvx c00, OFFSET_0, CO1
  1920. stvx c01, OFFSET_1, CO1
  1921. stvx c02, OFFSET_2, CO1
  1922. addi CO1, CO1, 8 * SIZE
  1923. .align 4
  1924. LL(150):
  1925. andi. I, M, 4
  1926. ble LL(160)
  1927. vxor c01, c01, c01
  1928. mr BO, B
  1929. mr J, K
  1930. andi. r0, B, 15
  1931. ble+ LL(151)
  1932. LOAD_A a1, OFFSET_0, AO
  1933. LOAD_B b1, OFFSET_0, BO
  1934. vspltw bp1, b1, 2
  1935. vspltw bp2, b1, 3
  1936. addi AO, AO, 4 * SIZE
  1937. addi BO, BO, SIZE
  1938. vmaddfp c01, a1, bp1, c01
  1939. subi J, J, 1
  1940. cmpwi cr0, J, 0
  1941. ble LL(158)
  1942. LOAD_A a1, OFFSET_0, AO
  1943. addi AO, AO, 4 * SIZE
  1944. addi BO, BO, SIZE
  1945. vmaddfp c01, a1, bp2, c01
  1946. subi J, J, 1
  1947. cmpwi cr0, J, 0
  1948. ble LL(158)
  1949. .align 4
  1950. LL(151):
  1951. LOAD_A a1, OFFSET_0, AO
  1952. LOAD_A a2, OFFSET_1, AO
  1953. LOAD_A a3, OFFSET_2, AO
  1954. LOAD_A a4, OFFSET_3, AO
  1955. LOAD_B b1, OFFSET_0, BO
  1956. srawi. r0, J, 2
  1957. mtspr CTR, r0
  1958. ble LL(155)
  1959. .align 4
  1960. LL(153):
  1961. vspltw bp1, b1, 0
  1962. vmaddfp c01, a1, bp1, c01
  1963. vspltw bp2, b1, 1
  1964. vmaddfp c01, a2, bp2, c01
  1965. vspltw bp1, b1, 2
  1966. vmaddfp c01, a3, bp1, c01
  1967. vspltw bp2, b1, 3
  1968. vmaddfp c01, a4, bp2, c01
  1969. addi AO, AO, 16 * SIZE
  1970. addi BO, BO, 4 * SIZE
  1971. LOAD_A a1, OFFSET_0, AO
  1972. LOAD_A a2, OFFSET_1, AO
  1973. LOAD_A a3, OFFSET_2, AO
  1974. LOAD_A a4, OFFSET_3, AO
  1975. LOAD_B b1, OFFSET_0, BO
  1976. bdnz LL(153)
  1977. .align 4
  1978. LL(155):
  1979. andi. r0, J, 3
  1980. ble+ LL(158)
  1981. cmpwi cr0, r0, 3
  1982. bne LL(156)
  1983. vspltw bp1, b1, 0
  1984. vmaddfp c01, a1, bp1, c01
  1985. vspltw bp2, b1, 1
  1986. vmaddfp c01, a2, bp2, c01
  1987. vspltw bp1, b1, 2
  1988. vmaddfp c01, a3, bp1, c01
  1989. addi AO, AO, 12 * SIZE
  1990. addi BO, BO, 3 * SIZE
  1991. b LL(158)
  1992. .align 4
  1993. LL(156):
  1994. cmpwi cr0, r0, 2
  1995. bne LL(157)
  1996. vspltw bp1, b1, 0
  1997. vspltw bp2, b1, 1
  1998. vmaddfp c01, a1, bp1, c01
  1999. vmaddfp c01, a2, bp2, c01
  2000. addi AO, AO, 8 * SIZE
  2001. addi BO, BO, 2 * SIZE
  2002. b LL(158)
  2003. .align 4
  2004. LL(157):
  2005. cmpwi cr0, r0, 1
  2006. bne LL(158)
  2007. vspltw bp1, b1, 0
  2008. vmaddfp c01, a1, bp1, c01
  2009. addi AO, AO, 4 * SIZE
  2010. addi BO, BO, 1 * SIZE
  2011. .align 4
  2012. LL(158):
  2013. lvx alpha, OFFSET_0, SP
  2014. vxor VZERO, VZERO, VZERO
  2015. lvx C1, OFFSET_0, CO1
  2016. lvx C2, OFFSET_1, CO1
  2017. lvsr PERMRSHIFT1, 0, CO1
  2018. vperm c00, VZERO, c01, PERMRSHIFT1
  2019. vperm c01, c01, VZERO, PERMRSHIFT1
  2020. vmaddfp c00, alpha, c00, C1
  2021. vmaddfp c01, alpha, c01, C2
  2022. stvx c00, OFFSET_0, CO1
  2023. stvx c01, OFFSET_1, CO1
  2024. addi CO1, CO1, 4 * SIZE
  2025. .align 4
  2026. LL(160):
  2027. andi. I, M, 2
  2028. ble LL(170)
  2029. mr BO, B
  2030. LFD f8, 0 * SIZE(AO)
  2031. LFD f9, 1 * SIZE(AO)
  2032. LFD f10, 2 * SIZE(AO)
  2033. LFD f11, 3 * SIZE(AO)
  2034. LFD f12, 0 * SIZE(B)
  2035. LFD f13, 1 * SIZE(B)
  2036. lfs f0, FZERO(SP)
  2037. fmr f1, f0
  2038. fmr f2, f0
  2039. fmr f3, f0
  2040. srawi. r0, K, 1
  2041. mtspr CTR, r0
  2042. ble LL(165)
  2043. .align 4
  2044. LL(162):
  2045. FMADD f0, f8, f12, f0
  2046. FMADD f1, f9, f12, f1
  2047. FMADD f2, f10, f13, f2
  2048. FMADD f3, f11, f13, f3
  2049. LFD f8, 4 * SIZE(AO)
  2050. LFD f9, 5 * SIZE(AO)
  2051. LFD f10, 6 * SIZE(AO)
  2052. LFD f11, 7 * SIZE(AO)
  2053. LFD f12, 2 * SIZE(BO)
  2054. LFD f13, 3 * SIZE(BO)
  2055. addi AO, AO, 4 * SIZE
  2056. addi BO, BO, 2 * SIZE
  2057. bdnz LL(162)
  2058. .align 4
  2059. LL(165):
  2060. andi. r0, K, 1
  2061. lfs f13, ALPHA(SP)
  2062. ble LL(168)
  2063. .align 4
  2064. LL(166):
  2065. FMADD f0, f8, f12, f0
  2066. FMADD f1, f9, f12, f1
  2067. addi AO, AO, 2 * SIZE
  2068. addi BO, BO, 1 * SIZE
  2069. .align 4
  2070. LL(168):
  2071. LFD f8, 0 * SIZE(CO1)
  2072. LFD f9, 1 * SIZE(CO1)
  2073. FADD f0, f0, f2
  2074. FADD f1, f1, f3
  2075. FMADD f0, f0, f13, f8
  2076. FMADD f1, f1, f13, f9
  2077. STFD f0, 0 * SIZE(CO1)
  2078. STFD f1, 1 * SIZE(CO1)
  2079. addi CO1, CO1, 2 * SIZE
  2080. .align 4
  2081. LL(170):
  2082. andi. I, M, 1
  2083. ble LL(999)
  2084. mr BO, B
  2085. LFD f8, 0 * SIZE(AO)
  2086. LFD f9, 1 * SIZE(AO)
  2087. LFD f10, 0 * SIZE(B)
  2088. LFD f11, 1 * SIZE(B)
  2089. lfs f0, FZERO(SP)
  2090. fmr f1, f0
  2091. srawi. r0, K, 1
  2092. mtspr CTR, r0
  2093. ble LL(175)
  2094. .align 4
  2095. LL(172):
  2096. FMADD f0, f8, f10, f0
  2097. FMADD f1, f9, f11, f1
  2098. LFD f8, 2 * SIZE(AO)
  2099. LFD f9, 3 * SIZE(AO)
  2100. LFD f10, 2 * SIZE(BO)
  2101. LFD f11, 3 * SIZE(BO)
  2102. addi AO, AO, 2 * SIZE
  2103. addi BO, BO, 2 * SIZE
  2104. bdnz LL(172)
  2105. .align 4
  2106. LL(175):
  2107. andi. r0, K, 1
  2108. lfs f13, ALPHA(SP)
  2109. ble LL(178)
  2110. .align 4
  2111. LL(176):
  2112. FMADD f0, f8, f10, f0
  2113. addi AO, AO, 1 * SIZE
  2114. addi BO, BO, 1 * SIZE
  2115. .align 4
  2116. LL(178):
  2117. LFD f8, 0 * SIZE(CO1)
  2118. FADD f0, f0, f1
  2119. FMADD f0, f0, f13, f8
  2120. STFD f0, 0 * SIZE(CO1)
  2121. .align 4
  2122. LL(999):
  2123. mr SP, STACK
  2124. li r0, 0 * 16
  2125. lvx v20, SP, r0
  2126. li r0, 1 * 16
  2127. lvx v21, SP, r0
  2128. li r0, 2 * 16
  2129. lvx v22, SP, r0
  2130. li r0, 3 * 16
  2131. lvx v23, SP, r0
  2132. li r0, 4 * 16
  2133. lvx v24, SP, r0
  2134. li r0, 5 * 16
  2135. lvx v25, SP, r0
  2136. li r0, 6 * 16
  2137. lvx v26, SP, r0
  2138. li r0, 7 * 16
  2139. lvx v27, SP, r0
  2140. li r0, 8 * 16
  2141. lvx v28, SP, r0
  2142. li r0, 9 * 16
  2143. lvx v29, SP, r0
  2144. li r0, 10 * 16
  2145. lvx v30, SP, r0
  2146. li r0, 11 * 16
  2147. lvx v31, SP, r0
  2148. mtspr VRsave, VREG
  2149. #ifdef __64BIT__
  2150. ld r31, 192(SP)
  2151. ld r30, 200(SP)
  2152. ld r29, 208(SP)
  2153. ld r28, 216(SP)
  2154. ld r27, 224(SP)
  2155. ld r26, 232(SP)
  2156. ld r25, 240(SP)
  2157. ld r24, 248(SP)
  2158. ld r23, 256(SP)
  2159. ld r22, 264(SP)
  2160. ld r21, 272(SP)
  2161. ld r20, 280(SP)
  2162. ld r19, 288(SP)
  2163. ld r18, 296(SP)
  2164. ld r17, 304(SP)
  2165. ld r16, 312(SP)
  2166. ld r15, 320(SP)
  2167. ld r14, 328(SP)
  2168. #else
  2169. lwz r31, 192(SP)
  2170. lwz r30, 196(SP)
  2171. lwz r29, 200(SP)
  2172. lwz r28, 204(SP)
  2173. lwz r27, 208(SP)
  2174. lwz r26, 212(SP)
  2175. lwz r25, 216(SP)
  2176. lwz r24, 220(SP)
  2177. lwz r23, 224(SP)
  2178. lwz r22, 228(SP)
  2179. lwz r21, 232(SP)
  2180. lwz r20, 236(SP)
  2181. lwz r19, 240(SP)
  2182. lwz r18, 244(SP)
  2183. lwz r17, 248(SP)
  2184. lwz r16, 252(SP)
  2185. lwz r15, 256(SP)
  2186. lwz r14, 260(SP)
  2187. #endif
  2188. addi SP, SP, STACKSIZE
  2189. blr
  2190. EPILOGUE
  2191. #endif