You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

gemm_kernel_altivec_cell.S 49 kB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673167416751676167716781679168016811682168316841685168616871688168916901691169216931694169516961697169816991700170117021703170417051706170717081709171017111712171317141715171617171718171917201721172217231724172517261727172817291730173117321733173417351736173717381739174017411742174317441745174617471748174917501751175217531754175517561757175817591760176117621763176417651766176717681769177017711772177317741775177617771778177917801781178217831784178517861787178817891790179117921793179417951796179717981799180018011802180318041805180618071808180918101811181218131814181518161817181818191820182118221823182418251826182718281829183018311832183318341835183618371838183918401841184218431844184518461847184818491850185118521853185418551856185718581859186018611862186318641865186618671868186918701871187218731874187518761877187818791880188118821883188418851886188718881889189018911892189318941895189618971898189919001901190219031904190519061907190819091910191119121913191419151916191719181919192019211922192319241925192619271928192919301931193219331934193519361937193819391940194119421943194419451946194719481949195019511952195319541955195619571958195919601961196219631964196519661967196819691970197119721973197419751976197719781979198019811982198319841985198619871988198919901991199219931994199519961997199819992000200120022003200420052006200720082009201020112012201320142015201620172018201920202021202220232024202520262027202820292030203120322033203420352036203720382039204020412042204320442045204620472048204920502051205220532054205520562057205820592060206120622063206420652066206720682069207020712072207320742075207620772078207920802081208220832084208520862087208820892090209120922093209420952096209720982099210021012102210321042105210621072108210921102111211221132114211521162117211821192120212121222123212421252126212721282129213021312132213321342135213621372138213921402141214221432144214521462147214821492150215121522153215421552156215721582159216021612162216321642165216621672168216921702171217221732174217521762177217821792180218121822183218421852186218721882189219021912192219321942195219621972198219922002201220222032204220522062207220822092210221122122213221422152216221722182219222022212222222322242225222622272228222922302231223222332234223522362237223822392240224122422243224422452246224722482249225022512252225322542255225622572258225922602261226222632264226522662267226822692270227122722273227422752276227722782279228022812282228322842285228622872288228922902291229222932294229522962297229822992300230123022303230423052306230723082309231023112312231323142315231623172318231923202321232223232324232523262327232823292330233123322333233423352336233723382339234023412342234323442345234623472348234923502351235223532354235523562357235823592360236123622363236423652366236723682369237023712372237323742375237623772378237923802381238223832384238523862387238823892390239123922393239423952396239723982399240024012402240324042405240624072408240924102411241224132414241524162417241824192420242124222423242424252426242724282429243024312432243324342435243624372438243924402441244224432444244524462447244824492450245124522453245424552456245724582459246024612462246324642465246624672468246924702471247224732474247524762477247824792480248124822483248424852486248724882489249024912492249324942495249624972498249925002501250225032504250525062507250825092510251125122513251425152516251725182519252025212522252325242525252625272528252925302531253225332534253525362537253825392540254125422543254425452546254725482549255025512552255325542555255625572558255925602561256225632564256525662567256825692570257125722573257425752576257725782579258025812582258325842585258625872588258925902591259225932594259525962597259825992600260126022603260426052606260726082609261026112612261326142615261626172618261926202621262226232624262526262627262826292630263126322633263426352636263726382639264026412642264326442645264626472648264926502651265226532654265526562657265826592660266126622663266426652666266726682669267026712672267326742675267626772678267926802681268226832684268526862687268826892690269126922693269426952696269726982699270027012702270327042705270627072708270927102711
  1. /*********************************************************************/
  2. /* Copyright 2009, 2010 The University of Texas at Austin. */
  3. /* All rights reserved. */
  4. /* */
  5. /* Redistribution and use in source and binary forms, with or */
  6. /* without modification, are permitted provided that the following */
  7. /* conditions are met: */
  8. /* */
  9. /* 1. Redistributions of source code must retain the above */
  10. /* copyright notice, this list of conditions and the following */
  11. /* disclaimer. */
  12. /* */
  13. /* 2. Redistributions in binary form must reproduce the above */
  14. /* copyright notice, this list of conditions and the following */
  15. /* disclaimer in the documentation and/or other materials */
  16. /* provided with the distribution. */
  17. /* */
  18. /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
  19. /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
  20. /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
  21. /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
  22. /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
  23. /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
  24. /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
  25. /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
  26. /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
  27. /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
  28. /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
  29. /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
  30. /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
  31. /* POSSIBILITY OF SUCH DAMAGE. */
  32. /* */
  33. /* The views and conclusions contained in the software and */
  34. /* documentation are those of the authors and should not be */
  35. /* interpreted as representing official policies, either expressed */
  36. /* or implied, of The University of Texas at Austin. */
  37. /*********************************************************************/
  38. #define ASSEMBLER
  39. #include "common.h"
  40. #ifndef __64BIT__
  41. #define LOAD lwz
  42. #else
  43. #define LOAD ld
  44. #endif
  45. #ifdef __64BIT__
  46. #define STACKSIZE 360
  47. #else
  48. #define STACKSIZE 272
  49. #endif
  50. #define ALPHA 0
  51. #define FZERO 16
  52. #define M r3
  53. #define N r4
  54. #define K r5
  55. #if defined(linux) || defined(__FreeBSD__)
  56. #ifndef __64BIT__
  57. #define A r6
  58. #define B r7
  59. #define C r8
  60. #define LDC r9
  61. #else
  62. #define A r7
  63. #define B r8
  64. #define C r9
  65. #define LDC r10
  66. #endif
  67. #endif
  68. #if defined(_AIX) || defined(__APPLE__)
  69. #if !defined(__64BIT__) && defined(DOUBLE)
  70. #define A r8
  71. #define B r9
  72. #define C r10
  73. #define LDC r7
  74. #else
  75. #define A r7
  76. #define B r8
  77. #define C r9
  78. #define LDC r10
  79. #endif
  80. #endif
  81. #define STACK r11
  82. #define I r21
  83. #define J r22
  84. #define AO r23
  85. #define BO r24
  86. #define CO1 r25
  87. #define CO2 r26
  88. #define CO3 r27
  89. #define CO4 r28
  90. #define PREA r29
  91. #define PREB r29
  92. #define PREC r30
  93. #define VREG r31
  94. #define LOAD_A lvx
  95. #define LOAD_B lvx
  96. #define OFFSET_0 0
  97. #define OFFSET_1 r14
  98. #define OFFSET_2 r15
  99. #define OFFSET_3 r16
  100. #define OFFSET_4 r17
  101. #define OFFSET_5 r18
  102. #define OFFSET_6 r19
  103. #define OFFSET_7 r20
  104. #define c01 v0
  105. #define c02 v1
  106. #define c03 v2
  107. #define c04 v3
  108. #define c05 v4
  109. #define c06 v5
  110. #define c07 v6
  111. #define c08 v7
  112. #define c09 v8
  113. #define c10 v9
  114. #define c11 v10
  115. #define c12 v11
  116. #define c13 v12
  117. #define c14 v13
  118. #define c15 v14
  119. #define c16 v15
  120. #define a1 v16
  121. #define a2 v17
  122. #define a3 v18
  123. #define a4 v19
  124. #define a5 v20
  125. #define a6 v21
  126. #define a7 v22
  127. #define a8 v23
  128. #define b1 v24
  129. #define b2 v25
  130. #define bp1 v26
  131. #define bp2 v27
  132. #define C1 v16
  133. #define C2 v17
  134. #define C3 v18
  135. #define C4 v19
  136. #define C5 v20
  137. #define C6 v21
  138. #define C7 v22
  139. #define C8 v23
  140. #define C9 v24
  141. #define c00 v25
  142. #define PERMRSHIFT1 v26
  143. #define PERMRSHIFT2 v27
  144. #define PERMRSHIFT3 v28
  145. #define PERMRSHIFT4 v29
  146. #define VZERO v30
  147. #define alpha v31
  148. #ifndef NEEDPARAM
  149. #ifndef DOUBLE
  150. #include "../sparam.h"
  151. #else
  152. #include "../dparam.h"
  153. #endif
  154. PROLOGUE
  155. PROFCODE
  156. addi SP, SP, -STACKSIZE
  157. mr STACK, SP
  158. li r0, 0 * 16
  159. stvx v20, SP, r0
  160. li r0, 1 * 16
  161. stvx v21, SP, r0
  162. li r0, 2 * 16
  163. stvx v22, SP, r0
  164. li r0, 3 * 16
  165. stvx v23, SP, r0
  166. li r0, 4 * 16
  167. stvx v24, SP, r0
  168. li r0, 5 * 16
  169. stvx v25, SP, r0
  170. li r0, 6 * 16
  171. stvx v26, SP, r0
  172. li r0, 7 * 16
  173. stvx v27, SP, r0
  174. li r0, 8 * 16
  175. stvx v28, SP, r0
  176. li r0, 9 * 16
  177. stvx v29, SP, r0
  178. li r0, 10 * 16
  179. stvx v30, SP, r0
  180. li r0, 11 * 16
  181. stvx v31, SP, r0
  182. #ifdef __64BIT__
  183. std r31, 192(SP)
  184. std r30, 200(SP)
  185. std r29, 208(SP)
  186. std r28, 216(SP)
  187. std r27, 224(SP)
  188. std r26, 232(SP)
  189. std r25, 240(SP)
  190. std r24, 248(SP)
  191. std r23, 256(SP)
  192. std r22, 264(SP)
  193. std r21, 272(SP)
  194. std r20, 280(SP)
  195. std r19, 288(SP)
  196. std r18, 296(SP)
  197. std r17, 304(SP)
  198. std r16, 312(SP)
  199. std r15, 320(SP)
  200. std r14, 328(SP)
  201. #else
  202. stw r31, 192(SP)
  203. stw r30, 196(SP)
  204. stw r29, 200(SP)
  205. stw r28, 204(SP)
  206. stw r27, 208(SP)
  207. stw r26, 212(SP)
  208. stw r25, 216(SP)
  209. stw r24, 220(SP)
  210. stw r23, 224(SP)
  211. stw r22, 228(SP)
  212. stw r21, 232(SP)
  213. stw r20, 236(SP)
  214. stw r19, 240(SP)
  215. stw r18, 244(SP)
  216. stw r17, 248(SP)
  217. stw r16, 252(SP)
  218. stw r15, 256(SP)
  219. stw r14, 260(SP)
  220. #endif
  221. #if defined(_AIX) || defined(__APPLE__)
  222. #if !defined(__64BIT__) && defined(DOUBLE)
  223. lwz LDC, FRAMESLOT(0) + STACKSIZE(SP)
  224. #endif
  225. #endif
  226. li r0, -1
  227. mfspr VREG, VRsave
  228. mtspr VRsave, r0
  229. addi SP, SP, -128
  230. li r0, -128
  231. and SP, SP, r0
  232. li OFFSET_1, 4 * SIZE
  233. li OFFSET_2, 8 * SIZE
  234. li OFFSET_3, 12 * SIZE
  235. li OFFSET_4, 16 * SIZE
  236. li OFFSET_5, 20 * SIZE
  237. li OFFSET_6, 24 * SIZE
  238. li OFFSET_7, 28 * SIZE
  239. stfs f1, ALPHA + 0(SP)
  240. stfs f1, ALPHA + 4(SP)
  241. stfs f1, ALPHA + 8(SP)
  242. stfs f1, ALPHA + 12(SP)
  243. li r29, 0
  244. stw r29, FZERO(SP)
  245. slwi LDC, LDC, BASE_SHIFT
  246. li PREC, (15 * SIZE)
  247. #ifdef CELL
  248. li PREB, (5 * 32 * SIZE)
  249. #else
  250. li PREB, (5 * 32 * SIZE)
  251. #endif
  252. cmpwi cr0, M, 0
  253. ble LL(999)
  254. cmpwi cr0, N, 0
  255. ble LL(999)
  256. cmpwi cr0, K, 0
  257. ble LL(999)
  258. srawi. J, N, 2
  259. ble LL(60)
  260. .align 4
  261. LL(01):
  262. mr CO1, C
  263. add CO2, C, LDC
  264. add CO3, CO2, LDC
  265. add CO4, CO3, LDC
  266. add C, CO4, LDC
  267. mr AO, A
  268. srawi. I, M, 4
  269. ble LL(20)
  270. .align 4
  271. LL(11):
  272. vxor c01, c01, c01
  273. LOAD_B b1, OFFSET_0, B
  274. vxor c02, c02, c02
  275. LOAD_A a1, OFFSET_0, AO
  276. vxor c03, c03, c03
  277. LOAD_A a2, OFFSET_1, AO
  278. vxor c04, c04, c04
  279. LOAD_A a3, OFFSET_2, AO
  280. vxor c05, c05, c05
  281. vxor c06, c06, c06
  282. vxor c07, c07, c07
  283. vxor c08, c08, c08
  284. vxor c09, c09, c09
  285. dcbtst CO1, PREC
  286. vxor c10, c10, c10
  287. dcbtst CO2, PREC
  288. vxor c11, c11, c11
  289. dcbtst CO3, PREC
  290. vxor c12, c12, c12
  291. dcbtst CO4, PREC
  292. vxor c13, c13, c13
  293. mr BO, B
  294. vxor c14, c14, c14
  295. srawi. r0, K, 2
  296. vxor c15, c15, c15
  297. mtspr CTR, r0
  298. vxor c16, c16, c16
  299. vspltw bp1, b1, 0
  300. ble LL(13)
  301. .align 4
  302. #define NOP1 mr r3, r3
  303. #define NOP2 mr r4, r4
  304. LL(12):
  305. vmaddfp c01, a1, bp1, c01
  306. vspltw bp2, b1, 1
  307. vmaddfp c02, a2, bp1, c02
  308. LOAD_A a4, OFFSET_3, AO
  309. vmaddfp c03, a3, bp1, c03
  310. dcbt AO, PREA
  311. vmaddfp c04, a4, bp1, c04
  312. NOP2
  313. vmaddfp c05, a1, bp2, c05
  314. vspltw bp1, b1, 2
  315. vmaddfp c06, a2, bp2, c06
  316. NOP2
  317. vmaddfp c07, a3, bp2, c07
  318. NOP1
  319. vmaddfp c08, a4, bp2, c08
  320. dcbt BO, PREB
  321. vmaddfp c09, a1, bp1, c09
  322. vspltw bp2, b1, 3
  323. vmaddfp c10, a2, bp1, c10
  324. LOAD_B b2, OFFSET_1, BO
  325. vmaddfp c11, a3, bp1, c11
  326. addi BO, BO, 8 * SIZE
  327. vmaddfp c12, a4, bp1, c12
  328. NOP1
  329. vmaddfp c13, a1, bp2, c13
  330. vspltw bp1, b2, 0
  331. vmaddfp c14, a2, bp2, c14
  332. LOAD_A a5, OFFSET_4, AO
  333. vmaddfp c15, a3, bp2, c15
  334. LOAD_A a6, OFFSET_5, AO
  335. vmaddfp c16, a4, bp2, c16
  336. vspltw bp2, b2, 1
  337. vmaddfp c01, a5, bp1, c01
  338. LOAD_A a7, OFFSET_6, AO
  339. vmaddfp c02, a6, bp1, c02
  340. LOAD_A a8, OFFSET_7, AO
  341. vmaddfp c03, a7, bp1, c03
  342. NOP1
  343. vmaddfp c04, a8, bp1, c04
  344. NOP2
  345. vmaddfp c05, a5, bp2, c05
  346. vspltw bp1, b2, 2
  347. vmaddfp c06, a6, bp2, c06
  348. addi AO, AO, 32 * SIZE
  349. vmaddfp c07, a7, bp2, c07
  350. LOAD_B b1, OFFSET_0, BO
  351. vmaddfp c08, a8, bp2, c08
  352. NOP1
  353. vmaddfp c09, a5, bp1, c09
  354. vspltw bp2, b2, 3
  355. vmaddfp c10, a6, bp1, c10
  356. NOP2
  357. vmaddfp c11, a7, bp1, c11
  358. NOP1
  359. vmaddfp c12, a8, bp1, c12
  360. dcbt AO, PREA
  361. vmaddfp c13, a5, bp2, c13
  362. vspltw bp1, b1, 0
  363. vmaddfp c14, a6, bp2, c14
  364. LOAD_A a1, OFFSET_0, AO //
  365. vmaddfp c15, a7, bp2, c15
  366. LOAD_A a2, OFFSET_1, AO
  367. vmaddfp c16, a8, bp2, c16
  368. vspltw bp2, b1, 1
  369. vmaddfp c01, a1, bp1, c01
  370. LOAD_A a3, OFFSET_2, AO
  371. vmaddfp c02, a2, bp1, c02
  372. LOAD_A a4, OFFSET_3, AO
  373. vmaddfp c03, a3, bp1, c03
  374. NOP1
  375. vmaddfp c04, a4, bp1, c04
  376. NOP2
  377. vmaddfp c05, a1, bp2, c05
  378. vspltw bp1, b1, 2
  379. vmaddfp c06, a2, bp2, c06
  380. NOP2
  381. vmaddfp c07, a3, bp2, c07
  382. NOP1
  383. vmaddfp c08, a4, bp2, c08
  384. LOAD_B b2, OFFSET_1, BO
  385. vmaddfp c09, a1, bp1, c09
  386. vspltw bp2, b1, 3
  387. vmaddfp c10, a2, bp1, c10
  388. NOP2
  389. vmaddfp c11, a3, bp1, c11
  390. NOP1
  391. vmaddfp c12, a4, bp1, c12
  392. addi BO, BO, 8 * SIZE
  393. vmaddfp c13, a1, bp2, c13
  394. vspltw bp1, b2, 0
  395. vmaddfp c14, a2, bp2, c14
  396. LOAD_A a5, OFFSET_4, AO
  397. vmaddfp c15, a3, bp2, c15
  398. LOAD_A a6, OFFSET_5, AO
  399. vmaddfp c16, a4, bp2, c16
  400. vspltw bp2, b2, 1
  401. vmaddfp c01, a5, bp1, c01
  402. LOAD_A a7, OFFSET_6, AO
  403. vmaddfp c02, a6, bp1, c02
  404. LOAD_A a8, OFFSET_7, AO
  405. vmaddfp c03, a7, bp1, c03
  406. addi AO, AO, 32 * SIZE
  407. vmaddfp c04, a8, bp1, c04
  408. NOP2
  409. vmaddfp c05, a5, bp2, c05
  410. vspltw bp1, b2, 2
  411. vmaddfp c06, a6, bp2, c06
  412. NOP2
  413. vmaddfp c07, a7, bp2, c07
  414. NOP1
  415. vmaddfp c08, a8, bp2, c08
  416. LOAD_B b1, OFFSET_0, BO
  417. vmaddfp c09, a5, bp1, c09
  418. vspltw bp2, b2, 3
  419. vmaddfp c10, a6, bp1, c10
  420. LOAD_A a1, OFFSET_0, AO //
  421. vmaddfp c11, a7, bp1, c11
  422. NOP2
  423. vmaddfp c12, a8, bp1, c12
  424. vspltw bp1, b1, 0
  425. vmaddfp c13, a5, bp2, c13
  426. LOAD_A a2, OFFSET_1, AO
  427. vmaddfp c14, a6, bp2, c14
  428. LOAD_A a3, OFFSET_2, AO
  429. vmaddfp c15, a7, bp2, c15
  430. NOP1
  431. vmaddfp c16, a8, bp2, c16
  432. bdnz+ LL(12)
  433. .align 4
  434. LL(13):
  435. andi. r0, K, 2
  436. nop
  437. nop
  438. ble+ LL(15)
  439. .align 4
  440. vmaddfp c01, a1, bp1, c01
  441. vspltw bp2, b1, 1
  442. vmaddfp c02, a2, bp1, c02
  443. LOAD_A a4, OFFSET_3, AO
  444. vmaddfp c03, a3, bp1, c03
  445. NOP1
  446. vmaddfp c04, a4, bp1, c04
  447. NOP2
  448. vmaddfp c05, a1, bp2, c05
  449. vspltw bp1, b1, 2
  450. vmaddfp c06, a2, bp2, c06
  451. NOP2
  452. vmaddfp c07, a3, bp2, c07
  453. NOP1
  454. vmaddfp c08, a4, bp2, c08
  455. LOAD_B b2, OFFSET_1, BO
  456. vmaddfp c09, a1, bp1, c09
  457. vspltw bp2, b1, 3
  458. vmaddfp c10, a2, bp1, c10
  459. LOAD_A a5, OFFSET_4, AO
  460. vmaddfp c11, a3, bp1, c11
  461. LOAD_A a6, OFFSET_5, AO
  462. vmaddfp c12, a4, bp1, c12
  463. addi BO, BO, 8 * SIZE
  464. vmaddfp c13, a1, bp2, c13
  465. vspltw bp1, b2, 0
  466. vmaddfp c14, a2, bp2, c14
  467. LOAD_A a7, OFFSET_6, AO
  468. vmaddfp c15, a3, bp2, c15
  469. LOAD_A a8, OFFSET_7, AO
  470. vmaddfp c16, a4, bp2, c16
  471. addi AO, AO, 32 * SIZE
  472. vmaddfp c01, a5, bp1, c01
  473. vspltw bp2, b2, 1
  474. vmaddfp c02, a6, bp1, c02
  475. NOP2
  476. vmaddfp c03, a7, bp1, c03
  477. NOP1
  478. vmaddfp c04, a8, bp1, c04
  479. NOP2
  480. vmaddfp c05, a5, bp2, c05
  481. vspltw bp1, b2, 2
  482. vmaddfp c06, a6, bp2, c06
  483. NOP2
  484. vmaddfp c07, a7, bp2, c07
  485. NOP1
  486. vmaddfp c08, a8, bp2, c08
  487. LOAD_B b1, OFFSET_0, BO
  488. vmaddfp c09, a5, bp1, c09
  489. vspltw bp2, b2, 3
  490. vmaddfp c10, a6, bp1, c10
  491. LOAD_A a1, OFFSET_0, AO
  492. vmaddfp c11, a7, bp1, c11
  493. LOAD_A a2, OFFSET_1, AO
  494. vmaddfp c12, a8, bp1, c12
  495. NOP2
  496. vmaddfp c13, a5, bp2, c13
  497. vspltw bp1, b1, 0
  498. vmaddfp c14, a6, bp2, c14
  499. LOAD_A a3, OFFSET_2, AO
  500. vmaddfp c15, a7, bp2, c15
  501. vmaddfp c16, a8, bp2, c16
  502. .align 4
  503. LL(15):
  504. andi. r0, K, 1
  505. lvx alpha, OFFSET_0, SP
  506. vxor VZERO, VZERO, VZERO
  507. ble+ LL(18)
  508. .align 4
  509. vmaddfp c01, a1, bp1, c01
  510. vspltw bp2, b1, 1
  511. vmaddfp c02, a2, bp1, c02
  512. LOAD_A a4, OFFSET_3, AO
  513. vmaddfp c03, a3, bp1, c03
  514. nop
  515. vmaddfp c04, a4, bp1, c04
  516. nop
  517. vmaddfp c05, a1, bp2, c05
  518. vspltw bp1, b1, 2
  519. vmaddfp c06, a2, bp2, c06
  520. nop
  521. vmaddfp c07, a3, bp2, c07
  522. nop
  523. vmaddfp c08, a4, bp2, c08
  524. nop
  525. vmaddfp c09, a1, bp1, c09
  526. vspltw bp2, b1, 3
  527. vmaddfp c10, a2, bp1, c10
  528. addi AO, AO, 16 * SIZE
  529. vmaddfp c11, a3, bp1, c11
  530. addi BO, BO, 4 * SIZE
  531. vmaddfp c12, a4, bp1, c12
  532. nop
  533. vmaddfp c13, a1, bp2, c13
  534. vmaddfp c14, a2, bp2, c14
  535. vmaddfp c15, a3, bp2, c15
  536. vmaddfp c16, a4, bp2, c16
  537. .align 4
  538. LL(18):
  539. lvx C1, OFFSET_0, CO1
  540. cmpwi cr0, LDC, 32 * SIZE
  541. lvx C2, OFFSET_1, CO1
  542. lvsr PERMRSHIFT1, 0, CO1
  543. lvx C3, OFFSET_2, CO1
  544. lvsr PERMRSHIFT2, 0, CO2
  545. lvx C4, OFFSET_3, CO1
  546. lvsr PERMRSHIFT3, 0, CO3
  547. lvx C5, OFFSET_4, CO1
  548. lvsr PERMRSHIFT4, 0, CO4
  549. ble LL(19)
  550. vperm c00, VZERO, c01, PERMRSHIFT1
  551. vperm c01, c01, c02, PERMRSHIFT1
  552. vperm c02, c02, c03, PERMRSHIFT1
  553. vperm c03, c03, c04, PERMRSHIFT1
  554. vperm c04, c04, VZERO, PERMRSHIFT1
  555. vmaddfp c00, alpha, c00, C1
  556. lvx C1, OFFSET_0, CO2
  557. vmaddfp c01, alpha, c01, C2
  558. lvx C6, OFFSET_1, CO2
  559. vmaddfp c02, alpha, c02, C3
  560. lvx C7, OFFSET_2, CO2
  561. vmaddfp c03, alpha, c03, C4
  562. lvx C8, OFFSET_3, CO2
  563. vmaddfp c04, alpha, c04, C5
  564. lvx C9, OFFSET_4, CO2
  565. stvx c00, OFFSET_0, CO1
  566. vperm c00, VZERO, c05, PERMRSHIFT2
  567. stvx c01, OFFSET_1, CO1
  568. vperm c05, c05, c06, PERMRSHIFT2
  569. stvx c02, OFFSET_2, CO1
  570. vperm c06, c06, c07, PERMRSHIFT2
  571. stvx c03, OFFSET_3, CO1
  572. vperm c07, c07, c08, PERMRSHIFT2
  573. stvx c04, OFFSET_4, CO1
  574. vperm c08, c08, VZERO, PERMRSHIFT2
  575. vmaddfp c00, alpha, c00, C1
  576. lvx C1, OFFSET_0, CO3
  577. vmaddfp c05, alpha, c05, C6
  578. lvx C2, OFFSET_1, CO3
  579. vmaddfp c06, alpha, c06, C7
  580. lvx C3, OFFSET_2, CO3
  581. vmaddfp c07, alpha, c07, C8
  582. lvx C4, OFFSET_3, CO3
  583. vmaddfp c08, alpha, c08, C9
  584. lvx C5, OFFSET_4, CO3
  585. stvx c00, OFFSET_0, CO2
  586. vperm c00, VZERO, c09, PERMRSHIFT3
  587. stvx c05, OFFSET_1, CO2
  588. vperm c09, c09, c10, PERMRSHIFT3
  589. stvx c06, OFFSET_2, CO2
  590. vperm c10, c10, c11, PERMRSHIFT3
  591. stvx c07, OFFSET_3, CO2
  592. vperm c11, c11, c12, PERMRSHIFT3
  593. stvx c08, OFFSET_4, CO2
  594. vperm c12, c12, VZERO, PERMRSHIFT3
  595. vmaddfp c00, alpha, c00, C1
  596. lvx C9, OFFSET_4, CO4
  597. vmaddfp c09, alpha, c09, C2
  598. lvx C1, OFFSET_0, CO4
  599. vmaddfp c10, alpha, c10, C3
  600. lvx C6, OFFSET_1, CO4
  601. vmaddfp c11, alpha, c11, C4
  602. lvx C7, OFFSET_2, CO4
  603. vmaddfp c12, alpha, c12, C5
  604. lvx C8, OFFSET_3, CO4
  605. stvx c00, OFFSET_0, CO3
  606. vperm c00, VZERO, c13, PERMRSHIFT4
  607. stvx c09, OFFSET_1, CO3
  608. vperm c13, c13, c14, PERMRSHIFT4
  609. stvx c10, OFFSET_2, CO3
  610. vperm c14, c14, c15, PERMRSHIFT4
  611. stvx c11, OFFSET_3, CO3
  612. vperm c15, c15, c16, PERMRSHIFT4
  613. stvx c12, OFFSET_4, CO3
  614. vperm c16, c16, VZERO, PERMRSHIFT4
  615. vmaddfp c00, alpha, c00, C1
  616. vmaddfp c13, alpha, c13, C6
  617. vmaddfp c14, alpha, c14, C7
  618. vmaddfp c15, alpha, c15, C8
  619. vmaddfp c16, alpha, c16, C9
  620. stvx c00, OFFSET_0, CO4
  621. stvx c13, OFFSET_1, CO4
  622. stvx c14, OFFSET_2, CO4
  623. stvx c15, OFFSET_3, CO4
  624. stvx c16, OFFSET_4, CO4
  625. addi CO1, CO1, 16 * SIZE
  626. addi CO2, CO2, 16 * SIZE
  627. addi CO3, CO3, 16 * SIZE
  628. addi CO4, CO4, 16 * SIZE
  629. addic. I, I, -1
  630. bgt+ LL(11)
  631. b LL(20)
  632. .align 4
  633. LL(19):
  634. lvx C6, OFFSET_1, CO2
  635. lvx C7, OFFSET_2, CO2
  636. lvx C8, OFFSET_3, CO2
  637. lvx C9, OFFSET_4, CO2
  638. vperm c00, VZERO, c01, PERMRSHIFT1
  639. vperm c01, c01, c02, PERMRSHIFT1
  640. vperm c02, c02, c03, PERMRSHIFT1
  641. vperm c03, c03, c04, PERMRSHIFT1
  642. vperm c04, c04, VZERO, PERMRSHIFT1
  643. vmaddfp c00, alpha, c00, C1
  644. vmaddfp c01, alpha, c01, C2
  645. lvx C2, OFFSET_1, CO3
  646. vmaddfp c02, alpha, c02, C3
  647. lvx C3, OFFSET_2, CO3
  648. vmaddfp c03, alpha, c03, C4
  649. lvx C4, OFFSET_3, CO3
  650. vmaddfp c04, alpha, c04, C5
  651. lvx C5, OFFSET_4, CO3
  652. stvx c00, OFFSET_0, CO1
  653. stvx c01, OFFSET_1, CO1
  654. stvx c02, OFFSET_2, CO1
  655. stvx c03, OFFSET_3, CO1
  656. stvx c04, OFFSET_4, CO1
  657. lvx C1, OFFSET_0, CO2
  658. vperm c00, VZERO, c05, PERMRSHIFT2
  659. vperm c05, c05, c06, PERMRSHIFT2
  660. vperm c06, c06, c07, PERMRSHIFT2
  661. vperm c07, c07, c08, PERMRSHIFT2
  662. vperm c08, c08, VZERO, PERMRSHIFT2
  663. vmaddfp c00, alpha, c00, C1
  664. vmaddfp c05, alpha, c05, C6
  665. lvx C6, OFFSET_1, CO4
  666. vmaddfp c06, alpha, c06, C7
  667. lvx C7, OFFSET_2, CO4
  668. vmaddfp c07, alpha, c07, C8
  669. lvx C8, OFFSET_3, CO4
  670. vmaddfp c08, alpha, c08, C9
  671. lvx C9, OFFSET_4, CO4
  672. stvx c00, OFFSET_0, CO2
  673. stvx c05, OFFSET_1, CO2
  674. stvx c06, OFFSET_2, CO2
  675. stvx c07, OFFSET_3, CO2
  676. stvx c08, OFFSET_4, CO2
  677. lvx C1, OFFSET_0, CO3
  678. vperm c00, VZERO, c09, PERMRSHIFT3
  679. vperm c09, c09, c10, PERMRSHIFT3
  680. vperm c10, c10, c11, PERMRSHIFT3
  681. vperm c11, c11, c12, PERMRSHIFT3
  682. vperm c12, c12, VZERO, PERMRSHIFT3
  683. vmaddfp c00, alpha, c00, C1
  684. vmaddfp c09, alpha, c09, C2
  685. vmaddfp c10, alpha, c10, C3
  686. vmaddfp c11, alpha, c11, C4
  687. vmaddfp c12, alpha, c12, C5
  688. stvx c00, OFFSET_0, CO3
  689. stvx c09, OFFSET_1, CO3
  690. stvx c10, OFFSET_2, CO3
  691. stvx c11, OFFSET_3, CO3
  692. stvx c12, OFFSET_4, CO3
  693. lvx C1, OFFSET_0, CO4
  694. vperm c00, VZERO, c13, PERMRSHIFT4
  695. vperm c13, c13, c14, PERMRSHIFT4
  696. vperm c14, c14, c15, PERMRSHIFT4
  697. vperm c15, c15, c16, PERMRSHIFT4
  698. vperm c16, c16, VZERO, PERMRSHIFT4
  699. vmaddfp c00, alpha, c00, C1
  700. vmaddfp c13, alpha, c13, C6
  701. vmaddfp c14, alpha, c14, C7
  702. vmaddfp c15, alpha, c15, C8
  703. vmaddfp c16, alpha, c16, C9
  704. stvx c00, OFFSET_0, CO4
  705. stvx c13, OFFSET_1, CO4
  706. stvx c14, OFFSET_2, CO4
  707. stvx c15, OFFSET_3, CO4
  708. stvx c16, OFFSET_4, CO4
  709. addi CO1, CO1, 16 * SIZE
  710. addi CO2, CO2, 16 * SIZE
  711. addi CO3, CO3, 16 * SIZE
  712. addi CO4, CO4, 16 * SIZE
  713. addic. I, I, -1
  714. bgt+ LL(11)
  715. .align 4
  716. LL(20):
  717. andi. I, M, 8
  718. ble LL(30)
  719. vxor c01, c01, c01
  720. LOAD_A a1, OFFSET_0, AO
  721. vxor c02, c02, c02
  722. LOAD_A a2, OFFSET_1, AO
  723. vxor c05, c05, c05
  724. LOAD_A a3, OFFSET_2, AO
  725. vxor c06, c06, c06
  726. LOAD_A a4, OFFSET_3, AO
  727. vxor c09, c09, c09
  728. LOAD_B b1, OFFSET_0, B
  729. vxor c10, c10, c10
  730. LOAD_B b2, OFFSET_1, B
  731. vxor c13, c13, c13
  732. vxor c14, c14, c14
  733. mr BO, B
  734. vspltw bp1, b1, 0
  735. srawi. r0, K, 1
  736. mtspr CTR, r0
  737. ble LL(25)
  738. .align 4
  739. LL(22):
  740. vmaddfp c01, a1, bp1, c01
  741. vspltw bp2, b1, 1
  742. addi AO, AO, 16 * SIZE
  743. vmaddfp c02, a2, bp1, c02
  744. addi BO, BO, 8 * SIZE
  745. vmaddfp c05, a1, bp2, c05
  746. vspltw bp1, b1, 2
  747. vmaddfp c06, a2, bp2, c06
  748. vmaddfp c09, a1, bp1, c09
  749. vspltw bp2, b1, 3
  750. LOAD_B b1, OFFSET_0, BO
  751. vmaddfp c10, a2, bp1, c10
  752. vmaddfp c13, a1, bp2, c13
  753. LOAD_A a1, OFFSET_0, AO
  754. vspltw bp1, b2, 0
  755. vmaddfp c14, a2, bp2, c14
  756. LOAD_A a2, OFFSET_1, AO
  757. vmaddfp c01, a3, bp1, c01
  758. vspltw bp2, b2, 1
  759. vmaddfp c02, a4, bp1, c02
  760. vmaddfp c05, a3, bp2, c05
  761. vspltw bp1, b2, 2
  762. vmaddfp c06, a4, bp2, c06
  763. vmaddfp c09, a3, bp1, c09
  764. vspltw bp2, b2, 3
  765. LOAD_B b2, OFFSET_1, BO
  766. vmaddfp c10, a4, bp1, c10
  767. vmaddfp c13, a3, bp2, c13
  768. LOAD_A a3, OFFSET_2, AO
  769. vmaddfp c14, a4, bp2, c14
  770. LOAD_A a4, OFFSET_3, AO
  771. vspltw bp1, b1, 0
  772. bdnz LL(22)
  773. .align 4
  774. LL(25):
  775. andi. r0, K, 1
  776. lvx alpha, OFFSET_0, SP
  777. vxor VZERO, VZERO, VZERO
  778. ble+ LL(28)
  779. .align 4
  780. LL(26):
  781. vmaddfp c01, a1, bp1, c01
  782. vspltw bp2, b1, 1
  783. vmaddfp c02, a2, bp1, c02
  784. nop
  785. vmaddfp c05, a1, bp2, c05
  786. vspltw bp1, b1, 2
  787. vmaddfp c06, a2, bp2, c06
  788. nop
  789. vmaddfp c09, a1, bp1, c09
  790. vspltw bp2, b1, 3
  791. vmaddfp c10, a2, bp1, c10
  792. addi AO, AO, 8 * SIZE
  793. vmaddfp c13, a1, bp2, c13
  794. addi BO, BO, 4 * SIZE
  795. vmaddfp c14, a2, bp2, c14
  796. nop
  797. .align 4
  798. LL(28):
  799. lvx C1, OFFSET_0, CO1
  800. lvx C2, OFFSET_1, CO1
  801. lvx C3, OFFSET_2, CO1
  802. lvsr PERMRSHIFT1, 0, CO1
  803. lvsr PERMRSHIFT2, 0, CO2
  804. lvsr PERMRSHIFT3, 0, CO3
  805. lvsr PERMRSHIFT4, 0, CO4
  806. vperm c00, VZERO, c01, PERMRSHIFT1
  807. vperm c01, c01, c02, PERMRSHIFT1
  808. vperm c02, c02, VZERO, PERMRSHIFT1
  809. vmaddfp c00, alpha, c00, C1
  810. vmaddfp c01, alpha, c01, C2
  811. vmaddfp c02, alpha, c02, C3
  812. stvx c00, OFFSET_0, CO1
  813. stvx c01, OFFSET_1, CO1
  814. stvx c02, OFFSET_2, CO1
  815. lvx C1, OFFSET_0, CO2
  816. lvx C2, OFFSET_1, CO2
  817. lvx C3, OFFSET_2, CO2
  818. vperm c00, VZERO, c05, PERMRSHIFT2
  819. vperm c05, c05, c06, PERMRSHIFT2
  820. vperm c06, c06, VZERO, PERMRSHIFT2
  821. vmaddfp c00, alpha, c00, C1
  822. vmaddfp c05, alpha, c05, C2
  823. vmaddfp c06, alpha, c06, C3
  824. stvx c00, OFFSET_0, CO2
  825. stvx c05, OFFSET_1, CO2
  826. stvx c06, OFFSET_2, CO2
  827. lvx C1, OFFSET_0, CO3
  828. lvx C2, OFFSET_1, CO3
  829. lvx C3, OFFSET_2, CO3
  830. vperm c00, VZERO, c09, PERMRSHIFT3
  831. vperm c09, c09, c10, PERMRSHIFT3
  832. vperm c10, c10, VZERO, PERMRSHIFT3
  833. vmaddfp c00, alpha, c00, C1
  834. vmaddfp c09, alpha, c09, C2
  835. vmaddfp c10, alpha, c10, C3
  836. stvx c00, OFFSET_0, CO3
  837. stvx c09, OFFSET_1, CO3
  838. stvx c10, OFFSET_2, CO3
  839. lvx C1, OFFSET_0, CO4
  840. lvx C2, OFFSET_1, CO4
  841. lvx C3, OFFSET_2, CO4
  842. vperm c00, VZERO, c13, PERMRSHIFT4
  843. vperm c13, c13, c14, PERMRSHIFT4
  844. vperm c14, c14, VZERO, PERMRSHIFT4
  845. vmaddfp c00, alpha, c00, C1
  846. vmaddfp c13, alpha, c13, C2
  847. vmaddfp c14, alpha, c14, C3
  848. stvx c00, OFFSET_0, CO4
  849. stvx c13, OFFSET_1, CO4
  850. stvx c14, OFFSET_2, CO4
  851. addi CO1, CO1, 8 * SIZE
  852. addi CO2, CO2, 8 * SIZE
  853. addi CO3, CO3, 8 * SIZE
  854. addi CO4, CO4, 8 * SIZE
  855. .align 4
  856. LL(30):
  857. andi. I, M, 4
  858. ble LL(40)
  859. vxor c01, c01, c01
  860. LOAD_A a1, OFFSET_0, AO
  861. vxor c02, c02, c02
  862. LOAD_A a2, OFFSET_1, AO
  863. vxor c05, c05, c05
  864. LOAD_B b1, OFFSET_0, B
  865. vxor c06, c06, c06
  866. LOAD_B b2, OFFSET_1, B
  867. vxor c09, c09, c09
  868. vxor c10, c10, c10
  869. vxor c13, c13, c13
  870. vxor c14, c14, c14
  871. vspltw bp1, b1, 0
  872. mr BO, B
  873. srawi. r0, K, 1
  874. mtspr CTR, r0
  875. ble LL(35)
  876. .align 4
  877. LL(32):
  878. vmaddfp c01, a1, bp1, c01
  879. addi AO, AO, 8 * SIZE
  880. vspltw bp2, b1, 1
  881. vmaddfp c05, a1, bp2, c05
  882. addi BO, BO, 8 * SIZE
  883. vspltw bp1, b1, 2
  884. vmaddfp c09, a1, bp1, c09
  885. vspltw bp2, b1, 3
  886. vmaddfp c13, a1, bp2, c13
  887. LOAD_A a1, OFFSET_0, AO
  888. vspltw bp1, b2, 0
  889. LOAD_B b1, OFFSET_0, BO
  890. vmaddfp c02, a2, bp1, c02
  891. vspltw bp2, b2, 1
  892. vmaddfp c06, a2, bp2, c06
  893. vspltw bp1, b2, 2
  894. vmaddfp c10, a2, bp1, c10
  895. vspltw bp2, b2, 3
  896. LOAD_B b2, OFFSET_1, BO
  897. vmaddfp c14, a2, bp2, c14
  898. LOAD_A a2, OFFSET_1, AO
  899. vspltw bp1, b1, 0
  900. bdnz LL(32)
  901. .align 4
  902. LL(35):
  903. andi. r0, K, 1
  904. lvx alpha, OFFSET_0, SP
  905. vxor VZERO, VZERO, VZERO
  906. ble+ LL(38)
  907. .align 4
  908. LL(36):
  909. vmaddfp c01, a1, bp1, c01
  910. vspltw bp2, b1, 1
  911. vmaddfp c05, a1, bp2, c05
  912. vspltw bp1, b1, 2
  913. vmaddfp c09, a1, bp1, c09
  914. vspltw bp2, b1, 3
  915. vmaddfp c13, a1, bp2, c13
  916. addi AO, AO, 4 * SIZE
  917. addi BO, BO, 4 * SIZE
  918. .align 4
  919. LL(38):
  920. vaddfp c01, c01, c02
  921. vaddfp c05, c05, c06
  922. vaddfp c09, c09, c10
  923. vaddfp c13, c13, c14
  924. lvx C1, OFFSET_0, CO1
  925. lvx C2, OFFSET_1, CO1
  926. lvsr PERMRSHIFT1, 0, CO1
  927. lvsr PERMRSHIFT2, 0, CO2
  928. lvsr PERMRSHIFT3, 0, CO3
  929. lvsr PERMRSHIFT4, 0, CO4
  930. vperm c00, VZERO, c01, PERMRSHIFT1
  931. vperm c01, c01, VZERO, PERMRSHIFT1
  932. vmaddfp c00, alpha, c00, C1
  933. vmaddfp c01, alpha, c01, C2
  934. stvx c00, OFFSET_0, CO1
  935. stvx c01, OFFSET_1, CO1
  936. lvx C1, OFFSET_0, CO2
  937. lvx C2, OFFSET_1, CO2
  938. vperm c00, VZERO, c05, PERMRSHIFT2
  939. vperm c05, c05, VZERO, PERMRSHIFT2
  940. vmaddfp c00, alpha, c00, C1
  941. vmaddfp c05, alpha, c05, C2
  942. stvx c00, OFFSET_0, CO2
  943. stvx c05, OFFSET_1, CO2
  944. lvx C1, OFFSET_0, CO3
  945. lvx C2, OFFSET_1, CO3
  946. vperm c00, VZERO, c09, PERMRSHIFT3
  947. vperm c09, c09, VZERO, PERMRSHIFT3
  948. vmaddfp c00, alpha, c00, C1
  949. vmaddfp c09, alpha, c09, C2
  950. stvx c00, OFFSET_0, CO3
  951. stvx c09, OFFSET_1, CO3
  952. lvx C1, OFFSET_0, CO4
  953. lvx C2, OFFSET_1, CO4
  954. vperm c00, VZERO, c13, PERMRSHIFT4
  955. vperm c13, c13, VZERO, PERMRSHIFT4
  956. vmaddfp c00, alpha, c00, C1
  957. vmaddfp c13, alpha, c13, C2
  958. stvx c00, OFFSET_0, CO4
  959. stvx c13, OFFSET_1, CO4
  960. addi CO1, CO1, 4 * SIZE
  961. addi CO2, CO2, 4 * SIZE
  962. addi CO3, CO3, 4 * SIZE
  963. addi CO4, CO4, 4 * SIZE
  964. .align 4
  965. LL(40):
  966. andi. I, M, 2
  967. ble LL(50)
  968. mr BO, B
  969. LFD f8, 0 * SIZE(AO)
  970. LFD f9, 1 * SIZE(AO)
  971. LFD f10, 0 * SIZE(B)
  972. LFD f11, 1 * SIZE(B)
  973. LFD f12, 2 * SIZE(B)
  974. LFD f13, 3 * SIZE(B)
  975. lfs f0, FZERO(SP)
  976. fmr f1, f0
  977. fmr f2, f0
  978. fmr f3, f0
  979. fmr f4, f0
  980. fmr f5, f0
  981. fmr f6, f0
  982. fmr f7, f0
  983. srawi. r0, K, 1
  984. mtspr CTR, r0
  985. ble LL(45)
  986. .align 4
  987. LL(42):
  988. FMADD f0, f8, f10, f0
  989. FMADD f2, f8, f11, f2
  990. FMADD f4, f8, f12, f4
  991. FMADD f6, f8, f13, f6
  992. FMADD f1, f9, f10, f1
  993. FMADD f3, f9, f11, f3
  994. FMADD f5, f9, f12, f5
  995. FMADD f7, f9, f13, f7
  996. LFD f8, 2 * SIZE(AO)
  997. LFD f9, 3 * SIZE(AO)
  998. LFD f10, 4 * SIZE(BO)
  999. LFD f11, 5 * SIZE(BO)
  1000. LFD f12, 6 * SIZE(BO)
  1001. LFD f13, 7 * SIZE(BO)
  1002. FMADD f0, f8, f10, f0
  1003. FMADD f2, f8, f11, f2
  1004. FMADD f4, f8, f12, f4
  1005. FMADD f6, f8, f13, f6
  1006. FMADD f1, f9, f10, f1
  1007. FMADD f3, f9, f11, f3
  1008. FMADD f5, f9, f12, f5
  1009. FMADD f7, f9, f13, f7
  1010. LFD f8, 4 * SIZE(AO)
  1011. LFD f9, 5 * SIZE(AO)
  1012. LFD f10, 8 * SIZE(BO)
  1013. LFD f11, 9 * SIZE(BO)
  1014. LFD f12, 10 * SIZE(BO)
  1015. LFD f13, 11 * SIZE(BO)
  1016. addi AO, AO, 4 * SIZE
  1017. addi BO, BO, 8 * SIZE
  1018. bdnz LL(42)
  1019. .align 4
  1020. LL(45):
  1021. andi. r0, K, 1
  1022. ble LL(48)
  1023. .align 4
  1024. LL(46):
  1025. FMADD f0, f8, f10, f0
  1026. FMADD f2, f8, f11, f2
  1027. FMADD f4, f8, f12, f4
  1028. FMADD f6, f8, f13, f6
  1029. FMADD f1, f9, f10, f1
  1030. FMADD f3, f9, f11, f3
  1031. FMADD f5, f9, f12, f5
  1032. FMADD f7, f9, f13, f7
  1033. LFD f8, 2 * SIZE(AO)
  1034. LFD f9, 3 * SIZE(AO)
  1035. LFD f10, 4 * SIZE(BO)
  1036. LFD f11, 5 * SIZE(BO)
  1037. LFD f12, 6 * SIZE(BO)
  1038. LFD f13, 7 * SIZE(BO)
  1039. addi AO, AO, 2 * SIZE
  1040. addi BO, BO, 4 * SIZE
  1041. .align 4
  1042. LL(48):
  1043. lfs f13, ALPHA(SP)
  1044. LFD f8, 0 * SIZE(CO1)
  1045. LFD f9, 1 * SIZE(CO1)
  1046. LFD f10, 0 * SIZE(CO2)
  1047. LFD f11, 1 * SIZE(CO2)
  1048. FMADD f0, f0, f13, f8
  1049. FMADD f1, f1, f13, f9
  1050. FMADD f2, f2, f13, f10
  1051. FMADD f3, f3, f13, f11
  1052. LFD f8, 0 * SIZE(CO3)
  1053. LFD f9, 1 * SIZE(CO3)
  1054. LFD f10, 0 * SIZE(CO4)
  1055. LFD f11, 1 * SIZE(CO4)
  1056. FMADD f4, f4, f13, f8
  1057. FMADD f5, f5, f13, f9
  1058. FMADD f6, f6, f13, f10
  1059. FMADD f7, f7, f13, f11
  1060. STFD f0, 0 * SIZE(CO1)
  1061. STFD f1, 1 * SIZE(CO1)
  1062. STFD f2, 0 * SIZE(CO2)
  1063. STFD f3, 1 * SIZE(CO2)
  1064. STFD f4, 0 * SIZE(CO3)
  1065. STFD f5, 1 * SIZE(CO3)
  1066. STFD f6, 0 * SIZE(CO4)
  1067. STFD f7, 1 * SIZE(CO4)
  1068. addi CO1, CO1, 2 * SIZE
  1069. addi CO2, CO2, 2 * SIZE
  1070. addi CO3, CO3, 2 * SIZE
  1071. addi CO4, CO4, 2 * SIZE
  1072. .align 4
  1073. LL(50):
  1074. andi. I, M, 1
  1075. ble LL(59)
  1076. mr BO, B
  1077. LFD f8, 0 * SIZE(AO)
  1078. LFD f9, 1 * SIZE(AO)
  1079. LFD f10, 0 * SIZE(B)
  1080. LFD f11, 1 * SIZE(B)
  1081. LFD f12, 2 * SIZE(B)
  1082. LFD f13, 3 * SIZE(B)
  1083. lfs f0, FZERO(SP)
  1084. fmr f1, f0
  1085. fmr f2, f0
  1086. fmr f3, f0
  1087. srawi. r0, K, 1
  1088. mtspr CTR, r0
  1089. ble LL(55)
  1090. .align 4
  1091. LL(52):
  1092. FMADD f0, f8, f10, f0
  1093. FMADD f1, f8, f11, f1
  1094. FMADD f2, f8, f12, f2
  1095. FMADD f3, f8, f13, f3
  1096. LFD f8, 2 * SIZE(AO)
  1097. LFD f10, 4 * SIZE(BO)
  1098. LFD f11, 5 * SIZE(BO)
  1099. LFD f12, 6 * SIZE(BO)
  1100. LFD f13, 7 * SIZE(BO)
  1101. FMADD f0, f9, f10, f0
  1102. FMADD f1, f9, f11, f1
  1103. FMADD f2, f9, f12, f2
  1104. FMADD f3, f9, f13, f3
  1105. LFD f9, 3 * SIZE(AO)
  1106. LFD f10, 8 * SIZE(BO)
  1107. LFD f11, 9 * SIZE(BO)
  1108. LFD f12, 10 * SIZE(BO)
  1109. LFD f13, 11 * SIZE(BO)
  1110. addi AO, AO, 2 * SIZE
  1111. addi BO, BO, 8 * SIZE
  1112. bdnz LL(52)
  1113. .align 4
  1114. LL(55):
  1115. andi. r0, K, 1
  1116. ble LL(58)
  1117. .align 4
  1118. LL(56):
  1119. FMADD f0, f8, f10, f0
  1120. FMADD f1, f8, f11, f1
  1121. FMADD f2, f8, f12, f2
  1122. FMADD f3, f8, f13, f3
  1123. LFD f8, 2 * SIZE(AO)
  1124. LFD f10, 4 * SIZE(BO)
  1125. LFD f11, 5 * SIZE(BO)
  1126. LFD f12, 6 * SIZE(BO)
  1127. LFD f13, 7 * SIZE(BO)
  1128. addi AO, AO, 1 * SIZE
  1129. addi BO, BO, 4 * SIZE
  1130. .align 4
  1131. LL(58):
  1132. lfs f13, ALPHA(SP)
  1133. LFD f8, 0 * SIZE(CO1)
  1134. LFD f9, 0 * SIZE(CO2)
  1135. LFD f10, 0 * SIZE(CO3)
  1136. LFD f11, 0 * SIZE(CO4)
  1137. FMADD f0, f0, f13, f8
  1138. FMADD f1, f1, f13, f9
  1139. FMADD f2, f2, f13, f10
  1140. FMADD f3, f3, f13, f11
  1141. STFD f0, 0 * SIZE(CO1)
  1142. STFD f1, 0 * SIZE(CO2)
  1143. STFD f2, 0 * SIZE(CO3)
  1144. STFD f3, 0 * SIZE(CO4)
  1145. .align 4
  1146. LL(59):
  1147. mr B, BO
  1148. addic. J, J, -1
  1149. bgt LL(01)
  1150. .align 4
  1151. LL(60):
  1152. andi. r0, N, 2
  1153. ble LL(120)
  1154. mr CO1, C
  1155. add CO2, C, LDC
  1156. add C, CO2, LDC
  1157. mr AO, A
  1158. srawi. I, M, 4
  1159. ble LL(80)
  1160. .align 4
  1161. LL(71):
  1162. vxor c01, c01, c01
  1163. LOAD_B b1, OFFSET_0, B
  1164. vxor c02, c02, c02
  1165. vxor c03, c03, c03
  1166. LOAD_A a1, OFFSET_0, AO
  1167. vxor c04, c04, c04
  1168. LOAD_A a2, OFFSET_1, AO
  1169. vxor c05, c05, c05
  1170. LOAD_A a3, OFFSET_2, AO
  1171. vxor c06, c06, c06
  1172. LOAD_A a4, OFFSET_3, AO
  1173. vxor c07, c07, c07
  1174. vxor c08, c08, c08
  1175. mr BO, B
  1176. dcbtst CO1, PREC
  1177. dcbtst CO2, PREC
  1178. vspltw bp1, b1, 0
  1179. srawi. r0, K, 1
  1180. mtspr CTR, r0
  1181. ble LL(75)
  1182. .align 4
  1183. LL(72):
  1184. LOAD_A a5, OFFSET_4, AO
  1185. LOAD_A a6, OFFSET_5, AO
  1186. LOAD_A a7, OFFSET_6, AO
  1187. LOAD_A a8, OFFSET_7, AO
  1188. vmaddfp c01, a1, bp1, c01
  1189. vspltw bp2, b1, 1
  1190. vmaddfp c02, a2, bp1, c02
  1191. vmaddfp c03, a3, bp1, c03
  1192. vmaddfp c04, a4, bp1, c04
  1193. vmaddfp c05, a1, bp2, c05
  1194. vspltw bp1, b1, 2
  1195. vmaddfp c06, a2, bp2, c06
  1196. vmaddfp c07, a3, bp2, c07
  1197. vmaddfp c08, a4, bp2, c08
  1198. vmaddfp c01, a5, bp1, c01
  1199. vspltw bp2, b1, 3
  1200. vmaddfp c02, a6, bp1, c02
  1201. vmaddfp c03, a7, bp1, c03
  1202. vmaddfp c04, a8, bp1, c04
  1203. LOAD_B b1, OFFSET_1, BO
  1204. vspltw bp1, b1, 0
  1205. vmaddfp c05, a5, bp2, c05
  1206. vmaddfp c06, a6, bp2, c06
  1207. vmaddfp c07, a7, bp2, c07
  1208. vmaddfp c08, a8, bp2, c08
  1209. addi AO, AO, 32 * SIZE
  1210. addi BO, BO, 4 * SIZE
  1211. LOAD_A a1, OFFSET_0, AO
  1212. LOAD_A a2, OFFSET_1, AO
  1213. LOAD_A a3, OFFSET_2, AO
  1214. LOAD_A a4, OFFSET_3, AO
  1215. bdnz LL(72)
  1216. .align 4
  1217. LL(75):
  1218. andi. r0, K, 1
  1219. lvx alpha, OFFSET_0, SP
  1220. vxor VZERO, VZERO, VZERO
  1221. ble+ LL(78)
  1222. .align 4
  1223. LL(76):
  1224. vmaddfp c01, a1, bp1, c01
  1225. vspltw bp2, b1, 1
  1226. vmaddfp c02, a2, bp1, c02
  1227. addi AO, AO, 16 * SIZE
  1228. vmaddfp c03, a3, bp1, c03
  1229. addi BO, BO, 2 * SIZE
  1230. vmaddfp c04, a4, bp1, c04
  1231. nop
  1232. vmaddfp c05, a1, bp2, c05
  1233. vmaddfp c06, a2, bp2, c06
  1234. vmaddfp c07, a3, bp2, c07
  1235. vmaddfp c08, a4, bp2, c08
  1236. .align 4
  1237. LL(78):
  1238. lvx C1, OFFSET_0, CO1
  1239. lvx C2, OFFSET_1, CO1
  1240. lvx C3, OFFSET_2, CO1
  1241. lvx C4, OFFSET_3, CO1
  1242. lvx C5, OFFSET_4, CO1
  1243. lvsr PERMRSHIFT1, 0, CO1
  1244. lvsr PERMRSHIFT2, 0, CO2
  1245. lvsr PERMRSHIFT3, 0, CO3
  1246. lvsr PERMRSHIFT4, 0, CO4
  1247. vperm c00, VZERO, c01, PERMRSHIFT1
  1248. vperm c01, c01, c02, PERMRSHIFT1
  1249. vperm c02, c02, c03, PERMRSHIFT1
  1250. vperm c03, c03, c04, PERMRSHIFT1
  1251. vperm c04, c04, VZERO, PERMRSHIFT1
  1252. vmaddfp c00, alpha, c00, C1
  1253. vmaddfp c01, alpha, c01, C2
  1254. vmaddfp c02, alpha, c02, C3
  1255. vmaddfp c03, alpha, c03, C4
  1256. vmaddfp c04, alpha, c04, C5
  1257. stvx c00, OFFSET_0, CO1
  1258. stvx c01, OFFSET_1, CO1
  1259. stvx c02, OFFSET_2, CO1
  1260. stvx c03, OFFSET_3, CO1
  1261. stvx c04, OFFSET_4, CO1
  1262. lvx C1, OFFSET_0, CO2
  1263. lvx C2, OFFSET_1, CO2
  1264. lvx C3, OFFSET_2, CO2
  1265. lvx C4, OFFSET_3, CO2
  1266. lvx C5, OFFSET_4, CO2
  1267. vperm c00, VZERO, c05, PERMRSHIFT2
  1268. vperm c05, c05, c06, PERMRSHIFT2
  1269. vperm c06, c06, c07, PERMRSHIFT2
  1270. vperm c07, c07, c08, PERMRSHIFT2
  1271. vperm c08, c08, VZERO, PERMRSHIFT2
  1272. vmaddfp c00, alpha, c00, C1
  1273. vmaddfp c05, alpha, c05, C2
  1274. vmaddfp c06, alpha, c06, C3
  1275. vmaddfp c07, alpha, c07, C4
  1276. vmaddfp c08, alpha, c08, C5
  1277. stvx c00, OFFSET_0, CO2
  1278. stvx c05, OFFSET_1, CO2
  1279. stvx c06, OFFSET_2, CO2
  1280. stvx c07, OFFSET_3, CO2
  1281. stvx c08, OFFSET_4, CO2
  1282. addi CO1, CO1, 16 * SIZE
  1283. addi CO2, CO2, 16 * SIZE
  1284. addic. I, I, -1
  1285. bgt+ LL(71)
  1286. .align 4
  1287. LL(80):
  1288. andi. I, M, 8
  1289. ble LL(90)
  1290. vxor c01, c01, c01
  1291. LOAD_B b1, OFFSET_0, B
  1292. vxor c02, c02, c02
  1293. vxor c03, c03, c03
  1294. LOAD_A a1, OFFSET_0, AO
  1295. vxor c04, c04, c04
  1296. LOAD_A a2, OFFSET_1, AO
  1297. vxor c05, c05, c05
  1298. LOAD_A a3, OFFSET_2, AO
  1299. vxor c06, c06, c06
  1300. LOAD_A a4, OFFSET_3, AO
  1301. vxor c07, c07, c07
  1302. vxor c08, c08, c08
  1303. mr BO, B
  1304. vspltw bp1, b1, 0
  1305. srawi. r0, K, 1
  1306. mtspr CTR, r0
  1307. ble LL(85)
  1308. .align 4
  1309. LL(82):
  1310. vmaddfp c01, a1, bp1, c01
  1311. vspltw bp2, b1, 1
  1312. vmaddfp c02, a2, bp1, c02
  1313. vmaddfp c05, a1, bp2, c05
  1314. vspltw bp1, b1, 2
  1315. vmaddfp c06, a2, bp2, c06
  1316. vmaddfp c03, a3, bp1, c03
  1317. vspltw bp2, b1, 3
  1318. vmaddfp c04, a4, bp1, c04
  1319. LOAD_B b1, OFFSET_1, BO
  1320. vspltw bp1, b1, 0
  1321. vmaddfp c07, a3, bp2, c07
  1322. vmaddfp c08, a4, bp2, c08
  1323. addi AO, AO, 16 * SIZE
  1324. addi BO, BO, 4 * SIZE
  1325. LOAD_A a1, OFFSET_0, AO
  1326. LOAD_A a2, OFFSET_1, AO
  1327. LOAD_A a3, OFFSET_2, AO
  1328. LOAD_A a4, OFFSET_3, AO
  1329. bdnz LL(82)
  1330. .align 4
  1331. LL(85):
  1332. andi. r0, K, 1
  1333. lvx alpha, OFFSET_0, SP
  1334. vxor VZERO, VZERO, VZERO
  1335. ble+ LL(88)
  1336. .align 4
  1337. LL(86):
  1338. vmaddfp c01, a1, bp1, c01
  1339. vspltw bp2, b1, 1
  1340. vmaddfp c02, a2, bp1, c02
  1341. addi AO, AO, 8 * SIZE
  1342. vmaddfp c05, a1, bp2, c05
  1343. addi BO, BO, 2 * SIZE
  1344. vmaddfp c06, a2, bp2, c06
  1345. .align 4
  1346. LL(88):
  1347. lvx C1, OFFSET_0, CO1
  1348. lvx C2, OFFSET_1, CO1
  1349. lvx C3, OFFSET_2, CO1
  1350. vaddfp c01, c01, c03
  1351. vaddfp c02, c02, c04
  1352. vaddfp c05, c05, c07
  1353. vaddfp c06, c06, c08
  1354. lvsr PERMRSHIFT1, 0, CO1
  1355. lvsr PERMRSHIFT2, 0, CO2
  1356. lvsr PERMRSHIFT3, 0, CO3
  1357. lvsr PERMRSHIFT4, 0, CO4
  1358. vperm c00, VZERO, c01, PERMRSHIFT1
  1359. vperm c01, c01, c02, PERMRSHIFT1
  1360. vperm c02, c02, VZERO, PERMRSHIFT1
  1361. vmaddfp c00, alpha, c00, C1
  1362. vmaddfp c01, alpha, c01, C2
  1363. vmaddfp c02, alpha, c02, C3
  1364. stvx c00, OFFSET_0, CO1
  1365. stvx c01, OFFSET_1, CO1
  1366. stvx c02, OFFSET_2, CO1
  1367. lvx C1, OFFSET_0, CO2
  1368. lvx C2, OFFSET_1, CO2
  1369. lvx C3, OFFSET_2, CO2
  1370. vperm c00, VZERO, c05, PERMRSHIFT2
  1371. vperm c05, c05, c06, PERMRSHIFT2
  1372. vperm c06, c06, VZERO, PERMRSHIFT2
  1373. vmaddfp c00, alpha, c00, C1
  1374. vmaddfp c05, alpha, c05, C2
  1375. vmaddfp c06, alpha, c06, C3
  1376. stvx c00, OFFSET_0, CO2
  1377. stvx c05, OFFSET_1, CO2
  1378. stvx c06, OFFSET_2, CO2
  1379. addi CO1, CO1, 8 * SIZE
  1380. addi CO2, CO2, 8 * SIZE
  1381. .align 4
  1382. LL(90):
  1383. andi. I, M, 4
  1384. ble LL(100)
  1385. vxor c01, c01, c01
  1386. LOAD_B b1, OFFSET_0, B
  1387. vxor c02, c02, c02
  1388. LOAD_A a1, OFFSET_0, AO
  1389. LOAD_A a2, OFFSET_1, AO
  1390. vxor c05, c05, c05
  1391. vxor c06, c06, c06
  1392. mr BO, B
  1393. vspltw bp1, b1, 0
  1394. srawi. r0, K, 1
  1395. mtspr CTR, r0
  1396. ble LL(95)
  1397. .align 4
  1398. LL(92):
  1399. vmaddfp c01, a1, bp1, c01
  1400. vspltw bp2, b1, 1
  1401. vmaddfp c05, a1, bp2, c05
  1402. vspltw bp1, b1, 2
  1403. vmaddfp c02, a2, bp1, c02
  1404. vspltw bp2, b1, 3
  1405. LOAD_B b1, OFFSET_1, BO
  1406. vspltw bp1, b1, 0
  1407. vmaddfp c06, a2, bp2, c06
  1408. addi AO, AO, 8 * SIZE
  1409. addi BO, BO, 4 * SIZE
  1410. LOAD_A a1, OFFSET_0, AO
  1411. LOAD_A a2, OFFSET_1, AO
  1412. bdnz LL(92)
  1413. .align 4
  1414. LL(95):
  1415. andi. r0, K, 1
  1416. lvx alpha, OFFSET_0, SP
  1417. vxor VZERO, VZERO, VZERO
  1418. ble+ LL(98)
  1419. .align 4
  1420. LL(96):
  1421. vspltw bp2, b1, 1
  1422. vmaddfp c01, a1, bp1, c01
  1423. vmaddfp c05, a1, bp2, c05
  1424. addi AO, AO, 4 * SIZE
  1425. addi BO, BO, 2 * SIZE
  1426. .align 4
  1427. LL(98):
  1428. vaddfp c01, c01, c02
  1429. vaddfp c05, c05, c06
  1430. vaddfp c09, c09, c10
  1431. vaddfp c13, c13, c14
  1432. lvx C1, OFFSET_0, CO1
  1433. lvx C2, OFFSET_1, CO1
  1434. lvsr PERMRSHIFT1, 0, CO1
  1435. lvsr PERMRSHIFT2, 0, CO2
  1436. lvsr PERMRSHIFT3, 0, CO3
  1437. lvsr PERMRSHIFT4, 0, CO4
  1438. vperm c00, VZERO, c01, PERMRSHIFT1
  1439. vperm c01, c01, VZERO, PERMRSHIFT1
  1440. vmaddfp c00, alpha, c00, C1
  1441. vmaddfp c01, alpha, c01, C2
  1442. stvx c00, OFFSET_0, CO1
  1443. stvx c01, OFFSET_1, CO1
  1444. lvx C1, OFFSET_0, CO2
  1445. lvx C2, OFFSET_1, CO2
  1446. vperm c00, VZERO, c05, PERMRSHIFT2
  1447. vperm c05, c05, VZERO, PERMRSHIFT2
  1448. vmaddfp c00, alpha, c00, C1
  1449. vmaddfp c05, alpha, c05, C2
  1450. stvx c00, OFFSET_0, CO2
  1451. stvx c05, OFFSET_1, CO2
  1452. addi CO1, CO1, 4 * SIZE
  1453. addi CO2, CO2, 4 * SIZE
  1454. .align 4
  1455. LL(100):
  1456. andi. I, M, 2
  1457. ble LL(110)
  1458. mr BO, B
  1459. LFD f8, 0 * SIZE(AO)
  1460. LFD f9, 1 * SIZE(AO)
  1461. LFD f10, 0 * SIZE(B)
  1462. LFD f11, 1 * SIZE(B)
  1463. LFD f12, 2 * SIZE(B)
  1464. LFD f13, 3 * SIZE(B)
  1465. lfs f0, FZERO(SP)
  1466. fmr f1, f0
  1467. fmr f2, f0
  1468. fmr f3, f0
  1469. fmr f4, f0
  1470. fmr f5, f0
  1471. fmr f6, f0
  1472. fmr f7, f0
  1473. srawi. r0, K, 1
  1474. mtspr CTR, r0
  1475. ble LL(105)
  1476. .align 4
  1477. LL(102):
  1478. FMADD f0, f8, f10, f0
  1479. FMADD f1, f9, f10, f1
  1480. FMADD f2, f8, f11, f2
  1481. FMADD f3, f9, f11, f3
  1482. LFD f8, 2 * SIZE(AO)
  1483. LFD f9, 3 * SIZE(AO)
  1484. FMADD f4, f8, f12, f4
  1485. FMADD f5, f9, f12, f5
  1486. FMADD f6, f8, f13, f6
  1487. FMADD f7, f9, f13, f7
  1488. LFD f8, 4 * SIZE(AO)
  1489. LFD f9, 5 * SIZE(AO)
  1490. LFD f10, 4 * SIZE(BO)
  1491. LFD f11, 5 * SIZE(BO)
  1492. LFD f12, 6 * SIZE(BO)
  1493. LFD f13, 7 * SIZE(BO)
  1494. addi AO, AO, 4 * SIZE
  1495. addi BO, BO, 4 * SIZE
  1496. bdnz LL(102)
  1497. .align 4
  1498. LL(105):
  1499. andi. r0, K, 1
  1500. lfs f13, ALPHA(SP)
  1501. ble LL(108)
  1502. .align 4
  1503. LL(106):
  1504. FMADD f0, f8, f10, f0
  1505. FMADD f1, f9, f10, f1
  1506. FMADD f2, f8, f11, f2
  1507. FMADD f3, f9, f11, f3
  1508. LFD f8, 2 * SIZE(AO)
  1509. LFD f9, 3 * SIZE(AO)
  1510. LFD f10, 2 * SIZE(BO)
  1511. LFD f11, 3 * SIZE(BO)
  1512. addi AO, AO, 2 * SIZE
  1513. addi BO, BO, 2 * SIZE
  1514. .align 4
  1515. LL(108):
  1516. LFD f8, 0 * SIZE(CO1)
  1517. LFD f9, 1 * SIZE(CO1)
  1518. LFD f10, 0 * SIZE(CO2)
  1519. LFD f11, 1 * SIZE(CO2)
  1520. FADD f0, f0, f4
  1521. FADD f1, f1, f5
  1522. FADD f2, f2, f6
  1523. FADD f3, f3, f7
  1524. FMADD f0, f0, f13, f8
  1525. FMADD f1, f1, f13, f9
  1526. FMADD f2, f2, f13, f10
  1527. FMADD f3, f3, f13, f11
  1528. STFD f0, 0 * SIZE(CO1)
  1529. STFD f1, 1 * SIZE(CO1)
  1530. STFD f2, 0 * SIZE(CO2)
  1531. STFD f3, 1 * SIZE(CO2)
  1532. addi CO1, CO1, 2 * SIZE
  1533. addi CO2, CO2, 2 * SIZE
  1534. .align 4
  1535. LL(110):
  1536. andi. I, M, 1
  1537. ble LL(119)
  1538. mr BO, B
  1539. LFD f8, 0 * SIZE(AO)
  1540. LFD f9, 1 * SIZE(AO)
  1541. LFD f10, 0 * SIZE(B)
  1542. LFD f11, 1 * SIZE(B)
  1543. LFD f12, 2 * SIZE(B)
  1544. LFD f13, 3 * SIZE(B)
  1545. lfs f0, FZERO(SP)
  1546. fmr f1, f0
  1547. fmr f2, f0
  1548. fmr f3, f0
  1549. srawi. r0, K, 1
  1550. mtspr CTR, r0
  1551. ble LL(115)
  1552. .align 4
  1553. LL(112):
  1554. FMADD f0, f8, f10, f0
  1555. FMADD f1, f8, f11, f1
  1556. FMADD f2, f9, f12, f2
  1557. FMADD f3, f9, f13, f3
  1558. LFD f8, 2 * SIZE(AO)
  1559. LFD f9, 3 * SIZE(AO)
  1560. LFD f10, 4 * SIZE(BO)
  1561. LFD f11, 5 * SIZE(BO)
  1562. LFD f12, 6 * SIZE(BO)
  1563. LFD f13, 7 * SIZE(BO)
  1564. addi AO, AO, 2 * SIZE
  1565. addi BO, BO, 4 * SIZE
  1566. bdnz LL(112)
  1567. .align 4
  1568. LL(115):
  1569. andi. r0, K, 1
  1570. lfs f13, ALPHA(SP)
  1571. ble LL(118)
  1572. .align 4
  1573. LL(116):
  1574. FMADD f0, f8, f10, f0
  1575. FMADD f1, f8, f11, f1
  1576. LFD f8, 1 * SIZE(AO)
  1577. LFD f10, 2 * SIZE(BO)
  1578. LFD f11, 3 * SIZE(BO)
  1579. addi AO, AO, 1 * SIZE
  1580. addi BO, BO, 2 * SIZE
  1581. .align 4
  1582. LL(118):
  1583. LFD f8, 0 * SIZE(CO1)
  1584. LFD f9, 0 * SIZE(CO2)
  1585. FADD f0, f0, f2
  1586. FADD f1, f1, f3
  1587. FMADD f0, f0, f13, f8
  1588. FMADD f1, f1, f13, f9
  1589. STFD f0, 0 * SIZE(CO1)
  1590. STFD f1, 0 * SIZE(CO2)
  1591. .align 4
  1592. LL(119):
  1593. mr B, BO
  1594. .align 4
  1595. LL(120):
  1596. andi. r0, N, 1
  1597. ble LL(999)
  1598. mr CO1, C
  1599. mr AO, A
  1600. srawi. I, M, 4
  1601. ble LL(140)
  1602. .align 4
  1603. LL(130):
  1604. vxor c01, c01, c01
  1605. vxor c02, c02, c02
  1606. vxor c03, c03, c03
  1607. vxor c04, c04, c04
  1608. mr BO, B
  1609. dcbtst CO1, PREC
  1610. mr J, K
  1611. andi. r0, B, 15
  1612. ble+ LL(131)
  1613. LOAD_A a1, OFFSET_0, AO
  1614. LOAD_A a2, OFFSET_1, AO
  1615. LOAD_A a3, OFFSET_2, AO
  1616. LOAD_A a4, OFFSET_3, AO
  1617. LOAD_B b1, OFFSET_0, BO
  1618. vspltw bp1, b1, 2
  1619. vspltw bp2, b1, 3
  1620. addi AO, AO, 16 * SIZE
  1621. addi BO, BO, SIZE
  1622. vmaddfp c01, a1, bp1, c01
  1623. vmaddfp c02, a2, bp1, c02
  1624. vmaddfp c03, a3, bp1, c03
  1625. vmaddfp c04, a4, bp1, c04
  1626. subi J, J, 1
  1627. cmpwi cr0, J, 0
  1628. ble LL(138)
  1629. LOAD_A a1, OFFSET_0, AO
  1630. LOAD_A a2, OFFSET_1, AO
  1631. LOAD_A a3, OFFSET_2, AO
  1632. LOAD_A a4, OFFSET_3, AO
  1633. addi AO, AO, 16 * SIZE
  1634. addi BO, BO, SIZE
  1635. vmaddfp c01, a1, bp2, c01
  1636. vmaddfp c02, a2, bp2, c02
  1637. vmaddfp c03, a3, bp2, c03
  1638. vmaddfp c04, a4, bp2, c04
  1639. subi J, J, 1
  1640. cmpwi cr0, J, 0
  1641. ble LL(138)
  1642. .align 4
  1643. LL(131):
  1644. LOAD_A a1, OFFSET_0, AO
  1645. LOAD_A a2, OFFSET_1, AO
  1646. LOAD_A a3, OFFSET_2, AO
  1647. LOAD_A a4, OFFSET_3, AO
  1648. LOAD_A a5, OFFSET_4, AO
  1649. LOAD_A a6, OFFSET_5, AO
  1650. LOAD_A a7, OFFSET_6, AO
  1651. LOAD_A a8, OFFSET_7, AO
  1652. LOAD_B b1, OFFSET_0, BO
  1653. srawi. r0, J, 2
  1654. mtspr CTR, r0
  1655. ble LL(135)
  1656. .align 4
  1657. LL(133):
  1658. vspltw bp1, b1, 0
  1659. vmaddfp c01, a1, bp1, c01
  1660. vmaddfp c02, a2, bp1, c02
  1661. vmaddfp c03, a3, bp1, c03
  1662. vmaddfp c04, a4, bp1, c04
  1663. vspltw bp2, b1, 1
  1664. vmaddfp c01, a5, bp2, c01
  1665. vmaddfp c02, a6, bp2, c02
  1666. vmaddfp c03, a7, bp2, c03
  1667. vmaddfp c04, a8, bp2, c04
  1668. addi AO, AO, 32 * SIZE
  1669. LOAD_A a1, OFFSET_0, AO
  1670. LOAD_A a2, OFFSET_1, AO
  1671. LOAD_A a3, OFFSET_2, AO
  1672. LOAD_A a4, OFFSET_3, AO
  1673. vspltw bp1, b1, 2
  1674. vmaddfp c01, a1, bp1, c01
  1675. vmaddfp c02, a2, bp1, c02
  1676. vmaddfp c03, a3, bp1, c03
  1677. vmaddfp c04, a4, bp1, c04
  1678. LOAD_A a5, OFFSET_4, AO
  1679. LOAD_A a6, OFFSET_5, AO
  1680. LOAD_A a7, OFFSET_6, AO
  1681. LOAD_A a8, OFFSET_7, AO
  1682. vspltw bp2, b1, 3
  1683. vmaddfp c01, a5, bp2, c01
  1684. vmaddfp c02, a6, bp2, c02
  1685. vmaddfp c03, a7, bp2, c03
  1686. vmaddfp c04, a8, bp2, c04
  1687. addi AO, AO, 32 * SIZE
  1688. addi BO, BO, 4 * SIZE
  1689. LOAD_A a1, OFFSET_0, AO
  1690. LOAD_A a2, OFFSET_1, AO
  1691. LOAD_A a3, OFFSET_2, AO
  1692. LOAD_A a4, OFFSET_3, AO
  1693. LOAD_A a5, OFFSET_4, AO
  1694. LOAD_A a6, OFFSET_5, AO
  1695. LOAD_A a7, OFFSET_6, AO
  1696. LOAD_A a8, OFFSET_7, AO
  1697. LOAD_B b1, OFFSET_0, BO
  1698. bdnz LL(133)
  1699. .align 4
  1700. LL(135):
  1701. andi. r0, J, 3
  1702. ble+ LL(138)
  1703. cmpwi cr0, r0, 3
  1704. bne LL(136)
  1705. vspltw bp1, b1, 0
  1706. vmaddfp c01, a1, bp1, c01
  1707. vmaddfp c02, a2, bp1, c02
  1708. vmaddfp c03, a3, bp1, c03
  1709. vmaddfp c04, a4, bp1, c04
  1710. addi AO, AO, 16 * SIZE
  1711. LOAD_A a1, OFFSET_0, AO
  1712. LOAD_A a2, OFFSET_1, AO
  1713. LOAD_A a3, OFFSET_2, AO
  1714. LOAD_A a4, OFFSET_3, AO
  1715. vspltw bp2, b1, 1
  1716. vmaddfp c01, a1, bp2, c01
  1717. vmaddfp c02, a2, bp2, c02
  1718. vmaddfp c03, a3, bp2, c03
  1719. vmaddfp c04, a4, bp2, c04
  1720. addi AO, AO, 16 * SIZE
  1721. LOAD_A a1, OFFSET_0, AO
  1722. LOAD_A a2, OFFSET_1, AO
  1723. LOAD_A a3, OFFSET_2, AO
  1724. LOAD_A a4, OFFSET_3, AO
  1725. vspltw bp1, b1, 2
  1726. vmaddfp c01, a1, bp1, c01
  1727. vmaddfp c02, a2, bp1, c02
  1728. vmaddfp c03, a3, bp1, c03
  1729. vmaddfp c04, a4, bp1, c04
  1730. addi AO, AO, 16 * SIZE
  1731. addi BO, BO, 3 * SIZE
  1732. b LL(138)
  1733. .align 4
  1734. LL(136):
  1735. cmpwi cr0, r0, 2
  1736. bne LL(137)
  1737. vspltw bp1, b1, 0
  1738. vspltw bp2, b1, 1
  1739. vmaddfp c01, a1, bp1, c01
  1740. vmaddfp c02, a2, bp1, c02
  1741. vmaddfp c03, a3, bp1, c03
  1742. vmaddfp c04, a4, bp1, c04
  1743. LOAD_A a1, OFFSET_4, AO
  1744. LOAD_A a2, OFFSET_5, AO
  1745. LOAD_A a3, OFFSET_6, AO
  1746. LOAD_A a4, OFFSET_7, AO
  1747. vmaddfp c01, a1, bp2, c01
  1748. vmaddfp c02, a2, bp2, c02
  1749. vmaddfp c03, a3, bp2, c03
  1750. vmaddfp c04, a4, bp2, c04
  1751. addi AO, AO, 32 * SIZE
  1752. addi BO, BO, 2 * SIZE
  1753. b LL(138)
  1754. .align 4
  1755. LL(137):
  1756. cmpwi cr0, r0, 1
  1757. bne LL(138)
  1758. vspltw bp1, b1, 0
  1759. vmaddfp c01, a1, bp1, c01
  1760. vmaddfp c02, a2, bp1, c02
  1761. vmaddfp c03, a3, bp1, c03
  1762. vmaddfp c04, a4, bp1, c04
  1763. addi AO, AO, 16 * SIZE
  1764. addi BO, BO, 1 * SIZE
  1765. .align 4
  1766. LL(138):
  1767. lvx alpha, OFFSET_0, SP
  1768. vxor VZERO, VZERO, VZERO
  1769. lvx C1, OFFSET_0, CO1
  1770. lvx C2, OFFSET_1, CO1
  1771. lvx C3, OFFSET_2, CO1
  1772. lvx C4, OFFSET_3, CO1
  1773. lvx C5, OFFSET_4, CO1
  1774. lvsr PERMRSHIFT1, 0, CO1
  1775. vperm c00, VZERO, c01, PERMRSHIFT1
  1776. vperm c01, c01, c02, PERMRSHIFT1
  1777. vperm c02, c02, c03, PERMRSHIFT1
  1778. vperm c03, c03, c04, PERMRSHIFT1
  1779. vperm c04, c04, VZERO, PERMRSHIFT1
  1780. vmaddfp c00, alpha, c00, C1
  1781. vmaddfp c01, alpha, c01, C2
  1782. vmaddfp c02, alpha, c02, C3
  1783. vmaddfp c03, alpha, c03, C4
  1784. vmaddfp c04, alpha, c04, C5
  1785. stvx c00, OFFSET_0, CO1
  1786. stvx c01, OFFSET_1, CO1
  1787. stvx c02, OFFSET_2, CO1
  1788. stvx c03, OFFSET_3, CO1
  1789. stvx c04, OFFSET_4, CO1
  1790. addi CO1, CO1, 16 * SIZE
  1791. addic. I, I, -1
  1792. bgt+ LL(130)
  1793. .align 4
  1794. LL(140):
  1795. andi. I, M, 8
  1796. ble LL(150)
  1797. vxor c01, c01, c01
  1798. vxor c02, c02, c02
  1799. mr BO, B
  1800. mr J, K
  1801. andi. r0, B, 15
  1802. ble+ LL(141)
  1803. LOAD_A a1, OFFSET_0, AO
  1804. LOAD_A a2, OFFSET_1, AO
  1805. LOAD_B b1, OFFSET_0, BO
  1806. vspltw bp1, b1, 2
  1807. vspltw bp2, b1, 3
  1808. addi AO, AO, 8 * SIZE
  1809. addi BO, BO, SIZE
  1810. vmaddfp c01, a1, bp1, c01
  1811. vmaddfp c02, a2, bp1, c02
  1812. subi J, J, 1
  1813. cmpwi cr0, J, 0
  1814. ble LL(148)
  1815. LOAD_A a1, OFFSET_0, AO
  1816. LOAD_A a2, OFFSET_1, AO
  1817. addi AO, AO, 8 * SIZE
  1818. addi BO, BO, SIZE
  1819. vmaddfp c01, a1, bp2, c01
  1820. vmaddfp c02, a2, bp2, c02
  1821. subi J, J, 1
  1822. cmpwi cr0, J, 0
  1823. ble LL(148)
  1824. .align 4
  1825. LL(141):
  1826. LOAD_A a1, OFFSET_0, AO
  1827. LOAD_A a2, OFFSET_1, AO
  1828. LOAD_A a3, OFFSET_2, AO
  1829. LOAD_A a4, OFFSET_3, AO
  1830. LOAD_A a5, OFFSET_4, AO
  1831. LOAD_A a6, OFFSET_5, AO
  1832. LOAD_A a7, OFFSET_6, AO
  1833. LOAD_A a8, OFFSET_7, AO
  1834. LOAD_B b1, OFFSET_0, BO
  1835. srawi. r0, J, 2
  1836. mtspr CTR, r0
  1837. ble LL(145)
  1838. .align 4
  1839. LL(143):
  1840. vspltw bp1, b1, 0
  1841. vmaddfp c01, a1, bp1, c01
  1842. vmaddfp c02, a2, bp1, c02
  1843. vspltw bp2, b1, 1
  1844. vmaddfp c01, a3, bp2, c01
  1845. vmaddfp c02, a4, bp2, c02
  1846. vspltw bp1, b1, 2
  1847. vmaddfp c01, a5, bp1, c01
  1848. vmaddfp c02, a6, bp1, c02
  1849. vspltw bp2, b1, 3
  1850. vmaddfp c01, a7, bp2, c01
  1851. vmaddfp c02, a8, bp2, c02
  1852. addi AO, AO, 32 * SIZE
  1853. addi BO, BO, 4 * SIZE
  1854. LOAD_A a1, OFFSET_0, AO
  1855. LOAD_A a2, OFFSET_1, AO
  1856. LOAD_A a3, OFFSET_2, AO
  1857. LOAD_A a4, OFFSET_3, AO
  1858. LOAD_A a5, OFFSET_4, AO
  1859. LOAD_A a6, OFFSET_5, AO
  1860. LOAD_A a7, OFFSET_6, AO
  1861. LOAD_A a8, OFFSET_7, AO
  1862. LOAD_B b1, OFFSET_0, BO
  1863. bdnz LL(143)
  1864. .align 4
  1865. LL(145):
  1866. andi. r0, J, 3
  1867. ble+ LL(148)
  1868. cmpwi cr0, r0, 3
  1869. bne LL(146)
  1870. vspltw bp1, b1, 0
  1871. vmaddfp c01, a1, bp1, c01
  1872. vmaddfp c02, a2, bp1, c02
  1873. vspltw bp2, b1, 1
  1874. vmaddfp c01, a3, bp2, c01
  1875. vmaddfp c02, a4, bp2, c02
  1876. LOAD_A a1, OFFSET_4, AO
  1877. LOAD_A a2, OFFSET_5, AO
  1878. vspltw bp1, b1, 2
  1879. vmaddfp c01, a1, bp1, c01
  1880. vmaddfp c02, a2, bp1, c02
  1881. addi AO, AO, 24 * SIZE
  1882. addi BO, BO, 3 * SIZE
  1883. b LL(148)
  1884. .align 4
  1885. LL(146):
  1886. cmpwi cr0, r0, 2
  1887. bne LL(147)
  1888. vspltw bp1, b1, 0
  1889. vspltw bp2, b1, 1
  1890. vmaddfp c01, a1, bp1, c01
  1891. vmaddfp c02, a2, bp1, c02
  1892. vmaddfp c01, a3, bp2, c01
  1893. vmaddfp c02, a4, bp2, c02
  1894. addi AO, AO, 16 * SIZE
  1895. addi BO, BO, 2 * SIZE
  1896. b LL(148)
  1897. .align 4
  1898. LL(147):
  1899. cmpwi cr0, r0, 1
  1900. bne LL(148)
  1901. vspltw bp1, b1, 0
  1902. vmaddfp c01, a1, bp1, c01
  1903. vmaddfp c02, a2, bp1, c02
  1904. addi AO, AO, 8 * SIZE
  1905. addi BO, BO, 1 * SIZE
  1906. .align 4
  1907. LL(148):
  1908. lvx alpha, OFFSET_0, SP
  1909. vxor VZERO, VZERO, VZERO
  1910. lvx C1, OFFSET_0, CO1
  1911. lvx C2, OFFSET_1, CO1
  1912. lvx C3, OFFSET_2, CO1
  1913. lvsr PERMRSHIFT1, 0, CO1
  1914. vperm c00, VZERO, c01, PERMRSHIFT1
  1915. vperm c01, c01, c02, PERMRSHIFT1
  1916. vperm c02, c02, VZERO, PERMRSHIFT1
  1917. vmaddfp c00, alpha, c00, C1
  1918. vmaddfp c01, alpha, c01, C2
  1919. vmaddfp c02, alpha, c02, C3
  1920. stvx c00, OFFSET_0, CO1
  1921. stvx c01, OFFSET_1, CO1
  1922. stvx c02, OFFSET_2, CO1
  1923. addi CO1, CO1, 8 * SIZE
  1924. .align 4
  1925. LL(150):
  1926. andi. I, M, 4
  1927. ble LL(160)
  1928. vxor c01, c01, c01
  1929. mr BO, B
  1930. mr J, K
  1931. andi. r0, B, 15
  1932. ble+ LL(151)
  1933. LOAD_A a1, OFFSET_0, AO
  1934. LOAD_B b1, OFFSET_0, BO
  1935. vspltw bp1, b1, 2
  1936. vspltw bp2, b1, 3
  1937. addi AO, AO, 4 * SIZE
  1938. addi BO, BO, SIZE
  1939. vmaddfp c01, a1, bp1, c01
  1940. subi J, J, 1
  1941. cmpwi cr0, J, 0
  1942. ble LL(158)
  1943. LOAD_A a1, OFFSET_0, AO
  1944. addi AO, AO, 4 * SIZE
  1945. addi BO, BO, SIZE
  1946. vmaddfp c01, a1, bp2, c01
  1947. subi J, J, 1
  1948. cmpwi cr0, J, 0
  1949. ble LL(158)
  1950. .align 4
  1951. LL(151):
  1952. LOAD_A a1, OFFSET_0, AO
  1953. LOAD_A a2, OFFSET_1, AO
  1954. LOAD_A a3, OFFSET_2, AO
  1955. LOAD_A a4, OFFSET_3, AO
  1956. LOAD_B b1, OFFSET_0, BO
  1957. srawi. r0, J, 2
  1958. mtspr CTR, r0
  1959. ble LL(155)
  1960. .align 4
  1961. LL(153):
  1962. vspltw bp1, b1, 0
  1963. vmaddfp c01, a1, bp1, c01
  1964. vspltw bp2, b1, 1
  1965. vmaddfp c01, a2, bp2, c01
  1966. vspltw bp1, b1, 2
  1967. vmaddfp c01, a3, bp1, c01
  1968. vspltw bp2, b1, 3
  1969. vmaddfp c01, a4, bp2, c01
  1970. addi AO, AO, 16 * SIZE
  1971. addi BO, BO, 4 * SIZE
  1972. LOAD_A a1, OFFSET_0, AO
  1973. LOAD_A a2, OFFSET_1, AO
  1974. LOAD_A a3, OFFSET_2, AO
  1975. LOAD_A a4, OFFSET_3, AO
  1976. LOAD_B b1, OFFSET_0, BO
  1977. bdnz LL(153)
  1978. .align 4
  1979. LL(155):
  1980. andi. r0, J, 3
  1981. ble+ LL(158)
  1982. cmpwi cr0, r0, 3
  1983. bne LL(156)
  1984. vspltw bp1, b1, 0
  1985. vmaddfp c01, a1, bp1, c01
  1986. vspltw bp2, b1, 1
  1987. vmaddfp c01, a2, bp2, c01
  1988. vspltw bp1, b1, 2
  1989. vmaddfp c01, a3, bp1, c01
  1990. addi AO, AO, 12 * SIZE
  1991. addi BO, BO, 3 * SIZE
  1992. b LL(158)
  1993. .align 4
  1994. LL(156):
  1995. cmpwi cr0, r0, 2
  1996. bne LL(157)
  1997. vspltw bp1, b1, 0
  1998. vspltw bp2, b1, 1
  1999. vmaddfp c01, a1, bp1, c01
  2000. vmaddfp c01, a2, bp2, c01
  2001. addi AO, AO, 8 * SIZE
  2002. addi BO, BO, 2 * SIZE
  2003. b LL(158)
  2004. .align 4
  2005. LL(157):
  2006. cmpwi cr0, r0, 1
  2007. bne LL(158)
  2008. vspltw bp1, b1, 0
  2009. vmaddfp c01, a1, bp1, c01
  2010. addi AO, AO, 4 * SIZE
  2011. addi BO, BO, 1 * SIZE
  2012. .align 4
  2013. LL(158):
  2014. lvx alpha, OFFSET_0, SP
  2015. vxor VZERO, VZERO, VZERO
  2016. lvx C1, OFFSET_0, CO1
  2017. lvx C2, OFFSET_1, CO1
  2018. lvsr PERMRSHIFT1, 0, CO1
  2019. vperm c00, VZERO, c01, PERMRSHIFT1
  2020. vperm c01, c01, VZERO, PERMRSHIFT1
  2021. vmaddfp c00, alpha, c00, C1
  2022. vmaddfp c01, alpha, c01, C2
  2023. stvx c00, OFFSET_0, CO1
  2024. stvx c01, OFFSET_1, CO1
  2025. addi CO1, CO1, 4 * SIZE
  2026. .align 4
  2027. LL(160):
  2028. andi. I, M, 2
  2029. ble LL(170)
  2030. mr BO, B
  2031. LFD f8, 0 * SIZE(AO)
  2032. LFD f9, 1 * SIZE(AO)
  2033. LFD f10, 2 * SIZE(AO)
  2034. LFD f11, 3 * SIZE(AO)
  2035. LFD f12, 0 * SIZE(B)
  2036. LFD f13, 1 * SIZE(B)
  2037. lfs f0, FZERO(SP)
  2038. fmr f1, f0
  2039. fmr f2, f0
  2040. fmr f3, f0
  2041. srawi. r0, K, 1
  2042. mtspr CTR, r0
  2043. ble LL(165)
  2044. .align 4
  2045. LL(162):
  2046. FMADD f0, f8, f12, f0
  2047. FMADD f1, f9, f12, f1
  2048. FMADD f2, f10, f13, f2
  2049. FMADD f3, f11, f13, f3
  2050. LFD f8, 4 * SIZE(AO)
  2051. LFD f9, 5 * SIZE(AO)
  2052. LFD f10, 6 * SIZE(AO)
  2053. LFD f11, 7 * SIZE(AO)
  2054. LFD f12, 2 * SIZE(BO)
  2055. LFD f13, 3 * SIZE(BO)
  2056. addi AO, AO, 4 * SIZE
  2057. addi BO, BO, 2 * SIZE
  2058. bdnz LL(162)
  2059. .align 4
  2060. LL(165):
  2061. andi. r0, K, 1
  2062. lfs f13, ALPHA(SP)
  2063. ble LL(168)
  2064. .align 4
  2065. LL(166):
  2066. FMADD f0, f8, f12, f0
  2067. FMADD f1, f9, f12, f1
  2068. addi AO, AO, 2 * SIZE
  2069. addi BO, BO, 1 * SIZE
  2070. .align 4
  2071. LL(168):
  2072. LFD f8, 0 * SIZE(CO1)
  2073. LFD f9, 1 * SIZE(CO1)
  2074. FADD f0, f0, f2
  2075. FADD f1, f1, f3
  2076. FMADD f0, f0, f13, f8
  2077. FMADD f1, f1, f13, f9
  2078. STFD f0, 0 * SIZE(CO1)
  2079. STFD f1, 1 * SIZE(CO1)
  2080. addi CO1, CO1, 2 * SIZE
  2081. .align 4
  2082. LL(170):
  2083. andi. I, M, 1
  2084. ble LL(999)
  2085. mr BO, B
  2086. LFD f8, 0 * SIZE(AO)
  2087. LFD f9, 1 * SIZE(AO)
  2088. LFD f10, 0 * SIZE(B)
  2089. LFD f11, 1 * SIZE(B)
  2090. lfs f0, FZERO(SP)
  2091. fmr f1, f0
  2092. srawi. r0, K, 1
  2093. mtspr CTR, r0
  2094. ble LL(175)
  2095. .align 4
  2096. LL(172):
  2097. FMADD f0, f8, f10, f0
  2098. FMADD f1, f9, f11, f1
  2099. LFD f8, 2 * SIZE(AO)
  2100. LFD f9, 3 * SIZE(AO)
  2101. LFD f10, 2 * SIZE(BO)
  2102. LFD f11, 3 * SIZE(BO)
  2103. addi AO, AO, 2 * SIZE
  2104. addi BO, BO, 2 * SIZE
  2105. bdnz LL(172)
  2106. .align 4
  2107. LL(175):
  2108. andi. r0, K, 1
  2109. lfs f13, ALPHA(SP)
  2110. ble LL(178)
  2111. .align 4
  2112. LL(176):
  2113. FMADD f0, f8, f10, f0
  2114. addi AO, AO, 1 * SIZE
  2115. addi BO, BO, 1 * SIZE
  2116. .align 4
  2117. LL(178):
  2118. LFD f8, 0 * SIZE(CO1)
  2119. FADD f0, f0, f1
  2120. FMADD f0, f0, f13, f8
  2121. STFD f0, 0 * SIZE(CO1)
  2122. .align 4
  2123. LL(999):
  2124. mr SP, STACK
  2125. li r0, 0 * 16
  2126. lvx v20, SP, r0
  2127. li r0, 1 * 16
  2128. lvx v21, SP, r0
  2129. li r0, 2 * 16
  2130. lvx v22, SP, r0
  2131. li r0, 3 * 16
  2132. lvx v23, SP, r0
  2133. li r0, 4 * 16
  2134. lvx v24, SP, r0
  2135. li r0, 5 * 16
  2136. lvx v25, SP, r0
  2137. li r0, 6 * 16
  2138. lvx v26, SP, r0
  2139. li r0, 7 * 16
  2140. lvx v27, SP, r0
  2141. li r0, 8 * 16
  2142. lvx v28, SP, r0
  2143. li r0, 9 * 16
  2144. lvx v29, SP, r0
  2145. li r0, 10 * 16
  2146. lvx v30, SP, r0
  2147. li r0, 11 * 16
  2148. lvx v31, SP, r0
  2149. mtspr VRsave, VREG
  2150. #ifdef __64BIT__
  2151. ld r31, 192(SP)
  2152. ld r30, 200(SP)
  2153. ld r29, 208(SP)
  2154. ld r28, 216(SP)
  2155. ld r27, 224(SP)
  2156. ld r26, 232(SP)
  2157. ld r25, 240(SP)
  2158. ld r24, 248(SP)
  2159. ld r23, 256(SP)
  2160. ld r22, 264(SP)
  2161. ld r21, 272(SP)
  2162. ld r20, 280(SP)
  2163. ld r19, 288(SP)
  2164. ld r18, 296(SP)
  2165. ld r17, 304(SP)
  2166. ld r16, 312(SP)
  2167. ld r15, 320(SP)
  2168. ld r14, 328(SP)
  2169. #else
  2170. lwz r31, 192(SP)
  2171. lwz r30, 196(SP)
  2172. lwz r29, 200(SP)
  2173. lwz r28, 204(SP)
  2174. lwz r27, 208(SP)
  2175. lwz r26, 212(SP)
  2176. lwz r25, 216(SP)
  2177. lwz r24, 220(SP)
  2178. lwz r23, 224(SP)
  2179. lwz r22, 228(SP)
  2180. lwz r21, 232(SP)
  2181. lwz r20, 236(SP)
  2182. lwz r19, 240(SP)
  2183. lwz r18, 244(SP)
  2184. lwz r17, 248(SP)
  2185. lwz r16, 252(SP)
  2186. lwz r15, 256(SP)
  2187. lwz r14, 260(SP)
  2188. #endif
  2189. addi SP, SP, STACKSIZE
  2190. blr
  2191. EPILOGUE
  2192. #endif