You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

ztrsm_kernel_hummer_LN.S 51 kB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673167416751676167716781679168016811682168316841685168616871688168916901691169216931694169516961697169816991700170117021703170417051706170717081709171017111712171317141715171617171718171917201721172217231724172517261727172817291730173117321733173417351736173717381739174017411742174317441745174617471748174917501751175217531754175517561757175817591760176117621763176417651766176717681769177017711772177317741775177617771778177917801781178217831784178517861787178817891790179117921793179417951796179717981799180018011802180318041805180618071808180918101811181218131814181518161817181818191820182118221823182418251826182718281829183018311832183318341835183618371838183918401841184218431844184518461847184818491850185118521853185418551856185718581859186018611862186318641865186618671868186918701871187218731874187518761877187818791880188118821883188418851886188718881889189018911892189318941895189618971898189919001901190219031904190519061907190819091910191119121913191419151916191719181919192019211922192319241925192619271928192919301931193219331934193519361937193819391940194119421943194419451946194719481949195019511952195319541955195619571958195919601961196219631964196519661967196819691970197119721973197419751976197719781979198019811982198319841985198619871988198919901991199219931994199519961997199819992000200120022003200420052006200720082009201020112012201320142015201620172018201920202021202220232024202520262027202820292030203120322033203420352036203720382039204020412042204320442045204620472048204920502051205220532054205520562057205820592060206120622063206420652066206720682069207020712072207320742075207620772078207920802081208220832084208520862087208820892090209120922093209420952096209720982099210021012102210321042105210621072108210921102111211221132114211521162117211821192120212121222123212421252126212721282129213021312132213321342135213621372138213921402141214221432144214521462147214821492150215121522153215421552156215721582159216021612162216321642165216621672168216921702171217221732174217521762177217821792180218121822183218421852186218721882189219021912192219321942195219621972198219922002201220222032204220522062207220822092210221122122213221422152216221722182219222022212222222322242225222622272228222922302231223222332234223522362237223822392240224122422243224422452246224722482249225022512252225322542255225622572258225922602261226222632264226522662267226822692270227122722273227422752276227722782279228022812282228322842285228622872288228922902291229222932294229522962297229822992300230123022303230423052306230723082309231023112312231323142315231623172318231923202321232223232324232523262327232823292330233123322333233423352336233723382339234023412342234323442345234623472348234923502351235223532354235523562357235823592360236123622363236423652366236723682369237023712372237323742375237623772378237923802381238223832384238523862387238823892390239123922393239423952396239723982399240024012402240324042405240624072408240924102411241224132414241524162417241824192420242124222423242424252426242724282429243024312432243324342435243624372438243924402441244224432444244524462447244824492450245124522453245424552456245724582459246024612462246324642465246624672468246924702471247224732474247524762477247824792480248124822483248424852486248724882489249024912492249324942495249624972498249925002501250225032504250525062507250825092510251125122513251425152516251725182519252025212522252325242525252625272528252925302531253225332534253525362537253825392540254125422543254425452546254725482549255025512552255325542555255625572558255925602561256225632564256525662567256825692570257125722573257425752576257725782579258025812582258325842585258625872588258925902591259225932594259525962597259825992600260126022603260426052606260726082609261026112612261326142615261626172618261926202621262226232624262526262627262826292630263126322633263426352636263726382639264026412642264326442645264626472648264926502651265226532654265526562657265826592660266126622663266426652666266726682669267026712672267326742675267626772678267926802681268226832684268526862687268826892690269126922693269426952696269726982699270027012702270327042705270627072708270927102711271227132714271527162717271827192720272127222723272427252726272727282729273027312732273327342735273627372738273927402741274227432744274527462747274827492750275127522753275427552756275727582759276027612762276327642765276627672768276927702771277227732774277527762777277827792780278127822783278427852786278727882789279027912792279327942795279627972798279928002801280228032804280528062807280828092810281128122813281428152816281728182819282028212822282328242825282628272828282928302831283228332834283528362837283828392840284128422843284428452846284728482849285028512852285328542855285628572858285928602861286228632864286528662867286828692870287128722873287428752876287728782879288028812882288328842885288628872888288928902891289228932894289528962897289828992900290129022903290429052906290729082909291029112912291329142915291629172918291929202921292229232924292529262927292829292930293129322933293429352936293729382939294029412942294329442945294629472948294929502951295229532954295529562957295829592960296129622963
  1. /*********************************************************************/
  2. /* Copyright 2009, 2010 The University of Texas at Austin. */
  3. /* All rights reserved. */
  4. /* */
  5. /* Redistribution and use in source and binary forms, with or */
  6. /* without modification, are permitted provided that the following */
  7. /* conditions are met: */
  8. /* */
  9. /* 1. Redistributions of source code must retain the above */
  10. /* copyright notice, this list of conditions and the following */
  11. /* disclaimer. */
  12. /* */
  13. /* 2. Redistributions in binary form must reproduce the above */
  14. /* copyright notice, this list of conditions and the following */
  15. /* disclaimer in the documentation and/or other materials */
  16. /* provided with the distribution. */
  17. /* */
  18. /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
  19. /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
  20. /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
  21. /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
  22. /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
  23. /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
  24. /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
  25. /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
  26. /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
  27. /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
  28. /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
  29. /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
  30. /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
  31. /* POSSIBILITY OF SUCH DAMAGE. */
  32. /* */
  33. /* The views and conclusions contained in the software and */
  34. /* documentation are those of the authors and should not be */
  35. /* interpreted as representing official policies, either expressed */
  36. /* or implied, of The University of Texas at Austin. */
  37. /*********************************************************************/
  38. #define ASSEMBLER
  39. #include "common.h"
  40. #undef ZERO
  41. #define ALPHA 0
  42. #define FZERO 16
  43. #define M r3
  44. #define N r4
  45. #define K r5
  46. #if defined(linux) || defined(__FreeBSD__)
  47. #define A r6
  48. #define B r7
  49. #define C r8
  50. #define LDC r9
  51. #define OFFSET r10
  52. #endif
  53. #define TEMP r11
  54. #define AORIG r12
  55. #define KK r14
  56. #define INCM1 r15
  57. #define INCM3 r16
  58. #define INCM5 r17
  59. #define INCM7 r18
  60. #define INC2 r19
  61. #define INC r20
  62. #define INC4 r21
  63. #define I r22
  64. #define J r23
  65. #define AO r24
  66. #define BO r25
  67. #define AO2 r26
  68. #define BO2 r27
  69. #define CO1 r28
  70. #define CO2 r29
  71. #define ZERO r31
  72. #ifndef NEEDPARAM
  73. #define A1 f16
  74. #define A2 f17
  75. #define A3 f18
  76. #define A4 f19
  77. #define A5 f20
  78. #define A6 f21
  79. #define A7 f22
  80. #define A8 f23
  81. #define A9 f24
  82. #define A10 f25
  83. #define B1 f26
  84. #define B2 f27
  85. #define B3 f28
  86. #define B4 f29
  87. #define B5 f30
  88. #define B6 f31
  89. #define AP B6
  90. #ifndef CONJ
  91. #define FXCPMADD fxcpmadd
  92. #define FXCSMADD fxcxnpma
  93. #else
  94. #if defined(LN) || defined(LT)
  95. #define FXCPMADD fxcpnsma
  96. #define FXCSMADD fxcxma
  97. #else
  98. #define FXCPMADD fxcpmadd
  99. #define FXCSMADD fxcxnsma
  100. #endif
  101. #endif
  102. #ifndef CONJ
  103. #define FXCXNPMA fxcxnpma
  104. #define FXCXNSMA fxcxnsma
  105. #else
  106. #define FXCXNPMA fxcxnsma
  107. #define FXCXNSMA fxcxnpma
  108. #endif
  109. PROLOGUE
  110. PROFCODE
  111. li r0, -16
  112. stfpdux f14, SP, r0
  113. stfpdux f15, SP, r0
  114. stfpdux f16, SP, r0
  115. stfpdux f17, SP, r0
  116. stfpdux f18, SP, r0
  117. stfpdux f19, SP, r0
  118. stfpdux f20, SP, r0
  119. stfpdux f21, SP, r0
  120. stfpdux f22, SP, r0
  121. stfpdux f23, SP, r0
  122. stfpdux f24, SP, r0
  123. stfpdux f25, SP, r0
  124. stfpdux f26, SP, r0
  125. stfpdux f27, SP, r0
  126. stfpdux f28, SP, r0
  127. stfpdux f29, SP, r0
  128. stfpdux f30, SP, r0
  129. stfpdux f31, SP, r0
  130. stwu r31, -4(SP)
  131. stwu r30, -4(SP)
  132. stwu r29, -4(SP)
  133. stwu r28, -4(SP)
  134. stwu r27, -4(SP)
  135. stwu r26, -4(SP)
  136. stwu r25, -4(SP)
  137. stwu r24, -4(SP)
  138. stwu r23, -4(SP)
  139. stwu r22, -4(SP)
  140. stwu r21, -4(SP)
  141. stwu r20, -4(SP)
  142. stwu r19, -4(SP)
  143. stwu r18, -4(SP)
  144. stwu r17, -4(SP)
  145. stwu r16, -4(SP)
  146. stwu r15, -4(SP)
  147. stwu r14, -4(SP)
  148. li r0, 0
  149. stwu r0, -4(SP)
  150. stwu r0, -4(SP)
  151. stfdu f2, -8(SP)
  152. stfdu f1, -8(SP)
  153. slwi LDC, LDC, ZBASE_SHIFT
  154. cmpwi cr0, M, 0
  155. ble .L999
  156. cmpwi cr0, N, 0
  157. ble .L999
  158. cmpwi cr0, K, 0
  159. ble .L999
  160. li INC, 1 * SIZE
  161. li INC2, 2 * SIZE
  162. li INC4, 4 * SIZE
  163. li INCM1, -1 * SIZE
  164. li INCM3, -3 * SIZE
  165. li INCM5, -5 * SIZE
  166. li INCM7, -7 * SIZE
  167. addi C, C, - 1 * SIZE
  168. #ifdef LN
  169. mullw r0, M, K
  170. slwi r0, r0, ZBASE_SHIFT
  171. add A, A, r0
  172. slwi r0, M, ZBASE_SHIFT
  173. add C, C, r0
  174. #endif
  175. #ifdef RN
  176. neg KK, OFFSET
  177. #endif
  178. #ifdef RT
  179. mullw r0, N, K
  180. slwi r0, r0, ZBASE_SHIFT
  181. add B, B, r0
  182. mullw r0, N, LDC
  183. add C, C, r0
  184. sub KK, N, OFFSET
  185. #endif
  186. srawi. J, N, 1
  187. ble .L50
  188. .align 4
  189. .L10:
  190. #ifdef RT
  191. slwi r0, K, 1 + ZBASE_SHIFT
  192. sub B, B, r0
  193. slwi r0, LDC, 1
  194. sub C, C, r0
  195. #endif
  196. mr CO1, C
  197. add CO2, C, LDC
  198. #ifdef LN
  199. add KK, M, OFFSET
  200. #endif
  201. #ifdef LT
  202. mr KK, OFFSET
  203. #endif
  204. #if defined(LN) || defined(RT)
  205. addi AORIG, A, -4 * SIZE
  206. #else
  207. addi AO, A, -4 * SIZE
  208. #endif
  209. #ifndef RT
  210. add C, CO2, LDC
  211. #endif
  212. li r0, FZERO
  213. lfpsx f0, SP, r0
  214. andi. I, M, 1
  215. beq .L20
  216. #if defined(LT) || defined(RN)
  217. addi AO2, AO, 2 * SIZE
  218. fpmr f1, f0
  219. addi BO, B, - 4 * SIZE
  220. fpmr f2, f0
  221. addi BO2, B, - 2 * SIZE
  222. fpmr f3, f0
  223. srawi. r0, KK, 2
  224. mtspr CTR, r0
  225. ble .L34
  226. #else
  227. #ifdef LN
  228. slwi r0, K, 0 + ZBASE_SHIFT
  229. sub AORIG, AORIG, r0
  230. #endif
  231. slwi r0 , KK, 0 + ZBASE_SHIFT
  232. slwi TEMP, KK, 1 + ZBASE_SHIFT
  233. add AO, AORIG, r0
  234. add BO, B, TEMP
  235. sub TEMP, K, KK
  236. addi AO2, AO, 2 * SIZE
  237. fpmr f1, f0
  238. addi BO, BO, - 4 * SIZE
  239. fpmr f2, f0
  240. addi BO2, BO, 2 * SIZE
  241. fpmr f3, f0
  242. srawi. r0, TEMP, 2
  243. mtspr CTR, r0
  244. ble .L34
  245. #endif
  246. LFPDUX A1, AO, INC4
  247. LFPDUX B1, BO, INC4
  248. LFPDUX B2, BO2, INC4
  249. LFPDUX A2, AO2, INC4
  250. LFPDUX B3, BO, INC4
  251. LFPDUX B4, BO2, INC4
  252. LFPDUX A3, AO, INC4
  253. LFPDUX A5, BO, INC4
  254. LFPDUX A6, BO2, INC4
  255. LFPDUX A4, AO2, INC4
  256. LFPDUX A7, BO, INC4
  257. LFPDUX A8, BO2, INC4
  258. bdz- .L33
  259. .align 4
  260. .L32:
  261. FXCPMADD f0, B1, A1, f0
  262. FXCSMADD f1, B1, A1, f1
  263. LFPDUX B1, BO, INC4
  264. FXCPMADD f2, B2, A1, f2
  265. FXCSMADD f3, B2, A1, f3
  266. LFPDUX B2, BO2, INC4
  267. LFPDUX A1, AO, INC4
  268. FXCPMADD f0, B3, A2, f0
  269. FXCSMADD f1, B3, A2, f1
  270. LFPDUX B3, BO, INC4
  271. FXCPMADD f2, B4, A2, f2
  272. FXCSMADD f3, B4, A2, f3
  273. LFPDUX B4, BO2, INC4
  274. LFPDUX A2, AO2, INC4
  275. FXCPMADD f0, A5, A3, f0
  276. FXCSMADD f1, A5, A3, f1
  277. LFPDUX A5, BO, INC4
  278. FXCPMADD f2, A6, A3, f2
  279. FXCSMADD f3, A6, A3, f3
  280. LFPDUX A6, BO2, INC4
  281. LFPDUX A3, AO, INC4
  282. FXCPMADD f0, A7, A4, f0
  283. FXCSMADD f1, A7, A4, f1
  284. LFPDUX A7, BO, INC4
  285. FXCPMADD f2, A8, A4, f2
  286. FXCSMADD f3, A8, A4, f3
  287. LFPDUX A8, BO2, INC4
  288. LFPDUX A4, AO2, INC4
  289. bdnz+ .L32
  290. .align 4
  291. .L33:
  292. FXCPMADD f0, B1, A1, f0
  293. FXCSMADD f1, B1, A1, f1
  294. FXCPMADD f2, B2, A1, f2
  295. FXCSMADD f3, B2, A1, f3
  296. FXCPMADD f0, B3, A2, f0
  297. FXCSMADD f1, B3, A2, f1
  298. FXCPMADD f2, B4, A2, f2
  299. FXCSMADD f3, B4, A2, f3
  300. FXCPMADD f0, A5, A3, f0
  301. FXCSMADD f1, A5, A3, f1
  302. FXCPMADD f2, A6, A3, f2
  303. FXCSMADD f3, A6, A3, f3
  304. FXCPMADD f0, A7, A4, f0
  305. FXCSMADD f1, A7, A4, f1
  306. FXCPMADD f2, A8, A4, f2
  307. FXCSMADD f3, A8, A4, f3
  308. .align 4
  309. .L34:
  310. #if defined(LT) || defined(RN)
  311. andi. r0, KK, 3
  312. mtspr CTR, r0
  313. ble+ .L38
  314. #else
  315. andi. r0, TEMP, 3
  316. mtspr CTR, r0
  317. ble+ .L38
  318. #endif
  319. LFPDX A1, AO, INC4
  320. LFPDUX B1, BO, INC4
  321. LFPDUX B2, BO2, INC4
  322. add AO, AO, INC2
  323. bdz- .L37
  324. .align 4
  325. .L36:
  326. FXCPMADD f0, B1, A1, f0
  327. FXCSMADD f1, B1, A1, f1
  328. LFPDUX B1, BO, INC4
  329. FXCPMADD f2, B2, A1, f2
  330. FXCSMADD f3, B2, A1, f3
  331. LFPDX A1, AO, INC4
  332. LFPDUX B2, BO2, INC4
  333. add AO, AO, INC2
  334. bdnz+ .L36
  335. .align 4
  336. .L37:
  337. FXCPMADD f0, B1, A1, f0
  338. FXCSMADD f1, B1, A1, f1
  339. FXCPMADD f2, B2, A1, f2
  340. FXCSMADD f3, B2, A1, f3
  341. .align 4
  342. .L38:
  343. fpadd f0, f0, f1
  344. fpadd f2, f2, f3
  345. #if defined(LN) || defined(RT)
  346. #ifdef LN
  347. subi r0, KK, 1
  348. #else
  349. subi r0, KK, 2
  350. #endif
  351. slwi TEMP, r0, 0 + ZBASE_SHIFT
  352. slwi r0, r0, 1 + ZBASE_SHIFT
  353. add AO, AORIG, TEMP
  354. add BO, B, r0
  355. addi BO, BO, - 4 * SIZE
  356. #endif
  357. addi AO2, AO, 2 * SIZE
  358. addi BO2, BO, 2 * SIZE
  359. #if defined(LN) || defined(LT)
  360. LFPDX f16, BO, INC4
  361. LFPDX f17, BO2, INC4
  362. #else
  363. LFPDX f16, AO, INC4
  364. LFPDX f17, AO2, INC4
  365. #endif
  366. fpsub f0, f16, f0
  367. fpsub f2, f17, f2
  368. #ifdef LN
  369. LFPDX A1, AO, INC4
  370. fxpmul f4, A1, f0
  371. fxpmul f5, A1, f2
  372. FXCXNPMA f0, A1, f0, f4
  373. FXCXNPMA f2, A1, f2, f5
  374. #endif
  375. #ifdef LT
  376. LFPDX A1, AO, INC4
  377. fxpmul f4, A1, f0
  378. fxpmul f5, A1, f2
  379. FXCXNPMA f0, A1, f0, f4
  380. FXCXNPMA f2, A1, f2, f5
  381. #endif
  382. #ifdef RN
  383. LFPDUX A1, BO, INC4
  384. LFPDUX A2, BO2, INC4
  385. add BO, BO, INC4
  386. LFPDUX A3, BO2, INC4
  387. subi BO, BO, 8 * SIZE
  388. subi BO2, BO2, 8 * SIZE
  389. fxpmul f4, A1, f0
  390. FXCXNPMA f0, A1, f0, f4
  391. fxcpnmsub f2, A2, f0, f2
  392. FXCXNSMA f2, A2, f0, f2
  393. fxpmul f4, A3, f2
  394. FXCXNPMA f2, A3, f2, f4
  395. #endif
  396. #ifdef RT
  397. LFPDUX A1, BO, INC4
  398. add BO2, BO2, INC4
  399. LFPDUX A2, BO, INC4
  400. LFPDUX A3, BO2, INC4
  401. subi BO, BO, 8 * SIZE
  402. subi BO2, BO2, 8 * SIZE
  403. fxpmul f4, A3, f2
  404. FXCXNPMA f2, A3, f2, f4
  405. fxcpnmsub f0, A2, f2, f0
  406. FXCXNSMA f0, A2, f2, f0
  407. fxpmul f4, A1, f0
  408. FXCXNPMA f0, A1, f0, f4
  409. #endif
  410. #ifdef LN
  411. subi CO1, CO1, 2 * SIZE
  412. subi CO2, CO2, 2 * SIZE
  413. #endif
  414. #if defined(LN) || defined(LT)
  415. STFPDX f0, BO, INC4
  416. STFPDX f2, BO2, INC4
  417. #else
  418. STFPDX f0, AO, INC4
  419. STFPDX f2, AO2, INC4
  420. #endif
  421. STFDUX f0, CO1, INC
  422. STFSDUX f0, CO1, INC
  423. STFDUX f2, CO2, INC
  424. STFSDUX f2, CO2, INC
  425. #ifdef LN
  426. subi CO1, CO1, 2 * SIZE
  427. subi CO2, CO2, 2 * SIZE
  428. #endif
  429. #ifdef RT
  430. slwi r0, K, 0 + ZBASE_SHIFT
  431. add AORIG, AORIG, r0
  432. #endif
  433. #if defined(LT) || defined(RN)
  434. sub TEMP, K, KK
  435. slwi r0, TEMP, 0 + ZBASE_SHIFT
  436. slwi TEMP, TEMP, 1 + ZBASE_SHIFT
  437. add AO, AO, r0
  438. add BO, BO, TEMP
  439. #endif
  440. #ifdef LT
  441. addi KK, KK, 1
  442. #endif
  443. #ifdef LN
  444. subi KK, KK, 1
  445. #endif
  446. li r0, FZERO
  447. lfpsx f0, SP, r0
  448. .align 4
  449. .L20:
  450. andi. I, M, 2
  451. beq .L30
  452. #if defined(LT) || defined(RN)
  453. addi AO2, AO, 2 * SIZE
  454. fpmr f4, f0
  455. addi BO, B, - 4 * SIZE
  456. fpmr f8, f0
  457. addi BO2, B, - 2 * SIZE
  458. fpmr f12, f0
  459. srawi. r0, KK, 2
  460. fpmr f1, f0
  461. fpmr f5, f0
  462. fpmr f9, f0
  463. mtspr CTR, r0
  464. fpmr f13, f0
  465. ble .L24
  466. #else
  467. #ifdef LN
  468. slwi r0, K, 1 + ZBASE_SHIFT
  469. sub AORIG, AORIG, r0
  470. #endif
  471. slwi r0 , KK, 1 + ZBASE_SHIFT
  472. add AO, AORIG, r0
  473. add BO, B, r0
  474. sub TEMP, K, KK
  475. addi AO2, AO, 2 * SIZE
  476. fpmr f4, f0
  477. addi BO, BO, - 4 * SIZE
  478. fpmr f8, f0
  479. addi BO2, BO, 2 * SIZE
  480. fpmr f12, f0
  481. fpmr f1, f0
  482. fpmr f5, f0
  483. fpmr f9, f0
  484. fpmr f13, f0
  485. srawi. r0, TEMP, 2
  486. mtspr CTR, r0
  487. ble .L24
  488. #endif
  489. LFPDUX A1, AO, INC4
  490. LFPDUX B1, BO, INC4
  491. LFPDUX A2, AO2, INC4
  492. LFPDUX B2, BO2, INC4
  493. LFPDUX A3, AO, INC4
  494. LFPDUX B3, BO, INC4
  495. LFPDUX A4, AO2, INC4
  496. LFPDUX B4, BO2, INC4
  497. LFPDUX A5, AO, INC4
  498. LFPDUX B5, BO, INC4
  499. LFPDUX A6, AO2, INC4
  500. LFPDUX B6, BO2, INC4
  501. LFPDUX A7, AO, INC4
  502. LFPDUX A9, BO, INC4
  503. LFPDUX A10, BO2, INC4
  504. bdz- .L23
  505. .align 4
  506. .L22:
  507. FXCPMADD f0, B1, A1, f0
  508. nop
  509. FXCSMADD f4, B1, A1, f4
  510. LFPDUX A8, AO2, INC4
  511. FXCPMADD f8, B2, A1, f8
  512. nop
  513. FXCSMADD f12, B2, A1, f12
  514. LFPDUX A1, AO, INC4
  515. FXCPMADD f1, B1, A2, f1
  516. nop
  517. FXCSMADD f5, B1, A2, f5
  518. LFPDUX B1, BO, INC4
  519. FXCPMADD f9, B2, A2, f9
  520. nop
  521. FXCSMADD f13, B2, A2, f13
  522. LFPDUX B2, BO2, INC4
  523. FXCPMADD f0, B3, A3, f0
  524. nop
  525. FXCSMADD f4, B3, A3, f4
  526. LFPDUX A2, AO2, INC4
  527. FXCPMADD f8, B4, A3, f8
  528. nop
  529. FXCSMADD f12, B4, A3, f12
  530. LFPDUX A3, AO, INC4
  531. FXCPMADD f1, B3, A4, f1
  532. nop
  533. FXCSMADD f5, B3, A4, f5
  534. LFPDUX B3, BO, INC4
  535. FXCPMADD f9, B4, A4, f9
  536. nop
  537. FXCSMADD f13, B4, A4, f13
  538. LFPDUX B4, BO2, INC4
  539. FXCPMADD f0, B5, A5, f0
  540. nop
  541. FXCSMADD f4, B5, A5, f4
  542. LFPDUX A4, AO2, INC4
  543. FXCPMADD f8, B6, A5, f8
  544. nop
  545. FXCSMADD f12, B6, A5, f12
  546. LFPDUX A5, AO, INC4
  547. FXCPMADD f1, B5, A6, f1
  548. nop
  549. FXCSMADD f5, B5, A6, f5
  550. LFPDUX B5, BO, INC4
  551. FXCPMADD f9, B6, A6, f9
  552. nop
  553. FXCSMADD f13, B6, A6, f13
  554. LFPDUX B6, BO2, INC4
  555. FXCPMADD f0, A9, A7, f0
  556. nop
  557. FXCSMADD f4, A9, A7, f4
  558. LFPDUX A6, AO2, INC4
  559. FXCPMADD f8, A10, A7, f8
  560. nop
  561. FXCSMADD f12, A10, A7, f12
  562. LFPDUX A7, AO, INC4
  563. FXCPMADD f1, A9, A8, f1
  564. nop
  565. FXCSMADD f5, A9, A8, f5
  566. LFPDUX A9, BO, INC4
  567. FXCPMADD f9, A10, A8, f9
  568. nop
  569. FXCSMADD f13, A10, A8, f13
  570. LFPDUX A10, BO2, INC4
  571. bdnz+ .L22
  572. .align 4
  573. .L23:
  574. FXCPMADD f0, B1, A1, f0
  575. FXCSMADD f4, B1, A1, f4
  576. LFPDUX A8, AO2, INC4
  577. FXCPMADD f8, B2, A1, f8
  578. FXCSMADD f12, B2, A1, f12
  579. FXCPMADD f1, B1, A2, f1
  580. FXCSMADD f5, B1, A2, f5
  581. FXCPMADD f9, B2, A2, f9
  582. FXCSMADD f13, B2, A2, f13
  583. FXCPMADD f0, B3, A3, f0
  584. FXCSMADD f4, B3, A3, f4
  585. FXCPMADD f8, B4, A3, f8
  586. FXCSMADD f12, B4, A3, f12
  587. FXCPMADD f1, B3, A4, f1
  588. FXCSMADD f5, B3, A4, f5
  589. FXCPMADD f9, B4, A4, f9
  590. FXCSMADD f13, B4, A4, f13
  591. FXCPMADD f0, B5, A5, f0
  592. FXCSMADD f4, B5, A5, f4
  593. FXCPMADD f8, B6, A5, f8
  594. FXCSMADD f12, B6, A5, f12
  595. FXCPMADD f1, B5, A6, f1
  596. FXCSMADD f5, B5, A6, f5
  597. FXCPMADD f9, B6, A6, f9
  598. FXCSMADD f13, B6, A6, f13
  599. FXCPMADD f0, A9, A7, f0
  600. FXCSMADD f4, A9, A7, f4
  601. FXCPMADD f8, A10, A7, f8
  602. FXCSMADD f12, A10, A7, f12
  603. FXCPMADD f1, A9, A8, f1
  604. FXCSMADD f5, A9, A8, f5
  605. FXCPMADD f9, A10, A8, f9
  606. FXCSMADD f13, A10, A8, f13
  607. .align 4
  608. .L24:
  609. #if defined(LT) || defined(RN)
  610. andi. r0, KK, 3
  611. mtspr CTR, r0
  612. ble+ .L28
  613. #else
  614. andi. r0, TEMP, 3
  615. mtspr CTR, r0
  616. ble+ .L28
  617. #endif
  618. LFPDUX A1, AO, INC4
  619. LFPDUX A2, AO2, INC4
  620. LFPDUX B1, BO, INC4
  621. LFPDUX B2, BO2, INC4
  622. bdz- .L27
  623. .align 4
  624. .L26:
  625. FXCPMADD f0, B1, A1, f0
  626. FXCSMADD f4, B1, A1, f4
  627. FXCPMADD f8, B2, A1, f8
  628. FXCSMADD f12, B2, A1, f12
  629. LFPDUX A1, AO, INC4
  630. FXCPMADD f1, B1, A2, f1
  631. FXCSMADD f5, B1, A2, f5
  632. LFPDUX B1, BO, INC4
  633. FXCPMADD f9, B2, A2, f9
  634. FXCSMADD f13, B2, A2, f13
  635. LFPDUX A2, AO2, INC4
  636. LFPDUX B2, BO2, INC4
  637. bdnz+ .L26
  638. .align 4
  639. .L27:
  640. FXCPMADD f0, B1, A1, f0
  641. FXCSMADD f4, B1, A1, f4
  642. FXCPMADD f8, B2, A1, f8
  643. FXCSMADD f12, B2, A1, f12
  644. FXCPMADD f1, B1, A2, f1
  645. FXCSMADD f5, B1, A2, f5
  646. FXCPMADD f9, B2, A2, f9
  647. FXCSMADD f13, B2, A2, f13
  648. .align 4
  649. .L28:
  650. fpadd f0, f0, f4
  651. fpadd f8, f8, f12
  652. fpadd f1, f1, f5
  653. fpadd f9, f9, f13
  654. #if defined(LN) || defined(RT)
  655. #ifdef LN
  656. subi r0, KK, 2
  657. #else
  658. subi r0, KK, 2
  659. #endif
  660. slwi r0, r0, 1 + ZBASE_SHIFT
  661. add AO, AORIG, r0
  662. add BO, B, r0
  663. addi AO2, AO, 2 * SIZE
  664. addi BO, BO, - 4 * SIZE
  665. addi BO2, BO, 2 * SIZE
  666. #endif
  667. #if defined(LN) || defined(LT)
  668. LFPDUX f16, BO, INC4
  669. LFPDUX f18, BO2, INC4
  670. LFPDUX f17, BO, INC4
  671. LFPDUX f19, BO2, INC4
  672. subi BO, BO, 8 * SIZE
  673. subi BO2, BO2, 8 * SIZE
  674. #else
  675. LFPDUX f16, AO, INC4
  676. LFPDUX f17, AO2, INC4
  677. LFPDUX f18, AO, INC4
  678. LFPDUX f19, AO2, INC4
  679. subi AO, AO, 8 * SIZE
  680. subi AO2, AO2, 8 * SIZE
  681. #endif
  682. fpsub f0, f16, f0
  683. fpsub f1, f17, f1
  684. fpsub f8, f18, f8
  685. fpsub f9, f19, f9
  686. #ifdef LN
  687. LFPDUX A1, AO, INC4
  688. add AO2, AO2, INC4
  689. LFPDUX A2, AO, INC4
  690. LFPDUX A3, AO2, INC4
  691. subi AO, AO, 8 * SIZE
  692. subi AO2, AO2, 8 * SIZE
  693. fxpmul f4, A3, f1
  694. fxpmul f5, A3, f9
  695. FXCXNPMA f1, A3, f1, f4
  696. FXCXNPMA f9, A3, f9, f5
  697. fxcpnmsub f0, A2, f1, f0
  698. fxcpnmsub f8, A2, f9, f8
  699. FXCXNSMA f0, A2, f1, f0
  700. FXCXNSMA f8, A2, f9, f8
  701. fxpmul f4, A1, f0
  702. fxpmul f5, A1, f8
  703. FXCXNPMA f0, A1, f0, f4
  704. FXCXNPMA f8, A1, f8, f5
  705. #endif
  706. #ifdef LT
  707. LFPDUX A1, AO, INC4
  708. LFPDUX A2, AO2, INC4
  709. add AO, AO, INC4
  710. LFPDUX A3, AO2, INC4
  711. subi AO, AO, 8 * SIZE
  712. subi AO2, AO2, 8 * SIZE
  713. fxpmul f4, A1, f0
  714. fxpmul f5, A1, f8
  715. FXCXNPMA f0, A1, f0, f4
  716. FXCXNPMA f8, A1, f8, f5
  717. fxcpnmsub f1, A2, f0, f1
  718. fxcpnmsub f9, A2, f8, f9
  719. FXCXNSMA f1, A2, f0, f1
  720. FXCXNSMA f9, A2, f8, f9
  721. fxpmul f6, A3, f1
  722. fxpmul f7, A3, f9
  723. FXCXNPMA f1, A3, f1, f6
  724. FXCXNPMA f9, A3, f9, f7
  725. #endif
  726. #ifdef RN
  727. LFPDUX A1, BO, INC4
  728. LFPDUX A2, BO2, INC4
  729. add BO, BO, INC4
  730. LFPDUX A3, BO2, INC4
  731. subi BO, BO, 8 * SIZE
  732. subi BO2, BO2, 8 * SIZE
  733. fxpmul f4, A1, f0
  734. fxpmul f5, A1, f1
  735. FXCXNPMA f0, A1, f0, f4
  736. FXCXNPMA f1, A1, f1, f5
  737. fxcpnmsub f8, A2, f0, f8
  738. fxcpnmsub f9, A2, f1, f9
  739. FXCXNSMA f8, A2, f0, f8
  740. FXCXNSMA f9, A2, f1, f9
  741. fxpmul f4, A3, f8
  742. fxpmul f5, A3, f9
  743. FXCXNPMA f8, A3, f8, f4
  744. FXCXNPMA f9, A3, f9, f5
  745. #endif
  746. #ifdef RT
  747. LFPDUX A1, BO, INC4
  748. add BO2, BO2, INC4
  749. LFPDUX A2, BO, INC4
  750. LFPDUX A3, BO2, INC4
  751. subi BO, BO, 8 * SIZE
  752. subi BO2, BO2, 8 * SIZE
  753. fxpmul f4, A3, f8
  754. fxpmul f5, A3, f9
  755. FXCXNPMA f8, A3, f8, f4
  756. FXCXNPMA f9, A3, f9, f5
  757. fxcpnmsub f0, A2, f8, f0
  758. fxcpnmsub f1, A2, f9, f1
  759. FXCXNSMA f0, A2, f8, f0
  760. FXCXNSMA f1, A2, f9, f1
  761. fxpmul f4, A1, f0
  762. fxpmul f5, A1, f1
  763. FXCXNPMA f0, A1, f0, f4
  764. FXCXNPMA f1, A1, f1, f5
  765. #endif
  766. #ifdef LN
  767. subi CO1, CO1, 4 * SIZE
  768. subi CO2, CO2, 4 * SIZE
  769. #endif
  770. #if defined(LN) || defined(LT)
  771. STFPDUX f0, BO, INC4
  772. STFPDUX f8, BO2, INC4
  773. STFPDUX f1, BO, INC4
  774. STFPDUX f9, BO2, INC4
  775. subi BO, BO, 8 * SIZE
  776. subi BO2, BO2, 8 * SIZE
  777. #else
  778. STFPDUX f0, AO, INC4
  779. STFPDUX f1, AO2, INC4
  780. STFPDUX f8, AO, INC4
  781. STFPDUX f9, AO2, INC4
  782. subi AO, AO, 8 * SIZE
  783. subi AO2, AO2, 8 * SIZE
  784. #endif
  785. STFDUX f0, CO1, INC
  786. STFSDUX f0, CO1, INC
  787. STFDUX f1, CO1, INC
  788. STFSDUX f1, CO1, INC
  789. STFDUX f8, CO2, INC
  790. STFSDUX f8, CO2, INC
  791. STFDUX f9, CO2, INC
  792. STFSDUX f9, CO2, INC
  793. #ifdef LN
  794. subi CO1, CO1, 4 * SIZE
  795. subi CO2, CO2, 4 * SIZE
  796. #endif
  797. #ifdef RT
  798. slwi r0, K, 1 + ZBASE_SHIFT
  799. add AORIG, AORIG, r0
  800. #endif
  801. #if defined(LT) || defined(RN)
  802. sub TEMP, K, KK
  803. slwi r0, TEMP, 1 + ZBASE_SHIFT
  804. add AO, AO, r0
  805. add BO, BO, r0
  806. #endif
  807. #ifdef LT
  808. addi KK, KK, 2
  809. #endif
  810. #ifdef LN
  811. subi KK, KK, 2
  812. #endif
  813. li r0, FZERO
  814. lfpsx f0, SP, r0
  815. .align 4
  816. .L30:
  817. srawi. I, M, 2
  818. ble .L49
  819. .align 4
  820. .L11:
  821. #if defined(LT) || defined(RN)
  822. addi AO2, AO, 2 * SIZE
  823. fpmr f4, f0
  824. addi BO, B, - 4 * SIZE
  825. fpmr f8, f0
  826. addi BO2, B, - 2 * SIZE
  827. fpmr f12, f0
  828. fpmr f5, f0
  829. fpmr f9, f0
  830. fpmr f13, f0
  831. fpmr f2, f0
  832. fpmr f6, f0
  833. fpmr f10, f0
  834. fpmr f14, f0
  835. fpmr f3, f0
  836. fpmr f7, f0
  837. fpmr f11, f0
  838. fpmr f15, f0
  839. srawi. r0, KK, 2
  840. fpmr f1, f0
  841. mtspr CTR, r0
  842. ble .L14
  843. #else
  844. #ifdef LN
  845. slwi r0, K, 2 + ZBASE_SHIFT
  846. sub AORIG, AORIG, r0
  847. #endif
  848. slwi r0 , KK, 2 + ZBASE_SHIFT
  849. slwi TEMP, KK, 1 + ZBASE_SHIFT
  850. add AO, AORIG, r0
  851. add BO, B, TEMP
  852. sub TEMP, K, KK
  853. fpmr f5, f0
  854. fpmr f9, f0
  855. fpmr f13, f0
  856. fpmr f2, f0
  857. fpmr f6, f0
  858. fpmr f10, f0
  859. fpmr f14, f0
  860. fpmr f3, f0
  861. fpmr f7, f0
  862. fpmr f11, f0
  863. fpmr f15, f0
  864. addi AO2, AO, 2 * SIZE
  865. fpmr f4, f0
  866. addi BO, BO, - 4 * SIZE
  867. fpmr f8, f0
  868. addi BO2, BO, 2 * SIZE
  869. fpmr f12, f0
  870. srawi. r0, TEMP, 2
  871. fpmr f1, f0
  872. mtspr CTR, r0
  873. ble .L14
  874. #endif
  875. LFPDUX A1, AO, INC4
  876. fpmr f5, f0
  877. LFPDUX A3, AO, INC4
  878. fpmr f9, f0
  879. LFPDUX B1, BO, INC4
  880. fpmr f13, f0
  881. LFPDUX A5, AO, INC4
  882. fpmr f2, f0
  883. LFPDUX A6, AO, INC4
  884. fpmr f6, f0
  885. LFPDUX B3, BO, INC4
  886. fpmr f10, f0
  887. LFPDUX A7, AO, INC4
  888. fpmr f14, f0
  889. LFPDUX A8, AO, INC4
  890. fpmr f3, f0
  891. LFPDUX B5, BO, INC4
  892. fpmr f7, f0
  893. LFPDUX A9, AO, INC4
  894. fpmr f11, f0
  895. LFPDUX A2, AO2, INC4
  896. fpmr f15, f0
  897. LFPDUX B2, BO2, INC4
  898. bdz- .L13
  899. .align 4
  900. .L12:
  901. ## 1 ##
  902. FXCPMADD f0, B1, A1, f0
  903. nop
  904. FXCSMADD f4, B1, A1, f4
  905. nop
  906. FXCPMADD f8, B2, A1, f8
  907. LFPDUX B4, BO2, INC4
  908. FXCSMADD f12, B2, A1, f12
  909. LFPDUX B6, BO, INC4
  910. FXCPMADD f1, B1, A2, f1
  911. nop
  912. FXCSMADD f5, B1, A2, f5
  913. LFPDUX A4, AO2, INC4
  914. FXCPMADD f9, B2, A2, f9
  915. LFPDUX A10, AO, INC4
  916. FXCSMADD f13, B2, A2, f13
  917. nop
  918. FXCPMADD f2, B1, A3, f2
  919. nop
  920. FXCSMADD f6, B1, A3, f6
  921. nop
  922. FXCPMADD f10, B2, A3, f10
  923. nop
  924. FXCSMADD f14, B2, A3, f14
  925. nop
  926. FXCPMADD f3, B1, A4, f3
  927. nop
  928. FXCSMADD f7, B1, A4, f7
  929. LFPDUX A2, AO2, INC4
  930. FXCPMADD f11, B2, A4, f11
  931. LFPDUX A1, AO, INC4
  932. FXCSMADD f15, B2, A4, f15
  933. nop
  934. ## 2 ##
  935. FXCPMADD f0, B3, A5, f0
  936. nop
  937. FXCSMADD f4, B3, A5, f4
  938. nop
  939. FXCPMADD f8, B4, A5, f8
  940. LFPDUX B2, BO2, INC4
  941. FXCSMADD f12, B4, A5, f12
  942. LFPDUX B1, BO, INC4
  943. FXCPMADD f1, B3, A2, f1
  944. nop
  945. FXCSMADD f5, B3, A2, f5
  946. LFPDUX A4, AO2, INC4
  947. FXCPMADD f9, B4, A2, f9
  948. LFPDUX A3, AO, INC4
  949. FXCSMADD f13, B4, A2, f13
  950. nop
  951. FXCPMADD f2, B3, A6, f2
  952. nop
  953. FXCSMADD f6, B3, A6, f6
  954. nop
  955. FXCPMADD f10, B4, A6, f10
  956. nop
  957. FXCSMADD f14, B4, A6, f14
  958. nop
  959. FXCPMADD f3, B3, A4, f3
  960. nop
  961. FXCSMADD f7, B3, A4, f7
  962. LFPDUX A2, AO2, INC4
  963. FXCPMADD f11, B4, A4, f11
  964. LFPDUX A5, AO, INC4
  965. FXCSMADD f15, B4, A4, f15
  966. nop
  967. ## 3 ##
  968. FXCPMADD f0, B5, A7, f0
  969. nop
  970. FXCSMADD f4, B5, A7, f4
  971. nop
  972. FXCPMADD f8, B2, A7, f8
  973. LFPDUX B4, BO2, INC4
  974. FXCSMADD f12, B2, A7, f12
  975. LFPDUX B3, BO, INC4
  976. FXCPMADD f1, B5, A2, f1
  977. nop
  978. FXCSMADD f5, B5, A2, f5
  979. LFPDUX A4, AO2, INC4
  980. FXCPMADD f9, B2, A2, f9
  981. LFPDUX A6, AO, INC4
  982. FXCSMADD f13, B2, A2, f13
  983. nop
  984. FXCPMADD f2, B5, A8, f2
  985. nop
  986. FXCSMADD f6, B5, A8, f6
  987. nop
  988. FXCPMADD f10, B2, A8, f10
  989. nop
  990. FXCSMADD f14, B2, A8, f14
  991. nop
  992. FXCPMADD f3, B5, A4, f3
  993. nop
  994. FXCSMADD f7, B5, A4, f7
  995. LFPDUX A2, AO2, INC4
  996. FXCPMADD f11, B2, A4, f11
  997. LFPDUX A7, AO, INC4
  998. FXCSMADD f15, B2, A4, f15
  999. nop
  1000. ## 4 ##
  1001. FXCPMADD f0, B6, A9, f0
  1002. nop
  1003. FXCSMADD f4, B6, A9, f4
  1004. nop
  1005. FXCPMADD f8, B4, A9, f8
  1006. LFPDUX B2, BO2, INC4
  1007. FXCSMADD f12, B4, A9, f12
  1008. LFPDUX B5, BO, INC4
  1009. FXCPMADD f1, B6, A2, f1
  1010. nop
  1011. FXCSMADD f5, B6, A2, f5
  1012. LFPDUX A4, AO2, INC4
  1013. FXCPMADD f9, B4, A2, f9
  1014. LFPDUX A8, AO, INC4
  1015. FXCSMADD f13, B4, A2, f13
  1016. nop
  1017. FXCPMADD f2, B6, A10, f2
  1018. nop
  1019. FXCSMADD f6, B6, A10, f6
  1020. nop
  1021. FXCPMADD f10, B4, A10, f10
  1022. nop
  1023. FXCSMADD f14, B4, A10, f14
  1024. nop
  1025. FXCPMADD f3, B6, A4, f3
  1026. LFPDUX A2, AO2, INC4
  1027. FXCSMADD f7, B6, A4, f7
  1028. LFPDUX A9, AO, INC4
  1029. FXCPMADD f11, B4, A4, f11
  1030. nop
  1031. FXCSMADD f15, B4, A4, f15
  1032. bdnz+ .L12
  1033. .align 4
  1034. .L13:
  1035. ## 1 ##
  1036. FXCPMADD f0, B1, A1, f0
  1037. nop
  1038. FXCSMADD f4, B1, A1, f4
  1039. nop
  1040. FXCPMADD f8, B2, A1, f8
  1041. LFPDUX B4, BO2, INC4
  1042. FXCSMADD f12, B2, A1, f12
  1043. LFPDUX B6, BO, INC4
  1044. FXCPMADD f1, B1, A2, f1
  1045. nop
  1046. FXCSMADD f5, B1, A2, f5
  1047. LFPDUX A4, AO2, INC4
  1048. FXCPMADD f9, B2, A2, f9
  1049. LFPDUX A10, AO, INC4
  1050. FXCSMADD f13, B2, A2, f13
  1051. nop
  1052. FXCPMADD f2, B1, A3, f2
  1053. nop
  1054. FXCSMADD f6, B1, A3, f6
  1055. nop
  1056. FXCPMADD f10, B2, A3, f10
  1057. nop
  1058. FXCSMADD f14, B2, A3, f14
  1059. nop
  1060. FXCPMADD f3, B1, A4, f3
  1061. nop
  1062. FXCSMADD f7, B1, A4, f7
  1063. LFPDUX A2, AO2, INC4
  1064. FXCPMADD f11, B2, A4, f11
  1065. nop
  1066. FXCSMADD f15, B2, A4, f15
  1067. nop
  1068. ## 2 ##
  1069. FXCPMADD f0, B3, A5, f0
  1070. nop
  1071. FXCSMADD f4, B3, A5, f4
  1072. nop
  1073. FXCPMADD f8, B4, A5, f8
  1074. LFPDUX B2, BO2, INC4
  1075. FXCSMADD f12, B4, A5, f12
  1076. nop
  1077. FXCPMADD f1, B3, A2, f1
  1078. nop
  1079. FXCSMADD f5, B3, A2, f5
  1080. LFPDUX A4, AO2, INC4
  1081. FXCPMADD f9, B4, A2, f9
  1082. nop
  1083. FXCSMADD f13, B4, A2, f13
  1084. nop
  1085. FXCPMADD f2, B3, A6, f2
  1086. nop
  1087. FXCSMADD f6, B3, A6, f6
  1088. nop
  1089. FXCPMADD f10, B4, A6, f10
  1090. nop
  1091. FXCSMADD f14, B4, A6, f14
  1092. nop
  1093. FXCPMADD f3, B3, A4, f3
  1094. nop
  1095. FXCSMADD f7, B3, A4, f7
  1096. LFPDUX A2, AO2, INC4
  1097. FXCPMADD f11, B4, A4, f11
  1098. nop
  1099. FXCSMADD f15, B4, A4, f15
  1100. nop
  1101. ## 3 ##
  1102. FXCPMADD f0, B5, A7, f0
  1103. nop
  1104. FXCSMADD f4, B5, A7, f4
  1105. nop
  1106. FXCPMADD f8, B2, A7, f8
  1107. LFPDUX B4, BO2, INC4
  1108. FXCSMADD f12, B2, A7, f12
  1109. nop
  1110. FXCPMADD f1, B5, A2, f1
  1111. nop
  1112. FXCSMADD f5, B5, A2, f5
  1113. LFPDUX A4, AO2, INC4
  1114. FXCPMADD f9, B2, A2, f9
  1115. nop
  1116. FXCSMADD f13, B2, A2, f13
  1117. nop
  1118. FXCPMADD f2, B5, A8, f2
  1119. nop
  1120. FXCSMADD f6, B5, A8, f6
  1121. nop
  1122. FXCPMADD f10, B2, A8, f10
  1123. nop
  1124. FXCSMADD f14, B2, A8, f14
  1125. nop
  1126. FXCPMADD f3, B5, A4, f3
  1127. nop
  1128. FXCSMADD f7, B5, A4, f7
  1129. LFPDUX A2, AO2, INC4
  1130. FXCPMADD f11, B2, A4, f11
  1131. nop
  1132. FXCSMADD f15, B2, A4, f15
  1133. nop
  1134. ## 4 ##
  1135. FXCPMADD f0, B6, A9, f0
  1136. nop
  1137. FXCSMADD f4, B6, A9, f4
  1138. nop
  1139. FXCPMADD f8, B4, A9, f8
  1140. nop
  1141. FXCSMADD f12, B4, A9, f12
  1142. nop
  1143. FXCPMADD f1, B6, A2, f1
  1144. nop
  1145. FXCSMADD f5, B6, A2, f5
  1146. LFPDUX A4, AO2, INC4
  1147. FXCPMADD f9, B4, A2, f9
  1148. nop
  1149. FXCSMADD f13, B4, A2, f13
  1150. nop
  1151. FXCPMADD f2, B6, A10, f2
  1152. nop
  1153. FXCSMADD f6, B6, A10, f6
  1154. nop
  1155. FXCPMADD f10, B4, A10, f10
  1156. nop
  1157. FXCSMADD f14, B4, A10, f14
  1158. nop
  1159. FXCPMADD f3, B6, A4, f3
  1160. nop
  1161. FXCSMADD f7, B6, A4, f7
  1162. nop
  1163. FXCPMADD f11, B4, A4, f11
  1164. nop
  1165. FXCSMADD f15, B4, A4, f15
  1166. nop
  1167. .align 4
  1168. .L14:
  1169. #if defined(LT) || defined(RN)
  1170. andi. r0, KK, 3
  1171. mtspr CTR, r0
  1172. ble+ .L18
  1173. #else
  1174. andi. r0, TEMP, 3
  1175. mtspr CTR, r0
  1176. ble+ .L18
  1177. #endif
  1178. .L15:
  1179. LFPDUX A2, AO, INC4
  1180. LFPDUX A4, AO2, INC4
  1181. LFPDUX A10, BO, INC4
  1182. LFPDUX B4, BO2, INC4
  1183. bdz- .L17
  1184. .align 4
  1185. .L16:
  1186. FXCPMADD f0, A10, A2, f0
  1187. FXCSMADD f4, A10, A2, f4
  1188. FXCPMADD f8, B4, A2, f8
  1189. FXCSMADD f12, B4, A2, f12
  1190. LFPDUX A2, AO, INC4
  1191. FXCPMADD f1, A10, A4, f1
  1192. FXCSMADD f5, A10, A4, f5
  1193. FXCPMADD f9, B4, A4, f9
  1194. FXCSMADD f13, B4, A4, f13
  1195. LFPDUX A4, AO2, INC4
  1196. FXCPMADD f2, A10, A2, f2
  1197. FXCSMADD f6, A10, A2, f6
  1198. FXCPMADD f10, B4, A2, f10
  1199. FXCSMADD f14, B4, A2, f14
  1200. LFPDUX A2, AO, INC4
  1201. FXCPMADD f3, A10, A4, f3
  1202. FXCSMADD f7, A10, A4, f7
  1203. LFPDUX A10, BO, INC4
  1204. FXCPMADD f11, B4, A4, f11
  1205. FXCSMADD f15, B4, A4, f15
  1206. LFPDUX A4, AO2, INC4
  1207. LFPDUX B4, BO2, INC4
  1208. bdnz+ .L16
  1209. .align 4
  1210. .L17:
  1211. FXCPMADD f0, A10, A2, f0
  1212. FXCSMADD f4, A10, A2, f4
  1213. FXCPMADD f8, B4, A2, f8
  1214. FXCSMADD f12, B4, A2, f12
  1215. LFPDUX A2, AO, INC4
  1216. FXCPMADD f1, A10, A4, f1
  1217. FXCSMADD f5, A10, A4, f5
  1218. FXCPMADD f9, B4, A4, f9
  1219. FXCSMADD f13, B4, A4, f13
  1220. LFPDUX A4, AO2, INC4
  1221. FXCPMADD f2, A10, A2, f2
  1222. FXCSMADD f6, A10, A2, f6
  1223. FXCPMADD f10, B4, A2, f10
  1224. FXCSMADD f14, B4, A2, f14
  1225. FXCPMADD f3, A10, A4, f3
  1226. FXCSMADD f7, A10, A4, f7
  1227. FXCPMADD f11, B4, A4, f11
  1228. FXCSMADD f15, B4, A4, f15
  1229. .align 4
  1230. .L18:
  1231. fpadd f0, f0, f4
  1232. fpadd f8, f8, f12
  1233. fpadd f1, f1, f5
  1234. fpadd f9, f9, f13
  1235. fpadd f2, f2, f6
  1236. fpadd f10, f10, f14
  1237. fpadd f3, f3, f7
  1238. fpadd f11, f11, f15
  1239. #if defined(LN) || defined(RT)
  1240. #ifdef LN
  1241. subi r0, KK, 4
  1242. #else
  1243. subi r0, KK, 2
  1244. #endif
  1245. slwi TEMP, r0, 2 + ZBASE_SHIFT
  1246. slwi r0, r0, 1 + ZBASE_SHIFT
  1247. add AO, AORIG, TEMP
  1248. add BO, B, r0
  1249. addi AO2, AO, 2 * SIZE
  1250. addi BO, BO, - 4 * SIZE
  1251. addi BO2, BO, 2 * SIZE
  1252. #endif
  1253. #if defined(LN) || defined(LT)
  1254. LFPDUX f16, BO, INC4
  1255. LFPDUX f20, BO2, INC4
  1256. LFPDUX f17, BO, INC4
  1257. LFPDUX f21, BO2, INC4
  1258. LFPDUX f18, BO, INC4
  1259. LFPDUX f22, BO2, INC4
  1260. LFPDUX f19, BO, INC4
  1261. LFPDUX f23, BO2, INC4
  1262. subi BO, BO, 16 * SIZE
  1263. subi BO2, BO2, 16 * SIZE
  1264. #else
  1265. LFPDUX f16, AO, INC4
  1266. LFPDUX f17, AO2, INC4
  1267. LFPDUX f18, AO, INC4
  1268. LFPDUX f19, AO2, INC4
  1269. LFPDUX f20, AO, INC4
  1270. LFPDUX f21, AO2, INC4
  1271. LFPDUX f22, AO, INC4
  1272. LFPDUX f23, AO2, INC4
  1273. subi AO, AO, 16 * SIZE
  1274. subi AO2, AO2, 16 * SIZE
  1275. #endif
  1276. fpsub f0, f16, f0
  1277. fpsub f1, f17, f1
  1278. fpsub f2, f18, f2
  1279. fpsub f3, f19, f3
  1280. fpsub f8, f20, f8
  1281. fpsub f9, f21, f9
  1282. fpsub f10, f22, f10
  1283. fpsub f11, f23, f11
  1284. #ifdef LN
  1285. LFPDUX A1, AO, INC4
  1286. add AO2, AO2, INC4
  1287. add AO, AO, INC4
  1288. add AO2, AO2, INC4
  1289. LFPDUX A2, AO, INC4
  1290. LFPDUX A3, AO2, INC4
  1291. add AO, AO, INC4
  1292. add AO2, AO2, INC4
  1293. LFPDUX A4, AO, INC4
  1294. LFPDUX A5, AO2, INC4
  1295. LFPDUX A6, AO, INC4
  1296. add AO2, AO2, INC4
  1297. LFPDUX A7, AO, INC4
  1298. LFPDUX A8, AO2, INC4
  1299. LFPDUX A9, AO, INC4
  1300. LFPDUX A10, AO2, INC4
  1301. subi AO, AO, 32 * SIZE
  1302. subi AO2, AO2, 32 * SIZE
  1303. fxpmul f4, A10, f3
  1304. fxpmul f5, A10, f11
  1305. FXCXNPMA f3, A10, f3, f4
  1306. FXCXNPMA f11, A10, f11, f5
  1307. fxcpnmsub f2, A9, f3, f2
  1308. fxcpnmsub f10, A9, f11, f10
  1309. FXCXNSMA f2, A9, f3, f2
  1310. FXCXNSMA f10, A9, f11, f10
  1311. fxcpnmsub f1, A8, f3, f1
  1312. fxcpnmsub f9, A8, f11, f9
  1313. FXCXNSMA f1, A8, f3, f1
  1314. FXCXNSMA f9, A8, f11, f9
  1315. fxcpnmsub f0, A7, f3, f0
  1316. fxcpnmsub f8, A7, f11, f8
  1317. FXCXNSMA f0, A7, f3, f0
  1318. FXCXNSMA f8, A7, f11, f8
  1319. fxpmul f4, A6, f2
  1320. fxpmul f5, A6, f10
  1321. FXCXNPMA f2, A6, f2, f4
  1322. FXCXNPMA f10, A6, f10, f5
  1323. fxcpnmsub f1, A5, f2, f1
  1324. fxcpnmsub f9, A5, f10, f9
  1325. FXCXNSMA f1, A5, f2, f1
  1326. FXCXNSMA f9, A5, f10, f9
  1327. fxcpnmsub f0, A4, f2, f0
  1328. fxcpnmsub f8, A4, f10, f8
  1329. FXCXNSMA f0, A4, f2, f0
  1330. FXCXNSMA f8, A4, f10, f8
  1331. fxpmul f4, A3, f1
  1332. fxpmul f5, A3, f9
  1333. FXCXNPMA f1, A3, f1, f4
  1334. FXCXNPMA f9, A3, f9, f5
  1335. fxcpnmsub f0, A2, f1, f0
  1336. fxcpnmsub f8, A2, f9, f8
  1337. FXCXNSMA f0, A2, f1, f0
  1338. FXCXNSMA f8, A2, f9, f8
  1339. fxpmul f4, A1, f0
  1340. fxpmul f5, A1, f8
  1341. FXCXNPMA f0, A1, f0, f4
  1342. FXCXNPMA f8, A1, f8, f5
  1343. #endif
  1344. #ifdef LT
  1345. LFPDUX A1, AO, INC4
  1346. LFPDUX A2, AO2, INC4
  1347. LFPDUX A3, AO, INC4
  1348. LFPDUX A4, AO2, INC4
  1349. add AO, AO, INC4
  1350. LFPDUX A5, AO2, INC4
  1351. LFPDUX A6, AO, INC4
  1352. LFPDUX A7, AO2, INC4
  1353. add AO, AO, INC4
  1354. add AO2, AO2, INC4
  1355. LFPDUX A8, AO, INC4
  1356. LFPDUX A9, AO2, INC4
  1357. add AO, AO, INC4
  1358. add AO2, AO2, INC4
  1359. add AO, AO, INC4
  1360. LFPDUX A10, AO2, INC4
  1361. subi AO, AO, 32 * SIZE
  1362. subi AO2, AO2, 32 * SIZE
  1363. fxpmul f4, A1, f0
  1364. fxpmul f5, A1, f8
  1365. FXCXNPMA f0, A1, f0, f4
  1366. FXCXNPMA f8, A1, f8, f5
  1367. fxcpnmsub f1, A2, f0, f1
  1368. fxcpnmsub f9, A2, f8, f9
  1369. FXCXNSMA f1, A2, f0, f1
  1370. FXCXNSMA f9, A2, f8, f9
  1371. fxcpnmsub f2, A3, f0, f2
  1372. fxcpnmsub f10, A3, f8, f10
  1373. FXCXNSMA f2, A3, f0, f2
  1374. FXCXNSMA f10, A3, f8, f10
  1375. fxcpnmsub f3, A4, f0, f3
  1376. fxcpnmsub f11, A4, f8, f11
  1377. FXCXNSMA f3, A4, f0, f3
  1378. FXCXNSMA f11, A4, f8, f11
  1379. fxpmul f6, A5, f1
  1380. fxpmul f7, A5, f9
  1381. FXCXNPMA f1, A5, f1, f6
  1382. FXCXNPMA f9, A5, f9, f7
  1383. fxcpnmsub f2, A6, f1, f2
  1384. fxcpnmsub f10, A6, f9, f10
  1385. FXCXNSMA f2, A6, f1, f2
  1386. FXCXNSMA f10, A6, f9, f10
  1387. fxcpnmsub f3, A7, f1, f3
  1388. fxcpnmsub f11, A7, f9, f11
  1389. FXCXNSMA f3, A7, f1, f3
  1390. FXCXNSMA f11, A7, f9, f11
  1391. fxpmul f4, A8, f2
  1392. fxpmul f5, A8, f10
  1393. FXCXNPMA f2, A8, f2, f4
  1394. FXCXNPMA f10, A8, f10, f5
  1395. fxcpnmsub f3, A9, f2, f3
  1396. fxcpnmsub f11, A9, f10, f11
  1397. FXCXNSMA f3, A9, f2, f3
  1398. FXCXNSMA f11, A9, f10, f11
  1399. fxpmul f6, A10, f3
  1400. fxpmul f7, A10, f11
  1401. FXCXNPMA f3, A10, f3, f6
  1402. FXCXNPMA f11, A10, f11, f7
  1403. #endif
  1404. #ifdef RN
  1405. LFPDUX A1, BO, INC4
  1406. LFPDUX A2, BO2, INC4
  1407. add BO, BO, INC4
  1408. LFPDUX A3, BO2, INC4
  1409. subi BO, BO, 8 * SIZE
  1410. subi BO2, BO2, 8 * SIZE
  1411. fxpmul f4, A1, f0
  1412. fxpmul f5, A1, f1
  1413. fxpmul f6, A1, f2
  1414. fxpmul f7, A1, f3
  1415. FXCXNPMA f0, A1, f0, f4
  1416. FXCXNPMA f1, A1, f1, f5
  1417. FXCXNPMA f2, A1, f2, f6
  1418. FXCXNPMA f3, A1, f3, f7
  1419. fxcpnmsub f8, A2, f0, f8
  1420. fxcpnmsub f9, A2, f1, f9
  1421. fxcpnmsub f10, A2, f2, f10
  1422. fxcpnmsub f11, A2, f3, f11
  1423. FXCXNSMA f8, A2, f0, f8
  1424. FXCXNSMA f9, A2, f1, f9
  1425. FXCXNSMA f10, A2, f2, f10
  1426. FXCXNSMA f11, A2, f3, f11
  1427. fxpmul f4, A3, f8
  1428. fxpmul f5, A3, f9
  1429. fxpmul f6, A3, f10
  1430. fxpmul f7, A3, f11
  1431. FXCXNPMA f8, A3, f8, f4
  1432. FXCXNPMA f9, A3, f9, f5
  1433. FXCXNPMA f10, A3, f10, f6
  1434. FXCXNPMA f11, A3, f11, f7
  1435. #endif
  1436. #ifdef RT
  1437. LFPDUX A1, BO, INC4
  1438. add BO2, BO2, INC4
  1439. LFPDUX A2, BO, INC4
  1440. LFPDUX A3, BO2, INC4
  1441. subi BO, BO, 8 * SIZE
  1442. subi BO2, BO2, 8 * SIZE
  1443. fxpmul f4, A3, f8
  1444. fxpmul f5, A3, f9
  1445. fxpmul f6, A3, f10
  1446. fxpmul f7, A3, f11
  1447. FXCXNPMA f8, A3, f8, f4
  1448. FXCXNPMA f9, A3, f9, f5
  1449. FXCXNPMA f10, A3, f10, f6
  1450. FXCXNPMA f11, A3, f11, f7
  1451. fxcpnmsub f0, A2, f8, f0
  1452. fxcpnmsub f1, A2, f9, f1
  1453. fxcpnmsub f2, A2, f10, f2
  1454. fxcpnmsub f3, A2, f11, f3
  1455. FXCXNSMA f0, A2, f8, f0
  1456. FXCXNSMA f1, A2, f9, f1
  1457. FXCXNSMA f2, A2, f10, f2
  1458. FXCXNSMA f3, A2, f11, f3
  1459. fxpmul f4, A1, f0
  1460. fxpmul f5, A1, f1
  1461. fxpmul f6, A1, f2
  1462. fxpmul f7, A1, f3
  1463. FXCXNPMA f0, A1, f0, f4
  1464. FXCXNPMA f1, A1, f1, f5
  1465. FXCXNPMA f2, A1, f2, f6
  1466. FXCXNPMA f3, A1, f3, f7
  1467. #endif
  1468. #ifdef LN
  1469. subi CO1, CO1, 8 * SIZE
  1470. subi CO2, CO2, 8 * SIZE
  1471. #endif
  1472. #if defined(LN) || defined(LT)
  1473. STFPDUX f0, BO, INC4
  1474. STFPDUX f8, BO2, INC4
  1475. STFPDUX f1, BO, INC4
  1476. STFPDUX f9, BO2, INC4
  1477. STFPDUX f2, BO, INC4
  1478. STFPDUX f10, BO2, INC4
  1479. STFPDUX f3, BO, INC4
  1480. STFPDUX f11, BO2, INC4
  1481. subi BO, BO, 16 * SIZE
  1482. subi BO2, BO2, 16 * SIZE
  1483. #else
  1484. STFPDUX f0, AO, INC4
  1485. STFPDUX f1, AO2, INC4
  1486. STFPDUX f2, AO, INC4
  1487. STFPDUX f3, AO2, INC4
  1488. STFPDUX f8, AO, INC4
  1489. STFPDUX f9, AO2, INC4
  1490. STFPDUX f10, AO, INC4
  1491. STFPDUX f11, AO2, INC4
  1492. subi AO, AO, 16 * SIZE
  1493. subi AO2, AO2, 16 * SIZE
  1494. #endif
  1495. STFDUX f0, CO1, INC
  1496. STFSDUX f0, CO1, INC
  1497. STFDUX f1, CO1, INC
  1498. STFSDUX f1, CO1, INC
  1499. STFDUX f2, CO1, INC
  1500. STFSDUX f2, CO1, INC
  1501. STFDUX f3, CO1, INC
  1502. STFSDUX f3, CO1, INC
  1503. STFDUX f8, CO2, INC
  1504. STFSDUX f8, CO2, INC
  1505. STFDUX f9, CO2, INC
  1506. STFSDUX f9, CO2, INC
  1507. STFDUX f10, CO2, INC
  1508. STFSDUX f10, CO2, INC
  1509. STFDUX f11, CO2, INC
  1510. STFSDUX f11, CO2, INC
  1511. #ifdef LN
  1512. subi CO1, CO1, 8 * SIZE
  1513. subi CO2, CO2, 8 * SIZE
  1514. #endif
  1515. #ifdef RT
  1516. slwi r0, K, 2 + ZBASE_SHIFT
  1517. add AORIG, AORIG, r0
  1518. #endif
  1519. #if defined(LT) || defined(RN)
  1520. sub TEMP, K, KK
  1521. slwi r0, TEMP, 2 + ZBASE_SHIFT
  1522. slwi TEMP, TEMP, 1 + ZBASE_SHIFT
  1523. add AO, AO, r0
  1524. add BO, BO, TEMP
  1525. #endif
  1526. #ifdef LT
  1527. addi KK, KK, 4
  1528. #endif
  1529. #ifdef LN
  1530. subi KK, KK, 4
  1531. #endif
  1532. addic. I, I, -1
  1533. li r0, FZERO
  1534. lfpsx f0, SP, r0
  1535. bgt+ .L11
  1536. .align 4
  1537. .L49:
  1538. #ifdef LN
  1539. slwi r0, K, 1 + ZBASE_SHIFT
  1540. add B, B, r0
  1541. #endif
  1542. #if defined(LT) || defined(RN)
  1543. addi B, BO, 4 * SIZE
  1544. #endif
  1545. #ifdef RN
  1546. addi KK, KK, 2
  1547. #endif
  1548. #ifdef RT
  1549. subi KK, KK, 2
  1550. #endif
  1551. addic. J, J, -1
  1552. bgt+ .L10
  1553. .align 4
  1554. .L50:
  1555. andi. J, N, 1
  1556. beq .L999
  1557. #ifdef RT
  1558. slwi r0, K, 0 + ZBASE_SHIFT
  1559. sub B, B, r0
  1560. sub C, C, LDC
  1561. #endif
  1562. mr CO1, C
  1563. #ifdef LN
  1564. add KK, M, OFFSET
  1565. #endif
  1566. #ifdef LT
  1567. mr KK, OFFSET
  1568. #endif
  1569. #if defined(LN) || defined(RT)
  1570. addi AORIG, A, -2 * SIZE
  1571. #else
  1572. addi AO, A, -2 * SIZE
  1573. #endif
  1574. #ifndef RT
  1575. add C, CO2, LDC
  1576. #endif
  1577. li r0, FZERO
  1578. lfpsx f0, SP, r0
  1579. andi. I, M, 1
  1580. beq .L60
  1581. #if defined(LT) || defined(RN)
  1582. addi BO, B, - 2 * SIZE
  1583. fpmr f1, f0
  1584. fpmr f2, f0
  1585. fpmr f3, f0
  1586. srawi. r0, KK, 3
  1587. mtspr CTR, r0
  1588. ble .L74
  1589. #else
  1590. #ifdef LN
  1591. slwi r0, K, 0 + ZBASE_SHIFT
  1592. sub AORIG, AORIG, r0
  1593. #endif
  1594. slwi TEMP, KK, 0 + ZBASE_SHIFT
  1595. add AO, AORIG, TEMP
  1596. add BO, B, TEMP
  1597. sub TEMP, K, KK
  1598. addi BO, BO, - 2 * SIZE
  1599. fpmr f1, f0
  1600. fpmr f2, f0
  1601. fpmr f3, f0
  1602. srawi. r0, TEMP, 3
  1603. mtspr CTR, r0
  1604. ble .L74
  1605. #endif
  1606. LFPDUX A1, AO, INC2
  1607. LFPDUX B1, BO, INC2
  1608. LFPDUX A2, AO, INC2
  1609. LFPDUX B2, BO, INC2
  1610. LFPDUX A3, AO, INC2
  1611. LFPDUX B3, BO, INC2
  1612. LFPDUX A4, AO, INC2
  1613. LFPDUX B4, BO, INC2
  1614. LFPDUX A5, AO, INC2
  1615. LFPDUX B5, BO, INC2
  1616. LFPDUX A6, AO, INC2
  1617. LFPDUX B6, BO, INC2
  1618. LFPDUX A7, AO, INC2
  1619. LFPDUX A9, BO, INC2
  1620. LFPDUX A8, AO, INC2
  1621. LFPDUX A10, BO, INC2
  1622. bdz- .L73
  1623. .align 4
  1624. .L72:
  1625. FXCPMADD f0, B1, A1, f0
  1626. FXCSMADD f1, B1, A1, f1
  1627. LFPDUX A1, AO, INC2
  1628. LFPDUX B1, BO, INC2
  1629. FXCPMADD f2, B2, A2, f2
  1630. FXCSMADD f3, B2, A2, f3
  1631. LFPDUX A2, AO, INC2
  1632. LFPDUX B2, BO, INC2
  1633. FXCPMADD f0, B3, A3, f0
  1634. FXCSMADD f1, B3, A3, f1
  1635. LFPDUX A3, AO, INC2
  1636. LFPDUX B3, BO, INC2
  1637. FXCPMADD f2, B4, A4, f2
  1638. FXCSMADD f3, B4, A4, f3
  1639. LFPDUX A4, AO, INC2
  1640. LFPDUX B4, BO, INC2
  1641. FXCPMADD f0, B5, A5, f0
  1642. FXCSMADD f1, B5, A5, f1
  1643. LFPDUX A5, AO, INC2
  1644. LFPDUX B5, BO, INC2
  1645. FXCPMADD f2, B6, A6, f2
  1646. FXCSMADD f3, B6, A6, f3
  1647. LFPDUX A6, AO, INC2
  1648. LFPDUX B6, BO, INC2
  1649. FXCPMADD f0, A9, A7, f0
  1650. FXCSMADD f1, A9, A7, f1
  1651. LFPDUX A7, AO, INC2
  1652. LFPDUX A9, BO, INC2
  1653. FXCPMADD f2, A10, A8, f2
  1654. FXCSMADD f3, A10, A8, f3
  1655. LFPDUX A8, AO, INC2
  1656. LFPDUX A10, BO, INC2
  1657. bdnz+ .L72
  1658. .align 4
  1659. .L73:
  1660. FXCPMADD f0, B1, A1, f0
  1661. FXCSMADD f1, B1, A1, f1
  1662. FXCPMADD f2, B2, A2, f2
  1663. FXCSMADD f3, B2, A2, f3
  1664. FXCPMADD f0, B3, A3, f0
  1665. FXCSMADD f1, B3, A3, f1
  1666. FXCPMADD f2, B4, A4, f2
  1667. FXCSMADD f3, B4, A4, f3
  1668. FXCPMADD f0, B5, A5, f0
  1669. FXCSMADD f1, B5, A5, f1
  1670. FXCPMADD f2, B6, A6, f2
  1671. FXCSMADD f3, B6, A6, f3
  1672. FXCPMADD f0, A9, A7, f0
  1673. FXCSMADD f1, A9, A7, f1
  1674. FXCPMADD f2, A10, A8, f2
  1675. FXCSMADD f3, A10, A8, f3
  1676. .align 4
  1677. .L74:
  1678. #if defined(LT) || defined(RN)
  1679. andi. r0, KK, 7
  1680. mtspr CTR, r0
  1681. ble+ .L78
  1682. #else
  1683. andi. r0, TEMP, 7
  1684. mtspr CTR, r0
  1685. ble+ .L78
  1686. #endif
  1687. LFPDUX A1, AO, INC2
  1688. LFPDUX B1, BO, INC2
  1689. bdz- .L77
  1690. .align 4
  1691. .L76:
  1692. FXCPMADD f0, B1, A1, f0
  1693. FXCSMADD f1, B1, A1, f1
  1694. LFPDUX A1, AO, INC2
  1695. LFPDUX B1, BO, INC2
  1696. bdnz+ .L76
  1697. .align 4
  1698. .L77:
  1699. FXCPMADD f0, B1, A1, f0
  1700. FXCSMADD f1, B1, A1, f1
  1701. .align 4
  1702. .L78:
  1703. fpadd f0, f0, f2
  1704. fpadd f1, f1, f3
  1705. fpadd f0, f0, f1
  1706. #if defined(LN) || defined(RT)
  1707. #ifdef LN
  1708. subi r0, KK, 1
  1709. #else
  1710. subi r0, KK, 1
  1711. #endif
  1712. slwi TEMP, r0, 0 + ZBASE_SHIFT
  1713. add AO, AORIG, TEMP
  1714. add BO, B, TEMP
  1715. addi BO, BO, - 2 * SIZE
  1716. #endif
  1717. #if defined(LN) || defined(LT)
  1718. LFPDX f16, BO, INC2
  1719. #else
  1720. LFPDX f16, AO, INC2
  1721. #endif
  1722. fpsub f0, f16, f0
  1723. #ifdef LN
  1724. LFPDX A1, AO, INC2
  1725. fxpmul f4, A1, f0
  1726. FXCXNPMA f0, A1, f0, f4
  1727. #endif
  1728. #ifdef LT
  1729. LFPDX A1, AO, INC2
  1730. fxpmul f4, A1, f0
  1731. FXCXNPMA f0, A1, f0, f4
  1732. #endif
  1733. #ifdef RN
  1734. LFPDX A1, BO, INC2
  1735. fxpmul f4, A1, f0
  1736. FXCXNPMA f0, A1, f0, f4
  1737. #endif
  1738. #ifdef RT
  1739. LFPDX A1, BO, INC2
  1740. fxpmul f4, A1, f0
  1741. FXCXNPMA f0, A1, f0, f4
  1742. #endif
  1743. #ifdef LN
  1744. subi CO1, CO1, 2 * SIZE
  1745. #endif
  1746. #if defined(LN) || defined(LT)
  1747. STFPDX f0, BO, INC2
  1748. #else
  1749. STFPDX f0, AO, INC2
  1750. #endif
  1751. STFDUX f0, CO1, INC
  1752. STFSDUX f0, CO1, INC
  1753. #ifdef LN
  1754. subi CO1, CO1, 2 * SIZE
  1755. #endif
  1756. #ifdef RT
  1757. slwi r0, K, 0 + ZBASE_SHIFT
  1758. add AORIG, AORIG, r0
  1759. #endif
  1760. #if defined(LT) || defined(RN)
  1761. sub TEMP, K, KK
  1762. slwi TEMP, TEMP, 0 + ZBASE_SHIFT
  1763. add AO, AO, TEMP
  1764. add BO, BO, TEMP
  1765. #endif
  1766. #ifdef LT
  1767. addi KK, KK, 1
  1768. #endif
  1769. #ifdef LN
  1770. subi KK, KK, 1
  1771. #endif
  1772. li r0, FZERO
  1773. lfpsx f0, SP, r0
  1774. .align 4
  1775. .L60:
  1776. andi. I, M, 2
  1777. beq .L70
  1778. #if defined(LT) || defined(RN)
  1779. fpmr f1, f0
  1780. addi BO, B, - 2 * SIZE
  1781. fpmr f2, f0
  1782. fpmr f3, f0
  1783. srawi. r0, KK, 2
  1784. mtspr CTR, r0
  1785. ble .L64
  1786. #else
  1787. #ifdef LN
  1788. slwi r0, K, 1 + ZBASE_SHIFT
  1789. sub AORIG, AORIG, r0
  1790. #endif
  1791. slwi r0 , KK, 1 + ZBASE_SHIFT
  1792. slwi TEMP, KK, 0 + ZBASE_SHIFT
  1793. add AO, AORIG, r0
  1794. add BO, B, TEMP
  1795. sub TEMP, K, KK
  1796. fpmr f1, f0
  1797. addi BO, BO, - 2 * SIZE
  1798. fpmr f2, f0
  1799. fpmr f3, f0
  1800. srawi. r0, TEMP, 2
  1801. mtspr CTR, r0
  1802. ble .L64
  1803. #endif
  1804. LFPDUX B1, BO, INC2
  1805. LFPDUX A1, AO, INC2
  1806. LFPDUX A2, AO, INC2
  1807. LFPDUX B2, BO, INC2
  1808. LFPDUX A3, AO, INC2
  1809. LFPDUX A4, AO, INC2
  1810. LFPDUX B3, BO, INC2
  1811. LFPDUX A5, AO, INC2
  1812. LFPDUX A6, AO, INC2
  1813. LFPDUX B4, BO, INC2
  1814. LFPDUX A7, AO, INC2
  1815. LFPDUX A8, AO, INC2
  1816. bdz- .L63
  1817. .align 4
  1818. .L62:
  1819. FXCPMADD f0, B1, A1, f0
  1820. FXCSMADD f2, B1, A1, f2
  1821. LFPDUX A1, AO, INC2
  1822. FXCPMADD f1, B1, A2, f1
  1823. FXCSMADD f3, B1, A2, f3
  1824. LFPDUX A2, AO, INC2
  1825. LFPDUX B1, BO, INC2
  1826. FXCPMADD f0, B2, A3, f0
  1827. FXCSMADD f2, B2, A3, f2
  1828. LFPDUX A3, AO, INC2
  1829. FXCPMADD f1, B2, A4, f1
  1830. FXCSMADD f3, B2, A4, f3
  1831. LFPDUX A4, AO, INC2
  1832. LFPDUX B2, BO, INC2
  1833. FXCPMADD f0, B3, A5, f0
  1834. FXCSMADD f2, B3, A5, f2
  1835. LFPDUX A5, AO, INC2
  1836. FXCPMADD f1, B3, A6, f1
  1837. FXCSMADD f3, B3, A6, f3
  1838. LFPDUX A6, AO, INC2
  1839. LFPDUX B3, BO, INC2
  1840. FXCPMADD f0, B4, A7, f0
  1841. FXCSMADD f2, B4, A7, f2
  1842. LFPDUX A7, AO, INC2
  1843. FXCPMADD f1, B4, A8, f1
  1844. FXCSMADD f3, B4, A8, f3
  1845. LFPDUX A8, AO, INC2
  1846. LFPDUX B4, BO, INC2
  1847. bdnz+ .L62
  1848. .align 4
  1849. .L63:
  1850. FXCPMADD f0, B1, A1, f0
  1851. FXCSMADD f2, B1, A1, f2
  1852. FXCPMADD f1, B1, A2, f1
  1853. FXCSMADD f3, B1, A2, f3
  1854. FXCPMADD f0, B2, A3, f0
  1855. FXCSMADD f2, B2, A3, f2
  1856. FXCPMADD f1, B2, A4, f1
  1857. FXCSMADD f3, B2, A4, f3
  1858. FXCPMADD f0, B3, A5, f0
  1859. FXCSMADD f2, B3, A5, f2
  1860. FXCPMADD f1, B3, A6, f1
  1861. FXCSMADD f3, B3, A6, f3
  1862. FXCPMADD f0, B4, A7, f0
  1863. FXCSMADD f2, B4, A7, f2
  1864. FXCPMADD f1, B4, A8, f1
  1865. FXCSMADD f3, B4, A8, f3
  1866. .align 4
  1867. .L64:
  1868. #if defined(LT) || defined(RN)
  1869. andi. r0, KK, 3
  1870. mtspr CTR, r0
  1871. ble+ .L68
  1872. #else
  1873. andi. r0, TEMP, 3
  1874. mtspr CTR, r0
  1875. ble+ .L68
  1876. #endif
  1877. LFPDUX A1, AO, INC2
  1878. LFPDUX B1, BO, INC2
  1879. LFPDUX A2, AO, INC2
  1880. bdz- .L67
  1881. .align 4
  1882. .L66:
  1883. FXCPMADD f0, B1, A1, f0
  1884. FXCSMADD f2, B1, A1, f2
  1885. LFPDUX A1, AO, INC2
  1886. FXCPMADD f1, B1, A2, f1
  1887. FXCSMADD f3, B1, A2, f3
  1888. LFPDUX B1, BO, INC2
  1889. LFPDUX A2, AO, INC2
  1890. bdnz+ .L66
  1891. .align 4
  1892. .L67:
  1893. FXCPMADD f0, B1, A1, f0
  1894. FXCSMADD f2, B1, A1, f2
  1895. FXCPMADD f1, B1, A2, f1
  1896. FXCSMADD f3, B1, A2, f3
  1897. .align 4
  1898. .L68:
  1899. fpadd f0, f0, f2
  1900. fpadd f1, f1, f3
  1901. #if defined(LN) || defined(RT)
  1902. #ifdef LN
  1903. subi r0, KK, 2
  1904. #else
  1905. subi r0, KK, 1
  1906. #endif
  1907. slwi TEMP, r0, 1 + ZBASE_SHIFT
  1908. slwi r0, r0, 0 + ZBASE_SHIFT
  1909. add AO, AORIG, TEMP
  1910. add BO, B, r0
  1911. addi BO, BO, - 2 * SIZE
  1912. #endif
  1913. #if defined(LN) || defined(LT)
  1914. LFPDUX f16, BO, INC2
  1915. LFPDUX f17, BO, INC2
  1916. subi BO, BO, 4 * SIZE
  1917. #else
  1918. LFPDUX f16, AO, INC2
  1919. LFPDUX f17, AO, INC2
  1920. subi AO, AO, 4 * SIZE
  1921. #endif
  1922. fpsub f0, f16, f0
  1923. fpsub f1, f17, f1
  1924. #ifdef LN
  1925. LFPDUX A1, AO, INC2
  1926. add AO, AO, INC2
  1927. LFPDUX A2, AO, INC2
  1928. LFPDUX A3, AO, INC2
  1929. subi AO, AO, 8 * SIZE
  1930. fxpmul f4, A3, f1
  1931. FXCXNPMA f1, A3, f1, f4
  1932. fxcpnmsub f0, A2, f1, f0
  1933. FXCXNSMA f0, A2, f1, f0
  1934. fxpmul f4, A1, f0
  1935. FXCXNPMA f0, A1, f0, f4
  1936. #endif
  1937. #ifdef LT
  1938. LFPDUX A1, AO, INC2
  1939. LFPDUX A2, AO, INC2
  1940. add AO, AO, INC2
  1941. LFPDUX A3, AO, INC2
  1942. subi AO, AO, 8 * SIZE
  1943. fxpmul f4, A1, f0
  1944. FXCXNPMA f0, A1, f0, f4
  1945. fxcpnmsub f1, A2, f0, f1
  1946. FXCXNSMA f1, A2, f0, f1
  1947. fxpmul f6, A3, f1
  1948. FXCXNPMA f1, A3, f1, f6
  1949. #endif
  1950. #ifdef RN
  1951. LFPDX A1, BO, INC2
  1952. fxpmul f4, A1, f0
  1953. fxpmul f5, A1, f1
  1954. FXCXNPMA f0, A1, f0, f4
  1955. FXCXNPMA f1, A1, f1, f5
  1956. #endif
  1957. #ifdef RT
  1958. LFPDX A1, BO, INC2
  1959. fxpmul f4, A1, f0
  1960. fxpmul f5, A1, f1
  1961. FXCXNPMA f0, A1, f0, f4
  1962. FXCXNPMA f1, A1, f1, f5
  1963. #endif
  1964. #ifdef LN
  1965. subi CO1, CO1, 4 * SIZE
  1966. #endif
  1967. #if defined(LN) || defined(LT)
  1968. STFPDUX f0, BO, INC2
  1969. STFPDUX f1, BO, INC2
  1970. subi BO, BO, 4 * SIZE
  1971. #else
  1972. STFPDUX f0, AO, INC2
  1973. STFPDUX f1, AO, INC2
  1974. subi AO, AO, 4 * SIZE
  1975. #endif
  1976. STFDUX f0, CO1, INC
  1977. STFSDUX f0, CO1, INC
  1978. STFDUX f1, CO1, INC
  1979. STFSDUX f1, CO1, INC
  1980. #ifdef LN
  1981. subi CO1, CO1, 4 * SIZE
  1982. #endif
  1983. #ifdef RT
  1984. slwi r0, K, 1 + ZBASE_SHIFT
  1985. add AORIG, AORIG, r0
  1986. #endif
  1987. #if defined(LT) || defined(RN)
  1988. sub TEMP, K, KK
  1989. slwi r0, TEMP, 1 + ZBASE_SHIFT
  1990. slwi TEMP, TEMP, 0 + ZBASE_SHIFT
  1991. add AO, AO, r0
  1992. add BO, BO, TEMP
  1993. #endif
  1994. #ifdef LT
  1995. addi KK, KK, 2
  1996. #endif
  1997. #ifdef LN
  1998. subi KK, KK, 2
  1999. #endif
  2000. li r0, FZERO
  2001. lfpsx f0, SP, r0
  2002. .align 4
  2003. .L70:
  2004. srawi. I, M, 2
  2005. ble .L89
  2006. .align 4
  2007. .L51:
  2008. #if defined(LT) || defined(RN)
  2009. fpmr f4, f0
  2010. addi BO, B, - 2 * SIZE
  2011. fpmr f1, f0
  2012. fpmr f5, f0
  2013. fpmr f2, f0
  2014. fpmr f6, f0
  2015. fpmr f3, f0
  2016. fpmr f7, f0
  2017. srawi. r0, KK, 2
  2018. mtspr CTR, r0
  2019. ble .L54
  2020. #else
  2021. #ifdef LN
  2022. slwi r0, K, 2 + ZBASE_SHIFT
  2023. sub AORIG, AORIG, r0
  2024. #endif
  2025. slwi r0 , KK, 2 + ZBASE_SHIFT
  2026. slwi TEMP, KK, 0 + ZBASE_SHIFT
  2027. add AO, AORIG, r0
  2028. add BO, B, TEMP
  2029. sub TEMP, K, KK
  2030. fpmr f4, f0
  2031. addi BO, BO, - 2 * SIZE
  2032. fpmr f1, f0
  2033. fpmr f5, f0
  2034. fpmr f2, f0
  2035. fpmr f6, f0
  2036. fpmr f3, f0
  2037. fpmr f7, f0
  2038. srawi. r0, TEMP, 2
  2039. mtspr CTR, r0
  2040. ble .L54
  2041. #endif
  2042. LFPDUX B1, BO, INC2
  2043. LFPDUX A1, AO, INC2
  2044. LFPDUX A2, AO, INC2
  2045. LFPDUX B2, BO, INC2
  2046. LFPDUX A3, AO, INC2
  2047. LFPDUX A4, AO, INC2
  2048. LFPDUX B3, BO, INC2
  2049. LFPDUX A5, AO, INC2
  2050. LFPDUX A6, AO, INC2
  2051. LFPDUX A7, AO, INC2
  2052. LFPDUX A8, AO, INC2
  2053. bdz- .L53
  2054. .align 4
  2055. .L52:
  2056. FXCPMADD f0, B1, A1, f0
  2057. LFPDUX B4, BO, INC2
  2058. FXCSMADD f4, B1, A1, f4
  2059. LFPDUX A1, AO, INC2
  2060. FXCPMADD f1, B1, A2, f1
  2061. nop
  2062. FXCSMADD f5, B1, A2, f5
  2063. LFPDUX A2, AO, INC2
  2064. FXCPMADD f2, B1, A3, f2
  2065. nop
  2066. FXCSMADD f6, B1, A3, f6
  2067. LFPDUX A3, AO, INC2
  2068. FXCPMADD f3, B1, A4, f3
  2069. nop
  2070. FXCSMADD f7, B1, A4, f7
  2071. LFPDUX A4, AO, INC2
  2072. FXCPMADD f0, B2, A5, f0
  2073. LFPDUX B1, BO, INC2
  2074. FXCSMADD f4, B2, A5, f4
  2075. LFPDUX A5, AO, INC2
  2076. FXCPMADD f1, B2, A6, f1
  2077. nop
  2078. FXCSMADD f5, B2, A6, f5
  2079. LFPDUX A6, AO, INC2
  2080. FXCPMADD f2, B2, A7, f2
  2081. nop
  2082. FXCSMADD f6, B2, A7, f6
  2083. LFPDUX A7, AO, INC2
  2084. FXCPMADD f3, B2, A8, f3
  2085. nop
  2086. FXCSMADD f7, B2, A8, f7
  2087. LFPDUX A8, AO, INC2
  2088. FXCPMADD f0, B3, A1, f0
  2089. LFPDUX B2, BO, INC2
  2090. FXCSMADD f4, B3, A1, f4
  2091. LFPDUX A1, AO, INC2
  2092. FXCPMADD f1, B3, A2, f1
  2093. nop
  2094. FXCSMADD f5, B3, A2, f5
  2095. LFPDUX A2, AO, INC2
  2096. FXCPMADD f2, B3, A3, f2
  2097. nop
  2098. FXCSMADD f6, B3, A3, f6
  2099. LFPDUX A3, AO, INC2
  2100. FXCPMADD f3, B3, A4, f3
  2101. nop
  2102. FXCSMADD f7, B3, A4, f7
  2103. LFPDUX A4, AO, INC2
  2104. FXCPMADD f0, B4, A5, f0
  2105. LFPDUX B3, BO, INC2
  2106. FXCSMADD f4, B4, A5, f4
  2107. LFPDUX A5, AO, INC2
  2108. FXCPMADD f1, B4, A6, f1
  2109. nop
  2110. FXCSMADD f5, B4, A6, f5
  2111. LFPDUX A6, AO, INC2
  2112. FXCPMADD f2, B4, A7, f2
  2113. nop
  2114. FXCSMADD f6, B4, A7, f6
  2115. LFPDUX A7, AO, INC2
  2116. FXCPMADD f3, B4, A8, f3
  2117. nop
  2118. FXCSMADD f7, B4, A8, f7
  2119. LFPDUX A8, AO, INC2
  2120. bdnz+ .L52
  2121. .align 4
  2122. .L53:
  2123. FXCPMADD f0, B1, A1, f0
  2124. LFPDUX B4, BO, INC2
  2125. FXCSMADD f4, B1, A1, f4
  2126. LFPDUX A1, AO, INC2
  2127. FXCPMADD f1, B1, A2, f1
  2128. nop
  2129. FXCSMADD f5, B1, A2, f5
  2130. LFPDUX A2, AO, INC2
  2131. FXCPMADD f2, B1, A3, f2
  2132. nop
  2133. FXCSMADD f6, B1, A3, f6
  2134. LFPDUX A3, AO, INC2
  2135. FXCPMADD f3, B1, A4, f3
  2136. nop
  2137. FXCSMADD f7, B1, A4, f7
  2138. LFPDUX A4, AO, INC2
  2139. FXCPMADD f0, B2, A5, f0
  2140. nop
  2141. FXCSMADD f4, B2, A5, f4
  2142. LFPDUX A5, AO, INC2
  2143. FXCPMADD f1, B2, A6, f1
  2144. nop
  2145. FXCSMADD f5, B2, A6, f5
  2146. LFPDUX A6, AO, INC2
  2147. FXCPMADD f2, B2, A7, f2
  2148. nop
  2149. FXCSMADD f6, B2, A7, f6
  2150. LFPDUX A7, AO, INC2
  2151. FXCPMADD f3, B2, A8, f3
  2152. nop
  2153. FXCSMADD f7, B2, A8, f7
  2154. LFPDUX A8, AO, INC2
  2155. FXCPMADD f0, B3, A1, f0
  2156. FXCSMADD f4, B3, A1, f4
  2157. FXCPMADD f1, B3, A2, f1
  2158. FXCSMADD f5, B3, A2, f5
  2159. FXCPMADD f2, B3, A3, f2
  2160. FXCSMADD f6, B3, A3, f6
  2161. FXCPMADD f3, B3, A4, f3
  2162. FXCSMADD f7, B3, A4, f7
  2163. FXCPMADD f0, B4, A5, f0
  2164. FXCSMADD f4, B4, A5, f4
  2165. FXCPMADD f1, B4, A6, f1
  2166. FXCSMADD f5, B4, A6, f5
  2167. FXCPMADD f2, B4, A7, f2
  2168. FXCSMADD f6, B4, A7, f6
  2169. FXCPMADD f3, B4, A8, f3
  2170. FXCSMADD f7, B4, A8, f7
  2171. .align 4
  2172. .L54:
  2173. #if defined(LT) || defined(RN)
  2174. andi. r0, KK, 3
  2175. mtspr CTR, r0
  2176. ble+ .L58
  2177. #else
  2178. andi. r0, TEMP, 3
  2179. mtspr CTR, r0
  2180. ble+ .L58
  2181. #endif
  2182. LFPDUX A1, AO, INC2
  2183. LFPDUX B1, BO, INC2
  2184. LFPDUX A2, AO, INC2
  2185. LFPDUX A3, AO, INC2
  2186. LFPDUX A4, AO, INC2
  2187. bdz- .L57
  2188. .align 4
  2189. .L56:
  2190. FXCPMADD f0, B1, A1, f0
  2191. FXCSMADD f4, B1, A1, f4
  2192. LFPDUX A1, AO, INC2
  2193. FXCPMADD f1, B1, A2, f1
  2194. FXCSMADD f5, B1, A2, f5
  2195. LFPDUX A2, AO, INC2
  2196. FXCPMADD f2, B1, A3, f2
  2197. FXCSMADD f6, B1, A3, f6
  2198. LFPDUX A3, AO, INC2
  2199. FXCPMADD f3, B1, A4, f3
  2200. FXCSMADD f7, B1, A4, f7
  2201. LFPDUX A4, AO, INC2
  2202. LFPDUX B1, BO, INC2
  2203. bdnz+ .L56
  2204. .align 4
  2205. .L57:
  2206. FXCPMADD f0, B1, A1, f0
  2207. FXCSMADD f4, B1, A1, f4
  2208. FXCPMADD f1, B1, A2, f1
  2209. FXCSMADD f5, B1, A2, f5
  2210. FXCPMADD f2, B1, A3, f2
  2211. FXCSMADD f6, B1, A3, f6
  2212. FXCPMADD f3, B1, A4, f3
  2213. FXCSMADD f7, B1, A4, f7
  2214. .align 4
  2215. .L58:
  2216. fpadd f0, f0, f4
  2217. fpadd f1, f1, f5
  2218. fpadd f2, f2, f6
  2219. fpadd f3, f3, f7
  2220. #if defined(LN) || defined(RT)
  2221. #ifdef LN
  2222. subi r0, KK, 4
  2223. #else
  2224. subi r0, KK, 1
  2225. #endif
  2226. slwi TEMP, r0, 2 + ZBASE_SHIFT
  2227. slwi r0, r0, 0 + ZBASE_SHIFT
  2228. add AO, AORIG, TEMP
  2229. add BO, B, r0
  2230. addi BO, BO, - 2 * SIZE
  2231. #endif
  2232. #if defined(LN) || defined(LT)
  2233. LFPDUX f16, BO, INC2
  2234. LFPDUX f17, BO, INC2
  2235. LFPDUX f18, BO, INC2
  2236. LFPDUX f19, BO, INC2
  2237. subi BO, BO, 8 * SIZE
  2238. #else
  2239. LFPDUX f16, AO, INC2
  2240. LFPDUX f17, AO, INC2
  2241. LFPDUX f18, AO, INC2
  2242. LFPDUX f19, AO, INC2
  2243. subi AO, AO, 8 * SIZE
  2244. #endif
  2245. fpsub f0, f16, f0
  2246. fpsub f1, f17, f1
  2247. fpsub f2, f18, f2
  2248. fpsub f3, f19, f3
  2249. #ifdef LN
  2250. LFPDUX A1, AO, INC2
  2251. add AO, AO, INC2
  2252. add AO, AO, INC2
  2253. add AO, AO, INC2
  2254. LFPDUX A2, AO, INC2
  2255. LFPDUX A3, AO, INC2
  2256. add AO, AO, INC2
  2257. add AO, AO, INC2
  2258. LFPDUX A4, AO, INC2
  2259. LFPDUX A5, AO, INC2
  2260. LFPDUX A6, AO, INC2
  2261. add AO, AO, INC2
  2262. LFPDUX A7, AO, INC2
  2263. LFPDUX A8, AO, INC2
  2264. LFPDUX A9, AO, INC2
  2265. LFPDUX A10, AO, INC2
  2266. subi AO, AO, 32 * SIZE
  2267. fxpmul f4, A10, f3
  2268. FXCXNPMA f3, A10, f3, f4
  2269. fxcpnmsub f2, A9, f3, f2
  2270. FXCXNSMA f2, A9, f3, f2
  2271. fxcpnmsub f1, A8, f3, f1
  2272. FXCXNSMA f1, A8, f3, f1
  2273. fxcpnmsub f0, A7, f3, f0
  2274. FXCXNSMA f0, A7, f3, f0
  2275. fxpmul f4, A6, f2
  2276. FXCXNPMA f2, A6, f2, f4
  2277. fxcpnmsub f1, A5, f2, f1
  2278. FXCXNSMA f1, A5, f2, f1
  2279. fxcpnmsub f0, A4, f2, f0
  2280. FXCXNSMA f0, A4, f2, f0
  2281. fxpmul f4, A3, f1
  2282. FXCXNPMA f1, A3, f1, f4
  2283. fxcpnmsub f0, A2, f1, f0
  2284. FXCXNSMA f0, A2, f1, f0
  2285. fxpmul f4, A1, f0
  2286. FXCXNPMA f0, A1, f0, f4
  2287. #endif
  2288. #ifdef LT
  2289. LFPDUX A1, AO, INC2
  2290. LFPDUX A2, AO, INC2
  2291. LFPDUX A3, AO, INC2
  2292. LFPDUX A4, AO, INC2
  2293. add AO, AO, INC2
  2294. LFPDUX A5, AO, INC2
  2295. LFPDUX A6, AO, INC2
  2296. LFPDUX A7, AO, INC2
  2297. add AO, AO, INC2
  2298. add AO, AO, INC2
  2299. LFPDUX A8, AO, INC2
  2300. LFPDUX A9, AO, INC2
  2301. add AO, AO, INC2
  2302. add AO, AO, INC2
  2303. add AO, AO, INC2
  2304. LFPDUX A10, AO, INC2
  2305. subi AO, AO, 32 * SIZE
  2306. fxpmul f4, A1, f0
  2307. FXCXNPMA f0, A1, f0, f4
  2308. fxcpnmsub f1, A2, f0, f1
  2309. FXCXNSMA f1, A2, f0, f1
  2310. fxcpnmsub f2, A3, f0, f2
  2311. FXCXNSMA f2, A3, f0, f2
  2312. fxcpnmsub f3, A4, f0, f3
  2313. FXCXNSMA f3, A4, f0, f3
  2314. fxpmul f6, A5, f1
  2315. FXCXNPMA f1, A5, f1, f6
  2316. fxcpnmsub f2, A6, f1, f2
  2317. FXCXNSMA f2, A6, f1, f2
  2318. fxcpnmsub f3, A7, f1, f3
  2319. FXCXNSMA f3, A7, f1, f3
  2320. fxpmul f4, A8, f2
  2321. FXCXNPMA f2, A8, f2, f4
  2322. fxcpnmsub f3, A9, f2, f3
  2323. FXCXNSMA f3, A9, f2, f3
  2324. fxpmul f6, A10, f3
  2325. FXCXNPMA f3, A10, f3, f6
  2326. #endif
  2327. #ifdef RN
  2328. LFPDX A1, BO, INC2
  2329. fxpmul f4, A1, f0
  2330. fxpmul f5, A1, f1
  2331. fxpmul f6, A1, f2
  2332. fxpmul f7, A1, f3
  2333. FXCXNPMA f0, A1, f0, f4
  2334. FXCXNPMA f1, A1, f1, f5
  2335. FXCXNPMA f2, A1, f2, f6
  2336. FXCXNPMA f3, A1, f3, f7
  2337. #endif
  2338. #ifdef RT
  2339. LFPDX A1, BO, INC2
  2340. fxpmul f4, A1, f0
  2341. fxpmul f5, A1, f1
  2342. fxpmul f6, A1, f2
  2343. fxpmul f7, A1, f3
  2344. FXCXNPMA f0, A1, f0, f4
  2345. FXCXNPMA f1, A1, f1, f5
  2346. FXCXNPMA f2, A1, f2, f6
  2347. FXCXNPMA f3, A1, f3, f7
  2348. #endif
  2349. #ifdef LN
  2350. subi CO1, CO1, 8 * SIZE
  2351. #endif
  2352. #if defined(LN) || defined(LT)
  2353. STFPDUX f0, BO, INC2
  2354. STFPDUX f1, BO, INC2
  2355. STFPDUX f2, BO, INC2
  2356. STFPDUX f3, BO, INC2
  2357. subi BO, BO, 8 * SIZE
  2358. #else
  2359. STFPDUX f0, AO, INC2
  2360. STFPDUX f1, AO, INC2
  2361. STFPDUX f2, AO, INC2
  2362. STFPDUX f3, AO, INC2
  2363. subi AO, AO, 8 * SIZE
  2364. #endif
  2365. STFDUX f0, CO1, INC
  2366. STFSDUX f0, CO1, INC
  2367. STFDUX f1, CO1, INC
  2368. STFSDUX f1, CO1, INC
  2369. STFDUX f2, CO1, INC
  2370. STFSDUX f2, CO1, INC
  2371. STFDUX f3, CO1, INC
  2372. STFSDUX f3, CO1, INC
  2373. #ifdef LN
  2374. subi CO1, CO1, 8 * SIZE
  2375. #endif
  2376. #ifdef RT
  2377. slwi r0, K, 2 + ZBASE_SHIFT
  2378. add AORIG, AORIG, r0
  2379. #endif
  2380. #if defined(LT) || defined(RN)
  2381. sub TEMP, K, KK
  2382. slwi r0, TEMP, 2 + ZBASE_SHIFT
  2383. slwi TEMP, TEMP, 0 + ZBASE_SHIFT
  2384. add AO, AO, r0
  2385. add BO, BO, TEMP
  2386. #endif
  2387. #ifdef LT
  2388. addi KK, KK, 4
  2389. #endif
  2390. #ifdef LN
  2391. subi KK, KK, 4
  2392. #endif
  2393. addic. I, I, -1
  2394. li r0, FZERO
  2395. lfpsx f0, SP, r0
  2396. bgt+ .L51
  2397. .align 4
  2398. .L89:
  2399. #ifdef LN
  2400. slwi r0, K, 0 + ZBASE_SHIFT
  2401. add B, B, r0
  2402. #endif
  2403. #if defined(LT) || defined(RN)
  2404. addi B, BO, 2 * SIZE
  2405. #endif
  2406. #ifdef RN
  2407. addi KK, KK, 1
  2408. #endif
  2409. #ifdef RT
  2410. subi KK, KK, 1
  2411. #endif
  2412. .align 4
  2413. .L999:
  2414. addi SP, SP, 20
  2415. lwzu r14, 4(SP)
  2416. lwzu r15, 4(SP)
  2417. lwzu r16, 4(SP)
  2418. lwzu r17, 4(SP)
  2419. lwzu r18, 4(SP)
  2420. lwzu r19, 4(SP)
  2421. lwzu r20, 4(SP)
  2422. lwzu r21, 4(SP)
  2423. lwzu r22, 4(SP)
  2424. lwzu r23, 4(SP)
  2425. lwzu r24, 4(SP)
  2426. lwzu r25, 4(SP)
  2427. lwzu r26, 4(SP)
  2428. lwzu r27, 4(SP)
  2429. lwzu r28, 4(SP)
  2430. lwzu r29, 4(SP)
  2431. lwzu r30, 4(SP)
  2432. lwzu r31, 4(SP)
  2433. subi SP, SP, 12
  2434. li r0, 16
  2435. lfpdux f31, SP, r0
  2436. lfpdux f30, SP, r0
  2437. lfpdux f29, SP, r0
  2438. lfpdux f28, SP, r0
  2439. lfpdux f27, SP, r0
  2440. lfpdux f26, SP, r0
  2441. lfpdux f25, SP, r0
  2442. lfpdux f24, SP, r0
  2443. lfpdux f23, SP, r0
  2444. lfpdux f22, SP, r0
  2445. lfpdux f21, SP, r0
  2446. lfpdux f20, SP, r0
  2447. lfpdux f19, SP, r0
  2448. lfpdux f18, SP, r0
  2449. lfpdux f17, SP, r0
  2450. lfpdux f16, SP, r0
  2451. lfpdux f15, SP, r0
  2452. lfpdux f14, SP, r0
  2453. addi SP, SP, 16
  2454. blr
  2455. .align 4
  2456. EPILOGUE
  2457. #endif