You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

gemm_ncopy_hummer_8.S 22 kB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217
  1. /*********************************************************************/
  2. /* Copyright 2009, 2010 The University of Texas at Austin. */
  3. /* All rights reserved. */
  4. /* */
  5. /* Redistribution and use in source and binary forms, with or */
  6. /* without modification, are permitted provided that the following */
  7. /* conditions are met: */
  8. /* */
  9. /* 1. Redistributions of source code must retain the above */
  10. /* copyright notice, this list of conditions and the following */
  11. /* disclaimer. */
  12. /* */
  13. /* 2. Redistributions in binary form must reproduce the above */
  14. /* copyright notice, this list of conditions and the following */
  15. /* disclaimer in the documentation and/or other materials */
  16. /* provided with the distribution. */
  17. /* */
  18. /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
  19. /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
  20. /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
  21. /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
  22. /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
  23. /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
  24. /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
  25. /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
  26. /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
  27. /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
  28. /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
  29. /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
  30. /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
  31. /* POSSIBILITY OF SUCH DAMAGE. */
  32. /* */
  33. /* The views and conclusions contained in the software and */
  34. /* documentation are those of the authors and should not be */
  35. /* interpreted as representing official policies, either expressed */
  36. /* or implied, of The University of Texas at Austin. */
  37. /*********************************************************************/
  38. #define ASSEMBLER
  39. #include "common.h"
  40. #define M r3
  41. #define N r4
  42. #define A r5
  43. #define LDA r6
  44. #define B r7
  45. #define AO1 r8
  46. #define AO2 r9
  47. #define AO3 r10
  48. #define AO4 r11
  49. #define J r12
  50. #define AO5 r26
  51. #define AO6 r27
  52. #define AO7 r28
  53. #define AO8 r29
  54. #define INC r30
  55. #define INC2 r31
  56. #define c01 f0
  57. #define c02 f1
  58. #define c03 f2
  59. #define c04 f3
  60. #define c05 f4
  61. #define c06 f5
  62. #define c07 f6
  63. #define c08 f7
  64. #define c09 f8
  65. #define c10 f9
  66. #define c11 f10
  67. #define c12 f11
  68. #define c13 f12
  69. #define c14 f13
  70. #define c15 f14
  71. #define c16 f15
  72. #define c17 f16
  73. #define c18 f17
  74. #define c19 f18
  75. #define c20 f19
  76. #define c21 f20
  77. #define c22 f21
  78. #define c23 f22
  79. #define c24 f23
  80. #define c25 f24
  81. #define c26 f25
  82. #define c27 f26
  83. #define c28 f27
  84. #define c29 f28
  85. #define c30 f29
  86. #define c31 f30
  87. #define c32 f31
  88. #define sel_p f30
  89. #define sel_s f31
  90. PROLOGUE
  91. PROFCODE
  92. li r0, -16
  93. stfpdux f14, SP, r0
  94. stfpdux f15, SP, r0
  95. stfpdux f16, SP, r0
  96. stfpdux f17, SP, r0
  97. stfpdux f18, SP, r0
  98. stfpdux f19, SP, r0
  99. stfpdux f20, SP, r0
  100. stfpdux f21, SP, r0
  101. stfpdux f22, SP, r0
  102. stfpdux f23, SP, r0
  103. stfpdux f24, SP, r0
  104. stfpdux f25, SP, r0
  105. stfpdux f26, SP, r0
  106. stfpdux f27, SP, r0
  107. stfpdux f28, SP, r0
  108. stfpdux f29, SP, r0
  109. stfpdux f30, SP, r0
  110. stfpdux f31, SP, r0
  111. stwu r31, -4(SP)
  112. stwu r30, -4(SP)
  113. stwu r29, -4(SP)
  114. stwu r28, -4(SP)
  115. stwu r27, -4(SP)
  116. stwu r26, -4(SP)
  117. lis r9, 0x3f80
  118. lis r10, 0xbf80
  119. stwu r9, -4(SP)
  120. stwu r10, -4(SP)
  121. stwu r10, -4(SP)
  122. stwu r9, -4(SP)
  123. slwi LDA, LDA, BASE_SHIFT
  124. li r0, 0
  125. lfpsux sel_p, SP, r0
  126. li r0, 8
  127. lfpsux sel_s, SP, r0
  128. cmpwi cr0, M, 0
  129. ble- .L999
  130. cmpwi cr0, N, 0
  131. ble- .L999
  132. li INC, 1 * SIZE
  133. li INC2, 2 * SIZE
  134. subi B, B, 2 * SIZE
  135. andi. r0, A, 2 * SIZE - 1
  136. bne .L100
  137. andi. r0, LDA, 2 * SIZE - 1
  138. bne .L100
  139. subi A, A, 2 * SIZE
  140. srawi. J, N, 3
  141. ble .L20
  142. .align 4
  143. .L11:
  144. mr AO1, A
  145. add AO2, A, LDA
  146. add AO3, AO2, LDA
  147. add AO4, AO3, LDA
  148. add AO5, AO4, LDA
  149. add AO6, AO5, LDA
  150. add AO7, AO6, LDA
  151. add AO8, AO7, LDA
  152. add A, AO8, LDA
  153. srawi. r0, M, 2
  154. mtspr CTR, r0
  155. ble .L15
  156. .align 4
  157. .L12:
  158. LFPDUX c01, AO1, INC2
  159. LFXDUX c02, AO2, INC2
  160. LFPDUX c03, AO3, INC2
  161. LFXDUX c04, AO4, INC2
  162. LFPDUX c05, AO5, INC2
  163. LFXDUX c06, AO6, INC2
  164. LFPDUX c07, AO7, INC2
  165. LFXDUX c08, AO8, INC2
  166. LFPDUX c09, AO1, INC2
  167. LFXDUX c10, AO2, INC2
  168. LFPDUX c11, AO3, INC2
  169. LFXDUX c12, AO4, INC2
  170. fpsel c17, sel_p, c01, c02
  171. LFPDUX c13, AO5, INC2
  172. fpsel c18, sel_p, c03, c04
  173. LFXDUX c14, AO6, INC2
  174. fpsel c19, sel_p, c05, c06
  175. LFPDUX c15, AO7, INC2
  176. fpsel c20, sel_p, c07, c08
  177. LFXDUX c16, AO8, INC2
  178. fpsel c21, sel_s, c01, c02
  179. fpsel c22, sel_s, c03, c04
  180. STFPDUX c17, B, INC2
  181. fpsel c23, sel_s, c05, c06
  182. STFPDUX c18, B, INC2
  183. fpsel c24, sel_s, c07, c08
  184. STFPDUX c19, B, INC2
  185. fpsel c01, sel_p, c09, c10
  186. STFPDUX c20, B, INC2
  187. fpsel c02, sel_p, c11, c12
  188. STFXDUX c21, B, INC2
  189. fpsel c03, sel_p, c13, c14
  190. STFXDUX c22, B, INC2
  191. fpsel c04, sel_p, c15, c16
  192. STFXDUX c23, B, INC2
  193. fpsel c05, sel_s, c09, c10
  194. STFXDUX c24, B, INC2
  195. fpsel c06, sel_s, c11, c12
  196. STFPDUX c01, B, INC2
  197. fpsel c07, sel_s, c13, c14
  198. STFPDUX c02, B, INC2
  199. fpsel c08, sel_s, c15, c16
  200. STFPDUX c03, B, INC2
  201. STFPDUX c04, B, INC2
  202. STFXDUX c05, B, INC2
  203. STFXDUX c06, B, INC2
  204. STFXDUX c07, B, INC2
  205. STFXDUX c08, B, INC2
  206. bdnz .L12
  207. .align 4
  208. .L15:
  209. andi. r0, M, 3
  210. ble .L19
  211. andi. r0, M, 2
  212. beq .L17
  213. LFPDUX c01, AO1, INC2
  214. LFXDUX c02, AO2, INC2
  215. LFPDUX c03, AO3, INC2
  216. LFXDUX c04, AO4, INC2
  217. LFPDUX c05, AO5, INC2
  218. fpsel c09, sel_p, c01, c02
  219. LFXDUX c06, AO6, INC2
  220. fpsel c10, sel_p, c03, c04
  221. LFPDUX c07, AO7, INC2
  222. fpsel c11, sel_p, c05, c06
  223. LFXDUX c08, AO8, INC2
  224. fpsel c12, sel_p, c07, c08
  225. fpsel c13, sel_s, c01, c02
  226. fpsel c14, sel_s, c03, c04
  227. STFPDUX c09, B, INC2
  228. fpsel c15, sel_s, c05, c06
  229. STFPDUX c10, B, INC2
  230. fpsel c16, sel_s, c07, c08
  231. STFPDUX c11, B, INC2
  232. STFPDUX c12, B, INC2
  233. STFXDUX c13, B, INC2
  234. STFXDUX c14, B, INC2
  235. STFXDUX c15, B, INC2
  236. STFXDUX c16, B, INC2
  237. .align 4
  238. .L17:
  239. andi. r0, M, 1
  240. beq .L19
  241. LFDUX c01, AO1, INC2
  242. LFDUX c02, AO3, INC2
  243. LFDUX c03, AO5, INC2
  244. LFDUX c04, AO7, INC2
  245. LFSDUX c01, AO2, INC2
  246. LFSDUX c02, AO4, INC2
  247. LFSDUX c03, AO6, INC2
  248. LFSDUX c04, AO8, INC2
  249. STFPDUX c01, B, INC2
  250. STFPDUX c02, B, INC2
  251. STFPDUX c03, B, INC2
  252. STFPDUX c04, B, INC2
  253. .align 4
  254. .L19:
  255. addic. J, J, -1
  256. bgt .L11
  257. .align 4
  258. .L20:
  259. andi. J, N, 4
  260. ble .L30
  261. .align 4
  262. .L21:
  263. mr AO1, A
  264. add AO2, A, LDA
  265. add AO3, AO2, LDA
  266. add AO4, AO3, LDA
  267. add A, AO4, LDA
  268. srawi. r0, M, 3
  269. mtspr CTR, r0
  270. ble .L25
  271. .align 4
  272. .L22:
  273. LFPDUX c01, AO1, INC2
  274. LFXDUX c02, AO2, INC2
  275. LFPDUX c03, AO3, INC2
  276. LFXDUX c04, AO4, INC2
  277. LFPDUX c05, AO1, INC2
  278. LFXDUX c06, AO2, INC2
  279. LFPDUX c07, AO3, INC2
  280. LFXDUX c08, AO4, INC2
  281. LFPDUX c09, AO1, INC2
  282. LFXDUX c10, AO2, INC2
  283. LFPDUX c11, AO3, INC2
  284. LFXDUX c12, AO4, INC2
  285. fpsel c17, sel_p, c01, c02
  286. LFPDUX c13, AO1, INC2
  287. fpsel c18, sel_p, c03, c04
  288. LFXDUX c14, AO2, INC2
  289. fpsel c19, sel_s, c01, c02
  290. LFPDUX c15, AO3, INC2
  291. fpsel c20, sel_s, c03, c04
  292. LFXDUX c16, AO4, INC2
  293. fpsel c21, sel_p, c05, c06
  294. fpsel c22, sel_p, c07, c08
  295. STFPDUX c17, B, INC2
  296. fpsel c23, sel_s, c05, c06
  297. STFPDUX c18, B, INC2
  298. fpsel c24, sel_s, c07, c08
  299. STFXDUX c19, B, INC2
  300. fpsel c01, sel_p, c09, c10
  301. STFXDUX c20, B, INC2
  302. fpsel c02, sel_p, c11, c12
  303. STFPDUX c21, B, INC2
  304. fpsel c03, sel_s, c09, c10
  305. STFPDUX c22, B, INC2
  306. fpsel c04, sel_s, c11, c12
  307. STFXDUX c23, B, INC2
  308. fpsel c05, sel_p, c13, c14
  309. STFXDUX c24, B, INC2
  310. fpsel c06, sel_p, c15, c16
  311. STFPDUX c01, B, INC2
  312. fpsel c07, sel_s, c13, c14
  313. STFPDUX c02, B, INC2
  314. fpsel c08, sel_s, c15, c16
  315. STFXDUX c03, B, INC2
  316. STFXDUX c04, B, INC2
  317. STFPDUX c05, B, INC2
  318. STFPDUX c06, B, INC2
  319. STFXDUX c07, B, INC2
  320. STFXDUX c08, B, INC2
  321. bdnz .L22
  322. .align 4
  323. .L25:
  324. andi. r0, M, 7
  325. ble .L30
  326. andi. r0, M, 4
  327. beq .L26
  328. LFPDUX c01, AO1, INC2
  329. LFXDUX c02, AO2, INC2
  330. LFPDUX c03, AO3, INC2
  331. LFXDUX c04, AO4, INC2
  332. LFPDUX c05, AO1, INC2
  333. fpsel c09, sel_p, c01, c02
  334. LFXDUX c06, AO2, INC2
  335. fpsel c10, sel_p, c03, c04
  336. LFPDUX c07, AO3, INC2
  337. fpsel c11, sel_s, c01, c02
  338. LFXDUX c08, AO4, INC2
  339. fpsel c12, sel_s, c03, c04
  340. fpsel c13, sel_p, c05, c06
  341. fpsel c14, sel_p, c07, c08
  342. STFPDUX c09, B, INC2
  343. fpsel c15, sel_s, c05, c06
  344. STFPDUX c10, B, INC2
  345. fpsel c16, sel_s, c07, c08
  346. STFXDUX c11, B, INC2
  347. STFXDUX c12, B, INC2
  348. STFPDUX c13, B, INC2
  349. STFPDUX c14, B, INC2
  350. STFXDUX c15, B, INC2
  351. STFXDUX c16, B, INC2
  352. .align 4
  353. .L26:
  354. andi. r0, M, 2
  355. beq .L27
  356. LFPDUX c01, AO1, INC2
  357. LFXDUX c02, AO2, INC2
  358. LFPDUX c03, AO3, INC2
  359. LFXDUX c04, AO4, INC2
  360. fpsel c05, sel_p, c01, c02
  361. fpsel c06, sel_p, c03, c04
  362. fpsel c07, sel_s, c01, c02
  363. fpsel c08, sel_s, c03, c04
  364. STFPDUX c05, B, INC2
  365. STFPDUX c06, B, INC2
  366. STFXDUX c07, B, INC2
  367. STFXDUX c08, B, INC2
  368. .align 4
  369. .L27:
  370. andi. r0, M, 1
  371. beq .L30
  372. LFDUX c01, AO1, INC2
  373. LFDUX c02, AO2, INC2
  374. LFDUX c03, AO3, INC2
  375. LFDUX c04, AO4, INC2
  376. fsmfp c01, c02
  377. fsmfp c03, c04
  378. STFPDUX c01, B, INC2
  379. STFPDUX c03, B, INC2
  380. .align 4
  381. .L30:
  382. andi. J, N, 2
  383. ble .L40
  384. mr AO1, A
  385. add AO2, A, LDA
  386. add A, AO2, LDA
  387. srawi. r0, M, 3
  388. mtspr CTR, r0
  389. ble .L35
  390. .align 4
  391. .L32:
  392. LFPDUX c01, AO1, INC2
  393. LFXDUX c05, AO2, INC2
  394. LFPDUX c02, AO1, INC2
  395. LFXDUX c06, AO2, INC2
  396. LFPDUX c03, AO1, INC2
  397. fpsel c09, sel_p, c01, c05
  398. LFXDUX c07, AO2, INC2
  399. fpsel c10, sel_s, c01, c05
  400. LFPDUX c04, AO1, INC2
  401. fpsel c11, sel_p, c02, c06
  402. LFXDUX c08, AO2, INC2
  403. fpsel c12, sel_s, c02, c06
  404. fpsel c13, sel_p, c03, c07
  405. fpsel c14, sel_s, c03, c07
  406. STFPDUX c09, B, INC2
  407. fpsel c15, sel_p, c04, c08
  408. STFXDUX c10, B, INC2
  409. fpsel c16, sel_s, c04, c08
  410. STFPDUX c11, B, INC2
  411. STFXDUX c12, B, INC2
  412. STFPDUX c13, B, INC2
  413. STFXDUX c14, B, INC2
  414. STFPDUX c15, B, INC2
  415. STFXDUX c16, B, INC2
  416. bdnz .L32
  417. .align 4
  418. .L35:
  419. andi. r0, M, 7
  420. ble .L40
  421. andi. r0, M, 4
  422. beq .L36
  423. LFPDUX c01, AO1, INC2
  424. LFXDUX c03, AO2, INC2
  425. LFPDUX c02, AO1, INC2
  426. LFXDUX c04, AO2, INC2
  427. fpsel c05, sel_p, c01, c03
  428. fpsel c06, sel_s, c01, c03
  429. fpsel c07, sel_p, c02, c04
  430. fpsel c08, sel_s, c02, c04
  431. STFPDUX c05, B, INC2
  432. STFXDUX c06, B, INC2
  433. STFPDUX c07, B, INC2
  434. STFXDUX c08, B, INC2
  435. .align 4
  436. .L36:
  437. andi. r0, M, 2
  438. beq .L37
  439. LFPDUX c01, AO1, INC2
  440. LFXDUX c02, AO2, INC2
  441. fpsel c03, sel_p, c01, c02
  442. fpsel c04, sel_s, c01, c02
  443. STFPDUX c03, B, INC2
  444. STFXDUX c04, B, INC2
  445. .align 4
  446. .L37:
  447. andi. r0, M, 1
  448. beq .L40
  449. LFDUX c01, AO1, INC2
  450. LFDUX c02, AO2, INC2
  451. fsmfp c01, c02
  452. STFPDUX c01, B, INC2
  453. .align 4
  454. .L40:
  455. andi. J, N, 1
  456. ble .L999
  457. mr AO1, A
  458. srawi. r0, M, 3
  459. mtspr CTR, r0
  460. ble .L45
  461. .align 4
  462. .L42:
  463. LFPDUX c01, AO1, INC2
  464. LFPDUX c02, AO1, INC2
  465. LFPDUX c03, AO1, INC2
  466. LFPDUX c04, AO1, INC2
  467. STFPDUX c01, B, INC2
  468. STFPDUX c02, B, INC2
  469. STFPDUX c03, B, INC2
  470. STFPDUX c04, B, INC2
  471. bdnz .L42
  472. .align 4
  473. .L45:
  474. andi. r0, M, 7
  475. ble .L999
  476. andi. r0, M, 4
  477. beq .L46
  478. LFPDUX c01, AO1, INC2
  479. LFPDUX c02, AO1, INC2
  480. STFPDUX c01, B, INC2
  481. STFPDUX c02, B, INC2
  482. .align 4
  483. .L46:
  484. andi. r0, M, 2
  485. beq .L47
  486. LFPDUX c01, AO1, INC2
  487. STFPDUX c01, B, INC2
  488. .align 4
  489. .L47:
  490. andi. r0, M, 1
  491. beq .L999
  492. LFDX c01, AO1, INC2
  493. STFDX c01, B, INC2
  494. b .L999
  495. .align 4
  496. .L100:
  497. subi A, A, 1 * SIZE
  498. srawi. J, N, 3
  499. ble .L120
  500. .align 4
  501. .L111:
  502. mr AO1, A
  503. add AO2, A, LDA
  504. add AO3, AO2, LDA
  505. add AO4, AO3, LDA
  506. add AO5, AO4, LDA
  507. add AO6, AO5, LDA
  508. add AO7, AO6, LDA
  509. add AO8, AO7, LDA
  510. add A, AO8, LDA
  511. srawi. r0, M, 3
  512. mtspr CTR, r0
  513. ble .L115
  514. .align 4
  515. .L112:
  516. LFDUX c01, AO1, INC
  517. LFDUX c05, AO1, INC
  518. LFDUX c09, AO1, INC
  519. LFDUX c13, AO1, INC
  520. LFDUX c17, AO1, INC
  521. LFDUX c21, AO1, INC
  522. LFDUX c25, AO1, INC
  523. LFDUX c29, AO1, INC
  524. LFSDUX c01, AO2, INC
  525. LFSDUX c05, AO2, INC
  526. LFSDUX c09, AO2, INC
  527. LFSDUX c13, AO2, INC
  528. LFSDUX c17, AO2, INC
  529. LFSDUX c21, AO2, INC
  530. LFSDUX c25, AO2, INC
  531. LFSDUX c29, AO2, INC
  532. LFDUX c02, AO3, INC
  533. LFDUX c06, AO3, INC
  534. LFDUX c10, AO3, INC
  535. LFDUX c14, AO3, INC
  536. LFDUX c18, AO3, INC
  537. LFDUX c22, AO3, INC
  538. LFDUX c26, AO3, INC
  539. LFDUX c30, AO3, INC
  540. LFSDUX c02, AO4, INC
  541. LFSDUX c06, AO4, INC
  542. LFSDUX c10, AO4, INC
  543. LFSDUX c14, AO4, INC
  544. LFSDUX c18, AO4, INC
  545. LFSDUX c22, AO4, INC
  546. LFSDUX c26, AO4, INC
  547. LFSDUX c30, AO4, INC
  548. LFDUX c03, AO5, INC
  549. LFDUX c07, AO5, INC
  550. LFDUX c11, AO5, INC
  551. LFDUX c15, AO5, INC
  552. LFDUX c19, AO5, INC
  553. LFDUX c23, AO5, INC
  554. LFDUX c27, AO5, INC
  555. LFDUX c31, AO5, INC
  556. LFSDUX c03, AO6, INC
  557. LFSDUX c07, AO6, INC
  558. LFSDUX c11, AO6, INC
  559. LFSDUX c15, AO6, INC
  560. LFSDUX c19, AO6, INC
  561. LFSDUX c23, AO6, INC
  562. LFSDUX c27, AO6, INC
  563. LFSDUX c31, AO6, INC
  564. LFDUX c04, AO7, INC
  565. LFDUX c08, AO7, INC
  566. LFDUX c12, AO7, INC
  567. LFDUX c16, AO7, INC
  568. LFDUX c20, AO7, INC
  569. LFDUX c24, AO7, INC
  570. LFDUX c28, AO7, INC
  571. LFDUX c32, AO7, INC
  572. LFSDUX c04, AO8, INC
  573. LFSDUX c08, AO8, INC
  574. LFSDUX c12, AO8, INC
  575. LFSDUX c16, AO8, INC
  576. LFSDUX c20, AO8, INC
  577. LFSDUX c24, AO8, INC
  578. LFSDUX c28, AO8, INC
  579. LFSDUX c32, AO8, INC
  580. STFPDUX c01, B, INC2
  581. STFPDUX c02, B, INC2
  582. STFPDUX c03, B, INC2
  583. STFPDUX c04, B, INC2
  584. STFPDUX c05, B, INC2
  585. STFPDUX c06, B, INC2
  586. STFPDUX c07, B, INC2
  587. STFPDUX c08, B, INC2
  588. STFPDUX c09, B, INC2
  589. STFPDUX c10, B, INC2
  590. STFPDUX c11, B, INC2
  591. STFPDUX c12, B, INC2
  592. STFPDUX c13, B, INC2
  593. STFPDUX c14, B, INC2
  594. STFPDUX c15, B, INC2
  595. STFPDUX c16, B, INC2
  596. STFPDUX c17, B, INC2
  597. STFPDUX c18, B, INC2
  598. STFPDUX c19, B, INC2
  599. STFPDUX c20, B, INC2
  600. STFPDUX c21, B, INC2
  601. STFPDUX c22, B, INC2
  602. STFPDUX c23, B, INC2
  603. STFPDUX c24, B, INC2
  604. STFPDUX c25, B, INC2
  605. STFPDUX c26, B, INC2
  606. STFPDUX c27, B, INC2
  607. STFPDUX c28, B, INC2
  608. STFPDUX c29, B, INC2
  609. STFPDUX c30, B, INC2
  610. STFPDUX c31, B, INC2
  611. STFPDUX c32, B, INC2
  612. bdnz .L112
  613. .align 4
  614. .L115:
  615. andi. r0, M, 7
  616. ble .L119
  617. andi. r0, M, 4
  618. beq .L116
  619. LFDUX c01, AO1, INC
  620. LFDUX c05, AO1, INC
  621. LFDUX c09, AO1, INC
  622. LFDUX c13, AO1, INC
  623. LFSDUX c01, AO2, INC
  624. LFSDUX c05, AO2, INC
  625. LFSDUX c09, AO2, INC
  626. LFSDUX c13, AO2, INC
  627. LFDUX c02, AO3, INC
  628. LFDUX c06, AO3, INC
  629. LFDUX c10, AO3, INC
  630. LFDUX c14, AO3, INC
  631. LFSDUX c02, AO4, INC
  632. LFSDUX c06, AO4, INC
  633. LFSDUX c10, AO4, INC
  634. LFSDUX c14, AO4, INC
  635. LFDUX c03, AO5, INC
  636. LFDUX c07, AO5, INC
  637. LFDUX c11, AO5, INC
  638. LFDUX c15, AO5, INC
  639. LFSDUX c03, AO6, INC
  640. LFSDUX c07, AO6, INC
  641. LFSDUX c11, AO6, INC
  642. LFSDUX c15, AO6, INC
  643. LFDUX c04, AO7, INC
  644. LFDUX c08, AO7, INC
  645. LFDUX c12, AO7, INC
  646. LFDUX c16, AO7, INC
  647. LFSDUX c04, AO8, INC
  648. LFSDUX c08, AO8, INC
  649. LFSDUX c12, AO8, INC
  650. LFSDUX c16, AO8, INC
  651. STFPDUX c01, B, INC2
  652. STFPDUX c02, B, INC2
  653. STFPDUX c03, B, INC2
  654. STFPDUX c04, B, INC2
  655. STFPDUX c05, B, INC2
  656. STFPDUX c06, B, INC2
  657. STFPDUX c07, B, INC2
  658. STFPDUX c08, B, INC2
  659. STFPDUX c09, B, INC2
  660. STFPDUX c10, B, INC2
  661. STFPDUX c11, B, INC2
  662. STFPDUX c12, B, INC2
  663. STFPDUX c13, B, INC2
  664. STFPDUX c14, B, INC2
  665. STFPDUX c15, B, INC2
  666. STFPDUX c16, B, INC2
  667. .align 4
  668. .L116:
  669. andi. r0, M, 2
  670. beq .L117
  671. LFDUX c01, AO1, INC
  672. LFDUX c05, AO1, INC
  673. LFDUX c02, AO3, INC
  674. LFDUX c06, AO3, INC
  675. LFSDUX c01, AO2, INC
  676. LFSDUX c05, AO2, INC
  677. LFSDUX c02, AO4, INC
  678. LFSDUX c06, AO4, INC
  679. LFDUX c03, AO5, INC
  680. LFDUX c07, AO5, INC
  681. LFDUX c04, AO7, INC
  682. LFDUX c08, AO7, INC
  683. LFSDUX c03, AO6, INC
  684. LFSDUX c07, AO6, INC
  685. LFSDUX c04, AO8, INC
  686. LFSDUX c08, AO8, INC
  687. STFPDUX c01, B, INC2
  688. STFPDUX c02, B, INC2
  689. STFPDUX c03, B, INC2
  690. STFPDUX c04, B, INC2
  691. STFPDUX c05, B, INC2
  692. STFPDUX c06, B, INC2
  693. STFPDUX c07, B, INC2
  694. STFPDUX c08, B, INC2
  695. .align 4
  696. .L117:
  697. andi. r0, M, 1
  698. beq .L119
  699. LFDUX c01, AO1, INC
  700. LFDUX c02, AO3, INC
  701. LFDUX c03, AO5, INC
  702. LFDUX c04, AO7, INC
  703. LFSDUX c01, AO2, INC
  704. LFSDUX c02, AO4, INC
  705. LFSDUX c03, AO6, INC
  706. LFSDUX c04, AO8, INC
  707. STFPDUX c01, B, INC2
  708. STFPDUX c02, B, INC2
  709. STFPDUX c03, B, INC2
  710. STFPDUX c04, B, INC2
  711. .align 4
  712. .L119:
  713. addic. J, J, -1
  714. bgt .L111
  715. .align 4
  716. .L120:
  717. andi. J, N, 4
  718. ble .L130
  719. .align 4
  720. .L121:
  721. mr AO1, A
  722. add AO2, A, LDA
  723. add AO3, AO2, LDA
  724. add AO4, AO3, LDA
  725. add A, AO4, LDA
  726. srawi. r0, M, 3
  727. mtspr CTR, r0
  728. ble .L125
  729. .align 4
  730. .L122:
  731. LFDUX c01, AO1, INC
  732. LFDUX c02, AO1, INC
  733. LFDUX c03, AO1, INC
  734. LFDUX c04, AO1, INC
  735. LFDUX c09, AO1, INC
  736. LFDUX c10, AO1, INC
  737. LFDUX c11, AO1, INC
  738. LFDUX c12, AO1, INC
  739. LFSDUX c01, AO2, INC
  740. LFSDUX c02, AO2, INC
  741. LFSDUX c03, AO2, INC
  742. LFSDUX c04, AO2, INC
  743. LFSDUX c09, AO2, INC
  744. LFSDUX c10, AO2, INC
  745. LFSDUX c11, AO2, INC
  746. LFSDUX c12, AO2, INC
  747. LFDUX c05, AO3, INC
  748. LFDUX c06, AO3, INC
  749. LFDUX c07, AO3, INC
  750. LFDUX c08, AO3, INC
  751. LFDUX c13, AO3, INC
  752. LFDUX c14, AO3, INC
  753. LFDUX c15, AO3, INC
  754. LFDUX c16, AO3, INC
  755. LFSDUX c05, AO4, INC
  756. LFSDUX c06, AO4, INC
  757. LFSDUX c07, AO4, INC
  758. LFSDUX c08, AO4, INC
  759. LFSDUX c13, AO4, INC
  760. LFSDUX c14, AO4, INC
  761. LFSDUX c15, AO4, INC
  762. LFSDUX c16, AO4, INC
  763. STFPDUX c01, B, INC2
  764. STFPDUX c05, B, INC2
  765. STFPDUX c02, B, INC2
  766. STFPDUX c06, B, INC2
  767. STFPDUX c03, B, INC2
  768. STFPDUX c07, B, INC2
  769. STFPDUX c04, B, INC2
  770. STFPDUX c08, B, INC2
  771. STFPDUX c09, B, INC2
  772. STFPDUX c13, B, INC2
  773. STFPDUX c10, B, INC2
  774. STFPDUX c14, B, INC2
  775. STFPDUX c11, B, INC2
  776. STFPDUX c15, B, INC2
  777. STFPDUX c12, B, INC2
  778. STFPDUX c16, B, INC2
  779. bdnz .L122
  780. .align 4
  781. .L125:
  782. andi. r0, M, 7
  783. ble .L130
  784. andi. r0, M, 4
  785. beq .L126
  786. LFDUX c01, AO1, INC
  787. LFDUX c02, AO1, INC
  788. LFDUX c03, AO1, INC
  789. LFDUX c04, AO1, INC
  790. LFSDUX c01, AO2, INC
  791. LFSDUX c02, AO2, INC
  792. LFSDUX c03, AO2, INC
  793. LFSDUX c04, AO2, INC
  794. LFDUX c05, AO3, INC
  795. LFDUX c06, AO3, INC
  796. LFDUX c07, AO3, INC
  797. LFDUX c08, AO3, INC
  798. LFSDUX c05, AO4, INC
  799. LFSDUX c06, AO4, INC
  800. LFSDUX c07, AO4, INC
  801. LFSDUX c08, AO4, INC
  802. STFPDUX c01, B, INC2
  803. STFPDUX c05, B, INC2
  804. STFPDUX c02, B, INC2
  805. STFPDUX c06, B, INC2
  806. STFPDUX c03, B, INC2
  807. STFPDUX c07, B, INC2
  808. STFPDUX c04, B, INC2
  809. STFPDUX c08, B, INC2
  810. .align 4
  811. .L126:
  812. andi. r0, M, 2
  813. beq .L127
  814. LFDUX c01, AO1, INC
  815. LFDUX c02, AO1, INC
  816. LFSDUX c01, AO2, INC
  817. LFSDUX c02, AO2, INC
  818. LFDUX c05, AO3, INC
  819. LFDUX c06, AO3, INC
  820. LFSDUX c05, AO4, INC
  821. LFSDUX c06, AO4, INC
  822. STFPDUX c01, B, INC2
  823. STFPDUX c05, B, INC2
  824. STFPDUX c02, B, INC2
  825. STFPDUX c06, B, INC2
  826. .align 4
  827. .L127:
  828. andi. r0, M, 1
  829. beq .L130
  830. LFDUX c01, AO1, INC
  831. LFDUX c05, AO3, INC
  832. nop
  833. nop
  834. LFSDUX c01, AO2, INC
  835. LFSDUX c05, AO4, INC
  836. STFPDUX c01, B, INC2
  837. STFPDUX c05, B, INC2
  838. .align 4
  839. .L130:
  840. andi. J, N, 2
  841. ble .L140
  842. mr AO1, A
  843. add AO2, A, LDA
  844. add A, AO2, LDA
  845. srawi. r0, M, 3
  846. mtspr CTR, r0
  847. ble .L135
  848. .align 4
  849. .L132:
  850. LFDUX c01, AO1, INC
  851. LFDUX c02, AO1, INC
  852. LFDUX c03, AO1, INC
  853. LFDUX c04, AO1, INC
  854. LFDUX c09, AO1, INC
  855. LFDUX c10, AO1, INC
  856. LFDUX c11, AO1, INC
  857. LFDUX c12, AO1, INC
  858. LFSDUX c01, AO2, INC
  859. LFSDUX c02, AO2, INC
  860. LFSDUX c03, AO2, INC
  861. LFSDUX c04, AO2, INC
  862. LFSDUX c09, AO2, INC
  863. LFSDUX c10, AO2, INC
  864. LFSDUX c11, AO2, INC
  865. LFSDUX c12, AO2, INC
  866. STFPDUX c01, B, INC2
  867. STFPDUX c02, B, INC2
  868. STFPDUX c03, B, INC2
  869. STFPDUX c04, B, INC2
  870. STFPDUX c09, B, INC2
  871. STFPDUX c10, B, INC2
  872. STFPDUX c11, B, INC2
  873. STFPDUX c12, B, INC2
  874. bdnz .L132
  875. .align 4
  876. .L135:
  877. andi. r0, M, 7
  878. ble .L140
  879. andi. r0, M, 4
  880. beq .L136
  881. LFDUX c01, AO1, INC
  882. LFDUX c02, AO1, INC
  883. LFDUX c03, AO1, INC
  884. LFDUX c04, AO1, INC
  885. LFSDUX c01, AO2, INC
  886. LFSDUX c02, AO2, INC
  887. LFSDUX c03, AO2, INC
  888. LFSDUX c04, AO2, INC
  889. STFPDUX c01, B, INC2
  890. STFPDUX c02, B, INC2
  891. STFPDUX c03, B, INC2
  892. STFPDUX c04, B, INC2
  893. .align 4
  894. .L136:
  895. andi. r0, M, 2
  896. beq .L137
  897. LFDUX c01, AO1, INC
  898. LFDUX c02, AO1, INC
  899. LFSDUX c01, AO2, INC
  900. LFSDUX c02, AO2, INC
  901. STFPDUX c01, B, INC2
  902. STFPDUX c02, B, INC2
  903. .align 4
  904. .L137:
  905. andi. r0, M, 1
  906. beq .L140
  907. LFDUX c01, AO1, INC
  908. LFDUX c02, AO2, INC
  909. fsmfp c01, c02
  910. STFPDUX c01, B, INC2
  911. .align 4
  912. .L140:
  913. andi. J, N, 1
  914. ble .L999
  915. mr AO1, A
  916. srawi. r0, M, 3
  917. mtspr CTR, r0
  918. ble .L145
  919. .align 4
  920. .L142:
  921. LFDUX c01, AO1, INC
  922. LFDUX c02, AO1, INC
  923. LFDUX c03, AO1, INC
  924. LFDUX c04, AO1, INC
  925. LFDUX c05, AO1, INC
  926. LFDUX c06, AO1, INC
  927. LFDUX c07, AO1, INC
  928. LFDUX c08, AO1, INC
  929. fsmfp c01, c02
  930. fsmfp c03, c04
  931. fsmfp c05, c06
  932. fsmfp c07, c08
  933. STFPDUX c01, B, INC2
  934. STFPDUX c03, B, INC2
  935. STFPDUX c05, B, INC2
  936. STFPDUX c07, B, INC2
  937. bdnz .L142
  938. .align 4
  939. .L145:
  940. andi. r0, M, 7
  941. ble .L999
  942. andi. r0, M, 4
  943. beq .L146
  944. LFDUX c01, AO1, INC
  945. LFDUX c02, AO1, INC
  946. LFDUX c03, AO1, INC
  947. LFDUX c04, AO1, INC
  948. fsmfp c01, c02
  949. fsmfp c03, c04
  950. STFPDUX c01, B, INC2
  951. STFPDUX c03, B, INC2
  952. .align 4
  953. .L146:
  954. andi. r0, M, 2
  955. beq .L147
  956. LFDUX c01, AO1, INC
  957. LFDUX c02, AO1, INC
  958. fsmfp c01, c02
  959. STFPDUX c01, B, INC2
  960. .align 4
  961. .L147:
  962. andi. r0, M, 1
  963. beq .L999
  964. LFDX c01, AO1, INC
  965. STFDX c01, B, INC2
  966. .align 4
  967. .L999:
  968. addi SP, SP, 4
  969. lwzu r26, 4(SP)
  970. lwzu r27, 4(SP)
  971. lwzu r28, 4(SP)
  972. lwzu r29, 4(SP)
  973. lwzu r30, 4(SP)
  974. lwzu r31, 4(SP)
  975. subi SP, SP, 12
  976. li r0, 16
  977. lfpdux f31, SP, r0
  978. lfpdux f30, SP, r0
  979. lfpdux f29, SP, r0
  980. lfpdux f28, SP, r0
  981. lfpdux f27, SP, r0
  982. lfpdux f26, SP, r0
  983. lfpdux f25, SP, r0
  984. lfpdux f24, SP, r0
  985. lfpdux f23, SP, r0
  986. lfpdux f22, SP, r0
  987. lfpdux f21, SP, r0
  988. lfpdux f20, SP, r0
  989. lfpdux f19, SP, r0
  990. lfpdux f18, SP, r0
  991. lfpdux f17, SP, r0
  992. lfpdux f16, SP, r0
  993. lfpdux f15, SP, r0
  994. lfpdux f14, SP, r0
  995. addi SP, SP, 16
  996. blr
  997. EPILOGUE