You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

zgemm_kernel_altivec_g4.S 33 kB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673167416751676167716781679168016811682168316841685168616871688168916901691169216931694169516961697169816991700170117021703170417051706170717081709171017111712171317141715171617171718171917201721172217231724172517261727172817291730173117321733173417351736173717381739174017411742174317441745174617471748174917501751175217531754175517561757
  1. /*********************************************************************/
  2. /* Copyright 2009, 2010 The University of Texas at Austin. */
  3. /* All rights reserved. */
  4. /* */
  5. /* Redistribution and use in source and binary forms, with or */
  6. /* without modification, are permitted provided that the following */
  7. /* conditions are met: */
  8. /* */
  9. /* 1. Redistributions of source code must retain the above */
  10. /* copyright notice, this list of conditions and the following */
  11. /* disclaimer. */
  12. /* */
  13. /* 2. Redistributions in binary form must reproduce the above */
  14. /* copyright notice, this list of conditions and the following */
  15. /* disclaimer in the documentation and/or other materials */
  16. /* provided with the distribution. */
  17. /* */
  18. /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
  19. /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
  20. /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
  21. /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
  22. /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
  23. /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
  24. /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
  25. /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
  26. /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
  27. /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
  28. /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
  29. /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
  30. /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
  31. /* POSSIBILITY OF SUCH DAMAGE. */
  32. /* */
  33. /* The views and conclusions contained in the software and */
  34. /* documentation are those of the authors and should not be */
  35. /* interpreted as representing official policies, either expressed */
  36. /* or implied, of The University of Texas at Austin. */
  37. /*********************************************************************/
  38. #define ASSEMBLER
  39. #include "common.h"
  40. #ifndef __64BIT__
  41. #define LOAD lwz
  42. #else
  43. #define LOAD ld
  44. #endif
  45. #ifdef __64BIT__
  46. #define STACKSIZE 360
  47. #else
  48. #define STACKSIZE 272
  49. #endif
  50. #define ALIGN_SIZE 0xffff
  51. #define SWAP 0
  52. #define NEG 16
  53. #define ALPHA_R 32
  54. #define ALPHA_I 48
  55. #define FZERO 64
  56. #define M r3
  57. #define N r4
  58. #define K r5
  59. #if defined(linux) || defined(__FreeBSD__)
  60. #ifndef __64BIT__
  61. #define A r6
  62. #define B r7
  63. #define C r8
  64. #define LDC r9
  65. #else
  66. #define A r8
  67. #define B r9
  68. #define C r10
  69. #define LDC r6
  70. #endif
  71. #endif
  72. #if defined(_AIX) || defined(__APPLE__)
  73. #if !defined(__64BIT__) && defined(DOUBLE)
  74. #define A r10
  75. #define B r6
  76. #define C r7
  77. #define LDC r8
  78. #else
  79. #define A r8
  80. #define B r9
  81. #define C r10
  82. #define LDC r6
  83. #endif
  84. #endif
  85. #define STACK r11
  86. #define I r21
  87. #define J r22
  88. #define AO r23
  89. #define BO r24
  90. #define CO1 r25
  91. #define CO2 r26
  92. #define PREA r29
  93. #define PREB r29
  94. #define PREC r30
  95. #define VREG r31
  96. #define LOAD_A lvx
  97. #define LOAD_B lvx
  98. #define OFFSET_0 0
  99. #define OFFSET_1 r14
  100. #define OFFSET_2 r15
  101. #define OFFSET_3 r16
  102. #define OFFSET_4 r17
  103. #define OFFSET_5 r18
  104. #define OFFSET_6 r19
  105. #define OFFSET_7 r20
  106. #define c01 v0
  107. #define c02 v1
  108. #define c03 v2
  109. #define c04 v3
  110. #define c05 v4
  111. #define c06 v5
  112. #define c07 v6
  113. #define c08 v7
  114. #define c09 v8
  115. #define c10 v9
  116. #define c11 v10
  117. #define c12 v11
  118. #define c13 v12
  119. #define c14 v13
  120. #define c15 v14
  121. #define c16 v15
  122. #define a1 v16
  123. #define a2 v17
  124. #define a3 v18
  125. #define a4 v19
  126. #define a5 v20
  127. #define a6 v21
  128. #define a7 v22
  129. #define a8 v23
  130. #define b1 v24
  131. #define b2 v25
  132. #define bp1 v26
  133. #define bp2 v27
  134. #define C1 v16
  135. #define C2 v17
  136. #define C3 v18
  137. #define C4 v19
  138. #define C5 v20
  139. #define c00 v24
  140. #define VZERO v25
  141. #define PERMRSHIFT1 v26
  142. #define PERMRSHIFT2 v27
  143. #define swap v28
  144. #define neg v29
  145. #define alpha_r v30
  146. #define alpha_i v31
  147. #ifndef NEEDPARAM
  148. PROLOGUE
  149. PROFCODE
  150. addi SP, SP, -STACKSIZE
  151. mr STACK, SP
  152. li r0, 0 * 16
  153. stvx v20, SP, r0
  154. li r0, 1 * 16
  155. stvx v21, SP, r0
  156. li r0, 2 * 16
  157. stvx v22, SP, r0
  158. li r0, 3 * 16
  159. stvx v23, SP, r0
  160. li r0, 4 * 16
  161. stvx v24, SP, r0
  162. li r0, 5 * 16
  163. stvx v25, SP, r0
  164. li r0, 6 * 16
  165. stvx v26, SP, r0
  166. li r0, 7 * 16
  167. stvx v27, SP, r0
  168. li r0, 8 * 16
  169. stvx v28, SP, r0
  170. li r0, 9 * 16
  171. stvx v29, SP, r0
  172. li r0, 10 * 16
  173. stvx v30, SP, r0
  174. li r0, 11 * 16
  175. stvx v31, SP, r0
  176. #ifdef __64BIT__
  177. std r31, 192(SP)
  178. std r30, 200(SP)
  179. std r29, 208(SP)
  180. std r28, 216(SP)
  181. std r27, 224(SP)
  182. std r26, 232(SP)
  183. std r25, 240(SP)
  184. std r24, 248(SP)
  185. std r23, 256(SP)
  186. std r22, 264(SP)
  187. std r21, 272(SP)
  188. std r20, 280(SP)
  189. std r19, 288(SP)
  190. std r18, 296(SP)
  191. std r17, 304(SP)
  192. std r16, 312(SP)
  193. std r15, 320(SP)
  194. std r14, 328(SP)
  195. #else
  196. stw r31, 192(SP)
  197. stw r30, 196(SP)
  198. stw r29, 200(SP)
  199. stw r28, 204(SP)
  200. stw r27, 208(SP)
  201. stw r26, 212(SP)
  202. stw r25, 216(SP)
  203. stw r24, 220(SP)
  204. stw r23, 224(SP)
  205. stw r22, 228(SP)
  206. stw r21, 232(SP)
  207. stw r20, 236(SP)
  208. stw r19, 240(SP)
  209. stw r18, 244(SP)
  210. stw r17, 248(SP)
  211. stw r16, 252(SP)
  212. stw r15, 256(SP)
  213. stw r14, 260(SP)
  214. #endif
  215. #if defined(linux) || defined(__FreeBSD__)
  216. #ifdef __64BIT__
  217. ld LDC, FRAMESLOT(0) + STACKSIZE(SP)
  218. #endif
  219. #endif
  220. #if defined(_AIX) || defined(__APPLE__)
  221. #ifdef __64BIT__
  222. ld LDC, FRAMESLOT(0) + STACKSIZE(SP)
  223. #else
  224. #ifdef DOUBLE
  225. lwz B, FRAMESLOT(0) + STACKSIZE(SP)
  226. lwz C, FRAMESLOT(1) + STACKSIZE(SP)
  227. lwz LDC, FRAMESLOT(2) + STACKSIZE(SP)
  228. #else
  229. lwz LDC, FRAMESLOT(0) + STACKSIZE(SP)
  230. #endif
  231. #endif
  232. #endif
  233. li r0, -1
  234. mfspr VREG, VRsave
  235. mtspr VRsave, r0
  236. addi SP, SP, -128
  237. li r0, -8192
  238. and SP, SP, r0
  239. fneg f3, f1
  240. fneg f4, f2
  241. #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \
  242. defined(NC) || defined(TC) || defined(NR) || defined(TR)
  243. stfs f1, ALPHA_R + 0(SP)
  244. stfs f1, ALPHA_R + 4(SP)
  245. stfs f1, ALPHA_R + 8(SP)
  246. stfs f1, ALPHA_R + 12(SP)
  247. stfs f4, ALPHA_I + 0(SP)
  248. stfs f2, ALPHA_I + 4(SP)
  249. stfs f4, ALPHA_I + 8(SP)
  250. stfs f2, ALPHA_I + 12(SP)
  251. #else
  252. stfs f1, ALPHA_R + 0(SP)
  253. stfs f3, ALPHA_R + 4(SP)
  254. stfs f1, ALPHA_R + 8(SP)
  255. stfs f3, ALPHA_R + 12(SP)
  256. stfs f2, ALPHA_I + 0(SP)
  257. stfs f2, ALPHA_I + 4(SP)
  258. stfs f2, ALPHA_I + 8(SP)
  259. stfs f2, ALPHA_I + 12(SP)
  260. #endif
  261. li I, Address_L(0x04050607)
  262. addis I, I, Address_H(0x04050607)
  263. stw I, SWAP + 0(SP)
  264. li I, Address_L(0x00010203)
  265. addis I, I, Address_H(0x00010203)
  266. stw I, SWAP + 4(SP)
  267. li I, Address_L(0x0c0d0e0f)
  268. addis I, I, Address_H(0x0c0d0e0f)
  269. stw I, SWAP + 8(SP)
  270. li I, Address_L(0x08090a0b)
  271. addis I, I, Address_H(0x08090a0b)
  272. stw I, SWAP + 12(SP)
  273. #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \
  274. defined(RR) || defined(RC) || defined(CR) || defined(CC)
  275. lis I, 0x8000
  276. stw I, NEG + 0(SP)
  277. stw I, NEG + 8(SP)
  278. li I, 0
  279. stw I, NEG + 4(SP)
  280. stw I, NEG + 12(SP)
  281. #else
  282. li I, 0
  283. stw I, NEG + 0(SP)
  284. stw I, NEG + 8(SP)
  285. lis I, 0x8000
  286. stw I, NEG + 4(SP)
  287. stw I, NEG + 12(SP)
  288. #endif
  289. li r0, 0
  290. stw r0, FZERO(SP)
  291. slwi LDC, LDC, ZBASE_SHIFT
  292. li PREC, (15 * SIZE)
  293. li PREB, (25 * 8 * SIZE)
  294. li OFFSET_1, 4 * SIZE
  295. li OFFSET_2, 8 * SIZE
  296. li OFFSET_3, 12 * SIZE
  297. li OFFSET_4, 16 * SIZE
  298. li OFFSET_5, 20 * SIZE
  299. li OFFSET_6, 24 * SIZE
  300. li OFFSET_7, 28 * SIZE
  301. cmpwi cr0, M, 0
  302. ble LL(999)
  303. cmpwi cr0, N, 0
  304. ble LL(999)
  305. cmpwi cr0, K, 0
  306. ble LL(999)
  307. srawi. J, N, 1
  308. ble LL(50)
  309. .align 4
  310. LL(01):
  311. mr CO1, C
  312. add CO2, C, LDC
  313. add C, CO2, LDC
  314. mr AO, A
  315. srawi. I, M, 3
  316. ble LL(20)
  317. .align 4
  318. LL(11):
  319. vxor c01, c01, c01
  320. LOAD_B b1, OFFSET_0, B
  321. vxor c02, c02, c02
  322. LOAD_A a1, OFFSET_0, AO
  323. vxor c03, c03, c03
  324. LOAD_A a2, OFFSET_1, AO
  325. vxor c04, c04, c04
  326. LOAD_A a3, OFFSET_2, AO
  327. vxor c05, c05, c05
  328. LOAD_A a4, OFFSET_3, AO
  329. vxor c06, c06, c06
  330. LOAD_B b2, OFFSET_2, B
  331. vxor c07, c07, c07
  332. LOAD_A a5, OFFSET_4, AO
  333. vxor c08, c08, c08
  334. LOAD_A a6, OFFSET_5, AO
  335. vxor c09, c09, c09
  336. dcbtst CO1, PREC
  337. vxor c10, c10, c10
  338. dcbtst CO2, PREC
  339. vxor c11, c11, c11
  340. vxor c12, c12, c12
  341. vxor c13, c13, c13
  342. mr BO, B
  343. vxor c14, c14, c14
  344. srawi. r0, K, 2
  345. vxor c15, c15, c15
  346. mtspr CTR, r0
  347. vxor c16, c16, c16
  348. vspltw bp1, b1, 0
  349. ble LL(15)
  350. .align 4
  351. LL(12):
  352. /* 1 */
  353. vmaddfp c01, a1, bp1, c01
  354. vspltw bp2, b1, 1
  355. vmaddfp c02, a2, bp1, c02
  356. addi AO, AO, 8 * SIZE
  357. vmaddfp c03, a3, bp1, c03
  358. LOAD_A a7, OFFSET_4, AO
  359. vmaddfp c04, a4, bp1, c04
  360. LOAD_A a8, OFFSET_5, AO
  361. /* 2 */
  362. vmaddfp c05, a1, bp2, c05
  363. vspltw bp1, b1, 2
  364. vmaddfp c06, a2, bp2, c06
  365. dcbt BO, PREB
  366. vmaddfp c07, a3, bp2, c07
  367. dcbt AO, PREB
  368. vmaddfp c08, a4, bp2, c08
  369. addi AO, AO, 8 * SIZE
  370. /* 3 */
  371. vmaddfp c09, a1, bp1, c09
  372. vspltw bp2, b1, 3
  373. vmaddfp c10, a2, bp1, c10
  374. LOAD_B b1, OFFSET_1, BO
  375. vmaddfp c11, a3, bp1, c11
  376. dcbt AO, PREB
  377. vmaddfp c12, a4, bp1, c12
  378. addi AO, AO, 8 * SIZE
  379. /* 4 */
  380. vmaddfp c13, a1, bp2, c13
  381. vspltw bp1, b1, 0
  382. vmaddfp c14, a2, bp2, c14
  383. LOAD_A a1, OFFSET_2, AO
  384. vmaddfp c15, a3, bp2, c15
  385. dcbt AO, PREB
  386. vmaddfp c16, a4, bp2, c16
  387. addi AO, AO, 8 * SIZE
  388. /* 5 */
  389. vmaddfp c01, a5, bp1, c01
  390. vspltw bp2, b1, 1
  391. vmaddfp c02, a6, bp1, c02
  392. LOAD_A a2, OFFSET_1, AO
  393. vmaddfp c03, a7, bp1, c03
  394. LOAD_A a3, OFFSET_2, AO
  395. vmaddfp c04, a8, bp1, c04
  396. LOAD_A a4, OFFSET_3, AO
  397. /* 6 */
  398. vmaddfp c05, a5, bp2, c05
  399. vspltw bp1, b1, 2
  400. vmaddfp c06, a6, bp2, c06
  401. nop
  402. vmaddfp c07, a7, bp2, c07
  403. dcbt AO, PREA
  404. vmaddfp c08, a8, bp2, c08
  405. addi AO, AO, 8 * SIZE
  406. /* 7 */
  407. vmaddfp c09, a5, bp1, c09
  408. vspltw bp2, b1, 3
  409. vmaddfp c10, a6, bp1, c10
  410. LOAD_B b1, OFFSET_4, BO
  411. vmaddfp c11, a7, bp1, c11
  412. nop
  413. vmaddfp c12, a8, bp1, c12
  414. nop
  415. /* 8 */
  416. vmaddfp c13, a5, bp2, c13
  417. vspltw bp1, b2, 0
  418. vmaddfp c14, a6, bp2, c14
  419. LOAD_A a5, OFFSET_2, AO
  420. vmaddfp c15, a7, bp2, c15
  421. LOAD_A a6, OFFSET_3, AO
  422. vmaddfp c16, a8, bp2, c16
  423. LOAD_A a7, OFFSET_4, AO
  424. /* 9 */
  425. vmaddfp c01, a1, bp1, c01
  426. vspltw bp2, b2, 1
  427. vmaddfp c02, a2, bp1, c02
  428. LOAD_A a8, OFFSET_5, AO
  429. vmaddfp c03, a3, bp1, c03
  430. addi BO, BO, 8 * SIZE
  431. vmaddfp c04, a4, bp1, c04
  432. nop
  433. /* 10 */
  434. vmaddfp c05, a1, bp2, c05
  435. vspltw bp1, b2, 2
  436. vmaddfp c06, a2, bp2, c06
  437. nop
  438. vmaddfp c07, a3, bp2, c07
  439. nop
  440. vmaddfp c08, a4, bp2, c08
  441. nop
  442. /* 11 */
  443. vmaddfp c09, a1, bp1, c09
  444. vspltw bp2, b2, 3
  445. vmaddfp c10, a2, bp1, c10
  446. LOAD_B b2, OFFSET_1, BO
  447. vmaddfp c11, a3, bp1, c11
  448. dcbt AO, PREA
  449. vmaddfp c12, a4, bp1, c12
  450. addi AO, AO, 8 * SIZE
  451. /* 12 */
  452. vmaddfp c13, a1, bp2, c13
  453. vspltw bp1, b2, 0
  454. vmaddfp c14, a2, bp2, c14
  455. LOAD_A a1, OFFSET_4, AO
  456. vmaddfp c15, a3, bp2, c15
  457. LOAD_A a2, OFFSET_5, AO
  458. vmaddfp c16, a4, bp2, c16
  459. LOAD_A a3, OFFSET_6, AO
  460. /* 13 */
  461. vmaddfp c01, a5, bp1, c01
  462. vspltw bp2, b2, 1
  463. vmaddfp c02, a6, bp1, c02
  464. LOAD_A a4, OFFSET_7, AO
  465. vmaddfp c03, a7, bp1, c03
  466. dcbt AO, PREA
  467. vmaddfp c04, a8, bp1, c04
  468. addi AO, AO, 8 * SIZE
  469. /* 14 */
  470. vmaddfp c05, a5, bp2, c05
  471. vspltw bp1, b2, 2
  472. vmaddfp c06, a6, bp2, c06
  473. nop
  474. vmaddfp c07, a7, bp2, c07
  475. dcbt AO, PREA
  476. vmaddfp c08, a8, bp2, c08
  477. addi AO, AO, 8 * SIZE
  478. /* 15 */
  479. vmaddfp c09, a5, bp1, c09
  480. vspltw bp2, b2, 3
  481. vmaddfp c10, a6, bp1, c10
  482. LOAD_B b2, OFFSET_4, BO
  483. vmaddfp c11, a7, bp1, c11
  484. dcbt AO, PREA
  485. vmaddfp c12, a8, bp1, c12
  486. addi BO, BO, 8 * SIZE
  487. /* 16 */
  488. vmaddfp c13, a5, bp2, c13
  489. vspltw bp1, b1, 0
  490. vmaddfp c14, a6, bp2, c14
  491. LOAD_A a5, OFFSET_4, AO
  492. vmaddfp c15, a7, bp2, c15
  493. LOAD_A a6, OFFSET_5, AO
  494. vmaddfp c16, a8, bp2, c16
  495. bdnz+ LL(12)
  496. .align 4
  497. LL(15):
  498. lvx swap, OFFSET_0, SP
  499. lvx neg, OFFSET_1, SP
  500. lvx alpha_r, OFFSET_2, SP
  501. lvx alpha_i, OFFSET_3, SP
  502. andi. r0, K, 3
  503. mtspr CTR, r0
  504. ble+ LL(18)
  505. .align 4
  506. LL(16):
  507. vmaddfp c01, a1, bp1, c01
  508. vspltw bp2, b1, 1
  509. vmaddfp c02, a2, bp1, c02
  510. nop
  511. vmaddfp c03, a3, bp1, c03
  512. nop
  513. vmaddfp c04, a4, bp1, c04
  514. nop
  515. vmaddfp c05, a1, bp2, c05
  516. vspltw bp1, b1, 2
  517. vmaddfp c06, a2, bp2, c06
  518. nop
  519. vmaddfp c07, a3, bp2, c07
  520. nop
  521. vmaddfp c08, a4, bp2, c08
  522. nop
  523. vmaddfp c09, a1, bp1, c09
  524. vspltw bp2, b1, 3
  525. vmaddfp c10, a2, bp1, c10
  526. LOAD_B b1, OFFSET_1, BO
  527. vmaddfp c11, a3, bp1, c11
  528. addi AO, AO, 16 * SIZE
  529. vmaddfp c12, a4, bp1, c12
  530. addi BO, BO, 4 * SIZE
  531. vmaddfp c13, a1, bp2, c13
  532. vspltw bp1, b1, 0
  533. vmaddfp c14, a2, bp2, c14
  534. LOAD_A a1, OFFSET_0, AO
  535. vmaddfp c15, a3, bp2, c15
  536. LOAD_A a2, OFFSET_1, AO
  537. vmaddfp c16, a4, bp2, c16
  538. LOAD_A a3, OFFSET_2, AO
  539. LOAD_A a4, OFFSET_3, AO
  540. bdnz+ LL(16)
  541. .align 4
  542. LL(18):
  543. vxor VZERO, VZERO, VZERO
  544. vperm c05, c05, c05, swap
  545. vperm c06, c06, c06, swap
  546. vperm c07, c07, c07, swap
  547. vperm c08, c08, c08, swap
  548. vperm c13, c13, c13, swap
  549. vperm c14, c14, c14, swap
  550. vperm c15, c15, c15, swap
  551. vperm c16, c16, c16, swap
  552. vxor c05, c05, neg
  553. vxor c06, c06, neg
  554. vxor c07, c07, neg
  555. vxor c08, c08, neg
  556. vxor c13, c13, neg
  557. vxor c14, c14, neg
  558. vxor c15, c15, neg
  559. vxor c16, c16, neg
  560. vaddfp c01, c01, c05
  561. vaddfp c02, c02, c06
  562. vaddfp c03, c03, c07
  563. vaddfp c04, c04, c08
  564. vaddfp c09, c09, c13
  565. vaddfp c10, c10, c14
  566. vaddfp c11, c11, c15
  567. vaddfp c12, c12, c16
  568. vperm c05, c01, c01, swap
  569. vperm c06, c02, c02, swap
  570. vperm c07, c03, c03, swap
  571. vperm c08, c04, c04, swap
  572. vperm c13, c09, c09, swap
  573. vperm c14, c10, c10, swap
  574. vperm c15, c11, c11, swap
  575. vperm c16, c12, c12, swap
  576. vmaddfp c01, alpha_r, c01, VZERO
  577. vmaddfp c02, alpha_r, c02, VZERO
  578. vmaddfp c03, alpha_r, c03, VZERO
  579. vmaddfp c04, alpha_r, c04, VZERO
  580. vmaddfp c01, alpha_i, c05, c01
  581. vmaddfp c02, alpha_i, c06, c02
  582. vmaddfp c03, alpha_i, c07, c03
  583. vmaddfp c04, alpha_i, c08, c04
  584. vmaddfp c09, alpha_r, c09, VZERO
  585. vmaddfp c10, alpha_r, c10, VZERO
  586. vmaddfp c11, alpha_r, c11, VZERO
  587. vmaddfp c12, alpha_r, c12, VZERO
  588. vmaddfp c09, alpha_i, c13, c09
  589. vmaddfp c10, alpha_i, c14, c10
  590. vmaddfp c11, alpha_i, c15, c11
  591. vmaddfp c12, alpha_i, c16, c12
  592. lvx C1, OFFSET_0, CO1
  593. lvx C2, OFFSET_1, CO1
  594. lvx C3, OFFSET_2, CO1
  595. lvx C4, OFFSET_3, CO1
  596. lvx C5, OFFSET_4, CO1
  597. lvsr PERMRSHIFT1, 0, CO1
  598. lvsr PERMRSHIFT2, 0, CO2
  599. vperm c00, VZERO, c01, PERMRSHIFT1
  600. vperm c01, c01, c02, PERMRSHIFT1
  601. vperm c02, c02, c03, PERMRSHIFT1
  602. vperm c03, c03, c04, PERMRSHIFT1
  603. vperm c04, c04, VZERO, PERMRSHIFT1
  604. vaddfp c00, c00, C1
  605. vaddfp c01, c01, C2
  606. vaddfp c02, c02, C3
  607. vaddfp c03, c03, C4
  608. vaddfp c04, c04, C5
  609. stvx c00, OFFSET_0, CO1
  610. stvx c01, OFFSET_1, CO1
  611. stvx c02, OFFSET_2, CO1
  612. stvx c03, OFFSET_3, CO1
  613. stvx c04, OFFSET_4, CO1
  614. lvx C1, OFFSET_0, CO2
  615. lvx C2, OFFSET_1, CO2
  616. lvx C3, OFFSET_2, CO2
  617. lvx C4, OFFSET_3, CO2
  618. lvx C5, OFFSET_4, CO2
  619. vperm c00, VZERO, c09, PERMRSHIFT2
  620. vperm c09, c09, c10, PERMRSHIFT2
  621. vperm c10, c10, c11, PERMRSHIFT2
  622. vperm c11, c11, c12, PERMRSHIFT2
  623. vperm c12, c12, VZERO, PERMRSHIFT2
  624. vaddfp c00, c00, C1
  625. vaddfp c09, c09, C2
  626. vaddfp c10, c10, C3
  627. vaddfp c11, c11, C4
  628. vaddfp c12, c12, C5
  629. stvx c00, OFFSET_0, CO2
  630. stvx c09, OFFSET_1, CO2
  631. stvx c10, OFFSET_2, CO2
  632. stvx c11, OFFSET_3, CO2
  633. stvx c12, OFFSET_4, CO2
  634. addi CO1, CO1, 16 * SIZE
  635. addi CO2, CO2, 16 * SIZE
  636. addic. I, I, -1
  637. bgt+ LL(11)
  638. .align 4
  639. LL(20):
  640. andi. I, M, 4
  641. ble LL(30)
  642. vxor c01, c01, c01
  643. LOAD_A a1, OFFSET_0, AO
  644. vxor c02, c02, c02
  645. LOAD_A a2, OFFSET_1, AO
  646. vxor c05, c05, c05
  647. LOAD_A a3, OFFSET_2, AO
  648. vxor c06, c06, c06
  649. LOAD_A a4, OFFSET_3, AO
  650. vxor c09, c09, c09
  651. LOAD_B b1, OFFSET_0, B
  652. vxor c10, c10, c10
  653. LOAD_B b2, OFFSET_1, B
  654. vxor c13, c13, c13
  655. vxor c14, c14, c14
  656. mr BO, B
  657. vspltw bp1, b1, 0
  658. srawi. r0, K, 1
  659. mtspr CTR, r0
  660. ble LL(25)
  661. .align 4
  662. LL(22):
  663. vmaddfp c01, a1, bp1, c01
  664. vspltw bp2, b1, 1
  665. addi AO, AO, 16 * SIZE
  666. vmaddfp c02, a2, bp1, c02
  667. addi BO, BO, 8 * SIZE
  668. vmaddfp c05, a1, bp2, c05
  669. vspltw bp1, b1, 2
  670. vmaddfp c06, a2, bp2, c06
  671. vmaddfp c09, a1, bp1, c09
  672. vspltw bp2, b1, 3
  673. LOAD_B b1, OFFSET_0, BO
  674. vmaddfp c10, a2, bp1, c10
  675. vmaddfp c13, a1, bp2, c13
  676. LOAD_A a1, OFFSET_0, AO
  677. vspltw bp1, b2, 0
  678. vmaddfp c14, a2, bp2, c14
  679. LOAD_A a2, OFFSET_1, AO
  680. vmaddfp c01, a3, bp1, c01
  681. vspltw bp2, b2, 1
  682. vmaddfp c02, a4, bp1, c02
  683. vmaddfp c05, a3, bp2, c05
  684. vspltw bp1, b2, 2
  685. vmaddfp c06, a4, bp2, c06
  686. vmaddfp c09, a3, bp1, c09
  687. vspltw bp2, b2, 3
  688. LOAD_B b2, OFFSET_1, BO
  689. vmaddfp c10, a4, bp1, c10
  690. vmaddfp c13, a3, bp2, c13
  691. LOAD_A a3, OFFSET_2, AO
  692. vmaddfp c14, a4, bp2, c14
  693. LOAD_A a4, OFFSET_3, AO
  694. vspltw bp1, b1, 0
  695. bdnz LL(22)
  696. .align 4
  697. LL(25):
  698. andi. r0, K, 1
  699. ble+ LL(28)
  700. .align 4
  701. LL(26):
  702. vmaddfp c01, a1, bp1, c01
  703. vspltw bp2, b1, 1
  704. vmaddfp c02, a2, bp1, c02
  705. nop
  706. vmaddfp c05, a1, bp2, c05
  707. vspltw bp1, b1, 2
  708. vmaddfp c06, a2, bp2, c06
  709. nop
  710. vmaddfp c09, a1, bp1, c09
  711. vspltw bp2, b1, 3
  712. vmaddfp c10, a2, bp1, c10
  713. addi AO, AO, 8 * SIZE
  714. vmaddfp c13, a1, bp2, c13
  715. addi BO, BO, 4 * SIZE
  716. vmaddfp c14, a2, bp2, c14
  717. nop
  718. .align 4
  719. LL(28):
  720. vxor VZERO, VZERO, VZERO
  721. lvx swap, OFFSET_0, SP
  722. lvx neg, OFFSET_1, SP
  723. lvx alpha_r, OFFSET_2, SP
  724. lvx alpha_i, OFFSET_3, SP
  725. vperm c05, c05, c05, swap
  726. vperm c06, c06, c06, swap
  727. vperm c13, c13, c13, swap
  728. vperm c14, c14, c14, swap
  729. vxor c05, c05, neg
  730. vxor c06, c06, neg
  731. vxor c13, c13, neg
  732. vxor c14, c14, neg
  733. vaddfp c01, c01, c05
  734. vaddfp c02, c02, c06
  735. vaddfp c09, c09, c13
  736. vaddfp c10, c10, c14
  737. vperm c05, c01, c01, swap
  738. vperm c06, c02, c02, swap
  739. vperm c13, c09, c09, swap
  740. vperm c14, c10, c10, swap
  741. vmaddfp c01, alpha_r, c01, VZERO
  742. vmaddfp c02, alpha_r, c02, VZERO
  743. vmaddfp c01, alpha_i, c05, c01
  744. vmaddfp c02, alpha_i, c06, c02
  745. vmaddfp c09, alpha_r, c09, VZERO
  746. vmaddfp c10, alpha_r, c10, VZERO
  747. vmaddfp c09, alpha_i, c13, c09
  748. vmaddfp c10, alpha_i, c14, c10
  749. lvx C1, OFFSET_0, CO1
  750. lvx C2, OFFSET_1, CO1
  751. lvx C3, OFFSET_2, CO1
  752. lvsr PERMRSHIFT1, 0, CO1
  753. lvsr PERMRSHIFT2, 0, CO2
  754. vperm c00, VZERO, c01, PERMRSHIFT1
  755. vperm c01, c01, c02, PERMRSHIFT1
  756. vperm c02, c02, VZERO, PERMRSHIFT1
  757. vaddfp c00, c00, C1
  758. vaddfp c01, c01, C2
  759. vaddfp c02, c02, C3
  760. stvx c00, OFFSET_0, CO1
  761. stvx c01, OFFSET_1, CO1
  762. stvx c02, OFFSET_2, CO1
  763. lvx C1, OFFSET_0, CO2
  764. lvx C2, OFFSET_1, CO2
  765. lvx C3, OFFSET_2, CO2
  766. vperm c00, VZERO, c09, PERMRSHIFT2
  767. vperm c09, c09, c10, PERMRSHIFT2
  768. vperm c10, c10, VZERO, PERMRSHIFT2
  769. vaddfp c00, c00, C1
  770. vaddfp c09, c09, C2
  771. vaddfp c10, c10, C3
  772. stvx c00, OFFSET_0, CO2
  773. stvx c09, OFFSET_1, CO2
  774. stvx c10, OFFSET_2, CO2
  775. addi CO1, CO1, 8 * SIZE
  776. addi CO2, CO2, 8 * SIZE
  777. .align 4
  778. LL(30):
  779. andi. I, M, 2
  780. ble LL(40)
  781. vxor c01, c01, c01
  782. LOAD_A a1, OFFSET_0, AO
  783. vxor c02, c02, c02
  784. LOAD_A a2, OFFSET_1, AO
  785. vxor c05, c05, c05
  786. LOAD_B b1, OFFSET_0, B
  787. vxor c06, c06, c06
  788. LOAD_B b2, OFFSET_1, B
  789. vxor c09, c09, c09
  790. vxor c10, c10, c10
  791. vxor c13, c13, c13
  792. vxor c14, c14, c14
  793. vspltw bp1, b1, 0
  794. mr BO, B
  795. srawi. r0, K, 1
  796. mtspr CTR, r0
  797. ble LL(35)
  798. .align 4
  799. LL(32):
  800. vmaddfp c01, a1, bp1, c01
  801. addi AO, AO, 8 * SIZE
  802. vspltw bp2, b1, 1
  803. vmaddfp c05, a1, bp2, c05
  804. addi BO, BO, 8 * SIZE
  805. vspltw bp1, b1, 2
  806. vmaddfp c09, a1, bp1, c09
  807. vspltw bp2, b1, 3
  808. vmaddfp c13, a1, bp2, c13
  809. LOAD_A a1, OFFSET_0, AO
  810. vspltw bp1, b2, 0
  811. LOAD_B b1, OFFSET_0, BO
  812. vmaddfp c02, a2, bp1, c02
  813. vspltw bp2, b2, 1
  814. vmaddfp c06, a2, bp2, c06
  815. vspltw bp1, b2, 2
  816. vmaddfp c10, a2, bp1, c10
  817. vspltw bp2, b2, 3
  818. LOAD_B b2, OFFSET_1, BO
  819. vmaddfp c14, a2, bp2, c14
  820. LOAD_A a2, OFFSET_1, AO
  821. vspltw bp1, b1, 0
  822. bdnz LL(32)
  823. .align 4
  824. LL(35):
  825. andi. r0, K, 1
  826. ble+ LL(38)
  827. .align 4
  828. LL(36):
  829. vmaddfp c01, a1, bp1, c01
  830. vspltw bp2, b1, 1
  831. vmaddfp c05, a1, bp2, c05
  832. vspltw bp1, b1, 2
  833. vmaddfp c09, a1, bp1, c09
  834. vspltw bp2, b1, 3
  835. vmaddfp c13, a1, bp2, c13
  836. addi AO, AO, 4 * SIZE
  837. addi BO, BO, 4 * SIZE
  838. .align 4
  839. LL(38):
  840. vaddfp c01, c01, c02
  841. vaddfp c05, c05, c06
  842. vaddfp c09, c09, c10
  843. vaddfp c13, c13, c14
  844. vxor VZERO, VZERO, VZERO
  845. lvx swap, OFFSET_0, SP
  846. lvx neg, OFFSET_1, SP
  847. lvx alpha_r, OFFSET_2, SP
  848. lvx alpha_i, OFFSET_3, SP
  849. vperm c05, c05, c05, swap
  850. vperm c13, c13, c13, swap
  851. vxor c05, c05, neg
  852. vxor c13, c13, neg
  853. vaddfp c01, c01, c05
  854. vaddfp c09, c09, c13
  855. vperm c05, c01, c01, swap
  856. vperm c13, c09, c09, swap
  857. vmaddfp c01, alpha_r, c01, VZERO
  858. vmaddfp c01, alpha_i, c05, c01
  859. vmaddfp c09, alpha_r, c09, VZERO
  860. vmaddfp c09, alpha_i, c13, c09
  861. lvx C1, OFFSET_0, CO1
  862. lvx C2, OFFSET_1, CO1
  863. lvsr PERMRSHIFT1, 0, CO1
  864. lvsr PERMRSHIFT2, 0, CO2
  865. vperm c00, VZERO, c01, PERMRSHIFT1
  866. vperm c01, c01, VZERO, PERMRSHIFT1
  867. vaddfp c00, c00, C1
  868. vaddfp c01, c01, C2
  869. stvx c00, OFFSET_0, CO1
  870. stvx c01, OFFSET_1, CO1
  871. lvx C1, OFFSET_0, CO2
  872. lvx C2, OFFSET_1, CO2
  873. vperm c00, VZERO, c09, PERMRSHIFT2
  874. vperm c09, c09, VZERO, PERMRSHIFT2
  875. vaddfp c00, c00, C1
  876. vaddfp c09, c09, C2
  877. stvx c00, OFFSET_0, CO2
  878. stvx c09, OFFSET_1, CO2
  879. addi CO1, CO1, 4 * SIZE
  880. addi CO2, CO2, 4 * SIZE
  881. .align 4
  882. LL(40):
  883. andi. I, M, 1
  884. ble LL(49)
  885. mr BO, B
  886. LFD f8, 0 * SIZE(AO)
  887. LFD f9, 1 * SIZE(AO)
  888. LFD f10, 0 * SIZE(BO)
  889. LFD f11, 1 * SIZE(BO)
  890. LFD f12, 2 * SIZE(BO)
  891. LFD f13, 3 * SIZE(BO)
  892. lfs f0, FZERO(SP)
  893. fmr f1, f0
  894. fmr f2, f0
  895. fmr f3, f0
  896. fmr f4, f0
  897. fmr f5, f0
  898. fmr f6, f0
  899. fmr f7, f0
  900. srawi. r0, K, 1
  901. mtspr CTR, r0
  902. ble LL(45)
  903. .align 4
  904. LL(42):
  905. fmadd f0, f8, f10, f0
  906. fmadd f2, f8, f11, f2
  907. fmadd f4, f8, f12, f4
  908. fmadd f6, f8, f13, f6
  909. fmadd f1, f9, f10, f1
  910. fmadd f3, f9, f11, f3
  911. fmadd f5, f9, f12, f5
  912. fmadd f7, f9, f13, f7
  913. LFD f8, 2 * SIZE(AO)
  914. LFD f9, 3 * SIZE(AO)
  915. LFD f10, 4 * SIZE(BO)
  916. LFD f11, 5 * SIZE(BO)
  917. LFD f12, 6 * SIZE(BO)
  918. LFD f13, 7 * SIZE(BO)
  919. fmadd f0, f8, f10, f0
  920. fmadd f2, f8, f11, f2
  921. fmadd f4, f8, f12, f4
  922. fmadd f6, f8, f13, f6
  923. fmadd f1, f9, f10, f1
  924. fmadd f3, f9, f11, f3
  925. fmadd f5, f9, f12, f5
  926. fmadd f7, f9, f13, f7
  927. LFD f8, 4 * SIZE(AO)
  928. LFD f9, 5 * SIZE(AO)
  929. LFD f10, 8 * SIZE(BO)
  930. LFD f11, 9 * SIZE(BO)
  931. LFD f12, 10 * SIZE(BO)
  932. LFD f13, 11 * SIZE(BO)
  933. addi AO, AO, 4 * SIZE
  934. addi BO, BO, 8 * SIZE
  935. bdnz LL(42)
  936. .align 4
  937. LL(45):
  938. andi. r0, K, 1
  939. ble LL(48)
  940. .align 4
  941. LL(46):
  942. fmadd f0, f8, f10, f0
  943. fmadd f2, f8, f11, f2
  944. fmadd f4, f8, f12, f4
  945. fmadd f6, f8, f13, f6
  946. fmadd f1, f9, f10, f1
  947. fmadd f3, f9, f11, f3
  948. fmadd f5, f9, f12, f5
  949. fmadd f7, f9, f13, f7
  950. addi AO, AO, 2 * SIZE
  951. addi BO, BO, 4 * SIZE
  952. .align 4
  953. LL(48):
  954. #if defined(NN) || defined(NT) || defined(TN) || defined(TT)
  955. fsub f0, f0, f3
  956. fadd f1, f1, f2
  957. fsub f4, f4, f7
  958. fadd f5, f5, f6
  959. #elif defined(NR) || defined(NC) || defined(TR) || defined(TC)
  960. fadd f0, f0, f3
  961. fsub f1, f1, f2
  962. fadd f4, f4, f7
  963. fsub f5, f5, f6
  964. #elif defined(RN) || defined(RT) || defined(CN) || defined(CT)
  965. fadd f0, f0, f3
  966. fsub f1, f2, f1
  967. fadd f4, f4, f7
  968. fsub f5, f6, f5
  969. #else /* RR, RC, CR, CC */
  970. fsub f0, f0, f3
  971. fadd f1, f1, f2
  972. fsub f4, f4, f7
  973. fadd f5, f5, f6
  974. #endif
  975. LFD f8, 0 * SIZE(CO1)
  976. LFD f9, 1 * SIZE(CO1)
  977. LFD f10, 0 * SIZE(CO2)
  978. LFD f11, 1 * SIZE(CO2)
  979. lfs f12, ALPHA_R + 0(SP)
  980. lfs f13, ALPHA_I + 4(SP)
  981. #if defined(RR) || defined(RC) || defined(CR) || defined(CC)
  982. fmadd f8, f12, f0, f8
  983. fnmsub f9, f12, f1, f9
  984. fmadd f10, f12, f4, f10
  985. fnmsub f11, f12, f5, f11
  986. fmadd f8, f13, f1, f8
  987. fmadd f9, f13, f0, f9
  988. fmadd f10, f13, f5, f10
  989. fmadd f11, f13, f4, f11
  990. #else
  991. fmadd f8, f12, f0, f8
  992. fmadd f9, f12, f1, f9
  993. fmadd f10, f12, f4, f10
  994. fmadd f11, f12, f5, f11
  995. fnmsub f8, f13, f1, f8
  996. fmadd f9, f13, f0, f9
  997. fnmsub f10, f13, f5, f10
  998. fmadd f11, f13, f4, f11
  999. #endif
  1000. STFD f8, 0 * SIZE(CO1)
  1001. STFD f9, 1 * SIZE(CO1)
  1002. STFD f10, 0 * SIZE(CO2)
  1003. STFD f11, 1 * SIZE(CO2)
  1004. LL(49):
  1005. mr B, BO
  1006. addic. J, J, -1
  1007. bgt LL(01)
  1008. .align 4
  1009. LL(50):
  1010. andi. J, N, 1
  1011. ble LL(999)
  1012. mr CO1, C
  1013. mr AO, A
  1014. srawi. I, M, 3
  1015. ble LL(70)
  1016. .align 4
  1017. LL(61):
  1018. vxor c01, c01, c01
  1019. LOAD_B b1, OFFSET_0, B
  1020. vxor c02, c02, c02
  1021. vxor c03, c03, c03
  1022. LOAD_A a1, OFFSET_0, AO
  1023. vxor c04, c04, c04
  1024. LOAD_A a2, OFFSET_1, AO
  1025. vxor c05, c05, c05
  1026. LOAD_A a3, OFFSET_2, AO
  1027. vxor c06, c06, c06
  1028. LOAD_A a4, OFFSET_3, AO
  1029. vxor c07, c07, c07
  1030. vxor c08, c08, c08
  1031. mr BO, B
  1032. dcbtst CO1, PREC
  1033. dcbtst CO2, PREC
  1034. vspltw bp1, b1, 0
  1035. srawi. r0, K, 1
  1036. mtspr CTR, r0
  1037. ble LL(65)
  1038. .align 4
  1039. LL(62):
  1040. LOAD_A a5, OFFSET_4, AO
  1041. LOAD_A a6, OFFSET_5, AO
  1042. LOAD_A a7, OFFSET_6, AO
  1043. LOAD_A a8, OFFSET_7, AO
  1044. vmaddfp c01, a1, bp1, c01
  1045. vspltw bp2, b1, 1
  1046. vmaddfp c02, a2, bp1, c02
  1047. vmaddfp c03, a3, bp1, c03
  1048. vmaddfp c04, a4, bp1, c04
  1049. vmaddfp c05, a1, bp2, c05
  1050. vspltw bp1, b1, 2
  1051. vmaddfp c06, a2, bp2, c06
  1052. vmaddfp c07, a3, bp2, c07
  1053. vmaddfp c08, a4, bp2, c08
  1054. vmaddfp c01, a5, bp1, c01
  1055. vspltw bp2, b1, 3
  1056. vmaddfp c02, a6, bp1, c02
  1057. vmaddfp c03, a7, bp1, c03
  1058. vmaddfp c04, a8, bp1, c04
  1059. LOAD_B b1, OFFSET_1, BO
  1060. vspltw bp1, b1, 0
  1061. vmaddfp c05, a5, bp2, c05
  1062. vmaddfp c06, a6, bp2, c06
  1063. vmaddfp c07, a7, bp2, c07
  1064. vmaddfp c08, a8, bp2, c08
  1065. addi AO, AO, 32 * SIZE
  1066. addi BO, BO, 4 * SIZE
  1067. LOAD_A a1, OFFSET_0, AO
  1068. LOAD_A a2, OFFSET_1, AO
  1069. LOAD_A a3, OFFSET_2, AO
  1070. LOAD_A a4, OFFSET_3, AO
  1071. bdnz LL(62)
  1072. .align 4
  1073. LL(65):
  1074. andi. r0, K, 1
  1075. ble+ LL(68)
  1076. .align 4
  1077. LL(66):
  1078. vmaddfp c01, a1, bp1, c01
  1079. vspltw bp2, b1, 1
  1080. vmaddfp c02, a2, bp1, c02
  1081. addi AO, AO, 16 * SIZE
  1082. vmaddfp c03, a3, bp1, c03
  1083. addi BO, BO, 2 * SIZE
  1084. vmaddfp c04, a4, bp1, c04
  1085. nop
  1086. vmaddfp c05, a1, bp2, c05
  1087. vmaddfp c06, a2, bp2, c06
  1088. vmaddfp c07, a3, bp2, c07
  1089. vmaddfp c08, a4, bp2, c08
  1090. .align 4
  1091. LL(68):
  1092. vxor VZERO, VZERO, VZERO
  1093. lvx swap, OFFSET_0, SP
  1094. lvx neg, OFFSET_1, SP
  1095. lvx alpha_r, OFFSET_2, SP
  1096. lvx alpha_i, OFFSET_3, SP
  1097. vperm c05, c05, c05, swap
  1098. vperm c06, c06, c06, swap
  1099. vperm c07, c07, c07, swap
  1100. vperm c08, c08, c08, swap
  1101. vxor c05, c05, neg
  1102. vxor c06, c06, neg
  1103. vxor c07, c07, neg
  1104. vxor c08, c08, neg
  1105. vaddfp c01, c01, c05
  1106. vaddfp c02, c02, c06
  1107. vaddfp c03, c03, c07
  1108. vaddfp c04, c04, c08
  1109. vperm c05, c01, c01, swap
  1110. vperm c06, c02, c02, swap
  1111. vperm c07, c03, c03, swap
  1112. vperm c08, c04, c04, swap
  1113. vmaddfp c01, alpha_r, c01, VZERO
  1114. vmaddfp c02, alpha_r, c02, VZERO
  1115. vmaddfp c03, alpha_r, c03, VZERO
  1116. vmaddfp c04, alpha_r, c04, VZERO
  1117. vmaddfp c01, alpha_i, c05, c01
  1118. vmaddfp c02, alpha_i, c06, c02
  1119. vmaddfp c03, alpha_i, c07, c03
  1120. vmaddfp c04, alpha_i, c08, c04
  1121. lvx C1, OFFSET_0, CO1
  1122. lvx C2, OFFSET_1, CO1
  1123. lvx C3, OFFSET_2, CO1
  1124. lvx C4, OFFSET_3, CO1
  1125. lvx C5, OFFSET_4, CO1
  1126. lvsr PERMRSHIFT1, 0, CO1
  1127. vperm c00, VZERO, c01, PERMRSHIFT1
  1128. vperm c01, c01, c02, PERMRSHIFT1
  1129. vperm c02, c02, c03, PERMRSHIFT1
  1130. vperm c03, c03, c04, PERMRSHIFT1
  1131. vperm c04, c04, VZERO, PERMRSHIFT1
  1132. vaddfp c00, c00, C1
  1133. vaddfp c01, c01, C2
  1134. vaddfp c02, c02, C3
  1135. vaddfp c03, c03, C4
  1136. vaddfp c04, c04, C5
  1137. stvx c00, OFFSET_0, CO1
  1138. stvx c01, OFFSET_1, CO1
  1139. stvx c02, OFFSET_2, CO1
  1140. stvx c03, OFFSET_3, CO1
  1141. stvx c04, OFFSET_4, CO1
  1142. addi CO1, CO1, 16 * SIZE
  1143. addic. I, I, -1
  1144. bgt+ LL(61)
  1145. .align 4
  1146. LL(70):
  1147. andi. I, M, 4
  1148. ble LL(80)
  1149. vxor c01, c01, c01
  1150. LOAD_B b1, OFFSET_0, B
  1151. vxor c02, c02, c02
  1152. vxor c03, c03, c03
  1153. LOAD_A a1, OFFSET_0, AO
  1154. vxor c04, c04, c04
  1155. LOAD_A a2, OFFSET_1, AO
  1156. vxor c05, c05, c05
  1157. LOAD_A a3, OFFSET_2, AO
  1158. vxor c06, c06, c06
  1159. LOAD_A a4, OFFSET_3, AO
  1160. vxor c07, c07, c07
  1161. vxor c08, c08, c08
  1162. mr BO, B
  1163. vspltw bp1, b1, 0
  1164. srawi. r0, K, 1
  1165. mtspr CTR, r0
  1166. ble LL(75)
  1167. .align 4
  1168. LL(72):
  1169. vmaddfp c01, a1, bp1, c01
  1170. vspltw bp2, b1, 1
  1171. vmaddfp c02, a2, bp1, c02
  1172. vmaddfp c05, a1, bp2, c05
  1173. vspltw bp1, b1, 2
  1174. vmaddfp c06, a2, bp2, c06
  1175. vmaddfp c03, a3, bp1, c03
  1176. vspltw bp2, b1, 3
  1177. vmaddfp c04, a4, bp1, c04
  1178. LOAD_B b1, OFFSET_1, BO
  1179. vspltw bp1, b1, 0
  1180. vmaddfp c07, a3, bp2, c07
  1181. vmaddfp c08, a4, bp2, c08
  1182. addi AO, AO, 16 * SIZE
  1183. addi BO, BO, 4 * SIZE
  1184. LOAD_A a1, OFFSET_0, AO
  1185. LOAD_A a2, OFFSET_1, AO
  1186. LOAD_A a3, OFFSET_2, AO
  1187. LOAD_A a4, OFFSET_3, AO
  1188. bdnz LL(72)
  1189. .align 4
  1190. LL(75):
  1191. andi. r0, K, 1
  1192. ble+ LL(78)
  1193. .align 4
  1194. LL(76):
  1195. vmaddfp c01, a1, bp1, c01
  1196. vspltw bp2, b1, 1
  1197. vmaddfp c02, a2, bp1, c02
  1198. addi AO, AO, 8 * SIZE
  1199. vmaddfp c05, a1, bp2, c05
  1200. addi BO, BO, 2 * SIZE
  1201. vmaddfp c06, a2, bp2, c06
  1202. .align 4
  1203. LL(78):
  1204. vaddfp c01, c01, c03
  1205. vaddfp c02, c02, c04
  1206. vaddfp c05, c05, c07
  1207. vaddfp c06, c06, c08
  1208. vxor VZERO, VZERO, VZERO
  1209. lvx swap, OFFSET_0, SP
  1210. lvx neg, OFFSET_1, SP
  1211. lvx alpha_r, OFFSET_2, SP
  1212. lvx alpha_i, OFFSET_3, SP
  1213. vperm c05, c05, c05, swap
  1214. vperm c06, c06, c06, swap
  1215. vxor c05, c05, neg
  1216. vxor c06, c06, neg
  1217. vaddfp c01, c01, c05
  1218. vaddfp c02, c02, c06
  1219. vperm c05, c01, c01, swap
  1220. vperm c06, c02, c02, swap
  1221. vmaddfp c01, alpha_r, c01, VZERO
  1222. vmaddfp c02, alpha_r, c02, VZERO
  1223. vmaddfp c01, alpha_i, c05, c01
  1224. vmaddfp c02, alpha_i, c06, c02
  1225. lvx C1, OFFSET_0, CO1
  1226. lvx C2, OFFSET_1, CO1
  1227. lvx C3, OFFSET_2, CO1
  1228. lvsr PERMRSHIFT1, 0, CO1
  1229. vperm c00, VZERO, c01, PERMRSHIFT1
  1230. vperm c01, c01, c02, PERMRSHIFT1
  1231. vperm c02, c02, VZERO, PERMRSHIFT1
  1232. vaddfp c00, c00, C1
  1233. vaddfp c01, c01, C2
  1234. vaddfp c02, c02, C3
  1235. stvx c00, OFFSET_0, CO1
  1236. stvx c01, OFFSET_1, CO1
  1237. stvx c02, OFFSET_2, CO1
  1238. addi CO1, CO1, 8 * SIZE
  1239. .align 4
  1240. LL(80):
  1241. andi. I, M, 2
  1242. ble LL(90)
  1243. vxor c01, c01, c01
  1244. LOAD_B b1, OFFSET_0, B
  1245. vxor c02, c02, c02
  1246. LOAD_A a1, OFFSET_0, AO
  1247. LOAD_A a2, OFFSET_1, AO
  1248. vxor c05, c05, c05
  1249. vxor c06, c06, c06
  1250. mr BO, B
  1251. vspltw bp1, b1, 0
  1252. srawi. r0, K, 1
  1253. mtspr CTR, r0
  1254. ble LL(85)
  1255. .align 4
  1256. LL(82):
  1257. vmaddfp c01, a1, bp1, c01
  1258. vspltw bp2, b1, 1
  1259. vmaddfp c05, a1, bp2, c05
  1260. vspltw bp1, b1, 2
  1261. vmaddfp c02, a2, bp1, c02
  1262. vspltw bp2, b1, 3
  1263. LOAD_B b1, OFFSET_1, BO
  1264. vspltw bp1, b1, 0
  1265. vmaddfp c06, a2, bp2, c06
  1266. addi AO, AO, 8 * SIZE
  1267. addi BO, BO, 4 * SIZE
  1268. LOAD_A a1, OFFSET_0, AO
  1269. LOAD_A a2, OFFSET_1, AO
  1270. bdnz LL(82)
  1271. .align 4
  1272. LL(85):
  1273. andi. r0, K, 1
  1274. ble+ LL(88)
  1275. .align 4
  1276. LL(86):
  1277. vspltw bp2, b1, 1
  1278. vmaddfp c01, a1, bp1, c01
  1279. vmaddfp c05, a1, bp2, c05
  1280. addi AO, AO, 4 * SIZE
  1281. addi BO, BO, 2 * SIZE
  1282. .align 4
  1283. LL(88):
  1284. vaddfp c01, c01, c02
  1285. vaddfp c05, c05, c06
  1286. vaddfp c09, c09, c10
  1287. vaddfp c13, c13, c14
  1288. vxor VZERO, VZERO, VZERO
  1289. lvx swap, OFFSET_0, SP
  1290. lvx neg, OFFSET_1, SP
  1291. lvx alpha_r, OFFSET_2, SP
  1292. lvx alpha_i, OFFSET_3, SP
  1293. vperm c05, c05, c05, swap
  1294. vxor c05, c05, neg
  1295. vaddfp c01, c01, c05
  1296. vperm c05, c01, c01, swap
  1297. vmaddfp c01, alpha_r, c01, VZERO
  1298. vmaddfp c01, alpha_i, c05, c01
  1299. lvx C1, OFFSET_0, CO1
  1300. lvx C2, OFFSET_1, CO1
  1301. lvsr PERMRSHIFT1, 0, CO1
  1302. vperm c00, VZERO, c01, PERMRSHIFT1
  1303. vperm c01, c01, VZERO, PERMRSHIFT1
  1304. vaddfp c00, c00, C1
  1305. vaddfp c01, c01, C2
  1306. stvx c00, OFFSET_0, CO1
  1307. stvx c01, OFFSET_1, CO1
  1308. addi CO1, CO1, 4 * SIZE
  1309. .align 4
  1310. LL(90):
  1311. andi. I, M, 1
  1312. ble LL(999)
  1313. mr BO, B
  1314. LFD f8, 0 * SIZE(AO)
  1315. LFD f9, 1 * SIZE(AO)
  1316. LFD f10, 0 * SIZE(BO)
  1317. LFD f11, 1 * SIZE(BO)
  1318. LFD f12, 2 * SIZE(BO)
  1319. LFD f13, 3 * SIZE(BO)
  1320. lfs f0, FZERO(SP)
  1321. fmr f1, f0
  1322. fmr f2, f0
  1323. fmr f3, f0
  1324. srawi. r0, K, 1
  1325. mtspr CTR, r0
  1326. ble LL(95)
  1327. .align 4
  1328. LL(92):
  1329. fmadd f0, f8, f10, f0
  1330. fmadd f2, f8, f11, f2
  1331. fmadd f1, f9, f10, f1
  1332. fmadd f3, f9, f11, f3
  1333. LFD f8, 2 * SIZE(AO)
  1334. LFD f9, 3 * SIZE(AO)
  1335. LFD f10, 4 * SIZE(BO)
  1336. LFD f11, 5 * SIZE(BO)
  1337. fmadd f0, f8, f12, f0
  1338. fmadd f2, f8, f13, f2
  1339. fmadd f1, f9, f12, f1
  1340. fmadd f3, f9, f13, f3
  1341. LFD f8, 4 * SIZE(AO)
  1342. LFD f9, 5 * SIZE(AO)
  1343. LFD f12, 6 * SIZE(BO)
  1344. LFD f13, 7 * SIZE(BO)
  1345. addi AO, AO, 4 * SIZE
  1346. addi BO, BO, 4 * SIZE
  1347. bdnz LL(92)
  1348. .align 4
  1349. LL(95):
  1350. andi. r0, K, 1
  1351. ble LL(98)
  1352. .align 4
  1353. LL(96):
  1354. fmadd f0, f8, f10, f0
  1355. fmadd f2, f8, f11, f2
  1356. fmadd f1, f9, f10, f1
  1357. fmadd f3, f9, f11, f3
  1358. .align 4
  1359. LL(98):
  1360. #if defined(NN) || defined(NT) || defined(TN) || defined(TT)
  1361. fsub f0, f0, f3
  1362. fadd f1, f1, f2
  1363. #elif defined(NR) || defined(NC) || defined(TR) || defined(TC)
  1364. fadd f0, f0, f3
  1365. fsub f1, f1, f2
  1366. #elif defined(RN) || defined(RT) || defined(CN) || defined(CT)
  1367. fadd f0, f0, f3
  1368. fsub f1, f2, f1
  1369. #else /* RR, RC, CR, CC */
  1370. fsub f0, f0, f3
  1371. fadd f1, f1, f2
  1372. #endif
  1373. LFD f8, 0 * SIZE(CO1)
  1374. LFD f9, 1 * SIZE(CO1)
  1375. lfs f12, ALPHA_R + 0(SP)
  1376. lfs f13, ALPHA_I + 4(SP)
  1377. #if defined(RR) || defined(RC) || defined(CR) || defined(CC)
  1378. fmadd f8, f12, f0, f8
  1379. fnmsub f9, f12, f1, f9
  1380. fmadd f8, f13, f1, f8
  1381. fmadd f9, f13, f0, f9
  1382. #else
  1383. fmadd f8, f12, f0, f8
  1384. fmadd f9, f12, f1, f9
  1385. fnmsub f8, f13, f1, f8
  1386. fmadd f9, f13, f0, f9
  1387. #endif
  1388. STFD f8, 0 * SIZE(CO1)
  1389. STFD f9, 1 * SIZE(CO1)
  1390. .align 4
  1391. LL(999):
  1392. mr SP, STACK
  1393. li r0, 0 * 16
  1394. lvx v20, SP, r0
  1395. li r0, 1 * 16
  1396. lvx v21, SP, r0
  1397. li r0, 2 * 16
  1398. lvx v22, SP, r0
  1399. li r0, 3 * 16
  1400. lvx v23, SP, r0
  1401. li r0, 4 * 16
  1402. lvx v24, SP, r0
  1403. li r0, 5 * 16
  1404. lvx v25, SP, r0
  1405. li r0, 6 * 16
  1406. lvx v26, SP, r0
  1407. li r0, 7 * 16
  1408. lvx v27, SP, r0
  1409. li r0, 8 * 16
  1410. lvx v28, SP, r0
  1411. li r0, 9 * 16
  1412. lvx v29, SP, r0
  1413. li r0, 10 * 16
  1414. lvx v30, SP, r0
  1415. li r0, 11 * 16
  1416. lvx v31, SP, r0
  1417. mtspr VRsave, VREG
  1418. #ifdef __64BIT__
  1419. ld r31, 192(SP)
  1420. ld r30, 200(SP)
  1421. ld r29, 208(SP)
  1422. ld r28, 216(SP)
  1423. ld r27, 224(SP)
  1424. ld r26, 232(SP)
  1425. ld r25, 240(SP)
  1426. ld r24, 248(SP)
  1427. ld r23, 256(SP)
  1428. ld r22, 264(SP)
  1429. ld r21, 272(SP)
  1430. ld r20, 280(SP)
  1431. ld r19, 288(SP)
  1432. ld r18, 296(SP)
  1433. ld r17, 304(SP)
  1434. ld r16, 312(SP)
  1435. ld r15, 320(SP)
  1436. ld r14, 328(SP)
  1437. #else
  1438. lwz r31, 192(SP)
  1439. lwz r30, 196(SP)
  1440. lwz r29, 200(SP)
  1441. lwz r28, 204(SP)
  1442. lwz r27, 208(SP)
  1443. lwz r26, 212(SP)
  1444. lwz r25, 216(SP)
  1445. lwz r24, 220(SP)
  1446. lwz r23, 224(SP)
  1447. lwz r22, 228(SP)
  1448. lwz r21, 232(SP)
  1449. lwz r20, 236(SP)
  1450. lwz r19, 240(SP)
  1451. lwz r18, 244(SP)
  1452. lwz r17, 248(SP)
  1453. lwz r16, 252(SP)
  1454. lwz r15, 256(SP)
  1455. lwz r14, 260(SP)
  1456. #endif
  1457. addi SP, SP, STACKSIZE
  1458. blr
  1459. EPILOGUE
  1460. #endif