You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

gemv_n_ppc440.S 24 kB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189
  1. /*********************************************************************/
  2. /* Copyright 2009, 2010 The University of Texas at Austin. */
  3. /* All rights reserved. */
  4. /* */
  5. /* Redistribution and use in source and binary forms, with or */
  6. /* without modification, are permitted provided that the following */
  7. /* conditions are met: */
  8. /* */
  9. /* 1. Redistributions of source code must retain the above */
  10. /* copyright notice, this list of conditions and the following */
  11. /* disclaimer. */
  12. /* */
  13. /* 2. Redistributions in binary form must reproduce the above */
  14. /* copyright notice, this list of conditions and the following */
  15. /* disclaimer in the documentation and/or other materials */
  16. /* provided with the distribution. */
  17. /* */
  18. /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
  19. /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
  20. /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
  21. /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
  22. /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
  23. /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
  24. /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
  25. /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
  26. /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
  27. /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
  28. /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
  29. /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
  30. /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
  31. /* POSSIBILITY OF SUCH DAMAGE. */
  32. /* */
  33. /* The views and conclusions contained in the software and */
  34. /* documentation are those of the authors and should not be */
  35. /* interpreted as representing official policies, either expressed */
  36. /* or implied, of The University of Texas at Austin. */
  37. /*********************************************************************/
  38. #define ASSEMBLER
  39. #include "common.h"
  40. #if defined(linux) || defined(__FreeBSD__)
  41. #ifndef __64BIT__
  42. #define M r3
  43. #define N r4
  44. #define A r6
  45. #define LDA r7
  46. #define X r8
  47. #define INCX r9
  48. #define Y r10
  49. #define INCY r5
  50. #else
  51. #define M r3
  52. #define N r4
  53. #define A r7
  54. #define LDA r8
  55. #define X r9
  56. #define INCX r10
  57. #define Y r5
  58. #define INCY r6
  59. #endif
  60. #endif
  61. #if defined(_AIX) || defined(__APPLE__)
  62. #if !defined(__64BIT__) && defined(DOUBLE)
  63. #define M r3
  64. #define N r4
  65. #define A r8
  66. #define LDA r9
  67. #define X r10
  68. #define INCX r5
  69. #define Y r6
  70. #define INCY r7
  71. #else
  72. #define M r3
  73. #define N r4
  74. #define A r7
  75. #define LDA r8
  76. #define X r9
  77. #define INCX r10
  78. #define Y r5
  79. #define INCY r6
  80. #endif
  81. #endif
  82. #define I r11
  83. #define J r12
  84. #define AO1 r14
  85. #define AO2 r15
  86. #define AO3 r16
  87. #define AO4 r17
  88. #define LDA8 r18
  89. #define Y1 r19
  90. #define Y2 r20
  91. #define PREA r21
  92. #define YY r22
  93. #define BUFFER r23
  94. #define y01 f0
  95. #define y02 f1
  96. #define y03 f2
  97. #define y04 f3
  98. #define y05 f4
  99. #define y06 f5
  100. #define y07 f6
  101. #define y08 f7
  102. #define y09 f8
  103. #define y10 f9
  104. #define y11 f10
  105. #define y12 f11
  106. #define y13 f12
  107. #define y14 f13
  108. #define y15 f14
  109. #define y16 f15
  110. #define alpha1 f16
  111. #define alpha2 f17
  112. #define alpha3 f18
  113. #define alpha4 f19
  114. #define a1 f20
  115. #define a2 f21
  116. #define a3 f22
  117. #define a4 f23
  118. #define a5 f24
  119. #define a6 f25
  120. #define a7 f26
  121. #define a8 f27
  122. #define alpha f27
  123. #if defined(PPC440)
  124. #define PREFETCHSIZE_A (3 * 4)
  125. #endif
  126. #if defined(PPCG4)
  127. #define PREFETCHSIZE_A (3 * 4)
  128. #endif
  129. #if defined(POWER6)
  130. #define PREFETCHSIZE_A (3 * 4)
  131. #endif
  132. #ifndef NEEDPARAM
  133. #ifndef __64BIT__
  134. #define STACKSIZE 224
  135. #define ALPHA 200(SP)
  136. #define FZERO 208(SP)
  137. #else
  138. #define STACKSIZE 280
  139. #define ALPHA 256(SP)
  140. #define FZERO 264(SP)
  141. #endif
  142. PROLOGUE
  143. PROFCODE
  144. addi SP, SP, -STACKSIZE
  145. li r0, 0
  146. stfd f14, 0(SP)
  147. stfd f15, 8(SP)
  148. stfd f16, 16(SP)
  149. stfd f17, 24(SP)
  150. stfd f18, 32(SP)
  151. stfd f19, 40(SP)
  152. stfd f20, 48(SP)
  153. stfd f21, 56(SP)
  154. stfd f22, 64(SP)
  155. stfd f23, 72(SP)
  156. stfd f24, 80(SP)
  157. stfd f25, 88(SP)
  158. stfd f26, 96(SP)
  159. stfd f27, 104(SP)
  160. #ifdef __64BIT__
  161. std r0, FZERO
  162. std r14, 144(SP)
  163. std r15, 152(SP)
  164. std r16, 160(SP)
  165. std r17, 168(SP)
  166. std r18, 176(SP)
  167. std r19, 184(SP)
  168. std r20, 192(SP)
  169. std r21, 200(SP)
  170. std r22, 208(SP)
  171. std r23, 216(SP)
  172. #else
  173. stw r0, 0 + FZERO
  174. stw r0, 4 + FZERO
  175. stw r14, 144(SP)
  176. stw r15, 148(SP)
  177. stw r16, 152(SP)
  178. stw r17, 156(SP)
  179. stw r18, 160(SP)
  180. stw r19, 164(SP)
  181. stw r20, 168(SP)
  182. stw r21, 172(SP)
  183. stw r22, 176(SP)
  184. stw r23, 180(SP)
  185. #endif
  186. #if defined(linux) || defined(__FreeBSD__)
  187. #ifndef __64BIT__
  188. lwz INCY, FRAMESLOT(0) + STACKSIZE(SP)
  189. lwz BUFFER, FRAMESLOT(1) + STACKSIZE(SP)
  190. #else
  191. ld Y, FRAMESLOT(0) + STACKSIZE(SP)
  192. ld INCY, FRAMESLOT(1) + STACKSIZE(SP)
  193. ld BUFFER, FRAMESLOT(2) + STACKSIZE(SP)
  194. #endif
  195. #endif
  196. #if defined(_AIX) || defined(__APPLE__)
  197. #ifndef __64BIT__
  198. #ifdef DOUBLE
  199. lwz INCX, FRAMESLOT(0) + STACKSIZE(SP)
  200. lwz Y, FRAMESLOT(1) + STACKSIZE(SP)
  201. lwz INCY, FRAMESLOT(2) + STACKSIZE(SP)
  202. lwz BUFFER, FRAMESLOT(3) + STACKSIZE(SP)
  203. #else
  204. lwz Y, FRAMESLOT(0) + STACKSIZE(SP)
  205. lwz INCY, FRAMESLOT(1) + STACKSIZE(SP)
  206. lwz BUFFER, FRAMESLOT(2) + STACKSIZE(SP)
  207. #endif
  208. #else
  209. ld Y, FRAMESLOT(0) + STACKSIZE(SP)
  210. ld INCY, FRAMESLOT(1) + STACKSIZE(SP)
  211. ld BUFFER, FRAMESLOT(2) + STACKSIZE(SP)
  212. #endif
  213. #endif
  214. stfd f1, ALPHA
  215. fmr alpha, f1
  216. slwi LDA, LDA, BASE_SHIFT
  217. slwi INCX, INCX, BASE_SHIFT
  218. slwi INCY, INCY, BASE_SHIFT
  219. li PREA, PREFETCHSIZE_A * SIZE
  220. cmpwi cr0, M, 0
  221. ble- LL(999)
  222. cmpwi cr0, N, 0
  223. ble- LL(999)
  224. addi A, A, -SIZE
  225. sub X, X, INCX
  226. sub Y, Y, INCY
  227. mr YY, Y
  228. lfd f0, FZERO
  229. cmpi cr0, 0, INCY, SIZE
  230. beq LL(10)
  231. addi YY, BUFFER, -SIZE
  232. addi Y1, BUFFER, -SIZE
  233. addi r0, M, 7
  234. srawi. r0, r0, 3
  235. mtspr CTR, r0
  236. .align 4
  237. LL(02):
  238. STFDU f0, 1 * SIZE(Y1)
  239. STFDU f0, 1 * SIZE(Y1)
  240. STFDU f0, 1 * SIZE(Y1)
  241. STFDU f0, 1 * SIZE(Y1)
  242. STFDU f0, 1 * SIZE(Y1)
  243. STFDU f0, 1 * SIZE(Y1)
  244. STFDU f0, 1 * SIZE(Y1)
  245. STFDU f0, 1 * SIZE(Y1)
  246. bdnz LL(02)
  247. .align 4
  248. LL(10):
  249. srawi. J, N, 2
  250. ble LL(30)
  251. .align 4
  252. LL(21):
  253. mr AO1, A
  254. add AO2, A, LDA
  255. LFDUX alpha1, X, INCX
  256. LFDUX alpha2, X, INCX
  257. LFDUX alpha3, X, INCX
  258. LFDUX alpha4, X, INCX
  259. FMUL alpha1, alpha, alpha1
  260. add AO3, AO2, LDA
  261. FMUL alpha2, alpha, alpha2
  262. add AO4, AO3, LDA
  263. FMUL alpha3, alpha, alpha3
  264. add A, AO4, LDA
  265. FMUL alpha4, alpha, alpha4
  266. mr Y1, YY
  267. mr Y2, YY
  268. srawi. r0, M, 3
  269. mtspr CTR, r0
  270. ble LL(25)
  271. LFDU y01, 1 * SIZE(Y1)
  272. LFDU a1, 1 * SIZE(AO1)
  273. LFDU y02, 1 * SIZE(Y1)
  274. LFDU a2, 1 * SIZE(AO1)
  275. LFDU y03, 1 * SIZE(Y1)
  276. LFDU a3, 1 * SIZE(AO1)
  277. LFDU y04, 1 * SIZE(Y1)
  278. LFDU a4, 1 * SIZE(AO1)
  279. LFDU y05, 1 * SIZE(Y1)
  280. LFDU a5, 1 * SIZE(AO1)
  281. LFDU y06, 1 * SIZE(Y1)
  282. LFDU a6, 1 * SIZE(AO1)
  283. LFDU y07, 1 * SIZE(Y1)
  284. LFDU a7, 1 * SIZE(AO1)
  285. LFDU y08, 1 * SIZE(Y1)
  286. LFDU a8, 1 * SIZE(AO1)
  287. bdz LL(23)
  288. .align 4
  289. LL(22):
  290. #ifdef PPCG4
  291. dcbtst Y1, PREA
  292. #endif
  293. FMADD y09, alpha1, a1, y01
  294. LFDU a1, 1 * SIZE(AO2)
  295. FMADD y10, alpha1, a2, y02
  296. LFDU a2, 1 * SIZE(AO2)
  297. FMADD y11, alpha1, a3, y03
  298. LFDU a3, 1 * SIZE(AO2)
  299. FMADD y12, alpha1, a4, y04
  300. LFDU a4, 1 * SIZE(AO2)
  301. LFDU y01, 1 * SIZE(Y1)
  302. #ifdef PPCG4
  303. dcbt AO2, PREA
  304. #endif
  305. FMADD y13, alpha1, a5, y05
  306. LFDU a5, 1 * SIZE(AO2)
  307. FMADD y14, alpha1, a6, y06
  308. LFDU a6, 1 * SIZE(AO2)
  309. FMADD y15, alpha1, a7, y07
  310. LFDU a7, 1 * SIZE(AO2)
  311. FMADD y16, alpha1, a8, y08
  312. LFDU a8, 1 * SIZE(AO2)
  313. LFDU y02, 1 * SIZE(Y1)
  314. #if defined(PPCG4) && defined(DOUBLE)
  315. dcbt AO2, PREA
  316. #endif
  317. FMADD y09, alpha2, a1, y09
  318. LFDU a1, 1 * SIZE(AO3)
  319. FMADD y10, alpha2, a2, y10
  320. LFDU a2, 1 * SIZE(AO3)
  321. FMADD y11, alpha2, a3, y11
  322. LFDU a3, 1 * SIZE(AO3)
  323. FMADD y12, alpha2, a4, y12
  324. LFDU a4, 1 * SIZE(AO3)
  325. LFDU y03, 1 * SIZE(Y1)
  326. #ifdef PPCG4
  327. dcbt AO3, PREA
  328. #endif
  329. FMADD y13, alpha2, a5, y13
  330. LFDU a5, 1 * SIZE(AO3)
  331. FMADD y14, alpha2, a6, y14
  332. LFDU a6, 1 * SIZE(AO3)
  333. FMADD y15, alpha2, a7, y15
  334. LFDU a7, 1 * SIZE(AO3)
  335. FMADD y16, alpha2, a8, y16
  336. LFDU a8, 1 * SIZE(AO3)
  337. LFDU y04, 1 * SIZE(Y1)
  338. #if defined(PPCG4) && defined(DOUBLE)
  339. dcbt AO3, PREA
  340. #endif
  341. FMADD y09, alpha3, a1, y09
  342. LFDU a1, 1 * SIZE(AO4)
  343. FMADD y10, alpha3, a2, y10
  344. LFDU a2, 1 * SIZE(AO4)
  345. FMADD y11, alpha3, a3, y11
  346. LFDU a3, 1 * SIZE(AO4)
  347. FMADD y12, alpha3, a4, y12
  348. LFDU a4, 1 * SIZE(AO4)
  349. #if defined(PPCG4) && defined(DOUBLE)
  350. dcbtst Y1, PREA
  351. #endif
  352. LFDU y05, 1 * SIZE(Y1)
  353. #ifdef PPCG4
  354. dcbt AO4, PREA
  355. #endif
  356. FMADD y13, alpha3, a5, y13
  357. LFDU a5, 1 * SIZE(AO4)
  358. FMADD y14, alpha3, a6, y14
  359. LFDU a6, 1 * SIZE(AO4)
  360. FMADD y15, alpha3, a7, y15
  361. LFDU a7, 1 * SIZE(AO4)
  362. FMADD y16, alpha3, a8, y16
  363. LFDU a8, 1 * SIZE(AO4)
  364. LFDU y06, 1 * SIZE(Y1)
  365. #if defined(PPCG4) && defined(DOUBLE)
  366. dcbt AO4, PREA
  367. #endif
  368. FMADD y09, alpha4, a1, y09
  369. LFDU a1, 1 * SIZE(AO1)
  370. FMADD y10, alpha4, a2, y10
  371. LFDU a2, 1 * SIZE(AO1)
  372. FMADD y11, alpha4, a3, y11
  373. LFDU a3, 1 * SIZE(AO1)
  374. FMADD y12, alpha4, a4, y12
  375. LFDU a4, 1 * SIZE(AO1)
  376. LFDU y07, 1 * SIZE(Y1)
  377. #ifdef PPCG4
  378. dcbt AO1, PREA
  379. #endif
  380. STFDU y09, 1 * SIZE(Y2)
  381. STFDU y10, 1 * SIZE(Y2)
  382. STFDU y11, 1 * SIZE(Y2)
  383. STFDU y12, 1 * SIZE(Y2)
  384. FMADD y13, alpha4, a5, y13
  385. LFDU a5, 1 * SIZE(AO1)
  386. FMADD y14, alpha4, a6, y14
  387. LFDU a6, 1 * SIZE(AO1)
  388. FMADD y15, alpha4, a7, y15
  389. LFDU a7, 1 * SIZE(AO1)
  390. FMADD y16, alpha4, a8, y16
  391. LFDU a8, 1 * SIZE(AO1)
  392. LFDU y08, 1 * SIZE(Y1)
  393. #if defined(PPCG4) && defined(DOUBLE)
  394. dcbt AO1, PREA
  395. #endif
  396. STFDU y13, 1 * SIZE(Y2)
  397. STFDU y14, 1 * SIZE(Y2)
  398. STFDU y15, 1 * SIZE(Y2)
  399. STFDU y16, 1 * SIZE(Y2)
  400. bdnz LL(22)
  401. .align 4
  402. LL(23):
  403. FMADD y01, alpha1, a1, y01
  404. LFDU a1, 1 * SIZE(AO2)
  405. FMADD y02, alpha1, a2, y02
  406. LFDU a2, 1 * SIZE(AO2)
  407. FMADD y03, alpha1, a3, y03
  408. LFDU a3, 1 * SIZE(AO2)
  409. FMADD y04, alpha1, a4, y04
  410. LFDU a4, 1 * SIZE(AO2)
  411. FMADD y05, alpha1, a5, y05
  412. LFDU a5, 1 * SIZE(AO2)
  413. FMADD y06, alpha1, a6, y06
  414. LFDU a6, 1 * SIZE(AO2)
  415. FMADD y07, alpha1, a7, y07
  416. LFDU a7, 1 * SIZE(AO2)
  417. FMADD y08, alpha1, a8, y08
  418. LFDU a8, 1 * SIZE(AO2)
  419. FMADD y01, alpha2, a1, y01
  420. LFDU a1, 1 * SIZE(AO3)
  421. FMADD y02, alpha2, a2, y02
  422. LFDU a2, 1 * SIZE(AO3)
  423. FMADD y03, alpha2, a3, y03
  424. LFDU a3, 1 * SIZE(AO3)
  425. FMADD y04, alpha2, a4, y04
  426. LFDU a4, 1 * SIZE(AO3)
  427. FMADD y05, alpha2, a5, y05
  428. LFDU a5, 1 * SIZE(AO3)
  429. FMADD y06, alpha2, a6, y06
  430. LFDU a6, 1 * SIZE(AO3)
  431. FMADD y07, alpha2, a7, y07
  432. LFDU a7, 1 * SIZE(AO3)
  433. FMADD y08, alpha2, a8, y08
  434. LFDU a8, 1 * SIZE(AO3)
  435. FMADD y01, alpha3, a1, y01
  436. LFDU a1, 1 * SIZE(AO4)
  437. FMADD y02, alpha3, a2, y02
  438. LFDU a2, 1 * SIZE(AO4)
  439. FMADD y03, alpha3, a3, y03
  440. LFDU a3, 1 * SIZE(AO4)
  441. FMADD y04, alpha3, a4, y04
  442. LFDU a4, 1 * SIZE(AO4)
  443. FMADD y05, alpha3, a5, y05
  444. LFDU a5, 1 * SIZE(AO4)
  445. FMADD y06, alpha3, a6, y06
  446. LFDU a6, 1 * SIZE(AO4)
  447. FMADD y07, alpha3, a7, y07
  448. LFDU a7, 1 * SIZE(AO4)
  449. FMADD y08, alpha3, a8, y08
  450. LFDU a8, 1 * SIZE(AO4)
  451. FMADD y01, alpha4, a1, y01
  452. FMADD y02, alpha4, a2, y02
  453. FMADD y03, alpha4, a3, y03
  454. FMADD y04, alpha4, a4, y04
  455. FMADD y05, alpha4, a5, y05
  456. STFDU y01, 1 * SIZE(Y2)
  457. FMADD y06, alpha4, a6, y06
  458. STFDU y02, 1 * SIZE(Y2)
  459. FMADD y07, alpha4, a7, y07
  460. STFDU y03, 1 * SIZE(Y2)
  461. FMADD y08, alpha4, a8, y08
  462. STFDU y04, 1 * SIZE(Y2)
  463. STFDU y05, 1 * SIZE(Y2)
  464. STFDU y06, 1 * SIZE(Y2)
  465. STFDU y07, 1 * SIZE(Y2)
  466. STFDU y08, 1 * SIZE(Y2)
  467. .align 4
  468. LL(25):
  469. andi. r0, M, 7
  470. ble LL(29)
  471. andi. r0, M, 4
  472. ble LL(27)
  473. LFDU a1, 1 * SIZE(AO1)
  474. LFDU y01, 1 * SIZE(Y1)
  475. LFDU a2, 1 * SIZE(AO1)
  476. LFDU y02, 1 * SIZE(Y1)
  477. LFDU a3, 1 * SIZE(AO1)
  478. LFDU y03, 1 * SIZE(Y1)
  479. LFDU a4, 1 * SIZE(AO1)
  480. LFDU y04, 1 * SIZE(Y1)
  481. FMADD y01, alpha1, a1, y01
  482. LFDU a5, 1 * SIZE(AO2)
  483. FMADD y02, alpha1, a2, y02
  484. LFDU a6, 1 * SIZE(AO2)
  485. FMADD y03, alpha1, a3, y03
  486. LFDU a7, 1 * SIZE(AO2)
  487. FMADD y04, alpha1, a4, y04
  488. LFDU a8, 1 * SIZE(AO2)
  489. FMADD y01, alpha2, a5, y01
  490. LFDU a1, 1 * SIZE(AO3)
  491. FMADD y02, alpha2, a6, y02
  492. LFDU a2, 1 * SIZE(AO3)
  493. FMADD y03, alpha2, a7, y03
  494. LFDU a3, 1 * SIZE(AO3)
  495. FMADD y04, alpha2, a8, y04
  496. LFDU a4, 1 * SIZE(AO3)
  497. FMADD y01, alpha3, a1, y01
  498. LFDU a5, 1 * SIZE(AO4)
  499. FMADD y02, alpha3, a2, y02
  500. LFDU a6, 1 * SIZE(AO4)
  501. FMADD y03, alpha3, a3, y03
  502. LFDU a7, 1 * SIZE(AO4)
  503. FMADD y04, alpha3, a4, y04
  504. LFDU a8, 1 * SIZE(AO4)
  505. FMADD y01, alpha4, a5, y01
  506. FMADD y02, alpha4, a6, y02
  507. FMADD y03, alpha4, a7, y03
  508. FMADD y04, alpha4, a8, y04
  509. STFDU y01, 1 * SIZE(Y2)
  510. STFDU y02, 1 * SIZE(Y2)
  511. STFDU y03, 1 * SIZE(Y2)
  512. STFDU y04, 1 * SIZE(Y2)
  513. .align 4
  514. LL(27):
  515. andi. r0, M, 2
  516. ble LL(28)
  517. LFDU a1, 1 * SIZE(AO1)
  518. LFDU y01, 1 * SIZE(Y1)
  519. LFDU a2, 1 * SIZE(AO1)
  520. LFDU y02, 1 * SIZE(Y1)
  521. LFDU a3, 1 * SIZE(AO2)
  522. LFDU a4, 1 * SIZE(AO2)
  523. FMADD y01, alpha1, a1, y01
  524. LFDU a5, 1 * SIZE(AO3)
  525. FMADD y02, alpha1, a2, y02
  526. LFDU a6, 1 * SIZE(AO3)
  527. FMADD y01, alpha2, a3, y01
  528. LFDU a7, 1 * SIZE(AO4)
  529. FMADD y02, alpha2, a4, y02
  530. LFDU a8, 1 * SIZE(AO4)
  531. FMADD y01, alpha3, a5, y01
  532. FMADD y02, alpha3, a6, y02
  533. FMADD y01, alpha4, a7, y01
  534. FMADD y02, alpha4, a8, y02
  535. STFDU y01, 1 * SIZE(Y2)
  536. STFDU y02, 1 * SIZE(Y2)
  537. .align 4
  538. LL(28):
  539. andi. r0, M, 1
  540. ble LL(29)
  541. LFDU a1, 1 * SIZE(AO1)
  542. LFDU y01, 1 * SIZE(Y1)
  543. LFDU a2, 1 * SIZE(AO2)
  544. LFDU a3, 1 * SIZE(AO3)
  545. LFDU a4, 1 * SIZE(AO4)
  546. FMADD y01, alpha1, a1, y01
  547. FMADD y01, alpha2, a2, y01
  548. FMADD y01, alpha3, a3, y01
  549. FMADD y01, alpha4, a4, y01
  550. STFDU y01, 1 * SIZE(Y2)
  551. .align 4
  552. LL(29):
  553. addi J, J, -1
  554. lfd alpha, ALPHA
  555. cmpi cr0, 0, J, 0
  556. bgt LL(21)
  557. .align 4
  558. LL(30):
  559. andi. J, N, 2
  560. ble LL(40)
  561. LFDUX alpha1, X, INCX
  562. LFDUX alpha2, X, INCX
  563. mr AO1, A
  564. add AO2, A, LDA
  565. add A, AO2, LDA
  566. FMUL alpha1, alpha, alpha1
  567. mr Y1, YY
  568. FMUL alpha2, alpha, alpha2
  569. mr Y2, YY
  570. srawi. r0, M, 3
  571. mtspr CTR, r0
  572. ble LL(35)
  573. LFDU y01, 1 * SIZE(Y1)
  574. LFDU a1, 1 * SIZE(AO1)
  575. LFDU y02, 1 * SIZE(Y1)
  576. LFDU a2, 1 * SIZE(AO1)
  577. LFDU y03, 1 * SIZE(Y1)
  578. LFDU a3, 1 * SIZE(AO1)
  579. LFDU y04, 1 * SIZE(Y1)
  580. LFDU a4, 1 * SIZE(AO1)
  581. LFDU y05, 1 * SIZE(Y1)
  582. LFDU a5, 1 * SIZE(AO1)
  583. LFDU y06, 1 * SIZE(Y1)
  584. LFDU a6, 1 * SIZE(AO1)
  585. LFDU y07, 1 * SIZE(Y1)
  586. LFDU a7, 1 * SIZE(AO1)
  587. LFDU y08, 1 * SIZE(Y1)
  588. LFDU a8, 1 * SIZE(AO1)
  589. bdz LL(33)
  590. .align 4
  591. LL(32):
  592. #ifdef PPCG4
  593. dcbtst Y1, PREA
  594. #endif
  595. FMADD y09, alpha1, a1, y01
  596. LFDU a1, 1 * SIZE(AO2)
  597. FMADD y10, alpha1, a2, y02
  598. LFDU a2, 1 * SIZE(AO2)
  599. FMADD y11, alpha1, a3, y03
  600. LFDU a3, 1 * SIZE(AO2)
  601. FMADD y12, alpha1, a4, y04
  602. LFDU a4, 1 * SIZE(AO2)
  603. LFDU y01, 1 * SIZE(Y1)
  604. LFDU y02, 1 * SIZE(Y1)
  605. #ifdef PPCG4
  606. dcbt AO2, PREA
  607. #endif
  608. FMADD y13, alpha1, a5, y05
  609. LFDU a5, 1 * SIZE(AO2)
  610. FMADD y14, alpha1, a6, y06
  611. LFDU a6, 1 * SIZE(AO2)
  612. FMADD y15, alpha1, a7, y07
  613. LFDU a7, 1 * SIZE(AO2)
  614. FMADD y16, alpha1, a8, y08
  615. LFDU a8, 1 * SIZE(AO2)
  616. LFDU y03, 1 * SIZE(Y1)
  617. LFDU y04, 1 * SIZE(Y1)
  618. #if defined(PPCG4) && defined(DOUBLE)
  619. dcbt AO2, PREA
  620. #endif
  621. FMADD y09, alpha2, a1, y09
  622. LFDU a1, 1 * SIZE(AO1)
  623. FMADD y10, alpha2, a2, y10
  624. LFDU a2, 1 * SIZE(AO1)
  625. FMADD y11, alpha2, a3, y11
  626. LFDU a3, 1 * SIZE(AO1)
  627. FMADD y12, alpha2, a4, y12
  628. LFDU a4, 1 * SIZE(AO1)
  629. #if defined(PPCG4) && defined(DOUBLE)
  630. dcbtst Y1, PREA
  631. #endif
  632. LFDU y05, 1 * SIZE(Y1)
  633. LFDU y06, 1 * SIZE(Y1)
  634. #ifdef PPCG4
  635. dcbt AO1, PREA
  636. #endif
  637. FMADD y13, alpha2, a5, y13
  638. LFDU a5, 1 * SIZE(AO1)
  639. FMADD y14, alpha2, a6, y14
  640. LFDU a6, 1 * SIZE(AO1)
  641. FMADD y15, alpha2, a7, y15
  642. LFDU a7, 1 * SIZE(AO1)
  643. FMADD y16, alpha2, a8, y16
  644. LFDU a8, 1 * SIZE(AO1)
  645. LFDU y07, 1 * SIZE(Y1)
  646. LFDU y08, 1 * SIZE(Y1)
  647. #if defined(PPCG4) && defined(DOUBLE)
  648. dcbt AO1, PREA
  649. #endif
  650. STFDU y09, 1 * SIZE(Y2)
  651. STFDU y10, 1 * SIZE(Y2)
  652. STFDU y11, 1 * SIZE(Y2)
  653. STFDU y12, 1 * SIZE(Y2)
  654. STFDU y13, 1 * SIZE(Y2)
  655. STFDU y14, 1 * SIZE(Y2)
  656. STFDU y15, 1 * SIZE(Y2)
  657. STFDU y16, 1 * SIZE(Y2)
  658. bdnz LL(32)
  659. .align 4
  660. LL(33):
  661. FMADD y01, alpha1, a1, y01
  662. LFDU a1, 1 * SIZE(AO2)
  663. FMADD y02, alpha1, a2, y02
  664. LFDU a2, 1 * SIZE(AO2)
  665. FMADD y03, alpha1, a3, y03
  666. LFDU a3, 1 * SIZE(AO2)
  667. FMADD y04, alpha1, a4, y04
  668. LFDU a4, 1 * SIZE(AO2)
  669. FMADD y05, alpha1, a5, y05
  670. LFDU a5, 1 * SIZE(AO2)
  671. FMADD y06, alpha1, a6, y06
  672. LFDU a6, 1 * SIZE(AO2)
  673. FMADD y07, alpha1, a7, y07
  674. LFDU a7, 1 * SIZE(AO2)
  675. FMADD y08, alpha1, a8, y08
  676. LFDU a8, 1 * SIZE(AO2)
  677. FMADD y01, alpha2, a1, y01
  678. FMADD y02, alpha2, a2, y02
  679. FMADD y03, alpha2, a3, y03
  680. FMADD y04, alpha2, a4, y04
  681. FMADD y05, alpha2, a5, y05
  682. STFDU y01, 1 * SIZE(Y2)
  683. FMADD y06, alpha2, a6, y06
  684. STFDU y02, 1 * SIZE(Y2)
  685. FMADD y07, alpha2, a7, y07
  686. STFDU y03, 1 * SIZE(Y2)
  687. FMADD y08, alpha2, a8, y08
  688. STFDU y04, 1 * SIZE(Y2)
  689. STFDU y05, 1 * SIZE(Y2)
  690. STFDU y06, 1 * SIZE(Y2)
  691. STFDU y07, 1 * SIZE(Y2)
  692. STFDU y08, 1 * SIZE(Y2)
  693. .align 4
  694. LL(35):
  695. andi. r0, M, 7
  696. ble LL(40)
  697. andi. r0, M, 4
  698. ble LL(37)
  699. LFDU a1, 1 * SIZE(AO1)
  700. LFDU y01, 1 * SIZE(Y1)
  701. LFDU a2, 1 * SIZE(AO1)
  702. LFDU y02, 1 * SIZE(Y1)
  703. LFDU a3, 1 * SIZE(AO1)
  704. LFDU y03, 1 * SIZE(Y1)
  705. LFDU a4, 1 * SIZE(AO1)
  706. LFDU y04, 1 * SIZE(Y1)
  707. FMADD y01, alpha1, a1, y01
  708. LFDU a5, 1 * SIZE(AO2)
  709. FMADD y02, alpha1, a2, y02
  710. LFDU a6, 1 * SIZE(AO2)
  711. FMADD y03, alpha1, a3, y03
  712. LFDU a7, 1 * SIZE(AO2)
  713. FMADD y04, alpha1, a4, y04
  714. LFDU a8, 1 * SIZE(AO2)
  715. FMADD y01, alpha2, a5, y01
  716. FMADD y02, alpha2, a6, y02
  717. FMADD y03, alpha2, a7, y03
  718. FMADD y04, alpha2, a8, y04
  719. STFDU y01, 1 * SIZE(Y2)
  720. STFDU y02, 1 * SIZE(Y2)
  721. STFDU y03, 1 * SIZE(Y2)
  722. STFDU y04, 1 * SIZE(Y2)
  723. .align 4
  724. LL(37):
  725. andi. r0, M, 2
  726. ble LL(38)
  727. LFDU a1, 1 * SIZE(AO1)
  728. LFDU y01, 1 * SIZE(Y1)
  729. LFDU a2, 1 * SIZE(AO1)
  730. LFDU y02, 1 * SIZE(Y1)
  731. LFDU a3, 1 * SIZE(AO2)
  732. LFDU a4, 1 * SIZE(AO2)
  733. FMADD y01, alpha1, a1, y01
  734. FMADD y02, alpha1, a2, y02
  735. FMADD y01, alpha2, a3, y01
  736. FMADD y02, alpha2, a4, y02
  737. STFDU y01, 1 * SIZE(Y2)
  738. STFDU y02, 1 * SIZE(Y2)
  739. .align 4
  740. LL(38):
  741. andi. r0, M, 1
  742. ble LL(40)
  743. LFDU a1, 1 * SIZE(AO1)
  744. LFDU y01, 1 * SIZE(Y1)
  745. LFDU a2, 1 * SIZE(AO2)
  746. FMADD y01, alpha1, a1, y01
  747. FMADD y01, alpha2, a2, y01
  748. STFDU y01, 1 * SIZE(Y2)
  749. .align 4
  750. LL(40):
  751. andi. J, N, 1
  752. lfd alpha, ALPHA
  753. ble LL(990)
  754. LFDUX alpha1, X, INCX
  755. mr AO1, A
  756. add A, A, LDA
  757. FMUL alpha1, alpha, alpha1
  758. mr Y1, YY
  759. mr Y2, YY
  760. srawi. r0, M, 3
  761. mtspr CTR, r0
  762. ble LL(45)
  763. LFDU y01, 1 * SIZE(Y1)
  764. LFDU a1, 1 * SIZE(AO1)
  765. LFDU y02, 1 * SIZE(Y1)
  766. LFDU a2, 1 * SIZE(AO1)
  767. LFDU y03, 1 * SIZE(Y1)
  768. LFDU a3, 1 * SIZE(AO1)
  769. LFDU y04, 1 * SIZE(Y1)
  770. LFDU a4, 1 * SIZE(AO1)
  771. LFDU y05, 1 * SIZE(Y1)
  772. LFDU a5, 1 * SIZE(AO1)
  773. LFDU y06, 1 * SIZE(Y1)
  774. LFDU a6, 1 * SIZE(AO1)
  775. LFDU y07, 1 * SIZE(Y1)
  776. LFDU a7, 1 * SIZE(AO1)
  777. LFDU y08, 1 * SIZE(Y1)
  778. LFDU a8, 1 * SIZE(AO1)
  779. bdz LL(43)
  780. .align 4
  781. LL(42):
  782. #ifdef PPCG4
  783. dcbtst Y1, PREA
  784. #endif
  785. FMADD y09, alpha1, a1, y01
  786. LFDU a1, 1 * SIZE(AO1)
  787. FMADD y10, alpha1, a2, y02
  788. LFDU a2, 1 * SIZE(AO1)
  789. FMADD y11, alpha1, a3, y03
  790. LFDU a3, 1 * SIZE(AO1)
  791. FMADD y12, alpha1, a4, y04
  792. LFDU a4, 1 * SIZE(AO1)
  793. LFDU y01, 1 * SIZE(Y1)
  794. LFDU y02, 1 * SIZE(Y1)
  795. LFDU y03, 1 * SIZE(Y1)
  796. LFDU y04, 1 * SIZE(Y1)
  797. #ifdef PPCG4
  798. dcbt AO1, PREA
  799. #endif
  800. FMADD y13, alpha1, a5, y05
  801. LFDU a5, 1 * SIZE(AO1)
  802. FMADD y14, alpha1, a6, y06
  803. LFDU a6, 1 * SIZE(AO1)
  804. FMADD y15, alpha1, a7, y07
  805. LFDU a7, 1 * SIZE(AO1)
  806. FMADD y16, alpha1, a8, y08
  807. LFDU a8, 1 * SIZE(AO1)
  808. #if defined(PPCG4) && defined(DOUBLE)
  809. dcbtst Y1, PREA
  810. #endif
  811. LFDU y05, 1 * SIZE(Y1)
  812. LFDU y06, 1 * SIZE(Y1)
  813. LFDU y07, 1 * SIZE(Y1)
  814. LFDU y08, 1 * SIZE(Y1)
  815. #if defined(PPCG4) && defined(DOUBLE)
  816. dcbt AO1, PREA
  817. #endif
  818. STFDU y09, 1 * SIZE(Y2)
  819. STFDU y10, 1 * SIZE(Y2)
  820. STFDU y11, 1 * SIZE(Y2)
  821. STFDU y12, 1 * SIZE(Y2)
  822. STFDU y13, 1 * SIZE(Y2)
  823. STFDU y14, 1 * SIZE(Y2)
  824. STFDU y15, 1 * SIZE(Y2)
  825. STFDU y16, 1 * SIZE(Y2)
  826. bdnz LL(42)
  827. .align 4
  828. LL(43):
  829. FMADD y01, alpha1, a1, y01
  830. FMADD y02, alpha1, a2, y02
  831. FMADD y03, alpha1, a3, y03
  832. FMADD y04, alpha1, a4, y04
  833. FMADD y05, alpha1, a5, y05
  834. STFDU y01, 1 * SIZE(Y2)
  835. FMADD y06, alpha1, a6, y06
  836. STFDU y02, 1 * SIZE(Y2)
  837. FMADD y07, alpha1, a7, y07
  838. STFDU y03, 1 * SIZE(Y2)
  839. FMADD y08, alpha1, a8, y08
  840. STFDU y04, 1 * SIZE(Y2)
  841. STFDU y05, 1 * SIZE(Y2)
  842. STFDU y06, 1 * SIZE(Y2)
  843. STFDU y07, 1 * SIZE(Y2)
  844. STFDU y08, 1 * SIZE(Y2)
  845. .align 4
  846. LL(45):
  847. andi. r0, M, 7
  848. ble LL(990)
  849. andi. r0, M, 4
  850. ble LL(47)
  851. LFDU a1, 1 * SIZE(AO1)
  852. LFDU y01, 1 * SIZE(Y1)
  853. LFDU a2, 1 * SIZE(AO1)
  854. LFDU y02, 1 * SIZE(Y1)
  855. LFDU a3, 1 * SIZE(AO1)
  856. LFDU y03, 1 * SIZE(Y1)
  857. LFDU a4, 1 * SIZE(AO1)
  858. LFDU y04, 1 * SIZE(Y1)
  859. FMADD y01, alpha1, a1, y01
  860. FMADD y02, alpha1, a2, y02
  861. FMADD y03, alpha1, a3, y03
  862. FMADD y04, alpha1, a4, y04
  863. STFDU y01, 1 * SIZE(Y2)
  864. STFDU y02, 1 * SIZE(Y2)
  865. STFDU y03, 1 * SIZE(Y2)
  866. STFDU y04, 1 * SIZE(Y2)
  867. .align 4
  868. LL(47):
  869. andi. r0, M, 2
  870. ble LL(48)
  871. LFDU a1, 1 * SIZE(AO1)
  872. LFDU y01, 1 * SIZE(Y1)
  873. LFDU a2, 1 * SIZE(AO1)
  874. LFDU y02, 1 * SIZE(Y1)
  875. FMADD y01, alpha1, a1, y01
  876. FMADD y02, alpha1, a2, y02
  877. STFDU y01, 1 * SIZE(Y2)
  878. STFDU y02, 1 * SIZE(Y2)
  879. .align 4
  880. LL(48):
  881. andi. r0, M, 1
  882. ble LL(990)
  883. LFDU a1, 1 * SIZE(AO1)
  884. LFDU y01, 1 * SIZE(Y1)
  885. FMADD y01, alpha1, a1, y01
  886. STFDU y01, 1 * SIZE(Y2)
  887. .align 4
  888. LL(990):
  889. cmpi cr0, 0, INCY, SIZE
  890. beq LL(999)
  891. addi YY, BUFFER, -SIZE
  892. mr Y1, Y
  893. srawi. r0, M, 3
  894. mtspr CTR, r0
  895. ble LL(995)
  896. .align 4
  897. LL(991):
  898. LFDUX f0, Y, INCY
  899. LFDUX f1, Y, INCY
  900. LFDUX f2, Y, INCY
  901. LFDUX f3, Y, INCY
  902. LFDUX f4, Y, INCY
  903. LFDUX f5, Y, INCY
  904. LFDUX f6, Y, INCY
  905. LFDUX f7, Y, INCY
  906. LFDU f8, 1 * SIZE(YY)
  907. LFDU f9, 1 * SIZE(YY)
  908. LFDU f10, 1 * SIZE(YY)
  909. LFDU f11, 1 * SIZE(YY)
  910. LFDU f12, 1 * SIZE(YY)
  911. LFDU f13, 1 * SIZE(YY)
  912. LFDU f14, 1 * SIZE(YY)
  913. LFDU f15, 1 * SIZE(YY)
  914. FADD f8, f8, f0
  915. FADD f9, f9, f1
  916. FADD f10, f10, f2
  917. FADD f11, f11, f3
  918. FADD f12, f12, f4
  919. FADD f13, f13, f5
  920. FADD f14, f14, f6
  921. FADD f15, f15, f7
  922. STFDUX f8, Y1, INCY
  923. STFDUX f9, Y1, INCY
  924. STFDUX f10, Y1, INCY
  925. STFDUX f11, Y1, INCY
  926. STFDUX f12, Y1, INCY
  927. STFDUX f13, Y1, INCY
  928. STFDUX f14, Y1, INCY
  929. STFDUX f15, Y1, INCY
  930. bdnz LL(991)
  931. .align 4
  932. LL(995):
  933. andi. J, M, 4
  934. ble LL(996)
  935. LFDUX f0, Y, INCY
  936. LFDUX f1, Y, INCY
  937. LFDUX f2, Y, INCY
  938. LFDUX f3, Y, INCY
  939. LFDU f8, 1 * SIZE(YY)
  940. LFDU f9, 1 * SIZE(YY)
  941. LFDU f10, 1 * SIZE(YY)
  942. LFDU f11, 1 * SIZE(YY)
  943. FADD f8, f8, f0
  944. FADD f9, f9, f1
  945. FADD f10, f10, f2
  946. FADD f11, f11, f3
  947. STFDUX f8, Y1, INCY
  948. STFDUX f9, Y1, INCY
  949. STFDUX f10, Y1, INCY
  950. STFDUX f11, Y1, INCY
  951. .align 4
  952. LL(996):
  953. andi. J, M, 2
  954. ble LL(997)
  955. LFDUX f0, Y, INCY
  956. LFDUX f1, Y, INCY
  957. LFDU f8, 1 * SIZE(YY)
  958. LFDU f9, 1 * SIZE(YY)
  959. FADD f8, f8, f0
  960. FADD f9, f9, f1
  961. STFDUX f8, Y1, INCY
  962. STFDUX f9, Y1, INCY
  963. .align 4
  964. LL(997):
  965. andi. J, M, 1
  966. ble LL(999)
  967. LFDUX f0, Y, INCY
  968. LFDU f8, 1 * SIZE(YY)
  969. FADD f8, f8, f0
  970. STFDUX f8, Y1, INCY
  971. .align 4
  972. LL(999):
  973. li r3, 0
  974. lfd f14, 0(SP)
  975. lfd f15, 8(SP)
  976. lfd f16, 16(SP)
  977. lfd f17, 24(SP)
  978. lfd f18, 32(SP)
  979. lfd f19, 40(SP)
  980. lfd f20, 48(SP)
  981. lfd f21, 56(SP)
  982. lfd f22, 64(SP)
  983. lfd f23, 72(SP)
  984. lfd f24, 80(SP)
  985. lfd f25, 88(SP)
  986. lfd f26, 96(SP)
  987. lfd f27, 104(SP)
  988. #ifdef __64BIT__
  989. ld r14, 144(SP)
  990. ld r15, 152(SP)
  991. ld r16, 160(SP)
  992. ld r17, 168(SP)
  993. ld r18, 176(SP)
  994. ld r19, 184(SP)
  995. ld r20, 192(SP)
  996. ld r21, 200(SP)
  997. ld r22, 208(SP)
  998. ld r23, 216(SP)
  999. #else
  1000. lwz r14, 144(SP)
  1001. lwz r15, 148(SP)
  1002. lwz r16, 152(SP)
  1003. lwz r17, 156(SP)
  1004. lwz r18, 160(SP)
  1005. lwz r19, 164(SP)
  1006. lwz r20, 168(SP)
  1007. lwz r21, 172(SP)
  1008. lwz r22, 176(SP)
  1009. lwz r23, 180(SP)
  1010. #endif
  1011. addi SP, SP, STACKSIZE
  1012. blr
  1013. EPILOGUE
  1014. #endif