You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

gemv_t_ppc440.S 20 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089
  1. /*********************************************************************/
  2. /* Copyright 2009, 2010 The University of Texas at Austin. */
  3. /* All rights reserved. */
  4. /* */
  5. /* Redistribution and use in source and binary forms, with or */
  6. /* without modification, are permitted provided that the following */
  7. /* conditions are met: */
  8. /* */
  9. /* 1. Redistributions of source code must retain the above */
  10. /* copyright notice, this list of conditions and the following */
  11. /* disclaimer. */
  12. /* */
  13. /* 2. Redistributions in binary form must reproduce the above */
  14. /* copyright notice, this list of conditions and the following */
  15. /* disclaimer in the documentation and/or other materials */
  16. /* provided with the distribution. */
  17. /* */
  18. /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
  19. /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
  20. /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
  21. /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
  22. /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
  23. /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
  24. /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
  25. /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
  26. /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
  27. /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
  28. /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
  29. /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
  30. /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
  31. /* POSSIBILITY OF SUCH DAMAGE. */
  32. /* */
  33. /* The views and conclusions contained in the software and */
  34. /* documentation are those of the authors and should not be */
  35. /* interpreted as representing official policies, either expressed */
  36. /* or implied, of The University of Texas at Austin. */
  37. /*********************************************************************/
  38. #define ASSEMBLER
  39. #include "common.h"
  40. #ifdef linux
  41. #ifndef __64BIT__
  42. #define M r3
  43. #define N r4
  44. #define A r6
  45. #define LDA r7
  46. #define X r8
  47. #define INCX r9
  48. #define Y r10
  49. #define INCY r5
  50. #else
  51. #define M r3
  52. #define N r4
  53. #define A r7
  54. #define LDA r8
  55. #define X r9
  56. #define INCX r10
  57. #define Y r5
  58. #define INCY r6
  59. #endif
  60. #endif
  61. #if defined(_AIX) || defined(__APPLE__)
  62. #if !defined(__64BIT__) && defined(DOUBLE)
  63. #define M r3
  64. #define N r4
  65. #define A r8
  66. #define LDA r9
  67. #define X r10
  68. #define INCX r5
  69. #define Y r6
  70. #define INCY r7
  71. #else
  72. #define M r3
  73. #define N r4
  74. #define A r7
  75. #define LDA r8
  76. #define X r9
  77. #define INCX r10
  78. #define Y r5
  79. #define INCY r6
  80. #endif
  81. #endif
  82. #define BUFFER r11
  83. #define XP r12
  84. #define AO1 r14
  85. #define AO2 r15
  86. #define AO3 r16
  87. #define AO4 r17
  88. #define J r18
  89. #define YY r19
  90. #define PREA r20
  91. #define PREC r21
  92. #define X1 r22
  93. #if defined(PPCG4)
  94. #define PREFETCHSIZE_A 42
  95. #define PREFETCHSIZE_C 7
  96. #endif
  97. #if defined(POWER6)
  98. #define PREFETCHSIZE_A 42
  99. #define PREFETCHSIZE_C 7
  100. #endif
  101. #define y01 f0
  102. #define y02 f1
  103. #define y03 f2
  104. #define y04 f3
  105. #define y05 f4
  106. #define y06 f5
  107. #define y07 f6
  108. #define y08 f7
  109. #define a1 f8
  110. #define a2 f9
  111. #define a3 f10
  112. #define a4 f11
  113. #define a5 f12
  114. #define a6 f13
  115. #define a7 f14
  116. #define a8 f15
  117. #define b1 f16
  118. #define b2 f17
  119. #define b3 f18
  120. #define b4 f19
  121. #define b5 f20
  122. #define b6 f21
  123. #define b7 f22
  124. #define b8 f23
  125. #define alpha f23
  126. #ifndef NEEDPARAM
  127. #ifndef __64BIT__
  128. #define STACKSIZE 224
  129. #else
  130. #define STACKSIZE 288
  131. #endif
  132. #define FZERO 144(SP)
  133. #define ALPHA 152(SP)
  134. PROLOGUE
  135. PROFCODE
  136. addi SP, SP, -STACKSIZE
  137. li r0, 0
  138. stfd f14, 0(SP)
  139. stfd f15, 8(SP)
  140. stfd f16, 16(SP)
  141. stfd f17, 24(SP)
  142. stfd f18, 32(SP)
  143. stfd f19, 40(SP)
  144. stfd f20, 48(SP)
  145. stfd f21, 56(SP)
  146. stfd f22, 64(SP)
  147. stfd f23, 72(SP)
  148. #ifdef __64BIT__
  149. std r0, FZERO
  150. stfd f1, ALPHA
  151. std r14, 160(SP)
  152. std r15, 168(SP)
  153. std r16, 176(SP)
  154. std r17, 184(SP)
  155. std r18, 192(SP)
  156. std r19, 200(SP)
  157. std r20, 208(SP)
  158. std r21, 216(SP)
  159. std r22, 224(SP)
  160. #else
  161. stw r0, 0 + FZERO
  162. stw r0, 4 + FZERO
  163. stfd f1, ALPHA
  164. stw r14, 160(SP)
  165. stw r15, 164(SP)
  166. stw r16, 168(SP)
  167. stw r17, 172(SP)
  168. stw r18, 176(SP)
  169. stw r19, 180(SP)
  170. stw r20, 184(SP)
  171. stw r21, 188(SP)
  172. stw r22, 192(SP)
  173. #endif
  174. #ifdef linux
  175. #ifndef __64BIT__
  176. lwz INCY, 8 + STACKSIZE(SP)
  177. lwz BUFFER, 12 + STACKSIZE(SP)
  178. #else
  179. ld Y, 112 + STACKSIZE(SP)
  180. ld INCY, 120 + STACKSIZE(SP)
  181. ld BUFFER, 128 + STACKSIZE(SP)
  182. #endif
  183. #endif
  184. #if defined(_AIX) || defined(__APPLE__)
  185. #ifndef __64BIT__
  186. #ifdef DOUBLE
  187. lwz INCX, 56 + STACKSIZE(SP)
  188. lwz Y, 60 + STACKSIZE(SP)
  189. lwz INCY, 64 + STACKSIZE(SP)
  190. lwz BUFFER, 68 + STACKSIZE(SP)
  191. #else
  192. lwz Y, 56 + STACKSIZE(SP)
  193. lwz INCY, 60 + STACKSIZE(SP)
  194. lwz BUFFER, 64 + STACKSIZE(SP)
  195. #endif
  196. #else
  197. ld Y, 112 + STACKSIZE(SP)
  198. ld INCY, 120 + STACKSIZE(SP)
  199. ld BUFFER, 128 + STACKSIZE(SP)
  200. #endif
  201. #endif
  202. slwi LDA, LDA, BASE_SHIFT
  203. slwi INCX, INCX, BASE_SHIFT
  204. slwi INCY, INCY, BASE_SHIFT
  205. addi A, A, -SIZE
  206. sub X, X, INCX
  207. sub Y, Y, INCY
  208. li PREA, PREFETCHSIZE_A * SIZE
  209. li PREC, PREFETCHSIZE_C * SIZE
  210. cmpi cr0, 0, M, 0
  211. ble LL(999)
  212. cmpi cr0, 0, N, 0
  213. ble LL(999)
  214. mr XP, X
  215. cmpi cr0, 0, INCX, SIZE
  216. beq LL(10)
  217. addi XP, BUFFER, -SIZE
  218. addi X1, BUFFER, -SIZE
  219. srawi. r0, M, 3
  220. mtspr CTR, r0
  221. ble LL(CopyRemain)
  222. .align 4
  223. LL(CopyKernel):
  224. LFDUX f0, X, INCX
  225. LFDUX f1, X, INCX
  226. LFDUX f2, X, INCX
  227. LFDUX f3, X, INCX
  228. LFDUX f4, X, INCX
  229. LFDUX f5, X, INCX
  230. LFDUX f6, X, INCX
  231. LFDUX f7, X, INCX
  232. STFDU f0, 1 * SIZE(X1)
  233. STFDU f1, 1 * SIZE(X1)
  234. STFDU f2, 1 * SIZE(X1)
  235. STFDU f3, 1 * SIZE(X1)
  236. STFDU f4, 1 * SIZE(X1)
  237. STFDU f5, 1 * SIZE(X1)
  238. STFDU f6, 1 * SIZE(X1)
  239. STFDU f7, 1 * SIZE(X1)
  240. bdnz LL(CopyKernel)
  241. .align 4
  242. LL(CopyRemain):
  243. andi. r0, M, 7
  244. mtspr CTR, r0
  245. ble LL(10)
  246. .align 4
  247. LL(CopySub):
  248. LFDUX f0, X, INCX
  249. STFDU f0, 1 * SIZE(X1)
  250. bdnz LL(CopySub)
  251. .align 4
  252. LL(10):
  253. mr YY, Y
  254. srawi. J, N, 2
  255. ble LL(30)
  256. .align 4
  257. LL(21):
  258. mr AO1, A
  259. add AO2, A, LDA
  260. add AO3, AO2, LDA
  261. add AO4, AO3, LDA
  262. add A, AO4, LDA
  263. mr X1, XP
  264. lfd y01, FZERO
  265. fmr y02, y01
  266. fmr y03, y01
  267. fmr y04, y01
  268. fmr y05, y01
  269. fmr y06, y01
  270. fmr y07, y01
  271. fmr y08, y01
  272. dcbtst Y, PREC
  273. srawi. r0, M, 3
  274. mtspr CTR, r0
  275. ble LL(24)
  276. LFDU a1, 1 * SIZE(AO1)
  277. LFDU a2, 1 * SIZE(AO2)
  278. LFDU a3, 1 * SIZE(AO3)
  279. LFDU a4, 1 * SIZE(AO4)
  280. LFDU b1, 1 * SIZE(X1)
  281. LFDU b2, 1 * SIZE(X1)
  282. LFDU a5, 1 * SIZE(AO1)
  283. LFDU a6, 1 * SIZE(AO2)
  284. LFDU a7, 1 * SIZE(AO3)
  285. LFDU a8, 1 * SIZE(AO4)
  286. LFDU b3, 1 * SIZE(X1)
  287. LFDU b4, 1 * SIZE(X1)
  288. bdz LL(23)
  289. .align 4
  290. LL(22):
  291. #ifdef PPCG4
  292. dcbt X1, PREA
  293. #endif
  294. FMADD y01, a1, b1, y01
  295. LFDU a1, 1 * SIZE(AO1)
  296. FMADD y02, a2, b1, y02
  297. LFDU a2, 1 * SIZE(AO2)
  298. FMADD y03, a3, b1, y03
  299. LFDU a3, 1 * SIZE(AO3)
  300. FMADD y04, a4, b1, y04
  301. LFDU a4, 1 * SIZE(AO4)
  302. LFDU b1, 1 * SIZE(X1)
  303. #ifdef PPCG4
  304. dcbt AO1, PREA
  305. #endif
  306. FMADD y05, a5, b2, y05
  307. LFDU a5, 1 * SIZE(AO1)
  308. FMADD y06, a6, b2, y06
  309. LFDU a6, 1 * SIZE(AO2)
  310. FMADD y07, a7, b2, y07
  311. LFDU a7, 1 * SIZE(AO3)
  312. FMADD y08, a8, b2, y08
  313. LFDU a8, 1 * SIZE(AO4)
  314. LFDU b2, 1 * SIZE(X1)
  315. #ifdef PPCG4
  316. dcbt AO2, PREA
  317. #endif
  318. FMADD y01, a1, b3, y01
  319. LFDU a1, 1 * SIZE(AO1)
  320. FMADD y02, a2, b3, y02
  321. LFDU a2, 1 * SIZE(AO2)
  322. FMADD y03, a3, b3, y03
  323. LFDU a3, 1 * SIZE(AO3)
  324. FMADD y04, a4, b3, y04
  325. LFDU a4, 1 * SIZE(AO4)
  326. LFDU b3, 1 * SIZE(X1)
  327. #ifdef PPCG4
  328. dcbt AO3, PREA
  329. #endif
  330. FMADD y05, a5, b4, y05
  331. LFDU a5, 1 * SIZE(AO1)
  332. FMADD y06, a6, b4, y06
  333. LFDU a6, 1 * SIZE(AO2)
  334. FMADD y07, a7, b4, y07
  335. LFDU a7, 1 * SIZE(AO3)
  336. FMADD y08, a8, b4, y08
  337. LFDU a8, 1 * SIZE(AO4)
  338. #ifdef PPCG4
  339. dcbt AO4, PREA
  340. #endif
  341. LFDU b4, 1 * SIZE(X1)
  342. #if defined(PPCG4) && defined(DOUBLE)
  343. dcbt X1, PREA
  344. #endif
  345. FMADD y01, a1, b1, y01
  346. LFDU a1, 1 * SIZE(AO1)
  347. FMADD y02, a2, b1, y02
  348. LFDU a2, 1 * SIZE(AO2)
  349. FMADD y03, a3, b1, y03
  350. LFDU a3, 1 * SIZE(AO3)
  351. FMADD y04, a4, b1, y04
  352. LFDU a4, 1 * SIZE(AO4)
  353. LFDU b1, 1 * SIZE(X1)
  354. #if defined(PPCG4) && defined(DOUBLE)
  355. dcbt AO1, PREA
  356. #endif
  357. FMADD y05, a5, b2, y05
  358. LFDU a5, 1 * SIZE(AO1)
  359. FMADD y06, a6, b2, y06
  360. LFDU a6, 1 * SIZE(AO2)
  361. FMADD y07, a7, b2, y07
  362. LFDU a7, 1 * SIZE(AO3)
  363. FMADD y08, a8, b2, y08
  364. LFDU a8, 1 * SIZE(AO4)
  365. LFDU b2, 1 * SIZE(X1)
  366. #if defined(PPCG4) && defined(DOUBLE)
  367. dcbt AO2, PREA
  368. #endif
  369. FMADD y01, a1, b3, y01
  370. LFDU a1, 1 * SIZE(AO1)
  371. FMADD y02, a2, b3, y02
  372. LFDU a2, 1 * SIZE(AO2)
  373. FMADD y03, a3, b3, y03
  374. LFDU a3, 1 * SIZE(AO3)
  375. FMADD y04, a4, b3, y04
  376. LFDU a4, 1 * SIZE(AO4)
  377. LFDU b3, 1 * SIZE(X1)
  378. #if defined(PPCG4) && defined(DOUBLE)
  379. dcbt AO3, PREA
  380. #endif
  381. FMADD y05, a5, b4, y05
  382. LFDU a5, 1 * SIZE(AO1)
  383. FMADD y06, a6, b4, y06
  384. LFDU a6, 1 * SIZE(AO2)
  385. FMADD y07, a7, b4, y07
  386. LFDU a7, 1 * SIZE(AO3)
  387. FMADD y08, a8, b4, y08
  388. LFDU a8, 1 * SIZE(AO4)
  389. LFDU b4, 1 * SIZE(X1)
  390. #if defined(PPCG4) && defined(DOUBLE)
  391. dcbt AO4, PREA
  392. #endif
  393. bdnz LL(22)
  394. .align 4
  395. LL(23):
  396. FMADD y01, a1, b1, y01
  397. LFDU a1, 1 * SIZE(AO1)
  398. FMADD y02, a2, b1, y02
  399. LFDU a2, 1 * SIZE(AO2)
  400. FMADD y03, a3, b1, y03
  401. LFDU a3, 1 * SIZE(AO3)
  402. FMADD y04, a4, b1, y04
  403. LFDU a4, 1 * SIZE(AO4)
  404. LFDU b1, 1 * SIZE(X1)
  405. FMADD y05, a5, b2, y05
  406. LFDU a5, 1 * SIZE(AO1)
  407. FMADD y06, a6, b2, y06
  408. LFDU a6, 1 * SIZE(AO2)
  409. FMADD y07, a7, b2, y07
  410. LFDU a7, 1 * SIZE(AO3)
  411. FMADD y08, a8, b2, y08
  412. LFDU a8, 1 * SIZE(AO4)
  413. LFDU b2, 1 * SIZE(X1)
  414. FMADD y01, a1, b3, y01
  415. LFDU a1, 1 * SIZE(AO1)
  416. FMADD y02, a2, b3, y02
  417. LFDU a2, 1 * SIZE(AO2)
  418. FMADD y03, a3, b3, y03
  419. LFDU a3, 1 * SIZE(AO3)
  420. FMADD y04, a4, b3, y04
  421. LFDU a4, 1 * SIZE(AO4)
  422. LFDU b3, 1 * SIZE(X1)
  423. FMADD y05, a5, b4, y05
  424. LFDU a5, 1 * SIZE(AO1)
  425. FMADD y06, a6, b4, y06
  426. LFDU a6, 1 * SIZE(AO2)
  427. FMADD y07, a7, b4, y07
  428. LFDU a7, 1 * SIZE(AO3)
  429. FMADD y08, a8, b4, y08
  430. LFDU a8, 1 * SIZE(AO4)
  431. LFDU b4, 1 * SIZE(X1)
  432. FMADD y01, a1, b1, y01
  433. LFDU a1, 1 * SIZE(AO1)
  434. FMADD y02, a2, b1, y02
  435. LFDU a2, 1 * SIZE(AO2)
  436. FMADD y03, a3, b1, y03
  437. LFDU a3, 1 * SIZE(AO3)
  438. FMADD y04, a4, b1, y04
  439. LFDU a4, 1 * SIZE(AO4)
  440. FMADD y05, a5, b2, y05
  441. LFDU a5, 1 * SIZE(AO1)
  442. FMADD y06, a6, b2, y06
  443. LFDU a6, 1 * SIZE(AO2)
  444. FMADD y07, a7, b2, y07
  445. LFDU a7, 1 * SIZE(AO3)
  446. FMADD y08, a8, b2, y08
  447. LFDU a8, 1 * SIZE(AO4)
  448. FMADD y01, a1, b3, y01
  449. FMADD y02, a2, b3, y02
  450. FMADD y03, a3, b3, y03
  451. FMADD y04, a4, b3, y04
  452. FMADD y05, a5, b4, y05
  453. FMADD y06, a6, b4, y06
  454. FMADD y07, a7, b4, y07
  455. FMADD y08, a8, b4, y08
  456. .align 4
  457. LL(24):
  458. andi. r0, M, 7
  459. ble LL(28)
  460. andi. r0, M, 4
  461. ble LL(26)
  462. LFDU a1, 1 * SIZE(AO1)
  463. LFDU a2, 1 * SIZE(AO2)
  464. LFDU b1, 1 * SIZE(X1)
  465. LFDU a3, 1 * SIZE(AO3)
  466. LFDU a4, 1 * SIZE(AO4)
  467. LFDU b2, 1 * SIZE(X1)
  468. FMADD y01, a1, b1, y01
  469. LFDU a5, 1 * SIZE(AO1)
  470. FMADD y02, a2, b1, y02
  471. LFDU a6, 1 * SIZE(AO2)
  472. FMADD y03, a3, b1, y03
  473. LFDU a7, 1 * SIZE(AO3)
  474. FMADD y04, a4, b1, y04
  475. LFDU a8, 1 * SIZE(AO4)
  476. LFDU b3, 1 * SIZE(X1)
  477. FMADD y05, a5, b2, y05
  478. LFDU a1, 1 * SIZE(AO1)
  479. FMADD y06, a6, b2, y06
  480. LFDU a2, 1 * SIZE(AO2)
  481. FMADD y07, a7, b2, y07
  482. LFDU a3, 1 * SIZE(AO3)
  483. FMADD y08, a8, b2, y08
  484. LFDU a4, 1 * SIZE(AO4)
  485. LFDU b4, 1 * SIZE(X1)
  486. FMADD y01, a1, b3, y01
  487. LFDU a5, 1 * SIZE(AO1)
  488. FMADD y02, a2, b3, y02
  489. LFDU a6, 1 * SIZE(AO2)
  490. FMADD y03, a3, b3, y03
  491. LFDU a7, 1 * SIZE(AO3)
  492. FMADD y04, a4, b3, y04
  493. LFDU a8, 1 * SIZE(AO4)
  494. FMADD y05, a5, b4, y05
  495. FMADD y06, a6, b4, y06
  496. FMADD y07, a7, b4, y07
  497. FMADD y08, a8, b4, y08
  498. .align 4
  499. LL(26):
  500. andi. r0, M, 2
  501. ble LL(27)
  502. LFDU b1, 1 * SIZE(X1)
  503. LFDU a1, 1 * SIZE(AO1)
  504. LFDU a2, 1 * SIZE(AO2)
  505. LFDU a3, 1 * SIZE(AO3)
  506. LFDU a4, 1 * SIZE(AO4)
  507. LFDU b2, 1 * SIZE(X1)
  508. FMADD y01, a1, b1, y01
  509. LFDU a5, 1 * SIZE(AO1)
  510. FMADD y02, a2, b1, y02
  511. LFDU a6, 1 * SIZE(AO2)
  512. FMADD y03, a3, b1, y03
  513. LFDU a7, 1 * SIZE(AO3)
  514. FMADD y04, a4, b1, y04
  515. LFDU a8, 1 * SIZE(AO4)
  516. FMADD y05, a5, b2, y05
  517. FMADD y06, a6, b2, y06
  518. FMADD y07, a7, b2, y07
  519. FMADD y08, a8, b2, y08
  520. .align 4
  521. LL(27):
  522. andi. r0, M, 1
  523. ble LL(28)
  524. LFDU a1, 1 * SIZE(AO1)
  525. LFDU b1, 1 * SIZE(X1)
  526. LFDU a2, 1 * SIZE(AO2)
  527. LFDU a3, 1 * SIZE(AO3)
  528. LFDU a4, 1 * SIZE(AO4)
  529. FMADD y01, a1, b1, y01
  530. FMADD y02, a2, b1, y02
  531. FMADD y03, a3, b1, y03
  532. FMADD y04, a4, b1, y04
  533. .align 4
  534. LL(28):
  535. lfd alpha, ALPHA
  536. LFDUX a1, Y, INCY
  537. LFDUX a2, Y, INCY
  538. LFDUX a3, Y, INCY
  539. LFDUX a4, Y, INCY
  540. FADD y01, y05, y01
  541. FADD y02, y06, y02
  542. FADD y03, y07, y03
  543. FADD y04, y08, y04
  544. FMADD a1, alpha, f0, a1
  545. FMADD a2, alpha, f1, a2
  546. FMADD a3, alpha, f2, a3
  547. FMADD a4, alpha, f3, a4
  548. STFDUX a1, YY, INCY
  549. addi J, J, -1
  550. STFDUX a2, YY, INCY
  551. cmpi cr0, 0, J, 0
  552. STFDUX a3, YY, INCY
  553. STFDUX a4, YY, INCY
  554. bgt LL(21)
  555. .align 4
  556. LL(30):
  557. andi. J, N, 2
  558. ble LL(40)
  559. mr AO1, A
  560. add AO2, A, LDA
  561. add A, AO2, LDA
  562. mr X1, XP
  563. lfd y01, FZERO
  564. fmr y02, y01
  565. fmr y03, y01
  566. fmr y04, y01
  567. srawi. r0, M, 3
  568. mtspr CTR, r0
  569. ble LL(34)
  570. LFDU a1, 1 * SIZE(AO1)
  571. LFDU a2, 1 * SIZE(AO2)
  572. LFDU b1, 1 * SIZE(X1)
  573. LFDU b2, 1 * SIZE(X1)
  574. LFDU a5, 1 * SIZE(AO1)
  575. LFDU a6, 1 * SIZE(AO2)
  576. LFDU b3, 1 * SIZE(X1)
  577. LFDU b4, 1 * SIZE(X1)
  578. bdz LL(33)
  579. .align 4
  580. LL(32):
  581. #ifdef PPCG4
  582. dcbt X1, PREA
  583. #endif
  584. FMADD y01, a1, b1, y01
  585. LFDU a1, 1 * SIZE(AO1)
  586. FMADD y02, a2, b1, y02
  587. LFDU a2, 1 * SIZE(AO2)
  588. LFDU b1, 1 * SIZE(X1)
  589. #ifdef PPCG4
  590. dcbt AO1, PREA
  591. #endif
  592. FMADD y03, a5, b2, y03
  593. LFDU a5, 1 * SIZE(AO1)
  594. FMADD y04, a6, b2, y04
  595. LFDU a6, 1 * SIZE(AO2)
  596. LFDU b2, 1 * SIZE(X1)
  597. FMADD y01, a1, b3, y01
  598. LFDU a1, 1 * SIZE(AO1)
  599. FMADD y02, a2, b3, y02
  600. LFDU a2, 1 * SIZE(AO2)
  601. LFDU b3, 1 * SIZE(X1)
  602. #ifdef PPCG4
  603. dcbt AO2, PREA
  604. #endif
  605. FMADD y03, a5, b4, y03
  606. LFDU a5, 1 * SIZE(AO1)
  607. FMADD y04, a6, b4, y04
  608. LFDU a6, 1 * SIZE(AO2)
  609. LFDU b4, 1 * SIZE(X1)
  610. FMADD y01, a1, b1, y01
  611. LFDU a1, 1 * SIZE(AO1)
  612. FMADD y02, a2, b1, y02
  613. LFDU a2, 1 * SIZE(AO2)
  614. #if defined(PPCG4) && defined(DOUBLE)
  615. dcbt X1, PREA
  616. #endif
  617. LFDU b1, 1 * SIZE(X1)
  618. #if defined(PPCG4) && defined(DOUBLE)
  619. dcbt AO1, PREA
  620. #endif
  621. FMADD y03, a5, b2, y03
  622. LFDU a5, 1 * SIZE(AO1)
  623. FMADD y04, a6, b2, y04
  624. LFDU a6, 1 * SIZE(AO2)
  625. LFDU b2, 1 * SIZE(X1)
  626. FMADD y01, a1, b3, y01
  627. LFDU a1, 1 * SIZE(AO1)
  628. FMADD y02, a2, b3, y02
  629. LFDU a2, 1 * SIZE(AO2)
  630. LFDU b3, 1 * SIZE(X1)
  631. #if defined(PPCG4) && defined(DOUBLE)
  632. dcbt AO2, PREA
  633. #endif
  634. FMADD y03, a5, b4, y03
  635. LFDU a5, 1 * SIZE(AO1)
  636. FMADD y04, a6, b4, y04
  637. LFDU a6, 1 * SIZE(AO2)
  638. LFDU b4, 1 * SIZE(X1)
  639. bdnz LL(32)
  640. .align 4
  641. LL(33):
  642. FMADD y01, a1, b1, y01
  643. LFDU a1, 1 * SIZE(AO1)
  644. FMADD y02, a2, b1, y02
  645. LFDU a2, 1 * SIZE(AO2)
  646. LFDU b1, 1 * SIZE(X1)
  647. FMADD y03, a5, b2, y03
  648. LFDU a5, 1 * SIZE(AO1)
  649. FMADD y04, a6, b2, y04
  650. LFDU a6, 1 * SIZE(AO2)
  651. LFDU b2, 1 * SIZE(X1)
  652. FMADD y01, a1, b3, y01
  653. LFDU a1, 1 * SIZE(AO1)
  654. FMADD y02, a2, b3, y02
  655. LFDU a2, 1 * SIZE(AO2)
  656. LFDU b3, 1 * SIZE(X1)
  657. FMADD y03, a5, b4, y03
  658. LFDU a5, 1 * SIZE(AO1)
  659. FMADD y04, a6, b4, y04
  660. LFDU a6, 1 * SIZE(AO2)
  661. LFDU b4, 1 * SIZE(X1)
  662. FMADD y01, a1, b1, y01
  663. LFDU a1, 1 * SIZE(AO1)
  664. FMADD y02, a2, b1, y02
  665. LFDU a2, 1 * SIZE(AO2)
  666. FMADD y03, a5, b2, y03
  667. LFDU a5, 1 * SIZE(AO1)
  668. FMADD y04, a6, b2, y04
  669. LFDU a6, 1 * SIZE(AO2)
  670. FMADD y01, a1, b3, y01
  671. FMADD y02, a2, b3, y02
  672. FMADD y03, a5, b4, y03
  673. FMADD y04, a6, b4, y04
  674. .align 4
  675. LL(34):
  676. andi. r0, M, 7
  677. ble LL(38)
  678. andi. r0, M, 4
  679. ble LL(36)
  680. LFDU a1, 1 * SIZE(AO1)
  681. LFDU a2, 1 * SIZE(AO2)
  682. LFDU b1, 1 * SIZE(X1)
  683. LFDU b2, 1 * SIZE(X1)
  684. FMADD y01, a1, b1, y01
  685. LFDU a5, 1 * SIZE(AO1)
  686. FMADD y02, a2, b1, y02
  687. LFDU a6, 1 * SIZE(AO2)
  688. LFDU b3, 1 * SIZE(X1)
  689. FMADD y03, a5, b2, y03
  690. LFDU a1, 1 * SIZE(AO1)
  691. FMADD y04, a6, b2, y04
  692. LFDU a2, 1 * SIZE(AO2)
  693. LFDU b4, 1 * SIZE(X1)
  694. FMADD y01, a1, b3, y01
  695. LFDU a5, 1 * SIZE(AO1)
  696. FMADD y02, a2, b3, y02
  697. LFDU a6, 1 * SIZE(AO2)
  698. FMADD y03, a5, b4, y03
  699. FMADD y04, a6, b4, y04
  700. .align 4
  701. LL(36):
  702. andi. r0, M, 2
  703. ble LL(37)
  704. LFDU b1, 1 * SIZE(X1)
  705. LFDU a1, 1 * SIZE(AO1)
  706. LFDU a2, 1 * SIZE(AO2)
  707. LFDU b2, 1 * SIZE(X1)
  708. LFDU a3, 1 * SIZE(AO1)
  709. LFDU a4, 1 * SIZE(AO2)
  710. FMADD y01, a1, b1, y01
  711. FMADD y02, a2, b1, y02
  712. FMADD y03, a3, b2, y03
  713. FMADD y04, a4, b2, y04
  714. .align 4
  715. LL(37):
  716. andi. r0, M, 1
  717. ble LL(38)
  718. LFDU a1, 1 * SIZE(AO1)
  719. LFDU b1, 1 * SIZE(X1)
  720. LFDU a2, 1 * SIZE(AO2)
  721. FMADD y01, a1, b1, y01
  722. FMADD y02, a2, b1, y02
  723. .align 4
  724. LL(38):
  725. lfd alpha, ALPHA
  726. LFDUX a1, Y, INCY
  727. LFDUX a2, Y, INCY
  728. FADD y01, y03, y01
  729. FADD y02, y04, y02
  730. FMADD a1, alpha, f0, a1
  731. FMADD a2, alpha, f1, a2
  732. STFDUX a1, YY, INCY
  733. STFDUX a2, YY, INCY
  734. .align 4
  735. LL(40):
  736. andi. J, N, 1
  737. ble LL(999)
  738. mr AO1, A
  739. add A, A, LDA
  740. mr X1, XP
  741. lfd y01, FZERO
  742. fmr y02, y01
  743. srawi. r0, M, 3
  744. mtspr CTR, r0
  745. ble LL(44)
  746. LFDU a1, 1 * SIZE(AO1)
  747. LFDU a2, 1 * SIZE(AO1)
  748. LFDU a3, 1 * SIZE(AO1)
  749. LFDU a4, 1 * SIZE(AO1)
  750. LFDU b1, 1 * SIZE(X1)
  751. LFDU b2, 1 * SIZE(X1)
  752. LFDU b3, 1 * SIZE(X1)
  753. LFDU b4, 1 * SIZE(X1)
  754. bdz LL(43)
  755. .align 4
  756. LL(42):
  757. FMADD y01, a1, b1, y01
  758. LFDU a1, 1 * SIZE(AO1)
  759. LFDU b1, 1 * SIZE(X1)
  760. #ifdef PPCG4
  761. dcbt X1, PREA
  762. #endif
  763. FMADD y02, a2, b2, y02
  764. LFDU a2, 1 * SIZE(AO1)
  765. LFDU b2, 1 * SIZE(X1)
  766. #ifdef PPCG4
  767. dcbt AO1, PREA
  768. #endif
  769. FMADD y01, a3, b3, y01
  770. LFDU a3, 1 * SIZE(AO1)
  771. LFDU b3, 1 * SIZE(X1)
  772. FMADD y02, a4, b4, y02
  773. LFDU a4, 1 * SIZE(AO1)
  774. LFDU b4, 1 * SIZE(X1)
  775. FMADD y01, a1, b1, y01
  776. LFDU a1, 1 * SIZE(AO1)
  777. LFDU b1, 1 * SIZE(X1)
  778. FMADD y02, a2, b2, y02
  779. LFDU a2, 1 * SIZE(AO1)
  780. LFDU b2, 1 * SIZE(X1)
  781. #if defined(PPCG4) && defined(DOUBLE)
  782. dcbt AO1, PREA
  783. #endif
  784. FMADD y01, a3, b3, y01
  785. LFDU a3, 1 * SIZE(AO1)
  786. LFDU b3, 1 * SIZE(X1)
  787. #if defined(PPCG4) && defined(DOUBLE)
  788. dcbt X1, PREA
  789. #endif
  790. FMADD y02, a4, b4, y02
  791. LFDU a4, 1 * SIZE(AO1)
  792. LFDU b4, 1 * SIZE(X1)
  793. bdnz LL(42)
  794. .align 4
  795. LL(43):
  796. FMADD y01, a1, b1, y01
  797. LFDU a1, 1 * SIZE(AO1)
  798. LFDU b1, 1 * SIZE(X1)
  799. FMADD y02, a2, b2, y02
  800. LFDU a2, 1 * SIZE(AO1)
  801. LFDU b2, 1 * SIZE(X1)
  802. FMADD y01, a3, b3, y01
  803. LFDU a3, 1 * SIZE(AO1)
  804. LFDU b3, 1 * SIZE(X1)
  805. FMADD y02, a4, b4, y02
  806. LFDU a4, 1 * SIZE(AO1)
  807. LFDU b4, 1 * SIZE(X1)
  808. FMADD y01, a1, b1, y01
  809. FMADD y02, a2, b2, y02
  810. FMADD y01, a3, b3, y01
  811. FMADD y02, a4, b4, y02
  812. .align 4
  813. LL(44):
  814. andi. r0, M, 7
  815. ble LL(48)
  816. andi. r0, M, 4
  817. ble LL(46)
  818. LFDU a1, 1 * SIZE(AO1)
  819. LFDU b1, 1 * SIZE(X1)
  820. LFDU a2, 1 * SIZE(AO1)
  821. LFDU b2, 1 * SIZE(X1)
  822. FMADD y01, a1, b1, y01
  823. LFDU a3, 1 * SIZE(AO1)
  824. LFDU b3, 1 * SIZE(X1)
  825. FMADD y02, a2, b2, y02
  826. LFDU a4, 1 * SIZE(AO1)
  827. LFDU b4, 1 * SIZE(X1)
  828. FMADD y01, a3, b3, y01
  829. FMADD y02, a4, b4, y02
  830. .align 4
  831. LL(46):
  832. andi. r0, M, 2
  833. ble LL(47)
  834. LFDU b1, 1 * SIZE(X1)
  835. LFDU a1, 1 * SIZE(AO1)
  836. LFDU b2, 1 * SIZE(X1)
  837. LFDU a2, 1 * SIZE(AO1)
  838. FMADD y01, a1, b1, y01
  839. FMADD y02, a2, b2, y02
  840. .align 4
  841. LL(47):
  842. andi. r0, M, 1
  843. ble LL(48)
  844. LFDU a1, 1 * SIZE(AO1)
  845. LFDU b1, 1 * SIZE(X1)
  846. FMADD y01, a1, b1, y01
  847. .align 4
  848. LL(48):
  849. lfd alpha, ALPHA
  850. LFDUX a1, Y, INCY
  851. FADD y01, y02, y01
  852. FMADD a1, alpha, f0, a1
  853. STFDUX a1, YY, INCY
  854. .align 4
  855. LL(999):
  856. li r3, 0
  857. lfd f14, 0(SP)
  858. lfd f15, 8(SP)
  859. lfd f16, 16(SP)
  860. lfd f17, 24(SP)
  861. lfd f18, 32(SP)
  862. lfd f19, 40(SP)
  863. lfd f20, 48(SP)
  864. lfd f21, 56(SP)
  865. lfd f22, 64(SP)
  866. lfd f23, 72(SP)
  867. #ifdef __64BIT__
  868. ld r14, 160(SP)
  869. ld r15, 168(SP)
  870. ld r16, 176(SP)
  871. ld r17, 184(SP)
  872. ld r18, 192(SP)
  873. ld r19, 200(SP)
  874. ld r20, 208(SP)
  875. ld r21, 216(SP)
  876. ld r22, 224(SP)
  877. #else
  878. lwz r14, 160(SP)
  879. lwz r15, 164(SP)
  880. lwz r16, 168(SP)
  881. lwz r17, 172(SP)
  882. lwz r18, 176(SP)
  883. lwz r19, 180(SP)
  884. lwz r20, 184(SP)
  885. lwz r21, 188(SP)
  886. lwz r22, 192(SP)
  887. #endif
  888. addi SP, SP, STACKSIZE
  889. blr
  890. EPILOGUE
  891. #endif