You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

gemv_t_ppc440.S 20 kB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094
  1. /*********************************************************************/
  2. /* Copyright 2009, 2010 The University of Texas at Austin. */
  3. /* All rights reserved. */
  4. /* */
  5. /* Redistribution and use in source and binary forms, with or */
  6. /* without modification, are permitted provided that the following */
  7. /* conditions are met: */
  8. /* */
  9. /* 1. Redistributions of source code must retain the above */
  10. /* copyright notice, this list of conditions and the following */
  11. /* disclaimer. */
  12. /* */
  13. /* 2. Redistributions in binary form must reproduce the above */
  14. /* copyright notice, this list of conditions and the following */
  15. /* disclaimer in the documentation and/or other materials */
  16. /* provided with the distribution. */
  17. /* */
  18. /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
  19. /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
  20. /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
  21. /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
  22. /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
  23. /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
  24. /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
  25. /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
  26. /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
  27. /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
  28. /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
  29. /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
  30. /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
  31. /* POSSIBILITY OF SUCH DAMAGE. */
  32. /* */
  33. /* The views and conclusions contained in the software and */
  34. /* documentation are those of the authors and should not be */
  35. /* interpreted as representing official policies, either expressed */
  36. /* or implied, of The University of Texas at Austin. */
  37. /*********************************************************************/
  38. #define ASSEMBLER
  39. #include "common.h"
  40. #if defined(linux) || defined(__FreeBSD__)
  41. #ifndef __64BIT__
  42. #define M r3
  43. #define N r4
  44. #define A r6
  45. #define LDA r7
  46. #define X r8
  47. #define INCX r9
  48. #define Y r10
  49. #define INCY r5
  50. #else
  51. #define M r3
  52. #define N r4
  53. #define A r7
  54. #define LDA r8
  55. #define X r9
  56. #define INCX r10
  57. #define Y r5
  58. #define INCY r6
  59. #endif
  60. #endif
  61. #if defined(_AIX) || defined(__APPLE__)
  62. #if !defined(__64BIT__) && defined(DOUBLE)
  63. #define M r3
  64. #define N r4
  65. #define A r8
  66. #define LDA r9
  67. #define X r10
  68. #define INCX r5
  69. #define Y r6
  70. #define INCY r7
  71. #else
  72. #define M r3
  73. #define N r4
  74. #define A r7
  75. #define LDA r8
  76. #define X r9
  77. #define INCX r10
  78. #define Y r5
  79. #define INCY r6
  80. #endif
  81. #endif
  82. #define BUFFER r11
  83. #define XP r12
  84. #define AO1 r14
  85. #define AO2 r15
  86. #define AO3 r16
  87. #define AO4 r17
  88. #define J r18
  89. #define YY r19
  90. #define PREA r20
  91. #define PREC r21
  92. #define X1 r22
  93. #if defined(PPC440)
  94. #define PREFETCHSIZE_A 42
  95. #define PREFETCHSIZE_C 7
  96. #endif
  97. #if defined(PPCG4)
  98. #define PREFETCHSIZE_A 42
  99. #define PREFETCHSIZE_C 7
  100. #endif
  101. #if defined(POWER6)
  102. #define PREFETCHSIZE_A 42
  103. #define PREFETCHSIZE_C 7
  104. #endif
  105. #define y01 f0
  106. #define y02 f1
  107. #define y03 f2
  108. #define y04 f3
  109. #define y05 f4
  110. #define y06 f5
  111. #define y07 f6
  112. #define y08 f7
  113. #define a1 f8
  114. #define a2 f9
  115. #define a3 f10
  116. #define a4 f11
  117. #define a5 f12
  118. #define a6 f13
  119. #define a7 f14
  120. #define a8 f15
  121. #define b1 f16
  122. #define b2 f17
  123. #define b3 f18
  124. #define b4 f19
  125. #define b5 f20
  126. #define b6 f21
  127. #define b7 f22
  128. #define b8 f23
  129. #define alpha f23
  130. #ifndef NEEDPARAM
  131. #ifndef __64BIT__
  132. #define STACKSIZE 224
  133. #else
  134. #define STACKSIZE 288
  135. #endif
  136. #define FZERO 144(SP)
  137. #define ALPHA 152(SP)
  138. PROLOGUE
  139. PROFCODE
  140. addi SP, SP, -STACKSIZE
  141. li r0, 0
  142. stfd f14, 0(SP)
  143. stfd f15, 8(SP)
  144. stfd f16, 16(SP)
  145. stfd f17, 24(SP)
  146. stfd f18, 32(SP)
  147. stfd f19, 40(SP)
  148. stfd f20, 48(SP)
  149. stfd f21, 56(SP)
  150. stfd f22, 64(SP)
  151. stfd f23, 72(SP)
  152. #ifdef __64BIT__
  153. std r0, FZERO
  154. stfd f1, ALPHA
  155. std r14, 160(SP)
  156. std r15, 168(SP)
  157. std r16, 176(SP)
  158. std r17, 184(SP)
  159. std r18, 192(SP)
  160. std r19, 200(SP)
  161. std r20, 208(SP)
  162. std r21, 216(SP)
  163. std r22, 224(SP)
  164. #else
  165. stw r0, 0 + FZERO
  166. stw r0, 4 + FZERO
  167. stfd f1, ALPHA
  168. stw r14, 160(SP)
  169. stw r15, 164(SP)
  170. stw r16, 168(SP)
  171. stw r17, 172(SP)
  172. stw r18, 176(SP)
  173. stw r19, 180(SP)
  174. stw r20, 184(SP)
  175. stw r21, 188(SP)
  176. stw r22, 192(SP)
  177. #endif
  178. #if defined(linux) || defined(__FreeBSD__)
  179. #ifndef __64BIT__
  180. lwz INCY, FRAMESLOT(0) + STACKSIZE(SP)
  181. lwz BUFFER, FRAMESLOT(1) + STACKSIZE(SP)
  182. #else
  183. ld Y, FRAMESLOT(0) + STACKSIZE(SP)
  184. ld INCY, FRAMESLOT(1) + STACKSIZE(SP)
  185. ld BUFFER, FRAMESLOT(2) + STACKSIZE(SP)
  186. #endif
  187. #endif
  188. #if defined(_AIX) || defined(__APPLE__)
  189. #ifndef __64BIT__
  190. #ifdef DOUBLE
  191. lwz INCX, FRAMESLOT(0) + STACKSIZE(SP)
  192. lwz Y, FRAMESLOT(1) + STACKSIZE(SP)
  193. lwz INCY, FRAMESLOT(2) + STACKSIZE(SP)
  194. lwz BUFFER, FRAMESLOT(3) + STACKSIZE(SP)
  195. #else
  196. lwz Y, FRAMESLOT(0) + STACKSIZE(SP)
  197. lwz INCY, FRAMESLOT(1) + STACKSIZE(SP)
  198. lwz BUFFER, FRAMESLOT(2) + STACKSIZE(SP)
  199. #endif
  200. #else
  201. ld Y, FRAMESLOT(0) + STACKSIZE(SP)
  202. ld INCY, FRAMESLOT(1) + STACKSIZE(SP)
  203. ld BUFFER, FRAMESLOT(2) + STACKSIZE(SP)
  204. #endif
  205. #endif
  206. slwi LDA, LDA, BASE_SHIFT
  207. slwi INCX, INCX, BASE_SHIFT
  208. slwi INCY, INCY, BASE_SHIFT
  209. addi A, A, -SIZE
  210. sub X, X, INCX
  211. sub Y, Y, INCY
  212. li PREA, PREFETCHSIZE_A * SIZE
  213. li PREC, PREFETCHSIZE_C * SIZE
  214. cmpi cr0, 0, M, 0
  215. ble LL(999)
  216. cmpi cr0, 0, N, 0
  217. ble LL(999)
  218. mr XP, X
  219. cmpi cr0, 0, INCX, SIZE
  220. beq LL(10)
  221. addi XP, BUFFER, -SIZE
  222. addi X1, BUFFER, -SIZE
  223. srawi. r0, M, 3
  224. mtspr CTR, r0
  225. ble LL(CopyRemain)
  226. .align 4
  227. LL(CopyKernel):
  228. LFDUX f0, X, INCX
  229. LFDUX f1, X, INCX
  230. LFDUX f2, X, INCX
  231. LFDUX f3, X, INCX
  232. LFDUX f4, X, INCX
  233. LFDUX f5, X, INCX
  234. LFDUX f6, X, INCX
  235. LFDUX f7, X, INCX
  236. STFDU f0, 1 * SIZE(X1)
  237. STFDU f1, 1 * SIZE(X1)
  238. STFDU f2, 1 * SIZE(X1)
  239. STFDU f3, 1 * SIZE(X1)
  240. STFDU f4, 1 * SIZE(X1)
  241. STFDU f5, 1 * SIZE(X1)
  242. STFDU f6, 1 * SIZE(X1)
  243. STFDU f7, 1 * SIZE(X1)
  244. bdnz LL(CopyKernel)
  245. .align 4
  246. LL(CopyRemain):
  247. andi. r0, M, 7
  248. mtspr CTR, r0
  249. ble LL(10)
  250. .align 4
  251. LL(CopySub):
  252. LFDUX f0, X, INCX
  253. STFDU f0, 1 * SIZE(X1)
  254. bdnz LL(CopySub)
  255. .align 4
  256. LL(10):
  257. mr YY, Y
  258. srawi. J, N, 2
  259. ble LL(30)
  260. .align 4
  261. LL(21):
  262. mr AO1, A
  263. add AO2, A, LDA
  264. add AO3, AO2, LDA
  265. add AO4, AO3, LDA
  266. add A, AO4, LDA
  267. mr X1, XP
  268. lfd y01, FZERO
  269. fmr y02, y01
  270. fmr y03, y01
  271. fmr y04, y01
  272. fmr y05, y01
  273. fmr y06, y01
  274. fmr y07, y01
  275. fmr y08, y01
  276. dcbtst Y, PREC
  277. srawi. r0, M, 3
  278. mtspr CTR, r0
  279. ble LL(24)
  280. LFDU a1, 1 * SIZE(AO1)
  281. LFDU a2, 1 * SIZE(AO2)
  282. LFDU a3, 1 * SIZE(AO3)
  283. LFDU a4, 1 * SIZE(AO4)
  284. LFDU b1, 1 * SIZE(X1)
  285. LFDU b2, 1 * SIZE(X1)
  286. LFDU a5, 1 * SIZE(AO1)
  287. LFDU a6, 1 * SIZE(AO2)
  288. LFDU a7, 1 * SIZE(AO3)
  289. LFDU a8, 1 * SIZE(AO4)
  290. LFDU b3, 1 * SIZE(X1)
  291. LFDU b4, 1 * SIZE(X1)
  292. bdz LL(23)
  293. .align 4
  294. LL(22):
  295. #ifdef PPCG4
  296. dcbt X1, PREA
  297. #endif
  298. FMADD y01, a1, b1, y01
  299. LFDU a1, 1 * SIZE(AO1)
  300. FMADD y02, a2, b1, y02
  301. LFDU a2, 1 * SIZE(AO2)
  302. FMADD y03, a3, b1, y03
  303. LFDU a3, 1 * SIZE(AO3)
  304. FMADD y04, a4, b1, y04
  305. LFDU a4, 1 * SIZE(AO4)
  306. LFDU b1, 1 * SIZE(X1)
  307. #ifdef PPCG4
  308. dcbt AO1, PREA
  309. #endif
  310. FMADD y05, a5, b2, y05
  311. LFDU a5, 1 * SIZE(AO1)
  312. FMADD y06, a6, b2, y06
  313. LFDU a6, 1 * SIZE(AO2)
  314. FMADD y07, a7, b2, y07
  315. LFDU a7, 1 * SIZE(AO3)
  316. FMADD y08, a8, b2, y08
  317. LFDU a8, 1 * SIZE(AO4)
  318. LFDU b2, 1 * SIZE(X1)
  319. #ifdef PPCG4
  320. dcbt AO2, PREA
  321. #endif
  322. FMADD y01, a1, b3, y01
  323. LFDU a1, 1 * SIZE(AO1)
  324. FMADD y02, a2, b3, y02
  325. LFDU a2, 1 * SIZE(AO2)
  326. FMADD y03, a3, b3, y03
  327. LFDU a3, 1 * SIZE(AO3)
  328. FMADD y04, a4, b3, y04
  329. LFDU a4, 1 * SIZE(AO4)
  330. LFDU b3, 1 * SIZE(X1)
  331. #ifdef PPCG4
  332. dcbt AO3, PREA
  333. #endif
  334. FMADD y05, a5, b4, y05
  335. LFDU a5, 1 * SIZE(AO1)
  336. FMADD y06, a6, b4, y06
  337. LFDU a6, 1 * SIZE(AO2)
  338. FMADD y07, a7, b4, y07
  339. LFDU a7, 1 * SIZE(AO3)
  340. FMADD y08, a8, b4, y08
  341. LFDU a8, 1 * SIZE(AO4)
  342. #ifdef PPCG4
  343. dcbt AO4, PREA
  344. #endif
  345. LFDU b4, 1 * SIZE(X1)
  346. #if defined(PPCG4) && defined(DOUBLE)
  347. dcbt X1, PREA
  348. #endif
  349. FMADD y01, a1, b1, y01
  350. LFDU a1, 1 * SIZE(AO1)
  351. FMADD y02, a2, b1, y02
  352. LFDU a2, 1 * SIZE(AO2)
  353. FMADD y03, a3, b1, y03
  354. LFDU a3, 1 * SIZE(AO3)
  355. FMADD y04, a4, b1, y04
  356. LFDU a4, 1 * SIZE(AO4)
  357. LFDU b1, 1 * SIZE(X1)
  358. #if defined(PPCG4) && defined(DOUBLE)
  359. dcbt AO1, PREA
  360. #endif
  361. FMADD y05, a5, b2, y05
  362. LFDU a5, 1 * SIZE(AO1)
  363. FMADD y06, a6, b2, y06
  364. LFDU a6, 1 * SIZE(AO2)
  365. FMADD y07, a7, b2, y07
  366. LFDU a7, 1 * SIZE(AO3)
  367. FMADD y08, a8, b2, y08
  368. LFDU a8, 1 * SIZE(AO4)
  369. LFDU b2, 1 * SIZE(X1)
  370. #if defined(PPCG4) && defined(DOUBLE)
  371. dcbt AO2, PREA
  372. #endif
  373. FMADD y01, a1, b3, y01
  374. LFDU a1, 1 * SIZE(AO1)
  375. FMADD y02, a2, b3, y02
  376. LFDU a2, 1 * SIZE(AO2)
  377. FMADD y03, a3, b3, y03
  378. LFDU a3, 1 * SIZE(AO3)
  379. FMADD y04, a4, b3, y04
  380. LFDU a4, 1 * SIZE(AO4)
  381. LFDU b3, 1 * SIZE(X1)
  382. #if defined(PPCG4) && defined(DOUBLE)
  383. dcbt AO3, PREA
  384. #endif
  385. FMADD y05, a5, b4, y05
  386. LFDU a5, 1 * SIZE(AO1)
  387. FMADD y06, a6, b4, y06
  388. LFDU a6, 1 * SIZE(AO2)
  389. FMADD y07, a7, b4, y07
  390. LFDU a7, 1 * SIZE(AO3)
  391. FMADD y08, a8, b4, y08
  392. LFDU a8, 1 * SIZE(AO4)
  393. LFDU b4, 1 * SIZE(X1)
  394. #if defined(PPCG4) && defined(DOUBLE)
  395. dcbt AO4, PREA
  396. #endif
  397. bdnz LL(22)
  398. .align 4
  399. LL(23):
  400. FMADD y01, a1, b1, y01
  401. LFDU a1, 1 * SIZE(AO1)
  402. FMADD y02, a2, b1, y02
  403. LFDU a2, 1 * SIZE(AO2)
  404. FMADD y03, a3, b1, y03
  405. LFDU a3, 1 * SIZE(AO3)
  406. FMADD y04, a4, b1, y04
  407. LFDU a4, 1 * SIZE(AO4)
  408. LFDU b1, 1 * SIZE(X1)
  409. FMADD y05, a5, b2, y05
  410. LFDU a5, 1 * SIZE(AO1)
  411. FMADD y06, a6, b2, y06
  412. LFDU a6, 1 * SIZE(AO2)
  413. FMADD y07, a7, b2, y07
  414. LFDU a7, 1 * SIZE(AO3)
  415. FMADD y08, a8, b2, y08
  416. LFDU a8, 1 * SIZE(AO4)
  417. LFDU b2, 1 * SIZE(X1)
  418. FMADD y01, a1, b3, y01
  419. LFDU a1, 1 * SIZE(AO1)
  420. FMADD y02, a2, b3, y02
  421. LFDU a2, 1 * SIZE(AO2)
  422. FMADD y03, a3, b3, y03
  423. LFDU a3, 1 * SIZE(AO3)
  424. FMADD y04, a4, b3, y04
  425. LFDU a4, 1 * SIZE(AO4)
  426. LFDU b3, 1 * SIZE(X1)
  427. FMADD y05, a5, b4, y05
  428. LFDU a5, 1 * SIZE(AO1)
  429. FMADD y06, a6, b4, y06
  430. LFDU a6, 1 * SIZE(AO2)
  431. FMADD y07, a7, b4, y07
  432. LFDU a7, 1 * SIZE(AO3)
  433. FMADD y08, a8, b4, y08
  434. LFDU a8, 1 * SIZE(AO4)
  435. LFDU b4, 1 * SIZE(X1)
  436. FMADD y01, a1, b1, y01
  437. LFDU a1, 1 * SIZE(AO1)
  438. FMADD y02, a2, b1, y02
  439. LFDU a2, 1 * SIZE(AO2)
  440. FMADD y03, a3, b1, y03
  441. LFDU a3, 1 * SIZE(AO3)
  442. FMADD y04, a4, b1, y04
  443. LFDU a4, 1 * SIZE(AO4)
  444. FMADD y05, a5, b2, y05
  445. LFDU a5, 1 * SIZE(AO1)
  446. FMADD y06, a6, b2, y06
  447. LFDU a6, 1 * SIZE(AO2)
  448. FMADD y07, a7, b2, y07
  449. LFDU a7, 1 * SIZE(AO3)
  450. FMADD y08, a8, b2, y08
  451. LFDU a8, 1 * SIZE(AO4)
  452. FMADD y01, a1, b3, y01
  453. FMADD y02, a2, b3, y02
  454. FMADD y03, a3, b3, y03
  455. FMADD y04, a4, b3, y04
  456. FMADD y05, a5, b4, y05
  457. FMADD y06, a6, b4, y06
  458. FMADD y07, a7, b4, y07
  459. FMADD y08, a8, b4, y08
  460. .align 4
  461. LL(24):
  462. andi. r0, M, 7
  463. ble LL(28)
  464. andi. r0, M, 4
  465. ble LL(26)
  466. LFDU a1, 1 * SIZE(AO1)
  467. LFDU a2, 1 * SIZE(AO2)
  468. LFDU b1, 1 * SIZE(X1)
  469. LFDU a3, 1 * SIZE(AO3)
  470. LFDU a4, 1 * SIZE(AO4)
  471. LFDU b2, 1 * SIZE(X1)
  472. FMADD y01, a1, b1, y01
  473. LFDU a5, 1 * SIZE(AO1)
  474. FMADD y02, a2, b1, y02
  475. LFDU a6, 1 * SIZE(AO2)
  476. FMADD y03, a3, b1, y03
  477. LFDU a7, 1 * SIZE(AO3)
  478. FMADD y04, a4, b1, y04
  479. LFDU a8, 1 * SIZE(AO4)
  480. LFDU b3, 1 * SIZE(X1)
  481. FMADD y05, a5, b2, y05
  482. LFDU a1, 1 * SIZE(AO1)
  483. FMADD y06, a6, b2, y06
  484. LFDU a2, 1 * SIZE(AO2)
  485. FMADD y07, a7, b2, y07
  486. LFDU a3, 1 * SIZE(AO3)
  487. FMADD y08, a8, b2, y08
  488. LFDU a4, 1 * SIZE(AO4)
  489. LFDU b4, 1 * SIZE(X1)
  490. FMADD y01, a1, b3, y01
  491. LFDU a5, 1 * SIZE(AO1)
  492. FMADD y02, a2, b3, y02
  493. LFDU a6, 1 * SIZE(AO2)
  494. FMADD y03, a3, b3, y03
  495. LFDU a7, 1 * SIZE(AO3)
  496. FMADD y04, a4, b3, y04
  497. LFDU a8, 1 * SIZE(AO4)
  498. FMADD y05, a5, b4, y05
  499. FMADD y06, a6, b4, y06
  500. FMADD y07, a7, b4, y07
  501. FMADD y08, a8, b4, y08
  502. .align 4
  503. LL(26):
  504. andi. r0, M, 2
  505. ble LL(27)
  506. LFDU b1, 1 * SIZE(X1)
  507. LFDU a1, 1 * SIZE(AO1)
  508. LFDU a2, 1 * SIZE(AO2)
  509. LFDU a3, 1 * SIZE(AO3)
  510. LFDU a4, 1 * SIZE(AO4)
  511. LFDU b2, 1 * SIZE(X1)
  512. FMADD y01, a1, b1, y01
  513. LFDU a5, 1 * SIZE(AO1)
  514. FMADD y02, a2, b1, y02
  515. LFDU a6, 1 * SIZE(AO2)
  516. FMADD y03, a3, b1, y03
  517. LFDU a7, 1 * SIZE(AO3)
  518. FMADD y04, a4, b1, y04
  519. LFDU a8, 1 * SIZE(AO4)
  520. FMADD y05, a5, b2, y05
  521. FMADD y06, a6, b2, y06
  522. FMADD y07, a7, b2, y07
  523. FMADD y08, a8, b2, y08
  524. .align 4
  525. LL(27):
  526. andi. r0, M, 1
  527. ble LL(28)
  528. LFDU a1, 1 * SIZE(AO1)
  529. LFDU b1, 1 * SIZE(X1)
  530. LFDU a2, 1 * SIZE(AO2)
  531. LFDU a3, 1 * SIZE(AO3)
  532. LFDU a4, 1 * SIZE(AO4)
  533. FMADD y01, a1, b1, y01
  534. FMADD y02, a2, b1, y02
  535. FMADD y03, a3, b1, y03
  536. FMADD y04, a4, b1, y04
  537. .align 4
  538. LL(28):
  539. lfd alpha, ALPHA
  540. LFDUX a1, Y, INCY
  541. LFDUX a2, Y, INCY
  542. LFDUX a3, Y, INCY
  543. LFDUX a4, Y, INCY
  544. FADD y01, y05, y01
  545. FADD y02, y06, y02
  546. FADD y03, y07, y03
  547. FADD y04, y08, y04
  548. FMADD a1, alpha, f0, a1
  549. FMADD a2, alpha, f1, a2
  550. FMADD a3, alpha, f2, a3
  551. FMADD a4, alpha, f3, a4
  552. STFDUX a1, YY, INCY
  553. addi J, J, -1
  554. STFDUX a2, YY, INCY
  555. cmpi cr0, 0, J, 0
  556. STFDUX a3, YY, INCY
  557. STFDUX a4, YY, INCY
  558. bgt LL(21)
  559. .align 4
  560. LL(30):
  561. andi. J, N, 2
  562. ble LL(40)
  563. mr AO1, A
  564. add AO2, A, LDA
  565. add A, AO2, LDA
  566. mr X1, XP
  567. lfd y01, FZERO
  568. fmr y02, y01
  569. fmr y03, y01
  570. fmr y04, y01
  571. srawi. r0, M, 3
  572. mtspr CTR, r0
  573. ble LL(34)
  574. LFDU a1, 1 * SIZE(AO1)
  575. LFDU a2, 1 * SIZE(AO2)
  576. LFDU b1, 1 * SIZE(X1)
  577. LFDU b2, 1 * SIZE(X1)
  578. LFDU a5, 1 * SIZE(AO1)
  579. LFDU a6, 1 * SIZE(AO2)
  580. LFDU b3, 1 * SIZE(X1)
  581. LFDU b4, 1 * SIZE(X1)
  582. bdz LL(33)
  583. .align 4
  584. LL(32):
  585. #ifdef PPCG4
  586. dcbt X1, PREA
  587. #endif
  588. FMADD y01, a1, b1, y01
  589. LFDU a1, 1 * SIZE(AO1)
  590. FMADD y02, a2, b1, y02
  591. LFDU a2, 1 * SIZE(AO2)
  592. LFDU b1, 1 * SIZE(X1)
  593. #ifdef PPCG4
  594. dcbt AO1, PREA
  595. #endif
  596. FMADD y03, a5, b2, y03
  597. LFDU a5, 1 * SIZE(AO1)
  598. FMADD y04, a6, b2, y04
  599. LFDU a6, 1 * SIZE(AO2)
  600. LFDU b2, 1 * SIZE(X1)
  601. FMADD y01, a1, b3, y01
  602. LFDU a1, 1 * SIZE(AO1)
  603. FMADD y02, a2, b3, y02
  604. LFDU a2, 1 * SIZE(AO2)
  605. LFDU b3, 1 * SIZE(X1)
  606. #ifdef PPCG4
  607. dcbt AO2, PREA
  608. #endif
  609. FMADD y03, a5, b4, y03
  610. LFDU a5, 1 * SIZE(AO1)
  611. FMADD y04, a6, b4, y04
  612. LFDU a6, 1 * SIZE(AO2)
  613. LFDU b4, 1 * SIZE(X1)
  614. FMADD y01, a1, b1, y01
  615. LFDU a1, 1 * SIZE(AO1)
  616. FMADD y02, a2, b1, y02
  617. LFDU a2, 1 * SIZE(AO2)
  618. #if defined(PPCG4) && defined(DOUBLE)
  619. dcbt X1, PREA
  620. #endif
  621. LFDU b1, 1 * SIZE(X1)
  622. #if defined(PPCG4) && defined(DOUBLE)
  623. dcbt AO1, PREA
  624. #endif
  625. FMADD y03, a5, b2, y03
  626. LFDU a5, 1 * SIZE(AO1)
  627. FMADD y04, a6, b2, y04
  628. LFDU a6, 1 * SIZE(AO2)
  629. LFDU b2, 1 * SIZE(X1)
  630. FMADD y01, a1, b3, y01
  631. LFDU a1, 1 * SIZE(AO1)
  632. FMADD y02, a2, b3, y02
  633. LFDU a2, 1 * SIZE(AO2)
  634. LFDU b3, 1 * SIZE(X1)
  635. #if defined(PPCG4) && defined(DOUBLE)
  636. dcbt AO2, PREA
  637. #endif
  638. FMADD y03, a5, b4, y03
  639. LFDU a5, 1 * SIZE(AO1)
  640. FMADD y04, a6, b4, y04
  641. LFDU a6, 1 * SIZE(AO2)
  642. LFDU b4, 1 * SIZE(X1)
  643. bdnz LL(32)
  644. .align 4
  645. LL(33):
  646. FMADD y01, a1, b1, y01
  647. LFDU a1, 1 * SIZE(AO1)
  648. FMADD y02, a2, b1, y02
  649. LFDU a2, 1 * SIZE(AO2)
  650. LFDU b1, 1 * SIZE(X1)
  651. FMADD y03, a5, b2, y03
  652. LFDU a5, 1 * SIZE(AO1)
  653. FMADD y04, a6, b2, y04
  654. LFDU a6, 1 * SIZE(AO2)
  655. LFDU b2, 1 * SIZE(X1)
  656. FMADD y01, a1, b3, y01
  657. LFDU a1, 1 * SIZE(AO1)
  658. FMADD y02, a2, b3, y02
  659. LFDU a2, 1 * SIZE(AO2)
  660. LFDU b3, 1 * SIZE(X1)
  661. FMADD y03, a5, b4, y03
  662. LFDU a5, 1 * SIZE(AO1)
  663. FMADD y04, a6, b4, y04
  664. LFDU a6, 1 * SIZE(AO2)
  665. LFDU b4, 1 * SIZE(X1)
  666. FMADD y01, a1, b1, y01
  667. LFDU a1, 1 * SIZE(AO1)
  668. FMADD y02, a2, b1, y02
  669. LFDU a2, 1 * SIZE(AO2)
  670. FMADD y03, a5, b2, y03
  671. LFDU a5, 1 * SIZE(AO1)
  672. FMADD y04, a6, b2, y04
  673. LFDU a6, 1 * SIZE(AO2)
  674. FMADD y01, a1, b3, y01
  675. FMADD y02, a2, b3, y02
  676. FMADD y03, a5, b4, y03
  677. FMADD y04, a6, b4, y04
  678. .align 4
  679. LL(34):
  680. andi. r0, M, 7
  681. ble LL(38)
  682. andi. r0, M, 4
  683. ble LL(36)
  684. LFDU a1, 1 * SIZE(AO1)
  685. LFDU a2, 1 * SIZE(AO2)
  686. LFDU b1, 1 * SIZE(X1)
  687. LFDU b2, 1 * SIZE(X1)
  688. FMADD y01, a1, b1, y01
  689. LFDU a5, 1 * SIZE(AO1)
  690. FMADD y02, a2, b1, y02
  691. LFDU a6, 1 * SIZE(AO2)
  692. LFDU b3, 1 * SIZE(X1)
  693. FMADD y03, a5, b2, y03
  694. LFDU a1, 1 * SIZE(AO1)
  695. FMADD y04, a6, b2, y04
  696. LFDU a2, 1 * SIZE(AO2)
  697. LFDU b4, 1 * SIZE(X1)
  698. FMADD y01, a1, b3, y01
  699. LFDU a5, 1 * SIZE(AO1)
  700. FMADD y02, a2, b3, y02
  701. LFDU a6, 1 * SIZE(AO2)
  702. FMADD y03, a5, b4, y03
  703. FMADD y04, a6, b4, y04
  704. .align 4
  705. LL(36):
  706. andi. r0, M, 2
  707. ble LL(37)
  708. LFDU b1, 1 * SIZE(X1)
  709. LFDU a1, 1 * SIZE(AO1)
  710. LFDU a2, 1 * SIZE(AO2)
  711. LFDU b2, 1 * SIZE(X1)
  712. LFDU a3, 1 * SIZE(AO1)
  713. LFDU a4, 1 * SIZE(AO2)
  714. FMADD y01, a1, b1, y01
  715. FMADD y02, a2, b1, y02
  716. FMADD y03, a3, b2, y03
  717. FMADD y04, a4, b2, y04
  718. .align 4
  719. LL(37):
  720. andi. r0, M, 1
  721. ble LL(38)
  722. LFDU a1, 1 * SIZE(AO1)
  723. LFDU b1, 1 * SIZE(X1)
  724. LFDU a2, 1 * SIZE(AO2)
  725. FMADD y01, a1, b1, y01
  726. FMADD y02, a2, b1, y02
  727. .align 4
  728. LL(38):
  729. lfd alpha, ALPHA
  730. LFDUX a1, Y, INCY
  731. LFDUX a2, Y, INCY
  732. FADD y01, y03, y01
  733. FADD y02, y04, y02
  734. FMADD a1, alpha, f0, a1
  735. FMADD a2, alpha, f1, a2
  736. STFDUX a1, YY, INCY
  737. STFDUX a2, YY, INCY
  738. .align 4
  739. LL(40):
  740. andi. J, N, 1
  741. ble LL(999)
  742. mr AO1, A
  743. add A, A, LDA
  744. mr X1, XP
  745. lfd y01, FZERO
  746. fmr y02, y01
  747. srawi. r0, M, 3
  748. mtspr CTR, r0
  749. ble LL(44)
  750. LFDU a1, 1 * SIZE(AO1)
  751. LFDU a2, 1 * SIZE(AO1)
  752. LFDU a3, 1 * SIZE(AO1)
  753. LFDU a4, 1 * SIZE(AO1)
  754. LFDU b1, 1 * SIZE(X1)
  755. LFDU b2, 1 * SIZE(X1)
  756. LFDU b3, 1 * SIZE(X1)
  757. LFDU b4, 1 * SIZE(X1)
  758. bdz LL(43)
  759. .align 4
  760. LL(42):
  761. FMADD y01, a1, b1, y01
  762. LFDU a1, 1 * SIZE(AO1)
  763. LFDU b1, 1 * SIZE(X1)
  764. #ifdef PPCG4
  765. dcbt X1, PREA
  766. #endif
  767. FMADD y02, a2, b2, y02
  768. LFDU a2, 1 * SIZE(AO1)
  769. LFDU b2, 1 * SIZE(X1)
  770. #ifdef PPCG4
  771. dcbt AO1, PREA
  772. #endif
  773. FMADD y01, a3, b3, y01
  774. LFDU a3, 1 * SIZE(AO1)
  775. LFDU b3, 1 * SIZE(X1)
  776. FMADD y02, a4, b4, y02
  777. LFDU a4, 1 * SIZE(AO1)
  778. LFDU b4, 1 * SIZE(X1)
  779. FMADD y01, a1, b1, y01
  780. LFDU a1, 1 * SIZE(AO1)
  781. LFDU b1, 1 * SIZE(X1)
  782. FMADD y02, a2, b2, y02
  783. LFDU a2, 1 * SIZE(AO1)
  784. LFDU b2, 1 * SIZE(X1)
  785. #if defined(PPCG4) && defined(DOUBLE)
  786. dcbt AO1, PREA
  787. #endif
  788. FMADD y01, a3, b3, y01
  789. LFDU a3, 1 * SIZE(AO1)
  790. LFDU b3, 1 * SIZE(X1)
  791. #if defined(PPCG4) && defined(DOUBLE)
  792. dcbt X1, PREA
  793. #endif
  794. FMADD y02, a4, b4, y02
  795. LFDU a4, 1 * SIZE(AO1)
  796. LFDU b4, 1 * SIZE(X1)
  797. bdnz LL(42)
  798. .align 4
  799. LL(43):
  800. FMADD y01, a1, b1, y01
  801. LFDU a1, 1 * SIZE(AO1)
  802. LFDU b1, 1 * SIZE(X1)
  803. FMADD y02, a2, b2, y02
  804. LFDU a2, 1 * SIZE(AO1)
  805. LFDU b2, 1 * SIZE(X1)
  806. FMADD y01, a3, b3, y01
  807. LFDU a3, 1 * SIZE(AO1)
  808. LFDU b3, 1 * SIZE(X1)
  809. FMADD y02, a4, b4, y02
  810. LFDU a4, 1 * SIZE(AO1)
  811. LFDU b4, 1 * SIZE(X1)
  812. FMADD y01, a1, b1, y01
  813. FMADD y02, a2, b2, y02
  814. FMADD y01, a3, b3, y01
  815. FMADD y02, a4, b4, y02
  816. .align 4
  817. LL(44):
  818. andi. r0, M, 7
  819. ble LL(48)
  820. andi. r0, M, 4
  821. ble LL(46)
  822. LFDU a1, 1 * SIZE(AO1)
  823. LFDU b1, 1 * SIZE(X1)
  824. LFDU a2, 1 * SIZE(AO1)
  825. LFDU b2, 1 * SIZE(X1)
  826. FMADD y01, a1, b1, y01
  827. LFDU a3, 1 * SIZE(AO1)
  828. LFDU b3, 1 * SIZE(X1)
  829. FMADD y02, a2, b2, y02
  830. LFDU a4, 1 * SIZE(AO1)
  831. LFDU b4, 1 * SIZE(X1)
  832. FMADD y01, a3, b3, y01
  833. FMADD y02, a4, b4, y02
  834. .align 4
  835. LL(46):
  836. andi. r0, M, 2
  837. ble LL(47)
  838. LFDU b1, 1 * SIZE(X1)
  839. LFDU a1, 1 * SIZE(AO1)
  840. LFDU b2, 1 * SIZE(X1)
  841. LFDU a2, 1 * SIZE(AO1)
  842. FMADD y01, a1, b1, y01
  843. FMADD y02, a2, b2, y02
  844. .align 4
  845. LL(47):
  846. andi. r0, M, 1
  847. ble LL(48)
  848. LFDU a1, 1 * SIZE(AO1)
  849. LFDU b1, 1 * SIZE(X1)
  850. FMADD y01, a1, b1, y01
  851. .align 4
  852. LL(48):
  853. lfd alpha, ALPHA
  854. LFDUX a1, Y, INCY
  855. FADD y01, y02, y01
  856. FMADD a1, alpha, f0, a1
  857. STFDUX a1, YY, INCY
  858. .align 4
  859. LL(999):
  860. li r3, 0
  861. lfd f14, 0(SP)
  862. lfd f15, 8(SP)
  863. lfd f16, 16(SP)
  864. lfd f17, 24(SP)
  865. lfd f18, 32(SP)
  866. lfd f19, 40(SP)
  867. lfd f20, 48(SP)
  868. lfd f21, 56(SP)
  869. lfd f22, 64(SP)
  870. lfd f23, 72(SP)
  871. #ifdef __64BIT__
  872. ld r14, 160(SP)
  873. ld r15, 168(SP)
  874. ld r16, 176(SP)
  875. ld r17, 184(SP)
  876. ld r18, 192(SP)
  877. ld r19, 200(SP)
  878. ld r20, 208(SP)
  879. ld r21, 216(SP)
  880. ld r22, 224(SP)
  881. #else
  882. lwz r14, 160(SP)
  883. lwz r15, 164(SP)
  884. lwz r16, 168(SP)
  885. lwz r17, 172(SP)
  886. lwz r18, 176(SP)
  887. lwz r19, 180(SP)
  888. lwz r20, 184(SP)
  889. lwz r21, 188(SP)
  890. lwz r22, 192(SP)
  891. #endif
  892. addi SP, SP, STACKSIZE
  893. blr
  894. EPILOGUE
  895. #endif