You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

zaxpy.S 15 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683
  1. /*********************************************************************/
  2. /* Copyright 2009, 2010 The University of Texas at Austin. */
  3. /* All rights reserved. */
  4. /* */
  5. /* Redistribution and use in source and binary forms, with or */
  6. /* without modification, are permitted provided that the following */
  7. /* conditions are met: */
  8. /* */
  9. /* 1. Redistributions of source code must retain the above */
  10. /* copyright notice, this list of conditions and the following */
  11. /* disclaimer. */
  12. /* */
  13. /* 2. Redistributions in binary form must reproduce the above */
  14. /* copyright notice, this list of conditions and the following */
  15. /* disclaimer in the documentation and/or other materials */
  16. /* provided with the distribution. */
  17. /* */
  18. /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
  19. /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
  20. /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
  21. /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
  22. /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
  23. /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
  24. /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
  25. /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
  26. /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
  27. /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
  28. /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
  29. /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
  30. /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
  31. /* POSSIBILITY OF SUCH DAMAGE. */
  32. /* */
  33. /* The views and conclusions contained in the software and */
  34. /* documentation are those of the authors and should not be */
  35. /* interpreted as representing official policies, either expressed */
  36. /* or implied, of The University of Texas at Austin. */
  37. /*********************************************************************/
  38. #define ASSEMBLER
  39. #include "common.h"
  40. #if defined(linux) || defined(__FreeBSD__)
  41. #ifndef __64BIT__
  42. #define N r3
  43. #define X r6
  44. #define INCX r7
  45. #define Y r8
  46. #define INCY r9
  47. #define INCXM1 r4
  48. #define INCYM1 r5
  49. #define PREA r10
  50. #define YY r11
  51. #else
  52. #define N r3
  53. #define X r8
  54. #define INCX r9
  55. #define Y r10
  56. #define INCY r4
  57. #define INCXM1 r5
  58. #define INCYM1 r6
  59. #define PREA r7
  60. #define YY r11
  61. #endif
  62. #endif
  63. #if defined(_AIX) || defined(__APPLE__)
  64. #if !defined(__64BIT__) && defined(DOUBLE)
  65. #define N r3
  66. #define X r10
  67. #define INCX r4
  68. #define Y r5
  69. #define INCY r6
  70. #define INCXM1 r7
  71. #define INCYM1 r8
  72. #define PREA r9
  73. #define YY r11
  74. #else
  75. #define N r3
  76. #define X r8
  77. #define INCX r9
  78. #define Y r10
  79. #define INCY r4
  80. #define INCXM1 r5
  81. #define INCYM1 r6
  82. #define PREA r7
  83. #define YY r11
  84. #endif
  85. #endif
  86. #define ALPHA_R f24
  87. #define ALPHA_I f25
  88. #ifndef CONJ
  89. #define ADD1 FNMSUB
  90. #define ADD2 FMADD
  91. #else
  92. #define ADD1 FMADD
  93. #define ADD2 FNMSUB
  94. #endif
  95. #ifndef NEEDPARAM
  96. #define STACKSIZE 96
  97. PROLOGUE
  98. PROFCODE
  99. addi SP, SP, -STACKSIZE
  100. li r0, 0
  101. stfd f14, 0(SP)
  102. stfd f15, 8(SP)
  103. stfd f16, 16(SP)
  104. stfd f17, 24(SP)
  105. stfd f18, 32(SP)
  106. stfd f19, 40(SP)
  107. stfd f20, 48(SP)
  108. stfd f21, 56(SP)
  109. stfd f22, 64(SP)
  110. stfd f23, 72(SP)
  111. stfd f24, 80(SP)
  112. stfd f25, 88(SP)
  113. #if (defined(linux) || defined(__FreeBSD__)) && defined(__64BIT__)
  114. ld INCY, FRAMESLOT(0) + STACKSIZE(SP)
  115. #endif
  116. #if defined(_AIX) || defined(__APPLE__)
  117. #ifdef __64BIT__
  118. ld INCY, FRAMESLOT(0) + STACKSIZE(SP)
  119. #else
  120. #ifdef DOUBLE
  121. lwz INCX, FRAMESLOT(0) + STACKSIZE(SP)
  122. lwz Y, FRAMESLOT(1) + STACKSIZE(SP)
  123. lwz INCY, FRAMESLOT(2) + STACKSIZE(SP)
  124. #else
  125. lwz INCY, FRAMESLOT(0) + STACKSIZE(SP)
  126. #endif
  127. #endif
  128. #endif
  129. fmr ALPHA_R, f1
  130. fmr ALPHA_I, f2
  131. slwi INCX, INCX, ZBASE_SHIFT
  132. slwi INCY, INCY, ZBASE_SHIFT
  133. subi INCXM1, INCX, SIZE
  134. subi INCYM1, INCY, SIZE
  135. #ifdef L1_DUALFETCH
  136. li PREA, (L1_PREFETCHSIZE) / 2
  137. #else
  138. li PREA, (L1_PREFETCHSIZE)
  139. #endif
  140. cmpwi cr0, N, 0
  141. ble- LL(999)
  142. cmpwi cr0, INCX, 2 * SIZE
  143. bne- cr0, LL(100)
  144. cmpwi cr0, INCY, 2 * SIZE
  145. bne- cr0, LL(100)
  146. srawi. r0, N, 3
  147. mtspr CTR, r0
  148. beq- cr0, LL(50)
  149. .align 4
  150. LFD f0, 0 * SIZE(X)
  151. LFD f1, 1 * SIZE(X)
  152. LFD f2, 2 * SIZE(X)
  153. LFD f3, 3 * SIZE(X)
  154. LFD f8, 0 * SIZE(Y)
  155. LFD f9, 1 * SIZE(Y)
  156. LFD f10, 2 * SIZE(Y)
  157. LFD f11, 3 * SIZE(Y)
  158. LFD f4, 4 * SIZE(X)
  159. LFD f5, 5 * SIZE(X)
  160. LFD f6, 6 * SIZE(X)
  161. LFD f7, 7 * SIZE(X)
  162. LFD f12, 4 * SIZE(Y)
  163. LFD f13, 5 * SIZE(Y)
  164. LFD f14, 6 * SIZE(Y)
  165. LFD f15, 7 * SIZE(Y)
  166. bdz LL(20)
  167. .align 4
  168. LL(10):
  169. FMADD f16, ALPHA_R, f0, f8
  170. FMADD f17, ALPHA_I, f0, f9
  171. FMADD f18, ALPHA_R, f2, f10
  172. FMADD f19, ALPHA_I, f2, f11
  173. ADD1 f16, ALPHA_I, f1, f16
  174. ADD2 f17, ALPHA_R, f1, f17
  175. ADD1 f18, ALPHA_I, f3, f18
  176. ADD2 f19, ALPHA_R, f3, f19
  177. LFD f0, 8 * SIZE(X)
  178. LFD f1, 9 * SIZE(X)
  179. LFD f2, 10 * SIZE(X)
  180. LFD f3, 11 * SIZE(X)
  181. LFD f8, 8 * SIZE(Y)
  182. LFD f9, 9 * SIZE(Y)
  183. LFD f10, 10 * SIZE(Y)
  184. LFD f11, 11 * SIZE(Y)
  185. STFD f16, 0 * SIZE(Y)
  186. STFD f17, 1 * SIZE(Y)
  187. STFD f18, 2 * SIZE(Y)
  188. STFD f19, 3 * SIZE(Y)
  189. FMADD f20, ALPHA_R, f4, f12
  190. FMADD f21, ALPHA_I, f4, f13
  191. FMADD f22, ALPHA_R, f6, f14
  192. FMADD f23, ALPHA_I, f6, f15
  193. ADD1 f20, ALPHA_I, f5, f20
  194. ADD2 f21, ALPHA_R, f5, f21
  195. ADD1 f22, ALPHA_I, f7, f22
  196. ADD2 f23, ALPHA_R, f7, f23
  197. LFD f4, 12 * SIZE(X)
  198. LFD f5, 13 * SIZE(X)
  199. LFD f6, 14 * SIZE(X)
  200. LFD f7, 15 * SIZE(X)
  201. LFD f12, 12 * SIZE(Y)
  202. LFD f13, 13 * SIZE(Y)
  203. LFD f14, 14 * SIZE(Y)
  204. LFD f15, 15 * SIZE(Y)
  205. STFD f20, 4 * SIZE(Y)
  206. STFD f21, 5 * SIZE(Y)
  207. STFD f22, 6 * SIZE(Y)
  208. STFD f23, 7 * SIZE(Y)
  209. FMADD f16, ALPHA_R, f0, f8
  210. FMADD f17, ALPHA_I, f0, f9
  211. FMADD f18, ALPHA_R, f2, f10
  212. FMADD f19, ALPHA_I, f2, f11
  213. ADD1 f16, ALPHA_I, f1, f16
  214. ADD2 f17, ALPHA_R, f1, f17
  215. ADD1 f18, ALPHA_I, f3, f18
  216. ADD2 f19, ALPHA_R, f3, f19
  217. LFD f0, 16 * SIZE(X)
  218. LFD f1, 17 * SIZE(X)
  219. LFD f2, 18 * SIZE(X)
  220. LFD f3, 19 * SIZE(X)
  221. LFD f8, 16 * SIZE(Y)
  222. LFD f9, 17 * SIZE(Y)
  223. LFD f10, 18 * SIZE(Y)
  224. LFD f11, 19 * SIZE(Y)
  225. STFD f16, 8 * SIZE(Y)
  226. STFD f17, 9 * SIZE(Y)
  227. STFD f18, 10 * SIZE(Y)
  228. STFD f19, 11 * SIZE(Y)
  229. FMADD f20, ALPHA_R, f4, f12
  230. FMADD f21, ALPHA_I, f4, f13
  231. FMADD f22, ALPHA_R, f6, f14
  232. FMADD f23, ALPHA_I, f6, f15
  233. ADD1 f20, ALPHA_I, f5, f20
  234. ADD2 f21, ALPHA_R, f5, f21
  235. ADD1 f22, ALPHA_I, f7, f22
  236. ADD2 f23, ALPHA_R, f7, f23
  237. LFD f4, 20 * SIZE(X)
  238. LFD f5, 21 * SIZE(X)
  239. LFD f6, 22 * SIZE(X)
  240. LFD f7, 23 * SIZE(X)
  241. LFD f12, 20 * SIZE(Y)
  242. LFD f13, 21 * SIZE(Y)
  243. LFD f14, 22 * SIZE(Y)
  244. LFD f15, 23 * SIZE(Y)
  245. STFD f20, 12 * SIZE(Y)
  246. STFD f21, 13 * SIZE(Y)
  247. STFD f22, 14 * SIZE(Y)
  248. STFD f23, 15 * SIZE(Y)
  249. #ifndef POWER6
  250. dcbtst Y, PREA
  251. #ifdef L1_DUALFETCH
  252. dcbt X, PREA
  253. #endif
  254. #endif
  255. addi X, X, 16 * SIZE
  256. addi Y, Y, 16 * SIZE
  257. #ifdef POWER6
  258. dcbtst Y, PREA
  259. L1_PREFETCH X, PREA
  260. #endif
  261. bdnz LL(10)
  262. .align 4
  263. LL(20):
  264. FMADD f16, ALPHA_R, f0, f8
  265. FMADD f17, ALPHA_I, f0, f9
  266. FMADD f18, ALPHA_R, f2, f10
  267. FMADD f19, ALPHA_I, f2, f11
  268. ADD1 f16, ALPHA_I, f1, f16
  269. ADD2 f17, ALPHA_R, f1, f17
  270. ADD1 f18, ALPHA_I, f3, f18
  271. ADD2 f19, ALPHA_R, f3, f19
  272. LFD f0, 8 * SIZE(X)
  273. LFD f1, 9 * SIZE(X)
  274. LFD f2, 10 * SIZE(X)
  275. LFD f3, 11 * SIZE(X)
  276. LFD f8, 8 * SIZE(Y)
  277. LFD f9, 9 * SIZE(Y)
  278. LFD f10, 10 * SIZE(Y)
  279. LFD f11, 11 * SIZE(Y)
  280. FMADD f20, ALPHA_R, f4, f12
  281. FMADD f21, ALPHA_I, f4, f13
  282. FMADD f22, ALPHA_R, f6, f14
  283. FMADD f23, ALPHA_I, f6, f15
  284. ADD1 f20, ALPHA_I, f5, f20
  285. ADD2 f21, ALPHA_R, f5, f21
  286. ADD1 f22, ALPHA_I, f7, f22
  287. ADD2 f23, ALPHA_R, f7, f23
  288. LFD f4, 12 * SIZE(X)
  289. LFD f5, 13 * SIZE(X)
  290. LFD f6, 14 * SIZE(X)
  291. LFD f7, 15 * SIZE(X)
  292. LFD f12, 12 * SIZE(Y)
  293. LFD f13, 13 * SIZE(Y)
  294. LFD f14, 14 * SIZE(Y)
  295. LFD f15, 15 * SIZE(Y)
  296. STFD f16, 0 * SIZE(Y)
  297. STFD f17, 1 * SIZE(Y)
  298. STFD f18, 2 * SIZE(Y)
  299. STFD f19, 3 * SIZE(Y)
  300. FMADD f16, ALPHA_R, f0, f8
  301. FMADD f17, ALPHA_I, f0, f9
  302. FMADD f18, ALPHA_R, f2, f10
  303. FMADD f19, ALPHA_I, f2, f11
  304. ADD1 f16, ALPHA_I, f1, f16
  305. ADD2 f17, ALPHA_R, f1, f17
  306. ADD1 f18, ALPHA_I, f3, f18
  307. ADD2 f19, ALPHA_R, f3, f19
  308. STFD f20, 4 * SIZE(Y)
  309. STFD f21, 5 * SIZE(Y)
  310. STFD f22, 6 * SIZE(Y)
  311. STFD f23, 7 * SIZE(Y)
  312. FMADD f20, ALPHA_R, f4, f12
  313. FMADD f21, ALPHA_I, f4, f13
  314. FMADD f22, ALPHA_R, f6, f14
  315. FMADD f23, ALPHA_I, f6, f15
  316. ADD1 f20, ALPHA_I, f5, f20
  317. ADD2 f21, ALPHA_R, f5, f21
  318. ADD1 f22, ALPHA_I, f7, f22
  319. ADD2 f23, ALPHA_R, f7, f23
  320. STFD f16, 8 * SIZE(Y)
  321. STFD f17, 9 * SIZE(Y)
  322. STFD f18, 10 * SIZE(Y)
  323. STFD f19, 11 * SIZE(Y)
  324. STFD f20, 12 * SIZE(Y)
  325. STFD f21, 13 * SIZE(Y)
  326. STFD f22, 14 * SIZE(Y)
  327. STFD f23, 15 * SIZE(Y)
  328. addi X, X, 16 * SIZE
  329. addi Y, Y, 16 * SIZE
  330. .align 4
  331. LL(50):
  332. andi. r0, N, 7
  333. mtspr CTR, r0
  334. beq LL(999)
  335. .align 4
  336. LL(60):
  337. LFD f0, 0 * SIZE(X)
  338. LFD f1, 1 * SIZE(X)
  339. LFD f8, 0 * SIZE(Y)
  340. LFD f9, 1 * SIZE(Y)
  341. FMADD f16, ALPHA_R, f0, f8
  342. FMADD f17, ALPHA_I, f0, f9
  343. ADD1 f16, ALPHA_I, f1, f16
  344. ADD2 f17, ALPHA_R, f1, f17
  345. STFD f16, 0 * SIZE(Y)
  346. STFD f17, 1 * SIZE(Y)
  347. addi X, X, 2 * SIZE
  348. addi Y, Y, 2 * SIZE
  349. bdnz LL(60)
  350. b LL(999)
  351. .align 4
  352. LL(100):
  353. sub X, X, INCXM1
  354. sub Y, Y, INCYM1
  355. mr YY, Y
  356. srawi. r0, N, 3
  357. mtspr CTR, r0
  358. beq- LL(150)
  359. .align 4
  360. LFDX f0, X, INCXM1
  361. LFDUX f1, X, INCX
  362. LFDX f2, X, INCXM1
  363. LFDUX f3, X, INCX
  364. LFDX f8, Y, INCYM1
  365. LFDUX f9, Y, INCY
  366. LFDX f10, Y, INCYM1
  367. LFDUX f11, Y, INCY
  368. LFDX f4, X, INCXM1
  369. LFDUX f5, X, INCX
  370. LFDX f6, X, INCXM1
  371. LFDUX f7, X, INCX
  372. LFDX f12, Y, INCYM1
  373. LFDUX f13, Y, INCY
  374. LFDX f14, Y, INCYM1
  375. LFDUX f15, Y, INCY
  376. bdz LL(120)
  377. .align 4
  378. LL(110):
  379. FMADD f16, ALPHA_R, f0, f8
  380. FMADD f17, ALPHA_I, f0, f9
  381. FMADD f18, ALPHA_R, f2, f10
  382. FMADD f19, ALPHA_I, f2, f11
  383. ADD1 f16, ALPHA_I, f1, f16
  384. ADD2 f17, ALPHA_R, f1, f17
  385. ADD1 f18, ALPHA_I, f3, f18
  386. ADD2 f19, ALPHA_R, f3, f19
  387. LFDX f0, X, INCXM1
  388. LFDUX f1, X, INCX
  389. LFDX f2, X, INCXM1
  390. LFDUX f3, X, INCX
  391. LFDX f8, Y, INCYM1
  392. LFDUX f9, Y, INCY
  393. LFDX f10, Y, INCYM1
  394. LFDUX f11, Y, INCY
  395. FMADD f20, ALPHA_R, f4, f12
  396. FMADD f21, ALPHA_I, f4, f13
  397. FMADD f22, ALPHA_R, f6, f14
  398. FMADD f23, ALPHA_I, f6, f15
  399. ADD1 f20, ALPHA_I, f5, f20
  400. ADD2 f21, ALPHA_R, f5, f21
  401. ADD1 f22, ALPHA_I, f7, f22
  402. ADD2 f23, ALPHA_R, f7, f23
  403. LFDX f4, X, INCXM1
  404. LFDUX f5, X, INCX
  405. LFDX f6, X, INCXM1
  406. LFDUX f7, X, INCX
  407. LFDX f12, Y, INCYM1
  408. LFDUX f13, Y, INCY
  409. LFDX f14, Y, INCYM1
  410. LFDUX f15, Y, INCY
  411. STFDX f16, YY, INCYM1
  412. STFDUX f17, YY, INCY
  413. STFDX f18, YY, INCYM1
  414. STFDUX f19, YY, INCY
  415. FMADD f16, ALPHA_R, f0, f8
  416. FMADD f17, ALPHA_I, f0, f9
  417. FMADD f18, ALPHA_R, f2, f10
  418. FMADD f19, ALPHA_I, f2, f11
  419. ADD1 f16, ALPHA_I, f1, f16
  420. ADD2 f17, ALPHA_R, f1, f17
  421. ADD1 f18, ALPHA_I, f3, f18
  422. ADD2 f19, ALPHA_R, f3, f19
  423. LFDX f0, X, INCXM1
  424. LFDUX f1, X, INCX
  425. LFDX f2, X, INCXM1
  426. LFDUX f3, X, INCX
  427. LFDX f8, Y, INCYM1
  428. LFDUX f9, Y, INCY
  429. LFDX f10, Y, INCYM1
  430. LFDUX f11, Y, INCY
  431. STFDX f20, YY, INCYM1
  432. STFDUX f21, YY, INCY
  433. STFDX f22, YY, INCYM1
  434. STFDUX f23, YY, INCY
  435. FMADD f20, ALPHA_R, f4, f12
  436. FMADD f21, ALPHA_I, f4, f13
  437. FMADD f22, ALPHA_R, f6, f14
  438. FMADD f23, ALPHA_I, f6, f15
  439. ADD1 f20, ALPHA_I, f5, f20
  440. ADD2 f21, ALPHA_R, f5, f21
  441. ADD1 f22, ALPHA_I, f7, f22
  442. ADD2 f23, ALPHA_R, f7, f23
  443. LFDX f4, X, INCXM1
  444. LFDUX f5, X, INCX
  445. LFDX f6, X, INCXM1
  446. LFDUX f7, X, INCX
  447. LFDX f12, Y, INCYM1
  448. LFDUX f13, Y, INCY
  449. LFDX f14, Y, INCYM1
  450. LFDUX f15, Y, INCY
  451. STFDX f16, YY, INCYM1
  452. STFDUX f17, YY, INCY
  453. STFDX f18, YY, INCYM1
  454. STFDUX f19, YY, INCY
  455. STFDX f20, YY, INCYM1
  456. STFDUX f21, YY, INCY
  457. STFDX f22, YY, INCYM1
  458. STFDUX f23, YY, INCY
  459. bdnz LL(110)
  460. .align 4
  461. LL(120):
  462. FMADD f16, ALPHA_R, f0, f8
  463. FMADD f17, ALPHA_I, f0, f9
  464. FMADD f18, ALPHA_R, f2, f10
  465. FMADD f19, ALPHA_I, f2, f11
  466. ADD1 f16, ALPHA_I, f1, f16
  467. ADD2 f17, ALPHA_R, f1, f17
  468. ADD1 f18, ALPHA_I, f3, f18
  469. ADD2 f19, ALPHA_R, f3, f19
  470. LFDX f0, X, INCXM1
  471. LFDUX f1, X, INCX
  472. LFDX f2, X, INCXM1
  473. LFDUX f3, X, INCX
  474. LFDX f8, Y, INCYM1
  475. LFDUX f9, Y, INCY
  476. LFDX f10, Y, INCYM1
  477. LFDUX f11, Y, INCY
  478. FMADD f20, ALPHA_R, f4, f12
  479. FMADD f21, ALPHA_I, f4, f13
  480. FMADD f22, ALPHA_R, f6, f14
  481. FMADD f23, ALPHA_I, f6, f15
  482. ADD1 f20, ALPHA_I, f5, f20
  483. ADD2 f21, ALPHA_R, f5, f21
  484. ADD1 f22, ALPHA_I, f7, f22
  485. ADD2 f23, ALPHA_R, f7, f23
  486. LFDX f4, X, INCXM1
  487. LFDUX f5, X, INCX
  488. LFDX f6, X, INCXM1
  489. LFDUX f7, X, INCX
  490. LFDX f12, Y, INCYM1
  491. LFDUX f13, Y, INCY
  492. LFDX f14, Y, INCYM1
  493. LFDUX f15, Y, INCY
  494. STFDX f16, YY, INCYM1
  495. STFDUX f17, YY, INCY
  496. STFDX f18, YY, INCYM1
  497. STFDUX f19, YY, INCY
  498. FMADD f16, ALPHA_R, f0, f8
  499. FMADD f17, ALPHA_I, f0, f9
  500. FMADD f18, ALPHA_R, f2, f10
  501. FMADD f19, ALPHA_I, f2, f11
  502. ADD1 f16, ALPHA_I, f1, f16
  503. ADD2 f17, ALPHA_R, f1, f17
  504. ADD1 f18, ALPHA_I, f3, f18
  505. ADD2 f19, ALPHA_R, f3, f19
  506. STFDX f20, YY, INCYM1
  507. STFDUX f21, YY, INCY
  508. STFDX f22, YY, INCYM1
  509. STFDUX f23, YY, INCY
  510. FMADD f20, ALPHA_R, f4, f12
  511. FMADD f21, ALPHA_I, f4, f13
  512. FMADD f22, ALPHA_R, f6, f14
  513. FMADD f23, ALPHA_I, f6, f15
  514. ADD1 f20, ALPHA_I, f5, f20
  515. ADD2 f21, ALPHA_R, f5, f21
  516. ADD1 f22, ALPHA_I, f7, f22
  517. ADD2 f23, ALPHA_R, f7, f23
  518. STFDX f16, YY, INCYM1
  519. STFDUX f17, YY, INCY
  520. STFDX f18, YY, INCYM1
  521. STFDUX f19, YY, INCY
  522. STFDX f20, YY, INCYM1
  523. STFDUX f21, YY, INCY
  524. STFDX f22, YY, INCYM1
  525. STFDUX f23, YY, INCY
  526. .align 4
  527. LL(150):
  528. andi. r0, N, 7
  529. mtspr CTR, r0
  530. beq LL(999)
  531. .align 4
  532. LL(160):
  533. LFDX f0, X, INCXM1
  534. LFDUX f1, X, INCX
  535. LFDX f8, Y, INCYM1
  536. LFDUX f9, Y, INCY
  537. FMADD f16, ALPHA_R, f0, f8
  538. FMADD f17, ALPHA_I, f0, f9
  539. ADD1 f16, ALPHA_I, f1, f16
  540. ADD2 f17, ALPHA_R, f1, f17
  541. STFDX f16, YY, INCYM1
  542. STFDUX f17, YY, INCY
  543. bdnz LL(160)
  544. .align 4
  545. LL(999):
  546. lfd f14, 0(SP)
  547. lfd f15, 8(SP)
  548. lfd f16, 16(SP)
  549. lfd f17, 24(SP)
  550. lfd f18, 32(SP)
  551. lfd f19, 40(SP)
  552. lfd f20, 48(SP)
  553. lfd f21, 56(SP)
  554. lfd f22, 64(SP)
  555. lfd f23, 72(SP)
  556. lfd f24, 80(SP)
  557. lfd f25, 88(SP)
  558. addi SP, SP, STACKSIZE
  559. blr
  560. EPILOGUE
  561. #endif