You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

axpy.S 12 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550
  1. /*********************************************************************/
  2. /* Copyright 2009, 2010 The University of Texas at Austin. */
  3. /* All rights reserved. */
  4. /* */
  5. /* Redistribution and use in source and binary forms, with or */
  6. /* without modification, are permitted provided that the following */
  7. /* conditions are met: */
  8. /* */
  9. /* 1. Redistributions of source code must retain the above */
  10. /* copyright notice, this list of conditions and the following */
  11. /* disclaimer. */
  12. /* */
  13. /* 2. Redistributions in binary form must reproduce the above */
  14. /* copyright notice, this list of conditions and the following */
  15. /* disclaimer in the documentation and/or other materials */
  16. /* provided with the distribution. */
  17. /* */
  18. /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
  19. /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
  20. /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
  21. /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
  22. /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
  23. /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
  24. /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
  25. /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
  26. /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
  27. /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
  28. /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
  29. /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
  30. /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
  31. /* POSSIBILITY OF SUCH DAMAGE. */
  32. /* */
  33. /* The views and conclusions contained in the software and */
  34. /* documentation are those of the authors and should not be */
  35. /* interpreted as representing official policies, either expressed */
  36. /* or implied, of The University of Texas at Austin. */
  37. /*********************************************************************/
  38. #define ASSEMBLER
  39. #include "common.h"
  40. #ifdef linux
  41. #ifndef __64BIT__
  42. #define N r3
  43. #define X r6
  44. #define INCX r7
  45. #define Y r8
  46. #define INCY r9
  47. #define PREA r4
  48. #define YY r5
  49. #else
  50. #define N r3
  51. #define X r7
  52. #define INCX r8
  53. #define Y r9
  54. #define INCY r10
  55. #define PREA r4
  56. #define YY r5
  57. #endif
  58. #endif
  59. #if defined(_AIX) || defined(__APPLE__)
  60. #if !defined(__64BIT__) && defined(DOUBLE)
  61. #define N r3
  62. #define X r8
  63. #define INCX r9
  64. #define Y r10
  65. #define INCY r4
  66. #define PREA r5
  67. #define YY r6
  68. #else
  69. #define N r3
  70. #define X r7
  71. #define INCX r8
  72. #define Y r9
  73. #define INCY r10
  74. #define PREA r4
  75. #define YY r5
  76. #endif
  77. #endif
  78. #define ALPHA f24
  79. #ifndef NEEDPARAM
  80. #define STACKSIZE 96
  81. PROLOGUE
  82. PROFCODE
  83. addi SP, SP, -STACKSIZE
  84. li r0, 0
  85. stfd f14, 0(SP)
  86. stfd f15, 8(SP)
  87. stfd f16, 16(SP)
  88. stfd f17, 24(SP)
  89. stfd f18, 32(SP)
  90. stfd f19, 40(SP)
  91. stfd f20, 48(SP)
  92. stfd f21, 56(SP)
  93. stfd f22, 64(SP)
  94. stfd f23, 72(SP)
  95. stfd f24, 80(SP)
  96. #if (defined(_AIX) || defined(__APPLE__)) && !defined(__64BIT__) && defined(DOUBLE)
  97. lwz INCY, FRAMESLOT(0) + STACKSIZE(SP)
  98. #endif
  99. fmr ALPHA, f1
  100. slwi INCX, INCX, BASE_SHIFT
  101. slwi INCY, INCY, BASE_SHIFT
  102. #ifdef L1_DUALFETCH
  103. li PREA, (L1_PREFETCHSIZE) / 2
  104. #else
  105. li PREA, (L1_PREFETCHSIZE)
  106. #endif
  107. cmpwi cr0, N, 0
  108. ble- LL(999)
  109. cmpwi cr0, INCX, SIZE
  110. bne- cr0, LL(100)
  111. cmpwi cr0, INCY, SIZE
  112. bne- cr0, LL(100)
  113. srawi. r0, N, 4
  114. mtspr CTR, r0
  115. beq- cr0, LL(50)
  116. .align 4
  117. LFD f0, 0 * SIZE(X)
  118. LFD f1, 1 * SIZE(X)
  119. LFD f2, 2 * SIZE(X)
  120. LFD f3, 3 * SIZE(X)
  121. LFD f8, 0 * SIZE(Y)
  122. LFD f9, 1 * SIZE(Y)
  123. LFD f10, 2 * SIZE(Y)
  124. LFD f11, 3 * SIZE(Y)
  125. LFD f4, 4 * SIZE(X)
  126. LFD f5, 5 * SIZE(X)
  127. LFD f6, 6 * SIZE(X)
  128. LFD f7, 7 * SIZE(X)
  129. LFD f12, 4 * SIZE(Y)
  130. LFD f13, 5 * SIZE(Y)
  131. LFD f14, 6 * SIZE(Y)
  132. LFD f15, 7 * SIZE(Y)
  133. bdz LL(20)
  134. .align 4
  135. LL(10):
  136. FMADD f16, ALPHA, f0, f8
  137. FMADD f17, ALPHA, f1, f9
  138. FMADD f18, ALPHA, f2, f10
  139. FMADD f19, ALPHA, f3, f11
  140. LFD f0, 8 * SIZE(X)
  141. LFD f1, 9 * SIZE(X)
  142. LFD f2, 10 * SIZE(X)
  143. LFD f3, 11 * SIZE(X)
  144. LFD f8, 8 * SIZE(Y)
  145. LFD f9, 9 * SIZE(Y)
  146. LFD f10, 10 * SIZE(Y)
  147. LFD f11, 11 * SIZE(Y)
  148. STFD f16, 0 * SIZE(Y)
  149. STFD f17, 1 * SIZE(Y)
  150. STFD f18, 2 * SIZE(Y)
  151. STFD f19, 3 * SIZE(Y)
  152. FMADD f20, ALPHA, f4, f12
  153. FMADD f21, ALPHA, f5, f13
  154. FMADD f22, ALPHA, f6, f14
  155. FMADD f23, ALPHA, f7, f15
  156. LFD f4, 12 * SIZE(X)
  157. LFD f5, 13 * SIZE(X)
  158. LFD f6, 14 * SIZE(X)
  159. LFD f7, 15 * SIZE(X)
  160. LFD f12, 12 * SIZE(Y)
  161. LFD f13, 13 * SIZE(Y)
  162. LFD f14, 14 * SIZE(Y)
  163. LFD f15, 15 * SIZE(Y)
  164. STFD f20, 4 * SIZE(Y)
  165. STFD f21, 5 * SIZE(Y)
  166. STFD f22, 6 * SIZE(Y)
  167. STFD f23, 7 * SIZE(Y)
  168. FMADD f16, ALPHA, f0, f8
  169. FMADD f17, ALPHA, f1, f9
  170. FMADD f18, ALPHA, f2, f10
  171. FMADD f19, ALPHA, f3, f11
  172. LFD f0, 16 * SIZE(X)
  173. LFD f1, 17 * SIZE(X)
  174. LFD f2, 18 * SIZE(X)
  175. LFD f3, 19 * SIZE(X)
  176. LFD f8, 16 * SIZE(Y)
  177. LFD f9, 17 * SIZE(Y)
  178. LFD f10, 18 * SIZE(Y)
  179. LFD f11, 19 * SIZE(Y)
  180. STFD f16, 8 * SIZE(Y)
  181. STFD f17, 9 * SIZE(Y)
  182. STFD f18, 10 * SIZE(Y)
  183. STFD f19, 11 * SIZE(Y)
  184. FMADD f20, ALPHA, f4, f12
  185. FMADD f21, ALPHA, f5, f13
  186. FMADD f22, ALPHA, f6, f14
  187. FMADD f23, ALPHA, f7, f15
  188. LFD f4, 20 * SIZE(X)
  189. LFD f5, 21 * SIZE(X)
  190. LFD f6, 22 * SIZE(X)
  191. LFD f7, 23 * SIZE(X)
  192. LFD f12, 20 * SIZE(Y)
  193. LFD f13, 21 * SIZE(Y)
  194. LFD f14, 22 * SIZE(Y)
  195. LFD f15, 23 * SIZE(Y)
  196. STFD f20, 12 * SIZE(Y)
  197. STFD f21, 13 * SIZE(Y)
  198. STFD f22, 14 * SIZE(Y)
  199. STFD f23, 15 * SIZE(Y)
  200. #ifndef POWER6
  201. dcbtst Y, PREA
  202. #ifdef L1_DUALFETCH
  203. dcbt X, PREA
  204. #endif
  205. #endif
  206. addi X, X, 16 * SIZE
  207. addi Y, Y, 16 * SIZE
  208. #ifdef POWER6
  209. dcbtst Y, PREA
  210. L1_PREFETCH X, PREA
  211. #endif
  212. bdnz LL(10)
  213. .align 4
  214. LL(20):
  215. FMADD f16, ALPHA, f0, f8
  216. FMADD f17, ALPHA, f1, f9
  217. FMADD f18, ALPHA, f2, f10
  218. FMADD f19, ALPHA, f3, f11
  219. LFD f0, 8 * SIZE(X)
  220. LFD f1, 9 * SIZE(X)
  221. LFD f2, 10 * SIZE(X)
  222. LFD f3, 11 * SIZE(X)
  223. LFD f8, 8 * SIZE(Y)
  224. LFD f9, 9 * SIZE(Y)
  225. LFD f10, 10 * SIZE(Y)
  226. LFD f11, 11 * SIZE(Y)
  227. FMADD f20, ALPHA, f4, f12
  228. FMADD f21, ALPHA, f5, f13
  229. FMADD f22, ALPHA, f6, f14
  230. FMADD f23, ALPHA, f7, f15
  231. LFD f4, 12 * SIZE(X)
  232. LFD f5, 13 * SIZE(X)
  233. LFD f6, 14 * SIZE(X)
  234. LFD f7, 15 * SIZE(X)
  235. LFD f12, 12 * SIZE(Y)
  236. LFD f13, 13 * SIZE(Y)
  237. LFD f14, 14 * SIZE(Y)
  238. LFD f15, 15 * SIZE(Y)
  239. STFD f16, 0 * SIZE(Y)
  240. STFD f17, 1 * SIZE(Y)
  241. STFD f18, 2 * SIZE(Y)
  242. STFD f19, 3 * SIZE(Y)
  243. FMADD f16, ALPHA, f0, f8
  244. FMADD f17, ALPHA, f1, f9
  245. FMADD f18, ALPHA, f2, f10
  246. FMADD f19, ALPHA, f3, f11
  247. STFD f20, 4 * SIZE(Y)
  248. STFD f21, 5 * SIZE(Y)
  249. STFD f22, 6 * SIZE(Y)
  250. STFD f23, 7 * SIZE(Y)
  251. FMADD f20, ALPHA, f4, f12
  252. FMADD f21, ALPHA, f5, f13
  253. FMADD f22, ALPHA, f6, f14
  254. FMADD f23, ALPHA, f7, f15
  255. STFD f16, 8 * SIZE(Y)
  256. STFD f17, 9 * SIZE(Y)
  257. STFD f18, 10 * SIZE(Y)
  258. STFD f19, 11 * SIZE(Y)
  259. STFD f20, 12 * SIZE(Y)
  260. STFD f21, 13 * SIZE(Y)
  261. STFD f22, 14 * SIZE(Y)
  262. STFD f23, 15 * SIZE(Y)
  263. addi X, X, 16 * SIZE
  264. addi Y, Y, 16 * SIZE
  265. .align 4
  266. LL(50):
  267. andi. r0, N, 15
  268. mtspr CTR, r0
  269. beq LL(999)
  270. .align 4
  271. LL(60):
  272. LFD f0, 0 * SIZE(X)
  273. LFD f8, 0 * SIZE(Y)
  274. FMADD f16, ALPHA, f0, f8
  275. STFD f16, 0 * SIZE(Y)
  276. addi X, X, 1 * SIZE
  277. addi Y, Y, 1 * SIZE
  278. bdnz LL(60)
  279. b LL(999)
  280. .align 4
  281. LL(100):
  282. sub X, X, INCX
  283. sub Y, Y, INCY
  284. mr YY, Y
  285. srawi. r0, N, 4
  286. mtspr CTR, r0
  287. beq- LL(150)
  288. .align 4
  289. LFDUX f0, X, INCX
  290. LFDUX f1, X, INCX
  291. LFDUX f2, X, INCX
  292. LFDUX f3, X, INCX
  293. LFDUX f8, Y, INCY
  294. LFDUX f9, Y, INCY
  295. LFDUX f10, Y, INCY
  296. LFDUX f11, Y, INCY
  297. LFDUX f4, X, INCX
  298. LFDUX f5, X, INCX
  299. LFDUX f6, X, INCX
  300. LFDUX f7, X, INCX
  301. LFDUX f12, Y, INCY
  302. LFDUX f13, Y, INCY
  303. LFDUX f14, Y, INCY
  304. LFDUX f15, Y, INCY
  305. bdz LL(120)
  306. .align 4
  307. LL(110):
  308. FMADD f16, ALPHA, f0, f8
  309. FMADD f17, ALPHA, f1, f9
  310. FMADD f18, ALPHA, f2, f10
  311. FMADD f19, ALPHA, f3, f11
  312. LFDUX f0, X, INCX
  313. LFDUX f1, X, INCX
  314. LFDUX f2, X, INCX
  315. LFDUX f3, X, INCX
  316. LFDUX f8, Y, INCY
  317. LFDUX f9, Y, INCY
  318. LFDUX f10, Y, INCY
  319. LFDUX f11, Y, INCY
  320. FMADD f20, ALPHA, f4, f12
  321. FMADD f21, ALPHA, f5, f13
  322. FMADD f22, ALPHA, f6, f14
  323. FMADD f23, ALPHA, f7, f15
  324. LFDUX f4, X, INCX
  325. LFDUX f5, X, INCX
  326. LFDUX f6, X, INCX
  327. LFDUX f7, X, INCX
  328. LFDUX f12, Y, INCY
  329. LFDUX f13, Y, INCY
  330. LFDUX f14, Y, INCY
  331. LFDUX f15, Y, INCY
  332. STFDUX f16, YY, INCY
  333. STFDUX f17, YY, INCY
  334. STFDUX f18, YY, INCY
  335. STFDUX f19, YY, INCY
  336. FMADD f16, ALPHA, f0, f8
  337. FMADD f17, ALPHA, f1, f9
  338. FMADD f18, ALPHA, f2, f10
  339. FMADD f19, ALPHA, f3, f11
  340. LFDUX f0, X, INCX
  341. LFDUX f1, X, INCX
  342. LFDUX f2, X, INCX
  343. LFDUX f3, X, INCX
  344. LFDUX f8, Y, INCY
  345. LFDUX f9, Y, INCY
  346. LFDUX f10, Y, INCY
  347. LFDUX f11, Y, INCY
  348. STFDUX f20, YY, INCY
  349. STFDUX f21, YY, INCY
  350. STFDUX f22, YY, INCY
  351. STFDUX f23, YY, INCY
  352. FMADD f20, ALPHA, f4, f12
  353. FMADD f21, ALPHA, f5, f13
  354. FMADD f22, ALPHA, f6, f14
  355. FMADD f23, ALPHA, f7, f15
  356. LFDUX f4, X, INCX
  357. LFDUX f5, X, INCX
  358. LFDUX f6, X, INCX
  359. LFDUX f7, X, INCX
  360. LFDUX f12, Y, INCY
  361. LFDUX f13, Y, INCY
  362. LFDUX f14, Y, INCY
  363. LFDUX f15, Y, INCY
  364. STFDUX f16, YY, INCY
  365. STFDUX f17, YY, INCY
  366. STFDUX f18, YY, INCY
  367. STFDUX f19, YY, INCY
  368. STFDUX f20, YY, INCY
  369. STFDUX f21, YY, INCY
  370. STFDUX f22, YY, INCY
  371. STFDUX f23, YY, INCY
  372. bdnz LL(110)
  373. .align 4
  374. LL(120):
  375. FMADD f16, ALPHA, f0, f8
  376. FMADD f17, ALPHA, f1, f9
  377. FMADD f18, ALPHA, f2, f10
  378. FMADD f19, ALPHA, f3, f11
  379. LFDUX f0, X, INCX
  380. LFDUX f1, X, INCX
  381. LFDUX f2, X, INCX
  382. LFDUX f3, X, INCX
  383. LFDUX f8, Y, INCY
  384. LFDUX f9, Y, INCY
  385. LFDUX f10, Y, INCY
  386. LFDUX f11, Y, INCY
  387. FMADD f20, ALPHA, f4, f12
  388. FMADD f21, ALPHA, f5, f13
  389. FMADD f22, ALPHA, f6, f14
  390. FMADD f23, ALPHA, f7, f15
  391. LFDUX f4, X, INCX
  392. LFDUX f5, X, INCX
  393. LFDUX f6, X, INCX
  394. LFDUX f7, X, INCX
  395. LFDUX f12, Y, INCY
  396. LFDUX f13, Y, INCY
  397. LFDUX f14, Y, INCY
  398. LFDUX f15, Y, INCY
  399. STFDUX f16, YY, INCY
  400. STFDUX f17, YY, INCY
  401. STFDUX f18, YY, INCY
  402. STFDUX f19, YY, INCY
  403. FMADD f16, ALPHA, f0, f8
  404. FMADD f17, ALPHA, f1, f9
  405. FMADD f18, ALPHA, f2, f10
  406. FMADD f19, ALPHA, f3, f11
  407. STFDUX f20, YY, INCY
  408. STFDUX f21, YY, INCY
  409. STFDUX f22, YY, INCY
  410. STFDUX f23, YY, INCY
  411. FMADD f20, ALPHA, f4, f12
  412. FMADD f21, ALPHA, f5, f13
  413. FMADD f22, ALPHA, f6, f14
  414. FMADD f23, ALPHA, f7, f15
  415. STFDUX f16, YY, INCY
  416. STFDUX f17, YY, INCY
  417. STFDUX f18, YY, INCY
  418. STFDUX f19, YY, INCY
  419. STFDUX f20, YY, INCY
  420. STFDUX f21, YY, INCY
  421. STFDUX f22, YY, INCY
  422. STFDUX f23, YY, INCY
  423. .align 4
  424. LL(150):
  425. andi. r0, N, 15
  426. mtspr CTR, r0
  427. beq LL(999)
  428. .align 4
  429. LL(160):
  430. LFDUX f0, X, INCX
  431. LFDUX f8, Y, INCY
  432. FMADD f16, ALPHA, f0, f8
  433. STFDUX f16, YY, INCY
  434. bdnz LL(160)
  435. .align 4
  436. LL(999):
  437. lfd f14, 0(SP)
  438. lfd f15, 8(SP)
  439. lfd f16, 16(SP)
  440. lfd f17, 24(SP)
  441. lfd f18, 32(SP)
  442. lfd f19, 40(SP)
  443. lfd f20, 48(SP)
  444. lfd f21, 56(SP)
  445. lfd f22, 64(SP)
  446. lfd f23, 72(SP)
  447. lfd f24, 80(SP)
  448. addi SP, SP, STACKSIZE
  449. blr
  450. EPILOGUE
  451. #endif