You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

zaxpy_hummer.S 11 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503
  1. /*********************************************************************/
  2. /* Copyright 2009, 2010 The University of Texas at Austin. */
  3. /* All rights reserved. */
  4. /* */
  5. /* Redistribution and use in source and binary forms, with or */
  6. /* without modification, are permitted provided that the following */
  7. /* conditions are met: */
  8. /* */
  9. /* 1. Redistributions of source code must retain the above */
  10. /* copyright notice, this list of conditions and the following */
  11. /* disclaimer. */
  12. /* */
  13. /* 2. Redistributions in binary form must reproduce the above */
  14. /* copyright notice, this list of conditions and the following */
  15. /* disclaimer in the documentation and/or other materials */
  16. /* provided with the distribution. */
  17. /* */
  18. /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
  19. /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
  20. /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
  21. /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
  22. /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
  23. /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
  24. /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
  25. /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
  26. /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
  27. /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
  28. /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
  29. /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
  30. /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
  31. /* POSSIBILITY OF SUCH DAMAGE. */
  32. /* */
  33. /* The views and conclusions contained in the software and */
  34. /* documentation are those of the authors and should not be */
  35. /* interpreted as representing official policies, either expressed */
  36. /* or implied, of The University of Texas at Austin. */
  37. /*********************************************************************/
  38. #define ASSEMBLER
  39. #include "common.h"
  40. #define N r3
  41. #define X r6
  42. #define INCX r7
  43. #define Y r8
  44. #define INCY r9
  45. #define YY r4
  46. #define INCX2 r5
  47. #define INCY2 r10
  48. #define X1 r11
  49. #define Y1 INCX
  50. #define YY1 INCY
  51. #define ALPHA f1
  52. #define A1 f0
  53. #define A2 f8
  54. #define A3 f2
  55. #define A4 f3
  56. #define A5 f4
  57. #define A6 f5
  58. #define A7 f6
  59. #define A8 f7
  60. #define A9 f25
  61. #define B1 f9
  62. #define B2 f10
  63. #define B3 f11
  64. #define B4 f12
  65. #define B5 f13
  66. #define B6 f14
  67. #define B7 f15
  68. #define B8 f16
  69. #define C1 f17
  70. #define C2 f18
  71. #define C3 f19
  72. #define C4 f20
  73. #define C5 f21
  74. #define C6 f22
  75. #define C7 f23
  76. #define C8 f24
  77. #define ALPHA_R ALPHA
  78. #define ALPHA_I A9
  79. #ifndef CONJ
  80. #define ADD1 FNMSUB
  81. #define ADD2 FMADD
  82. #else
  83. #define ADD1 FMADD
  84. #define ADD2 FNMSUB
  85. #endif
  86. #ifndef CONJ
  87. #define FXMADD1 fxcpmadd
  88. #define FXMADD2 fxcxnpma
  89. #else
  90. #define FXMADD1 fxcpnsma
  91. #define FXMADD2 fxcxma
  92. #endif
  93. PROLOGUE
  94. PROFCODE
  95. li r10, -16
  96. stfpdux f14, SP, r10
  97. stfpdux f15, SP, r10
  98. stfpdux f16, SP, r10
  99. stfpdux f17, SP, r10
  100. stfpdux f18, SP, r10
  101. stfpdux f19, SP, r10
  102. stfpdux f20, SP, r10
  103. stfpdux f21, SP, r10
  104. stfpdux f22, SP, r10
  105. stfpdux f23, SP, r10
  106. stfpdux f24, SP, r10
  107. stfpdux f25, SP, r10
  108. fsmfp ALPHA, f2
  109. slwi INCX, INCX, BASE_SHIFT
  110. slwi INCY, INCY, BASE_SHIFT
  111. add INCX2, INCX, INCX
  112. add INCY2, INCY, INCY
  113. cmpwi cr0, N, 0
  114. ble LL(999)
  115. andi. r0, X, 2 * SIZE - 1
  116. bne LL(100)
  117. andi. r0, Y, 2 * SIZE - 1
  118. bne LL(100)
  119. sub X, X, INCX2
  120. sub Y, Y, INCY2
  121. mr YY, Y
  122. srawi. r0, N, 3
  123. mtspr CTR, r0
  124. beq- LL(15)
  125. LFPDUX A1, X, INCX2
  126. LFPDUX B1, Y, INCY2
  127. LFPDUX A2, X, INCX2
  128. LFPDUX B2, Y, INCY2
  129. LFPDUX A3, X, INCX2
  130. LFPDUX B3, Y, INCY2
  131. LFPDUX A4, X, INCX2
  132. LFPDUX B4, Y, INCY2
  133. LFPDUX A5, X, INCX2
  134. LFPDUX B5, Y, INCY2
  135. LFPDUX A6, X, INCX2
  136. LFPDUX B6, Y, INCY2
  137. LFPDUX A7, X, INCX2
  138. LFPDUX B7, Y, INCY2
  139. LFPDUX A8, X, INCX2
  140. LFPDUX B8, Y, INCY2
  141. bdz LL(13)
  142. .align 4
  143. LL(12):
  144. FXMADD1 C1, ALPHA, A1, B1
  145. LFPDUX B1, Y, INCY2
  146. FXMADD1 C2, ALPHA, A2, B2
  147. LFPDUX B2, Y, INCY2
  148. FXMADD1 C3, ALPHA, A3, B3
  149. LFPDUX B3, Y, INCY2
  150. FXMADD1 C4, ALPHA, A4, B4
  151. LFPDUX B4, Y, INCY2
  152. FXMADD1 C5, ALPHA, A5, B5
  153. LFPDUX B5, Y, INCY2
  154. FXMADD1 C6, ALPHA, A6, B6
  155. LFPDUX B6, Y, INCY2
  156. FXMADD1 C7, ALPHA, A7, B7
  157. LFPDUX B7, Y, INCY2
  158. FXMADD1 C8, ALPHA, A8, B8
  159. LFPDUX B8, Y, INCY2
  160. FXMADD2 C1, ALPHA, A1, C1
  161. LFPDUX A1, X, INCX2
  162. FXMADD2 C2, ALPHA, A2, C2
  163. LFPDUX A2, X, INCX2
  164. FXMADD2 C3, ALPHA, A3, C3
  165. LFPDUX A3, X, INCX2
  166. FXMADD2 C4, ALPHA, A4, C4
  167. LFPDUX A4, X, INCX2
  168. FXMADD2 C5, ALPHA, A5, C5
  169. LFPDUX A5, X, INCX2
  170. FXMADD2 C6, ALPHA, A6, C6
  171. LFPDUX A6, X, INCX2
  172. FXMADD2 C7, ALPHA, A7, C7
  173. LFPDUX A7, X, INCX2
  174. FXMADD2 C8, ALPHA, A8, C8
  175. LFPDUX A8, X, INCX2
  176. STFPDUX C1, YY, INCY2
  177. STFPDUX C2, YY, INCY2
  178. STFPDUX C3, YY, INCY2
  179. STFPDUX C4, YY, INCY2
  180. STFPDUX C5, YY, INCY2
  181. STFPDUX C6, YY, INCY2
  182. STFPDUX C7, YY, INCY2
  183. STFPDUX C8, YY, INCY2
  184. bdnz LL(12)
  185. .align 4
  186. LL(13):
  187. FXMADD1 C1, ALPHA, A1, B1
  188. FXMADD1 C2, ALPHA, A2, B2
  189. FXMADD1 C3, ALPHA, A3, B3
  190. FXMADD1 C4, ALPHA, A4, B4
  191. FXMADD1 C5, ALPHA, A5, B5
  192. FXMADD1 C6, ALPHA, A6, B6
  193. FXMADD1 C7, ALPHA, A7, B7
  194. FXMADD1 C8, ALPHA, A8, B8
  195. FXMADD2 C1, ALPHA, A1, C1
  196. FXMADD2 C2, ALPHA, A2, C2
  197. FXMADD2 C3, ALPHA, A3, C3
  198. FXMADD2 C4, ALPHA, A4, C4
  199. FXMADD2 C5, ALPHA, A5, C5
  200. FXMADD2 C6, ALPHA, A6, C6
  201. STFPDUX C1, YY, INCY2
  202. FXMADD2 C7, ALPHA, A7, C7
  203. STFPDUX C2, YY, INCY2
  204. FXMADD2 C8, ALPHA, A8, C8
  205. STFPDUX C3, YY, INCY2
  206. STFPDUX C4, YY, INCY2
  207. STFPDUX C5, YY, INCY2
  208. STFPDUX C6, YY, INCY2
  209. STFPDUX C7, YY, INCY2
  210. STFPDUX C8, YY, INCY2
  211. .align 4
  212. LL(15):
  213. andi. r0, N, 7
  214. beq LL(999)
  215. andi. r0, N, 4
  216. beq LL(16)
  217. LFPDUX A1, X, INCX2
  218. LFPDUX B1, Y, INCY2
  219. LFPDUX A2, X, INCX2
  220. LFPDUX B2, Y, INCY2
  221. LFPDUX A3, X, INCX2
  222. LFPDUX B3, Y, INCY2
  223. LFPDUX A4, X, INCX2
  224. LFPDUX B4, Y, INCY2
  225. FXMADD1 C1, ALPHA, A1, B1
  226. FXMADD1 C2, ALPHA, A2, B2
  227. FXMADD1 C3, ALPHA, A3, B3
  228. FXMADD1 C4, ALPHA, A4, B4
  229. FXMADD2 C1, ALPHA, A1, C1
  230. FXMADD2 C2, ALPHA, A2, C2
  231. FXMADD2 C3, ALPHA, A3, C3
  232. FXMADD2 C4, ALPHA, A4, C4
  233. STFPDUX C1, YY, INCY2
  234. STFPDUX C2, YY, INCY2
  235. STFPDUX C3, YY, INCY2
  236. STFPDUX C4, YY, INCY2
  237. .align 4
  238. LL(16):
  239. andi. r0, N, 2
  240. beq LL(17)
  241. LFPDUX A1, X, INCX2
  242. LFPDUX B1, Y, INCY2
  243. LFPDUX A2, X, INCX2
  244. LFPDUX B2, Y, INCY2
  245. FXMADD1 C1, ALPHA, A1, B1
  246. FXMADD1 C2, ALPHA, A2, B2
  247. FXMADD2 C1, ALPHA, A1, C1
  248. FXMADD2 C2, ALPHA, A2, C2
  249. STFPDUX C1, YY, INCY2
  250. STFPDUX C2, YY, INCY2
  251. .align 4
  252. LL(17):
  253. andi. r0, N, 1
  254. beq LL(999)
  255. LFPDUX A1, X, INCX2
  256. LFPDUX B1, Y, INCY2
  257. FXMADD1 C1, ALPHA, A1, B1
  258. FXMADD2 C1, ALPHA, A1, C1
  259. STFPDUX C1, YY, INCY2
  260. b LL(999)
  261. .align 4
  262. LL(100):
  263. fsmtp ALPHA_I, ALPHA_R
  264. sub X, X, INCX2
  265. sub Y, Y, INCY2
  266. addi X1, X, SIZE
  267. addi Y1, Y, SIZE
  268. mr YY, Y
  269. mr YY1, Y1
  270. srawi. r0, N, 2
  271. mtspr CTR, r0
  272. beq- LL(115)
  273. LFDUX A1, X, INCX2
  274. LFDUX A2, X1, INCX2
  275. LFDUX B1, Y, INCY2
  276. LFDUX B2, Y1, INCY2
  277. LFDUX A3, X, INCX2
  278. LFDUX A4, X1, INCX2
  279. LFDUX B3, Y, INCY2
  280. LFDUX B4, Y1, INCY2
  281. LFDUX A5, X, INCX2
  282. LFDUX A6, X1, INCX2
  283. LFDUX B5, Y, INCY2
  284. LFDUX B6, Y1, INCY2
  285. LFDUX A7, X, INCX2
  286. LFDUX A8, X1, INCX2
  287. LFDUX B7, Y, INCY2
  288. LFDUX B8, Y1, INCY2
  289. bdz LL(113)
  290. .align 4
  291. LL(112):
  292. FMADD C1, ALPHA_R, A1, B1
  293. LFDUX B1, Y, INCY2
  294. FMADD C2, ALPHA_I, A1, B2
  295. LFDUX A1, X, INCX2
  296. FMADD C3, ALPHA_R, A3, B3
  297. LFDUX B3, Y, INCY2
  298. FMADD C4, ALPHA_I, A3, B4
  299. LFDUX A3, X, INCX2
  300. FMADD C5, ALPHA_R, A5, B5
  301. LFDUX B5, Y, INCY2
  302. FMADD C6, ALPHA_I, A5, B6
  303. LFDUX A5, X, INCX2
  304. FMADD C7, ALPHA_R, A7, B7
  305. LFDUX B7, Y, INCY2
  306. FMADD C8, ALPHA_I, A7, B8
  307. LFDUX A7, X, INCX2
  308. ADD1 C1, ALPHA_I, A2, C1
  309. LFDUX B2, Y1, INCY2
  310. ADD2 C2, ALPHA_R, A2, C2
  311. LFDUX A2, X1, INCX2
  312. ADD1 C3, ALPHA_I, A4, C3
  313. LFDUX B4, Y1, INCY2
  314. ADD2 C4, ALPHA_R, A4, C4
  315. LFDUX A4, X1, INCX2
  316. ADD1 C5, ALPHA_I, A6, C5
  317. LFDUX B6, Y1, INCY2
  318. ADD2 C6, ALPHA_R, A6, C6
  319. LFDUX A6, X1, INCX2
  320. ADD1 C7, ALPHA_I, A8, C7
  321. LFDUX B8, Y1, INCY2
  322. ADD2 C8, ALPHA_R, A8, C8
  323. LFDUX A8, X1, INCX2
  324. STFDUX C1, YY, INCY2
  325. STFDUX C2, YY1, INCY2
  326. STFDUX C3, YY, INCY2
  327. STFDUX C4, YY1, INCY2
  328. STFDUX C5, YY, INCY2
  329. STFDUX C6, YY1, INCY2
  330. STFDUX C7, YY, INCY2
  331. STFDUX C8, YY1, INCY2
  332. bdnz LL(112)
  333. .align 4
  334. LL(113):
  335. FMADD C1, ALPHA_R, A1, B1
  336. FMADD C2, ALPHA_I, A1, B2
  337. FMADD C3, ALPHA_R, A3, B3
  338. FMADD C4, ALPHA_I, A3, B4
  339. FMADD C5, ALPHA_R, A5, B5
  340. FMADD C6, ALPHA_I, A5, B6
  341. FMADD C7, ALPHA_R, A7, B7
  342. FMADD C8, ALPHA_I, A7, B8
  343. ADD1 C1, ALPHA_I, A2, C1
  344. ADD2 C2, ALPHA_R, A2, C2
  345. ADD1 C3, ALPHA_I, A4, C3
  346. ADD2 C4, ALPHA_R, A4, C4
  347. ADD1 C5, ALPHA_I, A6, C5
  348. ADD2 C6, ALPHA_R, A6, C6
  349. STFDUX C1, YY, INCY2
  350. ADD1 C7, ALPHA_I, A8, C7
  351. STFDUX C2, YY1, INCY2
  352. ADD2 C8, ALPHA_R, A8, C8
  353. STFDUX C3, YY, INCY2
  354. STFDUX C4, YY1, INCY2
  355. STFDUX C5, YY, INCY2
  356. STFDUX C6, YY1, INCY2
  357. STFDUX C7, YY, INCY2
  358. STFDUX C8, YY1, INCY2
  359. .align 4
  360. LL(115):
  361. andi. r0, N, 3
  362. beq LL(999)
  363. andi. r0, N, 2
  364. beq LL(117)
  365. LFDUX A1, X, INCX2
  366. LFDUX A2, X1, INCX2
  367. LFDUX B1, Y, INCY2
  368. LFDUX B2, Y1, INCY2
  369. LFDUX A3, X, INCX2
  370. FMADD C1, ALPHA_R, A1, B1
  371. LFDUX A4, X1, INCX2
  372. FMADD C2, ALPHA_I, A1, B2
  373. LFDUX B3, Y, INCY2
  374. FMADD C3, ALPHA_R, A3, B3
  375. LFDUX B4, Y1, INCY2
  376. FMADD C4, ALPHA_I, A3, B4
  377. ADD1 C1, ALPHA_I, A2, C1
  378. ADD2 C2, ALPHA_R, A2, C2
  379. STFDUX C1, YY, INCY2
  380. ADD1 C3, ALPHA_I, A4, C3
  381. STFDUX C2, YY1, INCY2
  382. ADD2 C4, ALPHA_R, A4, C4
  383. STFDUX C3, YY, INCY2
  384. STFDUX C4, YY1, INCY2
  385. .align 4
  386. LL(117):
  387. andi. r0, N, 1
  388. beq LL(999)
  389. LFDUX A1, X, INCX2
  390. LFDUX A2, X1, INCX2
  391. LFDUX B1, Y, INCY2
  392. LFDUX B2, Y1, INCY2
  393. FMADD C1, ALPHA_R, A1, B1
  394. FMADD C2, ALPHA_I, A1, B2
  395. ADD1 C1, ALPHA_I, A2, C1
  396. ADD2 C2, ALPHA_R, A2, C2
  397. STFDUX C1, YY, INCY2
  398. STFDUX C2, YY1, INCY2
  399. .align 4
  400. LL(999):
  401. li r10, 16
  402. subi SP, SP, 16
  403. lfpdux f25, SP, r10
  404. lfpdux f24, SP, r10
  405. lfpdux f23, SP, r10
  406. lfpdux f22, SP, r10
  407. lfpdux f21, SP, r10
  408. lfpdux f20, SP, r10
  409. lfpdux f19, SP, r10
  410. lfpdux f18, SP, r10
  411. lfpdux f17, SP, r10
  412. lfpdux f16, SP, r10
  413. lfpdux f15, SP, r10
  414. lfpdux f14, SP, r10
  415. addi SP, SP, 16
  416. blr
  417. EPILOGUE