You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

zsum.S 8.6 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452
  1. /*********************************************************************/
  2. /* Copyright 2009, 2010 The University of Texas at Austin. */
  3. /* All rights reserved. */
  4. /* */
  5. /* Redistribution and use in source and binary forms, with or */
  6. /* without modification, are permitted provided that the following */
  7. /* conditions are met: */
  8. /* */
  9. /* 1. Redistributions of source code must retain the above */
  10. /* copyright notice, this list of conditions and the following */
  11. /* disclaimer. */
  12. /* */
  13. /* 2. Redistributions in binary form must reproduce the above */
  14. /* copyright notice, this list of conditions and the following */
  15. /* disclaimer in the documentation and/or other materials */
  16. /* provided with the distribution. */
  17. /* */
  18. /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
  19. /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
  20. /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
  21. /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
  22. /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
  23. /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
  24. /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
  25. /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
  26. /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
  27. /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
  28. /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
  29. /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
  30. /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
  31. /* POSSIBILITY OF SUCH DAMAGE. */
  32. /* */
  33. /* The views and conclusions contained in the software and */
  34. /* documentation are those of the authors and should not be */
  35. /* interpreted as representing official policies, either expressed */
  36. /* or implied, of The University of Texas at Austin. */
  37. /*********************************************************************/
  38. #define ASSEMBLER
  39. #include "common.h"
  40. #define N r3
  41. #define X r4
  42. #define INCX r5
  43. #define INCXM1 r9
  44. #define PREA r8
  45. #define FZERO f0
  46. #define STACKSIZE 160
  47. PROLOGUE
  48. PROFCODE
  49. addi SP, SP, -STACKSIZE
  50. li r0, 0
  51. stfd f14, 0(SP)
  52. stfd f15, 8(SP)
  53. stfd f16, 16(SP)
  54. stfd f17, 24(SP)
  55. stfd f18, 32(SP)
  56. stfd f19, 40(SP)
  57. stfd f20, 48(SP)
  58. stfd f21, 56(SP)
  59. stfd f22, 64(SP)
  60. stfd f23, 72(SP)
  61. stfd f24, 80(SP)
  62. stfd f25, 88(SP)
  63. stfd f26, 96(SP)
  64. stfd f27, 104(SP)
  65. stfd f28, 112(SP)
  66. stfd f29, 120(SP)
  67. stfd f30, 128(SP)
  68. stfd f31, 136(SP)
  69. stw r0, 144(SP)
  70. lfs FZERO,144(SP)
  71. #ifdef F_INTERFACE
  72. LDINT N, 0(N)
  73. LDINT INCX, 0(INCX)
  74. #endif
  75. slwi INCX, INCX, ZBASE_SHIFT
  76. subi INCXM1, INCX, SIZE
  77. fmr f1, FZERO
  78. fmr f2, FZERO
  79. fmr f3, FZERO
  80. fmr f4, FZERO
  81. fmr f5, FZERO
  82. fmr f6, FZERO
  83. fmr f7, FZERO
  84. li PREA, L1_PREFETCHSIZE
  85. cmpwi cr0, N, 0
  86. ble- LL(999)
  87. cmpwi cr0, INCX, 0
  88. ble- LL(999)
  89. cmpwi cr0, INCX, 2 * SIZE
  90. bne- cr0, LL(100)
  91. srawi. r0, N, 3
  92. mtspr CTR, r0
  93. beq- cr0, LL(50)
  94. .align 4
  95. LFD f8, 0 * SIZE(X)
  96. LFD f9, 1 * SIZE(X)
  97. LFD f10, 2 * SIZE(X)
  98. LFD f11, 3 * SIZE(X)
  99. LFD f12, 4 * SIZE(X)
  100. LFD f13, 5 * SIZE(X)
  101. LFD f14, 6 * SIZE(X)
  102. LFD f15, 7 * SIZE(X)
  103. LFD f24, 8 * SIZE(X)
  104. LFD f25, 9 * SIZE(X)
  105. LFD f26, 10 * SIZE(X)
  106. LFD f27, 11 * SIZE(X)
  107. LFD f28, 12 * SIZE(X)
  108. LFD f29, 13 * SIZE(X)
  109. LFD f30, 14 * SIZE(X)
  110. LFD f31, 15 * SIZE(X)
  111. fmr f16, f8
  112. fmr f17, f9
  113. fmr f18, f10
  114. fmr f19, f11
  115. fmr f20, f12
  116. fmr f21, f13
  117. fmr f22, f14
  118. fmr f23, f15
  119. bdz LL(20)
  120. .align 4
  121. LL(10):
  122. FADD f0, f0, f16
  123. fmr f16, f24
  124. FADD f1, f1, f17
  125. fmr f17, f25
  126. FADD f2, f2, f18
  127. fmr f18, f26
  128. FADD f3, f3, f19
  129. fmr f19, f27
  130. LFD f8, 16 * SIZE(X)
  131. LFD f9, 17 * SIZE(X)
  132. LFD f10, 18 * SIZE(X)
  133. LFD f11, 19 * SIZE(X)
  134. FADD f4, f4, f20
  135. fmr f20, f28
  136. FADD f5, f5, f21
  137. fmr f21, f29
  138. FADD f6, f6, f22
  139. fmr f22, f30
  140. FADD f7, f7, f23
  141. fmr f23, f31
  142. LFD f12, 20 * SIZE(X)
  143. LFD f13, 21 * SIZE(X)
  144. LFD f14, 22 * SIZE(X)
  145. LFD f15, 23 * SIZE(X)
  146. FADD f0, f0, f16
  147. fmr f16, f8
  148. FADD f1, f1, f17
  149. fmr f17, f9
  150. FADD f2, f2, f18
  151. fmr f18, f10
  152. FADD f3, f3, f19
  153. fmr f19, f11
  154. LFD f24, 24 * SIZE(X)
  155. LFD f25, 25 * SIZE(X)
  156. LFD f26, 26 * SIZE(X)
  157. LFD f27, 27 * SIZE(X)
  158. FADD f4, f4, f20
  159. fmr f20, f12
  160. FADD f5, f5, f21
  161. fmr f21, f13
  162. FADD f6, f6, f22
  163. fmr f22, f14
  164. FADD f7, f7, f23
  165. fmr f23, f15
  166. LFD f28, 28 * SIZE(X)
  167. LFD f29, 29 * SIZE(X)
  168. LFD f30, 30 * SIZE(X)
  169. LFD f31, 31 * SIZE(X)
  170. #ifndef POWER6
  171. L1_PREFETCH X, PREA
  172. #endif
  173. addi X, X, 16 * SIZE
  174. #ifdef POWER6
  175. L1_PREFETCH X, PREA
  176. #endif
  177. bdnz LL(10)
  178. .align 4
  179. LL(20):
  180. FADD f0, f0, f16
  181. fmr f16, f24
  182. FADD f1, f1, f17
  183. fmr f17, f25
  184. FADD f2, f2, f18
  185. fmr f18, f26
  186. FADD f3, f3, f19
  187. fmr f19, f27
  188. FADD f4, f4, f20
  189. fmr f20, f28
  190. FADD f5, f5, f21
  191. fmr f21, f29
  192. FADD f6, f6, f22
  193. fmr f22, f30
  194. FADD f7, f7, f23
  195. fmr f23, f31
  196. FADD f0, f0, f16
  197. FADD f1, f1, f17
  198. FADD f2, f2, f18
  199. FADD f3, f3, f19
  200. FADD f4, f4, f20
  201. FADD f5, f5, f21
  202. FADD f6, f6, f22
  203. FADD f7, f7, f23
  204. addi X, X, 16 * SIZE
  205. .align 4
  206. LL(50):
  207. andi. r0, N, 7
  208. mtspr CTR, r0
  209. beq LL(999)
  210. .align 4
  211. LL(60):
  212. LFD f8, 0 * SIZE(X)
  213. LFD f9, 1 * SIZE(X)
  214. addi X, X, 2 * SIZE
  215. FADD f0, f0, f8
  216. FADD f1, f1, f9
  217. bdnz LL(60)
  218. b LL(999)
  219. .align 4
  220. LL(100):
  221. sub X, X, INCXM1
  222. srawi. r0, N, 3
  223. mtspr CTR, r0
  224. beq- LL(150)
  225. LFDX f8, X, INCXM1
  226. LFDUX f9, X, INCX
  227. LFDX f10, X, INCXM1
  228. LFDUX f11, X, INCX
  229. LFDX f12, X, INCXM1
  230. LFDUX f13, X, INCX
  231. LFDX f14, X, INCXM1
  232. LFDUX f15, X, INCX
  233. LFDX f24, X, INCXM1
  234. LFDUX f25, X, INCX
  235. LFDX f26, X, INCXM1
  236. LFDUX f27, X, INCX
  237. LFDX f28, X, INCXM1
  238. LFDUX f29, X, INCX
  239. LFDX f30, X, INCXM1
  240. LFDUX f31, X, INCX
  241. fmr f16, f8
  242. fmr f17, f9
  243. fmr f18, f10
  244. fmr f19, f11
  245. fmr f20, f12
  246. fmr f21, f13
  247. fmr f22, f14
  248. fmr f23, f15
  249. bdz LL(120)
  250. .align 4
  251. LL(110):
  252. FADD f0, f0, f16
  253. fmr f16, f24
  254. FADD f1, f1, f17
  255. fmr f17, f25
  256. FADD f2, f2, f18
  257. fmr f18, f26
  258. FADD f3, f3, f19
  259. fmr f19, f27
  260. LFDX f8, X, INCXM1
  261. LFDUX f9, X, INCX
  262. LFDX f10, X, INCXM1
  263. LFDUX f11, X, INCX
  264. FADD f4, f4, f20
  265. fmr f20, f28
  266. FADD f5, f5, f21
  267. fmr f21, f29
  268. FADD f6, f6, f22
  269. fmr f22, f30
  270. FADD f7, f7, f23
  271. fmr f23, f31
  272. LFDX f12, X, INCXM1
  273. LFDUX f13, X, INCX
  274. LFDX f14, X, INCXM1
  275. LFDUX f15, X, INCX
  276. FADD f0, f0, f16
  277. fmr f16, f8
  278. FADD f1, f1, f17
  279. fmr f17, f9
  280. FADD f2, f2, f18
  281. fmr f18, f10
  282. FADD f3, f3, f19
  283. fmr f19, f11
  284. LFDX f24, X, INCXM1
  285. LFDUX f25, X, INCX
  286. LFDX f26, X, INCXM1
  287. LFDUX f27, X, INCX
  288. FADD f4, f4, f20
  289. fmr f20, f12
  290. FADD f5, f5, f21
  291. fmr f21, f13
  292. FADD f6, f6, f22
  293. fmr f22, f14
  294. FADD f7, f7, f23
  295. fmr f23, f15
  296. LFDX f28, X, INCXM1
  297. LFDUX f29, X, INCX
  298. LFDX f30, X, INCXM1
  299. LFDUX f31, X, INCX
  300. bdnz LL(110)
  301. .align 4
  302. LL(120):
  303. FADD f0, f0, f16
  304. fmr f16, f24
  305. FADD f1, f1, f17
  306. fmr f17, f25
  307. FADD f2, f2, f18
  308. fmr f18, f26
  309. FADD f3, f3, f19
  310. fmr f19, f27
  311. FADD f4, f4, f20
  312. fmr f20, f28
  313. FADD f5, f5, f21
  314. fmr f21, f29
  315. FADD f6, f6, f22
  316. fmr f22, f30
  317. FADD f7, f7, f23
  318. fmr f23, f31
  319. FADD f0, f0, f16
  320. FADD f1, f1, f17
  321. FADD f2, f2, f18
  322. FADD f3, f3, f19
  323. FADD f4, f4, f20
  324. FADD f5, f5, f21
  325. FADD f6, f6, f22
  326. FADD f7, f7, f23
  327. .align 4
  328. LL(150):
  329. andi. r0, N, 7
  330. mtspr CTR, r0
  331. beq LL(999)
  332. .align 4
  333. LL(160):
  334. LFDX f8, X, INCXM1
  335. LFDUX f9, X, INCX
  336. FADD f0, f0, f8
  337. FADD f1, f1, f9
  338. bdnz LL(160)
  339. .align 4
  340. LL(999):
  341. FADD f0, f0, f1
  342. FADD f2, f2, f3
  343. FADD f4, f4, f5
  344. FADD f6, f6, f7
  345. FADD f0, f0, f2
  346. FADD f4, f4, f6
  347. FADD f1, f0, f4
  348. lfd f14, 0(SP)
  349. lfd f15, 8(SP)
  350. lfd f16, 16(SP)
  351. lfd f17, 24(SP)
  352. lfd f18, 32(SP)
  353. lfd f19, 40(SP)
  354. lfd f20, 48(SP)
  355. lfd f21, 56(SP)
  356. lfd f22, 64(SP)
  357. lfd f23, 72(SP)
  358. lfd f24, 80(SP)
  359. lfd f25, 88(SP)
  360. lfd f26, 96(SP)
  361. lfd f27, 104(SP)
  362. lfd f28, 112(SP)
  363. lfd f29, 120(SP)
  364. lfd f30, 128(SP)
  365. lfd f31, 136(SP)
  366. addi SP, SP, STACKSIZE
  367. blr
  368. EPILOGUE