You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

asum.S 8.5 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448
  1. /*********************************************************************/
  2. /* Copyright 2009, 2010 The University of Texas at Austin. */
  3. /* All rights reserved. */
  4. /* */
  5. /* Redistribution and use in source and binary forms, with or */
  6. /* without modification, are permitted provided that the following */
  7. /* conditions are met: */
  8. /* */
  9. /* 1. Redistributions of source code must retain the above */
  10. /* copyright notice, this list of conditions and the following */
  11. /* disclaimer. */
  12. /* */
  13. /* 2. Redistributions in binary form must reproduce the above */
  14. /* copyright notice, this list of conditions and the following */
  15. /* disclaimer in the documentation and/or other materials */
  16. /* provided with the distribution. */
  17. /* */
  18. /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
  19. /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
  20. /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
  21. /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
  22. /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
  23. /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
  24. /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
  25. /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
  26. /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
  27. /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
  28. /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
  29. /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
  30. /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
  31. /* POSSIBILITY OF SUCH DAMAGE. */
  32. /* */
  33. /* The views and conclusions contained in the software and */
  34. /* documentation are those of the authors and should not be */
  35. /* interpreted as representing official policies, either expressed */
  36. /* or implied, of The University of Texas at Austin. */
  37. /*********************************************************************/
  38. #define ASSEMBLER
  39. #include "common.h"
  40. #define N r3
  41. #define X r4
  42. #define INCX r5
  43. #define PREA r8
  44. #define FZERO f0
  45. #define STACKSIZE 160
  46. PROLOGUE
  47. PROFCODE
  48. addi SP, SP, -STACKSIZE
  49. li r0, 0
  50. stfd f14, 0(SP)
  51. stfd f15, 8(SP)
  52. stfd f16, 16(SP)
  53. stfd f17, 24(SP)
  54. stfd f18, 32(SP)
  55. stfd f19, 40(SP)
  56. stfd f20, 48(SP)
  57. stfd f21, 56(SP)
  58. stfd f22, 64(SP)
  59. stfd f23, 72(SP)
  60. stfd f24, 80(SP)
  61. stfd f25, 88(SP)
  62. stfd f26, 96(SP)
  63. stfd f27, 104(SP)
  64. stfd f28, 112(SP)
  65. stfd f29, 120(SP)
  66. stfd f30, 128(SP)
  67. stfd f31, 136(SP)
  68. stw r0, 144(SP)
  69. lfs FZERO,144(SP)
  70. #ifdef F_INTERFACE
  71. LDINT N, 0(N)
  72. LDINT INCX, 0(INCX)
  73. #endif
  74. slwi INCX, INCX, BASE_SHIFT
  75. fmr f1, FZERO
  76. fmr f2, FZERO
  77. fmr f3, FZERO
  78. fmr f4, FZERO
  79. fmr f5, FZERO
  80. fmr f6, FZERO
  81. fmr f7, FZERO
  82. li PREA, L1_PREFETCHSIZE
  83. cmpwi cr0, N, 0
  84. ble- LL(999)
  85. cmpwi cr0, INCX, 0
  86. ble- LL(999)
  87. cmpwi cr0, INCX, SIZE
  88. bne- cr0, LL(100)
  89. srawi. r0, N, 4
  90. mtspr CTR, r0
  91. beq- cr0, LL(50)
  92. .align 4
  93. LFD f8, 0 * SIZE(X)
  94. LFD f9, 1 * SIZE(X)
  95. LFD f10, 2 * SIZE(X)
  96. LFD f11, 3 * SIZE(X)
  97. LFD f12, 4 * SIZE(X)
  98. LFD f13, 5 * SIZE(X)
  99. LFD f14, 6 * SIZE(X)
  100. LFD f15, 7 * SIZE(X)
  101. LFD f24, 8 * SIZE(X)
  102. LFD f25, 9 * SIZE(X)
  103. LFD f26, 10 * SIZE(X)
  104. LFD f27, 11 * SIZE(X)
  105. LFD f28, 12 * SIZE(X)
  106. LFD f29, 13 * SIZE(X)
  107. LFD f30, 14 * SIZE(X)
  108. LFD f31, 15 * SIZE(X)
  109. fabs f16, f8
  110. fabs f17, f9
  111. fabs f18, f10
  112. fabs f19, f11
  113. fabs f20, f12
  114. fabs f21, f13
  115. fabs f22, f14
  116. fabs f23, f15
  117. bdz LL(20)
  118. .align 4
  119. LL(10):
  120. FADD f0, f0, f16
  121. fabs f16, f24
  122. FADD f1, f1, f17
  123. fabs f17, f25
  124. FADD f2, f2, f18
  125. fabs f18, f26
  126. FADD f3, f3, f19
  127. fabs f19, f27
  128. LFD f8, 16 * SIZE(X)
  129. LFD f9, 17 * SIZE(X)
  130. LFD f10, 18 * SIZE(X)
  131. LFD f11, 19 * SIZE(X)
  132. FADD f4, f4, f20
  133. fabs f20, f28
  134. FADD f5, f5, f21
  135. fabs f21, f29
  136. FADD f6, f6, f22
  137. fabs f22, f30
  138. FADD f7, f7, f23
  139. fabs f23, f31
  140. LFD f12, 20 * SIZE(X)
  141. LFD f13, 21 * SIZE(X)
  142. LFD f14, 22 * SIZE(X)
  143. LFD f15, 23 * SIZE(X)
  144. FADD f0, f0, f16
  145. fabs f16, f8
  146. FADD f1, f1, f17
  147. fabs f17, f9
  148. FADD f2, f2, f18
  149. fabs f18, f10
  150. FADD f3, f3, f19
  151. fabs f19, f11
  152. LFD f24, 24 * SIZE(X)
  153. LFD f25, 25 * SIZE(X)
  154. LFD f26, 26 * SIZE(X)
  155. LFD f27, 27 * SIZE(X)
  156. FADD f4, f4, f20
  157. fabs f20, f12
  158. FADD f5, f5, f21
  159. fabs f21, f13
  160. FADD f6, f6, f22
  161. fabs f22, f14
  162. FADD f7, f7, f23
  163. fabs f23, f15
  164. LFD f28, 28 * SIZE(X)
  165. LFD f29, 29 * SIZE(X)
  166. LFD f30, 30 * SIZE(X)
  167. LFD f31, 31 * SIZE(X)
  168. #ifndef POWER6
  169. L1_PREFETCH X, PREA
  170. #endif
  171. addi X, X, 16 * SIZE
  172. #ifdef POWER6
  173. L1_PREFETCH X, PREA
  174. #endif
  175. bdnz LL(10)
  176. .align 4
  177. LL(20):
  178. FADD f0, f0, f16
  179. fabs f16, f24
  180. FADD f1, f1, f17
  181. fabs f17, f25
  182. FADD f2, f2, f18
  183. fabs f18, f26
  184. FADD f3, f3, f19
  185. fabs f19, f27
  186. FADD f4, f4, f20
  187. fabs f20, f28
  188. FADD f5, f5, f21
  189. fabs f21, f29
  190. FADD f6, f6, f22
  191. fabs f22, f30
  192. FADD f7, f7, f23
  193. fabs f23, f31
  194. FADD f0, f0, f16
  195. FADD f1, f1, f17
  196. FADD f2, f2, f18
  197. FADD f3, f3, f19
  198. FADD f4, f4, f20
  199. FADD f5, f5, f21
  200. FADD f6, f6, f22
  201. FADD f7, f7, f23
  202. addi X, X, 16 * SIZE
  203. .align 4
  204. LL(50):
  205. andi. r0, N, 15
  206. mtspr CTR, r0
  207. beq LL(999)
  208. .align 4
  209. LL(60):
  210. LFD f8, 0 * SIZE(X)
  211. addi X, X, 1 * SIZE
  212. fabs f8, f8
  213. FADD f0, f0, f8
  214. bdnz LL(60)
  215. b LL(999)
  216. .align 4
  217. LL(100):
  218. sub X, X, INCX
  219. srawi. r0, N, 4
  220. mtspr CTR, r0
  221. beq- LL(150)
  222. LFDUX f8, X, INCX
  223. LFDUX f9, X, INCX
  224. LFDUX f10, X, INCX
  225. LFDUX f11, X, INCX
  226. LFDUX f12, X, INCX
  227. LFDUX f13, X, INCX
  228. LFDUX f14, X, INCX
  229. LFDUX f15, X, INCX
  230. LFDUX f24, X, INCX
  231. LFDUX f25, X, INCX
  232. LFDUX f26, X, INCX
  233. LFDUX f27, X, INCX
  234. LFDUX f28, X, INCX
  235. LFDUX f29, X, INCX
  236. LFDUX f30, X, INCX
  237. LFDUX f31, X, INCX
  238. fabs f16, f8
  239. fabs f17, f9
  240. fabs f18, f10
  241. fabs f19, f11
  242. fabs f20, f12
  243. fabs f21, f13
  244. fabs f22, f14
  245. fabs f23, f15
  246. bdz LL(120)
  247. .align 4
  248. LL(110):
  249. FADD f0, f0, f16
  250. fabs f16, f24
  251. FADD f1, f1, f17
  252. fabs f17, f25
  253. FADD f2, f2, f18
  254. fabs f18, f26
  255. FADD f3, f3, f19
  256. fabs f19, f27
  257. LFDUX f8, X, INCX
  258. LFDUX f9, X, INCX
  259. LFDUX f10, X, INCX
  260. LFDUX f11, X, INCX
  261. FADD f4, f4, f20
  262. fabs f20, f28
  263. FADD f5, f5, f21
  264. fabs f21, f29
  265. FADD f6, f6, f22
  266. fabs f22, f30
  267. FADD f7, f7, f23
  268. fabs f23, f31
  269. LFDUX f12, X, INCX
  270. LFDUX f13, X, INCX
  271. LFDUX f14, X, INCX
  272. LFDUX f15, X, INCX
  273. FADD f0, f0, f16
  274. fabs f16, f8
  275. FADD f1, f1, f17
  276. fabs f17, f9
  277. FADD f2, f2, f18
  278. fabs f18, f10
  279. FADD f3, f3, f19
  280. fabs f19, f11
  281. LFDUX f24, X, INCX
  282. LFDUX f25, X, INCX
  283. LFDUX f26, X, INCX
  284. LFDUX f27, X, INCX
  285. FADD f4, f4, f20
  286. fabs f20, f12
  287. FADD f5, f5, f21
  288. fabs f21, f13
  289. FADD f6, f6, f22
  290. fabs f22, f14
  291. FADD f7, f7, f23
  292. fabs f23, f15
  293. LFDUX f28, X, INCX
  294. LFDUX f29, X, INCX
  295. LFDUX f30, X, INCX
  296. LFDUX f31, X, INCX
  297. bdnz LL(110)
  298. .align 4
  299. LL(120):
  300. FADD f0, f0, f16
  301. fabs f16, f24
  302. FADD f1, f1, f17
  303. fabs f17, f25
  304. FADD f2, f2, f18
  305. fabs f18, f26
  306. FADD f3, f3, f19
  307. fabs f19, f27
  308. FADD f4, f4, f20
  309. fabs f20, f28
  310. FADD f5, f5, f21
  311. fabs f21, f29
  312. FADD f6, f6, f22
  313. fabs f22, f30
  314. FADD f7, f7, f23
  315. fabs f23, f31
  316. FADD f0, f0, f16
  317. FADD f1, f1, f17
  318. FADD f2, f2, f18
  319. FADD f3, f3, f19
  320. FADD f4, f4, f20
  321. FADD f5, f5, f21
  322. FADD f6, f6, f22
  323. FADD f7, f7, f23
  324. .align 4
  325. LL(150):
  326. andi. r0, N, 15
  327. mtspr CTR, r0
  328. beq LL(999)
  329. .align 4
  330. LL(160):
  331. LFDUX f8, X, INCX
  332. fabs f8, f8
  333. FADD f0, f0, f8
  334. bdnz LL(160)
  335. .align 4
  336. LL(999):
  337. FADD f0, f0, f1
  338. FADD f2, f2, f3
  339. FADD f4, f4, f5
  340. FADD f6, f6, f7
  341. FADD f0, f0, f2
  342. FADD f4, f4, f6
  343. FADD f1, f0, f4
  344. lfd f14, 0(SP)
  345. lfd f15, 8(SP)
  346. lfd f16, 16(SP)
  347. lfd f17, 24(SP)
  348. lfd f18, 32(SP)
  349. lfd f19, 40(SP)
  350. lfd f20, 48(SP)
  351. lfd f21, 56(SP)
  352. lfd f22, 64(SP)
  353. lfd f23, 72(SP)
  354. lfd f24, 80(SP)
  355. lfd f25, 88(SP)
  356. lfd f26, 96(SP)
  357. lfd f27, 104(SP)
  358. lfd f28, 112(SP)
  359. lfd f29, 120(SP)
  360. lfd f30, 128(SP)
  361. lfd f31, 136(SP)
  362. addi SP, SP, STACKSIZE
  363. blr
  364. EPILOGUE