You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446
  1. /*********************************************************************/
  2. /* Copyright 2009, 2010 The University of Texas at Austin. */
  3. /* All rights reserved. */
  4. /* */
  5. /* Redistribution and use in source and binary forms, with or */
  6. /* without modification, are permitted provided that the following */
  7. /* conditions are met: */
  8. /* */
  9. /* 1. Redistributions of source code must retain the above */
  10. /* copyright notice, this list of conditions and the following */
  11. /* disclaimer. */
  12. /* */
  13. /* 2. Redistributions in binary form must reproduce the above */
  14. /* copyright notice, this list of conditions and the following */
  15. /* disclaimer in the documentation and/or other materials */
  16. /* provided with the distribution. */
  17. /* */
  18. /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
  19. /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
  20. /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
  21. /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
  22. /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
  23. /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
  24. /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
  25. /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
  26. /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
  27. /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
  28. /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
  29. /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
  30. /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
  31. /* POSSIBILITY OF SUCH DAMAGE. */
  32. /* */
  33. /* The views and conclusions contained in the software and */
  34. /* documentation are those of the authors and should not be */
  35. /* interpreted as representing official policies, either expressed */
  36. /* or implied, of The University of Texas at Austin. */
  37. /*********************************************************************/
  38. #define ASSEMBLER
  39. #include "common.h"
  40. #define N r3
  41. #define X r4
  42. #define INCX r5
  43. #define PREA r8
  44. #define FZERO f0
  45. #define STACKSIZE 160
  46. PROLOGUE
  47. PROFCODE
  48. addi SP, SP, -STACKSIZE
  49. li r0, 0
  50. stfd f14, 0(SP)
  51. stfd f15, 8(SP)
  52. stfd f16, 16(SP)
  53. stfd f17, 24(SP)
  54. stfd f18, 32(SP)
  55. stfd f19, 40(SP)
  56. stfd f20, 48(SP)
  57. stfd f21, 56(SP)
  58. stfd f22, 64(SP)
  59. stfd f23, 72(SP)
  60. stfd f24, 80(SP)
  61. stfd f25, 88(SP)
  62. stfd f26, 96(SP)
  63. stfd f27, 104(SP)
  64. stfd f28, 112(SP)
  65. stfd f29, 120(SP)
  66. stfd f30, 128(SP)
  67. stfd f31, 136(SP)
  68. stw r0, 144(SP)
  69. lfs FZERO,144(SP)
  70. #ifdef F_INTERFACE
  71. LDINT N, 0(N)
  72. LDINT INCX, 0(INCX)
  73. #endif
  74. slwi INCX, INCX, BASE_SHIFT
  75. fmr f1, FZERO
  76. fmr f2, FZERO
  77. fmr f3, FZERO
  78. fmr f4, FZERO
  79. fmr f5, FZERO
  80. fmr f6, FZERO
  81. fmr f7, FZERO
  82. li PREA, L1_PREFETCHSIZE
  83. cmpwi cr0, N, 0
  84. ble- LL(999)
  85. cmpwi cr0, INCX, 0
  86. ble- LL(999)
  87. cmpwi cr0, INCX, SIZE
  88. bne- cr0, LL(100)
  89. srawi. r0, N, 4
  90. mtspr CTR, r0
  91. beq- cr0, LL(50)
  92. .align 4
  93. LFD f8, 0 * SIZE(X)
  94. LFD f9, 1 * SIZE(X)
  95. LFD f10, 2 * SIZE(X)
  96. LFD f11, 3 * SIZE(X)
  97. LFD f12, 4 * SIZE(X)
  98. LFD f13, 5 * SIZE(X)
  99. LFD f14, 6 * SIZE(X)
  100. LFD f15, 7 * SIZE(X)
  101. LFD f24, 8 * SIZE(X)
  102. LFD f25, 9 * SIZE(X)
  103. LFD f26, 10 * SIZE(X)
  104. LFD f27, 11 * SIZE(X)
  105. LFD f28, 12 * SIZE(X)
  106. LFD f29, 13 * SIZE(X)
  107. LFD f30, 14 * SIZE(X)
  108. LFD f31, 15 * SIZE(X)
  109. fmr f16, f8
  110. fmr f17, f9
  111. fmr f18, f10
  112. fmr f19, f11
  113. fmr f20, f12
  114. fmr f21, f13
  115. fmr f22, f14
  116. fmr f23, f15
  117. bdz LL(20)
  118. .align 4
  119. LL(10):
  120. FADD f0, f0, f16
  121. fmr f16, f24
  122. FADD f1, f1, f17
  123. fmr f17, f25
  124. FADD f2, f2, f18
  125. fmr f18, f26
  126. FADD f3, f3, f19
  127. fmr f19, f27
  128. LFD f8, 16 * SIZE(X)
  129. LFD f9, 17 * SIZE(X)
  130. LFD f10, 18 * SIZE(X)
  131. LFD f11, 19 * SIZE(X)
  132. FADD f4, f4, f20
  133. fmr f20, f28
  134. FADD f5, f5, f21
  135. fmr f21, f29
  136. FADD f6, f6, f22
  137. fmr f22, f30
  138. FADD f7, f7, f23
  139. fmr f23, f31
  140. LFD f12, 20 * SIZE(X)
  141. LFD f13, 21 * SIZE(X)
  142. LFD f14, 22 * SIZE(X)
  143. LFD f15, 23 * SIZE(X)
  144. FADD f0, f0, f16
  145. fmr f16, f8
  146. FADD f1, f1, f17
  147. fmr f17, f9
  148. FADD f2, f2, f18
  149. fmr f18, f10
  150. FADD f3, f3, f19
  151. fmr f19, f11
  152. LFD f24, 24 * SIZE(X)
  153. LFD f25, 25 * SIZE(X)
  154. LFD f26, 26 * SIZE(X)
  155. LFD f27, 27 * SIZE(X)
  156. FADD f4, f4, f20
  157. fmr f20, f12
  158. FADD f5, f5, f21
  159. fmr f21, f13
  160. FADD f6, f6, f22
  161. fmr f22, f14
  162. FADD f7, f7, f23
  163. fmr f23, f15
  164. LFD f28, 28 * SIZE(X)
  165. LFD f29, 29 * SIZE(X)
  166. LFD f30, 30 * SIZE(X)
  167. LFD f31, 31 * SIZE(X)
  168. #ifndef POWER6
  169. L1_PREFETCH X, PREA
  170. #endif
  171. addi X, X, 16 * SIZE
  172. #ifdef POWER6
  173. L1_PREFETCH X, PREA
  174. #endif
  175. bdnz LL(10)
  176. .align 4
  177. LL(20):
  178. FADD f0, f0, f16
  179. fmr f16, f24
  180. FADD f1, f1, f17
  181. fmr f17, f25
  182. FADD f2, f2, f18
  183. fmr f18, f26
  184. FADD f3, f3, f19
  185. fmr f19, f27
  186. FADD f4, f4, f20
  187. fmr f20, f28
  188. FADD f5, f5, f21
  189. fmr f21, f29
  190. FADD f6, f6, f22
  191. fmr f22, f30
  192. FADD f7, f7, f23
  193. fmr f23, f31
  194. FADD f0, f0, f16
  195. FADD f1, f1, f17
  196. FADD f2, f2, f18
  197. FADD f3, f3, f19
  198. FADD f4, f4, f20
  199. FADD f5, f5, f21
  200. FADD f6, f6, f22
  201. FADD f7, f7, f23
  202. addi X, X, 16 * SIZE
  203. .align 4
  204. LL(50):
  205. andi. r0, N, 15
  206. mtspr CTR, r0
  207. beq LL(999)
  208. .align 4
  209. LL(60):
  210. LFD f8, 0 * SIZE(X)
  211. addi X, X, 1 * SIZE
  212. FADD f0, f0, f8
  213. bdnz LL(60)
  214. b LL(999)
  215. .align 4
  216. LL(100):
  217. sub X, X, INCX
  218. srawi. r0, N, 4
  219. mtspr CTR, r0
  220. beq- LL(150)
  221. LFDUX f8, X, INCX
  222. LFDUX f9, X, INCX
  223. LFDUX f10, X, INCX
  224. LFDUX f11, X, INCX
  225. LFDUX f12, X, INCX
  226. LFDUX f13, X, INCX
  227. LFDUX f14, X, INCX
  228. LFDUX f15, X, INCX
  229. LFDUX f24, X, INCX
  230. LFDUX f25, X, INCX
  231. LFDUX f26, X, INCX
  232. LFDUX f27, X, INCX
  233. LFDUX f28, X, INCX
  234. LFDUX f29, X, INCX
  235. LFDUX f30, X, INCX
  236. LFDUX f31, X, INCX
  237. fmr f16, f8
  238. fmr f17, f9
  239. fmr f18, f10
  240. fmr f19, f11
  241. fmr f20, f12
  242. fmr f21, f13
  243. fmr f22, f14
  244. fmr f23, f15
  245. bdz LL(120)
  246. .align 4
  247. LL(110):
  248. FADD f0, f0, f16
  249. fmr f16, f24
  250. FADD f1, f1, f17
  251. fmr f17, f25
  252. FADD f2, f2, f18
  253. fmr f18, f26
  254. FADD f3, f3, f19
  255. fmr f19, f27
  256. LFDUX f8, X, INCX
  257. LFDUX f9, X, INCX
  258. LFDUX f10, X, INCX
  259. LFDUX f11, X, INCX
  260. FADD f4, f4, f20
  261. fmr f20, f28
  262. FADD f5, f5, f21
  263. fmr f21, f29
  264. FADD f6, f6, f22
  265. fmr f22, f30
  266. FADD f7, f7, f23
  267. fmr f23, f31
  268. LFDUX f12, X, INCX
  269. LFDUX f13, X, INCX
  270. LFDUX f14, X, INCX
  271. LFDUX f15, X, INCX
  272. FADD f0, f0, f16
  273. fmr f16, f8
  274. FADD f1, f1, f17
  275. fmr f17, f9
  276. FADD f2, f2, f18
  277. fmr f18, f10
  278. FADD f3, f3, f19
  279. fmr f19, f11
  280. LFDUX f24, X, INCX
  281. LFDUX f25, X, INCX
  282. LFDUX f26, X, INCX
  283. LFDUX f27, X, INCX
  284. FADD f4, f4, f20
  285. fmr f20, f12
  286. FADD f5, f5, f21
  287. fmr f21, f13
  288. FADD f6, f6, f22
  289. fmr f22, f14
  290. FADD f7, f7, f23
  291. fmr f23, f15
  292. LFDUX f28, X, INCX
  293. LFDUX f29, X, INCX
  294. LFDUX f30, X, INCX
  295. LFDUX f31, X, INCX
  296. bdnz LL(110)
  297. .align 4
  298. LL(120):
  299. FADD f0, f0, f16
  300. fmr f16, f24
  301. FADD f1, f1, f17
  302. fmr f17, f25
  303. FADD f2, f2, f18
  304. fmr f18, f26
  305. FADD f3, f3, f19
  306. fmr f19, f27
  307. FADD f4, f4, f20
  308. fmr f20, f28
  309. FADD f5, f5, f21
  310. fmr f21, f29
  311. FADD f6, f6, f22
  312. fmr f22, f30
  313. FADD f7, f7, f23
  314. fmr f23, f31
  315. FADD f0, f0, f16
  316. FADD f1, f1, f17
  317. FADD f2, f2, f18
  318. FADD f3, f3, f19
  319. FADD f4, f4, f20
  320. FADD f5, f5, f21
  321. FADD f6, f6, f22
  322. FADD f7, f7, f23
  323. .align 4
  324. LL(150):
  325. andi. r0, N, 15
  326. mtspr CTR, r0
  327. beq LL(999)
  328. .align 4
  329. LL(160):
  330. LFDUX f8, X, INCX
  331. FADD f0, f0, f8
  332. bdnz LL(160)
  333. .align 4
  334. LL(999):
  335. FADD f0, f0, f1
  336. FADD f2, f2, f3
  337. FADD f4, f4, f5
  338. FADD f6, f6, f7
  339. FADD f0, f0, f2
  340. FADD f4, f4, f6
  341. FADD f1, f0, f4
  342. lfd f14, 0(SP)
  343. lfd f15, 8(SP)
  344. lfd f16, 16(SP)
  345. lfd f17, 24(SP)
  346. lfd f18, 32(SP)
  347. lfd f19, 40(SP)
  348. lfd f20, 48(SP)
  349. lfd f21, 56(SP)
  350. lfd f22, 64(SP)
  351. lfd f23, 72(SP)
  352. lfd f24, 80(SP)
  353. lfd f25, 88(SP)
  354. lfd f26, 96(SP)
  355. lfd f27, 104(SP)
  356. lfd f28, 112(SP)
  357. lfd f29, 120(SP)
  358. lfd f30, 128(SP)
  359. lfd f31, 136(SP)
  360. addi SP, SP, STACKSIZE
  361. blr
  362. EPILOGUE