You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

zasum.S 8.7 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456
  1. /*********************************************************************/
  2. /* Copyright 2009, 2010 The University of Texas at Austin. */
  3. /* All rights reserved. */
  4. /* */
  5. /* Redistribution and use in source and binary forms, with or */
  6. /* without modification, are permitted provided that the following */
  7. /* conditions are met: */
  8. /* */
  9. /* 1. Redistributions of source code must retain the above */
  10. /* copyright notice, this list of conditions and the following */
  11. /* disclaimer. */
  12. /* */
  13. /* 2. Redistributions in binary form must reproduce the above */
  14. /* copyright notice, this list of conditions and the following */
  15. /* disclaimer in the documentation and/or other materials */
  16. /* provided with the distribution. */
  17. /* */
  18. /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
  19. /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
  20. /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
  21. /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
  22. /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
  23. /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
  24. /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
  25. /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
  26. /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
  27. /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
  28. /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
  29. /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
  30. /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
  31. /* POSSIBILITY OF SUCH DAMAGE. */
  32. /* */
  33. /* The views and conclusions contained in the software and */
  34. /* documentation are those of the authors and should not be */
  35. /* interpreted as representing official policies, either expressed */
  36. /* or implied, of The University of Texas at Austin. */
  37. /*********************************************************************/
  38. #define ASSEMBLER
  39. #include "common.h"
  40. #define N r3
  41. #define X r4
  42. #define INCX r5
  43. #define INCXM1 r9
  44. #define PREA r8
  45. #define FZERO f0
  46. #define STACKSIZE 160
  47. PROLOGUE
  48. PROFCODE
  49. addi SP, SP, -STACKSIZE
  50. li r0, 0
  51. stfd f14, 0(SP)
  52. stfd f15, 8(SP)
  53. stfd f16, 16(SP)
  54. stfd f17, 24(SP)
  55. stfd f18, 32(SP)
  56. stfd f19, 40(SP)
  57. stfd f20, 48(SP)
  58. stfd f21, 56(SP)
  59. stfd f22, 64(SP)
  60. stfd f23, 72(SP)
  61. stfd f24, 80(SP)
  62. stfd f25, 88(SP)
  63. stfd f26, 96(SP)
  64. stfd f27, 104(SP)
  65. stfd f28, 112(SP)
  66. stfd f29, 120(SP)
  67. stfd f30, 128(SP)
  68. stfd f31, 136(SP)
  69. stw r0, 144(SP)
  70. lfs FZERO,144(SP)
  71. #ifdef F_INTERFACE
  72. LDINT N, 0(N)
  73. LDINT INCX, 0(INCX)
  74. #endif
  75. slwi INCX, INCX, ZBASE_SHIFT
  76. subi INCXM1, INCX, SIZE
  77. fmr f1, FZERO
  78. fmr f2, FZERO
  79. fmr f3, FZERO
  80. fmr f4, FZERO
  81. fmr f5, FZERO
  82. fmr f6, FZERO
  83. fmr f7, FZERO
  84. li PREA, L1_PREFETCHSIZE
  85. cmpwi cr0, N, 0
  86. ble- LL(999)
  87. cmpwi cr0, INCX, 0
  88. ble- LL(999)
  89. cmpwi cr0, INCX, 2 * SIZE
  90. bne- cr0, LL(100)
  91. srawi. r0, N, 3
  92. mtspr CTR, r0
  93. beq- cr0, LL(50)
  94. .align 4
  95. LFD f8, 0 * SIZE(X)
  96. LFD f9, 1 * SIZE(X)
  97. LFD f10, 2 * SIZE(X)
  98. LFD f11, 3 * SIZE(X)
  99. LFD f12, 4 * SIZE(X)
  100. LFD f13, 5 * SIZE(X)
  101. LFD f14, 6 * SIZE(X)
  102. LFD f15, 7 * SIZE(X)
  103. LFD f24, 8 * SIZE(X)
  104. LFD f25, 9 * SIZE(X)
  105. LFD f26, 10 * SIZE(X)
  106. LFD f27, 11 * SIZE(X)
  107. LFD f28, 12 * SIZE(X)
  108. LFD f29, 13 * SIZE(X)
  109. LFD f30, 14 * SIZE(X)
  110. LFD f31, 15 * SIZE(X)
  111. fabs f16, f8
  112. fabs f17, f9
  113. fabs f18, f10
  114. fabs f19, f11
  115. fabs f20, f12
  116. fabs f21, f13
  117. fabs f22, f14
  118. fabs f23, f15
  119. bdz LL(20)
  120. .align 4
  121. LL(10):
  122. FADD f0, f0, f16
  123. fabs f16, f24
  124. FADD f1, f1, f17
  125. fabs f17, f25
  126. FADD f2, f2, f18
  127. fabs f18, f26
  128. FADD f3, f3, f19
  129. fabs f19, f27
  130. LFD f8, 16 * SIZE(X)
  131. LFD f9, 17 * SIZE(X)
  132. LFD f10, 18 * SIZE(X)
  133. LFD f11, 19 * SIZE(X)
  134. FADD f4, f4, f20
  135. fabs f20, f28
  136. FADD f5, f5, f21
  137. fabs f21, f29
  138. FADD f6, f6, f22
  139. fabs f22, f30
  140. FADD f7, f7, f23
  141. fabs f23, f31
  142. LFD f12, 20 * SIZE(X)
  143. LFD f13, 21 * SIZE(X)
  144. LFD f14, 22 * SIZE(X)
  145. LFD f15, 23 * SIZE(X)
  146. FADD f0, f0, f16
  147. fabs f16, f8
  148. FADD f1, f1, f17
  149. fabs f17, f9
  150. FADD f2, f2, f18
  151. fabs f18, f10
  152. FADD f3, f3, f19
  153. fabs f19, f11
  154. LFD f24, 24 * SIZE(X)
  155. LFD f25, 25 * SIZE(X)
  156. LFD f26, 26 * SIZE(X)
  157. LFD f27, 27 * SIZE(X)
  158. FADD f4, f4, f20
  159. fabs f20, f12
  160. FADD f5, f5, f21
  161. fabs f21, f13
  162. FADD f6, f6, f22
  163. fabs f22, f14
  164. FADD f7, f7, f23
  165. fabs f23, f15
  166. LFD f28, 28 * SIZE(X)
  167. LFD f29, 29 * SIZE(X)
  168. LFD f30, 30 * SIZE(X)
  169. LFD f31, 31 * SIZE(X)
  170. #ifndef POWER6
  171. L1_PREFETCH X, PREA
  172. #endif
  173. addi X, X, 16 * SIZE
  174. #ifdef POWER6
  175. L1_PREFETCH X, PREA
  176. #endif
  177. bdnz LL(10)
  178. .align 4
  179. LL(20):
  180. FADD f0, f0, f16
  181. fabs f16, f24
  182. FADD f1, f1, f17
  183. fabs f17, f25
  184. FADD f2, f2, f18
  185. fabs f18, f26
  186. FADD f3, f3, f19
  187. fabs f19, f27
  188. FADD f4, f4, f20
  189. fabs f20, f28
  190. FADD f5, f5, f21
  191. fabs f21, f29
  192. FADD f6, f6, f22
  193. fabs f22, f30
  194. FADD f7, f7, f23
  195. fabs f23, f31
  196. FADD f0, f0, f16
  197. FADD f1, f1, f17
  198. FADD f2, f2, f18
  199. FADD f3, f3, f19
  200. FADD f4, f4, f20
  201. FADD f5, f5, f21
  202. FADD f6, f6, f22
  203. FADD f7, f7, f23
  204. addi X, X, 16 * SIZE
  205. .align 4
  206. LL(50):
  207. andi. r0, N, 7
  208. mtspr CTR, r0
  209. beq LL(999)
  210. .align 4
  211. LL(60):
  212. LFD f8, 0 * SIZE(X)
  213. LFD f9, 1 * SIZE(X)
  214. addi X, X, 2 * SIZE
  215. fabs f8, f8
  216. fabs f9, f9
  217. FADD f0, f0, f8
  218. FADD f1, f1, f9
  219. bdnz LL(60)
  220. b LL(999)
  221. .align 4
  222. LL(100):
  223. sub X, X, INCXM1
  224. srawi. r0, N, 3
  225. mtspr CTR, r0
  226. beq- LL(150)
  227. LFDX f8, X, INCXM1
  228. LFDUX f9, X, INCX
  229. LFDX f10, X, INCXM1
  230. LFDUX f11, X, INCX
  231. LFDX f12, X, INCXM1
  232. LFDUX f13, X, INCX
  233. LFDX f14, X, INCXM1
  234. LFDUX f15, X, INCX
  235. LFDX f24, X, INCXM1
  236. LFDUX f25, X, INCX
  237. LFDX f26, X, INCXM1
  238. LFDUX f27, X, INCX
  239. LFDX f28, X, INCXM1
  240. LFDUX f29, X, INCX
  241. LFDX f30, X, INCXM1
  242. LFDUX f31, X, INCX
  243. fabs f16, f8
  244. fabs f17, f9
  245. fabs f18, f10
  246. fabs f19, f11
  247. fabs f20, f12
  248. fabs f21, f13
  249. fabs f22, f14
  250. fabs f23, f15
  251. bdz LL(120)
  252. .align 4
  253. LL(110):
  254. FADD f0, f0, f16
  255. fabs f16, f24
  256. FADD f1, f1, f17
  257. fabs f17, f25
  258. FADD f2, f2, f18
  259. fabs f18, f26
  260. FADD f3, f3, f19
  261. fabs f19, f27
  262. LFDX f8, X, INCXM1
  263. LFDUX f9, X, INCX
  264. LFDX f10, X, INCXM1
  265. LFDUX f11, X, INCX
  266. FADD f4, f4, f20
  267. fabs f20, f28
  268. FADD f5, f5, f21
  269. fabs f21, f29
  270. FADD f6, f6, f22
  271. fabs f22, f30
  272. FADD f7, f7, f23
  273. fabs f23, f31
  274. LFDX f12, X, INCXM1
  275. LFDUX f13, X, INCX
  276. LFDX f14, X, INCXM1
  277. LFDUX f15, X, INCX
  278. FADD f0, f0, f16
  279. fabs f16, f8
  280. FADD f1, f1, f17
  281. fabs f17, f9
  282. FADD f2, f2, f18
  283. fabs f18, f10
  284. FADD f3, f3, f19
  285. fabs f19, f11
  286. LFDX f24, X, INCXM1
  287. LFDUX f25, X, INCX
  288. LFDX f26, X, INCXM1
  289. LFDUX f27, X, INCX
  290. FADD f4, f4, f20
  291. fabs f20, f12
  292. FADD f5, f5, f21
  293. fabs f21, f13
  294. FADD f6, f6, f22
  295. fabs f22, f14
  296. FADD f7, f7, f23
  297. fabs f23, f15
  298. LFDX f28, X, INCXM1
  299. LFDUX f29, X, INCX
  300. LFDX f30, X, INCXM1
  301. LFDUX f31, X, INCX
  302. bdnz LL(110)
  303. .align 4
  304. LL(120):
  305. FADD f0, f0, f16
  306. fabs f16, f24
  307. FADD f1, f1, f17
  308. fabs f17, f25
  309. FADD f2, f2, f18
  310. fabs f18, f26
  311. FADD f3, f3, f19
  312. fabs f19, f27
  313. FADD f4, f4, f20
  314. fabs f20, f28
  315. FADD f5, f5, f21
  316. fabs f21, f29
  317. FADD f6, f6, f22
  318. fabs f22, f30
  319. FADD f7, f7, f23
  320. fabs f23, f31
  321. FADD f0, f0, f16
  322. FADD f1, f1, f17
  323. FADD f2, f2, f18
  324. FADD f3, f3, f19
  325. FADD f4, f4, f20
  326. FADD f5, f5, f21
  327. FADD f6, f6, f22
  328. FADD f7, f7, f23
  329. .align 4
  330. LL(150):
  331. andi. r0, N, 7
  332. mtspr CTR, r0
  333. beq LL(999)
  334. .align 4
  335. LL(160):
  336. LFDX f8, X, INCXM1
  337. LFDUX f9, X, INCX
  338. fabs f8, f8
  339. fabs f9, f9
  340. FADD f0, f0, f8
  341. FADD f1, f1, f9
  342. bdnz LL(160)
  343. .align 4
  344. LL(999):
  345. FADD f0, f0, f1
  346. FADD f2, f2, f3
  347. FADD f4, f4, f5
  348. FADD f6, f6, f7
  349. FADD f0, f0, f2
  350. FADD f4, f4, f6
  351. FADD f1, f0, f4
  352. lfd f14, 0(SP)
  353. lfd f15, 8(SP)
  354. lfd f16, 16(SP)
  355. lfd f17, 24(SP)
  356. lfd f18, 32(SP)
  357. lfd f19, 40(SP)
  358. lfd f20, 48(SP)
  359. lfd f21, 56(SP)
  360. lfd f22, 64(SP)
  361. lfd f23, 72(SP)
  362. lfd f24, 80(SP)
  363. lfd f25, 88(SP)
  364. lfd f26, 96(SP)
  365. lfd f27, 104(SP)
  366. lfd f28, 112(SP)
  367. lfd f29, 120(SP)
  368. lfd f30, 128(SP)
  369. lfd f31, 136(SP)
  370. addi SP, SP, STACKSIZE
  371. blr
  372. EPILOGUE