You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

zamax.S 9.7 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505
  1. /*********************************************************************/
  2. /* Copyright 2009, 2010 The University of Texas at Austin. */
  3. /* All rights reserved. */
  4. /* */
  5. /* Redistribution and use in source and binary forms, with or */
  6. /* without modification, are permitted provided that the following */
  7. /* conditions are met: */
  8. /* */
  9. /* 1. Redistributions of source code must retain the above */
  10. /* copyright notice, this list of conditions and the following */
  11. /* disclaimer. */
  12. /* */
  13. /* 2. Redistributions in binary form must reproduce the above */
  14. /* copyright notice, this list of conditions and the following */
  15. /* disclaimer in the documentation and/or other materials */
  16. /* provided with the distribution. */
  17. /* */
  18. /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
  19. /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
  20. /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
  21. /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
  22. /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
  23. /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
  24. /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
  25. /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
  26. /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
  27. /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
  28. /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
  29. /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
  30. /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
  31. /* POSSIBILITY OF SUCH DAMAGE. */
  32. /* */
  33. /* The views and conclusions contained in the software and */
  34. /* documentation are those of the authors and should not be */
  35. /* interpreted as representing official policies, either expressed */
  36. /* or implied, of The University of Texas at Austin. */
  37. /*********************************************************************/
  38. #define ASSEMBLER
  39. #include "common.h"
  40. #define N r3
  41. #define X r4
  42. #define INCX r5
  43. #define PREA r8
  44. #define INCXM1 r9
  45. #define FZERO f1
  46. #define STACKSIZE 160
  47. PROLOGUE
  48. PROFCODE
  49. addi SP, SP, -STACKSIZE
  50. li r0, 0
  51. stfd f14, 0(SP)
  52. stfd f15, 8(SP)
  53. stfd f16, 16(SP)
  54. stfd f17, 24(SP)
  55. stfd f18, 32(SP)
  56. stfd f19, 40(SP)
  57. stfd f20, 48(SP)
  58. stfd f21, 56(SP)
  59. stfd f22, 64(SP)
  60. stfd f23, 72(SP)
  61. stfd f24, 80(SP)
  62. stfd f25, 88(SP)
  63. stfd f26, 96(SP)
  64. stfd f27, 104(SP)
  65. stfd f28, 112(SP)
  66. stfd f29, 120(SP)
  67. stfd f30, 128(SP)
  68. stfd f31, 136(SP)
  69. stw r0, 144(SP)
  70. lfs FZERO,144(SP)
  71. #ifdef F_INTERFACE
  72. LDINT N, 0(N)
  73. LDINT INCX, 0(INCX)
  74. #endif
  75. slwi INCX, INCX, ZBASE_SHIFT
  76. subi INCXM1, INCX, SIZE
  77. li PREA, L1_PREFETCHSIZE
  78. cmpwi cr0, N, 0
  79. ble- LL(9999)
  80. cmpwi cr0, INCX, 0
  81. ble- LL(9999)
  82. LFD f1, 0 * SIZE(X)
  83. LFD f2, 1 * SIZE(X)
  84. add X, X, INCX
  85. fabs f1, f1
  86. fabs f2, f2
  87. fadd f1, f1, f2
  88. fmr f0, f1
  89. fmr f2, f1
  90. fmr f3, f1
  91. subi N, N, 1
  92. cmpwi cr0, INCX, 2 * SIZE
  93. bne- cr0, LL(100)
  94. srawi. r0, N, 3
  95. mtspr CTR, r0
  96. beq- cr0, LL(50)
  97. .align 4
  98. LFD f24, 0 * SIZE(X)
  99. LFD f25, 1 * SIZE(X)
  100. LFD f26, 2 * SIZE(X)
  101. LFD f27, 3 * SIZE(X)
  102. LFD f28, 4 * SIZE(X)
  103. LFD f29, 5 * SIZE(X)
  104. LFD f30, 6 * SIZE(X)
  105. LFD f31, 7 * SIZE(X)
  106. fabs f8, f24
  107. fabs f9, f25
  108. fabs f10, f26
  109. fabs f11, f27
  110. fabs f12, f28
  111. fabs f13, f29
  112. fabs f14, f30
  113. fabs f15, f31
  114. LFD f24, 8 * SIZE(X)
  115. LFD f25, 9 * SIZE(X)
  116. LFD f26, 10 * SIZE(X)
  117. LFD f27, 11 * SIZE(X)
  118. LFD f28, 12 * SIZE(X)
  119. LFD f29, 13 * SIZE(X)
  120. LFD f30, 14 * SIZE(X)
  121. LFD f31, 15 * SIZE(X)
  122. bdz LL(20)
  123. .align 4
  124. LL(10):
  125. fadd f4, f8, f9
  126. fadd f5, f10, f11
  127. fadd f6, f12, f13
  128. fadd f7, f14, f15
  129. fabs f8, f24
  130. fabs f9, f25
  131. fabs f10, f26
  132. fabs f11, f27
  133. LFD f24, 16 * SIZE(X)
  134. LFD f25, 17 * SIZE(X)
  135. LFD f26, 18 * SIZE(X)
  136. LFD f27, 19 * SIZE(X)
  137. fabs f12, f28
  138. fabs f13, f29
  139. fabs f14, f30
  140. fabs f15, f31
  141. LFD f28, 20 * SIZE(X)
  142. LFD f29, 21 * SIZE(X)
  143. LFD f30, 22 * SIZE(X)
  144. LFD f31, 23 * SIZE(X)
  145. fsub f16, f0, f4
  146. fsub f17, f1, f5
  147. fsub f18, f2, f6
  148. fsub f19, f3, f7
  149. fadd f20, f8, f9
  150. fadd f21, f10, f11
  151. fadd f22, f12, f13
  152. fadd f23, f14, f15
  153. fabs f8, f24
  154. fabs f9, f25
  155. fabs f10, f26
  156. fabs f11, f27
  157. LFD f24, 24 * SIZE(X)
  158. LFD f25, 25 * SIZE(X)
  159. LFD f26, 26 * SIZE(X)
  160. LFD f27, 27 * SIZE(X)
  161. fsel f0, f16, f0, f4
  162. fsel f1, f17, f1, f5
  163. fsel f2, f18, f2, f6
  164. fsel f3, f19, f3, f7
  165. fabs f12, f28
  166. fabs f13, f29
  167. fabs f14, f30
  168. fabs f15, f31
  169. LFD f28, 28 * SIZE(X)
  170. LFD f29, 29 * SIZE(X)
  171. LFD f30, 30 * SIZE(X)
  172. LFD f31, 31 * SIZE(X)
  173. fsub f16, f0, f20
  174. fsub f17, f1, f21
  175. fsub f18, f2, f22
  176. fsub f19, f3, f23
  177. fsel f0, f16, f0, f20
  178. fsel f1, f17, f1, f21
  179. fsel f2, f18, f2, f22
  180. fsel f3, f19, f3, f23
  181. #ifndef POWER6
  182. L1_PREFETCH X, PREA
  183. #endif
  184. addi X, X, 16 * SIZE
  185. #ifdef POWER6
  186. L1_PREFETCH X, PREA
  187. #endif
  188. bdnz LL(10)
  189. .align 4
  190. LL(20):
  191. fadd f4, f8, f9
  192. fadd f5, f10, f11
  193. fadd f6, f12, f13
  194. fadd f7, f14, f15
  195. fabs f8, f24
  196. fabs f9, f25
  197. fabs f10, f26
  198. fabs f11, f27
  199. fabs f12, f28
  200. fabs f13, f29
  201. fabs f14, f30
  202. fabs f15, f31
  203. fsub f16, f0, f4
  204. fsub f17, f1, f5
  205. fsub f18, f2, f6
  206. fsub f19, f3, f7
  207. fadd f20, f8, f9
  208. fadd f21, f10, f11
  209. fadd f22, f12, f13
  210. fadd f23, f14, f15
  211. fsel f0, f16, f0, f4
  212. fsel f1, f17, f1, f5
  213. fsel f2, f18, f2, f6
  214. fsel f3, f19, f3, f7
  215. fsub f16, f0, f20
  216. fsub f17, f1, f21
  217. fsub f18, f2, f22
  218. fsub f19, f3, f23
  219. fsel f0, f16, f0, f20
  220. fsel f1, f17, f1, f21
  221. fsel f2, f18, f2, f22
  222. fsel f3, f19, f3, f23
  223. addi X, X, 16 * SIZE
  224. .align 4
  225. LL(50):
  226. andi. r0, N, 7
  227. mtspr CTR, r0
  228. beq LL(999)
  229. .align 4
  230. LL(60):
  231. LFD f8, 0 * SIZE(X)
  232. LFD f9, 1 * SIZE(X)
  233. addi X, X, 2 * SIZE
  234. fabs f8, f8
  235. fabs f9, f9
  236. fadd f8, f8, f9
  237. fsub f16, f1, f8
  238. fsel f1, f16, f1, f8
  239. bdnz LL(60)
  240. b LL(999)
  241. .align 4
  242. LL(100):
  243. sub X, X, INCXM1
  244. srawi. r0, N, 3
  245. mtspr CTR, r0
  246. beq- LL(150)
  247. LFDX f24, X, INCXM1
  248. LFDUX f25, X, INCX
  249. LFDX f26, X, INCXM1
  250. LFDUX f27, X, INCX
  251. LFDX f28, X, INCXM1
  252. LFDUX f29, X, INCX
  253. LFDX f30, X, INCXM1
  254. LFDUX f31, X, INCX
  255. fabs f8, f24
  256. fabs f9, f25
  257. fabs f10, f26
  258. fabs f11, f27
  259. fabs f12, f28
  260. fabs f13, f29
  261. fabs f14, f30
  262. fabs f15, f31
  263. LFDX f24, X, INCXM1
  264. LFDUX f25, X, INCX
  265. LFDX f26, X, INCXM1
  266. LFDUX f27, X, INCX
  267. LFDX f28, X, INCXM1
  268. LFDUX f29, X, INCX
  269. LFDX f30, X, INCXM1
  270. LFDUX f31, X, INCX
  271. bdz LL(120)
  272. .align 4
  273. LL(110):
  274. fadd f4, f8, f9
  275. fadd f5, f10, f11
  276. fadd f6, f12, f13
  277. fadd f7, f14, f15
  278. fabs f8, f24
  279. fabs f9, f25
  280. fabs f10, f26
  281. fabs f11, f27
  282. LFDX f24, X, INCXM1
  283. LFDUX f25, X, INCX
  284. LFDX f26, X, INCXM1
  285. LFDUX f27, X, INCX
  286. fabs f12, f28
  287. fabs f13, f29
  288. fabs f14, f30
  289. fabs f15, f31
  290. LFDX f28, X, INCXM1
  291. LFDUX f29, X, INCX
  292. LFDX f30, X, INCXM1
  293. LFDUX f31, X, INCX
  294. fsub f16, f0, f4
  295. fsub f17, f1, f5
  296. fsub f18, f2, f6
  297. fsub f19, f3, f7
  298. fadd f20, f8, f9
  299. fadd f21, f10, f11
  300. fadd f22, f12, f13
  301. fadd f23, f14, f15
  302. fabs f8, f24
  303. fabs f9, f25
  304. fabs f10, f26
  305. fabs f11, f27
  306. LFDX f24, X, INCXM1
  307. LFDUX f25, X, INCX
  308. LFDX f26, X, INCXM1
  309. LFDUX f27, X, INCX
  310. fsel f0, f16, f0, f4
  311. fsel f1, f17, f1, f5
  312. fsel f2, f18, f2, f6
  313. fsel f3, f19, f3, f7
  314. fabs f12, f28
  315. fabs f13, f29
  316. fabs f14, f30
  317. fabs f15, f31
  318. LFDX f28, X, INCXM1
  319. LFDUX f29, X, INCX
  320. LFDX f30, X, INCXM1
  321. LFDUX f31, X, INCX
  322. fsub f16, f0, f20
  323. fsub f17, f1, f21
  324. fsub f18, f2, f22
  325. fsub f19, f3, f23
  326. fsel f0, f16, f0, f20
  327. fsel f1, f17, f1, f21
  328. fsel f2, f18, f2, f22
  329. fsel f3, f19, f3, f23
  330. bdnz LL(110)
  331. .align 4
  332. LL(120):
  333. fadd f4, f8, f9
  334. fadd f5, f10, f11
  335. fadd f6, f12, f13
  336. fadd f7, f14, f15
  337. fabs f8, f24
  338. fabs f9, f25
  339. fabs f10, f26
  340. fabs f11, f27
  341. fabs f12, f28
  342. fabs f13, f29
  343. fabs f14, f30
  344. fabs f15, f31
  345. fsub f16, f0, f4
  346. fsub f17, f1, f5
  347. fsub f18, f2, f6
  348. fsub f19, f3, f7
  349. fadd f20, f8, f9
  350. fadd f21, f10, f11
  351. fadd f22, f12, f13
  352. fadd f23, f14, f15
  353. fsel f0, f16, f0, f4
  354. fsel f1, f17, f1, f5
  355. fsel f2, f18, f2, f6
  356. fsel f3, f19, f3, f7
  357. fsub f16, f0, f20
  358. fsub f17, f1, f21
  359. fsub f18, f2, f22
  360. fsub f19, f3, f23
  361. fsel f0, f16, f0, f20
  362. fsel f1, f17, f1, f21
  363. fsel f2, f18, f2, f22
  364. fsel f3, f19, f3, f23
  365. .align 4
  366. LL(150):
  367. andi. r0, N, 7
  368. mtspr CTR, r0
  369. beq LL(999)
  370. .align 4
  371. LL(160):
  372. LFDX f8, X, INCXM1
  373. LFDUX f9, X, INCX
  374. fabs f8, f8
  375. fabs f9, f9
  376. fadd f8, f8, f9
  377. fsub f16, f1, f8
  378. fsel f1, f16, f1, f8
  379. bdnz LL(160)
  380. .align 4
  381. LL(999):
  382. fsub f8, f0, f1
  383. fsub f9, f2, f3
  384. fsel f0, f8, f0, f1
  385. fsel f2, f9, f2, f3
  386. fsub f8, f0, f2
  387. fsel f1, f8, f0, f2
  388. .align 4
  389. LL(9999):
  390. lfd f14, 0(SP)
  391. lfd f15, 8(SP)
  392. lfd f16, 16(SP)
  393. lfd f17, 24(SP)
  394. lfd f18, 32(SP)
  395. lfd f19, 40(SP)
  396. lfd f20, 48(SP)
  397. lfd f21, 56(SP)
  398. lfd f22, 64(SP)
  399. lfd f23, 72(SP)
  400. lfd f24, 80(SP)
  401. lfd f25, 88(SP)
  402. lfd f26, 96(SP)
  403. lfd f27, 104(SP)
  404. lfd f28, 112(SP)
  405. lfd f29, 120(SP)
  406. lfd f30, 128(SP)
  407. lfd f31, 136(SP)
  408. addi SP, SP, STACKSIZE
  409. blr
  410. EPILOGUE