You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

zamax_cell.S 9.6 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495
  1. /*********************************************************************/
  2. /* Copyright 2009, 2010 The University of Texas at Austin. */
  3. /* All rights reserved. */
  4. /* */
  5. /* Redistribution and use in source and binary forms, with or */
  6. /* without modification, are permitted provided that the following */
  7. /* conditions are met: */
  8. /* */
  9. /* 1. Redistributions of source code must retain the above */
  10. /* copyright notice, this list of conditions and the following */
  11. /* disclaimer. */
  12. /* */
  13. /* 2. Redistributions in binary form must reproduce the above */
  14. /* copyright notice, this list of conditions and the following */
  15. /* disclaimer in the documentation and/or other materials */
  16. /* provided with the distribution. */
  17. /* */
  18. /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
  19. /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
  20. /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
  21. /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
  22. /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
  23. /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
  24. /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
  25. /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
  26. /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
  27. /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
  28. /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
  29. /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
  30. /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
  31. /* POSSIBILITY OF SUCH DAMAGE. */
  32. /* */
  33. /* The views and conclusions contained in the software and */
  34. /* documentation are those of the authors and should not be */
  35. /* interpreted as representing official policies, either expressed */
  36. /* or implied, of The University of Texas at Austin. */
  37. /*********************************************************************/
  38. #define ASSEMBLER
  39. #include "common.h"
  40. #define N r3
  41. #define X r4
  42. #define INCX r5
  43. #define PREA r8
  44. #define INCXM1 r9
  45. #define FZERO f1
  46. #define STACKSIZE 160
  47. PROLOGUE
  48. PROFCODE
  49. addi SP, SP, -STACKSIZE
  50. li r0, 0
  51. stfd f14, 0(SP)
  52. stfd f15, 8(SP)
  53. stfd f16, 16(SP)
  54. stfd f17, 24(SP)
  55. stfd f18, 32(SP)
  56. stfd f19, 40(SP)
  57. stfd f20, 48(SP)
  58. stfd f21, 56(SP)
  59. stfd f22, 64(SP)
  60. stfd f23, 72(SP)
  61. stfd f24, 80(SP)
  62. stfd f25, 88(SP)
  63. stfd f26, 96(SP)
  64. stfd f27, 104(SP)
  65. stfd f28, 112(SP)
  66. stfd f29, 120(SP)
  67. stfd f30, 128(SP)
  68. stfd f31, 136(SP)
  69. stw r0, 144(SP)
  70. lfs FZERO,144(SP)
  71. #ifdef F_INTERFACE
  72. LDINT N, 0(N)
  73. LDINT INCX, 0(INCX)
  74. #endif
  75. slwi INCX, INCX, ZBASE_SHIFT
  76. subi INCXM1, INCX, SIZE
  77. li PREA, 10 * 16 * SIZE
  78. cmpwi cr0, N, 0
  79. ble- LL(9999)
  80. cmpwi cr0, INCX, 0
  81. ble- LL(9999)
  82. LFD f1, 0 * SIZE(X)
  83. LFD f2, 1 * SIZE(X)
  84. add X, X, INCX
  85. fabs f1, f1
  86. fabs f2, f2
  87. fadd f1, f1, f2
  88. fmr f0, f1
  89. fmr f2, f1
  90. fmr f3, f1
  91. subi N, N, 1
  92. cmpwi cr0, INCX, 2 * SIZE
  93. bne- cr0, LL(100)
  94. srawi. r0, N, 3
  95. mtspr CTR, r0
  96. beq- cr0, LL(50)
  97. .align 4
  98. LFD f24, 0 * SIZE(X)
  99. LFD f25, 1 * SIZE(X)
  100. fabs f8, f24
  101. LFD f26, 2 * SIZE(X)
  102. fabs f9, f25
  103. LFD f27, 3 * SIZE(X)
  104. fabs f10, f26
  105. LFD f28, 4 * SIZE(X)
  106. fabs f11, f27
  107. LFD f29, 5 * SIZE(X)
  108. fabs f12, f28
  109. LFD f30, 6 * SIZE(X)
  110. fabs f13, f29
  111. LFD f31, 7 * SIZE(X)
  112. fabs f14, f30
  113. nop
  114. fabs f15, f31
  115. bdz LL(20)
  116. .align 4
  117. LL(10):
  118. fadd f4, f8, f9
  119. dcbt X, PREA
  120. fadd f5, f10, f11
  121. nop
  122. fadd f6, f12, f13
  123. LFD f24, 8 * SIZE(X)
  124. fadd f7, f14, f15
  125. LFD f25, 9 * SIZE(X)
  126. fabs f8, f24
  127. LFD f26, 10 * SIZE(X)
  128. fabs f9, f25
  129. LFD f27, 11 * SIZE(X)
  130. fabs f10, f26
  131. fabs f11, f27
  132. fsub f16, f0, f4
  133. fsub f17, f1, f5
  134. fsub f18, f2, f6
  135. LFD f28, 12 * SIZE(X)
  136. fsub f19, f3, f7
  137. LFD f29, 13 * SIZE(X)
  138. fabs f12, f28
  139. LFD f30, 14 * SIZE(X)
  140. fabs f13, f29
  141. LFD f31, 15 * SIZE(X)
  142. fabs f14, f30
  143. fabs f15, f31
  144. fsel f0, f16, f0, f4
  145. fsel f1, f17, f1, f5
  146. fsel f2, f18, f2, f6
  147. fsel f3, f19, f3, f7
  148. fadd f20, f8, f9
  149. fadd f21, f10, f11
  150. fadd f22, f12, f13
  151. LFD f24, 16 * SIZE(X)
  152. fadd f23, f14, f15
  153. LFD f25, 17 * SIZE(X)
  154. fabs f8, f24
  155. LFD f26, 18 * SIZE(X)
  156. fabs f9, f25
  157. LFD f27, 19 * SIZE(X)
  158. fabs f10, f26
  159. fabs f11, f27
  160. fsub f16, f0, f20
  161. fsub f17, f1, f21
  162. fsub f18, f2, f22
  163. LFD f28, 20 * SIZE(X)
  164. fsub f19, f3, f23
  165. LFD f29, 21 * SIZE(X)
  166. fabs f12, f28
  167. LFD f30, 22 * SIZE(X)
  168. fabs f13, f29
  169. LFD f31, 23 * SIZE(X)
  170. fabs f14, f30
  171. addi X, X, 16 * SIZE
  172. fabs f15, f31
  173. fsel f0, f16, f0, f20
  174. fsel f1, f17, f1, f21
  175. fsel f2, f18, f2, f22
  176. fsel f3, f19, f3, f23
  177. bdnz LL(10)
  178. .align 4
  179. LL(20):
  180. fadd f4, f8, f9
  181. fadd f5, f10, f11
  182. fadd f6, f12, f13
  183. LFD f24, 8 * SIZE(X)
  184. fadd f7, f14, f15
  185. LFD f25, 9 * SIZE(X)
  186. fabs f8, f24
  187. LFD f26, 10 * SIZE(X)
  188. fabs f9, f25
  189. LFD f27, 11 * SIZE(X)
  190. fabs f10, f26
  191. fabs f11, f27
  192. fsub f16, f0, f4
  193. fsub f17, f1, f5
  194. fsub f18, f2, f6
  195. LFD f28, 12 * SIZE(X)
  196. fsub f19, f3, f7
  197. LFD f29, 13 * SIZE(X)
  198. fabs f12, f28
  199. LFD f30, 14 * SIZE(X)
  200. fabs f13, f29
  201. LFD f31, 15 * SIZE(X)
  202. fabs f14, f30
  203. fabs f15, f31
  204. fsel f0, f16, f0, f4
  205. fsel f1, f17, f1, f5
  206. fsel f2, f18, f2, f6
  207. fsel f3, f19, f3, f7
  208. fadd f20, f8, f9
  209. fadd f21, f10, f11
  210. fadd f22, f12, f13
  211. fadd f23, f14, f15
  212. fsub f16, f0, f20
  213. fsub f17, f1, f21
  214. fsub f18, f2, f22
  215. fsub f19, f3, f23
  216. fsel f0, f16, f0, f20
  217. fsel f1, f17, f1, f21
  218. fsel f2, f18, f2, f22
  219. fsel f3, f19, f3, f23
  220. addi X, X, 16 * SIZE
  221. .align 4
  222. LL(50):
  223. andi. r0, N, 7
  224. mtspr CTR, r0
  225. beq LL(999)
  226. .align 4
  227. LL(60):
  228. LFD f8, 0 * SIZE(X)
  229. LFD f9, 1 * SIZE(X)
  230. addi X, X, 2 * SIZE
  231. fabs f8, f8
  232. fabs f9, f9
  233. fadd f8, f8, f9
  234. fsub f16, f1, f8
  235. fsel f1, f16, f1, f8
  236. bdnz LL(60)
  237. b LL(999)
  238. .align 4
  239. LL(100):
  240. sub X, X, INCXM1
  241. srawi. r0, N, 3
  242. mtspr CTR, r0
  243. beq- LL(150)
  244. LFDX f24, X, INCXM1
  245. LFDUX f25, X, INCX
  246. LFDX f26, X, INCXM1
  247. LFDUX f27, X, INCX
  248. LFDX f28, X, INCXM1
  249. LFDUX f29, X, INCX
  250. LFDX f30, X, INCXM1
  251. LFDUX f31, X, INCX
  252. fabs f8, f24
  253. fabs f9, f25
  254. fabs f10, f26
  255. fabs f11, f27
  256. fabs f12, f28
  257. fabs f13, f29
  258. fabs f14, f30
  259. fabs f15, f31
  260. LFDX f24, X, INCXM1
  261. LFDUX f25, X, INCX
  262. LFDX f26, X, INCXM1
  263. LFDUX f27, X, INCX
  264. LFDX f28, X, INCXM1
  265. LFDUX f29, X, INCX
  266. LFDX f30, X, INCXM1
  267. LFDUX f31, X, INCX
  268. bdz LL(120)
  269. .align 4
  270. LL(110):
  271. fadd f4, f8, f9
  272. fadd f5, f10, f11
  273. fadd f6, f12, f13
  274. fadd f7, f14, f15
  275. fabs f8, f24
  276. fabs f9, f25
  277. fabs f10, f26
  278. fabs f11, f27
  279. LFDX f24, X, INCXM1
  280. LFDUX f25, X, INCX
  281. LFDX f26, X, INCXM1
  282. LFDUX f27, X, INCX
  283. fabs f12, f28
  284. fabs f13, f29
  285. fabs f14, f30
  286. fabs f15, f31
  287. LFDX f28, X, INCXM1
  288. LFDUX f29, X, INCX
  289. LFDX f30, X, INCXM1
  290. LFDUX f31, X, INCX
  291. fsub f16, f0, f4
  292. fsub f17, f1, f5
  293. fsub f18, f2, f6
  294. fsub f19, f3, f7
  295. fadd f20, f8, f9
  296. fadd f21, f10, f11
  297. fadd f22, f12, f13
  298. fadd f23, f14, f15
  299. fabs f8, f24
  300. fabs f9, f25
  301. fabs f10, f26
  302. fabs f11, f27
  303. LFDX f24, X, INCXM1
  304. LFDUX f25, X, INCX
  305. LFDX f26, X, INCXM1
  306. LFDUX f27, X, INCX
  307. fsel f0, f16, f0, f4
  308. fsel f1, f17, f1, f5
  309. fsel f2, f18, f2, f6
  310. fsel f3, f19, f3, f7
  311. fabs f12, f28
  312. fabs f13, f29
  313. fabs f14, f30
  314. fabs f15, f31
  315. LFDX f28, X, INCXM1
  316. LFDUX f29, X, INCX
  317. LFDX f30, X, INCXM1
  318. LFDUX f31, X, INCX
  319. fsub f16, f0, f20
  320. fsub f17, f1, f21
  321. fsub f18, f2, f22
  322. fsub f19, f3, f23
  323. fsel f0, f16, f0, f20
  324. fsel f1, f17, f1, f21
  325. fsel f2, f18, f2, f22
  326. fsel f3, f19, f3, f23
  327. bdnz LL(110)
  328. .align 4
  329. LL(120):
  330. fadd f4, f8, f9
  331. fadd f5, f10, f11
  332. fadd f6, f12, f13
  333. fadd f7, f14, f15
  334. fabs f8, f24
  335. fabs f9, f25
  336. fabs f10, f26
  337. fabs f11, f27
  338. fabs f12, f28
  339. fabs f13, f29
  340. fabs f14, f30
  341. fabs f15, f31
  342. fsub f16, f0, f4
  343. fsub f17, f1, f5
  344. fsub f18, f2, f6
  345. fsub f19, f3, f7
  346. fadd f20, f8, f9
  347. fadd f21, f10, f11
  348. fadd f22, f12, f13
  349. fadd f23, f14, f15
  350. fsel f0, f16, f0, f4
  351. fsel f1, f17, f1, f5
  352. fsel f2, f18, f2, f6
  353. fsel f3, f19, f3, f7
  354. fsub f16, f0, f20
  355. fsub f17, f1, f21
  356. fsub f18, f2, f22
  357. fsub f19, f3, f23
  358. fsel f0, f16, f0, f20
  359. fsel f1, f17, f1, f21
  360. fsel f2, f18, f2, f22
  361. fsel f3, f19, f3, f23
  362. .align 4
  363. LL(150):
  364. andi. r0, N, 7
  365. mtspr CTR, r0
  366. beq LL(999)
  367. .align 4
  368. LL(160):
  369. LFDX f8, X, INCXM1
  370. LFDUX f9, X, INCX
  371. fabs f8, f8
  372. fabs f9, f9
  373. fadd f8, f8, f9
  374. fsub f16, f1, f8
  375. fsel f1, f16, f1, f8
  376. bdnz LL(160)
  377. .align 4
  378. LL(999):
  379. fsub f8, f0, f1
  380. fsub f9, f2, f3
  381. fsel f0, f8, f0, f1
  382. fsel f2, f9, f2, f3
  383. fsub f8, f0, f2
  384. fsel f1, f8, f0, f2
  385. .align 4
  386. LL(9999):
  387. lfd f14, 0(SP)
  388. lfd f15, 8(SP)
  389. lfd f16, 16(SP)
  390. lfd f17, 24(SP)
  391. lfd f18, 32(SP)
  392. lfd f19, 40(SP)
  393. lfd f20, 48(SP)
  394. lfd f21, 56(SP)
  395. lfd f22, 64(SP)
  396. lfd f23, 72(SP)
  397. lfd f24, 80(SP)
  398. lfd f25, 88(SP)
  399. lfd f26, 96(SP)
  400. lfd f27, 104(SP)
  401. lfd f28, 112(SP)
  402. lfd f29, 120(SP)
  403. lfd f30, 128(SP)
  404. lfd f31, 136(SP)
  405. addi SP, SP, STACKSIZE
  406. blr
  407. EPILOGUE