You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

iamax_ppc440.S 9.5 kB


  1. /*********************************************************************/
  2. /* Copyright 2009, 2010 The University of Texas at Austin. */
  3. /* All rights reserved. */
  4. /* */
  5. /* Redistribution and use in source and binary forms, with or */
  6. /* without modification, are permitted provided that the following */
  7. /* conditions are met: */
  8. /* */
  9. /* 1. Redistributions of source code must retain the above */
  10. /* copyright notice, this list of conditions and the following */
  11. /* disclaimer. */
  12. /* */
  13. /* 2. Redistributions in binary form must reproduce the above */
  14. /* copyright notice, this list of conditions and the following */
  15. /* disclaimer in the documentation and/or other materials */
  16. /* provided with the distribution. */
  17. /* */
  18. /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
  19. /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
  20. /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
  21. /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
  22. /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
  23. /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
  24. /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
  25. /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
  26. /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
  27. /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
  28. /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
  29. /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
  30. /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
  31. /* POSSIBILITY OF SUCH DAMAGE. */
  32. /* */
  33. /* The views and conclusions contained in the software and */
  34. /* documentation are those of the authors and should not be */
  35. /* interpreted as representing official policies, either expressed */
  36. /* or implied, of The University of Texas at Austin. */
  37. /*********************************************************************/
  38. #define ASSEMBLER
  39. #include "common.h"
  40. #define RET r3
  41. #define X r4
  42. #define INCX r5
  43. #define N r6
  44. #define NN r7
  45. #define XX r8
  46. #define PRE r9
  47. #define FZERO f1
  48. #define STACKSIZE 160
  49. PROLOGUE
  50. PROFCODE
  51. addi SP, SP, -STACKSIZE
  52. li r0, 0
  53. stfd f14, 0(SP)
  54. stfd f15, 8(SP)
  55. stfd f16, 16(SP)
  56. stfd f17, 24(SP)
  57. stfd f18, 32(SP)
  58. stfd f19, 40(SP)
  59. stfd f20, 48(SP)
  60. stfd f21, 56(SP)
  61. stfd f22, 64(SP)
  62. stfd f23, 72(SP)
  63. stfd f24, 80(SP)
  64. stfd f25, 88(SP)
  65. stfd f26, 96(SP)
  66. stfd f27, 104(SP)
  67. stfd f28, 112(SP)
  68. stfd f29, 120(SP)
  69. stfd f30, 128(SP)
  70. stfd f31, 136(SP)
  71. stw r0, 144(SP)
  72. lfs FZERO,144(SP)
  73. #ifdef F_INTERFACE
  74. LDINT N, 0(r3)
  75. LDINT INCX, 0(INCX)
  76. #else
  77. mr N, r3
  78. #endif
  79. li RET, 0
  80. slwi INCX, INCX, BASE_SHIFT
  81. sub X, X, INCX
  82. li PRE, 3 * 16 * SIZE
  83. mr NN, N
  84. mr XX, X
  85. cmpwi cr0, N, 0
  86. ble- LL(9999)
  87. cmpwi cr0, INCX, 0
  88. ble- LL(9999)
  89. LFDUX f1, X, INCX
  90. fabs f0, f1
  91. fabs f2, f1
  92. fabs f3, f1
  93. fabs f4, f1
  94. fabs f5, f1
  95. subi N, N, 1
  96. fabs f6, f1
  97. srawi. r0, N, 4
  98. fabs f7, f1
  99. mtspr CTR, r0
  100. fabs f1, f1
  101. beq- LL(150)
  102. LFDUX f24, X, INCX
  103. LFDUX f25, X, INCX
  104. LFDUX f26, X, INCX
  105. LFDUX f27, X, INCX
  106. LFDUX f28, X, INCX
  107. LFDUX f29, X, INCX
  108. LFDUX f30, X, INCX
  109. LFDUX f31, X, INCX
  110. fabs f8, f24
  111. LFDUX f24, X, INCX
  112. fabs f9, f25
  113. LFDUX f25, X, INCX
  114. fabs f10, f26
  115. LFDUX f26, X, INCX
  116. fabs f11, f27
  117. LFDUX f27, X, INCX
  118. fabs f12, f28
  119. LFDUX f28, X, INCX
  120. fabs f13, f29
  121. LFDUX f29, X, INCX
  122. fabs f14, f30
  123. LFDUX f30, X, INCX
  124. fabs f15, f31
  125. LFDUX f31, X, INCX
  126. bdz LL(120)
  127. .align 4
  128. LL(110):
  129. fsub f16, f0, f8
  130. #ifdef PPCG4
  131. dcbt X, PRE
  132. #endif
  133. fsub f17, f1, f9
  134. fsub f18, f2, f10
  135. fsub f19, f3, f11
  136. fsub f20, f4, f12
  137. fsub f21, f5, f13
  138. fsub f22, f6, f14
  139. fsub f23, f7, f15
  140. fsel f0, f16, f0, f8
  141. fabs f8, f24
  142. LFDUX f24, X, INCX
  143. fsel f1, f17, f1, f9
  144. fabs f9, f25
  145. LFDUX f25, X, INCX
  146. fsel f2, f18, f2, f10
  147. fabs f10, f26
  148. LFDUX f26, X, INCX
  149. fsel f3, f19, f3, f11
  150. fabs f11, f27
  151. LFDUX f27, X, INCX
  152. fsel f4, f20, f4, f12
  153. #if defined(PPCG4) && defined(DOUBLE)
  154. dcbt X, PRE
  155. #endif
  156. fabs f12, f28
  157. LFDUX f28, X, INCX
  158. fsel f5, f21, f5, f13
  159. fabs f13, f29
  160. LFDUX f29, X, INCX
  161. fsel f6, f22, f6, f14
  162. fabs f14, f30
  163. LFDUX f30, X, INCX
  164. fsel f7, f23, f7, f15
  165. fabs f15, f31
  166. LFDUX f31, X, INCX
  167. fsub f16, f0, f8
  168. #ifdef PPCG4
  169. dcbt X, PRE
  170. #endif
  171. fsub f17, f1, f9
  172. fsub f18, f2, f10
  173. fsub f19, f3, f11
  174. fsub f20, f4, f12
  175. fsub f21, f5, f13
  176. fsub f22, f6, f14
  177. fsub f23, f7, f15
  178. fsel f0, f16, f0, f8
  179. fabs f8, f24
  180. LFDUX f24, X, INCX
  181. fsel f1, f17, f1, f9
  182. fabs f9, f25
  183. LFDUX f25, X, INCX
  184. fsel f2, f18, f2, f10
  185. fabs f10, f26
  186. LFDUX f26, X, INCX
  187. fsel f3, f19, f3, f11
  188. fabs f11, f27
  189. LFDUX f27, X, INCX
  190. fsel f4, f20, f4, f12
  191. #if defined(PPCG4) && defined(DOUBLE)
  192. dcbt X, PRE
  193. #endif
  194. fabs f12, f28
  195. LFDUX f28, X, INCX
  196. fsel f5, f21, f5, f13
  197. fabs f13, f29
  198. LFDUX f29, X, INCX
  199. fsel f6, f22, f6, f14
  200. fabs f14, f30
  201. LFDUX f30, X, INCX
  202. fsel f7, f23, f7, f15
  203. fabs f15, f31
  204. LFDUX f31, X, INCX
  205. bdnz LL(110)
  206. .align 4
  207. LL(120):
  208. fsub f16, f0, f8
  209. fsub f17, f1, f9
  210. fsub f18, f2, f10
  211. fsub f19, f3, f11
  212. fsub f20, f4, f12
  213. fsub f21, f5, f13
  214. fsub f22, f6, f14
  215. fsub f23, f7, f15
  216. fsel f0, f16, f0, f8
  217. fabs f8, f24
  218. fsel f1, f17, f1, f9
  219. fabs f9, f25
  220. fsel f2, f18, f2, f10
  221. fabs f10, f26
  222. fsel f3, f19, f3, f11
  223. fabs f11, f27
  224. fsel f4, f20, f4, f12
  225. fabs f12, f28
  226. fsel f5, f21, f5, f13
  227. fabs f13, f29
  228. fsel f6, f22, f6, f14
  229. fabs f14, f30
  230. fsel f7, f23, f7, f15
  231. fabs f15, f31
  232. fsub f16, f0, f8
  233. fsub f17, f1, f9
  234. fsub f18, f2, f10
  235. fsub f19, f3, f11
  236. fsub f20, f4, f12
  237. fsub f21, f5, f13
  238. fsub f22, f6, f14
  239. fsub f23, f7, f15
  240. fsel f0, f16, f0, f8
  241. fsel f1, f17, f1, f9
  242. fsel f2, f18, f2, f10
  243. fsel f3, f19, f3, f11
  244. fsel f4, f20, f4, f12
  245. fsel f5, f21, f5, f13
  246. fsel f6, f22, f6, f14
  247. fsel f7, f23, f7, f15
  248. .align 4
  249. LL(150):
  250. andi. r0, N, 15
  251. mtspr CTR, r0
  252. beq LL(999)
  253. .align 4
  254. LL(160):
  255. LFDUX f8, X, INCX
  256. fabs f8, f8
  257. fsub f16, f1, f8
  258. fsel f1, f16, f1, f8
  259. bdnz LL(160)
  260. .align 4
  261. LL(999):
  262. fsub f8, f0, f1
  263. fsub f9, f2, f3
  264. fsub f10, f4, f5
  265. fsub f11, f6, f7
  266. fsel f0, f8, f0, f1
  267. fsel f2, f9, f2, f3
  268. fsel f4, f10, f4, f5
  269. fsel f6, f11, f6, f7
  270. fsub f8, f0, f2
  271. fsub f9, f4, f6
  272. fsel f0, f8, f0, f2
  273. fsel f4, f9, f4, f6
  274. fsub f8, f0, f4
  275. fsel f1, f8, f0, f4
  276. .align 4
  277. LL(1000):
  278. srawi. r0, NN, 3
  279. mtspr CTR, r0
  280. beq- LL(1150)
  281. LFDUX f24, XX, INCX
  282. LFDUX f25, XX, INCX
  283. LFDUX f26, XX, INCX
  284. LFDUX f27, XX, INCX
  285. LFDUX f28, XX, INCX
  286. LFDUX f29, XX, INCX
  287. LFDUX f30, XX, INCX
  288. LFDUX f31, XX, INCX
  289. bdz LL(1120)
  290. .align 4
  291. LL(1110):
  292. fabs f8, f24
  293. LFDUX f24, XX, INCX
  294. fabs f9, f25
  295. LFDUX f25, XX, INCX
  296. fabs f10, f26
  297. LFDUX f26, XX, INCX
  298. fabs f11, f27
  299. LFDUX f27, XX, INCX
  300. #ifdef PPCG4
  301. dcbt XX, PRE
  302. #endif
  303. fabs f12, f28
  304. LFDUX f28, XX, INCX
  305. fabs f13, f29
  306. LFDUX f29, XX, INCX
  307. fabs f14, f30
  308. LFDUX f30, XX, INCX
  309. fabs f15, f31
  310. LFDUX f31, XX, INCX
  311. #if defined(PPCG4) && defined(DOUBLE)
  312. dcbt XX, PRE
  313. #endif
  314. addi RET, RET, 1
  315. fcmpu cr0, f1, f8
  316. beq cr0, LL(9999)
  317. addi RET, RET, 1
  318. fcmpu cr0, f1, f9
  319. beq cr0, LL(9999)
  320. addi RET, RET, 1
  321. fcmpu cr0, f1, f10
  322. beq cr0, LL(9999)
  323. addi RET, RET, 1
  324. fcmpu cr0, f1, f11
  325. beq cr0, LL(9999)
  326. addi RET, RET, 1
  327. fcmpu cr0, f1, f12
  328. beq cr0, LL(9999)
  329. addi RET, RET, 1
  330. fcmpu cr0, f1, f13
  331. beq cr0, LL(9999)
  332. addi RET, RET, 1
  333. fcmpu cr0, f1, f14
  334. beq cr0, LL(9999)
  335. addi RET, RET, 1
  336. fcmpu cr0, f1, f15
  337. beq cr0, LL(9999)
  338. bdnz LL(1110)
  339. .align 4
  340. LL(1120):
  341. fabs f8, f24
  342. fabs f9, f25
  343. fabs f10, f26
  344. fabs f11, f27
  345. fabs f12, f28
  346. fabs f13, f29
  347. fabs f14, f30
  348. fabs f15, f31
  349. addi RET, RET, 1
  350. fcmpu cr0, f1, f8
  351. beq cr0, LL(9999)
  352. addi RET, RET, 1
  353. fcmpu cr0, f1, f9
  354. beq cr0, LL(9999)
  355. addi RET, RET, 1
  356. fcmpu cr0, f1, f10
  357. beq cr0, LL(9999)
  358. addi RET, RET, 1
  359. fcmpu cr0, f1, f11
  360. beq cr0, LL(9999)
  361. addi RET, RET, 1
  362. fcmpu cr0, f1, f12
  363. beq cr0, LL(9999)
  364. addi RET, RET, 1
  365. fcmpu cr0, f1, f13
  366. beq cr0, LL(9999)
  367. addi RET, RET, 1
  368. fcmpu cr0, f1, f14
  369. beq cr0, LL(9999)
  370. addi RET, RET, 1
  371. fcmpu cr0, f1, f15
  372. beq cr0, LL(9999)
  373. .align 4
  374. LL(1150):
  375. andi. r0, NN, 7
  376. mtspr CTR, r0
  377. beq LL(9999)
  378. .align 4
  379. LL(1160):
  380. LFDUX f8, XX, INCX
  381. fabs f8, f8
  382. addi RET, RET, 1
  383. fcmpu cr0, f1, f8
  384. beq cr0, LL(9999)
  385. bdnz LL(1160)
  386. .align 4
  387. LL(9999):
  388. lfd f14, 0(SP)
  389. lfd f15, 8(SP)
  390. lfd f16, 16(SP)
  391. lfd f17, 24(SP)
  392. lfd f18, 32(SP)
  393. lfd f19, 40(SP)
  394. lfd f20, 48(SP)
  395. lfd f21, 56(SP)
  396. lfd f22, 64(SP)
  397. lfd f23, 72(SP)
  398. lfd f24, 80(SP)
  399. lfd f25, 88(SP)
  400. lfd f26, 96(SP)
  401. lfd f27, 104(SP)
  402. lfd f28, 112(SP)
  403. lfd f29, 120(SP)
  404. lfd f30, 128(SP)
  405. lfd f31, 136(SP)
  406. addi SP, SP, STACKSIZE
  407. blr
  408. EPILOGUE