You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

asum_hummer.S 8.3 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455
  1. /*********************************************************************/
  2. /* Copyright 2009, 2010 The University of Texas at Austin. */
  3. /* All rights reserved. */
  4. /* */
  5. /* Redistribution and use in source and binary forms, with or */
  6. /* without modification, are permitted provided that the following */
  7. /* conditions are met: */
  8. /* */
  9. /* 1. Redistributions of source code must retain the above */
  10. /* copyright notice, this list of conditions and the following */
  11. /* disclaimer. */
  12. /* */
  13. /* 2. Redistributions in binary form must reproduce the above */
  14. /* copyright notice, this list of conditions and the following */
  15. /* disclaimer in the documentation and/or other materials */
  16. /* provided with the distribution. */
  17. /* */
  18. /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
  19. /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
  20. /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
  21. /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
  22. /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
  23. /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
  24. /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
  25. /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
  26. /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
  27. /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
  28. /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
  29. /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
  30. /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
  31. /* POSSIBILITY OF SUCH DAMAGE. */
  32. /* */
  33. /* The views and conclusions contained in the software and */
  34. /* documentation are those of the authors and should not be */
  35. /* interpreted as representing official policies, either expressed */
  36. /* or implied, of The University of Texas at Austin. */
  37. /*********************************************************************/
  38. #define ASSEMBLER
  39. #include "common.h"
  40. #define N r3
  41. #define X r4
  42. #define INCX r5
  43. #define INCX2 r6
  44. #define X2 r7
  45. #define C1 f1
  46. #define C2 f0
  47. #define C3 f2
  48. #define C4 f3
  49. #define A1 f4
  50. #define A2 f5
  51. #define A3 f6
  52. #define A4 f7
  53. #define A5 f8
  54. #define A6 f9
  55. #define A7 f10
  56. #define A8 f11
  57. #define T1 f12
  58. #define T2 f13
  59. #define T3 f14
  60. #define T4 f15
  61. PROLOGUE
  62. PROFCODE
  63. li r10, -16
  64. stfpdux f14, SP, r10
  65. stfpdux f15, SP, r10
  66. li r10, 0
  67. stwu r10, -4(SP)
  68. stwu r10, -4(SP)
  69. stwu r10, -4(SP)
  70. stwu r10, -4(SP)
  71. #ifdef F_INTERFACE
  72. LDINT N, 0(N)
  73. LDINT INCX, 0(INCX)
  74. #endif
  75. lfpdx C1, SP, r10 # Zero clear
  76. slwi INCX, INCX, BASE_SHIFT
  77. add INCX2, INCX, INCX
  78. fpmr C2, C1
  79. fpmr C3, C1
  80. fpmr C4, C1
  81. cmpwi cr0, N, 0
  82. ble LL(999)
  83. cmpwi cr0, INCX, 0
  84. ble LL(999)
  85. cmpwi cr0, INCX, SIZE
  86. bne LL(100)
  87. andi. r0, X, 2 * SIZE - 1
  88. beq LL(05)
  89. LFD C1, 0(X)
  90. addi X, X, 1 * SIZE
  91. addi N, N, -1
  92. cmpwi cr0, N, 0
  93. fabs C1, C1
  94. ble LL(999)
  95. .align 4
  96. LL(05):
  97. srawi. r0, N, 4
  98. sub X, X, INCX2
  99. mtspr CTR, r0
  100. beq- LL(15)
  101. LFPDUX A1, X, INCX2
  102. fpmr T1, C2
  103. LFPDUX A2, X, INCX2
  104. fpmr T2, C2
  105. LFPDUX A3, X, INCX2
  106. fpmr T3, C2
  107. LFPDUX A4, X, INCX2
  108. fpmr T4, C2
  109. LFPDUX A5, X, INCX2
  110. LFPDUX A6, X, INCX2
  111. LFPDUX A7, X, INCX2
  112. LFPDUX A8, X, INCX2
  113. bdz LL(13)
  114. .align 4
  115. LL(12):
  116. fpadd C1, C1, T1
  117. nop
  118. fpabs T1, A1
  119. LFPDUX A1, X, INCX2
  120. fpadd C2, C2, T2
  121. nop
  122. fpabs T2, A2
  123. LFPDUX A2, X, INCX2
  124. fpadd C3, C3, T3
  125. nop
  126. fpabs T3, A3
  127. LFPDUX A3, X, INCX2
  128. fpadd C4, C4, T4
  129. nop
  130. fpabs T4, A4
  131. LFPDUX A4, X, INCX2
  132. fpadd C1, C1, T1
  133. nop
  134. fpabs T1, A5
  135. LFPDUX A5, X, INCX2
  136. fpadd C2, C2, T2
  137. nop
  138. fpabs T2, A6
  139. LFPDUX A6, X, INCX2
  140. fpadd C3, C3, T3
  141. nop
  142. fpabs T3, A7
  143. LFPDUX A7, X, INCX2
  144. fpadd C4, C4, T4
  145. fpabs T4, A8
  146. LFPDUX A8, X, INCX2
  147. bdnz LL(12)
  148. .align 4
  149. LL(13):
  150. fpadd C1, C1, T1
  151. fpabs T1, A1
  152. fpadd C2, C2, T2
  153. fpabs T2, A2
  154. fpadd C3, C3, T3
  155. fpabs T3, A3
  156. fpadd C4, C4, T4
  157. fpabs T4, A4
  158. fpadd C1, C1, T1
  159. fpabs T1, A5
  160. fpadd C2, C2, T2
  161. fpabs T2, A6
  162. fpadd C3, C3, T3
  163. fpabs T3, A7
  164. fpadd C4, C4, T4
  165. fpabs T4, A8
  166. fpadd C1, C1, T1
  167. fpadd C2, C2, T2
  168. fpadd C3, C3, T3
  169. fpadd C4, C4, T4
  170. .align 4
  171. LL(15):
  172. andi. r0, N, 15
  173. beq LL(999)
  174. andi. r0, N, 8
  175. beq LL(16)
  176. LFPDUX A1, X, INCX2
  177. LFPDUX A2, X, INCX2
  178. LFPDUX A3, X, INCX2
  179. LFPDUX A4, X, INCX2
  180. fpabs T1, A1
  181. fpabs T2, A2
  182. fpabs T3, A3
  183. fpabs T4, A4
  184. fpadd C1, C1, T1
  185. fpadd C2, C2, T2
  186. fpadd C3, C3, T3
  187. fpadd C4, C4, T4
  188. .align 4
  189. LL(16):
  190. andi. r0, N, 4
  191. beq LL(17)
  192. LFPDUX A1, X, INCX2
  193. LFPDUX A2, X, INCX2
  194. fpabs T1, A1
  195. fpabs T2, A2
  196. fpadd C1, C1, T1
  197. fpadd C2, C2, T2
  198. .align 4
  199. LL(17):
  200. andi. r0, N, 2
  201. beq LL(18)
  202. LFPDUX A1, X, INCX2
  203. fpabs T1, A1
  204. fpadd C1, C1, T1
  205. .align 4
  206. LL(18):
  207. andi. r0, N, 1
  208. beq LL(999)
  209. LFDX A1, X, INCX2
  210. fabs T1, A1
  211. fadd C1, C1, T1
  212. b LL(999)
  213. .align 4
  214. LL(100):
  215. sub X2, X, INCX
  216. sub X, X, INCX2
  217. srawi. r0, N, 4
  218. mtspr CTR, r0
  219. beq- LL(115)
  220. LFDUX A1, X, INCX2
  221. fpmr T1, C2
  222. LFDUX A2, X, INCX2
  223. fpmr T2, C2
  224. LFDUX A3, X, INCX2
  225. fpmr T3, C2
  226. LFDUX A4, X, INCX2
  227. fpmr T4, C2
  228. LFDUX A5, X, INCX2
  229. LFSDUX A1, X2, INCX2
  230. LFDUX A6, X, INCX2
  231. LFSDUX A2, X2, INCX2
  232. LFDUX A7, X, INCX2
  233. LFSDUX A3, X2, INCX2
  234. LFDUX A8, X, INCX2
  235. LFSDUX A4, X2, INCX2
  236. bdz LL(113)
  237. .align 4
  238. LL(112):
  239. fpadd C1, C1, T1
  240. LFSDUX A5, X2, INCX2
  241. fpabs T1, A1
  242. LFDUX A1, X, INCX2
  243. fpadd C2, C2, T2
  244. LFSDUX A6, X2, INCX2
  245. fpabs T2, A2
  246. LFDUX A2, X, INCX2
  247. fpadd C3, C3, T3
  248. LFSDUX A7, X2, INCX2
  249. fpabs T3, A3
  250. LFDUX A3, X, INCX2
  251. fpadd C4, C4, T4
  252. LFSDUX A8, X2, INCX2
  253. fpabs T4, A4
  254. LFDUX A4, X, INCX2
  255. fpadd C1, C1, T1
  256. LFSDUX A1, X2, INCX2
  257. fpabs T1, A5
  258. LFDUX A5, X, INCX2
  259. fpadd C2, C2, T2
  260. LFSDUX A2, X2, INCX2
  261. fpabs T2, A6
  262. LFDUX A6, X, INCX2
  263. fpadd C3, C3, T3
  264. LFSDUX A3, X2, INCX2
  265. fpabs T3, A7
  266. LFDUX A7, X, INCX2
  267. fpadd C4, C4, T4
  268. LFSDUX A4, X2, INCX2
  269. fpabs T4, A8
  270. LFDUX A8, X, INCX2
  271. bdnz LL(112)
  272. .align 4
  273. LL(113):
  274. fpadd C1, C1, T1
  275. nop
  276. fpabs T1, A1
  277. LFSDUX A5, X2, INCX2
  278. fpadd C2, C2, T2
  279. nop
  280. fpabs T2, A2
  281. LFSDUX A6, X2, INCX2
  282. fpadd C3, C3, T3
  283. nop
  284. fpabs T3, A3
  285. LFSDUX A7, X2, INCX2
  286. fpadd C4, C4, T4
  287. nop
  288. fpabs T4, A4
  289. LFSDUX A8, X2, INCX2
  290. fpadd C1, C1, T1
  291. fpabs T1, A5
  292. fpadd C2, C2, T2
  293. fpabs T2, A6
  294. fpadd C3, C3, T3
  295. fpabs T3, A7
  296. fpadd C4, C4, T4
  297. fpabs T4, A8
  298. fpadd C1, C1, T1
  299. fpadd C2, C2, T2
  300. fpadd C3, C3, T3
  301. fpadd C4, C4, T4
  302. .align 4
  303. LL(115):
  304. andi. r0, N, 15
  305. beq LL(999)
  306. andi. r0, N, 8
  307. beq LL(116)
  308. LFDUX A1, X, INCX2
  309. LFDUX A2, X2, INCX2
  310. LFDUX A3, X, INCX2
  311. LFDUX A4, X2, INCX2
  312. fabs T1, A1
  313. LFDUX A5, X, INCX2
  314. fabs T2, A2
  315. LFDUX A6, X2, INCX2
  316. fabs T3, A3
  317. LFDUX A7, X, INCX2
  318. fabs T4, A4
  319. LFDUX A8, X2, INCX2
  320. fadd C1, C1, T1
  321. fabs T1, A5
  322. fadd C2, C2, T2
  323. fabs T2, A6
  324. fadd C3, C3, T3
  325. fabs T3, A7
  326. fadd C4, C4, T4
  327. fabs T4, A8
  328. fadd C1, C1, T1
  329. fadd C2, C2, T2
  330. fadd C3, C3, T3
  331. fadd C4, C4, T4
  332. .align 4
  333. LL(116):
  334. andi. r0, N, 4
  335. beq LL(117)
  336. LFDUX A1, X, INCX2
  337. LFDUX A2, X2, INCX2
  338. LFDUX A3, X, INCX2
  339. LFDUX A4, X2, INCX2
  340. fabs T1, A1
  341. fabs T2, A2
  342. fabs T3, A3
  343. fabs T4, A4
  344. fadd C1, C1, T1
  345. fadd C2, C2, T2
  346. fadd C3, C3, T3
  347. fadd C4, C4, T4
  348. .align 4
  349. LL(117):
  350. andi. r0, N, 2
  351. beq LL(118)
  352. LFDUX A1, X, INCX2
  353. LFDUX A2, X2, INCX2
  354. fabs T1, A1
  355. fabs T2, A2
  356. fadd C1, C1, T1
  357. fadd C2, C2, T2
  358. .align 4
  359. LL(118):
  360. andi. r0, N, 1
  361. beq LL(999)
  362. LFDX A1, X, INCX2
  363. fabs T1, A1
  364. fadd C1, C1, T1
  365. .align 4
  366. LL(999):
  367. fpadd C1, C1, C2
  368. li r10, 16
  369. fpadd C3, C3, C4
  370. fpadd C1, C1, C3
  371. lfpdux f15, SP, r10
  372. fsmtp C2, C1
  373. lfpdux f14, SP, r10
  374. addi SP, SP, 16
  375. fadd C1, C2, C1
  376. blr
  377. EPILOGUE