You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

scal_hummer.S 9.2 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477
  1. /*********************************************************************/
  2. /* Copyright 2009, 2010 The University of Texas at Austin. */
  3. /* All rights reserved. */
  4. /* */
  5. /* Redistribution and use in source and binary forms, with or */
  6. /* without modification, are permitted provided that the following */
  7. /* conditions are met: */
  8. /* */
  9. /* 1. Redistributions of source code must retain the above */
  10. /* copyright notice, this list of conditions and the following */
  11. /* disclaimer. */
  12. /* */
  13. /* 2. Redistributions in binary form must reproduce the above */
  14. /* copyright notice, this list of conditions and the following */
  15. /* disclaimer in the documentation and/or other materials */
  16. /* provided with the distribution. */
  17. /* */
  18. /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
  19. /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
  20. /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
  21. /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
  22. /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
  23. /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
  24. /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
  25. /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
  26. /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
  27. /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
  28. /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
  29. /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
  30. /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
  31. /* POSSIBILITY OF SUCH DAMAGE. */
  32. /* */
  33. /* The views and conclusions contained in the software and */
  34. /* documentation are those of the authors and should not be */
  35. /* interpreted as representing official policies, either expressed */
  36. /* or implied, of The University of Texas at Austin. */
  37. /*********************************************************************/
  38. #define ASSEMBLER
  39. #include "common.h"
  40. #define N r3
  41. #define X r6
  42. #define INCX r7
  43. #define INCX2 r4
  44. #define X2 r5
  45. #define ALPHA f1
  46. #define A1 f0
  47. #define A2 f16
  48. #define A3 f2
  49. #define A4 f3
  50. #define A5 f4
  51. #define A6 f5
  52. #define A7 f6
  53. #define A8 f7
  54. #define B1 f8
  55. #define B2 f9
  56. #define B3 f10
  57. #define B4 f11
  58. #define B5 f12
  59. #define B6 f13
  60. #define B7 f14
  61. #define B8 f15
  62. PROLOGUE
  63. PROFCODE
  64. li r10, -16
  65. stfpdux f14, SP, r10
  66. stfpdux f15, SP, r10
  67. stfpdux f16, SP, r10
  68. li r10, 0
  69. stwu r10, -4(SP)
  70. stwu r10, -4(SP)
  71. stwu r10, -4(SP)
  72. stwu r10, -4(SP)
  73. lfpdx A1, SP, r10 # Zero clear
  74. fsmfp ALPHA, ALPHA
  75. slwi INCX, INCX, BASE_SHIFT
  76. add INCX2, INCX, INCX
  77. cmpwi cr0, N, 0
  78. ble LL(999)
  79. cmpwi cr0, INCX, SIZE
  80. bne LL(100)
  81. fcmpu cr7, ALPHA, A1
  82. bne cr7, LL(50)
  83. sub X, X, INCX2
  84. andi. r0, X, 2 * SIZE - 1
  85. beq LL(11)
  86. STFDX A1, X, INCX2
  87. addi X, X, 1 * SIZE
  88. addi N, N, -1
  89. cmpwi cr0, N, 0
  90. ble LL(999)
  91. .align 4
  92. LL(11):
  93. srawi. r0, N, 4
  94. mtspr CTR, r0
  95. beq- LL(15)
  96. .align 4
  97. LL(12):
  98. STFPDUX A1, X, INCX2
  99. STFPDUX A1, X, INCX2
  100. STFPDUX A1, X, INCX2
  101. STFPDUX A1, X, INCX2
  102. STFPDUX A1, X, INCX2
  103. STFPDUX A1, X, INCX2
  104. STFPDUX A1, X, INCX2
  105. STFPDUX A1, X, INCX2
  106. bdnz LL(12)
  107. .align 4
  108. LL(15):
  109. andi. r0, N, 15
  110. beq LL(999)
  111. andi. r0, N, 8
  112. beq LL(16)
  113. STFPDUX A1, X, INCX2
  114. STFPDUX A1, X, INCX2
  115. STFPDUX A1, X, INCX2
  116. STFPDUX A1, X, INCX2
  117. .align 4
  118. LL(16):
  119. andi. r0, N, 4
  120. beq LL(17)
  121. STFPDUX A1, X, INCX2
  122. STFPDUX A1, X, INCX2
  123. .align 4
  124. LL(17):
  125. andi. r0, N, 2
  126. beq LL(18)
  127. STFPDUX A1, X, INCX2
  128. .align 4
  129. LL(18):
  130. andi. r0, N, 1
  131. beq LL(999)
  132. STFDUX A1, X, INCX2
  133. b LL(999)
  134. .align 4
  135. LL(50):
  136. sub X2, X, INCX2
  137. sub X, X, INCX2
  138. andi. r0, X, 2 * SIZE - 1
  139. beq LL(51)
  140. LFDX A1, X, INCX2
  141. addi X, X, 1 * SIZE
  142. fmul B1, ALPHA, A1
  143. addi N, N, -1
  144. cmpwi cr0, N, 0
  145. STFDX B1, X2, INCX2
  146. addi X2, X2, 1 * SIZE
  147. ble LL(999)
  148. .align 4
  149. LL(51):
  150. srawi. r0, N, 4
  151. mtspr CTR, r0
  152. beq- LL(55)
  153. LFPDUX A1, X, INCX2
  154. LFPDUX A2, X, INCX2
  155. LFPDUX A3, X, INCX2
  156. LFPDUX A4, X, INCX2
  157. LFPDUX A5, X, INCX2
  158. LFPDUX A6, X, INCX2
  159. LFPDUX A7, X, INCX2
  160. LFPDUX A8, X, INCX2
  161. bdz LL(53)
  162. .align 4
  163. LL(52):
  164. fpmul B1, ALPHA, A1
  165. LFPDUX A1, X, INCX2
  166. fpmul B2, ALPHA, A2
  167. LFPDUX A2, X, INCX2
  168. fpmul B3, ALPHA, A3
  169. LFPDUX A3, X, INCX2
  170. fpmul B4, ALPHA, A4
  171. LFPDUX A4, X, INCX2
  172. fpmul B5, ALPHA, A5
  173. LFPDUX A5, X, INCX2
  174. fpmul B6, ALPHA, A6
  175. LFPDUX A6, X, INCX2
  176. fpmul B7, ALPHA, A7
  177. LFPDUX A7, X, INCX2
  178. fpmul B8, ALPHA, A8
  179. LFPDUX A8, X, INCX2
  180. STFPDUX B1, X2, INCX2
  181. STFPDUX B2, X2, INCX2
  182. STFPDUX B3, X2, INCX2
  183. STFPDUX B4, X2, INCX2
  184. STFPDUX B5, X2, INCX2
  185. STFPDUX B6, X2, INCX2
  186. STFPDUX B7, X2, INCX2
  187. STFPDUX B8, X2, INCX2
  188. bdnz LL(52)
  189. .align 4
  190. LL(53):
  191. fpmul B1, ALPHA, A1
  192. fpmul B2, ALPHA, A2
  193. fpmul B3, ALPHA, A3
  194. fpmul B4, ALPHA, A4
  195. fpmul B5, ALPHA, A5
  196. fpmul B6, ALPHA, A6
  197. STFPDUX B1, X2, INCX2
  198. fpmul B7, ALPHA, A7
  199. STFPDUX B2, X2, INCX2
  200. fpmul B8, ALPHA, A8
  201. STFPDUX B3, X2, INCX2
  202. STFPDUX B4, X2, INCX2
  203. STFPDUX B5, X2, INCX2
  204. STFPDUX B6, X2, INCX2
  205. STFPDUX B7, X2, INCX2
  206. STFPDUX B8, X2, INCX2
  207. .align 4
  208. LL(55):
  209. andi. r0, N, 15
  210. beq LL(999)
  211. andi. r0, N, 8
  212. beq LL(56)
  213. LFPDUX A1, X, INCX2
  214. LFPDUX A2, X, INCX2
  215. LFPDUX A3, X, INCX2
  216. LFPDUX A4, X, INCX2
  217. fpmul B1, ALPHA, A1
  218. fpmul B2, ALPHA, A2
  219. fpmul B3, ALPHA, A3
  220. fpmul B4, ALPHA, A4
  221. STFPDUX B1, X2, INCX2
  222. STFPDUX B2, X2, INCX2
  223. STFPDUX B3, X2, INCX2
  224. STFPDUX B4, X2, INCX2
  225. .align 4
  226. LL(56):
  227. andi. r0, N, 4
  228. beq LL(57)
  229. LFPDUX A1, X, INCX2
  230. LFPDUX A2, X, INCX2
  231. fpmul B1, ALPHA, A1
  232. fpmul B2, ALPHA, A2
  233. STFPDUX B1, X2, INCX2
  234. STFPDUX B2, X2, INCX2
  235. .align 4
  236. LL(57):
  237. andi. r0, N, 2
  238. beq LL(58)
  239. LFPDUX A1, X, INCX2
  240. fpmul B1, ALPHA, A1
  241. STFPDUX B1, X2, INCX2
  242. .align 4
  243. LL(58):
  244. andi. r0, N, 1
  245. beq LL(999)
  246. LFDX A1, X, INCX2
  247. fmul B1, ALPHA, A1
  248. STFDX B1, X2, INCX2
  249. b LL(999)
  250. .align 4
  251. LL(100):
  252. fcmpu cr7, ALPHA, A1
  253. bne cr7, LL(200)
  254. sub X, X, INCX
  255. srawi. r0, N, 3
  256. mtspr CTR, r0
  257. beq- LL(115)
  258. .align 4
  259. LL(112):
  260. STFDUX A1, X, INCX
  261. STFDUX A1, X, INCX
  262. STFDUX A1, X, INCX
  263. STFDUX A1, X, INCX
  264. STFDUX A1, X, INCX
  265. STFDUX A1, X, INCX
  266. STFDUX A1, X, INCX
  267. STFDUX A1, X, INCX
  268. bdnz LL(112)
  269. .align 4
  270. LL(115):
  271. andi. r0, N, 7
  272. beq LL(999)
  273. andi. r0, N, 4
  274. beq LL(117)
  275. STFDUX A1, X, INCX
  276. STFDUX A1, X, INCX
  277. STFDUX A1, X, INCX
  278. STFDUX A1, X, INCX
  279. .align 4
  280. LL(117):
  281. andi. r0, N, 2
  282. beq LL(118)
  283. STFDUX A1, X, INCX
  284. STFDUX A1, X, INCX
  285. .align 4
  286. LL(118):
  287. andi. r0, N, 1
  288. beq LL(999)
  289. STFDUX A1, X, INCX
  290. b LL(999)
  291. .align 4
  292. LL(200):
  293. sub X2, X, INCX
  294. sub X, X, INCX
  295. srawi. r0, N, 3
  296. mtspr CTR, r0
  297. beq- LL(215)
  298. LFDUX A1, X, INCX
  299. LFDUX A2, X, INCX
  300. LFDUX A3, X, INCX
  301. LFDUX A4, X, INCX
  302. LFDUX A5, X, INCX
  303. LFDUX A6, X, INCX
  304. LFDUX A7, X, INCX
  305. LFDUX A8, X, INCX
  306. bdz LL(213)
  307. .align 4
  308. LL(212):
  309. fmul B1, ALPHA, A1
  310. LFDUX A1, X, INCX
  311. fmul B2, ALPHA, A2
  312. LFDUX A2, X, INCX
  313. fmul B3, ALPHA, A3
  314. LFDUX A3, X, INCX
  315. fmul B4, ALPHA, A4
  316. LFDUX A4, X, INCX
  317. fmul B5, ALPHA, A5
  318. LFDUX A5, X, INCX
  319. fmul B6, ALPHA, A6
  320. LFDUX A6, X, INCX
  321. fmul B7, ALPHA, A7
  322. LFDUX A7, X, INCX
  323. fmul B8, ALPHA, A8
  324. LFDUX A8, X, INCX
  325. STFDUX B1, X2, INCX
  326. STFDUX B2, X2, INCX
  327. STFDUX B3, X2, INCX
  328. STFDUX B4, X2, INCX
  329. STFDUX B5, X2, INCX
  330. STFDUX B6, X2, INCX
  331. STFDUX B7, X2, INCX
  332. STFDUX B8, X2, INCX
  333. bdnz LL(212)
  334. .align 4
  335. LL(213):
  336. fmul B1, ALPHA, A1
  337. fmul B2, ALPHA, A2
  338. fmul B3, ALPHA, A3
  339. fmul B4, ALPHA, A4
  340. fmul B5, ALPHA, A5
  341. fmul B6, ALPHA, A6
  342. STFDUX B1, X2, INCX
  343. fmul B7, ALPHA, A7
  344. STFDUX B2, X2, INCX
  345. fmul B8, ALPHA, A8
  346. STFDUX B3, X2, INCX
  347. STFDUX B4, X2, INCX
  348. STFDUX B5, X2, INCX
  349. STFDUX B6, X2, INCX
  350. STFDUX B7, X2, INCX
  351. STFDUX B8, X2, INCX
  352. .align 4
  353. LL(215):
  354. andi. r0, N, 7
  355. beq LL(999)
  356. andi. r0, N, 4
  357. beq LL(217)
  358. LFDUX A1, X, INCX
  359. LFDUX A2, X, INCX
  360. LFDUX A3, X, INCX
  361. LFDUX A4, X, INCX
  362. fmul B1, ALPHA, A1
  363. fmul B2, ALPHA, A2
  364. fmul B3, ALPHA, A3
  365. fmul B4, ALPHA, A4
  366. STFDUX B1, X2, INCX
  367. STFDUX B2, X2, INCX
  368. STFDUX B3, X2, INCX
  369. STFDUX B4, X2, INCX
  370. .align 4
  371. LL(217):
  372. andi. r0, N, 2
  373. beq LL(218)
  374. LFDUX A1, X, INCX
  375. LFDUX A2, X, INCX
  376. fmul B1, ALPHA, A1
  377. fmul B2, ALPHA, A2
  378. STFDUX B1, X2, INCX
  379. STFDUX B2, X2, INCX
  380. .align 4
  381. LL(218):
  382. andi. r0, N, 1
  383. beq LL(999)
  384. LFDUX A1, X, INCX
  385. fmul B1, ALPHA, A1
  386. STFDUX B1, X2, INCX
  387. .align 4
  388. LL(999):
  389. li r10, 16
  390. lfpdux f16, SP, r10
  391. lfpdux f15, SP, r10
  392. lfpdux f14, SP, r10
  393. addi SP, SP, 16
  394. blr
  395. EPILOGUE