You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

swap_hummer.S 13 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703
  1. /*********************************************************************/
  2. /* Copyright 2009, 2010 The University of Texas at Austin. */
  3. /* All rights reserved. */
  4. /* */
  5. /* Redistribution and use in source and binary forms, with or */
  6. /* without modification, are permitted provided that the following */
  7. /* conditions are met: */
  8. /* */
  9. /* 1. Redistributions of source code must retain the above */
  10. /* copyright notice, this list of conditions and the following */
  11. /* disclaimer. */
  12. /* */
  13. /* 2. Redistributions in binary form must reproduce the above */
  14. /* copyright notice, this list of conditions and the following */
  15. /* disclaimer in the documentation and/or other materials */
  16. /* provided with the distribution. */
  17. /* */
  18. /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
  19. /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
  20. /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
  21. /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
  22. /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
  23. /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
  24. /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
  25. /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
  26. /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
  27. /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
  28. /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
  29. /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
  30. /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
  31. /* POSSIBILITY OF SUCH DAMAGE. */
  32. /* */
  33. /* The views and conclusions contained in the software and */
  34. /* documentation are those of the authors and should not be */
  35. /* interpreted as representing official policies, either expressed */
  36. /* or implied, of The University of Texas at Austin. */
  37. /*********************************************************************/
  38. #define ASSEMBLER
  39. #include "common.h"
  40. #define N r3
  41. #define X r6
  42. #define INCX r7
  43. #define Y r8
  44. #define INCY r9
  45. #define INCX2 r4
  46. #define INCY2 r5
  47. #define X2 r10
  48. #define Y2 r11
  49. #define A1 f0
  50. #define A2 f1
  51. #define A3 f2
  52. #define A4 f3
  53. #define A5 f4
  54. #define B1 f5
  55. #define B2 f6
  56. #define B3 f7
  57. #define B4 f8
  58. #define B5 f9
  59. #define T1 f10
  60. #define T2 f11
  61. #define T3 f12
  62. #define T4 f13
  63. #define T5 f14
  64. #define T6 f15
  65. #define T7 f16
  66. PROLOGUE
  67. PROFCODE
  68. li r10, -16
  69. stfpdux f14, SP, r10
  70. stfpdux f15, SP, r10
  71. stfpdux f16, SP, r10
  72. slwi INCX, INCX, BASE_SHIFT
  73. slwi INCY, INCY, BASE_SHIFT
  74. add INCX2, INCX, INCX
  75. add INCY2, INCY, INCY
  76. cmpwi cr0, N, 0
  77. ble LL(999)
  78. cmpwi cr0, INCX, SIZE
  79. bne LL(100)
  80. cmpwi cr0, INCY, SIZE
  81. bne LL(100)
  82. sub X, X, INCX2
  83. sub Y, Y, INCY2
  84. mr X2, X
  85. mr Y2, Y
  86. andi. r0, X, 2 * SIZE - 1
  87. bne LL(30)
  88. andi. r0, Y, 2 * SIZE - 1
  89. bne LL(20)
  90. .align 4
  91. LL(10): /* X : aligned Y : aligned */
  92. srawi. r0, N, 3
  93. mtspr CTR, r0
  94. beq- LL(15)
  95. LFPDUX A1, X, INCX2
  96. LFPDUX B1, Y, INCY2
  97. LFPDUX A2, X, INCX2
  98. LFPDUX B2, Y, INCY2
  99. LFPDUX A3, X, INCX2
  100. LFPDUX B3, Y, INCY2
  101. LFPDUX A4, X, INCX2
  102. LFPDUX B4, Y, INCY2
  103. bdz LL(13)
  104. .align 4
  105. LL(12):
  106. STFPDUX B1, X2, INCY2
  107. LFPDUX B1, Y, INCY2
  108. STFPDUX A1, Y2, INCY2
  109. LFPDUX A1, X, INCX2
  110. STFPDUX B2, X2, INCY2
  111. LFPDUX B2, Y, INCY2
  112. STFPDUX A2, Y2, INCY2
  113. LFPDUX A2, X, INCX2
  114. STFPDUX B3, X2, INCY2
  115. LFPDUX B3, Y, INCY2
  116. STFPDUX A3, Y2, INCY2
  117. LFPDUX A3, X, INCX2
  118. STFPDUX B4, X2, INCY2
  119. LFPDUX B4, Y, INCY2
  120. STFPDUX A4, Y2, INCY2
  121. LFPDUX A4, X, INCX2
  122. bdnz LL(12)
  123. .align 4
  124. LL(13):
  125. STFPDUX B1, X2, INCY2
  126. STFPDUX A1, Y2, INCY2
  127. STFPDUX B2, X2, INCY2
  128. STFPDUX A2, Y2, INCY2
  129. STFPDUX B3, X2, INCY2
  130. STFPDUX A3, Y2, INCY2
  131. STFPDUX B4, X2, INCY2
  132. STFPDUX A4, Y2, INCY2
  133. .align 4
  134. LL(15):
  135. andi. r0, N, 7
  136. beq LL(999)
  137. andi. r0, N, 4
  138. beq LL(16)
  139. LFPDUX A1, X, INCX2
  140. LFPDUX B1, Y, INCY2
  141. LFPDUX A2, X, INCX2
  142. LFPDUX B2, Y, INCY2
  143. STFPDUX B1, X2, INCY2
  144. STFPDUX A1, Y2, INCY2
  145. STFPDUX B2, X2, INCY2
  146. STFPDUX A2, Y2, INCY2
  147. .align 4
  148. LL(16):
  149. andi. r0, N, 2
  150. beq LL(17)
  151. LFPDUX A1, X, INCX2
  152. LFPDUX B1, Y, INCY2
  153. STFPDUX B1, X2, INCY2
  154. STFPDUX A1, Y2, INCY2
  155. .align 4
  156. LL(17):
  157. andi. r0, N, 1
  158. beq LL(999)
  159. LFDUX A1, X, INCX2
  160. LFDUX B1, Y, INCY2
  161. STFDUX B1, X2, INCY2
  162. STFDUX A1, Y2, INCY2
  163. b LL(999)
  164. .align 4
  165. LL(20): /* X : aligned Y : unaligned */
  166. LFXDUX A1, X, INCX2
  167. LFDX B1, Y, INCY2
  168. STFSDX A1, Y2, INCY2
  169. add Y, Y, INCY
  170. add Y2, Y2, INCY
  171. addi N, N, -1
  172. cmpwi cr0, N, 0
  173. ble LL(29)
  174. .align 4
  175. srawi. r0, N, 3
  176. mtspr CTR, r0
  177. beq- LL(25)
  178. LFXDUX T1, X, INCX2
  179. LFXDUX T2, Y, INCY2
  180. LFXDUX T3, X, INCX2
  181. LFXDUX T4, Y, INCY2
  182. LFPDUX A4, X, INCX2
  183. fsmr A1, T1
  184. LFPDUX B4, Y, INCY2
  185. fsmr B1, T2
  186. LFPDUX A5, X, INCX2
  187. fsmr T1, T3
  188. LFPDUX B5, Y, INCY2
  189. fsmr T2, T4
  190. bdz LL(23)
  191. .align 4
  192. LL(22):
  193. fxmr T5, A4
  194. STFPDUX A1, Y2, INCY2
  195. fxmr T6, B4
  196. STFPDUX B1, X2, INCX2
  197. fxmr A1, A5
  198. STFPDUX T1, Y2, INCY2
  199. fxmr B1, B5
  200. STFPDUX T2, X2, INCX2
  201. fsmr T3, T5
  202. LFPDUX A2, X, INCX2
  203. fsmr T4, T6
  204. LFPDUX B2, Y, INCY2
  205. fsmr T5, A1
  206. LFPDUX A3, X, INCX2
  207. fsmr T6, B1
  208. LFPDUX B3, Y, INCY2
  209. fxmr T1, A2
  210. STFPDUX T3, Y2, INCY2
  211. fxmr T2, B2
  212. STFPDUX T4, X2, INCX2
  213. fxmr T3, A3
  214. STFPDUX T5, Y2, INCY2
  215. fxmr T4, B3
  216. STFPDUX T6, X2, INCX2
  217. fsmr A1, T1
  218. LFPDUX A4, X, INCX2
  219. fsmr B1, T2
  220. LFPDUX B4, Y, INCY2
  221. fsmr T1, T3
  222. LFPDUX A5, X, INCX2
  223. fsmr T2, T4
  224. LFPDUX B5, Y, INCY2
  225. bdnz LL(22)
  226. .align 4
  227. LL(23):
  228. fxmr T5, A4
  229. STFPDUX A1, Y2, INCY2
  230. fxmr T6, B4
  231. STFPDUX B1, X2, INCX2
  232. fxmr A1, A5
  233. STFPDUX T1, Y2, INCY2
  234. fxmr B1, B5
  235. STFPDUX T2, X2, INCX2
  236. fsmr T3, T5
  237. fsmr T4, T6
  238. fsmr T5, A1
  239. fsmr T6, B1
  240. STFPDUX T3, Y2, INCY2
  241. STFPDUX T4, X2, INCX2
  242. STFPDUX T5, Y2, INCY2
  243. STFPDUX T6, X2, INCX2
  244. .align 4
  245. LL(25):
  246. andi. r0, N, 7
  247. beq LL(29)
  248. andi. r0, N, 4
  249. beq LL(27)
  250. LFXDUX A2, X, INCX2
  251. LFXDUX B2, Y, INCY2
  252. LFXDUX A3, X, INCX2
  253. LFXDUX B3, Y, INCY2
  254. fsmr A1, A2
  255. fsmr B1, B2
  256. fsmr A2, A3
  257. fsmr B2, B3
  258. STFPDUX A1, Y2, INCY2
  259. STFPDUX B1, X2, INCX2
  260. STFPDUX A2, Y2, INCY2
  261. fpmr A1, A3
  262. STFPDUX B2, X2, INCX2
  263. fpmr B1, B3
  264. .align 4
  265. LL(27):
  266. andi. r0, N, 2
  267. beq LL(28)
  268. LFXDUX A2, X, INCX2
  269. LFXDUX B2, Y, INCY2
  270. fsmr A1, A2
  271. fsmr B1, B2
  272. STFPDUX A1, Y2, INCY2
  273. fpmr A1, A2
  274. STFPDUX B1, X2, INCX2
  275. fpmr B1, B2
  276. .align 4
  277. LL(28):
  278. andi. r0, N, 1
  279. beq LL(29)
  280. LFSDX B1, Y, INCY2
  281. STFDX A1, Y2, INCY2
  282. STFDX B1, X2, INCX2
  283. add X2, X2, INCX
  284. fsmtp B1, B1
  285. .align 4
  286. LL(29):
  287. STFDX B1, X2, INCX2
  288. b LL(999)
  289. .align 4
  290. LL(30): /* X : unaligned Y : aligned */
  291. andi. r0, Y, 2 * SIZE - 1
  292. bne LL(40)
  293. LFXDUX A1, Y, INCY2
  294. LFDX B1, X, INCX2
  295. STFSDX A1, X2, INCX2
  296. add X, X, INCX
  297. add X2, X2, INCX
  298. addi N, N, -1
  299. cmpwi cr0, N, 0
  300. ble LL(39)
  301. .align 4
  302. srawi. r0, N, 3
  303. mtspr CTR, r0
  304. beq- LL(35)
  305. LFXDUX T1, Y, INCY2
  306. LFXDUX T2, X, INCX2
  307. LFXDUX T3, Y, INCY2
  308. LFXDUX T4, X, INCX2
  309. LFPDUX A4, Y, INCY2
  310. fsmr A1, T1
  311. LFPDUX B4, X, INCX2
  312. fsmr B1, T2
  313. LFPDUX A5, Y, INCY2
  314. fsmr T1, T3
  315. LFPDUX B5, X, INCX2
  316. fsmr T2, T4
  317. bdz LL(33)
  318. .align 4
  319. LL(32):
  320. fxmr T5, A4
  321. STFPDUX A1, X2, INCX2
  322. fxmr T6, B4
  323. STFPDUX B1, Y2, INCY2
  324. fxmr A1, A5
  325. STFPDUX T1, X2, INCX2
  326. fxmr B1, B5
  327. STFPDUX T2, Y2, INCY2
  328. fsmr T3, T5
  329. LFPDUX A2, Y, INCY2
  330. fsmr T4, T6
  331. LFPDUX B2, X, INCX2
  332. fsmr T5, A1
  333. LFPDUX A3, Y, INCY2
  334. fsmr T6, B1
  335. LFPDUX B3, X, INCX2
  336. fxmr T1, A2
  337. STFPDUX T3, X2, INCX2
  338. fxmr T2, B2
  339. STFPDUX T4, Y2, INCY2
  340. fxmr T3, A3
  341. STFPDUX T5, X2, INCX2
  342. fxmr T4, B3
  343. STFPDUX T6, Y2, INCY2
  344. fsmr A1, T1
  345. LFPDUX A4, Y, INCY2
  346. fsmr B1, T2
  347. LFPDUX B4, X, INCX2
  348. fsmr T1, T3
  349. LFPDUX A5, Y, INCY2
  350. fsmr T2, T4
  351. LFPDUX B5, X, INCX2
  352. bdnz LL(32)
  353. .align 4
  354. LL(33):
  355. fxmr T5, A4
  356. STFPDUX A1, X2, INCX2
  357. fxmr T6, B4
  358. STFPDUX B1, Y2, INCY2
  359. fxmr A1, A5
  360. STFPDUX T1, X2, INCX2
  361. fxmr B1, B5
  362. STFPDUX T2, Y2, INCY2
  363. fsmr T3, T5
  364. fsmr T4, T6
  365. fsmr T5, A1
  366. fsmr T6, B1
  367. STFPDUX T3, X2, INCX2
  368. STFPDUX T4, Y2, INCY2
  369. STFPDUX T5, X2, INCX2
  370. STFPDUX T6, Y2, INCY2
  371. .align 4
  372. LL(35):
  373. andi. r0, N, 7
  374. beq LL(39)
  375. andi. r0, N, 4
  376. beq LL(37)
  377. LFXDUX A2, Y, INCY2
  378. LFXDUX B2, X, INCX2
  379. LFXDUX A3, Y, INCY2
  380. LFXDUX B3, X, INCX2
  381. fsmr A1, A2
  382. fsmr B1, B2
  383. fsmr A2, A3
  384. fsmr B2, B3
  385. STFPDUX A1, X2, INCX2
  386. STFPDUX B1, Y2, INCY2
  387. STFPDUX A2, X2, INCX2
  388. fpmr A1, A3
  389. STFPDUX B2, Y2, INCY2
  390. fpmr B1, B3
  391. .align 4
  392. LL(37):
  393. andi. r0, N, 2
  394. beq LL(38)
  395. LFXDUX A2, Y, INCY2
  396. LFXDUX B2, X, INCX2
  397. fsmr A1, A2
  398. fsmr B1, B2
  399. STFPDUX A1, X2, INCX2
  400. fpmr A1, A2
  401. STFPDUX B1, Y2, INCY2
  402. fpmr B1, B2
  403. .align 4
  404. LL(38):
  405. andi. r0, N, 1
  406. beq LL(39)
  407. LFSDX B1, X, INCX2
  408. STFDX A1, X2, INCX2
  409. STFDX B1, Y2, INCY2
  410. add Y2, Y2, INCY
  411. fsmtp B1, B1
  412. .align 4
  413. LL(39):
  414. STFDX B1, Y2, INCY2
  415. b LL(999)
  416. .align 4
  417. LL(40): /* X : unaligned Y : unaligned */
  418. LFDX A1, Y, INCY2
  419. LFDX B1, X, INCX2
  420. add X, X, INCX
  421. add Y, Y, INCY
  422. addi N, N, -1
  423. cmpwi cr0, N, 0
  424. STFDX A1, X2, INCX2
  425. STFDX B1, Y2, INCY2
  426. add X2, X2, INCX
  427. add Y2, Y2, INCY
  428. ble LL(999)
  429. srawi. r0, N, 3
  430. mtspr CTR, r0
  431. beq- LL(45)
  432. LFPDUX A1, X, INCX2
  433. LFPDUX B1, Y, INCY2
  434. LFPDUX A2, X, INCX2
  435. LFPDUX B2, Y, INCY2
  436. LFPDUX A3, X, INCX2
  437. LFPDUX B3, Y, INCY2
  438. LFPDUX A4, X, INCX2
  439. LFPDUX B4, Y, INCY2
  440. bdz LL(43)
  441. .align 4
  442. LL(42):
  443. STFPDUX B1, X2, INCY2
  444. LFPDUX B1, Y, INCY2
  445. STFPDUX A1, Y2, INCY2
  446. LFPDUX A1, X, INCX2
  447. STFPDUX B2, X2, INCY2
  448. LFPDUX B2, Y, INCY2
  449. STFPDUX A2, Y2, INCY2
  450. LFPDUX A2, X, INCX2
  451. STFPDUX B3, X2, INCY2
  452. LFPDUX B3, Y, INCY2
  453. STFPDUX A3, Y2, INCY2
  454. LFPDUX A3, X, INCX2
  455. STFPDUX B4, X2, INCY2
  456. LFPDUX B4, Y, INCY2
  457. STFPDUX A4, Y2, INCY2
  458. LFPDUX A4, X, INCX2
  459. bdnz LL(42)
  460. .align 4
  461. LL(43):
  462. STFPDUX B1, X2, INCY2
  463. STFPDUX A1, Y2, INCY2
  464. STFPDUX B2, X2, INCY2
  465. STFPDUX A2, Y2, INCY2
  466. STFPDUX B3, X2, INCY2
  467. STFPDUX A3, Y2, INCY2
  468. STFPDUX B4, X2, INCY2
  469. STFPDUX A4, Y2, INCY2
  470. .align 4
  471. LL(45):
  472. andi. r0, N, 7
  473. beq LL(999)
  474. andi. r0, N, 4
  475. beq LL(46)
  476. LFPDUX A1, X, INCX2
  477. LFPDUX B1, Y, INCY2
  478. LFPDUX A2, X, INCX2
  479. LFPDUX B2, Y, INCY2
  480. STFPDUX B1, X2, INCY2
  481. STFPDUX A1, Y2, INCY2
  482. STFPDUX B2, X2, INCY2
  483. STFPDUX A2, Y2, INCY2
  484. .align 4
  485. LL(46):
  486. andi. r0, N, 2
  487. beq LL(47)
  488. LFPDUX A1, X, INCX2
  489. LFPDUX B1, Y, INCY2
  490. STFPDUX B1, X2, INCY2
  491. STFPDUX A1, Y2, INCY2
  492. .align 4
  493. LL(47):
  494. andi. r0, N, 1
  495. beq LL(999)
  496. LFDUX A1, X, INCX2
  497. LFDUX B1, Y, INCY2
  498. STFDUX B1, X2, INCY2
  499. STFDUX A1, Y2, INCY2
  500. b LL(999)
  501. .align 4
  502. LL(100):
  503. sub X, X, INCX
  504. sub Y, Y, INCY
  505. mr X2, X
  506. mr Y2, Y
  507. srawi. r0, N, 2
  508. mtspr CTR, r0
  509. beq- LL(115)
  510. LFDUX A1, X, INCX
  511. LFDUX B1, Y, INCY
  512. LFDUX A2, X, INCX
  513. LFDUX B2, Y, INCY
  514. LFDUX A3, X, INCX
  515. LFDUX B3, Y, INCY
  516. LFDUX A4, X, INCX
  517. LFDUX B4, Y, INCY
  518. bdz LL(113)
  519. .align 4
  520. LL(112):
  521. STFDUX B1, X2, INCX
  522. LFDUX B1, Y, INCY
  523. STFDUX A1, Y2, INCY
  524. LFDUX A1, X, INCX
  525. STFDUX B2, X2, INCX
  526. LFDUX B2, Y, INCY
  527. STFDUX A2, Y2, INCY
  528. LFDUX A2, X, INCX
  529. STFDUX B3, X2, INCX
  530. LFDUX B3, Y, INCY
  531. STFDUX A3, Y2, INCY
  532. LFDUX A3, X, INCX
  533. STFDUX B4, X2, INCX
  534. LFDUX B4, Y, INCY
  535. STFDUX A4, Y2, INCY
  536. LFDUX A4, X, INCX
  537. bdnz LL(112)
  538. .align 4
  539. LL(113):
  540. STFDUX B1, X2, INCX
  541. STFDUX A1, Y2, INCY
  542. STFDUX B2, X2, INCX
  543. STFDUX A2, Y2, INCY
  544. STFDUX B3, X2, INCX
  545. STFDUX A3, Y2, INCY
  546. STFDUX B4, X2, INCX
  547. STFDUX A4, Y2, INCY
  548. .align 4
  549. LL(115):
  550. andi. r0, N, 3
  551. beq LL(999)
  552. andi. r0, N, 2
  553. beq LL(117)
  554. LFDUX A1, X, INCX
  555. LFDUX A2, X, INCX
  556. LFDUX B1, Y, INCY
  557. LFDUX B2, Y, INCY
  558. STFDUX B1, X2, INCX
  559. STFDUX B2, X2, INCX
  560. STFDUX A1, Y2, INCY
  561. STFDUX A2, Y2, INCY
  562. .align 4
  563. LL(117):
  564. andi. r0, N, 1
  565. beq LL(999)
  566. LFDUX A1, X, INCX
  567. LFDUX B1, Y, INCY
  568. STFDUX B1, X2, INCX
  569. STFDUX A1, Y2, INCY
  570. .align 4
  571. LL(999):
  572. li r10, 16
  573. addi SP, SP, -16
  574. lfpdux f16, SP, r10
  575. lfpdux f15, SP, r10
  576. lfpdux f14, SP, r10
  577. addi SP, SP, 16
  578. blr
  579. EPILOGUE