You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

zcopy_hummer.S 12 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652
  1. /*********************************************************************/
  2. /* Copyright 2009, 2010 The University of Texas at Austin. */
  3. /* All rights reserved. */
  4. /* */
  5. /* Redistribution and use in source and binary forms, with or */
  6. /* without modification, are permitted provided that the following */
  7. /* conditions are met: */
  8. /* */
  9. /* 1. Redistributions of source code must retain the above */
  10. /* copyright notice, this list of conditions and the following */
  11. /* disclaimer. */
  12. /* */
  13. /* 2. Redistributions in binary form must reproduce the above */
  14. /* copyright notice, this list of conditions and the following */
  15. /* disclaimer in the documentation and/or other materials */
  16. /* provided with the distribution. */
  17. /* */
  18. /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
  19. /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
  20. /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
  21. /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
  22. /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
  23. /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
  24. /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
  25. /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
  26. /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
  27. /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
  28. /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
  29. /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
  30. /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
  31. /* POSSIBILITY OF SUCH DAMAGE. */
  32. /* */
  33. /* The views and conclusions contained in the software and */
  34. /* documentation are those of the authors and should not be */
  35. /* interpreted as representing official policies, either expressed */
  36. /* or implied, of The University of Texas at Austin. */
  37. /*********************************************************************/
  38. #define ASSEMBLER
  39. #include "common.h"
  40. #define N r3
  41. #define X r4
  42. #define INCX r5
  43. #define Y r6
  44. #define INCY r7
  45. #define INCX2 r8
  46. #define INCY2 r9
  47. #define X2 r10
  48. #define Y2 r11
  49. #define A1 f0
  50. #define A2 f1
  51. #define A3 f2
  52. #define A4 f3
  53. #define A5 f4
  54. #define A6 f5
  55. #define A7 f6
  56. #define A8 f7
  57. #define A9 f8
  58. #define T1 f9
  59. #define T2 f10
  60. #define T3 f11
  61. #define T4 f12
  62. #define T5 f13
  63. #define T6 f14
  64. #define T7 f15
  65. PROLOGUE
  66. PROFCODE
  67. li r10, -16
  68. stfpdux f14, SP, r10
  69. stfpdux f15, SP, r10
  70. slwi INCX, INCX, BASE_SHIFT
  71. slwi INCY, INCY, BASE_SHIFT
  72. add INCX2, INCX, INCX
  73. add INCY2, INCY, INCY
  74. cmpwi cr0, N, 0
  75. ble LL(999)
  76. sub X, X, INCX2
  77. sub Y, Y, INCY2
  78. cmpwi cr0, INCX, SIZE
  79. bne LL(100)
  80. cmpwi cr0, INCY, SIZE
  81. bne LL(100)
  82. andi. r0, X, 2 * SIZE - 1
  83. bne LL(30)
  84. andi. r0, Y, 2 * SIZE - 1
  85. bne LL(20)
  86. .align 4
  87. LL(10): /* X ): aligned Y ): aligned */
  88. srawi. r0, N, 3
  89. mtspr CTR, r0
  90. beq- LL(15)
  91. LFPDUX A1, X, INCX2
  92. LFPDUX A2, X, INCX2
  93. LFPDUX A3, X, INCX2
  94. LFPDUX A4, X, INCX2
  95. LFPDUX A5, X, INCX2
  96. LFPDUX A6, X, INCX2
  97. LFPDUX A7, X, INCX2
  98. LFPDUX A8, X, INCX2
  99. bdz LL(13)
  100. .align 4
  101. LL(12):
  102. STFPDUX A1, Y, INCY2
  103. LFPDUX A1, X, INCX2
  104. STFPDUX A2, Y, INCY2
  105. LFPDUX A2, X, INCX2
  106. STFPDUX A3, Y, INCY2
  107. LFPDUX A3, X, INCX2
  108. STFPDUX A4, Y, INCY2
  109. LFPDUX A4, X, INCX2
  110. STFPDUX A5, Y, INCY2
  111. LFPDUX A5, X, INCX2
  112. STFPDUX A6, Y, INCY2
  113. LFPDUX A6, X, INCX2
  114. STFPDUX A7, Y, INCY2
  115. LFPDUX A7, X, INCX2
  116. STFPDUX A8, Y, INCY2
  117. LFPDUX A8, X, INCX2
  118. bdnz LL(12)
  119. .align 4
  120. LL(13):
  121. STFPDUX A1, Y, INCY2
  122. STFPDUX A2, Y, INCY2
  123. STFPDUX A3, Y, INCY2
  124. STFPDUX A4, Y, INCY2
  125. STFPDUX A5, Y, INCY2
  126. STFPDUX A6, Y, INCY2
  127. STFPDUX A7, Y, INCY2
  128. STFPDUX A8, Y, INCY2
  129. .align 4
  130. LL(15):
  131. andi. r0, N, 7
  132. beq LL(999)
  133. andi. r0, N, 4
  134. beq LL(16)
  135. LFPDUX A1, X, INCX2
  136. LFPDUX A2, X, INCX2
  137. LFPDUX A3, X, INCX2
  138. LFPDUX A4, X, INCX2
  139. STFPDUX A1, Y, INCY2
  140. STFPDUX A2, Y, INCY2
  141. STFPDUX A3, Y, INCY2
  142. STFPDUX A4, Y, INCY2
  143. .align 4
  144. LL(16):
  145. andi. r0, N, 2
  146. beq LL(17)
  147. LFPDUX A1, X, INCX2
  148. LFPDUX A2, X, INCX2
  149. STFPDUX A1, Y, INCY2
  150. STFPDUX A2, Y, INCY2
  151. .align 4
  152. LL(17):
  153. andi. r0, N, 1
  154. beq LL(999)
  155. LFPDUX A1, X, INCX2
  156. STFPDUX A1, Y, INCY2
  157. b LL(999)
  158. .align 4
  159. LL(20): /* X : aligned Y : unaligned */
  160. LFXDUX A1, X, INCX2
  161. addi N, N, -1
  162. cmpwi cr0, N, 0
  163. STFSDX A1, Y, INCY2
  164. add Y, Y, INCY
  165. ble LL(29)
  166. .align 4
  167. srawi. r0, N, 3
  168. mtspr CTR, r0
  169. beq- LL(25)
  170. LFXDUX T1, X, INCX2
  171. LFXDUX T2, X, INCX2
  172. LFXDUX T3, X, INCX2
  173. LFXDUX T4, X, INCX2
  174. LFPDUX A6, X, INCX2
  175. fsmr A1, T1
  176. LFPDUX A7, X, INCX2
  177. fsmr T1, T2
  178. LFPDUX A8, X, INCX2
  179. fsmr T2, T3
  180. LFPDUX A9, X, INCX2
  181. fsmr T3, T4
  182. bdz LL(23)
  183. .align 4
  184. LL(22):
  185. STFPDUX A1, Y, INCY2
  186. fxmr T5, A6
  187. STFPDUX T1, Y, INCY2
  188. fxmr T6, A7
  189. STFPDUX T2, Y, INCY2
  190. fxmr T7, A8
  191. STFPDUX T3, Y, INCY2
  192. fxmr A1, A9
  193. fsmr T4, T5
  194. LFPDUX A2, X, INCX2
  195. fsmr T5, T6
  196. LFPDUX A3, X, INCX2
  197. fsmr T6, T7
  198. LFPDUX A4, X, INCX2
  199. fsmr T7, A1
  200. LFPDUX A5, X, INCX2
  201. STFPDUX T4, Y, INCY2
  202. fxmr T1, A2
  203. STFPDUX T5, Y, INCY2
  204. fxmr T2, A3
  205. STFPDUX T6, Y, INCY2
  206. fxmr T3, A4
  207. STFPDUX T7, Y, INCY2
  208. fxmr T4, A5
  209. LFPDUX A6, X, INCX2
  210. fsmr A1, T1
  211. LFPDUX A7, X, INCX2
  212. fsmr T1, T2
  213. LFPDUX A8, X, INCX2
  214. fsmr T2, T3
  215. LFPDUX A9, X, INCX2
  216. fsmr T3, T4
  217. bdnz LL(22)
  218. .align 4
  219. LL(23):
  220. STFPDUX A1, Y, INCY2
  221. fxmr T5, A6
  222. STFPDUX T1, Y, INCY2
  223. fxmr T6, A7
  224. STFPDUX T2, Y, INCY2
  225. fxmr T7, A8
  226. STFPDUX T3, Y, INCY2
  227. fxmr A1, A9
  228. fsmr T4, T5
  229. fsmr T5, T6
  230. fsmr T6, T7
  231. fsmr T7, A1
  232. STFPDUX T4, Y, INCY2
  233. STFPDUX T5, Y, INCY2
  234. STFPDUX T6, Y, INCY2
  235. STFPDUX T7, Y, INCY2
  236. .align 4
  237. LL(25):
  238. andi. r0, N, 7
  239. beq LL(29)
  240. andi. r0, N, 4
  241. beq LL(26)
  242. LFXDUX A2, X, INCX2
  243. LFXDUX A3, X, INCX2
  244. LFXDUX A4, X, INCX2
  245. LFXDUX A5, X, INCX2
  246. fsmr A1, A2
  247. fsmr A2, A3
  248. fsmr A3, A4
  249. fsmr A4, A5
  250. STFPDUX A1, Y, INCY2
  251. STFPDUX A2, Y, INCY2
  252. STFPDUX A3, Y, INCY2
  253. STFPDUX A4, Y, INCY2
  254. fpmr A1, A5
  255. .align 4
  256. LL(26):
  257. andi. r0, N, 2
  258. beq LL(27)
  259. LFXDUX A2, X, INCX2
  260. LFXDUX A3, X, INCX2
  261. fsmr A1, A2
  262. fsmr A2, A3
  263. STFPDUX A1, Y, INCY2
  264. STFPDUX A2, Y, INCY2
  265. fpmr A1, A3
  266. .align 4
  267. LL(27):
  268. andi. r0, N, 1
  269. beq LL(29)
  270. LFXDUX A2, X, INCX2
  271. fsmr A1, A2
  272. STFPDUX A1, Y, INCY2
  273. fpmr A1, A2
  274. .align 4
  275. LL(29):
  276. STFDUX A1, Y, INCY2
  277. b LL(999)
  278. .align 4
  279. LL(30): /* X ): unaligned Y ): aligned */
  280. andi. r0, Y, 2 * SIZE - 1
  281. bne LL(40)
  282. LFDX A1, X, INCX2
  283. add X, X, INCX
  284. srawi. r0, N, 3
  285. mtspr CTR, r0
  286. beq- LL(35)
  287. LFXDUX T1, X, INCX2
  288. LFXDUX T2, X, INCX2
  289. LFXDUX T3, X, INCX2
  290. LFXDUX T4, X, INCX2
  291. LFPDUX A6, X, INCX2
  292. fsmr A1, T1
  293. LFPDUX A7, X, INCX2
  294. fsmr T1, T2
  295. LFPDUX A8, X, INCX2
  296. fsmr T2, T3
  297. LFPDUX A9, X, INCX2
  298. fsmr T3, T4
  299. bdz LL(33)
  300. .align 4
  301. LL(32):
  302. fxmr T5, A6
  303. STFPDUX A1, Y, INCY2
  304. fxmr T6, A7
  305. STFPDUX T1, Y, INCY2
  306. fxmr T7, A8
  307. STFPDUX T2, Y, INCY2
  308. fxmr A1, A9
  309. STFPDUX T3, Y, INCY2
  310. LFPDUX A2, X, INCX2
  311. fsmr T4, T5
  312. LFPDUX A3, X, INCX2
  313. fsmr T5, T6
  314. LFPDUX A4, X, INCX2
  315. fsmr T6, T7
  316. LFPDUX A5, X, INCX2
  317. fsmr T7, A1
  318. fxmr T1, A2
  319. STFPDUX T4, Y, INCY2
  320. fxmr T2, A3
  321. STFPDUX T5, Y, INCY2
  322. fxmr T3, A4
  323. STFPDUX T6, Y, INCY2
  324. fxmr T4, A5
  325. STFPDUX T7, Y, INCY2
  326. fsmr A1, T1
  327. LFPDUX A6, X, INCX2
  328. fsmr T1, T2
  329. LFPDUX A7, X, INCX2
  330. fsmr T2, T3
  331. LFPDUX A8, X, INCX2
  332. fsmr T3, T4
  333. LFPDUX A9, X, INCX2
  334. bdnz LL(32)
  335. .align 4
  336. LL(33):
  337. STFPDUX A1, Y, INCY2
  338. fxmr T5, A6
  339. STFPDUX T1, Y, INCY2
  340. fxmr T6, A7
  341. STFPDUX T2, Y, INCY2
  342. fxmr T7, A8
  343. STFPDUX T3, Y, INCY2
  344. fxmr A1, A9
  345. fsmr T4, T5
  346. fsmr T5, T6
  347. fsmr T6, T7
  348. fsmr T7, A1
  349. STFPDUX T4, Y, INCY2
  350. STFPDUX T5, Y, INCY2
  351. STFPDUX T6, Y, INCY2
  352. STFPDUX T7, Y, INCY2
  353. .align 4
  354. LL(35):
  355. andi. r0, N, 7
  356. beq LL(999)
  357. andi. r0, N, 4
  358. beq LL(36)
  359. LFXDUX A2, X, INCX2
  360. LFXDUX A3, X, INCX2
  361. LFXDUX A4, X, INCX2
  362. LFXDUX A5, X, INCX2
  363. fsmr A1, A2
  364. fsmr A2, A3
  365. fsmr A3, A4
  366. fsmr A4, A5
  367. STFPDUX A1, Y, INCY2
  368. STFPDUX A2, Y, INCY2
  369. STFPDUX A3, Y, INCY2
  370. STFPDUX A4, Y, INCY2
  371. fpmr A1, A5
  372. .align 4
  373. LL(36):
  374. andi. r0, N, 2
  375. beq LL(37)
  376. LFXDUX A2, X, INCX2
  377. LFXDUX A3, X, INCX2
  378. fsmr A1, A2
  379. fsmr A2, A3
  380. STFPDUX A1, Y, INCY2
  381. STFPDUX A2, Y, INCY2
  382. fpmr A1, A3
  383. .align 4
  384. LL(37):
  385. andi. r0, N, 1
  386. beq LL(999)
  387. LFXDUX A2, X, INCX2
  388. fsmr A1, A2
  389. STFPDUX A1, Y, INCY2
  390. b LL(999)
  391. .align 4
  392. LL(40): /* X : unaligned Y : unaligned */
  393. LFDX A1, X, INCX2
  394. add X, X, INCX
  395. addi N, N, -1
  396. cmpwi cr0, N, 0
  397. STFDX A1, Y, INCY2
  398. add Y, Y, INCY
  399. ble LL(49)
  400. srawi. r0, N, 3
  401. mtspr CTR, r0
  402. beq- LL(45)
  403. LFPDUX A1, X, INCX2
  404. LFPDUX A2, X, INCX2
  405. LFPDUX A3, X, INCX2
  406. LFPDUX A4, X, INCX2
  407. LFPDUX A5, X, INCX2
  408. LFPDUX A6, X, INCX2
  409. LFPDUX A7, X, INCX2
  410. LFPDUX A8, X, INCX2
  411. bdz LL(43)
  412. .align 4
  413. LL(42):
  414. STFPDUX A1, Y, INCY2
  415. LFPDUX A1, X, INCX2
  416. STFPDUX A2, Y, INCY2
  417. LFPDUX A2, X, INCX2
  418. STFPDUX A3, Y, INCY2
  419. LFPDUX A3, X, INCX2
  420. STFPDUX A4, Y, INCY2
  421. LFPDUX A4, X, INCX2
  422. STFPDUX A5, Y, INCY2
  423. LFPDUX A5, X, INCX2
  424. STFPDUX A6, Y, INCY2
  425. LFPDUX A6, X, INCX2
  426. STFPDUX A7, Y, INCY2
  427. LFPDUX A7, X, INCX2
  428. STFPDUX A8, Y, INCY2
  429. LFPDUX A8, X, INCX2
  430. bdnz LL(42)
  431. .align 4
  432. LL(43):
  433. STFPDUX A1, Y, INCY2
  434. STFPDUX A2, Y, INCY2
  435. STFPDUX A3, Y, INCY2
  436. STFPDUX A4, Y, INCY2
  437. STFPDUX A5, Y, INCY2
  438. STFPDUX A6, Y, INCY2
  439. STFPDUX A7, Y, INCY2
  440. STFPDUX A8, Y, INCY2
  441. .align 4
  442. LL(45):
  443. andi. r0, N, 7
  444. beq LL(49)
  445. andi. r0, N, 4
  446. beq LL(46)
  447. LFPDUX A1, X, INCX2
  448. LFPDUX A2, X, INCX2
  449. LFPDUX A3, X, INCX2
  450. LFPDUX A4, X, INCX2
  451. STFPDUX A1, Y, INCY2
  452. STFPDUX A2, Y, INCY2
  453. STFPDUX A3, Y, INCY2
  454. STFPDUX A4, Y, INCY2
  455. .align 4
  456. LL(46):
  457. andi. r0, N, 2
  458. beq LL(47)
  459. LFPDUX A1, X, INCX2
  460. LFPDUX A2, X, INCX2
  461. STFPDUX A1, Y, INCY2
  462. STFPDUX A2, Y, INCY2
  463. .align 4
  464. LL(47):
  465. andi. r0, N, 1
  466. beq LL(49)
  467. LFPDUX A1, X, INCX2
  468. STFPDUX A1, Y, INCY2
  469. LL(49):
  470. LFDUX A1, X, INCX2
  471. STFDUX A1, Y, INCY2
  472. b LL(999)
  473. .align 4
  474. LL(100):
  475. addi X2, X, SIZE
  476. addi Y2, Y, SIZE
  477. srawi. r0, N, 2
  478. mtspr CTR, r0
  479. beq- LL(115)
  480. LFDUX A1, X, INCX2
  481. LFDUX A2, X2, INCX2
  482. LFDUX A3, X, INCX2
  483. LFDUX A4, X2, INCX2
  484. LFDUX A5, X, INCX2
  485. LFDUX A6, X2, INCX2
  486. LFDUX A7, X, INCX2
  487. LFDUX A8, X2, INCX2
  488. bdz LL(113)
  489. .align 4
  490. LL(112):
  491. STFDUX A1, Y, INCY2
  492. LFDUX A1, X, INCX2
  493. STFDUX A2, Y2, INCY2
  494. LFDUX A2, X2, INCX2
  495. STFDUX A3, Y, INCY2
  496. LFDUX A3, X, INCX2
  497. STFDUX A4, Y2, INCY2
  498. LFDUX A4, X2, INCX2
  499. STFDUX A5, Y, INCY2
  500. LFDUX A5, X, INCX2
  501. STFDUX A6, Y2, INCY2
  502. LFDUX A6, X2, INCX2
  503. STFDUX A7, Y, INCY2
  504. LFDUX A7, X, INCX2
  505. STFDUX A8, Y2, INCY2
  506. LFDUX A8, X2, INCX2
  507. bdnz LL(112)
  508. .align 4
  509. LL(113):
  510. STFDUX A1, Y, INCY2
  511. STFDUX A2, Y2, INCY2
  512. STFDUX A3, Y, INCY2
  513. STFDUX A4, Y2, INCY2
  514. STFDUX A5, Y, INCY2
  515. STFDUX A6, Y2, INCY2
  516. STFDUX A7, Y, INCY2
  517. STFDUX A8, Y2, INCY2
  518. .align 4
  519. LL(115):
  520. andi. r0, N, 3
  521. beq LL(999)
  522. andi. r0, N, 2
  523. beq LL(117)
  524. LFDUX A1, X, INCX2
  525. LFDUX A2, X2, INCX2
  526. LFDUX A3, X, INCX2
  527. LFDUX A4, X2, INCX2
  528. STFDUX A1, Y, INCY2
  529. STFDUX A2, Y2, INCY2
  530. STFDUX A3, Y, INCY2
  531. STFDUX A4, Y2, INCY2
  532. .align 4
  533. LL(117):
  534. andi. r0, N, 1
  535. beq LL(999)
  536. LFDUX A1, X, INCX2
  537. LFDUX A2, X2, INCX2
  538. STFDUX A1, Y, INCY2
  539. STFDUX A2, Y2, INCY2
  540. .align 4
  541. LL(999):
  542. li r10, 16
  543. addi SP, SP, -16
  544. lfpdux f15, SP, r10
  545. lfpdux f14, SP, r10
  546. addi SP, SP, 16
  547. blr
  548. EPILOGUE