You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

axpy_hummer.S 13 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656
  1. /*********************************************************************/
  2. /* Copyright 2009, 2010 The University of Texas at Austin. */
  3. /* All rights reserved. */
  4. /* */
  5. /* Redistribution and use in source and binary forms, with or */
  6. /* without modification, are permitted provided that the following */
  7. /* conditions are met: */
  8. /* */
  9. /* 1. Redistributions of source code must retain the above */
  10. /* copyright notice, this list of conditions and the following */
  11. /* disclaimer. */
  12. /* */
  13. /* 2. Redistributions in binary form must reproduce the above */
  14. /* copyright notice, this list of conditions and the following */
  15. /* disclaimer in the documentation and/or other materials */
  16. /* provided with the distribution. */
  17. /* */
  18. /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
  19. /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
  20. /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
  21. /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
  22. /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
  23. /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
  24. /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
  25. /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
  26. /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
  27. /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
  28. /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
  29. /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
  30. /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
  31. /* POSSIBILITY OF SUCH DAMAGE. */
  32. /* */
  33. /* The views and conclusions contained in the software and */
  34. /* documentation are those of the authors and should not be */
  35. /* interpreted as representing official policies, either expressed */
  36. /* or implied, of The University of Texas at Austin. */
  37. /*********************************************************************/
  38. #define ASSEMBLER
  39. #include "common.h"
  40. #define N r3
  41. #define X r6
  42. #define INCX r7
  43. #define Y r8
  44. #define INCY r9
  45. #define YY r4
  46. #define INCX2 r5
  47. #define INCY2 r10
  48. #define ALPHA f1
  49. #define A1 f0
  50. #define A2 f8
  51. #define A3 f2
  52. #define A4 f3
  53. #define A5 f4
  54. #define A6 f5
  55. #define A7 f6
  56. #define A8 f7
  57. #define A9 f25
  58. #define B1 f9
  59. #define B2 f10
  60. #define B3 f11
  61. #define B4 f12
  62. #define B5 f13
  63. #define B6 f14
  64. #define B7 f15
  65. #define B8 f16
  66. #define C1 f17
  67. #define C2 f18
  68. #define C3 f19
  69. #define C4 f20
  70. #define C5 f21
  71. #define C6 f22
  72. #define C7 f23
  73. #define C8 f24
  74. PROLOGUE
  75. PROFCODE
  76. li r10, -16
  77. stfpdux f14, SP, r10
  78. stfpdux f15, SP, r10
  79. stfpdux f16, SP, r10
  80. stfpdux f17, SP, r10
  81. stfpdux f18, SP, r10
  82. stfpdux f19, SP, r10
  83. stfpdux f20, SP, r10
  84. stfpdux f21, SP, r10
  85. stfpdux f22, SP, r10
  86. stfpdux f23, SP, r10
  87. stfpdux f24, SP, r10
  88. stfpdux f25, SP, r10
  89. fsmfp ALPHA, ALPHA
  90. slwi INCX, INCX, BASE_SHIFT
  91. slwi INCY, INCY, BASE_SHIFT
  92. add INCX2, INCX, INCX
  93. add INCY2, INCY, INCY
  94. cmpwi cr0, N, 0
  95. ble LL(999)
  96. cmpwi cr0, INCX, SIZE
  97. bne LL(100)
  98. cmpwi cr0, INCY, SIZE
  99. bne LL(100)
  100. andi. r0, Y, 2 * SIZE - 1
  101. beq LL(05)
  102. LFD A1, 0 * SIZE(X)
  103. LFD B1, 0 * SIZE(Y)
  104. addi X, X, SIZE
  105. addi Y, Y, SIZE
  106. fmadd C1, ALPHA, A1, B1
  107. addi N, N, -1
  108. STFD C1, -1 * SIZE(Y)
  109. LL(05):
  110. andi. r0, X, 2 * SIZE - 1
  111. bne LL(20)
  112. sub X, X, INCX2
  113. sub Y, Y, INCY2
  114. mr YY, Y
  115. srawi. r0, N, 4
  116. mtspr CTR, r0
  117. beq- LL(15)
  118. LFPDUX A1, X, INCX2
  119. LFPDUX B1, Y, INCY2
  120. LFPDUX A2, X, INCX2
  121. LFPDUX B2, Y, INCY2
  122. LFPDUX A3, X, INCX2
  123. LFPDUX B3, Y, INCY2
  124. LFPDUX A4, X, INCX2
  125. LFPDUX B4, Y, INCY2
  126. LFPDUX A5, X, INCX2
  127. LFPDUX B5, Y, INCY2
  128. LFPDUX A6, X, INCX2
  129. LFPDUX B6, Y, INCY2
  130. LFPDUX A7, X, INCX2
  131. LFPDUX B7, Y, INCY2
  132. LFPDUX A8, X, INCX2
  133. LFPDUX B8, Y, INCY2
  134. bdz LL(13)
  135. .align 4
  136. LL(12):
  137. fpmadd C1, ALPHA, A1, B1
  138. LFPDUX A1, X, INCX2
  139. LFPDUX B1, Y, INCY2
  140. fpmadd C2, ALPHA, A2, B2
  141. LFPDUX A2, X, INCX2
  142. LFPDUX B2, Y, INCY2
  143. fpmadd C3, ALPHA, A3, B3
  144. LFPDUX A3, X, INCX2
  145. LFPDUX B3, Y, INCY2
  146. fpmadd C4, ALPHA, A4, B4
  147. LFPDUX A4, X, INCX2
  148. LFPDUX B4, Y, INCY2
  149. fpmadd C5, ALPHA, A5, B5
  150. LFPDUX A5, X, INCX2
  151. LFPDUX B5, Y, INCY2
  152. fpmadd C6, ALPHA, A6, B6
  153. LFPDUX A6, X, INCX2
  154. LFPDUX B6, Y, INCY2
  155. fpmadd C7, ALPHA, A7, B7
  156. LFPDUX A7, X, INCX2
  157. LFPDUX B7, Y, INCY2
  158. fpmadd C8, ALPHA, A8, B8
  159. LFPDUX A8, X, INCX2
  160. LFPDUX B8, Y, INCY2
  161. STFPDUX C1, YY, INCY2
  162. STFPDUX C2, YY, INCY2
  163. STFPDUX C3, YY, INCY2
  164. STFPDUX C4, YY, INCY2
  165. STFPDUX C5, YY, INCY2
  166. STFPDUX C6, YY, INCY2
  167. STFPDUX C7, YY, INCY2
  168. STFPDUX C8, YY, INCY2
  169. bdnz LL(12)
  170. .align 4
  171. LL(13):
  172. fpmadd C1, ALPHA, A1, B1
  173. fpmadd C2, ALPHA, A2, B2
  174. fpmadd C3, ALPHA, A3, B3
  175. fpmadd C4, ALPHA, A4, B4
  176. fpmadd C5, ALPHA, A5, B5
  177. fpmadd C6, ALPHA, A6, B6
  178. STFPDUX C1, YY, INCY2
  179. fpmadd C7, ALPHA, A7, B7
  180. STFPDUX C2, YY, INCY2
  181. fpmadd C8, ALPHA, A8, B8
  182. STFPDUX C3, YY, INCY2
  183. STFPDUX C4, YY, INCY2
  184. STFPDUX C5, YY, INCY2
  185. STFPDUX C6, YY, INCY2
  186. STFPDUX C7, YY, INCY2
  187. STFPDUX C8, YY, INCY2
  188. .align 4
  189. LL(15):
  190. andi. r0, N, 15
  191. beq LL(999)
  192. andi. r0, N, 8
  193. beq LL(16)
  194. LFPDUX A1, X, INCX2
  195. LFPDUX B1, Y, INCY2
  196. LFPDUX A2, X, INCX2
  197. LFPDUX B2, Y, INCY2
  198. LFPDUX A3, X, INCX2
  199. LFPDUX B3, Y, INCY2
  200. LFPDUX A4, X, INCX2
  201. LFPDUX B4, Y, INCY2
  202. fpmadd C1, ALPHA, A1, B1
  203. fpmadd C2, ALPHA, A2, B2
  204. fpmadd C3, ALPHA, A3, B3
  205. fpmadd C4, ALPHA, A4, B4
  206. STFPDUX C1, YY, INCY2
  207. STFPDUX C2, YY, INCY2
  208. STFPDUX C3, YY, INCY2
  209. STFPDUX C4, YY, INCY2
  210. .align 4
  211. LL(16):
  212. andi. r0, N, 4
  213. beq LL(17)
  214. LFPDUX A1, X, INCX2
  215. LFPDUX B1, Y, INCY2
  216. LFPDUX A2, X, INCX2
  217. LFPDUX B2, Y, INCY2
  218. fpmadd C1, ALPHA, A1, B1
  219. fpmadd C2, ALPHA, A2, B2
  220. STFPDUX C1, YY, INCY2
  221. STFPDUX C2, YY, INCY2
  222. .align 4
  223. LL(17):
  224. andi. r0, N, 2
  225. beq LL(18)
  226. LFPDUX A1, X, INCX2
  227. LFPDUX B1, Y, INCY2
  228. fpmadd C1, ALPHA, A1, B1
  229. STFPDUX C1, YY, INCY2
  230. .align 4
  231. LL(18):
  232. andi. r0, N, 1
  233. beq LL(999)
  234. LFDUX A1, X, INCX2
  235. LFDUX B1, Y, INCY2
  236. fmadd C1, ALPHA, A1, B1
  237. STFDUX C1, YY, INCY2
  238. b LL(999)
  239. .align 4
  240. /* X is unaliged */
  241. LL(20):
  242. LFD A1, 0 * SIZE(X)
  243. addi X, X, SIZE
  244. sub X, X, INCX2
  245. sub Y, Y, INCY2
  246. mr YY, Y
  247. srawi. r0, N, 4
  248. mtspr CTR, r0
  249. beq- LL(25)
  250. LFXDUX A2, X, INCX2
  251. LFPDUX B1, Y, INCY2
  252. LFXDUX A3, X, INCX2
  253. LFPDUX B2, Y, INCY2
  254. LFXDUX A4, X, INCX2
  255. LFPDUX B3, Y, INCY2
  256. LFXDUX A5, X, INCX2
  257. LFPDUX B4, Y, INCY2
  258. LFXDUX A6, X, INCX2
  259. LFPDUX B5, Y, INCY2
  260. LFXDUX A7, X, INCX2
  261. LFPDUX B6, Y, INCY2
  262. fsmr A1, A2
  263. LFXDUX A8, X, INCX2
  264. fsmr A2, A3
  265. LFPDUX B7, Y, INCY2
  266. fsmr A3, A4
  267. LFXDUX A9, X, INCX2
  268. fsmr A4, A5
  269. LFPDUX B8, Y, INCY2
  270. bdz LL(23)
  271. .align 4
  272. LL(22):
  273. fpmadd C1, ALPHA, A1, B1
  274. fsmr A5, A6
  275. LFPDUX B1, Y, INCY2
  276. fpmadd C2, ALPHA, A2, B2
  277. LFXDUX A2, X, INCX2
  278. fsmr A6, A7
  279. LFPDUX B2, Y, INCY2
  280. fpmadd C3, ALPHA, A3, B3
  281. LFXDUX A3, X, INCX2
  282. fsmr A7, A8
  283. LFPDUX B3, Y, INCY2
  284. fpmadd C4, ALPHA, A4, B4
  285. LFXDUX A4, X, INCX2
  286. fsmr A8, A9
  287. LFPDUX B4, Y, INCY2
  288. fpmadd C5, ALPHA, A5, B5
  289. LFXDUX A5, X, INCX2
  290. LFPDUX B5, Y, INCY2
  291. fpmadd C6, ALPHA, A6, B6
  292. LFXDUX A6, X, INCX2
  293. LFPDUX B6, Y, INCY2
  294. fpmadd C7, ALPHA, A7, B7
  295. LFXDUX A7, X, INCX2
  296. LFPDUX B7, Y, INCY2
  297. fpmadd C8, ALPHA, A8, B8
  298. LFXDUX A8, X, INCX2
  299. LFPDUX B8, Y, INCY2
  300. fpmr A1, A9
  301. LFXDUX A9, X, INCX2
  302. STFPDUX C1, YY, INCY2
  303. STFPDUX C2, YY, INCY2
  304. STFPDUX C3, YY, INCY2
  305. STFPDUX C4, YY, INCY2
  306. fsmr A1, A2
  307. STFPDUX C5, YY, INCY2
  308. fsmr A2, A3
  309. STFPDUX C6, YY, INCY2
  310. fsmr A3, A4
  311. STFPDUX C7, YY, INCY2
  312. fsmr A4, A5
  313. STFPDUX C8, YY, INCY2
  314. bdnz LL(22)
  315. .align 4
  316. LL(23):
  317. fpmadd C1, ALPHA, A1, B1
  318. fsmr A5, A6
  319. fpmadd C2, ALPHA, A2, B2
  320. fsmr A6, A7
  321. fpmadd C3, ALPHA, A3, B3
  322. fsmr A7, A8
  323. fpmadd C4, ALPHA, A4, B4
  324. fsmr A8, A9
  325. fpmadd C5, ALPHA, A5, B5
  326. fpmadd C6, ALPHA, A6, B6
  327. fpmadd C7, ALPHA, A7, B7
  328. fpmadd C8, ALPHA, A8, B8
  329. fpmr A1, A9
  330. STFPDUX C1, YY, INCY2
  331. STFPDUX C2, YY, INCY2
  332. STFPDUX C3, YY, INCY2
  333. STFPDUX C4, YY, INCY2
  334. STFPDUX C5, YY, INCY2
  335. STFPDUX C6, YY, INCY2
  336. STFPDUX C7, YY, INCY2
  337. STFPDUX C8, YY, INCY2
  338. .align 4
  339. LL(25):
  340. andi. r0, N, 15
  341. beq LL(999)
  342. andi. r0, N, 8
  343. beq LL(26)
  344. LFXDUX A2, X, INCX2
  345. LFPDUX B1, Y, INCY2
  346. LFXDUX A3, X, INCX2
  347. LFPDUX B2, Y, INCY2
  348. LFXDUX A4, X, INCX2
  349. LFPDUX B3, Y, INCY2
  350. LFXDUX A5, X, INCX2
  351. LFPDUX B4, Y, INCY2
  352. fsmr A1, A2
  353. fsmr A2, A3
  354. fsmr A3, A4
  355. fsmr A4, A5
  356. fpmadd C1, ALPHA, A1, B1
  357. fpmadd C2, ALPHA, A2, B2
  358. fpmadd C3, ALPHA, A3, B3
  359. fpmadd C4, ALPHA, A4, B4
  360. fpmr A1, A5
  361. STFPDUX C1, YY, INCY2
  362. STFPDUX C2, YY, INCY2
  363. STFPDUX C3, YY, INCY2
  364. STFPDUX C4, YY, INCY2
  365. .align 4
  366. LL(26):
  367. andi. r0, N, 4
  368. beq LL(27)
  369. LFXDUX A2, X, INCX2
  370. LFPDUX B1, Y, INCY2
  371. LFXDUX A3, X, INCX2
  372. LFPDUX B2, Y, INCY2
  373. fsmr A1, A2
  374. fsmr A2, A3
  375. fpmadd C1, ALPHA, A1, B1
  376. fpmadd C2, ALPHA, A2, B2
  377. fpmr A1, A3
  378. STFPDUX C1, YY, INCY2
  379. STFPDUX C2, YY, INCY2
  380. .align 4
  381. LL(27):
  382. andi. r0, N, 2
  383. beq LL(28)
  384. LFXDUX A2, X, INCX2
  385. LFPDUX B1, Y, INCY2
  386. fsmr A1, A2
  387. fpmadd C1, ALPHA, A1, B1
  388. fpmr A1, A2
  389. STFPDUX C1, YY, INCY2
  390. .align 4
  391. LL(28):
  392. andi. r0, N, 1
  393. beq LL(999)
  394. LFDUX B1, Y, INCY2
  395. fmadd C1, ALPHA, A1, B1
  396. STFDUX C1, YY, INCY2
  397. b LL(999)
  398. .align 4
  399. ####
  400. LL(100):
  401. sub X, X, INCX
  402. sub Y, Y, INCY
  403. mr YY, Y
  404. srawi. r0, N, 3
  405. mtspr CTR, r0
  406. beq- LL(115)
  407. LFDUX A1, X, INCX
  408. LFDUX B1, Y, INCY
  409. LFDUX A2, X, INCX
  410. LFDUX B2, Y, INCY
  411. LFDUX A3, X, INCX
  412. LFDUX B3, Y, INCY
  413. LFDUX A4, X, INCX
  414. LFDUX B4, Y, INCY
  415. LFDUX A5, X, INCX
  416. LFDUX B5, Y, INCY
  417. LFDUX A6, X, INCX
  418. LFDUX B6, Y, INCY
  419. LFDUX A7, X, INCX
  420. LFDUX B7, Y, INCY
  421. LFDUX A8, X, INCX
  422. LFDUX B8, Y, INCY
  423. bdz LL(113)
  424. .align 4
  425. LL(112):
  426. fmadd C1, ALPHA, A1, B1
  427. LFDUX A1, X, INCX
  428. LFDUX B1, Y, INCY
  429. fmadd C2, ALPHA, A2, B2
  430. LFDUX A2, X, INCX
  431. LFDUX B2, Y, INCY
  432. fmadd C3, ALPHA, A3, B3
  433. LFDUX A3, X, INCX
  434. LFDUX B3, Y, INCY
  435. fmadd C4, ALPHA, A4, B4
  436. LFDUX A4, X, INCX
  437. LFDUX B4, Y, INCY
  438. fmadd C5, ALPHA, A5, B5
  439. LFDUX A5, X, INCX
  440. LFDUX B5, Y, INCY
  441. fmadd C6, ALPHA, A6, B6
  442. LFDUX A6, X, INCX
  443. LFDUX B6, Y, INCY
  444. fmadd C7, ALPHA, A7, B7
  445. LFDUX A7, X, INCX
  446. LFDUX B7, Y, INCY
  447. fmadd C8, ALPHA, A8, B8
  448. LFDUX A8, X, INCX
  449. LFDUX B8, Y, INCY
  450. STFDUX C1, YY, INCY
  451. STFDUX C2, YY, INCY
  452. STFDUX C3, YY, INCY
  453. STFDUX C4, YY, INCY
  454. STFDUX C5, YY, INCY
  455. STFDUX C6, YY, INCY
  456. STFDUX C7, YY, INCY
  457. STFDUX C8, YY, INCY
  458. bdnz LL(112)
  459. .align 4
  460. LL(113):
  461. fmadd C1, ALPHA, A1, B1
  462. fmadd C2, ALPHA, A2, B2
  463. fmadd C3, ALPHA, A3, B3
  464. fmadd C4, ALPHA, A4, B4
  465. fmadd C5, ALPHA, A5, B5
  466. fmadd C6, ALPHA, A6, B6
  467. STFDUX C1, YY, INCY
  468. fmadd C7, ALPHA, A7, B7
  469. STFDUX C2, YY, INCY
  470. fmadd C8, ALPHA, A8, B8
  471. STFDUX C3, YY, INCY
  472. STFDUX C4, YY, INCY
  473. STFDUX C5, YY, INCY
  474. STFDUX C6, YY, INCY
  475. STFDUX C7, YY, INCY
  476. STFDUX C8, YY, INCY
  477. .align 4
  478. LL(115):
  479. andi. r0, N, 7
  480. beq LL(999)
  481. andi. r0, N, 4
  482. beq LL(117)
  483. LFDUX A1, X, INCX
  484. LFDUX B1, Y, INCY
  485. LFDUX A2, X, INCX
  486. LFDUX B2, Y, INCY
  487. LFDUX A3, X, INCX
  488. LFDUX B3, Y, INCY
  489. LFDUX A4, X, INCX
  490. LFDUX B4, Y, INCY
  491. fmadd C1, ALPHA, A1, B1
  492. fmadd C2, ALPHA, A2, B2
  493. fmadd C3, ALPHA, A3, B3
  494. fmadd C4, ALPHA, A4, B4
  495. STFDUX C1, YY, INCY
  496. STFDUX C2, YY, INCY
  497. STFDUX C3, YY, INCY
  498. STFDUX C4, YY, INCY
  499. .align 4
  500. LL(117):
  501. andi. r0, N, 2
  502. beq LL(118)
  503. LFDUX A1, X, INCX
  504. LFDUX B1, Y, INCY
  505. LFDUX A2, X, INCX
  506. LFDUX B2, Y, INCY
  507. fmadd C1, ALPHA, A1, B1
  508. fmadd C2, ALPHA, A2, B2
  509. STFDUX C1, YY, INCY
  510. STFDUX C2, YY, INCY
  511. .align 4
  512. LL(118):
  513. andi. r0, N, 1
  514. beq LL(999)
  515. LFDUX A1, X, INCX
  516. LFDUX B1, Y, INCY
  517. fmadd C1, ALPHA, A1, B1
  518. STFDUX C1, YY, INCY
  519. .align 4
  520. LL(999):
  521. li r10, 16
  522. subi SP, SP, 16
  523. lfpdux f25, SP, r10
  524. lfpdux f24, SP, r10
  525. lfpdux f23, SP, r10
  526. lfpdux f22, SP, r10
  527. lfpdux f21, SP, r10
  528. lfpdux f20, SP, r10
  529. lfpdux f19, SP, r10
  530. lfpdux f18, SP, r10
  531. lfpdux f17, SP, r10
  532. lfpdux f16, SP, r10
  533. lfpdux f15, SP, r10
  534. lfpdux f14, SP, r10
  535. addi SP, SP, 16
  536. blr
  537. EPILOGUE