You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

rot.S 11 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668
  1. /*********************************************************************/
  2. /* Copyright 2009, 2010 The University of Texas at Austin. */
  3. /* All rights reserved. */
  4. /* */
  5. /* Redistribution and use in source and binary forms, with or */
  6. /* without modification, are permitted provided that the following */
  7. /* conditions are met: */
  8. /* */
  9. /* 1. Redistributions of source code must retain the above */
  10. /* copyright notice, this list of conditions and the following */
  11. /* disclaimer. */
  12. /* */
  13. /* 2. Redistributions in binary form must reproduce the above */
  14. /* copyright notice, this list of conditions and the following */
  15. /* disclaimer in the documentation and/or other materials */
  16. /* provided with the distribution. */
  17. /* */
  18. /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
  19. /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
  20. /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
  21. /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
  22. /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
  23. /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
  24. /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
  25. /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
  26. /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
  27. /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
  28. /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
  29. /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
  30. /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
  31. /* POSSIBILITY OF SUCH DAMAGE. */
  32. /* */
  33. /* The views and conclusions contained in the software and */
  34. /* documentation are those of the authors and should not be */
  35. /* interpreted as representing official policies, either expressed */
  36. /* or implied, of The University of Texas at Austin. */
  37. /*********************************************************************/
  38. #define ASSEMBLER
  39. #include "common.h"
  40. #define N %i0
  41. #define X %i1
  42. #define INCX %i2
  43. #define Y %i3
  44. #define INCY %i4
  45. #define I %i5
  46. #define XX %l0
  47. #define YY %l1
  48. #ifdef DOUBLE
  49. #define a1 %f4
  50. #define a2 %f6
  51. #define a3 %f8
  52. #define a4 %f10
  53. #define a5 %f12
  54. #define a6 %f14
  55. #define a7 %f16
  56. #define a8 %f18
  57. #define b1 %f20
  58. #define b2 %f22
  59. #define b3 %f24
  60. #define b4 %f26
  61. #define b5 %f28
  62. #define b6 %f30
  63. #define b7 %f32
  64. #define b8 %f34
  65. #define c1 %f36
  66. #define c2 %f38
  67. #define c3 %f40
  68. #define c4 %f42
  69. #define c5 %f44
  70. #define c6 %f46
  71. #define c7 %f48
  72. #define c8 %f50
  73. #define t1 %f52
  74. #define t2 %f54
  75. #define t3 %f56
  76. #define t4 %f58
  77. #else
  78. #define a1 %f2
  79. #define a2 %f3
  80. #define a3 %f4
  81. #define a4 %f5
  82. #define a5 %f6
  83. #define a6 %f7
  84. #define a7 %f8
  85. #define a8 %f9
  86. #define b1 %f10
  87. #define b2 %f11
  88. #define b3 %f12
  89. #define b4 %f13
  90. #define b5 %f14
  91. #define b6 %f15
  92. #define b7 %f16
  93. #define b8 %f17
  94. #define c1 %f18
  95. #define c2 %f19
  96. #define c3 %f20
  97. #define c4 %f21
  98. #define c5 %f22
  99. #define c6 %f23
  100. #define c7 %f24
  101. #define c8 %f25
  102. #define t1 %f26
  103. #define t2 %f27
  104. #define t3 %f28
  105. #define t4 %f29
  106. #endif
  107. #ifdef DOUBLE
  108. #define C %f0
  109. #define S %f2
  110. #else
  111. #define C %f0
  112. #define S %f1
  113. #endif
  114. PROLOGUE
  115. SAVESP
  116. #ifndef __64BIT__
  117. #ifdef DOUBLE
  118. st %i5, [%sp + STACK_START + 24]
  119. LDF [%sp + STACK_START + 24], C
  120. LDF [%sp + STACK_START + 32], S
  121. #else
  122. st %i5, [%sp + STACK_START + 24]
  123. LDF [%sp + STACK_START + 24], C
  124. LDF [%sp + STACK_START + 28], S
  125. #endif
  126. #else
  127. #ifdef DOUBLE
  128. FMOV %f10, C
  129. FMOV %f12, S
  130. #else
  131. FMOV %f11, C
  132. FMOV %f13, S
  133. #endif
  134. #endif
  135. cmp N, 0
  136. ble .LL19
  137. nop
  138. sll INCX, BASE_SHIFT, INCX
  139. sll INCY, BASE_SHIFT, INCY
  140. cmp INCX, SIZE
  141. bne .LL50
  142. nop
  143. cmp INCY, SIZE
  144. bne .LL50
  145. nop
  146. sra N, 3, I
  147. cmp I, 0
  148. ble,pn %icc, .LL15
  149. nop
  150. LDF [X + 0 * SIZE], a1
  151. LDF [Y + 0 * SIZE], b1
  152. LDF [X + 1 * SIZE], a2
  153. LDF [Y + 1 * SIZE], b2
  154. LDF [X + 2 * SIZE], a3
  155. LDF [Y + 2 * SIZE], b3
  156. LDF [X + 3 * SIZE], a4
  157. LDF [Y + 3 * SIZE], b4
  158. LDF [X + 4 * SIZE], a5
  159. LDF [Y + 4 * SIZE], b5
  160. LDF [X + 5 * SIZE], a6
  161. LDF [Y + 5 * SIZE], b6
  162. LDF [X + 6 * SIZE], a7
  163. LDF [Y + 6 * SIZE], b7
  164. LDF [X + 7 * SIZE], a8
  165. LDF [Y + 7 * SIZE], b8
  166. FMUL C, a1, c1
  167. FMUL S, b1, c2
  168. FMUL C, b1, c3
  169. LDF [Y + 8 * SIZE], b1
  170. FMUL S, a1, c4
  171. LDF [X + 8 * SIZE], a1
  172. FMUL C, a2, c5
  173. FMUL S, b2, c6
  174. FADD c1, c2, t1
  175. FMUL C, b2, c7
  176. LDF [Y + 9 * SIZE], b2
  177. FMUL S, a2, c8
  178. LDF [X + 9 * SIZE], a2
  179. FSUB c3, c4, t2
  180. addcc I, -1, I
  181. ble,pt %icc, .LL12
  182. nop
  183. #define PREFETCHSIZE 64
  184. .LL11:
  185. FMUL C, a3, c1
  186. nop
  187. prefetch [Y + PREFETCHSIZE * SIZE], 1
  188. nop
  189. FMUL S, b3, c2
  190. STF t1, [X + 0 * SIZE]
  191. FADD c5, c6, t3
  192. nop
  193. FMUL C, b3, c3
  194. LDF [Y + 10 * SIZE], b3
  195. nop
  196. nop
  197. FMUL S, a3, c4
  198. STF t2, [Y + 0 * SIZE]
  199. FSUB c7, c8, t4
  200. nop
  201. FMUL C, a4, c5
  202. LDF [X + 10 * SIZE], a3
  203. nop
  204. nop
  205. FMUL S, b4, c6
  206. STF t3, [X + 1 * SIZE]
  207. FADD c1, c2, t1
  208. nop
  209. FMUL C, b4, c7
  210. LDF [Y + 11 * SIZE], b4
  211. nop
  212. nop
  213. FMUL S, a4, c8
  214. STF t4, [Y + 1 * SIZE]
  215. FSUB c3, c4, t2
  216. nop
  217. FMUL C, a5, c1
  218. LDF [X + 11 * SIZE], a4
  219. nop
  220. nop
  221. FMUL S, b5, c2
  222. STF t1, [X + 2 * SIZE]
  223. FADD c5, c6, t3
  224. nop
  225. FMUL C, b5, c3
  226. LDF [Y + 12 * SIZE], b5
  227. nop
  228. nop
  229. FMUL S, a5, c4
  230. STF t2, [Y + 2 * SIZE]
  231. FSUB c7, c8, t4
  232. nop
  233. FMUL C, a6, c5
  234. LDF [X + 12 * SIZE], a5
  235. nop
  236. nop
  237. FMUL S, b6, c6
  238. STF t3, [X + 3 * SIZE]
  239. FADD c1, c2, t1
  240. nop
  241. FMUL C, b6, c7
  242. LDF [Y + 13 * SIZE], b6
  243. nop
  244. nop
  245. FMUL S, a6, c8
  246. STF t4, [Y + 3 * SIZE]
  247. FSUB c3, c4, t2
  248. nop
  249. FMUL C, a7, c1
  250. LDF [X + 13 * SIZE], a6
  251. nop
  252. nop
  253. FMUL S, b7, c2
  254. STF t1, [X + 4 * SIZE]
  255. FADD c5, c6, t3
  256. nop
  257. FMUL C, b7, c3
  258. LDF [Y + 14 * SIZE], b7
  259. nop
  260. nop
  261. FMUL S, a7, c4
  262. STF t2, [Y + 4 * SIZE]
  263. FSUB c7, c8, t4
  264. nop
  265. FMUL C, a8, c5
  266. LDF [X + 14 * SIZE], a7
  267. nop
  268. nop
  269. FMUL S, b8, c6
  270. STF t3, [X + 5 * SIZE]
  271. FADD c1, c2, t1
  272. nop
  273. FMUL C, b8, c7
  274. LDF [Y + 15 * SIZE], b8
  275. nop
  276. nop
  277. FMUL S, a8, c8
  278. STF t4, [Y + 5 * SIZE]
  279. FSUB c3, c4, t2
  280. nop
  281. FMUL C, a1, c1
  282. LDF [X + 15 * SIZE], a8
  283. addcc I, -1, I
  284. nop
  285. FMUL S, b1, c2
  286. STF t1, [X + 6 * SIZE]
  287. FADD c5, c6, t3
  288. nop
  289. FMUL C, b1, c3
  290. LDF [Y + 16 * SIZE], b1
  291. nop
  292. nop
  293. FMUL S, a1, c4
  294. STF t2, [Y + 6 * SIZE]
  295. FSUB c7, c8, t4
  296. nop
  297. FMUL C, a2, c5
  298. LDF [X + 16 * SIZE], a1
  299. add Y, 8 * SIZE, Y
  300. nop
  301. FMUL S, b2, c6
  302. STF t3, [X + 7 * SIZE]
  303. FADD c1, c2, t1
  304. nop
  305. FMUL C, b2, c7
  306. LDF [Y + 9 * SIZE], b2
  307. add X, 8 * SIZE, X
  308. nop
  309. FMUL S, a2, c8
  310. STF t4, [Y - 1 * SIZE]
  311. FSUB c3, c4, t2
  312. nop
  313. bg,pt %icc, .LL11
  314. LDF [X + 9 * SIZE], a2
  315. .LL12:
  316. FMUL C, a3, c1
  317. FMUL S, b3, c2
  318. STF t1, [X + 0 * SIZE]
  319. FADD c5, c6, t3
  320. FMUL C, b3, c3
  321. FMUL S, a3, c4
  322. STF t2, [Y + 0 * SIZE]
  323. FSUB c7, c8, t4
  324. FMUL C, a4, c5
  325. FMUL S, b4, c6
  326. STF t3, [X + 1 * SIZE]
  327. FADD c1, c2, t1
  328. FMUL C, b4, c7
  329. FMUL S, a4, c8
  330. STF t4, [Y + 1 * SIZE]
  331. FSUB c3, c4, t2
  332. FMUL C, a5, c1
  333. FMUL S, b5, c2
  334. STF t1, [X + 2 * SIZE]
  335. FADD c5, c6, t3
  336. FMUL C, b5, c3
  337. FMUL S, a5, c4
  338. STF t2, [Y + 2 * SIZE]
  339. FSUB c7, c8, t4
  340. FMUL C, a6, c5
  341. FMUL S, b6, c6
  342. STF t3, [X + 3 * SIZE]
  343. FADD c1, c2, t1
  344. FMUL C, b6, c7
  345. FMUL S, a6, c8
  346. STF t4, [Y + 3 * SIZE]
  347. FSUB c3, c4, t2
  348. FMUL C, a7, c1
  349. FMUL S, b7, c2
  350. STF t1, [X + 4 * SIZE]
  351. FADD c5, c6, t3
  352. FMUL C, b7, c3
  353. FMUL S, a7, c4
  354. STF t2, [Y + 4 * SIZE]
  355. FSUB c7, c8, t4
  356. FMUL C, a8, c5
  357. FMUL S, b8, c6
  358. STF t3, [X + 5 * SIZE]
  359. FADD c1, c2, t1
  360. FMUL C, b8, c7
  361. FMUL S, a8, c8
  362. STF t4, [Y + 5 * SIZE]
  363. FSUB c3, c4, t2
  364. FADD c5, c6, t3
  365. STF t1, [X + 6 * SIZE]
  366. FSUB c7, c8, t4
  367. STF t2, [Y + 6 * SIZE]
  368. STF t3, [X + 7 * SIZE]
  369. STF t4, [Y + 7 * SIZE]
  370. add X, 8 * SIZE, X
  371. add Y, 8 * SIZE, Y
  372. .LL15:
  373. andcc N, 7, I
  374. nop
  375. ble,a,pn %icc, .LL19
  376. nop
  377. .LL16:
  378. LDF [X + 0 * SIZE], a1
  379. add X, 1 * SIZE, X
  380. LDF [Y + 0 * SIZE], b1
  381. add Y, 1 * SIZE, Y
  382. FMUL C, a1, c1
  383. FMUL S, b1, c2
  384. FMUL C, b1, c3
  385. FMUL S, a1, c4
  386. FADD c1, c2, c2
  387. addcc I, -1, I
  388. FSUB c3, c4, c4
  389. nop
  390. STF c2, [X - 1 * SIZE]
  391. STF c4, [Y - 1 * SIZE]
  392. bg,pt %icc, .LL16
  393. nop
  394. .LL19:
  395. return %i7 + 8
  396. nop
  397. .LL50:
  398. mov X, XX
  399. mov Y, YY
  400. sra N, 3, I
  401. cmp I, 0
  402. ble,pn %icc, .LL55
  403. nop
  404. .LL51:
  405. LDF [X + 0 * SIZE], a1
  406. add X, INCX, X
  407. LDF [Y + 0 * SIZE], b1
  408. add Y, INCY, Y
  409. LDF [X + 0 * SIZE], a2
  410. add X, INCX, X
  411. LDF [Y + 0 * SIZE], b2
  412. add Y, INCY, Y
  413. LDF [X + 0 * SIZE], a3
  414. add X, INCX, X
  415. LDF [Y + 0 * SIZE], b3
  416. add Y, INCY, Y
  417. LDF [X + 0 * SIZE], a4
  418. add X, INCX, X
  419. LDF [Y + 0 * SIZE], b4
  420. add Y, INCY, Y
  421. LDF [X + 0 * SIZE], a5
  422. add X, INCX, X
  423. LDF [Y + 0 * SIZE], b5
  424. add Y, INCY, Y
  425. LDF [X + 0 * SIZE], a6
  426. add X, INCX, X
  427. LDF [Y + 0 * SIZE], b6
  428. add Y, INCY, Y
  429. LDF [X + 0 * SIZE], a7
  430. add X, INCX, X
  431. LDF [Y + 0 * SIZE], b7
  432. add Y, INCY, Y
  433. LDF [X + 0 * SIZE], a8
  434. add X, INCX, X
  435. LDF [Y + 0 * SIZE], b8
  436. add Y, INCY, Y
  437. FMUL C, a1, c1
  438. FMUL S, b1, c2
  439. FMUL C, b1, c3
  440. FMUL S, a1, c4
  441. FADD c1, c2, t1
  442. FSUB c3, c4, t2
  443. STF t1, [XX + 0 * SIZE]
  444. add XX, INCX, XX
  445. STF t2, [YY + 0 * SIZE]
  446. add YY, INCY, YY
  447. FMUL C, a2, c5
  448. FMUL S, b2, c6
  449. FMUL C, b2, c7
  450. FMUL S, a2, c8
  451. FADD c5, c6, t3
  452. FSUB c7, c8, t4
  453. STF t3, [XX + 0 * SIZE]
  454. add XX, INCX, XX
  455. STF t4, [YY + 0 * SIZE]
  456. add YY, INCY, YY
  457. FMUL C, a3, c1
  458. FMUL S, b3, c2
  459. FMUL C, b3, c3
  460. FMUL S, a3, c4
  461. FADD c1, c2, t1
  462. FSUB c3, c4, t2
  463. STF t1, [XX + 0 * SIZE]
  464. add XX, INCX, XX
  465. STF t2, [YY + 0 * SIZE]
  466. add YY, INCY, YY
  467. FMUL C, a4, c5
  468. FMUL S, b4, c6
  469. FMUL C, b4, c7
  470. FMUL S, a4, c8
  471. FADD c5, c6, t3
  472. FSUB c7, c8, t4
  473. STF t3, [XX + 0 * SIZE]
  474. add XX, INCX, XX
  475. STF t4, [YY + 0 * SIZE]
  476. add YY, INCY, YY
  477. FMUL C, a5, c1
  478. FMUL S, b5, c2
  479. FMUL C, b5, c3
  480. FMUL S, a5, c4
  481. FADD c1, c2, t1
  482. FSUB c3, c4, t2
  483. STF t1, [XX + 0 * SIZE]
  484. add XX, INCX, XX
  485. STF t2, [YY + 0 * SIZE]
  486. add YY, INCY, YY
  487. FMUL C, a6, c5
  488. FMUL S, b6, c6
  489. FMUL C, b6, c7
  490. FMUL S, a6, c8
  491. FADD c5, c6, t3
  492. FSUB c7, c8, t4
  493. STF t3, [XX + 0 * SIZE]
  494. add XX, INCX, XX
  495. STF t4, [YY + 0 * SIZE]
  496. add YY, INCY, YY
  497. FMUL C, a7, c1
  498. FMUL S, b7, c2
  499. FMUL C, b7, c3
  500. FMUL S, a7, c4
  501. FADD c1, c2, t1
  502. FSUB c3, c4, t2
  503. STF t1, [XX + 0 * SIZE]
  504. add XX, INCX, XX
  505. STF t2, [YY + 0 * SIZE]
  506. add YY, INCY, YY
  507. FMUL C, a8, c5
  508. FMUL S, b8, c6
  509. FMUL C, b8, c7
  510. FMUL S, a8, c8
  511. FADD c5, c6, t3
  512. FSUB c7, c8, t4
  513. STF t3, [XX + 0 * SIZE]
  514. add XX, INCX, XX
  515. STF t4, [YY + 0 * SIZE]
  516. add YY, INCY, YY
  517. addcc I, -1, I
  518. bg,pt %icc, .LL51
  519. nop
  520. .LL55:
  521. andcc N, 7, I
  522. nop
  523. ble %icc, .LL59
  524. nop
  525. .LL56:
  526. LDF [X + 0 * SIZE], a1
  527. LDF [Y + 0 * SIZE], b1
  528. FMUL C, a1, c1
  529. FMUL S, b1, c2
  530. FMUL C, b1, c3
  531. FMUL S, a1, c4
  532. FADD c1, c2, c2
  533. FSUB c3, c4, c4
  534. STF c2, [X + 0 * SIZE]
  535. add X, INCX, X
  536. STF c4, [Y + 0 * SIZE]
  537. addcc I, -1, I
  538. bg %icc, .LL56
  539. add Y, INCY, Y
  540. .LL59:
  541. return %i7 + 8
  542. nop
  543. EPILOGUE