You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

zrot.S 11 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631
  1. /*********************************************************************/
  2. /* Copyright 2009, 2010 The University of Texas at Austin. */
  3. /* All rights reserved. */
  4. /* */
  5. /* Redistribution and use in source and binary forms, with or */
  6. /* without modification, are permitted provided that the following */
  7. /* conditions are met: */
  8. /* */
  9. /* 1. Redistributions of source code must retain the above */
  10. /* copyright notice, this list of conditions and the following */
  11. /* disclaimer. */
  12. /* */
  13. /* 2. Redistributions in binary form must reproduce the above */
  14. /* copyright notice, this list of conditions and the following */
  15. /* disclaimer in the documentation and/or other materials */
  16. /* provided with the distribution. */
  17. /* */
  18. /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
  19. /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
  20. /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
  21. /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
  22. /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
  23. /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
  24. /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
  25. /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
  26. /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
  27. /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
  28. /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
  29. /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
  30. /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
  31. /* POSSIBILITY OF SUCH DAMAGE. */
  32. /* */
  33. /* The views and conclusions contained in the software and */
  34. /* documentation are those of the authors and should not be */
  35. /* interpreted as representing official policies, either expressed */
  36. /* or implied, of The University of Texas at Austin. */
  37. /*********************************************************************/
  38. #define ASSEMBLER
  39. #include "common.h"
  40. #include "version.h"
  41. #define N $16
  42. #define X $17
  43. #define INCX $18
  44. #define Y $19
  45. #define INCY $20
  46. #define I $21
  47. #define XX $23
  48. #define YY $24
  49. #define C $f10
  50. #define S $f11
  51. #define PREFETCH_SIZE 80
  52. PROLOGUE
  53. PROFCODE
  54. .frame $sp, 0, $26, 0
  55. #ifndef PROFILE
  56. .prologue 0
  57. #else
  58. .prologue 1
  59. #endif
  60. fmov $f21, C
  61. LD S, 0($sp)
  62. addq INCX, INCX, INCX
  63. addq INCY, INCY, INCY
  64. cmpeq INCX, 2, $23
  65. cmpeq INCY, 2, $24
  66. ble N, $L998
  67. and $23, $24, $23
  68. beq $23, $L50
  69. sra N, 2, I
  70. ble I, $L15
  71. LD $f12, 0*SIZE(X)
  72. LD $f13, 0*SIZE(Y)
  73. LD $f14, 1*SIZE(X)
  74. LD $f15, 1*SIZE(Y)
  75. LD $f16, 2*SIZE(X)
  76. LD $f17, 2*SIZE(Y)
  77. LD $f18, 3*SIZE(X)
  78. LD $f19, 3*SIZE(Y)
  79. MUL C, $f12, $f21
  80. unop
  81. MUL S, $f13, $f22
  82. MUL C, $f13, $f23
  83. LD $f13, 4*SIZE(Y)
  84. MUL S, $f12, $f24
  85. LD $f12, 4*SIZE(X)
  86. MUL C, $f14, $f25
  87. lda I, -1(I)
  88. MUL S, $f15, $f26
  89. ADD $f21, $f22, $f22
  90. MUL C, $f15, $f27
  91. LD $f15, 5*SIZE(Y)
  92. MUL S, $f14, $f28
  93. SUB $f23, $f24, $f24
  94. ble I, $L13
  95. .align 4
  96. $L12:
  97. MUL C, $f16, $f21
  98. lds $f31, (PREFETCH_SIZE) * SIZE(X)
  99. unop
  100. LD $f14, 5*SIZE(X)
  101. ST $f22, 0*SIZE(X)
  102. MUL S, $f17, $f22
  103. unop
  104. ADD $f25, $f26, $f26
  105. MUL C, $f17, $f23
  106. lds $f31, (PREFETCH_SIZE) * SIZE(Y)
  107. unop
  108. LD $f17, 6*SIZE(Y)
  109. ST $f24, 0*SIZE(Y)
  110. MUL S, $f16, $f24
  111. unop
  112. SUB $f27, $f28, $f28
  113. MUL C, $f18, $f25
  114. LD $f16, 6*SIZE(X)
  115. unop
  116. unop
  117. ST $f26, 1*SIZE(X)
  118. MUL S, $f19, $f26
  119. unop
  120. ADD $f21, $f22, $f22
  121. MUL C, $f19, $f27
  122. unop
  123. unop
  124. LD $f19, 7*SIZE(Y)
  125. ST $f28, 1*SIZE(Y)
  126. MUL S, $f18, $f28
  127. unop
  128. SUB $f23, $f24, $f24
  129. MUL C, $f12, $f21
  130. LD $f18, 7*SIZE(X)
  131. unop
  132. unop
  133. ST $f22, 2*SIZE(X)
  134. unop
  135. MUL S, $f13, $f22
  136. ADD $f25, $f26, $f26
  137. MUL C, $f13, $f23
  138. LD $f13, 8*SIZE(Y)
  139. unop
  140. unop
  141. ST $f24, 2*SIZE(Y)
  142. MUL S, $f12, $f24
  143. unop
  144. SUB $f27, $f28, $f28
  145. MUL C, $f14, $f25
  146. LD $f12, 8*SIZE(X)
  147. unop
  148. unop
  149. ST $f26, 3*SIZE(X)
  150. MUL S, $f15, $f26
  151. unop
  152. ADD $f21, $f22, $f22
  153. MUL C, $f15, $f27
  154. LD $f15, 9*SIZE(Y)
  155. unop
  156. unop
  157. ST $f28, 3*SIZE(Y)
  158. MUL S, $f14, $f28
  159. unop
  160. SUB $f23, $f24, $f24
  161. MUL C, $f16, $f21
  162. LD $f14, 9*SIZE(X)
  163. unop
  164. unop
  165. ST $f22, 4*SIZE(X)
  166. MUL S, $f17, $f22
  167. unop
  168. ADD $f25, $f26, $f26
  169. MUL C, $f17, $f23
  170. LD $f17, 10*SIZE(Y)
  171. unop
  172. unop
  173. ST $f24, 4*SIZE(Y)
  174. MUL S, $f16, $f24
  175. unop
  176. SUB $f27, $f28, $f28
  177. MUL C, $f18, $f25
  178. LD $f16, 10*SIZE(X)
  179. unop
  180. unop
  181. ST $f26, 5*SIZE(X)
  182. MUL S, $f19, $f26
  183. unop
  184. ADD $f21, $f22, $f22
  185. MUL C, $f19, $f27
  186. LD $f19, 11*SIZE(Y)
  187. unop
  188. unop
  189. ST $f28, 5*SIZE(Y)
  190. MUL S, $f18, $f28
  191. lda I, -1(I)
  192. SUB $f23, $f24, $f24
  193. MUL C, $f12, $f21
  194. LD $f18, 11*SIZE(X)
  195. unop
  196. unop
  197. ST $f22, 6*SIZE(X)
  198. MUL S, $f13, $f22
  199. unop
  200. ADD $f25, $f26, $f26
  201. MUL C, $f13, $f23
  202. LD $f13, 12*SIZE(Y)
  203. lda X, 8*SIZE(X)
  204. unop
  205. ST $f24, 6*SIZE(Y)
  206. MUL S, $f12, $f24
  207. unop
  208. SUB $f27, $f28, $f28
  209. MUL C, $f14, $f25
  210. LD $f12, 4*SIZE(X)
  211. lda Y, 8*SIZE(Y)
  212. unop
  213. ST $f26, -1*SIZE(X)
  214. MUL S, $f15, $f26
  215. unop
  216. ADD $f21, $f22, $f22
  217. MUL C, $f15, $f27
  218. LD $f15, 5*SIZE(Y)
  219. unop
  220. unop
  221. ST $f28, -1*SIZE(Y)
  222. MUL S, $f14, $f28
  223. SUB $f23, $f24, $f24
  224. bgt I, $L12
  225. .align 4
  226. $L13:
  227. MUL C, $f16, $f21
  228. LD $f14, 5*SIZE(X)
  229. unop
  230. unop
  231. ST $f22, 0*SIZE(X)
  232. MUL S, $f17, $f22
  233. unop
  234. ADD $f25, $f26, $f26
  235. MUL C, $f17, $f23
  236. unop
  237. unop
  238. LD $f17, 6*SIZE(Y)
  239. ST $f24, 0*SIZE(Y)
  240. MUL S, $f16, $f24
  241. LD $f16, 6*SIZE(X)
  242. SUB $f27, $f28, $f28
  243. MUL C, $f18, $f25
  244. unop
  245. unop
  246. unop
  247. ST $f26, 1*SIZE(X)
  248. MUL S, $f19, $f26
  249. unop
  250. ADD $f21, $f22, $f22
  251. MUL C, $f19, $f27
  252. unop
  253. unop
  254. LD $f19, 7*SIZE(Y)
  255. ST $f28, 1*SIZE(Y)
  256. MUL S, $f18, $f28
  257. LD $f18, 7*SIZE(X)
  258. SUB $f23, $f24, $f24
  259. MUL C, $f12, $f21
  260. unop
  261. unop
  262. unop
  263. ST $f22, 2*SIZE(X)
  264. unop
  265. MUL S, $f13, $f22
  266. ADD $f25, $f26, $f26
  267. MUL C, $f13, $f23
  268. unop
  269. unop
  270. unop
  271. ST $f24, 2*SIZE(Y)
  272. MUL S, $f12, $f24
  273. unop
  274. SUB $f27, $f28, $f28
  275. MUL C, $f14, $f25
  276. unop
  277. unop
  278. unop
  279. ST $f26, 3*SIZE(X)
  280. MUL S, $f15, $f26
  281. unop
  282. ADD $f21, $f22, $f22
  283. MUL C, $f15, $f27
  284. unop
  285. unop
  286. unop
  287. ST $f28, 3*SIZE(Y)
  288. MUL S, $f14, $f28
  289. unop
  290. SUB $f23, $f24, $f24
  291. MUL C, $f16, $f21
  292. unop
  293. unop
  294. unop
  295. ST $f22, 4*SIZE(X)
  296. MUL S, $f17, $f22
  297. unop
  298. ADD $f25, $f26, $f26
  299. MUL C, $f17, $f23
  300. unop
  301. unop
  302. unop
  303. ST $f24, 4*SIZE(Y)
  304. MUL S, $f16, $f24
  305. unop
  306. SUB $f27, $f28, $f28
  307. MUL C, $f18, $f25
  308. unop
  309. unop
  310. unop
  311. ST $f26, 5*SIZE(X)
  312. MUL S, $f19, $f26
  313. unop
  314. ADD $f21, $f22, $f22
  315. MUL C, $f19, $f27
  316. unop
  317. unop
  318. unop
  319. ST $f28, 5*SIZE(Y)
  320. MUL S, $f18, $f28
  321. unop
  322. SUB $f23, $f24, $f24
  323. ST $f22, 6*SIZE(X)
  324. ADD $f25, $f26, $f26
  325. ST $f24, 6*SIZE(Y)
  326. SUB $f27, $f28, $f28
  327. ST $f26, 7*SIZE(X)
  328. lda X, 8*SIZE(X)
  329. ST $f28, 7*SIZE(Y)
  330. lda Y, 8*SIZE(Y)
  331. .align 4
  332. $L15:
  333. and N, 3, I
  334. ble I, $L998
  335. .align 4
  336. $L16:
  337. LD $f12, 0*SIZE(X)
  338. LD $f13, 0*SIZE(Y)
  339. LD $f14, 1*SIZE(X)
  340. LD $f15, 1*SIZE(Y)
  341. MUL C, $f12, $f21
  342. MUL S, $f13, $f22
  343. MUL C, $f13, $f23
  344. MUL S, $f12, $f24
  345. ADD $f21, $f22, $f22
  346. SUB $f23, $f24, $f24
  347. MUL C, $f14, $f25
  348. MUL S, $f15, $f26
  349. MUL C, $f15, $f27
  350. MUL S, $f14, $f28
  351. ADD $f25, $f26, $f26
  352. SUB $f27, $f28, $f28
  353. ST $f22, 0*SIZE(X)
  354. ST $f24, 0*SIZE(Y)
  355. lda I, -1(I)
  356. ST $f26, 1*SIZE(X)
  357. lda X, 2 * SIZE(X)
  358. ST $f28, 1*SIZE(Y)
  359. lda Y, 2 * SIZE(Y)
  360. bgt I, $L16
  361. .align 4
  362. $L998:
  363. clr $0
  364. ret
  365. .align 4
  366. $L50:
  367. mov X, XX
  368. mov Y, YY
  369. sra N, 2, I
  370. ble I, $L55
  371. .align 4
  372. $L51:
  373. LD $f12, 0*SIZE(X)
  374. LD $f13, 0*SIZE(Y)
  375. LD $f14, 1*SIZE(X)
  376. SXADDQ INCX, X, X
  377. LD $f15, 1*SIZE(Y)
  378. SXADDQ INCY, Y, Y
  379. MUL C, $f12, $f21
  380. MUL S, $f13, $f22
  381. MUL C, $f13, $f23
  382. MUL S, $f12, $f24
  383. ADD $f21, $f22, $f22
  384. SUB $f23, $f24, $f24
  385. MUL C, $f14, $f25
  386. MUL S, $f15, $f26
  387. MUL C, $f15, $f27
  388. MUL S, $f14, $f28
  389. ADD $f25, $f26, $f26
  390. SUB $f27, $f28, $f28
  391. ST $f22, 0*SIZE(XX)
  392. ST $f24, 0*SIZE(YY)
  393. ST $f26, 1*SIZE(XX)
  394. SXADDQ INCX, XX, XX
  395. ST $f28, 1*SIZE(YY)
  396. SXADDQ INCY, YY, YY
  397. LD $f12, 0*SIZE(X)
  398. LD $f13, 0*SIZE(Y)
  399. LD $f14, 1*SIZE(X)
  400. SXADDQ INCX, X, X
  401. LD $f15, 1*SIZE(Y)
  402. SXADDQ INCY, Y, Y
  403. MUL C, $f12, $f21
  404. MUL S, $f13, $f22
  405. MUL C, $f13, $f23
  406. MUL S, $f12, $f24
  407. ADD $f21, $f22, $f22
  408. SUB $f23, $f24, $f24
  409. MUL C, $f14, $f25
  410. MUL S, $f15, $f26
  411. MUL C, $f15, $f27
  412. MUL S, $f14, $f28
  413. ADD $f25, $f26, $f26
  414. SUB $f27, $f28, $f28
  415. ST $f22, 0*SIZE(XX)
  416. ST $f24, 0*SIZE(YY)
  417. ST $f26, 1*SIZE(XX)
  418. SXADDQ INCX, XX, XX
  419. ST $f28, 1*SIZE(YY)
  420. SXADDQ INCY, YY, YY
  421. LD $f12, 0*SIZE(X)
  422. LD $f13, 0*SIZE(Y)
  423. LD $f14, 1*SIZE(X)
  424. SXADDQ INCX, X, X
  425. LD $f15, 1*SIZE(Y)
  426. SXADDQ INCY, Y, Y
  427. MUL C, $f12, $f21
  428. MUL S, $f13, $f22
  429. MUL C, $f13, $f23
  430. MUL S, $f12, $f24
  431. ADD $f21, $f22, $f22
  432. SUB $f23, $f24, $f24
  433. MUL C, $f14, $f25
  434. MUL S, $f15, $f26
  435. MUL C, $f15, $f27
  436. MUL S, $f14, $f28
  437. ADD $f25, $f26, $f26
  438. SUB $f27, $f28, $f28
  439. ST $f22, 0*SIZE(XX)
  440. ST $f24, 0*SIZE(YY)
  441. ST $f26, 1*SIZE(XX)
  442. SXADDQ INCX, XX, XX
  443. ST $f28, 1*SIZE(YY)
  444. SXADDQ INCY, YY, YY
  445. LD $f12, 0*SIZE(X)
  446. LD $f13, 0*SIZE(Y)
  447. LD $f14, 1*SIZE(X)
  448. SXADDQ INCX, X, X
  449. LD $f15, 1*SIZE(Y)
  450. SXADDQ INCY, Y, Y
  451. MUL C, $f12, $f21
  452. MUL S, $f13, $f22
  453. MUL C, $f13, $f23
  454. MUL S, $f12, $f24
  455. ADD $f21, $f22, $f22
  456. SUB $f23, $f24, $f24
  457. MUL C, $f14, $f25
  458. MUL S, $f15, $f26
  459. MUL C, $f15, $f27
  460. MUL S, $f14, $f28
  461. ADD $f25, $f26, $f26
  462. SUB $f27, $f28, $f28
  463. ST $f22, 0*SIZE(XX)
  464. ST $f24, 0*SIZE(YY)
  465. ST $f26, 1*SIZE(XX)
  466. SXADDQ INCX, XX, XX
  467. ST $f28, 1*SIZE(YY)
  468. SXADDQ INCY, YY, YY
  469. lda I, -1(I)
  470. bgt I, $L51
  471. .align 4
  472. $L55:
  473. and N, 3, I
  474. ble I, $L999
  475. .align 4
  476. $L56:
  477. LD $f12, 0*SIZE(X)
  478. LD $f13, 0*SIZE(Y)
  479. LD $f14, 1*SIZE(X)
  480. LD $f15, 1*SIZE(Y)
  481. MUL C, $f12, $f21
  482. MUL S, $f13, $f22
  483. MUL C, $f13, $f23
  484. MUL S, $f12, $f24
  485. ADD $f21, $f22, $f22
  486. SUB $f23, $f24, $f24
  487. MUL C, $f14, $f25
  488. MUL S, $f15, $f26
  489. MUL C, $f15, $f27
  490. MUL S, $f14, $f28
  491. ADD $f25, $f26, $f26
  492. SUB $f27, $f28, $f28
  493. ST $f22, 0*SIZE(X)
  494. ST $f24, 0*SIZE(Y)
  495. lda I, -1(I)
  496. ST $f26, 1*SIZE(X)
  497. ST $f28, 1*SIZE(Y)
  498. SXADDQ INCX, X, X
  499. SXADDQ INCY, Y, Y
  500. bgt I, $L56
  501. .align 4
  502. $L999:
  503. clr $0
  504. ret
  505. EPILOGUE