You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

zrot.S 11 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631
  1. /*********************************************************************/
  2. /* Copyright 2009, 2010 The University of Texas at Austin. */
  3. /* All rights reserved. */
  4. /* */
  5. /* Redistribution and use in source and binary forms, with or */
  6. /* without modification, are permitted provided that the following */
  7. /* conditions are met: */
  8. /* */
  9. /* 1. Redistributions of source code must retain the above */
  10. /* copyright notice, this list of conditions and the following */
  11. /* disclaimer. */
  12. /* */
  13. /* 2. Redistributions in binary form must reproduce the above */
  14. /* copyright notice, this list of conditions and the following */
  15. /* disclaimer in the documentation and/or other materials */
  16. /* provided with the distribution. */
  17. /* */
  18. /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
  19. /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
  20. /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
  21. /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
  22. /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
  23. /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
  24. /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
  25. /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
  26. /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
  27. /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
  28. /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
  29. /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
  30. /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
  31. /* POSSIBILITY OF SUCH DAMAGE. */
  32. /* */
  33. /* The views and conclusions contained in the software and */
  34. /* documentation are those of the authors and should not be */
  35. /* interpreted as representing official policies, either expressed */
  36. /* or implied, of The University of Texas at Austin. */
  37. /*********************************************************************/
  38. #define ASSEMBLER
  39. #include "common.h"
  40. #define N $16
  41. #define X $17
  42. #define INCX $18
  43. #define Y $19
  44. #define INCY $20
  45. #define I $21
  46. #define XX $23
  47. #define YY $24
  48. #define C $f10
  49. #define S $f11
  50. #define PREFETCH_SIZE 80
  51. PROLOGUE
  52. PROFCODE
  53. .frame $sp, 0, $26, 0
  54. #ifndef PROFILE
  55. .prologue 0
  56. #else
  57. .prologue 1
  58. #endif
  59. fmov $f21, C
  60. LD S, 0($sp)
  61. addq INCX, INCX, INCX
  62. addq INCY, INCY, INCY
  63. cmpeq INCX, 2, $23
  64. cmpeq INCY, 2, $24
  65. ble N, $L998
  66. and $23, $24, $23
  67. beq $23, $L50
  68. sra N, 2, I
  69. ble I, $L15
  70. LD $f12, 0*SIZE(X)
  71. LD $f13, 0*SIZE(Y)
  72. LD $f14, 1*SIZE(X)
  73. LD $f15, 1*SIZE(Y)
  74. LD $f16, 2*SIZE(X)
  75. LD $f17, 2*SIZE(Y)
  76. LD $f18, 3*SIZE(X)
  77. LD $f19, 3*SIZE(Y)
  78. MUL C, $f12, $f21
  79. unop
  80. MUL S, $f13, $f22
  81. MUL C, $f13, $f23
  82. LD $f13, 4*SIZE(Y)
  83. MUL S, $f12, $f24
  84. LD $f12, 4*SIZE(X)
  85. MUL C, $f14, $f25
  86. lda I, -1(I)
  87. MUL S, $f15, $f26
  88. ADD $f21, $f22, $f22
  89. MUL C, $f15, $f27
  90. LD $f15, 5*SIZE(Y)
  91. MUL S, $f14, $f28
  92. SUB $f23, $f24, $f24
  93. ble I, $L13
  94. .align 4
  95. $L12:
  96. MUL C, $f16, $f21
  97. lds $f31, (PREFETCH_SIZE) * SIZE(X)
  98. unop
  99. LD $f14, 5*SIZE(X)
  100. ST $f22, 0*SIZE(X)
  101. MUL S, $f17, $f22
  102. unop
  103. ADD $f25, $f26, $f26
  104. MUL C, $f17, $f23
  105. lds $f31, (PREFETCH_SIZE) * SIZE(Y)
  106. unop
  107. LD $f17, 6*SIZE(Y)
  108. ST $f24, 0*SIZE(Y)
  109. MUL S, $f16, $f24
  110. unop
  111. SUB $f27, $f28, $f28
  112. MUL C, $f18, $f25
  113. LD $f16, 6*SIZE(X)
  114. unop
  115. unop
  116. ST $f26, 1*SIZE(X)
  117. MUL S, $f19, $f26
  118. unop
  119. ADD $f21, $f22, $f22
  120. MUL C, $f19, $f27
  121. unop
  122. unop
  123. LD $f19, 7*SIZE(Y)
  124. ST $f28, 1*SIZE(Y)
  125. MUL S, $f18, $f28
  126. unop
  127. SUB $f23, $f24, $f24
  128. MUL C, $f12, $f21
  129. LD $f18, 7*SIZE(X)
  130. unop
  131. unop
  132. ST $f22, 2*SIZE(X)
  133. unop
  134. MUL S, $f13, $f22
  135. ADD $f25, $f26, $f26
  136. MUL C, $f13, $f23
  137. LD $f13, 8*SIZE(Y)
  138. unop
  139. unop
  140. ST $f24, 2*SIZE(Y)
  141. MUL S, $f12, $f24
  142. unop
  143. SUB $f27, $f28, $f28
  144. MUL C, $f14, $f25
  145. LD $f12, 8*SIZE(X)
  146. unop
  147. unop
  148. ST $f26, 3*SIZE(X)
  149. MUL S, $f15, $f26
  150. unop
  151. ADD $f21, $f22, $f22
  152. MUL C, $f15, $f27
  153. LD $f15, 9*SIZE(Y)
  154. unop
  155. unop
  156. ST $f28, 3*SIZE(Y)
  157. MUL S, $f14, $f28
  158. unop
  159. SUB $f23, $f24, $f24
  160. MUL C, $f16, $f21
  161. LD $f14, 9*SIZE(X)
  162. unop
  163. unop
  164. ST $f22, 4*SIZE(X)
  165. MUL S, $f17, $f22
  166. unop
  167. ADD $f25, $f26, $f26
  168. MUL C, $f17, $f23
  169. LD $f17, 10*SIZE(Y)
  170. unop
  171. unop
  172. ST $f24, 4*SIZE(Y)
  173. MUL S, $f16, $f24
  174. unop
  175. SUB $f27, $f28, $f28
  176. MUL C, $f18, $f25
  177. LD $f16, 10*SIZE(X)
  178. unop
  179. unop
  180. ST $f26, 5*SIZE(X)
  181. MUL S, $f19, $f26
  182. unop
  183. ADD $f21, $f22, $f22
  184. MUL C, $f19, $f27
  185. LD $f19, 11*SIZE(Y)
  186. unop
  187. unop
  188. ST $f28, 5*SIZE(Y)
  189. MUL S, $f18, $f28
  190. lda I, -1(I)
  191. SUB $f23, $f24, $f24
  192. MUL C, $f12, $f21
  193. LD $f18, 11*SIZE(X)
  194. unop
  195. unop
  196. ST $f22, 6*SIZE(X)
  197. MUL S, $f13, $f22
  198. unop
  199. ADD $f25, $f26, $f26
  200. MUL C, $f13, $f23
  201. LD $f13, 12*SIZE(Y)
  202. lda X, 8*SIZE(X)
  203. unop
  204. ST $f24, 6*SIZE(Y)
  205. MUL S, $f12, $f24
  206. unop
  207. SUB $f27, $f28, $f28
  208. MUL C, $f14, $f25
  209. LD $f12, 4*SIZE(X)
  210. lda Y, 8*SIZE(Y)
  211. unop
  212. ST $f26, -1*SIZE(X)
  213. MUL S, $f15, $f26
  214. unop
  215. ADD $f21, $f22, $f22
  216. MUL C, $f15, $f27
  217. LD $f15, 5*SIZE(Y)
  218. unop
  219. unop
  220. ST $f28, -1*SIZE(Y)
  221. MUL S, $f14, $f28
  222. SUB $f23, $f24, $f24
  223. bgt I, $L12
  224. .align 4
  225. $L13:
  226. MUL C, $f16, $f21
  227. LD $f14, 5*SIZE(X)
  228. unop
  229. unop
  230. ST $f22, 0*SIZE(X)
  231. MUL S, $f17, $f22
  232. unop
  233. ADD $f25, $f26, $f26
  234. MUL C, $f17, $f23
  235. unop
  236. unop
  237. LD $f17, 6*SIZE(Y)
  238. ST $f24, 0*SIZE(Y)
  239. MUL S, $f16, $f24
  240. LD $f16, 6*SIZE(X)
  241. SUB $f27, $f28, $f28
  242. MUL C, $f18, $f25
  243. unop
  244. unop
  245. unop
  246. ST $f26, 1*SIZE(X)
  247. MUL S, $f19, $f26
  248. unop
  249. ADD $f21, $f22, $f22
  250. MUL C, $f19, $f27
  251. unop
  252. unop
  253. LD $f19, 7*SIZE(Y)
  254. ST $f28, 1*SIZE(Y)
  255. MUL S, $f18, $f28
  256. LD $f18, 7*SIZE(X)
  257. SUB $f23, $f24, $f24
  258. MUL C, $f12, $f21
  259. unop
  260. unop
  261. unop
  262. ST $f22, 2*SIZE(X)
  263. unop
  264. MUL S, $f13, $f22
  265. ADD $f25, $f26, $f26
  266. MUL C, $f13, $f23
  267. unop
  268. unop
  269. unop
  270. ST $f24, 2*SIZE(Y)
  271. MUL S, $f12, $f24
  272. unop
  273. SUB $f27, $f28, $f28
  274. MUL C, $f14, $f25
  275. unop
  276. unop
  277. unop
  278. ST $f26, 3*SIZE(X)
  279. MUL S, $f15, $f26
  280. unop
  281. ADD $f21, $f22, $f22
  282. MUL C, $f15, $f27
  283. unop
  284. unop
  285. unop
  286. ST $f28, 3*SIZE(Y)
  287. MUL S, $f14, $f28
  288. unop
  289. SUB $f23, $f24, $f24
  290. MUL C, $f16, $f21
  291. unop
  292. unop
  293. unop
  294. ST $f22, 4*SIZE(X)
  295. MUL S, $f17, $f22
  296. unop
  297. ADD $f25, $f26, $f26
  298. MUL C, $f17, $f23
  299. unop
  300. unop
  301. unop
  302. ST $f24, 4*SIZE(Y)
  303. MUL S, $f16, $f24
  304. unop
  305. SUB $f27, $f28, $f28
  306. MUL C, $f18, $f25
  307. unop
  308. unop
  309. unop
  310. ST $f26, 5*SIZE(X)
  311. MUL S, $f19, $f26
  312. unop
  313. ADD $f21, $f22, $f22
  314. MUL C, $f19, $f27
  315. unop
  316. unop
  317. unop
  318. ST $f28, 5*SIZE(Y)
  319. MUL S, $f18, $f28
  320. unop
  321. SUB $f23, $f24, $f24
  322. ST $f22, 6*SIZE(X)
  323. ADD $f25, $f26, $f26
  324. ST $f24, 6*SIZE(Y)
  325. SUB $f27, $f28, $f28
  326. ST $f26, 7*SIZE(X)
  327. lda X, 8*SIZE(X)
  328. ST $f28, 7*SIZE(Y)
  329. lda Y, 8*SIZE(Y)
  330. .align 4
  331. $L15:
  332. and N, 3, I
  333. ble I, $L998
  334. .align 4
  335. $L16:
  336. LD $f12, 0*SIZE(X)
  337. LD $f13, 0*SIZE(Y)
  338. LD $f14, 1*SIZE(X)
  339. LD $f15, 1*SIZE(Y)
  340. MUL C, $f12, $f21
  341. MUL S, $f13, $f22
  342. MUL C, $f13, $f23
  343. MUL S, $f12, $f24
  344. ADD $f21, $f22, $f22
  345. SUB $f23, $f24, $f24
  346. MUL C, $f14, $f25
  347. MUL S, $f15, $f26
  348. MUL C, $f15, $f27
  349. MUL S, $f14, $f28
  350. ADD $f25, $f26, $f26
  351. SUB $f27, $f28, $f28
  352. ST $f22, 0*SIZE(X)
  353. ST $f24, 0*SIZE(Y)
  354. lda I, -1(I)
  355. ST $f26, 1*SIZE(X)
  356. lda X, 2 * SIZE(X)
  357. ST $f28, 1*SIZE(Y)
  358. lda Y, 2 * SIZE(Y)
  359. bgt I, $L16
  360. .align 4
  361. $L998:
  362. clr $0
  363. ret
  364. .align 4
  365. $L50:
  366. mov X, XX
  367. mov Y, YY
  368. sra N, 2, I
  369. ble I, $L55
  370. .align 4
  371. $L51:
  372. LD $f12, 0*SIZE(X)
  373. LD $f13, 0*SIZE(Y)
  374. LD $f14, 1*SIZE(X)
  375. SXADDQ INCX, X, X
  376. LD $f15, 1*SIZE(Y)
  377. SXADDQ INCY, Y, Y
  378. MUL C, $f12, $f21
  379. MUL S, $f13, $f22
  380. MUL C, $f13, $f23
  381. MUL S, $f12, $f24
  382. ADD $f21, $f22, $f22
  383. SUB $f23, $f24, $f24
  384. MUL C, $f14, $f25
  385. MUL S, $f15, $f26
  386. MUL C, $f15, $f27
  387. MUL S, $f14, $f28
  388. ADD $f25, $f26, $f26
  389. SUB $f27, $f28, $f28
  390. ST $f22, 0*SIZE(XX)
  391. ST $f24, 0*SIZE(YY)
  392. ST $f26, 1*SIZE(XX)
  393. SXADDQ INCX, XX, XX
  394. ST $f28, 1*SIZE(YY)
  395. SXADDQ INCY, YY, YY
  396. LD $f12, 0*SIZE(X)
  397. LD $f13, 0*SIZE(Y)
  398. LD $f14, 1*SIZE(X)
  399. SXADDQ INCX, X, X
  400. LD $f15, 1*SIZE(Y)
  401. SXADDQ INCY, Y, Y
  402. MUL C, $f12, $f21
  403. MUL S, $f13, $f22
  404. MUL C, $f13, $f23
  405. MUL S, $f12, $f24
  406. ADD $f21, $f22, $f22
  407. SUB $f23, $f24, $f24
  408. MUL C, $f14, $f25
  409. MUL S, $f15, $f26
  410. MUL C, $f15, $f27
  411. MUL S, $f14, $f28
  412. ADD $f25, $f26, $f26
  413. SUB $f27, $f28, $f28
  414. ST $f22, 0*SIZE(XX)
  415. ST $f24, 0*SIZE(YY)
  416. ST $f26, 1*SIZE(XX)
  417. SXADDQ INCX, XX, XX
  418. ST $f28, 1*SIZE(YY)
  419. SXADDQ INCY, YY, YY
  420. LD $f12, 0*SIZE(X)
  421. LD $f13, 0*SIZE(Y)
  422. LD $f14, 1*SIZE(X)
  423. SXADDQ INCX, X, X
  424. LD $f15, 1*SIZE(Y)
  425. SXADDQ INCY, Y, Y
  426. MUL C, $f12, $f21
  427. MUL S, $f13, $f22
  428. MUL C, $f13, $f23
  429. MUL S, $f12, $f24
  430. ADD $f21, $f22, $f22
  431. SUB $f23, $f24, $f24
  432. MUL C, $f14, $f25
  433. MUL S, $f15, $f26
  434. MUL C, $f15, $f27
  435. MUL S, $f14, $f28
  436. ADD $f25, $f26, $f26
  437. SUB $f27, $f28, $f28
  438. ST $f22, 0*SIZE(XX)
  439. ST $f24, 0*SIZE(YY)
  440. ST $f26, 1*SIZE(XX)
  441. SXADDQ INCX, XX, XX
  442. ST $f28, 1*SIZE(YY)
  443. SXADDQ INCY, YY, YY
  444. LD $f12, 0*SIZE(X)
  445. LD $f13, 0*SIZE(Y)
  446. LD $f14, 1*SIZE(X)
  447. SXADDQ INCX, X, X
  448. LD $f15, 1*SIZE(Y)
  449. SXADDQ INCY, Y, Y
  450. MUL C, $f12, $f21
  451. MUL S, $f13, $f22
  452. MUL C, $f13, $f23
  453. MUL S, $f12, $f24
  454. ADD $f21, $f22, $f22
  455. SUB $f23, $f24, $f24
  456. MUL C, $f14, $f25
  457. MUL S, $f15, $f26
  458. MUL C, $f15, $f27
  459. MUL S, $f14, $f28
  460. ADD $f25, $f26, $f26
  461. SUB $f27, $f28, $f28
  462. ST $f22, 0*SIZE(XX)
  463. ST $f24, 0*SIZE(YY)
  464. ST $f26, 1*SIZE(XX)
  465. SXADDQ INCX, XX, XX
  466. ST $f28, 1*SIZE(YY)
  467. SXADDQ INCY, YY, YY
  468. lda I, -1(I)
  469. bgt I, $L51
  470. .align 4
  471. $L55:
  472. and N, 3, I
  473. ble I, $L999
  474. .align 4
  475. $L56:
  476. LD $f12, 0*SIZE(X)
  477. LD $f13, 0*SIZE(Y)
  478. LD $f14, 1*SIZE(X)
  479. LD $f15, 1*SIZE(Y)
  480. MUL C, $f12, $f21
  481. MUL S, $f13, $f22
  482. MUL C, $f13, $f23
  483. MUL S, $f12, $f24
  484. ADD $f21, $f22, $f22
  485. SUB $f23, $f24, $f24
  486. MUL C, $f14, $f25
  487. MUL S, $f15, $f26
  488. MUL C, $f15, $f27
  489. MUL S, $f14, $f28
  490. ADD $f25, $f26, $f26
  491. SUB $f27, $f28, $f28
  492. ST $f22, 0*SIZE(X)
  493. ST $f24, 0*SIZE(Y)
  494. lda I, -1(I)
  495. ST $f26, 1*SIZE(X)
  496. ST $f28, 1*SIZE(Y)
  497. SXADDQ INCX, X, X
  498. SXADDQ INCY, Y, Y
  499. bgt I, $L56
  500. .align 4
  501. $L999:
  502. clr $0
  503. ret
  504. EPILOGUE