You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

rot.S 10 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624
  1. /*********************************************************************/
  2. /* Copyright 2009, 2010 The University of Texas at Austin. */
  3. /* All rights reserved. */
  4. /* */
  5. /* Redistribution and use in source and binary forms, with or */
  6. /* without modification, are permitted provided that the following */
  7. /* conditions are met: */
  8. /* */
  9. /* 1. Redistributions of source code must retain the above */
  10. /* copyright notice, this list of conditions and the following */
  11. /* disclaimer. */
  12. /* */
  13. /* 2. Redistributions in binary form must reproduce the above */
  14. /* copyright notice, this list of conditions and the following */
  15. /* disclaimer in the documentation and/or other materials */
  16. /* provided with the distribution. */
  17. /* */
  18. /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
  19. /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
  20. /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
  21. /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
  22. /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
  23. /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
  24. /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
  25. /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
  26. /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
  27. /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
  28. /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
  29. /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
  30. /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
  31. /* POSSIBILITY OF SUCH DAMAGE. */
  32. /* */
  33. /* The views and conclusions contained in the software and */
  34. /* documentation are those of the authors and should not be */
  35. /* interpreted as representing official policies, either expressed */
  36. /* or implied, of The University of Texas at Austin. */
  37. /*********************************************************************/
  38. #define ASSEMBLER
  39. #include "common.h"
  40. #define N $16
  41. #define X $17
  42. #define INCX $18
  43. #define Y $19
  44. #define INCY $20
  45. #define I $21
  46. #define XX $23
  47. #define YY $24
  48. #define C $f10
  49. #define S $f11
  50. #define PREFETCH_SIZE 80
  51. PROLOGUE
  52. PROFCODE
  53. .frame $sp, 0, $26, 0
  54. #ifndef PROFILE
  55. .prologue 0
  56. #else
  57. .prologue 1
  58. #endif
  59. fmov $f21, C
  60. LD S, 0($sp)
  61. cmpeq INCX, 1, $23
  62. cmpeq INCY, 1, $24
  63. ble N, $L998
  64. and $23, $24, $23
  65. beq $23, $L50
  66. sra N, 3, I
  67. ble I, $L15
  68. LD $f12, 0*SIZE(X)
  69. LD $f13, 0*SIZE(Y)
  70. LD $f14, 1*SIZE(X)
  71. LD $f15, 1*SIZE(Y)
  72. LD $f16, 2*SIZE(X)
  73. LD $f17, 2*SIZE(Y)
  74. LD $f18, 3*SIZE(X)
  75. LD $f19, 3*SIZE(Y)
  76. MUL C, $f12, $f21
  77. unop
  78. MUL S, $f13, $f22
  79. MUL C, $f13, $f23
  80. LD $f13, 4*SIZE(Y)
  81. MUL S, $f12, $f24
  82. LD $f12, 4*SIZE(X)
  83. MUL C, $f14, $f25
  84. lda I, -1(I)
  85. MUL S, $f15, $f26
  86. ADD $f21, $f22, $f22
  87. MUL C, $f15, $f27
  88. LD $f15, 5*SIZE(Y)
  89. MUL S, $f14, $f28
  90. SUB $f23, $f24, $f24
  91. ble I, $L13
  92. .align 4
  93. $L12:
  94. MUL C, $f16, $f21
  95. lds $f31, (PREFETCH_SIZE) * SIZE(X)
  96. unop
  97. LD $f14, 5*SIZE(X)
  98. ST $f22, 0*SIZE(X)
  99. MUL S, $f17, $f22
  100. unop
  101. ADD $f25, $f26, $f26
  102. MUL C, $f17, $f23
  103. lds $f31, (PREFETCH_SIZE) * SIZE(Y)
  104. unop
  105. LD $f17, 6*SIZE(Y)
  106. ST $f24, 0*SIZE(Y)
  107. MUL S, $f16, $f24
  108. unop
  109. SUB $f27, $f28, $f28
  110. MUL C, $f18, $f25
  111. LD $f16, 6*SIZE(X)
  112. unop
  113. unop
  114. ST $f26, 1*SIZE(X)
  115. MUL S, $f19, $f26
  116. unop
  117. ADD $f21, $f22, $f22
  118. MUL C, $f19, $f27
  119. unop
  120. unop
  121. LD $f19, 7*SIZE(Y)
  122. ST $f28, 1*SIZE(Y)
  123. MUL S, $f18, $f28
  124. unop
  125. SUB $f23, $f24, $f24
  126. MUL C, $f12, $f21
  127. LD $f18, 7*SIZE(X)
  128. unop
  129. unop
  130. ST $f22, 2*SIZE(X)
  131. unop
  132. MUL S, $f13, $f22
  133. ADD $f25, $f26, $f26
  134. MUL C, $f13, $f23
  135. LD $f13, 8*SIZE(Y)
  136. unop
  137. unop
  138. ST $f24, 2*SIZE(Y)
  139. MUL S, $f12, $f24
  140. unop
  141. SUB $f27, $f28, $f28
  142. MUL C, $f14, $f25
  143. LD $f12, 8*SIZE(X)
  144. unop
  145. unop
  146. ST $f26, 3*SIZE(X)
  147. MUL S, $f15, $f26
  148. unop
  149. ADD $f21, $f22, $f22
  150. MUL C, $f15, $f27
  151. LD $f15, 9*SIZE(Y)
  152. unop
  153. unop
  154. ST $f28, 3*SIZE(Y)
  155. MUL S, $f14, $f28
  156. unop
  157. SUB $f23, $f24, $f24
  158. MUL C, $f16, $f21
  159. LD $f14, 9*SIZE(X)
  160. unop
  161. unop
  162. ST $f22, 4*SIZE(X)
  163. MUL S, $f17, $f22
  164. unop
  165. ADD $f25, $f26, $f26
  166. MUL C, $f17, $f23
  167. LD $f17, 10*SIZE(Y)
  168. unop
  169. unop
  170. ST $f24, 4*SIZE(Y)
  171. MUL S, $f16, $f24
  172. unop
  173. SUB $f27, $f28, $f28
  174. MUL C, $f18, $f25
  175. LD $f16, 10*SIZE(X)
  176. unop
  177. unop
  178. ST $f26, 5*SIZE(X)
  179. MUL S, $f19, $f26
  180. unop
  181. ADD $f21, $f22, $f22
  182. MUL C, $f19, $f27
  183. LD $f19, 11*SIZE(Y)
  184. unop
  185. unop
  186. ST $f28, 5*SIZE(Y)
  187. MUL S, $f18, $f28
  188. lda I, -1(I)
  189. SUB $f23, $f24, $f24
  190. MUL C, $f12, $f21
  191. LD $f18, 11*SIZE(X)
  192. unop
  193. unop
  194. ST $f22, 6*SIZE(X)
  195. MUL S, $f13, $f22
  196. unop
  197. ADD $f25, $f26, $f26
  198. MUL C, $f13, $f23
  199. LD $f13, 12*SIZE(Y)
  200. lda X, 8*SIZE(X)
  201. unop
  202. ST $f24, 6*SIZE(Y)
  203. MUL S, $f12, $f24
  204. unop
  205. SUB $f27, $f28, $f28
  206. MUL C, $f14, $f25
  207. LD $f12, 4*SIZE(X)
  208. lda Y, 8*SIZE(Y)
  209. unop
  210. ST $f26, -1*SIZE(X)
  211. MUL S, $f15, $f26
  212. unop
  213. ADD $f21, $f22, $f22
  214. MUL C, $f15, $f27
  215. LD $f15, 5*SIZE(Y)
  216. unop
  217. unop
  218. ST $f28, -1*SIZE(Y)
  219. MUL S, $f14, $f28
  220. SUB $f23, $f24, $f24
  221. bgt I, $L12
  222. .align 4
  223. $L13:
  224. MUL C, $f16, $f21
  225. LD $f14, 5*SIZE(X)
  226. unop
  227. unop
  228. ST $f22, 0*SIZE(X)
  229. MUL S, $f17, $f22
  230. unop
  231. ADD $f25, $f26, $f26
  232. MUL C, $f17, $f23
  233. unop
  234. unop
  235. LD $f17, 6*SIZE(Y)
  236. ST $f24, 0*SIZE(Y)
  237. MUL S, $f16, $f24
  238. LD $f16, 6*SIZE(X)
  239. SUB $f27, $f28, $f28
  240. MUL C, $f18, $f25
  241. unop
  242. unop
  243. unop
  244. ST $f26, 1*SIZE(X)
  245. MUL S, $f19, $f26
  246. unop
  247. ADD $f21, $f22, $f22
  248. MUL C, $f19, $f27
  249. unop
  250. unop
  251. LD $f19, 7*SIZE(Y)
  252. ST $f28, 1*SIZE(Y)
  253. MUL S, $f18, $f28
  254. LD $f18, 7*SIZE(X)
  255. SUB $f23, $f24, $f24
  256. MUL C, $f12, $f21
  257. unop
  258. unop
  259. unop
  260. ST $f22, 2*SIZE(X)
  261. unop
  262. MUL S, $f13, $f22
  263. ADD $f25, $f26, $f26
  264. MUL C, $f13, $f23
  265. unop
  266. unop
  267. unop
  268. ST $f24, 2*SIZE(Y)
  269. MUL S, $f12, $f24
  270. unop
  271. SUB $f27, $f28, $f28
  272. MUL C, $f14, $f25
  273. unop
  274. unop
  275. unop
  276. ST $f26, 3*SIZE(X)
  277. MUL S, $f15, $f26
  278. unop
  279. ADD $f21, $f22, $f22
  280. MUL C, $f15, $f27
  281. unop
  282. unop
  283. unop
  284. ST $f28, 3*SIZE(Y)
  285. MUL S, $f14, $f28
  286. unop
  287. SUB $f23, $f24, $f24
  288. MUL C, $f16, $f21
  289. unop
  290. unop
  291. unop
  292. ST $f22, 4*SIZE(X)
  293. MUL S, $f17, $f22
  294. unop
  295. ADD $f25, $f26, $f26
  296. MUL C, $f17, $f23
  297. unop
  298. unop
  299. unop
  300. ST $f24, 4*SIZE(Y)
  301. MUL S, $f16, $f24
  302. unop
  303. SUB $f27, $f28, $f28
  304. MUL C, $f18, $f25
  305. unop
  306. unop
  307. unop
  308. ST $f26, 5*SIZE(X)
  309. MUL S, $f19, $f26
  310. unop
  311. ADD $f21, $f22, $f22
  312. MUL C, $f19, $f27
  313. unop
  314. unop
  315. unop
  316. ST $f28, 5*SIZE(Y)
  317. MUL S, $f18, $f28
  318. unop
  319. SUB $f23, $f24, $f24
  320. ST $f22, 6*SIZE(X)
  321. ADD $f25, $f26, $f26
  322. ST $f24, 6*SIZE(Y)
  323. SUB $f27, $f28, $f28
  324. ST $f26, 7*SIZE(X)
  325. lda X, 8*SIZE(X)
  326. ST $f28, 7*SIZE(Y)
  327. lda Y, 8*SIZE(Y)
  328. .align 4
  329. $L15:
  330. and N, 7, I
  331. ble I, $L998
  332. .align 4
  333. $L16:
  334. LD $f12, 0*SIZE(X)
  335. LD $f13, 0*SIZE(Y)
  336. MUL C, $f12, $f21
  337. MUL S, $f13, $f22
  338. MUL C, $f13, $f23
  339. MUL S, $f12, $f24
  340. ADD $f21, $f22, $f25
  341. SUB $f23, $f24, $f26
  342. lda I, -1(I)
  343. ST $f25, 0*SIZE(X)
  344. lda X, 1 * SIZE(X)
  345. ST $f26, 0*SIZE(Y)
  346. lda Y, 1 * SIZE(Y)
  347. bgt I, $L16
  348. .align 4
  349. $L998:
  350. clr $0
  351. ret
  352. .align 4
  353. $L50:
  354. mov X, XX
  355. mov Y, YY
  356. sra N, 3, I
  357. ble I, $L55
  358. .align 4
  359. $L51:
  360. LD $f12, 0*SIZE(X)
  361. SXADDQ INCX, X, X
  362. LD $f13, 0*SIZE(Y)
  363. SXADDQ INCY, Y, Y
  364. LD $f14, 0*SIZE(X)
  365. SXADDQ INCX, X, X
  366. LD $f15, 0*SIZE(Y)
  367. SXADDQ INCY, Y, Y
  368. LD $f16, 0*SIZE(X)
  369. SXADDQ INCX, X, X
  370. LD $f17, 0*SIZE(Y)
  371. SXADDQ INCY, Y, Y
  372. LD $f18, 0*SIZE(X)
  373. SXADDQ INCX, X, X
  374. LD $f19, 0*SIZE(Y)
  375. SXADDQ INCY, Y, Y
  376. MUL C, $f12, $f21
  377. MUL S, $f13, $f22
  378. MUL C, $f13, $f23
  379. MUL S, $f12, $f24
  380. ADD $f21, $f22, $f22
  381. SUB $f23, $f24, $f24
  382. ST $f22, 0*SIZE(XX)
  383. SXADDQ INCX, XX, XX
  384. ST $f24, 0*SIZE(YY)
  385. SXADDQ INCY, YY, YY
  386. MUL C, $f14, $f25
  387. MUL S, $f15, $f26
  388. MUL C, $f15, $f27
  389. MUL S, $f14, $f28
  390. ADD $f25, $f26, $f26
  391. SUB $f27, $f28, $f28
  392. ST $f26, 0*SIZE(XX)
  393. SXADDQ INCX, XX, XX
  394. ST $f28, 0*SIZE(YY)
  395. SXADDQ INCY, YY, YY
  396. MUL C, $f16, $f21
  397. MUL S, $f17, $f22
  398. MUL C, $f17, $f23
  399. MUL S, $f16, $f24
  400. ADD $f21, $f22, $f22
  401. SUB $f23, $f24, $f24
  402. ST $f22, 0*SIZE(XX)
  403. SXADDQ INCX, XX, XX
  404. ST $f24, 0*SIZE(YY)
  405. SXADDQ INCY, YY, YY
  406. MUL C, $f18, $f25
  407. MUL S, $f19, $f26
  408. MUL C, $f19, $f27
  409. MUL S, $f18, $f28
  410. ADD $f25, $f26, $f26
  411. SUB $f27, $f28, $f28
  412. ST $f26, 0*SIZE(XX)
  413. SXADDQ INCX, XX, XX
  414. ST $f28, 0*SIZE(YY)
  415. SXADDQ INCY, YY, YY
  416. LD $f12, 0*SIZE(X)
  417. SXADDQ INCX, X, X
  418. LD $f13, 0*SIZE(Y)
  419. SXADDQ INCY, Y, Y
  420. LD $f14, 0*SIZE(X)
  421. SXADDQ INCX, X, X
  422. LD $f15, 0*SIZE(Y)
  423. SXADDQ INCY, Y, Y
  424. LD $f16, 0*SIZE(X)
  425. SXADDQ INCX, X, X
  426. LD $f17, 0*SIZE(Y)
  427. SXADDQ INCY, Y, Y
  428. LD $f18, 0*SIZE(X)
  429. SXADDQ INCX, X, X
  430. LD $f19, 0*SIZE(Y)
  431. SXADDQ INCY, Y, Y
  432. MUL C, $f12, $f21
  433. MUL S, $f13, $f22
  434. MUL C, $f13, $f23
  435. MUL S, $f12, $f24
  436. ADD $f21, $f22, $f22
  437. SUB $f23, $f24, $f24
  438. ST $f22, 0*SIZE(XX)
  439. SXADDQ INCX, XX, XX
  440. ST $f24, 0*SIZE(YY)
  441. SXADDQ INCY, YY, YY
  442. MUL C, $f14, $f25
  443. MUL S, $f15, $f26
  444. MUL C, $f15, $f27
  445. MUL S, $f14, $f28
  446. ADD $f25, $f26, $f26
  447. SUB $f27, $f28, $f28
  448. ST $f26, 0*SIZE(XX)
  449. SXADDQ INCX, XX, XX
  450. ST $f28, 0*SIZE(YY)
  451. SXADDQ INCY, YY, YY
  452. MUL C, $f16, $f21
  453. MUL S, $f17, $f22
  454. MUL C, $f17, $f23
  455. MUL S, $f16, $f24
  456. ADD $f21, $f22, $f22
  457. SUB $f23, $f24, $f24
  458. ST $f22, 0*SIZE(XX)
  459. SXADDQ INCX, XX, XX
  460. ST $f24, 0*SIZE(YY)
  461. SXADDQ INCY, YY, YY
  462. MUL C, $f18, $f25
  463. MUL S, $f19, $f26
  464. MUL C, $f19, $f27
  465. MUL S, $f18, $f28
  466. ADD $f25, $f26, $f26
  467. SUB $f27, $f28, $f28
  468. ST $f26, 0*SIZE(XX)
  469. SXADDQ INCX, XX, XX
  470. ST $f28, 0*SIZE(YY)
  471. SXADDQ INCY, YY, YY
  472. lda I, -1(I)
  473. bgt I, $L51
  474. .align 4
  475. $L55:
  476. and N, 7, I
  477. ble I, $L999
  478. .align 4
  479. $L56:
  480. LD $f12, 0*SIZE(X)
  481. LD $f13, 0*SIZE(Y)
  482. MUL C, $f12, $f21
  483. MUL S, $f13, $f22
  484. MUL C, $f13, $f23
  485. MUL S, $f12, $f24
  486. ADD $f21, $f22, $f25
  487. SUB $f23, $f24, $f26
  488. lda I, -1(I)
  489. ST $f25, 0*SIZE(X)
  490. SXADDQ INCX, X, X
  491. ST $f26, 0*SIZE(Y)
  492. SXADDQ INCY, Y, Y
  493. bgt I, $L56
  494. .align 4
  495. $L999:
  496. clr $0
  497. ret
  498. EPILOGUE