You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

rot.S 10 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624
  1. /*********************************************************************/
  2. /* Copyright 2009, 2010 The University of Texas at Austin. */
  3. /* All rights reserved. */
  4. /* */
  5. /* Redistribution and use in source and binary forms, with or */
  6. /* without modification, are permitted provided that the following */
  7. /* conditions are met: */
  8. /* */
  9. /* 1. Redistributions of source code must retain the above */
  10. /* copyright notice, this list of conditions and the following */
  11. /* disclaimer. */
  12. /* */
  13. /* 2. Redistributions in binary form must reproduce the above */
  14. /* copyright notice, this list of conditions and the following */
  15. /* disclaimer in the documentation and/or other materials */
  16. /* provided with the distribution. */
  17. /* */
  18. /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
  19. /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
  20. /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
  21. /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
  22. /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
  23. /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
  24. /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
  25. /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
  26. /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
  27. /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
  28. /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
  29. /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
  30. /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
  31. /* POSSIBILITY OF SUCH DAMAGE. */
  32. /* */
  33. /* The views and conclusions contained in the software and */
  34. /* documentation are those of the authors and should not be */
  35. /* interpreted as representing official policies, either expressed */
  36. /* or implied, of The University of Texas at Austin. */
  37. /*********************************************************************/
  38. #define ASSEMBLER
  39. #include "common.h"
  40. #include "version.h"
  41. #define N $16
  42. #define X $17
  43. #define INCX $18
  44. #define Y $19
  45. #define INCY $20
  46. #define I $21
  47. #define XX $23
  48. #define YY $24
  49. #define C $f10
  50. #define S $f11
  51. #define PREFETCH_SIZE 80
  52. PROLOGUE
  53. PROFCODE
  54. .frame $sp, 0, $26, 0
  55. #ifndef PROFILE
  56. .prologue 0
  57. #else
  58. .prologue 1
  59. #endif
  60. fmov $f21, C
  61. LD S, 0($sp)
  62. cmpeq INCX, 1, $23
  63. cmpeq INCY, 1, $24
  64. ble N, $L998
  65. and $23, $24, $23
  66. beq $23, $L50
  67. sra N, 3, I
  68. ble I, $L15
  69. LD $f12, 0*SIZE(X)
  70. LD $f13, 0*SIZE(Y)
  71. LD $f14, 1*SIZE(X)
  72. LD $f15, 1*SIZE(Y)
  73. LD $f16, 2*SIZE(X)
  74. LD $f17, 2*SIZE(Y)
  75. LD $f18, 3*SIZE(X)
  76. LD $f19, 3*SIZE(Y)
  77. MUL C, $f12, $f21
  78. unop
  79. MUL S, $f13, $f22
  80. MUL C, $f13, $f23
  81. LD $f13, 4*SIZE(Y)
  82. MUL S, $f12, $f24
  83. LD $f12, 4*SIZE(X)
  84. MUL C, $f14, $f25
  85. lda I, -1(I)
  86. MUL S, $f15, $f26
  87. ADD $f21, $f22, $f22
  88. MUL C, $f15, $f27
  89. LD $f15, 5*SIZE(Y)
  90. MUL S, $f14, $f28
  91. SUB $f23, $f24, $f24
  92. ble I, $L13
  93. .align 4
  94. $L12:
  95. MUL C, $f16, $f21
  96. lds $f31, (PREFETCH_SIZE) * SIZE(X)
  97. unop
  98. LD $f14, 5*SIZE(X)
  99. ST $f22, 0*SIZE(X)
  100. MUL S, $f17, $f22
  101. unop
  102. ADD $f25, $f26, $f26
  103. MUL C, $f17, $f23
  104. lds $f31, (PREFETCH_SIZE) * SIZE(Y)
  105. unop
  106. LD $f17, 6*SIZE(Y)
  107. ST $f24, 0*SIZE(Y)
  108. MUL S, $f16, $f24
  109. unop
  110. SUB $f27, $f28, $f28
  111. MUL C, $f18, $f25
  112. LD $f16, 6*SIZE(X)
  113. unop
  114. unop
  115. ST $f26, 1*SIZE(X)
  116. MUL S, $f19, $f26
  117. unop
  118. ADD $f21, $f22, $f22
  119. MUL C, $f19, $f27
  120. unop
  121. unop
  122. LD $f19, 7*SIZE(Y)
  123. ST $f28, 1*SIZE(Y)
  124. MUL S, $f18, $f28
  125. unop
  126. SUB $f23, $f24, $f24
  127. MUL C, $f12, $f21
  128. LD $f18, 7*SIZE(X)
  129. unop
  130. unop
  131. ST $f22, 2*SIZE(X)
  132. unop
  133. MUL S, $f13, $f22
  134. ADD $f25, $f26, $f26
  135. MUL C, $f13, $f23
  136. LD $f13, 8*SIZE(Y)
  137. unop
  138. unop
  139. ST $f24, 2*SIZE(Y)
  140. MUL S, $f12, $f24
  141. unop
  142. SUB $f27, $f28, $f28
  143. MUL C, $f14, $f25
  144. LD $f12, 8*SIZE(X)
  145. unop
  146. unop
  147. ST $f26, 3*SIZE(X)
  148. MUL S, $f15, $f26
  149. unop
  150. ADD $f21, $f22, $f22
  151. MUL C, $f15, $f27
  152. LD $f15, 9*SIZE(Y)
  153. unop
  154. unop
  155. ST $f28, 3*SIZE(Y)
  156. MUL S, $f14, $f28
  157. unop
  158. SUB $f23, $f24, $f24
  159. MUL C, $f16, $f21
  160. LD $f14, 9*SIZE(X)
  161. unop
  162. unop
  163. ST $f22, 4*SIZE(X)
  164. MUL S, $f17, $f22
  165. unop
  166. ADD $f25, $f26, $f26
  167. MUL C, $f17, $f23
  168. LD $f17, 10*SIZE(Y)
  169. unop
  170. unop
  171. ST $f24, 4*SIZE(Y)
  172. MUL S, $f16, $f24
  173. unop
  174. SUB $f27, $f28, $f28
  175. MUL C, $f18, $f25
  176. LD $f16, 10*SIZE(X)
  177. unop
  178. unop
  179. ST $f26, 5*SIZE(X)
  180. MUL S, $f19, $f26
  181. unop
  182. ADD $f21, $f22, $f22
  183. MUL C, $f19, $f27
  184. LD $f19, 11*SIZE(Y)
  185. unop
  186. unop
  187. ST $f28, 5*SIZE(Y)
  188. MUL S, $f18, $f28
  189. lda I, -1(I)
  190. SUB $f23, $f24, $f24
  191. MUL C, $f12, $f21
  192. LD $f18, 11*SIZE(X)
  193. unop
  194. unop
  195. ST $f22, 6*SIZE(X)
  196. MUL S, $f13, $f22
  197. unop
  198. ADD $f25, $f26, $f26
  199. MUL C, $f13, $f23
  200. LD $f13, 12*SIZE(Y)
  201. lda X, 8*SIZE(X)
  202. unop
  203. ST $f24, 6*SIZE(Y)
  204. MUL S, $f12, $f24
  205. unop
  206. SUB $f27, $f28, $f28
  207. MUL C, $f14, $f25
  208. LD $f12, 4*SIZE(X)
  209. lda Y, 8*SIZE(Y)
  210. unop
  211. ST $f26, -1*SIZE(X)
  212. MUL S, $f15, $f26
  213. unop
  214. ADD $f21, $f22, $f22
  215. MUL C, $f15, $f27
  216. LD $f15, 5*SIZE(Y)
  217. unop
  218. unop
  219. ST $f28, -1*SIZE(Y)
  220. MUL S, $f14, $f28
  221. SUB $f23, $f24, $f24
  222. bgt I, $L12
  223. .align 4
  224. $L13:
  225. MUL C, $f16, $f21
  226. LD $f14, 5*SIZE(X)
  227. unop
  228. unop
  229. ST $f22, 0*SIZE(X)
  230. MUL S, $f17, $f22
  231. unop
  232. ADD $f25, $f26, $f26
  233. MUL C, $f17, $f23
  234. unop
  235. unop
  236. LD $f17, 6*SIZE(Y)
  237. ST $f24, 0*SIZE(Y)
  238. MUL S, $f16, $f24
  239. LD $f16, 6*SIZE(X)
  240. SUB $f27, $f28, $f28
  241. MUL C, $f18, $f25
  242. unop
  243. unop
  244. unop
  245. ST $f26, 1*SIZE(X)
  246. MUL S, $f19, $f26
  247. unop
  248. ADD $f21, $f22, $f22
  249. MUL C, $f19, $f27
  250. unop
  251. unop
  252. LD $f19, 7*SIZE(Y)
  253. ST $f28, 1*SIZE(Y)
  254. MUL S, $f18, $f28
  255. LD $f18, 7*SIZE(X)
  256. SUB $f23, $f24, $f24
  257. MUL C, $f12, $f21
  258. unop
  259. unop
  260. unop
  261. ST $f22, 2*SIZE(X)
  262. unop
  263. MUL S, $f13, $f22
  264. ADD $f25, $f26, $f26
  265. MUL C, $f13, $f23
  266. unop
  267. unop
  268. unop
  269. ST $f24, 2*SIZE(Y)
  270. MUL S, $f12, $f24
  271. unop
  272. SUB $f27, $f28, $f28
  273. MUL C, $f14, $f25
  274. unop
  275. unop
  276. unop
  277. ST $f26, 3*SIZE(X)
  278. MUL S, $f15, $f26
  279. unop
  280. ADD $f21, $f22, $f22
  281. MUL C, $f15, $f27
  282. unop
  283. unop
  284. unop
  285. ST $f28, 3*SIZE(Y)
  286. MUL S, $f14, $f28
  287. unop
  288. SUB $f23, $f24, $f24
  289. MUL C, $f16, $f21
  290. unop
  291. unop
  292. unop
  293. ST $f22, 4*SIZE(X)
  294. MUL S, $f17, $f22
  295. unop
  296. ADD $f25, $f26, $f26
  297. MUL C, $f17, $f23
  298. unop
  299. unop
  300. unop
  301. ST $f24, 4*SIZE(Y)
  302. MUL S, $f16, $f24
  303. unop
  304. SUB $f27, $f28, $f28
  305. MUL C, $f18, $f25
  306. unop
  307. unop
  308. unop
  309. ST $f26, 5*SIZE(X)
  310. MUL S, $f19, $f26
  311. unop
  312. ADD $f21, $f22, $f22
  313. MUL C, $f19, $f27
  314. unop
  315. unop
  316. unop
  317. ST $f28, 5*SIZE(Y)
  318. MUL S, $f18, $f28
  319. unop
  320. SUB $f23, $f24, $f24
  321. ST $f22, 6*SIZE(X)
  322. ADD $f25, $f26, $f26
  323. ST $f24, 6*SIZE(Y)
  324. SUB $f27, $f28, $f28
  325. ST $f26, 7*SIZE(X)
  326. lda X, 8*SIZE(X)
  327. ST $f28, 7*SIZE(Y)
  328. lda Y, 8*SIZE(Y)
  329. .align 4
  330. $L15:
  331. and N, 7, I
  332. ble I, $L998
  333. .align 4
  334. $L16:
  335. LD $f12, 0*SIZE(X)
  336. LD $f13, 0*SIZE(Y)
  337. MUL C, $f12, $f21
  338. MUL S, $f13, $f22
  339. MUL C, $f13, $f23
  340. MUL S, $f12, $f24
  341. ADD $f21, $f22, $f25
  342. SUB $f23, $f24, $f26
  343. lda I, -1(I)
  344. ST $f25, 0*SIZE(X)
  345. lda X, 1 * SIZE(X)
  346. ST $f26, 0*SIZE(Y)
  347. lda Y, 1 * SIZE(Y)
  348. bgt I, $L16
  349. .align 4
  350. $L998:
  351. clr $0
  352. ret
  353. .align 4
  354. $L50:
  355. mov X, XX
  356. mov Y, YY
  357. sra N, 3, I
  358. ble I, $L55
  359. .align 4
  360. $L51:
  361. LD $f12, 0*SIZE(X)
  362. SXADDQ INCX, X, X
  363. LD $f13, 0*SIZE(Y)
  364. SXADDQ INCY, Y, Y
  365. LD $f14, 0*SIZE(X)
  366. SXADDQ INCX, X, X
  367. LD $f15, 0*SIZE(Y)
  368. SXADDQ INCY, Y, Y
  369. LD $f16, 0*SIZE(X)
  370. SXADDQ INCX, X, X
  371. LD $f17, 0*SIZE(Y)
  372. SXADDQ INCY, Y, Y
  373. LD $f18, 0*SIZE(X)
  374. SXADDQ INCX, X, X
  375. LD $f19, 0*SIZE(Y)
  376. SXADDQ INCY, Y, Y
  377. MUL C, $f12, $f21
  378. MUL S, $f13, $f22
  379. MUL C, $f13, $f23
  380. MUL S, $f12, $f24
  381. ADD $f21, $f22, $f22
  382. SUB $f23, $f24, $f24
  383. ST $f22, 0*SIZE(XX)
  384. SXADDQ INCX, XX, XX
  385. ST $f24, 0*SIZE(YY)
  386. SXADDQ INCY, YY, YY
  387. MUL C, $f14, $f25
  388. MUL S, $f15, $f26
  389. MUL C, $f15, $f27
  390. MUL S, $f14, $f28
  391. ADD $f25, $f26, $f26
  392. SUB $f27, $f28, $f28
  393. ST $f26, 0*SIZE(XX)
  394. SXADDQ INCX, XX, XX
  395. ST $f28, 0*SIZE(YY)
  396. SXADDQ INCY, YY, YY
  397. MUL C, $f16, $f21
  398. MUL S, $f17, $f22
  399. MUL C, $f17, $f23
  400. MUL S, $f16, $f24
  401. ADD $f21, $f22, $f22
  402. SUB $f23, $f24, $f24
  403. ST $f22, 0*SIZE(XX)
  404. SXADDQ INCX, XX, XX
  405. ST $f24, 0*SIZE(YY)
  406. SXADDQ INCY, YY, YY
  407. MUL C, $f18, $f25
  408. MUL S, $f19, $f26
  409. MUL C, $f19, $f27
  410. MUL S, $f18, $f28
  411. ADD $f25, $f26, $f26
  412. SUB $f27, $f28, $f28
  413. ST $f26, 0*SIZE(XX)
  414. SXADDQ INCX, XX, XX
  415. ST $f28, 0*SIZE(YY)
  416. SXADDQ INCY, YY, YY
  417. LD $f12, 0*SIZE(X)
  418. SXADDQ INCX, X, X
  419. LD $f13, 0*SIZE(Y)
  420. SXADDQ INCY, Y, Y
  421. LD $f14, 0*SIZE(X)
  422. SXADDQ INCX, X, X
  423. LD $f15, 0*SIZE(Y)
  424. SXADDQ INCY, Y, Y
  425. LD $f16, 0*SIZE(X)
  426. SXADDQ INCX, X, X
  427. LD $f17, 0*SIZE(Y)
  428. SXADDQ INCY, Y, Y
  429. LD $f18, 0*SIZE(X)
  430. SXADDQ INCX, X, X
  431. LD $f19, 0*SIZE(Y)
  432. SXADDQ INCY, Y, Y
  433. MUL C, $f12, $f21
  434. MUL S, $f13, $f22
  435. MUL C, $f13, $f23
  436. MUL S, $f12, $f24
  437. ADD $f21, $f22, $f22
  438. SUB $f23, $f24, $f24
  439. ST $f22, 0*SIZE(XX)
  440. SXADDQ INCX, XX, XX
  441. ST $f24, 0*SIZE(YY)
  442. SXADDQ INCY, YY, YY
  443. MUL C, $f14, $f25
  444. MUL S, $f15, $f26
  445. MUL C, $f15, $f27
  446. MUL S, $f14, $f28
  447. ADD $f25, $f26, $f26
  448. SUB $f27, $f28, $f28
  449. ST $f26, 0*SIZE(XX)
  450. SXADDQ INCX, XX, XX
  451. ST $f28, 0*SIZE(YY)
  452. SXADDQ INCY, YY, YY
  453. MUL C, $f16, $f21
  454. MUL S, $f17, $f22
  455. MUL C, $f17, $f23
  456. MUL S, $f16, $f24
  457. ADD $f21, $f22, $f22
  458. SUB $f23, $f24, $f24
  459. ST $f22, 0*SIZE(XX)
  460. SXADDQ INCX, XX, XX
  461. ST $f24, 0*SIZE(YY)
  462. SXADDQ INCY, YY, YY
  463. MUL C, $f18, $f25
  464. MUL S, $f19, $f26
  465. MUL C, $f19, $f27
  466. MUL S, $f18, $f28
  467. ADD $f25, $f26, $f26
  468. SUB $f27, $f28, $f28
  469. ST $f26, 0*SIZE(XX)
  470. SXADDQ INCX, XX, XX
  471. ST $f28, 0*SIZE(YY)
  472. SXADDQ INCY, YY, YY
  473. lda I, -1(I)
  474. bgt I, $L51
  475. .align 4
  476. $L55:
  477. and N, 7, I
  478. ble I, $L999
  479. .align 4
  480. $L56:
  481. LD $f12, 0*SIZE(X)
  482. LD $f13, 0*SIZE(Y)
  483. MUL C, $f12, $f21
  484. MUL S, $f13, $f22
  485. MUL C, $f13, $f23
  486. MUL S, $f12, $f24
  487. ADD $f21, $f22, $f25
  488. SUB $f23, $f24, $f26
  489. lda I, -1(I)
  490. ST $f25, 0*SIZE(X)
  491. SXADDQ INCX, X, X
  492. ST $f26, 0*SIZE(Y)
  493. SXADDQ INCY, Y, Y
  494. bgt I, $L56
  495. .align 4
  496. $L999:
  497. clr $0
  498. ret
  499. EPILOGUE