You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

axpy.S 9.0 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428
  1. /*********************************************************************/
  2. /* Copyright 2009, 2010 The University of Texas at Austin. */
  3. /* All rights reserved. */
  4. /* */
  5. /* Redistribution and use in source and binary forms, with or */
  6. /* without modification, are permitted provided that the following */
  7. /* conditions are met: */
  8. /* */
  9. /* 1. Redistributions of source code must retain the above */
  10. /* copyright notice, this list of conditions and the following */
  11. /* disclaimer. */
  12. /* */
  13. /* 2. Redistributions in binary form must reproduce the above */
  14. /* copyright notice, this list of conditions and the following */
  15. /* disclaimer in the documentation and/or other materials */
  16. /* provided with the distribution. */
  17. /* */
  18. /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
  19. /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
  20. /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
  21. /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
  22. /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
  23. /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
  24. /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
  25. /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
  26. /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
  27. /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
  28. /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
  29. /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
  30. /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
  31. /* POSSIBILITY OF SUCH DAMAGE. */
  32. /* */
  33. /* The views and conclusions contained in the software and */
  34. /* documentation are those of the authors and should not be */
  35. /* interpreted as representing official policies, either expressed */
  36. /* or implied, of The University of Texas at Austin. */
  37. /*********************************************************************/
  38. #define ASSEMBLER
  39. #include "common.h"
  40. #include "version.h"
  41. #define PREFETCHSIZE 40
  42. PROLOGUE
  43. PROFCODE
  44. .frame $sp, 16, $26, 0
  45. ldq $24, 0($sp)
  46. fmov $f19, $f30
  47. ldl $23, 8($sp)
  48. lda $sp, -16($sp)
  49. #ifndef PROFILE
  50. .prologue 0
  51. #else
  52. .prologue 1
  53. #endif
  54. nop
  55. sra $16, 3, $1
  56. stt $f2, 0($sp)
  57. cmpeq $21, 1, $3
  58. stt $f3, 8($sp)
  59. cmpeq $23, 1, $4
  60. and $16, 7, $2
  61. ble $16, $End
  62. and $3, $4, $3
  63. fbeq $f30, $End
  64. beq $3, $Sub
  65. ble $1, $Remain
  66. .align 4
  67. LD $f10, 0*SIZE($20)
  68. LD $f11, 1*SIZE($20)
  69. LD $f12, 2*SIZE($20)
  70. LD $f13, 3*SIZE($20)
  71. LD $f18, 0*SIZE($24)
  72. LD $f19, 1*SIZE($24)
  73. LD $f20, 2*SIZE($24)
  74. LD $f21, 3*SIZE($24)
  75. LD $f14, 4*SIZE($20)
  76. LD $f15, 5*SIZE($20)
  77. LD $f16, 6*SIZE($20)
  78. LD $f17, 7*SIZE($20)
  79. LD $f22, 4*SIZE($24)
  80. LD $f23, 5*SIZE($24)
  81. LD $f24, 6*SIZE($24)
  82. LD $f25, 7*SIZE($24)
  83. subq $1, 1, $1
  84. addq $20, 8*SIZE, $20
  85. unop
  86. ble $1, $LoopEnd
  87. .align 4
  88. $Loop:
  89. ldt $f31, PREFETCHSIZE * SIZE($24)
  90. ldl $31, PREFETCHSIZE * SIZE($20)
  91. MUL $f30, $f10, $f26 # ctemp1 = da * atemp1
  92. LD $f10, 0*SIZE($20)
  93. MUL $f30, $f11, $f27
  94. LD $f11, 1*SIZE($20)
  95. MUL $f30, $f12, $f28
  96. LD $f12, 2*SIZE($20)
  97. MUL $f30, $f13, $f29
  98. LD $f13, 3*SIZE($20)
  99. ADD $f18, $f26, $f0
  100. LD $f18, 8*SIZE($24)
  101. MUL $f30, $f14, $f26 # ctemp1 = da * atemp1
  102. LD $f14, 4*SIZE($20)
  103. ADD $f19, $f27, $f1
  104. LD $f19, 9*SIZE($24)
  105. MUL $f30, $f15, $f27
  106. LD $f15, 5*SIZE($20)
  107. ADD $f20, $f28, $f2
  108. LD $f20, 10*SIZE($24)
  109. MUL $f30, $f16, $f28
  110. LD $f16, 6*SIZE($20)
  111. ADD $f21, $f29, $f3
  112. LD $f21, 11*SIZE($24)
  113. MUL $f30, $f17, $f29
  114. LD $f17, 7*SIZE($20)
  115. ST $f0, 0*SIZE($24)
  116. ADD $f22, $f26, $f0
  117. ST $f1, 1*SIZE($24)
  118. ADD $f23, $f27, $f1
  119. ST $f2, 2*SIZE($24)
  120. ADD $f24, $f28, $f2
  121. ST $f3, 3*SIZE($24)
  122. ADD $f25, $f29, $f3
  123. LD $f22, 12*SIZE($24)
  124. LD $f23, 13*SIZE($24)
  125. LD $f24, 14*SIZE($24)
  126. LD $f25, 15*SIZE($24)
  127. ST $f0, 4*SIZE($24)
  128. ST $f1, 5*SIZE($24)
  129. ST $f2, 6*SIZE($24)
  130. ST $f3, 7*SIZE($24)
  131. subq $1, 1, $1
  132. addq $24, 8*SIZE, $24
  133. addq $20, 8*SIZE, $20
  134. bgt $1, $Loop
  135. .align 4
  136. $LoopEnd:
  137. MUL $f30, $f10, $f26 # ctemp1 = da * atemp1
  138. MUL $f30, $f11, $f27
  139. MUL $f30, $f12, $f28
  140. MUL $f30, $f13, $f29
  141. ADD $f18, $f26, $f0
  142. MUL $f30, $f14, $f26 # ctemp1 = da * atemp1
  143. ADD $f19, $f27, $f1
  144. MUL $f30, $f15, $f27
  145. ADD $f20, $f28, $f2
  146. MUL $f30, $f16, $f28
  147. ADD $f21, $f29, $f3
  148. MUL $f30, $f17, $f29
  149. ST $f0, 0*SIZE($24)
  150. ADD $f22, $f26, $f0
  151. ST $f1, 1*SIZE($24)
  152. ADD $f23, $f27, $f1
  153. ST $f2, 2*SIZE($24)
  154. ADD $f24, $f28, $f2
  155. ST $f3, 3*SIZE($24)
  156. ADD $f25, $f29, $f3
  157. ST $f0, 4*SIZE($24)
  158. ST $f1, 5*SIZE($24)
  159. ST $f2, 6*SIZE($24)
  160. ST $f3, 7*SIZE($24)
  161. addq $24, 8*SIZE, $24
  162. .align 4
  163. $Remain:
  164. ble $2, $End
  165. .align 4
  166. $RemainLoop:
  167. LD $f10, 0*SIZE($20)
  168. LD $f11, 0*SIZE($24)
  169. addq $20, SIZE, $20
  170. addq $24, SIZE, $24
  171. MUL $f30, $f10, $f12
  172. subq $2, 1, $2
  173. ADD $f11, $f12, $f13
  174. ST $f13, -1*SIZE($24)
  175. bgt $2, $RemainLoop
  176. .align 4
  177. $End:
  178. ldt $f2, 0($sp)
  179. ldt $f3, 8($sp)
  180. lda $sp, 16($sp)
  181. ret
  182. .align 4
  183. $Sub:
  184. SXSUBL $16, SIZE, $22
  185. subq $1, 1, $4
  186. ble $1, $SubRemain
  187. .align 4
  188. LD $f10, 0($20)
  189. SXADDQ $21, $20, $20
  190. LD $f11, 0($20)
  191. SXADDQ $21, $20, $20
  192. LD $f12, 0($20)
  193. SXADDQ $21, $20, $20
  194. LD $f13, 0($20)
  195. SXADDQ $21, $20, $20
  196. LD $f18, 0($24)
  197. SXADDQ $23, $24, $22
  198. LD $f19, 0($22)
  199. SXADDQ $23, $22, $22
  200. LD $f20, 0($22)
  201. SXADDQ $23, $22, $22
  202. LD $f21, 0($22)
  203. SXADDQ $23, $22, $22
  204. LD $f14, 0($20)
  205. SXADDQ $21, $20, $20
  206. LD $f15, 0($20)
  207. SXADDQ $21, $20, $20
  208. LD $f16, 0($20)
  209. SXADDQ $21, $20, $20
  210. LD $f17, 0($20)
  211. SXADDQ $21, $20, $20
  212. LD $f22, 0($22)
  213. SXADDQ $23, $22, $22
  214. LD $f23, 0($22)
  215. SXADDQ $23, $22, $22
  216. LD $f24, 0($22)
  217. SXADDQ $23, $22, $22
  218. LD $f25, 0($22)
  219. SXADDQ $23, $22, $22
  220. unop
  221. ble $4, $SubLoopEnd
  222. .align 4
  223. $SubLoop:
  224. MUL $f30, $f10, $f26 # ctemp1 = da * atemp1
  225. LD $f10, 0($20)
  226. unop
  227. SXADDQ $21, $20, $20
  228. MUL $f30, $f11, $f27
  229. LD $f11, 0($20)
  230. unop
  231. SXADDQ $21, $20, $20
  232. MUL $f30, $f12, $f28
  233. LD $f12, 0($20)
  234. unop
  235. SXADDQ $21, $20, $20
  236. MUL $f30, $f13, $f29
  237. LD $f13, 0($20)
  238. unop
  239. SXADDQ $21, $20, $20
  240. ADD $f18, $f26, $f0
  241. MUL $f30, $f14, $f26 # ctemp1 = da * atemp1
  242. LD $f14, 0($20)
  243. SXADDQ $21, $20, $20
  244. ADD $f19, $f27, $f1
  245. MUL $f30, $f15, $f27
  246. LD $f15, 0($20)
  247. SXADDQ $21, $20, $20
  248. ADD $f20, $f28, $f2
  249. MUL $f30, $f16, $f28
  250. LD $f16, 0($20)
  251. SXADDQ $21, $20, $20
  252. ADD $f21, $f29, $f3
  253. MUL $f30, $f17, $f29
  254. LD $f17, 0($20)
  255. SXADDQ $21, $20, $20
  256. ST $f0, 0($24)
  257. SXADDQ $23, $24, $24
  258. ADD $f22, $f26, $f0
  259. unop
  260. ST $f1, 0($24)
  261. SXADDQ $23, $24, $24
  262. ADD $f23, $f27, $f1
  263. unop
  264. ST $f2, 0($24)
  265. SXADDQ $23, $24, $24
  266. ADD $f24, $f28, $f2
  267. unop
  268. ST $f3, 0($24)
  269. SXADDQ $23, $24, $24
  270. ADD $f25, $f29, $f3
  271. unop
  272. LD $f18, 0($22)
  273. SXADDQ $23, $22, $22
  274. LD $f19, 0($22)
  275. SXADDQ $23, $22, $22
  276. LD $f20, 0($22)
  277. SXADDQ $23, $22, $22
  278. LD $f21, 0($22)
  279. SXADDQ $23, $22, $22
  280. LD $f22, 0($22)
  281. SXADDQ $23, $22, $22
  282. LD $f23, 0($22)
  283. SXADDQ $23, $22, $22
  284. LD $f24, 0($22)
  285. SXADDQ $23, $22, $22
  286. LD $f25, 0($22)
  287. SXADDQ $23, $22, $22
  288. ST $f0, 0($24)
  289. SXADDQ $23, $24, $24
  290. ST $f1, 0($24)
  291. SXADDQ $23, $24, $24
  292. ST $f2, 0($24)
  293. SXADDQ $23, $24, $24
  294. ST $f3, 0($24)
  295. SXADDQ $23, $24, $24
  296. subq $4, 1, $4
  297. bgt $4, $SubLoop
  298. .align 4
  299. $SubLoopEnd:
  300. MUL $f30, $f10, $f26 # ctemp1 = da * atemp1
  301. MUL $f30, $f11, $f27
  302. MUL $f30, $f12, $f28
  303. MUL $f30, $f13, $f29
  304. ADD $f18, $f26, $f0
  305. MUL $f30, $f14, $f26 # ctemp1 = da * atemp1
  306. ADD $f19, $f27, $f1
  307. MUL $f30, $f15, $f27
  308. ADD $f20, $f28, $f2
  309. MUL $f30, $f16, $f28
  310. ADD $f21, $f29, $f3
  311. MUL $f30, $f17, $f29
  312. ST $f0, 0($24)
  313. SXADDQ $23, $24, $24
  314. ST $f1, 0($24)
  315. SXADDQ $23, $24, $24
  316. ST $f2, 0($24)
  317. SXADDQ $23, $24, $24
  318. ST $f3, 0($24)
  319. SXADDQ $23, $24, $24
  320. ADD $f22, $f26, $f0
  321. ADD $f23, $f27, $f1
  322. ADD $f24, $f28, $f2
  323. ADD $f25, $f29, $f3
  324. ST $f0, 0($24)
  325. SXADDQ $23, $24, $24
  326. ST $f1, 0($24)
  327. SXADDQ $23, $24, $24
  328. ST $f2, 0($24)
  329. SXADDQ $23, $24, $24
  330. ST $f3, 0($24)
  331. SXADDQ $23, $24, $24
  332. .align 4
  333. $SubRemain:
  334. ble $2, $SubEnd
  335. .align 4
  336. $SubRemainLoop:
  337. LD $f10, 0($20)
  338. LD $f11, 0($24)
  339. SXADDQ $21, $20, $20
  340. MUL $f30, $f10, $f12
  341. subq $2, 1, $2
  342. ADD $f11, $f12, $f13
  343. ST $f13, 0($24)
  344. SXADDQ $23, $24, $24
  345. bgt $2, $SubRemainLoop
  346. .align 4
  347. $SubEnd:
  348. ldt $f2, 0($sp)
  349. ldt $f3, 8($sp)
  350. lda $sp, 16($sp)
  351. ret
  352. EPILOGUE