You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

axpy.S 9.0 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428
  1. /*********************************************************************/
  2. /* Copyright 2009, 2010 The University of Texas at Austin. */
  3. /* All rights reserved. */
  4. /* */
  5. /* Redistribution and use in source and binary forms, with or */
  6. /* without modification, are permitted provided that the following */
  7. /* conditions are met: */
  8. /* */
  9. /* 1. Redistributions of source code must retain the above */
  10. /* copyright notice, this list of conditions and the following */
  11. /* disclaimer. */
  12. /* */
  13. /* 2. Redistributions in binary form must reproduce the above */
  14. /* copyright notice, this list of conditions and the following */
  15. /* disclaimer in the documentation and/or other materials */
  16. /* provided with the distribution. */
  17. /* */
  18. /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
  19. /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
  20. /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
  21. /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
  22. /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
  23. /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
  24. /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
  25. /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
  26. /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
  27. /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
  28. /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
  29. /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
  30. /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
  31. /* POSSIBILITY OF SUCH DAMAGE. */
  32. /* */
  33. /* The views and conclusions contained in the software and */
  34. /* documentation are those of the authors and should not be */
  35. /* interpreted as representing official policies, either expressed */
  36. /* or implied, of The University of Texas at Austin. */
  37. /*********************************************************************/
  38. #define ASSEMBLER
  39. #include "common.h"
  40. #define PREFETCHSIZE 40
  41. PROLOGUE
  42. PROFCODE
  43. .frame $sp, 16, $26, 0
  44. ldq $24, 0($sp)
  45. fmov $f19, $f30
  46. ldl $23, 8($sp)
  47. lda $sp, -16($sp)
  48. #ifndef PROFILE
  49. .prologue 0
  50. #else
  51. .prologue 1
  52. #endif
  53. nop
  54. sra $16, 3, $1
  55. stt $f2, 0($sp)
  56. cmpeq $21, 1, $3
  57. stt $f3, 8($sp)
  58. cmpeq $23, 1, $4
  59. and $16, 7, $2
  60. ble $16, $End
  61. and $3, $4, $3
  62. fbeq $f30, $End
  63. beq $3, $Sub
  64. ble $1, $Remain
  65. .align 4
  66. LD $f10, 0*SIZE($20)
  67. LD $f11, 1*SIZE($20)
  68. LD $f12, 2*SIZE($20)
  69. LD $f13, 3*SIZE($20)
  70. LD $f18, 0*SIZE($24)
  71. LD $f19, 1*SIZE($24)
  72. LD $f20, 2*SIZE($24)
  73. LD $f21, 3*SIZE($24)
  74. LD $f14, 4*SIZE($20)
  75. LD $f15, 5*SIZE($20)
  76. LD $f16, 6*SIZE($20)
  77. LD $f17, 7*SIZE($20)
  78. LD $f22, 4*SIZE($24)
  79. LD $f23, 5*SIZE($24)
  80. LD $f24, 6*SIZE($24)
  81. LD $f25, 7*SIZE($24)
  82. subq $1, 1, $1
  83. addq $20, 8*SIZE, $20
  84. unop
  85. ble $1, $LoopEnd
  86. .align 4
  87. $Loop:
  88. ldt $f31, PREFETCHSIZE * SIZE($24)
  89. ldl $31, PREFETCHSIZE * SIZE($20)
  90. MUL $f30, $f10, $f26 # ctemp1 = da * atemp1
  91. LD $f10, 0*SIZE($20)
  92. MUL $f30, $f11, $f27
  93. LD $f11, 1*SIZE($20)
  94. MUL $f30, $f12, $f28
  95. LD $f12, 2*SIZE($20)
  96. MUL $f30, $f13, $f29
  97. LD $f13, 3*SIZE($20)
  98. ADD $f18, $f26, $f0
  99. LD $f18, 8*SIZE($24)
  100. MUL $f30, $f14, $f26 # ctemp1 = da * atemp1
  101. LD $f14, 4*SIZE($20)
  102. ADD $f19, $f27, $f1
  103. LD $f19, 9*SIZE($24)
  104. MUL $f30, $f15, $f27
  105. LD $f15, 5*SIZE($20)
  106. ADD $f20, $f28, $f2
  107. LD $f20, 10*SIZE($24)
  108. MUL $f30, $f16, $f28
  109. LD $f16, 6*SIZE($20)
  110. ADD $f21, $f29, $f3
  111. LD $f21, 11*SIZE($24)
  112. MUL $f30, $f17, $f29
  113. LD $f17, 7*SIZE($20)
  114. ST $f0, 0*SIZE($24)
  115. ADD $f22, $f26, $f0
  116. ST $f1, 1*SIZE($24)
  117. ADD $f23, $f27, $f1
  118. ST $f2, 2*SIZE($24)
  119. ADD $f24, $f28, $f2
  120. ST $f3, 3*SIZE($24)
  121. ADD $f25, $f29, $f3
  122. LD $f22, 12*SIZE($24)
  123. LD $f23, 13*SIZE($24)
  124. LD $f24, 14*SIZE($24)
  125. LD $f25, 15*SIZE($24)
  126. ST $f0, 4*SIZE($24)
  127. ST $f1, 5*SIZE($24)
  128. ST $f2, 6*SIZE($24)
  129. ST $f3, 7*SIZE($24)
  130. subq $1, 1, $1
  131. addq $24, 8*SIZE, $24
  132. addq $20, 8*SIZE, $20
  133. bgt $1, $Loop
  134. .align 4
  135. $LoopEnd:
  136. MUL $f30, $f10, $f26 # ctemp1 = da * atemp1
  137. MUL $f30, $f11, $f27
  138. MUL $f30, $f12, $f28
  139. MUL $f30, $f13, $f29
  140. ADD $f18, $f26, $f0
  141. MUL $f30, $f14, $f26 # ctemp1 = da * atemp1
  142. ADD $f19, $f27, $f1
  143. MUL $f30, $f15, $f27
  144. ADD $f20, $f28, $f2
  145. MUL $f30, $f16, $f28
  146. ADD $f21, $f29, $f3
  147. MUL $f30, $f17, $f29
  148. ST $f0, 0*SIZE($24)
  149. ADD $f22, $f26, $f0
  150. ST $f1, 1*SIZE($24)
  151. ADD $f23, $f27, $f1
  152. ST $f2, 2*SIZE($24)
  153. ADD $f24, $f28, $f2
  154. ST $f3, 3*SIZE($24)
  155. ADD $f25, $f29, $f3
  156. ST $f0, 4*SIZE($24)
  157. ST $f1, 5*SIZE($24)
  158. ST $f2, 6*SIZE($24)
  159. ST $f3, 7*SIZE($24)
  160. addq $24, 8*SIZE, $24
  161. .align 4
  162. $Remain:
  163. ble $2, $End
  164. .align 4
  165. $RemainLoop:
  166. LD $f10, 0*SIZE($20)
  167. LD $f11, 0*SIZE($24)
  168. addq $20, SIZE, $20
  169. addq $24, SIZE, $24
  170. MUL $f30, $f10, $f12
  171. subq $2, 1, $2
  172. ADD $f11, $f12, $f13
  173. ST $f13, -1*SIZE($24)
  174. bgt $2, $RemainLoop
  175. .align 4
  176. $End:
  177. ldt $f2, 0($sp)
  178. ldt $f3, 8($sp)
  179. lda $sp, 16($sp)
  180. ret
  181. .align 4
  182. $Sub:
  183. SXSUBL $16, SIZE, $22
  184. subq $1, 1, $4
  185. ble $1, $SubRemain
  186. .align 4
  187. LD $f10, 0($20)
  188. SXADDQ $21, $20, $20
  189. LD $f11, 0($20)
  190. SXADDQ $21, $20, $20
  191. LD $f12, 0($20)
  192. SXADDQ $21, $20, $20
  193. LD $f13, 0($20)
  194. SXADDQ $21, $20, $20
  195. LD $f18, 0($24)
  196. SXADDQ $23, $24, $22
  197. LD $f19, 0($22)
  198. SXADDQ $23, $22, $22
  199. LD $f20, 0($22)
  200. SXADDQ $23, $22, $22
  201. LD $f21, 0($22)
  202. SXADDQ $23, $22, $22
  203. LD $f14, 0($20)
  204. SXADDQ $21, $20, $20
  205. LD $f15, 0($20)
  206. SXADDQ $21, $20, $20
  207. LD $f16, 0($20)
  208. SXADDQ $21, $20, $20
  209. LD $f17, 0($20)
  210. SXADDQ $21, $20, $20
  211. LD $f22, 0($22)
  212. SXADDQ $23, $22, $22
  213. LD $f23, 0($22)
  214. SXADDQ $23, $22, $22
  215. LD $f24, 0($22)
  216. SXADDQ $23, $22, $22
  217. LD $f25, 0($22)
  218. SXADDQ $23, $22, $22
  219. unop
  220. ble $4, $SubLoopEnd
  221. .align 4
  222. $SubLoop:
  223. MUL $f30, $f10, $f26 # ctemp1 = da * atemp1
  224. LD $f10, 0($20)
  225. unop
  226. SXADDQ $21, $20, $20
  227. MUL $f30, $f11, $f27
  228. LD $f11, 0($20)
  229. unop
  230. SXADDQ $21, $20, $20
  231. MUL $f30, $f12, $f28
  232. LD $f12, 0($20)
  233. unop
  234. SXADDQ $21, $20, $20
  235. MUL $f30, $f13, $f29
  236. LD $f13, 0($20)
  237. unop
  238. SXADDQ $21, $20, $20
  239. ADD $f18, $f26, $f0
  240. MUL $f30, $f14, $f26 # ctemp1 = da * atemp1
  241. LD $f14, 0($20)
  242. SXADDQ $21, $20, $20
  243. ADD $f19, $f27, $f1
  244. MUL $f30, $f15, $f27
  245. LD $f15, 0($20)
  246. SXADDQ $21, $20, $20
  247. ADD $f20, $f28, $f2
  248. MUL $f30, $f16, $f28
  249. LD $f16, 0($20)
  250. SXADDQ $21, $20, $20
  251. ADD $f21, $f29, $f3
  252. MUL $f30, $f17, $f29
  253. LD $f17, 0($20)
  254. SXADDQ $21, $20, $20
  255. ST $f0, 0($24)
  256. SXADDQ $23, $24, $24
  257. ADD $f22, $f26, $f0
  258. unop
  259. ST $f1, 0($24)
  260. SXADDQ $23, $24, $24
  261. ADD $f23, $f27, $f1
  262. unop
  263. ST $f2, 0($24)
  264. SXADDQ $23, $24, $24
  265. ADD $f24, $f28, $f2
  266. unop
  267. ST $f3, 0($24)
  268. SXADDQ $23, $24, $24
  269. ADD $f25, $f29, $f3
  270. unop
  271. LD $f18, 0($22)
  272. SXADDQ $23, $22, $22
  273. LD $f19, 0($22)
  274. SXADDQ $23, $22, $22
  275. LD $f20, 0($22)
  276. SXADDQ $23, $22, $22
  277. LD $f21, 0($22)
  278. SXADDQ $23, $22, $22
  279. LD $f22, 0($22)
  280. SXADDQ $23, $22, $22
  281. LD $f23, 0($22)
  282. SXADDQ $23, $22, $22
  283. LD $f24, 0($22)
  284. SXADDQ $23, $22, $22
  285. LD $f25, 0($22)
  286. SXADDQ $23, $22, $22
  287. ST $f0, 0($24)
  288. SXADDQ $23, $24, $24
  289. ST $f1, 0($24)
  290. SXADDQ $23, $24, $24
  291. ST $f2, 0($24)
  292. SXADDQ $23, $24, $24
  293. ST $f3, 0($24)
  294. SXADDQ $23, $24, $24
  295. subq $4, 1, $4
  296. bgt $4, $SubLoop
  297. .align 4
  298. $SubLoopEnd:
  299. MUL $f30, $f10, $f26 # ctemp1 = da * atemp1
  300. MUL $f30, $f11, $f27
  301. MUL $f30, $f12, $f28
  302. MUL $f30, $f13, $f29
  303. ADD $f18, $f26, $f0
  304. MUL $f30, $f14, $f26 # ctemp1 = da * atemp1
  305. ADD $f19, $f27, $f1
  306. MUL $f30, $f15, $f27
  307. ADD $f20, $f28, $f2
  308. MUL $f30, $f16, $f28
  309. ADD $f21, $f29, $f3
  310. MUL $f30, $f17, $f29
  311. ST $f0, 0($24)
  312. SXADDQ $23, $24, $24
  313. ST $f1, 0($24)
  314. SXADDQ $23, $24, $24
  315. ST $f2, 0($24)
  316. SXADDQ $23, $24, $24
  317. ST $f3, 0($24)
  318. SXADDQ $23, $24, $24
  319. ADD $f22, $f26, $f0
  320. ADD $f23, $f27, $f1
  321. ADD $f24, $f28, $f2
  322. ADD $f25, $f29, $f3
  323. ST $f0, 0($24)
  324. SXADDQ $23, $24, $24
  325. ST $f1, 0($24)
  326. SXADDQ $23, $24, $24
  327. ST $f2, 0($24)
  328. SXADDQ $23, $24, $24
  329. ST $f3, 0($24)
  330. SXADDQ $23, $24, $24
  331. .align 4
  332. $SubRemain:
  333. ble $2, $SubEnd
  334. .align 4
  335. $SubRemainLoop:
  336. LD $f10, 0($20)
  337. LD $f11, 0($24)
  338. SXADDQ $21, $20, $20
  339. MUL $f30, $f10, $f12
  340. subq $2, 1, $2
  341. ADD $f11, $f12, $f13
  342. ST $f13, 0($24)
  343. SXADDQ $23, $24, $24
  344. bgt $2, $SubRemainLoop
  345. .align 4
  346. $SubEnd:
  347. ldt $f2, 0($sp)
  348. ldt $f3, 8($sp)
  349. lda $sp, 16($sp)
  350. ret
  351. EPILOGUE