You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

izamax.S 8.3 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425
  1. /*********************************************************************/
  2. /* Copyright 2009, 2010 The University of Texas at Austin. */
  3. /* All rights reserved. */
  4. /* */
  5. /* Redistribution and use in source and binary forms, with or */
  6. /* without modification, are permitted provided that the following */
  7. /* conditions are met: */
  8. /* */
  9. /* 1. Redistributions of source code must retain the above */
  10. /* copyright notice, this list of conditions and the following */
  11. /* disclaimer. */
  12. /* */
  13. /* 2. Redistributions in binary form must reproduce the above */
  14. /* copyright notice, this list of conditions and the following */
  15. /* disclaimer in the documentation and/or other materials */
  16. /* provided with the distribution. */
  17. /* */
  18. /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
  19. /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
  20. /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
  21. /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
  22. /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
  23. /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
  24. /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
  25. /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
  26. /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
  27. /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
  28. /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
  29. /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
  30. /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
  31. /* POSSIBILITY OF SUCH DAMAGE. */
  32. /* */
  33. /* The views and conclusions contained in the software and */
  34. /* documentation are those of the authors and should not be */
  35. /* interpreted as representing official policies, either expressed */
  36. /* or implied, of The University of Texas at Austin. */
  37. /*********************************************************************/
  38. #define ASSEMBLER
  39. #include "common.h"
  40. #define N %i0
  41. #define X %i1
  42. #define INCX %i2
  43. #define I %i3
  44. #define v1 %o0
  45. #define v2 %o1
  46. #define v3 %o2
  47. #define v4 %o3
  48. #define count %o4
  49. #ifdef DOUBLE
  50. #define c1 %f0
  51. #define c2 %f2
  52. #define c3 %f4
  53. #define c4 %f6
  54. #define t1 %f8
  55. #define t2 %f10
  56. #define t3 %f12
  57. #define t4 %f14
  58. #define t5 %f16
  59. #define t6 %f18
  60. #define t7 %f20
  61. #define t8 %f22
  62. #define a1 %f24
  63. #define a2 %f26
  64. #define a3 %f28
  65. #define a4 %f30
  66. #define a5 %f32
  67. #define a6 %f34
  68. #define a7 %f36
  69. #define a8 %f38
  70. #else
  71. #define c1 %f0
  72. #define c2 %f1
  73. #define c3 %f2
  74. #define c4 %f3
  75. #define t1 %f4
  76. #define t2 %f5
  77. #define t3 %f6
  78. #define t4 %f7
  79. #define t5 %f8
  80. #define t6 %f9
  81. #define t7 %f10
  82. #define t8 %f11
  83. #define a1 %f12
  84. #define a2 %f13
  85. #define a3 %f14
  86. #define a4 %f15
  87. #define a5 %f16
  88. #define a6 %f17
  89. #define a7 %f18
  90. #define a8 %f19
  91. #endif
  92. #ifndef USE_MIN
  93. #define FCMOV FMOVG
  94. #define CMOV movg
  95. #else
  96. #define FCMOV FMOVL
  97. #define CMOV movl
  98. #endif
  99. PROLOGUE
  100. SAVESP
  101. FCLR(0)
  102. cmp N, 0
  103. ble .LL20
  104. clr v1
  105. cmp INCX, 0
  106. ble .LL20
  107. sll INCX, ZBASE_SHIFT, INCX
  108. mov 1, v1
  109. LDF [X + 0 * SIZE], c1
  110. LDF [X + 1 * SIZE], c2
  111. add N, -1, N
  112. FABS c1, c1
  113. add X, INCX, X
  114. FABS c2, c2
  115. cmp N, 0
  116. ble .LL20
  117. FADD c1, c2, c1
  118. FMOV c1, c2
  119. mov 1, v2
  120. FMOV c1, c3
  121. mov 1, v3
  122. FMOV c1, c4
  123. mov 1, v4
  124. mov 2, count
  125. cmp INCX, 2 * SIZE
  126. bne .LL50
  127. nop
  128. sra N, 2, I
  129. cmp I, 0
  130. ble,pn %icc, .LL15
  131. nop
  132. LDF [X + 0 * SIZE], a1
  133. LDF [X + 1 * SIZE], a2
  134. LDF [X + 2 * SIZE], a3
  135. LDF [X + 3 * SIZE], a4
  136. LDF [X + 4 * SIZE], a5
  137. add I, -1, I
  138. LDF [X + 5 * SIZE], a6
  139. cmp I, 0
  140. LDF [X + 6 * SIZE], a7
  141. LDF [X + 7 * SIZE], a8
  142. ble,pt %icc, .LL12
  143. add X, 8 * SIZE, X
  144. #define PREFETCHSIZE 32
  145. .LL11:
  146. prefetch [X + PREFETCHSIZE * SIZE], 0
  147. FABS a1, t1
  148. LDF [X + 0 * SIZE], a1
  149. FABS a2, t2
  150. LDF [X + 1 * SIZE], a2
  151. FABS a3, t3
  152. LDF [X + 2 * SIZE], a3
  153. FABS a4, t4
  154. LDF [X + 3 * SIZE], a4
  155. FABS a5, t5
  156. LDF [X + 4 * SIZE], a5
  157. FABS a6, t6
  158. LDF [X + 5 * SIZE], a6
  159. FABS a7, t7
  160. LDF [X + 6 * SIZE], a7
  161. FABS a8, t8
  162. LDF [X + 7 * SIZE], a8
  163. FADD t1, t2, t1
  164. FADD t3, t4, t3
  165. FADD t5, t6, t5
  166. FADD t7, t8, t7
  167. FCMP %fcc0, t1, c1
  168. FCMP %fcc1, t3, c2
  169. FCMP %fcc2, t5, c3
  170. FCMP %fcc3, t7, c4
  171. FCMOV %fcc0, t1, c1
  172. CMOV %fcc0, count, v1
  173. add I, -1, I
  174. FCMOV %fcc1, t3, c2
  175. CMOV %fcc1, count, v2
  176. cmp I, 0
  177. FCMOV %fcc2, t5, c3
  178. CMOV %fcc2, count, v3
  179. FCMOV %fcc3, t7, c4
  180. CMOV %fcc3, count, v4
  181. add count, 4, count
  182. bg,pt %icc, .LL11
  183. add X, 8 * SIZE, X
  184. .LL12:
  185. FABS a1, t1
  186. FABS a2, t2
  187. FABS a3, t3
  188. FABS a4, t4
  189. FABS a5, t5
  190. FABS a6, t6
  191. FABS a7, t7
  192. FABS a8, t8
  193. FADD t1, t2, t1
  194. FADD t3, t4, t3
  195. FADD t5, t6, t5
  196. FADD t7, t8, t7
  197. FCMP %fcc0, t1, c1
  198. FCMP %fcc1, t3, c2
  199. FCMP %fcc2, t5, c3
  200. FCMP %fcc3, t7, c4
  201. FCMOV %fcc0, t1, c1
  202. CMOV %fcc0, count, v1
  203. FCMOV %fcc1, t3, c2
  204. CMOV %fcc1, count, v2
  205. FCMOV %fcc2, t5, c3
  206. CMOV %fcc2, count, v3
  207. FCMOV %fcc3, t7, c4
  208. CMOV %fcc3, count, v4
  209. add count, 4, count
  210. .LL15:
  211. and N, 3, I
  212. cmp I, 0
  213. ble,a,pn %icc, .LL19
  214. nop
  215. .LL16:
  216. LDF [X + 0 * SIZE], a1
  217. LDF [X + 1 * SIZE], a2
  218. FABS a1, t1
  219. FABS a2, t2
  220. FADD t1, t2, t1
  221. FCMP %fcc0, t1, c1
  222. FCMOV %fcc0, t1, c1
  223. CMOV %fcc0, count, v1
  224. add count, 1, count
  225. add I, -1, I
  226. cmp I, 0
  227. bg,pt %icc, .LL16
  228. add X, 2 * SIZE, X
  229. .LL19:
  230. FCMP %fcc0, c2, c1
  231. add v2, 1, v2
  232. FCMP %fcc1, c4, c3
  233. add v3, 2, v3
  234. add v4, 3, v4
  235. FCMOV %fcc0, c2, c1
  236. CMOV %fcc0, v2, v1
  237. FCMOV %fcc1, c4, c3
  238. CMOV %fcc1, v4, v3
  239. FCMP %fcc0, c3, c1
  240. CMOV %fcc0, v3, v1
  241. .LL20:
  242. mov v1, %i0
  243. return %i7 + 8
  244. nop
  245. .LL50:
  246. sra N, 2, I
  247. cmp I, 0
  248. ble,pn %icc, .LL55
  249. nop
  250. LDF [X + 0 * SIZE], a1
  251. LDF [X + 1 * SIZE], a2
  252. add X, INCX, X
  253. LDF [X + 0 * SIZE], a3
  254. LDF [X + 1 * SIZE], a4
  255. add X, INCX, X
  256. LDF [X + 0 * SIZE], a5
  257. LDF [X + 1 * SIZE], a6
  258. add X, INCX, X
  259. add I, -1, I
  260. LDF [X + 0 * SIZE], a7
  261. cmp I, 0
  262. LDF [X + 1 * SIZE], a8
  263. ble,pt %icc, .LL52
  264. add X, INCX, X
  265. .LL51:
  266. FABS a1, t1
  267. LDF [X + 0 * SIZE], a1
  268. FABS a2, t2
  269. LDF [X + 1 * SIZE], a2
  270. add X, INCX, X
  271. FABS a3, t3
  272. LDF [X + 0 * SIZE], a3
  273. FABS a4, t4
  274. LDF [X + 1 * SIZE], a4
  275. add X, INCX, X
  276. FABS a5, t5
  277. LDF [X + 0 * SIZE], a5
  278. FABS a6, t6
  279. LDF [X + 1 * SIZE], a6
  280. add X, INCX, X
  281. FABS a7, t7
  282. LDF [X + 0 * SIZE], a7
  283. FABS a8, t8
  284. LDF [X + 1 * SIZE], a8
  285. FADD t1, t2, t1
  286. FADD t3, t4, t3
  287. FADD t5, t6, t5
  288. FADD t7, t8, t7
  289. FCMP %fcc0, t1, c1
  290. FCMP %fcc1, t3, c2
  291. FCMP %fcc2, t5, c3
  292. FCMP %fcc3, t7, c4
  293. FCMOV %fcc0, t1, c1
  294. CMOV %fcc0, count, v1
  295. add I, -1, I
  296. FCMOV %fcc1, t3, c2
  297. CMOV %fcc1, count, v2
  298. cmp I, 0
  299. FCMOV %fcc2, t5, c3
  300. CMOV %fcc2, count, v3
  301. FCMOV %fcc3, t7, c4
  302. CMOV %fcc3, count, v4
  303. add count, 4, count
  304. bg,pt %icc, .LL51
  305. add X, INCX, X
  306. .LL52:
  307. FABS a1, t1
  308. FABS a2, t2
  309. FABS a3, t3
  310. FABS a4, t4
  311. FABS a5, t5
  312. FABS a6, t6
  313. FABS a7, t7
  314. FABS a8, t8
  315. FADD t1, t2, t1
  316. FADD t3, t4, t3
  317. FADD t5, t6, t5
  318. FADD t7, t8, t7
  319. FCMP %fcc0, t1, c1
  320. FCMP %fcc1, t3, c2
  321. FCMP %fcc2, t5, c3
  322. FCMP %fcc3, t7, c4
  323. FCMOV %fcc0, t1, c1
  324. CMOV %fcc0, count, v1
  325. FCMOV %fcc1, t3, c2
  326. CMOV %fcc1, count, v2
  327. FCMOV %fcc2, t5, c3
  328. CMOV %fcc2, count, v3
  329. FCMOV %fcc3, t7, c4
  330. CMOV %fcc3, count, v4
  331. add count, 4, count
  332. .LL55:
  333. and N, 3, I
  334. cmp I, 0
  335. ble,a,pn %icc, .LL59
  336. nop
  337. .LL56:
  338. LDF [X + 0 * SIZE], a1
  339. LDF [X + 1 * SIZE], a2
  340. FABS a1, t1
  341. add I, -1, I
  342. FABS a2, t2
  343. cmp I, 0
  344. FADD t1, t2, t1
  345. FCMP %fcc0, t1, c1
  346. FCMOV %fcc0, t1, c1
  347. CMOV %fcc0, count, v1
  348. add count, 1, count
  349. bg,pt %icc, .LL56
  350. add X, INCX, X
  351. .LL59:
  352. FCMP %fcc0, c2, c1
  353. add v2, 1, v2
  354. FCMP %fcc1, c4, c3
  355. add v3, 2, v3
  356. add v4, 3, v4
  357. FCMOV %fcc0, c2, c1
  358. CMOV %fcc0, v2, v1
  359. FCMOV %fcc1, c4, c3
  360. CMOV %fcc1, v4, v3
  361. FCMP %fcc0, c3, c1
  362. CMOV %fcc0, v3, v1
  363. mov v1, %i0
  364. return %i7 + 8
  365. nop
  366. EPILOGUE