You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

rot.S 6.9 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388
  1. /*********************************************************************/
  2. /* Copyright 2009, 2010 The University of Texas at Austin. */
  3. /* All rights reserved. */
  4. /* */
  5. /* Redistribution and use in source and binary forms, with or */
  6. /* without modification, are permitted provided that the following */
  7. /* conditions are met: */
  8. /* */
  9. /* 1. Redistributions of source code must retain the above */
  10. /* copyright notice, this list of conditions and the following */
  11. /* disclaimer. */
  12. /* */
  13. /* 2. Redistributions in binary form must reproduce the above */
  14. /* copyright notice, this list of conditions and the following */
  15. /* disclaimer in the documentation and/or other materials */
  16. /* provided with the distribution. */
  17. /* */
  18. /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
  19. /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
  20. /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
  21. /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
  22. /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
  23. /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
  24. /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
  25. /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
  26. /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
  27. /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
  28. /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
  29. /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
  30. /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
  31. /* POSSIBILITY OF SUCH DAMAGE. */
  32. /* */
  33. /* The views and conclusions contained in the software and */
  34. /* documentation are those of the authors and should not be */
  35. /* interpreted as representing official policies, either expressed */
  36. /* or implied, of The University of Texas at Austin. */
  37. /*********************************************************************/
  38. #define ASSEMBLER
  39. #include "common.h"
  40. #define STACK 12
  41. #define ARGS 0
  42. #define STACK_N 4 + STACK + ARGS(%esp)
  43. #define STACK_X 8 + STACK + ARGS(%esp)
  44. #define STACK_INCX 12 + STACK + ARGS(%esp)
  45. #define STACK_Y 16 + STACK + ARGS(%esp)
  46. #define STACK_INCY 20 + STACK + ARGS(%esp)
  47. #define STACK_C 24 + STACK + ARGS(%esp)
  48. #ifdef XDOUBLE
  49. #define STACK_S 40 + STACK + ARGS(%esp)
  50. #elif defined DOUBLE
  51. #define STACK_S 32 + STACK + ARGS(%esp)
  52. #else
  53. #define STACK_S 28 + STACK + ARGS(%esp)
  54. #endif
  55. #define N %ebx
  56. #define X %esi
  57. #define INCX %ecx
  58. #define Y %edi
  59. #define INCY %edx
  60. #define I %eax
  61. #ifdef PENTIUM4
  62. #define PREFETCH prefetcht0
  63. #define PREFETCH_SIZE 144
  64. #endif
  65. #ifdef OPTERON
  66. #define PREFETCH prefetchw
  67. #define PREFETCH_SIZE 144
  68. #endif
  69. PROLOGUE
  70. pushl %edi
  71. pushl %esi
  72. pushl %ebx
  73. PROFCODE
  74. #if defined(F_INTERFACE_GFORT) || defined(F_INTERFACE_G95)
  75. EMMS
  76. #endif
  77. movl STACK_N, N
  78. movl STACK_X, X
  79. movl STACK_INCX, INCX
  80. movl STACK_Y, Y
  81. movl STACK_INCY, INCY
  82. FLD STACK_S
  83. FLD STACK_C
  84. sall $BASE_SHIFT, INCX
  85. sall $BASE_SHIFT, INCY
  86. testl N, N
  87. jle .L999
  88. cmpl $SIZE, INCX
  89. jne .L50
  90. cmpl $SIZE, INCY
  91. jne .L50
  92. movl N, I
  93. sarl $2, I
  94. jle .L15
  95. ALIGN_4
  96. .L10:
  97. #ifdef PENTIUM4
  98. PREFETCH (PREFETCH_SIZE + 0) * SIZE(X)
  99. #endif
  100. #ifdef OPTERON
  101. PREFETCH (PREFETCH_SIZE + 0) * SIZE(X)
  102. #endif
  103. FLD 0 * SIZE(X)
  104. FLD 0 * SIZE(Y)
  105. fld %st(1)
  106. fmul %st(3), %st
  107. fld %st(1)
  108. fmul %st(5), %st
  109. faddp %st, %st(1)
  110. FST 0 * SIZE(X)
  111. fmul %st(2), %st
  112. fxch %st(1)
  113. fmul %st(3), %st
  114. fsubrp %st, %st(1)
  115. FST 0 * SIZE(Y)
  116. FLD 1 * SIZE(X)
  117. FLD 1 * SIZE(Y)
  118. fld %st(1)
  119. fmul %st(3), %st
  120. fld %st(1)
  121. fmul %st(5), %st
  122. faddp %st, %st(1)
  123. FST 1 * SIZE(X)
  124. fmul %st(2), %st
  125. fxch %st(1)
  126. fmul %st(3), %st
  127. fsubrp %st, %st(1)
  128. FST 1 * SIZE(Y)
  129. #ifdef PENTIUM4
  130. PREFETCH (PREFETCH_SIZE + 0) * SIZE(Y)
  131. #endif
  132. #ifdef OPTERON
  133. PREFETCH (PREFETCH_SIZE + 0) * SIZE(Y)
  134. #endif
  135. FLD 2 * SIZE(X)
  136. FLD 2 * SIZE(Y)
  137. fld %st(1)
  138. fmul %st(3), %st
  139. fld %st(1)
  140. fmul %st(5), %st
  141. faddp %st, %st(1)
  142. FST 2 * SIZE(X)
  143. fmul %st(2), %st
  144. fxch %st(1)
  145. fmul %st(3), %st
  146. fsubrp %st, %st(1)
  147. FST 2 * SIZE(Y)
  148. FLD 3 * SIZE(X)
  149. FLD 3 * SIZE(Y)
  150. fld %st(1)
  151. fmul %st(3), %st
  152. fld %st(1)
  153. fmul %st(5), %st
  154. faddp %st, %st(1)
  155. FST 3 * SIZE(X)
  156. fmul %st(2), %st
  157. fxch %st(1)
  158. fmul %st(3), %st
  159. fsubrp %st, %st(1)
  160. FST 3 * SIZE(Y)
  161. addl $4 * SIZE, X
  162. addl $4 * SIZE, Y
  163. decl I
  164. jg .L10
  165. ALIGN_4
  166. .L15:
  167. movl N, I
  168. andl $3, I
  169. jle .L999
  170. ALIGN_4
  171. .L16:
  172. FLD 0 * SIZE(X)
  173. FLD 0 * SIZE(Y)
  174. fld %st(1)
  175. fmul %st(3), %st
  176. fld %st(1)
  177. fmul %st(5), %st
  178. faddp %st, %st(1)
  179. FST 0 * SIZE(X)
  180. fmul %st(2), %st
  181. fxch %st(1)
  182. fmul %st(3), %st
  183. fsubrp %st, %st(1)
  184. FST 0 * SIZE(Y)
  185. addl $SIZE, X
  186. addl $SIZE, Y
  187. decl I
  188. jg .L16
  189. jmp .L999
  190. ALIGN_4
  191. .L50:
  192. movl N, I
  193. sarl $2, I
  194. jle .L55
  195. ALIGN_4
  196. .L51:
  197. FLD 0 * SIZE(X)
  198. FLD 0 * SIZE(Y)
  199. fld %st(1)
  200. fmul %st(3), %st
  201. fld %st(1)
  202. fmul %st(5), %st
  203. faddp %st, %st(1)
  204. FST 0 * SIZE(X)
  205. fmul %st(2), %st
  206. fxch %st(1)
  207. fmul %st(3), %st
  208. fsubrp %st, %st(1)
  209. FST 0 * SIZE(Y)
  210. addl INCX, X
  211. addl INCY, Y
  212. FLD 0 * SIZE(X)
  213. FLD 0 * SIZE(Y)
  214. fld %st(1)
  215. fmul %st(3), %st
  216. fld %st(1)
  217. fmul %st(5), %st
  218. faddp %st, %st(1)
  219. FST 0 * SIZE(X)
  220. fmul %st(2), %st
  221. fxch %st(1)
  222. fmul %st(3), %st
  223. fsubrp %st, %st(1)
  224. FST 0 * SIZE(Y)
  225. addl INCX, X
  226. addl INCY, Y
  227. FLD 0 * SIZE(X)
  228. FLD 0 * SIZE(Y)
  229. fld %st(1)
  230. fmul %st(3), %st
  231. fld %st(1)
  232. fmul %st(5), %st
  233. faddp %st, %st(1)
  234. FST 0 * SIZE(X)
  235. fmul %st(2), %st
  236. fxch %st(1)
  237. fmul %st(3), %st
  238. fsubrp %st, %st(1)
  239. FST 0 * SIZE(Y)
  240. addl INCX, X
  241. addl INCY, Y
  242. FLD 0 * SIZE(X)
  243. FLD 0 * SIZE(Y)
  244. fld %st(1)
  245. fmul %st(3), %st
  246. fld %st(1)
  247. fmul %st(5), %st
  248. faddp %st, %st(1)
  249. FST 0 * SIZE(X)
  250. fmul %st(2), %st
  251. fxch %st(1)
  252. fmul %st(3), %st
  253. fsubrp %st, %st(1)
  254. FST 0 * SIZE(Y)
  255. addl INCX, X
  256. addl INCY, Y
  257. decl I
  258. jg .L51
  259. ALIGN_4
  260. .L55:
  261. movl N, I
  262. andl $3, I
  263. jle .L999
  264. ALIGN_4
  265. .L56:
  266. FLD 0 * SIZE(X)
  267. FLD 0 * SIZE(Y)
  268. fld %st(1)
  269. fmul %st(3), %st
  270. fld %st(1)
  271. fmul %st(5), %st
  272. faddp %st, %st(1)
  273. FST 0 * SIZE(X)
  274. fmul %st(2), %st
  275. fxch %st(1)
  276. fmul %st(3), %st
  277. fsubrp %st, %st(1)
  278. FST 0 * SIZE(Y)
  279. addl INCX, X
  280. addl INCY, Y
  281. decl I
  282. jg .L56
  283. ALIGN_4
  284. .L999:
  285. ffreep %st(0)
  286. ffreep %st(0)
  287. popl %ebx
  288. popl %esi
  289. popl %edi
  290. ret
  291. EPILOGUE