You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

zrot.S 7.2 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407
  1. /*********************************************************************/
  2. /* Copyright 2009, 2010 The University of Texas at Austin. */
  3. /* All rights reserved. */
  4. /* */
  5. /* Redistribution and use in source and binary forms, with or */
  6. /* without modification, are permitted provided that the following */
  7. /* conditions are met: */
  8. /* */
  9. /* 1. Redistributions of source code must retain the above */
  10. /* copyright notice, this list of conditions and the following */
  11. /* disclaimer. */
  12. /* */
  13. /* 2. Redistributions in binary form must reproduce the above */
  14. /* copyright notice, this list of conditions and the following */
  15. /* disclaimer in the documentation and/or other materials */
  16. /* provided with the distribution. */
  17. /* */
  18. /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
  19. /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
  20. /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
  21. /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
  22. /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
  23. /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
  24. /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
  25. /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
  26. /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
  27. /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
  28. /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
  29. /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
  30. /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
  31. /* POSSIBILITY OF SUCH DAMAGE. */
  32. /* */
  33. /* The views and conclusions contained in the software and */
  34. /* documentation are those of the authors and should not be */
  35. /* interpreted as representing official policies, either expressed */
  36. /* or implied, of The University of Texas at Austin. */
  37. /*********************************************************************/
  38. #define ASSEMBLER
  39. #include "common.h"
  40. #define STACK 12
  41. #define ARGS 0
  42. #define STACK_N 4 + STACK + ARGS(%esp)
  43. #define STACK_X 8 + STACK + ARGS(%esp)
  44. #define STACK_INCX 12 + STACK + ARGS(%esp)
  45. #define STACK_Y 16 + STACK + ARGS(%esp)
  46. #define STACK_INCY 20 + STACK + ARGS(%esp)
  47. #define STACK_C 24 + STACK + ARGS(%esp)
  48. #ifdef XDOUBLE
  49. #define STACK_S 40 + STACK + ARGS(%esp)
  50. #elif defined DOUBLE
  51. #define STACK_S 32 + STACK + ARGS(%esp)
  52. #else
  53. #define STACK_S 28 + STACK + ARGS(%esp)
  54. #endif
  55. #define N %ebx
  56. #define X %esi
  57. #define INCX %ecx
  58. #define Y %edi
  59. #define INCY %edx
  60. #define I %eax
  61. #ifdef PENTIUM4
  62. #define PREFETCH prefetcht0
  63. #define PREFETCH_SIZE 144
  64. #endif
  65. #ifdef OPTERON
  66. #define PREFETCH prefetchw
  67. #define PREFETCH_SIZE 144
  68. #endif
  69. PROLOGUE
  70. pushl %edi
  71. pushl %esi
  72. pushl %ebx
  73. PROFCODE
  74. #if defined(F_INTERFACE_GFORT) || defined(F_INTERFACE_G95)
  75. EMMS
  76. #endif
  77. movl STACK_N, N
  78. movl STACK_X, X
  79. movl STACK_INCX, INCX
  80. movl STACK_Y, Y
  81. movl STACK_INCY, INCY
  82. FLD STACK_S
  83. FLD STACK_C
  84. sall $ZBASE_SHIFT, INCX
  85. sall $ZBASE_SHIFT, INCY
  86. testl N, N
  87. jle .L999
  88. cmpl $2 * SIZE, INCX
  89. jne .L50
  90. cmpl $2 * SIZE, INCY
  91. jne .L50
  92. movl N, I
  93. sarl $1, I
  94. jle .L15
  95. ALIGN_4
  96. .L10:
  97. #ifdef PENTIUM4
  98. PREFETCH (PREFETCH_SIZE + 0) * SIZE(X)
  99. #endif
  100. #ifdef OPTERON
  101. PREFETCH (PREFETCH_SIZE + 0) * SIZE(X)
  102. #endif
  103. FLD 0 * SIZE(X)
  104. FLD 0 * SIZE(Y)
  105. fld %st(1)
  106. fmul %st(3), %st
  107. fld %st(1)
  108. fmul %st(5), %st
  109. faddp %st, %st(1)
  110. FST 0 * SIZE(X)
  111. fmul %st(2), %st
  112. fxch %st(1)
  113. fmul %st(3), %st
  114. fsubrp %st, %st(1)
  115. FST 0 * SIZE(Y)
  116. FLD 1 * SIZE(X)
  117. FLD 1 * SIZE(Y)
  118. fld %st(1)
  119. fmul %st(3), %st
  120. fld %st(1)
  121. fmul %st(5), %st
  122. faddp %st, %st(1)
  123. FST 1 * SIZE(X)
  124. fmul %st(2), %st
  125. fxch %st(1)
  126. fmul %st(3), %st
  127. fsubrp %st, %st(1)
  128. FST 1 * SIZE(Y)
  129. #ifdef PENTIUM4
  130. PREFETCH (PREFETCH_SIZE + 0) * SIZE(Y)
  131. #endif
  132. #ifdef OPTERON
  133. PREFETCH (PREFETCH_SIZE + 0) * SIZE(Y)
  134. #endif
  135. FLD 2 * SIZE(X)
  136. FLD 2 * SIZE(Y)
  137. fld %st(1)
  138. fmul %st(3), %st
  139. fld %st(1)
  140. fmul %st(5), %st
  141. faddp %st, %st(1)
  142. FST 2 * SIZE(X)
  143. fmul %st(2), %st
  144. fxch %st(1)
  145. fmul %st(3), %st
  146. fsubrp %st, %st(1)
  147. FST 2 * SIZE(Y)
  148. FLD 3 * SIZE(X)
  149. FLD 3 * SIZE(Y)
  150. fld %st(1)
  151. fmul %st(3), %st
  152. fld %st(1)
  153. fmul %st(5), %st
  154. faddp %st, %st(1)
  155. FST 3 * SIZE(X)
  156. fmul %st(2), %st
  157. fxch %st(1)
  158. fmul %st(3), %st
  159. fsubrp %st, %st(1)
  160. FST 3 * SIZE(Y)
  161. addl $4 * SIZE, X
  162. addl $4 * SIZE, Y
  163. decl I
  164. jg .L10
  165. ALIGN_4
  166. .L15:
  167. movl N, I
  168. andl $1, I
  169. jle .L999
  170. ALIGN_4
  171. .L16:
  172. FLD 0 * SIZE(X)
  173. FLD 0 * SIZE(Y)
  174. fld %st(1)
  175. fmul %st(3), %st
  176. fld %st(1)
  177. fmul %st(5), %st
  178. faddp %st, %st(1)
  179. FST 0 * SIZE(X)
  180. fmul %st(2), %st
  181. fxch %st(1)
  182. fmul %st(3), %st
  183. fsubrp %st, %st(1)
  184. FST 0 * SIZE(Y)
  185. FLD 1 * SIZE(X)
  186. FLD 1 * SIZE(Y)
  187. fld %st(1)
  188. fmul %st(3), %st
  189. fld %st(1)
  190. fmul %st(5), %st
  191. faddp %st, %st(1)
  192. FST 1 * SIZE(X)
  193. fmul %st(2), %st
  194. fxch %st(1)
  195. fmul %st(3), %st
  196. fsubrp %st, %st(1)
  197. FST 1 * SIZE(Y)
  198. jmp .L999
  199. ALIGN_4
  200. .L50:
  201. movl N, I
  202. sarl $1, I
  203. jle .L55
  204. ALIGN_4
  205. .L51:
  206. FLD 0 * SIZE(X)
  207. FLD 0 * SIZE(Y)
  208. fld %st(1)
  209. fmul %st(3), %st
  210. fld %st(1)
  211. fmul %st(5), %st
  212. faddp %st, %st(1)
  213. FST 0 * SIZE(X)
  214. fmul %st(2), %st
  215. fxch %st(1)
  216. fmul %st(3), %st
  217. fsubrp %st, %st(1)
  218. FST 0 * SIZE(Y)
  219. FLD 1 * SIZE(X)
  220. FLD 1 * SIZE(Y)
  221. fld %st(1)
  222. fmul %st(3), %st
  223. fld %st(1)
  224. fmul %st(5), %st
  225. faddp %st, %st(1)
  226. FST 1 * SIZE(X)
  227. fmul %st(2), %st
  228. fxch %st(1)
  229. fmul %st(3), %st
  230. fsubrp %st, %st(1)
  231. FST 1 * SIZE(Y)
  232. addl INCX, X
  233. addl INCY, Y
  234. FLD 0 * SIZE(X)
  235. FLD 0 * SIZE(Y)
  236. fld %st(1)
  237. fmul %st(3), %st
  238. fld %st(1)
  239. fmul %st(5), %st
  240. faddp %st, %st(1)
  241. FST 0 * SIZE(X)
  242. fmul %st(2), %st
  243. fxch %st(1)
  244. fmul %st(3), %st
  245. fsubrp %st, %st(1)
  246. FST 0 * SIZE(Y)
  247. FLD 1 * SIZE(X)
  248. FLD 1 * SIZE(Y)
  249. fld %st(1)
  250. fmul %st(3), %st
  251. fld %st(1)
  252. fmul %st(5), %st
  253. faddp %st, %st(1)
  254. FST 1 * SIZE(X)
  255. fmul %st(2), %st
  256. fxch %st(1)
  257. fmul %st(3), %st
  258. fsubrp %st, %st(1)
  259. FST 1 * SIZE(Y)
  260. addl INCX, X
  261. addl INCY, Y
  262. decl I
  263. jg .L51
  264. ALIGN_4
  265. .L55:
  266. movl N, I
  267. andl $1, I
  268. jle .L999
  269. ALIGN_4
  270. .L56:
  271. FLD 0 * SIZE(X)
  272. FLD 0 * SIZE(Y)
  273. fld %st(1)
  274. fmul %st(3), %st
  275. fld %st(1)
  276. fmul %st(5), %st
  277. faddp %st, %st(1)
  278. FST 0 * SIZE(X)
  279. fmul %st(2), %st
  280. fxch %st(1)
  281. fmul %st(3), %st
  282. fsubrp %st, %st(1)
  283. FST 0 * SIZE(Y)
  284. FLD 1 * SIZE(X)
  285. FLD 1 * SIZE(Y)
  286. fld %st(1)
  287. fmul %st(3), %st
  288. fld %st(1)
  289. fmul %st(5), %st
  290. faddp %st, %st(1)
  291. FST 1 * SIZE(X)
  292. fmul %st(2), %st
  293. fxch %st(1)
  294. fmul %st(3), %st
  295. fsubrp %st, %st(1)
  296. FST 1 * SIZE(Y)
  297. ALIGN_4
  298. .L999:
  299. ffreep %st(0)
  300. ffreep %st(0)
  301. popl %ebx
  302. popl %esi
  303. popl %edi
  304. ret
  305. EPILOGUE