You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

axpy.S 5.4 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247
  1. /*********************************************************************/
  2. /* Copyright 2009, 2010 The University of Texas at Austin. */
  3. /* All rights reserved. */
  4. /* */
  5. /* Redistribution and use in source and binary forms, with or */
  6. /* without modification, are permitted provided that the following */
  7. /* conditions are met: */
  8. /* */
  9. /* 1. Redistributions of source code must retain the above */
  10. /* copyright notice, this list of conditions and the following */
  11. /* disclaimer. */
  12. /* */
  13. /* 2. Redistributions in binary form must reproduce the above */
  14. /* copyright notice, this list of conditions and the following */
  15. /* disclaimer in the documentation and/or other materials */
  16. /* provided with the distribution. */
  17. /* */
  18. /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
  19. /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
  20. /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
  21. /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
  22. /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
  23. /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
  24. /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
  25. /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
  26. /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
  27. /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
  28. /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
  29. /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
  30. /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
  31. /* POSSIBILITY OF SUCH DAMAGE. */
  32. /* */
  33. /* The views and conclusions contained in the software and */
  34. /* documentation are those of the authors and should not be */
  35. /* interpreted as representing official policies, either expressed */
  36. /* or implied, of The University of Texas at Austin. */
  37. /*********************************************************************/
  38. #define ASSEMBLER
  39. #include "common.h"
  40. #define STACK 12
  41. #define ARGS 0
  42. #define STACK_M 4 + STACK + ARGS(%esp)
  43. #define STACK_ALPHA 16 + STACK + ARGS(%esp)
  44. #ifdef DOUBLE
  45. #define STACK_X 24 + STACK + ARGS(%esp)
  46. #define STACK_INCX 28 + STACK + ARGS(%esp)
  47. #define STACK_Y 32 + STACK + ARGS(%esp)
  48. #define STACK_INCY 36 + STACK + ARGS(%esp)
  49. #else
  50. #define STACK_X 20 + STACK + ARGS(%esp)
  51. #define STACK_INCX 24 + STACK + ARGS(%esp)
  52. #define STACK_Y 28 + STACK + ARGS(%esp)
  53. #define STACK_INCY 32 + STACK + ARGS(%esp)
  54. #endif
  55. #define M %ebx
  56. #define X %esi
  57. #define INCX %ecx
  58. #define Y %edi
  59. #define INCY %edx
  60. PROLOGUE
  61. pushl %edi
  62. pushl %esi
  63. pushl %ebx
  64. PROFCODE
  65. #if defined(F_INTERFACE_GFORT) || defined(F_INTERFACE_G95)
  66. EMMS
  67. #endif
  68. FLD STACK_ALPHA
  69. movl STACK_M, M
  70. movl STACK_X, X
  71. movl STACK_INCX, INCX
  72. movl STACK_Y, Y
  73. movl STACK_INCY, INCY
  74. leal (, INCX, SIZE), INCX
  75. leal (, INCY, SIZE), INCY
  76. testl M, M
  77. jle .L40
  78. cmpl $SIZE, INCX
  79. jne .L14
  80. cmpl $SIZE, INCY
  81. jne .L14
  82. movl M, %eax
  83. sarl $3, %eax
  84. jle .L15
  85. ALIGN_3
  86. #define PRESIZE 33
  87. .L16:
  88. #ifdef HAS_PREFETCH
  89. prefetcht0 PRESIZE * SIZE(X)
  90. #endif
  91. FLD 0 * SIZE(X)
  92. fmul %st(1),%st
  93. FADD 0 * SIZE(Y)
  94. FST 0 * SIZE(Y)
  95. FLD 1 * SIZE(X)
  96. fmul %st(1),%st
  97. FADD 1 * SIZE(Y)
  98. FST 1 * SIZE(Y)
  99. FLD 2 * SIZE(X)
  100. fmul %st(1),%st
  101. FADD 2 * SIZE(Y)
  102. FST 2 * SIZE(Y)
  103. FLD 3 * SIZE(X)
  104. fmul %st(1),%st
  105. FADD 3 * SIZE(Y)
  106. FST 3 * SIZE(Y)
  107. #ifdef HAS_PREFETCH
  108. prefetcht0 (4 + PRESIZE) * SIZE(X)
  109. #endif
  110. FLD 4 * SIZE(X)
  111. fmul %st(1),%st
  112. FADD 4 * SIZE(Y)
  113. FST 4 * SIZE(Y)
  114. FLD 5 * SIZE(X)
  115. fmul %st(1),%st
  116. FADD 5 * SIZE(Y)
  117. FST 5 * SIZE(Y)
  118. FLD 6 * SIZE(X)
  119. fmul %st(1),%st
  120. FADD 6 * SIZE(Y)
  121. FST 6 * SIZE(Y)
  122. FLD 7 * SIZE(X)
  123. fmul %st(1),%st
  124. FADD 7 * SIZE(Y)
  125. FST 7 * SIZE(Y)
  126. #ifdef HAVE_3DNOW
  127. prefetchw 24 * SIZE(Y)
  128. #endif
  129. addl $8 * SIZE, X
  130. addl $8 * SIZE, Y
  131. decl %eax
  132. jg .L16
  133. ALIGN_3
  134. .L15:
  135. movl M, %eax
  136. andl $7, %eax
  137. jle .L40
  138. ALIGN_3
  139. .L22:
  140. FLD 0 * SIZE(X)
  141. fmul %st(1),%st
  142. FADD 0 * SIZE(Y)
  143. FST 0 * SIZE(Y)
  144. addl $SIZE, X
  145. addl $SIZE, Y
  146. decl %eax
  147. jg .L22
  148. jmp .L40
  149. ALIGN_3
  150. .L14:
  151. movl M, %eax
  152. sarl $2, %eax
  153. jle .L28
  154. ALIGN_3
  155. .L29:
  156. FLD (X)
  157. fmul %st(1),%st
  158. FADD (Y)
  159. FST (Y)
  160. addl INCX, X
  161. addl INCY, Y
  162. FLD (X)
  163. fmul %st(1),%st
  164. FADD (Y)
  165. FST (Y)
  166. addl INCX, X
  167. addl INCY, Y
  168. FLD (X)
  169. fmul %st(1),%st
  170. FADD (Y)
  171. FST (Y)
  172. addl INCX, X
  173. addl INCY, Y
  174. FLD (X)
  175. fmul %st(1),%st
  176. FADD (Y)
  177. FST (Y)
  178. addl INCX, X
  179. addl INCY, Y
  180. decl %eax
  181. jg .L29
  182. ALIGN_3
  183. .L28:
  184. movl M, %eax
  185. andl $3, %eax
  186. jle .L40
  187. ALIGN_3
  188. .L35:
  189. FLD (X)
  190. fmul %st(1),%st
  191. FADD (Y)
  192. FST (Y)
  193. addl INCX, X
  194. addl INCY, Y
  195. decl %eax
  196. jg .L35
  197. ALIGN_3
  198. .L40:
  199. ffreep %st(0)
  200. xorl %eax,%eax
  201. popl %ebx
  202. popl %esi
  203. popl %edi
  204. ret
  205. EPILOGUE