You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

gemm_beta.S 5.2 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224
  1. /*********************************************************************/
  2. /* Copyright 2009, 2010 The University of Texas at Austin. */
  3. /* All rights reserved. */
  4. /* */
  5. /* Redistribution and use in source and binary forms, with or */
  6. /* without modification, are permitted provided that the following */
  7. /* conditions are met: */
  8. /* */
  9. /* 1. Redistributions of source code must retain the above */
  10. /* copyright notice, this list of conditions and the following */
  11. /* disclaimer. */
  12. /* */
  13. /* 2. Redistributions in binary form must reproduce the above */
  14. /* copyright notice, this list of conditions and the following */
  15. /* disclaimer in the documentation and/or other materials */
  16. /* provided with the distribution. */
  17. /* */
  18. /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
  19. /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
  20. /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
  21. /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
  22. /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
  23. /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
  24. /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
  25. /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
  26. /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
  27. /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
  28. /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
  29. /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
  30. /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
  31. /* POSSIBILITY OF SUCH DAMAGE. */
  32. /* */
  33. /* The views and conclusions contained in the software and */
  34. /* documentation are those of the authors and should not be */
  35. /* interpreted as representing official policies, either expressed */
  36. /* or implied, of The University of Texas at Austin. */
  37. /*********************************************************************/
  38. #define ASSEMBLER
  39. #include "common.h"
  40. #define STACK 16
  41. #define ARGS 0
  42. #define M 4 + STACK + ARGS(%esp)
  43. #define N 8 + STACK + ARGS(%esp)
  44. #ifdef DOUBLE
  45. #define BETA 16 + STACK + ARGS(%esp)
  46. #define C 40 + STACK + ARGS(%esp)
  47. #define LDC 44 + STACK + ARGS(%esp)
  48. #else
  49. #define BETA 16 + STACK + ARGS(%esp)
  50. #define C 36 + STACK + ARGS(%esp)
  51. #define LDC 40 + STACK + ARGS(%esp)
  52. #endif
  53. PROLOGUE
  54. pushl %ebp
  55. pushl %edi
  56. pushl %esi
  57. pushl %ebx
  58. PROFCODE
  59. movl M, %esi # m
  60. movl N, %ecx # n
  61. FLD BETA # beta
  62. movl C, %edi # C
  63. movl LDC, %ebp # ldc
  64. testl %esi, %esi # if n <= 0 goto End
  65. jle .L999
  66. testl %ecx, %ecx # if m <= 0 goto End
  67. jle .L999
  68. ftst
  69. fnstsw %ax
  70. andb $68, %ah
  71. je .L201
  72. ALIGN_4
  73. .L101:
  74. movl %edi, %eax # c_offset = c
  75. leal (%edi, %ebp, SIZE), %edi # c += ldc
  76. movl %esi, %edx
  77. sarl $3, %edx
  78. jle .L103
  79. ALIGN_4
  80. .L102:
  81. #ifdef HAS_PREFETCH
  82. #ifndef ATHLON
  83. prefetchnta 12 * SIZE(%eax)
  84. prefetchnta 16 * SIZE(%eax)
  85. #else
  86. prefetchw 32 * SIZE(%eax)
  87. #endif
  88. #endif
  89. FSTU 0 * SIZE(%eax)
  90. FSTU 1 * SIZE(%eax)
  91. FSTU 2 * SIZE(%eax)
  92. FSTU 3 * SIZE(%eax)
  93. FSTU 4 * SIZE(%eax)
  94. FSTU 5 * SIZE(%eax)
  95. FSTU 6 * SIZE(%eax)
  96. FSTU 7 * SIZE(%eax)
  97. addl $8 * SIZE, %eax
  98. decl %edx
  99. jg .L102
  100. ALIGN_4
  101. .L103:
  102. movl %esi, %edx
  103. andl $7, %edx
  104. jle .L105
  105. ALIGN_4
  106. .L104:
  107. FSTU 0 * SIZE(%eax)
  108. addl $SIZE, %eax
  109. decl %edx
  110. jg .L104
  111. ALIGN_4
  112. .L105:
  113. decl %ecx
  114. jg .L101
  115. jmp .L999
  116. ALIGN_3
  117. .L201:
  118. movl %edi, %eax # c_offset = c
  119. leal (%edi, %ebp, SIZE), %edi # c += ldc
  120. movl %esi, %edx
  121. sarl $3, %edx
  122. jle .L203
  123. ALIGN_4
  124. .L202:
  125. #ifdef HAS_PREFETCH
  126. #ifndef ATHLON
  127. prefetchnta 16 * SIZE(%eax)
  128. prefetchnta 20 * SIZE(%eax)
  129. #else
  130. prefetchw 32 * SIZE(%eax)
  131. #endif
  132. #endif
  133. FLD 0 * SIZE(%eax)
  134. fmul %st(1),%st
  135. FST 0 * SIZE(%eax)
  136. FLD 1 * SIZE(%eax)
  137. fmul %st(1),%st
  138. FST 1 * SIZE(%eax)
  139. FLD 2 * SIZE(%eax)
  140. fmul %st(1),%st
  141. FST 2 * SIZE(%eax)
  142. FLD 3 * SIZE(%eax)
  143. fmul %st(1),%st
  144. FST 3 * SIZE(%eax)
  145. FLD 4 * SIZE(%eax)
  146. fmul %st(1),%st
  147. FST 4 * SIZE(%eax)
  148. FLD 5 * SIZE(%eax)
  149. fmul %st(1),%st
  150. FST 5 * SIZE(%eax)
  151. FLD 6 * SIZE(%eax)
  152. fmul %st(1),%st
  153. FST 6 * SIZE(%eax)
  154. FLD 7 * SIZE(%eax)
  155. fmul %st(1),%st
  156. FST 7 * SIZE(%eax)
  157. addl $8 * SIZE, %eax
  158. decl %edx
  159. jg .L202
  160. ALIGN_4
  161. .L203:
  162. movl %esi, %edx
  163. andl $7, %edx
  164. jle .L205
  165. ALIGN_4
  166. .L204:
  167. FLD 0 * SIZE(%eax)
  168. fmul %st(1), %st
  169. FST 0 * SIZE(%eax)
  170. addl $SIZE, %eax
  171. decl %edx
  172. jg .L204
  173. ALIGN_4
  174. .L205:
  175. decl %ecx
  176. jg .L201
  177. ALIGN_3
  178. .L999:
  179. #ifndef C_SUN
  180. ffreep %st(0)
  181. #else
  182. .byte 0xdf
  183. .byte 0xc0
  184. #endif
  185. xorl %eax,%eax
  186. popl %ebx
  187. popl %esi
  188. popl %edi
  189. popl %ebp
  190. ret
  191. EPILOGUE