You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

gemm_tcopy_2.S 6.9 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305
  1. /*********************************************************************/
  2. /* Copyright 2009, 2010 The University of Texas at Austin. */
  3. /* All rights reserved. */
  4. /* */
  5. /* Redistribution and use in source and binary forms, with or */
  6. /* without modification, are permitted provided that the following */
  7. /* conditions are met: */
  8. /* */
  9. /* 1. Redistributions of source code must retain the above */
  10. /* copyright notice, this list of conditions and the following */
  11. /* disclaimer. */
  12. /* */
  13. /* 2. Redistributions in binary form must reproduce the above */
  14. /* copyright notice, this list of conditions and the following */
  15. /* disclaimer in the documentation and/or other materials */
  16. /* provided with the distribution. */
  17. /* */
  18. /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
  19. /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
  20. /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
  21. /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
  22. /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
  23. /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
  24. /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
  25. /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
  26. /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
  27. /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
  28. /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
  29. /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
  30. /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
  31. /* POSSIBILITY OF SUCH DAMAGE. */
  32. /* */
  33. /* The views and conclusions contained in the software and */
  34. /* documentation are those of the authors and should not be */
  35. /* interpreted as representing official policies, either expressed */
  36. /* or implied, of The University of Texas at Austin. */
  37. /*********************************************************************/
  38. #define ASSEMBLER
  39. #include "common.h"
  40. #define STACK 16
  41. #define ARGS 8
  42. #define J 0 + STACK(%esp)
  43. #define BOFFSET2 4 + STACK(%esp)
  44. #define M 4 + STACK + ARGS(%esp)
  45. #define N 8 + STACK + ARGS(%esp)
  46. #define A 12 + STACK + ARGS(%esp)
  47. #define LDA 16 + STACK + ARGS(%esp)
  48. #define B 20 + STACK + ARGS(%esp)
  49. PROLOGUE
  50. subl $ARGS, %esp
  51. pushl %ebp
  52. pushl %edi
  53. pushl %esi
  54. pushl %ebx
  55. PROFCODE
  56. EMMS
  57. movl A, %ebp
  58. movl B, %edi
  59. movl M, %ebx
  60. movl N, %eax
  61. andl $-2, %eax
  62. imull %ebx, %eax # m * ( n & ~1)
  63. leal (%edi,%eax,SIZE), %eax # boffset2 = b + m * (n & ~1)
  64. movl %eax, BOFFSET2
  65. movl M, %esi
  66. #ifdef DOUBLE
  67. sall $4,%esi
  68. #else
  69. sall $3,%esi
  70. #endif
  71. sarl $1, %ebx # if !(m & 1) goto L28
  72. movl %ebx, J
  73. jle .L28
  74. ALIGN_4
  75. .L39:
  76. movl %ebp, %edx # aoffset1 = a
  77. movl LDA, %eax
  78. movl N, %ebx
  79. leal (%ebp, %eax,SIZE), %ecx # aoffset2 = a + lda
  80. leal (%ecx, %eax,SIZE), %ebp # aoffset += 2 * lda
  81. movl %edi, %eax # boffset1 = b_offset
  82. addl $4 * SIZE, %edi # boffset += 4
  83. sarl $2, %ebx
  84. jle .L32
  85. ALIGN_4
  86. .L36:
  87. #ifdef HAVE_MMX
  88. MMXLOAD 0 * SIZE(%edx), %mm0
  89. MMXLOAD 1 * SIZE(%edx), %mm1
  90. MMXLOAD 0 * SIZE(%ecx), %mm2
  91. MMXLOAD 1 * SIZE(%ecx), %mm3
  92. MMXLOAD 2 * SIZE(%edx), %mm4
  93. MMXLOAD 3 * SIZE(%edx), %mm5
  94. MMXLOAD 2 * SIZE(%ecx), %mm6
  95. MMXLOAD 3 * SIZE(%ecx), %mm7
  96. MMXSTORE %mm0, 0 * SIZE(%eax)
  97. MMXSTORE %mm1, 1 * SIZE(%eax)
  98. MMXSTORE %mm2, 2 * SIZE(%eax)
  99. MMXSTORE %mm3, 3 * SIZE(%eax)
  100. addl %esi, %eax
  101. MMXSTORE %mm4, 0 * SIZE(%eax)
  102. MMXSTORE %mm5, 1 * SIZE(%eax)
  103. MMXSTORE %mm6, 2 * SIZE(%eax)
  104. MMXSTORE %mm7, 3 * SIZE(%eax)
  105. #else
  106. FLD 1 * SIZE(%ecx)
  107. FLD 0 * SIZE(%ecx)
  108. FLD 1 * SIZE(%edx)
  109. FLD 0 * SIZE(%edx)
  110. FST 0 * SIZE(%eax)
  111. FST 1 * SIZE(%eax)
  112. FST 2 * SIZE(%eax)
  113. FST 3 * SIZE(%eax)
  114. addl %esi, %eax
  115. FLD 3 * SIZE(%ecx)
  116. FLD 2 * SIZE(%ecx)
  117. FLD 3 * SIZE(%edx)
  118. FLD 2 * SIZE(%edx)
  119. FST 0 * SIZE(%eax)
  120. FST 1 * SIZE(%eax)
  121. FST 2 * SIZE(%eax)
  122. FST 3 * SIZE(%eax)
  123. #endif
  124. addl $4 * SIZE, %ecx
  125. addl $4 * SIZE, %edx
  126. addl %esi, %eax
  127. decl %ebx
  128. jne .L36
  129. ALIGN_4
  130. .L32:
  131. movl N, %ebx
  132. test $2, %ebx
  133. je .L37
  134. #ifdef HAVE_MMX
  135. MMXLOAD 0 * SIZE(%edx), %mm0
  136. MMXLOAD 1 * SIZE(%edx), %mm1
  137. MMXLOAD 0 * SIZE(%ecx), %mm2
  138. MMXLOAD 1 * SIZE(%ecx), %mm3
  139. MMXSTORE %mm0, 0 * SIZE(%eax)
  140. MMXSTORE %mm1, 1 * SIZE(%eax)
  141. MMXSTORE %mm2, 2 * SIZE(%eax)
  142. MMXSTORE %mm3, 3 * SIZE(%eax)
  143. #else
  144. FLD 1 * SIZE(%ecx)
  145. FLD 0 * SIZE(%ecx)
  146. FLD 1 * SIZE(%edx)
  147. FLD 0 * SIZE(%edx)
  148. FST 0 * SIZE(%eax)
  149. FST 1 * SIZE(%eax)
  150. FST 2 * SIZE(%eax)
  151. FST 3 * SIZE(%eax)
  152. #endif
  153. addl $2 * SIZE, %ecx
  154. addl $2 * SIZE, %edx
  155. ALIGN_4
  156. .L37:
  157. movl N, %ebx
  158. test $1, %ebx
  159. je .L38
  160. movl BOFFSET2, %eax
  161. #ifdef HAVE_MMX
  162. MMXLOAD 0 * SIZE(%edx), %mm0
  163. MMXLOAD 0 * SIZE(%ecx), %mm1
  164. MMXSTORE %mm0, 0 * SIZE(%eax)
  165. MMXSTORE %mm1, 1 * SIZE(%eax)
  166. #else
  167. FLD 0 * SIZE(%edx)
  168. FST 0 * SIZE(%eax)
  169. FLD 0 * SIZE(%ecx)
  170. FST 1 * SIZE(%eax)
  171. #endif
  172. addl $2 * SIZE, %eax
  173. movl %eax, BOFFSET2
  174. ALIGN_4
  175. .L38:
  176. decl J
  177. jg .L39
  178. ALIGN_4
  179. .L28:
  180. movl M, %eax
  181. movl N, %ebx
  182. testb $1, %al
  183. je .L40
  184. sarl $2, %ebx
  185. jle .L41
  186. ALIGN_4
  187. .L45:
  188. #ifdef HAVE_MMX
  189. MMXLOAD 0 * SIZE(%ebp), %mm0
  190. MMXLOAD 1 * SIZE(%ebp), %mm1
  191. MMXLOAD 2 * SIZE(%ebp), %mm2
  192. MMXLOAD 3 * SIZE(%ebp), %mm3
  193. MMXSTORE %mm0, 0 * SIZE(%edi)
  194. MMXSTORE %mm1, 1 * SIZE(%edi)
  195. addl %esi, %edi
  196. MMXSTORE %mm2, 0 * SIZE(%edi)
  197. MMXSTORE %mm3, 1 * SIZE(%edi)
  198. #else
  199. FLD 0 * SIZE(%ebp)
  200. FST 0 * SIZE(%edi)
  201. FLD 1 * SIZE(%ebp)
  202. FST 1 * SIZE(%edi)
  203. addl %esi, %edi
  204. FLD 2 * SIZE(%ebp)
  205. FST 0 * SIZE(%edi)
  206. FLD 3 * SIZE(%ebp)
  207. FST 1 * SIZE(%edi)
  208. #endif
  209. addl %esi,%edi
  210. addl $4 * SIZE, %ebp
  211. decl %ebx
  212. jg .L45
  213. ALIGN_4
  214. .L41:
  215. movl N, %ebx
  216. test $2, %ebx
  217. je .L46
  218. #ifdef HAVE_MMX
  219. MMXLOAD 0 * SIZE(%ebp), %mm0
  220. MMXSTORE %mm0, 0 * SIZE(%edi)
  221. MMXLOAD 1 * SIZE(%ebp), %mm1
  222. MMXSTORE %mm1, 1 * SIZE(%edi)
  223. #else
  224. FLD 1 * SIZE(%ebp)
  225. FLD 0 * SIZE(%ebp)
  226. FST 0 * SIZE(%edi)
  227. FST 1 * SIZE(%edi)
  228. #endif
  229. addl $2 * SIZE, %ebp
  230. ALIGN_4
  231. .L46:
  232. movl N, %ebx
  233. test $1, %ebx
  234. je .L40
  235. movl BOFFSET2, %eax
  236. #ifdef HAVE_MMX
  237. MMXLOAD 0 * SIZE(%ebp), %mm0
  238. MMXSTORE %mm0, 0 * SIZE(%eax)
  239. #else
  240. FLD (%ebp)
  241. FST (%eax)
  242. #endif
  243. ALIGN_4
  244. .L40:
  245. EMMS
  246. popl %ebx
  247. popl %esi
  248. popl %edi
  249. popl %ebp
  250. addl $ARGS,%esp
  251. ret
  252. EPILOGUE