You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

gemm_tcopy_2_sse.S 5.7 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236
  1. /*********************************************************************/
  2. /* Copyright 2009, 2010 The University of Texas at Austin. */
  3. /* All rights reserved. */
  4. /* */
  5. /* Redistribution and use in source and binary forms, with or */
  6. /* without modification, are permitted provided that the following */
  7. /* conditions are met: */
  8. /* */
  9. /* 1. Redistributions of source code must retain the above */
  10. /* copyright notice, this list of conditions and the following */
  11. /* disclaimer. */
  12. /* */
  13. /* 2. Redistributions in binary form must reproduce the above */
  14. /* copyright notice, this list of conditions and the following */
  15. /* disclaimer in the documentation and/or other materials */
  16. /* provided with the distribution. */
  17. /* */
  18. /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
  19. /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
  20. /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
  21. /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
  22. /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
  23. /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
  24. /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
  25. /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
  26. /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
  27. /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
  28. /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
  29. /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
  30. /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
  31. /* POSSIBILITY OF SUCH DAMAGE. */
  32. /* */
  33. /* The views and conclusions contained in the software and */
  34. /* documentation are those of the authors and should not be */
  35. /* interpreted as representing official policies, either expressed */
  36. /* or implied, of The University of Texas at Austin. */
  37. /*********************************************************************/
  38. #define ASSEMBLER
  39. #include "common.h"
  40. #define RPREFETCHSIZE 12
  41. #define WPREFETCHSIZE (RPREFETCHSIZE * 2)
  42. #define PREFETCH prefetcht0
  43. #define PREFETCHW prefetcht2
  44. #define STACK 16
  45. #define ARGS 8
  46. #define J 0 + STACK(%esp)
  47. #define BOFFSET2 4 + STACK(%esp)
  48. #define M 4 + STACK + ARGS(%esp)
  49. #define N 8 + STACK + ARGS(%esp)
  50. #define A 12 + STACK + ARGS(%esp)
  51. #define LDA 16 + STACK + ARGS(%esp)
  52. #define B 20 + STACK + ARGS(%esp)
  53. PROLOGUE
  54. subl $ARGS, %esp
  55. pushl %ebp
  56. pushl %edi
  57. pushl %esi
  58. pushl %ebx
  59. PROFCODE
  60. movl A, %ebp
  61. movl B, %edi
  62. movl M, %ebx
  63. movl N, %eax
  64. andl $-2, %eax
  65. imull %ebx, %eax # m * ( n & ~1)
  66. leal (%edi,%eax,SIZE), %eax # boffset2 = b + m * (n & ~1)
  67. movl %eax, BOFFSET2
  68. movl M, %esi
  69. #ifdef DOUBLE
  70. sall $4,%esi
  71. #else
  72. sall $3,%esi
  73. #endif
  74. sarl $1, %ebx # if !(m & 1) goto L28
  75. movl %ebx, J
  76. jle .L28
  77. ALIGN_4
  78. .L39:
  79. movl %ebp, %edx # aoffset1 = a
  80. movl LDA, %eax
  81. movl N, %ebx
  82. leal (%ebp, %eax,SIZE), %ecx # aoffset2 = a + lda
  83. leal (%ecx, %eax,SIZE), %ebp # aoffset += 2 * lda
  84. movl %edi, %eax # boffset1 = b_offset
  85. addl $4 * SIZE, %edi # boffset += 4
  86. sarl $2, %ebx
  87. jle .L32
  88. ALIGN_4
  89. .L36:
  90. PREFETCH RPREFETCHSIZE * SIZE(%edx)
  91. movsd 0 * SIZE(%edx), %xmm0
  92. movhps 1 * SIZE(%edx), %xmm0
  93. movsd 0 * SIZE(%ecx), %xmm2
  94. movhps 1 * SIZE(%ecx), %xmm2
  95. PREFETCH RPREFETCHSIZE * SIZE(%ecx)
  96. movsd 2 * SIZE(%edx), %xmm4
  97. movhps 3 * SIZE(%edx), %xmm4
  98. movsd 2 * SIZE(%ecx), %xmm6
  99. movhps 3 * SIZE(%ecx), %xmm6
  100. movaps %xmm0, 0 * SIZE(%eax)
  101. movaps %xmm2, 2 * SIZE(%eax)
  102. addl %esi, %eax
  103. movaps %xmm4, 0 * SIZE(%eax)
  104. movaps %xmm6, 2 * SIZE(%eax)
  105. addl $4 * SIZE, %ecx
  106. addl $4 * SIZE, %edx
  107. addl %esi, %eax
  108. decl %ebx
  109. jne .L36
  110. ALIGN_4
  111. .L32:
  112. movl N, %ebx
  113. test $2, %ebx
  114. je .L37
  115. PREFETCH RPREFETCHSIZE * SIZE(%edx)
  116. movsd 0 * SIZE(%edx), %xmm0
  117. movhps 1 * SIZE(%edx), %xmm0
  118. PREFETCH RPREFETCHSIZE * SIZE(%ecx)
  119. movsd 0 * SIZE(%ecx), %xmm2
  120. movhps 1 * SIZE(%ecx), %xmm2
  121. movaps %xmm0, 0 * SIZE(%eax)
  122. movaps %xmm2, 2 * SIZE(%eax)
  123. addl $2 * SIZE, %ecx
  124. addl $2 * SIZE, %edx
  125. ALIGN_4
  126. .L37:
  127. movl N, %ebx
  128. test $1, %ebx
  129. je .L38
  130. movl BOFFSET2, %eax
  131. movsd 0 * SIZE(%edx), %xmm0
  132. movhps 0 * SIZE(%ecx), %xmm0
  133. movaps %xmm0, 0 * SIZE(%eax)
  134. addl $2 * SIZE, %eax
  135. movl %eax, BOFFSET2
  136. ALIGN_4
  137. .L38:
  138. decl J
  139. jg .L39
  140. ALIGN_4
  141. .L28:
  142. movl M, %eax
  143. movl N, %ebx
  144. testb $1, %al
  145. je .L40
  146. sarl $2, %ebx
  147. jle .L41
  148. ALIGN_4
  149. .L45:
  150. movsd 0 * SIZE(%ebp), %xmm0
  151. movhps 1 * SIZE(%ebp), %xmm0
  152. movsd 2 * SIZE(%ebp), %xmm2
  153. movhps 3 * SIZE(%ebp), %xmm2
  154. movaps %xmm0, 0 * SIZE(%edi)
  155. addl %esi, %edi
  156. movaps %xmm2, 0 * SIZE(%edi)
  157. addl %esi,%edi
  158. addl $4 * SIZE, %ebp
  159. decl %ebx
  160. jg .L45
  161. ALIGN_4
  162. .L41:
  163. movl N, %ebx
  164. test $2, %ebx
  165. je .L46
  166. movsd 0 * SIZE(%ebp), %xmm0
  167. movhps 1 * SIZE(%ebp), %xmm0
  168. movaps %xmm0, 0 * SIZE(%edi)
  169. addl $2 * SIZE, %ebp
  170. ALIGN_4
  171. .L46:
  172. movl N, %ebx
  173. test $1, %ebx
  174. je .L40
  175. movl BOFFSET2, %eax
  176. movsd 0 * SIZE(%ebp), %xmm0
  177. movsd %xmm0, 0 * SIZE(%eax)
  178. ALIGN_4
  179. .L40:
  180. popl %ebx
  181. popl %esi
  182. popl %edi
  183. popl %ebp
  184. addl $ARGS,%esp
  185. ret
  186. EPILOGUE