You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

zgemm_beta.S 6.0 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260
  1. /*********************************************************************/
  2. /* Copyright 2009, 2010 The University of Texas at Austin. */
  3. /* All rights reserved. */
  4. /* */
  5. /* Redistribution and use in source and binary forms, with or */
  6. /* without modification, are permitted provided that the following */
  7. /* conditions are met: */
  8. /* */
  9. /* 1. Redistributions of source code must retain the above */
  10. /* copyright notice, this list of conditions and the following */
  11. /* disclaimer. */
  12. /* */
  13. /* 2. Redistributions in binary form must reproduce the above */
  14. /* copyright notice, this list of conditions and the following */
  15. /* disclaimer in the documentation and/or other materials */
  16. /* provided with the distribution. */
  17. /* */
  18. /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
  19. /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
  20. /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
  21. /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
  22. /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
  23. /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
  24. /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
  25. /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
  26. /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
  27. /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
  28. /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
  29. /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
  30. /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
  31. /* POSSIBILITY OF SUCH DAMAGE. */
  32. /* */
  33. /* The views and conclusions contained in the software and */
  34. /* documentation are those of the authors and should not be */
  35. /* interpreted as representing official policies, either expressed */
  36. /* or implied, of The University of Texas at Austin. */
  37. /*********************************************************************/
  38. #define ASSEMBLER
  39. #include "common.h"
  40. #ifndef WINDOWS_ABI
  41. #define M ARG1
  42. #define N ARG2
  43. #define C ARG3
  44. #define LDC ARG4
  45. #define C1 ARG5
  46. #define STACK_C 16(%rsp)
  47. #define STACK_LDC 24(%rsp)
  48. #else
  49. #define STACKSIZE 256
  50. #define M ARG1
  51. #define N ARG2
  52. #define C ARG3
  53. #define LDC ARG4
  54. #define C1 %r10
  55. #define STACK_ALPHA_I 40 + STACKSIZE(%rsp)
  56. #define STACK_C 80 + STACKSIZE(%rsp)
  57. #define STACK_LDC 88 + STACKSIZE(%rsp)
  58. #endif
  59. #define I %rax
  60. PROLOGUE
  61. PROFCODE
  62. #ifdef WINDOWS_ABI
  63. subq $STACKSIZE, %rsp
  64. movups %xmm6, 0(%rsp)
  65. movups %xmm7, 16(%rsp)
  66. movups %xmm8, 32(%rsp)
  67. movups %xmm9, 48(%rsp)
  68. movups %xmm10, 64(%rsp)
  69. movups %xmm11, 80(%rsp)
  70. movups %xmm12, 96(%rsp)
  71. movups %xmm13, 112(%rsp)
  72. movups %xmm14, 128(%rsp)
  73. movups %xmm15, 144(%rsp)
  74. movaps %xmm3, %xmm0
  75. movsd STACK_ALPHA_I, %xmm1
  76. #endif
  77. pxor %xmm15, %xmm15
  78. movq STACK_C, C
  79. movq STACK_LDC, LDC
  80. testq M, M
  81. jle .L999
  82. testq N, N
  83. jle .L999
  84. salq $ZBASE_SHIFT, LDC
  85. #ifdef DOUBLE
  86. ucomisd %xmm15, %xmm0
  87. jne .L71
  88. ucomisd %xmm15, %xmm1
  89. jne .L71
  90. #else
  91. ucomiss %xmm15, %xmm0
  92. jne .L71
  93. ucomiss %xmm15, %xmm1
  94. jne .L71
  95. #endif
  96. ALIGN_2
  97. .L53:
  98. movq C, C1 # c_offset1 = c_offset
  99. addq LDC, C # c_offset += ldc
  100. movq M, I
  101. sarq $2, I
  102. jle .L56
  103. ALIGN_2
  104. .L57:
  105. #ifdef OPTERON
  106. prefetchw 64 * SIZE(C1)
  107. #endif
  108. MOVSD %xmm0, 0 * SIZE(C1) # c_offset1
  109. MOVSD %xmm0, 1 * SIZE(C1)
  110. MOVSD %xmm0, 2 * SIZE(C1)
  111. MOVSD %xmm0, 3 * SIZE(C1)
  112. MOVSD %xmm0, 4 * SIZE(C1)
  113. MOVSD %xmm0, 5 * SIZE(C1)
  114. MOVSD %xmm0, 6 * SIZE(C1)
  115. MOVSD %xmm0, 7 * SIZE(C1)
  116. addq $8 * SIZE, C1 # c_offset1 += 8
  117. decq I # i--
  118. jg .L57
  119. ALIGN_2
  120. .L56:
  121. movq M, I
  122. andq $3, I
  123. jle .L62
  124. ALIGN_2
  125. .L63:
  126. MOVSD %xmm0, 0 * SIZE(C1)
  127. MOVSD %xmm0, 1 * SIZE(C1)
  128. addq $2 * SIZE,C1
  129. decq I
  130. jg .L63
  131. ALIGN_2
  132. .L62:
  133. decq N # j --
  134. jg .L53
  135. jmp .L999
  136. ALIGN_3
  137. .L71:
  138. movq C, C1
  139. addq LDC, C # c_offset += ldc
  140. movq M, I
  141. sarq $1, I
  142. jle .L84
  143. ALIGN_3
  144. .L85:
  145. #ifdef OPTERON
  146. prefetchw 16 * SIZE(C1)
  147. #endif
  148. MOVSD 0 * SIZE(C1), %xmm2
  149. MOVSD 1 * SIZE(C1), %xmm3
  150. MOVSD 0 * SIZE(C1), %xmm4
  151. MOVSD 1 * SIZE(C1), %xmm5
  152. MOVSD 2 * SIZE(C1), %xmm6
  153. MOVSD 3 * SIZE(C1), %xmm7
  154. MOVSD 2 * SIZE(C1), %xmm8
  155. MOVSD 3 * SIZE(C1), %xmm9
  156. MULSD %xmm0, %xmm2
  157. MULSD %xmm1, %xmm3
  158. MULSD %xmm1, %xmm4
  159. MULSD %xmm0, %xmm5
  160. MULSD %xmm0, %xmm6
  161. MULSD %xmm1, %xmm7
  162. MULSD %xmm1, %xmm8
  163. MULSD %xmm0, %xmm9
  164. SUBSD %xmm3, %xmm2
  165. ADDPD %xmm5, %xmm4
  166. SUBSD %xmm7, %xmm6
  167. ADDPD %xmm9, %xmm8
  168. MOVSD %xmm2, 0 * SIZE(C1)
  169. MOVSD %xmm4, 1 * SIZE(C1)
  170. MOVSD %xmm6, 2 * SIZE(C1)
  171. MOVSD %xmm8, 3 * SIZE(C1)
  172. addq $4 * SIZE, C1
  173. decq I
  174. jg .L85
  175. ALIGN_3
  176. .L84:
  177. testq $1, M
  178. jle .L74
  179. ALIGN_3
  180. .L75:
  181. prefetchnta 80 * SIZE(C1)
  182. MOVSD 0 * SIZE(C1), %xmm2
  183. MULSD %xmm0, %xmm2
  184. MOVSD 1 * SIZE(C1), %xmm3
  185. MULSD %xmm1, %xmm3
  186. MOVSD 0 * SIZE(C1), %xmm4
  187. MULSD %xmm1, %xmm4
  188. MOVSD 1 * SIZE(C1), %xmm5
  189. MULSD %xmm0, %xmm5
  190. SUBSD %xmm3, %xmm2
  191. ADDPD %xmm5, %xmm4
  192. MOVSD %xmm2, 0 * SIZE(C1)
  193. MOVSD %xmm4, 1 * SIZE(C1)
  194. ALIGN_2
  195. .L74:
  196. decq N
  197. jg .L71
  198. ALIGN_2
  199. .L999:
  200. #ifdef WINDOWS_ABI
  201. movups 0(%rsp), %xmm6
  202. movups 16(%rsp), %xmm7
  203. movups 32(%rsp), %xmm8
  204. movups 48(%rsp), %xmm9
  205. movups 64(%rsp), %xmm10
  206. movups 80(%rsp), %xmm11
  207. movups 96(%rsp), %xmm12
  208. movups 112(%rsp), %xmm13
  209. movups 128(%rsp), %xmm14
  210. movups 144(%rsp), %xmm15
  211. addq $STACKSIZE, %rsp
  212. #endif
  213. ret
  214. EPILOGUE