You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

zgemm_tcopy_1.S 5.0 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190
  1. /*********************************************************************/
  2. /* Copyright 2009, 2010 The University of Texas at Austin. */
  3. /* All rights reserved. */
  4. /* */
  5. /* Redistribution and use in source and binary forms, with or */
  6. /* without modification, are permitted provided that the following */
  7. /* conditions are met: */
  8. /* */
  9. /* 1. Redistributions of source code must retain the above */
  10. /* copyright notice, this list of conditions and the following */
  11. /* disclaimer. */
  12. /* */
  13. /* 2. Redistributions in binary form must reproduce the above */
  14. /* copyright notice, this list of conditions and the following */
  15. /* disclaimer in the documentation and/or other materials */
  16. /* provided with the distribution. */
  17. /* */
  18. /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
  19. /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
  20. /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
  21. /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
  22. /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
  23. /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
  24. /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
  25. /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
  26. /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
  27. /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
  28. /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
  29. /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
  30. /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
  31. /* POSSIBILITY OF SUCH DAMAGE. */
  32. /* */
  33. /* The views and conclusions contained in the software and */
  34. /* documentation are those of the authors and should not be */
  35. /* interpreted as representing official policies, either expressed */
  36. /* or implied, of The University of Texas at Austin. */
  37. /*********************************************************************/
  38. #define ASSEMBLER
  39. #include "common.h"
  40. #ifndef WINDOWS_ABI
  41. #define M ARG1 /* rdi */
  42. #define N ARG2 /* rsi */
  43. #define A ARG3 /* rdx */
  44. #define LDA ARG4 /* rcx */
  45. #define B ARG5 /* r8 */
  46. #define I %r9
  47. #define J %r10
  48. #define AO1 %r11
  49. #define AO2 %r12
  50. #else
  51. #define STACKSIZE 256
  52. #define M ARG1 /* rcx */
  53. #define N ARG2 /* rdx */
  54. #define A ARG3 /* r8 */
  55. #define LDA ARG4 /* r9 */
  56. #define OLD_B 40 + 24 + STACKSIZE(%rsp)
  57. #define B %r10
  58. #define I %r11
  59. #define J %r12
  60. #define AO1 %r13
  61. #define AO2 %r14
  62. #endif
  63. #define RPREFETCHSIZE 4
  64. #define WPREFETCHSIZE 4
  65. PROLOGUE
  66. PROFCODE
  67. #ifdef WINDOWS_ABI
  68. pushq %r14
  69. pushq %r13
  70. #endif
  71. pushq %r12
  72. #ifdef WINDOWS_ABI
  73. subq $STACKSIZE, %rsp
  74. movups %xmm6, 0(%rsp)
  75. movups %xmm7, 16(%rsp)
  76. movups %xmm8, 32(%rsp)
  77. movups %xmm9, 48(%rsp)
  78. movups %xmm10, 64(%rsp)
  79. movups %xmm11, 80(%rsp)
  80. movups %xmm12, 96(%rsp)
  81. movups %xmm13, 112(%rsp)
  82. movups %xmm14, 128(%rsp)
  83. movups %xmm15, 144(%rsp)
  84. movq OLD_B, B
  85. #endif
  86. salq $ZBASE_SHIFT, LDA
  87. testq N, N
  88. movq N, J
  89. jle .L999
  90. ALIGN_4
  91. .L12:
  92. movq A, AO1
  93. addq $2 * SIZE, A
  94. movq M, I
  95. sarq $1, I
  96. jle .L14
  97. ALIGN_4
  98. .L13:
  99. #ifndef DOUBLE
  100. movsd 0 * SIZE(AO1), %xmm0
  101. movhps 0 * SIZE(AO1, LDA, 1), %xmm0
  102. movaps %xmm0, 0 * SIZE(B)
  103. #else
  104. prefetcht0 RPREFETCHSIZE * SIZE(AO1)
  105. movsd 0 * SIZE(AO1), %xmm0
  106. movhpd 1 * SIZE(AO1), %xmm0
  107. prefetcht0 RPREFETCHSIZE * SIZE(AO1, LDA)
  108. movsd 0 * SIZE(AO1, LDA), %xmm1
  109. movhpd 1 * SIZE(AO1, LDA), %xmm1
  110. movapd %xmm0, 0 * SIZE(B)
  111. movapd %xmm1, 2 * SIZE(B)
  112. prefetcht0 WPREFETCHSIZE * SIZE(B)
  113. #endif
  114. leaq (AO1, LDA, 2), AO1
  115. addq $4 * SIZE, B
  116. decq I
  117. jg .L13
  118. ALIGN_4
  119. .L14:
  120. testq $1, M
  121. jle .L16
  122. #ifndef DOUBLE
  123. movsd 0 * SIZE(AO1), %xmm0
  124. movsd %xmm0, 0 * SIZE(B)
  125. #else
  126. movsd 0 * SIZE(AO1), %xmm0
  127. movhpd 1 * SIZE(AO1), %xmm0
  128. movapd %xmm0, 0 * SIZE(B)
  129. #endif
  130. addq $2 * SIZE, B
  131. ALIGN_4
  132. .L16:
  133. decq J
  134. jg .L12
  135. ALIGN_4
  136. .L999:
  137. #ifdef WINDOWS_ABI
  138. movups 0(%rsp), %xmm6
  139. movups 16(%rsp), %xmm7
  140. movups 32(%rsp), %xmm8
  141. movups 48(%rsp), %xmm9
  142. movups 64(%rsp), %xmm10
  143. movups 80(%rsp), %xmm11
  144. movups 96(%rsp), %xmm12
  145. movups 112(%rsp), %xmm13
  146. movups 128(%rsp), %xmm14
  147. movups 144(%rsp), %xmm15
  148. addq $STACKSIZE, %rsp
  149. #endif
  150. popq %r12
  151. #ifdef WINDOWS_ABI
  152. popq %r13
  153. popq %r14
  154. #endif
  155. ret
  156. EPILOGUE