You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

zgemm_kernel_1x1_atom.S 8.0 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351
  1. /*********************************************************************/
  2. /* Copyright 2009, 2010 The University of Texas at Austin. */
  3. /* All rights reserved. */
  4. /* */
  5. /* Redistribution and use in source and binary forms, with or */
  6. /* without modification, are permitted provided that the following */
  7. /* conditions are met: */
  8. /* */
  9. /* 1. Redistributions of source code must retain the above */
  10. /* copyright notice, this list of conditions and the following */
  11. /* disclaimer. */
  12. /* */
  13. /* 2. Redistributions in binary form must reproduce the above */
  14. /* copyright notice, this list of conditions and the following */
  15. /* disclaimer in the documentation and/or other materials */
  16. /* provided with the distribution. */
  17. /* */
  18. /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
  19. /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
  20. /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
  21. /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
  22. /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
  23. /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
  24. /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
  25. /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
  26. /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
  27. /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
  28. /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
  29. /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
  30. /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
  31. /* POSSIBILITY OF SUCH DAMAGE. */
  32. /* */
  33. /* The views and conclusions contained in the software and */
  34. /* documentation are those of the authors and should not be */
  35. /* interpreted as representing official policies, either expressed */
  36. /* or implied, of The University of Texas at Austin. */
  37. /*********************************************************************/
  38. #define ASSEMBLER
  39. #include "common.h"
  40. #define STACK 16
  41. #define ARGS 16
  42. #define M 4 + STACK + ARGS(%esp)
  43. #define N 8 + STACK + ARGS(%esp)
  44. #define K 12 + STACK + ARGS(%esp)
  45. #define ALPHA_R 16 + STACK + ARGS(%esp)
  46. #define ALPHA_I 24 + STACK + ARGS(%esp)
  47. #define A 32 + STACK + ARGS(%esp)
  48. #define ARG_B 36 + STACK + ARGS(%esp)
  49. #define C 40 + STACK + ARGS(%esp)
  50. #define ARG_LDC 44 + STACK + ARGS(%esp)
  51. #define OFFSET 48 + STACK + ARGS(%esp)
  52. #define J 0 + STACK(%esp)
  53. #define BX 4 + STACK(%esp)
  54. #define KK 8 + STACK(%esp)
  55. #define KKK 12 + STACK(%esp)
  56. #define PREFETCH prefetcht0
  57. #define PREFETCHSIZE 84
  58. #define AA %edx
  59. #define BB %ecx
  60. #define CO1 %esi
  61. #define LDC %ebp
  62. #define B %edi
  63. #if defined(NN) || defined(NT) || defined(TN) || defined(TT)
  64. #define ADDSD1 addsd
  65. #define ADDSD2 addsd
  66. #define ADDSD3 addsd
  67. #define ADDSD4 subsd
  68. #elif defined(NR) || defined(NC) || defined(TR) || defined(TC)
  69. #define ADDSD1 addsd
  70. #define ADDSD2 subsd
  71. #define ADDSD3 addsd
  72. #define ADDSD4 addsd
  73. #elif defined(RN) || defined(RT) || defined(CN) || defined(CT)
  74. #define ADDSD1 addsd
  75. #define ADDSD2 addsd
  76. #define ADDSD3 subsd
  77. #define ADDSD4 addsd
  78. #else
  79. #define ADDSD1 addsd
  80. #define ADDSD2 subsd
  81. #define ADDSD3 subsd
  82. #define ADDSD4 subsd
  83. #endif
  84. PROLOGUE
  85. subl $ARGS, %esp
  86. pushl %ebp
  87. pushl %edi
  88. pushl %esi
  89. pushl %ebx
  90. PROFCODE
  91. movl ARG_B, B
  92. movl ARG_LDC, LDC
  93. #ifdef TRMMKERNEL
  94. movl OFFSET, %eax
  95. #ifndef LEFT
  96. negl %eax
  97. #endif
  98. movl %eax, KK
  99. #endif
  100. sall $ZBASE_SHIFT, LDC
  101. movl M, %ebx
  102. testl %ebx, %ebx
  103. jle .L999
  104. movl N, %eax
  105. testl %eax, %eax
  106. movl %eax, J
  107. jle .L999
  108. ALIGN_4
  109. .L01:
  110. #if defined(TRMMKERNEL) && defined(LEFT)
  111. movl OFFSET, %eax
  112. movl %eax, KK
  113. #endif
  114. movl B, BX
  115. movl C, CO1
  116. addl LDC, C
  117. movl A, AA
  118. movl M, %ebx
  119. ALIGN_4
  120. .L10:
  121. #if !defined(TRMMKERNEL) || \
  122. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  123. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  124. movl B, BB
  125. #else
  126. movl KK, %eax
  127. leal (, %eax, SIZE), %eax
  128. leal (AA, %eax, 2), AA
  129. leal (B, %eax, 2), BB
  130. #endif
  131. movl BX, %eax
  132. prefetcht0 0 * SIZE(%eax)
  133. subl $-8 * SIZE, BX
  134. movsd 0 * SIZE(AA), %xmm0
  135. xorps %xmm2, %xmm2
  136. xorps %xmm3, %xmm3
  137. xorps %xmm4, %xmm4
  138. prefetcht0 1 * SIZE(CO1)
  139. xorps %xmm5, %xmm5
  140. xorps %xmm6, %xmm6
  141. xorps %xmm7, %xmm7
  142. #ifndef TRMMKERNEL
  143. movl K, %eax
  144. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  145. movl K, %eax
  146. subl KK, %eax
  147. movl %eax, KKK
  148. #else
  149. movl KK, %eax
  150. #ifdef LEFT
  151. addl $1, %eax
  152. #else
  153. addl $1, %eax
  154. #endif
  155. movl %eax, KKK
  156. #endif
  157. sarl $2, %eax
  158. je .L15
  159. ALIGN_4
  160. .L12:
  161. PREFETCH (PREFETCHSIZE + 0) * SIZE(AA)
  162. ADDSD3 %xmm2, %xmm6
  163. movsd 1 * SIZE(AA), %xmm2
  164. movaps %xmm0, %xmm1
  165. mulsd 0 * SIZE(BB), %xmm0
  166. ADDSD4 %xmm3, %xmm7
  167. mulsd 1 * SIZE(BB), %xmm1
  168. ADDSD1 %xmm0, %xmm4
  169. movsd 2 * SIZE(AA), %xmm0
  170. movaps %xmm2, %xmm3
  171. mulsd 0 * SIZE(BB), %xmm2
  172. ADDSD2 %xmm1, %xmm5
  173. mulsd 1 * SIZE(BB), %xmm3
  174. ADDSD3 %xmm2, %xmm6
  175. movsd 3 * SIZE(AA), %xmm2
  176. movaps %xmm0, %xmm1
  177. mulsd 2 * SIZE(BB), %xmm0
  178. ADDSD4 %xmm3, %xmm7
  179. mulsd 3 * SIZE(BB), %xmm1
  180. ADDSD1 %xmm0, %xmm4
  181. movsd 4 * SIZE(AA), %xmm0
  182. movaps %xmm2, %xmm3
  183. mulsd 2 * SIZE(BB), %xmm2
  184. ADDSD2 %xmm1, %xmm5
  185. mulsd 3 * SIZE(BB), %xmm3
  186. ADDSD3 %xmm2, %xmm6
  187. movsd 5 * SIZE(AA), %xmm2
  188. movaps %xmm0, %xmm1
  189. mulsd 4 * SIZE(BB), %xmm0
  190. ADDSD4 %xmm3, %xmm7
  191. mulsd 5 * SIZE(BB), %xmm1
  192. ADDSD1 %xmm0, %xmm4
  193. movsd 6 * SIZE(AA), %xmm0
  194. movaps %xmm2, %xmm3
  195. mulsd 4 * SIZE(BB), %xmm2
  196. ADDSD2 %xmm1, %xmm5
  197. mulsd 5 * SIZE(BB), %xmm3
  198. ADDSD3 %xmm2, %xmm6
  199. movsd 7 * SIZE(AA), %xmm2
  200. movaps %xmm0, %xmm1
  201. mulsd 6 * SIZE(BB), %xmm0
  202. ADDSD4 %xmm3, %xmm7
  203. mulsd 7 * SIZE(BB), %xmm1
  204. ADDSD1 %xmm0, %xmm4
  205. movsd 8 * SIZE(AA), %xmm0
  206. movaps %xmm2, %xmm3
  207. mulsd 6 * SIZE(BB), %xmm2
  208. ADDSD2 %xmm1, %xmm5
  209. mulsd 7 * SIZE(BB), %xmm3
  210. addl $8 * SIZE, BB
  211. addl $8 * SIZE, AA
  212. decl %eax
  213. jne .L12
  214. ALIGN_4
  215. .L15:
  216. #ifndef TRMMKERNEL
  217. movl K, %eax
  218. #else
  219. movl KKK, %eax
  220. #endif
  221. andl $3, %eax # if (k & 1)
  222. BRANCH
  223. je .L18
  224. ALIGN_3
  225. .L16:
  226. ADDSD3 %xmm2, %xmm6
  227. movsd 1 * SIZE(AA), %xmm2
  228. movaps %xmm0, %xmm1
  229. mulsd 0 * SIZE(BB), %xmm0
  230. ADDSD4 %xmm3, %xmm7
  231. mulsd 1 * SIZE(BB), %xmm1
  232. ADDSD1 %xmm0, %xmm4
  233. movsd 2 * SIZE(AA), %xmm0
  234. movaps %xmm2, %xmm3
  235. mulsd 0 * SIZE(BB), %xmm2
  236. ADDSD2 %xmm1, %xmm5
  237. mulsd 1 * SIZE(BB), %xmm3
  238. addl $2 * SIZE, AA
  239. addl $2 * SIZE, BB
  240. decl %eax
  241. jg .L16
  242. ALIGN_4
  243. .L18:
  244. movsd ALPHA_R, %xmm0
  245. movsd ALPHA_I, %xmm1
  246. ADDSD3 %xmm2, %xmm6
  247. ADDSD4 %xmm3, %xmm7
  248. addsd %xmm7, %xmm4
  249. addsd %xmm5, %xmm6
  250. movaps %xmm4, %xmm5
  251. movaps %xmm6, %xmm7
  252. mulsd %xmm0, %xmm4
  253. mulsd %xmm1, %xmm5
  254. mulsd %xmm1, %xmm6
  255. mulsd %xmm0, %xmm7
  256. subsd %xmm6, %xmm4
  257. addsd %xmm7, %xmm5
  258. #ifndef TRMMKERNEL
  259. addsd 0 * SIZE(CO1), %xmm4
  260. addsd 1 * SIZE(CO1), %xmm5
  261. #endif
  262. movsd %xmm4, 0 * SIZE(CO1)
  263. movsd %xmm5, 1 * SIZE(CO1)
  264. #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  265. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  266. movl K, %eax
  267. subl KKK, %eax
  268. leal (, %eax, SIZE), %eax
  269. leal (AA, %eax, 2), AA
  270. leal (BB, %eax, 2), BB
  271. #endif
  272. #if defined(TRMMKERNEL) && defined(LEFT)
  273. addl $1, KK
  274. #endif
  275. addl $2 * SIZE, CO1
  276. decl %ebx
  277. jg .L10
  278. ALIGN_4
  279. .L99:
  280. #if defined(TRMMKERNEL) && !defined(LEFT)
  281. addl $1, KK
  282. #endif
  283. movl BB, B
  284. decl J
  285. jg .L01
  286. ALIGN_4
  287. .L999:
  288. popl %ebx
  289. popl %esi
  290. popl %edi
  291. popl %ebp
  292. addl $ARGS, %esp
  293. ret
  294. EPILOGUE