You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

zgemm_kernel_1x1.S 9.4 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450
  1. /*********************************************************************/
  2. /* Copyright 2009, 2010 The University of Texas at Austin. */
  3. /* All rights reserved. */
  4. /* */
  5. /* Redistribution and use in source and binary forms, with or */
  6. /* without modification, are permitted provided that the following */
  7. /* conditions are met: */
  8. /* */
  9. /* 1. Redistributions of source code must retain the above */
  10. /* copyright notice, this list of conditions and the following */
  11. /* disclaimer. */
  12. /* */
  13. /* 2. Redistributions in binary form must reproduce the above */
  14. /* copyright notice, this list of conditions and the following */
  15. /* disclaimer in the documentation and/or other materials */
  16. /* provided with the distribution. */
  17. /* */
  18. /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
  19. /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
  20. /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
  21. /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
  22. /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
  23. /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
  24. /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
  25. /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
  26. /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
  27. /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
  28. /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
  29. /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
  30. /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
  31. /* POSSIBILITY OF SUCH DAMAGE. */
  32. /* */
  33. /* The views and conclusions contained in the software and */
  34. /* documentation are those of the authors and should not be */
  35. /* interpreted as representing official policies, either expressed */
  36. /* or implied, of The University of Texas at Austin. */
  37. /*********************************************************************/
  38. #define ASSEMBLER
  39. #include "common.h"
  40. #define STACK 16
  41. #define ARGS 16
  42. #define BX 0 + STACK(%esp)
  43. #define KK 4 + STACK(%esp)
  44. #define KKK 8 + STACK(%esp)
  45. #define STACK_M 4 + STACK + ARGS(%esp)
  46. #define STACK_N 8 + STACK + ARGS(%esp)
  47. #define STACK_K 12 + STACK + ARGS(%esp)
  48. #ifdef DOUBLE
  49. #define ALPHA_R 16 + STACK + ARGS(%esp)
  50. #define ALPHA_I 24 + STACK + ARGS(%esp)
  51. #define STACK_A 32 + STACK + ARGS(%esp)
  52. #define STACK_B 36 + STACK + ARGS(%esp)
  53. #define STACK_C 40 + STACK + ARGS(%esp)
  54. #define STACK_LDC 44 + STACK + ARGS(%esp)
  55. #define OFFSET 48 + STACK + ARGS(%esp)
  56. #else
  57. #define ALPHA_R 16 + STACK + ARGS(%esp)
  58. #define ALPHA_I 20 + STACK + ARGS(%esp)
  59. #define STACK_A 24 + STACK + ARGS(%esp)
  60. #define STACK_B 28 + STACK + ARGS(%esp)
  61. #define STACK_C 32 + STACK + ARGS(%esp)
  62. #define STACK_LDC 36 + STACK + ARGS(%esp)
  63. #define OFFSET 40 + STACK + ARGS(%esp)
  64. #endif
  65. PROLOGUE
  66. subl $ARGS, %esp
  67. pushl %ebp
  68. pushl %edi
  69. pushl %esi
  70. pushl %ebx
  71. PROFCODE
  72. #define M %esi
  73. #define K %edi
  74. #define A %ebx
  75. #define B %ecx
  76. #define C %edx
  77. #define LDC %ebp
  78. #if defined(TRMMKERNEL) && !defined(LEFT)
  79. movl OFFSET, %eax
  80. negl %eax
  81. movl %eax, KK
  82. #endif
  83. movl STACK_K, K
  84. movl STACK_LDC, LDC
  85. sall $ZBASE_SHIFT, LDC
  86. cmpl $0, STACK_N
  87. jle .L29
  88. cmpl $0, STACK_M
  89. jle .L29
  90. ALIGN_4
  91. .L30:
  92. #if defined(TRMMKERNEL) && defined(LEFT)
  93. movl OFFSET, %eax
  94. movl %eax, KK
  95. #endif
  96. movl %ebx, BX
  97. movl STACK_A, A
  98. movl STACK_C, C
  99. movl STACK_M, M
  100. ALIGN_4
  101. .L34:
  102. #if !defined(TRMMKERNEL) || \
  103. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  104. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  105. movl STACK_B, B
  106. #else
  107. movl STACK_B, B
  108. movl KK, %eax
  109. leal (, %eax, SIZE), %eax
  110. leal (A, %eax, 2), A
  111. leal (B, %eax, 2), B
  112. #endif
  113. #ifdef HAVE_SSE
  114. movl BX, %eax
  115. prefetcht2 0 * SIZE(%eax)
  116. prefetcht2 4 * SIZE(%eax)
  117. #if L2_SIZE > 262144
  118. subl $-8 * SIZE, BX
  119. #elif L2_SIZE > 131072
  120. prefetcht2 8 * SIZE(%eax)
  121. prefetcht2 12 * SIZE(%eax)
  122. subl $-16 * SIZE, BX
  123. #else
  124. prefetcht2 16 * SIZE(%eax)
  125. prefetcht2 20 * SIZE(%eax)
  126. prefetcht2 24 * SIZE(%eax)
  127. prefetcht2 28 * SIZE(%eax)
  128. subl $-32 * SIZE, BX
  129. #endif
  130. #endif
  131. fldz
  132. fldz
  133. fldz
  134. fldz
  135. FLD 4 * SIZE(B) # B5
  136. FLD 4 * SIZE(A) # A5
  137. FLD 0 * SIZE(B) # B0
  138. FLD 0 * SIZE(A) # A0
  139. #ifndef TRMMKERNEL
  140. movl K, %eax
  141. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  142. movl K, %eax
  143. subl KK, %eax
  144. movl %eax, KKK
  145. #else
  146. movl KK, %eax
  147. #ifdef LEFT
  148. addl $1, %eax
  149. #else
  150. addl $1, %eax
  151. #endif
  152. movl %eax, KKK
  153. #endif
  154. #ifdef HAVE_SSE
  155. prefetcht2 2 * SIZE(C)
  156. #endif
  157. sarl $2, %eax
  158. je .L37
  159. ALIGN_4
  160. #define PREFETCH_OFFSET 40
  161. .L38:
  162. #ifdef HAVE_SSE
  163. prefetchnta (PREFETCH_OFFSET) * SIZE(B)
  164. #ifdef CORE_KATMAI
  165. prefetcht0 (PREFETCH_OFFSET) * SIZE(A)
  166. #endif
  167. #endif
  168. fmul %st, %st(1)
  169. FMUL 1 * SIZE(B)
  170. fxch %st(1)
  171. faddp %st, %st(5)
  172. FLD 0 * SIZE(B)
  173. fxch %st(1)
  174. #if defined(NN) || defined(CN)
  175. faddp %st, %st(4)
  176. #else
  177. fsubrp %st, %st(4)
  178. #endif
  179. FLD 1 * SIZE(A)
  180. fmul %st, %st(1)
  181. FMUL 1 * SIZE(B)
  182. fxch %st(1)
  183. #if defined(NN) || defined(NC)
  184. faddp %st, %st(7)
  185. #else
  186. fsubrp %st, %st(7)
  187. #endif
  188. FLD 2 * SIZE(B)
  189. fxch %st(1)
  190. #if defined(NN) || defined(CC)
  191. fsubrp %st, %st(6)
  192. #else
  193. faddp %st, %st(6)
  194. #endif
  195. FLD 2 * SIZE(A)
  196. fmul %st, %st(1)
  197. FMUL 3 * SIZE(B)
  198. fxch %st(1)
  199. faddp %st, %st(5)
  200. FLD 2 * SIZE(B)
  201. fxch %st(1)
  202. #if defined(NN) || defined(CN)
  203. faddp %st, %st(4)
  204. #else
  205. fsubrp %st, %st(4)
  206. #endif
  207. FLD 3 * SIZE(A)
  208. fmul %st, %st(1)
  209. FMUL 3 * SIZE(B)
  210. fxch %st(1)
  211. #if defined(NN) || defined(NC)
  212. faddp %st, %st(7)
  213. #else
  214. fsubrp %st, %st(7)
  215. #endif
  216. FLD 8 * SIZE(B)
  217. fxch %st(1)
  218. #if defined(NN) || defined(CC)
  219. fsubrp %st, %st(6)
  220. #else
  221. faddp %st, %st(6)
  222. #endif
  223. FLD 8 * SIZE(A)
  224. fxch %st(2)
  225. #ifdef HAVE_SSE
  226. #ifdef DOUBLE
  227. prefetchnta (PREFETCH_OFFSET + 4) * SIZE(B)
  228. #ifdef CORE_KATMAI
  229. prefetcht0 (PREFETCH_OFFSET + 4) * SIZE(A)
  230. #endif
  231. #endif
  232. #endif
  233. fmul %st, %st(3)
  234. FMUL 5 * SIZE(B)
  235. fxch %st(3)
  236. faddp %st, %st(5)
  237. FLD 4 * SIZE(B)
  238. fxch %st(3)
  239. #if defined(NN) || defined(CN)
  240. faddp %st, %st(4)
  241. #else
  242. fsubrp %st, %st(4)
  243. #endif
  244. FLD 5 * SIZE(A)
  245. fmul %st, %st(3)
  246. FMUL 5 * SIZE(B)
  247. fxch %st(3)
  248. #if defined(NN) || defined(NC)
  249. faddp %st, %st(7)
  250. #else
  251. fsubrp %st, %st(7)
  252. #endif
  253. FLD 6 * SIZE(B)
  254. fxch %st(3)
  255. #if defined(NN) || defined(CC)
  256. fsubrp %st, %st(6)
  257. #else
  258. faddp %st, %st(6)
  259. #endif
  260. FLD 6 * SIZE(A)
  261. fmul %st, %st(3)
  262. FMUL 7 * SIZE(B)
  263. fxch %st(3)
  264. faddp %st, %st(5)
  265. FLD 6 * SIZE(B)
  266. fxch %st(3)
  267. #if defined(NN) || defined(CN)
  268. faddp %st, %st(4)
  269. #else
  270. fsubrp %st, %st(4)
  271. #endif
  272. FLD 7 * SIZE(A)
  273. fmul %st, %st(3)
  274. FMUL 7 * SIZE(B)
  275. fxch %st(3)
  276. #if defined(NN) || defined(NC)
  277. faddp %st, %st(7)
  278. #else
  279. fsubrp %st, %st(7)
  280. #endif
  281. FLD 12 * SIZE(B)
  282. fxch %st(3)
  283. #if defined(NN) || defined(CC)
  284. fsubrp %st, %st(6)
  285. #else
  286. faddp %st, %st(6)
  287. #endif
  288. FLD 12 * SIZE(A)
  289. fxch %st(2)
  290. subl $-8 * SIZE, B
  291. subl $-8 * SIZE, A
  292. decl %eax
  293. jg .L38
  294. ALIGN_4
  295. .L37:
  296. #ifndef TRMMKERNEL
  297. movl K, %eax
  298. #else
  299. movl KKK, %eax
  300. #endif
  301. andl $3, %eax
  302. jle .L43
  303. ALIGN_2
  304. .L54:
  305. fmul %st, %st(1)
  306. FMUL 1 * SIZE(B)
  307. fxch %st(1)
  308. faddp %st, %st(5)
  309. FLD 0 * SIZE(B)
  310. fxch %st(1)
  311. #if defined(NN) || defined(CN)
  312. faddp %st, %st(4)
  313. #else
  314. fsubrp %st, %st(4)
  315. #endif
  316. FLD 1 * SIZE(A)
  317. fmul %st, %st(1)
  318. FMUL 1 * SIZE(B)
  319. fxch %st(1)
  320. #if defined(NN) || defined(NC)
  321. faddp %st, %st(7)
  322. #else
  323. fsubrp %st, %st(7)
  324. #endif
  325. FLD 2 * SIZE(B)
  326. fxch %st(1)
  327. #if defined(NN) || defined(CC)
  328. fsubrp %st, %st(6)
  329. #else
  330. faddp %st, %st(6)
  331. #endif
  332. FLD 2 * SIZE(A)
  333. addl $2 * SIZE, A
  334. addl $2 * SIZE, B
  335. decl %eax
  336. jg .L54
  337. ALIGN_3
  338. .L43:
  339. ffreep %st(0)
  340. ffreep %st(0)
  341. ffreep %st(0)
  342. ffreep %st(0)
  343. FLD ALPHA_R
  344. fxch %st(3)
  345. FLD ALPHA_I
  346. fxch %st(5)
  347. faddp %st, %st(2) # ctemp3 += ctemp4
  348. faddp %st, %st(2) # ctemp1 += ctemp2
  349. fld %st(0) # copy ctemp2
  350. fmul %st(4), %st # ctemp3 *= alpha_i
  351. fld %st(2) # copy ctemp1
  352. fmul %st(4), %st # ctemp1 *= alpha_r
  353. fsubp %st, %st(1) # ctemp2 -= ctemp4
  354. #ifndef TRMMKERNEL
  355. FADD 0 * SIZE(C)
  356. #endif
  357. FST 0 * SIZE(C)
  358. fmulp %st, %st(2) # ctemp3 *= alpha_i
  359. fmulp %st, %st(2) # ctemp1 *= alpha_r
  360. faddp %st, %st(1) # ctemp1 += ctemp3
  361. #ifndef TRMMKERNEL
  362. FADD 1 * SIZE(C)
  363. #endif
  364. FST 1 * SIZE(C)
  365. addl $2 * SIZE, C
  366. #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  367. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  368. movl K, %eax
  369. subl KKK, %eax
  370. leal (,%eax, SIZE), %eax
  371. leal (A, %eax, 2), A
  372. leal (B, %eax, 2), B
  373. #endif
  374. #if defined(TRMMKERNEL) && defined(LEFT)
  375. addl $1, KK
  376. #endif
  377. decl M
  378. jg .L34
  379. ALIGN_2
  380. .L33:
  381. #if defined(TRMMKERNEL) && !defined(LEFT)
  382. addl $1, KK
  383. #endif
  384. movl B, STACK_B
  385. addl LDC, STACK_C
  386. decl STACK_N
  387. jg .L30
  388. ALIGN_2
  389. .L29:
  390. popl %ebx
  391. popl %esi
  392. popl %edi
  393. popl %ebp
  394. addl $ARGS, %esp
  395. ret
  396. EPILOGUE