You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

dgemm_tcopy_4.S 11 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522
  1. /*********************************************************************/
  2. /* Copyright 2009, 2010 The University of Texas at Austin. */
  3. /* All rights reserved. */
  4. /* */
  5. /* Redistribution and use in source and binary forms, with or */
  6. /* without modification, are permitted provided that the following */
  7. /* conditions are met: */
  8. /* */
  9. /* 1. Redistributions of source code must retain the above */
  10. /* copyright notice, this list of conditions and the following */
  11. /* disclaimer. */
  12. /* */
  13. /* 2. Redistributions in binary form must reproduce the above */
  14. /* copyright notice, this list of conditions and the following */
  15. /* disclaimer in the documentation and/or other materials */
  16. /* provided with the distribution. */
  17. /* */
  18. /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
  19. /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
  20. /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
  21. /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
  22. /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
  23. /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
  24. /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
  25. /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
  26. /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
  27. /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
  28. /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
  29. /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
  30. /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
  31. /* POSSIBILITY OF SUCH DAMAGE. */
  32. /* */
  33. /* The views and conclusions contained in the software and */
  34. /* documentation are those of the authors and should not be */
  35. /* interpreted as representing official policies, either expressed */
  36. /* or implied, of The University of Texas at Austin. */
  37. /*********************************************************************/
  38. #define ASSEMBLER
  39. #include "common.h"
  40. #if defined(PENTIUM4) || defined(GENERIC)
  41. #define PREFETCHSIZE 16
  42. #define PREFETCH prefetcht0
  43. #define PREFETCHW prefetcht0
  44. #endif
  45. #ifdef NEHALEM
  46. #define PREFETCHSIZE 12
  47. #define PREFETCH prefetcht0
  48. #define MOVUPS_A movups
  49. #endif
  50. #ifdef SANDYBRIDGE
  51. #define PREFETCHSIZE 12
  52. #define PREFETCH prefetcht0
  53. #define MOVUPS_A movups
  54. #endif
  55. #if defined(CORE2) || defined(PENRYN) || defined(DUNNINGTON)
  56. #define PREFETCHSIZE 16
  57. #define PREFETCH prefetcht0
  58. #define PREFETCHW prefetcht0
  59. #endif
  60. #ifdef OPTERON
  61. #define PREFETCHSIZE 16
  62. #define PREFETCH prefetch
  63. #define PREFETCHW prefetchw
  64. #endif
  65. #ifdef MOVUPS_A
  66. #define MOVUPS_A1(OFF, ADDR, REGS) MOVUPS_A OFF(ADDR), REGS
  67. #define MOVUPS_A2(OFF, ADDR, BASE, SCALE, REGS) MOVUPS_A OFF(ADDR, BASE, SCALE), REGS
  68. #else
  69. #define MOVUPS_A1(OFF, ADDR, REGS) movsd OFF(ADDR), REGS; movhps OFF + 8(ADDR), REGS
  70. #define MOVUPS_A2(OFF, ADDR, BASE, SCALE, REGS) movsd OFF(ADDR, BASE, SCALE), REGS; movhps OFF + 8(ADDR, BASE, SCALE), REGS
  71. #endif
  72. #ifndef WINDOWS_ABI
  73. #define N ARG1 /* rsi */
  74. #define M ARG2 /* rdi */
  75. #define A ARG3 /* rdx */
  76. #define LDA ARG4 /* rcx */
  77. #define B ARG5 /* r8 */
  78. #define AO1 %r9
  79. #define AO2 %r10
  80. #define LDA3 %r11
  81. #define M8 %r12
  82. #else
  83. #define STACKSIZE 256
  84. #define N ARG1 /* rdx */
  85. #define M ARG2 /* rcx */
  86. #define A ARG3 /* r8 */
  87. #define LDA ARG4 /* r9 */
  88. #define OLD_B 64 + 32 + STACKSIZE(%rsp)
  89. #define B %r12
  90. #define AO1 %rsi
  91. #define AO2 %rdi
  92. #define LDA3 %r10
  93. #define M8 %r11
  94. #endif
  95. #define I %rax
  96. #define B0 %rbp
  97. #define B2 %r14
  98. #define B3 %r15
  99. PROLOGUE
  100. PROFCODE
  101. #ifdef WINDOWS_ABI
  102. pushq %rdi
  103. pushq %rsi
  104. #endif
  105. pushq %r15
  106. pushq %r14
  107. pushq %r13
  108. pushq %r12
  109. pushq %rbp
  110. #ifdef WINDOWS_ABI
  111. movq OLD_B, B
  112. #endif
  113. subq $-16 * SIZE, B
  114. movq M, B2
  115. movq M, B3
  116. andq $-4, B2
  117. andq $-2, B3
  118. imulq N, B2
  119. imulq N, B3
  120. leaq (B, B2, SIZE), B2
  121. leaq (B, B3, SIZE), B3
  122. leaq (,LDA, SIZE), LDA
  123. leaq (LDA, LDA, 2), LDA3
  124. leaq (, N, SIZE), M8
  125. cmpq $4, N
  126. jl .L30
  127. ALIGN_4
  128. .L21:
  129. subq $4, N
  130. movq A, AO1
  131. leaq (A, LDA, 2), AO2
  132. leaq (A, LDA, 4), A
  133. movq B, B0
  134. addq $16 * SIZE, B
  135. movq M, I
  136. sarq $3, I
  137. jle .L24
  138. ALIGN_4
  139. .L23:
  140. #ifdef PREFETCH
  141. PREFETCH PREFETCHSIZE * SIZE(AO1)
  142. #endif
  143. MOVUPS_A1(0 * SIZE, AO1, %xmm0)
  144. MOVUPS_A1(2 * SIZE, AO1, %xmm1)
  145. MOVUPS_A1(4 * SIZE, AO1, %xmm2)
  146. MOVUPS_A1(6 * SIZE, AO1, %xmm3)
  147. #ifdef PREFETCHW
  148. PREFETCHW (PREFETCHSIZE * 4 + 0) * SIZE(B)
  149. #endif
  150. movaps %xmm0, -16 * SIZE(B0)
  151. movaps %xmm1, -14 * SIZE(B0)
  152. movaps %xmm2, -16 * SIZE(B0, M8, 4)
  153. movaps %xmm3, -14 * SIZE(B0, M8, 4)
  154. #ifdef PREFETCH
  155. PREFETCH PREFETCHSIZE * SIZE(AO1, LDA)
  156. #endif
  157. MOVUPS_A2(0 * SIZE, AO1, LDA, 1, %xmm0)
  158. MOVUPS_A2(2 * SIZE, AO1, LDA, 1, %xmm1)
  159. MOVUPS_A2(4 * SIZE, AO1, LDA, 1, %xmm2)
  160. MOVUPS_A2(6 * SIZE, AO1, LDA, 1, %xmm3)
  161. #ifdef PREFETCHW
  162. PREFETCHW (PREFETCHSIZE * 4 + 8) * SIZE(B)
  163. #endif
  164. movaps %xmm0, -12 * SIZE(B0)
  165. movaps %xmm1, -10 * SIZE(B0)
  166. movaps %xmm2, -12 * SIZE(B0, M8, 4)
  167. movaps %xmm3, -10 * SIZE(B0, M8, 4)
  168. #ifdef PREFETCH
  169. PREFETCH PREFETCHSIZE * SIZE(AO2)
  170. #endif
  171. MOVUPS_A1(0 * SIZE, AO2, %xmm0)
  172. MOVUPS_A1(2 * SIZE, AO2, %xmm1)
  173. MOVUPS_A1(4 * SIZE, AO2, %xmm2)
  174. MOVUPS_A1(6 * SIZE, AO2, %xmm3)
  175. #ifdef PREFETCHW
  176. PREFETCHW (PREFETCHSIZE * 4 + 16) * SIZE(B)
  177. #endif
  178. movaps %xmm0, -8 * SIZE(B0)
  179. movaps %xmm1, -6 * SIZE(B0)
  180. movaps %xmm2, -8 * SIZE(B0, M8, 4)
  181. movaps %xmm3, -6 * SIZE(B0, M8, 4)
  182. #ifdef PREFETCH
  183. PREFETCH PREFETCHSIZE * SIZE(AO2, LDA)
  184. #endif
  185. MOVUPS_A2(0 * SIZE, AO2, LDA, 1, %xmm0)
  186. MOVUPS_A2(2 * SIZE, AO2, LDA, 1, %xmm1)
  187. MOVUPS_A2(4 * SIZE, AO2, LDA, 1, %xmm2)
  188. MOVUPS_A2(6 * SIZE, AO2, LDA, 1, %xmm3)
  189. #ifdef PREFETCHW
  190. PREFETCHW (PREFETCHSIZE * 4 + 24) * SIZE(B)
  191. #endif
  192. movaps %xmm0, -4 * SIZE(B0)
  193. movaps %xmm1, -2 * SIZE(B0)
  194. movaps %xmm2, -4 * SIZE(B0, M8, 4)
  195. movaps %xmm3, -2 * SIZE(B0, M8, 4)
  196. addq $8 * SIZE, AO1
  197. addq $8 * SIZE, AO2
  198. leaq (B0, M8, 8), B0
  199. decq I
  200. jg .L23
  201. ALIGN_4
  202. .L24:
  203. testq $4, M
  204. jle .L26
  205. MOVUPS_A1(0 * SIZE, AO1, %xmm0)
  206. MOVUPS_A1(2 * SIZE, AO1, %xmm1)
  207. MOVUPS_A2(0 * SIZE, AO1, LDA, 1, %xmm2)
  208. MOVUPS_A2(2 * SIZE, AO1, LDA, 1, %xmm3)
  209. movaps %xmm0, -16 * SIZE(B0)
  210. movaps %xmm1, -14 * SIZE(B0)
  211. movaps %xmm2, -12 * SIZE(B0)
  212. movaps %xmm3, -10 * SIZE(B0)
  213. MOVUPS_A1(0 * SIZE, AO2, %xmm0)
  214. MOVUPS_A1(2 * SIZE, AO2, %xmm1)
  215. MOVUPS_A2(0 * SIZE, AO2, LDA, 1, %xmm2)
  216. MOVUPS_A2(2 * SIZE, AO2, LDA, 1, %xmm3)
  217. movaps %xmm0, -8 * SIZE(B0)
  218. movaps %xmm1, -6 * SIZE(B0)
  219. movaps %xmm2, -4 * SIZE(B0)
  220. movaps %xmm3, -2 * SIZE(B0)
  221. addq $4 * SIZE, AO1
  222. addq $4 * SIZE, AO2
  223. leaq (B0, M8, 4), B0
  224. ALIGN_4
  225. .L26:
  226. testq $2, M
  227. jle .L28
  228. MOVUPS_A1(0 * SIZE, AO1, %xmm0)
  229. MOVUPS_A2(0 * SIZE, AO1, LDA, 1, %xmm1)
  230. MOVUPS_A1(0 * SIZE, AO2, %xmm2)
  231. MOVUPS_A2(0 * SIZE, AO2, LDA, 1, %xmm3)
  232. movaps %xmm0, -16 * SIZE(B2)
  233. movaps %xmm1, -14 * SIZE(B2)
  234. movaps %xmm2, -12 * SIZE(B2)
  235. movaps %xmm3, -10 * SIZE(B2)
  236. addq $2 * SIZE, AO1
  237. addq $2 * SIZE, AO2
  238. subq $-8 * SIZE, B2
  239. ALIGN_4
  240. .L28:
  241. testq $1, M
  242. jle .L29
  243. movsd 0 * SIZE(AO1), %xmm0
  244. movsd 0 * SIZE(AO1, LDA), %xmm1
  245. movsd 0 * SIZE(AO2), %xmm2
  246. movsd 0 * SIZE(AO2, LDA), %xmm3
  247. unpcklpd %xmm1, %xmm0
  248. unpcklpd %xmm3, %xmm2
  249. movaps %xmm0, -16 * SIZE(B3)
  250. movaps %xmm2, -14 * SIZE(B3)
  251. subq $-4 * SIZE, B3
  252. ALIGN_4
  253. .L29:
  254. cmpq $4, N
  255. jge .L21
  256. ALIGN_4
  257. .L30:
  258. cmpq $2, N
  259. jl .L40
  260. subq $2, N
  261. movq A, AO1
  262. leaq (A, LDA), AO2
  263. leaq (A, LDA, 2), A
  264. movq B, B0
  265. addq $8 * SIZE, B
  266. movq M, I
  267. sarq $3, I
  268. jle .L34
  269. ALIGN_4
  270. .L33:
  271. #ifdef PREFETCH
  272. PREFETCH PREFETCHSIZE * 2 * SIZE(AO1)
  273. #endif
  274. MOVUPS_A1(0 * SIZE, AO1, %xmm0)
  275. MOVUPS_A1(2 * SIZE, AO1, %xmm1)
  276. MOVUPS_A1(4 * SIZE, AO1, %xmm2)
  277. MOVUPS_A1(6 * SIZE, AO1, %xmm3)
  278. #ifdef PREFETCHW
  279. PREFETCHW (PREFETCHSIZE * 4 + 0) * SIZE(B)
  280. #endif
  281. movaps %xmm0, -16 * SIZE(B0)
  282. movaps %xmm1, -14 * SIZE(B0)
  283. movaps %xmm2, -16 * SIZE(B0, M8, 4)
  284. movaps %xmm3, -14 * SIZE(B0, M8, 4)
  285. #ifdef PREFETCH
  286. PREFETCH PREFETCHSIZE * 2 * SIZE(AO2)
  287. #endif
  288. MOVUPS_A1(0 * SIZE, AO2, %xmm0)
  289. MOVUPS_A1(2 * SIZE, AO2, %xmm1)
  290. MOVUPS_A1(4 * SIZE, AO2, %xmm2)
  291. MOVUPS_A1(6 * SIZE, AO2, %xmm3)
  292. #ifdef PREFETCHW
  293. PREFETCHW (PREFETCHSIZE * 4 + 8) * SIZE(B)
  294. #endif
  295. movaps %xmm0, -12 * SIZE(B0)
  296. movaps %xmm1, -10 * SIZE(B0)
  297. movaps %xmm2, -12 * SIZE(B0, M8, 4)
  298. movaps %xmm3, -10 * SIZE(B0, M8, 4)
  299. addq $8 * SIZE, AO1
  300. addq $8 * SIZE, AO2
  301. leaq (B0, M8, 8), B0
  302. decq I
  303. jg .L33
  304. ALIGN_4
  305. .L34:
  306. testq $4, M
  307. jle .L36
  308. MOVUPS_A1(0 * SIZE, AO1, %xmm0)
  309. MOVUPS_A1(2 * SIZE, AO1, %xmm1)
  310. MOVUPS_A1(0 * SIZE, AO2, %xmm2)
  311. MOVUPS_A1(2 * SIZE, AO2, %xmm3)
  312. movaps %xmm0, -16 * SIZE(B0)
  313. movaps %xmm1, -14 * SIZE(B0)
  314. movaps %xmm2, -12 * SIZE(B0)
  315. movaps %xmm3, -10 * SIZE(B0)
  316. addq $4 * SIZE, AO1
  317. addq $4 * SIZE, AO2
  318. leaq (B0, M8, 4), B0
  319. ALIGN_4
  320. .L36:
  321. testq $2, M
  322. jle .L38
  323. MOVUPS_A1(0 * SIZE, AO1, %xmm0)
  324. MOVUPS_A1(0 * SIZE, AO2, %xmm1)
  325. movaps %xmm0, -16 * SIZE(B2)
  326. movaps %xmm1, -14 * SIZE(B2)
  327. addq $2 * SIZE, AO1
  328. addq $2 * SIZE, AO2
  329. subq $-4 * SIZE, B2
  330. ALIGN_4
  331. .L38:
  332. testq $1, M
  333. jle .L40
  334. movsd 0 * SIZE(AO1), %xmm0
  335. movsd 0 * SIZE(AO2), %xmm1
  336. unpcklpd %xmm1, %xmm0
  337. movaps %xmm0, -16 * SIZE(B3)
  338. subq $-2 * SIZE, B3
  339. ALIGN_4
  340. .L40:
  341. cmpq $1, N
  342. jl .L999
  343. movq A, AO1
  344. movq B, B0
  345. movq M, I
  346. sarq $3, I
  347. jle .L44
  348. ALIGN_4
  349. .L43:
  350. #ifdef PREFETCH
  351. PREFETCH PREFETCHSIZE * 4 * SIZE(AO1)
  352. #endif
  353. MOVUPS_A1(0 * SIZE, AO1, %xmm0)
  354. MOVUPS_A1(2 * SIZE, AO1, %xmm1)
  355. MOVUPS_A1(4 * SIZE, AO1, %xmm2)
  356. MOVUPS_A1(6 * SIZE, AO1, %xmm3)
  357. #ifdef PREFETCHW
  358. PREFETCHW (PREFETCHSIZE * 4 + 0) * SIZE(B)
  359. #endif
  360. movaps %xmm0, -16 * SIZE(B0)
  361. movaps %xmm1, -14 * SIZE(B0)
  362. movaps %xmm2, -16 * SIZE(B0, M8, 4)
  363. movaps %xmm3, -14 * SIZE(B0, M8, 4)
  364. addq $8 * SIZE, AO1
  365. leaq (B0, M8, 8), B0
  366. decq I
  367. jg .L43
  368. ALIGN_4
  369. .L44:
  370. testq $4, M
  371. jle .L45
  372. MOVUPS_A1(0 * SIZE, AO1, %xmm0)
  373. MOVUPS_A1(2 * SIZE, AO1, %xmm1)
  374. movaps %xmm0, -16 * SIZE(B0)
  375. movaps %xmm1, -14 * SIZE(B0)
  376. addq $4 * SIZE, AO1
  377. leaq (B0, M8, 4), B0
  378. ALIGN_4
  379. .L45:
  380. testq $2, M
  381. jle .L46
  382. MOVUPS_A1(0 * SIZE, AO1, %xmm0)
  383. movaps %xmm0, -16 * SIZE(B2)
  384. addq $2 * SIZE, AO1
  385. subq $-2 * SIZE, B2
  386. ALIGN_4
  387. .L46:
  388. testq $1, M
  389. jle .L999
  390. movsd 0 * SIZE(AO1), %xmm0
  391. movlpd %xmm0, -16 * SIZE(B3)
  392. jmp .L999
  393. ALIGN_4
  394. .L999:
  395. popq %rbp
  396. popq %r12
  397. popq %r13
  398. popq %r14
  399. popq %r15
  400. #ifdef WINDOWS_ABI
  401. popq %rsi
  402. popq %rdi
  403. #endif
  404. ret
  405. EPILOGUE