You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

zgemm_kernel_2x1_atom.S 16 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769
  1. /*********************************************************************/
  2. /* Copyright 2009, 2010 The University of Texas at Austin. */
  3. /* All rights reserved. */
  4. /* */
  5. /* Redistribution and use in source and binary forms, with or */
  6. /* without modification, are permitted provided that the following */
  7. /* conditions are met: */
  8. /* */
  9. /* 1. Redistributions of source code must retain the above */
  10. /* copyright notice, this list of conditions and the following */
  11. /* disclaimer. */
  12. /* */
  13. /* 2. Redistributions in binary form must reproduce the above */
  14. /* copyright notice, this list of conditions and the following */
  15. /* disclaimer in the documentation and/or other materials */
  16. /* provided with the distribution. */
  17. /* */
  18. /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
  19. /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
  20. /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
  21. /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
  22. /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
  23. /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
  24. /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
  25. /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
  26. /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
  27. /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
  28. /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
  29. /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
  30. /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
  31. /* POSSIBILITY OF SUCH DAMAGE. */
  32. /* */
  33. /* The views and conclusions contained in the software and */
  34. /* documentation are those of the authors and should not be */
  35. /* interpreted as representing official policies, either expressed */
  36. /* or implied, of The University of Texas at Austin. */
  37. /*********************************************************************/
  38. #define ASSEMBLER
  39. #include "common.h"
  40. #define M %rdi
  41. #define N %rsi
  42. #define K %rdx
  43. #define A %rcx
  44. #define B %r8
  45. #define C %r9
  46. #define LDC %r10
  47. #define I %r11
  48. #define J %r12
  49. #define AO %r13
  50. #define BO %r14
  51. #define CO1 %r15
  52. #define BB %rbp
  53. #ifndef WINDOWS_ABI
  54. #define STACKSIZE 128
  55. #define OLD_LDC 8 + STACKSIZE(%rsp)
  56. #define OLD_OFFSET 16 + STACKSIZE(%rsp)
  57. #define ALPHA_R 48(%rsp)
  58. #define ALPHA_I 56(%rsp)
  59. #define OFFSET 64(%rsp)
  60. #define KKK 72(%rsp)
  61. #define KK 80(%rsp)
  62. #else
  63. #define STACKSIZE 512
  64. #define OLD_ALPHA_I 40 + STACKSIZE(%rsp)
  65. #define OLD_A 48 + STACKSIZE(%rsp)
  66. #define OLD_B 56 + STACKSIZE(%rsp)
  67. #define OLD_C 64 + STACKSIZE(%rsp)
  68. #define OLD_LDC 72 + STACKSIZE(%rsp)
  69. #define OLD_OFFSET 80 + STACKSIZE(%rsp)
  70. #define ALPHA_R 224(%rsp)
  71. #define ALPHA_I 232(%rsp)
  72. #define OFFSET 240(%rsp)
  73. #define KKK 248(%rsp)
  74. #define KK 256(%rsp)
  75. #endif
  76. #define PREFETCH prefetcht0
  77. #define PREFETCHSIZE (8 * 8 + 3)
  78. #if defined(NN) || defined(NT) || defined(TN) || defined(TT)
  79. #define ADDSD1 addsd
  80. #define ADDSD2 addsd
  81. #define ADDSD3 addsd
  82. #define ADDSD4 subsd
  83. #elif defined(NR) || defined(NC) || defined(TR) || defined(TC)
  84. #define ADDSD1 addsd
  85. #define ADDSD2 subsd
  86. #define ADDSD3 addsd
  87. #define ADDSD4 addsd
  88. #elif defined(RN) || defined(RT) || defined(CN) || defined(CT)
  89. #define ADDSD1 addsd
  90. #define ADDSD2 addsd
  91. #define ADDSD3 subsd
  92. #define ADDSD4 addsd
  93. #else
  94. #define ADDSD1 addsd
  95. #define ADDSD2 subsd
  96. #define ADDSD3 subsd
  97. #define ADDSD4 subsd
  98. #endif
  99. PROLOGUE
  100. PROFCODE
  101. subq $STACKSIZE, %rsp
  102. movq %rbx, 0(%rsp)
  103. movq %rbp, 8(%rsp)
  104. movq %r12, 16(%rsp)
  105. movq %r13, 24(%rsp)
  106. movq %r14, 32(%rsp)
  107. movq %r15, 40(%rsp)
  108. #ifdef WINDOWS_ABI
  109. movq %rdi, 48(%rsp)
  110. movq %rsi, 56(%rsp)
  111. movups %xmm6, 64(%rsp)
  112. movups %xmm7, 80(%rsp)
  113. movups %xmm8, 96(%rsp)
  114. movups %xmm9, 112(%rsp)
  115. movups %xmm10, 128(%rsp)
  116. movups %xmm11, 144(%rsp)
  117. movups %xmm12, 160(%rsp)
  118. movups %xmm13, 176(%rsp)
  119. movups %xmm14, 192(%rsp)
  120. movups %xmm15, 208(%rsp)
  121. movq ARG1, M
  122. movq ARG2, N
  123. movq ARG3, K
  124. movq OLD_A, A
  125. movq OLD_B, B
  126. movq OLD_C, C
  127. movq OLD_LDC, LDC
  128. #ifdef TRMMKERNEL
  129. movsd OLD_OFFSET, %xmm4
  130. #endif
  131. movaps %xmm3, %xmm0
  132. movsd OLD_ALPHA_I, %xmm1
  133. #else
  134. movq OLD_LDC, LDC
  135. #ifdef TRMMKERNEL
  136. movsd OLD_OFFSET, %xmm4
  137. #endif
  138. #endif
  139. movsd %xmm0, ALPHA_R
  140. movsd %xmm1, ALPHA_I
  141. #ifdef TRMMKERNEL
  142. movsd %xmm4, OFFSET
  143. movsd %xmm4, KK
  144. #ifndef LEFT
  145. negq KK
  146. #endif
  147. #endif
  148. salq $ZBASE_SHIFT, LDC
  149. movq N, J
  150. testq N, N
  151. jle .L999
  152. ALIGN_4
  153. .L01:
  154. #if defined(TRMMKERNEL) && defined(LEFT)
  155. movq OFFSET, %rax
  156. movq %rax, KK
  157. #endif
  158. movq C, CO1
  159. addq LDC, C
  160. movq A, AO
  161. movq K, %rax
  162. salq $ZBASE_SHIFT, %rax
  163. leaq (B, %rax), BB
  164. movq M, I
  165. sarq $1, I
  166. jle .L20
  167. ALIGN_4
  168. .L10:
  169. #if !defined(TRMMKERNEL) || \
  170. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  171. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  172. movq B, BO
  173. #else
  174. movq KK, %rax
  175. leaq (, %rax, SIZE), %rax
  176. leaq (AO, %rax, 4), AO
  177. leaq (B, %rax, 2), BO
  178. #endif
  179. prefetcht0 0 * SIZE(BB)
  180. subq $-8 * SIZE, BB
  181. movsd 0 * SIZE(AO), %xmm0
  182. xorps %xmm2, %xmm2
  183. movsd 1 * SIZE(AO), %xmm4
  184. xorps %xmm5, %xmm5
  185. movsd 2 * SIZE(AO), %xmm5
  186. xorps %xmm6, %xmm6
  187. xorps %xmm7, %xmm7
  188. movsd 0 * SIZE(BO), %xmm1
  189. xorps %xmm8, %xmm8
  190. xorps %xmm9, %xmm9
  191. movsd 1 * SIZE(BO), %xmm3
  192. xorps %xmm10, %xmm10
  193. xorps %xmm11, %xmm11
  194. prefetcht0 3 * SIZE(CO1)
  195. xorps %xmm12, %xmm12
  196. xorps %xmm13, %xmm13
  197. xorps %xmm14, %xmm14
  198. xorps %xmm15, %xmm15
  199. #ifndef TRMMKERNEL
  200. movq K, %rax
  201. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  202. movq K, %rax
  203. subq KK, %rax
  204. movq %rax, KKK
  205. #else
  206. movq KK, %rax
  207. #ifdef LEFT
  208. addq $2, %rax
  209. #else
  210. addq $1, %rax
  211. #endif
  212. movq %rax, KKK
  213. #endif
  214. sarq $2, %rax
  215. je .L15
  216. ALIGN_4
  217. .L12:
  218. ADDSD2 %xmm2, %xmm13
  219. PREFETCH (PREFETCHSIZE + 0) * SIZE(AO)
  220. movaps %xmm0, %xmm2
  221. mulsd %xmm1, %xmm0
  222. ADDSD3 %xmm7, %xmm14
  223. movsd 3 * SIZE(AO), %xmm7
  224. mulsd %xmm3, %xmm2
  225. ADDSD4 %xmm6, %xmm15
  226. PREFETCH ((PREFETCHSIZE) >> 1 + 0) * SIZE(BO)
  227. movaps %xmm4, %xmm6
  228. mulsd %xmm1, %xmm4
  229. ADDSD1 %xmm0, %xmm8
  230. movsd 4 * SIZE(AO), %xmm0
  231. mulsd %xmm3, %xmm6
  232. ADDSD2 %xmm2, %xmm9
  233. movaps %xmm5, %xmm2
  234. mulsd %xmm1, %xmm5
  235. ADDSD3 %xmm4, %xmm10
  236. movsd 5 * SIZE(AO), %xmm4
  237. mulsd %xmm3, %xmm2
  238. ADDSD4 %xmm6, %xmm11
  239. movaps %xmm7, %xmm6
  240. mulsd %xmm1, %xmm7
  241. movsd 2 * SIZE(BO), %xmm1
  242. ADDSD1 %xmm5, %xmm12
  243. movsd 6 * SIZE(AO), %xmm5
  244. mulsd %xmm3, %xmm6
  245. movsd 3 * SIZE(BO), %xmm3
  246. ADDSD2 %xmm2, %xmm13
  247. movaps %xmm0, %xmm2
  248. mulsd %xmm1, %xmm0
  249. ADDSD3 %xmm7, %xmm14
  250. movsd 7 * SIZE(AO), %xmm7
  251. mulsd %xmm3, %xmm2
  252. ADDSD4 %xmm6, %xmm15
  253. movaps %xmm4, %xmm6
  254. mulsd %xmm1, %xmm4
  255. ADDSD1 %xmm0, %xmm8
  256. movsd 8 * SIZE(AO), %xmm0
  257. mulsd %xmm3, %xmm6
  258. ADDSD2 %xmm2, %xmm9
  259. movaps %xmm5, %xmm2
  260. mulsd %xmm1, %xmm5
  261. ADDSD3 %xmm4, %xmm10
  262. movsd 9 * SIZE(AO), %xmm4
  263. mulsd %xmm3, %xmm2
  264. ADDSD4 %xmm6, %xmm11
  265. movaps %xmm7, %xmm6
  266. mulsd %xmm1, %xmm7
  267. movsd 4 * SIZE(BO), %xmm1
  268. ADDSD1 %xmm5, %xmm12
  269. movsd 10 * SIZE(AO), %xmm5
  270. mulsd %xmm3, %xmm6
  271. movsd 5 * SIZE(BO), %xmm3
  272. ADDSD2 %xmm2, %xmm13
  273. PREFETCH (PREFETCHSIZE + 8) * SIZE(AO)
  274. movaps %xmm0, %xmm2
  275. mulsd %xmm1, %xmm0
  276. ADDSD3 %xmm7, %xmm14
  277. movsd 11 * SIZE(AO), %xmm7
  278. mulsd %xmm3, %xmm2
  279. ADDSD4 %xmm6, %xmm15
  280. movaps %xmm4, %xmm6
  281. mulsd %xmm1, %xmm4
  282. ADDSD1 %xmm0, %xmm8
  283. movsd 12 * SIZE(AO), %xmm0
  284. mulsd %xmm3, %xmm6
  285. ADDSD2 %xmm2, %xmm9
  286. movaps %xmm5, %xmm2
  287. mulsd %xmm1, %xmm5
  288. ADDSD3 %xmm4, %xmm10
  289. movsd 13 * SIZE(AO), %xmm4
  290. mulsd %xmm3, %xmm2
  291. ADDSD4 %xmm6, %xmm11
  292. movaps %xmm7, %xmm6
  293. mulsd %xmm1, %xmm7
  294. movsd 6 * SIZE(BO), %xmm1
  295. ADDSD1 %xmm5, %xmm12
  296. movsd 14 * SIZE(AO), %xmm5
  297. mulsd %xmm3, %xmm6
  298. movsd 7 * SIZE(BO), %xmm3
  299. ADDSD2 %xmm2, %xmm13
  300. movaps %xmm0, %xmm2
  301. mulsd %xmm1, %xmm0
  302. ADDSD3 %xmm7, %xmm14
  303. movsd 15 * SIZE(AO), %xmm7
  304. mulsd %xmm3, %xmm2
  305. subq $-16 * SIZE, AO
  306. ADDSD4 %xmm6, %xmm15
  307. movaps %xmm4, %xmm6
  308. mulsd %xmm1, %xmm4
  309. ADDSD1 %xmm0, %xmm8
  310. movsd 0 * SIZE(AO), %xmm0
  311. mulsd %xmm3, %xmm6
  312. ADDSD2 %xmm2, %xmm9
  313. movaps %xmm5, %xmm2
  314. mulsd %xmm1, %xmm5
  315. addq $ 8 * SIZE, BO
  316. ADDSD3 %xmm4, %xmm10
  317. movsd 1 * SIZE(AO), %xmm4
  318. mulsd %xmm3, %xmm2
  319. decq %rax
  320. ADDSD4 %xmm6, %xmm11
  321. movaps %xmm7, %xmm6
  322. mulsd %xmm1, %xmm7
  323. movsd 0 * SIZE(BO), %xmm1
  324. ADDSD1 %xmm5, %xmm12
  325. movsd 2 * SIZE(AO), %xmm5
  326. mulsd %xmm3, %xmm6
  327. movsd 1 * SIZE(BO), %xmm3
  328. jne .L12
  329. ALIGN_4
  330. .L15:
  331. #ifndef TRMMKERNEL
  332. movq K, %rax
  333. #else
  334. movq KKK, %rax
  335. #endif
  336. andq $3, %rax
  337. BRANCH
  338. BRANCH
  339. je .L18
  340. ALIGN_4
  341. .L16:
  342. ADDSD2 %xmm2, %xmm13
  343. movaps %xmm0, %xmm2
  344. mulsd %xmm1, %xmm0
  345. ADDSD3 %xmm7, %xmm14
  346. movsd 3 * SIZE(AO), %xmm7
  347. mulsd %xmm3, %xmm2
  348. ADDSD4 %xmm6, %xmm15
  349. movaps %xmm4, %xmm6
  350. mulsd %xmm1, %xmm4
  351. ADDSD1 %xmm0, %xmm8
  352. movsd 4 * SIZE(AO), %xmm0
  353. mulsd %xmm3, %xmm6
  354. ADDSD2 %xmm2, %xmm9
  355. movaps %xmm5, %xmm2
  356. mulsd %xmm1, %xmm5
  357. ADDSD3 %xmm4, %xmm10
  358. movsd 5 * SIZE(AO), %xmm4
  359. mulsd %xmm3, %xmm2
  360. ADDSD4 %xmm6, %xmm11
  361. movaps %xmm7, %xmm6
  362. mulsd %xmm1, %xmm7
  363. movsd 2 * SIZE(BO), %xmm1
  364. ADDSD1 %xmm5, %xmm12
  365. movsd 6 * SIZE(AO), %xmm5
  366. mulsd %xmm3, %xmm6
  367. movsd 3 * SIZE(BO), %xmm3
  368. addq $4 * SIZE, AO
  369. addq $2 * SIZE, BO
  370. decq %rax
  371. BRANCH
  372. jg .L16
  373. ALIGN_4
  374. .L18:
  375. movsd ALPHA_R, %xmm0
  376. movsd ALPHA_I, %xmm1
  377. ADDSD2 %xmm2, %xmm13
  378. ADDSD3 %xmm7, %xmm14
  379. ADDSD4 %xmm6, %xmm15
  380. addsd %xmm11, %xmm8
  381. addsd %xmm9, %xmm10
  382. addsd %xmm15, %xmm12
  383. addsd %xmm13, %xmm14
  384. movaps %xmm8, %xmm9
  385. movaps %xmm10, %xmm11
  386. movaps %xmm12, %xmm13
  387. movaps %xmm14, %xmm15
  388. mulsd %xmm0, %xmm8
  389. mulsd %xmm1, %xmm9
  390. mulsd %xmm1, %xmm10
  391. mulsd %xmm0, %xmm11
  392. subsd %xmm10, %xmm8
  393. addsd %xmm11, %xmm9
  394. mulsd %xmm0, %xmm12
  395. mulsd %xmm1, %xmm13
  396. mulsd %xmm1, %xmm14
  397. mulsd %xmm0, %xmm15
  398. subsd %xmm14, %xmm12
  399. addsd %xmm15, %xmm13
  400. #if !defined(TRMMKERNEL) && !defined(BETAZERO)
  401. addsd 0 * SIZE(CO1), %xmm8
  402. addsd 1 * SIZE(CO1), %xmm9
  403. addsd 2 * SIZE(CO1), %xmm12
  404. addsd 3 * SIZE(CO1), %xmm13
  405. #endif
  406. movsd %xmm8, 0 * SIZE(CO1)
  407. movsd %xmm9, 1 * SIZE(CO1)
  408. movsd %xmm12, 2 * SIZE(CO1)
  409. movsd %xmm13, 3 * SIZE(CO1)
  410. #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  411. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  412. movq K, %rax
  413. subq KKK, %rax
  414. leaq (,%rax, SIZE), %rax
  415. leaq (AO, %rax, 4), AO
  416. leaq (BO, %rax, 2), BO
  417. #endif
  418. #if defined(TRMMKERNEL) && defined(LEFT)
  419. addq $2, KK
  420. #endif
  421. addq $4 * SIZE, CO1
  422. decq I
  423. jg .L10
  424. ALIGN_4
  425. .L20:
  426. testq $1, M
  427. jle .L99
  428. #if !defined(TRMMKERNEL) || \
  429. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  430. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  431. movq B, BO
  432. #else
  433. movq KK, %rax
  434. leaq (, %rax, SIZE), %rax
  435. leaq (AO, %rax, 2), AO
  436. leaq (B, %rax, 2), BO
  437. #endif
  438. movsd 0 * SIZE(AO), %xmm0
  439. xorps %xmm2, %xmm2
  440. movsd 1 * SIZE(AO), %xmm4
  441. xorps %xmm5, %xmm5
  442. movsd 2 * SIZE(AO), %xmm5
  443. xorps %xmm6, %xmm6
  444. movsd 3 * SIZE(AO), %xmm7
  445. movsd 0 * SIZE(BO), %xmm1
  446. xorps %xmm8, %xmm8
  447. xorps %xmm9, %xmm9
  448. movsd 1 * SIZE(BO), %xmm3
  449. xorps %xmm10, %xmm10
  450. xorps %xmm11, %xmm11
  451. #ifndef TRMMKERNEL
  452. movq K, %rax
  453. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  454. movq K, %rax
  455. subq KK, %rax
  456. movq %rax, KKK
  457. #else
  458. movq KK, %rax
  459. #ifdef LEFT
  460. addq $1, %rax
  461. #else
  462. addq $1, %rax
  463. #endif
  464. movq %rax, KKK
  465. #endif
  466. sarq $2, %rax
  467. je .L25
  468. ALIGN_4
  469. .L22:
  470. PREFETCH (PREFETCHSIZE + 0) * SIZE(AO)
  471. ADDSD2 %xmm2, %xmm9
  472. movaps %xmm0, %xmm2
  473. mulsd %xmm1, %xmm0
  474. ADDSD4 %xmm6, %xmm11
  475. movaps %xmm4, %xmm6
  476. mulsd %xmm1, %xmm4
  477. movsd 2 * SIZE(BO), %xmm1
  478. ADDSD1 %xmm0, %xmm8
  479. movsd 4 * SIZE(AO), %xmm0
  480. mulsd %xmm3, %xmm2
  481. ADDSD3 %xmm4, %xmm10
  482. movsd 5 * SIZE(AO), %xmm4
  483. mulsd %xmm3, %xmm6
  484. movsd 3 * SIZE(BO), %xmm3
  485. ADDSD2 %xmm2, %xmm9
  486. movaps %xmm5, %xmm2
  487. mulsd %xmm1, %xmm5
  488. ADDSD4 %xmm6, %xmm11
  489. movaps %xmm7, %xmm6
  490. mulsd %xmm1, %xmm7
  491. movsd 4 * SIZE(BO), %xmm1
  492. ADDSD1 %xmm5, %xmm8
  493. movsd 6 * SIZE(AO), %xmm5
  494. mulsd %xmm3, %xmm2
  495. ADDSD3 %xmm7, %xmm10
  496. movsd 7 * SIZE(AO), %xmm7
  497. mulsd %xmm3, %xmm6
  498. movsd 5 * SIZE(BO), %xmm3
  499. ADDSD2 %xmm2, %xmm9
  500. movaps %xmm0, %xmm2
  501. mulsd %xmm1, %xmm0
  502. ADDSD4 %xmm6, %xmm11
  503. movaps %xmm4, %xmm6
  504. mulsd %xmm1, %xmm4
  505. movsd 6 * SIZE(BO), %xmm1
  506. ADDSD1 %xmm0, %xmm8
  507. movsd 8 * SIZE(AO), %xmm0
  508. mulsd %xmm3, %xmm2
  509. ADDSD3 %xmm4, %xmm10
  510. movsd 9 * SIZE(AO), %xmm4
  511. mulsd %xmm3, %xmm6
  512. movsd 7 * SIZE(BO), %xmm3
  513. ADDSD2 %xmm2, %xmm9
  514. movaps %xmm5, %xmm2
  515. mulsd %xmm1, %xmm5
  516. ADDSD4 %xmm6, %xmm11
  517. movaps %xmm7, %xmm6
  518. mulsd %xmm1, %xmm7
  519. movsd 8 * SIZE(BO), %xmm1
  520. ADDSD1 %xmm5, %xmm8
  521. movsd 10 * SIZE(AO), %xmm5
  522. mulsd %xmm3, %xmm2
  523. ADDSD3 %xmm7, %xmm10
  524. movsd 11 * SIZE(AO), %xmm7
  525. mulsd %xmm3, %xmm6
  526. movsd 9 * SIZE(BO), %xmm3
  527. addq $8 * SIZE, AO
  528. addq $8 * SIZE, BO
  529. decq %rax
  530. jne .L22
  531. ALIGN_4
  532. .L25:
  533. #ifndef TRMMKERNEL
  534. movq K, %rax
  535. #else
  536. movq KKK, %rax
  537. #endif
  538. andq $3, %rax
  539. BRANCH
  540. BRANCH
  541. je .L29
  542. ALIGN_4
  543. .L26:
  544. ADDSD2 %xmm2, %xmm9
  545. movaps %xmm0, %xmm2
  546. mulsd %xmm1, %xmm0
  547. ADDSD4 %xmm6, %xmm11
  548. movaps %xmm4, %xmm6
  549. mulsd %xmm1, %xmm4
  550. movsd 2 * SIZE(BO), %xmm1
  551. mulsd %xmm3, %xmm2
  552. ADDSD1 %xmm0, %xmm8
  553. movsd 2 * SIZE(AO), %xmm0
  554. mulsd %xmm3, %xmm6
  555. movsd 3 * SIZE(BO), %xmm3
  556. ADDSD3 %xmm4, %xmm10
  557. movsd 3 * SIZE(AO), %xmm4
  558. addq $2 * SIZE, AO
  559. addq $2 * SIZE, BO
  560. decq %rax
  561. BRANCH
  562. jg .L26
  563. ALIGN_4
  564. .L29:
  565. movsd ALPHA_R, %xmm0
  566. movsd ALPHA_I, %xmm1
  567. ADDSD2 %xmm2, %xmm9
  568. ADDSD4 %xmm6, %xmm11
  569. addsd %xmm11, %xmm8
  570. addsd %xmm9, %xmm10
  571. movaps %xmm8, %xmm9
  572. movaps %xmm10, %xmm11
  573. mulsd %xmm0, %xmm8
  574. mulsd %xmm1, %xmm9
  575. mulsd %xmm1, %xmm10
  576. mulsd %xmm0, %xmm11
  577. subsd %xmm10, %xmm8
  578. addsd %xmm11, %xmm9
  579. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  580. addsd 0 * SIZE(CO1), %xmm8
  581. addsd 1 * SIZE(CO1), %xmm9
  582. #endif
  583. movsd %xmm8, 0 * SIZE(CO1)
  584. movsd %xmm9, 1 * SIZE(CO1)
  585. #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  586. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  587. movq K, %rax
  588. subq KKK, %rax
  589. leaq (,%rax, SIZE), %rax
  590. leaq (AO, %rax, 2), AO
  591. leaq (BO, %rax, 2), BO
  592. #endif
  593. #if defined(TRMMKERNEL) && defined(LEFT)
  594. addq $1, KK
  595. #endif
  596. ALIGN_4
  597. .L99:
  598. #if defined(TRMMKERNEL) && !defined(LEFT)
  599. addq $1, KK
  600. #endif
  601. movq BO, B
  602. decq J # j --
  603. jg .L01
  604. ALIGN_4
  605. .L999:
  606. movq 0(%rsp), %rbx
  607. movq 8(%rsp), %rbp
  608. movq 16(%rsp), %r12
  609. movq 24(%rsp), %r13
  610. movq 32(%rsp), %r14
  611. movq 40(%rsp), %r15
  612. #ifdef WINDOWS_ABI
  613. movq 48(%rsp), %rdi
  614. movq 56(%rsp), %rsi
  615. movups 64(%rsp), %xmm6
  616. movups 80(%rsp), %xmm7
  617. movups 96(%rsp), %xmm8
  618. movups 112(%rsp), %xmm9
  619. movups 128(%rsp), %xmm10
  620. movups 144(%rsp), %xmm11
  621. movups 160(%rsp), %xmm12
  622. movups 176(%rsp), %xmm13
  623. movups 192(%rsp), %xmm14
  624. movups 208(%rsp), %xmm15
  625. #endif
  626. addq $STACKSIZE, %rsp
  627. ret
  628. EPILOGUE