You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

zgemm_kernel_2x1_core2.S 15 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695
  1. /*********************************************************************/
  2. /* Copyright 2009, 2010 The University of Texas at Austin. */
  3. /* All rights reserved. */
  4. /* */
  5. /* Redistribution and use in source and binary forms, with or */
  6. /* without modification, are permitted provided that the following */
  7. /* conditions are met: */
  8. /* */
  9. /* 1. Redistributions of source code must retain the above */
  10. /* copyright notice, this list of conditions and the following */
  11. /* disclaimer. */
  12. /* */
  13. /* 2. Redistributions in binary form must reproduce the above */
  14. /* copyright notice, this list of conditions and the following */
  15. /* disclaimer in the documentation and/or other materials */
  16. /* provided with the distribution. */
  17. /* */
  18. /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
  19. /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
  20. /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
  21. /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
  22. /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
  23. /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
  24. /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
  25. /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
  26. /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
  27. /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
  28. /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
  29. /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
  30. /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
  31. /* POSSIBILITY OF SUCH DAMAGE. */
  32. /* */
  33. /* The views and conclusions contained in the software and */
  34. /* documentation are those of the authors and should not be */
  35. /* interpreted as representing official policies, either expressed */
  36. /* or implied, of The University of Texas at Austin. */
  37. /*********************************************************************/
  38. #define ASSEMBLER
  39. #include "common.h"
  40. #define STACK 16
  41. #define ARGS 0
  42. #define STACK_M 4 + STACK + ARGS(%esi)
  43. #define STACK_N 8 + STACK + ARGS(%esi)
  44. #define STACK_K 12 + STACK + ARGS(%esi)
  45. #define STACK_ALPHA_R 16 + STACK + ARGS(%esi)
  46. #define STACK_ALPHA_I 24 + STACK + ARGS(%esi)
  47. #define STACK_A 32 + STACK + ARGS(%esi)
  48. #define STACK_B 36 + STACK + ARGS(%esi)
  49. #define STACK_C 40 + STACK + ARGS(%esi)
  50. #define STACK_LDC 44 + STACK + ARGS(%esi)
  51. #define STACK_OFFT 48 + STACK + ARGS(%esi)
  52. #define ALPHA_R 16(%esp)
  53. #define ALPHA_I 32(%esp)
  54. #define K 48(%esp)
  55. #define N 52(%esp)
  56. #define M 56(%esp)
  57. #define A 60(%esp)
  58. #define C 64(%esp)
  59. #define J 68(%esp)
  60. #define BX 72(%esp)
  61. #define OLD_STACK 76(%esp)
  62. #define OFFSET 80(%esp)
  63. #define KK 84(%esp)
  64. #define KKK 88(%esp)
  65. #define BUFFER 128(%esp)
  66. #define STACK_ALIGN 4096
  67. #define STACK_OFFSET 1024
  68. #define PREFETCH_R (8 * 16 + 0)
  69. #define PREFETCH_W (PREFETCH_R * 2)
  70. #define PREFETCHSIZE (8 * 16 + 4)
  71. #define PREFETCH prefetcht0
  72. #define B %edi
  73. #define LDC %ebp
  74. #define AA %edx
  75. #define BB %ecx
  76. #define C1 %esi
  77. #if defined(NN) || defined(NT) || defined(TN) || defined(TT)
  78. #define ADD1 addpd
  79. #define ADD2 addpd
  80. #elif defined(NR) || defined(NC) || defined(TR) || defined(TC)
  81. #define ADD1 addpd
  82. #define ADD2 subpd
  83. #elif defined(RN) || defined(RT) || defined(CN) || defined(CT)
  84. #define ADD1 subpd
  85. #define ADD2 addpd
  86. #else
  87. #define ADD1 subpd
  88. #define ADD2 subpd
  89. #endif
  90. PROLOGUE
  91. pushl %ebp
  92. pushl %edi
  93. pushl %esi
  94. pushl %ebx
  95. PROFCODE
  96. movl %esp, %esi # save old stack
  97. subl $128 + LOCAL_BUFFER_SIZE + STACK_OFFSET, %esp
  98. andl $-STACK_ALIGN, %esp # align stack
  99. addl $STACK_OFFSET, %esp
  100. STACK_TOUCHING
  101. movd STACK_M, %mm0
  102. movl STACK_N, %eax
  103. movd STACK_K, %mm1
  104. movd STACK_A, %mm2
  105. movl STACK_B, B
  106. movd STACK_C, %mm3
  107. movl STACK_LDC, LDC
  108. #ifdef TRMMKERNEL
  109. movd STACK_OFFT, %mm4
  110. #endif
  111. movsd STACK_ALPHA_R, %xmm0
  112. movsd STACK_ALPHA_I, %xmm1
  113. movddup %xmm0, %xmm0
  114. movddup %xmm1, %xmm1
  115. movapd %xmm0, ALPHA_R
  116. movapd %xmm1, ALPHA_I
  117. movd %mm1, K
  118. movl %eax, N
  119. movd %mm0, M
  120. movd %mm2, A
  121. movd %mm3, C
  122. movl %esi, OLD_STACK
  123. #ifdef TRMMKERNEL
  124. movd %mm4, OFFSET
  125. movd %mm4, KK
  126. #ifndef LEFT
  127. negl KK
  128. #endif
  129. #endif
  130. subl $-16 * SIZE, A
  131. subl $-16 * SIZE, B
  132. sall $ZBASE_SHIFT, LDC
  133. movl %eax, J # j = n
  134. testl %eax, %eax
  135. jle .L999
  136. ALIGN_2
  137. .L01:
  138. leal 16 * SIZE + BUFFER, BB
  139. #if defined(TRMMKERNEL) && defined(LEFT)
  140. movl OFFSET, %eax
  141. movl %eax, KK
  142. #endif
  143. movl K, %eax
  144. sarl $2, %eax
  145. jle .L03
  146. ALIGN_2
  147. .L02:
  148. prefetcht0 (PREFETCH_R + 0) * SIZE(B)
  149. movddup -16 * SIZE(B), %xmm0
  150. movddup -15 * SIZE(B), %xmm1
  151. movddup -14 * SIZE(B), %xmm2
  152. movddup -13 * SIZE(B), %xmm3
  153. movddup -12 * SIZE(B), %xmm4
  154. movddup -11 * SIZE(B), %xmm5
  155. movddup -10 * SIZE(B), %xmm6
  156. movddup -9 * SIZE(B), %xmm7
  157. movapd %xmm0, -16 * SIZE(BB)
  158. movapd %xmm1, -14 * SIZE(BB)
  159. movapd %xmm2, -12 * SIZE(BB)
  160. movapd %xmm3, -10 * SIZE(BB)
  161. movapd %xmm4, -8 * SIZE(BB)
  162. movapd %xmm5, -6 * SIZE(BB)
  163. movapd %xmm6, -4 * SIZE(BB)
  164. movapd %xmm7, -2 * SIZE(BB)
  165. addl $ 8 * SIZE, B
  166. subl $-16 * SIZE, BB
  167. decl %eax
  168. jne .L02
  169. ALIGN_2
  170. .L03:
  171. movl K, %eax
  172. andl $3, %eax
  173. BRANCH
  174. jle .L05
  175. ALIGN_2
  176. .L04:
  177. movddup -16 * SIZE(B), %xmm0
  178. movddup -15 * SIZE(B), %xmm1
  179. movapd %xmm0, -16 * SIZE(BB)
  180. movapd %xmm1, -14 * SIZE(BB)
  181. addl $ 2 * SIZE, B
  182. addl $ 4 * SIZE, BB
  183. decl %eax
  184. jne .L04
  185. ALIGN_4
  186. .L05:
  187. movl B, BX
  188. movl C, C1 # coffset = c
  189. movl A, AA # aoffset = a
  190. movl M, %ebx
  191. sarl $1, %ebx # i = (m >> 2)
  192. jle .L20
  193. ALIGN_4
  194. .L10:
  195. #if !defined(TRMMKERNEL) || \
  196. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  197. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  198. leal 16 * SIZE + BUFFER, BB
  199. #else
  200. leal 16 * SIZE + BUFFER, BB
  201. movl KK, %eax
  202. leal (, %eax, SIZE), %eax
  203. leal (AA, %eax, 4), AA
  204. leal (BB, %eax, 4), BB /* because it's doubled */
  205. #endif
  206. movapd -16 * SIZE(AA), %xmm0
  207. pxor %xmm4, %xmm4
  208. movapd -16 * SIZE(BB), %xmm1
  209. pxor %xmm5, %xmm5
  210. movapd -8 * SIZE(AA), %xmm3
  211. pxor %xmm6, %xmm6
  212. prefetcht0 3 * SIZE(C1)
  213. pxor %xmm7, %xmm7
  214. movapd %xmm1, %xmm2
  215. movl BX, %eax
  216. prefetcht0 (%eax)
  217. subl $-8 * SIZE, %eax
  218. movl %eax, BX
  219. #ifndef TRMMKERNEL
  220. movl K, %eax
  221. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  222. movl K, %eax
  223. subl KK, %eax
  224. movl %eax, KKK
  225. #else
  226. movl KK, %eax
  227. #ifdef LEFT
  228. addl $2, %eax
  229. #else
  230. addl $1, %eax
  231. #endif
  232. movl %eax, KKK
  233. #endif
  234. sarl $3, %eax
  235. je .L15
  236. ALIGN_4
  237. .L12:
  238. mulpd %xmm0, %xmm1
  239. ADD1 %xmm1, %xmm4
  240. movapd -14 * SIZE(BB), %xmm1
  241. mulpd %xmm1, %xmm0
  242. ADD2 %xmm0, %xmm5
  243. movapd -14 * SIZE(AA), %xmm0
  244. mulpd %xmm0, %xmm2
  245. ADD1 %xmm2, %xmm6
  246. movapd -12 * SIZE(BB), %xmm2
  247. mulpd %xmm0, %xmm1
  248. movapd -12 * SIZE(AA), %xmm0
  249. ADD2 %xmm1, %xmm7
  250. PADDING;
  251. movapd %xmm2, %xmm1
  252. mulpd %xmm0, %xmm2
  253. ADD1 %xmm2, %xmm4
  254. movapd -10 * SIZE(BB), %xmm2
  255. mulpd %xmm2, %xmm0
  256. ADD2 %xmm0, %xmm5
  257. movapd -10 * SIZE(AA), %xmm0
  258. mulpd %xmm0, %xmm1
  259. ADD1 %xmm1, %xmm6
  260. movapd -8 * SIZE(BB), %xmm1
  261. mulpd %xmm0, %xmm2
  262. PADDING;
  263. movapd 0 * SIZE(AA), %xmm0
  264. ADD2 %xmm2, %xmm7
  265. PADDING;
  266. movapd %xmm1, %xmm2
  267. mulpd %xmm3, %xmm1
  268. ADD1 %xmm1, %xmm4
  269. movapd -6 * SIZE(BB), %xmm1
  270. mulpd %xmm1, %xmm3
  271. ADD2 %xmm3, %xmm5
  272. movapd -6 * SIZE(AA), %xmm3
  273. mulpd %xmm3, %xmm2
  274. ADD1 %xmm2, %xmm6
  275. movapd -4 * SIZE(BB), %xmm2
  276. mulpd %xmm3, %xmm1
  277. movapd -4 * SIZE(AA), %xmm3
  278. ADD2 %xmm1, %xmm7
  279. PADDING;
  280. movapd %xmm2, %xmm1
  281. mulpd %xmm3, %xmm2
  282. ADD1 %xmm2, %xmm4
  283. movapd -2 * SIZE(BB), %xmm2
  284. mulpd %xmm2, %xmm3
  285. ADD2 %xmm3, %xmm5
  286. movapd -2 * SIZE(AA), %xmm3
  287. mulpd %xmm3, %xmm1
  288. ADD1 %xmm1, %xmm6
  289. PADDING;
  290. movapd 0 * SIZE(BB), %xmm1
  291. mulpd %xmm3, %xmm2
  292. movapd 8 * SIZE(AA), %xmm3
  293. ADD2 %xmm2, %xmm7
  294. PADDING;
  295. movapd %xmm1, %xmm2
  296. mulpd %xmm0, %xmm1
  297. ADD1 %xmm1, %xmm4
  298. movapd 2 * SIZE(BB), %xmm1
  299. mulpd %xmm1, %xmm0
  300. ADD2 %xmm0, %xmm5
  301. movapd 2 * SIZE(AA), %xmm0
  302. mulpd %xmm0, %xmm2
  303. ADD1 %xmm2, %xmm6
  304. movapd 4 * SIZE(BB), %xmm2
  305. mulpd %xmm0, %xmm1
  306. movapd 4 * SIZE(AA), %xmm0
  307. ADD2 %xmm1, %xmm7
  308. PADDING;
  309. movapd %xmm2, %xmm1
  310. mulpd %xmm0, %xmm2
  311. ADD1 %xmm2, %xmm4
  312. movapd 6 * SIZE(BB), %xmm2
  313. mulpd %xmm2, %xmm0
  314. ADD2 %xmm0, %xmm5
  315. movapd 6 * SIZE(AA), %xmm0
  316. mulpd %xmm0, %xmm1
  317. ADD1 %xmm1, %xmm6
  318. movapd 8 * SIZE(BB), %xmm1
  319. mulpd %xmm0, %xmm2
  320. movapd 16 * SIZE(AA), %xmm0
  321. ADD2 %xmm2, %xmm7
  322. PADDING;
  323. movapd %xmm1, %xmm2
  324. mulpd %xmm3, %xmm1
  325. ADD1 %xmm1, %xmm4
  326. movapd 10 * SIZE(BB), %xmm1
  327. mulpd %xmm1, %xmm3
  328. ADD2 %xmm3, %xmm5
  329. movapd 10 * SIZE(AA), %xmm3
  330. mulpd %xmm3, %xmm2
  331. ADD1 %xmm2, %xmm6
  332. movapd 12 * SIZE(BB), %xmm2
  333. mulpd %xmm3, %xmm1
  334. movapd 12 * SIZE(AA), %xmm3
  335. ADD2 %xmm1, %xmm7
  336. PADDING;
  337. movapd %xmm2, %xmm1
  338. mulpd %xmm3, %xmm2
  339. ADD1 %xmm2, %xmm4
  340. movapd 14 * SIZE(BB), %xmm2
  341. mulpd %xmm2, %xmm3
  342. subl $-32 * SIZE, BB
  343. ADD2 %xmm3, %xmm5
  344. movapd 14 * SIZE(AA), %xmm3
  345. mulpd %xmm3, %xmm1
  346. ADD1 %xmm1, %xmm6
  347. movapd -16 * SIZE(BB), %xmm1
  348. mulpd %xmm3, %xmm2
  349. movapd 24 * SIZE(AA), %xmm3
  350. ADD2 %xmm2, %xmm7
  351. PADDING;
  352. movapd %xmm1, %xmm2
  353. subl $-32 * SIZE, AA
  354. decl %eax
  355. BRANCH
  356. jne .L12
  357. ALIGN_4
  358. .L15:
  359. #ifndef TRMMKERNEL
  360. movl K, %eax
  361. #else
  362. movl KKK, %eax
  363. #endif
  364. andl $7, %eax
  365. BRANCH
  366. je .L18
  367. ALIGN_4
  368. .L16:
  369. mulpd %xmm0, %xmm1
  370. ADD1 %xmm1, %xmm4
  371. movapd -14 * SIZE(BB), %xmm1
  372. movapd %xmm1, %xmm3
  373. mulpd %xmm0, %xmm1
  374. movapd -14 * SIZE(AA), %xmm0
  375. ADD2 %xmm1, %xmm5
  376. movapd -12 * SIZE(BB), %xmm1
  377. mulpd %xmm0, %xmm2
  378. ADD1 %xmm2, %xmm6
  379. mulpd %xmm0, %xmm3
  380. movapd -12 * SIZE(AA), %xmm0
  381. ADD2 %xmm3, %xmm7
  382. movapd %xmm1, %xmm2
  383. addl $4 * SIZE, AA
  384. addl $4 * SIZE, BB
  385. decl %eax
  386. jg .L16
  387. ALIGN_4
  388. .L18:
  389. movapd ALPHA_R, %xmm2
  390. movapd ALPHA_I, %xmm3
  391. SHUFPD_1 %xmm5, %xmm5
  392. SHUFPD_1 %xmm7, %xmm7
  393. #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \
  394. defined(NR) || defined(NC) || defined(TR) || defined(TC)
  395. addsubpd %xmm5, %xmm4
  396. addsubpd %xmm7, %xmm6
  397. movapd %xmm4, %xmm5
  398. movapd %xmm6, %xmm7
  399. #else
  400. addsubpd %xmm4, %xmm5
  401. addsubpd %xmm6, %xmm7
  402. movapd %xmm5, %xmm4
  403. movapd %xmm7, %xmm6
  404. #endif
  405. #ifndef TRMMKERNEL
  406. movsd 0 * SIZE(C1), %xmm0
  407. movhpd 1 * SIZE(C1), %xmm0
  408. movsd 2 * SIZE(C1), %xmm1
  409. movhpd 3 * SIZE(C1), %xmm1
  410. #endif
  411. SHUFPD_1 %xmm5, %xmm5
  412. SHUFPD_1 %xmm7, %xmm7
  413. mulpd %xmm2, %xmm4
  414. mulpd %xmm2, %xmm6
  415. mulpd %xmm3, %xmm5
  416. mulpd %xmm3, %xmm7
  417. addsubpd %xmm5, %xmm4
  418. addsubpd %xmm7, %xmm6
  419. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  420. addpd %xmm0, %xmm4
  421. addpd %xmm1, %xmm6
  422. #endif
  423. movsd %xmm4, 0 * SIZE(C1)
  424. movhpd %xmm4, 1 * SIZE(C1)
  425. movsd %xmm6, 2 * SIZE(C1)
  426. movhpd %xmm6, 3 * SIZE(C1)
  427. #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  428. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  429. movl K, %eax
  430. subl KKK, %eax
  431. leal (,%eax, SIZE), %eax
  432. leal (AA, %eax, 4), AA
  433. leal (BB, %eax, 4), BB
  434. #endif
  435. #if defined(TRMMKERNEL) && defined(LEFT)
  436. addl $2, KK
  437. #endif
  438. addl $4 * SIZE, C1 # coffset += 4
  439. decl %ebx # i --
  440. jg .L10
  441. .L20:
  442. movl M, %ebx
  443. testl $1, %ebx
  444. je .L29
  445. #if !defined(TRMMKERNEL) || \
  446. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  447. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  448. leal 16 * SIZE + BUFFER, %ecx
  449. #else
  450. leal 16 * SIZE + BUFFER, BB
  451. movl KK, %eax
  452. leal (, %eax, SIZE), %eax
  453. leal (AA, %eax, 2), AA
  454. leal (BB, %eax, 4), BB /* because it's doubled */
  455. #endif
  456. movapd -16 * SIZE(AA), %xmm0
  457. pxor %xmm4, %xmm4
  458. movapd -16 * SIZE(BB), %xmm1
  459. pxor %xmm5, %xmm5
  460. movapd -8 * SIZE(AA), %xmm2
  461. pxor %xmm6, %xmm6
  462. movapd -8 * SIZE(BB), %xmm3
  463. pxor %xmm7, %xmm7
  464. #ifndef TRMMKERNEL
  465. movl K, %eax
  466. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  467. movl K, %eax
  468. subl KK, %eax
  469. movl %eax, KKK
  470. #else
  471. movl KK, %eax
  472. addl $1, %eax
  473. movl %eax, KKK
  474. #endif
  475. sarl $3, %eax
  476. jle .L22
  477. .L21:
  478. mulpd %xmm0, %xmm1
  479. mulpd -14 * SIZE(BB), %xmm0
  480. ADD1 %xmm1, %xmm4
  481. movapd -12 * SIZE(BB), %xmm1
  482. ADD2 %xmm0, %xmm5
  483. movapd -14 * SIZE(AA), %xmm0
  484. mulpd %xmm0, %xmm1
  485. mulpd -10 * SIZE(BB), %xmm0
  486. ADD1 %xmm1, %xmm6
  487. movapd 0 * SIZE(BB), %xmm1
  488. ADD2 %xmm0, %xmm7
  489. movapd -12 * SIZE(AA), %xmm0
  490. mulpd %xmm0, %xmm3
  491. mulpd -6 * SIZE(BB), %xmm0
  492. ADD1 %xmm3, %xmm4
  493. movapd -4 * SIZE(BB), %xmm3
  494. ADD2 %xmm0, %xmm5
  495. movapd -10 * SIZE(AA), %xmm0
  496. mulpd %xmm0, %xmm3
  497. mulpd -2 * SIZE(BB), %xmm0
  498. ADD1 %xmm3, %xmm6
  499. movapd 8 * SIZE(BB), %xmm3
  500. ADD2 %xmm0, %xmm7
  501. movapd 0 * SIZE(AA), %xmm0
  502. mulpd %xmm2, %xmm1
  503. mulpd 2 * SIZE(BB), %xmm2
  504. ADD1 %xmm1, %xmm4
  505. movapd 4 * SIZE(BB), %xmm1
  506. ADD2 %xmm2, %xmm5
  507. movapd -6 * SIZE(AA), %xmm2
  508. mulpd %xmm2, %xmm1
  509. mulpd 6 * SIZE(BB), %xmm2
  510. ADD1 %xmm1, %xmm6
  511. movapd 16 * SIZE(BB), %xmm1
  512. ADD2 %xmm2, %xmm7
  513. movapd -4 * SIZE(AA), %xmm2
  514. mulpd %xmm2, %xmm3
  515. mulpd 10 * SIZE(BB), %xmm2
  516. ADD1 %xmm3, %xmm4
  517. movapd 12 * SIZE(BB), %xmm3
  518. ADD2 %xmm2, %xmm5
  519. movapd -2 * SIZE(AA), %xmm2
  520. mulpd %xmm2, %xmm3
  521. mulpd 14 * SIZE(BB), %xmm2
  522. ADD1 %xmm3, %xmm6
  523. movapd 24 * SIZE(BB), %xmm3
  524. ADD2 %xmm2, %xmm7
  525. movapd 8 * SIZE(AA), %xmm2
  526. subl $-16 * SIZE, AA
  527. addl $ 32 * SIZE, BB
  528. decl %eax # l--
  529. jg .L21
  530. ALIGN_2
  531. .L22:
  532. #ifndef TRMMKERNEL
  533. movl K, %eax
  534. #else
  535. movl KKK, %eax
  536. #endif
  537. andl $7, %eax # l = (k & 3)
  538. jle .L24
  539. ALIGN_2
  540. .L23:
  541. mulpd %xmm0, %xmm1
  542. mulpd -14 * SIZE(BB), %xmm0
  543. ADD1 %xmm1, %xmm4
  544. movapd -12 * SIZE(BB), %xmm1
  545. ADD2 %xmm0, %xmm5
  546. movapd -14 * SIZE(AA), %xmm0
  547. addl $2 * SIZE, AA
  548. addl $4 * SIZE, BB
  549. decl %eax # l--
  550. jg .L23
  551. .L24:
  552. addpd %xmm6, %xmm4
  553. addpd %xmm7, %xmm5
  554. movapd ALPHA_R, %xmm2
  555. movapd ALPHA_I, %xmm3
  556. SHUFPD_1 %xmm5, %xmm5
  557. #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \
  558. defined(NR) || defined(NC) || defined(TR) || defined(TC)
  559. addsubpd %xmm5, %xmm4
  560. movapd %xmm4, %xmm5
  561. #else
  562. addsubpd %xmm4, %xmm5
  563. movapd %xmm5, %xmm4
  564. #endif
  565. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  566. movsd 0 * SIZE(C1), %xmm0
  567. movhpd 1 * SIZE(C1), %xmm0
  568. #endif
  569. SHUFPD_1 %xmm5, %xmm5
  570. mulpd %xmm2, %xmm4
  571. mulpd %xmm3, %xmm5
  572. addsubpd %xmm5, %xmm4
  573. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  574. addpd %xmm0, %xmm4
  575. #endif
  576. movsd %xmm4, 0 * SIZE(C1)
  577. movhpd %xmm4, 1 * SIZE(C1)
  578. ALIGN_2
  579. .L29:
  580. #if defined(TRMMKERNEL) && !defined(LEFT)
  581. addl $1, KK
  582. #endif
  583. addl LDC, C # c += ldc
  584. decl J # j --
  585. jg .L01
  586. .L999:
  587. movl OLD_STACK, %esp
  588. emms
  589. popl %ebx
  590. popl %esi
  591. popl %edi
  592. popl %ebp
  593. ret
  594. ALIGN_2
  595. EPILOGUE