You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

zgemm_kernel_4x1_core2.S 19 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872
  1. /*********************************************************************/
  2. /* Copyright 2009, 2010 The University of Texas at Austin. */
  3. /* All rights reserved. */
  4. /* */
  5. /* Redistribution and use in source and binary forms, with or */
  6. /* without modification, are permitted provided that the following */
  7. /* conditions are met: */
  8. /* */
  9. /* 1. Redistributions of source code must retain the above */
  10. /* copyright notice, this list of conditions and the following */
  11. /* disclaimer. */
  12. /* */
  13. /* 2. Redistributions in binary form must reproduce the above */
  14. /* copyright notice, this list of conditions and the following */
  15. /* disclaimer in the documentation and/or other materials */
  16. /* provided with the distribution. */
  17. /* */
  18. /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
  19. /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
  20. /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
  21. /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
  22. /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
  23. /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
  24. /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
  25. /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
  26. /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
  27. /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
  28. /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
  29. /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
  30. /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
  31. /* POSSIBILITY OF SUCH DAMAGE. */
  32. /* */
  33. /* The views and conclusions contained in the software and */
  34. /* documentation are those of the authors and should not be */
  35. /* interpreted as representing official policies, either expressed */
  36. /* or implied, of The University of Texas at Austin. */
  37. /*********************************************************************/
  38. #define ASSEMBLER
  39. #include "common.h"
  40. #if !defined(HAVE_SSE) || !defined(HAVE_MMX)
  41. #error You have to check your configuration.
  42. #endif
  43. #define STACK 16
  44. #define ARGS 0
  45. #define STACK_M 4 + STACK + ARGS(%esi)
  46. #define STACK_N 8 + STACK + ARGS(%esi)
  47. #define STACK_K 12 + STACK + ARGS(%esi)
  48. #define STACK_ALPHA_R 16 + STACK + ARGS(%esi)
  49. #define STACK_ALPHA_I 20 + STACK + ARGS(%esi)
  50. #define STACK_A 24 + STACK + ARGS(%esi)
  51. #define STACK_B 28 + STACK + ARGS(%esi)
  52. #define STACK_C 32 + STACK + ARGS(%esi)
  53. #define STACK_LDC 36 + STACK + ARGS(%esi)
  54. #define STACK_OFFT 40 + STACK + ARGS(%esi)
  55. #define ALPHA_R 16(%esp)
  56. #define ALPHA_I 32(%esp)
  57. #define K 48(%esp)
  58. #define N 52(%esp)
  59. #define M 56(%esp)
  60. #define A 60(%esp)
  61. #define C 64(%esp)
  62. #define J 68(%esp)
  63. #define OLD_STACK 72(%esp)
  64. #define TEMP 76(%esp)
  65. #define OFFSET 80(%esp)
  66. #define KK 84(%esp)
  67. #define KKK 88(%esp)
  68. #define BUFFER 128(%esp)
  69. #define B %edi
  70. #define LDC %ebp
  71. #define C1 %esi
  72. #define STACK_ALIGN 4096
  73. #define STACK_OFFSET 1024
  74. #define PREFETCH_R (8 * 16 + 0)
  75. #define PREFETCH_W (PREFETCH_R * 2)
  76. #define PREFETCHSIZE (8 * 16 + 4)
  77. #define PREFETCH prefetcht0
  78. #define AA %edx
  79. #define BB %ecx
  80. #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \
  81. defined(RN) || defined(RT) || defined(CN) || defined(CT)
  82. #define ADDSUB addps
  83. #else
  84. #define ADDSUB subps
  85. #endif
  86. PROLOGUE
  87. pushl %ebp
  88. pushl %edi
  89. pushl %esi
  90. pushl %ebx
  91. PROFCODE
  92. movl %esp, %esi # save old stack
  93. subl $128 + LOCAL_BUFFER_SIZE + STACK_OFFSET, %esp
  94. andl $-STACK_ALIGN, %esp # align stack
  95. addl $STACK_OFFSET, %esp
  96. STACK_TOUCHING
  97. movd STACK_M, %mm0
  98. movl STACK_N, %eax
  99. movd STACK_K, %mm1
  100. movd STACK_A, %mm2
  101. movl STACK_B, B
  102. movd STACK_C, %mm3
  103. movl STACK_LDC, LDC
  104. #ifdef TRMMKERNEL
  105. movd STACK_OFFT, %mm4
  106. #endif
  107. movd %mm1, K
  108. movd %mm0, M
  109. movl %eax, N
  110. movd %mm2, A
  111. movd %mm3, C
  112. movl %esi, OLD_STACK
  113. #ifdef TRMMKERNEL
  114. movd %mm4, OFFSET
  115. movd %mm4, KK
  116. #ifndef LEFT
  117. negl KK
  118. #endif
  119. #endif
  120. subl $-32 * SIZE, A
  121. subl $-32 * SIZE, B
  122. leal (, LDC, SIZE * 2), LDC
  123. movss STACK_ALPHA_R, %xmm0
  124. movss STACK_ALPHA_I, %xmm1
  125. pcmpeqb %xmm7, %xmm7
  126. pslld $31, %xmm7 # Generate mask
  127. shufps $0, %xmm0, %xmm0
  128. movaps %xmm0, 0 + ALPHA_R
  129. movss %xmm1, 4 + ALPHA_I
  130. movss %xmm1, 12 + ALPHA_I
  131. xorps %xmm7, %xmm1
  132. movss %xmm1, 0 + ALPHA_I
  133. movss %xmm1, 8 + ALPHA_I
  134. movl %eax, J # j = n
  135. testl %eax, %eax
  136. jle .L999
  137. .L01:
  138. leal 32 * SIZE + BUFFER, BB
  139. #if defined(TRMMKERNEL) && defined(LEFT)
  140. movl OFFSET, %eax
  141. movl %eax, KK
  142. #endif
  143. movl K, %eax
  144. sarl $2, %eax
  145. jle .L03
  146. .L02:
  147. prefetcht0 (PREFETCH_R + 0) * SIZE(B)
  148. movss -32 * SIZE(B), %xmm0
  149. movss -31 * SIZE(B), %xmm1
  150. movss -30 * SIZE(B), %xmm2
  151. movss -29 * SIZE(B), %xmm3
  152. movss -28 * SIZE(B), %xmm4
  153. movss -27 * SIZE(B), %xmm5
  154. movss -26 * SIZE(B), %xmm6
  155. movss -25 * SIZE(B), %xmm7
  156. prefetcht0 (PREFETCH_W + 0) * SIZE(BB)
  157. shufps $0, %xmm0, %xmm0
  158. shufps $0, %xmm1, %xmm1
  159. shufps $0, %xmm2, %xmm2
  160. shufps $0, %xmm3, %xmm3
  161. shufps $0, %xmm4, %xmm4
  162. shufps $0, %xmm5, %xmm5
  163. shufps $0, %xmm6, %xmm6
  164. shufps $0, %xmm7, %xmm7
  165. prefetcht0 (PREFETCH_W + 16) * SIZE(BB)
  166. movaps %xmm0, -32 * SIZE(BB)
  167. movaps %xmm1, -28 * SIZE(BB)
  168. movaps %xmm2, -24 * SIZE(BB)
  169. movaps %xmm3, -20 * SIZE(BB)
  170. movaps %xmm4, -16 * SIZE(BB)
  171. movaps %xmm5, -12 * SIZE(BB)
  172. movaps %xmm6, -8 * SIZE(BB)
  173. movaps %xmm7, -4 * SIZE(BB)
  174. addl $ 8 * SIZE, B
  175. subl $-32 * SIZE, BB
  176. decl %eax
  177. jne .L02
  178. .L03:
  179. movl K, %eax
  180. andl $3, %eax
  181. BRANCH
  182. jle .L05
  183. .L04:
  184. movss -32 * SIZE(B), %xmm0
  185. movss -31 * SIZE(B), %xmm1
  186. shufps $0, %xmm0, %xmm0
  187. shufps $0, %xmm1, %xmm1
  188. movaps %xmm0, -32 * SIZE(BB)
  189. movaps %xmm1, -28 * SIZE(BB)
  190. addl $2 * SIZE, B
  191. addl $8 * SIZE, BB
  192. decl %eax
  193. jne .L04
  194. ALIGN_4
  195. .L05:
  196. movl C, C1 # coffset = c
  197. movl A, AA # aoffset = a
  198. movl M, %ebx
  199. sarl $2, %ebx # i = (m >> 2)
  200. jle .L20
  201. ALIGN_4
  202. .L10:
  203. #if !defined(TRMMKERNEL) || \
  204. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  205. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  206. leal 32 * SIZE + BUFFER, BB
  207. #else
  208. leal 32 * SIZE + BUFFER, BB
  209. movl KK, %eax
  210. leal (, %eax, 8), %eax
  211. leal (AA, %eax, 4), AA
  212. leal (BB, %eax, 4), BB /* because it's doubled */
  213. #endif
  214. movaps -32 * SIZE(AA), %xmm0
  215. pxor %xmm4, %xmm4
  216. movaps -32 * SIZE(BB), %xmm1
  217. pxor %xmm5, %xmm5
  218. movapd -16 * SIZE(AA), %xmm3
  219. pxor %xmm6, %xmm6
  220. prefetcht0 7 * SIZE(C1)
  221. pxor %xmm7, %xmm7
  222. #ifndef TRMMKERNEL
  223. movl K, %eax
  224. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  225. movl K, %eax
  226. subl KK, %eax
  227. movl %eax, KKK
  228. #else
  229. movl KK, %eax
  230. #ifdef LEFT
  231. addl $4, %eax
  232. #else
  233. addl $1, %eax
  234. #endif
  235. movl %eax, KKK
  236. #endif
  237. sarl $3, %eax
  238. je .L15
  239. ALIGN_4
  240. .L12:
  241. movaps %xmm1, %xmm2
  242. mulps %xmm0, %xmm1
  243. addps %xmm1, %xmm4
  244. movaps -28 * SIZE(BB), %xmm1
  245. mulps %xmm1, %xmm0
  246. ADDSUB %xmm0, %xmm5
  247. movaps -28 * SIZE(AA), %xmm0
  248. mulps %xmm0, %xmm2
  249. mulps %xmm0, %xmm1
  250. movaps -24 * SIZE(AA), %xmm0
  251. addps %xmm2, %xmm6
  252. ADDSUB %xmm1, %xmm7
  253. movaps -24 * SIZE(BB), %xmm1
  254. movaps %xmm1, %xmm2
  255. mulps %xmm0, %xmm1
  256. addps %xmm1, %xmm4
  257. movaps -20 * SIZE(BB), %xmm1
  258. mulps %xmm1, %xmm0
  259. ADDSUB %xmm0, %xmm5
  260. movaps -20 * SIZE(AA), %xmm0
  261. mulps %xmm0, %xmm2
  262. mulps %xmm0, %xmm1
  263. movaps 0 * SIZE(AA), %xmm0
  264. addps %xmm2, %xmm6
  265. ADDSUB %xmm1, %xmm7
  266. movaps -16 * SIZE(BB), %xmm1
  267. movaps %xmm1, %xmm2
  268. mulps %xmm3, %xmm1
  269. addps %xmm1, %xmm4
  270. movaps -12 * SIZE(BB), %xmm1
  271. mulps %xmm1, %xmm3
  272. ADDSUB %xmm3, %xmm5
  273. movaps -12 * SIZE(AA), %xmm3
  274. mulps %xmm3, %xmm2
  275. mulps %xmm3, %xmm1
  276. movaps -8 * SIZE(AA), %xmm3
  277. addps %xmm2, %xmm6
  278. ADDSUB %xmm1, %xmm7
  279. movaps -8 * SIZE(BB), %xmm1
  280. movaps %xmm1, %xmm2
  281. mulps %xmm3, %xmm1
  282. addps %xmm1, %xmm4
  283. movaps -4 * SIZE(BB), %xmm1
  284. mulps %xmm1, %xmm3
  285. ADDSUB %xmm3, %xmm5
  286. movaps -4 * SIZE(AA), %xmm3
  287. mulps %xmm3, %xmm2
  288. mulps %xmm3, %xmm1
  289. movaps 16 * SIZE(AA), %xmm3
  290. addps %xmm2, %xmm6
  291. ADDSUB %xmm1, %xmm7
  292. movaps 0 * SIZE(BB), %xmm1
  293. movaps %xmm1, %xmm2
  294. mulps %xmm0, %xmm1
  295. addps %xmm1, %xmm4
  296. movaps 4 * SIZE(BB), %xmm1
  297. mulps %xmm1, %xmm0
  298. ADDSUB %xmm0, %xmm5
  299. movaps 4 * SIZE(AA), %xmm0
  300. mulps %xmm0, %xmm2
  301. mulps %xmm0, %xmm1
  302. movaps 8 * SIZE(AA), %xmm0
  303. addps %xmm2, %xmm6
  304. ADDSUB %xmm1, %xmm7
  305. movaps 8 * SIZE(BB), %xmm1
  306. movaps %xmm1, %xmm2
  307. mulps %xmm0, %xmm1
  308. addps %xmm1, %xmm4
  309. movaps 12 * SIZE(BB), %xmm1
  310. mulps %xmm1, %xmm0
  311. ADDSUB %xmm0, %xmm5
  312. movaps 12 * SIZE(AA), %xmm0
  313. mulps %xmm0, %xmm2
  314. mulps %xmm0, %xmm1
  315. movaps 32 * SIZE(AA), %xmm0
  316. addps %xmm2, %xmm6
  317. ADDSUB %xmm1, %xmm7
  318. movaps 16 * SIZE(BB), %xmm1
  319. movaps %xmm1, %xmm2
  320. mulps %xmm3, %xmm1
  321. addps %xmm1, %xmm4
  322. movaps 20 * SIZE(BB), %xmm1
  323. mulps %xmm1, %xmm3
  324. ADDSUB %xmm3, %xmm5
  325. movaps 20 * SIZE(AA), %xmm3
  326. mulps %xmm3, %xmm2
  327. mulps %xmm3, %xmm1
  328. addps %xmm2, %xmm6
  329. movaps 24 * SIZE(AA), %xmm3
  330. ADDSUB %xmm1, %xmm7
  331. movaps 24 * SIZE(BB), %xmm1
  332. movaps %xmm1, %xmm2
  333. mulps %xmm3, %xmm1
  334. addps %xmm1, %xmm4
  335. movaps 28 * SIZE(BB), %xmm1
  336. mulps %xmm1, %xmm3
  337. ADDSUB %xmm3, %xmm5
  338. movaps 28 * SIZE(AA), %xmm3
  339. mulps %xmm3, %xmm2
  340. mulps %xmm3, %xmm1
  341. subl $-64 * SIZE, BB
  342. movaps 48 * SIZE(AA), %xmm3
  343. subl $-64 * SIZE, AA
  344. addps %xmm2, %xmm6
  345. ADDSUB %xmm1, %xmm7
  346. movaps -32 * SIZE(BB), %xmm1
  347. decl %eax
  348. jne .L12
  349. ALIGN_4
  350. .L15:
  351. #ifndef TRMMKERNEL
  352. movl K, %eax
  353. #else
  354. movl KKK, %eax
  355. #endif
  356. andl $7, %eax # if (k & 1)
  357. BRANCH
  358. je .L18
  359. .L16:
  360. movaps %xmm1, %xmm2
  361. mulps %xmm0, %xmm1
  362. addps %xmm1, %xmm4
  363. movaps -28 * SIZE(BB), %xmm1
  364. mulps %xmm1, %xmm0
  365. ADDSUB %xmm0, %xmm5
  366. movaps -28 * SIZE(AA), %xmm0
  367. mulps %xmm0, %xmm2
  368. mulps %xmm0, %xmm1
  369. movaps -24 * SIZE(AA), %xmm0
  370. addps %xmm2, %xmm6
  371. ADDSUB %xmm1, %xmm7
  372. movaps -24 * SIZE(BB), %xmm1
  373. addl $8 * SIZE, AA
  374. addl $8 * SIZE, BB
  375. decl %eax
  376. jg .L16
  377. .L18:
  378. movaps ALPHA_R, %xmm0
  379. movaps ALPHA_I, %xmm1
  380. #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \
  381. defined(NR) || defined(NC) || defined(TR) || defined(TC)
  382. shufps $0xb1, %xmm5, %xmm5
  383. shufps $0xb1, %xmm7, %xmm7
  384. addsubps %xmm5, %xmm4
  385. addsubps %xmm7, %xmm6
  386. movaps %xmm4, %xmm5
  387. movaps %xmm6, %xmm7
  388. shufps $0xb1, %xmm4, %xmm4
  389. shufps $0xb1, %xmm6, %xmm6
  390. #else
  391. shufps $0xb1, %xmm4, %xmm4
  392. shufps $0xb1, %xmm6, %xmm6
  393. addsubps %xmm4, %xmm5
  394. addsubps %xmm6, %xmm7
  395. movaps %xmm5, %xmm4
  396. movaps %xmm7, %xmm6
  397. shufps $0xb1, %xmm5, %xmm5
  398. shufps $0xb1, %xmm7, %xmm7
  399. #endif
  400. mulps %xmm0, %xmm5
  401. mulps %xmm1, %xmm4
  402. mulps %xmm0, %xmm7
  403. mulps %xmm1, %xmm6
  404. addps %xmm5, %xmm4
  405. addps %xmm7, %xmm6
  406. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  407. movsd 0 * SIZE(C1), %xmm2
  408. movhps 2 * SIZE(C1), %xmm2
  409. movsd 4 * SIZE(C1), %xmm3
  410. movhps 6 * SIZE(C1), %xmm3
  411. addps %xmm2, %xmm4
  412. addps %xmm3, %xmm6
  413. #endif
  414. movsd %xmm4, 0 * SIZE(C1)
  415. movhps %xmm4, 2 * SIZE(C1)
  416. movsd %xmm6, 4 * SIZE(C1)
  417. movhps %xmm6, 6 * SIZE(C1)
  418. #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  419. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  420. movl K, %eax
  421. subl KKK, %eax
  422. leal (,%eax, 8), %eax
  423. leal (AA, %eax, 4), AA
  424. leal (BB, %eax, 4), BB
  425. #endif
  426. #if defined(TRMMKERNEL) && defined(LEFT)
  427. addl $4, KK
  428. #endif
  429. addl $8 * SIZE, C1
  430. decl %ebx
  431. jg .L10
  432. ALIGN_2
  433. .L20:
  434. movl M, %ebx
  435. testl $2, %ebx
  436. jle .L30
  437. #if !defined(TRMMKERNEL) || \
  438. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  439. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  440. leal 32 * SIZE + BUFFER, BB
  441. #else
  442. leal 32 * SIZE + BUFFER, BB
  443. movl KK, %eax
  444. leal (, %eax, 8), %eax
  445. leal (AA, %eax, 2), AA
  446. leal (BB, %eax, 4), BB /* because it's doubled */
  447. #endif
  448. movaps -32 * SIZE(AA), %xmm0
  449. pxor %xmm4, %xmm4
  450. movaps -32 * SIZE(BB), %xmm1
  451. pxor %xmm5, %xmm5
  452. movaps -16 * SIZE(AA), %xmm2
  453. pxor %xmm6, %xmm6
  454. movaps -16 * SIZE(BB), %xmm3
  455. pxor %xmm7, %xmm7
  456. #ifndef TRMMKERNEL
  457. movl K, %eax
  458. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  459. movl K, %eax
  460. subl KK, %eax
  461. movl %eax, KKK
  462. #else
  463. movl KK, %eax
  464. #ifdef LEFT
  465. addl $2, %eax
  466. #else
  467. addl $1, %eax
  468. #endif
  469. movl %eax, KKK
  470. #endif
  471. sarl $3, %eax
  472. je .L25
  473. ALIGN_4
  474. .L22:
  475. mulps %xmm0, %xmm1
  476. mulps -28 * SIZE(BB), %xmm0
  477. addps %xmm1, %xmm4
  478. movaps -24 * SIZE(BB), %xmm1
  479. ADDSUB %xmm0, %xmm5
  480. movaps -28 * SIZE(AA), %xmm0
  481. mulps %xmm0, %xmm1
  482. mulps -20 * SIZE(BB), %xmm0
  483. addps %xmm1, %xmm6
  484. movaps 0 * SIZE(BB), %xmm1
  485. ADDSUB %xmm0, %xmm7
  486. movaps -24 * SIZE(AA), %xmm0
  487. mulps %xmm0, %xmm3
  488. mulps -12 * SIZE(BB), %xmm0
  489. addps %xmm3, %xmm4
  490. movaps -8 * SIZE(BB), %xmm3
  491. ADDSUB %xmm0, %xmm5
  492. movaps -20 * SIZE(AA), %xmm0
  493. mulps %xmm0, %xmm3
  494. mulps -4 * SIZE(BB), %xmm0
  495. addps %xmm3, %xmm6
  496. movaps 16 * SIZE(BB), %xmm3
  497. ADDSUB %xmm0, %xmm7
  498. movaps 0 * SIZE(AA), %xmm0
  499. mulps %xmm2, %xmm1
  500. mulps 4 * SIZE(BB), %xmm2
  501. addps %xmm1, %xmm4
  502. movaps 8 * SIZE(BB), %xmm1
  503. ADDSUB %xmm2, %xmm5
  504. movaps -12 * SIZE(AA), %xmm2
  505. mulps %xmm2, %xmm1
  506. mulps 12 * SIZE(BB), %xmm2
  507. addps %xmm1, %xmm6
  508. movaps 32 * SIZE(BB), %xmm1
  509. ADDSUB %xmm2, %xmm7
  510. movaps -8 * SIZE(AA), %xmm2
  511. mulps %xmm2, %xmm3
  512. mulps 20 * SIZE(BB), %xmm2
  513. addps %xmm3, %xmm4
  514. movaps 24 * SIZE(BB), %xmm3
  515. ADDSUB %xmm2, %xmm5
  516. movaps -4 * SIZE(AA), %xmm2
  517. mulps %xmm2, %xmm3
  518. mulps 28 * SIZE(BB), %xmm2
  519. addps %xmm3, %xmm6
  520. movaps 48 * SIZE(BB), %xmm3
  521. ADDSUB %xmm2, %xmm7
  522. movaps 16 * SIZE(AA), %xmm2
  523. subl $-32 * SIZE, AA
  524. addl $ 64 * SIZE, BB
  525. decl %eax
  526. jne .L22
  527. ALIGN_2
  528. .L25:
  529. #ifndef TRMMKERNEL
  530. movl K, %eax
  531. #else
  532. movl KKK, %eax
  533. #endif
  534. andl $7, %eax # if (k & 1)
  535. BRANCH
  536. je .L28
  537. .L26:
  538. mulps %xmm0, %xmm1
  539. mulps -28 * SIZE(BB), %xmm0
  540. addps %xmm1, %xmm4
  541. movaps -24 * SIZE(BB), %xmm1
  542. ADDSUB %xmm0, %xmm5
  543. movaps -28 * SIZE(AA), %xmm0
  544. addl $4 * SIZE, AA
  545. addl $8 * SIZE, BB
  546. decl %eax
  547. jg .L26
  548. .L28:
  549. addps %xmm6, %xmm4
  550. addps %xmm7, %xmm5
  551. movaps ALPHA_R, %xmm0
  552. movaps ALPHA_I, %xmm1
  553. #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \
  554. defined(NR) || defined(NC) || defined(TR) || defined(TC)
  555. shufps $0xb1, %xmm5, %xmm5
  556. addsubps %xmm5, %xmm4
  557. movaps %xmm4, %xmm5
  558. shufps $0xb1, %xmm4, %xmm4
  559. #else
  560. shufps $0xb1, %xmm4, %xmm4
  561. addsubps %xmm4, %xmm5
  562. movaps %xmm5, %xmm4
  563. shufps $0xb1, %xmm5, %xmm5
  564. #endif
  565. mulps %xmm0, %xmm5
  566. mulps %xmm1, %xmm4
  567. addps %xmm5, %xmm4
  568. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  569. movsd 0 * SIZE(C1), %xmm2
  570. movhps 2 * SIZE(C1), %xmm2
  571. addps %xmm2, %xmm4
  572. #endif
  573. movsd %xmm4, 0 * SIZE(C1)
  574. movhps %xmm4, 2 * SIZE(C1)
  575. #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  576. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  577. movl K, %eax
  578. subl KKK, %eax
  579. leal (,%eax, 8), %eax
  580. leal (AA, %eax, 2), AA
  581. leal (BB, %eax, 4), BB
  582. #endif
  583. #if defined(TRMMKERNEL) && defined(LEFT)
  584. addl $2, KK
  585. #endif
  586. addl $4 * SIZE, C1
  587. ALIGN_2
  588. .L30:
  589. testl $1, %ebx
  590. jle .L39
  591. #if !defined(TRMMKERNEL) || \
  592. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  593. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  594. leal 32 * SIZE + BUFFER, BB
  595. #else
  596. leal 32 * SIZE + BUFFER, BB
  597. movl KK, %eax
  598. leal (, %eax, 8), %eax
  599. leal (AA, %eax, 1), AA
  600. leal (BB, %eax, 4), BB /* because it's doubled */
  601. #endif
  602. movsd -32 * SIZE(AA), %xmm0
  603. pxor %xmm4, %xmm4
  604. movsd -32 * SIZE(BB), %xmm1
  605. pxor %xmm5, %xmm5
  606. movsd -24 * SIZE(AA), %xmm2
  607. pxor %xmm6, %xmm6
  608. movsd -16 * SIZE(BB), %xmm3
  609. pxor %xmm7, %xmm7
  610. #ifndef TRMMKERNEL
  611. movl K, %eax
  612. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  613. movl K, %eax
  614. subl KK, %eax
  615. movl %eax, KKK
  616. #else
  617. movl KK, %eax
  618. addl $1, %eax
  619. movl %eax, KKK
  620. #endif
  621. sarl $3, %eax
  622. je .L35
  623. ALIGN_4
  624. .L32:
  625. mulps %xmm0, %xmm1
  626. mulps -28 * SIZE(BB), %xmm0
  627. addps %xmm1, %xmm4
  628. movsd -24 * SIZE(BB), %xmm1
  629. ADDSUB %xmm0, %xmm5
  630. movsd -30 * SIZE(AA), %xmm0
  631. mulps %xmm0, %xmm1
  632. mulps -20 * SIZE(BB), %xmm0
  633. addps %xmm1, %xmm6
  634. movsd 0 * SIZE(BB), %xmm1
  635. ADDSUB %xmm0, %xmm7
  636. movsd -28 * SIZE(AA), %xmm0
  637. mulps %xmm0, %xmm3
  638. mulps -12 * SIZE(BB), %xmm0
  639. addps %xmm3, %xmm4
  640. movsd -8 * SIZE(BB), %xmm3
  641. ADDSUB %xmm0, %xmm5
  642. movsd -26 * SIZE(AA), %xmm0
  643. mulps %xmm0, %xmm3
  644. mulps -4 * SIZE(BB), %xmm0
  645. addps %xmm3, %xmm6
  646. movsd 16 * SIZE(BB), %xmm3
  647. ADDSUB %xmm0, %xmm7
  648. movsd -16 * SIZE(AA), %xmm0
  649. mulps %xmm2, %xmm1
  650. mulps 4 * SIZE(BB), %xmm2
  651. addps %xmm1, %xmm4
  652. movsd 8 * SIZE(BB), %xmm1
  653. ADDSUB %xmm2, %xmm5
  654. movsd -22 * SIZE(AA), %xmm2
  655. mulps %xmm2, %xmm1
  656. mulps 12 * SIZE(BB), %xmm2
  657. addps %xmm1, %xmm6
  658. movsd 32 * SIZE(BB), %xmm1
  659. ADDSUB %xmm2, %xmm7
  660. movsd -20 * SIZE(AA), %xmm2
  661. mulps %xmm2, %xmm3
  662. mulps 20 * SIZE(BB), %xmm2
  663. addps %xmm3, %xmm4
  664. movsd 24 * SIZE(BB), %xmm3
  665. ADDSUB %xmm2, %xmm5
  666. movsd -18 * SIZE(AA), %xmm2
  667. mulps %xmm2, %xmm3
  668. mulps 28 * SIZE(BB), %xmm2
  669. addps %xmm3, %xmm6
  670. movsd 48 * SIZE(BB), %xmm3
  671. ADDSUB %xmm2, %xmm7
  672. movsd -8 * SIZE(AA), %xmm2
  673. subl $-16 * SIZE, AA
  674. addl $ 64 * SIZE, BB
  675. decl %eax
  676. jne .L32
  677. ALIGN_2
  678. .L35:
  679. #ifndef TRMMKERNEL
  680. movl K, %eax
  681. #else
  682. movl KKK, %eax
  683. #endif
  684. andl $7, %eax # if (k & 1)
  685. BRANCH
  686. je .L38
  687. .L36:
  688. mulps %xmm0, %xmm1
  689. mulps -28 * SIZE(BB), %xmm0
  690. addps %xmm1, %xmm4
  691. movsd -24 * SIZE(BB), %xmm1
  692. ADDSUB %xmm0, %xmm5
  693. movsd -30 * SIZE(AA), %xmm0
  694. addl $2 * SIZE, AA
  695. addl $8 * SIZE, BB
  696. decl %eax
  697. jg .L36
  698. .L38:
  699. addps %xmm6, %xmm4
  700. addps %xmm7, %xmm5
  701. movaps ALPHA_R, %xmm0
  702. movaps ALPHA_I, %xmm1
  703. #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \
  704. defined(NR) || defined(NC) || defined(TR) || defined(TC)
  705. shufps $0xb1, %xmm5, %xmm5
  706. addsubps %xmm5, %xmm4
  707. movaps %xmm4, %xmm5
  708. shufps $0xb1, %xmm4, %xmm4
  709. #else
  710. shufps $0xb1, %xmm4, %xmm4
  711. addsubps %xmm4, %xmm5
  712. movaps %xmm5, %xmm4
  713. shufps $0xb1, %xmm5, %xmm5
  714. #endif
  715. mulps %xmm0, %xmm5
  716. mulps %xmm1, %xmm4
  717. addps %xmm5, %xmm4
  718. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  719. movsd 0 * SIZE(C1), %xmm2
  720. addps %xmm2, %xmm4
  721. #endif
  722. movsd %xmm4, 0 * SIZE(C1)
  723. #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  724. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  725. movl K, %eax
  726. subl KKK, %eax
  727. leal (,%eax, 8), %eax
  728. leal (AA, %eax, 1), AA
  729. leal (BB, %eax, 4), BB
  730. #endif
  731. #if defined(TRMMKERNEL) && defined(LEFT)
  732. addl $1, KK
  733. #endif
  734. ALIGN_2
  735. .L39:
  736. #if defined(TRMMKERNEL) && !defined(LEFT)
  737. addl $1, KK
  738. #endif
  739. addl LDC, C # c += ldc
  740. decl J # j --
  741. jg .L01
  742. ALIGN_2
  743. .L999:
  744. movl OLD_STACK, %esp
  745. EMMS
  746. popl %ebx
  747. popl %esi
  748. popl %edi
  749. popl %ebp
  750. ret
  751. EPILOGUE