You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

zgemm_kernel_1x2_barcelona.S 16 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728
  1. /*********************************************************************/
  2. /* Copyright 2009, 2010 The University of Texas at Austin. */
  3. /* All rights reserved. */
  4. /* */
  5. /* Redistribution and use in source and binary forms, with or */
  6. /* without modification, are permitted provided that the following */
  7. /* conditions are met: */
  8. /* */
  9. /* 1. Redistributions of source code must retain the above */
  10. /* copyright notice, this list of conditions and the following */
  11. /* disclaimer. */
  12. /* */
  13. /* 2. Redistributions in binary form must reproduce the above */
  14. /* copyright notice, this list of conditions and the following */
  15. /* disclaimer in the documentation and/or other materials */
  16. /* provided with the distribution. */
  17. /* */
  18. /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
  19. /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
  20. /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
  21. /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
  22. /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
  23. /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
  24. /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
  25. /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
  26. /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
  27. /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
  28. /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
  29. /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
  30. /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
  31. /* POSSIBILITY OF SUCH DAMAGE. */
  32. /* */
  33. /* The views and conclusions contained in the software and */
  34. /* documentation are those of the authors and should not be */
  35. /* interpreted as representing official policies, either expressed */
  36. /* or implied, of The University of Texas at Austin. */
  37. /*********************************************************************/
  38. #define ASSEMBLER
  39. #include "common.h"
  40. #define STACK 16
  41. #define ARGS 16
  42. #define M 4 + STACK + ARGS(%esp)
  43. #define N 8 + STACK + ARGS(%esp)
  44. #define K 12 + STACK + ARGS(%esp)
  45. #define ALPHA_R 16 + STACK + ARGS(%esp)
  46. #define ALPHA_I 24 + STACK + ARGS(%esp)
  47. #define A 32 + STACK + ARGS(%esp)
  48. #define OLD_B 36 + STACK + ARGS(%esp)
  49. #define C 40 + STACK + ARGS(%esp)
  50. #define OLD_LDC 44 + STACK + ARGS(%esp)
  51. #define OFFSET 48 + STACK + ARGS(%esp)
  52. #define J 0 + STACK(%esp)
  53. #define BX 4 + STACK(%esp)
  54. #define KK 8 + STACK(%esp)
  55. #define KKK 12 + STACK(%esp)
  56. #define B %edi
  57. #define LDC %ebp
  58. #define AO %edx
  59. #define BO %ecx
  60. #define CO %esi
  61. #define I %ebx
  62. #define movsd movlps
  63. #define movapd movups
  64. #define movlpd movlps
  65. #define movhpd movhps
  66. #define PREFETCH prefetch
  67. #define PREFETCHSIZE (8 * 7 + 0)
  68. #if defined(NN) || defined(NT) || defined(TN) || defined(TT)
  69. #define ADD1 addpd
  70. #define ADD2 addpd
  71. #elif defined(RN) || defined(RT) || defined(CN) || defined(CT)
  72. #define ADD1 addpd
  73. #define ADD2 subpd
  74. #elif defined(NR) || defined(NC) || defined(TR) || defined(TC)
  75. #define ADD1 subpd
  76. #define ADD2 addpd
  77. #else
  78. #define ADD1 subpd
  79. #define ADD2 subpd
  80. #endif
  81. #define KERNEL1(address) \
  82. mulpd %xmm0, %xmm1; \
  83. PREFETCH (PREFETCHSIZE + 0) * SIZE(AO, %eax, 2); \
  84. mulpd -14 * SIZE(BO, %eax, 4), %xmm0; \
  85. ADD1 %xmm1, %xmm4; \
  86. movapd -12 * SIZE(BO, %eax, 4), %xmm1; \
  87. ADD1 %xmm0, %xmm6; \
  88. movddup -15 * SIZE(AO, %eax, 2), %xmm0; \
  89. mulpd %xmm0, %xmm2; \
  90. mulpd -14 * SIZE(BO, %eax, 4), %xmm0; \
  91. ADD2 %xmm0, %xmm7; \
  92. movddup -14 * SIZE(AO, %eax, 2), %xmm0
  93. #define KERNEL2(address) \
  94. ADD2 %xmm2, %xmm5; \
  95. movapd %xmm1, %xmm2; \
  96. mulpd %xmm0, %xmm1; \
  97. mulpd -10 * SIZE(BO, %eax, 4), %xmm0; \
  98. ADD1 %xmm1, %xmm4; \
  99. movapd -8 * SIZE(BO, %eax, 4), %xmm1; \
  100. ADD1 %xmm0, %xmm6; \
  101. movddup -13 * SIZE(AO, %eax, 2), %xmm0; \
  102. mulpd %xmm0, %xmm2; \
  103. mulpd -10 * SIZE(BO, %eax, 4), %xmm0; \
  104. ADD2 %xmm0, %xmm7; \
  105. movddup -12 * SIZE(AO, %eax, 2), %xmm0
  106. #define KERNEL3(address) \
  107. ADD2 %xmm2, %xmm5; \
  108. movapd %xmm1, %xmm2; \
  109. mulpd %xmm0, %xmm1; \
  110. mulpd -6 * SIZE(BO, %eax, 4), %xmm0; \
  111. ADD1 %xmm1, %xmm4; \
  112. movapd -4 * SIZE(BO, %eax, 4), %xmm1; \
  113. ADD1 %xmm0, %xmm6; \
  114. movddup -11 * SIZE(AO, %eax, 2), %xmm0; \
  115. mulpd %xmm0, %xmm2; \
  116. mulpd -6 * SIZE(BO, %eax, 4), %xmm0; \
  117. ADD2 %xmm0, %xmm7; \
  118. movddup -10 * SIZE(AO, %eax, 2), %xmm0
  119. #define KERNEL4(address) \
  120. ADD2 %xmm2, %xmm5; \
  121. movapd %xmm1, %xmm2; \
  122. mulpd %xmm0, %xmm1; \
  123. mulpd -2 * SIZE(BO, %eax, 4), %xmm0; \
  124. ADD1 %xmm1, %xmm4; \
  125. movapd (BO, %eax, 4), %xmm1; \
  126. ADD1 %xmm0, %xmm6; \
  127. movddup -9 * SIZE(AO, %eax, 2), %xmm0; \
  128. mulpd %xmm0, %xmm2; \
  129. mulpd -2 * SIZE(BO, %eax, 4), %xmm0; \
  130. ADD2 %xmm0, %xmm7; \
  131. movddup (AO, %eax, 2), %xmm0
  132. #define KERNEL5(address) \
  133. ADD2 %xmm2, %xmm5; \
  134. movapd %xmm1, %xmm2; \
  135. mulpd %xmm3, %xmm1; \
  136. mulpd 2 * SIZE(BO, %eax, 4), %xmm3; \
  137. ADD1 %xmm1, %xmm4; \
  138. movapd 4 * SIZE(BO, %eax, 4), %xmm1; \
  139. ADD1 %xmm3, %xmm6; \
  140. movddup -7 * SIZE(AO, %eax, 2), %xmm3; \
  141. mulpd %xmm3, %xmm2; \
  142. mulpd 2 * SIZE(BO, %eax, 4), %xmm3; \
  143. ADD2 %xmm3, %xmm7; \
  144. movddup -6 * SIZE(AO, %eax, 2), %xmm3
  145. #define KERNEL6(address) \
  146. ADD2 %xmm2, %xmm5; \
  147. movapd %xmm1, %xmm2; \
  148. mulpd %xmm3, %xmm1; \
  149. mulpd 6 * SIZE(BO, %eax, 4), %xmm3; \
  150. ADD1 %xmm1, %xmm4; \
  151. movapd 8 * SIZE(BO, %eax, 4), %xmm1; \
  152. ADD1 %xmm3, %xmm6; \
  153. movddup -5 * SIZE(AO, %eax, 2), %xmm3; \
  154. mulpd %xmm3, %xmm2; \
  155. mulpd 6 * SIZE(BO, %eax, 4), %xmm3; \
  156. ADD2 %xmm3, %xmm7; \
  157. movddup -4 * SIZE(AO, %eax, 2), %xmm3
  158. #define KERNEL7(address) \
  159. ADD2 %xmm2, %xmm5; \
  160. movapd %xmm1, %xmm2; \
  161. mulpd %xmm3, %xmm1; \
  162. mulpd 10 * SIZE(BO, %eax, 4), %xmm3; \
  163. ADD1 %xmm1, %xmm4; \
  164. movapd 12 * SIZE(BO, %eax, 4), %xmm1; \
  165. ADD1 %xmm3, %xmm6; \
  166. movddup -3 * SIZE(AO, %eax, 2), %xmm3; \
  167. mulpd %xmm3, %xmm2; \
  168. mulpd 10 * SIZE(BO, %eax, 4), %xmm3; \
  169. ADD2 %xmm3, %xmm7; \
  170. movddup -2 * SIZE(AO, %eax, 2), %xmm3
  171. #define KERNEL8(address) \
  172. ADD2 %xmm2, %xmm5; \
  173. movapd %xmm1, %xmm2; \
  174. mulpd %xmm3, %xmm1; \
  175. mulpd 14 * SIZE(BO, %eax, 4), %xmm3; \
  176. ADD1 %xmm1, %xmm4; \
  177. movapd 16 * SIZE(BO, %eax, 4), %xmm1; \
  178. ADD1 %xmm3, %xmm6; \
  179. movddup -1 * SIZE(AO, %eax, 2), %xmm3; \
  180. mulpd %xmm3, %xmm2; \
  181. mulpd 14 * SIZE(BO, %eax, 4), %xmm3; \
  182. ADD2 %xmm3, %xmm7; \
  183. movddup 8 * SIZE(AO, %eax, 2), %xmm3; \
  184. ADD2 %xmm2, %xmm5; \
  185. movapd %xmm1, %xmm2
  186. PROLOGUE
  187. subl $ARGS, %esp
  188. pushl %ebp
  189. pushl %edi
  190. pushl %esi
  191. pushl %ebx
  192. PROFCODE
  193. movl OLD_B, B
  194. movl OLD_LDC, LDC
  195. #ifdef TRMMKERNEL
  196. movl OFFSET, %eax
  197. #ifndef LEFT
  198. negl %eax
  199. #endif
  200. movl %eax, KK
  201. #endif
  202. subl $-16 * SIZE, A
  203. subl $-16 * SIZE, B
  204. sall $ZBASE_SHIFT, LDC
  205. movl N, %eax
  206. sarl $1, %eax
  207. movl %eax, J # j = n
  208. jle .L100
  209. ALIGN_4
  210. .L01:
  211. #if defined(TRMMKERNEL) && defined(LEFT)
  212. movl OFFSET, %eax
  213. movl %eax, KK
  214. #endif
  215. leal GEMM_DEFAULT_Q * GEMM_DEFAULT_UNROLL_N * SIZE(B), %eax
  216. movl %eax, BX
  217. movl C, CO
  218. movl A, AO
  219. movl M, I
  220. testl I, I
  221. jle .L100
  222. ALIGN_4
  223. .L10:
  224. #if !defined(TRMMKERNEL) || \
  225. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  226. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  227. movl B, BO
  228. #else
  229. movl KK, %eax
  230. leal (, %eax, SIZE), %eax
  231. leal (AO, %eax, 2), AO
  232. leal (B, %eax, 4), BO
  233. #endif
  234. movl BX, %eax
  235. prefetcht2 0 * SIZE(%eax)
  236. subl $-8 * SIZE, BX
  237. movddup -16 * SIZE(AO), %xmm0
  238. movapd -16 * SIZE(BO), %xmm1
  239. pxor %xmm4, %xmm4
  240. movddup -8 * SIZE(AO), %xmm3
  241. pxor %xmm5, %xmm5
  242. prefetchw 1 * SIZE(CO)
  243. pxor %xmm6, %xmm6
  244. prefetchw 1 * SIZE(CO, LDC)
  245. pxor %xmm7, %xmm7
  246. movapd %xmm1, %xmm2
  247. #ifndef TRMMKERNEL
  248. movl K, %eax
  249. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  250. movl K, %eax
  251. subl KK, %eax
  252. movl %eax, KKK
  253. #else
  254. movl KK, %eax
  255. #ifdef LEFT
  256. addl $1, %eax
  257. #else
  258. addl $2, %eax
  259. #endif
  260. movl %eax, KKK
  261. #endif
  262. andl $-8, %eax
  263. leal (, %eax, SIZE), %eax
  264. leal (AO, %eax, 2), AO
  265. leal (BO, %eax, 4), BO
  266. negl %eax
  267. NOBRANCH
  268. je .L15
  269. ALIGN_3
  270. .L12:
  271. KERNEL1(16 * 0)
  272. KERNEL2(16 * 0)
  273. KERNEL3(16 * 0)
  274. KERNEL4(16 * 0)
  275. KERNEL5(16 * 0)
  276. KERNEL6(16 * 0)
  277. KERNEL7(16 * 0)
  278. KERNEL8(16 * 0)
  279. addl $8 * SIZE, %eax
  280. NOBRANCH
  281. je .L15
  282. KERNEL1(16 * 0)
  283. KERNEL2(16 * 0)
  284. KERNEL3(16 * 0)
  285. KERNEL4(16 * 0)
  286. KERNEL5(16 * 0)
  287. KERNEL6(16 * 0)
  288. KERNEL7(16 * 0)
  289. KERNEL8(16 * 0)
  290. addl $8 * SIZE, %eax
  291. NOBRANCH
  292. je .L15
  293. KERNEL1(16 * 0)
  294. KERNEL2(16 * 0)
  295. KERNEL3(16 * 0)
  296. KERNEL4(16 * 0)
  297. KERNEL5(16 * 0)
  298. KERNEL6(16 * 0)
  299. KERNEL7(16 * 0)
  300. KERNEL8(16 * 0)
  301. addl $8 * SIZE, %eax
  302. NOBRANCH
  303. je .L15
  304. KERNEL1(16 * 0)
  305. KERNEL2(16 * 0)
  306. KERNEL3(16 * 0)
  307. KERNEL4(16 * 0)
  308. KERNEL5(16 * 0)
  309. KERNEL6(16 * 0)
  310. KERNEL7(16 * 0)
  311. KERNEL8(16 * 0)
  312. addl $8 * SIZE, %eax
  313. NOBRANCH
  314. je .L15
  315. KERNEL1(16 * 0)
  316. KERNEL2(16 * 0)
  317. KERNEL3(16 * 0)
  318. KERNEL4(16 * 0)
  319. KERNEL5(16 * 0)
  320. KERNEL6(16 * 0)
  321. KERNEL7(16 * 0)
  322. KERNEL8(16 * 0)
  323. addl $8 * SIZE, %eax
  324. NOBRANCH
  325. je .L15
  326. KERNEL1(16 * 0)
  327. KERNEL2(16 * 0)
  328. KERNEL3(16 * 0)
  329. KERNEL4(16 * 0)
  330. KERNEL5(16 * 0)
  331. KERNEL6(16 * 0)
  332. KERNEL7(16 * 0)
  333. KERNEL8(16 * 0)
  334. addl $8 * SIZE, %eax
  335. NOBRANCH
  336. je .L15
  337. KERNEL1(16 * 0)
  338. KERNEL2(16 * 0)
  339. KERNEL3(16 * 0)
  340. KERNEL4(16 * 0)
  341. KERNEL5(16 * 0)
  342. KERNEL6(16 * 0)
  343. KERNEL7(16 * 0)
  344. KERNEL8(16 * 0)
  345. addl $8 * SIZE, %eax
  346. NOBRANCH
  347. je .L15
  348. KERNEL1(16 * 0)
  349. KERNEL2(16 * 0)
  350. KERNEL3(16 * 0)
  351. KERNEL4(16 * 0)
  352. KERNEL5(16 * 0)
  353. KERNEL6(16 * 0)
  354. KERNEL7(16 * 0)
  355. KERNEL8(16 * 0)
  356. addl $8 * SIZE, %eax
  357. BRANCH
  358. jl .L12
  359. ALIGN_3
  360. .L15:
  361. #ifndef TRMMKERNEL
  362. movl K, %eax
  363. #else
  364. movl KKK, %eax
  365. #endif
  366. andl $7, %eax # if (k & 1)
  367. BRANCH
  368. je .L14
  369. leal (, %eax, SIZE), %eax
  370. leal (AO, %eax, 2), AO
  371. leal (BO, %eax, 4), BO
  372. negl %eax
  373. ALIGN_4
  374. .L16:
  375. mulpd %xmm0, %xmm1
  376. mulpd -14 * SIZE(BO, %eax, 4), %xmm0
  377. ADD1 %xmm1, %xmm4
  378. movapd -12 * SIZE(BO, %eax, 4), %xmm1
  379. ADD1 %xmm0, %xmm6
  380. movddup -15 * SIZE(AO, %eax, 2), %xmm0
  381. mulpd %xmm0, %xmm2
  382. mulpd -14 * SIZE(BO, %eax, 4), %xmm0
  383. ADD2 %xmm0, %xmm7
  384. movddup -14 * SIZE(AO, %eax, 2), %xmm0
  385. ADD2 %xmm2, %xmm5
  386. movapd %xmm1, %xmm2
  387. addl $SIZE, %eax
  388. jl .L16
  389. ALIGN_4
  390. .L14:
  391. #ifndef TRMMKERNEL
  392. movupd 0 * SIZE(CO), %xmm0
  393. movupd 0 * SIZE(CO, LDC), %xmm1
  394. #endif
  395. movddup ALPHA_R, %xmm2
  396. movddup ALPHA_I, %xmm3
  397. SHUFPD_1 %xmm5, %xmm5
  398. SHUFPD_1 %xmm7, %xmm7
  399. #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \
  400. defined(RN) || defined(RT) || defined(CN) || defined(CT)
  401. addsubpd %xmm5, %xmm4
  402. addsubpd %xmm7, %xmm6
  403. pshufd $0x4e, %xmm4, %xmm5
  404. pshufd $0x4e, %xmm6, %xmm7
  405. #else
  406. addsubpd %xmm4, %xmm5
  407. addsubpd %xmm6, %xmm7
  408. movapd %xmm5, %xmm4
  409. pshufd $0x4e, %xmm5, %xmm5
  410. movapd %xmm7, %xmm6
  411. pshufd $0x4e, %xmm7, %xmm7
  412. #endif
  413. mulpd %xmm2, %xmm4
  414. mulpd %xmm3, %xmm5
  415. mulpd %xmm2, %xmm6
  416. mulpd %xmm3, %xmm7
  417. addsubpd %xmm5, %xmm4
  418. addsubpd %xmm7, %xmm6
  419. #ifndef TRMMKERNEL
  420. addpd %xmm0, %xmm4
  421. addpd %xmm1, %xmm6
  422. #endif
  423. movlpd %xmm4, 0 * SIZE(CO)
  424. movhpd %xmm4, 1 * SIZE(CO)
  425. movlpd %xmm6, 0 * SIZE(CO, LDC)
  426. movhpd %xmm6, 1 * SIZE(CO, LDC)
  427. #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  428. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  429. movl K, %eax
  430. subl KKK, %eax
  431. leal (,%eax, SIZE), %eax
  432. leal (AO, %eax, 2), AO
  433. leal (BO, %eax, 4), BO
  434. #endif
  435. #if defined(TRMMKERNEL) && defined(LEFT)
  436. addl $1, KK
  437. #endif
  438. addl $2 * SIZE, CO # coffset += 4
  439. decl I # i --
  440. jg .L10
  441. ALIGN_4
  442. .L99:
  443. #if defined(TRMMKERNEL) && !defined(LEFT)
  444. addl $2, KK
  445. #endif
  446. movl BO, B
  447. leal (, LDC, 2), %eax
  448. addl %eax, C # c += ldc
  449. decl J # j --
  450. jg .L01
  451. ALIGN_4
  452. .L100:
  453. movl N, %eax
  454. andl $1, %eax
  455. jle .L500
  456. ALIGN_4
  457. .L101:
  458. #if defined(TRMMKERNEL) && defined(LEFT)
  459. movl OFFSET, %eax
  460. movl %eax, KK
  461. #endif
  462. movl C, CO
  463. movl A, AO
  464. movl M, I
  465. testl %ebx, I
  466. jle .L500
  467. ALIGN_4
  468. .L110:
  469. #if !defined(TRMMKERNEL) || \
  470. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  471. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  472. movl B, BO
  473. #else
  474. movl KK, %eax
  475. leal (, %eax, SIZE), %eax
  476. leal (AO, %eax, 2), AO
  477. leal (B, %eax, 2), BO
  478. #endif
  479. movddup -16 * SIZE(AO), %xmm0
  480. pxor %xmm4, %xmm4
  481. movddup -15 * SIZE(AO), %xmm1
  482. pxor %xmm5, %xmm5
  483. pxor %xmm6, %xmm6
  484. pxor %xmm7, %xmm7
  485. prefetchw 1 * SIZE(CO)
  486. #ifndef TRMMKERNEL
  487. movl K, %eax
  488. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  489. movl K, %eax
  490. subl KK, %eax
  491. movl %eax, KKK
  492. #else
  493. movl KK, %eax
  494. #ifdef LEFT
  495. addl $1, %eax
  496. #else
  497. addl $1, %eax
  498. #endif
  499. movl %eax, KKK
  500. #endif
  501. sarl $3, %eax
  502. je .L112
  503. ALIGN_4
  504. .L111:
  505. PREFETCH (PREFETCHSIZE + 0) * SIZE(AO)
  506. mulpd -16 * SIZE(BO), %xmm0
  507. ADD1 %xmm0, %xmm4
  508. movddup -14 * SIZE(AO), %xmm0
  509. mulpd -16 * SIZE(BO), %xmm1
  510. ADD2 %xmm1, %xmm5
  511. movddup -13 * SIZE(AO), %xmm1
  512. mulpd -14 * SIZE(BO), %xmm0
  513. ADD1 %xmm0, %xmm6
  514. movddup -12 * SIZE(AO), %xmm0
  515. mulpd -14 * SIZE(BO), %xmm1
  516. ADD2 %xmm1, %xmm7
  517. movddup -11 * SIZE(AO), %xmm1
  518. mulpd -12 * SIZE(BO), %xmm0
  519. ADD1 %xmm0, %xmm4
  520. movddup -10 * SIZE(AO), %xmm0
  521. mulpd -12 * SIZE(BO), %xmm1
  522. ADD2 %xmm1, %xmm5
  523. movddup -9 * SIZE(AO), %xmm1
  524. mulpd -10 * SIZE(BO), %xmm0
  525. ADD1 %xmm0, %xmm6
  526. movddup -8 * SIZE(AO), %xmm0
  527. mulpd -10 * SIZE(BO), %xmm1
  528. ADD2 %xmm1, %xmm7
  529. movddup -7 * SIZE(AO), %xmm1
  530. mulpd -8 * SIZE(BO), %xmm0
  531. ADD1 %xmm0, %xmm4
  532. movddup -6 * SIZE(AO), %xmm0
  533. mulpd -8 * SIZE(BO), %xmm1
  534. ADD2 %xmm1, %xmm5
  535. movddup -5 * SIZE(AO), %xmm1
  536. mulpd -6 * SIZE(BO), %xmm0
  537. ADD1 %xmm0, %xmm6
  538. movddup -4 * SIZE(AO), %xmm0
  539. mulpd -6 * SIZE(BO), %xmm1
  540. ADD2 %xmm1, %xmm7
  541. movddup -3 * SIZE(AO), %xmm1
  542. mulpd -4 * SIZE(BO), %xmm0
  543. ADD1 %xmm0, %xmm4
  544. movddup -2 * SIZE(AO), %xmm0
  545. mulpd -4 * SIZE(BO), %xmm1
  546. ADD2 %xmm1, %xmm5
  547. movddup -1 * SIZE(AO), %xmm1
  548. mulpd -2 * SIZE(BO), %xmm0
  549. ADD1 %xmm0, %xmm6
  550. movddup 0 * SIZE(AO), %xmm0
  551. mulpd -2 * SIZE(BO), %xmm1
  552. ADD2 %xmm1, %xmm7
  553. movddup 1 * SIZE(AO), %xmm1
  554. subl $-16 * SIZE, AO
  555. subl $-16 * SIZE, BO
  556. decl %eax
  557. jne .L111
  558. ALIGN_4
  559. .L112:
  560. #ifndef TRMMKERNEL
  561. movl K, %eax
  562. #else
  563. movl KKK, %eax
  564. #endif
  565. andl $7, %eax # if (k & 1)
  566. BRANCH
  567. je .L114
  568. ALIGN_4
  569. .L113:
  570. mulpd -16 * SIZE(BO), %xmm0
  571. ADD1 %xmm0, %xmm4
  572. movddup -14 * SIZE(AO), %xmm0
  573. mulpd -16 * SIZE(BO), %xmm1
  574. ADD2 %xmm1, %xmm5
  575. movddup -13 * SIZE(AO), %xmm1
  576. addl $2 * SIZE, AO
  577. addl $2 * SIZE, BO
  578. decl %eax
  579. jg .L113
  580. ALIGN_4
  581. .L114:
  582. #ifndef TRMMKERNEL
  583. movupd 0 * SIZE(CO), %xmm0
  584. #endif
  585. movddup ALPHA_R, %xmm2
  586. movddup ALPHA_I, %xmm3
  587. addpd %xmm6, %xmm4
  588. addpd %xmm7, %xmm5
  589. SHUFPD_1 %xmm5, %xmm5
  590. #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \
  591. defined(RN) || defined(RT) || defined(CN) || defined(CT)
  592. addsubpd %xmm5, %xmm4
  593. pshufd $0x4e, %xmm4, %xmm5
  594. #else
  595. addsubpd %xmm4, %xmm5
  596. movapd %xmm5, %xmm4
  597. pshufd $0x4e, %xmm5, %xmm5
  598. #endif
  599. mulpd %xmm2, %xmm4
  600. mulpd %xmm3, %xmm5
  601. addsubpd %xmm5, %xmm4
  602. #ifndef TRMMKERNEL
  603. addpd %xmm0, %xmm4
  604. #endif
  605. movlpd %xmm4, 0 * SIZE(CO)
  606. movhpd %xmm4, 1 * SIZE(CO)
  607. #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  608. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  609. movl K, %eax
  610. subl KKK, %eax
  611. leal (,%eax, SIZE), %eax
  612. leal (AO, %eax, 2), AO
  613. leal (BO, %eax, 2), BO
  614. #endif
  615. #if defined(TRMMKERNEL) && defined(LEFT)
  616. addl $1, KK
  617. #endif
  618. addl $2 * SIZE, CO # coffset += 4
  619. decl I # i --
  620. jg .L110
  621. ALIGN_4
  622. .L500:
  623. popl %ebx
  624. popl %esi
  625. popl %edi
  626. popl %ebp
  627. addl $ARGS, %esp
  628. ret
  629. EPILOGUE