You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

zgemm_kernel_2x2_barcelona.S 30 kB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363
  1. /*********************************************************************/
  2. /* Copyright 2009, 2010 The University of Texas at Austin. */
  3. /* All rights reserved. */
  4. /* */
  5. /* Redistribution and use in source and binary forms, with or */
  6. /* without modification, are permitted provided that the following */
  7. /* conditions are met: */
  8. /* */
  9. /* 1. Redistributions of source code must retain the above */
  10. /* copyright notice, this list of conditions and the following */
  11. /* disclaimer. */
  12. /* */
  13. /* 2. Redistributions in binary form must reproduce the above */
  14. /* copyright notice, this list of conditions and the following */
  15. /* disclaimer in the documentation and/or other materials */
  16. /* provided with the distribution. */
  17. /* */
  18. /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
  19. /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
  20. /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
  21. /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
  22. /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
  23. /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
  24. /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
  25. /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
  26. /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
  27. /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
  28. /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
  29. /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
  30. /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
  31. /* POSSIBILITY OF SUCH DAMAGE. */
  32. /* */
  33. /* The views and conclusions contained in the software and */
  34. /* documentation are those of the authors and should not be */
  35. /* interpreted as representing official policies, either expressed */
  36. /* or implied, of The University of Texas at Austin. */
  37. /*********************************************************************/
  38. #define ASSEMBLER
  39. #include "common.h"
  40. #define STACK 16
  41. #define ARGS 0
  42. #define STACK_M 4 + STACK + ARGS(%esi)
  43. #define STACK_N 8 + STACK + ARGS(%esi)
  44. #define STACK_K 12 + STACK + ARGS(%esi)
  45. #define STACK_ALPHA_R 16 + STACK + ARGS(%esi)
  46. #define STACK_ALPHA_I 20 + STACK + ARGS(%esi)
  47. #define STACK_A 24 + STACK + ARGS(%esi)
  48. #define STACK_B 28 + STACK + ARGS(%esi)
  49. #define STACK_C 32 + STACK + ARGS(%esi)
  50. #define STACK_LDC 36 + STACK + ARGS(%esi)
  51. #define STACK_OFFT 40 + STACK + ARGS(%esi)
  52. #define POSINV 0(%esp)
  53. #define ALPHA_R 16(%esp)
  54. #define ALPHA_I 32(%esp)
  55. #define K 48(%esp)
  56. #define N 52(%esp)
  57. #define M 56(%esp)
  58. #define A 60(%esp)
  59. #define C 64(%esp)
  60. #define J 68(%esp)
  61. #define OLD_STACK 72(%esp)
  62. #define OFFSET 76(%esp)
  63. #define KK 80(%esp)
  64. #define KKK 84(%esp)
  65. #define BUFFER 128(%esp)
  66. #define B %edi
  67. #define LDC %ebp
  68. #define AA %edx
  69. #define BB %ecx
  70. #define STACK_ALIGN 4096
  71. #define STACK_OFFSET 1024
  72. #define PREFETCH prefetch
  73. #define PREFETCHSIZE (16 * 17 + 0)
  74. #define RPREFETCHSIZE (16 * 9 + 0)
  75. #define WPREFETCHSIZE (16 * 9 + 0)
  76. #define KERNEL1(address) \
  77. mulps %xmm0, %xmm2; \
  78. PREFETCH (PREFETCHSIZE + 0) * SIZE + (address) * 1 * SIZE(AA); \
  79. addps %xmm2, %xmm4; \
  80. movaps 4 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \
  81. mulps %xmm0, %xmm2; \
  82. addps %xmm2, %xmm5; \
  83. movaps 8 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \
  84. mulps %xmm0, %xmm2; \
  85. mulps 12 * SIZE + (address) * 4 * SIZE(BB), %xmm0; \
  86. addps %xmm2, %xmm6; \
  87. movaps 32 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \
  88. addps %xmm0, %xmm7; \
  89. movaps 4 * SIZE + (address) * 1 * SIZE(AA), %xmm0
  90. #define KERNEL2(address) \
  91. mulps %xmm0, %xmm3; \
  92. addps %xmm3, %xmm4; \
  93. movaps 20 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \
  94. mulps %xmm0, %xmm3; \
  95. addps %xmm3, %xmm5; \
  96. movaps 24 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \
  97. mulps %xmm0, %xmm3; \
  98. mulps 28 * SIZE + (address) * 4 * SIZE(BB), %xmm0; \
  99. addps %xmm3, %xmm6; \
  100. movaps 48 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \
  101. addps %xmm0, %xmm7; \
  102. movaps 8 * SIZE + (address) * 1 * SIZE(AA), %xmm0
  103. #define KERNEL3(address) \
  104. mulps %xmm0, %xmm2; \
  105. addps %xmm2, %xmm4; \
  106. movaps 36 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \
  107. mulps %xmm0, %xmm2; \
  108. addps %xmm2, %xmm5; \
  109. movaps 40 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \
  110. mulps %xmm0, %xmm2; \
  111. mulps 44 * SIZE + (address) * 4 * SIZE(BB), %xmm0; \
  112. addps %xmm2, %xmm6; \
  113. movaps 64 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \
  114. addps %xmm0, %xmm7; \
  115. movaps 12 * SIZE + (address) * 1 * SIZE(AA), %xmm0
  116. #define KERNEL4(address) \
  117. mulps %xmm0, %xmm3; \
  118. addps %xmm3, %xmm4; \
  119. movaps 52 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \
  120. mulps %xmm0, %xmm3; \
  121. addps %xmm3, %xmm5; \
  122. movaps 56 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \
  123. mulps %xmm0, %xmm3; \
  124. mulps 60 * SIZE + (address) * 4 * SIZE(BB), %xmm0; \
  125. addps %xmm3, %xmm6; \
  126. movaps 80 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \
  127. addps %xmm0, %xmm7; \
  128. movaps 32 * SIZE + (address) * 1 * SIZE(AA), %xmm0
  129. #define KERNEL5(address) \
  130. PREFETCH (PREFETCHSIZE + 16) * SIZE + (address) * 1 * SIZE(AA); \
  131. mulps %xmm1, %xmm2; \
  132. addps %xmm2, %xmm4; \
  133. movaps 68 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \
  134. mulps %xmm1, %xmm2; \
  135. addps %xmm2, %xmm5; \
  136. movaps 72 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \
  137. mulps %xmm1, %xmm2; \
  138. mulps 76 * SIZE + (address) * 4 * SIZE(BB), %xmm1; \
  139. addps %xmm2, %xmm6; \
  140. movaps 96 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \
  141. addps %xmm1, %xmm7; \
  142. movaps 20 * SIZE + (address) * 1 * SIZE(AA), %xmm1
  143. #define KERNEL6(address) \
  144. mulps %xmm1, %xmm3; \
  145. addps %xmm3, %xmm4; \
  146. movaps 84 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \
  147. mulps %xmm1, %xmm3; \
  148. addps %xmm3, %xmm5; \
  149. movaps 88 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \
  150. mulps %xmm1, %xmm3; \
  151. mulps 92 * SIZE + (address) * 4 * SIZE(BB), %xmm1; \
  152. addps %xmm3, %xmm6; \
  153. movaps 112 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \
  154. addps %xmm1, %xmm7; \
  155. movaps 24 * SIZE + (address) * 1 * SIZE(AA), %xmm1
  156. #define KERNEL7(address) \
  157. mulps %xmm1, %xmm2; \
  158. addps %xmm2, %xmm4; \
  159. movaps 100 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \
  160. mulps %xmm1, %xmm2; \
  161. addps %xmm2, %xmm5; \
  162. movaps 104 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \
  163. mulps %xmm1, %xmm2; \
  164. mulps 108 * SIZE + (address) * 4 * SIZE(BB), %xmm1; \
  165. addps %xmm2, %xmm6; \
  166. movaps 128 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \
  167. addps %xmm1, %xmm7; \
  168. movaps 28 * SIZE + (address) * 1 * SIZE(AA), %xmm1
  169. #define KERNEL8(address) \
  170. mulps %xmm1, %xmm3; \
  171. addps %xmm3, %xmm4; \
  172. movaps 116 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \
  173. mulps %xmm1, %xmm3; \
  174. addps %xmm3, %xmm5; \
  175. movaps 120 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \
  176. mulps %xmm1, %xmm3; \
  177. mulps 124 * SIZE + (address) * 4 * SIZE(BB), %xmm1; \
  178. addps %xmm3, %xmm6; \
  179. movaps 144 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \
  180. addps %xmm1, %xmm7; \
  181. movaps 48 * SIZE + (address) * 1 * SIZE(AA), %xmm1;
  182. PROLOGUE
  183. pushl %ebp
  184. pushl %edi
  185. pushl %esi
  186. pushl %ebx
  187. PROFCODE
  188. movl %esp, %esi # save old stack
  189. subl $128 + LOCAL_BUFFER_SIZE + STACK_OFFSET, %esp
  190. andl $-STACK_ALIGN, %esp # align stack
  191. addl $STACK_OFFSET, %esp
  192. STACK_TOUCHING
  193. movl STACK_M, %ebx
  194. movl STACK_N, %eax
  195. movl STACK_K, %ecx
  196. movl STACK_A, %edx
  197. movl %ebx, M
  198. movl %eax, N
  199. movl %ecx, K
  200. movl %edx, A
  201. movl %esi, OLD_STACK
  202. movl STACK_B, %edi
  203. movl STACK_C, %ebx
  204. #ifdef TRMMKERNEL
  205. movss STACK_OFFT, %xmm4
  206. #endif
  207. movss STACK_ALPHA_R, %xmm0
  208. movss STACK_ALPHA_I, %xmm1
  209. xorps %xmm7, %xmm7
  210. cmpeqps %xmm7, %xmm7
  211. pslld $31, %xmm7 # Generate mask
  212. xorps %xmm2, %xmm2
  213. shufps $0, %xmm0, %xmm0
  214. movaps %xmm0, 0 + ALPHA_R
  215. movss %xmm1, 4 + ALPHA_I
  216. movss %xmm1, 12 + ALPHA_I
  217. xorps %xmm7, %xmm1
  218. movss %xmm1, 0 + ALPHA_I
  219. movss %xmm1, 8 + ALPHA_I
  220. #if defined(NN) || defined(NT) || defined(NR) || defined(NC) || \
  221. defined(TN) || defined(TT) || defined(TR) || defined(TC)
  222. movss %xmm7, 0 + POSINV
  223. movss %xmm2, 4 + POSINV
  224. movss %xmm7, 8 + POSINV
  225. movss %xmm2, 12 + POSINV
  226. #else
  227. movss %xmm2, 0 + POSINV
  228. movss %xmm7, 4 + POSINV
  229. movss %xmm2, 8 + POSINV
  230. movss %xmm7, 12 + POSINV
  231. #endif
  232. EMMS
  233. movl %ebx, C
  234. movl STACK_LDC, LDC
  235. #ifdef TRMMKERNEL
  236. movss %xmm4, OFFSET
  237. movss %xmm4, KK
  238. #ifndef LEFT
  239. negl KK
  240. #endif
  241. #endif
  242. sall $ZBASE_SHIFT, LDC
  243. movl %eax, J # j = n
  244. sarl $1, J
  245. jle .L100
  246. ALIGN_4
  247. .L01:
  248. #if defined(TRMMKERNEL) && defined(LEFT)
  249. movl OFFSET, %eax
  250. movl %eax, KK
  251. #endif
  252. /* Copying to Sub Buffer */
  253. leal BUFFER, %ecx
  254. movaps POSINV, %xmm7
  255. movl K, %eax
  256. sarl $1, %eax
  257. jle .L03
  258. ALIGN_4
  259. .L02:
  260. prefetch (RPREFETCHSIZE + 0) * SIZE(%edi)
  261. movaps 0 * SIZE(%edi), %xmm3
  262. pshufd $0x00, %xmm3, %xmm0
  263. pshufd $0x55, %xmm3, %xmm1
  264. pshufd $0xaa, %xmm3, %xmm2
  265. pshufd $0xff, %xmm3, %xmm3
  266. #if defined(NN) || defined(NT) || defined(NR) || defined(NC) || \
  267. defined(TN) || defined(TT) || defined(TR) || defined(TC)
  268. xorps %xmm7, %xmm1
  269. xorps %xmm7, %xmm3
  270. #else
  271. xorps %xmm7, %xmm0
  272. xorps %xmm7, %xmm2
  273. #endif
  274. prefetchw (WPREFETCHSIZE + 0) * SIZE(%ecx)
  275. movaps %xmm0, 0 * SIZE(%ecx)
  276. movaps %xmm1, 4 * SIZE(%ecx)
  277. movaps %xmm2, 8 * SIZE(%ecx)
  278. movaps %xmm3, 12 * SIZE(%ecx)
  279. movaps 4 * SIZE(%edi), %xmm3
  280. pshufd $0x00, %xmm3, %xmm0
  281. pshufd $0x55, %xmm3, %xmm1
  282. pshufd $0xaa, %xmm3, %xmm2
  283. pshufd $0xff, %xmm3, %xmm3
  284. #if defined(NN) || defined(NT) || defined(NR) || defined(NC) || \
  285. defined(TN) || defined(TT) || defined(TR) || defined(TC)
  286. xorps %xmm7, %xmm1
  287. xorps %xmm7, %xmm3
  288. #else
  289. xorps %xmm7, %xmm0
  290. xorps %xmm7, %xmm2
  291. #endif
  292. prefetchw (WPREFETCHSIZE + 0) * SIZE(%ecx)
  293. movaps %xmm0, 16 * SIZE(%ecx)
  294. movaps %xmm1, 20 * SIZE(%ecx)
  295. movaps %xmm2, 24 * SIZE(%ecx)
  296. movaps %xmm3, 28 * SIZE(%ecx)
  297. addl $ 8 * SIZE, %edi
  298. subl $-32 * SIZE, %ecx
  299. decl %eax
  300. jne .L02
  301. ALIGN_4
  302. .L03:
  303. movl K, %eax
  304. andl $1, %eax
  305. BRANCH
  306. jle .L05
  307. ALIGN_4
  308. .L04:
  309. movaps 0 * SIZE(%edi), %xmm3
  310. pshufd $0x00, %xmm3, %xmm0
  311. pshufd $0x55, %xmm3, %xmm1
  312. pshufd $0xaa, %xmm3, %xmm2
  313. pshufd $0xff, %xmm3, %xmm3
  314. #if defined(NN) || defined(NT) || defined(NR) || defined(NC) || \
  315. defined(TN) || defined(TT) || defined(TR) || defined(TC)
  316. xorps %xmm7, %xmm1
  317. xorps %xmm7, %xmm3
  318. #else
  319. xorps %xmm7, %xmm0
  320. xorps %xmm7, %xmm2
  321. #endif
  322. movaps %xmm0, 0 * SIZE(%ecx)
  323. movaps %xmm1, 4 * SIZE(%ecx)
  324. movaps %xmm2, 8 * SIZE(%ecx)
  325. movaps %xmm3, 12 * SIZE(%ecx)
  326. addl $ 4 * SIZE, %edi
  327. ALIGN_4
  328. .L05:
  329. movl C, %esi
  330. movl A, %edx
  331. movl M, %ebx
  332. sarl $1, %ebx
  333. jle .L30
  334. ALIGN_4
  335. .L10:
  336. #if !defined(TRMMKERNEL) || \
  337. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  338. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  339. leal BUFFER, BB # boffset1 = boffset
  340. #else
  341. leal BUFFER, BB # boffset1 = boffset
  342. movl KK, %eax
  343. leal (, %eax, 8), %eax
  344. leal (AA, %eax, 2), AA
  345. leal (BB, %eax, 8), BB
  346. #endif
  347. movaps 0 * SIZE(AA), %xmm0
  348. pxor %xmm4, %xmm4
  349. movaps 16 * SIZE(AA), %xmm1
  350. pxor %xmm5, %xmm5
  351. movaps 0 * SIZE(BB), %xmm2
  352. pxor %xmm6, %xmm6
  353. movaps 16 * SIZE(BB), %xmm3
  354. pxor %xmm7, %xmm7
  355. prefetchw 3 * SIZE(%esi)
  356. prefetchw 3 * SIZE(%esi, LDC)
  357. #ifndef TRMMKERNEL
  358. movl K, %eax
  359. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  360. movl K, %eax
  361. subl KK, %eax
  362. movl %eax, KKK
  363. #else
  364. movl KK, %eax
  365. #ifdef LEFT
  366. addl $2, %eax
  367. #else
  368. addl $2, %eax
  369. #endif
  370. movl %eax, KKK
  371. #endif
  372. andl $-8, %eax
  373. sall $4, %eax
  374. je .L15
  375. .L1X:
  376. KERNEL1(32 * 0)
  377. KERNEL2(32 * 0)
  378. KERNEL3(32 * 0)
  379. KERNEL4(32 * 0)
  380. KERNEL5(32 * 0)
  381. KERNEL6(32 * 0)
  382. KERNEL7(32 * 0)
  383. KERNEL8(32 * 0)
  384. cmpl $128 * 1, %eax
  385. jle .L12
  386. KERNEL1(32 * 1)
  387. KERNEL2(32 * 1)
  388. KERNEL3(32 * 1)
  389. KERNEL4(32 * 1)
  390. KERNEL5(32 * 1)
  391. KERNEL6(32 * 1)
  392. KERNEL7(32 * 1)
  393. KERNEL8(32 * 1)
  394. cmpl $128 * 2, %eax
  395. jle .L12
  396. KERNEL1(32 * 2)
  397. KERNEL2(32 * 2)
  398. KERNEL3(32 * 2)
  399. KERNEL4(32 * 2)
  400. KERNEL5(32 * 2)
  401. KERNEL6(32 * 2)
  402. KERNEL7(32 * 2)
  403. KERNEL8(32 * 2)
  404. cmpl $128 * 3, %eax
  405. jle .L12
  406. KERNEL1(32 * 3)
  407. KERNEL2(32 * 3)
  408. KERNEL3(32 * 3)
  409. KERNEL4(32 * 3)
  410. KERNEL5(32 * 3)
  411. KERNEL6(32 * 3)
  412. KERNEL7(32 * 3)
  413. KERNEL8(32 * 3)
  414. cmpl $128 * 4, %eax
  415. jle .L12
  416. KERNEL1(32 * 4)
  417. KERNEL2(32 * 4)
  418. KERNEL3(32 * 4)
  419. KERNEL4(32 * 4)
  420. KERNEL5(32 * 4)
  421. KERNEL6(32 * 4)
  422. KERNEL7(32 * 4)
  423. KERNEL8(32 * 4)
  424. cmpl $128 * 5, %eax
  425. jle .L12
  426. KERNEL1(32 * 5)
  427. KERNEL2(32 * 5)
  428. KERNEL3(32 * 5)
  429. KERNEL4(32 * 5)
  430. KERNEL5(32 * 5)
  431. KERNEL6(32 * 5)
  432. KERNEL7(32 * 5)
  433. KERNEL8(32 * 5)
  434. cmpl $128 * 6, %eax
  435. jle .L12
  436. KERNEL1(32 * 6)
  437. KERNEL2(32 * 6)
  438. KERNEL3(32 * 6)
  439. KERNEL4(32 * 6)
  440. KERNEL5(32 * 6)
  441. KERNEL6(32 * 6)
  442. KERNEL7(32 * 6)
  443. KERNEL8(32 * 6)
  444. cmpl $128 * 7, %eax
  445. jle .L12
  446. KERNEL1(32 * 7)
  447. KERNEL2(32 * 7)
  448. KERNEL3(32 * 7)
  449. KERNEL4(32 * 7)
  450. KERNEL5(32 * 7)
  451. KERNEL6(32 * 7)
  452. KERNEL7(32 * 7)
  453. KERNEL8(32 * 7)
  454. addl $128 * 8 * SIZE, BB
  455. addl $128 * 2 * SIZE, AA
  456. subl $128 * 8, %eax
  457. jg .L1X
  458. jmp .L15
  459. .L12:
  460. leal (AA, %eax, 1), AA
  461. leal (BB, %eax, 4), BB
  462. ALIGN_4
  463. .L15:
  464. #ifndef TRMMKERNEL
  465. movl K, %eax
  466. #else
  467. movl KKK, %eax
  468. #endif
  469. movaps ALPHA_R, %xmm1
  470. movaps ALPHA_I, %xmm3
  471. andl $7, %eax # if (k & 1)
  472. BRANCH
  473. je .L14
  474. ALIGN_4
  475. .L13:
  476. mulps %xmm0, %xmm2
  477. addps %xmm2, %xmm4
  478. movaps 4 * SIZE(BB), %xmm2
  479. mulps %xmm0, %xmm2
  480. addps %xmm2, %xmm5
  481. movaps 8 * SIZE(BB), %xmm2
  482. mulps %xmm0, %xmm2
  483. mulps 12 * SIZE(BB), %xmm0
  484. addps %xmm2, %xmm6
  485. movaps 16 * SIZE(BB), %xmm2
  486. addps %xmm0, %xmm7
  487. movaps 4 * SIZE(AA), %xmm0
  488. addl $ 4 * SIZE, AA
  489. addl $16 * SIZE, BB
  490. decl %eax
  491. jg .L13
  492. ALIGN_4
  493. .L14:
  494. shufps $0xb1, %xmm5, %xmm5
  495. shufps $0xb1, %xmm7, %xmm7
  496. #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \
  497. defined(RR) || defined(RC) || defined(CR) || defined(CC)
  498. subps %xmm5, %xmm4
  499. subps %xmm7, %xmm6
  500. #else
  501. addps %xmm5, %xmm4
  502. addps %xmm7, %xmm6
  503. #endif
  504. movaps %xmm4, %xmm5
  505. movaps %xmm6, %xmm7
  506. shufps $0xb1, %xmm4, %xmm4
  507. shufps $0xb1, %xmm6, %xmm6
  508. mulps %xmm1, %xmm5
  509. mulps %xmm3, %xmm4
  510. mulps %xmm1, %xmm7
  511. mulps %xmm3, %xmm6
  512. addps %xmm5, %xmm4
  513. addps %xmm7, %xmm6
  514. #ifndef TRMMKERNEL
  515. shufps $0xe4, %xmm0, %xmm0
  516. movsd 0 * SIZE(%esi), %xmm0
  517. movhps 2 * SIZE(%esi), %xmm0
  518. shufps $0xe4, %xmm2, %xmm2
  519. movsd 0 * SIZE(%esi, LDC), %xmm2
  520. movhps 2 * SIZE(%esi, LDC), %xmm2
  521. addps %xmm0, %xmm4
  522. addps %xmm2, %xmm6
  523. #endif
  524. movlps %xmm4, 0 * SIZE(%esi)
  525. movhps %xmm4, 2 * SIZE(%esi)
  526. movlps %xmm6, 0 * SIZE(%esi, LDC)
  527. movhps %xmm6, 2 * SIZE(%esi, LDC)
  528. #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  529. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  530. movl K, %eax
  531. subl KKK, %eax
  532. leal (,%eax, 8), %eax
  533. leal (AA, %eax, 2), AA
  534. leal (BB, %eax, 8), BB
  535. #endif
  536. #if defined(TRMMKERNEL) && defined(LEFT)
  537. addl $2, KK
  538. #endif
  539. addl $4 * SIZE, %esi # coffset += 4
  540. decl %ebx # i --
  541. jg .L10
  542. ALIGN_4
  543. .L30:
  544. movl M, %ebx
  545. andl $1, %ebx
  546. jle .L99
  547. ALIGN_4
  548. .L40:
  549. #if !defined(TRMMKERNEL) || \
  550. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  551. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  552. leal BUFFER, BB # boffset1 = boffset
  553. #else
  554. leal BUFFER, BB # boffset1 = boffset
  555. movl KK, %eax
  556. leal (, %eax, 8), %eax
  557. leal (AA, %eax, 1), AA
  558. leal (BB, %eax, 8), BB
  559. #endif
  560. pxor %xmm4, %xmm4
  561. pxor %xmm5, %xmm5
  562. pxor %xmm6, %xmm6
  563. pxor %xmm7, %xmm7
  564. movsd 0 * SIZE(AA), %xmm0
  565. movsd 8 * SIZE(AA), %xmm1
  566. movaps 0 * SIZE(BB), %xmm2
  567. movaps 16 * SIZE(BB), %xmm3
  568. #ifndef TRMMKERNEL
  569. movl K, %eax
  570. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  571. movl K, %eax
  572. subl KK, %eax
  573. movl %eax, KKK
  574. #else
  575. movl KK, %eax
  576. #ifdef LEFT
  577. addl $1, %eax
  578. #else
  579. addl $2, %eax
  580. #endif
  581. movl %eax, KKK
  582. #endif
  583. sarl $3, %eax
  584. je .L42
  585. ALIGN_4
  586. .L41:
  587. mulps %xmm0, %xmm2
  588. prefetcht1 (PREFETCHSIZE + 0) * SIZE(AA)
  589. addps %xmm2, %xmm4
  590. movaps 4 * SIZE(BB), %xmm2
  591. mulps %xmm0, %xmm2
  592. addps %xmm2, %xmm5
  593. movaps 8 * SIZE(BB), %xmm2
  594. mulps %xmm0, %xmm2
  595. mulps 12 * SIZE(BB), %xmm0
  596. addps %xmm2, %xmm6
  597. movaps 32 * SIZE(BB), %xmm2
  598. addps %xmm0, %xmm7
  599. movsd 2 * SIZE(AA), %xmm0
  600. mulps %xmm0, %xmm3
  601. addps %xmm3, %xmm4
  602. movaps 20 * SIZE(BB), %xmm3
  603. mulps %xmm0, %xmm3
  604. addps %xmm3, %xmm5
  605. movaps 24 * SIZE(BB), %xmm3
  606. mulps %xmm0, %xmm3
  607. mulps 28 * SIZE(BB), %xmm0
  608. addps %xmm3, %xmm6
  609. movaps 48 * SIZE(BB), %xmm3
  610. addps %xmm0, %xmm7
  611. movsd 4 * SIZE(AA), %xmm0
  612. mulps %xmm0, %xmm2
  613. addps %xmm2, %xmm4
  614. movaps 36 * SIZE(BB), %xmm2
  615. mulps %xmm0, %xmm2
  616. addps %xmm2, %xmm5
  617. movaps 40 * SIZE(BB), %xmm2
  618. mulps %xmm0, %xmm2
  619. mulps 44 * SIZE(BB), %xmm0
  620. addps %xmm2, %xmm6
  621. movaps 64 * SIZE(BB), %xmm2
  622. addps %xmm0, %xmm7
  623. movsd 6 * SIZE(AA), %xmm0
  624. mulps %xmm0, %xmm3
  625. addps %xmm3, %xmm4
  626. movaps 52 * SIZE(BB), %xmm3
  627. mulps %xmm0, %xmm3
  628. addps %xmm3, %xmm5
  629. movaps 56 * SIZE(BB), %xmm3
  630. mulps %xmm0, %xmm3
  631. mulps 60 * SIZE(BB), %xmm0
  632. addps %xmm3, %xmm6
  633. movaps 80 * SIZE(BB), %xmm3
  634. addps %xmm0, %xmm7
  635. movsd 16 * SIZE(AA), %xmm0
  636. mulps %xmm1, %xmm2
  637. prefetcht1 (PREFETCHSIZE + 16) * SIZE(AA)
  638. addps %xmm2, %xmm4
  639. movaps 68 * SIZE(BB), %xmm2
  640. mulps %xmm1, %xmm2
  641. addps %xmm2, %xmm5
  642. movaps 72 * SIZE(BB), %xmm2
  643. mulps %xmm1, %xmm2
  644. mulps 76 * SIZE(BB), %xmm1
  645. addps %xmm2, %xmm6
  646. movaps 96 * SIZE(BB), %xmm2
  647. addps %xmm1, %xmm7
  648. movsd 10 * SIZE(AA), %xmm1
  649. mulps %xmm1, %xmm3
  650. addps %xmm3, %xmm4
  651. movaps 84 * SIZE(BB), %xmm3
  652. mulps %xmm1, %xmm3
  653. addps %xmm3, %xmm5
  654. movaps 88 * SIZE(BB), %xmm3
  655. mulps %xmm1, %xmm3
  656. mulps 92 * SIZE(BB), %xmm1
  657. addps %xmm3, %xmm6
  658. movaps 112 * SIZE(BB), %xmm3
  659. addps %xmm1, %xmm7
  660. movsd 12 * SIZE(AA), %xmm1
  661. mulps %xmm1, %xmm2
  662. addps %xmm2, %xmm4
  663. movaps 100 * SIZE(BB), %xmm2
  664. mulps %xmm1, %xmm2
  665. addps %xmm2, %xmm5
  666. movaps 104 * SIZE(BB), %xmm2
  667. mulps %xmm1, %xmm2
  668. mulps 108 * SIZE(BB), %xmm1
  669. addps %xmm2, %xmm6
  670. movaps 128 * SIZE(BB), %xmm2
  671. addps %xmm1, %xmm7
  672. movsd 14 * SIZE(AA), %xmm1
  673. mulps %xmm1, %xmm3
  674. addps %xmm3, %xmm4
  675. movaps 116 * SIZE(BB), %xmm3
  676. mulps %xmm1, %xmm3
  677. addps %xmm3, %xmm5
  678. movaps 120 * SIZE(BB), %xmm3
  679. mulps %xmm1, %xmm3
  680. mulps 124 * SIZE(BB), %xmm1
  681. addps %xmm3, %xmm6
  682. movaps 144 * SIZE(BB), %xmm3
  683. addps %xmm1, %xmm7
  684. movsd 24 * SIZE(AA), %xmm1
  685. addl $ 16 * SIZE, AA
  686. addl $128 * SIZE, BB
  687. decl %eax
  688. jne .L41
  689. ALIGN_4
  690. .L42:
  691. #ifndef TRMMKERNEL
  692. movl K, %eax
  693. #else
  694. movl KKK, %eax
  695. #endif
  696. movaps ALPHA_R, %xmm1
  697. movaps ALPHA_I, %xmm3
  698. andl $7, %eax # if (k & 1)
  699. BRANCH
  700. je .L44
  701. ALIGN_4
  702. .L43:
  703. mulps %xmm0, %xmm2
  704. addps %xmm2, %xmm4
  705. movaps 4 * SIZE(BB), %xmm2
  706. mulps %xmm0, %xmm2
  707. addps %xmm2, %xmm5
  708. movaps 8 * SIZE(BB), %xmm2
  709. mulps %xmm0, %xmm2
  710. mulps 12 * SIZE(BB), %xmm0
  711. addps %xmm2, %xmm6
  712. movaps 16 * SIZE(BB), %xmm2
  713. addps %xmm0, %xmm7
  714. movsd 2 * SIZE(AA), %xmm0
  715. addl $ 2 * SIZE, AA
  716. addl $16 * SIZE, BB
  717. decl %eax
  718. jg .L43
  719. ALIGN_4
  720. .L44:
  721. shufps $0xb1, %xmm5, %xmm5
  722. shufps $0xb1, %xmm7, %xmm7
  723. #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \
  724. defined(RR) || defined(RC) || defined(CR) || defined(CC)
  725. subps %xmm5, %xmm4
  726. subps %xmm7, %xmm6
  727. #else
  728. addps %xmm5, %xmm4
  729. addps %xmm7, %xmm6
  730. #endif
  731. movaps %xmm4, %xmm5
  732. movaps %xmm6, %xmm7
  733. shufps $0xb1, %xmm4, %xmm4
  734. shufps $0xb1, %xmm6, %xmm6
  735. mulps %xmm1, %xmm5
  736. mulps %xmm3, %xmm4
  737. mulps %xmm1, %xmm7
  738. mulps %xmm3, %xmm6
  739. addps %xmm5, %xmm4
  740. addps %xmm7, %xmm6
  741. #ifndef TRMMKERNEL
  742. shufps $0xe4, %xmm4, %xmm4
  743. shufps $0xe4, %xmm6, %xmm6
  744. movsd 0 * SIZE(%esi), %xmm0
  745. movsd 0 * SIZE(%esi, LDC), %xmm2
  746. addps %xmm0, %xmm4
  747. addps %xmm2, %xmm6
  748. #endif
  749. movlps %xmm4, 0 * SIZE(%esi)
  750. movlps %xmm6, 0 * SIZE(%esi, LDC)
  751. #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  752. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  753. movl K, %eax
  754. subl KKK, %eax
  755. leal (,%eax, 8), %eax
  756. leal (AA, %eax, 1), AA
  757. leal (BB, %eax, 8), BB
  758. #endif
  759. #if defined(TRMMKERNEL) && defined(LEFT)
  760. addl $1, KK
  761. #endif
  762. ALIGN_4
  763. .L99:
  764. #if defined(TRMMKERNEL) && !defined(LEFT)
  765. addl $2, KK
  766. #endif
  767. leal (LDC, LDC), %eax
  768. addl %eax, C # c += 2 * ldc
  769. decl J # j --
  770. jg .L01
  771. ALIGN_4
  772. .L100:
  773. movl N, %eax
  774. andl $1, %eax
  775. jle .L999
  776. ALIGN_4
  777. .L101:
  778. #if defined(TRMMKERNEL) && defined(LEFT)
  779. movl OFFSET, %eax
  780. movl %eax, KK
  781. #endif
  782. /* Copying to Sub Buffer */
  783. leal BUFFER, %ecx
  784. movaps POSINV, %xmm7
  785. movl K, %eax
  786. sarl $2, %eax
  787. jle .L103
  788. ALIGN_4
  789. .L102:
  790. prefetch (RPREFETCHSIZE + 0) * SIZE(%edi)
  791. movaps 0 * SIZE(%edi), %xmm3
  792. pshufd $0x00, %xmm3, %xmm0
  793. pshufd $0x55, %xmm3, %xmm1
  794. pshufd $0xaa, %xmm3, %xmm2
  795. pshufd $0xff, %xmm3, %xmm3
  796. #if defined(NN) || defined(NT) || defined(NR) || defined(NC) || \
  797. defined(TN) || defined(TT) || defined(TR) || defined(TC)
  798. xorps %xmm7, %xmm1
  799. xorps %xmm7, %xmm3
  800. #else
  801. xorps %xmm7, %xmm0
  802. xorps %xmm7, %xmm2
  803. #endif
  804. prefetchw (WPREFETCHSIZE + 0) * SIZE(%ecx)
  805. movaps %xmm0, 0 * SIZE(%ecx)
  806. movaps %xmm1, 4 * SIZE(%ecx)
  807. movaps %xmm2, 8 * SIZE(%ecx)
  808. movaps %xmm3, 12 * SIZE(%ecx)
  809. movaps 4 * SIZE(%edi), %xmm3
  810. pshufd $0x00, %xmm3, %xmm0
  811. pshufd $0x55, %xmm3, %xmm1
  812. pshufd $0xaa, %xmm3, %xmm2
  813. pshufd $0xff, %xmm3, %xmm3
  814. #if defined(NN) || defined(NT) || defined(NR) || defined(NC) || \
  815. defined(TN) || defined(TT) || defined(TR) || defined(TC)
  816. xorps %xmm7, %xmm1
  817. xorps %xmm7, %xmm3
  818. #else
  819. xorps %xmm7, %xmm0
  820. xorps %xmm7, %xmm2
  821. #endif
  822. prefetchw (WPREFETCHSIZE + 0) * SIZE(%ecx)
  823. movaps %xmm0, 16 * SIZE(%ecx)
  824. movaps %xmm1, 20 * SIZE(%ecx)
  825. movaps %xmm2, 24 * SIZE(%ecx)
  826. movaps %xmm3, 28 * SIZE(%ecx)
  827. addl $ 8 * SIZE, B
  828. subl $-32 * SIZE, BB
  829. decl %eax
  830. jne .L102
  831. ALIGN_4
  832. .L103:
  833. movl K, %eax
  834. andl $3, %eax
  835. BRANCH
  836. jle .L105
  837. ALIGN_4
  838. .L104:
  839. movsd 0 * SIZE(%edi), %xmm3
  840. pshufd $0x00, %xmm3, %xmm0
  841. pshufd $0x55, %xmm3, %xmm1
  842. #if defined(NN) || defined(NT) || defined(NR) || defined(NC) || \
  843. defined(TN) || defined(TT) || defined(TR) || defined(TC)
  844. xorps %xmm7, %xmm1
  845. #else
  846. xorps %xmm7, %xmm0
  847. #endif
  848. movaps %xmm0, 0 * SIZE(%ecx)
  849. movaps %xmm1, 4 * SIZE(%ecx)
  850. addl $ 2 * SIZE, %edi
  851. addl $ 8 * SIZE, %ecx
  852. decl %eax
  853. jne .L104
  854. ALIGN_4
  855. .L105:
  856. movl C, %esi
  857. movl A, AA
  858. movl M, %ebx
  859. sarl $1, %ebx
  860. jle .L130
  861. ALIGN_4
  862. .L110:
  863. #if !defined(TRMMKERNEL) || \
  864. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  865. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  866. leal BUFFER, BB # boffset1 = boffset
  867. #else
  868. leal BUFFER, BB # boffset1 = boffset
  869. movl KK, %eax
  870. leal (, %eax, 8), %eax
  871. leal (AA, %eax, 2), AA
  872. leal (BB, %eax, 4), BB
  873. #endif
  874. pxor %xmm4, %xmm4
  875. pxor %xmm5, %xmm5
  876. pxor %xmm6, %xmm6
  877. pxor %xmm7, %xmm7
  878. movaps 0 * SIZE(AA), %xmm0
  879. movaps 16 * SIZE(AA), %xmm1
  880. movaps 0 * SIZE(BB), %xmm2
  881. movaps 16 * SIZE(BB), %xmm3
  882. prefetchw 3 * SIZE(%esi)
  883. #ifndef TRMMKERNEL
  884. movl K, %eax
  885. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  886. movl K, %eax
  887. subl KK, %eax
  888. movl %eax, KKK
  889. #else
  890. movl KK, %eax
  891. #ifdef LEFT
  892. addl $2, %eax
  893. #else
  894. addl $1, %eax
  895. #endif
  896. movl %eax, KKK
  897. #endif
  898. sarl $3, %eax
  899. je .L112
  900. ALIGN_4
  901. .L111:
  902. mulps %xmm0, %xmm2
  903. addps %xmm2, %xmm4
  904. movaps 4 * SIZE(BB), %xmm2
  905. mulps %xmm0, %xmm2
  906. movaps 4 * SIZE(AA), %xmm0
  907. addps %xmm2, %xmm5
  908. movaps 8 * SIZE(BB), %xmm2
  909. mulps %xmm0, %xmm2
  910. addps %xmm2, %xmm6
  911. movaps 12 * SIZE(BB), %xmm2
  912. mulps %xmm0, %xmm2
  913. movaps 8 * SIZE(AA), %xmm0
  914. addps %xmm2, %xmm7
  915. movaps 32 * SIZE(BB), %xmm2
  916. mulps %xmm0, %xmm3
  917. addps %xmm3, %xmm4
  918. movaps 20 * SIZE(BB), %xmm3
  919. mulps %xmm0, %xmm3
  920. movaps 12 * SIZE(AA), %xmm0
  921. addps %xmm3, %xmm5
  922. movaps 24 * SIZE(BB), %xmm3
  923. mulps %xmm0, %xmm3
  924. addps %xmm3, %xmm6
  925. movaps 28 * SIZE(BB), %xmm3
  926. mulps %xmm0, %xmm3
  927. movaps 32 * SIZE(AA), %xmm0
  928. addps %xmm3, %xmm7
  929. movaps 48 * SIZE(BB), %xmm3
  930. mulps %xmm1, %xmm2
  931. addps %xmm2, %xmm4
  932. movaps 36 * SIZE(BB), %xmm2
  933. mulps %xmm1, %xmm2
  934. movaps 20 * SIZE(AA), %xmm1
  935. addps %xmm2, %xmm5
  936. movaps 40 * SIZE(BB), %xmm2
  937. mulps %xmm1, %xmm2
  938. addps %xmm2, %xmm6
  939. movaps 44 * SIZE(BB), %xmm2
  940. mulps %xmm1, %xmm2
  941. movaps 24 * SIZE(AA), %xmm1
  942. addps %xmm2, %xmm7
  943. movaps 64 * SIZE(BB), %xmm2
  944. mulps %xmm1, %xmm3
  945. addps %xmm3, %xmm4
  946. movaps 52 * SIZE(BB), %xmm3
  947. mulps %xmm1, %xmm3
  948. movaps 28 * SIZE(AA), %xmm1
  949. addps %xmm3, %xmm5
  950. movaps 56 * SIZE(BB), %xmm3
  951. mulps %xmm1, %xmm3
  952. addps %xmm3, %xmm6
  953. movaps 60 * SIZE(BB), %xmm3
  954. mulps %xmm1, %xmm3
  955. movaps 48 * SIZE(AA), %xmm1
  956. addps %xmm3, %xmm7
  957. movaps 80 * SIZE(BB), %xmm3
  958. addl $ 32 * SIZE, AA
  959. addl $ 64 * SIZE, BB
  960. decl %eax
  961. jne .L111
  962. ALIGN_4
  963. .L112:
  964. #ifndef TRMMKERNEL
  965. movl K, %eax
  966. #else
  967. movl KKK, %eax
  968. #endif
  969. movaps ALPHA_R, %xmm1
  970. movaps ALPHA_I, %xmm3
  971. andl $7, %eax # if (k & 1)
  972. BRANCH
  973. je .L114
  974. ALIGN_4
  975. .L113:
  976. mulps %xmm0, %xmm2
  977. mulps 4 * SIZE(BB), %xmm0
  978. addps %xmm2, %xmm4
  979. movaps 8 * SIZE(BB), %xmm2
  980. addps %xmm0, %xmm5
  981. movaps 4 * SIZE(AA), %xmm0
  982. addl $ 4 * SIZE, AA
  983. addl $ 8 * SIZE, BB
  984. decl %eax
  985. jg .L113
  986. ALIGN_4
  987. .L114:
  988. addps %xmm6, %xmm4
  989. addps %xmm7, %xmm5
  990. shufps $0xb1, %xmm5, %xmm5
  991. #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \
  992. defined(RR) || defined(RC) || defined(CR) || defined(CC)
  993. subps %xmm5, %xmm4
  994. #else
  995. addps %xmm5, %xmm4
  996. #endif
  997. movaps %xmm4, %xmm5
  998. shufps $0xb1, %xmm4, %xmm4
  999. mulps %xmm1, %xmm5
  1000. mulps %xmm3, %xmm4
  1001. addps %xmm5, %xmm4
  1002. #ifndef TRMMKERNEL
  1003. shufps $0xe4, %xmm4, %xmm4
  1004. movsd 0 * SIZE(%esi), %xmm0
  1005. movhps 2 * SIZE(%esi), %xmm0
  1006. addps %xmm0, %xmm4
  1007. #endif
  1008. movlps %xmm4, 0 * SIZE(%esi)
  1009. movhps %xmm4, 2 * SIZE(%esi)
  1010. #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  1011. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  1012. movl K, %eax
  1013. subl KKK, %eax
  1014. leal (,%eax, 8), %eax
  1015. leal (AA, %eax, 2), AA
  1016. leal (BB, %eax, 4), BB
  1017. #endif
  1018. #if defined(TRMMKERNEL) && defined(LEFT)
  1019. addl $2, KK
  1020. #endif
  1021. addl $4 * SIZE, %esi # coffset += 4
  1022. decl %ebx # i --
  1023. jg .L110
  1024. ALIGN_4
  1025. .L130:
  1026. movl M, %ebx
  1027. andl $1, %ebx
  1028. jle .L999
  1029. ALIGN_4
  1030. .L140:
  1031. #if !defined(TRMMKERNEL) || \
  1032. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  1033. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  1034. leal BUFFER, BB # boffset1 = boffset
  1035. #else
  1036. leal BUFFER, BB # boffset1 = boffset
  1037. movl KK, %eax
  1038. leal (, %eax, 8), %eax
  1039. leal (AA, %eax, 1), AA
  1040. leal (BB, %eax, 4), BB
  1041. #endif
  1042. movsd 0 * SIZE(AA), %xmm0
  1043. pxor %xmm4, %xmm4
  1044. movsd 8 * SIZE(AA), %xmm1
  1045. pxor %xmm5, %xmm5
  1046. movaps 0 * SIZE(BB), %xmm2
  1047. pxor %xmm6, %xmm6
  1048. movaps 16 * SIZE(BB), %xmm3
  1049. pxor %xmm7, %xmm7
  1050. #ifndef TRMMKERNEL
  1051. movl K, %eax
  1052. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1053. movl K, %eax
  1054. subl KK, %eax
  1055. movl %eax, KKK
  1056. #else
  1057. movl KK, %eax
  1058. #ifdef LEFT
  1059. addl $1, %eax
  1060. #else
  1061. addl $1, %eax
  1062. #endif
  1063. movl %eax, KKK
  1064. #endif
  1065. sarl $3, %eax
  1066. je .L142
  1067. ALIGN_4
  1068. .L141:
  1069. mulps %xmm0, %xmm2
  1070. addps %xmm2, %xmm4
  1071. movaps 4 * SIZE(BB), %xmm2
  1072. mulps %xmm0, %xmm2
  1073. movsd 2 * SIZE(AA), %xmm0
  1074. addps %xmm2, %xmm5
  1075. movaps 8 * SIZE(BB), %xmm2
  1076. mulps %xmm0, %xmm2
  1077. addps %xmm2, %xmm6
  1078. movaps 12 * SIZE(BB), %xmm2
  1079. mulps %xmm0, %xmm2
  1080. movsd 4 * SIZE(AA), %xmm0
  1081. addps %xmm2, %xmm7
  1082. movaps 32 * SIZE(BB), %xmm2
  1083. mulps %xmm0, %xmm3
  1084. addps %xmm3, %xmm4
  1085. movaps 20 * SIZE(BB), %xmm3
  1086. mulps %xmm0, %xmm3
  1087. movsd 6 * SIZE(AA), %xmm0
  1088. addps %xmm3, %xmm5
  1089. movaps 24 * SIZE(BB), %xmm3
  1090. mulps %xmm0, %xmm3
  1091. addps %xmm3, %xmm6
  1092. movaps 28 * SIZE(BB), %xmm3
  1093. mulps %xmm0, %xmm3
  1094. movsd 16 * SIZE(AA), %xmm0
  1095. addps %xmm3, %xmm7
  1096. movaps 48 * SIZE(BB), %xmm3
  1097. mulps %xmm1, %xmm2
  1098. addps %xmm2, %xmm4
  1099. movaps 36 * SIZE(BB), %xmm2
  1100. mulps %xmm1, %xmm2
  1101. movsd 10 * SIZE(AA), %xmm1
  1102. addps %xmm2, %xmm5
  1103. movaps 40 * SIZE(BB), %xmm2
  1104. mulps %xmm1, %xmm2
  1105. addps %xmm2, %xmm6
  1106. movaps 44 * SIZE(BB), %xmm2
  1107. mulps %xmm1, %xmm2
  1108. movsd 12 * SIZE(AA), %xmm1
  1109. addps %xmm2, %xmm7
  1110. movaps 64 * SIZE(BB), %xmm2
  1111. mulps %xmm1, %xmm3
  1112. addps %xmm3, %xmm4
  1113. movaps 52 * SIZE(BB), %xmm3
  1114. mulps %xmm1, %xmm3
  1115. movsd 14 * SIZE(AA), %xmm1
  1116. addps %xmm3, %xmm5
  1117. movaps 56 * SIZE(BB), %xmm3
  1118. mulps %xmm1, %xmm3
  1119. addps %xmm3, %xmm6
  1120. movaps 60 * SIZE(BB), %xmm3
  1121. mulps %xmm1, %xmm3
  1122. movsd 24 * SIZE(AA), %xmm1
  1123. addps %xmm3, %xmm7
  1124. movaps 80 * SIZE(BB), %xmm3
  1125. addl $ 16 * SIZE, AA
  1126. addl $ 64 * SIZE, BB
  1127. decl %eax
  1128. jne .L141
  1129. ALIGN_4
  1130. .L142:
  1131. #ifndef TRMMKERNEL
  1132. movl K, %eax
  1133. #else
  1134. movl KKK, %eax
  1135. #endif
  1136. movaps ALPHA_R, %xmm1
  1137. movaps ALPHA_I, %xmm3
  1138. andl $7, %eax # if (k & 1)
  1139. BRANCH
  1140. je .L144
  1141. ALIGN_4
  1142. .L143:
  1143. mulps %xmm0, %xmm2
  1144. mulps 4 * SIZE(BB), %xmm0
  1145. addps %xmm2, %xmm4
  1146. movaps 8 * SIZE(BB), %xmm2
  1147. addps %xmm0, %xmm5
  1148. movsd 2 * SIZE(AA), %xmm0
  1149. addl $2 * SIZE, AA
  1150. addl $8 * SIZE, BB
  1151. decl %eax
  1152. jg .L143
  1153. ALIGN_4
  1154. .L144:
  1155. addps %xmm6, %xmm4
  1156. addps %xmm7, %xmm5
  1157. shufps $0xb1, %xmm5, %xmm5
  1158. #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \
  1159. defined(RR) || defined(RC) || defined(CR) || defined(CC)
  1160. subps %xmm5, %xmm4
  1161. #else
  1162. addps %xmm5, %xmm4
  1163. #endif
  1164. movaps %xmm4, %xmm5
  1165. shufps $0xb1, %xmm4, %xmm4
  1166. mulps %xmm1, %xmm5
  1167. mulps %xmm3, %xmm4
  1168. addps %xmm5, %xmm4
  1169. #ifndef TRMMKERNEL
  1170. shufps $0xe4, %xmm4, %xmm4
  1171. movsd 0 * SIZE(%esi), %xmm0
  1172. addps %xmm0, %xmm4
  1173. #endif
  1174. movlps %xmm4, 0 * SIZE(%esi)
  1175. ALIGN_4
  1176. .L999:
  1177. EMMS
  1178. movl OLD_STACK, %esp
  1179. popl %ebx
  1180. popl %esi
  1181. popl %edi
  1182. popl %ebp
  1183. ret
  1184. EPILOGUE