You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

zgemm3m_kernel_4x2_northwood.S 32 kB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522
  1. /*********************************************************************/
  2. /* Copyright 2009, 2010 The University of Texas at Austin. */
  3. /* All rights reserved. */
  4. /* */
  5. /* Redistribution and use in source and binary forms, with or */
  6. /* without modification, are permitted provided that the following */
  7. /* conditions are met: */
  8. /* */
  9. /* 1. Redistributions of source code must retain the above */
  10. /* copyright notice, this list of conditions and the following */
  11. /* disclaimer. */
  12. /* */
  13. /* 2. Redistributions in binary form must reproduce the above */
  14. /* copyright notice, this list of conditions and the following */
  15. /* disclaimer in the documentation and/or other materials */
  16. /* provided with the distribution. */
  17. /* */
  18. /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
  19. /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
  20. /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
  21. /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
  22. /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
  23. /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
  24. /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
  25. /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
  26. /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
  27. /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
  28. /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
  29. /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
  30. /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
  31. /* POSSIBILITY OF SUCH DAMAGE. */
  32. /* */
  33. /* The views and conclusions contained in the software and */
  34. /* documentation are those of the authors and should not be */
  35. /* interpreted as representing official policies, either expressed */
  36. /* or implied, of The University of Texas at Austin. */
  37. /*********************************************************************/
  38. #define ASSEMBLER
  39. #include "common.h"
  40. #define PREFETCHSIZE (8 * 4)
  41. #if !defined(HAVE_SSE2) || !defined(HAVE_MMX)
  42. #error You have to check your configuration.
  43. #endif
  44. #define STACK 16
  45. #define ARGS 0
  46. #define STACK_M 4 + STACK + ARGS(%esi)
  47. #define STACK_N 8 + STACK + ARGS(%esi)
  48. #define STACK_K 12 + STACK + ARGS(%esi)
  49. #define STACK_ALPHA_R 16 + STACK + ARGS(%esi)
  50. #define STACK_ALPHA_I 24 + STACK + ARGS(%esi)
  51. #define STACK_A 32 + STACK + ARGS(%esi)
  52. #define STACK_B 36 + STACK + ARGS(%esi)
  53. #define STACK_C 40 + STACK + ARGS(%esi)
  54. #define STACK_LDC 44 + STACK + ARGS(%esi)
  55. #define STACK_OFFT 48 + STACK + ARGS(%esi)
  56. #define ALPHA 0(%esp)
  57. #define K 16(%esp)
  58. #define N 20(%esp)
  59. #define M 24(%esp)
  60. #define A 28(%esp)
  61. #define C 32(%esp)
  62. #define J 36(%esp)
  63. #define BX 40(%esp)
  64. #define OLD_STACK 44(%esp)
  65. #define OFFSET 48(%esp)
  66. #define KK 52(%esp)
  67. #define KKK 56(%esp)
  68. #define BUFFER 128(%esp)
  69. #define B %edi
  70. #define LDC %ebp
  71. #define STACK_ALIGN 4096
  72. #define STACK_OFFSET 1024
  73. #define AA %edx
  74. #define BB %ecx
  75. #define KERNEL1(address) \
  76. mulpd %xmm0, %xmm2; \
  77. mulpd 2 * SIZE + (address) * SIZE(BB), %xmm0; \
  78. addpd %xmm2, %xmm4; \
  79. movapd 0 * SIZE + (address) * SIZE(BB), %xmm2; \
  80. movq (PREFETCHSIZE + 0) * SIZE + (address) * SIZE(AA), %mm2; \
  81. addpd %xmm0, %xmm5; \
  82. movapd 2 * SIZE + (address) * SIZE(AA), %xmm0; \
  83. mulpd %xmm0, %xmm2; \
  84. mulpd 2 * SIZE + (address) * SIZE(BB), %xmm0; \
  85. addpd %xmm2, %xmm6; \
  86. movapd 4 * SIZE + (address) * SIZE(BB), %xmm2; \
  87. addpd %xmm0, %xmm7; \
  88. movapd 4 * SIZE + (address) * SIZE(AA), %xmm0
  89. #define KERNEL2(address) \
  90. mulpd %xmm0, %xmm2; \
  91. mulpd 6 * SIZE + (address) * SIZE(BB), %xmm0; \
  92. addpd %xmm2, %xmm4; \
  93. movapd 4 * SIZE + (address) * SIZE(BB), %xmm2; \
  94. addpd %xmm0, %xmm5; \
  95. movapd 6 * SIZE + (address) * SIZE(AA), %xmm0; \
  96. mulpd %xmm0, %xmm2; \
  97. mulpd 6 * SIZE + (address) * SIZE(BB), %xmm0; \
  98. addpd %xmm2, %xmm6; \
  99. movapd 16 * SIZE + (address) * SIZE(BB), %xmm2; \
  100. addpd %xmm0, %xmm7; \
  101. movapd 16 * SIZE + (address) * SIZE(AA), %xmm0
  102. #define KERNEL3(address) \
  103. movq (PREFETCHSIZE + 8) * SIZE + (address) * SIZE(AA), %mm2; \
  104. mulpd %xmm1, %xmm3; \
  105. mulpd 10 * SIZE + (address) * SIZE(BB), %xmm1; \
  106. addpd %xmm3, %xmm4; \
  107. movapd 8 * SIZE + (address) * SIZE(BB), %xmm3; \
  108. addpd %xmm1, %xmm5; \
  109. movapd 10 * SIZE + (address) * SIZE(AA), %xmm1; \
  110. mulpd %xmm1, %xmm3; \
  111. mulpd 10 * SIZE + (address) * SIZE(BB), %xmm1; \
  112. addpd %xmm3, %xmm6; \
  113. movapd 12 * SIZE + (address) * SIZE(BB), %xmm3; \
  114. addpd %xmm1, %xmm7; \
  115. movapd 12 * SIZE + (address) * SIZE(AA), %xmm1
  116. #define KERNEL4(address) \
  117. mulpd %xmm1, %xmm3; \
  118. mulpd 14 * SIZE + (address) * SIZE(BB), %xmm1; \
  119. addpd %xmm3, %xmm4; \
  120. movapd 12 * SIZE + (address) * SIZE(BB), %xmm3; \
  121. addpd %xmm1, %xmm5; \
  122. movapd 14 * SIZE + (address) * SIZE(AA), %xmm1; \
  123. mulpd %xmm1, %xmm3; \
  124. mulpd 14 * SIZE + (address) * SIZE(BB), %xmm1; \
  125. addpd %xmm3, %xmm6; \
  126. movapd 24 * SIZE + (address) * SIZE(BB), %xmm3; \
  127. addpd %xmm1, %xmm7; \
  128. movapd 24 * SIZE + (address) * SIZE(AA), %xmm1
  129. #define KERNEL5(address) \
  130. mulpd %xmm0, %xmm2; \
  131. mulpd 18 * SIZE + (address) * SIZE(BB), %xmm0; \
  132. addpd %xmm2, %xmm4; \
  133. movapd 16 * SIZE + (address) * SIZE(BB), %xmm2; \
  134. movq (PREFETCHSIZE + 16) * SIZE + (address) * SIZE(AA), %mm2; \
  135. addpd %xmm0, %xmm5; \
  136. movapd 18 * SIZE + (address) * SIZE(AA), %xmm0; \
  137. mulpd %xmm0, %xmm2; \
  138. mulpd 18 * SIZE + (address) * SIZE(BB), %xmm0; \
  139. addpd %xmm2, %xmm6; \
  140. movapd 20 * SIZE + (address) * SIZE(BB), %xmm2; \
  141. addpd %xmm0, %xmm7; \
  142. movapd 20 * SIZE + (address) * SIZE(AA), %xmm0
  143. #define KERNEL6(address) \
  144. mulpd %xmm0, %xmm2; \
  145. mulpd 22 * SIZE + (address) * SIZE(BB), %xmm0; \
  146. addpd %xmm2, %xmm4; \
  147. movapd 20 * SIZE + (address) * SIZE(BB), %xmm2; \
  148. addpd %xmm0, %xmm5; \
  149. movapd 22 * SIZE + (address) * SIZE(AA), %xmm0; \
  150. mulpd %xmm0, %xmm2; \
  151. mulpd 22 * SIZE + (address) * SIZE(BB), %xmm0; \
  152. addpd %xmm2, %xmm6; \
  153. movapd 32 * SIZE + (address) * SIZE(BB), %xmm2; \
  154. addpd %xmm0, %xmm7; \
  155. movapd 32 * SIZE + (address) * SIZE(AA), %xmm0
  156. #define KERNEL7(address) \
  157. movq (PREFETCHSIZE + 24) * SIZE + (address) * SIZE(AA), %mm2; \
  158. mulpd %xmm1, %xmm3; \
  159. mulpd 26 * SIZE + (address) * SIZE(BB), %xmm1; \
  160. addpd %xmm3, %xmm4; \
  161. movapd 24 * SIZE + (address) * SIZE(BB), %xmm3; \
  162. addpd %xmm1, %xmm5; \
  163. movapd 26 * SIZE + (address) * SIZE(AA), %xmm1; \
  164. mulpd %xmm1, %xmm3; \
  165. mulpd 26 * SIZE + (address) * SIZE(BB), %xmm1; \
  166. addpd %xmm3, %xmm6; \
  167. movapd 28 * SIZE + (address) * SIZE(BB), %xmm3; \
  168. addpd %xmm1, %xmm7; \
  169. movapd 28 * SIZE + (address) * SIZE(AA), %xmm1
  170. #define KERNEL8(address) \
  171. mulpd %xmm1, %xmm3; \
  172. mulpd 30 * SIZE + (address) * SIZE(BB), %xmm1; \
  173. addpd %xmm3, %xmm4; \
  174. movapd 28 * SIZE + (address) * SIZE(BB), %xmm3; \
  175. addpd %xmm1, %xmm5; \
  176. movapd 30 * SIZE + (address) * SIZE(AA), %xmm1; \
  177. mulpd %xmm1, %xmm3; \
  178. mulpd 30 * SIZE + (address) * SIZE(BB), %xmm1; \
  179. addpd %xmm3, %xmm6; \
  180. movapd 40 * SIZE + (address) * SIZE(BB), %xmm3; \
  181. addpd %xmm1, %xmm7; \
  182. movapd 40 * SIZE + (address) * SIZE(AA), %xmm1
  183. PROLOGUE
  184. pushl %ebp
  185. pushl %edi
  186. pushl %esi
  187. pushl %ebx
  188. PROFCODE
  189. EMMS
  190. movl %esp, %esi # save old stack
  191. subl $128 + LOCAL_BUFFER_SIZE + STACK_OFFSET, %esp
  192. andl $-STACK_ALIGN, %esp
  193. addl $STACK_OFFSET, %esp
  194. STACK_TOUCHING
  195. movd STACK_M, %mm0
  196. movl STACK_N, %eax
  197. movd STACK_K, %mm1
  198. movd STACK_A, %mm2
  199. movsd STACK_ALPHA_R, %xmm0
  200. movhps STACK_ALPHA_I, %xmm0
  201. movl STACK_B, B
  202. movd STACK_C, %mm3
  203. movl STACK_LDC, LDC
  204. #ifdef TRMMKERNEL
  205. movd STACK_OFFT, %mm4
  206. #endif
  207. movaps %xmm0, ALPHA
  208. movd %mm1, K
  209. movl %eax, N
  210. movd %mm0, M
  211. movd %mm2, A
  212. movd %mm3, C
  213. movl %esi, OLD_STACK
  214. #ifdef TRMMKERNEL
  215. movd %mm4, OFFSET
  216. movd %mm4, KK
  217. #ifndef LEFT
  218. negl KK
  219. #endif
  220. #endif
  221. sall $ZBASE_SHIFT, LDC
  222. sarl $1, %eax # j = (n >> 1)
  223. movl %eax, J
  224. jle .L100
  225. ALIGN_2
  226. .L01:
  227. #if defined(TRMMKERNEL) && defined(LEFT)
  228. movl OFFSET, %eax
  229. movl %eax, KK
  230. #endif
  231. /* Copying to Sub Buffer */
  232. leal BUFFER, %ecx
  233. movl K, %eax
  234. sarl $2, %eax
  235. jle .L03
  236. ALIGN_2
  237. .L02:
  238. movsd 0 * SIZE(B), %xmm0
  239. movsd 1 * SIZE(B), %xmm1
  240. movsd 2 * SIZE(B), %xmm2
  241. movsd 3 * SIZE(B), %xmm3
  242. movsd 4 * SIZE(B), %xmm4
  243. movsd 5 * SIZE(B), %xmm5
  244. movsd 6 * SIZE(B), %xmm6
  245. movsd 7 * SIZE(B), %xmm7
  246. unpcklpd %xmm0, %xmm0
  247. unpcklpd %xmm1, %xmm1
  248. unpcklpd %xmm2, %xmm2
  249. unpcklpd %xmm3, %xmm3
  250. unpcklpd %xmm4, %xmm4
  251. unpcklpd %xmm5, %xmm5
  252. unpcklpd %xmm6, %xmm6
  253. unpcklpd %xmm7, %xmm7
  254. movapd %xmm0, 0 * SIZE(%ecx)
  255. movapd %xmm1, 2 * SIZE(%ecx)
  256. movapd %xmm2, 4 * SIZE(%ecx)
  257. movapd %xmm3, 6 * SIZE(%ecx)
  258. movapd %xmm4, 8 * SIZE(%ecx)
  259. movapd %xmm5, 10 * SIZE(%ecx)
  260. movapd %xmm6, 12 * SIZE(%ecx)
  261. movapd %xmm7, 14 * SIZE(%ecx)
  262. prefetcht0 104 * SIZE(B)
  263. addl $ 8 * SIZE, B
  264. subl $-16 * SIZE, %ecx
  265. decl %eax
  266. BRANCH
  267. jne .L02
  268. ALIGN_2
  269. .L03:
  270. movl K, %eax
  271. andl $3, %eax
  272. BRANCH
  273. jle .L05
  274. ALIGN_4
  275. .L04:
  276. movsd 0 * SIZE(B), %xmm0
  277. movsd 1 * SIZE(B), %xmm1
  278. unpcklpd %xmm0, %xmm0
  279. unpcklpd %xmm1, %xmm1
  280. movapd %xmm0, 0 * SIZE(%ecx)
  281. movapd %xmm1, 2 * SIZE(%ecx)
  282. addl $2 * SIZE, B
  283. addl $4 * SIZE, %ecx
  284. decl %eax
  285. BRANCH
  286. jne .L04
  287. ALIGN_4
  288. .L05:
  289. movl B, BX
  290. movl C, %esi # coffset = c
  291. movl A, %edx # aoffset = a
  292. movl M, %ebx
  293. sarl $2, %ebx # i = (m >> 2)
  294. NOBRANCH
  295. jle .L30
  296. ALIGN_4
  297. .L10:
  298. #if !defined(TRMMKERNEL) || \
  299. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  300. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  301. leal BUFFER, BB
  302. movapd 0 * SIZE + BUFFER, %xmm2
  303. pxor %xmm4, %xmm4
  304. movapd 0 * SIZE(AA), %xmm0
  305. pxor %xmm5, %xmm5
  306. movapd 8 * SIZE + BUFFER, %xmm3
  307. pxor %xmm6, %xmm6
  308. movapd 8 * SIZE(AA), %xmm1
  309. pxor %xmm7, %xmm7
  310. #else
  311. leal BUFFER, BB
  312. movl KK, %eax
  313. leal (, %eax, SIZE), %eax
  314. leal (AA, %eax, 4), AA
  315. leal (BB, %eax, 4), BB /* because it's doubled */
  316. movapd 0 * SIZE(BB), %xmm2
  317. pxor %xmm4, %xmm4
  318. movapd 0 * SIZE(AA), %xmm0
  319. pxor %xmm5, %xmm5
  320. movapd 8 * SIZE(BB), %xmm3
  321. pxor %xmm6, %xmm6
  322. movapd 8 * SIZE(AA), %xmm1
  323. pxor %xmm7, %xmm7
  324. #endif
  325. prefetchnta 3 * SIZE(%esi)
  326. prefetchnta 3 * SIZE(%esi, LDC)
  327. movl BX, %eax
  328. prefetcht2 0 * SIZE(%eax)
  329. subl $-8 * SIZE, %eax
  330. movl %eax, BX
  331. #ifndef TRMMKERNEL
  332. movl K, %eax
  333. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  334. movl K, %eax
  335. subl KK, %eax
  336. movl %eax, KKK
  337. #else
  338. movl KK, %eax
  339. #ifdef LEFT
  340. addl $4, %eax
  341. #else
  342. addl $2, %eax
  343. #endif
  344. movl %eax, KKK
  345. #endif
  346. #ifdef PENTIUM4
  347. andl $-8, %eax
  348. NOBRANCH
  349. je .L12
  350. sall $3, %eax
  351. .align 8
  352. .L1X:
  353. KERNEL1(32 * 0)
  354. KERNEL2(32 * 0)
  355. KERNEL3(32 * 0)
  356. KERNEL4(32 * 0)
  357. KERNEL5(32 * 0)
  358. KERNEL6(32 * 0)
  359. KERNEL7(32 * 0)
  360. KERNEL8(32 * 0)
  361. cmpl $64 * 1, %eax
  362. NOBRANCH
  363. jle .L11
  364. KERNEL1(32 * 1)
  365. KERNEL2(32 * 1)
  366. KERNEL3(32 * 1)
  367. KERNEL4(32 * 1)
  368. KERNEL5(32 * 1)
  369. KERNEL6(32 * 1)
  370. KERNEL7(32 * 1)
  371. KERNEL8(32 * 1)
  372. cmpl $64 * 2, %eax
  373. NOBRANCH
  374. jle .L11
  375. KERNEL1(32 * 2)
  376. KERNEL2(32 * 2)
  377. KERNEL3(32 * 2)
  378. KERNEL4(32 * 2)
  379. KERNEL5(32 * 2)
  380. KERNEL6(32 * 2)
  381. KERNEL7(32 * 2)
  382. KERNEL8(32 * 2)
  383. cmpl $64 * 3, %eax
  384. NOBRANCH
  385. jle .L11
  386. KERNEL1(32 * 3)
  387. KERNEL2(32 * 3)
  388. KERNEL3(32 * 3)
  389. KERNEL4(32 * 3)
  390. KERNEL5(32 * 3)
  391. KERNEL6(32 * 3)
  392. KERNEL7(32 * 3)
  393. KERNEL8(32 * 3)
  394. cmpl $64 * 4, %eax
  395. NOBRANCH
  396. jle .L11
  397. KERNEL1(32 * 4)
  398. KERNEL2(32 * 4)
  399. KERNEL3(32 * 4)
  400. KERNEL4(32 * 4)
  401. KERNEL5(32 * 4)
  402. KERNEL6(32 * 4)
  403. KERNEL7(32 * 4)
  404. KERNEL8(32 * 4)
  405. cmpl $64 * 5, %eax
  406. NOBRANCH
  407. jle .L11
  408. KERNEL1(32 * 5)
  409. KERNEL2(32 * 5)
  410. KERNEL3(32 * 5)
  411. KERNEL4(32 * 5)
  412. KERNEL5(32 * 5)
  413. KERNEL6(32 * 5)
  414. KERNEL7(32 * 5)
  415. KERNEL8(32 * 5)
  416. cmpl $64 * 6, %eax
  417. NOBRANCH
  418. jle .L11
  419. KERNEL1(32 * 6)
  420. KERNEL2(32 * 6)
  421. KERNEL3(32 * 6)
  422. KERNEL4(32 * 6)
  423. KERNEL5(32 * 6)
  424. KERNEL6(32 * 6)
  425. KERNEL7(32 * 6)
  426. KERNEL8(32 * 6)
  427. cmpl $64 * 7, %eax
  428. NOBRANCH
  429. jle .L11
  430. KERNEL1(32 * 7)
  431. KERNEL2(32 * 7)
  432. KERNEL3(32 * 7)
  433. KERNEL4(32 * 7)
  434. KERNEL5(32 * 7)
  435. KERNEL6(32 * 7)
  436. KERNEL7(32 * 7)
  437. KERNEL8(32 * 7)
  438. addl $64 * 4 * SIZE, AA
  439. addl $64 * 4 * SIZE, BB
  440. subl $64 * 8, %eax
  441. BRANCH
  442. jg .L1X
  443. .L11:
  444. leal (AA, %eax, 4), AA
  445. leal (BB, %eax, 4), BB
  446. #else
  447. sarl $3, %eax
  448. je .L12
  449. .L11:
  450. KERNEL1(32 * 0)
  451. KERNEL2(32 * 0)
  452. KERNEL3(32 * 0)
  453. KERNEL4(32 * 0)
  454. KERNEL5(32 * 0)
  455. KERNEL6(32 * 0)
  456. KERNEL7(32 * 0)
  457. KERNEL8(32 * 0)
  458. addl $32 * SIZE, %ecx
  459. addl $32 * SIZE, %edx
  460. decl %eax
  461. jne .L11
  462. #endif
  463. .L12:
  464. #ifndef TRMMKERNEL
  465. movl K, %eax
  466. #else
  467. movl KKK, %eax
  468. #endif
  469. movaps ALPHA, %xmm3
  470. andl $7, %eax # if (k & 1)
  471. BRANCH
  472. je .L14
  473. .L13:
  474. mulpd %xmm0, %xmm2
  475. mulpd 2 * SIZE(BB), %xmm0
  476. addpd %xmm2, %xmm4
  477. movapd 0 * SIZE(BB), %xmm2
  478. addpd %xmm0, %xmm5
  479. movapd 2 * SIZE(AA), %xmm0
  480. mulpd %xmm0, %xmm2
  481. mulpd 2 * SIZE(BB), %xmm0
  482. addpd %xmm2, %xmm6
  483. movapd 4 * SIZE(BB), %xmm2
  484. addpd %xmm0, %xmm7
  485. movapd 4 * SIZE(AA), %xmm0
  486. addl $4 * SIZE, AA # aoffset += 8
  487. addl $4 * SIZE, BB # boffset1 += 8
  488. subl $1, %eax
  489. jg .L13
  490. ALIGN_4
  491. .L14:
  492. movsd 0 * SIZE(%esi), %xmm0
  493. movhps 1 * SIZE(%esi), %xmm0
  494. movsd 2 * SIZE(%esi), %xmm1
  495. movhps 3 * SIZE(%esi), %xmm1
  496. pshufd $0x44, %xmm4, %xmm2
  497. unpckhpd %xmm4, %xmm4
  498. mulpd %xmm3, %xmm2
  499. addpd %xmm2, %xmm0
  500. mulpd %xmm3, %xmm4
  501. addpd %xmm4, %xmm1
  502. movlps %xmm0, 0 * SIZE(%esi)
  503. movhps %xmm0, 1 * SIZE(%esi)
  504. movlps %xmm1, 2 * SIZE(%esi)
  505. movhps %xmm1, 3 * SIZE(%esi)
  506. movsd 4 * SIZE(%esi), %xmm0
  507. movhps 5 * SIZE(%esi), %xmm0
  508. movsd 6 * SIZE(%esi), %xmm1
  509. movhps 7 * SIZE(%esi), %xmm1
  510. pshufd $0x44, %xmm6, %xmm2
  511. unpckhpd %xmm6, %xmm6
  512. mulpd %xmm3, %xmm2
  513. addpd %xmm2, %xmm0
  514. mulpd %xmm3, %xmm6
  515. addpd %xmm6, %xmm1
  516. movlps %xmm0, 4 * SIZE(%esi)
  517. movhps %xmm0, 5 * SIZE(%esi)
  518. movlps %xmm1, 6 * SIZE(%esi)
  519. movhps %xmm1, 7 * SIZE(%esi)
  520. movsd 0 * SIZE(%esi, LDC), %xmm0
  521. movhps 1 * SIZE(%esi, LDC), %xmm0
  522. movsd 2 * SIZE(%esi, LDC), %xmm1
  523. movhps 3 * SIZE(%esi, LDC), %xmm1
  524. pshufd $0x44, %xmm5, %xmm2
  525. unpckhpd %xmm5, %xmm5
  526. mulpd %xmm3, %xmm2
  527. addpd %xmm2, %xmm0
  528. mulpd %xmm3, %xmm5
  529. addpd %xmm5, %xmm1
  530. movlps %xmm0, 0 * SIZE(%esi, LDC)
  531. movhps %xmm0, 1 * SIZE(%esi, LDC)
  532. movlps %xmm1, 2 * SIZE(%esi, LDC)
  533. movhps %xmm1, 3 * SIZE(%esi, LDC)
  534. movsd 4 * SIZE(%esi, LDC), %xmm0
  535. movhps 5 * SIZE(%esi, LDC), %xmm0
  536. movsd 6 * SIZE(%esi, LDC), %xmm1
  537. movhps 7 * SIZE(%esi, LDC), %xmm1
  538. pshufd $0x44, %xmm7, %xmm2
  539. unpckhpd %xmm7, %xmm7
  540. mulpd %xmm3, %xmm2
  541. addpd %xmm2, %xmm0
  542. mulpd %xmm3, %xmm7
  543. addpd %xmm7, %xmm1
  544. movlps %xmm0, 4 * SIZE(%esi, LDC)
  545. movhps %xmm0, 5 * SIZE(%esi, LDC)
  546. movlps %xmm1, 6 * SIZE(%esi, LDC)
  547. movhps %xmm1, 7 * SIZE(%esi, LDC)
  548. addl $8 * SIZE, %esi
  549. decl %ebx # i --
  550. BRANCH
  551. jg .L10
  552. ALIGN_2
  553. .L30:
  554. movl M, %ebx
  555. testl $2, %ebx
  556. jle .L50
  557. #if !defined(TRMMKERNEL) || \
  558. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  559. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  560. leal BUFFER, %ecx
  561. movapd 0 * SIZE + BUFFER, %xmm2
  562. pxor %xmm4, %xmm4
  563. movapd 0 * SIZE(AA), %xmm0
  564. pxor %xmm5, %xmm5
  565. movapd 8 * SIZE + BUFFER, %xmm3
  566. pxor %xmm6, %xmm6
  567. movapd 8 * SIZE(AA), %xmm1
  568. pxor %xmm7, %xmm7
  569. #else
  570. leal BUFFER, BB
  571. movl KK, %eax
  572. leal (, %eax, SIZE), %eax
  573. leal (AA, %eax, 2), AA
  574. leal (BB, %eax, 4), BB /* because it's doubled */
  575. movapd 0 * SIZE(BB), %xmm2
  576. pxor %xmm4, %xmm4
  577. movapd 0 * SIZE(AA), %xmm0
  578. pxor %xmm5, %xmm5
  579. movapd 8 * SIZE(BB), %xmm3
  580. pxor %xmm6, %xmm6
  581. movapd 8 * SIZE(AA), %xmm1
  582. pxor %xmm7, %xmm7
  583. #endif
  584. #ifndef TRMMKERNEL
  585. movl K, %eax
  586. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  587. movl K, %eax
  588. subl KK, %eax
  589. movl %eax, KKK
  590. #else
  591. movl KK, %eax
  592. addl $2, %eax
  593. movl %eax, KKK
  594. #endif
  595. sarl $3, %eax
  596. je .L32
  597. .L31:
  598. mulpd %xmm0, %xmm2
  599. mulpd 2 * SIZE(BB), %xmm0
  600. addpd %xmm2, %xmm4
  601. movapd 4 * SIZE(BB), %xmm2
  602. addpd %xmm0, %xmm5
  603. movapd 2 * SIZE(AA), %xmm0
  604. mulpd %xmm0, %xmm2
  605. mulpd 6 * SIZE(BB), %xmm0
  606. addpd %xmm2, %xmm6
  607. movapd 16 * SIZE(BB), %xmm2
  608. addpd %xmm0, %xmm7
  609. movapd 4 * SIZE(AA), %xmm0
  610. mulpd %xmm0, %xmm3
  611. mulpd 10 * SIZE(BB), %xmm0
  612. addpd %xmm3, %xmm4
  613. movapd 12 * SIZE(BB), %xmm3
  614. addpd %xmm0, %xmm5
  615. movapd 6 * SIZE(AA), %xmm0
  616. mulpd %xmm0, %xmm3
  617. mulpd 14 * SIZE(BB), %xmm0
  618. addpd %xmm3, %xmm6
  619. movapd 24 * SIZE(BB), %xmm3
  620. addpd %xmm0, %xmm7
  621. movapd 16 * SIZE(AA), %xmm0
  622. mulpd %xmm1, %xmm2
  623. mulpd 18 * SIZE(BB), %xmm1
  624. addpd %xmm2, %xmm4
  625. movapd 20 * SIZE(BB), %xmm2
  626. addpd %xmm1, %xmm5
  627. movapd 10 * SIZE(AA), %xmm1
  628. mulpd %xmm1, %xmm2
  629. mulpd 22 * SIZE(BB), %xmm1
  630. addpd %xmm2, %xmm6
  631. movapd 32 * SIZE(BB), %xmm2
  632. addpd %xmm1, %xmm7
  633. movapd 12 * SIZE(AA), %xmm1
  634. mulpd %xmm1, %xmm3
  635. mulpd 26 * SIZE(BB), %xmm1
  636. addpd %xmm3, %xmm4
  637. movapd 28 * SIZE(BB), %xmm3
  638. addpd %xmm1, %xmm5
  639. movapd 14 * SIZE(AA), %xmm1
  640. mulpd %xmm1, %xmm3
  641. mulpd 30 * SIZE(BB), %xmm1
  642. addpd %xmm3, %xmm6
  643. movapd 40 * SIZE(BB), %xmm3
  644. addpd %xmm1, %xmm7
  645. movapd 24 * SIZE(AA), %xmm1
  646. addl $16 * SIZE, AA
  647. addl $32 * SIZE, BB
  648. BRANCH
  649. decl %eax
  650. jne .L31
  651. .L32:
  652. #ifndef TRMMKERNEL
  653. movl K, %eax
  654. #else
  655. movl KKK, %eax
  656. #endif
  657. movaps ALPHA, %xmm3
  658. andl $7, %eax # if (k & 1)
  659. BRANCH
  660. je .L34
  661. .L33:
  662. mulpd %xmm0, %xmm2
  663. mulpd 2 * SIZE(BB), %xmm0
  664. addpd %xmm2, %xmm4
  665. movapd 4 * SIZE(BB), %xmm2
  666. addpd %xmm0, %xmm5
  667. movapd 2 * SIZE(AA), %xmm0
  668. addl $2 * SIZE, AA # aoffset += 8
  669. addl $4 * SIZE, BB # boffset1 += 8
  670. decl %eax
  671. BRANCH
  672. jg .L33
  673. ALIGN_4
  674. .L34:
  675. addpd %xmm6, %xmm4
  676. addpd %xmm7, %xmm5
  677. movsd 0 * SIZE(%esi), %xmm0
  678. movhps 1 * SIZE(%esi), %xmm0
  679. movsd 2 * SIZE(%esi), %xmm1
  680. movhps 3 * SIZE(%esi), %xmm1
  681. pshufd $0x44, %xmm4, %xmm2
  682. unpckhpd %xmm4, %xmm4
  683. mulpd %xmm3, %xmm2
  684. addpd %xmm2, %xmm0
  685. mulpd %xmm3, %xmm4
  686. addpd %xmm4, %xmm1
  687. movlps %xmm0, 0 * SIZE(%esi)
  688. movhps %xmm0, 1 * SIZE(%esi)
  689. movlps %xmm1, 2 * SIZE(%esi)
  690. movhps %xmm1, 3 * SIZE(%esi)
  691. movsd 0 * SIZE(%esi, LDC), %xmm0
  692. movhps 1 * SIZE(%esi, LDC), %xmm0
  693. movsd 2 * SIZE(%esi, LDC), %xmm1
  694. movhps 3 * SIZE(%esi, LDC), %xmm1
  695. pshufd $0x44, %xmm5, %xmm2
  696. unpckhpd %xmm5, %xmm5
  697. mulpd %xmm3, %xmm2
  698. addpd %xmm2, %xmm0
  699. mulpd %xmm3, %xmm5
  700. addpd %xmm5, %xmm1
  701. movlps %xmm0, 0 * SIZE(%esi, LDC)
  702. movhps %xmm0, 1 * SIZE(%esi, LDC)
  703. movlps %xmm1, 2 * SIZE(%esi, LDC)
  704. movhps %xmm1, 3 * SIZE(%esi, LDC)
  705. addl $4 * SIZE, %esi # coffset += 4
  706. ALIGN_2
  707. .L50:
  708. movl M, %ebx
  709. testl $1, %ebx
  710. jle .L99
  711. #if !defined(TRMMKERNEL) || \
  712. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  713. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  714. leal BUFFER, %ecx
  715. movapd 0 * SIZE + BUFFER, %xmm2
  716. pxor %xmm4, %xmm4
  717. movapd 0 * SIZE(AA), %xmm0
  718. pxor %xmm5, %xmm5
  719. movapd 8 * SIZE + BUFFER, %xmm3
  720. pxor %xmm6, %xmm6
  721. movsd 4 * SIZE(AA), %xmm1
  722. pxor %xmm7, %xmm7
  723. #else
  724. leal BUFFER, BB
  725. movl KK, %eax
  726. leal (, %eax, SIZE), %eax
  727. leal (AA, %eax, 1), AA
  728. leal (BB, %eax, 4), BB /* because it's doubled */
  729. movapd 0 * SIZE(BB), %xmm2
  730. pxor %xmm4, %xmm4
  731. movapd 0 * SIZE(AA), %xmm0
  732. pxor %xmm5, %xmm5
  733. movapd 8 * SIZE(BB), %xmm3
  734. pxor %xmm6, %xmm6
  735. movsd 4 * SIZE(AA), %xmm1
  736. pxor %xmm7, %xmm7
  737. #endif
  738. #ifndef TRMMKERNEL
  739. movl K, %eax
  740. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  741. movl K, %eax
  742. subl KK, %eax
  743. movl %eax, KKK
  744. #else
  745. movl KK, %eax
  746. #ifdef LEFT
  747. addl $1, %eax
  748. #else
  749. addl $2, %eax
  750. #endif
  751. movl %eax, KKK
  752. #endif
  753. sarl $3, %eax
  754. je .L52
  755. .L51:
  756. mulsd %xmm0, %xmm2
  757. mulsd 2 * SIZE(BB), %xmm0
  758. addsd %xmm2, %xmm4
  759. movsd 4 * SIZE(BB), %xmm2
  760. addsd %xmm0, %xmm5
  761. movsd 1 * SIZE(AA), %xmm0
  762. mulsd %xmm0, %xmm2
  763. mulsd 6 * SIZE(BB), %xmm0
  764. addsd %xmm2, %xmm4
  765. movsd 16 * SIZE(BB), %xmm2
  766. addsd %xmm0, %xmm5
  767. movsd 2 * SIZE(AA), %xmm0
  768. mulsd %xmm0, %xmm3
  769. mulsd 10 * SIZE(BB), %xmm0
  770. addsd %xmm3, %xmm4
  771. movsd 12 * SIZE(BB), %xmm3
  772. addsd %xmm0, %xmm5
  773. movsd 3 * SIZE(AA), %xmm0
  774. mulsd %xmm0, %xmm3
  775. mulsd 14 * SIZE(BB), %xmm0
  776. addsd %xmm3, %xmm4
  777. movsd 24 * SIZE(BB), %xmm3
  778. addsd %xmm0, %xmm5
  779. movsd 8 * SIZE(AA), %xmm0
  780. mulsd %xmm1, %xmm2
  781. mulsd 18 * SIZE(BB), %xmm1
  782. addsd %xmm2, %xmm4
  783. movsd 20 * SIZE(BB), %xmm2
  784. addsd %xmm1, %xmm5
  785. movsd 5 * SIZE(AA), %xmm1
  786. mulsd %xmm1, %xmm2
  787. mulsd 22 * SIZE(BB), %xmm1
  788. addsd %xmm2, %xmm4
  789. movsd 32 * SIZE(BB), %xmm2
  790. addsd %xmm1, %xmm5
  791. movsd 6 * SIZE(AA), %xmm1
  792. mulsd %xmm1, %xmm3
  793. mulsd 26 * SIZE(BB), %xmm1
  794. addsd %xmm3, %xmm4
  795. movsd 28 * SIZE(BB), %xmm3
  796. addsd %xmm1, %xmm5
  797. movsd 7 * SIZE(AA), %xmm1
  798. mulsd %xmm1, %xmm3
  799. mulsd 30 * SIZE(BB), %xmm1
  800. addsd %xmm3, %xmm4
  801. movsd 40 * SIZE(BB), %xmm3
  802. addsd %xmm1, %xmm5
  803. movsd 12 * SIZE(AA), %xmm1
  804. addl $ 8 * SIZE, AA
  805. addl $32 * SIZE, BB
  806. BRANCH
  807. decl %eax
  808. jne .L51
  809. .L52:
  810. #ifndef TRMMKERNEL
  811. movl K, %eax
  812. #else
  813. movl KKK, %eax
  814. #endif
  815. movaps ALPHA, %xmm3
  816. andl $7, %eax # if (k & 1)
  817. BRANCH
  818. je .L54
  819. .L53:
  820. mulsd %xmm0, %xmm2
  821. mulsd 2 * SIZE(BB), %xmm0
  822. addsd %xmm2, %xmm4
  823. movsd 4 * SIZE(BB), %xmm2
  824. addsd %xmm0, %xmm5
  825. movsd 1 * SIZE(AA), %xmm0
  826. addl $1 * SIZE, AA # aoffset += 8
  827. addl $4 * SIZE, BB # boffset1 += 8
  828. decl %eax
  829. BRANCH
  830. jg .L53
  831. ALIGN_4
  832. .L54:
  833. addsd %xmm6, %xmm4
  834. addsd %xmm7, %xmm5
  835. movsd 0 * SIZE(%esi), %xmm0
  836. movhps 1 * SIZE(%esi), %xmm0
  837. movsd 0 * SIZE(%esi, LDC), %xmm1
  838. movhps 1 * SIZE(%esi, LDC), %xmm1
  839. unpcklpd %xmm4, %xmm4
  840. unpcklpd %xmm5, %xmm5
  841. mulpd %xmm3, %xmm4
  842. addpd %xmm4, %xmm0
  843. mulpd %xmm3, %xmm5
  844. addpd %xmm5, %xmm1
  845. movlps %xmm0, 0 * SIZE(%esi)
  846. movhps %xmm0, 1 * SIZE(%esi)
  847. movlps %xmm1, 0 * SIZE(%esi, LDC)
  848. movhps %xmm1, 1 * SIZE(%esi, LDC)
  849. ALIGN_2
  850. .L99:
  851. #if defined(TRMMKERNEL) && !defined(LEFT)
  852. addl $2, KK
  853. #endif
  854. leal (, LDC, 2), %eax
  855. addl %eax, C # c += 2 * ldc
  856. BRANCH
  857. decl J # j --
  858. jg .L01
  859. ALIGN_2
  860. .L100:
  861. movl N, %eax
  862. testl $1, %eax
  863. jle .L999
  864. ALIGN_2
  865. .L101:
  866. #if defined(TRMMKERNEL) && defined(LEFT)
  867. movl OFFSET, %eax
  868. movl %eax, KK
  869. #endif
  870. /* Copying to Sub Buffer */
  871. leal BUFFER, %ecx
  872. movl K, %eax
  873. sarl $3, %eax
  874. jle .L103
  875. ALIGN_4
  876. .L102:
  877. movsd 0 * SIZE(B), %xmm0
  878. movsd 1 * SIZE(B), %xmm1
  879. movsd 2 * SIZE(B), %xmm2
  880. movsd 3 * SIZE(B), %xmm3
  881. movsd 4 * SIZE(B), %xmm4
  882. movsd 5 * SIZE(B), %xmm5
  883. movsd 6 * SIZE(B), %xmm6
  884. movsd 7 * SIZE(B), %xmm7
  885. unpcklpd %xmm0, %xmm0
  886. unpcklpd %xmm1, %xmm1
  887. unpcklpd %xmm2, %xmm2
  888. unpcklpd %xmm3, %xmm3
  889. unpcklpd %xmm4, %xmm4
  890. unpcklpd %xmm5, %xmm5
  891. unpcklpd %xmm6, %xmm6
  892. unpcklpd %xmm7, %xmm7
  893. movapd %xmm0, 0 * SIZE(%ecx)
  894. movapd %xmm1, 2 * SIZE(%ecx)
  895. movapd %xmm2, 4 * SIZE(%ecx)
  896. movapd %xmm3, 6 * SIZE(%ecx)
  897. movapd %xmm4, 8 * SIZE(%ecx)
  898. movapd %xmm5, 10 * SIZE(%ecx)
  899. movapd %xmm6, 12 * SIZE(%ecx)
  900. movapd %xmm7, 14 * SIZE(%ecx)
  901. prefetcht0 104 * SIZE(B)
  902. addl $ 8 * SIZE, B
  903. addl $16 * SIZE, %ecx
  904. decl %eax
  905. BRANCH
  906. jne .L102
  907. ALIGN_2
  908. .L103:
  909. movl K, %eax
  910. andl $7, %eax
  911. BRANCH
  912. jle .L105
  913. ALIGN_2
  914. .L104:
  915. movsd 0 * SIZE(B), %xmm0
  916. unpcklpd %xmm0, %xmm0
  917. movapd %xmm0, 0 * SIZE(%ecx)
  918. addl $1 * SIZE, B
  919. addl $2 * SIZE, %ecx
  920. decl %eax
  921. jne .L104
  922. ALIGN_4
  923. .L105:
  924. movl C, %esi # coffset = c
  925. movl A, %edx # aoffset = a
  926. movl M, %ebx
  927. sarl $2, %ebx # i = (m >> 2)
  928. jle .L130
  929. ALIGN_4
  930. .L110:
  931. #if !defined(TRMMKERNEL) || \
  932. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  933. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  934. leal BUFFER, BB
  935. movapd 0 * SIZE + BUFFER, %xmm2
  936. pxor %xmm4, %xmm4
  937. movapd 0 * SIZE(AA), %xmm0
  938. pxor %xmm5, %xmm5
  939. movapd 8 * SIZE + BUFFER, %xmm3
  940. pxor %xmm6, %xmm6
  941. movapd 8 * SIZE(AA), %xmm1
  942. pxor %xmm7, %xmm7
  943. #else
  944. leal BUFFER, BB
  945. movl KK, %eax
  946. leal (, %eax, SIZE), %eax
  947. leal (AA, %eax, 4), AA
  948. leal (BB, %eax, 2), BB
  949. movapd 0 * SIZE(BB), %xmm2
  950. pxor %xmm4, %xmm4
  951. movapd 0 * SIZE(AA), %xmm0
  952. pxor %xmm5, %xmm5
  953. movapd 8 * SIZE(BB), %xmm3
  954. pxor %xmm6, %xmm6
  955. movapd 8 * SIZE(AA), %xmm1
  956. pxor %xmm7, %xmm7
  957. #endif
  958. #ifndef TRMMKERNEL
  959. movl K, %eax
  960. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  961. movl K, %eax
  962. subl KK, %eax
  963. movl %eax, KKK
  964. #else
  965. movl KK, %eax
  966. #ifdef LEFT
  967. addl $4, %eax
  968. #else
  969. addl $1, %eax
  970. #endif
  971. movl %eax, KKK
  972. #endif
  973. sarl $3, %eax
  974. je .L112
  975. .L111:
  976. mulpd %xmm2, %xmm0
  977. mulpd 2 * SIZE(AA), %xmm2
  978. addpd %xmm0, %xmm4
  979. movapd 4 * SIZE(AA), %xmm0
  980. addpd %xmm2, %xmm6
  981. movapd 2 * SIZE(BB), %xmm2
  982. mulpd %xmm2, %xmm0
  983. mulpd 6 * SIZE(AA), %xmm2
  984. addpd %xmm0, %xmm5
  985. movapd 16 * SIZE(AA), %xmm0
  986. addpd %xmm2, %xmm7
  987. movapd 4 * SIZE(BB), %xmm2
  988. mulpd %xmm2, %xmm1
  989. mulpd 10 * SIZE(AA), %xmm2
  990. addpd %xmm1, %xmm4
  991. movapd 12 * SIZE(AA), %xmm1
  992. addpd %xmm2, %xmm6
  993. movapd 6 * SIZE(BB), %xmm2
  994. mulpd %xmm2, %xmm1
  995. mulpd 14 * SIZE(AA), %xmm2
  996. addpd %xmm1, %xmm5
  997. movapd 24 * SIZE(AA), %xmm1
  998. addpd %xmm2, %xmm7
  999. movapd 16 * SIZE(BB), %xmm2
  1000. mulpd %xmm3, %xmm0
  1001. mulpd 18 * SIZE(AA), %xmm3
  1002. addpd %xmm0, %xmm4
  1003. movapd 20 * SIZE(AA), %xmm0
  1004. addpd %xmm3, %xmm6
  1005. movapd 10 * SIZE(BB), %xmm3
  1006. mulpd %xmm3, %xmm0
  1007. mulpd 22 * SIZE(AA), %xmm3
  1008. addpd %xmm0, %xmm5
  1009. movapd 32 * SIZE(AA), %xmm0
  1010. addpd %xmm3, %xmm7
  1011. movapd 12 * SIZE(BB), %xmm3
  1012. mulpd %xmm3, %xmm1
  1013. mulpd 26 * SIZE(AA), %xmm3
  1014. addpd %xmm1, %xmm4
  1015. movapd 28 * SIZE(AA), %xmm1
  1016. addpd %xmm3, %xmm6
  1017. movapd 14 * SIZE(BB), %xmm3
  1018. mulpd %xmm3, %xmm1
  1019. mulpd 30 * SIZE(AA), %xmm3
  1020. addpd %xmm1, %xmm5
  1021. movapd 40 * SIZE(AA), %xmm1
  1022. addpd %xmm3, %xmm7
  1023. movapd 24 * SIZE(BB), %xmm3
  1024. addl $32 * SIZE, AA
  1025. addl $16 * SIZE, BB
  1026. decl %eax
  1027. jne .L111
  1028. .L112:
  1029. #ifndef TRMMKERNEL
  1030. movl K, %eax
  1031. #else
  1032. movl KKK, %eax
  1033. #endif
  1034. movaps ALPHA, %xmm3
  1035. andl $7, %eax # if (k & 1)
  1036. BRANCH
  1037. je .L114
  1038. .L113:
  1039. mulpd %xmm2, %xmm0
  1040. mulpd 2 * SIZE(AA), %xmm2
  1041. addpd %xmm0, %xmm4
  1042. movapd 4 * SIZE(AA), %xmm0
  1043. addpd %xmm2, %xmm6
  1044. movapd 2 * SIZE(BB), %xmm2
  1045. addl $4 * SIZE, AA # aoffset += 8
  1046. addl $2 * SIZE, BB # boffset1 += 8
  1047. subl $1, %eax
  1048. jg .L113
  1049. ALIGN_4
  1050. .L114:
  1051. addpd %xmm5, %xmm4
  1052. addpd %xmm7, %xmm6
  1053. movsd 0 * SIZE(%esi), %xmm0
  1054. movhps 1 * SIZE(%esi), %xmm0
  1055. movsd 2 * SIZE(%esi), %xmm1
  1056. movhps 3 * SIZE(%esi), %xmm1
  1057. pshufd $0x44, %xmm4, %xmm2
  1058. unpckhpd %xmm4, %xmm4
  1059. mulpd %xmm3, %xmm2
  1060. addpd %xmm2, %xmm0
  1061. mulpd %xmm3, %xmm4
  1062. addpd %xmm4, %xmm1
  1063. movlps %xmm0, 0 * SIZE(%esi)
  1064. movhps %xmm0, 1 * SIZE(%esi)
  1065. movlps %xmm1, 2 * SIZE(%esi)
  1066. movhps %xmm1, 3 * SIZE(%esi)
  1067. movsd 4 * SIZE(%esi), %xmm0
  1068. movhps 5 * SIZE(%esi), %xmm0
  1069. movsd 6 * SIZE(%esi), %xmm1
  1070. movhps 7 * SIZE(%esi), %xmm1
  1071. pshufd $0x44, %xmm6, %xmm2
  1072. unpckhpd %xmm6, %xmm6
  1073. mulpd %xmm3, %xmm2
  1074. addpd %xmm2, %xmm0
  1075. mulpd %xmm3, %xmm6
  1076. addpd %xmm6, %xmm1
  1077. movlps %xmm0, 4 * SIZE(%esi)
  1078. movhps %xmm0, 5 * SIZE(%esi)
  1079. movlps %xmm1, 6 * SIZE(%esi)
  1080. movhps %xmm1, 7 * SIZE(%esi)
  1081. addl $8 * SIZE, %esi # coffset += 4
  1082. BRANCH
  1083. decl %ebx # i --
  1084. jg .L110
  1085. ALIGN_2
  1086. .L130:
  1087. movl M, %ebx
  1088. testl $2, %ebx
  1089. jle .L150
  1090. #if !defined(TRMMKERNEL) || \
  1091. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  1092. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  1093. leal BUFFER, BB
  1094. movapd 0 * SIZE + BUFFER, %xmm2
  1095. pxor %xmm4, %xmm4
  1096. movapd 0 * SIZE(AA), %xmm0
  1097. pxor %xmm5, %xmm5
  1098. movapd 8 * SIZE + BUFFER, %xmm3
  1099. pxor %xmm6, %xmm6
  1100. movapd 8 * SIZE(AA), %xmm1
  1101. pxor %xmm7, %xmm7
  1102. #else
  1103. leal BUFFER, BB
  1104. movl KK, %eax
  1105. leal (, %eax, SIZE), %eax
  1106. leal (AA, %eax, 2), AA
  1107. leal (BB, %eax, 2), BB
  1108. movapd 0 * SIZE(BB), %xmm2
  1109. pxor %xmm4, %xmm4
  1110. movapd 0 * SIZE(AA), %xmm0
  1111. pxor %xmm5, %xmm5
  1112. movapd 8 * SIZE(BB), %xmm3
  1113. pxor %xmm6, %xmm6
  1114. movapd 8 * SIZE(AA), %xmm1
  1115. pxor %xmm7, %xmm7
  1116. #endif
  1117. #ifndef TRMMKERNEL
  1118. movl K, %eax
  1119. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1120. movl K, %eax
  1121. subl KK, %eax
  1122. movl %eax, KKK
  1123. #else
  1124. movl KK, %eax
  1125. #ifdef LEFT
  1126. addl $2, %eax
  1127. #else
  1128. addl $1, %eax
  1129. #endif
  1130. movl %eax, KKK
  1131. #endif
  1132. sarl $3, %eax
  1133. je .L132
  1134. .L131:
  1135. mulpd %xmm0, %xmm2
  1136. movapd 2 * SIZE(AA), %xmm0
  1137. addpd %xmm2, %xmm4
  1138. mulpd 2 * SIZE(BB), %xmm0
  1139. movapd 16 * SIZE(BB), %xmm2
  1140. addpd %xmm0, %xmm5
  1141. movapd 4 * SIZE(AA), %xmm0
  1142. mulpd 4 * SIZE(BB), %xmm0
  1143. addpd %xmm0, %xmm6
  1144. movapd 6 * SIZE(AA), %xmm0
  1145. mulpd 6 * SIZE(BB), %xmm0
  1146. addpd %xmm0, %xmm7
  1147. movapd 16 * SIZE(AA), %xmm0
  1148. mulpd %xmm1, %xmm3
  1149. movapd 10 * SIZE(AA), %xmm1
  1150. addpd %xmm3, %xmm4
  1151. mulpd 10 * SIZE(BB), %xmm1
  1152. movapd 24 * SIZE(BB), %xmm3
  1153. addpd %xmm1, %xmm5
  1154. movapd 12 * SIZE(AA), %xmm1
  1155. mulpd 12 * SIZE(BB), %xmm1
  1156. addpd %xmm1, %xmm6
  1157. movapd 14 * SIZE(AA), %xmm1
  1158. mulpd 14 * SIZE(BB), %xmm1
  1159. addpd %xmm1, %xmm7
  1160. movapd 24 * SIZE(AA), %xmm1
  1161. addl $16 * SIZE, AA
  1162. addl $16 * SIZE, BB
  1163. BRANCH
  1164. decl %eax
  1165. jne .L131
  1166. .L132:
  1167. #ifndef TRMMKERNEL
  1168. movl K, %eax
  1169. #else
  1170. movl KKK, %eax
  1171. #endif
  1172. movaps ALPHA, %xmm3
  1173. andl $7, %eax # if (k & 1)
  1174. BRANCH
  1175. je .L134
  1176. .L133:
  1177. movapd 0 * SIZE(AA), %xmm0
  1178. mulpd 0 * SIZE(BB), %xmm0
  1179. addpd %xmm0, %xmm4
  1180. addl $2 * SIZE, AA # aoffset += 8
  1181. addl $2 * SIZE, BB # boffset1 += 8
  1182. decl %eax
  1183. BRANCH
  1184. jg .L133
  1185. ALIGN_4
  1186. .L134:
  1187. addpd %xmm5, %xmm4
  1188. addpd %xmm7, %xmm6
  1189. addpd %xmm6, %xmm4
  1190. movsd 0 * SIZE(%esi), %xmm0
  1191. movhps 1 * SIZE(%esi), %xmm0
  1192. movsd 2 * SIZE(%esi), %xmm1
  1193. movhps 3 * SIZE(%esi), %xmm1
  1194. pshufd $0x44, %xmm4, %xmm2
  1195. unpckhpd %xmm4, %xmm4
  1196. mulpd %xmm3, %xmm2
  1197. addpd %xmm2, %xmm0
  1198. mulpd %xmm3, %xmm4
  1199. addpd %xmm4, %xmm1
  1200. movlps %xmm0, 0 * SIZE(%esi)
  1201. movhps %xmm0, 1 * SIZE(%esi)
  1202. movlps %xmm1, 2 * SIZE(%esi)
  1203. movhps %xmm1, 3 * SIZE(%esi)
  1204. addl $4 * SIZE, %esi
  1205. ALIGN_2
  1206. .L150:
  1207. movl M, %ebx
  1208. testl $1, %ebx
  1209. jle .L999
  1210. #if !defined(TRMMKERNEL) || \
  1211. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  1212. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  1213. leal BUFFER, BB
  1214. movapd 0 * SIZE + BUFFER, %xmm2
  1215. pxor %xmm4, %xmm4
  1216. movapd 0 * SIZE(AA), %xmm0
  1217. pxor %xmm5, %xmm5
  1218. movapd 8 * SIZE + BUFFER, %xmm3
  1219. pxor %xmm6, %xmm6
  1220. movapd 4 * SIZE(AA), %xmm1
  1221. pxor %xmm7, %xmm7
  1222. #else
  1223. leal BUFFER, BB
  1224. movl KK, %eax
  1225. leal (, %eax, SIZE), %eax
  1226. leal (AA, %eax, 1), AA
  1227. leal (BB, %eax, 2), BB
  1228. movapd 0 * SIZE(BB), %xmm2
  1229. pxor %xmm4, %xmm4
  1230. movapd 0 * SIZE(AA), %xmm0
  1231. pxor %xmm5, %xmm5
  1232. movapd 8 * SIZE(BB), %xmm3
  1233. pxor %xmm6, %xmm6
  1234. movapd 4 * SIZE(AA), %xmm1
  1235. pxor %xmm7, %xmm7
  1236. #endif
  1237. #ifndef TRMMKERNEL
  1238. movl K, %eax
  1239. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1240. movl K, %eax
  1241. subl KK, %eax
  1242. movl %eax, KKK
  1243. #else
  1244. movl KK, %eax
  1245. addl $1, %eax
  1246. movl %eax, KKK
  1247. #endif
  1248. sarl $3, %eax
  1249. je .L152
  1250. .L151:
  1251. mulsd %xmm0, %xmm2
  1252. movsd 1 * SIZE(AA), %xmm0
  1253. addsd %xmm2, %xmm4
  1254. mulsd 2 * SIZE(BB), %xmm0
  1255. movsd 16 * SIZE(BB), %xmm2
  1256. addsd %xmm0, %xmm4
  1257. movsd 2 * SIZE(AA), %xmm0
  1258. mulsd 4 * SIZE(BB), %xmm0
  1259. addsd %xmm0, %xmm4
  1260. movsd 3 * SIZE(AA), %xmm0
  1261. mulsd 6 * SIZE(BB), %xmm0
  1262. addsd %xmm0, %xmm4
  1263. movsd 8 * SIZE(AA), %xmm0
  1264. mulsd %xmm1, %xmm3
  1265. movsd 5 * SIZE(AA), %xmm1
  1266. addsd %xmm3, %xmm4
  1267. mulsd 10 * SIZE(BB), %xmm1
  1268. movsd 24 * SIZE(BB), %xmm3
  1269. addsd %xmm1, %xmm4
  1270. movsd 6 * SIZE(AA), %xmm1
  1271. mulsd 12 * SIZE(BB), %xmm1
  1272. addsd %xmm1, %xmm4
  1273. movsd 7 * SIZE(AA), %xmm1
  1274. mulsd 14 * SIZE(BB), %xmm1
  1275. addsd %xmm1, %xmm4
  1276. movsd 12 * SIZE(AA), %xmm1
  1277. addl $ 8 * SIZE, AA
  1278. addl $16 * SIZE, BB
  1279. BRANCH
  1280. decl %eax
  1281. jne .L151
  1282. .L152:
  1283. #ifndef TRMMKERNEL
  1284. movl K, %eax
  1285. #else
  1286. movl KKK, %eax
  1287. #endif
  1288. movaps ALPHA, %xmm3
  1289. andl $7, %eax # if (k & 1)
  1290. BRANCH
  1291. je .L154
  1292. .L153:
  1293. movsd 0 * SIZE(AA), %xmm0
  1294. mulsd 0 * SIZE(BB), %xmm0
  1295. addsd %xmm0, %xmm4
  1296. addl $1 * SIZE, AA # aoffset += 8
  1297. addl $2 * SIZE, BB # boffset1 += 8
  1298. decl %eax
  1299. BRANCH
  1300. jg .L153
  1301. ALIGN_4
  1302. .L154:
  1303. movsd 0 * SIZE(%esi), %xmm0
  1304. movhps 1 * SIZE(%esi), %xmm0
  1305. unpcklpd %xmm4, %xmm4
  1306. mulpd %xmm3, %xmm4
  1307. addpd %xmm4, %xmm0
  1308. movlps %xmm0, 0 * SIZE(%esi)
  1309. movhps %xmm0, 1 * SIZE(%esi)
  1310. ALIGN_2
  1311. .L999:
  1312. movl OLD_STACK, %esp
  1313. EMMS
  1314. popl %ebx
  1315. popl %esi
  1316. popl %edi
  1317. popl %ebp
  1318. ret
  1319. ALIGN_2
  1320. EPILOGUE