You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

gemm_kernel_2x4_core2.S 25 kB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318
  1. /*********************************************************************/
  2. /* Copyright 2009, 2010 The University of Texas at Austin. */
  3. /* All rights reserved. */
  4. /* */
  5. /* Redistribution and use in source and binary forms, with or */
  6. /* without modification, are permitted provided that the following */
  7. /* conditions are met: */
  8. /* */
  9. /* 1. Redistributions of source code must retain the above */
  10. /* copyright notice, this list of conditions and the following */
  11. /* disclaimer. */
  12. /* */
  13. /* 2. Redistributions in binary form must reproduce the above */
  14. /* copyright notice, this list of conditions and the following */
  15. /* disclaimer in the documentation and/or other materials */
  16. /* provided with the distribution. */
  17. /* */
  18. /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
  19. /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
  20. /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
  21. /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
  22. /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
  23. /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
  24. /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
  25. /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
  26. /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
  27. /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
  28. /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
  29. /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
  30. /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
  31. /* POSSIBILITY OF SUCH DAMAGE. */
  32. /* */
  33. /* The views and conclusions contained in the software and */
  34. /* documentation are those of the authors and should not be */
  35. /* interpreted as representing official policies, either expressed */
  36. /* or implied, of The University of Texas at Austin. */
  37. /*********************************************************************/
  38. #define ASSEMBLER
  39. #include "common.h"
  40. #define STACK 16
  41. #define ARGS 16
  42. #define M 4 + STACK + ARGS(%esp)
  43. #define N 8 + STACK + ARGS(%esp)
  44. #define K 12 + STACK + ARGS(%esp)
  45. #define ALPHA 16 + STACK + ARGS(%esp)
  46. #define A 24 + STACK + ARGS(%esp)
  47. #define ARG_B 28 + STACK + ARGS(%esp)
  48. #define C 32 + STACK + ARGS(%esp)
  49. #define ARG_LDC 36 + STACK + ARGS(%esp)
  50. #define OFFSET 40 + STACK + ARGS(%esp)
  51. #define J 0 + STACK(%esp)
  52. #define BX 4 + STACK(%esp)
  53. #define KK 8 + STACK(%esp)
  54. #define KKK 12 + STACK(%esp)
  55. #define PREFETCH_R (8 * 4)
  56. #define PREFETCHSIZE (8 * 21 + 4)
  57. #define PREFETCH prefetcht0
  58. #define AA %edx
  59. #define BB %ecx
  60. #define LDC %ebp
  61. #define B %edi
  62. #define C1 %esi
  63. #define I %ebx
  64. PROLOGUE
  65. subl $ARGS, %esp # Generate Stack Frame
  66. pushl %ebp
  67. pushl %edi
  68. pushl %esi
  69. pushl %ebx
  70. PROFCODE
  71. movl ARG_B, B
  72. movl ARG_LDC, LDC
  73. #ifdef TRMMKERNEL
  74. movl OFFSET, %eax
  75. #ifndef LEFT
  76. negl %eax
  77. #endif
  78. movl %eax, KK
  79. #endif
  80. subl $-16 * SIZE, A
  81. subl $-16 * SIZE, B
  82. leal (, LDC, SIZE), LDC
  83. movl N, %eax
  84. sarl $2, %eax
  85. movl %eax, J
  86. jle .L30
  87. ALIGN_4
  88. .L01:
  89. #if defined(TRMMKERNEL) && defined(LEFT)
  90. movl OFFSET, %eax
  91. movl %eax, KK
  92. #endif
  93. movl B, BX
  94. movl C, C1
  95. movl A, AA
  96. movl M, I
  97. sarl $1, I
  98. jle .L20
  99. ALIGN_4
  100. .L11:
  101. #if !defined(TRMMKERNEL) || \
  102. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  103. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  104. movl B, BB
  105. #else
  106. movl B, BB
  107. movl KK, %eax
  108. leal (, %eax, SIZE), %eax
  109. leal (AA, %eax, 2), AA
  110. leal (BB, %eax, 4), BB
  111. #endif
  112. movl BX, %eax
  113. prefetcht2 (PREFETCH_R + 0) * SIZE(%eax)
  114. prefetcht2 (PREFETCH_R + 8) * SIZE(%eax)
  115. subl $-8 * SIZE, BX
  116. leal (C1, LDC, 2), %eax
  117. movaps -16 * SIZE(AA), %xmm0
  118. pxor %xmm2, %xmm2
  119. movaps -16 * SIZE(BB), %xmm1
  120. pxor %xmm3, %xmm3
  121. pxor %xmm4, %xmm4
  122. prefetcht0 1 * SIZE(C1)
  123. pxor %xmm5, %xmm5
  124. prefetcht0 1 * SIZE(C1, LDC)
  125. pxor %xmm6, %xmm6
  126. prefetcht0 1 * SIZE(%eax)
  127. pxor %xmm7, %xmm7
  128. prefetcht0 1 * SIZE(%eax, LDC)
  129. #ifndef TRMMKERNEL
  130. movl K, %eax
  131. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  132. movl K, %eax
  133. subl KK, %eax
  134. movl %eax, KKK
  135. #else
  136. movl KK, %eax
  137. #ifdef LEFT
  138. addl $2, %eax
  139. #else
  140. addl $4, %eax
  141. #endif
  142. movl %eax, KKK
  143. #endif
  144. sarl $3, %eax
  145. je .L15
  146. ALIGN_4
  147. .L12:
  148. PREFETCH (PREFETCHSIZE + 0) * SIZE(AA)
  149. addpd %xmm2, %xmm6
  150. movapd %xmm1, %xmm2
  151. mulpd %xmm0, %xmm1
  152. addpd %xmm1, %xmm4
  153. movaps -14 * SIZE(BB), %xmm1
  154. addpd %xmm3, %xmm7
  155. movapd %xmm1, %xmm3
  156. mulpd %xmm0, %xmm1
  157. // SHUFPD_1 %xmm0, %xmm0
  158. pshufd $0x4e, %xmm0, %xmm0
  159. mulpd %xmm0, %xmm2
  160. mulpd %xmm0, %xmm3
  161. movaps -14 * SIZE(AA), %xmm0
  162. addpd %xmm1, %xmm5
  163. movaps -12 * SIZE(BB), %xmm1
  164. addpd %xmm2, %xmm6
  165. movapd %xmm1, %xmm2
  166. mulpd %xmm0, %xmm1
  167. addpd %xmm1, %xmm4
  168. movaps -10 * SIZE(BB), %xmm1
  169. addpd %xmm3, %xmm7
  170. movapd %xmm1, %xmm3
  171. mulpd %xmm0, %xmm1
  172. // SHUFPD_1 %xmm0, %xmm0
  173. pshufd $0x4e, %xmm0, %xmm0
  174. mulpd %xmm0, %xmm2
  175. mulpd %xmm0, %xmm3
  176. movaps -12 * SIZE(AA), %xmm0
  177. addpd %xmm1, %xmm5
  178. movaps -8 * SIZE(BB), %xmm1
  179. addpd %xmm2, %xmm6
  180. movapd %xmm1, %xmm2
  181. mulpd %xmm0, %xmm1
  182. addpd %xmm1, %xmm4
  183. movaps -6 * SIZE(BB), %xmm1
  184. addpd %xmm3, %xmm7
  185. movapd %xmm1, %xmm3
  186. mulpd %xmm0, %xmm1
  187. // SHUFPD_1 %xmm0, %xmm0
  188. pshufd $0x4e, %xmm0, %xmm0
  189. mulpd %xmm0, %xmm2
  190. mulpd %xmm0, %xmm3
  191. movaps -10 * SIZE(AA), %xmm0
  192. addpd %xmm1, %xmm5
  193. movaps -4 * SIZE(BB), %xmm1
  194. addpd %xmm2, %xmm6
  195. movapd %xmm1, %xmm2
  196. mulpd %xmm0, %xmm1
  197. addpd %xmm1, %xmm4
  198. movaps -2 * SIZE(BB), %xmm1
  199. addpd %xmm3, %xmm7
  200. movapd %xmm1, %xmm3
  201. mulpd %xmm0, %xmm1
  202. // SHUFPD_1 %xmm0, %xmm0
  203. pshufd $0x4e, %xmm0, %xmm0
  204. mulpd %xmm0, %xmm2
  205. mulpd %xmm0, %xmm3
  206. movaps -8 * SIZE(AA), %xmm0
  207. addpd %xmm1, %xmm5
  208. movaps 0 * SIZE(BB), %xmm1
  209. PREFETCH (PREFETCHSIZE + 8) * SIZE(AA)
  210. addpd %xmm2, %xmm6
  211. movapd %xmm1, %xmm2
  212. mulpd %xmm0, %xmm1
  213. addpd %xmm1, %xmm4
  214. movaps 2 * SIZE(BB), %xmm1
  215. addpd %xmm3, %xmm7
  216. movapd %xmm1, %xmm3
  217. mulpd %xmm0, %xmm1
  218. // SHUFPD_1 %xmm0, %xmm0
  219. pshufd $0x4e, %xmm0, %xmm0
  220. mulpd %xmm0, %xmm2
  221. mulpd %xmm0, %xmm3
  222. movaps -6 * SIZE(AA), %xmm0
  223. addpd %xmm1, %xmm5
  224. movaps 4 * SIZE(BB), %xmm1
  225. addpd %xmm2, %xmm6
  226. movapd %xmm1, %xmm2
  227. mulpd %xmm0, %xmm1
  228. addpd %xmm1, %xmm4
  229. movaps 6 * SIZE(BB), %xmm1
  230. addpd %xmm3, %xmm7
  231. movapd %xmm1, %xmm3
  232. mulpd %xmm0, %xmm1
  233. // SHUFPD_1 %xmm0, %xmm0
  234. pshufd $0x4e, %xmm0, %xmm0
  235. mulpd %xmm0, %xmm2
  236. mulpd %xmm0, %xmm3
  237. movaps -4 * SIZE(AA), %xmm0
  238. addpd %xmm1, %xmm5
  239. movaps 8 * SIZE(BB), %xmm1
  240. addpd %xmm2, %xmm6
  241. movapd %xmm1, %xmm2
  242. mulpd %xmm0, %xmm1
  243. addpd %xmm1, %xmm4
  244. movaps 10 * SIZE(BB), %xmm1
  245. addpd %xmm3, %xmm7
  246. movapd %xmm1, %xmm3
  247. mulpd %xmm0, %xmm1
  248. // SHUFPD_1 %xmm0, %xmm0
  249. pshufd $0x4e, %xmm0, %xmm0
  250. mulpd %xmm0, %xmm2
  251. mulpd %xmm0, %xmm3
  252. movaps -2 * SIZE(AA), %xmm0
  253. addpd %xmm1, %xmm5
  254. movaps 12 * SIZE(BB), %xmm1
  255. addpd %xmm2, %xmm6
  256. movapd %xmm1, %xmm2
  257. mulpd %xmm0, %xmm1
  258. addpd %xmm1, %xmm4
  259. movaps 14 * SIZE(BB), %xmm1
  260. addpd %xmm3, %xmm7
  261. movapd %xmm1, %xmm3
  262. mulpd %xmm0, %xmm1
  263. // SHUFPD_1 %xmm0, %xmm0
  264. pshufd $0x4e, %xmm0, %xmm0
  265. mulpd %xmm0, %xmm2
  266. mulpd %xmm0, %xmm3
  267. movaps 0 * SIZE(AA), %xmm0
  268. addpd %xmm1, %xmm5
  269. movaps 16 * SIZE(BB), %xmm1
  270. subl $-32 * SIZE, BB
  271. subl $-16 * SIZE, AA
  272. subl $1, %eax
  273. BRANCH
  274. jne .L12
  275. ALIGN_4
  276. .L15:
  277. #ifndef TRMMKERNEL
  278. movl K, %eax
  279. #else
  280. movl KKK, %eax
  281. #endif
  282. andl $7, %eax
  283. BRANCH
  284. je .L18
  285. ALIGN_4
  286. .L16:
  287. addpd %xmm2, %xmm6
  288. movapd %xmm1, %xmm2
  289. mulpd %xmm0, %xmm1
  290. addpd %xmm1, %xmm4
  291. movaps -14 * SIZE(BB), %xmm1
  292. addpd %xmm3, %xmm7
  293. movapd %xmm1, %xmm3
  294. mulpd %xmm0, %xmm1
  295. SHUFPD_1 %xmm0, %xmm0
  296. mulpd %xmm0, %xmm2
  297. mulpd %xmm0, %xmm3
  298. movaps -14 * SIZE(AA), %xmm0
  299. addpd %xmm1, %xmm5
  300. movaps -12 * SIZE(BB), %xmm1
  301. addl $2 * SIZE, AA
  302. addl $4 * SIZE, BB
  303. decl %eax
  304. jg .L16
  305. ALIGN_4
  306. .L18:
  307. addpd %xmm2, %xmm6
  308. addpd %xmm3, %xmm7
  309. movddup ALPHA, %xmm3
  310. movaps %xmm4, %xmm0
  311. unpcklpd %xmm6, %xmm4
  312. unpckhpd %xmm0, %xmm6
  313. movaps %xmm5, %xmm1
  314. unpcklpd %xmm7, %xmm5
  315. unpckhpd %xmm1, %xmm7
  316. mulpd %xmm3, %xmm4
  317. mulpd %xmm3, %xmm5
  318. mulpd %xmm3, %xmm6
  319. mulpd %xmm3, %xmm7
  320. leal (C1, LDC, 2), %eax
  321. #ifndef TRMMKERNEL
  322. movsd 0 * SIZE(C1), %xmm0
  323. movhpd 1 * SIZE(C1), %xmm0
  324. movsd 0 * SIZE(C1, LDC), %xmm1
  325. movhpd 1 * SIZE(C1, LDC), %xmm1
  326. movsd 0 * SIZE(%eax), %xmm2
  327. movhpd 1 * SIZE(%eax), %xmm2
  328. movsd 0 * SIZE(%eax, LDC), %xmm3
  329. movhpd 1 * SIZE(%eax, LDC), %xmm3
  330. addpd %xmm0, %xmm4
  331. addpd %xmm1, %xmm6
  332. addpd %xmm2, %xmm5
  333. addpd %xmm3, %xmm7
  334. #endif
  335. movsd %xmm4, 0 * SIZE(C1)
  336. movhpd %xmm4, 1 * SIZE(C1)
  337. movsd %xmm6, 0 * SIZE(C1, LDC)
  338. movhpd %xmm6, 1 * SIZE(C1, LDC)
  339. movsd %xmm5, 0 * SIZE(%eax)
  340. movhpd %xmm5, 1 * SIZE(%eax)
  341. movsd %xmm7, 0 * SIZE(%eax, LDC)
  342. movhpd %xmm7, 1 * SIZE(%eax, LDC)
  343. #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  344. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  345. movl K, %eax
  346. subl KKK, %eax
  347. leal (,%eax, SIZE), %eax
  348. leal (AA, %eax, 2), AA
  349. leal (BB, %eax, 4), BB
  350. #endif
  351. #if defined(TRMMKERNEL) && defined(LEFT)
  352. addl $2, KK
  353. #endif
  354. addl $2 * SIZE, C1
  355. decl I
  356. jg .L11
  357. ALIGN_4
  358. .L20:
  359. movl M, I
  360. testl $1, I
  361. jle .L29
  362. #if !defined(TRMMKERNEL) || \
  363. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  364. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  365. movl B, BB
  366. #else
  367. movl B, BB
  368. movl KK, %eax
  369. leal (, %eax, SIZE), %eax
  370. addl %eax, AA
  371. leal (BB, %eax, 4), BB
  372. #endif
  373. movaps -16 * SIZE(AA), %xmm0
  374. pxor %xmm4, %xmm4
  375. movaps -16 * SIZE(BB), %xmm2
  376. pxor %xmm5, %xmm5
  377. movaps -14 * SIZE(BB), %xmm3
  378. pxor %xmm6, %xmm6
  379. pxor %xmm7, %xmm7
  380. #ifndef TRMMKERNEL
  381. movl K, %eax
  382. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  383. movl K, %eax
  384. subl KK, %eax
  385. movl %eax, KKK
  386. #else
  387. movl KK, %eax
  388. #ifdef LEFT
  389. addl $1, %eax
  390. #else
  391. addl $4, %eax
  392. #endif
  393. movl %eax, KKK
  394. #endif
  395. sarl $3, %eax
  396. je .L25
  397. ALIGN_4
  398. .L22:
  399. PREFETCH (PREFETCHSIZE + 0) * SIZE(AA)
  400. pshufd $0x44, %xmm0, %xmm1
  401. mulpd %xmm1, %xmm2
  402. mulpd %xmm1, %xmm3
  403. addpd %xmm2, %xmm4
  404. movaps -12 * SIZE(BB), %xmm2
  405. addpd %xmm3, %xmm5
  406. movaps -10 * SIZE(BB), %xmm3
  407. pshufd $0xee, %xmm0, %xmm1
  408. movaps -14 * SIZE(AA), %xmm0
  409. mulpd %xmm1, %xmm2
  410. mulpd %xmm1, %xmm3
  411. addpd %xmm2, %xmm6
  412. movaps -8 * SIZE(BB), %xmm2
  413. addpd %xmm3, %xmm7
  414. movaps -6 * SIZE(BB), %xmm3
  415. pshufd $0x44, %xmm0, %xmm1
  416. mulpd %xmm1, %xmm2
  417. mulpd %xmm1, %xmm3
  418. addpd %xmm2, %xmm4
  419. movaps -4 * SIZE(BB), %xmm2
  420. addpd %xmm3, %xmm5
  421. movaps -2 * SIZE(BB), %xmm3
  422. pshufd $0xee, %xmm0, %xmm1
  423. movaps -12 * SIZE(AA), %xmm0
  424. mulpd %xmm1, %xmm2
  425. mulpd %xmm1, %xmm3
  426. addpd %xmm2, %xmm6
  427. movaps 0 * SIZE(BB), %xmm2
  428. addpd %xmm3, %xmm7
  429. movaps 2 * SIZE(BB), %xmm3
  430. pshufd $0x44, %xmm0, %xmm1
  431. mulpd %xmm1, %xmm2
  432. mulpd %xmm1, %xmm3
  433. addpd %xmm2, %xmm4
  434. movaps 4 * SIZE(BB), %xmm2
  435. addpd %xmm3, %xmm5
  436. movaps 6 * SIZE(BB), %xmm3
  437. pshufd $0xee, %xmm0, %xmm1
  438. movaps -10 * SIZE(AA), %xmm0
  439. mulpd %xmm1, %xmm2
  440. mulpd %xmm1, %xmm3
  441. addpd %xmm2, %xmm6
  442. movaps 8 * SIZE(BB), %xmm2
  443. addpd %xmm3, %xmm7
  444. movaps 10 * SIZE(BB), %xmm3
  445. pshufd $0x44, %xmm0, %xmm1
  446. mulpd %xmm1, %xmm2
  447. mulpd %xmm1, %xmm3
  448. addpd %xmm2, %xmm4
  449. movaps 12 * SIZE(BB), %xmm2
  450. addpd %xmm3, %xmm5
  451. movaps 14 * SIZE(BB), %xmm3
  452. pshufd $0xee, %xmm0, %xmm1
  453. movaps -8 * SIZE(AA), %xmm0
  454. mulpd %xmm1, %xmm2
  455. mulpd %xmm1, %xmm3
  456. addpd %xmm2, %xmm6
  457. movaps 16 * SIZE(BB), %xmm2
  458. addpd %xmm3, %xmm7
  459. movaps 18 * SIZE(BB), %xmm3
  460. subl $ -8 * SIZE, AA
  461. subl $-32 * SIZE, BB
  462. subl $1, %eax
  463. jne .L22
  464. ALIGN_4
  465. .L25:
  466. #ifndef TRMMKERNEL
  467. movl K, %eax
  468. #else
  469. movl KKK, %eax
  470. #endif
  471. andl $7, %eax
  472. BRANCH
  473. je .L28
  474. ALIGN_4
  475. .L26:
  476. pshufd $0x44, %xmm0, %xmm1
  477. movsd -15 * SIZE(AA), %xmm0
  478. mulpd %xmm1, %xmm2
  479. mulpd %xmm1, %xmm3
  480. addpd %xmm2, %xmm4
  481. movaps -12 * SIZE(BB), %xmm2
  482. addpd %xmm3, %xmm5
  483. movaps -10 * SIZE(BB), %xmm3
  484. addl $1 * SIZE, AA
  485. addl $4 * SIZE, BB
  486. decl %eax
  487. jg .L26
  488. ALIGN_4
  489. .L28:
  490. movddup ALPHA, %xmm3
  491. addpd %xmm6, %xmm4
  492. addpd %xmm7, %xmm5
  493. leal (C1, LDC, 2), %eax
  494. #ifndef TRMMKERNEL
  495. movsd 0 * SIZE(C1), %xmm0
  496. movhpd 0 * SIZE(C1, LDC), %xmm0
  497. movsd 0 * SIZE(%eax), %xmm1
  498. movhpd 0 * SIZE(%eax, LDC), %xmm1
  499. #endif
  500. mulpd %xmm3, %xmm4
  501. mulpd %xmm3, %xmm5
  502. #ifndef TRMMKERNEL
  503. addpd %xmm0, %xmm4
  504. addpd %xmm1, %xmm5
  505. #endif
  506. movsd %xmm4, 0 * SIZE(C1)
  507. movhpd %xmm4, 0 * SIZE(C1, LDC)
  508. movsd %xmm5, 0 * SIZE(%eax)
  509. movhpd %xmm5, 0 * SIZE(%eax, LDC)
  510. #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  511. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  512. movl K, %eax
  513. subl KKK, %eax
  514. leal (,%eax, SIZE), %eax
  515. addl %eax, AA
  516. leal (BB, %eax, 4), BB
  517. #endif
  518. #if defined(TRMMKERNEL) && defined(LEFT)
  519. addl $1, KK
  520. #endif
  521. ALIGN_4
  522. .L29:
  523. #if defined(TRMMKERNEL) && !defined(LEFT)
  524. addl $4, KK
  525. #endif
  526. movl BB, B
  527. leal (, LDC, 4), %eax
  528. addl %eax, C
  529. decl J
  530. jg .L01
  531. ALIGN_4
  532. .L30:
  533. movl N, %eax
  534. testl $2, %eax
  535. jle .L50
  536. #if defined(TRMMKERNEL) && defined(LEFT)
  537. movl OFFSET, %eax
  538. movl %eax, KK
  539. #endif
  540. movl C, C1
  541. movl A, AA
  542. movl M, I
  543. sarl $1, I
  544. jle .L40
  545. ALIGN_4
  546. .L31:
  547. #if !defined(TRMMKERNEL) || \
  548. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  549. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  550. movl B, BB
  551. #else
  552. movl B, BB
  553. movl KK, %eax
  554. leal (, %eax, SIZE), %eax
  555. leal (AA, %eax, 2), AA
  556. leal (BB, %eax, 2), BB
  557. #endif
  558. movaps -16 * SIZE(AA), %xmm0
  559. pxor %xmm4, %xmm4
  560. movaps -16 * SIZE(BB), %xmm1
  561. pxor %xmm5, %xmm5
  562. prefetcht0 1 * SIZE(C1)
  563. pxor %xmm6, %xmm6
  564. prefetcht0 1 * SIZE(C1, LDC)
  565. pxor %xmm7, %xmm7
  566. #ifndef TRMMKERNEL
  567. movl K, %eax
  568. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  569. movl K, %eax
  570. subl KK, %eax
  571. movl %eax, KKK
  572. #else
  573. movl KK, %eax
  574. #ifdef LEFT
  575. addl $2, %eax
  576. #else
  577. addl $2, %eax
  578. #endif
  579. movl %eax, KKK
  580. #endif
  581. sarl $3, %eax
  582. je .L35
  583. ALIGN_4
  584. .L32:
  585. PREFETCH (PREFETCHSIZE + 0) * SIZE(AA)
  586. pshufd $0x4e, %xmm1, %xmm2
  587. mulpd %xmm0, %xmm1
  588. mulpd %xmm0, %xmm2
  589. movaps -14 * SIZE(AA), %xmm0
  590. addpd %xmm1, %xmm5
  591. movaps -14 * SIZE(BB), %xmm1
  592. addpd %xmm2, %xmm4
  593. pshufd $0x4e, %xmm1, %xmm2
  594. mulpd %xmm0, %xmm1
  595. mulpd %xmm0, %xmm2
  596. movaps -12 * SIZE(AA), %xmm0
  597. addpd %xmm1, %xmm7
  598. movaps -12 * SIZE(BB), %xmm1
  599. addpd %xmm2, %xmm6
  600. pshufd $0x4e, %xmm1, %xmm2
  601. mulpd %xmm0, %xmm1
  602. mulpd %xmm0, %xmm2
  603. movaps -10 * SIZE(AA), %xmm0
  604. addpd %xmm1, %xmm5
  605. movaps -10 * SIZE(BB), %xmm1
  606. addpd %xmm2, %xmm4
  607. pshufd $0x4e, %xmm1, %xmm2
  608. mulpd %xmm0, %xmm1
  609. mulpd %xmm0, %xmm2
  610. movaps -8 * SIZE(AA), %xmm0
  611. addpd %xmm1, %xmm7
  612. movaps -8 * SIZE(BB), %xmm1
  613. addpd %xmm2, %xmm6
  614. PREFETCH (PREFETCHSIZE + 8) * SIZE(AA)
  615. pshufd $0x4e, %xmm1, %xmm2
  616. mulpd %xmm0, %xmm1
  617. mulpd %xmm0, %xmm2
  618. movaps -6 * SIZE(AA), %xmm0
  619. addpd %xmm1, %xmm5
  620. movaps -6 * SIZE(BB), %xmm1
  621. addpd %xmm2, %xmm4
  622. pshufd $0x4e, %xmm1, %xmm2
  623. mulpd %xmm0, %xmm1
  624. mulpd %xmm0, %xmm2
  625. movaps -4 * SIZE(AA), %xmm0
  626. addpd %xmm1, %xmm7
  627. movaps -4 * SIZE(BB), %xmm1
  628. addpd %xmm2, %xmm6
  629. pshufd $0x4e, %xmm1, %xmm2
  630. mulpd %xmm0, %xmm1
  631. mulpd %xmm0, %xmm2
  632. movaps -2 * SIZE(AA), %xmm0
  633. addpd %xmm1, %xmm5
  634. movaps -2 * SIZE(BB), %xmm1
  635. addpd %xmm2, %xmm4
  636. pshufd $0x4e, %xmm1, %xmm2
  637. mulpd %xmm0, %xmm1
  638. mulpd %xmm0, %xmm2
  639. movaps 0 * SIZE(AA), %xmm0
  640. addpd %xmm1, %xmm7
  641. movaps 0 * SIZE(BB), %xmm1
  642. addpd %xmm2, %xmm6
  643. subl $-16 * SIZE, AA
  644. subl $-16 * SIZE, BB
  645. subl $1, %eax
  646. jne .L32
  647. ALIGN_4
  648. .L35:
  649. #ifndef TRMMKERNEL
  650. movl K, %eax
  651. #else
  652. movl KKK, %eax
  653. #endif
  654. andl $7, %eax
  655. BRANCH
  656. je .L38
  657. ALIGN_4
  658. .L36:
  659. pshufd $0x4e, %xmm1, %xmm2
  660. mulpd %xmm0, %xmm1
  661. mulpd %xmm0, %xmm2
  662. movaps -14 * SIZE(AA), %xmm0
  663. addpd %xmm1, %xmm5
  664. movaps -14 * SIZE(BB), %xmm1
  665. addpd %xmm2, %xmm4
  666. addl $2 * SIZE, AA
  667. addl $2 * SIZE, BB
  668. decl %eax
  669. jg .L36
  670. ALIGN_4
  671. .L38:
  672. movddup ALPHA, %xmm3
  673. addpd %xmm6, %xmm4
  674. addpd %xmm7, %xmm5
  675. movaps %xmm4, %xmm0
  676. movsd %xmm5, %xmm4
  677. mulpd %xmm3, %xmm4
  678. movsd %xmm0, %xmm5
  679. mulpd %xmm3, %xmm5
  680. #ifndef TRMMKERNEL
  681. movsd 0 * SIZE(C1), %xmm0
  682. movhpd 1 * SIZE(C1), %xmm0
  683. movsd 0 * SIZE(C1, LDC), %xmm1
  684. movhpd 1 * SIZE(C1, LDC), %xmm1
  685. addpd %xmm0, %xmm4
  686. addpd %xmm1, %xmm5
  687. #endif
  688. movsd %xmm4, 0 * SIZE(C1)
  689. movhpd %xmm4, 1 * SIZE(C1)
  690. movsd %xmm5, 0 * SIZE(C1, LDC)
  691. movhpd %xmm5, 1 * SIZE(C1, LDC)
  692. #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  693. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  694. movl K, %eax
  695. subl KKK, %eax
  696. leal (,%eax, SIZE), %eax
  697. leal (AA, %eax, 2), AA
  698. leal (BB, %eax, 2), BB
  699. #endif
  700. #if defined(TRMMKERNEL) && defined(LEFT)
  701. addl $2, KK
  702. #endif
  703. addl $2 * SIZE, C1
  704. decl I
  705. jg .L31
  706. ALIGN_4
  707. .L40:
  708. movl M, I
  709. testl $1, I
  710. jle .L49
  711. #if !defined(TRMMKERNEL) || \
  712. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  713. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  714. movl B, BB
  715. #else
  716. movl B, BB
  717. movl KK, %eax
  718. leal (, %eax, SIZE), %eax
  719. addl %eax, AA
  720. leal (BB, %eax, 2), BB
  721. #endif
  722. movaps -16 * SIZE(AA), %xmm0
  723. pxor %xmm4, %xmm4
  724. movaps -16 * SIZE(BB), %xmm2
  725. pxor %xmm5, %xmm5
  726. #ifndef TRMMKERNEL
  727. movl K, %eax
  728. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  729. movl K, %eax
  730. subl KK, %eax
  731. movl %eax, KKK
  732. #else
  733. movl KK, %eax
  734. #ifdef LEFT
  735. addl $1, %eax
  736. #else
  737. addl $2, %eax
  738. #endif
  739. movl %eax, KKK
  740. #endif
  741. sarl $3, %eax
  742. je .L45
  743. ALIGN_4
  744. .L42:
  745. PREFETCH (PREFETCHSIZE + 0) * SIZE(AA)
  746. pshufd $0x44, %xmm0, %xmm1
  747. mulpd %xmm1, %xmm2
  748. addpd %xmm2, %xmm4
  749. movaps -14 * SIZE(BB), %xmm2
  750. pshufd $0xee, %xmm0, %xmm1
  751. movaps -14 * SIZE(AA), %xmm0
  752. mulpd %xmm1, %xmm2
  753. addpd %xmm2, %xmm5
  754. movaps -12 * SIZE(BB), %xmm2
  755. pshufd $0x44, %xmm0, %xmm1
  756. mulpd %xmm1, %xmm2
  757. addpd %xmm2, %xmm4
  758. movaps -10 * SIZE(BB), %xmm2
  759. pshufd $0xee, %xmm0, %xmm1
  760. movaps -12 * SIZE(AA), %xmm0
  761. mulpd %xmm1, %xmm2
  762. addpd %xmm2, %xmm5
  763. movaps -8 * SIZE(BB), %xmm2
  764. pshufd $0x44, %xmm0, %xmm1
  765. mulpd %xmm1, %xmm2
  766. addpd %xmm2, %xmm4
  767. movaps -6 * SIZE(BB), %xmm2
  768. pshufd $0xee, %xmm0, %xmm1
  769. movaps -10 * SIZE(AA), %xmm0
  770. mulpd %xmm1, %xmm2
  771. addpd %xmm2, %xmm5
  772. movaps -4 * SIZE(BB), %xmm2
  773. pshufd $0x44, %xmm0, %xmm1
  774. mulpd %xmm1, %xmm2
  775. addpd %xmm2, %xmm4
  776. movaps -2 * SIZE(BB), %xmm2
  777. pshufd $0xee, %xmm0, %xmm1
  778. movaps -8 * SIZE(AA), %xmm0
  779. mulpd %xmm1, %xmm2
  780. addpd %xmm2, %xmm5
  781. movaps 0 * SIZE(BB), %xmm2
  782. subl $ -8 * SIZE, AA
  783. subl $-16 * SIZE, BB
  784. subl $1, %eax
  785. jne .L42
  786. ALIGN_4
  787. .L45:
  788. #ifndef TRMMKERNEL
  789. movl K, %eax
  790. #else
  791. movl KKK, %eax
  792. #endif
  793. andl $7, %eax
  794. BRANCH
  795. je .L48
  796. ALIGN_4
  797. .L46:
  798. pshufd $0x44, %xmm0, %xmm1
  799. movsd -15 * SIZE(AA), %xmm0
  800. mulpd %xmm1, %xmm2
  801. addpd %xmm2, %xmm4
  802. movaps -14 * SIZE(BB), %xmm2
  803. addl $1 * SIZE, AA
  804. addl $2 * SIZE, BB
  805. decl %eax
  806. jg .L46
  807. ALIGN_4
  808. .L48:
  809. movddup ALPHA, %xmm3
  810. addpd %xmm5, %xmm4
  811. #ifndef TRMMKERNEL
  812. movsd 0 * SIZE(C1), %xmm0
  813. movhpd 0 * SIZE(C1, LDC), %xmm0
  814. #endif
  815. mulpd %xmm3, %xmm4
  816. #ifndef TRMMKERNEL
  817. addpd %xmm0, %xmm4
  818. #endif
  819. movsd %xmm4, 0 * SIZE(C1)
  820. movhpd %xmm4, 0 * SIZE(C1, LDC)
  821. #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  822. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  823. movl K, %eax
  824. subl KKK, %eax
  825. leal (,%eax, SIZE), %eax
  826. addl %eax, AA
  827. leal (BB, %eax, 2), BB
  828. #endif
  829. #if defined(TRMMKERNEL) && defined(LEFT)
  830. addl $1, KK
  831. #endif
  832. ALIGN_4
  833. .L49:
  834. #if defined(TRMMKERNEL) && !defined(LEFT)
  835. addl $2, KK
  836. #endif
  837. movl BB, B
  838. leal (, LDC, 2), %eax
  839. addl %eax, C
  840. ALIGN_4
  841. .L50:
  842. movl N, %eax
  843. testl $1, %eax
  844. jle .L999
  845. #if defined(TRMMKERNEL) && defined(LEFT)
  846. movl OFFSET, %eax
  847. movl %eax, KK
  848. #endif
  849. movl C, C1
  850. movl A, AA
  851. movl M, I
  852. sarl $1, I
  853. jle .L60
  854. ALIGN_4
  855. .L51:
  856. #if !defined(TRMMKERNEL) || \
  857. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  858. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  859. movl B, BB
  860. #else
  861. movl B, BB
  862. movl KK, %eax
  863. leal (, %eax, SIZE), %eax
  864. leal (AA, %eax, 2), AA
  865. addl %eax, BB
  866. #endif
  867. movaps -16 * SIZE(AA), %xmm0
  868. pxor %xmm4, %xmm4
  869. movaps -16 * SIZE(BB), %xmm1
  870. pxor %xmm5, %xmm5
  871. prefetcht0 1 * SIZE(C1)
  872. #ifndef TRMMKERNEL
  873. movl K, %eax
  874. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  875. movl K, %eax
  876. subl KK, %eax
  877. movl %eax, KKK
  878. #else
  879. movl KK, %eax
  880. #ifdef LEFT
  881. addl $2, %eax
  882. #else
  883. addl $1, %eax
  884. #endif
  885. movl %eax, KKK
  886. #endif
  887. sarl $3, %eax
  888. je .L55
  889. ALIGN_4
  890. .L52:
  891. PREFETCH (PREFETCHSIZE + 0) * SIZE(AA)
  892. pshufd $0x44, %xmm1, %xmm2
  893. mulpd %xmm0, %xmm2
  894. movaps -14 * SIZE(AA), %xmm0
  895. addpd %xmm2, %xmm4
  896. pshufd $0xee, %xmm1, %xmm2
  897. movaps -14 * SIZE(BB), %xmm1
  898. mulpd %xmm0, %xmm2
  899. movaps -12 * SIZE(AA), %xmm0
  900. addpd %xmm2, %xmm5
  901. pshufd $0x44, %xmm1, %xmm2
  902. mulpd %xmm0, %xmm2
  903. movaps -10 * SIZE(AA), %xmm0
  904. addpd %xmm2, %xmm4
  905. pshufd $0xee, %xmm1, %xmm2
  906. movaps -12 * SIZE(BB), %xmm1
  907. mulpd %xmm0, %xmm2
  908. movaps -8 * SIZE(AA), %xmm0
  909. addpd %xmm2, %xmm5
  910. PREFETCH (PREFETCHSIZE + 8) * SIZE(AA)
  911. pshufd $0x44, %xmm1, %xmm2
  912. mulpd %xmm0, %xmm2
  913. movaps -6 * SIZE(AA), %xmm0
  914. addpd %xmm2, %xmm4
  915. pshufd $0xee, %xmm1, %xmm2
  916. movaps -10 * SIZE(BB), %xmm1
  917. mulpd %xmm0, %xmm2
  918. movaps -4 * SIZE(AA), %xmm0
  919. addpd %xmm2, %xmm5
  920. pshufd $0x44, %xmm1, %xmm2
  921. mulpd %xmm0, %xmm2
  922. movaps -2 * SIZE(AA), %xmm0
  923. addpd %xmm2, %xmm4
  924. pshufd $0xee, %xmm1, %xmm2
  925. movaps -8 * SIZE(BB), %xmm1
  926. mulpd %xmm0, %xmm2
  927. movaps 0 * SIZE(AA), %xmm0
  928. addpd %xmm2, %xmm5
  929. subl $-16 * SIZE, AA
  930. subl $ -8 * SIZE, BB
  931. subl $1, %eax
  932. jne .L52
  933. ALIGN_4
  934. .L55:
  935. #ifndef TRMMKERNEL
  936. movl K, %eax
  937. #else
  938. movl KKK, %eax
  939. #endif
  940. andl $7, %eax
  941. BRANCH
  942. je .L58
  943. ALIGN_4
  944. .L56:
  945. pshufd $0x44, %xmm1, %xmm2
  946. movsd -15 * SIZE(BB), %xmm1
  947. mulpd %xmm0, %xmm2
  948. movaps -14 * SIZE(AA), %xmm0
  949. addpd %xmm2, %xmm4
  950. addl $2 * SIZE, AA
  951. addl $1 * SIZE, BB
  952. decl %eax
  953. jg .L56
  954. ALIGN_4
  955. .L58:
  956. movddup ALPHA, %xmm3
  957. addpd %xmm5, %xmm4
  958. mulpd %xmm3, %xmm4
  959. #ifndef TRMMKERNEL
  960. movsd 0 * SIZE(C1), %xmm0
  961. movhpd 1 * SIZE(C1), %xmm0
  962. addpd %xmm0, %xmm4
  963. #endif
  964. movsd %xmm4, 0 * SIZE(C1)
  965. movhpd %xmm4, 1 * SIZE(C1)
  966. #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  967. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  968. movl K, %eax
  969. subl KKK, %eax
  970. leal (,%eax, SIZE), %eax
  971. leal (AA, %eax, 2), AA
  972. addl %eax, BB
  973. #endif
  974. #if defined(TRMMKERNEL) && defined(LEFT)
  975. addl $2, KK
  976. #endif
  977. addl $2 * SIZE, C1
  978. decl I
  979. jg .L51
  980. ALIGN_4
  981. .L60:
  982. movl M, I
  983. testl $1, I
  984. jle .L999
  985. #if !defined(TRMMKERNEL) || \
  986. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  987. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  988. movl B, BB
  989. #else
  990. movl B, BB
  991. movl KK, %eax
  992. leal (, %eax, SIZE), %eax
  993. addl %eax, AA
  994. addl %eax, BB
  995. #endif
  996. movaps -16 * SIZE(AA), %xmm0
  997. pxor %xmm4, %xmm4
  998. movaps -16 * SIZE(BB), %xmm2
  999. pxor %xmm5, %xmm5
  1000. #ifndef TRMMKERNEL
  1001. movl K, %eax
  1002. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1003. movl K, %eax
  1004. subl KK, %eax
  1005. movl %eax, KKK
  1006. #else
  1007. movl KK, %eax
  1008. #ifdef LEFT
  1009. addl $1, %eax
  1010. #else
  1011. addl $1, %eax
  1012. #endif
  1013. movl %eax, KKK
  1014. #endif
  1015. sarl $3, %eax
  1016. je .L65
  1017. ALIGN_4
  1018. .L62:
  1019. PREFETCH (PREFETCHSIZE + 0) * SIZE(AA)
  1020. mulpd %xmm0, %xmm2
  1021. movaps -14 * SIZE(AA), %xmm0
  1022. addpd %xmm2, %xmm4
  1023. movaps -14 * SIZE(BB), %xmm2
  1024. mulpd %xmm0, %xmm2
  1025. movaps -12 * SIZE(AA), %xmm0
  1026. addpd %xmm2, %xmm5
  1027. movaps -12 * SIZE(BB), %xmm2
  1028. mulpd %xmm0, %xmm2
  1029. movaps -10 * SIZE(AA), %xmm0
  1030. addpd %xmm2, %xmm4
  1031. movaps -10 * SIZE(BB), %xmm2
  1032. mulpd %xmm0, %xmm2
  1033. movaps -8 * SIZE(AA), %xmm0
  1034. addpd %xmm2, %xmm5
  1035. movaps -8 * SIZE(BB), %xmm2
  1036. subl $-8 * SIZE, AA
  1037. subl $-8 * SIZE, BB
  1038. subl $1, %eax
  1039. jne .L62
  1040. ALIGN_4
  1041. .L65:
  1042. #ifndef TRMMKERNEL
  1043. movl K, %eax
  1044. #else
  1045. movl KKK, %eax
  1046. #endif
  1047. andl $7, %eax
  1048. BRANCH
  1049. je .L68
  1050. ALIGN_4
  1051. .L66:
  1052. mulsd %xmm0, %xmm2
  1053. movsd -15 * SIZE(AA), %xmm0
  1054. addsd %xmm2, %xmm4
  1055. movsd -15 * SIZE(BB), %xmm2
  1056. addl $1 * SIZE, AA
  1057. addl $1 * SIZE, BB
  1058. decl %eax
  1059. jg .L66
  1060. ALIGN_4
  1061. .L68:
  1062. movddup ALPHA, %xmm3
  1063. addpd %xmm5, %xmm4
  1064. haddpd %xmm4, %xmm4
  1065. #ifndef TRMMKERNEL
  1066. movsd 0 * SIZE(C1), %xmm0
  1067. #endif
  1068. mulsd %xmm3, %xmm4
  1069. #ifndef TRMMKERNEL
  1070. addsd %xmm0, %xmm4
  1071. #endif
  1072. movsd %xmm4, 0 * SIZE(C1)
  1073. ALIGN_4
  1074. .L999:
  1075. popl %ebx
  1076. popl %esi
  1077. popl %edi
  1078. popl %ebp
  1079. addl $ARGS, %esp
  1080. ret
  1081. EPILOGUE