You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

zgemm3m_kernel_2x4_penryn.S 25 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344
  1. /*********************************************************************/
  2. /* Copyright 2009, 2010 The University of Texas at Austin. */
  3. /* All rights reserved. */
  4. /* */
  5. /* Redistribution and use in source and binary forms, with or */
  6. /* without modification, are permitted provided that the following */
  7. /* conditions are met: */
  8. /* */
  9. /* 1. Redistributions of source code must retain the above */
  10. /* copyright notice, this list of conditions and the following */
  11. /* disclaimer. */
  12. /* */
  13. /* 2. Redistributions in binary form must reproduce the above */
  14. /* copyright notice, this list of conditions and the following */
  15. /* disclaimer in the documentation and/or other materials */
  16. /* provided with the distribution. */
  17. /* */
  18. /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
  19. /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
  20. /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
  21. /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
  22. /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
  23. /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
  24. /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
  25. /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
  26. /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
  27. /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
  28. /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
  29. /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
  30. /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
  31. /* POSSIBILITY OF SUCH DAMAGE. */
  32. /* */
  33. /* The views and conclusions contained in the software and */
  34. /* documentation are those of the authors and should not be */
  35. /* interpreted as representing official policies, either expressed */
  36. /* or implied, of The University of Texas at Austin. */
  37. /*********************************************************************/
  38. #define ASSEMBLER
  39. #include "common.h"
  40. #define STACK 16
  41. #define ARGS 16
  42. #define M 4 + STACK + ARGS(%esp)
  43. #define N 8 + STACK + ARGS(%esp)
  44. #define K 12 + STACK + ARGS(%esp)
  45. #define ALPHA 16 + STACK + ARGS(%esp)
  46. #define A 32 + STACK + ARGS(%esp)
  47. #define ARG_B 36 + STACK + ARGS(%esp)
  48. #define C 40 + STACK + ARGS(%esp)
  49. #define ARG_LDC 44 + STACK + ARGS(%esp)
  50. #define J 0 + STACK(%esp)
  51. #define BX 4 + STACK(%esp)
  52. #define KK 8 + STACK(%esp)
  53. #define KKK 12 + STACK(%esp)
  54. #define AA %edx
  55. #define BB %ecx
  56. #define LDC %ebp
  57. #define B %edi
  58. #define C1 %esi
  59. #define I %ebx
  60. #ifdef NANO
  61. #define PREFETCHSIZE (8 * 3 + 4)
  62. #define PREFETCHW prefetcht0
  63. #define PREFETCHB prefetcht0
  64. #endif
  65. #ifndef PREFETCH
  66. #define PREFETCH prefetcht0
  67. #endif
  68. #ifndef PREFETCHW
  69. #define PREFETCHW prefetcht2
  70. #endif
  71. #ifndef PREFETCHB
  72. #define PREFETCHB prefetcht2
  73. #endif
  74. #ifndef PREFETCHSIZE
  75. #define PREFETCHSIZE (8 * 21 + 4)
  76. #endif
  77. PROLOGUE
  78. subl $ARGS, %esp # Generate Stack Frame
  79. pushl %ebp
  80. pushl %edi
  81. pushl %esi
  82. pushl %ebx
  83. PROFCODE
  84. movl ARG_B, B
  85. movl ARG_LDC, LDC
  86. #ifdef TRMMKERNEL
  87. movl OFFSET, %eax
  88. #ifndef LEFT
  89. negl %eax
  90. #endif
  91. movl %eax, KK
  92. #endif
  93. subl $-16 * SIZE, A
  94. subl $-16 * SIZE, B
  95. sall $ZBASE_SHIFT, LDC
  96. movl N, %eax
  97. sarl $2, %eax
  98. movl %eax, J
  99. jle .L30
  100. ALIGN_4
  101. .L01:
  102. #if defined(TRMMKERNEL) && defined(LEFT)
  103. movl OFFSET, %eax
  104. movl %eax, KK
  105. #endif
  106. movl K, %eax
  107. sall $BASE_SHIFT + 2, %eax
  108. leal (B, %eax), %eax
  109. movl %eax, BX
  110. movl C, C1
  111. movl A, AA
  112. movl M, I
  113. sarl $1, I
  114. jle .L20
  115. ALIGN_4
  116. .L11:
  117. #if !defined(TRMMKERNEL) || \
  118. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  119. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  120. movl B, BB
  121. #else
  122. movl B, BB
  123. movl KK, %eax
  124. leal (, %eax, SIZE), %eax
  125. leal (AA, %eax, 2), AA
  126. leal (BB, %eax, 4), BB
  127. #endif
  128. movl BX, %eax
  129. PREFETCHB -16 * SIZE(%eax)
  130. subl $-8 * SIZE, BX
  131. leal (C1, LDC, 2), %eax
  132. movaps -16 * SIZE(AA), %xmm0
  133. pxor %xmm2, %xmm2
  134. movaps -16 * SIZE(BB), %xmm1
  135. pxor %xmm3, %xmm3
  136. pxor %xmm4, %xmm4
  137. PREFETCHW 1 * SIZE(C1)
  138. pxor %xmm5, %xmm5
  139. PREFETCHW 1 * SIZE(C1, LDC)
  140. pxor %xmm6, %xmm6
  141. PREFETCHW 1 * SIZE(%eax)
  142. pxor %xmm7, %xmm7
  143. PREFETCHW 1 * SIZE(%eax, LDC)
  144. #ifndef TRMMKERNEL
  145. movl K, %eax
  146. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  147. movl K, %eax
  148. subl KK, %eax
  149. movl %eax, KKK
  150. #else
  151. movl KK, %eax
  152. #ifdef LEFT
  153. addl $2, %eax
  154. #else
  155. addl $4, %eax
  156. #endif
  157. movl %eax, KKK
  158. #endif
  159. sarl $3, %eax
  160. je .L15
  161. ALIGN_4
  162. .L12:
  163. PREFETCH (PREFETCHSIZE + 0) * SIZE(AA)
  164. addpd %xmm3, %xmm7
  165. movaps -14 * SIZE(BB), %xmm3
  166. addpd %xmm2, %xmm6
  167. pshufd $0x4e, %xmm1, %xmm2
  168. mulpd %xmm0, %xmm1
  169. mulpd %xmm0, %xmm2
  170. addpd %xmm1, %xmm5
  171. movaps -12 * SIZE(BB), %xmm1
  172. addpd %xmm2, %xmm4
  173. pshufd $0x4e, %xmm3, %xmm2
  174. mulpd %xmm0, %xmm3
  175. mulpd %xmm0, %xmm2
  176. movaps -14 * SIZE(AA), %xmm0
  177. addpd %xmm3, %xmm7
  178. movaps -10 * SIZE(BB), %xmm3
  179. addpd %xmm2, %xmm6
  180. pshufd $0x4e, %xmm1, %xmm2
  181. mulpd %xmm0, %xmm1
  182. mulpd %xmm0, %xmm2
  183. addpd %xmm1, %xmm5
  184. movaps -8 * SIZE(BB), %xmm1
  185. addpd %xmm2, %xmm4
  186. pshufd $0x4e, %xmm3, %xmm2
  187. mulpd %xmm0, %xmm3
  188. mulpd %xmm0, %xmm2
  189. movaps -12 * SIZE(AA), %xmm0
  190. addpd %xmm3, %xmm7
  191. movaps -6 * SIZE(BB), %xmm3
  192. addpd %xmm2, %xmm6
  193. pshufd $0x4e, %xmm1, %xmm2
  194. mulpd %xmm0, %xmm1
  195. mulpd %xmm0, %xmm2
  196. addpd %xmm1, %xmm5
  197. movaps -4 * SIZE(BB), %xmm1
  198. addpd %xmm2, %xmm4
  199. pshufd $0x4e, %xmm3, %xmm2
  200. mulpd %xmm0, %xmm3
  201. mulpd %xmm0, %xmm2
  202. movaps -10 * SIZE(AA), %xmm0
  203. addpd %xmm3, %xmm7
  204. movaps -2 * SIZE(BB), %xmm3
  205. addpd %xmm2, %xmm6
  206. pshufd $0x4e, %xmm1, %xmm2
  207. mulpd %xmm0, %xmm1
  208. mulpd %xmm0, %xmm2
  209. addpd %xmm1, %xmm5
  210. movaps 0 * SIZE(BB), %xmm1
  211. addpd %xmm2, %xmm4
  212. pshufd $0x4e, %xmm3, %xmm2
  213. mulpd %xmm0, %xmm3
  214. mulpd %xmm0, %xmm2
  215. movaps -8 * SIZE(AA), %xmm0
  216. PREFETCH (PREFETCHSIZE + 8) * SIZE(AA)
  217. addpd %xmm3, %xmm7
  218. movaps 2 * SIZE(BB), %xmm3
  219. addpd %xmm2, %xmm6
  220. pshufd $0x4e, %xmm1, %xmm2
  221. mulpd %xmm0, %xmm1
  222. mulpd %xmm0, %xmm2
  223. addpd %xmm1, %xmm5
  224. movaps 4 * SIZE(BB), %xmm1
  225. addpd %xmm2, %xmm4
  226. pshufd $0x4e, %xmm3, %xmm2
  227. mulpd %xmm0, %xmm3
  228. mulpd %xmm0, %xmm2
  229. movaps -6 * SIZE(AA), %xmm0
  230. addpd %xmm3, %xmm7
  231. movaps 6 * SIZE(BB), %xmm3
  232. addpd %xmm2, %xmm6
  233. pshufd $0x4e, %xmm1, %xmm2
  234. mulpd %xmm0, %xmm1
  235. mulpd %xmm0, %xmm2
  236. addpd %xmm1, %xmm5
  237. movaps 8 * SIZE(BB), %xmm1
  238. addpd %xmm2, %xmm4
  239. pshufd $0x4e, %xmm3, %xmm2
  240. mulpd %xmm0, %xmm3
  241. mulpd %xmm0, %xmm2
  242. movaps -4 * SIZE(AA), %xmm0
  243. addpd %xmm3, %xmm7
  244. movaps 10 * SIZE(BB), %xmm3
  245. addpd %xmm2, %xmm6
  246. pshufd $0x4e, %xmm1, %xmm2
  247. mulpd %xmm0, %xmm1
  248. mulpd %xmm0, %xmm2
  249. addpd %xmm1, %xmm5
  250. movaps 12 * SIZE(BB), %xmm1
  251. addpd %xmm2, %xmm4
  252. pshufd $0x4e, %xmm3, %xmm2
  253. mulpd %xmm0, %xmm3
  254. mulpd %xmm0, %xmm2
  255. movaps -2 * SIZE(AA), %xmm0
  256. addpd %xmm3, %xmm7
  257. movaps 14 * SIZE(BB), %xmm3
  258. addpd %xmm2, %xmm6
  259. pshufd $0x4e, %xmm1, %xmm2
  260. mulpd %xmm0, %xmm1
  261. mulpd %xmm0, %xmm2
  262. addpd %xmm1, %xmm5
  263. movaps 16 * SIZE(BB), %xmm1
  264. addpd %xmm2, %xmm4
  265. pshufd $0x4e, %xmm3, %xmm2
  266. mulpd %xmm0, %xmm3
  267. subl $-32 * SIZE, BB
  268. mulpd %xmm0, %xmm2
  269. movaps 0 * SIZE(AA), %xmm0
  270. subl $-16 * SIZE, AA
  271. subl $1, %eax
  272. BRANCH
  273. jne .L12
  274. ALIGN_4
  275. .L15:
  276. #ifndef TRMMKERNEL
  277. movl K, %eax
  278. #else
  279. movl KKK, %eax
  280. #endif
  281. andl $7, %eax
  282. BRANCH
  283. je .L18
  284. ALIGN_4
  285. .L16:
  286. addpd %xmm3, %xmm7
  287. movaps -14 * SIZE(BB), %xmm3
  288. addpd %xmm2, %xmm6
  289. pshufd $0x4e, %xmm1, %xmm2
  290. mulpd %xmm0, %xmm1
  291. mulpd %xmm0, %xmm2
  292. addpd %xmm1, %xmm5
  293. movaps -12 * SIZE(BB), %xmm1
  294. addpd %xmm2, %xmm4
  295. pshufd $0x4e, %xmm3, %xmm2
  296. mulpd %xmm0, %xmm3
  297. mulpd %xmm0, %xmm2
  298. movaps -14 * SIZE(AA), %xmm0
  299. addl $2 * SIZE, AA
  300. addl $4 * SIZE, BB
  301. decl %eax
  302. jg .L16
  303. ALIGN_4
  304. .L18:
  305. addpd %xmm2, %xmm6
  306. addpd %xmm3, %xmm7
  307. movups ALPHA, %xmm3
  308. movaps %xmm4, %xmm0
  309. movsd %xmm5, %xmm4
  310. movsd %xmm0, %xmm5
  311. movaps %xmm6, %xmm0
  312. movsd %xmm7, %xmm6
  313. movsd %xmm0, %xmm7
  314. leal (C1, LDC, 2), %eax
  315. movsd 0 * SIZE(C1), %xmm0
  316. movhps 1 * SIZE(C1), %xmm0
  317. movsd 2 * SIZE(C1), %xmm1
  318. movhps 3 * SIZE(C1), %xmm1
  319. pshufd $0x44, %xmm4, %xmm2
  320. unpckhpd %xmm4, %xmm4
  321. mulpd %xmm3, %xmm2
  322. addpd %xmm2, %xmm0
  323. mulpd %xmm3, %xmm4
  324. addpd %xmm4, %xmm1
  325. movlps %xmm0, 0 * SIZE(C1)
  326. movhps %xmm0, 1 * SIZE(C1)
  327. movlps %xmm1, 2 * SIZE(C1)
  328. movhps %xmm1, 3 * SIZE(C1)
  329. movsd 0 * SIZE(C1, LDC), %xmm0
  330. movhps 1 * SIZE(C1, LDC), %xmm0
  331. movsd 2 * SIZE(C1, LDC), %xmm1
  332. movhps 3 * SIZE(C1, LDC), %xmm1
  333. pshufd $0x44, %xmm5, %xmm2
  334. unpckhpd %xmm5, %xmm5
  335. mulpd %xmm3, %xmm2
  336. addpd %xmm2, %xmm0
  337. mulpd %xmm3, %xmm5
  338. addpd %xmm5, %xmm1
  339. movlps %xmm0, 0 * SIZE(C1, LDC)
  340. movhps %xmm0, 1 * SIZE(C1, LDC)
  341. movlps %xmm1, 2 * SIZE(C1, LDC)
  342. movhps %xmm1, 3 * SIZE(C1, LDC)
  343. movsd 0 * SIZE(%eax), %xmm0
  344. movhps 1 * SIZE(%eax), %xmm0
  345. movsd 2 * SIZE(%eax), %xmm1
  346. movhps 3 * SIZE(%eax), %xmm1
  347. pshufd $0x44, %xmm6, %xmm2
  348. unpckhpd %xmm6, %xmm6
  349. mulpd %xmm3, %xmm2
  350. addpd %xmm2, %xmm0
  351. mulpd %xmm3, %xmm6
  352. addpd %xmm6, %xmm1
  353. movlps %xmm0, 0 * SIZE(%eax)
  354. movhps %xmm0, 1 * SIZE(%eax)
  355. movlps %xmm1, 2 * SIZE(%eax)
  356. movhps %xmm1, 3 * SIZE(%eax)
  357. movsd 0 * SIZE(%eax, LDC), %xmm0
  358. movhps 1 * SIZE(%eax, LDC), %xmm0
  359. movsd 2 * SIZE(%eax, LDC), %xmm1
  360. movhps 3 * SIZE(%eax, LDC), %xmm1
  361. pshufd $0x44, %xmm7, %xmm2
  362. unpckhpd %xmm7, %xmm7
  363. mulpd %xmm3, %xmm2
  364. addpd %xmm2, %xmm0
  365. mulpd %xmm3, %xmm7
  366. addpd %xmm7, %xmm1
  367. movlps %xmm0, 0 * SIZE(%eax, LDC)
  368. movhps %xmm0, 1 * SIZE(%eax, LDC)
  369. movlps %xmm1, 2 * SIZE(%eax, LDC)
  370. movhps %xmm1, 3 * SIZE(%eax, LDC)
  371. addl $4 * SIZE, C1
  372. decl I
  373. jg .L11
  374. ALIGN_4
  375. .L20:
  376. movl M, I
  377. testl $1, I
  378. jle .L29
  379. #if !defined(TRMMKERNEL) || \
  380. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  381. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  382. movl B, BB
  383. #else
  384. movl B, BB
  385. movl KK, %eax
  386. leal (, %eax, SIZE), %eax
  387. addl %eax, AA
  388. leal (BB, %eax, 4), BB
  389. #endif
  390. movaps -16 * SIZE(AA), %xmm0
  391. pxor %xmm4, %xmm4
  392. movaps -16 * SIZE(BB), %xmm2
  393. pxor %xmm5, %xmm5
  394. movaps -14 * SIZE(BB), %xmm3
  395. pxor %xmm6, %xmm6
  396. pxor %xmm7, %xmm7
  397. #ifndef TRMMKERNEL
  398. movl K, %eax
  399. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  400. movl K, %eax
  401. subl KK, %eax
  402. movl %eax, KKK
  403. #else
  404. movl KK, %eax
  405. #ifdef LEFT
  406. addl $1, %eax
  407. #else
  408. addl $4, %eax
  409. #endif
  410. movl %eax, KKK
  411. #endif
  412. sarl $3, %eax
  413. je .L25
  414. ALIGN_4
  415. .L22:
  416. PREFETCH (PREFETCHSIZE + 0) * SIZE(AA)
  417. pshufd $0x44, %xmm0, %xmm1
  418. mulpd %xmm1, %xmm2
  419. mulpd %xmm1, %xmm3
  420. addpd %xmm2, %xmm4
  421. movaps -12 * SIZE(BB), %xmm2
  422. addpd %xmm3, %xmm5
  423. movaps -10 * SIZE(BB), %xmm3
  424. pshufd $0xee, %xmm0, %xmm1
  425. movaps -14 * SIZE(AA), %xmm0
  426. mulpd %xmm1, %xmm2
  427. mulpd %xmm1, %xmm3
  428. addpd %xmm2, %xmm6
  429. movaps -8 * SIZE(BB), %xmm2
  430. addpd %xmm3, %xmm7
  431. movaps -6 * SIZE(BB), %xmm3
  432. pshufd $0x44, %xmm0, %xmm1
  433. mulpd %xmm1, %xmm2
  434. mulpd %xmm1, %xmm3
  435. addpd %xmm2, %xmm4
  436. movaps -4 * SIZE(BB), %xmm2
  437. addpd %xmm3, %xmm5
  438. movaps -2 * SIZE(BB), %xmm3
  439. pshufd $0xee, %xmm0, %xmm1
  440. movaps -12 * SIZE(AA), %xmm0
  441. mulpd %xmm1, %xmm2
  442. mulpd %xmm1, %xmm3
  443. addpd %xmm2, %xmm6
  444. movaps 0 * SIZE(BB), %xmm2
  445. addpd %xmm3, %xmm7
  446. movaps 2 * SIZE(BB), %xmm3
  447. pshufd $0x44, %xmm0, %xmm1
  448. mulpd %xmm1, %xmm2
  449. mulpd %xmm1, %xmm3
  450. addpd %xmm2, %xmm4
  451. movaps 4 * SIZE(BB), %xmm2
  452. addpd %xmm3, %xmm5
  453. movaps 6 * SIZE(BB), %xmm3
  454. pshufd $0xee, %xmm0, %xmm1
  455. movaps -10 * SIZE(AA), %xmm0
  456. mulpd %xmm1, %xmm2
  457. mulpd %xmm1, %xmm3
  458. addpd %xmm2, %xmm6
  459. movaps 8 * SIZE(BB), %xmm2
  460. addpd %xmm3, %xmm7
  461. movaps 10 * SIZE(BB), %xmm3
  462. pshufd $0x44, %xmm0, %xmm1
  463. mulpd %xmm1, %xmm2
  464. mulpd %xmm1, %xmm3
  465. addpd %xmm2, %xmm4
  466. movaps 12 * SIZE(BB), %xmm2
  467. addpd %xmm3, %xmm5
  468. movaps 14 * SIZE(BB), %xmm3
  469. pshufd $0xee, %xmm0, %xmm1
  470. movaps -8 * SIZE(AA), %xmm0
  471. mulpd %xmm1, %xmm2
  472. mulpd %xmm1, %xmm3
  473. addpd %xmm2, %xmm6
  474. movaps 16 * SIZE(BB), %xmm2
  475. addpd %xmm3, %xmm7
  476. movaps 18 * SIZE(BB), %xmm3
  477. subl $ -8 * SIZE, AA
  478. subl $-32 * SIZE, BB
  479. subl $1, %eax
  480. jne .L22
  481. ALIGN_4
  482. .L25:
  483. #ifndef TRMMKERNEL
  484. movl K, %eax
  485. #else
  486. movl KKK, %eax
  487. #endif
  488. andl $7, %eax
  489. BRANCH
  490. je .L28
  491. ALIGN_4
  492. .L26:
  493. pshufd $0x44, %xmm0, %xmm1
  494. movsd -15 * SIZE(AA), %xmm0
  495. mulpd %xmm1, %xmm2
  496. mulpd %xmm1, %xmm3
  497. addpd %xmm2, %xmm4
  498. movaps -12 * SIZE(BB), %xmm2
  499. addpd %xmm3, %xmm5
  500. movaps -10 * SIZE(BB), %xmm3
  501. addl $1 * SIZE, AA
  502. addl $4 * SIZE, BB
  503. decl %eax
  504. jg .L26
  505. ALIGN_4
  506. .L28:
  507. movups ALPHA, %xmm3
  508. addpd %xmm6, %xmm4
  509. addpd %xmm7, %xmm5
  510. leal (C1, LDC, 2), %eax
  511. movsd 0 * SIZE(C1), %xmm0
  512. movhps 1 * SIZE(C1), %xmm0
  513. movsd 0 * SIZE(C1, LDC), %xmm1
  514. movhps 1 * SIZE(C1, LDC), %xmm1
  515. pshufd $0x44, %xmm4, %xmm2
  516. unpckhpd %xmm4, %xmm4
  517. mulpd %xmm3, %xmm2
  518. addpd %xmm2, %xmm0
  519. mulpd %xmm3, %xmm4
  520. addpd %xmm4, %xmm1
  521. movlps %xmm0, 0 * SIZE(C1)
  522. movhps %xmm0, 1 * SIZE(C1)
  523. movlps %xmm1, 0 * SIZE(C1, LDC)
  524. movhps %xmm1, 1 * SIZE(C1, LDC)
  525. movsd 0 * SIZE(%eax), %xmm0
  526. movhps 1 * SIZE(%eax), %xmm0
  527. movsd 0 * SIZE(%eax, LDC), %xmm1
  528. movhps 1 * SIZE(%eax, LDC), %xmm1
  529. pshufd $0x44, %xmm5, %xmm2
  530. unpckhpd %xmm5, %xmm5
  531. mulpd %xmm3, %xmm2
  532. addpd %xmm2, %xmm0
  533. mulpd %xmm3, %xmm5
  534. addpd %xmm5, %xmm1
  535. movlps %xmm0, 0 * SIZE(%eax)
  536. movhps %xmm0, 1 * SIZE(%eax)
  537. movlps %xmm1, 0 * SIZE(%eax, LDC)
  538. movhps %xmm1, 1 * SIZE(%eax, LDC)
  539. ALIGN_4
  540. .L29:
  541. #if defined(TRMMKERNEL) && !defined(LEFT)
  542. addl $4, KK
  543. #endif
  544. movl BB, B
  545. leal (, LDC, 4), %eax
  546. addl %eax, C
  547. decl J
  548. jg .L01
  549. ALIGN_4
  550. .L30:
  551. movl N, %eax
  552. testl $2, %eax
  553. jle .L50
  554. #if defined(TRMMKERNEL) && defined(LEFT)
  555. movl OFFSET, %eax
  556. movl %eax, KK
  557. #endif
  558. movl C, C1
  559. movl A, AA
  560. movl M, I
  561. sarl $1, I
  562. jle .L40
  563. ALIGN_4
  564. .L31:
  565. #if !defined(TRMMKERNEL) || \
  566. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  567. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  568. movl B, BB
  569. #else
  570. movl B, BB
  571. movl KK, %eax
  572. leal (, %eax, SIZE), %eax
  573. leal (AA, %eax, 2), AA
  574. leal (BB, %eax, 2), BB
  575. #endif
  576. movaps -16 * SIZE(AA), %xmm0
  577. pxor %xmm4, %xmm4
  578. movaps -16 * SIZE(BB), %xmm1
  579. pxor %xmm5, %xmm5
  580. PREFETCHW 1 * SIZE(C1)
  581. pxor %xmm6, %xmm6
  582. PREFETCHW 1 * SIZE(C1, LDC)
  583. pxor %xmm7, %xmm7
  584. #ifndef TRMMKERNEL
  585. movl K, %eax
  586. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  587. movl K, %eax
  588. subl KK, %eax
  589. movl %eax, KKK
  590. #else
  591. movl KK, %eax
  592. #ifdef LEFT
  593. addl $2, %eax
  594. #else
  595. addl $2, %eax
  596. #endif
  597. movl %eax, KKK
  598. #endif
  599. sarl $3, %eax
  600. je .L35
  601. ALIGN_4
  602. .L32:
  603. PREFETCH (PREFETCHSIZE + 0) * SIZE(AA)
  604. pshufd $0x4e, %xmm1, %xmm2
  605. mulpd %xmm0, %xmm1
  606. mulpd %xmm0, %xmm2
  607. movaps -14 * SIZE(AA), %xmm0
  608. addpd %xmm1, %xmm5
  609. movaps -14 * SIZE(BB), %xmm1
  610. addpd %xmm2, %xmm4
  611. pshufd $0x4e, %xmm1, %xmm2
  612. mulpd %xmm0, %xmm1
  613. mulpd %xmm0, %xmm2
  614. movaps -12 * SIZE(AA), %xmm0
  615. addpd %xmm1, %xmm7
  616. movaps -12 * SIZE(BB), %xmm1
  617. addpd %xmm2, %xmm6
  618. pshufd $0x4e, %xmm1, %xmm2
  619. mulpd %xmm0, %xmm1
  620. mulpd %xmm0, %xmm2
  621. movaps -10 * SIZE(AA), %xmm0
  622. addpd %xmm1, %xmm5
  623. movaps -10 * SIZE(BB), %xmm1
  624. addpd %xmm2, %xmm4
  625. pshufd $0x4e, %xmm1, %xmm2
  626. mulpd %xmm0, %xmm1
  627. mulpd %xmm0, %xmm2
  628. movaps -8 * SIZE(AA), %xmm0
  629. addpd %xmm1, %xmm7
  630. movaps -8 * SIZE(BB), %xmm1
  631. addpd %xmm2, %xmm6
  632. PREFETCH (PREFETCHSIZE + 8) * SIZE(AA)
  633. pshufd $0x4e, %xmm1, %xmm2
  634. mulpd %xmm0, %xmm1
  635. mulpd %xmm0, %xmm2
  636. movaps -6 * SIZE(AA), %xmm0
  637. addpd %xmm1, %xmm5
  638. movaps -6 * SIZE(BB), %xmm1
  639. addpd %xmm2, %xmm4
  640. pshufd $0x4e, %xmm1, %xmm2
  641. mulpd %xmm0, %xmm1
  642. mulpd %xmm0, %xmm2
  643. movaps -4 * SIZE(AA), %xmm0
  644. addpd %xmm1, %xmm7
  645. movaps -4 * SIZE(BB), %xmm1
  646. addpd %xmm2, %xmm6
  647. pshufd $0x4e, %xmm1, %xmm2
  648. mulpd %xmm0, %xmm1
  649. mulpd %xmm0, %xmm2
  650. movaps -2 * SIZE(AA), %xmm0
  651. addpd %xmm1, %xmm5
  652. movaps -2 * SIZE(BB), %xmm1
  653. addpd %xmm2, %xmm4
  654. pshufd $0x4e, %xmm1, %xmm2
  655. mulpd %xmm0, %xmm1
  656. mulpd %xmm0, %xmm2
  657. movaps 0 * SIZE(AA), %xmm0
  658. addpd %xmm1, %xmm7
  659. movaps 0 * SIZE(BB), %xmm1
  660. addpd %xmm2, %xmm6
  661. subl $-16 * SIZE, AA
  662. subl $-16 * SIZE, BB
  663. subl $1, %eax
  664. jne .L32
  665. ALIGN_4
  666. .L35:
  667. #ifndef TRMMKERNEL
  668. movl K, %eax
  669. #else
  670. movl KKK, %eax
  671. #endif
  672. andl $7, %eax
  673. BRANCH
  674. je .L38
  675. ALIGN_4
  676. .L36:
  677. pshufd $0x4e, %xmm1, %xmm2
  678. mulpd %xmm0, %xmm1
  679. mulpd %xmm0, %xmm2
  680. movaps -14 * SIZE(AA), %xmm0
  681. addpd %xmm1, %xmm5
  682. movaps -14 * SIZE(BB), %xmm1
  683. addpd %xmm2, %xmm4
  684. addl $2 * SIZE, AA
  685. addl $2 * SIZE, BB
  686. decl %eax
  687. jg .L36
  688. ALIGN_4
  689. .L38:
  690. movups ALPHA, %xmm3
  691. addpd %xmm6, %xmm4
  692. addpd %xmm7, %xmm5
  693. movaps %xmm4, %xmm0
  694. movsd %xmm5, %xmm4
  695. movsd %xmm0, %xmm5
  696. movsd 0 * SIZE(C1), %xmm0
  697. movhps 1 * SIZE(C1), %xmm0
  698. movsd 2 * SIZE(C1), %xmm1
  699. movhps 3 * SIZE(C1), %xmm1
  700. pshufd $0x44, %xmm4, %xmm2
  701. unpckhpd %xmm4, %xmm4
  702. mulpd %xmm3, %xmm2
  703. addpd %xmm2, %xmm0
  704. mulpd %xmm3, %xmm4
  705. addpd %xmm4, %xmm1
  706. movlps %xmm0, 0 * SIZE(C1)
  707. movhps %xmm0, 1 * SIZE(C1)
  708. movlps %xmm1, 2 * SIZE(C1)
  709. movhps %xmm1, 3 * SIZE(C1)
  710. movsd 0 * SIZE(C1, LDC), %xmm0
  711. movhps 1 * SIZE(C1, LDC), %xmm0
  712. movsd 2 * SIZE(C1, LDC), %xmm1
  713. movhps 3 * SIZE(C1, LDC), %xmm1
  714. pshufd $0x44, %xmm5, %xmm2
  715. unpckhpd %xmm5, %xmm5
  716. mulpd %xmm3, %xmm2
  717. addpd %xmm2, %xmm0
  718. mulpd %xmm3, %xmm5
  719. addpd %xmm5, %xmm1
  720. movlps %xmm0, 0 * SIZE(C1, LDC)
  721. movhps %xmm0, 1 * SIZE(C1, LDC)
  722. movlps %xmm1, 2 * SIZE(C1, LDC)
  723. movhps %xmm1, 3 * SIZE(C1, LDC)
  724. addl $4 * SIZE, C1
  725. decl I
  726. jg .L31
  727. ALIGN_4
  728. .L40:
  729. movl M, I
  730. testl $1, I
  731. jle .L49
  732. #if !defined(TRMMKERNEL) || \
  733. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  734. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  735. movl B, BB
  736. #else
  737. movl B, BB
  738. movl KK, %eax
  739. leal (, %eax, SIZE), %eax
  740. addl %eax, AA
  741. leal (BB, %eax, 2), BB
  742. #endif
  743. movaps -16 * SIZE(AA), %xmm0
  744. pxor %xmm4, %xmm4
  745. movaps -16 * SIZE(BB), %xmm2
  746. pxor %xmm5, %xmm5
  747. #ifndef TRMMKERNEL
  748. movl K, %eax
  749. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  750. movl K, %eax
  751. subl KK, %eax
  752. movl %eax, KKK
  753. #else
  754. movl KK, %eax
  755. #ifdef LEFT
  756. addl $1, %eax
  757. #else
  758. addl $2, %eax
  759. #endif
  760. movl %eax, KKK
  761. #endif
  762. sarl $3, %eax
  763. je .L45
  764. ALIGN_4
  765. .L42:
  766. PREFETCH (PREFETCHSIZE + 0) * SIZE(AA)
  767. pshufd $0x44, %xmm0, %xmm1
  768. mulpd %xmm1, %xmm2
  769. addpd %xmm2, %xmm4
  770. movaps -14 * SIZE(BB), %xmm2
  771. pshufd $0xee, %xmm0, %xmm1
  772. movaps -14 * SIZE(AA), %xmm0
  773. mulpd %xmm1, %xmm2
  774. addpd %xmm2, %xmm5
  775. movaps -12 * SIZE(BB), %xmm2
  776. pshufd $0x44, %xmm0, %xmm1
  777. mulpd %xmm1, %xmm2
  778. addpd %xmm2, %xmm4
  779. movaps -10 * SIZE(BB), %xmm2
  780. pshufd $0xee, %xmm0, %xmm1
  781. movaps -12 * SIZE(AA), %xmm0
  782. mulpd %xmm1, %xmm2
  783. addpd %xmm2, %xmm5
  784. movaps -8 * SIZE(BB), %xmm2
  785. pshufd $0x44, %xmm0, %xmm1
  786. mulpd %xmm1, %xmm2
  787. addpd %xmm2, %xmm4
  788. movaps -6 * SIZE(BB), %xmm2
  789. pshufd $0xee, %xmm0, %xmm1
  790. movaps -10 * SIZE(AA), %xmm0
  791. mulpd %xmm1, %xmm2
  792. addpd %xmm2, %xmm5
  793. movaps -4 * SIZE(BB), %xmm2
  794. pshufd $0x44, %xmm0, %xmm1
  795. mulpd %xmm1, %xmm2
  796. addpd %xmm2, %xmm4
  797. movaps -2 * SIZE(BB), %xmm2
  798. pshufd $0xee, %xmm0, %xmm1
  799. movaps -8 * SIZE(AA), %xmm0
  800. mulpd %xmm1, %xmm2
  801. addpd %xmm2, %xmm5
  802. movaps 0 * SIZE(BB), %xmm2
  803. subl $ -8 * SIZE, AA
  804. subl $-16 * SIZE, BB
  805. subl $1, %eax
  806. jne .L42
  807. ALIGN_4
  808. .L45:
  809. #ifndef TRMMKERNEL
  810. movl K, %eax
  811. #else
  812. movl KKK, %eax
  813. #endif
  814. andl $7, %eax
  815. BRANCH
  816. je .L48
  817. ALIGN_4
  818. .L46:
  819. pshufd $0x44, %xmm0, %xmm1
  820. movsd -15 * SIZE(AA), %xmm0
  821. mulpd %xmm1, %xmm2
  822. addpd %xmm2, %xmm4
  823. movaps -14 * SIZE(BB), %xmm2
  824. addl $1 * SIZE, AA
  825. addl $2 * SIZE, BB
  826. decl %eax
  827. jg .L46
  828. ALIGN_4
  829. .L48:
  830. movups ALPHA, %xmm3
  831. addpd %xmm5, %xmm4
  832. movsd 0 * SIZE(C1), %xmm0
  833. movhps 1 * SIZE(C1), %xmm0
  834. movsd 0 * SIZE(C1, LDC), %xmm1
  835. movhps 1 * SIZE(C1, LDC), %xmm1
  836. pshufd $0x44, %xmm4, %xmm2
  837. unpckhpd %xmm4, %xmm4
  838. mulpd %xmm3, %xmm2
  839. addpd %xmm2, %xmm0
  840. mulpd %xmm3, %xmm4
  841. addpd %xmm4, %xmm1
  842. movlps %xmm0, 0 * SIZE(C1)
  843. movhps %xmm0, 1 * SIZE(C1)
  844. movlps %xmm1, 0 * SIZE(C1, LDC)
  845. movhps %xmm1, 1 * SIZE(C1, LDC)
  846. ALIGN_4
  847. .L49:
  848. #if defined(TRMMKERNEL) && !defined(LEFT)
  849. addl $2, KK
  850. #endif
  851. movl BB, B
  852. leal (, LDC, 2), %eax
  853. addl %eax, C
  854. ALIGN_4
  855. .L50:
  856. movl N, %eax
  857. testl $1, %eax
  858. jle .L999
  859. #if defined(TRMMKERNEL) && defined(LEFT)
  860. movl OFFSET, %eax
  861. movl %eax, KK
  862. #endif
  863. movl C, C1
  864. movl A, AA
  865. movl M, I
  866. sarl $1, I
  867. jle .L60
  868. ALIGN_4
  869. .L51:
  870. #if !defined(TRMMKERNEL) || \
  871. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  872. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  873. movl B, BB
  874. #else
  875. movl B, BB
  876. movl KK, %eax
  877. leal (, %eax, SIZE), %eax
  878. leal (AA, %eax, 2), AA
  879. addl %eax, BB
  880. #endif
  881. movaps -16 * SIZE(AA), %xmm0
  882. pxor %xmm4, %xmm4
  883. movaps -16 * SIZE(BB), %xmm1
  884. pxor %xmm5, %xmm5
  885. PREFETCHW 1 * SIZE(C1)
  886. #ifndef TRMMKERNEL
  887. movl K, %eax
  888. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  889. movl K, %eax
  890. subl KK, %eax
  891. movl %eax, KKK
  892. #else
  893. movl KK, %eax
  894. #ifdef LEFT
  895. addl $2, %eax
  896. #else
  897. addl $1, %eax
  898. #endif
  899. movl %eax, KKK
  900. #endif
  901. sarl $3, %eax
  902. je .L55
  903. ALIGN_4
  904. .L52:
  905. PREFETCH (PREFETCHSIZE + 0) * SIZE(AA)
  906. pshufd $0x44, %xmm1, %xmm2
  907. mulpd %xmm0, %xmm2
  908. movaps -14 * SIZE(AA), %xmm0
  909. addpd %xmm2, %xmm4
  910. pshufd $0xee, %xmm1, %xmm2
  911. movaps -14 * SIZE(BB), %xmm1
  912. mulpd %xmm0, %xmm2
  913. movaps -12 * SIZE(AA), %xmm0
  914. addpd %xmm2, %xmm5
  915. pshufd $0x44, %xmm1, %xmm2
  916. mulpd %xmm0, %xmm2
  917. movaps -10 * SIZE(AA), %xmm0
  918. addpd %xmm2, %xmm4
  919. pshufd $0xee, %xmm1, %xmm2
  920. movaps -12 * SIZE(BB), %xmm1
  921. mulpd %xmm0, %xmm2
  922. movaps -8 * SIZE(AA), %xmm0
  923. addpd %xmm2, %xmm5
  924. PREFETCH (PREFETCHSIZE + 8) * SIZE(AA)
  925. pshufd $0x44, %xmm1, %xmm2
  926. mulpd %xmm0, %xmm2
  927. movaps -6 * SIZE(AA), %xmm0
  928. addpd %xmm2, %xmm4
  929. pshufd $0xee, %xmm1, %xmm2
  930. movaps -10 * SIZE(BB), %xmm1
  931. mulpd %xmm0, %xmm2
  932. movaps -4 * SIZE(AA), %xmm0
  933. addpd %xmm2, %xmm5
  934. pshufd $0x44, %xmm1, %xmm2
  935. mulpd %xmm0, %xmm2
  936. movaps -2 * SIZE(AA), %xmm0
  937. addpd %xmm2, %xmm4
  938. pshufd $0xee, %xmm1, %xmm2
  939. movaps -8 * SIZE(BB), %xmm1
  940. mulpd %xmm0, %xmm2
  941. movaps 0 * SIZE(AA), %xmm0
  942. addpd %xmm2, %xmm5
  943. subl $-16 * SIZE, AA
  944. subl $ -8 * SIZE, BB
  945. subl $1, %eax
  946. jne .L52
  947. ALIGN_4
  948. .L55:
  949. #ifndef TRMMKERNEL
  950. movl K, %eax
  951. #else
  952. movl KKK, %eax
  953. #endif
  954. andl $7, %eax
  955. BRANCH
  956. je .L58
  957. ALIGN_4
  958. .L56:
  959. pshufd $0x44, %xmm1, %xmm2
  960. movsd -15 * SIZE(BB), %xmm1
  961. mulpd %xmm0, %xmm2
  962. movaps -14 * SIZE(AA), %xmm0
  963. addpd %xmm2, %xmm4
  964. addl $2 * SIZE, AA
  965. addl $1 * SIZE, BB
  966. decl %eax
  967. jg .L56
  968. ALIGN_4
  969. .L58:
  970. movups ALPHA, %xmm3
  971. addpd %xmm5, %xmm4
  972. movsd 0 * SIZE(C1), %xmm0
  973. movhps 1 * SIZE(C1), %xmm0
  974. movsd 2 * SIZE(C1), %xmm1
  975. movhps 3 * SIZE(C1), %xmm1
  976. pshufd $0x44, %xmm4, %xmm2
  977. unpckhpd %xmm4, %xmm4
  978. mulpd %xmm3, %xmm2
  979. addpd %xmm2, %xmm0
  980. mulpd %xmm3, %xmm4
  981. addpd %xmm4, %xmm1
  982. movlps %xmm0, 0 * SIZE(C1)
  983. movhps %xmm0, 1 * SIZE(C1)
  984. movlps %xmm1, 2 * SIZE(C1)
  985. movhps %xmm1, 3 * SIZE(C1)
  986. addl $4 * SIZE, C1
  987. decl I
  988. jg .L51
  989. ALIGN_4
  990. .L60:
  991. movl M, I
  992. testl $1, I
  993. jle .L999
  994. #if !defined(TRMMKERNEL) || \
  995. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  996. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  997. movl B, BB
  998. #else
  999. movl B, BB
  1000. movl KK, %eax
  1001. leal (, %eax, SIZE), %eax
  1002. addl %eax, AA
  1003. addl %eax, BB
  1004. #endif
  1005. movaps -16 * SIZE(AA), %xmm0
  1006. pxor %xmm4, %xmm4
  1007. movaps -16 * SIZE(BB), %xmm2
  1008. pxor %xmm5, %xmm5
  1009. #ifndef TRMMKERNEL
  1010. movl K, %eax
  1011. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1012. movl K, %eax
  1013. subl KK, %eax
  1014. movl %eax, KKK
  1015. #else
  1016. movl KK, %eax
  1017. #ifdef LEFT
  1018. addl $1, %eax
  1019. #else
  1020. addl $1, %eax
  1021. #endif
  1022. movl %eax, KKK
  1023. #endif
  1024. sarl $3, %eax
  1025. je .L65
  1026. ALIGN_4
  1027. .L62:
  1028. PREFETCH (PREFETCHSIZE + 0) * SIZE(AA)
  1029. mulpd %xmm0, %xmm2
  1030. movaps -14 * SIZE(AA), %xmm0
  1031. addpd %xmm2, %xmm4
  1032. movaps -14 * SIZE(BB), %xmm2
  1033. mulpd %xmm0, %xmm2
  1034. movaps -12 * SIZE(AA), %xmm0
  1035. addpd %xmm2, %xmm5
  1036. movaps -12 * SIZE(BB), %xmm2
  1037. mulpd %xmm0, %xmm2
  1038. movaps -10 * SIZE(AA), %xmm0
  1039. addpd %xmm2, %xmm4
  1040. movaps -10 * SIZE(BB), %xmm2
  1041. mulpd %xmm0, %xmm2
  1042. movaps -8 * SIZE(AA), %xmm0
  1043. addpd %xmm2, %xmm5
  1044. movaps -8 * SIZE(BB), %xmm2
  1045. subl $-8 * SIZE, AA
  1046. subl $-8 * SIZE, BB
  1047. subl $1, %eax
  1048. jne .L62
  1049. ALIGN_4
  1050. .L65:
  1051. #ifndef TRMMKERNEL
  1052. movl K, %eax
  1053. #else
  1054. movl KKK, %eax
  1055. #endif
  1056. andl $7, %eax
  1057. BRANCH
  1058. je .L68
  1059. ALIGN_4
  1060. .L66:
  1061. mulsd %xmm0, %xmm2
  1062. movsd -15 * SIZE(AA), %xmm0
  1063. addsd %xmm2, %xmm4
  1064. movsd -15 * SIZE(BB), %xmm2
  1065. addl $1 * SIZE, AA
  1066. addl $1 * SIZE, BB
  1067. decl %eax
  1068. jg .L66
  1069. ALIGN_4
  1070. .L68:
  1071. movups ALPHA, %xmm3
  1072. addpd %xmm5, %xmm4
  1073. haddpd %xmm4, %xmm4
  1074. movsd 0 * SIZE(C1), %xmm0
  1075. movhps 1 * SIZE(C1), %xmm0
  1076. pshufd $0x44, %xmm4, %xmm2
  1077. mulpd %xmm3, %xmm2
  1078. addpd %xmm2, %xmm0
  1079. movlps %xmm0, 0 * SIZE(C1)
  1080. movhps %xmm0, 1 * SIZE(C1)
  1081. ALIGN_4
  1082. .L999:
  1083. popl %ebx
  1084. popl %esi
  1085. popl %edi
  1086. popl %ebp
  1087. addl $ARGS, %esp
  1088. ret
  1089. EPILOGUE