You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

gemm_kernel_4x2_sse2.S 32 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539
  1. /*********************************************************************/
  2. /* Copyright 2009, 2010 The University of Texas at Austin. */
  3. /* All rights reserved. */
  4. /* */
  5. /* Redistribution and use in source and binary forms, with or */
  6. /* without modification, are permitted provided that the following */
  7. /* conditions are met: */
  8. /* */
  9. /* 1. Redistributions of source code must retain the above */
  10. /* copyright notice, this list of conditions and the following */
  11. /* disclaimer. */
  12. /* */
  13. /* 2. Redistributions in binary form must reproduce the above */
  14. /* copyright notice, this list of conditions and the following */
  15. /* disclaimer in the documentation and/or other materials */
  16. /* provided with the distribution. */
  17. /* */
  18. /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
  19. /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
  20. /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
  21. /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
  22. /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
  23. /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
  24. /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
  25. /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
  26. /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
  27. /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
  28. /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
  29. /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
  30. /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
  31. /* POSSIBILITY OF SUCH DAMAGE. */
  32. /* */
  33. /* The views and conclusions contained in the software and */
  34. /* documentation are those of the authors and should not be */
  35. /* interpreted as representing official policies, either expressed */
  36. /* or implied, of The University of Texas at Austin. */
  37. /*********************************************************************/
  38. #define ASSEMBLER
  39. #include "common.h"
  40. #define PREFETCHSIZE (8 * 4)
  41. #if !defined(HAVE_SSE2) || !defined(HAVE_MMX)
  42. #error You have to check your configuration.
  43. #endif
  44. #define STACK 16
  45. #define ARGS 0
  46. #define STACK_M 4 + STACK + ARGS(%esi)
  47. #define STACK_N 8 + STACK + ARGS(%esi)
  48. #define STACK_K 12 + STACK + ARGS(%esi)
  49. #define STACK_ALPHA 16 + STACK + ARGS(%esi)
  50. #define STACK_A 24 + STACK + ARGS(%esi)
  51. #define STACK_B 28 + STACK + ARGS(%esi)
  52. #define STACK_C 32 + STACK + ARGS(%esi)
  53. #define STACK_LDC 36 + STACK + ARGS(%esi)
  54. #define STACK_OFFT 40 + STACK + ARGS(%esi)
  55. #define ALPHA 0(%esp)
  56. #define K 16(%esp)
  57. #define N 20(%esp)
  58. #define M 24(%esp)
  59. #define A 28(%esp)
  60. #define C 32(%esp)
  61. #define J 36(%esp)
  62. #define BX 40(%esp)
  63. #define OLD_STACK 44(%esp)
  64. #define OFFSET 48(%esp)
  65. #define KK 52(%esp)
  66. #define KKK 56(%esp)
  67. #define BUFFER 128(%esp)
  68. #define B %edi
  69. #define LDC %ebp
  70. #define STACK_ALIGN 4096
  71. #define STACK_OFFSET 1024
  72. #define AA %edx
  73. #define BB %ecx
  74. #define KERNEL1(address) \
  75. mulpd %xmm0, %xmm2; \
  76. mulpd 2 * SIZE + (address) * SIZE(BB), %xmm0; \
  77. addpd %xmm2, %xmm4; \
  78. movapd 0 * SIZE + (address) * SIZE(BB), %xmm2; \
  79. movq (PREFETCHSIZE + 0) * SIZE + (address) * SIZE(AA), %mm2; \
  80. addpd %xmm0, %xmm5; \
  81. movapd 2 * SIZE + (address) * SIZE(AA), %xmm0; \
  82. mulpd %xmm0, %xmm2; \
  83. mulpd 2 * SIZE + (address) * SIZE(BB), %xmm0; \
  84. addpd %xmm2, %xmm6; \
  85. movapd 4 * SIZE + (address) * SIZE(BB), %xmm2; \
  86. addpd %xmm0, %xmm7; \
  87. movapd 4 * SIZE + (address) * SIZE(AA), %xmm0
  88. #define KERNEL2(address) \
  89. mulpd %xmm0, %xmm2; \
  90. mulpd 6 * SIZE + (address) * SIZE(BB), %xmm0; \
  91. addpd %xmm2, %xmm4; \
  92. movapd 4 * SIZE + (address) * SIZE(BB), %xmm2; \
  93. addpd %xmm0, %xmm5; \
  94. movapd 6 * SIZE + (address) * SIZE(AA), %xmm0; \
  95. mulpd %xmm0, %xmm2; \
  96. mulpd 6 * SIZE + (address) * SIZE(BB), %xmm0; \
  97. addpd %xmm2, %xmm6; \
  98. movapd 16 * SIZE + (address) * SIZE(BB), %xmm2; \
  99. addpd %xmm0, %xmm7; \
  100. movapd 16 * SIZE + (address) * SIZE(AA), %xmm0
  101. #define KERNEL3(address) \
  102. movq (PREFETCHSIZE + 8) * SIZE + (address) * SIZE(AA), %mm2; \
  103. mulpd %xmm1, %xmm3; \
  104. mulpd 10 * SIZE + (address) * SIZE(BB), %xmm1; \
  105. addpd %xmm3, %xmm4; \
  106. movapd 8 * SIZE + (address) * SIZE(BB), %xmm3; \
  107. addpd %xmm1, %xmm5; \
  108. movapd 10 * SIZE + (address) * SIZE(AA), %xmm1; \
  109. mulpd %xmm1, %xmm3; \
  110. mulpd 10 * SIZE + (address) * SIZE(BB), %xmm1; \
  111. addpd %xmm3, %xmm6; \
  112. movapd 12 * SIZE + (address) * SIZE(BB), %xmm3; \
  113. addpd %xmm1, %xmm7; \
  114. movapd 12 * SIZE + (address) * SIZE(AA), %xmm1
  115. #define KERNEL4(address) \
  116. mulpd %xmm1, %xmm3; \
  117. mulpd 14 * SIZE + (address) * SIZE(BB), %xmm1; \
  118. addpd %xmm3, %xmm4; \
  119. movapd 12 * SIZE + (address) * SIZE(BB), %xmm3; \
  120. addpd %xmm1, %xmm5; \
  121. movapd 14 * SIZE + (address) * SIZE(AA), %xmm1; \
  122. mulpd %xmm1, %xmm3; \
  123. mulpd 14 * SIZE + (address) * SIZE(BB), %xmm1; \
  124. addpd %xmm3, %xmm6; \
  125. movapd 24 * SIZE + (address) * SIZE(BB), %xmm3; \
  126. addpd %xmm1, %xmm7; \
  127. movapd 24 * SIZE + (address) * SIZE(AA), %xmm1
  128. #define KERNEL5(address) \
  129. mulpd %xmm0, %xmm2; \
  130. mulpd 18 * SIZE + (address) * SIZE(BB), %xmm0; \
  131. addpd %xmm2, %xmm4; \
  132. movapd 16 * SIZE + (address) * SIZE(BB), %xmm2; \
  133. movq (PREFETCHSIZE + 16) * SIZE + (address) * SIZE(AA), %mm2; \
  134. addpd %xmm0, %xmm5; \
  135. movapd 18 * SIZE + (address) * SIZE(AA), %xmm0; \
  136. mulpd %xmm0, %xmm2; \
  137. mulpd 18 * SIZE + (address) * SIZE(BB), %xmm0; \
  138. addpd %xmm2, %xmm6; \
  139. movapd 20 * SIZE + (address) * SIZE(BB), %xmm2; \
  140. addpd %xmm0, %xmm7; \
  141. movapd 20 * SIZE + (address) * SIZE(AA), %xmm0
  142. #define KERNEL6(address) \
  143. mulpd %xmm0, %xmm2; \
  144. mulpd 22 * SIZE + (address) * SIZE(BB), %xmm0; \
  145. addpd %xmm2, %xmm4; \
  146. movapd 20 * SIZE + (address) * SIZE(BB), %xmm2; \
  147. addpd %xmm0, %xmm5; \
  148. movapd 22 * SIZE + (address) * SIZE(AA), %xmm0; \
  149. mulpd %xmm0, %xmm2; \
  150. mulpd 22 * SIZE + (address) * SIZE(BB), %xmm0; \
  151. addpd %xmm2, %xmm6; \
  152. movapd 32 * SIZE + (address) * SIZE(BB), %xmm2; \
  153. addpd %xmm0, %xmm7; \
  154. movapd 32 * SIZE + (address) * SIZE(AA), %xmm0
  155. #define KERNEL7(address) \
  156. movq (PREFETCHSIZE + 24) * SIZE + (address) * SIZE(AA), %mm2; \
  157. mulpd %xmm1, %xmm3; \
  158. mulpd 26 * SIZE + (address) * SIZE(BB), %xmm1; \
  159. addpd %xmm3, %xmm4; \
  160. movapd 24 * SIZE + (address) * SIZE(BB), %xmm3; \
  161. addpd %xmm1, %xmm5; \
  162. movapd 26 * SIZE + (address) * SIZE(AA), %xmm1; \
  163. mulpd %xmm1, %xmm3; \
  164. mulpd 26 * SIZE + (address) * SIZE(BB), %xmm1; \
  165. addpd %xmm3, %xmm6; \
  166. movapd 28 * SIZE + (address) * SIZE(BB), %xmm3; \
  167. addpd %xmm1, %xmm7; \
  168. movapd 28 * SIZE + (address) * SIZE(AA), %xmm1
  169. #define KERNEL8(address) \
  170. mulpd %xmm1, %xmm3; \
  171. mulpd 30 * SIZE + (address) * SIZE(BB), %xmm1; \
  172. addpd %xmm3, %xmm4; \
  173. movapd 28 * SIZE + (address) * SIZE(BB), %xmm3; \
  174. addpd %xmm1, %xmm5; \
  175. movapd 30 * SIZE + (address) * SIZE(AA), %xmm1; \
  176. mulpd %xmm1, %xmm3; \
  177. mulpd 30 * SIZE + (address) * SIZE(BB), %xmm1; \
  178. addpd %xmm3, %xmm6; \
  179. movapd 40 * SIZE + (address) * SIZE(BB), %xmm3; \
  180. addpd %xmm1, %xmm7; \
  181. movapd 40 * SIZE + (address) * SIZE(AA), %xmm1
  182. PROLOGUE
  183. pushl %ebp
  184. pushl %edi
  185. pushl %esi
  186. pushl %ebx
  187. PROFCODE
  188. EMMS
  189. movl %esp, %esi # save old stack
  190. subl $128 + LOCAL_BUFFER_SIZE + STACK_OFFSET, %esp
  191. andl $-STACK_ALIGN, %esp
  192. addl $STACK_OFFSET, %esp
  193. STACK_TOUCHING
  194. movd STACK_M, %mm0
  195. movl STACK_N, %eax
  196. movd STACK_K, %mm1
  197. movd STACK_A, %mm2
  198. movq STACK_ALPHA, %mm7
  199. movl STACK_B, B
  200. movd STACK_C, %mm3
  201. movl STACK_LDC, LDC
  202. #ifdef TRMMKERNEL
  203. movd STACK_OFFT, %mm4
  204. #endif
  205. movq %mm7, 0 * SIZE + ALPHA
  206. movq %mm7, 1 * SIZE + ALPHA
  207. movd %mm1, K
  208. movl %eax, N
  209. movd %mm0, M
  210. movd %mm2, A
  211. movd %mm3, C
  212. movl %esi, OLD_STACK
  213. #ifdef TRMMKERNEL
  214. movd %mm4, OFFSET
  215. movd %mm4, KK
  216. #ifndef LEFT
  217. negl KK
  218. #endif
  219. #endif
  220. sall $BASE_SHIFT, LDC
  221. sarl $1, %eax # j = (n >> 1)
  222. movl %eax, J
  223. jle .L100
  224. ALIGN_2
  225. .L01:
  226. #if defined(TRMMKERNEL) && defined(LEFT)
  227. movl OFFSET, %eax
  228. movl %eax, KK
  229. #endif
  230. /* Copying to Sub Buffer */
  231. leal BUFFER, %ecx
  232. movl K, %eax
  233. sarl $2, %eax
  234. jle .L03
  235. ALIGN_2
  236. .L02:
  237. movsd 0 * SIZE(B), %xmm0
  238. movsd 1 * SIZE(B), %xmm1
  239. movsd 2 * SIZE(B), %xmm2
  240. movsd 3 * SIZE(B), %xmm3
  241. movsd 4 * SIZE(B), %xmm4
  242. movsd 5 * SIZE(B), %xmm5
  243. movsd 6 * SIZE(B), %xmm6
  244. movsd 7 * SIZE(B), %xmm7
  245. unpcklpd %xmm0, %xmm0
  246. unpcklpd %xmm1, %xmm1
  247. unpcklpd %xmm2, %xmm2
  248. unpcklpd %xmm3, %xmm3
  249. unpcklpd %xmm4, %xmm4
  250. unpcklpd %xmm5, %xmm5
  251. unpcklpd %xmm6, %xmm6
  252. unpcklpd %xmm7, %xmm7
  253. movapd %xmm0, 0 * SIZE(%ecx)
  254. movapd %xmm1, 2 * SIZE(%ecx)
  255. movapd %xmm2, 4 * SIZE(%ecx)
  256. movapd %xmm3, 6 * SIZE(%ecx)
  257. movapd %xmm4, 8 * SIZE(%ecx)
  258. movapd %xmm5, 10 * SIZE(%ecx)
  259. movapd %xmm6, 12 * SIZE(%ecx)
  260. movapd %xmm7, 14 * SIZE(%ecx)
  261. prefetcht0 104 * SIZE(B)
  262. addl $ 8 * SIZE, B
  263. subl $-16 * SIZE, %ecx
  264. decl %eax
  265. BRANCH
  266. jne .L02
  267. ALIGN_2
  268. .L03:
  269. movl K, %eax
  270. andl $3, %eax
  271. BRANCH
  272. jle .L05
  273. ALIGN_4
  274. .L04:
  275. movsd 0 * SIZE(B), %xmm0
  276. movsd 1 * SIZE(B), %xmm1
  277. unpcklpd %xmm0, %xmm0
  278. unpcklpd %xmm1, %xmm1
  279. movapd %xmm0, 0 * SIZE(%ecx)
  280. movapd %xmm1, 2 * SIZE(%ecx)
  281. addl $2 * SIZE, B
  282. addl $4 * SIZE, %ecx
  283. decl %eax
  284. BRANCH
  285. jne .L04
  286. ALIGN_4
  287. .L05:
  288. movl B, BX
  289. movl C, %esi # coffset = c
  290. movl A, %edx # aoffset = a
  291. movl M, %ebx
  292. sarl $2, %ebx # i = (m >> 2)
  293. NOBRANCH
  294. jle .L30
  295. ALIGN_4
  296. .L10:
  297. #if !defined(TRMMKERNEL) || \
  298. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  299. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  300. leal BUFFER, BB
  301. movapd 0 * SIZE + BUFFER, %xmm2
  302. pxor %xmm4, %xmm4
  303. movapd 0 * SIZE(AA), %xmm0
  304. pxor %xmm5, %xmm5
  305. movapd 8 * SIZE + BUFFER, %xmm3
  306. pxor %xmm6, %xmm6
  307. movapd 8 * SIZE(AA), %xmm1
  308. pxor %xmm7, %xmm7
  309. #else
  310. leal BUFFER, BB
  311. movl KK, %eax
  312. leal (, %eax, SIZE), %eax
  313. leal (AA, %eax, 4), AA
  314. leal (BB, %eax, 4), BB /* because it's doubled */
  315. movapd 0 * SIZE(BB), %xmm2
  316. pxor %xmm4, %xmm4
  317. movapd 0 * SIZE(AA), %xmm0
  318. pxor %xmm5, %xmm5
  319. movapd 8 * SIZE(BB), %xmm3
  320. pxor %xmm6, %xmm6
  321. movapd 8 * SIZE(AA), %xmm1
  322. pxor %xmm7, %xmm7
  323. #endif
  324. prefetchnta 3 * SIZE(%esi)
  325. prefetchnta 3 * SIZE(%esi, LDC)
  326. movl BX, %eax
  327. prefetcht2 0 * SIZE(%eax)
  328. subl $-8 * SIZE, %eax
  329. movl %eax, BX
  330. #ifndef TRMMKERNEL
  331. movl K, %eax
  332. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  333. movl K, %eax
  334. subl KK, %eax
  335. movl %eax, KKK
  336. #else
  337. movl KK, %eax
  338. #ifdef LEFT
  339. addl $4, %eax
  340. #else
  341. addl $2, %eax
  342. #endif
  343. movl %eax, KKK
  344. #endif
  345. #ifdef PENTIUM4
  346. andl $-8, %eax
  347. NOBRANCH
  348. je .L12
  349. sall $3, %eax
  350. .align 8
  351. .L1X:
  352. KERNEL1(32 * 0)
  353. KERNEL2(32 * 0)
  354. KERNEL3(32 * 0)
  355. KERNEL4(32 * 0)
  356. KERNEL5(32 * 0)
  357. KERNEL6(32 * 0)
  358. KERNEL7(32 * 0)
  359. KERNEL8(32 * 0)
  360. cmpl $64 * 1, %eax
  361. NOBRANCH
  362. jle .L11
  363. KERNEL1(32 * 1)
  364. KERNEL2(32 * 1)
  365. KERNEL3(32 * 1)
  366. KERNEL4(32 * 1)
  367. KERNEL5(32 * 1)
  368. KERNEL6(32 * 1)
  369. KERNEL7(32 * 1)
  370. KERNEL8(32 * 1)
  371. cmpl $64 * 2, %eax
  372. NOBRANCH
  373. jle .L11
  374. KERNEL1(32 * 2)
  375. KERNEL2(32 * 2)
  376. KERNEL3(32 * 2)
  377. KERNEL4(32 * 2)
  378. KERNEL5(32 * 2)
  379. KERNEL6(32 * 2)
  380. KERNEL7(32 * 2)
  381. KERNEL8(32 * 2)
  382. cmpl $64 * 3, %eax
  383. NOBRANCH
  384. jle .L11
  385. KERNEL1(32 * 3)
  386. KERNEL2(32 * 3)
  387. KERNEL3(32 * 3)
  388. KERNEL4(32 * 3)
  389. KERNEL5(32 * 3)
  390. KERNEL6(32 * 3)
  391. KERNEL7(32 * 3)
  392. KERNEL8(32 * 3)
  393. cmpl $64 * 4, %eax
  394. NOBRANCH
  395. jle .L11
  396. KERNEL1(32 * 4)
  397. KERNEL2(32 * 4)
  398. KERNEL3(32 * 4)
  399. KERNEL4(32 * 4)
  400. KERNEL5(32 * 4)
  401. KERNEL6(32 * 4)
  402. KERNEL7(32 * 4)
  403. KERNEL8(32 * 4)
  404. cmpl $64 * 5, %eax
  405. NOBRANCH
  406. jle .L11
  407. KERNEL1(32 * 5)
  408. KERNEL2(32 * 5)
  409. KERNEL3(32 * 5)
  410. KERNEL4(32 * 5)
  411. KERNEL5(32 * 5)
  412. KERNEL6(32 * 5)
  413. KERNEL7(32 * 5)
  414. KERNEL8(32 * 5)
  415. cmpl $64 * 6, %eax
  416. NOBRANCH
  417. jle .L11
  418. KERNEL1(32 * 6)
  419. KERNEL2(32 * 6)
  420. KERNEL3(32 * 6)
  421. KERNEL4(32 * 6)
  422. KERNEL5(32 * 6)
  423. KERNEL6(32 * 6)
  424. KERNEL7(32 * 6)
  425. KERNEL8(32 * 6)
  426. cmpl $64 * 7, %eax
  427. NOBRANCH
  428. jle .L11
  429. KERNEL1(32 * 7)
  430. KERNEL2(32 * 7)
  431. KERNEL3(32 * 7)
  432. KERNEL4(32 * 7)
  433. KERNEL5(32 * 7)
  434. KERNEL6(32 * 7)
  435. KERNEL7(32 * 7)
  436. KERNEL8(32 * 7)
  437. addl $64 * 4 * SIZE, AA
  438. addl $64 * 4 * SIZE, BB
  439. subl $64 * 8, %eax
  440. BRANCH
  441. jg .L1X
  442. .L11:
  443. leal (AA, %eax, 4), AA
  444. leal (BB, %eax, 4), BB
  445. #else
  446. sarl $3, %eax
  447. je .L12
  448. .L11:
  449. KERNEL1(32 * 0)
  450. KERNEL2(32 * 0)
  451. KERNEL3(32 * 0)
  452. KERNEL4(32 * 0)
  453. KERNEL5(32 * 0)
  454. KERNEL6(32 * 0)
  455. KERNEL7(32 * 0)
  456. KERNEL8(32 * 0)
  457. addl $32 * SIZE, %ecx
  458. addl $32 * SIZE, %edx
  459. decl %eax
  460. jne .L11
  461. #endif
  462. .L12:
  463. #ifndef TRMMKERNEL
  464. movl K, %eax
  465. #else
  466. movl KKK, %eax
  467. #endif
  468. movapd ALPHA, %xmm3
  469. andl $7, %eax # if (k & 1)
  470. BRANCH
  471. je .L14
  472. .L13:
  473. mulpd %xmm0, %xmm2
  474. mulpd 2 * SIZE(BB), %xmm0
  475. addpd %xmm2, %xmm4
  476. movapd 0 * SIZE(BB), %xmm2
  477. addpd %xmm0, %xmm5
  478. movapd 2 * SIZE(AA), %xmm0
  479. mulpd %xmm0, %xmm2
  480. mulpd 2 * SIZE(BB), %xmm0
  481. addpd %xmm2, %xmm6
  482. movapd 4 * SIZE(BB), %xmm2
  483. addpd %xmm0, %xmm7
  484. movapd 4 * SIZE(AA), %xmm0
  485. addl $4 * SIZE, AA # aoffset += 8
  486. addl $4 * SIZE, BB # boffset1 += 8
  487. subl $1, %eax
  488. jg .L13
  489. ALIGN_4
  490. .L14:
  491. mulpd %xmm3, %xmm4
  492. mulpd %xmm3, %xmm5
  493. mulpd %xmm3, %xmm6
  494. mulpd %xmm3, %xmm7
  495. movl %esi, %eax
  496. orl LDC, %eax
  497. testl $15, %eax
  498. NOBRANCH
  499. jne .L18x
  500. #ifndef TRMMKERNEL
  501. movapd 0 * SIZE(%esi), %xmm0
  502. movapd 2 * SIZE(%esi), %xmm1
  503. movapd 0 * SIZE(%esi, LDC), %xmm2
  504. movapd 2 * SIZE(%esi, LDC), %xmm3
  505. addpd %xmm0, %xmm4
  506. addpd %xmm1, %xmm6
  507. addpd %xmm2, %xmm5
  508. addpd %xmm3, %xmm7
  509. #endif
  510. movapd %xmm4, 0 * SIZE(%esi)
  511. movapd %xmm6, 2 * SIZE(%esi)
  512. movapd %xmm5, 0 * SIZE(%esi, LDC)
  513. movapd %xmm7, 2 * SIZE(%esi, LDC)
  514. #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  515. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  516. movl K, %eax
  517. subl KKK, %eax
  518. leal (,%eax, SIZE), %eax
  519. leal (AA, %eax, 4), AA
  520. leal (BB, %eax, 4), BB
  521. #endif
  522. #if defined(TRMMKERNEL) && defined(LEFT)
  523. addl $4, KK
  524. #endif
  525. addl $4 * SIZE, %esi # coffset += 4
  526. decl %ebx # i --
  527. BRANCH
  528. jg .L10
  529. jmp .L30
  530. ALIGN_2
  531. .L18x:
  532. #ifndef TRMMKERNEL
  533. movsd 0 * SIZE(%esi), %xmm0
  534. movhpd 1 * SIZE(%esi), %xmm0
  535. movsd 2 * SIZE(%esi), %xmm1
  536. movhpd 3 * SIZE(%esi), %xmm1
  537. movsd 0 * SIZE(%esi, LDC), %xmm2
  538. movhpd 1 * SIZE(%esi, LDC), %xmm2
  539. movsd 2 * SIZE(%esi, LDC), %xmm3
  540. movhpd 3 * SIZE(%esi, LDC), %xmm3
  541. addpd %xmm0, %xmm4
  542. addpd %xmm1, %xmm6
  543. addpd %xmm2, %xmm5
  544. addpd %xmm3, %xmm7
  545. #endif
  546. movsd %xmm4, 0 * SIZE(%esi)
  547. movhpd %xmm4, 1 * SIZE(%esi)
  548. movsd %xmm6, 2 * SIZE(%esi)
  549. movhpd %xmm6, 3 * SIZE(%esi)
  550. movsd %xmm5, 0 * SIZE(%esi, LDC)
  551. movhpd %xmm5, 1 * SIZE(%esi, LDC)
  552. movsd %xmm7, 2 * SIZE(%esi, LDC)
  553. movhpd %xmm7, 3 * SIZE(%esi, LDC)
  554. #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  555. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  556. movl K, %eax
  557. subl KKK, %eax
  558. leal (,%eax, SIZE), %eax
  559. leal (AA, %eax, 4), AA
  560. leal (BB, %eax, 4), BB
  561. #endif
  562. #if defined(TRMMKERNEL) && defined(LEFT)
  563. addl $4, KK
  564. #endif
  565. addl $4 * SIZE, %esi # coffset += 4
  566. decl %ebx # i --
  567. BRANCH
  568. jg .L10
  569. ALIGN_2
  570. .L30:
  571. movl M, %ebx
  572. testl $2, %ebx
  573. jle .L50
  574. #if !defined(TRMMKERNEL) || \
  575. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  576. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  577. leal BUFFER, %ecx
  578. movapd 0 * SIZE + BUFFER, %xmm2
  579. pxor %xmm4, %xmm4
  580. movapd 0 * SIZE(AA), %xmm0
  581. pxor %xmm5, %xmm5
  582. movapd 8 * SIZE + BUFFER, %xmm3
  583. pxor %xmm6, %xmm6
  584. movapd 8 * SIZE(AA), %xmm1
  585. pxor %xmm7, %xmm7
  586. #else
  587. leal BUFFER, BB
  588. movl KK, %eax
  589. leal (, %eax, SIZE), %eax
  590. leal (AA, %eax, 2), AA
  591. leal (BB, %eax, 4), BB /* because it's doubled */
  592. movapd 0 * SIZE(BB), %xmm2
  593. pxor %xmm4, %xmm4
  594. movapd 0 * SIZE(AA), %xmm0
  595. pxor %xmm5, %xmm5
  596. movapd 8 * SIZE(BB), %xmm3
  597. pxor %xmm6, %xmm6
  598. movapd 8 * SIZE(AA), %xmm1
  599. pxor %xmm7, %xmm7
  600. #endif
  601. #ifndef TRMMKERNEL
  602. movl K, %eax
  603. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  604. movl K, %eax
  605. subl KK, %eax
  606. movl %eax, KKK
  607. #else
  608. movl KK, %eax
  609. addl $2, %eax
  610. movl %eax, KKK
  611. #endif
  612. sarl $3, %eax
  613. je .L32
  614. .L31:
  615. mulpd %xmm0, %xmm2
  616. mulpd 2 * SIZE(BB), %xmm0
  617. addpd %xmm2, %xmm4
  618. movapd 4 * SIZE(BB), %xmm2
  619. addpd %xmm0, %xmm5
  620. movapd 2 * SIZE(AA), %xmm0
  621. mulpd %xmm0, %xmm2
  622. mulpd 6 * SIZE(BB), %xmm0
  623. addpd %xmm2, %xmm6
  624. movapd 16 * SIZE(BB), %xmm2
  625. addpd %xmm0, %xmm7
  626. movapd 4 * SIZE(AA), %xmm0
  627. mulpd %xmm0, %xmm3
  628. mulpd 10 * SIZE(BB), %xmm0
  629. addpd %xmm3, %xmm4
  630. movapd 12 * SIZE(BB), %xmm3
  631. addpd %xmm0, %xmm5
  632. movapd 6 * SIZE(AA), %xmm0
  633. mulpd %xmm0, %xmm3
  634. mulpd 14 * SIZE(BB), %xmm0
  635. addpd %xmm3, %xmm6
  636. movapd 24 * SIZE(BB), %xmm3
  637. addpd %xmm0, %xmm7
  638. movapd 16 * SIZE(AA), %xmm0
  639. mulpd %xmm1, %xmm2
  640. mulpd 18 * SIZE(BB), %xmm1
  641. addpd %xmm2, %xmm4
  642. movapd 20 * SIZE(BB), %xmm2
  643. addpd %xmm1, %xmm5
  644. movapd 10 * SIZE(AA), %xmm1
  645. mulpd %xmm1, %xmm2
  646. mulpd 22 * SIZE(BB), %xmm1
  647. addpd %xmm2, %xmm6
  648. movapd 32 * SIZE(BB), %xmm2
  649. addpd %xmm1, %xmm7
  650. movapd 12 * SIZE(AA), %xmm1
  651. mulpd %xmm1, %xmm3
  652. mulpd 26 * SIZE(BB), %xmm1
  653. addpd %xmm3, %xmm4
  654. movapd 28 * SIZE(BB), %xmm3
  655. addpd %xmm1, %xmm5
  656. movapd 14 * SIZE(AA), %xmm1
  657. mulpd %xmm1, %xmm3
  658. mulpd 30 * SIZE(BB), %xmm1
  659. addpd %xmm3, %xmm6
  660. movapd 40 * SIZE(BB), %xmm3
  661. addpd %xmm1, %xmm7
  662. movapd 24 * SIZE(AA), %xmm1
  663. addl $16 * SIZE, AA
  664. addl $32 * SIZE, BB
  665. BRANCH
  666. decl %eax
  667. jne .L31
  668. .L32:
  669. #ifndef TRMMKERNEL
  670. movl K, %eax
  671. #else
  672. movl KKK, %eax
  673. #endif
  674. movapd ALPHA, %xmm3
  675. andl $7, %eax # if (k & 1)
  676. BRANCH
  677. je .L34
  678. .L33:
  679. mulpd %xmm0, %xmm2
  680. mulpd 2 * SIZE(BB), %xmm0
  681. addpd %xmm2, %xmm4
  682. movapd 4 * SIZE(BB), %xmm2
  683. addpd %xmm0, %xmm5
  684. movapd 2 * SIZE(AA), %xmm0
  685. addl $2 * SIZE, AA # aoffset += 8
  686. addl $4 * SIZE, BB # boffset1 += 8
  687. decl %eax
  688. BRANCH
  689. jg .L33
  690. ALIGN_4
  691. .L34:
  692. addpd %xmm6, %xmm4
  693. addpd %xmm7, %xmm5
  694. mulpd %xmm3, %xmm4
  695. mulpd %xmm3, %xmm5
  696. #ifndef TRMMKERNEL
  697. SHUFPD_1 %xmm0, %xmm0
  698. movsd 0 * SIZE(%esi), %xmm0
  699. movhpd 1 * SIZE(%esi), %xmm0
  700. SHUFPD_1 %xmm2, %xmm2
  701. movsd 0 * SIZE(%esi, LDC), %xmm2
  702. movhpd 1 * SIZE(%esi, LDC), %xmm2
  703. addpd %xmm0, %xmm4
  704. addpd %xmm2, %xmm5
  705. #endif
  706. movsd %xmm4, 0 * SIZE(%esi)
  707. movhpd %xmm4, 1 * SIZE(%esi)
  708. movsd %xmm5, 0 * SIZE(%esi, LDC)
  709. movhpd %xmm5, 1 * SIZE(%esi, LDC)
  710. addl $2 * SIZE, %esi # coffset += 4
  711. #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  712. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  713. movl K, %eax
  714. subl KKK, %eax
  715. leal (,%eax, SIZE), %eax
  716. leal (AA, %eax, 2), AA
  717. leal (BB, %eax, 4), BB
  718. #endif
  719. #if defined(TRMMKERNEL) && defined(LEFT)
  720. addl $2, KK
  721. #endif
  722. ALIGN_2
  723. .L50:
  724. movl M, %ebx
  725. testl $1, %ebx
  726. jle .L99
  727. #if !defined(TRMMKERNEL) || \
  728. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  729. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  730. leal BUFFER, %ecx
  731. movapd 0 * SIZE + BUFFER, %xmm2
  732. pxor %xmm4, %xmm4
  733. movapd 0 * SIZE(AA), %xmm0
  734. pxor %xmm5, %xmm5
  735. movapd 8 * SIZE + BUFFER, %xmm3
  736. pxor %xmm6, %xmm6
  737. movsd 4 * SIZE(AA), %xmm1
  738. pxor %xmm7, %xmm7
  739. #else
  740. leal BUFFER, BB
  741. movl KK, %eax
  742. leal (, %eax, SIZE), %eax
  743. leal (AA, %eax, 1), AA
  744. leal (BB, %eax, 4), BB /* because it's doubled */
  745. movapd 0 * SIZE(BB), %xmm2
  746. pxor %xmm4, %xmm4
  747. movapd 0 * SIZE(AA), %xmm0
  748. pxor %xmm5, %xmm5
  749. movapd 8 * SIZE(BB), %xmm3
  750. pxor %xmm6, %xmm6
  751. movsd 4 * SIZE(AA), %xmm1
  752. pxor %xmm7, %xmm7
  753. #endif
  754. #ifndef TRMMKERNEL
  755. movl K, %eax
  756. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  757. movl K, %eax
  758. subl KK, %eax
  759. movl %eax, KKK
  760. #else
  761. movl KK, %eax
  762. #ifdef LEFT
  763. addl $1, %eax
  764. #else
  765. addl $2, %eax
  766. #endif
  767. movl %eax, KKK
  768. #endif
  769. sarl $3, %eax
  770. je .L52
  771. .L51:
  772. mulsd %xmm0, %xmm2
  773. mulsd 2 * SIZE(BB), %xmm0
  774. addsd %xmm2, %xmm4
  775. movsd 4 * SIZE(BB), %xmm2
  776. addsd %xmm0, %xmm5
  777. movsd 1 * SIZE(AA), %xmm0
  778. mulsd %xmm0, %xmm2
  779. mulsd 6 * SIZE(BB), %xmm0
  780. addsd %xmm2, %xmm4
  781. movsd 16 * SIZE(BB), %xmm2
  782. addsd %xmm0, %xmm5
  783. movsd 2 * SIZE(AA), %xmm0
  784. mulsd %xmm0, %xmm3
  785. mulsd 10 * SIZE(BB), %xmm0
  786. addsd %xmm3, %xmm4
  787. movsd 12 * SIZE(BB), %xmm3
  788. addsd %xmm0, %xmm5
  789. movsd 3 * SIZE(AA), %xmm0
  790. mulsd %xmm0, %xmm3
  791. mulsd 14 * SIZE(BB), %xmm0
  792. addsd %xmm3, %xmm4
  793. movsd 24 * SIZE(BB), %xmm3
  794. addsd %xmm0, %xmm5
  795. movsd 8 * SIZE(AA), %xmm0
  796. mulsd %xmm1, %xmm2
  797. mulsd 18 * SIZE(BB), %xmm1
  798. addsd %xmm2, %xmm4
  799. movsd 20 * SIZE(BB), %xmm2
  800. addsd %xmm1, %xmm5
  801. movsd 5 * SIZE(AA), %xmm1
  802. mulsd %xmm1, %xmm2
  803. mulsd 22 * SIZE(BB), %xmm1
  804. addsd %xmm2, %xmm4
  805. movsd 32 * SIZE(BB), %xmm2
  806. addsd %xmm1, %xmm5
  807. movsd 6 * SIZE(AA), %xmm1
  808. mulsd %xmm1, %xmm3
  809. mulsd 26 * SIZE(BB), %xmm1
  810. addsd %xmm3, %xmm4
  811. movsd 28 * SIZE(BB), %xmm3
  812. addsd %xmm1, %xmm5
  813. movsd 7 * SIZE(AA), %xmm1
  814. mulsd %xmm1, %xmm3
  815. mulsd 30 * SIZE(BB), %xmm1
  816. addsd %xmm3, %xmm4
  817. movsd 40 * SIZE(BB), %xmm3
  818. addsd %xmm1, %xmm5
  819. movsd 12 * SIZE(AA), %xmm1
  820. addl $ 8 * SIZE, AA
  821. addl $32 * SIZE, BB
  822. BRANCH
  823. decl %eax
  824. jne .L51
  825. .L52:
  826. #ifndef TRMMKERNEL
  827. movl K, %eax
  828. #else
  829. movl KKK, %eax
  830. #endif
  831. movsd ALPHA, %xmm3
  832. andl $7, %eax # if (k & 1)
  833. BRANCH
  834. je .L54
  835. .L53:
  836. mulsd %xmm0, %xmm2
  837. mulsd 2 * SIZE(BB), %xmm0
  838. addsd %xmm2, %xmm4
  839. movsd 4 * SIZE(BB), %xmm2
  840. addsd %xmm0, %xmm5
  841. movsd 1 * SIZE(AA), %xmm0
  842. addl $1 * SIZE, AA # aoffset += 8
  843. addl $4 * SIZE, BB # boffset1 += 8
  844. decl %eax
  845. BRANCH
  846. jg .L53
  847. ALIGN_4
  848. .L54:
  849. addsd %xmm6, %xmm4
  850. addsd %xmm7, %xmm5
  851. mulsd %xmm3, %xmm4
  852. mulsd %xmm3, %xmm5
  853. #ifndef TRMMKERNEL
  854. addsd 0 * SIZE(%esi), %xmm4
  855. addsd 0 * SIZE(%esi, LDC), %xmm5
  856. #endif
  857. movsd %xmm4, 0 * SIZE(%esi)
  858. movsd %xmm5, 0 * SIZE(%esi, LDC)
  859. addl $1 * SIZE, %esi
  860. #if defined(TRMMKERNEL) && defined(LEFT)
  861. addl $1, KK
  862. #endif
  863. ALIGN_2
  864. .L99:
  865. #if defined(TRMMKERNEL) && !defined(LEFT)
  866. addl $2, KK
  867. #endif
  868. leal (, LDC, 2), %eax
  869. addl %eax, C # c += 2 * ldc
  870. BRANCH
  871. decl J # j --
  872. jg .L01
  873. ALIGN_2
  874. .L100:
  875. movl N, %eax
  876. testl $1, %eax
  877. jle .L999
  878. ALIGN_2
  879. .L101:
  880. #if defined(TRMMKERNEL) && defined(LEFT)
  881. movl OFFSET, %eax
  882. movl %eax, KK
  883. #endif
  884. /* Copying to Sub Buffer */
  885. leal BUFFER, %ecx
  886. movl K, %eax
  887. sarl $3, %eax
  888. jle .L103
  889. ALIGN_4
  890. .L102:
  891. movsd 0 * SIZE(B), %xmm0
  892. movsd 1 * SIZE(B), %xmm1
  893. movsd 2 * SIZE(B), %xmm2
  894. movsd 3 * SIZE(B), %xmm3
  895. movsd 4 * SIZE(B), %xmm4
  896. movsd 5 * SIZE(B), %xmm5
  897. movsd 6 * SIZE(B), %xmm6
  898. movsd 7 * SIZE(B), %xmm7
  899. unpcklpd %xmm0, %xmm0
  900. unpcklpd %xmm1, %xmm1
  901. unpcklpd %xmm2, %xmm2
  902. unpcklpd %xmm3, %xmm3
  903. unpcklpd %xmm4, %xmm4
  904. unpcklpd %xmm5, %xmm5
  905. unpcklpd %xmm6, %xmm6
  906. unpcklpd %xmm7, %xmm7
  907. movapd %xmm0, 0 * SIZE(%ecx)
  908. movapd %xmm1, 2 * SIZE(%ecx)
  909. movapd %xmm2, 4 * SIZE(%ecx)
  910. movapd %xmm3, 6 * SIZE(%ecx)
  911. movapd %xmm4, 8 * SIZE(%ecx)
  912. movapd %xmm5, 10 * SIZE(%ecx)
  913. movapd %xmm6, 12 * SIZE(%ecx)
  914. movapd %xmm7, 14 * SIZE(%ecx)
  915. prefetcht0 104 * SIZE(B)
  916. addl $ 8 * SIZE, B
  917. addl $16 * SIZE, %ecx
  918. decl %eax
  919. BRANCH
  920. jne .L102
  921. ALIGN_2
  922. .L103:
  923. movl K, %eax
  924. andl $7, %eax
  925. BRANCH
  926. jle .L105
  927. ALIGN_2
  928. .L104:
  929. movsd 0 * SIZE(B), %xmm0
  930. unpcklpd %xmm0, %xmm0
  931. movapd %xmm0, 0 * SIZE(%ecx)
  932. addl $1 * SIZE, B
  933. addl $2 * SIZE, %ecx
  934. decl %eax
  935. jne .L104
  936. ALIGN_4
  937. .L105:
  938. movl C, %esi # coffset = c
  939. movl A, %edx # aoffset = a
  940. movl M, %ebx
  941. sarl $2, %ebx # i = (m >> 2)
  942. jle .L130
  943. ALIGN_4
  944. .L110:
  945. #if !defined(TRMMKERNEL) || \
  946. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  947. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  948. leal BUFFER, BB
  949. movapd 0 * SIZE + BUFFER, %xmm2
  950. pxor %xmm4, %xmm4
  951. movapd 0 * SIZE(AA), %xmm0
  952. pxor %xmm5, %xmm5
  953. movapd 8 * SIZE + BUFFER, %xmm3
  954. pxor %xmm6, %xmm6
  955. movapd 8 * SIZE(AA), %xmm1
  956. pxor %xmm7, %xmm7
  957. #else
  958. leal BUFFER, BB
  959. movl KK, %eax
  960. leal (, %eax, SIZE), %eax
  961. leal (AA, %eax, 4), AA
  962. leal (BB, %eax, 2), BB
  963. movapd 0 * SIZE(BB), %xmm2
  964. pxor %xmm4, %xmm4
  965. movapd 0 * SIZE(AA), %xmm0
  966. pxor %xmm5, %xmm5
  967. movapd 8 * SIZE(BB), %xmm3
  968. pxor %xmm6, %xmm6
  969. movapd 8 * SIZE(AA), %xmm1
  970. pxor %xmm7, %xmm7
  971. #endif
  972. #ifndef TRMMKERNEL
  973. movl K, %eax
  974. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  975. movl K, %eax
  976. subl KK, %eax
  977. movl %eax, KKK
  978. #else
  979. movl KK, %eax
  980. #ifdef LEFT
  981. addl $4, %eax
  982. #else
  983. addl $1, %eax
  984. #endif
  985. movl %eax, KKK
  986. #endif
  987. sarl $3, %eax
  988. je .L112
  989. .L111:
  990. mulpd %xmm2, %xmm0
  991. mulpd 2 * SIZE(AA), %xmm2
  992. addpd %xmm0, %xmm4
  993. movapd 4 * SIZE(AA), %xmm0
  994. addpd %xmm2, %xmm6
  995. movapd 2 * SIZE(BB), %xmm2
  996. mulpd %xmm2, %xmm0
  997. mulpd 6 * SIZE(AA), %xmm2
  998. addpd %xmm0, %xmm5
  999. movapd 16 * SIZE(AA), %xmm0
  1000. addpd %xmm2, %xmm7
  1001. movapd 4 * SIZE(BB), %xmm2
  1002. mulpd %xmm2, %xmm1
  1003. mulpd 10 * SIZE(AA), %xmm2
  1004. addpd %xmm1, %xmm4
  1005. movapd 12 * SIZE(AA), %xmm1
  1006. addpd %xmm2, %xmm6
  1007. movapd 6 * SIZE(BB), %xmm2
  1008. mulpd %xmm2, %xmm1
  1009. mulpd 14 * SIZE(AA), %xmm2
  1010. addpd %xmm1, %xmm5
  1011. movapd 24 * SIZE(AA), %xmm1
  1012. addpd %xmm2, %xmm7
  1013. movapd 16 * SIZE(BB), %xmm2
  1014. mulpd %xmm3, %xmm0
  1015. mulpd 18 * SIZE(AA), %xmm3
  1016. addpd %xmm0, %xmm4
  1017. movapd 20 * SIZE(AA), %xmm0
  1018. addpd %xmm3, %xmm6
  1019. movapd 10 * SIZE(BB), %xmm3
  1020. mulpd %xmm3, %xmm0
  1021. mulpd 22 * SIZE(AA), %xmm3
  1022. addpd %xmm0, %xmm5
  1023. movapd 32 * SIZE(AA), %xmm0
  1024. addpd %xmm3, %xmm7
  1025. movapd 12 * SIZE(BB), %xmm3
  1026. mulpd %xmm3, %xmm1
  1027. mulpd 26 * SIZE(AA), %xmm3
  1028. addpd %xmm1, %xmm4
  1029. movapd 28 * SIZE(AA), %xmm1
  1030. addpd %xmm3, %xmm6
  1031. movapd 14 * SIZE(BB), %xmm3
  1032. mulpd %xmm3, %xmm1
  1033. mulpd 30 * SIZE(AA), %xmm3
  1034. addpd %xmm1, %xmm5
  1035. movapd 40 * SIZE(AA), %xmm1
  1036. addpd %xmm3, %xmm7
  1037. movapd 24 * SIZE(BB), %xmm3
  1038. addl $32 * SIZE, AA
  1039. addl $16 * SIZE, BB
  1040. decl %eax
  1041. jne .L111
  1042. .L112:
  1043. #ifndef TRMMKERNEL
  1044. movl K, %eax
  1045. #else
  1046. movl KKK, %eax
  1047. #endif
  1048. movapd ALPHA, %xmm3
  1049. andl $7, %eax # if (k & 1)
  1050. BRANCH
  1051. je .L114
  1052. .L113:
  1053. mulpd %xmm2, %xmm0
  1054. mulpd 2 * SIZE(AA), %xmm2
  1055. addpd %xmm0, %xmm4
  1056. movapd 4 * SIZE(AA), %xmm0
  1057. addpd %xmm2, %xmm6
  1058. movapd 2 * SIZE(BB), %xmm2
  1059. addl $4 * SIZE, AA # aoffset += 8
  1060. addl $2 * SIZE, BB # boffset1 += 8
  1061. subl $1, %eax
  1062. jg .L113
  1063. ALIGN_4
  1064. .L114:
  1065. addpd %xmm5, %xmm4
  1066. addpd %xmm7, %xmm6
  1067. mulpd %xmm3, %xmm4
  1068. mulpd %xmm3, %xmm6
  1069. #ifndef TRMMKERNEL
  1070. movsd 0 * SIZE(%esi), %xmm0
  1071. movhpd 1 * SIZE(%esi), %xmm0
  1072. addpd %xmm0, %xmm4
  1073. movsd 2 * SIZE(%esi), %xmm1
  1074. movhpd 3 * SIZE(%esi), %xmm1
  1075. addpd %xmm1, %xmm6
  1076. #endif
  1077. movsd %xmm4, 0 * SIZE(%esi)
  1078. unpckhpd %xmm4, %xmm4
  1079. movsd %xmm4, 1 * SIZE(%esi)
  1080. movsd %xmm6, 2 * SIZE(%esi)
  1081. unpckhpd %xmm6, %xmm6
  1082. movsd %xmm6, 3 * SIZE(%esi)
  1083. #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  1084. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  1085. movl K, %eax
  1086. subl KKK, %eax
  1087. leal (,%eax, SIZE), %eax
  1088. leal (AA, %eax, 4), AA
  1089. leal (BB, %eax, 2), BB
  1090. #endif
  1091. #if defined(TRMMKERNEL) && defined(LEFT)
  1092. addl $4, KK
  1093. #endif
  1094. addl $4 * SIZE, %esi # coffset += 4
  1095. BRANCH
  1096. decl %ebx # i --
  1097. jg .L110
  1098. ALIGN_2
  1099. .L130:
  1100. movl M, %ebx
  1101. testl $2, %ebx
  1102. jle .L150
  1103. #if !defined(TRMMKERNEL) || \
  1104. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  1105. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  1106. leal BUFFER, BB
  1107. movapd 0 * SIZE + BUFFER, %xmm2
  1108. pxor %xmm4, %xmm4
  1109. movapd 0 * SIZE(AA), %xmm0
  1110. pxor %xmm5, %xmm5
  1111. movapd 8 * SIZE + BUFFER, %xmm3
  1112. pxor %xmm6, %xmm6
  1113. movapd 8 * SIZE(AA), %xmm1
  1114. pxor %xmm7, %xmm7
  1115. #else
  1116. leal BUFFER, BB
  1117. movl KK, %eax
  1118. leal (, %eax, SIZE), %eax
  1119. leal (AA, %eax, 2), AA
  1120. leal (BB, %eax, 2), BB
  1121. movapd 0 * SIZE(BB), %xmm2
  1122. pxor %xmm4, %xmm4
  1123. movapd 0 * SIZE(AA), %xmm0
  1124. pxor %xmm5, %xmm5
  1125. movapd 8 * SIZE(BB), %xmm3
  1126. pxor %xmm6, %xmm6
  1127. movapd 8 * SIZE(AA), %xmm1
  1128. pxor %xmm7, %xmm7
  1129. #endif
  1130. #ifndef TRMMKERNEL
  1131. movl K, %eax
  1132. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1133. movl K, %eax
  1134. subl KK, %eax
  1135. movl %eax, KKK
  1136. #else
  1137. movl KK, %eax
  1138. #ifdef LEFT
  1139. addl $2, %eax
  1140. #else
  1141. addl $1, %eax
  1142. #endif
  1143. movl %eax, KKK
  1144. #endif
  1145. sarl $3, %eax
  1146. je .L132
  1147. .L131:
  1148. mulpd %xmm0, %xmm2
  1149. movapd 2 * SIZE(AA), %xmm0
  1150. addpd %xmm2, %xmm4
  1151. mulpd 2 * SIZE(BB), %xmm0
  1152. movapd 16 * SIZE(BB), %xmm2
  1153. addpd %xmm0, %xmm5
  1154. movapd 4 * SIZE(AA), %xmm0
  1155. mulpd 4 * SIZE(BB), %xmm0
  1156. addpd %xmm0, %xmm6
  1157. movapd 6 * SIZE(AA), %xmm0
  1158. mulpd 6 * SIZE(BB), %xmm0
  1159. addpd %xmm0, %xmm7
  1160. movapd 16 * SIZE(AA), %xmm0
  1161. mulpd %xmm1, %xmm3
  1162. movapd 10 * SIZE(AA), %xmm1
  1163. addpd %xmm3, %xmm4
  1164. mulpd 10 * SIZE(BB), %xmm1
  1165. movapd 24 * SIZE(BB), %xmm3
  1166. addpd %xmm1, %xmm5
  1167. movapd 12 * SIZE(AA), %xmm1
  1168. mulpd 12 * SIZE(BB), %xmm1
  1169. addpd %xmm1, %xmm6
  1170. movapd 14 * SIZE(AA), %xmm1
  1171. mulpd 14 * SIZE(BB), %xmm1
  1172. addpd %xmm1, %xmm7
  1173. movapd 24 * SIZE(AA), %xmm1
  1174. addl $16 * SIZE, AA
  1175. addl $16 * SIZE, BB
  1176. BRANCH
  1177. decl %eax
  1178. jne .L131
  1179. .L132:
  1180. #ifndef TRMMKERNEL
  1181. movl K, %eax
  1182. #else
  1183. movl KKK, %eax
  1184. #endif
  1185. movapd ALPHA, %xmm3
  1186. andl $7, %eax # if (k & 1)
  1187. BRANCH
  1188. je .L134
  1189. .L133:
  1190. movapd 0 * SIZE(AA), %xmm0
  1191. mulpd 0 * SIZE(BB), %xmm0
  1192. addpd %xmm0, %xmm4
  1193. addl $2 * SIZE, AA # aoffset += 8
  1194. addl $2 * SIZE, BB # boffset1 += 8
  1195. decl %eax
  1196. BRANCH
  1197. jg .L133
  1198. ALIGN_4
  1199. .L134:
  1200. addpd %xmm5, %xmm4
  1201. addpd %xmm7, %xmm6
  1202. addpd %xmm6, %xmm4
  1203. mulpd %xmm3, %xmm4
  1204. #ifndef TRMMKERNEL
  1205. movsd 0 * SIZE(%esi), %xmm0
  1206. movhpd 1 * SIZE(%esi), %xmm0
  1207. addpd %xmm0, %xmm4
  1208. #endif
  1209. movsd %xmm4, 0 * SIZE(%esi)
  1210. unpckhpd %xmm4, %xmm4
  1211. movsd %xmm4, 1 * SIZE(%esi)
  1212. addl $2 * SIZE, %esi # coffset += 4
  1213. #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  1214. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  1215. movl K, %eax
  1216. subl KKK, %eax
  1217. leal (,%eax, SIZE), %eax
  1218. leal (AA, %eax, 2), AA
  1219. leal (BB, %eax, 2), BB
  1220. #endif
  1221. #if defined(TRMMKERNEL) && defined(LEFT)
  1222. addl $2, KK
  1223. #endif
  1224. ALIGN_2
  1225. .L150:
  1226. movl M, %ebx
  1227. testl $1, %ebx
  1228. jle .L999
  1229. #if !defined(TRMMKERNEL) || \
  1230. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  1231. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  1232. leal BUFFER, BB
  1233. movapd 0 * SIZE + BUFFER, %xmm2
  1234. pxor %xmm4, %xmm4
  1235. movapd 0 * SIZE(AA), %xmm0
  1236. pxor %xmm5, %xmm5
  1237. movapd 8 * SIZE + BUFFER, %xmm3
  1238. pxor %xmm6, %xmm6
  1239. movapd 4 * SIZE(AA), %xmm1
  1240. pxor %xmm7, %xmm7
  1241. #else
  1242. leal BUFFER, BB
  1243. movl KK, %eax
  1244. leal (, %eax, SIZE), %eax
  1245. leal (AA, %eax, 1), AA
  1246. leal (BB, %eax, 2), BB
  1247. movapd 0 * SIZE(BB), %xmm2
  1248. pxor %xmm4, %xmm4
  1249. movapd 0 * SIZE(AA), %xmm0
  1250. pxor %xmm5, %xmm5
  1251. movapd 8 * SIZE(BB), %xmm3
  1252. pxor %xmm6, %xmm6
  1253. movapd 4 * SIZE(AA), %xmm1
  1254. pxor %xmm7, %xmm7
  1255. #endif
  1256. #ifndef TRMMKERNEL
  1257. movl K, %eax
  1258. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1259. movl K, %eax
  1260. subl KK, %eax
  1261. movl %eax, KKK
  1262. #else
  1263. movl KK, %eax
  1264. addl $1, %eax
  1265. movl %eax, KKK
  1266. #endif
  1267. sarl $3, %eax
  1268. je .L152
  1269. .L151:
  1270. mulsd %xmm0, %xmm2
  1271. movsd 1 * SIZE(AA), %xmm0
  1272. addsd %xmm2, %xmm4
  1273. mulsd 2 * SIZE(BB), %xmm0
  1274. movsd 16 * SIZE(BB), %xmm2
  1275. addsd %xmm0, %xmm4
  1276. movsd 2 * SIZE(AA), %xmm0
  1277. mulsd 4 * SIZE(BB), %xmm0
  1278. addsd %xmm0, %xmm4
  1279. movsd 3 * SIZE(AA), %xmm0
  1280. mulsd 6 * SIZE(BB), %xmm0
  1281. addsd %xmm0, %xmm4
  1282. movsd 8 * SIZE(AA), %xmm0
  1283. mulsd %xmm1, %xmm3
  1284. movsd 5 * SIZE(AA), %xmm1
  1285. addsd %xmm3, %xmm4
  1286. mulsd 10 * SIZE(BB), %xmm1
  1287. movsd 24 * SIZE(BB), %xmm3
  1288. addsd %xmm1, %xmm4
  1289. movsd 6 * SIZE(AA), %xmm1
  1290. mulsd 12 * SIZE(BB), %xmm1
  1291. addsd %xmm1, %xmm4
  1292. movsd 7 * SIZE(AA), %xmm1
  1293. mulsd 14 * SIZE(BB), %xmm1
  1294. addsd %xmm1, %xmm4
  1295. movsd 12 * SIZE(AA), %xmm1
  1296. addl $ 8 * SIZE, AA
  1297. addl $16 * SIZE, BB
  1298. BRANCH
  1299. decl %eax
  1300. jne .L151
  1301. .L152:
  1302. #ifndef TRMMKERNEL
  1303. movl K, %eax
  1304. #else
  1305. movl KKK, %eax
  1306. #endif
  1307. movsd ALPHA, %xmm3
  1308. andl $7, %eax # if (k & 1)
  1309. BRANCH
  1310. je .L154
  1311. .L153:
  1312. movsd 0 * SIZE(AA), %xmm0
  1313. mulsd 0 * SIZE(BB), %xmm0
  1314. addsd %xmm0, %xmm4
  1315. addl $1 * SIZE, AA # aoffset += 8
  1316. addl $2 * SIZE, BB # boffset1 += 8
  1317. decl %eax
  1318. BRANCH
  1319. jg .L153
  1320. ALIGN_4
  1321. .L154:
  1322. addsd %xmm6, %xmm4
  1323. addsd %xmm7, %xmm5
  1324. mulsd %xmm3, %xmm4
  1325. #ifndef TRMMKERNEL
  1326. addsd 0 * SIZE(%esi), %xmm4
  1327. #endif
  1328. movsd %xmm4, 0 * SIZE(%esi)
  1329. ALIGN_2
  1330. .L999:
  1331. movl OLD_STACK, %esp
  1332. EMMS
  1333. popl %ebx
  1334. popl %esi
  1335. popl %edi
  1336. popl %ebp
  1337. ret
  1338. ALIGN_2
  1339. EPILOGUE